net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <net/dst.h>
  94 #include <net/net_namespace.h>
  95 #include <net/protocol.h>
  96 #include <net/ip.h>
  97 #include <net/route.h>
  98 #include <net/inetpeer.h>
  99 #include <net/sock.h>
 100 #include <net/ip_fib.h>
 101 #include <net/arp.h>
 102 #include <net/tcp.h>
 103 #include <net/icmp.h>
 104 #include <net/xfrm.h>
 105 #include <net/netevent.h>
 106 #include <net/rtnetlink.h>
 107 #ifdef CONFIG_SYSCTL
 108 #include <linux/sysctl.h>
 109 #endif
 110
 111 #define RT_FL_TOS(oldflp) \
 112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 113
 114 #define IP_MAX_MTU      0xFFF0
 115
 116 #define RT_GC_TIMEOUT (300*HZ)
 117
 118 static int ip_rt_max_size;
 119 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 120 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 121 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 122 static int ip_rt_redirect_number __read_mostly  = 9;
 123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 125 static int ip_rt_error_cost __read_mostly       = HZ;
 126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 127 static int ip_rt_gc_elasticity __read_mostly    = 8;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 132 static int rt_chain_length_max __read_mostly    = 20;
 133
 134 static void rt_worker_func(struct work_struct *work);
 135 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static void              ipv4_dst_destroy(struct dst_entry *dst);
 143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 144                                          struct net_device *dev, int how);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148 static int rt_garbage_collect(struct dst_ops *ops);
 149 static void rt_emergency_hash_rebuild(struct net *net);
 150
 151
 152 static struct dst_ops ipv4_dst_ops = {
 153         .family =               AF_INET,
 154         .protocol =             cpu_to_be16(ETH_P_IP),
 155         .gc =                   rt_garbage_collect,
 156         .check =                ipv4_dst_check,
 157         .destroy =              ipv4_dst_destroy,
 158         .ifdown =               ipv4_dst_ifdown,
 159         .negative_advice =      ipv4_negative_advice,
 160         .link_failure =         ipv4_link_failure,
 161         .update_pmtu =          ip_rt_update_pmtu,
 162         .local_out =            __ip_local_out,
 163         .entries =              ATOMIC_INIT(0),
 164 };
 165
 166 #define ECN_OR_COST(class)      TC_PRIO_##class
 167
 168 const __u8 ip_tos2prio[16] = {
 169         TC_PRIO_BESTEFFORT,
 170         ECN_OR_COST(FILLER),
 171         TC_PRIO_BESTEFFORT,
 172         ECN_OR_COST(BESTEFFORT),
 173         TC_PRIO_BULK,
 174         ECN_OR_COST(BULK),
 175         TC_PRIO_BULK,
 176         ECN_OR_COST(BULK),
 177         TC_PRIO_INTERACTIVE,
 178         ECN_OR_COST(INTERACTIVE),
 179         TC_PRIO_INTERACTIVE,
 180         ECN_OR_COST(INTERACTIVE),
 181         TC_PRIO_INTERACTIVE_BULK,
 182         ECN_OR_COST(INTERACTIVE_BULK),
 183         TC_PRIO_INTERACTIVE_BULK,
 184         ECN_OR_COST(INTERACTIVE_BULK)
 185 };
 186
 187
 188 /*
 189  * Route cache.
 190  */
 191
 192 /* The locking scheme is rather straight forward:
 193  *
 194  * 1) Read-Copy Update protects the buckets of the central route hash.
 195  * 2) Only writers remove entries, and they hold the lock
 196  *    as they look at rtable reference counts.
 197  * 3) Only readers acquire references to rtable entries,
 198  *    they do so with atomic increments and with the
 199  *    lock held.
 200  */
 201
 202 struct rt_hash_bucket {
 203         struct rtable   *chain;
 204 };
 205
 206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 207         defined(CONFIG_PROVE_LOCKING)
 208 /*
 209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 210  * The size of this table is a power of two and depends on the number of CPUS.
 211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 212  */
 213 #ifdef CONFIG_LOCKDEP
 214 # define RT_HASH_LOCK_SZ        256
 215 #else
 216 # if NR_CPUS >= 32
 217 #  define RT_HASH_LOCK_SZ       4096
 218 # elif NR_CPUS >= 16
 219 #  define RT_HASH_LOCK_SZ       2048
 220 # elif NR_CPUS >= 8
 221 #  define RT_HASH_LOCK_SZ       1024
 222 # elif NR_CPUS >= 4
 223 #  define RT_HASH_LOCK_SZ       512
 224 # else
 225 #  define RT_HASH_LOCK_SZ       256
 226 # endif
 227 #endif
 228
 229 static spinlock_t       *rt_hash_locks;
 230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 231
 232 static __init void rt_hash_lock_init(void)
 233 {
 234         int i;
 235
 236         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 237                         GFP_KERNEL);
 238         if (!rt_hash_locks)
 239                 panic("IP: failed to allocate rt_hash_locks\n");
 240
 241         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 242                 spin_lock_init(&rt_hash_locks[i]);
 243 }
 244 #else
 245 # define rt_hash_lock_addr(slot) NULL
 246
 247 static inline void rt_hash_lock_init(void)
 248 {
 249 }
 250 #endif
 251
 252 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 253 static unsigned                 rt_hash_mask __read_mostly;
 254 static unsigned int             rt_hash_log  __read_mostly;
 255
 256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 257 #define RT_CACHE_STAT_INC(field) \
 258         (__raw_get_cpu_var(rt_cache_stat).field++)
 259
 260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 261                 int genid)
 262 {
 263         return jhash_3words((__force u32)(__be32)(daddr),
 264                             (__force u32)(__be32)(saddr),
 265                             idx, genid)
 266                 & rt_hash_mask;
 267 }
 268
 269 static inline int rt_genid(struct net *net)
 270 {
 271         return atomic_read(&net->ipv4.rt_genid);
 272 }
 273
 274 #ifdef CONFIG_PROC_FS
 275 struct rt_cache_iter_state {
 276         struct seq_net_private p;
 277         int bucket;
 278         int genid;
 279 };
 280
 281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 282 {
 283         struct rt_cache_iter_state *st = seq->private;
 284         struct rtable *r = NULL;
 285
 286         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 287                 if (!rt_hash_table[st->bucket].chain)
 288                         continue;
 289                 rcu_read_lock_bh();
 290                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
 291                 while (r) {
 292                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 293                             r->rt_genid == st->genid)
 294                                 return r;
 295                         r = rcu_dereference(r->u.dst.rt_next);
 296                 }
 297                 rcu_read_unlock_bh();
 298         }
 299         return r;
 300 }
 301
 302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 303                                           struct rtable *r)
 304 {
 305         struct rt_cache_iter_state *st = seq->private;
 306
 307         r = r->u.dst.rt_next;
 308         while (!r) {
 309                 rcu_read_unlock_bh();
 310                 do {
 311                         if (--st->bucket < 0)
 312                                 return NULL;
 313                 } while (!rt_hash_table[st->bucket].chain);
 314                 rcu_read_lock_bh();
 315                 r = rt_hash_table[st->bucket].chain;
 316         }
 317         return rcu_dereference(r);
 318 }
 319
 320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 321                                         struct rtable *r)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 325                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 326                         continue;
 327                 if (r->rt_genid == st->genid)
 328                         break;
 329         }
 330         return r;
 331 }
 332
 333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 334 {
 335         struct rtable *r = rt_cache_get_first(seq);
 336
 337         if (r)
 338                 while (pos && (r = rt_cache_get_next(seq, r)))
 339                         --pos;
 340         return pos ? NULL : r;
 341 }
 342
 343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346         if (*pos)
 347                 return rt_cache_get_idx(seq, *pos - 1);
 348         st->genid = rt_genid(seq_file_net(seq));
 349         return SEQ_START_TOKEN;
 350 }
 351
 352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 353 {
 354         struct rtable *r;
 355
 356         if (v == SEQ_START_TOKEN)
 357                 r = rt_cache_get_first(seq);
 358         else
 359                 r = rt_cache_get_next(seq, v);
 360         ++*pos;
 361         return r;
 362 }
 363
 364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 365 {
 366         if (v && v != SEQ_START_TOKEN)
 367                 rcu_read_unlock_bh();
 368 }
 369
 370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 371 {
 372         if (v == SEQ_START_TOKEN)
 373                 seq_printf(seq, "%-127s\n",
 374                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 375                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 376                            "HHUptod\tSpecDst");
 377         else {
 378                 struct rtable *r = v;
 379                 int len;
 380
 381                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 382                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 383                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 384                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 385                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 386                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 387                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 388                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 389                         dst_metric(&r->u.dst, RTAX_WINDOW),
 390                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 391                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 392                         r->fl.fl4_tos,
 393                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 394                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 395                                        dev_queue_xmit) : 0,
 396                         r->rt_spec_dst, &len);
 397
 398                 seq_printf(seq, "%*s\n", 127 - len, "");
 399         }
 400         return 0;
 401 }
 402
 403 static const struct seq_operations rt_cache_seq_ops = {
 404         .start  = rt_cache_seq_start,
 405         .next   = rt_cache_seq_next,
 406         .stop   = rt_cache_seq_stop,
 407         .show   = rt_cache_seq_show,
 408 };
 409
 410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 411 {
 412         return seq_open_net(inode, file, &rt_cache_seq_ops,
 413                         sizeof(struct rt_cache_iter_state));
 414 }
 415
 416 static const struct file_operations rt_cache_seq_fops = {
 417         .owner   = THIS_MODULE,
 418         .open    = rt_cache_seq_open,
 419         .read    = seq_read,
 420         .llseek  = seq_lseek,
 421         .release = seq_release_net,
 422 };
 423
 424
 425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 426 {
 427         int cpu;
 428
 429         if (*pos == 0)
 430                 return SEQ_START_TOKEN;
 431
 432         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 433                 if (!cpu_possible(cpu))
 434                         continue;
 435                 *pos = cpu+1;
 436                 return &per_cpu(rt_cache_stat, cpu);
 437         }
 438         return NULL;
 439 }
 440
 441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 442 {
 443         int cpu;
 444
 445         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 446                 if (!cpu_possible(cpu))
 447                         continue;
 448                 *pos = cpu+1;
 449                 return &per_cpu(rt_cache_stat, cpu);
 450         }
 451         return NULL;
 452
 453 }
 454
 455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 456 {
 457
 458 }
 459
 460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 461 {
 462         struct rt_cache_stat *st = v;
 463
 464         if (v == SEQ_START_TOKEN) {
 465                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 466                 return 0;
 467         }
 468
 469         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 470                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 471                    atomic_read(&ipv4_dst_ops.entries),
 472                    st->in_hit,
 473                    st->in_slow_tot,
 474                    st->in_slow_mc,
 475                    st->in_no_route,
 476                    st->in_brd,
 477                    st->in_martian_dst,
 478                    st->in_martian_src,
 479
 480                    st->out_hit,
 481                    st->out_slow_tot,
 482                    st->out_slow_mc,
 483
 484                    st->gc_total,
 485                    st->gc_ignored,
 486                    st->gc_goal_miss,
 487                    st->gc_dst_overflow,
 488                    st->in_hlist_search,
 489                    st->out_hlist_search
 490                 );
 491         return 0;
 492 }
 493
 494 static const struct seq_operations rt_cpu_seq_ops = {
 495         .start  = rt_cpu_seq_start,
 496         .next   = rt_cpu_seq_next,
 497         .stop   = rt_cpu_seq_stop,
 498         .show   = rt_cpu_seq_show,
 499 };
 500
 501
 502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 503 {
 504         return seq_open(file, &rt_cpu_seq_ops);
 505 }
 506
 507 static const struct file_operations rt_cpu_seq_fops = {
 508         .owner   = THIS_MODULE,
 509         .open    = rt_cpu_seq_open,
 510         .read    = seq_read,
 511         .llseek  = seq_lseek,
 512         .release = seq_release,
 513 };
 514
 515 #ifdef CONFIG_NET_CLS_ROUTE
 516 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 517                            int length, int *eof, void *data)
 518 {
 519         unsigned int i;
 520
 521         if ((offset & 3) || (length & 3))
 522                 return -EIO;
 523
 524         if (offset >= sizeof(struct ip_rt_acct) * 256) {
 525                 *eof = 1;
 526                 return 0;
 527         }
 528
 529         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 530                 length = sizeof(struct ip_rt_acct) * 256 - offset;
 531                 *eof = 1;
 532         }
 533
 534         offset /= sizeof(u32);
 535
 536         if (length > 0) {
 537                 u32 *dst = (u32 *) buffer;
 538
 539                 *start = buffer;
 540                 memset(dst, 0, length);
 541
 542                 for_each_possible_cpu(i) {
 543                         unsigned int j;
 544                         u32 *src;
 545
 546                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 547                         for (j = 0; j < length/4; j++)
 548                                 dst[j] += src[j];
 549                 }
 550         }
 551         return length;
 552 }
 553 #endif
 554
 555 static int __net_init ip_rt_do_proc_init(struct net *net)
 556 {
 557         struct proc_dir_entry *pde;
 558
 559         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 560                         &rt_cache_seq_fops);
 561         if (!pde)
 562                 goto err1;
 563
 564         pde = proc_create("rt_cache", S_IRUGO,
 565                           net->proc_net_stat, &rt_cpu_seq_fops);
 566         if (!pde)
 567                 goto err2;
 568
 569 #ifdef CONFIG_NET_CLS_ROUTE
 570         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
 571                         ip_rt_acct_read, NULL);
 572         if (!pde)
 573                 goto err3;
 574 #endif
 575         return 0;
 576
 577 #ifdef CONFIG_NET_CLS_ROUTE
 578 err3:
 579         remove_proc_entry("rt_cache", net->proc_net_stat);
 580 #endif
 581 err2:
 582         remove_proc_entry("rt_cache", net->proc_net);
 583 err1:
 584         return -ENOMEM;
 585 }
 586
 587 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 588 {
 589         remove_proc_entry("rt_cache", net->proc_net_stat);
 590         remove_proc_entry("rt_cache", net->proc_net);
 591         remove_proc_entry("rt_acct", net->proc_net);
 592 }
 593
 594 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 595         .init = ip_rt_do_proc_init,
 596         .exit = ip_rt_do_proc_exit,
 597 };
 598
 599 static int __init ip_rt_proc_init(void)
 600 {
 601         return register_pernet_subsys(&ip_rt_proc_ops);
 602 }
 603
 604 #else
 605 static inline int ip_rt_proc_init(void)
 606 {
 607         return 0;
 608 }
 609 #endif /* CONFIG_PROC_FS */
 610
 611 static inline void rt_free(struct rtable *rt)
 612 {
 613         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 614 }
 615
 616 static inline void rt_drop(struct rtable *rt)
 617 {
 618         ip_rt_put(rt);
 619         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 620 }
 621
 622 static inline int rt_fast_clean(struct rtable *rth)
 623 {
 624         /* Kill broadcast/multicast entries very aggresively, if they
 625            collide in hash table with more useful entries */
 626         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 627                 rth->fl.iif && rth->u.dst.rt_next;
 628 }
 629
 630 static inline int rt_valuable(struct rtable *rth)
 631 {
 632         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 633                 rth->u.dst.expires;
 634 }
 635
 636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 637 {
 638         unsigned long age;
 639         int ret = 0;
 640
 641         if (atomic_read(&rth->u.dst.__refcnt))
 642                 goto out;
 643
 644         ret = 1;
 645         if (rth->u.dst.expires &&
 646             time_after_eq(jiffies, rth->u.dst.expires))
 647                 goto out;
 648
 649         age = jiffies - rth->u.dst.lastuse;
 650         ret = 0;
 651         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 652             (age <= tmo2 && rt_valuable(rth)))
 653                 goto out;
 654         ret = 1;
 655 out:    return ret;
 656 }
 657
 658 /* Bits of score are:
 659  * 31: very valuable
 660  * 30: not quite useless
 661  * 29..0: usage counter
 662  */
 663 static inline u32 rt_score(struct rtable *rt)
 664 {
 665         u32 score = jiffies - rt->u.dst.lastuse;
 666
 667         score = ~score & ~(3<<30);
 668
 669         if (rt_valuable(rt))
 670                 score |= (1<<31);
 671
 672         if (!rt->fl.iif ||
 673             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 674                 score |= (1<<30);
 675
 676         return score;
 677 }
 678
 679 static inline bool rt_caching(const struct net *net)
 680 {
 681         return net->ipv4.current_rt_cache_rebuild_count <=
 682                 net->ipv4.sysctl_rt_cache_rebuild_count;
 683 }
 684
 685 static inline bool compare_hash_inputs(const struct flowi *fl1,
 686                                         const struct flowi *fl2)
 687 {
 688         return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 689                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
 690                 (fl1->iif ^ fl2->iif)) == 0);
 691 }
 692
 693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 694 {
 695         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 696                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 697                 (fl1->mark ^ fl2->mark) |
 698                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 699                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 700                 (fl1->oif ^ fl2->oif) |
 701                 (fl1->iif ^ fl2->iif)) == 0;
 702 }
 703
 704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 705 {
 706         return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
 707 }
 708
 709 static inline int rt_is_expired(struct rtable *rth)
 710 {
 711         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
 712 }
 713
 714 /*
 715  * Perform a full scan of hash table and free all entries.
 716  * Can be called by a softirq or a process.
 717  * In the later case, we want to be reschedule if necessary
 718  */
 719 static void rt_do_flush(int process_context)
 720 {
 721         unsigned int i;
 722         struct rtable *rth, *next;
 723         struct rtable * tail;
 724
 725         for (i = 0; i <= rt_hash_mask; i++) {
 726                 if (process_context && need_resched())
 727                         cond_resched();
 728                 rth = rt_hash_table[i].chain;
 729                 if (!rth)
 730                         continue;
 731
 732                 spin_lock_bh(rt_hash_lock_addr(i));
 733 #ifdef CONFIG_NET_NS
 734                 {
 735                 struct rtable ** prev, * p;
 736
 737                 rth = rt_hash_table[i].chain;
 738
 739                 /* defer releasing the head of the list after spin_unlock */
 740                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
 741                         if (!rt_is_expired(tail))
 742                                 break;
 743                 if (rth != tail)
 744                         rt_hash_table[i].chain = tail;
 745
 746                 /* call rt_free on entries after the tail requiring flush */
 747                 prev = &rt_hash_table[i].chain;
 748                 for (p = *prev; p; p = next) {
 749                         next = p->u.dst.rt_next;
 750                         if (!rt_is_expired(p)) {
 751                                 prev = &p->u.dst.rt_next;
 752                         } else {
 753                                 *prev = next;
 754                                 rt_free(p);
 755                         }
 756                 }
 757                 }
 758 #else
 759                 rth = rt_hash_table[i].chain;
 760                 rt_hash_table[i].chain = NULL;
 761                 tail = NULL;
 762 #endif
 763                 spin_unlock_bh(rt_hash_lock_addr(i));
 764
 765                 for (; rth != tail; rth = next) {
 766                         next = rth->u.dst.rt_next;
 767                         rt_free(rth);
 768                 }
 769         }
 770 }
 771
 772 /*
 773  * While freeing expired entries, we compute average chain length
 774  * and standard deviation, using fixed-point arithmetic.
 775  * This to have an estimation of rt_chain_length_max
 776  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 777  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 778  */
 779
 780 #define FRACT_BITS 3
 781 #define ONE (1UL << FRACT_BITS)
 782
 783 static void rt_check_expire(void)
 784 {
 785         static unsigned int rover;
 786         unsigned int i = rover, goal;
 787         struct rtable *rth, *aux, **rthp;
 788         unsigned long samples = 0;
 789         unsigned long sum = 0, sum2 = 0;
 790         u64 mult;
 791
 792         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 793         if (ip_rt_gc_timeout > 1)
 794                 do_div(mult, ip_rt_gc_timeout);
 795         goal = (unsigned int)mult;
 796         if (goal > rt_hash_mask)
 797                 goal = rt_hash_mask + 1;
 798         for (; goal > 0; goal--) {
 799                 unsigned long tmo = ip_rt_gc_timeout;
 800                 unsigned long length;
 801
 802                 i = (i + 1) & rt_hash_mask;
 803                 rthp = &rt_hash_table[i].chain;
 804
 805                 if (need_resched())
 806                         cond_resched();
 807
 808                 samples++;
 809
 810                 if (*rthp == NULL)
 811                         continue;
 812                 length = 0;
 813                 spin_lock_bh(rt_hash_lock_addr(i));
 814                 while ((rth = *rthp) != NULL) {
 815                         prefetch(rth->u.dst.rt_next);
 816                         if (rt_is_expired(rth)) {
 817                                 *rthp = rth->u.dst.rt_next;
 818                                 rt_free(rth);
 819                                 continue;
 820                         }
 821                         if (rth->u.dst.expires) {
 822                                 /* Entry is expired even if it is in use */
 823                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 824 nofree:
 825                                         tmo >>= 1;
 826                                         rthp = &rth->u.dst.rt_next;
 827                                         /*
 828                                          * We only count entries on
 829                                          * a chain with equal hash inputs once
 830                                          * so that entries for different QOS
 831                                          * levels, and other non-hash input
 832                                          * attributes don't unfairly skew
 833                                          * the length computation
 834                                          */
 835                                         for (aux = rt_hash_table[i].chain;;) {
 836                                                 if (aux == rth) {
 837                                                         length += ONE;
 838                                                         break;
 839                                                 }
 840                                                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 841                                                         break;
 842                                                 aux = aux->u.dst.rt_next;
 843                                         }
 844                                         continue;
 845                                 }
 846                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 847                                 goto nofree;
 848
 849                         /* Cleanup aged off entries. */
 850                         *rthp = rth->u.dst.rt_next;
 851                         rt_free(rth);
 852                 }
 853                 spin_unlock_bh(rt_hash_lock_addr(i));
 854                 sum += length;
 855                 sum2 += length*length;
 856         }
 857         if (samples) {
 858                 unsigned long avg = sum / samples;
 859                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 860                 rt_chain_length_max = max_t(unsigned long,
 861                                         ip_rt_gc_elasticity,
 862                                         (avg + 4*sd) >> FRACT_BITS);
 863         }
 864         rover = i;
 865 }
 866
 867 /*
 868  * rt_worker_func() is run in process context.
 869  * we call rt_check_expire() to scan part of the hash table
 870  */
 871 static void rt_worker_func(struct work_struct *work)
 872 {
 873         rt_check_expire();
 874         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 875 }
 876
 877 /*
 878  * Pertubation of rt_genid by a small quantity [1..256]
 879  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 880  * many times (2^24) without giving recent rt_genid.
 881  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 882  */
 883 static void rt_cache_invalidate(struct net *net)
 884 {
 885         unsigned char shuffle;
 886
 887         get_random_bytes(&shuffle, sizeof(shuffle));
 888         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 889 }
 890
 891 /*
 892  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 893  * delay >= 0 : invalidate & flush cache (can be long)
 894  */
 895 void rt_cache_flush(struct net *net, int delay)
 896 {
 897         rt_cache_invalidate(net);
 898         if (delay >= 0)
 899                 rt_do_flush(!in_softirq());
 900 }
 901
 902 /*
 903  * We change rt_genid and let gc do the cleanup
 904  */
 905 static void rt_secret_rebuild(unsigned long __net)
 906 {
 907         struct net *net = (struct net *)__net;
 908         rt_cache_invalidate(net);
 909         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 910 }
 911
 912 static void rt_secret_rebuild_oneshot(struct net *net)
 913 {
 914         del_timer_sync(&net->ipv4.rt_secret_timer);
 915         rt_cache_invalidate(net);
 916         if (ip_rt_secret_interval) {
 917                 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
 918                 add_timer(&net->ipv4.rt_secret_timer);
 919         }
 920 }
 921
 922 static void rt_emergency_hash_rebuild(struct net *net)
 923 {
 924         if (net_ratelimit()) {
 925                 printk(KERN_WARNING "Route hash chain too long!\n");
 926                 printk(KERN_WARNING "Adjust your secret_interval!\n");
 927         }
 928
 929         rt_secret_rebuild_oneshot(net);
 930 }
 931
 932 /*
 933    Short description of GC goals.
 934
 935    We want to build algorithm, which will keep routing cache
 936    at some equilibrium point, when number of aged off entries
 937    is kept approximately equal to newly generated ones.
 938
 939    Current expiration strength is variable "expire".
 940    We try to adjust it dynamically, so that if networking
 941    is idle expires is large enough to keep enough of warm entries,
 942    and when load increases it reduces to limit cache size.
 943  */
 944
 945 static int rt_garbage_collect(struct dst_ops *ops)
 946 {
 947         static unsigned long expire = RT_GC_TIMEOUT;
 948         static unsigned long last_gc;
 949         static int rover;
 950         static int equilibrium;
 951         struct rtable *rth, **rthp;
 952         unsigned long now = jiffies;
 953         int goal;
 954
 955         /*
 956          * Garbage collection is pretty expensive,
 957          * do not make it too frequently.
 958          */
 959
 960         RT_CACHE_STAT_INC(gc_total);
 961
 962         if (now - last_gc < ip_rt_gc_min_interval &&
 963             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 964                 RT_CACHE_STAT_INC(gc_ignored);
 965                 goto out;
 966         }
 967
 968         /* Calculate number of entries, which we want to expire now. */
 969         goal = atomic_read(&ipv4_dst_ops.entries) -
 970                 (ip_rt_gc_elasticity << rt_hash_log);
 971         if (goal <= 0) {
 972                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 973                         equilibrium = ipv4_dst_ops.gc_thresh;
 974                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 975                 if (goal > 0) {
 976                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 977                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 978                 }
 979         } else {
 980                 /* We are in dangerous area. Try to reduce cache really
 981                  * aggressively.
 982                  */
 983                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 984                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 985         }
 986
 987         if (now - last_gc >= ip_rt_gc_min_interval)
 988                 last_gc = now;
 989
 990         if (goal <= 0) {
 991                 equilibrium += goal;
 992                 goto work_done;
 993         }
 994
 995         do {
 996                 int i, k;
 997
 998                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 999                         unsigned long tmo = expire;
1000
1001                         k = (k + 1) & rt_hash_mask;
1002                         rthp = &rt_hash_table[k].chain;
1003                         spin_lock_bh(rt_hash_lock_addr(k));
1004                         while ((rth = *rthp) != NULL) {
1005                                 if (!rt_is_expired(rth) &&
1006                                         !rt_may_expire(rth, tmo, expire)) {
1007                                         tmo >>= 1;
1008                                         rthp = &rth->u.dst.rt_next;
1009                                         continue;
1010                                 }
1011                                 *rthp = rth->u.dst.rt_next;
1012                                 rt_free(rth);
1013                                 goal--;
1014                         }
1015                         spin_unlock_bh(rt_hash_lock_addr(k));
1016                         if (goal <= 0)
1017                                 break;
1018                 }
1019                 rover = k;
1020
1021                 if (goal <= 0)
1022                         goto work_done;
1023
1024                 /* Goal is not achieved. We stop process if:
1025
1026                    - if expire reduced to zero. Otherwise, expire is halfed.
1027                    - if table is not full.
1028                    - if we are called from interrupt.
1029                    - jiffies check is just fallback/debug loop breaker.
1030                      We will not spin here for long time in any case.
1031                  */
1032
1033                 RT_CACHE_STAT_INC(gc_goal_miss);
1034
1035                 if (expire == 0)
1036                         break;
1037
1038                 expire >>= 1;
1039 #if RT_CACHE_DEBUG >= 2
1040                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1041                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
1042 #endif
1043
1044                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1045                         goto out;
1046         } while (!in_softirq() && time_before_eq(jiffies, now));
1047
1048         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1049                 goto out;
1050         if (net_ratelimit())
1051                 printk(KERN_WARNING "dst cache overflow\n");
1052         RT_CACHE_STAT_INC(gc_dst_overflow);
1053         return 1;
1054
1055 work_done:
1056         expire += ip_rt_gc_min_interval;
1057         if (expire > ip_rt_gc_timeout ||
1058             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1059                 expire = ip_rt_gc_timeout;
1060 #if RT_CACHE_DEBUG >= 2
1061         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1062                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
1063 #endif
1064 out:    return 0;
1065 }
1066
1067 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
1068 {
1069         struct rtable   *rth, **rthp;
1070         unsigned long   now;
1071         struct rtable *cand, **candp;
1072         u32             min_score;
1073         int             chain_length;
1074         int attempts = !in_softirq();
1075
1076 restart:
1077         chain_length = 0;
1078         min_score = ~(u32)0;
1079         cand = NULL;
1080         candp = NULL;
1081         now = jiffies;
1082
1083         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1084                 rt_drop(rt);
1085                 return 0;
1086         }
1087
1088         rthp = &rt_hash_table[hash].chain;
1089
1090         spin_lock_bh(rt_hash_lock_addr(hash));
1091         while ((rth = *rthp) != NULL) {
1092                 if (rt_is_expired(rth)) {
1093                         *rthp = rth->u.dst.rt_next;
1094                         rt_free(rth);
1095                         continue;
1096                 }
1097                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1098                         /* Put it first */
1099                         *rthp = rth->u.dst.rt_next;
1100                         /*
1101                          * Since lookup is lockfree, the deletion
1102                          * must be visible to another weakly ordered CPU before
1103                          * the insertion at the start of the hash chain.
1104                          */
1105                         rcu_assign_pointer(rth->u.dst.rt_next,
1106                                            rt_hash_table[hash].chain);
1107                         /*
1108                          * Since lookup is lockfree, the update writes
1109                          * must be ordered for consistency on SMP.
1110                          */
1111                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1112
1113                         dst_use(&rth->u.dst, now);
1114                         spin_unlock_bh(rt_hash_lock_addr(hash));
1115
1116                         rt_drop(rt);
1117                         *rp = rth;
1118                         return 0;
1119                 }
1120
1121                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1122                         u32 score = rt_score(rth);
1123
1124                         if (score <= min_score) {
1125                                 cand = rth;
1126                                 candp = rthp;
1127                                 min_score = score;
1128                         }
1129                 }
1130
1131                 chain_length++;
1132
1133                 rthp = &rth->u.dst.rt_next;
1134         }
1135
1136         if (cand) {
1137                 /* ip_rt_gc_elasticity used to be average length of chain
1138                  * length, when exceeded gc becomes really aggressive.
1139                  *
1140                  * The second limit is less certain. At the moment it allows
1141                  * only 2 entries per bucket. We will see.
1142                  */
1143                 if (chain_length > ip_rt_gc_elasticity) {
1144                         *candp = cand->u.dst.rt_next;
1145                         rt_free(cand);
1146                 }
1147         } else {
1148                 if (chain_length > rt_chain_length_max) {
1149                         struct net *net = dev_net(rt->u.dst.dev);
1150                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1151                         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1152                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1153                                         rt->u.dst.dev->name, num);
1154                         }
1155                         rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1156                 }
1157         }
1158
1159         /* Try to bind route to arp only if it is output
1160            route or unicast forwarding path.
1161          */
1162         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1163                 int err = arp_bind_neighbour(&rt->u.dst);
1164                 if (err) {
1165                         spin_unlock_bh(rt_hash_lock_addr(hash));
1166
1167                         if (err != -ENOBUFS) {
1168                                 rt_drop(rt);
1169                                 return err;
1170                         }
1171
1172                         /* Neighbour tables are full and nothing
1173                            can be released. Try to shrink route cache,
1174                            it is most likely it holds some neighbour records.
1175                          */
1176                         if (attempts-- > 0) {
1177                                 int saved_elasticity = ip_rt_gc_elasticity;
1178                                 int saved_int = ip_rt_gc_min_interval;
1179                                 ip_rt_gc_elasticity     = 1;
1180                                 ip_rt_gc_min_interval   = 0;
1181                                 rt_garbage_collect(&ipv4_dst_ops);
1182                                 ip_rt_gc_min_interval   = saved_int;
1183                                 ip_rt_gc_elasticity     = saved_elasticity;
1184                                 goto restart;
1185                         }
1186
1187                         if (net_ratelimit())
1188                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1189                         rt_drop(rt);
1190                         return -ENOBUFS;
1191                 }
1192         }
1193
1194         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1195
1196 #if RT_CACHE_DEBUG >= 2
1197         if (rt->u.dst.rt_next) {
1198                 struct rtable *trt;
1199                 printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst);
1200                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1201                         printk(" . %pI4", &trt->rt_dst);
1202                 printk("\n");
1203         }
1204 #endif
1205         /*
1206          * Since lookup is lockfree, we must make sure
1207          * previous writes to rt are comitted to memory
1208          * before making rt visible to other CPUS.
1209          */
1210         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1211
1212         spin_unlock_bh(rt_hash_lock_addr(hash));
1213         *rp = rt;
1214         return 0;
1215 }
1216
1217 void rt_bind_peer(struct rtable *rt, int create)
1218 {
1219         static DEFINE_SPINLOCK(rt_peer_lock);
1220         struct inet_peer *peer;
1221
1222         peer = inet_getpeer(rt->rt_dst, create);
1223
1224         spin_lock_bh(&rt_peer_lock);
1225         if (rt->peer == NULL) {
1226                 rt->peer = peer;
1227                 peer = NULL;
1228         }
1229         spin_unlock_bh(&rt_peer_lock);
1230         if (peer)
1231                 inet_putpeer(peer);
1232 }
1233
1234 /*
1235  * Peer allocation may fail only in serious out-of-memory conditions.  However
1236  * we still can generate some output.
1237  * Random ID selection looks a bit dangerous because we have no chances to
1238  * select ID being unique in a reasonable period of time.
1239  * But broken packet identifier may be better than no packet at all.
1240  */
1241 static void ip_select_fb_ident(struct iphdr *iph)
1242 {
1243         static DEFINE_SPINLOCK(ip_fb_id_lock);
1244         static u32 ip_fallback_id;
1245         u32 salt;
1246
1247         spin_lock_bh(&ip_fb_id_lock);
1248         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1249         iph->id = htons(salt & 0xFFFF);
1250         ip_fallback_id = salt;
1251         spin_unlock_bh(&ip_fb_id_lock);
1252 }
1253
1254 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1255 {
1256         struct rtable *rt = (struct rtable *) dst;
1257
1258         if (rt) {
1259                 if (rt->peer == NULL)
1260                         rt_bind_peer(rt, 1);
1261
1262                 /* If peer is attached to destination, it is never detached,
1263                    so that we need not to grab a lock to dereference it.
1264                  */
1265                 if (rt->peer) {
1266                         iph->id = htons(inet_getid(rt->peer, more));
1267                         return;
1268                 }
1269         } else
1270                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1271                        __builtin_return_address(0));
1272
1273         ip_select_fb_ident(iph);
1274 }
1275
1276 static void rt_del(unsigned hash, struct rtable *rt)
1277 {
1278         struct rtable **rthp, *aux;
1279
1280         rthp = &rt_hash_table[hash].chain;
1281         spin_lock_bh(rt_hash_lock_addr(hash));
1282         ip_rt_put(rt);
1283         while ((aux = *rthp) != NULL) {
1284                 if (aux == rt || rt_is_expired(aux)) {
1285                         *rthp = aux->u.dst.rt_next;
1286                         rt_free(aux);
1287                         continue;
1288                 }
1289                 rthp = &aux->u.dst.rt_next;
1290         }
1291         spin_unlock_bh(rt_hash_lock_addr(hash));
1292 }
1293
1294 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1295                     __be32 saddr, struct net_device *dev)
1296 {
1297         int i, k;
1298         struct in_device *in_dev = in_dev_get(dev);
1299         struct rtable *rth, **rthp;
1300         __be32  skeys[2] = { saddr, 0 };
1301         int  ikeys[2] = { dev->ifindex, 0 };
1302         struct netevent_redirect netevent;
1303         struct net *net;
1304
1305         if (!in_dev)
1306                 return;
1307
1308         net = dev_net(dev);
1309         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1310             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1311             || ipv4_is_zeronet(new_gw))
1312                 goto reject_redirect;
1313
1314         if (!rt_caching(net))
1315                 goto reject_redirect;
1316
1317         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1318                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1319                         goto reject_redirect;
1320                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1321                         goto reject_redirect;
1322         } else {
1323                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1324                         goto reject_redirect;
1325         }
1326
1327         for (i = 0; i < 2; i++) {
1328                 for (k = 0; k < 2; k++) {
1329                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1330                                                 rt_genid(net));
1331
1332                         rthp=&rt_hash_table[hash].chain;
1333
1334                         rcu_read_lock();
1335                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1336                                 struct rtable *rt;
1337
1338                                 if (rth->fl.fl4_dst != daddr ||
1339                                     rth->fl.fl4_src != skeys[i] ||
1340                                     rth->fl.oif != ikeys[k] ||
1341                                     rth->fl.iif != 0 ||
1342                                     rt_is_expired(rth) ||
1343                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1344                                         rthp = &rth->u.dst.rt_next;
1345                                         continue;
1346                                 }
1347
1348                                 if (rth->rt_dst != daddr ||
1349                                     rth->rt_src != saddr ||
1350                                     rth->u.dst.error ||
1351                                     rth->rt_gateway != old_gw ||
1352                                     rth->u.dst.dev != dev)
1353                                         break;
1354
1355                                 dst_hold(&rth->u.dst);
1356                                 rcu_read_unlock();
1357
1358                                 rt = dst_alloc(&ipv4_dst_ops);
1359                                 if (rt == NULL) {
1360                                         ip_rt_put(rth);
1361                                         in_dev_put(in_dev);
1362                                         return;
1363                                 }
1364
1365                                 /* Copy all the information. */
1366                                 *rt = *rth;
1367                                 rt->u.dst.__use         = 1;
1368                                 atomic_set(&rt->u.dst.__refcnt, 1);
1369                                 rt->u.dst.child         = NULL;
1370                                 if (rt->u.dst.dev)
1371                                         dev_hold(rt->u.dst.dev);
1372                                 if (rt->idev)
1373                                         in_dev_hold(rt->idev);
1374                                 rt->u.dst.obsolete      = 0;
1375                                 rt->u.dst.lastuse       = jiffies;
1376                                 rt->u.dst.path          = &rt->u.dst;
1377                                 rt->u.dst.neighbour     = NULL;
1378                                 rt->u.dst.hh            = NULL;
1379 #ifdef CONFIG_XFRM
1380                                 rt->u.dst.xfrm          = NULL;
1381 #endif
1382                                 rt->rt_genid            = rt_genid(net);
1383                                 rt->rt_flags            |= RTCF_REDIRECTED;
1384
1385                                 /* Gateway is different ... */
1386                                 rt->rt_gateway          = new_gw;
1387
1388                                 /* Redirect received -> path was valid */
1389                                 dst_confirm(&rth->u.dst);
1390
1391                                 if (rt->peer)
1392                                         atomic_inc(&rt->peer->refcnt);
1393
1394                                 if (arp_bind_neighbour(&rt->u.dst) ||
1395                                     !(rt->u.dst.neighbour->nud_state &
1396                                             NUD_VALID)) {
1397                                         if (rt->u.dst.neighbour)
1398                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1399                                         ip_rt_put(rth);
1400                                         rt_drop(rt);
1401                                         goto do_next;
1402                                 }
1403
1404                                 netevent.old = &rth->u.dst;
1405                                 netevent.new = &rt->u.dst;
1406                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1407                                                         &netevent);
1408
1409                                 rt_del(hash, rth);
1410                                 if (!rt_intern_hash(hash, rt, &rt))
1411                                         ip_rt_put(rt);
1412                                 goto do_next;
1413                         }
1414                         rcu_read_unlock();
1415                 do_next:
1416                         ;
1417                 }
1418         }
1419         in_dev_put(in_dev);
1420         return;
1421
1422 reject_redirect:
1423 #ifdef CONFIG_IP_ROUTE_VERBOSE
1424         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1425                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1426                         "  Advised path = %pI4 -> %pI4\n",
1427                        &old_gw, dev->name, &new_gw,
1428                        &saddr, &daddr);
1429 #endif
1430         in_dev_put(in_dev);
1431 }
1432
1433 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1434 {
1435         struct rtable *rt = (struct rtable *)dst;
1436         struct dst_entry *ret = dst;
1437
1438         if (rt) {
1439                 if (dst->obsolete) {
1440                         ip_rt_put(rt);
1441                         ret = NULL;
1442                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1443                            rt->u.dst.expires) {
1444                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1445                                                 rt->fl.oif,
1446                                                 rt_genid(dev_net(dst->dev)));
1447 #if RT_CACHE_DEBUG >= 1
1448                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1449                                 &rt->rt_dst, rt->fl.fl4_tos);
1450 #endif
1451                         rt_del(hash, rt);
1452                         ret = NULL;
1453                 }
1454         }
1455         return ret;
1456 }
1457
1458 /*
1459  * Algorithm:
1460  *      1. The first ip_rt_redirect_number redirects are sent
1461  *         with exponential backoff, then we stop sending them at all,
1462  *         assuming that the host ignores our redirects.
1463  *      2. If we did not see packets requiring redirects
1464  *         during ip_rt_redirect_silence, we assume that the host
1465  *         forgot redirected route and start to send redirects again.
1466  *
1467  * This algorithm is much cheaper and more intelligent than dumb load limiting
1468  * in icmp.c.
1469  *
1470  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1471  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1472  */
1473
1474 void ip_rt_send_redirect(struct sk_buff *skb)
1475 {
1476         struct rtable *rt = skb->rtable;
1477         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1478
1479         if (!in_dev)
1480                 return;
1481
1482         if (!IN_DEV_TX_REDIRECTS(in_dev))
1483                 goto out;
1484
1485         /* No redirected packets during ip_rt_redirect_silence;
1486          * reset the algorithm.
1487          */
1488         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1489                 rt->u.dst.rate_tokens = 0;
1490
1491         /* Too many ignored redirects; do not send anything
1492          * set u.dst.rate_last to the last seen redirected packet.
1493          */
1494         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1495                 rt->u.dst.rate_last = jiffies;
1496                 goto out;
1497         }
1498
1499         /* Check for load limit; set rate_last to the latest sent
1500          * redirect.
1501          */
1502         if (rt->u.dst.rate_tokens == 0 ||
1503             time_after(jiffies,
1504                        (rt->u.dst.rate_last +
1505                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1506                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1507                 rt->u.dst.rate_last = jiffies;
1508                 ++rt->u.dst.rate_tokens;
1509 #ifdef CONFIG_IP_ROUTE_VERBOSE
1510                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1511                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1512                     net_ratelimit())
1513                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1514                                 &rt->rt_src, rt->rt_iif,
1515                                 &rt->rt_dst, &rt->rt_gateway);
1516 #endif
1517         }
1518 out:
1519         in_dev_put(in_dev);
1520 }
1521
1522 static int ip_error(struct sk_buff *skb)
1523 {
1524         struct rtable *rt = skb->rtable;
1525         unsigned long now;
1526         int code;
1527
1528         switch (rt->u.dst.error) {
1529                 case EINVAL:
1530                 default:
1531                         goto out;
1532                 case EHOSTUNREACH:
1533                         code = ICMP_HOST_UNREACH;
1534                         break;
1535                 case ENETUNREACH:
1536                         code = ICMP_NET_UNREACH;
1537                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1538                                         IPSTATS_MIB_INNOROUTES);
1539                         break;
1540                 case EACCES:
1541                         code = ICMP_PKT_FILTERED;
1542                         break;
1543         }
1544
1545         now = jiffies;
1546         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1547         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1548                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1549         rt->u.dst.rate_last = now;
1550         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1551                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1552                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1553         }
1554
1555 out:    kfree_skb(skb);
1556         return 0;
1557 }
1558
1559 /*
1560  *      The last two values are not from the RFC but
1561  *      are needed for AMPRnet AX.25 paths.
1562  */
1563
1564 static const unsigned short mtu_plateau[] =
1565 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1566
1567 static inline unsigned short guess_mtu(unsigned short old_mtu)
1568 {
1569         int i;
1570
1571         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1572                 if (old_mtu > mtu_plateau[i])
1573                         return mtu_plateau[i];
1574         return 68;
1575 }
1576
1577 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1578                                  unsigned short new_mtu,
1579                                  struct net_device *dev)
1580 {
1581         int i, k;
1582         unsigned short old_mtu = ntohs(iph->tot_len);
1583         struct rtable *rth;
1584         int  ikeys[2] = { dev->ifindex, 0 };
1585         __be32  skeys[2] = { iph->saddr, 0, };
1586         __be32  daddr = iph->daddr;
1587         unsigned short est_mtu = 0;
1588
1589         if (ipv4_config.no_pmtu_disc)
1590                 return 0;
1591
1592         for (k = 0; k < 2; k++) {
1593                 for (i = 0; i < 2; i++) {
1594                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1595                                                 rt_genid(net));
1596
1597                         rcu_read_lock();
1598                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1599                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1600                                 unsigned short mtu = new_mtu;
1601
1602                                 if (rth->fl.fl4_dst != daddr ||
1603                                     rth->fl.fl4_src != skeys[i] ||
1604                                     rth->rt_dst != daddr ||
1605                                     rth->rt_src != iph->saddr ||
1606                                     rth->fl.oif != ikeys[k] ||
1607                                     rth->fl.iif != 0 ||
1608                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1609                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1610                                     rt_is_expired(rth))
1611                                         continue;
1612
1613                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1614
1615                                         /* BSD 4.2 compatibility hack :-( */
1616                                         if (mtu == 0 &&
1617                                             old_mtu >= dst_mtu(&rth->u.dst) &&
1618                                             old_mtu >= 68 + (iph->ihl << 2))
1619                                                 old_mtu -= iph->ihl << 2;
1620
1621                                         mtu = guess_mtu(old_mtu);
1622                                 }
1623                                 if (mtu <= dst_mtu(&rth->u.dst)) {
1624                                         if (mtu < dst_mtu(&rth->u.dst)) {
1625                                                 dst_confirm(&rth->u.dst);
1626                                                 if (mtu < ip_rt_min_pmtu) {
1627                                                         mtu = ip_rt_min_pmtu;
1628                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1629                                                                 (1 << RTAX_MTU);
1630                                                 }
1631                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1632                                                 dst_set_expires(&rth->u.dst,
1633                                                         ip_rt_mtu_expires);
1634                                         }
1635                                         est_mtu = mtu;
1636                                 }
1637                         }
1638                         rcu_read_unlock();
1639                 }
1640         }
1641         return est_mtu ? : new_mtu;
1642 }
1643
1644 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1645 {
1646         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1647             !(dst_metric_locked(dst, RTAX_MTU))) {
1648                 if (mtu < ip_rt_min_pmtu) {
1649                         mtu = ip_rt_min_pmtu;
1650                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1651                 }
1652                 dst->metrics[RTAX_MTU-1] = mtu;
1653                 dst_set_expires(dst, ip_rt_mtu_expires);
1654                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1655         }
1656 }
1657
1658 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1659 {
1660         return NULL;
1661 }
1662
1663 static void ipv4_dst_destroy(struct dst_entry *dst)
1664 {
1665         struct rtable *rt = (struct rtable *) dst;
1666         struct inet_peer *peer = rt->peer;
1667         struct in_device *idev = rt->idev;
1668
1669         if (peer) {
1670                 rt->peer = NULL;
1671                 inet_putpeer(peer);
1672         }
1673
1674         if (idev) {
1675                 rt->idev = NULL;
1676                 in_dev_put(idev);
1677         }
1678 }
1679
1680 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1681                             int how)
1682 {
1683         struct rtable *rt = (struct rtable *) dst;
1684         struct in_device *idev = rt->idev;
1685         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1686                 struct in_device *loopback_idev =
1687                         in_dev_get(dev_net(dev)->loopback_dev);
1688                 if (loopback_idev) {
1689                         rt->idev = loopback_idev;
1690                         in_dev_put(idev);
1691                 }
1692         }
1693 }
1694
1695 static void ipv4_link_failure(struct sk_buff *skb)
1696 {
1697         struct rtable *rt;
1698
1699         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1700
1701         rt = skb->rtable;
1702         if (rt)
1703                 dst_set_expires(&rt->u.dst, 0);
1704 }
1705
1706 static int ip_rt_bug(struct sk_buff *skb)
1707 {
1708         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1709                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1710                 skb->dev ? skb->dev->name : "?");
1711         kfree_skb(skb);
1712         return 0;
1713 }
1714
1715 /*
1716    We do not cache source address of outgoing interface,
1717    because it is used only by IP RR, TS and SRR options,
1718    so that it out of fast path.
1719
1720    BTW remember: "addr" is allowed to be not aligned
1721    in IP options!
1722  */
1723
1724 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1725 {
1726         __be32 src;
1727         struct fib_result res;
1728
1729         if (rt->fl.iif == 0)
1730                 src = rt->rt_src;
1731         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1732                 src = FIB_RES_PREFSRC(res);
1733                 fib_res_put(&res);
1734         } else
1735                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1736                                         RT_SCOPE_UNIVERSE);
1737         memcpy(addr, &src, 4);
1738 }
1739
1740 #ifdef CONFIG_NET_CLS_ROUTE
1741 static void set_class_tag(struct rtable *rt, u32 tag)
1742 {
1743         if (!(rt->u.dst.tclassid & 0xFFFF))
1744                 rt->u.dst.tclassid |= tag & 0xFFFF;
1745         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1746                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1747 }
1748 #endif
1749
1750 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1751 {
1752         struct fib_info *fi = res->fi;
1753
1754         if (fi) {
1755                 if (FIB_RES_GW(*res) &&
1756                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1757                         rt->rt_gateway = FIB_RES_GW(*res);
1758                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1759                        sizeof(rt->u.dst.metrics));
1760                 if (fi->fib_mtu == 0) {
1761                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1762                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1763                             rt->rt_gateway != rt->rt_dst &&
1764                             rt->u.dst.dev->mtu > 576)
1765                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1766                 }
1767 #ifdef CONFIG_NET_CLS_ROUTE
1768                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1769 #endif
1770         } else
1771                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1772
1773         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1774                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1775         if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1776                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1777         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1778                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1779                                        ip_rt_min_advmss);
1780         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1781                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1782
1783 #ifdef CONFIG_NET_CLS_ROUTE
1784 #ifdef CONFIG_IP_MULTIPLE_TABLES
1785         set_class_tag(rt, fib_rules_tclass(res));
1786 #endif
1787         set_class_tag(rt, itag);
1788 #endif
1789         rt->rt_type = res->type;
1790 }
1791
1792 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1793                                 u8 tos, struct net_device *dev, int our)
1794 {
1795         unsigned hash;
1796         struct rtable *rth;
1797         __be32 spec_dst;
1798         struct in_device *in_dev = in_dev_get(dev);
1799         u32 itag = 0;
1800
1801         /* Primary sanity checks. */
1802
1803         if (in_dev == NULL)
1804                 return -EINVAL;
1805
1806         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1807             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1808                 goto e_inval;
1809
1810         if (ipv4_is_zeronet(saddr)) {
1811                 if (!ipv4_is_local_multicast(daddr))
1812                         goto e_inval;
1813                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1814         } else if (fib_validate_source(saddr, 0, tos, 0,
1815                                         dev, &spec_dst, &itag) < 0)
1816                 goto e_inval;
1817
1818         rth = dst_alloc(&ipv4_dst_ops);
1819         if (!rth)
1820                 goto e_nobufs;
1821
1822         rth->u.dst.output= ip_rt_bug;
1823
1824         atomic_set(&rth->u.dst.__refcnt, 1);
1825         rth->u.dst.flags= DST_HOST;
1826         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1827                 rth->u.dst.flags |= DST_NOPOLICY;
1828         rth->fl.fl4_dst = daddr;
1829         rth->rt_dst     = daddr;
1830         rth->fl.fl4_tos = tos;
1831         rth->fl.mark    = skb->mark;
1832         rth->fl.fl4_src = saddr;
1833         rth->rt_src     = saddr;
1834 #ifdef CONFIG_NET_CLS_ROUTE
1835         rth->u.dst.tclassid = itag;
1836 #endif
1837         rth->rt_iif     =
1838         rth->fl.iif     = dev->ifindex;
1839         rth->u.dst.dev  = init_net.loopback_dev;
1840         dev_hold(rth->u.dst.dev);
1841         rth->idev       = in_dev_get(rth->u.dst.dev);
1842         rth->fl.oif     = 0;
1843         rth->rt_gateway = daddr;
1844         rth->rt_spec_dst= spec_dst;
1845         rth->rt_genid   = rt_genid(dev_net(dev));
1846         rth->rt_flags   = RTCF_MULTICAST;
1847         rth->rt_type    = RTN_MULTICAST;
1848         if (our) {
1849                 rth->u.dst.input= ip_local_deliver;
1850                 rth->rt_flags |= RTCF_LOCAL;
1851         }
1852
1853 #ifdef CONFIG_IP_MROUTE
1854         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1855                 rth->u.dst.input = ip_mr_input;
1856 #endif
1857         RT_CACHE_STAT_INC(in_slow_mc);
1858
1859         in_dev_put(in_dev);
1860         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1861         return rt_intern_hash(hash, rth, &skb->rtable);
1862
1863 e_nobufs:
1864         in_dev_put(in_dev);
1865         return -ENOBUFS;
1866
1867 e_inval:
1868         in_dev_put(in_dev);
1869         return -EINVAL;
1870 }
1871
1872
1873 static void ip_handle_martian_source(struct net_device *dev,
1874                                      struct in_device *in_dev,
1875                                      struct sk_buff *skb,
1876                                      __be32 daddr,
1877                                      __be32 saddr)
1878 {
1879         RT_CACHE_STAT_INC(in_martian_src);
1880 #ifdef CONFIG_IP_ROUTE_VERBOSE
1881         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1882                 /*
1883                  *      RFC1812 recommendation, if source is martian,
1884                  *      the only hint is MAC header.
1885                  */
1886                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1887                         &daddr, &saddr, dev->name);
1888                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1889                         int i;
1890                         const unsigned char *p = skb_mac_header(skb);
1891                         printk(KERN_WARNING "ll header: ");
1892                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1893                                 printk("%02x", *p);
1894                                 if (i < (dev->hard_header_len - 1))
1895                                         printk(":");
1896                         }
1897                         printk("\n");
1898                 }
1899         }
1900 #endif
1901 }
1902
1903 static int __mkroute_input(struct sk_buff *skb,
1904                            struct fib_result *res,
1905                            struct in_device *in_dev,
1906                            __be32 daddr, __be32 saddr, u32 tos,
1907                            struct rtable **result)
1908 {
1909
1910         struct rtable *rth;
1911         int err;
1912         struct in_device *out_dev;
1913         unsigned flags = 0;
1914         __be32 spec_dst;
1915         u32 itag;
1916
1917         /* get a working reference to the output device */
1918         out_dev = in_dev_get(FIB_RES_DEV(*res));
1919         if (out_dev == NULL) {
1920                 if (net_ratelimit())
1921                         printk(KERN_CRIT "Bug in ip_route_input" \
1922                                "_slow(). Please, report\n");
1923                 return -EINVAL;
1924         }
1925
1926
1927         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1928                                   in_dev->dev, &spec_dst, &itag);
1929         if (err < 0) {
1930                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1931                                          saddr);
1932
1933                 err = -EINVAL;
1934                 goto cleanup;
1935         }
1936
1937         if (err)
1938                 flags |= RTCF_DIRECTSRC;
1939
1940         if (out_dev == in_dev && err &&
1941             (IN_DEV_SHARED_MEDIA(out_dev) ||
1942              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1943                 flags |= RTCF_DOREDIRECT;
1944
1945         if (skb->protocol != htons(ETH_P_IP)) {
1946                 /* Not IP (i.e. ARP). Do not create route, if it is
1947                  * invalid for proxy arp. DNAT routes are always valid.
1948                  */
1949                 if (out_dev == in_dev) {
1950                         err = -EINVAL;
1951                         goto cleanup;
1952                 }
1953         }
1954
1955
1956         rth = dst_alloc(&ipv4_dst_ops);
1957         if (!rth) {
1958                 err = -ENOBUFS;
1959                 goto cleanup;
1960         }
1961
1962         atomic_set(&rth->u.dst.__refcnt, 1);
1963         rth->u.dst.flags= DST_HOST;
1964         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1965                 rth->u.dst.flags |= DST_NOPOLICY;
1966         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1967                 rth->u.dst.flags |= DST_NOXFRM;
1968         rth->fl.fl4_dst = daddr;
1969         rth->rt_dst     = daddr;
1970         rth->fl.fl4_tos = tos;
1971         rth->fl.mark    = skb->mark;
1972         rth->fl.fl4_src = saddr;
1973         rth->rt_src     = saddr;
1974         rth->rt_gateway = daddr;
1975         rth->rt_iif     =
1976                 rth->fl.iif     = in_dev->dev->ifindex;
1977         rth->u.dst.dev  = (out_dev)->dev;
1978         dev_hold(rth->u.dst.dev);
1979         rth->idev       = in_dev_get(rth->u.dst.dev);
1980         rth->fl.oif     = 0;
1981         rth->rt_spec_dst= spec_dst;
1982
1983         rth->u.dst.input = ip_forward;
1984         rth->u.dst.output = ip_output;
1985         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
1986
1987         rt_set_nexthop(rth, res, itag);
1988
1989         rth->rt_flags = flags;
1990
1991         *result = rth;
1992         err = 0;
1993  cleanup:
1994         /* release the working reference to the output device */
1995         in_dev_put(out_dev);
1996         return err;
1997 }
1998
1999 static int ip_mkroute_input(struct sk_buff *skb,
2000                             struct fib_result *res,
2001                             const struct flowi *fl,
2002                             struct in_device *in_dev,
2003                             __be32 daddr, __be32 saddr, u32 tos)
2004 {
2005         struct rtable* rth = NULL;
2006         int err;
2007         unsigned hash;
2008
2009 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2010         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2011                 fib_select_multipath(fl, res);
2012 #endif
2013
2014         /* create a routing cache entry */
2015         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2016         if (err)
2017                 return err;
2018
2019         /* put it into the cache */
2020         hash = rt_hash(daddr, saddr, fl->iif,
2021                        rt_genid(dev_net(rth->u.dst.dev)));
2022         return rt_intern_hash(hash, rth, &skb->rtable);
2023 }
2024
2025 /*
2026  *      NOTE. We drop all the packets that has local source
2027  *      addresses, because every properly looped back packet
2028  *      must have correct destination already attached by output routine.
2029  *
2030  *      Such approach solves two big problems:
2031  *      1. Not simplex devices are handled properly.
2032  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2033  */
2034
2035 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2036                                u8 tos, struct net_device *dev)
2037 {
2038         struct fib_result res;
2039         struct in_device *in_dev = in_dev_get(dev);
2040         struct flowi fl = { .nl_u = { .ip4_u =
2041                                       { .daddr = daddr,
2042                                         .saddr = saddr,
2043                                         .tos = tos,
2044                                         .scope = RT_SCOPE_UNIVERSE,
2045                                       } },
2046                             .mark = skb->mark,
2047                             .iif = dev->ifindex };
2048         unsigned        flags = 0;
2049         u32             itag = 0;
2050         struct rtable * rth;
2051         unsigned        hash;
2052         __be32          spec_dst;
2053         int             err = -EINVAL;
2054         int             free_res = 0;
2055         struct net    * net = dev_net(dev);
2056
2057         /* IP on this device is disabled. */
2058
2059         if (!in_dev)
2060                 goto out;
2061
2062         /* Check for the most weird martians, which can be not detected
2063            by fib_lookup.
2064          */
2065
2066         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2067             ipv4_is_loopback(saddr))
2068                 goto martian_source;
2069
2070         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2071                 goto brd_input;
2072
2073         /* Accept zero addresses only to limited broadcast;
2074          * I even do not know to fix it or not. Waiting for complains :-)
2075          */
2076         if (ipv4_is_zeronet(saddr))
2077                 goto martian_source;
2078
2079         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2080             ipv4_is_loopback(daddr))
2081                 goto martian_destination;
2082
2083         /*
2084          *      Now we are ready to route packet.
2085          */
2086         if ((err = fib_lookup(net, &fl, &res)) != 0) {
2087                 if (!IN_DEV_FORWARD(in_dev))
2088                         goto e_hostunreach;
2089                 goto no_route;
2090         }
2091         free_res = 1;
2092
2093         RT_CACHE_STAT_INC(in_slow_tot);
2094
2095         if (res.type == RTN_BROADCAST)
2096                 goto brd_input;
2097
2098         if (res.type == RTN_LOCAL) {
2099                 int result;
2100                 result = fib_validate_source(saddr, daddr, tos,
2101                                              net->loopback_dev->ifindex,
2102                                              dev, &spec_dst, &itag);
2103                 if (result < 0)
2104                         goto martian_source;
2105                 if (result)
2106                         flags |= RTCF_DIRECTSRC;
2107                 spec_dst = daddr;
2108                 goto local_input;
2109         }
2110
2111         if (!IN_DEV_FORWARD(in_dev))
2112                 goto e_hostunreach;
2113         if (res.type != RTN_UNICAST)
2114                 goto martian_destination;
2115
2116         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2117 done:
2118         in_dev_put(in_dev);
2119         if (free_res)
2120                 fib_res_put(&res);
2121 out:    return err;
2122
2123 brd_input:
2124         if (skb->protocol != htons(ETH_P_IP))
2125                 goto e_inval;
2126
2127         if (ipv4_is_zeronet(saddr))
2128                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2129         else {
2130                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2131                                           &itag);
2132                 if (err < 0)
2133                         goto martian_source;
2134                 if (err)
2135                         flags |= RTCF_DIRECTSRC;
2136         }
2137         flags |= RTCF_BROADCAST;
2138         res.type = RTN_BROADCAST;
2139         RT_CACHE_STAT_INC(in_brd);
2140
2141 local_input:
2142         rth = dst_alloc(&ipv4_dst_ops);
2143         if (!rth)
2144                 goto e_nobufs;
2145
2146         rth->u.dst.output= ip_rt_bug;
2147         rth->rt_genid = rt_genid(net);
2148
2149         atomic_set(&rth->u.dst.__refcnt, 1);
2150         rth->u.dst.flags= DST_HOST;
2151         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2152                 rth->u.dst.flags |= DST_NOPOLICY;
2153         rth->fl.fl4_dst = daddr;
2154         rth->rt_dst     = daddr;
2155         rth->fl.fl4_tos = tos;
2156         rth->fl.mark    = skb->mark;
2157         rth->fl.fl4_src = saddr;
2158         rth->rt_src     = saddr;
2159 #ifdef CONFIG_NET_CLS_ROUTE
2160         rth->u.dst.tclassid = itag;
2161 #endif
2162         rth->rt_iif     =
2163         rth->fl.iif     = dev->ifindex;
2164         rth->u.dst.dev  = net->loopback_dev;
2165         dev_hold(rth->u.dst.dev);
2166         rth->idev       = in_dev_get(rth->u.dst.dev);
2167         rth->rt_gateway = daddr;
2168         rth->rt_spec_dst= spec_dst;
2169         rth->u.dst.input= ip_local_deliver;
2170         rth->rt_flags   = flags|RTCF_LOCAL;
2171         if (res.type == RTN_UNREACHABLE) {
2172                 rth->u.dst.input= ip_error;
2173                 rth->u.dst.error= -err;
2174                 rth->rt_flags   &= ~RTCF_LOCAL;
2175         }
2176         rth->rt_type    = res.type;
2177         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2178         err = rt_intern_hash(hash, rth, &skb->rtable);
2179         goto done;
2180
2181 no_route:
2182         RT_CACHE_STAT_INC(in_no_route);
2183         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2184         res.type = RTN_UNREACHABLE;
2185         if (err == -ESRCH)
2186                 err = -ENETUNREACH;
2187         goto local_input;
2188
2189         /*
2190          *      Do not cache martian addresses: they should be logged (RFC1812)
2191          */
2192 martian_destination:
2193         RT_CACHE_STAT_INC(in_martian_dst);
2194 #ifdef CONFIG_IP_ROUTE_VERBOSE
2195         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2196                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2197                         &daddr, &saddr, dev->name);
2198 #endif
2199
2200 e_hostunreach:
2201         err = -EHOSTUNREACH;
2202         goto done;
2203
2204 e_inval:
2205         err = -EINVAL;
2206         goto done;
2207
2208 e_nobufs:
2209         err = -ENOBUFS;
2210         goto done;
2211
2212 martian_source:
2213         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2214         goto e_inval;
2215 }
2216
2217 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2218                    u8 tos, struct net_device *dev)
2219 {
2220         struct rtable * rth;
2221         unsigned        hash;
2222         int iif = dev->ifindex;
2223         struct net *net;
2224
2225         net = dev_net(dev);
2226
2227         if (!rt_caching(net))
2228                 goto skip_cache;
2229
2230         tos &= IPTOS_RT_MASK;
2231         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2232
2233         rcu_read_lock();
2234         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2235              rth = rcu_dereference(rth->u.dst.rt_next)) {
2236                 if (((rth->fl.fl4_dst ^ daddr) |
2237                      (rth->fl.fl4_src ^ saddr) |
2238                      (rth->fl.iif ^ iif) |
2239                      rth->fl.oif |
2240                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2241                     rth->fl.mark == skb->mark &&
2242                     net_eq(dev_net(rth->u.dst.dev), net) &&
2243                     !rt_is_expired(rth)) {
2244                         dst_use(&rth->u.dst, jiffies);
2245                         RT_CACHE_STAT_INC(in_hit);
2246                         rcu_read_unlock();
2247                         skb->rtable = rth;
2248                         return 0;
2249                 }
2250                 RT_CACHE_STAT_INC(in_hlist_search);
2251         }
2252         rcu_read_unlock();
2253
2254 skip_cache:
2255         /* Multicast recognition logic is moved from route cache to here.
2256            The problem was that too many Ethernet cards have broken/missing
2257            hardware multicast filters :-( As result the host on multicasting
2258            network acquires a lot of useless route cache entries, sort of
2259            SDR messages from all the world. Now we try to get rid of them.
2260            Really, provided software IP multicast filter is organized
2261            reasonably (at least, hashed), it does not result in a slowdown
2262            comparing with route cache reject entries.
2263            Note, that multicast routers are not affected, because
2264            route cache entry is created eventually.
2265          */
2266         if (ipv4_is_multicast(daddr)) {
2267                 struct in_device *in_dev;
2268
2269                 rcu_read_lock();
2270                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2271                         int our = ip_check_mc(in_dev, daddr, saddr,
2272                                 ip_hdr(skb)->protocol);
2273                         if (our
2274 #ifdef CONFIG_IP_MROUTE
2275                             || (!ipv4_is_local_multicast(daddr) &&
2276                                 IN_DEV_MFORWARD(in_dev))
2277 #endif
2278                             ) {
2279                                 rcu_read_unlock();
2280                                 return ip_route_input_mc(skb, daddr, saddr,
2281                                                          tos, dev, our);
2282                         }
2283                 }
2284                 rcu_read_unlock();
2285                 return -EINVAL;
2286         }
2287         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2288 }
2289
2290 static int __mkroute_output(struct rtable **result,
2291                             struct fib_result *res,
2292                             const struct flowi *fl,
2293                             const struct flowi *oldflp,
2294                             struct net_device *dev_out,
2295                             unsigned flags)
2296 {
2297         struct rtable *rth;
2298         struct in_device *in_dev;
2299         u32 tos = RT_FL_TOS(oldflp);
2300         int err = 0;
2301
2302         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2303                 return -EINVAL;
2304
2305         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2306                 res->type = RTN_BROADCAST;
2307         else if (ipv4_is_multicast(fl->fl4_dst))
2308                 res->type = RTN_MULTICAST;
2309         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2310                 return -EINVAL;
2311
2312         if (dev_out->flags & IFF_LOOPBACK)
2313                 flags |= RTCF_LOCAL;
2314
2315         /* get work reference to inet device */
2316         in_dev = in_dev_get(dev_out);
2317         if (!in_dev)
2318                 return -EINVAL;
2319
2320         if (res->type == RTN_BROADCAST) {
2321                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2322                 if (res->fi) {
2323                         fib_info_put(res->fi);
2324                         res->fi = NULL;
2325                 }
2326         } else if (res->type == RTN_MULTICAST) {
2327                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2328                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2329                                  oldflp->proto))
2330                         flags &= ~RTCF_LOCAL;
2331                 /* If multicast route do not exist use
2332                    default one, but do not gateway in this case.
2333                    Yes, it is hack.
2334                  */
2335                 if (res->fi && res->prefixlen < 4) {
2336                         fib_info_put(res->fi);
2337                         res->fi = NULL;
2338                 }
2339         }
2340
2341
2342         rth = dst_alloc(&ipv4_dst_ops);
2343         if (!rth) {
2344                 err = -ENOBUFS;
2345                 goto cleanup;
2346         }
2347
2348         atomic_set(&rth->u.dst.__refcnt, 1);
2349         rth->u.dst.flags= DST_HOST;
2350         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2351                 rth->u.dst.flags |= DST_NOXFRM;
2352         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2353                 rth->u.dst.flags |= DST_NOPOLICY;
2354
2355         rth->fl.fl4_dst = oldflp->fl4_dst;
2356         rth->fl.fl4_tos = tos;
2357         rth->fl.fl4_src = oldflp->fl4_src;
2358         rth->fl.oif     = oldflp->oif;
2359         rth->fl.mark    = oldflp->mark;
2360         rth->rt_dst     = fl->fl4_dst;
2361         rth->rt_src     = fl->fl4_src;
2362         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2363         /* get references to the devices that are to be hold by the routing
2364            cache entry */
2365         rth->u.dst.dev  = dev_out;
2366         dev_hold(dev_out);
2367         rth->idev       = in_dev_get(dev_out);
2368         rth->rt_gateway = fl->fl4_dst;
2369         rth->rt_spec_dst= fl->fl4_src;
2370
2371         rth->u.dst.output=ip_output;
2372         rth->rt_genid = rt_genid(dev_net(dev_out));
2373
2374         RT_CACHE_STAT_INC(out_slow_tot);
2375
2376         if (flags & RTCF_LOCAL) {
2377                 rth->u.dst.input = ip_local_deliver;
2378                 rth->rt_spec_dst = fl->fl4_dst;
2379         }
2380         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2381                 rth->rt_spec_dst = fl->fl4_src;
2382                 if (flags & RTCF_LOCAL &&
2383                     !(dev_out->flags & IFF_LOOPBACK)) {
2384                         rth->u.dst.output = ip_mc_output;
2385                         RT_CACHE_STAT_INC(out_slow_mc);
2386                 }
2387 #ifdef CONFIG_IP_MROUTE
2388                 if (res->type == RTN_MULTICAST) {
2389                         if (IN_DEV_MFORWARD(in_dev) &&
2390                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2391                                 rth->u.dst.input = ip_mr_input;
2392                                 rth->u.dst.output = ip_mc_output;
2393                         }
2394                 }
2395 #endif
2396         }
2397
2398         rt_set_nexthop(rth, res, 0);
2399
2400         rth->rt_flags = flags;
2401
2402         *result = rth;
2403  cleanup:
2404         /* release work reference to inet device */
2405         in_dev_put(in_dev);
2406
2407         return err;
2408 }
2409
2410 static int ip_mkroute_output(struct rtable **rp,
2411                              struct fib_result *res,
2412                              const struct flowi *fl,
2413                              const struct flowi *oldflp,
2414                              struct net_device *dev_out,
2415                              unsigned flags)
2416 {
2417         struct rtable *rth = NULL;
2418         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2419         unsigned hash;
2420         if (err == 0) {
2421                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2422                                rt_genid(dev_net(dev_out)));
2423                 err = rt_intern_hash(hash, rth, rp);
2424         }
2425
2426         return err;
2427 }
2428
2429 /*
2430  * Major route resolver routine.
2431  */
2432
2433 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2434                                 const struct flowi *oldflp)
2435 {
2436         u32 tos = RT_FL_TOS(oldflp);
2437         struct flowi fl = { .nl_u = { .ip4_u =
2438                                       { .daddr = oldflp->fl4_dst,
2439                                         .saddr = oldflp->fl4_src,
2440                                         .tos = tos & IPTOS_RT_MASK,
2441                                         .scope = ((tos & RTO_ONLINK) ?
2442                                                   RT_SCOPE_LINK :
2443                                                   RT_SCOPE_UNIVERSE),
2444                                       } },
2445                             .mark = oldflp->mark,
2446                             .iif = net->loopback_dev->ifindex,
2447                             .oif = oldflp->oif };
2448         struct fib_result res;
2449         unsigned flags = 0;
2450         struct net_device *dev_out = NULL;
2451         int free_res = 0;
2452         int err;
2453
2454
2455         res.fi          = NULL;
2456 #ifdef CONFIG_IP_MULTIPLE_TABLES
2457         res.r           = NULL;
2458 #endif
2459
2460         if (oldflp->fl4_src) {
2461                 err = -EINVAL;
2462                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2463                     ipv4_is_lbcast(oldflp->fl4_src) ||
2464                     ipv4_is_zeronet(oldflp->fl4_src))
2465                         goto out;
2466
2467                 /* I removed check for oif == dev_out->oif here.
2468                    It was wrong for two reasons:
2469                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2470                       is assigned to multiple interfaces.
2471                    2. Moreover, we are allowed to send packets with saddr
2472                       of another iface. --ANK
2473                  */
2474
2475                 if (oldflp->oif == 0
2476                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2477                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2478                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2479                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2480                         if (dev_out == NULL)
2481                                 goto out;
2482
2483                         /* Special hack: user can direct multicasts
2484                            and limited broadcast via necessary interface
2485                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2486                            This hack is not just for fun, it allows
2487                            vic,vat and friends to work.
2488                            They bind socket to loopback, set ttl to zero
2489                            and expect that it will work.
2490                            From the viewpoint of routing cache they are broken,
2491                            because we are not allowed to build multicast path
2492                            with loopback source addr (look, routing cache
2493                            cannot know, that ttl is zero, so that packet
2494                            will not leave this host and route is valid).
2495                            Luckily, this hack is good workaround.
2496                          */
2497
2498                         fl.oif = dev_out->ifindex;
2499                         goto make_route;
2500                 }
2501
2502                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2503                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2504                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2505                         if (dev_out == NULL)
2506                                 goto out;
2507                         dev_put(dev_out);
2508                         dev_out = NULL;
2509                 }
2510         }
2511
2512
2513         if (oldflp->oif) {
2514                 dev_out = dev_get_by_index(net, oldflp->oif);
2515                 err = -ENODEV;
2516                 if (dev_out == NULL)
2517                         goto out;
2518
2519                 /* RACE: Check return value of inet_select_addr instead. */
2520                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2521                         dev_put(dev_out);
2522                         goto out;       /* Wrong error code */
2523                 }
2524
2525                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2526                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2527                         if (!fl.fl4_src)
2528                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2529                                                               RT_SCOPE_LINK);
2530                         goto make_route;
2531                 }
2532                 if (!fl.fl4_src) {
2533                         if (ipv4_is_multicast(oldflp->fl4_dst))
2534                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2535                                                               fl.fl4_scope);
2536                         else if (!oldflp->fl4_dst)
2537                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2538                                                               RT_SCOPE_HOST);
2539                 }
2540         }
2541
2542         if (!fl.fl4_dst) {
2543                 fl.fl4_dst = fl.fl4_src;
2544                 if (!fl.fl4_dst)
2545                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2546                 if (dev_out)
2547                         dev_put(dev_out);
2548                 dev_out = net->loopback_dev;
2549                 dev_hold(dev_out);
2550                 fl.oif = net->loopback_dev->ifindex;
2551                 res.type = RTN_LOCAL;
2552                 flags |= RTCF_LOCAL;
2553                 goto make_route;
2554         }
2555
2556         if (fib_lookup(net, &fl, &res)) {
2557                 res.fi = NULL;
2558                 if (oldflp->oif) {
2559                         /* Apparently, routing tables are wrong. Assume,
2560                            that the destination is on link.
2561
2562                            WHY? DW.
2563                            Because we are allowed to send to iface
2564                            even if it has NO routes and NO assigned
2565                            addresses. When oif is specified, routing
2566                            tables are looked up with only one purpose:
2567                            to catch if destination is gatewayed, rather than
2568                            direct. Moreover, if MSG_DONTROUTE is set,
2569                            we send packet, ignoring both routing tables
2570                            and ifaddr state. --ANK
2571
2572
2573                            We could make it even if oif is unknown,
2574                            likely IPv6, but we do not.
2575                          */
2576
2577                         if (fl.fl4_src == 0)
2578                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2579                                                               RT_SCOPE_LINK);
2580                         res.type = RTN_UNICAST;
2581                         goto make_route;
2582                 }
2583                 if (dev_out)
2584                         dev_put(dev_out);
2585                 err = -ENETUNREACH;
2586                 goto out;
2587         }
2588         free_res = 1;
2589
2590         if (res.type == RTN_LOCAL) {
2591                 if (!fl.fl4_src)
2592                         fl.fl4_src = fl.fl4_dst;
2593                 if (dev_out)
2594                         dev_put(dev_out);
2595                 dev_out = net->loopback_dev;
2596                 dev_hold(dev_out);
2597                 fl.oif = dev_out->ifindex;
2598                 if (res.fi)
2599                         fib_info_put(res.fi);
2600                 res.fi = NULL;
2601                 flags |= RTCF_LOCAL;
2602                 goto make_route;
2603         }
2604
2605 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2606         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2607                 fib_select_multipath(&fl, &res);
2608         else
2609 #endif
2610         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2611                 fib_select_default(net, &fl, &res);
2612
2613         if (!fl.fl4_src)
2614                 fl.fl4_src = FIB_RES_PREFSRC(res);
2615
2616         if (dev_out)
2617                 dev_put(dev_out);
2618         dev_out = FIB_RES_DEV(res);
2619         dev_hold(dev_out);
2620         fl.oif = dev_out->ifindex;
2621
2622
2623 make_route:
2624         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2625
2626
2627         if (free_res)
2628                 fib_res_put(&res);
2629         if (dev_out)
2630                 dev_put(dev_out);
2631 out:    return err;
2632 }
2633
2634 int __ip_route_output_key(struct net *net, struct rtable **rp,
2635                           const struct flowi *flp)
2636 {
2637         unsigned hash;
2638         struct rtable *rth;
2639
2640         if (!rt_caching(net))
2641                 goto slow_output;
2642
2643         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2644
2645         rcu_read_lock_bh();
2646         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2647                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2648                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2649                     rth->fl.fl4_src == flp->fl4_src &&
2650                     rth->fl.iif == 0 &&
2651                     rth->fl.oif == flp->oif &&
2652                     rth->fl.mark == flp->mark &&
2653                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2654                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2655                     net_eq(dev_net(rth->u.dst.dev), net) &&
2656                     !rt_is_expired(rth)) {
2657                         dst_use(&rth->u.dst, jiffies);
2658                         RT_CACHE_STAT_INC(out_hit);
2659                         rcu_read_unlock_bh();
2660                         *rp = rth;
2661                         return 0;
2662                 }
2663                 RT_CACHE_STAT_INC(out_hlist_search);
2664         }
2665         rcu_read_unlock_bh();
2666
2667 slow_output:
2668         return ip_route_output_slow(net, rp, flp);
2669 }
2670
2671 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2672
2673 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2674 {
2675 }
2676
2677 static struct dst_ops ipv4_dst_blackhole_ops = {
2678         .family                 =       AF_INET,
2679         .protocol               =       cpu_to_be16(ETH_P_IP),
2680         .destroy                =       ipv4_dst_destroy,
2681         .check                  =       ipv4_dst_check,
2682         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2683         .entries                =       ATOMIC_INIT(0),
2684 };
2685
2686
2687 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2688 {
2689         struct rtable *ort = *rp;
2690         struct rtable *rt = (struct rtable *)
2691                 dst_alloc(&ipv4_dst_blackhole_ops);
2692
2693         if (rt) {
2694                 struct dst_entry *new = &rt->u.dst;
2695
2696                 atomic_set(&new->__refcnt, 1);
2697                 new->__use = 1;
2698                 new->input = dst_discard;
2699                 new->output = dst_discard;
2700                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2701
2702                 new->dev = ort->u.dst.dev;
2703                 if (new->dev)
2704                         dev_hold(new->dev);
2705
2706                 rt->fl = ort->fl;
2707
2708                 rt->idev = ort->idev;
2709                 if (rt->idev)
2710                         in_dev_hold(rt->idev);
2711                 rt->rt_genid = rt_genid(net);
2712                 rt->rt_flags = ort->rt_flags;
2713                 rt->rt_type = ort->rt_type;
2714                 rt->rt_dst = ort->rt_dst;
2715                 rt->rt_src = ort->rt_src;
2716                 rt->rt_iif = ort->rt_iif;
2717                 rt->rt_gateway = ort->rt_gateway;
2718                 rt->rt_spec_dst = ort->rt_spec_dst;
2719                 rt->peer = ort->peer;
2720                 if (rt->peer)
2721                         atomic_inc(&rt->peer->refcnt);
2722
2723                 dst_free(new);
2724         }
2725
2726         dst_release(&(*rp)->u.dst);
2727         *rp = rt;
2728         return (rt ? 0 : -ENOMEM);
2729 }
2730
2731 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2732                          struct sock *sk, int flags)
2733 {
2734         int err;
2735
2736         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2737                 return err;
2738
2739         if (flp->proto) {
2740                 if (!flp->fl4_src)
2741                         flp->fl4_src = (*rp)->rt_src;
2742                 if (!flp->fl4_dst)
2743                         flp->fl4_dst = (*rp)->rt_dst;
2744                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2745                                     flags ? XFRM_LOOKUP_WAIT : 0);
2746                 if (err == -EREMOTE)
2747                         err = ipv4_dst_blackhole(net, rp, flp);
2748
2749                 return err;
2750         }
2751
2752         return 0;
2753 }
2754
2755 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2756
2757 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2758 {
2759         return ip_route_output_flow(net, rp, flp, NULL, 0);
2760 }
2761
2762 static int rt_fill_info(struct net *net,
2763                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2764                         int nowait, unsigned int flags)
2765 {
2766         struct rtable *rt = skb->rtable;
2767         struct rtmsg *r;
2768         struct nlmsghdr *nlh;
2769         long expires;
2770         u32 id = 0, ts = 0, tsage = 0, error;
2771
2772         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2773         if (nlh == NULL)
2774                 return -EMSGSIZE;
2775
2776         r = nlmsg_data(nlh);
2777         r->rtm_family    = AF_INET;
2778         r->rtm_dst_len  = 32;
2779         r->rtm_src_len  = 0;
2780         r->rtm_tos      = rt->fl.fl4_tos;
2781         r->rtm_table    = RT_TABLE_MAIN;
2782         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2783         r->rtm_type     = rt->rt_type;
2784         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2785         r->rtm_protocol = RTPROT_UNSPEC;
2786         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2787         if (rt->rt_flags & RTCF_NOTIFY)
2788                 r->rtm_flags |= RTM_F_NOTIFY;
2789
2790         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2791
2792         if (rt->fl.fl4_src) {
2793                 r->rtm_src_len = 32;
2794                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2795         }
2796         if (rt->u.dst.dev)
2797                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2798 #ifdef CONFIG_NET_CLS_ROUTE
2799         if (rt->u.dst.tclassid)
2800                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2801 #endif
2802         if (rt->fl.iif)
2803                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2804         else if (rt->rt_src != rt->fl.fl4_src)
2805                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2806
2807         if (rt->rt_dst != rt->rt_gateway)
2808                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2809
2810         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2811                 goto nla_put_failure;
2812
2813         error = rt->u.dst.error;
2814         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2815         if (rt->peer) {
2816                 id = rt->peer->ip_id_count;
2817                 if (rt->peer->tcp_ts_stamp) {
2818                         ts = rt->peer->tcp_ts;
2819                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2820                 }
2821         }
2822
2823         if (rt->fl.iif) {
2824 #ifdef CONFIG_IP_MROUTE
2825                 __be32 dst = rt->rt_dst;
2826
2827                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2828                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2829                         int err = ipmr_get_route(net, skb, r, nowait);
2830                         if (err <= 0) {
2831                                 if (!nowait) {
2832                                         if (err == 0)
2833                                                 return 0;
2834                                         goto nla_put_failure;
2835                                 } else {
2836                                         if (err == -EMSGSIZE)
2837                                                 goto nla_put_failure;
2838                                         error = err;
2839                                 }
2840                         }
2841                 } else
2842 #endif
2843                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2844         }
2845
2846         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2847                                expires, error) < 0)
2848                 goto nla_put_failure;
2849
2850         return nlmsg_end(skb, nlh);
2851
2852 nla_put_failure:
2853         nlmsg_cancel(skb, nlh);
2854         return -EMSGSIZE;
2855 }
2856
2857 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2858 {
2859         struct net *net = sock_net(in_skb->sk);
2860         struct rtmsg *rtm;
2861         struct nlattr *tb[RTA_MAX+1];
2862         struct rtable *rt = NULL;
2863         __be32 dst = 0;
2864         __be32 src = 0;
2865         u32 iif;
2866         int err;
2867         struct sk_buff *skb;
2868
2869         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2870         if (err < 0)
2871                 goto errout;
2872
2873         rtm = nlmsg_data(nlh);
2874
2875         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2876         if (skb == NULL) {
2877                 err = -ENOBUFS;
2878                 goto errout;
2879         }
2880
2881         /* Reserve room for dummy headers, this skb can pass
2882            through good chunk of routing engine.
2883          */
2884         skb_reset_mac_header(skb);
2885         skb_reset_network_header(skb);
2886
2887         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2888         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2889         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2890
2891         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2892         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2893         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2894
2895         if (iif) {
2896                 struct net_device *dev;
2897
2898                 dev = __dev_get_by_index(net, iif);
2899                 if (dev == NULL) {
2900                         err = -ENODEV;
2901                         goto errout_free;
2902                 }
2903
2904                 skb->protocol   = htons(ETH_P_IP);
2905                 skb->dev        = dev;
2906                 local_bh_disable();
2907                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2908                 local_bh_enable();
2909
2910                 rt = skb->rtable;
2911                 if (err == 0 && rt->u.dst.error)
2912                         err = -rt->u.dst.error;
2913         } else {
2914                 struct flowi fl = {
2915                         .nl_u = {
2916                                 .ip4_u = {
2917                                         .daddr = dst,
2918                                         .saddr = src,
2919                                         .tos = rtm->rtm_tos,
2920                                 },
2921                         },
2922                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2923                 };
2924                 err = ip_route_output_key(net, &rt, &fl);
2925         }
2926
2927         if (err)
2928                 goto errout_free;
2929
2930         skb->rtable = rt;
2931         if (rtm->rtm_flags & RTM_F_NOTIFY)
2932                 rt->rt_flags |= RTCF_NOTIFY;
2933
2934         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2935                            RTM_NEWROUTE, 0, 0);
2936         if (err <= 0)
2937                 goto errout_free;
2938
2939         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2940 errout:
2941         return err;
2942
2943 errout_free:
2944         kfree_skb(skb);
2945         goto errout;
2946 }
2947
2948 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2949 {
2950         struct rtable *rt;
2951         int h, s_h;
2952         int idx, s_idx;
2953         struct net *net;
2954
2955         net = sock_net(skb->sk);
2956
2957         s_h = cb->args[0];
2958         if (s_h < 0)
2959                 s_h = 0;
2960         s_idx = idx = cb->args[1];
2961         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2962                 if (!rt_hash_table[h].chain)
2963                         continue;
2964                 rcu_read_lock_bh();
2965                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2966                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2967                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2968                                 continue;
2969                         if (rt_is_expired(rt))
2970                                 continue;
2971                         skb->dst = dst_clone(&rt->u.dst);
2972                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
2973                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2974                                          1, NLM_F_MULTI) <= 0) {
2975                                 dst_release(xchg(&skb->dst, NULL));
2976                                 rcu_read_unlock_bh();
2977                                 goto done;
2978                         }
2979                         dst_release(xchg(&skb->dst, NULL));
2980                 }
2981                 rcu_read_unlock_bh();
2982         }
2983
2984 done:
2985         cb->args[0] = h;
2986         cb->args[1] = idx;
2987         return skb->len;
2988 }
2989
2990 void ip_rt_multicast_event(struct in_device *in_dev)
2991 {
2992         rt_cache_flush(dev_net(in_dev->dev), 0);
2993 }
2994
2995 #ifdef CONFIG_SYSCTL
2996 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2997                                         struct file *filp, void __user *buffer,
2998                                         size_t *lenp, loff_t *ppos)
2999 {
3000         if (write) {
3001                 int flush_delay;
3002                 ctl_table ctl;
3003                 struct net *net;
3004
3005                 memcpy(&ctl, __ctl, sizeof(ctl));
3006                 ctl.data = &flush_delay;
3007                 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
3008
3009                 net = (struct net *)__ctl->extra1;
3010                 rt_cache_flush(net, flush_delay);
3011                 return 0;
3012         }
3013
3014         return -EINVAL;
3015 }
3016
3017 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3018                                                 void __user *oldval,
3019                                                 size_t __user *oldlenp,
3020                                                 void __user *newval,
3021                                                 size_t newlen)
3022 {
3023         int delay;
3024         struct net *net;
3025         if (newlen != sizeof(int))
3026                 return -EINVAL;
3027         if (get_user(delay, (int __user *)newval))
3028                 return -EFAULT;
3029         net = (struct net *)table->extra1;
3030         rt_cache_flush(net, delay);
3031         return 0;
3032 }
3033
3034 static void rt_secret_reschedule(int old)
3035 {
3036         struct net *net;
3037         int new = ip_rt_secret_interval;
3038         int diff = new - old;
3039
3040         if (!diff)
3041                 return;
3042
3043         rtnl_lock();
3044         for_each_net(net) {
3045                 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3046
3047                 if (!new)
3048                         continue;
3049
3050                 if (deleted) {
3051                         long time = net->ipv4.rt_secret_timer.expires - jiffies;
3052
3053                         if (time <= 0 || (time += diff) <= 0)
3054                                 time = 0;
3055
3056                         net->ipv4.rt_secret_timer.expires = time;
3057                 } else
3058                         net->ipv4.rt_secret_timer.expires = new;
3059
3060                 net->ipv4.rt_secret_timer.expires += jiffies;
3061                 add_timer(&net->ipv4.rt_secret_timer);
3062         }
3063         rtnl_unlock();
3064 }
3065
3066 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3067                                           struct file *filp,
3068                                           void __user *buffer, size_t *lenp,
3069                                           loff_t *ppos)
3070 {
3071         int old = ip_rt_secret_interval;
3072         int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
3073
3074         rt_secret_reschedule(old);
3075
3076         return ret;
3077 }
3078
3079 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3080                                                    void __user *oldval,
3081                                                    size_t __user *oldlenp,
3082                                                    void __user *newval,
3083                                                    size_t newlen)
3084 {
3085         int old = ip_rt_secret_interval;
3086         int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3087
3088         rt_secret_reschedule(old);
3089
3090         return ret;
3091 }
3092
3093 static ctl_table ipv4_route_table[] = {
3094         {
3095                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
3096                 .procname       = "gc_thresh",
3097                 .data           = &ipv4_dst_ops.gc_thresh,
3098                 .maxlen         = sizeof(int),
3099                 .mode           = 0644,
3100                 .proc_handler   = proc_dointvec,
3101         },
3102         {
3103                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
3104                 .procname       = "max_size",
3105                 .data           = &ip_rt_max_size,
3106                 .maxlen         = sizeof(int),
3107                 .mode           = 0644,
3108                 .proc_handler   = proc_dointvec,
3109         },
3110         {
3111                 /*  Deprecated. Use gc_min_interval_ms */
3112
3113                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3114                 .procname       = "gc_min_interval",
3115                 .data           = &ip_rt_gc_min_interval,
3116                 .maxlen         = sizeof(int),
3117                 .mode           = 0644,
3118                 .proc_handler   = proc_dointvec_jiffies,
3119                 .strategy       = sysctl_jiffies,
3120         },
3121         {
3122                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3123                 .procname       = "gc_min_interval_ms",
3124                 .data           = &ip_rt_gc_min_interval,
3125                 .maxlen         = sizeof(int),
3126                 .mode           = 0644,
3127                 .proc_handler   = proc_dointvec_ms_jiffies,
3128                 .strategy       = sysctl_ms_jiffies,
3129         },
3130         {
3131                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
3132                 .procname       = "gc_timeout",
3133                 .data           = &ip_rt_gc_timeout,
3134                 .maxlen         = sizeof(int),
3135                 .mode           = 0644,
3136                 .proc_handler   = proc_dointvec_jiffies,
3137                 .strategy       = sysctl_jiffies,
3138         },
3139         {
3140                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
3141                 .procname       = "gc_interval",
3142                 .data           = &ip_rt_gc_interval,
3143                 .maxlen         = sizeof(int),
3144                 .mode           = 0644,
3145                 .proc_handler   = proc_dointvec_jiffies,
3146                 .strategy       = sysctl_jiffies,
3147         },
3148         {
3149                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
3150                 .procname       = "redirect_load",
3151                 .data           = &ip_rt_redirect_load,
3152                 .maxlen         = sizeof(int),
3153                 .mode           = 0644,
3154                 .proc_handler   = proc_dointvec,
3155         },
3156         {
3157                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3158                 .procname       = "redirect_number",
3159                 .data           = &ip_rt_redirect_number,
3160                 .maxlen         = sizeof(int),
3161                 .mode           = 0644,
3162                 .proc_handler   = proc_dointvec,
3163         },
3164         {
3165                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3166                 .procname       = "redirect_silence",
3167                 .data           = &ip_rt_redirect_silence,
3168                 .maxlen         = sizeof(int),
3169                 .mode           = 0644,
3170                 .proc_handler   = proc_dointvec,
3171         },
3172         {
3173                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3174                 .procname       = "error_cost",
3175                 .data           = &ip_rt_error_cost,
3176                 .maxlen         = sizeof(int),
3177                 .mode           = 0644,
3178                 .proc_handler   = proc_dointvec,
3179         },
3180         {
3181                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3182                 .procname       = "error_burst",
3183                 .data           = &ip_rt_error_burst,
3184                 .maxlen         = sizeof(int),
3185                 .mode           = 0644,
3186                 .proc_handler   = proc_dointvec,
3187         },
3188         {
3189                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3190                 .procname       = "gc_elasticity",
3191                 .data           = &ip_rt_gc_elasticity,
3192                 .maxlen         = sizeof(int),
3193                 .mode           = 0644,
3194                 .proc_handler   = proc_dointvec,
3195         },
3196         {
3197                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3198                 .procname       = "mtu_expires",
3199                 .data           = &ip_rt_mtu_expires,
3200                 .maxlen         = sizeof(int),
3201                 .mode           = 0644,
3202                 .proc_handler   = proc_dointvec_jiffies,
3203                 .strategy       = sysctl_jiffies,
3204         },
3205         {
3206                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3207                 .procname       = "min_pmtu",
3208                 .data           = &ip_rt_min_pmtu,
3209                 .maxlen         = sizeof(int),
3210                 .mode           = 0644,
3211                 .proc_handler   = proc_dointvec,
3212         },
3213         {
3214                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3215                 .procname       = "min_adv_mss",
3216                 .data           = &ip_rt_min_advmss,
3217                 .maxlen         = sizeof(int),
3218                 .mode           = 0644,
3219                 .proc_handler   = proc_dointvec,
3220         },
3221         {
3222                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3223                 .procname       = "secret_interval",
3224                 .data           = &ip_rt_secret_interval,
3225                 .maxlen         = sizeof(int),
3226                 .mode           = 0644,
3227                 .proc_handler   = ipv4_sysctl_rt_secret_interval,
3228                 .strategy       = ipv4_sysctl_rt_secret_interval_strategy,
3229         },
3230         { .ctl_name = 0 }
3231 };
3232
3233 static struct ctl_table empty[1];
3234
3235 static struct ctl_table ipv4_skeleton[] =
3236 {
3237         { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3238           .mode = 0555, .child = ipv4_route_table},
3239         { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3240           .mode = 0555, .child = empty},
3241         { }
3242 };
3243
3244 static __net_initdata struct ctl_path ipv4_path[] = {
3245         { .procname = "net", .ctl_name = CTL_NET, },
3246         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3247         { },
3248 };
3249
3250 static struct ctl_table ipv4_route_flush_table[] = {
3251         {
3252                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
3253                 .procname       = "flush",
3254                 .maxlen         = sizeof(int),
3255                 .mode           = 0200,
3256                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3257                 .strategy       = ipv4_sysctl_rtcache_flush_strategy,
3258         },
3259         { .ctl_name = 0 },
3260 };
3261
3262 static __net_initdata struct ctl_path ipv4_route_path[] = {
3263         { .procname = "net", .ctl_name = CTL_NET, },
3264         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3265         { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3266         { },
3267 };
3268
3269 static __net_init int sysctl_route_net_init(struct net *net)
3270 {
3271         struct ctl_table *tbl;
3272
3273         tbl = ipv4_route_flush_table;
3274         if (net != &init_net) {
3275                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3276                 if (tbl == NULL)
3277                         goto err_dup;
3278         }
3279         tbl[0].extra1 = net;
3280
3281         net->ipv4.route_hdr =
3282                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3283         if (net->ipv4.route_hdr == NULL)
3284                 goto err_reg;
3285         return 0;
3286
3287 err_reg:
3288         if (tbl != ipv4_route_flush_table)
3289                 kfree(tbl);
3290 err_dup:
3291         return -ENOMEM;
3292 }
3293
3294 static __net_exit void sysctl_route_net_exit(struct net *net)
3295 {
3296         struct ctl_table *tbl;
3297
3298         tbl = net->ipv4.route_hdr->ctl_table_arg;
3299         unregister_net_sysctl_table(net->ipv4.route_hdr);
3300         BUG_ON(tbl == ipv4_route_flush_table);
3301         kfree(tbl);
3302 }
3303
3304 static __net_initdata struct pernet_operations sysctl_route_ops = {
3305         .init = sysctl_route_net_init,
3306         .exit = sysctl_route_net_exit,
3307 };
3308 #endif
3309
3310
3311 static __net_init int rt_secret_timer_init(struct net *net)
3312 {
3313         atomic_set(&net->ipv4.rt_genid,
3314                         (int) ((num_physpages ^ (num_physpages>>8)) ^
3315                         (jiffies ^ (jiffies >> 7))));
3316
3317         net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3318         net->ipv4.rt_secret_timer.data = (unsigned long)net;
3319         init_timer_deferrable(&net->ipv4.rt_secret_timer);
3320
3321         if (ip_rt_secret_interval) {
3322                 net->ipv4.rt_secret_timer.expires =
3323                         jiffies + net_random() % ip_rt_secret_interval +
3324                         ip_rt_secret_interval;
3325                 add_timer(&net->ipv4.rt_secret_timer);
3326         }
3327         return 0;
3328 }
3329
3330 static __net_exit void rt_secret_timer_exit(struct net *net)
3331 {
3332         del_timer_sync(&net->ipv4.rt_secret_timer);
3333 }
3334
3335 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3336         .init = rt_secret_timer_init,
3337         .exit = rt_secret_timer_exit,
3338 };
3339
3340
3341 #ifdef CONFIG_NET_CLS_ROUTE
3342 struct ip_rt_acct *ip_rt_acct __read_mostly;
3343 #endif /* CONFIG_NET_CLS_ROUTE */
3344
3345 static __initdata unsigned long rhash_entries;
3346 static int __init set_rhash_entries(char *str)
3347 {
3348         if (!str)
3349                 return 0;
3350         rhash_entries = simple_strtoul(str, &str, 0);
3351         return 1;
3352 }
3353 __setup("rhash_entries=", set_rhash_entries);
3354
3355 int __init ip_rt_init(void)
3356 {
3357         int rc = 0;
3358
3359 #ifdef CONFIG_NET_CLS_ROUTE
3360         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3361         if (!ip_rt_acct)
3362                 panic("IP: failed to allocate ip_rt_acct\n");
3363 #endif
3364
3365         ipv4_dst_ops.kmem_cachep =
3366                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3367                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3368
3369         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3370
3371         rt_hash_table = (struct rt_hash_bucket *)
3372                 alloc_large_system_hash("IP route cache",
3373                                         sizeof(struct rt_hash_bucket),
3374                                         rhash_entries,
3375                                         (num_physpages >= 128 * 1024) ?
3376                                         15 : 17,
3377                                         0,
3378                                         &rt_hash_log,
3379                                         &rt_hash_mask,
3380                                         rhash_entries ? 0 : 512 * 1024);
3381         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3382         rt_hash_lock_init();
3383
3384         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3385         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3386
3387         devinet_init();
3388         ip_fib_init();
3389
3390         /* All the timers, started at system startup tend
3391            to synchronize. Perturb it a bit.
3392          */
3393         schedule_delayed_work(&expires_work,
3394                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3395
3396         if (register_pernet_subsys(&rt_secret_timer_ops))
3397                 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3398
3399         if (ip_rt_proc_init())
3400                 printk(KERN_ERR "Unable to create route proc files\n");
3401 #ifdef CONFIG_XFRM
3402         xfrm_init();
3403         xfrm4_init();
3404 #endif
3405         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3406
3407 #ifdef CONFIG_SYSCTL
3408         register_pernet_subsys(&sysctl_route_ops);
3409 #endif
3410         return rc;
3411 }
3412
3413 #ifdef CONFIG_SYSCTL
3414 /*
3415  * We really need to sanitize the damn ipv4 init order, then all
3416  * this nonsense will go away.
3417  */
3418 void __init ip_static_sysctl_init(void)
3419 {
3420         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3421 }
3422 #endif
3423
3424 EXPORT_SYMBOL(__ip_select_ident);
3425 EXPORT_SYMBOL(ip_route_input);
3426 EXPORT_SYMBOL(ip_route_output_key);