kernel/2.6.29.6-aldebaran-rt/net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <net/dst.h>
  94 #include <net/net_namespace.h>
  95 #include <net/protocol.h>
  96 #include <net/ip.h>
  97 #include <net/route.h>
  98 #include <net/inetpeer.h>
  99 #include <net/sock.h>
 100 #include <net/ip_fib.h>
 101 #include <net/arp.h>
 102 #include <net/tcp.h>
 103 #include <net/icmp.h>
 104 #include <net/xfrm.h>
 105 #include <net/netevent.h>
 106 #include <net/rtnetlink.h>
 107 #ifdef CONFIG_SYSCTL
 108 #include <linux/sysctl.h>
 109 #endif
 110
 111 #define RT_FL_TOS(oldflp) \
 112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 113
 114 #define IP_MAX_MTU      0xFFF0
 115
 116 #define RT_GC_TIMEOUT (300*HZ)
 117
 118 static int ip_rt_max_size;
 119 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 120 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 121 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 122 static int ip_rt_redirect_number __read_mostly  = 9;
 123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 125 static int ip_rt_error_cost __read_mostly       = HZ;
 126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 127 static int ip_rt_gc_elasticity __read_mostly    = 8;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 132 static int rt_chain_length_max __read_mostly    = 20;
 133
 134 static void rt_worker_func(struct work_struct *work);
 135 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static void              ipv4_dst_destroy(struct dst_entry *dst);
 143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 144                                          struct net_device *dev, int how);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148 static int rt_garbage_collect(struct dst_ops *ops);
 149 static void rt_emergency_hash_rebuild(struct net *net);
 150
 151
 152 static struct dst_ops ipv4_dst_ops = {
 153         .family =               AF_INET,
 154         .protocol =             __constant_htons(ETH_P_IP),
 155         .gc =                   rt_garbage_collect,
 156         .check =                ipv4_dst_check,
 157         .destroy =              ipv4_dst_destroy,
 158         .ifdown =               ipv4_dst_ifdown,
 159         .negative_advice =      ipv4_negative_advice,
 160         .link_failure =         ipv4_link_failure,
 161         .update_pmtu =          ip_rt_update_pmtu,
 162         .local_out =            __ip_local_out,
 163         .entries =              ATOMIC_INIT(0),
 164 };
 165
 166 #define ECN_OR_COST(class)      TC_PRIO_##class
 167
 168 const __u8 ip_tos2prio[16] = {
 169         TC_PRIO_BESTEFFORT,
 170         ECN_OR_COST(FILLER),
 171         TC_PRIO_BESTEFFORT,
 172         ECN_OR_COST(BESTEFFORT),
 173         TC_PRIO_BULK,
 174         ECN_OR_COST(BULK),
 175         TC_PRIO_BULK,
 176         ECN_OR_COST(BULK),
 177         TC_PRIO_INTERACTIVE,
 178         ECN_OR_COST(INTERACTIVE),
 179         TC_PRIO_INTERACTIVE,
 180         ECN_OR_COST(INTERACTIVE),
 181         TC_PRIO_INTERACTIVE_BULK,
 182         ECN_OR_COST(INTERACTIVE_BULK),
 183         TC_PRIO_INTERACTIVE_BULK,
 184         ECN_OR_COST(INTERACTIVE_BULK)
 185 };
 186
 187
 188 /*
 189  * Route cache.
 190  */
 191
 192 /* The locking scheme is rather straight forward:
 193  *
 194  * 1) Read-Copy Update protects the buckets of the central route hash.
 195  * 2) Only writers remove entries, and they hold the lock
 196  *    as they look at rtable reference counts.
 197  * 3) Only readers acquire references to rtable entries,
 198  *    they do so with atomic increments and with the
 199  *    lock held.
 200  */
 201
 202 struct rt_hash_bucket {
 203         struct rtable   *chain;
 204 };
 205
 206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 207         defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_PREEMPT_RT)
 208 /*
 209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 210  * The size of this table is a power of two and depends on the number of CPUS.
 211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 212  */
 213 #if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT)
 214 # define RT_HASH_LOCK_SZ        256
 215 #else
 216 # if NR_CPUS >= 32
 217 #  define RT_HASH_LOCK_SZ       4096
 218 # elif NR_CPUS >= 16
 219 #  define RT_HASH_LOCK_SZ       2048
 220 # elif NR_CPUS >= 8
 221 #  define RT_HASH_LOCK_SZ       1024
 222 # elif NR_CPUS >= 4
 223 #  define RT_HASH_LOCK_SZ       512
 224 # else
 225 #  define RT_HASH_LOCK_SZ       256
 226 # endif
 227 #endif
 228
 229 static spinlock_t       *rt_hash_locks;
 230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 231
 232 static __init void rt_hash_lock_init(void)
 233 {
 234         int i;
 235
 236         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 237                         GFP_KERNEL);
 238         if (!rt_hash_locks)
 239                 panic("IP: failed to allocate rt_hash_locks\n");
 240
 241         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 242                 spin_lock_init(&rt_hash_locks[i]);
 243 }
 244 #else
 245 # define rt_hash_lock_addr(slot) ((spinlock_t *)NULL)
 246
 247 static inline void rt_hash_lock_init(void)
 248 {
 249 }
 250 #endif
 251
 252 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 253 static unsigned                 rt_hash_mask __read_mostly;
 254 static unsigned int             rt_hash_log  __read_mostly;
 255
 256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 257 #define RT_CACHE_STAT_INC(field) \
 258         (__raw_get_cpu_var(rt_cache_stat).field++)
 259
 260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 261                 int genid)
 262 {
 263         return jhash_3words((__force u32)(__be32)(daddr),
 264                             (__force u32)(__be32)(saddr),
 265                             idx, genid)
 266                 & rt_hash_mask;
 267 }
 268
 269 static inline int rt_genid(struct net *net)
 270 {
 271         return atomic_read(&net->ipv4.rt_genid);
 272 }
 273
 274 #ifdef CONFIG_PROC_FS
 275 struct rt_cache_iter_state {
 276         struct seq_net_private p;
 277         int bucket;
 278         int genid;
 279 };
 280
 281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 282 {
 283         struct rt_cache_iter_state *st = seq->private;
 284         struct rtable *r = NULL;
 285
 286         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 287                 if (!rt_hash_table[st->bucket].chain)
 288                         continue;
 289                 rcu_read_lock_bh();
 290                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
 291                 while (r) {
 292                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 293                             r->rt_genid == st->genid)
 294                                 return r;
 295                         r = rcu_dereference(r->u.dst.rt_next);
 296                 }
 297                 rcu_read_unlock_bh();
 298         }
 299         return r;
 300 }
 301
 302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 303                                           struct rtable *r)
 304 {
 305         struct rt_cache_iter_state *st = seq->private;
 306
 307         r = r->u.dst.rt_next;
 308         while (!r) {
 309                 rcu_read_unlock_bh();
 310                 do {
 311                         if (--st->bucket < 0)
 312                                 return NULL;
 313                 } while (!rt_hash_table[st->bucket].chain);
 314                 rcu_read_lock_bh();
 315                 r = rt_hash_table[st->bucket].chain;
 316         }
 317         return rcu_dereference(r);
 318 }
 319
 320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 321                                         struct rtable *r)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 325                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 326                         continue;
 327                 if (r->rt_genid == st->genid)
 328                         break;
 329         }
 330         return r;
 331 }
 332
 333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 334 {
 335         struct rtable *r = rt_cache_get_first(seq);
 336
 337         if (r)
 338                 while (pos && (r = rt_cache_get_next(seq, r)))
 339                         --pos;
 340         return pos ? NULL : r;
 341 }
 342
 343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346         if (*pos)
 347                 return rt_cache_get_idx(seq, *pos - 1);
 348         st->genid = rt_genid(seq_file_net(seq));
 349         return SEQ_START_TOKEN;
 350 }
 351
 352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 353 {
 354         struct rtable *r;
 355
 356         if (v == SEQ_START_TOKEN)
 357                 r = rt_cache_get_first(seq);
 358         else
 359                 r = rt_cache_get_next(seq, v);
 360         ++*pos;
 361         return r;
 362 }
 363
 364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 365 {
 366         if (v && v != SEQ_START_TOKEN)
 367                 rcu_read_unlock_bh();
 368 }
 369
 370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 371 {
 372         if (v == SEQ_START_TOKEN)
 373                 seq_printf(seq, "%-127s\n",
 374                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 375                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 376                            "HHUptod\tSpecDst");
 377         else {
 378                 struct rtable *r = v;
 379                 int len;
 380
 381                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 382                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 383                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 384                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 385                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 386                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 387                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 388                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 389                         dst_metric(&r->u.dst, RTAX_WINDOW),
 390                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 391                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 392                         r->fl.fl4_tos,
 393                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 394                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 395                                        dev_queue_xmit) : 0,
 396                         r->rt_spec_dst, &len);
 397
 398                 seq_printf(seq, "%*s\n", 127 - len, "");
 399         }
 400         return 0;
 401 }
 402
 403 static const struct seq_operations rt_cache_seq_ops = {
 404         .start  = rt_cache_seq_start,
 405         .next   = rt_cache_seq_next,
 406         .stop   = rt_cache_seq_stop,
 407         .show   = rt_cache_seq_show,
 408 };
 409
 410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 411 {
 412         return seq_open_net(inode, file, &rt_cache_seq_ops,
 413                         sizeof(struct rt_cache_iter_state));
 414 }
 415
 416 static const struct file_operations rt_cache_seq_fops = {
 417         .owner   = THIS_MODULE,
 418         .open    = rt_cache_seq_open,
 419         .read    = seq_read,
 420         .llseek  = seq_lseek,
 421         .release = seq_release_net,
 422 };
 423
 424
 425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 426 {
 427         int cpu;
 428
 429         if (*pos == 0)
 430                 return SEQ_START_TOKEN;
 431
 432         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 433                 if (!cpu_possible(cpu))
 434                         continue;
 435                 *pos = cpu+1;
 436                 return &per_cpu(rt_cache_stat, cpu);
 437         }
 438         return NULL;
 439 }
 440
 441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 442 {
 443         int cpu;
 444
 445         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 446                 if (!cpu_possible(cpu))
 447                         continue;
 448                 *pos = cpu+1;
 449                 return &per_cpu(rt_cache_stat, cpu);
 450         }
 451         return NULL;
 452
 453 }
 454
 455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 456 {
 457
 458 }
 459
 460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 461 {
 462         struct rt_cache_stat *st = v;
 463
 464         if (v == SEQ_START_TOKEN) {
 465                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 466                 return 0;
 467         }
 468
 469         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 470                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 471                    atomic_read(&ipv4_dst_ops.entries),
 472                    st->in_hit,
 473                    st->in_slow_tot,
 474                    st->in_slow_mc,
 475                    st->in_no_route,
 476                    st->in_brd,
 477                    st->in_martian_dst,
 478                    st->in_martian_src,
 479
 480                    st->out_hit,
 481                    st->out_slow_tot,
 482                    st->out_slow_mc,
 483
 484                    st->gc_total,
 485                    st->gc_ignored,
 486                    st->gc_goal_miss,
 487                    st->gc_dst_overflow,
 488                    st->in_hlist_search,
 489                    st->out_hlist_search
 490                 );
 491         return 0;
 492 }
 493
 494 static const struct seq_operations rt_cpu_seq_ops = {
 495         .start  = rt_cpu_seq_start,
 496         .next   = rt_cpu_seq_next,
 497         .stop   = rt_cpu_seq_stop,
 498         .show   = rt_cpu_seq_show,
 499 };
 500
 501
 502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 503 {
 504         return seq_open(file, &rt_cpu_seq_ops);
 505 }
 506
 507 static const struct file_operations rt_cpu_seq_fops = {
 508         .owner   = THIS_MODULE,
 509         .open    = rt_cpu_seq_open,
 510         .read    = seq_read,
 511         .llseek  = seq_lseek,
 512         .release = seq_release,
 513 };
 514
 515 #ifdef CONFIG_NET_CLS_ROUTE
 516 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 517                            int length, int *eof, void *data)
 518 {
 519         unsigned int i;
 520
 521         if ((offset & 3) || (length & 3))
 522                 return -EIO;
 523
 524         if (offset >= sizeof(struct ip_rt_acct) * 256) {
 525                 *eof = 1;
 526                 return 0;
 527         }
 528
 529         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 530                 length = sizeof(struct ip_rt_acct) * 256 - offset;
 531                 *eof = 1;
 532         }
 533
 534         offset /= sizeof(u32);
 535
 536         if (length > 0) {
 537                 u32 *dst = (u32 *) buffer;
 538
 539                 *start = buffer;
 540                 memset(dst, 0, length);
 541
 542                 for_each_possible_cpu(i) {
 543                         unsigned int j;
 544                         u32 *src;
 545
 546                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 547                         for (j = 0; j < length/4; j++)
 548                                 dst[j] += src[j];
 549                 }
 550         }
 551         return length;
 552 }
 553 #endif
 554
 555 static int __net_init ip_rt_do_proc_init(struct net *net)
 556 {
 557         struct proc_dir_entry *pde;
 558
 559         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 560                         &rt_cache_seq_fops);
 561         if (!pde)
 562                 goto err1;
 563
 564         pde = proc_create("rt_cache", S_IRUGO,
 565                           net->proc_net_stat, &rt_cpu_seq_fops);
 566         if (!pde)
 567                 goto err2;
 568
 569 #ifdef CONFIG_NET_CLS_ROUTE
 570         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
 571                         ip_rt_acct_read, NULL);
 572         if (!pde)
 573                 goto err3;
 574 #endif
 575         return 0;
 576
 577 #ifdef CONFIG_NET_CLS_ROUTE
 578 err3:
 579         remove_proc_entry("rt_cache", net->proc_net_stat);
 580 #endif
 581 err2:
 582         remove_proc_entry("rt_cache", net->proc_net);
 583 err1:
 584         return -ENOMEM;
 585 }
 586
 587 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 588 {
 589         remove_proc_entry("rt_cache", net->proc_net_stat);
 590         remove_proc_entry("rt_cache", net->proc_net);
 591         remove_proc_entry("rt_acct", net->proc_net);
 592 }
 593
 594 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 595         .init = ip_rt_do_proc_init,
 596         .exit = ip_rt_do_proc_exit,
 597 };
 598
 599 static int __init ip_rt_proc_init(void)
 600 {
 601         return register_pernet_subsys(&ip_rt_proc_ops);
 602 }
 603
 604 #else
 605 static inline int ip_rt_proc_init(void)
 606 {
 607         return 0;
 608 }
 609 #endif /* CONFIG_PROC_FS */
 610
 611 static inline void rt_free(struct rtable *rt)
 612 {
 613         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 614 }
 615
 616 static inline void rt_drop(struct rtable *rt)
 617 {
 618         ip_rt_put(rt);
 619         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 620 }
 621
 622 static inline int rt_fast_clean(struct rtable *rth)
 623 {
 624         /* Kill broadcast/multicast entries very aggresively, if they
 625            collide in hash table with more useful entries */
 626         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 627                 rth->fl.iif && rth->u.dst.rt_next;
 628 }
 629
 630 static inline int rt_valuable(struct rtable *rth)
 631 {
 632         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 633                 rth->u.dst.expires;
 634 }
 635
 636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 637 {
 638         unsigned long age;
 639         int ret = 0;
 640
 641         if (atomic_read(&rth->u.dst.__refcnt))
 642                 goto out;
 643
 644         ret = 1;
 645         if (rth->u.dst.expires &&
 646             time_after_eq(jiffies, rth->u.dst.expires))
 647                 goto out;
 648
 649         age = jiffies - rth->u.dst.lastuse;
 650         ret = 0;
 651         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 652             (age <= tmo2 && rt_valuable(rth)))
 653                 goto out;
 654         ret = 1;
 655 out:    return ret;
 656 }
 657
 658 /* Bits of score are:
 659  * 31: very valuable
 660  * 30: not quite useless
 661  * 29..0: usage counter
 662  */
 663 static inline u32 rt_score(struct rtable *rt)
 664 {
 665         u32 score = jiffies - rt->u.dst.lastuse;
 666
 667         score = ~score & ~(3<<30);
 668
 669         if (rt_valuable(rt))
 670                 score |= (1<<31);
 671
 672         if (!rt->fl.iif ||
 673             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 674                 score |= (1<<30);
 675
 676         return score;
 677 }
 678
 679 static inline bool rt_caching(const struct net *net)
 680 {
 681         return net->ipv4.current_rt_cache_rebuild_count <=
 682                 net->ipv4.sysctl_rt_cache_rebuild_count;
 683 }
 684
 685 static inline bool compare_hash_inputs(const struct flowi *fl1,
 686                                         const struct flowi *fl2)
 687 {
 688         return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 689                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
 690                 (fl1->iif ^ fl2->iif)) == 0);
 691 }
 692
 693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 694 {
 695         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 696                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 697                 (fl1->mark ^ fl2->mark) |
 698                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 699                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 700                 (fl1->oif ^ fl2->oif) |
 701                 (fl1->iif ^ fl2->iif)) == 0;
 702 }
 703
 704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 705 {
 706         return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
 707 }
 708
 709 static inline int rt_is_expired(struct rtable *rth)
 710 {
 711         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
 712 }
 713
 714 /*
 715  * Perform a full scan of hash table and free all entries.
 716  * Can be called by a softirq or a process.
 717  * In the later case, we want to be reschedule if necessary
 718  */
 719 static void rt_do_flush(int process_context)
 720 {
 721         unsigned int i;
 722         struct rtable *rth, *next;
 723         struct rtable * tail;
 724
 725         for (i = 0; i <= rt_hash_mask; i++) {
 726                 if (process_context && need_resched())
 727                         cond_resched();
 728                 rth = rt_hash_table[i].chain;
 729                 if (!rth)
 730                         continue;
 731
 732                 spin_lock_bh(rt_hash_lock_addr(i));
 733 #ifdef CONFIG_NET_NS
 734                 {
 735                 struct rtable ** prev, * p;
 736
 737                 rth = rt_hash_table[i].chain;
 738
 739                 /* defer releasing the head of the list after spin_unlock */
 740                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
 741                         if (!rt_is_expired(tail))
 742                                 break;
 743                 if (rth != tail)
 744                         rt_hash_table[i].chain = tail;
 745
 746                 /* call rt_free on entries after the tail requiring flush */
 747                 prev = &rt_hash_table[i].chain;
 748                 for (p = *prev; p; p = next) {
 749                         next = p->u.dst.rt_next;
 750                         if (!rt_is_expired(p)) {
 751                                 prev = &p->u.dst.rt_next;
 752                         } else {
 753                                 *prev = next;
 754                                 rt_free(p);
 755                         }
 756                 }
 757                 }
 758 #else
 759                 rth = rt_hash_table[i].chain;
 760                 rt_hash_table[i].chain = NULL;
 761                 tail = NULL;
 762 #endif
 763                 spin_unlock_bh(rt_hash_lock_addr(i));
 764
 765                 for (; rth != tail; rth = next) {
 766                         next = rth->u.dst.rt_next;
 767                         rt_free(rth);
 768                 }
 769         }
 770 }
 771
 772 /*
 773  * While freeing expired entries, we compute average chain length
 774  * and standard deviation, using fixed-point arithmetic.
 775  * This to have an estimation of rt_chain_length_max
 776  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 777  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 778  */
 779
 780 #define FRACT_BITS 3
 781 #define ONE (1UL << FRACT_BITS)
 782
 783 static void rt_check_expire(void)
 784 {
 785         static unsigned int rover;
 786         unsigned int i = rover, goal;
 787         struct rtable *rth, *aux, **rthp;
 788         unsigned long samples = 0;
 789         unsigned long sum = 0, sum2 = 0;
 790         u64 mult;
 791
 792         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 793         if (ip_rt_gc_timeout > 1)
 794                 do_div(mult, ip_rt_gc_timeout);
 795         goal = (unsigned int)mult;
 796         if (goal > rt_hash_mask)
 797                 goal = rt_hash_mask + 1;
 798         for (; goal > 0; goal--) {
 799                 unsigned long tmo = ip_rt_gc_timeout;
 800                 unsigned long length;
 801
 802                 i = (i + 1) & rt_hash_mask;
 803                 rthp = &rt_hash_table[i].chain;
 804
 805                 if (need_resched())
 806                         cond_resched();
 807
 808                 samples++;
 809
 810                 if (*rthp == NULL)
 811                         continue;
 812                 length = 0;
 813                 spin_lock_bh(rt_hash_lock_addr(i));
 814                 while ((rth = *rthp) != NULL) {
 815                         prefetch(rth->u.dst.rt_next);
 816                         if (rt_is_expired(rth)) {
 817                                 *rthp = rth->u.dst.rt_next;
 818                                 rt_free(rth);
 819                                 continue;
 820                         }
 821                         if (rth->u.dst.expires) {
 822                                 /* Entry is expired even if it is in use */
 823                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 824 nofree:
 825                                         tmo >>= 1;
 826                                         rthp = &rth->u.dst.rt_next;
 827                                         /*
 828                                          * We only count entries on
 829                                          * a chain with equal hash inputs once
 830                                          * so that entries for different QOS
 831                                          * levels, and other non-hash input
 832                                          * attributes don't unfairly skew
 833                                          * the length computation
 834                                          */
 835                                         for (aux = rt_hash_table[i].chain;;) {
 836                                                 if (aux == rth) {
 837                                                         length += ONE;
 838                                                         break;
 839                                                 }
 840                                                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 841                                                         break;
 842                                                 aux = aux->u.dst.rt_next;
 843                                         }
 844                                         continue;
 845                                 }
 846                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 847                                 goto nofree;
 848
 849                         /* Cleanup aged off entries. */
 850                         *rthp = rth->u.dst.rt_next;
 851                         rt_free(rth);
 852                 }
 853                 spin_unlock_bh(rt_hash_lock_addr(i));
 854                 sum += length;
 855                 sum2 += length*length;
 856         }
 857         if (samples) {
 858                 unsigned long avg = sum / samples;
 859                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 860                 rt_chain_length_max = max_t(unsigned long,
 861                                         ip_rt_gc_elasticity,
 862                                         (avg + 4*sd) >> FRACT_BITS);
 863         }
 864         rover = i;
 865 }
 866
 867 /*
 868  * rt_worker_func() is run in process context.
 869  * we call rt_check_expire() to scan part of the hash table
 870  */
 871 static void rt_worker_func(struct work_struct *work)
 872 {
 873         rt_check_expire();
 874         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 875 }
 876
 877 /*
 878  * Pertubation of rt_genid by a small quantity [1..256]
 879  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 880  * many times (2^24) without giving recent rt_genid.
 881  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 882  */
 883 static void rt_cache_invalidate(struct net *net)
 884 {
 885         unsigned char shuffle;
 886
 887         get_random_bytes(&shuffle, sizeof(shuffle));
 888         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 889 }
 890
 891 /*
 892  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 893  * delay >= 0 : invalidate & flush cache (can be long)
 894  */
 895 void rt_cache_flush(struct net *net, int delay)
 896 {
 897         rt_cache_invalidate(net);
 898         if (delay >= 0)
 899                 rt_do_flush(!in_softirq());
 900 }
 901
 902 /*
 903  * We change rt_genid and let gc do the cleanup
 904  */
 905 static void rt_secret_rebuild(unsigned long __net)
 906 {
 907         struct net *net = (struct net *)__net;
 908         rt_cache_invalidate(net);
 909         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 910 }
 911
 912 static void rt_secret_rebuild_oneshot(struct net *net)
 913 {
 914         del_timer_sync(&net->ipv4.rt_secret_timer);
 915         rt_cache_invalidate(net);
 916         if (ip_rt_secret_interval) {
 917                 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
 918                 add_timer(&net->ipv4.rt_secret_timer);
 919         }
 920 }
 921
 922 static void rt_emergency_hash_rebuild(struct net *net)
 923 {
 924         if (net_ratelimit()) {
 925                 printk(KERN_WARNING "Route hash chain too long!\n");
 926                 printk(KERN_WARNING "Adjust your secret_interval!\n");
 927         }
 928
 929         rt_secret_rebuild_oneshot(net);
 930 }
 931
 932 /*
 933    Short description of GC goals.
 934
 935    We want to build algorithm, which will keep routing cache
 936    at some equilibrium point, when number of aged off entries
 937    is kept approximately equal to newly generated ones.
 938
 939    Current expiration strength is variable "expire".
 940    We try to adjust it dynamically, so that if networking
 941    is idle expires is large enough to keep enough of warm entries,
 942    and when load increases it reduces to limit cache size.
 943  */
 944
 945 static int rt_garbage_collect(struct dst_ops *ops)
 946 {
 947         static unsigned long expire = RT_GC_TIMEOUT;
 948         static unsigned long last_gc;
 949         static int rover;
 950         static int equilibrium;
 951         struct rtable *rth, **rthp;
 952         unsigned long now = jiffies;
 953         int goal;
 954
 955         /*
 956          * Garbage collection is pretty expensive,
 957          * do not make it too frequently.
 958          */
 959
 960         RT_CACHE_STAT_INC(gc_total);
 961
 962         if (now - last_gc < ip_rt_gc_min_interval &&
 963             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 964                 RT_CACHE_STAT_INC(gc_ignored);
 965                 goto out;
 966         }
 967
 968         /* Calculate number of entries, which we want to expire now. */
 969         goal = atomic_read(&ipv4_dst_ops.entries) -
 970                 (ip_rt_gc_elasticity << rt_hash_log);
 971         if (goal <= 0) {
 972                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 973                         equilibrium = ipv4_dst_ops.gc_thresh;
 974                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 975                 if (goal > 0) {
 976                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 977                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 978                 }
 979         } else {
 980                 /* We are in dangerous area. Try to reduce cache really
 981                  * aggressively.
 982                  */
 983                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 984                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 985         }
 986
 987         if (now - last_gc >= ip_rt_gc_min_interval)
 988                 last_gc = now;
 989
 990         if (goal <= 0) {
 991                 equilibrium += goal;
 992                 goto work_done;
 993         }
 994
 995         do {
 996                 int i, k;
 997
 998                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 999                         unsigned long tmo = expire;
1000
1001                         k = (k + 1) & rt_hash_mask;
1002                         rthp = &rt_hash_table[k].chain;
1003                         spin_lock_bh(rt_hash_lock_addr(k));
1004                         while ((rth = *rthp) != NULL) {
1005                                 if (!rt_is_expired(rth) &&
1006                                         !rt_may_expire(rth, tmo, expire)) {
1007                                         tmo >>= 1;
1008                                         rthp = &rth->u.dst.rt_next;
1009                                         continue;
1010                                 }
1011                                 *rthp = rth->u.dst.rt_next;
1012                                 rt_free(rth);
1013                                 goal--;
1014                         }
1015                         spin_unlock_bh(rt_hash_lock_addr(k));
1016                         if (goal <= 0)
1017                                 break;
1018                 }
1019                 rover = k;
1020
1021                 if (goal <= 0)
1022                         goto work_done;
1023
1024                 /* Goal is not achieved. We stop process if:
1025
1026                    - if expire reduced to zero. Otherwise, expire is halfed.
1027                    - if table is not full.
1028                    - if we are called from interrupt.
1029                    - jiffies check is just fallback/debug loop breaker.
1030                      We will not spin here for long time in any case.
1031                  */
1032
1033                 RT_CACHE_STAT_INC(gc_goal_miss);
1034
1035                 if (expire == 0)
1036                         break;
1037
1038                 expire >>= 1;
1039 #if RT_CACHE_DEBUG >= 2
1040                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1041                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
1042 #endif
1043
1044                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1045                         goto out;
1046         } while (!in_softirq() && time_before_eq(jiffies, now));
1047
1048         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1049                 goto out;
1050         if (net_ratelimit())
1051                 printk(KERN_WARNING "dst cache overflow\n");
1052         RT_CACHE_STAT_INC(gc_dst_overflow);
1053         return 1;
1054
1055 work_done:
1056         expire += ip_rt_gc_min_interval;
1057         if (expire > ip_rt_gc_timeout ||
1058             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1059                 expire = ip_rt_gc_timeout;
1060 #if RT_CACHE_DEBUG >= 2
1061         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1062                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
1063 #endif
1064 out:    return 0;
1065 }
1066
1067 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
1068 {
1069         struct rtable   *rth, **rthp;
1070         unsigned long   now;
1071         struct rtable *cand, **candp;
1072         u32             min_score;
1073         int             chain_length;
1074         int attempts = !in_softirq();
1075
1076 restart:
1077         chain_length = 0;
1078         min_score = ~(u32)0;
1079         cand = NULL;
1080         candp = NULL;
1081         now = jiffies;
1082
1083         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1084                 rt_drop(rt);
1085                 return 0;
1086         }
1087
1088         rthp = &rt_hash_table[hash].chain;
1089
1090         spin_lock_bh(rt_hash_lock_addr(hash));
1091         while ((rth = *rthp) != NULL) {
1092                 if (rt_is_expired(rth)) {
1093                         *rthp = rth->u.dst.rt_next;
1094                         rt_free(rth);
1095                         continue;
1096                 }
1097                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1098                         /* Put it first */
1099                         *rthp = rth->u.dst.rt_next;
1100                         /*
1101                          * Since lookup is lockfree, the deletion
1102                          * must be visible to another weakly ordered CPU before
1103                          * the insertion at the start of the hash chain.
1104                          */
1105                         rcu_assign_pointer(rth->u.dst.rt_next,
1106                                            rt_hash_table[hash].chain);
1107                         /*
1108                          * Since lookup is lockfree, the update writes
1109                          * must be ordered for consistency on SMP.
1110                          */
1111                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1112
1113                         dst_use(&rth->u.dst, now);
1114                         spin_unlock_bh(rt_hash_lock_addr(hash));
1115
1116                         rt_drop(rt);
1117                         *rp = rth;
1118                         return 0;
1119                 }
1120
1121                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1122                         u32 score = rt_score(rth);
1123
1124                         if (score <= min_score) {
1125                                 cand = rth;
1126                                 candp = rthp;
1127                                 min_score = score;
1128                         }
1129                 }
1130
1131                 chain_length++;
1132
1133                 rthp = &rth->u.dst.rt_next;
1134         }
1135
1136         if (cand) {
1137                 /* ip_rt_gc_elasticity used to be average length of chain
1138                  * length, when exceeded gc becomes really aggressive.
1139                  *
1140                  * The second limit is less certain. At the moment it allows
1141                  * only 2 entries per bucket. We will see.
1142                  */
1143                 if (chain_length > ip_rt_gc_elasticity) {
1144                         *candp = cand->u.dst.rt_next;
1145                         rt_free(cand);
1146                 }
1147         } else {
1148                 if (chain_length > rt_chain_length_max) {
1149                         struct net *net = dev_net(rt->u.dst.dev);
1150                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1151                         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1152                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1153                                         rt->u.dst.dev->name, num);
1154                         }
1155                         rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1156                 }
1157         }
1158
1159         /* Try to bind route to arp only if it is output
1160            route or unicast forwarding path.
1161          */
1162         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1163                 int err = arp_bind_neighbour(&rt->u.dst);
1164                 if (err) {
1165                         spin_unlock_bh(rt_hash_lock_addr(hash));
1166
1167                         if (err != -ENOBUFS) {
1168                                 rt_drop(rt);
1169                                 return err;
1170                         }
1171
1172                         /* Neighbour tables are full and nothing
1173                            can be released. Try to shrink route cache,
1174                            it is most likely it holds some neighbour records.
1175                          */
1176                         if (attempts-- > 0) {
1177                                 int saved_elasticity = ip_rt_gc_elasticity;
1178                                 int saved_int = ip_rt_gc_min_interval;
1179                                 ip_rt_gc_elasticity     = 1;
1180                                 ip_rt_gc_min_interval   = 0;
1181                                 rt_garbage_collect(&ipv4_dst_ops);
1182                                 ip_rt_gc_min_interval   = saved_int;
1183                                 ip_rt_gc_elasticity     = saved_elasticity;
1184                                 goto restart;
1185                         }
1186
1187                         if (net_ratelimit())
1188                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1189                         rt_drop(rt);
1190                         return -ENOBUFS;
1191                 }
1192         }
1193
1194         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1195
1196 #if RT_CACHE_DEBUG >= 2
1197         if (rt->u.dst.rt_next) {
1198                 struct rtable *trt;
1199                 printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst);
1200                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1201                         printk(" . %pI4", &trt->rt_dst);
1202                 printk("\n");
1203         }
1204 #endif
1205         /*
1206          * Since lookup is lockfree, we must make sure
1207          * previous writes to rt are comitted to memory
1208          * before making rt visible to other CPUS.
1209          */
1210         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1211
1212         spin_unlock_bh(rt_hash_lock_addr(hash));
1213         *rp = rt;
1214         return 0;
1215 }
1216
1217 void rt_bind_peer(struct rtable *rt, int create)
1218 {
1219         static DEFINE_SPINLOCK(rt_peer_lock);
1220         struct inet_peer *peer;
1221
1222         peer = inet_getpeer(rt->rt_dst, create);
1223
1224         spin_lock_bh(&rt_peer_lock);
1225         if (rt->peer == NULL) {
1226                 rt->peer = peer;
1227                 peer = NULL;
1228         }
1229         spin_unlock_bh(&rt_peer_lock);
1230         if (peer)
1231                 inet_putpeer(peer);
1232 }
1233
1234 /*
1235  * Peer allocation may fail only in serious out-of-memory conditions.  However
1236  * we still can generate some output.
1237  * Random ID selection looks a bit dangerous because we have no chances to
1238  * select ID being unique in a reasonable period of time.
1239  * But broken packet identifier may be better than no packet at all.
1240  */
1241 static void ip_select_fb_ident(struct iphdr *iph)
1242 {
1243         static DEFINE_SPINLOCK(ip_fb_id_lock);
1244         static u32 ip_fallback_id;
1245         u32 salt;
1246
1247         spin_lock_bh(&ip_fb_id_lock);
1248         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1249         iph->id = htons(salt & 0xFFFF);
1250         ip_fallback_id = salt;
1251         spin_unlock_bh(&ip_fb_id_lock);
1252 }
1253
1254 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1255 {
1256         struct rtable *rt = (struct rtable *) dst;
1257
1258         if (rt) {
1259                 if (rt->peer == NULL)
1260                         rt_bind_peer(rt, 1);
1261
1262                 /* If peer is attached to destination, it is never detached,
1263                    so that we need not to grab a lock to dereference it.
1264                  */
1265                 if (rt->peer) {
1266                         iph->id = htons(inet_getid(rt->peer, more));
1267                         return;
1268                 }
1269         } else
1270                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1271                        __builtin_return_address(0));
1272
1273         ip_select_fb_ident(iph);
1274 }
1275
1276 static void rt_del(unsigned hash, struct rtable *rt)
1277 {
1278         struct rtable **rthp, *aux;
1279
1280         rthp = &rt_hash_table[hash].chain;
1281         spin_lock_bh(rt_hash_lock_addr(hash));
1282         ip_rt_put(rt);
1283         while ((aux = *rthp) != NULL) {
1284                 if (aux == rt || rt_is_expired(aux)) {
1285                         *rthp = aux->u.dst.rt_next;
1286                         rt_free(aux);
1287                         continue;
1288                 }
1289                 rthp = &aux->u.dst.rt_next;
1290         }
1291         spin_unlock_bh(rt_hash_lock_addr(hash));
1292 }
1293
1294 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1295                     __be32 saddr, struct net_device *dev)
1296 {
1297         int i, k;
1298         struct in_device *in_dev = in_dev_get(dev);
1299         struct rtable *rth, **rthp;
1300         __be32  skeys[2] = { saddr, 0 };
1301         int  ikeys[2] = { dev->ifindex, 0 };
1302         struct netevent_redirect netevent;
1303         struct net *net;
1304
1305         if (!in_dev)
1306                 return;
1307
1308         net = dev_net(dev);
1309         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1310             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1311             || ipv4_is_zeronet(new_gw))
1312                 goto reject_redirect;
1313
1314         if (!rt_caching(net))
1315                 goto reject_redirect;
1316
1317         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1318                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1319                         goto reject_redirect;
1320                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1321                         goto reject_redirect;
1322         } else {
1323                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1324                         goto reject_redirect;
1325         }
1326
1327         for (i = 0; i < 2; i++) {
1328                 for (k = 0; k < 2; k++) {
1329                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1330                                                 rt_genid(net));
1331
1332                         rthp=&rt_hash_table[hash].chain;
1333
1334                         rcu_read_lock();
1335                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1336                                 struct rtable *rt;
1337
1338                                 if (rth->fl.fl4_dst != daddr ||
1339                                     rth->fl.fl4_src != skeys[i] ||
1340                                     rth->fl.oif != ikeys[k] ||
1341                                     rth->fl.iif != 0 ||
1342                                     rt_is_expired(rth) ||
1343                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1344                                         rthp = &rth->u.dst.rt_next;
1345                                         continue;
1346                                 }
1347
1348                                 if (rth->rt_dst != daddr ||
1349                                     rth->rt_src != saddr ||
1350                                     rth->u.dst.error ||
1351                                     rth->rt_gateway != old_gw ||
1352                                     rth->u.dst.dev != dev)
1353                                         break;
1354
1355                                 dst_hold(&rth->u.dst);
1356                                 rcu_read_unlock();
1357
1358                                 rt = dst_alloc(&ipv4_dst_ops);
1359                                 if (rt == NULL) {
1360                                         ip_rt_put(rth);
1361                                         in_dev_put(in_dev);
1362                                         return;
1363                                 }
1364
1365                                 /* Copy all the information. */
1366                                 *rt = *rth;
1367                                 rt->u.dst.__use         = 1;
1368                                 atomic_set(&rt->u.dst.__refcnt, 1);
1369                                 rt->u.dst.child         = NULL;
1370                                 if (rt->u.dst.dev)
1371                                         dev_hold(rt->u.dst.dev);
1372                                 if (rt->idev)
1373                                         in_dev_hold(rt->idev);
1374                                 rt->u.dst.obsolete      = 0;
1375                                 rt->u.dst.lastuse       = jiffies;
1376                                 rt->u.dst.path          = &rt->u.dst;
1377                                 rt->u.dst.neighbour     = NULL;
1378                                 rt->u.dst.hh            = NULL;
1379 #ifdef CONFIG_XFRM
1380                                 rt->u.dst.xfrm          = NULL;
1381 #endif
1382                                 rt->rt_genid            = rt_genid(net);
1383                                 rt->rt_flags            |= RTCF_REDIRECTED;
1384
1385                                 /* Gateway is different ... */
1386                                 rt->rt_gateway          = new_gw;
1387
1388                                 /* Redirect received -> path was valid */
1389                                 dst_confirm(&rth->u.dst);
1390
1391                                 if (rt->peer)
1392                                         atomic_inc(&rt->peer->refcnt);
1393
1394                                 if (arp_bind_neighbour(&rt->u.dst) ||
1395                                     !(rt->u.dst.neighbour->nud_state &
1396                                             NUD_VALID)) {
1397                                         if (rt->u.dst.neighbour)
1398                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1399                                         ip_rt_put(rth);
1400                                         rt_drop(rt);
1401                                         goto do_next;
1402                                 }
1403
1404                                 netevent.old = &rth->u.dst;
1405                                 netevent.new = &rt->u.dst;
1406                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1407                                                         &netevent);
1408
1409                                 rt_del(hash, rth);
1410                                 if (!rt_intern_hash(hash, rt, &rt))
1411                                         ip_rt_put(rt);
1412                                 goto do_next;
1413                         }
1414                         rcu_read_unlock();
1415                 do_next:
1416                         ;
1417                 }
1418         }
1419         in_dev_put(in_dev);
1420         return;
1421
1422 reject_redirect:
1423 #ifdef CONFIG_IP_ROUTE_VERBOSE
1424         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1425                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1426                         "  Advised path = %pI4 -> %pI4\n",
1427                        &old_gw, dev->name, &new_gw,
1428                        &saddr, &daddr);
1429 #endif
1430         in_dev_put(in_dev);
1431 }
1432
1433 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1434 {
1435         struct rtable *rt = (struct rtable *)dst;
1436         struct dst_entry *ret = dst;
1437
1438         if (rt) {
1439                 if (dst->obsolete) {
1440                         ip_rt_put(rt);
1441                         ret = NULL;
1442                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1443                            rt->u.dst.expires) {
1444                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1445                                                 rt->fl.oif,
1446                                                 rt_genid(dev_net(dst->dev)));
1447 #if RT_CACHE_DEBUG >= 1
1448                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1449                                 &rt->rt_dst, rt->fl.fl4_tos);
1450 #endif
1451                         rt_del(hash, rt);
1452                         ret = NULL;
1453                 }
1454         }
1455         return ret;
1456 }
1457
1458 /*
1459  * Algorithm:
1460  *      1. The first ip_rt_redirect_number redirects are sent
1461  *         with exponential backoff, then we stop sending them at all,
1462  *         assuming that the host ignores our redirects.
1463  *      2. If we did not see packets requiring redirects
1464  *         during ip_rt_redirect_silence, we assume that the host
1465  *         forgot redirected route and start to send redirects again.
1466  *
1467  * This algorithm is much cheaper and more intelligent than dumb load limiting
1468  * in icmp.c.
1469  *
1470  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1471  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1472  */
1473
1474 void ip_rt_send_redirect(struct sk_buff *skb)
1475 {
1476         struct rtable *rt = skb->rtable;
1477         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1478
1479         if (!in_dev)
1480                 return;
1481
1482         if (!IN_DEV_TX_REDIRECTS(in_dev))
1483                 goto out;
1484
1485         /* No redirected packets during ip_rt_redirect_silence;
1486          * reset the algorithm.
1487          */
1488         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1489                 rt->u.dst.rate_tokens = 0;
1490
1491         /* Too many ignored redirects; do not send anything
1492          * set u.dst.rate_last to the last seen redirected packet.
1493          */
1494         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1495                 rt->u.dst.rate_last = jiffies;
1496                 goto out;
1497         }
1498
1499         /* Check for load limit; set rate_last to the latest sent
1500          * redirect.
1501          */
1502         if (rt->u.dst.rate_tokens == 0 ||
1503             time_after(jiffies,
1504                        (rt->u.dst.rate_last +
1505                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1506                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1507                 rt->u.dst.rate_last = jiffies;
1508                 ++rt->u.dst.rate_tokens;
1509 #ifdef CONFIG_IP_ROUTE_VERBOSE
1510                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1511                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1512                     net_ratelimit())
1513                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1514                                 &rt->rt_src, rt->rt_iif,
1515                                 &rt->rt_dst, &rt->rt_gateway);
1516 #endif
1517         }
1518 out:
1519         in_dev_put(in_dev);
1520 }
1521
1522 static int ip_error(struct sk_buff *skb)
1523 {
1524         struct rtable *rt = skb->rtable;
1525         unsigned long now;
1526         int code;
1527
1528         switch (rt->u.dst.error) {
1529                 case EINVAL:
1530                 default:
1531                         goto out;
1532                 case EHOSTUNREACH:
1533                         code = ICMP_HOST_UNREACH;
1534                         break;
1535                 case ENETUNREACH:
1536                         code = ICMP_NET_UNREACH;
1537                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1538                                         IPSTATS_MIB_INNOROUTES);
1539                         break;
1540                 case EACCES:
1541                         code = ICMP_PKT_FILTERED;
1542                         break;
1543         }
1544
1545         now = jiffies;
1546         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1547         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1548                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1549         rt->u.dst.rate_last = now;
1550         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1551                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1552                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1553         }
1554
1555 out:    kfree_skb(skb);
1556         return 0;
1557 }
1558
1559 /*
1560  *      The last two values are not from the RFC but
1561  *      are needed for AMPRnet AX.25 paths.
1562  */
1563
1564 static const unsigned short mtu_plateau[] =
1565 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1566
1567 static inline unsigned short guess_mtu(unsigned short old_mtu)
1568 {
1569         int i;
1570
1571         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1572                 if (old_mtu > mtu_plateau[i])
1573                         return mtu_plateau[i];
1574         return 68;
1575 }
1576
1577 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1578                                  unsigned short new_mtu,
1579                                  struct net_device *dev)
1580 {
1581         int i, k;
1582         unsigned short old_mtu = ntohs(iph->tot_len);
1583         struct rtable *rth;
1584         int  ikeys[2] = { dev->ifindex, 0 };
1585         __be32  skeys[2] = { iph->saddr, 0, };
1586         __be32  daddr = iph->daddr;
1587         unsigned short est_mtu = 0;
1588
1589         if (ipv4_config.no_pmtu_disc)
1590                 return 0;
1591
1592         for (k = 0; k < 2; k++) {
1593                 for (i = 0; i < 2; i++) {
1594                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1595                                                 rt_genid(net));
1596
1597                         rcu_read_lock();
1598                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1599                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1600                                 unsigned short mtu = new_mtu;
1601
1602                                 if (rth->fl.fl4_dst != daddr ||
1603                                     rth->fl.fl4_src != skeys[i] ||
1604                                     rth->rt_dst != daddr ||
1605                                     rth->rt_src != iph->saddr ||
1606                                     rth->fl.oif != ikeys[k] ||
1607                                     rth->fl.iif != 0 ||
1608                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1609                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1610                                     rt_is_expired(rth))
1611                                         continue;
1612
1613                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1614
1615                                         /* BSD 4.2 compatibility hack :-( */
1616                                         if (mtu == 0 &&
1617                                             old_mtu >= dst_mtu(&rth->u.dst) &&
1618                                             old_mtu >= 68 + (iph->ihl << 2))
1619                                                 old_mtu -= iph->ihl << 2;
1620
1621                                         mtu = guess_mtu(old_mtu);
1622                                 }
1623                                 if (mtu <= dst_mtu(&rth->u.dst)) {
1624                                         if (mtu < dst_mtu(&rth->u.dst)) {
1625                                                 dst_confirm(&rth->u.dst);
1626                                                 if (mtu < ip_rt_min_pmtu) {
1627                                                         mtu = ip_rt_min_pmtu;
1628                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1629                                                                 (1 << RTAX_MTU);
1630                                                 }
1631                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1632                                                 dst_set_expires(&rth->u.dst,
1633                                                         ip_rt_mtu_expires);
1634                                         }
1635                                         est_mtu = mtu;
1636                                 }
1637                         }
1638                         rcu_read_unlock();
1639                 }
1640         }
1641         return est_mtu ? : new_mtu;
1642 }
1643
1644 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1645 {
1646         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1647             !(dst_metric_locked(dst, RTAX_MTU))) {
1648                 if (mtu < ip_rt_min_pmtu) {
1649                         mtu = ip_rt_min_pmtu;
1650                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1651                 }
1652                 dst->metrics[RTAX_MTU-1] = mtu;
1653                 dst_set_expires(dst, ip_rt_mtu_expires);
1654                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1655         }
1656 }
1657
1658 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1659 {
1660         return NULL;
1661 }
1662
1663 static void ipv4_dst_destroy(struct dst_entry *dst)
1664 {
1665         struct rtable *rt = (struct rtable *) dst;
1666         struct inet_peer *peer = rt->peer;
1667         struct in_device *idev = rt->idev;
1668
1669         if (peer) {
1670                 rt->peer = NULL;
1671                 inet_putpeer(peer);
1672         }
1673
1674         if (idev) {
1675                 rt->idev = NULL;
1676                 in_dev_put(idev);
1677         }
1678 }
1679
1680 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1681                             int how)
1682 {
1683         struct rtable *rt = (struct rtable *) dst;
1684         struct in_device *idev = rt->idev;
1685         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1686                 struct in_device *loopback_idev =
1687                         in_dev_get(dev_net(dev)->loopback_dev);
1688                 if (loopback_idev) {
1689                         rt->idev = loopback_idev;
1690                         in_dev_put(idev);
1691                 }
1692         }
1693 }
1694
1695 static void ipv4_link_failure(struct sk_buff *skb)
1696 {
1697         struct rtable *rt;
1698
1699         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1700
1701         rt = skb->rtable;
1702         if (rt)
1703                 dst_set_expires(&rt->u.dst, 0);
1704 }
1705
1706 static int ip_rt_bug(struct sk_buff *skb)
1707 {
1708         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1709                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1710                 skb->dev ? skb->dev->name : "?");
1711         kfree_skb(skb);
1712         return 0;
1713 }
1714
1715 /*
1716    We do not cache source address of outgoing interface,
1717    because it is used only by IP RR, TS and SRR options,
1718    so that it out of fast path.
1719
1720    BTW remember: "addr" is allowed to be not aligned
1721    in IP options!
1722  */
1723
1724 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1725 {
1726         __be32 src;
1727         struct fib_result res;
1728
1729         if (rt->fl.iif == 0)
1730                 src = rt->rt_src;
1731         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1732                 src = FIB_RES_PREFSRC(res);
1733                 fib_res_put(&res);
1734         } else
1735                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1736                                         RT_SCOPE_UNIVERSE);
1737         memcpy(addr, &src, 4);
1738 }
1739
1740 #ifdef CONFIG_NET_CLS_ROUTE
1741 static void set_class_tag(struct rtable *rt, u32 tag)
1742 {
1743         if (!(rt->u.dst.tclassid & 0xFFFF))
1744                 rt->u.dst.tclassid |= tag & 0xFFFF;
1745         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1746                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1747 }
1748 #endif
1749
1750 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1751 {
1752         struct fib_info *fi = res->fi;
1753
1754         if (fi) {
1755                 if (FIB_RES_GW(*res) &&
1756                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1757                         rt->rt_gateway = FIB_RES_GW(*res);
1758                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1759                        sizeof(rt->u.dst.metrics));
1760                 if (fi->fib_mtu == 0) {
1761                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1762                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1763                             rt->rt_gateway != rt->rt_dst &&
1764                             rt->u.dst.dev->mtu > 576)
1765                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1766                 }
1767 #ifdef CONFIG_NET_CLS_ROUTE
1768                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1769 #endif
1770         } else
1771                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1772
1773         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1774                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1775         if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1776                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1777         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1778                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1779                                        ip_rt_min_advmss);
1780         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1781                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1782
1783 #ifdef CONFIG_NET_CLS_ROUTE
1784 #ifdef CONFIG_IP_MULTIPLE_TABLES
1785         set_class_tag(rt, fib_rules_tclass(res));
1786 #endif
1787         set_class_tag(rt, itag);
1788 #endif
1789         rt->rt_type = res->type;
1790 }
1791
1792 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1793                                 u8 tos, struct net_device *dev, int our)
1794 {
1795         unsigned hash;
1796         struct rtable *rth;
1797         __be32 spec_dst;
1798         struct in_device *in_dev = in_dev_get(dev);
1799         u32 itag = 0;
1800
1801         /* Primary sanity checks. */
1802
1803         if (in_dev == NULL)
1804                 return -EINVAL;
1805
1806         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1807             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1808                 goto e_inval;
1809
1810         if (ipv4_is_zeronet(saddr)) {
1811                 if (!ipv4_is_local_multicast(daddr))
1812                         goto e_inval;
1813                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1814         } else if (fib_validate_source(saddr, 0, tos, 0,
1815                                         dev, &spec_dst, &itag) < 0)
1816                 goto e_inval;
1817
1818         rth = dst_alloc(&ipv4_dst_ops);
1819         if (!rth)
1820                 goto e_nobufs;
1821
1822         rth->u.dst.output= ip_rt_bug;
1823
1824         atomic_set(&rth->u.dst.__refcnt, 1);
1825         rth->u.dst.flags= DST_HOST;
1826         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1827                 rth->u.dst.flags |= DST_NOPOLICY;
1828         rth->fl.fl4_dst = daddr;
1829         rth->rt_dst     = daddr;
1830         rth->fl.fl4_tos = tos;
1831         rth->fl.mark    = skb->mark;
1832         rth->fl.fl4_src = saddr;
1833         rth->rt_src     = saddr;
1834 #ifdef CONFIG_NET_CLS_ROUTE
1835         rth->u.dst.tclassid = itag;
1836 #endif
1837         rth->rt_iif     =
1838         rth->fl.iif     = dev->ifindex;
1839         rth->u.dst.dev  = init_net.loopback_dev;
1840         dev_hold(rth->u.dst.dev);
1841         rth->idev       = in_dev_get(rth->u.dst.dev);
1842         rth->fl.oif     = 0;
1843         rth->rt_gateway = daddr;
1844         rth->rt_spec_dst= spec_dst;
1845         rth->rt_genid   = rt_genid(dev_net(dev));
1846         rth->rt_flags   = RTCF_MULTICAST;
1847         rth->rt_type    = RTN_MULTICAST;
1848         if (our) {
1849                 rth->u.dst.input= ip_local_deliver;
1850                 rth->rt_flags |= RTCF_LOCAL;
1851         }
1852
1853 #ifdef CONFIG_IP_MROUTE
1854         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1855                 rth->u.dst.input = ip_mr_input;
1856 #endif
1857         RT_CACHE_STAT_INC(in_slow_mc);
1858
1859         in_dev_put(in_dev);
1860         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1861         return rt_intern_hash(hash, rth, &skb->rtable);
1862
1863 e_nobufs:
1864         in_dev_put(in_dev);
1865         return -ENOBUFS;
1866
1867 e_inval:
1868         in_dev_put(in_dev);
1869         return -EINVAL;
1870 }
1871
1872
1873 static void ip_handle_martian_source(struct net_device *dev,
1874                                      struct in_device *in_dev,
1875                                      struct sk_buff *skb,
1876                                      __be32 daddr,
1877                                      __be32 saddr)
1878 {
1879         RT_CACHE_STAT_INC(in_martian_src);
1880 #ifdef CONFIG_IP_ROUTE_VERBOSE
1881         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1882                 /*
1883                  *      RFC1812 recommendation, if source is martian,
1884                  *      the only hint is MAC header.
1885                  */
1886                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1887                         &daddr, &saddr, dev->name);
1888                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1889                         int i;
1890                         const unsigned char *p = skb_mac_header(skb);
1891                         printk(KERN_WARNING "ll header: ");
1892                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1893                                 printk("%02x", *p);
1894                                 if (i < (dev->hard_header_len - 1))
1895                                         printk(":");
1896                         }
1897                         printk("\n");
1898                 }
1899         }
1900 #endif
1901 }
1902
1903 static int __mkroute_input(struct sk_buff *skb,
1904                            struct fib_result *res,
1905                            struct in_device *in_dev,
1906                            __be32 daddr, __be32 saddr, u32 tos,
1907                            struct rtable **result)
1908 {
1909
1910         struct rtable *rth;
1911         int err;
1912         struct in_device *out_dev;
1913         unsigned flags = 0;
1914         __be32 spec_dst;
1915         u32 itag;
1916
1917         /* get a working reference to the output device */
1918         out_dev = in_dev_get(FIB_RES_DEV(*res));
1919         if (out_dev == NULL) {
1920                 if (net_ratelimit())
1921                         printk(KERN_CRIT "Bug in ip_route_input" \
1922                                "_slow(). Please, report\n");
1923                 return -EINVAL;
1924         }
1925
1926
1927         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1928                                   in_dev->dev, &spec_dst, &itag);
1929         if (err < 0) {
1930                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1931                                          saddr);
1932
1933                 err = -EINVAL;
1934                 goto cleanup;
1935         }
1936
1937         if (err)
1938                 flags |= RTCF_DIRECTSRC;
1939
1940         if (out_dev == in_dev && err &&
1941             (IN_DEV_SHARED_MEDIA(out_dev) ||
1942              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1943                 flags |= RTCF_DOREDIRECT;
1944
1945         if (skb->protocol != htons(ETH_P_IP)) {
1946                 /* Not IP (i.e. ARP). Do not create route, if it is
1947                  * invalid for proxy arp. DNAT routes are always valid.
1948                  */
1949                 if (out_dev == in_dev) {
1950                         err = -EINVAL;
1951                         goto cleanup;
1952                 }
1953         }
1954
1955
1956         rth = dst_alloc(&ipv4_dst_ops);
1957         if (!rth) {
1958                 err = -ENOBUFS;
1959                 goto cleanup;
1960         }
1961
1962         atomic_set(&rth->u.dst.__refcnt, 1);
1963         rth->u.dst.flags= DST_HOST;
1964         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1965                 rth->u.dst.flags |= DST_NOPOLICY;
1966         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1967                 rth->u.dst.flags |= DST_NOXFRM;
1968         rth->fl.fl4_dst = daddr;
1969         rth->rt_dst     = daddr;
1970         rth->fl.fl4_tos = tos;
1971         rth->fl.mark    = skb->mark;
1972         rth->fl.fl4_src = saddr;
1973         rth->rt_src     = saddr;
1974         rth->rt_gateway = daddr;
1975         rth->rt_iif     =
1976                 rth->fl.iif     = in_dev->dev->ifindex;
1977         rth->u.dst.dev  = (out_dev)->dev;
1978         dev_hold(rth->u.dst.dev);
1979         rth->idev       = in_dev_get(rth->u.dst.dev);
1980         rth->fl.oif     = 0;
1981         rth->rt_spec_dst= spec_dst;
1982
1983         rth->u.dst.input = ip_forward;
1984         rth->u.dst.output = ip_output;
1985         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
1986
1987         rt_set_nexthop(rth, res, itag);
1988
1989         rth->rt_flags = flags;
1990
1991         *result = rth;
1992         err = 0;
1993  cleanup:
1994         /* release the working reference to the output device */
1995         in_dev_put(out_dev);
1996         return err;
1997 }
1998
1999 static int ip_mkroute_input(struct sk_buff *skb,
2000                             struct fib_result *res,
2001                             const struct flowi *fl,
2002                             struct in_device *in_dev,
2003                             __be32 daddr, __be32 saddr, u32 tos)
2004 {
2005         struct rtable* rth = NULL;
2006         int err;
2007         unsigned hash;
2008
2009 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2010         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2011                 fib_select_multipath(fl, res);
2012 #endif
2013
2014         /* create a routing cache entry */
2015         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2016         if (err)
2017                 return err;
2018
2019         /* put it into the cache */
2020         hash = rt_hash(daddr, saddr, fl->iif,
2021                        rt_genid(dev_net(rth->u.dst.dev)));
2022         return rt_intern_hash(hash, rth, &skb->rtable);
2023 }
2024
2025 /*
2026  *      NOTE. We drop all the packets that has local source
2027  *      addresses, because every properly looped back packet
2028  *      must have correct destination already attached by output routine.
2029  *
2030  *      Such approach solves two big problems:
2031  *      1. Not simplex devices are handled properly.
2032  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2033  */
2034
2035 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2036                                u8 tos, struct net_device *dev)
2037 {
2038         struct fib_result res;
2039         struct in_device *in_dev = in_dev_get(dev);
2040         struct flowi fl = { .nl_u = { .ip4_u =
2041                                       { .daddr = daddr,
2042                                         .saddr = saddr,
2043                                         .tos = tos,
2044                                         .scope = RT_SCOPE_UNIVERSE,
2045                                       } },
2046                             .mark = skb->mark,
2047                             .iif = dev->ifindex };
2048         unsigned        flags = 0;
2049         u32             itag = 0;
2050         struct rtable * rth;
2051         unsigned        hash;
2052         __be32          spec_dst;
2053         int             err = -EINVAL;
2054         int             free_res = 0;
2055         struct net    * net = dev_net(dev);
2056
2057         /* IP on this device is disabled. */
2058
2059         if (!in_dev)
2060                 goto out;
2061
2062         /* Check for the most weird martians, which can be not detected
2063            by fib_lookup.
2064          */
2065
2066         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2067             ipv4_is_loopback(saddr))
2068                 goto martian_source;
2069
2070         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2071                 goto brd_input;
2072
2073         /* Accept zero addresses only to limited broadcast;
2074          * I even do not know to fix it or not. Waiting for complains :-)
2075          */
2076         if (ipv4_is_zeronet(saddr))
2077                 goto martian_source;
2078
2079         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2080             ipv4_is_loopback(daddr))
2081                 goto martian_destination;
2082
2083         /*
2084          *      Now we are ready to route packet.
2085          */
2086         if ((err = fib_lookup(net, &fl, &res)) != 0) {
2087                 if (!IN_DEV_FORWARD(in_dev))
2088                         goto e_hostunreach;
2089                 goto no_route;
2090         }
2091         free_res = 1;
2092
2093         RT_CACHE_STAT_INC(in_slow_tot);
2094
2095         if (res.type == RTN_BROADCAST)
2096                 goto brd_input;
2097
2098         if (res.type == RTN_LOCAL) {
2099                 int result;
2100                 result = fib_validate_source(saddr, daddr, tos,
2101                                              net->loopback_dev->ifindex,
2102                                              dev, &spec_dst, &itag);
2103                 if (result < 0)
2104                         goto martian_source;
2105                 if (result)
2106                         flags |= RTCF_DIRECTSRC;
2107                 spec_dst = daddr;
2108                 goto local_input;
2109         }
2110
2111         if (!IN_DEV_FORWARD(in_dev))
2112                 goto e_hostunreach;
2113         if (res.type != RTN_UNICAST)
2114                 goto martian_destination;
2115
2116         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2117 done:
2118         in_dev_put(in_dev);
2119         if (free_res)
2120                 fib_res_put(&res);
2121 out:    return err;
2122
2123 brd_input:
2124         if (skb->protocol != htons(ETH_P_IP))
2125                 goto e_inval;
2126
2127         if (ipv4_is_zeronet(saddr))
2128                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2129         else {
2130                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2131                                           &itag);
2132                 if (err < 0)
2133                         goto martian_source;
2134                 if (err)
2135                         flags |= RTCF_DIRECTSRC;
2136         }
2137         flags |= RTCF_BROADCAST;
2138         res.type = RTN_BROADCAST;
2139         RT_CACHE_STAT_INC(in_brd);
2140
2141 local_input:
2142         rth = dst_alloc(&ipv4_dst_ops);
2143         if (!rth)
2144                 goto e_nobufs;
2145
2146         rth->u.dst.output= ip_rt_bug;
2147         rth->rt_genid = rt_genid(net);
2148
2149         atomic_set(&rth->u.dst.__refcnt, 1);
2150         rth->u.dst.flags= DST_HOST;
2151         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2152                 rth->u.dst.flags |= DST_NOPOLICY;
2153         rth->fl.fl4_dst = daddr;
2154         rth->rt_dst     = daddr;
2155         rth->fl.fl4_tos = tos;
2156         rth->fl.mark    = skb->mark;
2157         rth->fl.fl4_src = saddr;
2158         rth->rt_src     = saddr;
2159 #ifdef CONFIG_NET_CLS_ROUTE
2160         rth->u.dst.tclassid = itag;
2161 #endif
2162         rth->rt_iif     =
2163         rth->fl.iif     = dev->ifindex;
2164         rth->u.dst.dev  = net->loopback_dev;
2165         dev_hold(rth->u.dst.dev);
2166         rth->idev       = in_dev_get(rth->u.dst.dev);
2167         rth->rt_gateway = daddr;
2168         rth->rt_spec_dst= spec_dst;
2169         rth->u.dst.input= ip_local_deliver;
2170         rth->rt_flags   = flags|RTCF_LOCAL;
2171         if (res.type == RTN_UNREACHABLE) {
2172                 rth->u.dst.input= ip_error;
2173                 rth->u.dst.error= -err;
2174                 rth->rt_flags   &= ~RTCF_LOCAL;
2175         }
2176         rth->rt_type    = res.type;
2177         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2178         err = rt_intern_hash(hash, rth, &skb->rtable);
2179         goto done;
2180
2181 no_route:
2182         RT_CACHE_STAT_INC(in_no_route);
2183         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2184         res.type = RTN_UNREACHABLE;
2185         if (err == -ESRCH)
2186                 err = -ENETUNREACH;
2187         goto local_input;
2188
2189         /*
2190          *      Do not cache martian addresses: they should be logged (RFC1812)
2191          */
2192 martian_destination:
2193         RT_CACHE_STAT_INC(in_martian_dst);
2194 #ifdef CONFIG_IP_ROUTE_VERBOSE
2195         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2196                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2197                         &daddr, &saddr, dev->name);
2198 #endif
2199
2200 e_hostunreach:
2201         err = -EHOSTUNREACH;
2202         goto done;
2203
2204 e_inval:
2205         err = -EINVAL;
2206         goto done;
2207
2208 e_nobufs:
2209         err = -ENOBUFS;
2210         goto done;
2211
2212 martian_source:
2213         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2214         goto e_inval;
2215 }
2216
2217 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2218                    u8 tos, struct net_device *dev)
2219 {
2220         struct rtable * rth;
2221         unsigned        hash;
2222         int iif = dev->ifindex;
2223         struct net *net;
2224
2225         net = dev_net(dev);
2226
2227         if (!rt_caching(net))
2228                 goto skip_cache;
2229
2230         tos &= IPTOS_RT_MASK;
2231         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2232
2233         rcu_read_lock();
2234         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2235              rth = rcu_dereference(rth->u.dst.rt_next)) {
2236                 if (((rth->fl.fl4_dst ^ daddr) |
2237                      (rth->fl.fl4_src ^ saddr) |
2238                      (rth->fl.iif ^ iif) |
2239                      rth->fl.oif |
2240                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2241                     rth->fl.mark == skb->mark &&
2242                     net_eq(dev_net(rth->u.dst.dev), net) &&
2243                     !rt_is_expired(rth)) {
2244                         dst_use(&rth->u.dst, jiffies);
2245                         RT_CACHE_STAT_INC(in_hit);
2246                         rcu_read_unlock();
2247                         skb->rtable = rth;
2248                         return 0;
2249                 }
2250                 RT_CACHE_STAT_INC(in_hlist_search);
2251         }
2252         rcu_read_unlock();
2253
2254 skip_cache:
2255         /* Multicast recognition logic is moved from route cache to here.
2256            The problem was that too many Ethernet cards have broken/missing
2257            hardware multicast filters :-( As result the host on multicasting
2258            network acquires a lot of useless route cache entries, sort of
2259            SDR messages from all the world. Now we try to get rid of them.
2260            Really, provided software IP multicast filter is organized
2261            reasonably (at least, hashed), it does not result in a slowdown
2262            comparing with route cache reject entries.
2263            Note, that multicast routers are not affected, because
2264            route cache entry is created eventually.
2265          */
2266         if (ipv4_is_multicast(daddr)) {
2267                 struct in_device *in_dev;
2268
2269                 rcu_read_lock();
2270                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2271                         int our = ip_check_mc(in_dev, daddr, saddr,
2272                                 ip_hdr(skb)->protocol);
2273                         if (our
2274 #ifdef CONFIG_IP_MROUTE
2275                             || (!ipv4_is_local_multicast(daddr) &&
2276                                 IN_DEV_MFORWARD(in_dev))
2277 #endif
2278                             ) {
2279                                 rcu_read_unlock();
2280                                 return ip_route_input_mc(skb, daddr, saddr,
2281                                                          tos, dev, our);
2282                         }
2283                 }
2284                 rcu_read_unlock();
2285                 return -EINVAL;
2286         }
2287         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2288 }
2289
2290 static int __mkroute_output(struct rtable **result,
2291                             struct fib_result *res,
2292                             const struct flowi *fl,
2293                             const struct flowi *oldflp,
2294                             struct net_device *dev_out,
2295                             unsigned flags)
2296 {
2297         struct rtable *rth;
2298         struct in_device *in_dev;
2299         u32 tos = RT_FL_TOS(oldflp);
2300         int err = 0;
2301
2302         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2303                 return -EINVAL;
2304
2305         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2306                 res->type = RTN_BROADCAST;
2307         else if (ipv4_is_multicast(fl->fl4_dst))
2308                 res->type = RTN_MULTICAST;
2309         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2310                 return -EINVAL;
2311
2312         if (dev_out->flags & IFF_LOOPBACK)
2313                 flags |= RTCF_LOCAL;
2314
2315         /* get work reference to inet device */
2316         in_dev = in_dev_get(dev_out);
2317         if (!in_dev)
2318                 return -EINVAL;
2319
2320         if (res->type == RTN_BROADCAST) {
2321                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2322                 if (res->fi) {
2323                         fib_info_put(res->fi);
2324                         res->fi = NULL;
2325                 }
2326         } else if (res->type == RTN_MULTICAST) {
2327                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2328                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2329                                  oldflp->proto))
2330                         flags &= ~RTCF_LOCAL;
2331                 /* If multicast route do not exist use
2332                    default one, but do not gateway in this case.
2333                    Yes, it is hack.
2334                  */
2335                 if (res->fi && res->prefixlen < 4) {
2336                         fib_info_put(res->fi);
2337                         res->fi = NULL;
2338                 }
2339         }
2340
2341
2342         rth = dst_alloc(&ipv4_dst_ops);
2343         if (!rth) {
2344                 err = -ENOBUFS;
2345                 goto cleanup;
2346         }
2347
2348         atomic_set(&rth->u.dst.__refcnt, 1);
2349         rth->u.dst.flags= DST_HOST;
2350         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2351                 rth->u.dst.flags |= DST_NOXFRM;
2352         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2353                 rth->u.dst.flags |= DST_NOPOLICY;
2354
2355         rth->fl.fl4_dst = oldflp->fl4_dst;
2356         rth->fl.fl4_tos = tos;
2357         rth->fl.fl4_src = oldflp->fl4_src;
2358         rth->fl.oif     = oldflp->oif;
2359         rth->fl.mark    = oldflp->mark;
2360         rth->rt_dst     = fl->fl4_dst;
2361         rth->rt_src     = fl->fl4_src;
2362         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2363         /* get references to the devices that are to be hold by the routing
2364            cache entry */
2365         rth->u.dst.dev  = dev_out;
2366         dev_hold(dev_out);
2367         rth->idev       = in_dev_get(dev_out);
2368         rth->rt_gateway = fl->fl4_dst;
2369         rth->rt_spec_dst= fl->fl4_src;
2370
2371         rth->u.dst.output=ip_output;
2372         rth->rt_genid = rt_genid(dev_net(dev_out));
2373
2374         RT_CACHE_STAT_INC(out_slow_tot);
2375
2376         if (flags & RTCF_LOCAL) {
2377                 rth->u.dst.input = ip_local_deliver;
2378                 rth->rt_spec_dst = fl->fl4_dst;
2379         }
2380         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2381                 rth->rt_spec_dst = fl->fl4_src;
2382                 if (flags & RTCF_LOCAL &&
2383                     !(dev_out->flags & IFF_LOOPBACK)) {
2384                         rth->u.dst.output = ip_mc_output;
2385                         RT_CACHE_STAT_INC(out_slow_mc);
2386                 }
2387 #ifdef CONFIG_IP_MROUTE
2388                 if (res->type == RTN_MULTICAST) {
2389                         if (IN_DEV_MFORWARD(in_dev) &&
2390                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2391                                 rth->u.dst.input = ip_mr_input;
2392                                 rth->u.dst.output = ip_mc_output;
2393                         }
2394                 }
2395 #endif
2396         }
2397
2398         rt_set_nexthop(rth, res, 0);
2399
2400         rth->rt_flags = flags;
2401
2402         *result = rth;
2403  cleanup:
2404         /* release work reference to inet device */
2405         in_dev_put(in_dev);
2406
2407         return err;
2408 }
2409
2410 static int ip_mkroute_output(struct rtable **rp,
2411                              struct fib_result *res,
2412                              const struct flowi *fl,
2413                              const struct flowi *oldflp,
2414                              struct net_device *dev_out,
2415                              unsigned flags)
2416 {
2417         struct rtable *rth = NULL;
2418         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2419         unsigned hash;
2420         if (err == 0) {
2421                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2422                                rt_genid(dev_net(dev_out)));
2423                 err = rt_intern_hash(hash, rth, rp);
2424         }
2425
2426         return err;
2427 }
2428
2429 /*
2430  * Major route resolver routine.
2431  */
2432
2433 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2434                                 const struct flowi *oldflp)
2435 {
2436         u32 tos = RT_FL_TOS(oldflp);
2437         struct flowi fl = { .nl_u = { .ip4_u =
2438                                       { .daddr = oldflp->fl4_dst,
2439                                         .saddr = oldflp->fl4_src,
2440                                         .tos = tos & IPTOS_RT_MASK,
2441                                         .scope = ((tos & RTO_ONLINK) ?
2442                                                   RT_SCOPE_LINK :
2443                                                   RT_SCOPE_UNIVERSE),
2444                                       } },
2445                             .mark = oldflp->mark,
2446                             .iif = net->loopback_dev->ifindex,
2447                             .oif = oldflp->oif };
2448         struct fib_result res;
2449         unsigned flags = 0;
2450         struct net_device *dev_out = NULL;
2451         int free_res = 0;
2452         int err;
2453
2454
2455         res.fi          = NULL;
2456 #ifdef CONFIG_IP_MULTIPLE_TABLES
2457         res.r           = NULL;
2458 #endif
2459
2460         if (oldflp->fl4_src) {
2461                 err = -EINVAL;
2462                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2463                     ipv4_is_lbcast(oldflp->fl4_src) ||
2464                     ipv4_is_zeronet(oldflp->fl4_src))
2465                         goto out;
2466
2467                 /* I removed check for oif == dev_out->oif here.
2468                    It was wrong for two reasons:
2469                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2470                       is assigned to multiple interfaces.
2471                    2. Moreover, we are allowed to send packets with saddr
2472                       of another iface. --ANK
2473                  */
2474
2475                 if (oldflp->oif == 0
2476                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2477                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2478                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2479                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2480                         if (dev_out == NULL)
2481                                 goto out;
2482
2483                         /* Special hack: user can direct multicasts
2484                            and limited broadcast via necessary interface
2485                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2486                            This hack is not just for fun, it allows
2487                            vic,vat and friends to work.
2488                            They bind socket to loopback, set ttl to zero
2489                            and expect that it will work.
2490                            From the viewpoint of routing cache they are broken,
2491                            because we are not allowed to build multicast path
2492                            with loopback source addr (look, routing cache
2493                            cannot know, that ttl is zero, so that packet
2494                            will not leave this host and route is valid).
2495                            Luckily, this hack is good workaround.
2496                          */
2497
2498                         fl.oif = dev_out->ifindex;
2499                         goto make_route;
2500                 }
2501
2502                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2503                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2504                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2505                         if (dev_out == NULL)
2506                                 goto out;
2507                         dev_put(dev_out);
2508                         dev_out = NULL;
2509                 }
2510         }
2511
2512
2513         if (oldflp->oif) {
2514                 dev_out = dev_get_by_index(net, oldflp->oif);
2515                 err = -ENODEV;
2516                 if (dev_out == NULL)
2517                         goto out;
2518
2519                 /* RACE: Check return value of inet_select_addr instead. */
2520                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2521                         dev_put(dev_out);
2522                         goto out;       /* Wrong error code */
2523                 }
2524
2525                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2526                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2527                         if (!fl.fl4_src)
2528                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2529                                                               RT_SCOPE_LINK);
2530                         goto make_route;
2531                 }
2532                 if (!fl.fl4_src) {
2533                         if (ipv4_is_multicast(oldflp->fl4_dst))
2534                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2535                                                               fl.fl4_scope);
2536                         else if (!oldflp->fl4_dst)
2537                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2538                                                               RT_SCOPE_HOST);
2539                 }
2540         }
2541
2542         if (!fl.fl4_dst) {
2543                 fl.fl4_dst = fl.fl4_src;
2544                 if (!fl.fl4_dst)
2545                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2546                 if (dev_out)
2547                         dev_put(dev_out);
2548                 dev_out = net->loopback_dev;
2549                 dev_hold(dev_out);
2550                 fl.oif = net->loopback_dev->ifindex;
2551                 res.type = RTN_LOCAL;
2552                 flags |= RTCF_LOCAL;
2553                 goto make_route;
2554         }
2555
2556         if (fib_lookup(net, &fl, &res)) {
2557                 res.fi = NULL;
2558                 if (oldflp->oif) {
2559                         /* Apparently, routing tables are wrong. Assume,
2560                            that the destination is on link.
2561
2562                            WHY? DW.
2563                            Because we are allowed to send to iface
2564                            even if it has NO routes and NO assigned
2565                            addresses. When oif is specified, routing
2566                            tables are looked up with only one purpose:
2567                            to catch if destination is gatewayed, rather than
2568                            direct. Moreover, if MSG_DONTROUTE is set,
2569                            we send packet, ignoring both routing tables
2570                            and ifaddr state. --ANK
2571
2572
2573                            We could make it even if oif is unknown,
2574                            likely IPv6, but we do not.
2575                          */
2576
2577                         if (fl.fl4_src == 0)
2578                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2579                                                               RT_SCOPE_LINK);
2580                         res.type = RTN_UNICAST;
2581                         goto make_route;
2582                 }
2583                 if (dev_out)
2584                         dev_put(dev_out);
2585                 err = -ENETUNREACH;
2586                 goto out;
2587         }
2588         free_res = 1;
2589
2590         if (res.type == RTN_LOCAL) {
2591                 if (!fl.fl4_src)
2592                         fl.fl4_src = fl.fl4_dst;
2593                 if (dev_out)
2594                         dev_put(dev_out);
2595                 dev_out = net->loopback_dev;
2596                 dev_hold(dev_out);
2597                 fl.oif = dev_out->ifindex;
2598                 if (res.fi)
2599                         fib_info_put(res.fi);
2600                 res.fi = NULL;
2601                 flags |= RTCF_LOCAL;
2602                 goto make_route;
2603         }
2604
2605 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2606         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2607                 fib_select_multipath(&fl, &res);
2608         else
2609 #endif
2610         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2611                 fib_select_default(net, &fl, &res);
2612
2613         if (!fl.fl4_src)
2614                 fl.fl4_src = FIB_RES_PREFSRC(res);
2615
2616         if (dev_out)
2617                 dev_put(dev_out);
2618         dev_out = FIB_RES_DEV(res);
2619         dev_hold(dev_out);
2620         fl.oif = dev_out->ifindex;
2621
2622
2623 make_route:
2624         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2625
2626
2627         if (free_res)
2628                 fib_res_put(&res);
2629         if (dev_out)
2630                 dev_put(dev_out);
2631 out:    return err;
2632 }
2633
2634 int __ip_route_output_key(struct net *net, struct rtable **rp,
2635                           const struct flowi *flp)
2636 {
2637         unsigned hash;
2638         struct rtable *rth;
2639
2640         if (!rt_caching(net))
2641                 goto slow_output;
2642
2643         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2644
2645         rcu_read_lock_bh();
2646         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2647                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2648                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2649                     rth->fl.fl4_src == flp->fl4_src &&
2650                     rth->fl.iif == 0 &&
2651                     rth->fl.oif == flp->oif &&
2652                     rth->fl.mark == flp->mark &&
2653                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2654                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2655                     net_eq(dev_net(rth->u.dst.dev), net) &&
2656                     !rt_is_expired(rth)) {
2657                         dst_use(&rth->u.dst, jiffies);
2658                         RT_CACHE_STAT_INC(out_hit);
2659                         rcu_read_unlock_bh();
2660                         *rp = rth;
2661                         return 0;
2662                 }
2663                 RT_CACHE_STAT_INC(out_hlist_search);
2664         }
2665         rcu_read_unlock_bh();
2666
2667 slow_output:
2668         return ip_route_output_slow(net, rp, flp);
2669 }
2670
2671 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2672
2673 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2674 {
2675 }
2676
2677 static struct dst_ops ipv4_dst_blackhole_ops = {
2678         .family                 =       AF_INET,
2679         .protocol               =       __constant_htons(ETH_P_IP),
2680         .destroy                =       ipv4_dst_destroy,
2681         .check                  =       ipv4_dst_check,
2682         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2683         .entries                =       ATOMIC_INIT(0),
2684 };
2685
2686
2687 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2688 {
2689         struct rtable *ort = *rp;
2690         struct rtable *rt = (struct rtable *)
2691                 dst_alloc(&ipv4_dst_blackhole_ops);
2692
2693         if (rt) {
2694                 struct dst_entry *new = &rt->u.dst;
2695
2696                 atomic_set(&new->__refcnt, 1);
2697                 new->__use = 1;
2698                 new->input = dst_discard;
2699                 new->output = dst_discard;
2700                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2701
2702                 new->dev = ort->u.dst.dev;
2703                 if (new->dev)
2704                         dev_hold(new->dev);
2705
2706                 rt->fl = ort->fl;
2707
2708                 rt->idev = ort->idev;
2709                 if (rt->idev)
2710                         in_dev_hold(rt->idev);
2711                 rt->rt_genid = rt_genid(net);
2712                 rt->rt_flags = ort->rt_flags;
2713                 rt->rt_type = ort->rt_type;
2714                 rt->rt_dst = ort->rt_dst;
2715                 rt->rt_src = ort->rt_src;
2716                 rt->rt_iif = ort->rt_iif;
2717                 rt->rt_gateway = ort->rt_gateway;
2718                 rt->rt_spec_dst = ort->rt_spec_dst;
2719                 rt->peer = ort->peer;
2720                 if (rt->peer)
2721                         atomic_inc(&rt->peer->refcnt);
2722
2723                 dst_free(new);
2724         }
2725
2726         dst_release(&(*rp)->u.dst);
2727         *rp = rt;
2728         return (rt ? 0 : -ENOMEM);
2729 }
2730
2731 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2732                          struct sock *sk, int flags)
2733 {
2734         int err;
2735
2736         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2737                 return err;
2738
2739         if (flp->proto) {
2740                 if (!flp->fl4_src)
2741                         flp->fl4_src = (*rp)->rt_src;
2742                 if (!flp->fl4_dst)
2743                         flp->fl4_dst = (*rp)->rt_dst;
2744                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2745                                     flags ? XFRM_LOOKUP_WAIT : 0);
2746                 if (err == -EREMOTE)
2747                         err = ipv4_dst_blackhole(net, rp, flp);
2748
2749                 return err;
2750         }
2751
2752         return 0;
2753 }
2754
2755 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2756
2757 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2758 {
2759         return ip_route_output_flow(net, rp, flp, NULL, 0);
2760 }
2761
2762 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2763                         int nowait, unsigned int flags)
2764 {
2765         struct rtable *rt = skb->rtable;
2766         struct rtmsg *r;
2767         struct nlmsghdr *nlh;
2768         long expires;
2769         u32 id = 0, ts = 0, tsage = 0, error;
2770
2771         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2772         if (nlh == NULL)
2773                 return -EMSGSIZE;
2774
2775         r = nlmsg_data(nlh);
2776         r->rtm_family    = AF_INET;
2777         r->rtm_dst_len  = 32;
2778         r->rtm_src_len  = 0;
2779         r->rtm_tos      = rt->fl.fl4_tos;
2780         r->rtm_table    = RT_TABLE_MAIN;
2781         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2782         r->rtm_type     = rt->rt_type;
2783         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2784         r->rtm_protocol = RTPROT_UNSPEC;
2785         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2786         if (rt->rt_flags & RTCF_NOTIFY)
2787                 r->rtm_flags |= RTM_F_NOTIFY;
2788
2789         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2790
2791         if (rt->fl.fl4_src) {
2792                 r->rtm_src_len = 32;
2793                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2794         }
2795         if (rt->u.dst.dev)
2796                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2797 #ifdef CONFIG_NET_CLS_ROUTE
2798         if (rt->u.dst.tclassid)
2799                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2800 #endif
2801         if (rt->fl.iif)
2802                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2803         else if (rt->rt_src != rt->fl.fl4_src)
2804                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2805
2806         if (rt->rt_dst != rt->rt_gateway)
2807                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2808
2809         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2810                 goto nla_put_failure;
2811
2812         error = rt->u.dst.error;
2813         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2814         if (rt->peer) {
2815                 id = rt->peer->ip_id_count;
2816                 if (rt->peer->tcp_ts_stamp) {
2817                         ts = rt->peer->tcp_ts;
2818                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2819                 }
2820         }
2821
2822         if (rt->fl.iif) {
2823 #ifdef CONFIG_IP_MROUTE
2824                 __be32 dst = rt->rt_dst;
2825
2826                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2827                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2828                         int err = ipmr_get_route(skb, r, nowait);
2829                         if (err <= 0) {
2830                                 if (!nowait) {
2831                                         if (err == 0)
2832                                                 return 0;
2833                                         goto nla_put_failure;
2834                                 } else {
2835                                         if (err == -EMSGSIZE)
2836                                                 goto nla_put_failure;
2837                                         error = err;
2838                                 }
2839                         }
2840                 } else
2841 #endif
2842                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2843         }
2844
2845         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2846                                expires, error) < 0)
2847                 goto nla_put_failure;
2848
2849         return nlmsg_end(skb, nlh);
2850
2851 nla_put_failure:
2852         nlmsg_cancel(skb, nlh);
2853         return -EMSGSIZE;
2854 }
2855
2856 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2857 {
2858         struct net *net = sock_net(in_skb->sk);
2859         struct rtmsg *rtm;
2860         struct nlattr *tb[RTA_MAX+1];
2861         struct rtable *rt = NULL;
2862         __be32 dst = 0;
2863         __be32 src = 0;
2864         u32 iif;
2865         int err;
2866         struct sk_buff *skb;
2867
2868         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2869         if (err < 0)
2870                 goto errout;
2871
2872         rtm = nlmsg_data(nlh);
2873
2874         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2875         if (skb == NULL) {
2876                 err = -ENOBUFS;
2877                 goto errout;
2878         }
2879
2880         /* Reserve room for dummy headers, this skb can pass
2881            through good chunk of routing engine.
2882          */
2883         skb_reset_mac_header(skb);
2884         skb_reset_network_header(skb);
2885
2886         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2887         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2888         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2889
2890         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2891         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2892         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2893
2894         if (iif) {
2895                 struct net_device *dev;
2896
2897                 dev = __dev_get_by_index(net, iif);
2898                 if (dev == NULL) {
2899                         err = -ENODEV;
2900                         goto errout_free;
2901                 }
2902
2903                 skb->protocol   = htons(ETH_P_IP);
2904                 skb->dev        = dev;
2905                 local_bh_disable();
2906                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2907                 local_bh_enable();
2908
2909                 rt = skb->rtable;
2910                 if (err == 0 && rt->u.dst.error)
2911                         err = -rt->u.dst.error;
2912         } else {
2913                 struct flowi fl = {
2914                         .nl_u = {
2915                                 .ip4_u = {
2916                                         .daddr = dst,
2917                                         .saddr = src,
2918                                         .tos = rtm->rtm_tos,
2919                                 },
2920                         },
2921                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2922                 };
2923                 err = ip_route_output_key(net, &rt, &fl);
2924         }
2925
2926         if (err)
2927                 goto errout_free;
2928
2929         skb->rtable = rt;
2930         if (rtm->rtm_flags & RTM_F_NOTIFY)
2931                 rt->rt_flags |= RTCF_NOTIFY;
2932
2933         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2934                            RTM_NEWROUTE, 0, 0);
2935         if (err <= 0)
2936                 goto errout_free;
2937
2938         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2939 errout:
2940         return err;
2941
2942 errout_free:
2943         kfree_skb(skb);
2944         goto errout;
2945 }
2946
2947 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2948 {
2949         struct rtable *rt;
2950         int h, s_h;
2951         int idx, s_idx;
2952         struct net *net;
2953
2954         net = sock_net(skb->sk);
2955
2956         s_h = cb->args[0];
2957         if (s_h < 0)
2958                 s_h = 0;
2959         s_idx = idx = cb->args[1];
2960         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2961                 if (!rt_hash_table[h].chain)
2962                         continue;
2963                 rcu_read_lock_bh();
2964                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2965                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2966                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2967                                 continue;
2968                         if (rt_is_expired(rt))
2969                                 continue;
2970                         skb->dst = dst_clone(&rt->u.dst);
2971                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2972                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2973                                          1, NLM_F_MULTI) <= 0) {
2974                                 dst_release(xchg(&skb->dst, NULL));
2975                                 rcu_read_unlock_bh();
2976                                 goto done;
2977                         }
2978                         dst_release(xchg(&skb->dst, NULL));
2979                 }
2980                 rcu_read_unlock_bh();
2981         }
2982
2983 done:
2984         cb->args[0] = h;
2985         cb->args[1] = idx;
2986         return skb->len;
2987 }
2988
2989 void ip_rt_multicast_event(struct in_device *in_dev)
2990 {
2991         rt_cache_flush(dev_net(in_dev->dev), 0);
2992 }
2993
2994 #ifdef CONFIG_SYSCTL
2995 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2996                                         struct file *filp, void __user *buffer,
2997                                         size_t *lenp, loff_t *ppos)
2998 {
2999         if (write) {
3000                 int flush_delay;
3001                 ctl_table ctl;
3002                 struct net *net;
3003
3004                 memcpy(&ctl, __ctl, sizeof(ctl));
3005                 ctl.data = &flush_delay;
3006                 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
3007
3008                 net = (struct net *)__ctl->extra1;
3009                 rt_cache_flush(net, flush_delay);
3010                 return 0;
3011         }
3012
3013         return -EINVAL;
3014 }
3015
3016 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3017                                                 void __user *oldval,
3018                                                 size_t __user *oldlenp,
3019                                                 void __user *newval,
3020                                                 size_t newlen)
3021 {
3022         int delay;
3023         struct net *net;
3024         if (newlen != sizeof(int))
3025                 return -EINVAL;
3026         if (get_user(delay, (int __user *)newval))
3027                 return -EFAULT;
3028         net = (struct net *)table->extra1;
3029         rt_cache_flush(net, delay);
3030         return 0;
3031 }
3032
3033 static void rt_secret_reschedule(int old)
3034 {
3035         struct net *net;
3036         int new = ip_rt_secret_interval;
3037         int diff = new - old;
3038
3039         if (!diff)
3040                 return;
3041
3042         rtnl_lock();
3043         for_each_net(net) {
3044                 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3045
3046                 if (!new)
3047                         continue;
3048
3049                 if (deleted) {
3050                         long time = net->ipv4.rt_secret_timer.expires - jiffies;
3051
3052                         if (time <= 0 || (time += diff) <= 0)
3053                                 time = 0;
3054
3055                         net->ipv4.rt_secret_timer.expires = time;
3056                 } else
3057                         net->ipv4.rt_secret_timer.expires = new;
3058
3059                 net->ipv4.rt_secret_timer.expires += jiffies;
3060                 add_timer(&net->ipv4.rt_secret_timer);
3061         }
3062         rtnl_unlock();
3063 }
3064
3065 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3066                                           struct file *filp,
3067                                           void __user *buffer, size_t *lenp,
3068                                           loff_t *ppos)
3069 {
3070         int old = ip_rt_secret_interval;
3071         int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
3072
3073         rt_secret_reschedule(old);
3074
3075         return ret;
3076 }
3077
3078 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3079                                                    void __user *oldval,
3080                                                    size_t __user *oldlenp,
3081                                                    void __user *newval,
3082                                                    size_t newlen)
3083 {
3084         int old = ip_rt_secret_interval;
3085         int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3086
3087         rt_secret_reschedule(old);
3088
3089         return ret;
3090 }
3091
3092 static ctl_table ipv4_route_table[] = {
3093         {
3094                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
3095                 .procname       = "gc_thresh",
3096                 .data           = &ipv4_dst_ops.gc_thresh,
3097                 .maxlen         = sizeof(int),
3098                 .mode           = 0644,
3099                 .proc_handler   = proc_dointvec,
3100         },
3101         {
3102                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
3103                 .procname       = "max_size",
3104                 .data           = &ip_rt_max_size,
3105                 .maxlen         = sizeof(int),
3106                 .mode           = 0644,
3107                 .proc_handler   = proc_dointvec,
3108         },
3109         {
3110                 /*  Deprecated. Use gc_min_interval_ms */
3111
3112                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3113                 .procname       = "gc_min_interval",
3114                 .data           = &ip_rt_gc_min_interval,
3115                 .maxlen         = sizeof(int),
3116                 .mode           = 0644,
3117                 .proc_handler   = proc_dointvec_jiffies,
3118                 .strategy       = sysctl_jiffies,
3119         },
3120         {
3121                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3122                 .procname       = "gc_min_interval_ms",
3123                 .data           = &ip_rt_gc_min_interval,
3124                 .maxlen         = sizeof(int),
3125                 .mode           = 0644,
3126                 .proc_handler   = proc_dointvec_ms_jiffies,
3127                 .strategy       = sysctl_ms_jiffies,
3128         },
3129         {
3130                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
3131                 .procname       = "gc_timeout",
3132                 .data           = &ip_rt_gc_timeout,
3133                 .maxlen         = sizeof(int),
3134                 .mode           = 0644,
3135                 .proc_handler   = proc_dointvec_jiffies,
3136                 .strategy       = sysctl_jiffies,
3137         },
3138         {
3139                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
3140                 .procname       = "gc_interval",
3141                 .data           = &ip_rt_gc_interval,
3142                 .maxlen         = sizeof(int),
3143                 .mode           = 0644,
3144                 .proc_handler   = proc_dointvec_jiffies,
3145                 .strategy       = sysctl_jiffies,
3146         },
3147         {
3148                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
3149                 .procname       = "redirect_load",
3150                 .data           = &ip_rt_redirect_load,
3151                 .maxlen         = sizeof(int),
3152                 .mode           = 0644,
3153                 .proc_handler   = proc_dointvec,
3154         },
3155         {
3156                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3157                 .procname       = "redirect_number",
3158                 .data           = &ip_rt_redirect_number,
3159                 .maxlen         = sizeof(int),
3160                 .mode           = 0644,
3161                 .proc_handler   = proc_dointvec,
3162         },
3163         {
3164                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3165                 .procname       = "redirect_silence",
3166                 .data           = &ip_rt_redirect_silence,
3167                 .maxlen         = sizeof(int),
3168                 .mode           = 0644,
3169                 .proc_handler   = proc_dointvec,
3170         },
3171         {
3172                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3173                 .procname       = "error_cost",
3174                 .data           = &ip_rt_error_cost,
3175                 .maxlen         = sizeof(int),
3176                 .mode           = 0644,
3177                 .proc_handler   = proc_dointvec,
3178         },
3179         {
3180                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3181                 .procname       = "error_burst",
3182                 .data           = &ip_rt_error_burst,
3183                 .maxlen         = sizeof(int),
3184                 .mode           = 0644,
3185                 .proc_handler   = proc_dointvec,
3186         },
3187         {
3188                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3189                 .procname       = "gc_elasticity",
3190                 .data           = &ip_rt_gc_elasticity,
3191                 .maxlen         = sizeof(int),
3192                 .mode           = 0644,
3193                 .proc_handler   = proc_dointvec,
3194         },
3195         {
3196                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3197                 .procname       = "mtu_expires",
3198                 .data           = &ip_rt_mtu_expires,
3199                 .maxlen         = sizeof(int),
3200                 .mode           = 0644,
3201                 .proc_handler   = proc_dointvec_jiffies,
3202                 .strategy       = sysctl_jiffies,
3203         },
3204         {
3205                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3206                 .procname       = "min_pmtu",
3207                 .data           = &ip_rt_min_pmtu,
3208                 .maxlen         = sizeof(int),
3209                 .mode           = 0644,
3210                 .proc_handler   = proc_dointvec,
3211         },
3212         {
3213                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3214                 .procname       = "min_adv_mss",
3215                 .data           = &ip_rt_min_advmss,
3216                 .maxlen         = sizeof(int),
3217                 .mode           = 0644,
3218                 .proc_handler   = proc_dointvec,
3219         },
3220         {
3221                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3222                 .procname       = "secret_interval",
3223                 .data           = &ip_rt_secret_interval,
3224                 .maxlen         = sizeof(int),
3225                 .mode           = 0644,
3226                 .proc_handler   = ipv4_sysctl_rt_secret_interval,
3227                 .strategy       = ipv4_sysctl_rt_secret_interval_strategy,
3228         },
3229         { .ctl_name = 0 }
3230 };
3231
3232 static struct ctl_table empty[1];
3233
3234 static struct ctl_table ipv4_skeleton[] =
3235 {
3236         { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3237           .mode = 0555, .child = ipv4_route_table},
3238         { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3239           .mode = 0555, .child = empty},
3240         { }
3241 };
3242
3243 static __net_initdata struct ctl_path ipv4_path[] = {
3244         { .procname = "net", .ctl_name = CTL_NET, },
3245         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3246         { },
3247 };
3248
3249 static struct ctl_table ipv4_route_flush_table[] = {
3250         {
3251                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
3252                 .procname       = "flush",
3253                 .maxlen         = sizeof(int),
3254                 .mode           = 0200,
3255                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3256                 .strategy       = ipv4_sysctl_rtcache_flush_strategy,
3257         },
3258         { .ctl_name = 0 },
3259 };
3260
3261 static __net_initdata struct ctl_path ipv4_route_path[] = {
3262         { .procname = "net", .ctl_name = CTL_NET, },
3263         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3264         { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3265         { },
3266 };
3267
3268 static __net_init int sysctl_route_net_init(struct net *net)
3269 {
3270         struct ctl_table *tbl;
3271
3272         tbl = ipv4_route_flush_table;
3273         if (net != &init_net) {
3274                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3275                 if (tbl == NULL)
3276                         goto err_dup;
3277         }
3278         tbl[0].extra1 = net;
3279
3280         net->ipv4.route_hdr =
3281                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3282         if (net->ipv4.route_hdr == NULL)
3283                 goto err_reg;
3284         return 0;
3285
3286 err_reg:
3287         if (tbl != ipv4_route_flush_table)
3288                 kfree(tbl);
3289 err_dup:
3290         return -ENOMEM;
3291 }
3292
3293 static __net_exit void sysctl_route_net_exit(struct net *net)
3294 {
3295         struct ctl_table *tbl;
3296
3297         tbl = net->ipv4.route_hdr->ctl_table_arg;
3298         unregister_net_sysctl_table(net->ipv4.route_hdr);
3299         BUG_ON(tbl == ipv4_route_flush_table);
3300         kfree(tbl);
3301 }
3302
3303 static __net_initdata struct pernet_operations sysctl_route_ops = {
3304         .init = sysctl_route_net_init,
3305         .exit = sysctl_route_net_exit,
3306 };
3307 #endif
3308
3309
3310 static __net_init int rt_secret_timer_init(struct net *net)
3311 {
3312         atomic_set(&net->ipv4.rt_genid,
3313                         (int) ((num_physpages ^ (num_physpages>>8)) ^
3314                         (jiffies ^ (jiffies >> 7))));
3315
3316         net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3317         net->ipv4.rt_secret_timer.data = (unsigned long)net;
3318         init_timer_deferrable(&net->ipv4.rt_secret_timer);
3319
3320         if (ip_rt_secret_interval) {
3321                 net->ipv4.rt_secret_timer.expires =
3322                         jiffies + net_random() % ip_rt_secret_interval +
3323                         ip_rt_secret_interval;
3324                 add_timer(&net->ipv4.rt_secret_timer);
3325         }
3326         return 0;
3327 }
3328
3329 static __net_exit void rt_secret_timer_exit(struct net *net)
3330 {
3331         del_timer_sync(&net->ipv4.rt_secret_timer);
3332 }
3333
3334 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3335         .init = rt_secret_timer_init,
3336         .exit = rt_secret_timer_exit,
3337 };
3338
3339
3340 #ifdef CONFIG_NET_CLS_ROUTE
3341 struct ip_rt_acct *ip_rt_acct __read_mostly;
3342 #endif /* CONFIG_NET_CLS_ROUTE */
3343
3344 static __initdata unsigned long rhash_entries;
3345 static int __init set_rhash_entries(char *str)
3346 {
3347         if (!str)
3348                 return 0;
3349         rhash_entries = simple_strtoul(str, &str, 0);
3350         return 1;
3351 }
3352 __setup("rhash_entries=", set_rhash_entries);
3353
3354 int __init ip_rt_init(void)
3355 {
3356         int rc = 0;
3357
3358 #ifdef CONFIG_NET_CLS_ROUTE
3359         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3360         if (!ip_rt_acct)
3361                 panic("IP: failed to allocate ip_rt_acct\n");
3362 #endif
3363
3364         ipv4_dst_ops.kmem_cachep =
3365                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3366                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3367
3368         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3369
3370         rt_hash_table = (struct rt_hash_bucket *)
3371                 alloc_large_system_hash("IP route cache",
3372                                         sizeof(struct rt_hash_bucket),
3373                                         rhash_entries,
3374                                         (num_physpages >= 128 * 1024) ?
3375                                         15 : 17,
3376                                         0,
3377                                         &rt_hash_log,
3378                                         &rt_hash_mask,
3379                                         0);
3380         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3381         rt_hash_lock_init();
3382
3383         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3384         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3385
3386         devinet_init();
3387         ip_fib_init();
3388
3389         /* All the timers, started at system startup tend
3390            to synchronize. Perturb it a bit.
3391          */
3392         schedule_delayed_work(&expires_work,
3393                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3394
3395         if (register_pernet_subsys(&rt_secret_timer_ops))
3396                 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3397
3398         if (ip_rt_proc_init())
3399                 printk(KERN_ERR "Unable to create route proc files\n");
3400 #ifdef CONFIG_XFRM
3401         xfrm_init();
3402         xfrm4_init();
3403 #endif
3404         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3405
3406 #ifdef CONFIG_SYSCTL
3407         register_pernet_subsys(&sysctl_route_ops);
3408 #endif
3409         return rc;
3410 }
3411
3412 #ifdef CONFIG_SYSCTL
3413 /*
3414  * We really need to sanitize the damn ipv4 init order, then all
3415  * this nonsense will go away.
3416  */
3417 void __init ip_static_sysctl_init(void)
3418 {
3419         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3420 }
3421 #endif
3422
3423 EXPORT_SYMBOL(__ip_select_ident);
3424 EXPORT_SYMBOL(ip_route_input);
3425 EXPORT_SYMBOL(ip_route_output_key);