net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  60  *
  61  *              This program is free software; you can redistribute it and/or
  62  *              modify it under the terms of the GNU General Public License
  63  *              as published by the Free Software Foundation; either version
  64  *              2 of the License, or (at your option) any later version.
  65  */
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <asm/system.h>
  70 #include <linux/bitops.h>
  71 #include <linux/types.h>
  72 #include <linux/kernel.h>
  73 #include <linux/mm.h>
  74 #include <linux/bootmem.h>
  75 #include <linux/string.h>
  76 #include <linux/socket.h>
  77 #include <linux/sockios.h>
  78 #include <linux/errno.h>
  79 #include <linux/in.h>
  80 #include <linux/inet.h>
  81 #include <linux/netdevice.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/init.h>
  84 #include <linux/workqueue.h>
  85 #include <linux/skbuff.h>
  86 #include <linux/inetdevice.h>
  87 #include <linux/igmp.h>
  88 #include <linux/pkt_sched.h>
  89 #include <linux/mroute.h>
  90 #include <linux/netfilter_ipv4.h>
  91 #include <linux/random.h>
  92 #include <linux/jhash.h>
  93 #include <linux/rcupdate.h>
  94 #include <linux/times.h>
  95 #include <net/dst.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112
 113 #define RT_FL_TOS(oldflp) \
 114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 115
 116 #define IP_MAX_MTU      0xFFF0
 117
 118 #define RT_GC_TIMEOUT (300*HZ)
 119
 120 static int ip_rt_max_size;
 121 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 122 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_gc_elasticity __read_mostly    = 8;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 134
 135 static void rt_worker_func(struct work_struct *work);
 136 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 137 static struct timer_list rt_secret_timer;
 138
 139 /*
 140  *      Interface to generic destination cache.
 141  */
 142
 143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 144 static void              ipv4_dst_destroy(struct dst_entry *dst);
 145 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 146                                          struct net_device *dev, int how);
 147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 148 static void              ipv4_link_failure(struct sk_buff *skb);
 149 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 150 static int rt_garbage_collect(struct dst_ops *ops);
 151
 152
 153 static struct dst_ops ipv4_dst_ops = {
 154         .family =               AF_INET,
 155         .protocol =             __constant_htons(ETH_P_IP),
 156         .gc =                   rt_garbage_collect,
 157         .check =                ipv4_dst_check,
 158         .destroy =              ipv4_dst_destroy,
 159         .ifdown =               ipv4_dst_ifdown,
 160         .negative_advice =      ipv4_negative_advice,
 161         .link_failure =         ipv4_link_failure,
 162         .update_pmtu =          ip_rt_update_pmtu,
 163         .local_out =            ip_local_out,
 164         .entry_size =           sizeof(struct rtable),
 165         .entries =              ATOMIC_INIT(0),
 166 };
 167
 168 #define ECN_OR_COST(class)      TC_PRIO_##class
 169
 170 const __u8 ip_tos2prio[16] = {
 171         TC_PRIO_BESTEFFORT,
 172         ECN_OR_COST(FILLER),
 173         TC_PRIO_BESTEFFORT,
 174         ECN_OR_COST(BESTEFFORT),
 175         TC_PRIO_BULK,
 176         ECN_OR_COST(BULK),
 177         TC_PRIO_BULK,
 178         ECN_OR_COST(BULK),
 179         TC_PRIO_INTERACTIVE,
 180         ECN_OR_COST(INTERACTIVE),
 181         TC_PRIO_INTERACTIVE,
 182         ECN_OR_COST(INTERACTIVE),
 183         TC_PRIO_INTERACTIVE_BULK,
 184         ECN_OR_COST(INTERACTIVE_BULK),
 185         TC_PRIO_INTERACTIVE_BULK,
 186         ECN_OR_COST(INTERACTIVE_BULK)
 187 };
 188
 189
 190 /*
 191  * Route cache.
 192  */
 193
 194 /* The locking scheme is rather straight forward:
 195  *
 196  * 1) Read-Copy Update protects the buckets of the central route hash.
 197  * 2) Only writers remove entries, and they hold the lock
 198  *    as they look at rtable reference counts.
 199  * 3) Only readers acquire references to rtable entries,
 200  *    they do so with atomic increments and with the
 201  *    lock held.
 202  */
 203
 204 struct rt_hash_bucket {
 205         struct rtable   *chain;
 206 };
 207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 208         defined(CONFIG_PROVE_LOCKING)
 209 /*
 210  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 211  * The size of this table is a power of two and depends on the number of CPUS.
 212  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 213  */
 214 #ifdef CONFIG_LOCKDEP
 215 # define RT_HASH_LOCK_SZ        256
 216 #else
 217 # if NR_CPUS >= 32
 218 #  define RT_HASH_LOCK_SZ       4096
 219 # elif NR_CPUS >= 16
 220 #  define RT_HASH_LOCK_SZ       2048
 221 # elif NR_CPUS >= 8
 222 #  define RT_HASH_LOCK_SZ       1024
 223 # elif NR_CPUS >= 4
 224 #  define RT_HASH_LOCK_SZ       512
 225 # else
 226 #  define RT_HASH_LOCK_SZ       256
 227 # endif
 228 #endif
 229
 230 static spinlock_t       *rt_hash_locks;
 231 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 232
 233 static __init void rt_hash_lock_init(void)
 234 {
 235         int i;
 236
 237         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 238                         GFP_KERNEL);
 239         if (!rt_hash_locks)
 240                 panic("IP: failed to allocate rt_hash_locks\n");
 241
 242         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 243                 spin_lock_init(&rt_hash_locks[i]);
 244 }
 245 #else
 246 # define rt_hash_lock_addr(slot) NULL
 247
 248 static inline void rt_hash_lock_init(void)
 249 {
 250 }
 251 #endif
 252
 253 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 254 static unsigned                 rt_hash_mask __read_mostly;
 255 static unsigned int             rt_hash_log  __read_mostly;
 256 static atomic_t                 rt_genid __read_mostly;
 257
 258 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 259 #define RT_CACHE_STAT_INC(field) \
 260         (__raw_get_cpu_var(rt_cache_stat).field++)
 261
 262 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
 263 {
 264         return jhash_3words((__force u32)(__be32)(daddr),
 265                             (__force u32)(__be32)(saddr),
 266                             idx, atomic_read(&rt_genid))
 267                 & rt_hash_mask;
 268 }
 269
 270 #ifdef CONFIG_PROC_FS
 271 struct rt_cache_iter_state {
 272         struct seq_net_private p;
 273         int bucket;
 274         int genid;
 275 };
 276
 277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 278 {
 279         struct rt_cache_iter_state *st = seq->private;
 280         struct rtable *r = NULL;
 281
 282         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 283                 rcu_read_lock_bh();
 284                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
 285                 while (r) {
 286                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 287                             r->rt_genid == st->genid)
 288                                 return r;
 289                         r = rcu_dereference(r->u.dst.rt_next);
 290                 }
 291                 rcu_read_unlock_bh();
 292         }
 293         return r;
 294 }
 295
 296 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 297                                           struct rtable *r)
 298 {
 299         struct rt_cache_iter_state *st = seq->private;
 300         r = r->u.dst.rt_next;
 301         while (!r) {
 302                 rcu_read_unlock_bh();
 303                 if (--st->bucket < 0)
 304                         break;
 305                 rcu_read_lock_bh();
 306                 r = rt_hash_table[st->bucket].chain;
 307         }
 308         return rcu_dereference(r);
 309 }
 310
 311 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 312                                         struct rtable *r)
 313 {
 314         struct rt_cache_iter_state *st = seq->private;
 315         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 316                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 317                         continue;
 318                 if (r->rt_genid == st->genid)
 319                         break;
 320         }
 321         return r;
 322 }
 323
 324 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 325 {
 326         struct rtable *r = rt_cache_get_first(seq);
 327
 328         if (r)
 329                 while (pos && (r = rt_cache_get_next(seq, r)))
 330                         --pos;
 331         return pos ? NULL : r;
 332 }
 333
 334 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 335 {
 336         struct rt_cache_iter_state *st = seq->private;
 337         if (*pos)
 338                 return rt_cache_get_idx(seq, *pos - 1);
 339         st->genid = atomic_read(&rt_genid);
 340         return SEQ_START_TOKEN;
 341 }
 342
 343 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 344 {
 345         struct rtable *r;
 346
 347         if (v == SEQ_START_TOKEN)
 348                 r = rt_cache_get_first(seq);
 349         else
 350                 r = rt_cache_get_next(seq, v);
 351         ++*pos;
 352         return r;
 353 }
 354
 355 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 356 {
 357         if (v && v != SEQ_START_TOKEN)
 358                 rcu_read_unlock_bh();
 359 }
 360
 361 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 362 {
 363         if (v == SEQ_START_TOKEN)
 364                 seq_printf(seq, "%-127s\n",
 365                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 366                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 367                            "HHUptod\tSpecDst");
 368         else {
 369                 struct rtable *r = v;
 370                 int len;
 371
 372                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 373                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 374                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 375                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 376                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 377                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 378                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 379                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 380                         dst_metric(&r->u.dst, RTAX_WINDOW),
 381                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 382                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 383                         r->fl.fl4_tos,
 384                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 385                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 386                                        dev_queue_xmit) : 0,
 387                         r->rt_spec_dst, &len);
 388
 389                 seq_printf(seq, "%*s\n", 127 - len, "");
 390         }
 391         return 0;
 392 }
 393
 394 static const struct seq_operations rt_cache_seq_ops = {
 395         .start  = rt_cache_seq_start,
 396         .next   = rt_cache_seq_next,
 397         .stop   = rt_cache_seq_stop,
 398         .show   = rt_cache_seq_show,
 399 };
 400
 401 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 402 {
 403         return seq_open_net(inode, file, &rt_cache_seq_ops,
 404                         sizeof(struct rt_cache_iter_state));
 405 }
 406
 407 static const struct file_operations rt_cache_seq_fops = {
 408         .owner   = THIS_MODULE,
 409         .open    = rt_cache_seq_open,
 410         .read    = seq_read,
 411         .llseek  = seq_lseek,
 412         .release = seq_release_net,
 413 };
 414
 415
 416 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 417 {
 418         int cpu;
 419
 420         if (*pos == 0)
 421                 return SEQ_START_TOKEN;
 422
 423         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 424                 if (!cpu_possible(cpu))
 425                         continue;
 426                 *pos = cpu+1;
 427                 return &per_cpu(rt_cache_stat, cpu);
 428         }
 429         return NULL;
 430 }
 431
 432 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 433 {
 434         int cpu;
 435
 436         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 437                 if (!cpu_possible(cpu))
 438                         continue;
 439                 *pos = cpu+1;
 440                 return &per_cpu(rt_cache_stat, cpu);
 441         }
 442         return NULL;
 443
 444 }
 445
 446 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 447 {
 448
 449 }
 450
 451 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 452 {
 453         struct rt_cache_stat *st = v;
 454
 455         if (v == SEQ_START_TOKEN) {
 456                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 457                 return 0;
 458         }
 459
 460         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 461                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 462                    atomic_read(&ipv4_dst_ops.entries),
 463                    st->in_hit,
 464                    st->in_slow_tot,
 465                    st->in_slow_mc,
 466                    st->in_no_route,
 467                    st->in_brd,
 468                    st->in_martian_dst,
 469                    st->in_martian_src,
 470
 471                    st->out_hit,
 472                    st->out_slow_tot,
 473                    st->out_slow_mc,
 474
 475                    st->gc_total,
 476                    st->gc_ignored,
 477                    st->gc_goal_miss,
 478                    st->gc_dst_overflow,
 479                    st->in_hlist_search,
 480                    st->out_hlist_search
 481                 );
 482         return 0;
 483 }
 484
 485 static const struct seq_operations rt_cpu_seq_ops = {
 486         .start  = rt_cpu_seq_start,
 487         .next   = rt_cpu_seq_next,
 488         .stop   = rt_cpu_seq_stop,
 489         .show   = rt_cpu_seq_show,
 490 };
 491
 492
 493 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 494 {
 495         return seq_open(file, &rt_cpu_seq_ops);
 496 }
 497
 498 static const struct file_operations rt_cpu_seq_fops = {
 499         .owner   = THIS_MODULE,
 500         .open    = rt_cpu_seq_open,
 501         .read    = seq_read,
 502         .llseek  = seq_lseek,
 503         .release = seq_release,
 504 };
 505
 506 #ifdef CONFIG_NET_CLS_ROUTE
 507 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 508                            int length, int *eof, void *data)
 509 {
 510         unsigned int i;
 511
 512         if ((offset & 3) || (length & 3))
 513                 return -EIO;
 514
 515         if (offset >= sizeof(struct ip_rt_acct) * 256) {
 516                 *eof = 1;
 517                 return 0;
 518         }
 519
 520         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 521                 length = sizeof(struct ip_rt_acct) * 256 - offset;
 522                 *eof = 1;
 523         }
 524
 525         offset /= sizeof(u32);
 526
 527         if (length > 0) {
 528                 u32 *dst = (u32 *) buffer;
 529
 530                 *start = buffer;
 531                 memset(dst, 0, length);
 532
 533                 for_each_possible_cpu(i) {
 534                         unsigned int j;
 535                         u32 *src;
 536
 537                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 538                         for (j = 0; j < length/4; j++)
 539                                 dst[j] += src[j];
 540                 }
 541         }
 542         return length;
 543 }
 544 #endif
 545
 546 static int __net_init ip_rt_do_proc_init(struct net *net)
 547 {
 548         struct proc_dir_entry *pde;
 549
 550         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 551                         &rt_cache_seq_fops);
 552         if (!pde)
 553                 goto err1;
 554
 555         pde = proc_create("rt_cache", S_IRUGO,
 556                           net->proc_net_stat, &rt_cpu_seq_fops);
 557         if (!pde)
 558                 goto err2;
 559
 560 #ifdef CONFIG_NET_CLS_ROUTE
 561         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
 562                         ip_rt_acct_read, NULL);
 563         if (!pde)
 564                 goto err3;
 565 #endif
 566         return 0;
 567
 568 #ifdef CONFIG_NET_CLS_ROUTE
 569 err3:
 570         remove_proc_entry("rt_cache", net->proc_net_stat);
 571 #endif
 572 err2:
 573         remove_proc_entry("rt_cache", net->proc_net);
 574 err1:
 575         return -ENOMEM;
 576 }
 577
 578 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 579 {
 580         remove_proc_entry("rt_cache", net->proc_net_stat);
 581         remove_proc_entry("rt_cache", net->proc_net);
 582         remove_proc_entry("rt_acct", net->proc_net);
 583 }
 584
 585 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 586         .init = ip_rt_do_proc_init,
 587         .exit = ip_rt_do_proc_exit,
 588 };
 589
 590 static int __init ip_rt_proc_init(void)
 591 {
 592         return register_pernet_subsys(&ip_rt_proc_ops);
 593 }
 594
 595 #else
 596 static inline int ip_rt_proc_init(void)
 597 {
 598         return 0;
 599 }
 600 #endif /* CONFIG_PROC_FS */
 601
 602 static inline void rt_free(struct rtable *rt)
 603 {
 604         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 605 }
 606
 607 static inline void rt_drop(struct rtable *rt)
 608 {
 609         ip_rt_put(rt);
 610         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 611 }
 612
 613 static inline int rt_fast_clean(struct rtable *rth)
 614 {
 615         /* Kill broadcast/multicast entries very aggresively, if they
 616            collide in hash table with more useful entries */
 617         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 618                 rth->fl.iif && rth->u.dst.rt_next;
 619 }
 620
 621 static inline int rt_valuable(struct rtable *rth)
 622 {
 623         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 624                 rth->u.dst.expires;
 625 }
 626
 627 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 628 {
 629         unsigned long age;
 630         int ret = 0;
 631
 632         if (atomic_read(&rth->u.dst.__refcnt))
 633                 goto out;
 634
 635         ret = 1;
 636         if (rth->u.dst.expires &&
 637             time_after_eq(jiffies, rth->u.dst.expires))
 638                 goto out;
 639
 640         age = jiffies - rth->u.dst.lastuse;
 641         ret = 0;
 642         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 643             (age <= tmo2 && rt_valuable(rth)))
 644                 goto out;
 645         ret = 1;
 646 out:    return ret;
 647 }
 648
 649 /* Bits of score are:
 650  * 31: very valuable
 651  * 30: not quite useless
 652  * 29..0: usage counter
 653  */
 654 static inline u32 rt_score(struct rtable *rt)
 655 {
 656         u32 score = jiffies - rt->u.dst.lastuse;
 657
 658         score = ~score & ~(3<<30);
 659
 660         if (rt_valuable(rt))
 661                 score |= (1<<31);
 662
 663         if (!rt->fl.iif ||
 664             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 665                 score |= (1<<30);
 666
 667         return score;
 668 }
 669
 670 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 671 {
 672         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 673                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 674                 (fl1->mark ^ fl2->mark) |
 675                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 676                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 677                 (fl1->oif ^ fl2->oif) |
 678                 (fl1->iif ^ fl2->iif)) == 0;
 679 }
 680
 681 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 682 {
 683         return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
 684 }
 685
 686 /*
 687  * Perform a full scan of hash table and free all entries.
 688  * Can be called by a softirq or a process.
 689  * In the later case, we want to be reschedule if necessary
 690  */
 691 static void rt_do_flush(int process_context)
 692 {
 693         unsigned int i;
 694         struct rtable *rth, *next;
 695
 696         for (i = 0; i <= rt_hash_mask; i++) {
 697                 if (process_context && need_resched())
 698                         cond_resched();
 699                 rth = rt_hash_table[i].chain;
 700                 if (!rth)
 701                         continue;
 702
 703                 spin_lock_bh(rt_hash_lock_addr(i));
 704                 rth = rt_hash_table[i].chain;
 705                 rt_hash_table[i].chain = NULL;
 706                 spin_unlock_bh(rt_hash_lock_addr(i));
 707
 708                 for (; rth; rth = next) {
 709                         next = rth->u.dst.rt_next;
 710                         rt_free(rth);
 711                 }
 712         }
 713 }
 714
 715 static void rt_check_expire(void)
 716 {
 717         static unsigned int rover;
 718         unsigned int i = rover, goal;
 719         struct rtable *rth, **rthp;
 720         u64 mult;
 721
 722         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 723         if (ip_rt_gc_timeout > 1)
 724                 do_div(mult, ip_rt_gc_timeout);
 725         goal = (unsigned int)mult;
 726         if (goal > rt_hash_mask)
 727                 goal = rt_hash_mask + 1;
 728         for (; goal > 0; goal--) {
 729                 unsigned long tmo = ip_rt_gc_timeout;
 730
 731                 i = (i + 1) & rt_hash_mask;
 732                 rthp = &rt_hash_table[i].chain;
 733
 734                 if (need_resched())
 735                         cond_resched();
 736
 737                 if (*rthp == NULL)
 738                         continue;
 739                 spin_lock_bh(rt_hash_lock_addr(i));
 740                 while ((rth = *rthp) != NULL) {
 741                         if (rth->rt_genid != atomic_read(&rt_genid)) {
 742                                 *rthp = rth->u.dst.rt_next;
 743                                 rt_free(rth);
 744                                 continue;
 745                         }
 746                         if (rth->u.dst.expires) {
 747                                 /* Entry is expired even if it is in use */
 748                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 749                                         tmo >>= 1;
 750                                         rthp = &rth->u.dst.rt_next;
 751                                         continue;
 752                                 }
 753                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 754                                 tmo >>= 1;
 755                                 rthp = &rth->u.dst.rt_next;
 756                                 continue;
 757                         }
 758
 759                         /* Cleanup aged off entries. */
 760                         *rthp = rth->u.dst.rt_next;
 761                         rt_free(rth);
 762                 }
 763                 spin_unlock_bh(rt_hash_lock_addr(i));
 764         }
 765         rover = i;
 766 }
 767
 768 /*
 769  * rt_worker_func() is run in process context.
 770  * we call rt_check_expire() to scan part of the hash table
 771  */
 772 static void rt_worker_func(struct work_struct *work)
 773 {
 774         rt_check_expire();
 775         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 776 }
 777
 778 /*
 779  * Pertubation of rt_genid by a small quantity [1..256]
 780  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 781  * many times (2^24) without giving recent rt_genid.
 782  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 783  */
 784 static void rt_cache_invalidate(void)
 785 {
 786         unsigned char shuffle;
 787
 788         get_random_bytes(&shuffle, sizeof(shuffle));
 789         atomic_add(shuffle + 1U, &rt_genid);
 790 }
 791
 792 /*
 793  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 794  * delay >= 0 : invalidate & flush cache (can be long)
 795  */
 796 void rt_cache_flush(int delay)
 797 {
 798         rt_cache_invalidate();
 799         if (delay >= 0)
 800                 rt_do_flush(!in_softirq());
 801 }
 802
 803 /*
 804  * We change rt_genid and let gc do the cleanup
 805  */
 806 static void rt_secret_rebuild(unsigned long dummy)
 807 {
 808         rt_cache_invalidate();
 809         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
 810 }
 811
 812 /*
 813    Short description of GC goals.
 814
 815    We want to build algorithm, which will keep routing cache
 816    at some equilibrium point, when number of aged off entries
 817    is kept approximately equal to newly generated ones.
 818
 819    Current expiration strength is variable "expire".
 820    We try to adjust it dynamically, so that if networking
 821    is idle expires is large enough to keep enough of warm entries,
 822    and when load increases it reduces to limit cache size.
 823  */
 824
 825 static int rt_garbage_collect(struct dst_ops *ops)
 826 {
 827         static unsigned long expire = RT_GC_TIMEOUT;
 828         static unsigned long last_gc;
 829         static int rover;
 830         static int equilibrium;
 831         struct rtable *rth, **rthp;
 832         unsigned long now = jiffies;
 833         int goal;
 834
 835         /*
 836          * Garbage collection is pretty expensive,
 837          * do not make it too frequently.
 838          */
 839
 840         RT_CACHE_STAT_INC(gc_total);
 841
 842         if (now - last_gc < ip_rt_gc_min_interval &&
 843             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 844                 RT_CACHE_STAT_INC(gc_ignored);
 845                 goto out;
 846         }
 847
 848         /* Calculate number of entries, which we want to expire now. */
 849         goal = atomic_read(&ipv4_dst_ops.entries) -
 850                 (ip_rt_gc_elasticity << rt_hash_log);
 851         if (goal <= 0) {
 852                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 853                         equilibrium = ipv4_dst_ops.gc_thresh;
 854                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 855                 if (goal > 0) {
 856                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 857                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 858                 }
 859         } else {
 860                 /* We are in dangerous area. Try to reduce cache really
 861                  * aggressively.
 862                  */
 863                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 864                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 865         }
 866
 867         if (now - last_gc >= ip_rt_gc_min_interval)
 868                 last_gc = now;
 869
 870         if (goal <= 0) {
 871                 equilibrium += goal;
 872                 goto work_done;
 873         }
 874
 875         do {
 876                 int i, k;
 877
 878                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 879                         unsigned long tmo = expire;
 880
 881                         k = (k + 1) & rt_hash_mask;
 882                         rthp = &rt_hash_table[k].chain;
 883                         spin_lock_bh(rt_hash_lock_addr(k));
 884                         while ((rth = *rthp) != NULL) {
 885                                 if (rth->rt_genid == atomic_read(&rt_genid) &&
 886                                         !rt_may_expire(rth, tmo, expire)) {
 887                                         tmo >>= 1;
 888                                         rthp = &rth->u.dst.rt_next;
 889                                         continue;
 890                                 }
 891                                 *rthp = rth->u.dst.rt_next;
 892                                 rt_free(rth);
 893                                 goal--;
 894                         }
 895                         spin_unlock_bh(rt_hash_lock_addr(k));
 896                         if (goal <= 0)
 897                                 break;
 898                 }
 899                 rover = k;
 900
 901                 if (goal <= 0)
 902                         goto work_done;
 903
 904                 /* Goal is not achieved. We stop process if:
 905
 906                    - if expire reduced to zero. Otherwise, expire is halfed.
 907                    - if table is not full.
 908                    - if we are called from interrupt.
 909                    - jiffies check is just fallback/debug loop breaker.
 910                      We will not spin here for long time in any case.
 911                  */
 912
 913                 RT_CACHE_STAT_INC(gc_goal_miss);
 914
 915                 if (expire == 0)
 916                         break;
 917
 918                 expire >>= 1;
 919 #if RT_CACHE_DEBUG >= 2
 920                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 921                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 922 #endif
 923
 924                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 925                         goto out;
 926         } while (!in_softirq() && time_before_eq(jiffies, now));
 927
 928         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 929                 goto out;
 930         if (net_ratelimit())
 931                 printk(KERN_WARNING "dst cache overflow\n");
 932         RT_CACHE_STAT_INC(gc_dst_overflow);
 933         return 1;
 934
 935 work_done:
 936         expire += ip_rt_gc_min_interval;
 937         if (expire > ip_rt_gc_timeout ||
 938             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 939                 expire = ip_rt_gc_timeout;
 940 #if RT_CACHE_DEBUG >= 2
 941         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 942                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 943 #endif
 944 out:    return 0;
 945 }
 946
 947 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 948 {
 949         struct rtable   *rth, **rthp;
 950         unsigned long   now;
 951         struct rtable *cand, **candp;
 952         u32             min_score;
 953         int             chain_length;
 954         int attempts = !in_softirq();
 955
 956 restart:
 957         chain_length = 0;
 958         min_score = ~(u32)0;
 959         cand = NULL;
 960         candp = NULL;
 961         now = jiffies;
 962
 963         rthp = &rt_hash_table[hash].chain;
 964
 965         spin_lock_bh(rt_hash_lock_addr(hash));
 966         while ((rth = *rthp) != NULL) {
 967                 if (rth->rt_genid != atomic_read(&rt_genid)) {
 968                         *rthp = rth->u.dst.rt_next;
 969                         rt_free(rth);
 970                         continue;
 971                 }
 972                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
 973                         /* Put it first */
 974                         *rthp = rth->u.dst.rt_next;
 975                         /*
 976                          * Since lookup is lockfree, the deletion
 977                          * must be visible to another weakly ordered CPU before
 978                          * the insertion at the start of the hash chain.
 979                          */
 980                         rcu_assign_pointer(rth->u.dst.rt_next,
 981                                            rt_hash_table[hash].chain);
 982                         /*
 983                          * Since lookup is lockfree, the update writes
 984                          * must be ordered for consistency on SMP.
 985                          */
 986                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 987
 988                         dst_use(&rth->u.dst, now);
 989                         spin_unlock_bh(rt_hash_lock_addr(hash));
 990
 991                         rt_drop(rt);
 992                         *rp = rth;
 993                         return 0;
 994                 }
 995
 996                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 997                         u32 score = rt_score(rth);
 998
 999                         if (score <= min_score) {
1000                                 cand = rth;
1001                                 candp = rthp;
1002                                 min_score = score;
1003                         }
1004                 }
1005
1006                 chain_length++;
1007
1008                 rthp = &rth->u.dst.rt_next;
1009         }
1010
1011         if (cand) {
1012                 /* ip_rt_gc_elasticity used to be average length of chain
1013                  * length, when exceeded gc becomes really aggressive.
1014                  *
1015                  * The second limit is less certain. At the moment it allows
1016                  * only 2 entries per bucket. We will see.
1017                  */
1018                 if (chain_length > ip_rt_gc_elasticity) {
1019                         *candp = cand->u.dst.rt_next;
1020                         rt_free(cand);
1021                 }
1022         }
1023
1024         /* Try to bind route to arp only if it is output
1025            route or unicast forwarding path.
1026          */
1027         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1028                 int err = arp_bind_neighbour(&rt->u.dst);
1029                 if (err) {
1030                         spin_unlock_bh(rt_hash_lock_addr(hash));
1031
1032                         if (err != -ENOBUFS) {
1033                                 rt_drop(rt);
1034                                 return err;
1035                         }
1036
1037                         /* Neighbour tables are full and nothing
1038                            can be released. Try to shrink route cache,
1039                            it is most likely it holds some neighbour records.
1040                          */
1041                         if (attempts-- > 0) {
1042                                 int saved_elasticity = ip_rt_gc_elasticity;
1043                                 int saved_int = ip_rt_gc_min_interval;
1044                                 ip_rt_gc_elasticity     = 1;
1045                                 ip_rt_gc_min_interval   = 0;
1046                                 rt_garbage_collect(&ipv4_dst_ops);
1047                                 ip_rt_gc_min_interval   = saved_int;
1048                                 ip_rt_gc_elasticity     = saved_elasticity;
1049                                 goto restart;
1050                         }
1051
1052                         if (net_ratelimit())
1053                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1054                         rt_drop(rt);
1055                         return -ENOBUFS;
1056                 }
1057         }
1058
1059         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1060 #if RT_CACHE_DEBUG >= 2
1061         if (rt->u.dst.rt_next) {
1062                 struct rtable *trt;
1063                 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1064                        NIPQUAD(rt->rt_dst));
1065                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1066                         printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1067                 printk("\n");
1068         }
1069 #endif
1070         rt_hash_table[hash].chain = rt;
1071         spin_unlock_bh(rt_hash_lock_addr(hash));
1072         *rp = rt;
1073         return 0;
1074 }
1075
1076 void rt_bind_peer(struct rtable *rt, int create)
1077 {
1078         static DEFINE_SPINLOCK(rt_peer_lock);
1079         struct inet_peer *peer;
1080
1081         peer = inet_getpeer(rt->rt_dst, create);
1082
1083         spin_lock_bh(&rt_peer_lock);
1084         if (rt->peer == NULL) {
1085                 rt->peer = peer;
1086                 peer = NULL;
1087         }
1088         spin_unlock_bh(&rt_peer_lock);
1089         if (peer)
1090                 inet_putpeer(peer);
1091 }
1092
1093 /*
1094  * Peer allocation may fail only in serious out-of-memory conditions.  However
1095  * we still can generate some output.
1096  * Random ID selection looks a bit dangerous because we have no chances to
1097  * select ID being unique in a reasonable period of time.
1098  * But broken packet identifier may be better than no packet at all.
1099  */
1100 static void ip_select_fb_ident(struct iphdr *iph)
1101 {
1102         static DEFINE_SPINLOCK(ip_fb_id_lock);
1103         static u32 ip_fallback_id;
1104         u32 salt;
1105
1106         spin_lock_bh(&ip_fb_id_lock);
1107         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1108         iph->id = htons(salt & 0xFFFF);
1109         ip_fallback_id = salt;
1110         spin_unlock_bh(&ip_fb_id_lock);
1111 }
1112
1113 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1114 {
1115         struct rtable *rt = (struct rtable *) dst;
1116
1117         if (rt) {
1118                 if (rt->peer == NULL)
1119                         rt_bind_peer(rt, 1);
1120
1121                 /* If peer is attached to destination, it is never detached,
1122                    so that we need not to grab a lock to dereference it.
1123                  */
1124                 if (rt->peer) {
1125                         iph->id = htons(inet_getid(rt->peer, more));
1126                         return;
1127                 }
1128         } else
1129                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1130                        __builtin_return_address(0));
1131
1132         ip_select_fb_ident(iph);
1133 }
1134
1135 static void rt_del(unsigned hash, struct rtable *rt)
1136 {
1137         struct rtable **rthp, *aux;
1138
1139         rthp = &rt_hash_table[hash].chain;
1140         spin_lock_bh(rt_hash_lock_addr(hash));
1141         ip_rt_put(rt);
1142         while ((aux = *rthp) != NULL) {
1143                 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1144                         *rthp = aux->u.dst.rt_next;
1145                         rt_free(aux);
1146                         continue;
1147                 }
1148                 rthp = &aux->u.dst.rt_next;
1149         }
1150         spin_unlock_bh(rt_hash_lock_addr(hash));
1151 }
1152
1153 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1154                     __be32 saddr, struct net_device *dev)
1155 {
1156         int i, k;
1157         struct in_device *in_dev = in_dev_get(dev);
1158         struct rtable *rth, **rthp;
1159         __be32  skeys[2] = { saddr, 0 };
1160         int  ikeys[2] = { dev->ifindex, 0 };
1161         struct netevent_redirect netevent;
1162         struct net *net;
1163
1164         if (!in_dev)
1165                 return;
1166
1167         net = dev_net(dev);
1168         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1169             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1170             || ipv4_is_zeronet(new_gw))
1171                 goto reject_redirect;
1172
1173         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1174                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1175                         goto reject_redirect;
1176                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1177                         goto reject_redirect;
1178         } else {
1179                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1180                         goto reject_redirect;
1181         }
1182
1183         for (i = 0; i < 2; i++) {
1184                 for (k = 0; k < 2; k++) {
1185                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1186
1187                         rthp=&rt_hash_table[hash].chain;
1188
1189                         rcu_read_lock();
1190                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1191                                 struct rtable *rt;
1192
1193                                 if (rth->fl.fl4_dst != daddr ||
1194                                     rth->fl.fl4_src != skeys[i] ||
1195                                     rth->fl.oif != ikeys[k] ||
1196                                     rth->fl.iif != 0 ||
1197                                     rth->rt_genid != atomic_read(&rt_genid) ||
1198                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1199                                         rthp = &rth->u.dst.rt_next;
1200                                         continue;
1201                                 }
1202
1203                                 if (rth->rt_dst != daddr ||
1204                                     rth->rt_src != saddr ||
1205                                     rth->u.dst.error ||
1206                                     rth->rt_gateway != old_gw ||
1207                                     rth->u.dst.dev != dev)
1208                                         break;
1209
1210                                 dst_hold(&rth->u.dst);
1211                                 rcu_read_unlock();
1212
1213                                 rt = dst_alloc(&ipv4_dst_ops);
1214                                 if (rt == NULL) {
1215                                         ip_rt_put(rth);
1216                                         in_dev_put(in_dev);
1217                                         return;
1218                                 }
1219
1220                                 /* Copy all the information. */
1221                                 *rt = *rth;
1222                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1223                                 rt->u.dst.__use         = 1;
1224                                 atomic_set(&rt->u.dst.__refcnt, 1);
1225                                 rt->u.dst.child         = NULL;
1226                                 if (rt->u.dst.dev)
1227                                         dev_hold(rt->u.dst.dev);
1228                                 if (rt->idev)
1229                                         in_dev_hold(rt->idev);
1230                                 rt->u.dst.obsolete      = 0;
1231                                 rt->u.dst.lastuse       = jiffies;
1232                                 rt->u.dst.path          = &rt->u.dst;
1233                                 rt->u.dst.neighbour     = NULL;
1234                                 rt->u.dst.hh            = NULL;
1235                                 rt->u.dst.xfrm          = NULL;
1236                                 rt->rt_genid            = atomic_read(&rt_genid);
1237                                 rt->rt_flags            |= RTCF_REDIRECTED;
1238
1239                                 /* Gateway is different ... */
1240                                 rt->rt_gateway          = new_gw;
1241
1242                                 /* Redirect received -> path was valid */
1243                                 dst_confirm(&rth->u.dst);
1244
1245                                 if (rt->peer)
1246                                         atomic_inc(&rt->peer->refcnt);
1247
1248                                 if (arp_bind_neighbour(&rt->u.dst) ||
1249                                     !(rt->u.dst.neighbour->nud_state &
1250                                             NUD_VALID)) {
1251                                         if (rt->u.dst.neighbour)
1252                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1253                                         ip_rt_put(rth);
1254                                         rt_drop(rt);
1255                                         goto do_next;
1256                                 }
1257
1258                                 netevent.old = &rth->u.dst;
1259                                 netevent.new = &rt->u.dst;
1260                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1261                                                         &netevent);
1262
1263                                 rt_del(hash, rth);
1264                                 if (!rt_intern_hash(hash, rt, &rt))
1265                                         ip_rt_put(rt);
1266                                 goto do_next;
1267                         }
1268                         rcu_read_unlock();
1269                 do_next:
1270                         ;
1271                 }
1272         }
1273         in_dev_put(in_dev);
1274         return;
1275
1276 reject_redirect:
1277 #ifdef CONFIG_IP_ROUTE_VERBOSE
1278         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1279                 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1280                         NIPQUAD_FMT " ignored.\n"
1281                         "  Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1282                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1283                        NIPQUAD(saddr), NIPQUAD(daddr));
1284 #endif
1285         in_dev_put(in_dev);
1286 }
1287
1288 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1289 {
1290         struct rtable *rt = (struct rtable *)dst;
1291         struct dst_entry *ret = dst;
1292
1293         if (rt) {
1294                 if (dst->obsolete) {
1295                         ip_rt_put(rt);
1296                         ret = NULL;
1297                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1298                            rt->u.dst.expires) {
1299                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1300                                                 rt->fl.oif);
1301 #if RT_CACHE_DEBUG >= 1
1302                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1303                                           NIPQUAD_FMT "/%02x dropped\n",
1304                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1305 #endif
1306                         rt_del(hash, rt);
1307                         ret = NULL;
1308                 }
1309         }
1310         return ret;
1311 }
1312
1313 /*
1314  * Algorithm:
1315  *      1. The first ip_rt_redirect_number redirects are sent
1316  *         with exponential backoff, then we stop sending them at all,
1317  *         assuming that the host ignores our redirects.
1318  *      2. If we did not see packets requiring redirects
1319  *         during ip_rt_redirect_silence, we assume that the host
1320  *         forgot redirected route and start to send redirects again.
1321  *
1322  * This algorithm is much cheaper and more intelligent than dumb load limiting
1323  * in icmp.c.
1324  *
1325  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1326  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1327  */
1328
1329 void ip_rt_send_redirect(struct sk_buff *skb)
1330 {
1331         struct rtable *rt = skb->rtable;
1332         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1333
1334         if (!in_dev)
1335                 return;
1336
1337         if (!IN_DEV_TX_REDIRECTS(in_dev))
1338                 goto out;
1339
1340         /* No redirected packets during ip_rt_redirect_silence;
1341          * reset the algorithm.
1342          */
1343         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1344                 rt->u.dst.rate_tokens = 0;
1345
1346         /* Too many ignored redirects; do not send anything
1347          * set u.dst.rate_last to the last seen redirected packet.
1348          */
1349         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1350                 rt->u.dst.rate_last = jiffies;
1351                 goto out;
1352         }
1353
1354         /* Check for load limit; set rate_last to the latest sent
1355          * redirect.
1356          */
1357         if (rt->u.dst.rate_tokens == 0 ||
1358             time_after(jiffies,
1359                        (rt->u.dst.rate_last +
1360                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1361                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1362                 rt->u.dst.rate_last = jiffies;
1363                 ++rt->u.dst.rate_tokens;
1364 #ifdef CONFIG_IP_ROUTE_VERBOSE
1365                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1366                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1367                     net_ratelimit())
1368                         printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1369                                 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1370                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1371                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1372 #endif
1373         }
1374 out:
1375         in_dev_put(in_dev);
1376 }
1377
1378 static int ip_error(struct sk_buff *skb)
1379 {
1380         struct rtable *rt = skb->rtable;
1381         unsigned long now;
1382         int code;
1383
1384         switch (rt->u.dst.error) {
1385                 case EINVAL:
1386                 default:
1387                         goto out;
1388                 case EHOSTUNREACH:
1389                         code = ICMP_HOST_UNREACH;
1390                         break;
1391                 case ENETUNREACH:
1392                         code = ICMP_NET_UNREACH;
1393                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1394                         break;
1395                 case EACCES:
1396                         code = ICMP_PKT_FILTERED;
1397                         break;
1398         }
1399
1400         now = jiffies;
1401         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1402         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1403                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1404         rt->u.dst.rate_last = now;
1405         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1406                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1407                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1408         }
1409
1410 out:    kfree_skb(skb);
1411         return 0;
1412 }
1413
1414 /*
1415  *      The last two values are not from the RFC but
1416  *      are needed for AMPRnet AX.25 paths.
1417  */
1418
1419 static const unsigned short mtu_plateau[] =
1420 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1421
1422 static inline unsigned short guess_mtu(unsigned short old_mtu)
1423 {
1424         int i;
1425
1426         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1427                 if (old_mtu > mtu_plateau[i])
1428                         return mtu_plateau[i];
1429         return 68;
1430 }
1431
1432 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1433                                  unsigned short new_mtu,
1434                                  struct net_device *dev)
1435 {
1436         int i, k;
1437         unsigned short old_mtu = ntohs(iph->tot_len);
1438         struct rtable *rth;
1439         int  ikeys[2] = { dev->ifindex, 0 };
1440         __be32  skeys[2] = { iph->saddr, 0, };
1441         __be32  daddr = iph->daddr;
1442         unsigned short est_mtu = 0;
1443
1444         if (ipv4_config.no_pmtu_disc)
1445                 return 0;
1446
1447         for (k = 0; k < 2; k++) {
1448                 for (i = 0; i < 2; i++) {
1449                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1450
1451                         rcu_read_lock();
1452                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1453                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1454                                 unsigned short mtu = new_mtu;
1455
1456                                 if (rth->fl.fl4_dst != daddr ||
1457                                     rth->fl.fl4_src != skeys[i] ||
1458                                     rth->rt_dst != daddr ||
1459                                     rth->rt_src != iph->saddr ||
1460                                     rth->fl.oif != ikeys[k] ||
1461                                     rth->fl.iif != 0 ||
1462                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1463                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1464                                     rth->rt_genid != atomic_read(&rt_genid))
1465                                         continue;
1466
1467                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1468
1469                                         /* BSD 4.2 compatibility hack :-( */
1470                                         if (mtu == 0 &&
1471                                             old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
1472                                             old_mtu >= 68 + (iph->ihl << 2))
1473                                                 old_mtu -= iph->ihl << 2;
1474
1475                                         mtu = guess_mtu(old_mtu);
1476                                 }
1477                                 if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
1478                                         if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
1479                                                 dst_confirm(&rth->u.dst);
1480                                                 if (mtu < ip_rt_min_pmtu) {
1481                                                         mtu = ip_rt_min_pmtu;
1482                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1483                                                                 (1 << RTAX_MTU);
1484                                                 }
1485                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1486                                                 dst_set_expires(&rth->u.dst,
1487                                                         ip_rt_mtu_expires);
1488                                         }
1489                                         est_mtu = mtu;
1490                                 }
1491                         }
1492                         rcu_read_unlock();
1493                 }
1494         }
1495         return est_mtu ? : new_mtu;
1496 }
1497
1498 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1499 {
1500         if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
1501             !(dst_metric_locked(dst, RTAX_MTU))) {
1502                 if (mtu < ip_rt_min_pmtu) {
1503                         mtu = ip_rt_min_pmtu;
1504                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1505                 }
1506                 dst->metrics[RTAX_MTU-1] = mtu;
1507                 dst_set_expires(dst, ip_rt_mtu_expires);
1508                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1509         }
1510 }
1511
1512 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1513 {
1514         return NULL;
1515 }
1516
1517 static void ipv4_dst_destroy(struct dst_entry *dst)
1518 {
1519         struct rtable *rt = (struct rtable *) dst;
1520         struct inet_peer *peer = rt->peer;
1521         struct in_device *idev = rt->idev;
1522
1523         if (peer) {
1524                 rt->peer = NULL;
1525                 inet_putpeer(peer);
1526         }
1527
1528         if (idev) {
1529                 rt->idev = NULL;
1530                 in_dev_put(idev);
1531         }
1532 }
1533
1534 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1535                             int how)
1536 {
1537         struct rtable *rt = (struct rtable *) dst;
1538         struct in_device *idev = rt->idev;
1539         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1540                 struct in_device *loopback_idev =
1541                         in_dev_get(dev_net(dev)->loopback_dev);
1542                 if (loopback_idev) {
1543                         rt->idev = loopback_idev;
1544                         in_dev_put(idev);
1545                 }
1546         }
1547 }
1548
1549 static void ipv4_link_failure(struct sk_buff *skb)
1550 {
1551         struct rtable *rt;
1552
1553         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1554
1555         rt = skb->rtable;
1556         if (rt)
1557                 dst_set_expires(&rt->u.dst, 0);
1558 }
1559
1560 static int ip_rt_bug(struct sk_buff *skb)
1561 {
1562         printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1563                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1564                 skb->dev ? skb->dev->name : "?");
1565         kfree_skb(skb);
1566         return 0;
1567 }
1568
1569 /*
1570    We do not cache source address of outgoing interface,
1571    because it is used only by IP RR, TS and SRR options,
1572    so that it out of fast path.
1573
1574    BTW remember: "addr" is allowed to be not aligned
1575    in IP options!
1576  */
1577
1578 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1579 {
1580         __be32 src;
1581         struct fib_result res;
1582
1583         if (rt->fl.iif == 0)
1584                 src = rt->rt_src;
1585         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1586                 src = FIB_RES_PREFSRC(res);
1587                 fib_res_put(&res);
1588         } else
1589                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1590                                         RT_SCOPE_UNIVERSE);
1591         memcpy(addr, &src, 4);
1592 }
1593
1594 #ifdef CONFIG_NET_CLS_ROUTE
1595 static void set_class_tag(struct rtable *rt, u32 tag)
1596 {
1597         if (!(rt->u.dst.tclassid & 0xFFFF))
1598                 rt->u.dst.tclassid |= tag & 0xFFFF;
1599         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1600                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1601 }
1602 #endif
1603
1604 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1605 {
1606         struct fib_info *fi = res->fi;
1607
1608         if (fi) {
1609                 if (FIB_RES_GW(*res) &&
1610                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1611                         rt->rt_gateway = FIB_RES_GW(*res);
1612                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1613                        sizeof(rt->u.dst.metrics));
1614                 if (fi->fib_mtu == 0) {
1615                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1616                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1617                             rt->rt_gateway != rt->rt_dst &&
1618                             rt->u.dst.dev->mtu > 576)
1619                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1620                 }
1621 #ifdef CONFIG_NET_CLS_ROUTE
1622                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1623 #endif
1624         } else
1625                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1626
1627         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1628                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1629         if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
1630                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1631         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1632                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1633                                        ip_rt_min_advmss);
1634         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1635                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1636
1637 #ifdef CONFIG_NET_CLS_ROUTE
1638 #ifdef CONFIG_IP_MULTIPLE_TABLES
1639         set_class_tag(rt, fib_rules_tclass(res));
1640 #endif
1641         set_class_tag(rt, itag);
1642 #endif
1643         rt->rt_type = res->type;
1644 }
1645
1646 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1647                                 u8 tos, struct net_device *dev, int our)
1648 {
1649         unsigned hash;
1650         struct rtable *rth;
1651         __be32 spec_dst;
1652         struct in_device *in_dev = in_dev_get(dev);
1653         u32 itag = 0;
1654
1655         /* Primary sanity checks. */
1656
1657         if (in_dev == NULL)
1658                 return -EINVAL;
1659
1660         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1661             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1662                 goto e_inval;
1663
1664         if (ipv4_is_zeronet(saddr)) {
1665                 if (!ipv4_is_local_multicast(daddr))
1666                         goto e_inval;
1667                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1668         } else if (fib_validate_source(saddr, 0, tos, 0,
1669                                         dev, &spec_dst, &itag) < 0)
1670                 goto e_inval;
1671
1672         rth = dst_alloc(&ipv4_dst_ops);
1673         if (!rth)
1674                 goto e_nobufs;
1675
1676         rth->u.dst.output= ip_rt_bug;
1677
1678         atomic_set(&rth->u.dst.__refcnt, 1);
1679         rth->u.dst.flags= DST_HOST;
1680         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1681                 rth->u.dst.flags |= DST_NOPOLICY;
1682         rth->fl.fl4_dst = daddr;
1683         rth->rt_dst     = daddr;
1684         rth->fl.fl4_tos = tos;
1685         rth->fl.mark    = skb->mark;
1686         rth->fl.fl4_src = saddr;
1687         rth->rt_src     = saddr;
1688 #ifdef CONFIG_NET_CLS_ROUTE
1689         rth->u.dst.tclassid = itag;
1690 #endif
1691         rth->rt_iif     =
1692         rth->fl.iif     = dev->ifindex;
1693         rth->u.dst.dev  = init_net.loopback_dev;
1694         dev_hold(rth->u.dst.dev);
1695         rth->idev       = in_dev_get(rth->u.dst.dev);
1696         rth->fl.oif     = 0;
1697         rth->rt_gateway = daddr;
1698         rth->rt_spec_dst= spec_dst;
1699         rth->rt_genid   = atomic_read(&rt_genid);
1700         rth->rt_flags   = RTCF_MULTICAST;
1701         rth->rt_type    = RTN_MULTICAST;
1702         if (our) {
1703                 rth->u.dst.input= ip_local_deliver;
1704                 rth->rt_flags |= RTCF_LOCAL;
1705         }
1706
1707 #ifdef CONFIG_IP_MROUTE
1708         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1709                 rth->u.dst.input = ip_mr_input;
1710 #endif
1711         RT_CACHE_STAT_INC(in_slow_mc);
1712
1713         in_dev_put(in_dev);
1714         hash = rt_hash(daddr, saddr, dev->ifindex);
1715         return rt_intern_hash(hash, rth, &skb->rtable);
1716
1717 e_nobufs:
1718         in_dev_put(in_dev);
1719         return -ENOBUFS;
1720
1721 e_inval:
1722         in_dev_put(in_dev);
1723         return -EINVAL;
1724 }
1725
1726
1727 static void ip_handle_martian_source(struct net_device *dev,
1728                                      struct in_device *in_dev,
1729                                      struct sk_buff *skb,
1730                                      __be32 daddr,
1731                                      __be32 saddr)
1732 {
1733         RT_CACHE_STAT_INC(in_martian_src);
1734 #ifdef CONFIG_IP_ROUTE_VERBOSE
1735         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1736                 /*
1737                  *      RFC1812 recommendation, if source is martian,
1738                  *      the only hint is MAC header.
1739                  */
1740                 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1741                         NIPQUAD_FMT", on dev %s\n",
1742                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1743                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1744                         int i;
1745                         const unsigned char *p = skb_mac_header(skb);
1746                         printk(KERN_WARNING "ll header: ");
1747                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1748                                 printk("%02x", *p);
1749                                 if (i < (dev->hard_header_len - 1))
1750                                         printk(":");
1751                         }
1752                         printk("\n");
1753                 }
1754         }
1755 #endif
1756 }
1757
1758 static int __mkroute_input(struct sk_buff *skb,
1759                            struct fib_result *res,
1760                            struct in_device *in_dev,
1761                            __be32 daddr, __be32 saddr, u32 tos,
1762                            struct rtable **result)
1763 {
1764
1765         struct rtable *rth;
1766         int err;
1767         struct in_device *out_dev;
1768         unsigned flags = 0;
1769         __be32 spec_dst;
1770         u32 itag;
1771
1772         /* get a working reference to the output device */
1773         out_dev = in_dev_get(FIB_RES_DEV(*res));
1774         if (out_dev == NULL) {
1775                 if (net_ratelimit())
1776                         printk(KERN_CRIT "Bug in ip_route_input" \
1777                                "_slow(). Please, report\n");
1778                 return -EINVAL;
1779         }
1780
1781
1782         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1783                                   in_dev->dev, &spec_dst, &itag);
1784         if (err < 0) {
1785                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1786                                          saddr);
1787
1788                 err = -EINVAL;
1789                 goto cleanup;
1790         }
1791
1792         if (err)
1793                 flags |= RTCF_DIRECTSRC;
1794
1795         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1796             (IN_DEV_SHARED_MEDIA(out_dev) ||
1797              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1798                 flags |= RTCF_DOREDIRECT;
1799
1800         if (skb->protocol != htons(ETH_P_IP)) {
1801                 /* Not IP (i.e. ARP). Do not create route, if it is
1802                  * invalid for proxy arp. DNAT routes are always valid.
1803                  */
1804                 if (out_dev == in_dev) {
1805                         err = -EINVAL;
1806                         goto cleanup;
1807                 }
1808         }
1809
1810
1811         rth = dst_alloc(&ipv4_dst_ops);
1812         if (!rth) {
1813                 err = -ENOBUFS;
1814                 goto cleanup;
1815         }
1816
1817         atomic_set(&rth->u.dst.__refcnt, 1);
1818         rth->u.dst.flags= DST_HOST;
1819         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1820                 rth->u.dst.flags |= DST_NOPOLICY;
1821         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1822                 rth->u.dst.flags |= DST_NOXFRM;
1823         rth->fl.fl4_dst = daddr;
1824         rth->rt_dst     = daddr;
1825         rth->fl.fl4_tos = tos;
1826         rth->fl.mark    = skb->mark;
1827         rth->fl.fl4_src = saddr;
1828         rth->rt_src     = saddr;
1829         rth->rt_gateway = daddr;
1830         rth->rt_iif     =
1831                 rth->fl.iif     = in_dev->dev->ifindex;
1832         rth->u.dst.dev  = (out_dev)->dev;
1833         dev_hold(rth->u.dst.dev);
1834         rth->idev       = in_dev_get(rth->u.dst.dev);
1835         rth->fl.oif     = 0;
1836         rth->rt_spec_dst= spec_dst;
1837
1838         rth->u.dst.input = ip_forward;
1839         rth->u.dst.output = ip_output;
1840         rth->rt_genid = atomic_read(&rt_genid);
1841
1842         rt_set_nexthop(rth, res, itag);
1843
1844         rth->rt_flags = flags;
1845
1846         *result = rth;
1847         err = 0;
1848  cleanup:
1849         /* release the working reference to the output device */
1850         in_dev_put(out_dev);
1851         return err;
1852 }
1853
1854 static int ip_mkroute_input(struct sk_buff *skb,
1855                             struct fib_result *res,
1856                             const struct flowi *fl,
1857                             struct in_device *in_dev,
1858                             __be32 daddr, __be32 saddr, u32 tos)
1859 {
1860         struct rtable* rth = NULL;
1861         int err;
1862         unsigned hash;
1863
1864 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1865         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1866                 fib_select_multipath(fl, res);
1867 #endif
1868
1869         /* create a routing cache entry */
1870         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1871         if (err)
1872                 return err;
1873
1874         /* put it into the cache */
1875         hash = rt_hash(daddr, saddr, fl->iif);
1876         return rt_intern_hash(hash, rth, &skb->rtable);
1877 }
1878
1879 /*
1880  *      NOTE. We drop all the packets that has local source
1881  *      addresses, because every properly looped back packet
1882  *      must have correct destination already attached by output routine.
1883  *
1884  *      Such approach solves two big problems:
1885  *      1. Not simplex devices are handled properly.
1886  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1887  */
1888
1889 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1890                                u8 tos, struct net_device *dev)
1891 {
1892         struct fib_result res;
1893         struct in_device *in_dev = in_dev_get(dev);
1894         struct flowi fl = { .nl_u = { .ip4_u =
1895                                       { .daddr = daddr,
1896                                         .saddr = saddr,
1897                                         .tos = tos,
1898                                         .scope = RT_SCOPE_UNIVERSE,
1899                                       } },
1900                             .mark = skb->mark,
1901                             .iif = dev->ifindex };
1902         unsigned        flags = 0;
1903         u32             itag = 0;
1904         struct rtable * rth;
1905         unsigned        hash;
1906         __be32          spec_dst;
1907         int             err = -EINVAL;
1908         int             free_res = 0;
1909         struct net    * net = dev_net(dev);
1910
1911         /* IP on this device is disabled. */
1912
1913         if (!in_dev)
1914                 goto out;
1915
1916         /* Check for the most weird martians, which can be not detected
1917            by fib_lookup.
1918          */
1919
1920         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1921             ipv4_is_loopback(saddr))
1922                 goto martian_source;
1923
1924         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1925                 goto brd_input;
1926
1927         /* Accept zero addresses only to limited broadcast;
1928          * I even do not know to fix it or not. Waiting for complains :-)
1929          */
1930         if (ipv4_is_zeronet(saddr))
1931                 goto martian_source;
1932
1933         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1934             ipv4_is_loopback(daddr))
1935                 goto martian_destination;
1936
1937         /*
1938          *      Now we are ready to route packet.
1939          */
1940         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1941                 if (!IN_DEV_FORWARD(in_dev))
1942                         goto e_hostunreach;
1943                 goto no_route;
1944         }
1945         free_res = 1;
1946
1947         RT_CACHE_STAT_INC(in_slow_tot);
1948
1949         if (res.type == RTN_BROADCAST)
1950                 goto brd_input;
1951
1952         if (res.type == RTN_LOCAL) {
1953                 int result;
1954                 result = fib_validate_source(saddr, daddr, tos,
1955                                              net->loopback_dev->ifindex,
1956                                              dev, &spec_dst, &itag);
1957                 if (result < 0)
1958                         goto martian_source;
1959                 if (result)
1960                         flags |= RTCF_DIRECTSRC;
1961                 spec_dst = daddr;
1962                 goto local_input;
1963         }
1964
1965         if (!IN_DEV_FORWARD(in_dev))
1966                 goto e_hostunreach;
1967         if (res.type != RTN_UNICAST)
1968                 goto martian_destination;
1969
1970         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1971 done:
1972         in_dev_put(in_dev);
1973         if (free_res)
1974                 fib_res_put(&res);
1975 out:    return err;
1976
1977 brd_input:
1978         if (skb->protocol != htons(ETH_P_IP))
1979                 goto e_inval;
1980
1981         if (ipv4_is_zeronet(saddr))
1982                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1983         else {
1984                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1985                                           &itag);
1986                 if (err < 0)
1987                         goto martian_source;
1988                 if (err)
1989                         flags |= RTCF_DIRECTSRC;
1990         }
1991         flags |= RTCF_BROADCAST;
1992         res.type = RTN_BROADCAST;
1993         RT_CACHE_STAT_INC(in_brd);
1994
1995 local_input:
1996         rth = dst_alloc(&ipv4_dst_ops);
1997         if (!rth)
1998                 goto e_nobufs;
1999
2000         rth->u.dst.output= ip_rt_bug;
2001         rth->rt_genid = atomic_read(&rt_genid);
2002
2003         atomic_set(&rth->u.dst.__refcnt, 1);
2004         rth->u.dst.flags= DST_HOST;
2005         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2006                 rth->u.dst.flags |= DST_NOPOLICY;
2007         rth->fl.fl4_dst = daddr;
2008         rth->rt_dst     = daddr;
2009         rth->fl.fl4_tos = tos;
2010         rth->fl.mark    = skb->mark;
2011         rth->fl.fl4_src = saddr;
2012         rth->rt_src     = saddr;
2013 #ifdef CONFIG_NET_CLS_ROUTE
2014         rth->u.dst.tclassid = itag;
2015 #endif
2016         rth->rt_iif     =
2017         rth->fl.iif     = dev->ifindex;
2018         rth->u.dst.dev  = net->loopback_dev;
2019         dev_hold(rth->u.dst.dev);
2020         rth->idev       = in_dev_get(rth->u.dst.dev);
2021         rth->rt_gateway = daddr;
2022         rth->rt_spec_dst= spec_dst;
2023         rth->u.dst.input= ip_local_deliver;
2024         rth->rt_flags   = flags|RTCF_LOCAL;
2025         if (res.type == RTN_UNREACHABLE) {
2026                 rth->u.dst.input= ip_error;
2027                 rth->u.dst.error= -err;
2028                 rth->rt_flags   &= ~RTCF_LOCAL;
2029         }
2030         rth->rt_type    = res.type;
2031         hash = rt_hash(daddr, saddr, fl.iif);
2032         err = rt_intern_hash(hash, rth, &skb->rtable);
2033         goto done;
2034
2035 no_route:
2036         RT_CACHE_STAT_INC(in_no_route);
2037         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2038         res.type = RTN_UNREACHABLE;
2039         if (err == -ESRCH)
2040                 err = -ENETUNREACH;
2041         goto local_input;
2042
2043         /*
2044          *      Do not cache martian addresses: they should be logged (RFC1812)
2045          */
2046 martian_destination:
2047         RT_CACHE_STAT_INC(in_martian_dst);
2048 #ifdef CONFIG_IP_ROUTE_VERBOSE
2049         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2050                 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2051                         NIPQUAD_FMT ", dev %s\n",
2052                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2053 #endif
2054
2055 e_hostunreach:
2056         err = -EHOSTUNREACH;
2057         goto done;
2058
2059 e_inval:
2060         err = -EINVAL;
2061         goto done;
2062
2063 e_nobufs:
2064         err = -ENOBUFS;
2065         goto done;
2066
2067 martian_source:
2068         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2069         goto e_inval;
2070 }
2071
2072 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2073                    u8 tos, struct net_device *dev)
2074 {
2075         struct rtable * rth;
2076         unsigned        hash;
2077         int iif = dev->ifindex;
2078         struct net *net;
2079
2080         net = dev_net(dev);
2081         tos &= IPTOS_RT_MASK;
2082         hash = rt_hash(daddr, saddr, iif);
2083
2084         rcu_read_lock();
2085         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2086              rth = rcu_dereference(rth->u.dst.rt_next)) {
2087                 if (((rth->fl.fl4_dst ^ daddr) |
2088                      (rth->fl.fl4_src ^ saddr) |
2089                      (rth->fl.iif ^ iif) |
2090                      rth->fl.oif |
2091                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2092                     rth->fl.mark == skb->mark &&
2093                     net_eq(dev_net(rth->u.dst.dev), net) &&
2094                     rth->rt_genid == atomic_read(&rt_genid)) {
2095                         dst_use(&rth->u.dst, jiffies);
2096                         RT_CACHE_STAT_INC(in_hit);
2097                         rcu_read_unlock();
2098                         skb->rtable = rth;
2099                         return 0;
2100                 }
2101                 RT_CACHE_STAT_INC(in_hlist_search);
2102         }
2103         rcu_read_unlock();
2104
2105         /* Multicast recognition logic is moved from route cache to here.
2106            The problem was that too many Ethernet cards have broken/missing
2107            hardware multicast filters :-( As result the host on multicasting
2108            network acquires a lot of useless route cache entries, sort of
2109            SDR messages from all the world. Now we try to get rid of them.
2110            Really, provided software IP multicast filter is organized
2111            reasonably (at least, hashed), it does not result in a slowdown
2112            comparing with route cache reject entries.
2113            Note, that multicast routers are not affected, because
2114            route cache entry is created eventually.
2115          */
2116         if (ipv4_is_multicast(daddr)) {
2117                 struct in_device *in_dev;
2118
2119                 rcu_read_lock();
2120                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2121                         int our = ip_check_mc(in_dev, daddr, saddr,
2122                                 ip_hdr(skb)->protocol);
2123                         if (our
2124 #ifdef CONFIG_IP_MROUTE
2125                             || (!ipv4_is_local_multicast(daddr) &&
2126                                 IN_DEV_MFORWARD(in_dev))
2127 #endif
2128                             ) {
2129                                 rcu_read_unlock();
2130                                 return ip_route_input_mc(skb, daddr, saddr,
2131                                                          tos, dev, our);
2132                         }
2133                 }
2134                 rcu_read_unlock();
2135                 return -EINVAL;
2136         }
2137         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2138 }
2139
2140 static int __mkroute_output(struct rtable **result,
2141                             struct fib_result *res,
2142                             const struct flowi *fl,
2143                             const struct flowi *oldflp,
2144                             struct net_device *dev_out,
2145                             unsigned flags)
2146 {
2147         struct rtable *rth;
2148         struct in_device *in_dev;
2149         u32 tos = RT_FL_TOS(oldflp);
2150         int err = 0;
2151
2152         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2153                 return -EINVAL;
2154
2155         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2156                 res->type = RTN_BROADCAST;
2157         else if (ipv4_is_multicast(fl->fl4_dst))
2158                 res->type = RTN_MULTICAST;
2159         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2160                 return -EINVAL;
2161
2162         if (dev_out->flags & IFF_LOOPBACK)
2163                 flags |= RTCF_LOCAL;
2164
2165         /* get work reference to inet device */
2166         in_dev = in_dev_get(dev_out);
2167         if (!in_dev)
2168                 return -EINVAL;
2169
2170         if (res->type == RTN_BROADCAST) {
2171                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2172                 if (res->fi) {
2173                         fib_info_put(res->fi);
2174                         res->fi = NULL;
2175                 }
2176         } else if (res->type == RTN_MULTICAST) {
2177                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2178                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2179                                  oldflp->proto))
2180                         flags &= ~RTCF_LOCAL;
2181                 /* If multicast route do not exist use
2182                    default one, but do not gateway in this case.
2183                    Yes, it is hack.
2184                  */
2185                 if (res->fi && res->prefixlen < 4) {
2186                         fib_info_put(res->fi);
2187                         res->fi = NULL;
2188                 }
2189         }
2190
2191
2192         rth = dst_alloc(&ipv4_dst_ops);
2193         if (!rth) {
2194                 err = -ENOBUFS;
2195                 goto cleanup;
2196         }
2197
2198         atomic_set(&rth->u.dst.__refcnt, 1);
2199         rth->u.dst.flags= DST_HOST;
2200         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2201                 rth->u.dst.flags |= DST_NOXFRM;
2202         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2203                 rth->u.dst.flags |= DST_NOPOLICY;
2204
2205         rth->fl.fl4_dst = oldflp->fl4_dst;
2206         rth->fl.fl4_tos = tos;
2207         rth->fl.fl4_src = oldflp->fl4_src;
2208         rth->fl.oif     = oldflp->oif;
2209         rth->fl.mark    = oldflp->mark;
2210         rth->rt_dst     = fl->fl4_dst;
2211         rth->rt_src     = fl->fl4_src;
2212         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2213         /* get references to the devices that are to be hold by the routing
2214            cache entry */
2215         rth->u.dst.dev  = dev_out;
2216         dev_hold(dev_out);
2217         rth->idev       = in_dev_get(dev_out);
2218         rth->rt_gateway = fl->fl4_dst;
2219         rth->rt_spec_dst= fl->fl4_src;
2220
2221         rth->u.dst.output=ip_output;
2222         rth->rt_genid = atomic_read(&rt_genid);
2223
2224         RT_CACHE_STAT_INC(out_slow_tot);
2225
2226         if (flags & RTCF_LOCAL) {
2227                 rth->u.dst.input = ip_local_deliver;
2228                 rth->rt_spec_dst = fl->fl4_dst;
2229         }
2230         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2231                 rth->rt_spec_dst = fl->fl4_src;
2232                 if (flags & RTCF_LOCAL &&
2233                     !(dev_out->flags & IFF_LOOPBACK)) {
2234                         rth->u.dst.output = ip_mc_output;
2235                         RT_CACHE_STAT_INC(out_slow_mc);
2236                 }
2237 #ifdef CONFIG_IP_MROUTE
2238                 if (res->type == RTN_MULTICAST) {
2239                         if (IN_DEV_MFORWARD(in_dev) &&
2240                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2241                                 rth->u.dst.input = ip_mr_input;
2242                                 rth->u.dst.output = ip_mc_output;
2243                         }
2244                 }
2245 #endif
2246         }
2247
2248         rt_set_nexthop(rth, res, 0);
2249
2250         rth->rt_flags = flags;
2251
2252         *result = rth;
2253  cleanup:
2254         /* release work reference to inet device */
2255         in_dev_put(in_dev);
2256
2257         return err;
2258 }
2259
2260 static int ip_mkroute_output(struct rtable **rp,
2261                              struct fib_result *res,
2262                              const struct flowi *fl,
2263                              const struct flowi *oldflp,
2264                              struct net_device *dev_out,
2265                              unsigned flags)
2266 {
2267         struct rtable *rth = NULL;
2268         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2269         unsigned hash;
2270         if (err == 0) {
2271                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2272                 err = rt_intern_hash(hash, rth, rp);
2273         }
2274
2275         return err;
2276 }
2277
2278 /*
2279  * Major route resolver routine.
2280  */
2281
2282 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2283                                 const struct flowi *oldflp)
2284 {
2285         u32 tos = RT_FL_TOS(oldflp);
2286         struct flowi fl = { .nl_u = { .ip4_u =
2287                                       { .daddr = oldflp->fl4_dst,
2288                                         .saddr = oldflp->fl4_src,
2289                                         .tos = tos & IPTOS_RT_MASK,
2290                                         .scope = ((tos & RTO_ONLINK) ?
2291                                                   RT_SCOPE_LINK :
2292                                                   RT_SCOPE_UNIVERSE),
2293                                       } },
2294                             .mark = oldflp->mark,
2295                             .iif = net->loopback_dev->ifindex,
2296                             .oif = oldflp->oif };
2297         struct fib_result res;
2298         unsigned flags = 0;
2299         struct net_device *dev_out = NULL;
2300         int free_res = 0;
2301         int err;
2302
2303
2304         res.fi          = NULL;
2305 #ifdef CONFIG_IP_MULTIPLE_TABLES
2306         res.r           = NULL;
2307 #endif
2308
2309         if (oldflp->fl4_src) {
2310                 err = -EINVAL;
2311                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2312                     ipv4_is_lbcast(oldflp->fl4_src) ||
2313                     ipv4_is_zeronet(oldflp->fl4_src))
2314                         goto out;
2315
2316                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2317                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2318                 if (dev_out == NULL)
2319                         goto out;
2320
2321                 /* I removed check for oif == dev_out->oif here.
2322                    It was wrong for two reasons:
2323                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2324                       is assigned to multiple interfaces.
2325                    2. Moreover, we are allowed to send packets with saddr
2326                       of another iface. --ANK
2327                  */
2328
2329                 if (oldflp->oif == 0
2330                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2331                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2332                         /* Special hack: user can direct multicasts
2333                            and limited broadcast via necessary interface
2334                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2335                            This hack is not just for fun, it allows
2336                            vic,vat and friends to work.
2337                            They bind socket to loopback, set ttl to zero
2338                            and expect that it will work.
2339                            From the viewpoint of routing cache they are broken,
2340                            because we are not allowed to build multicast path
2341                            with loopback source addr (look, routing cache
2342                            cannot know, that ttl is zero, so that packet
2343                            will not leave this host and route is valid).
2344                            Luckily, this hack is good workaround.
2345                          */
2346
2347                         fl.oif = dev_out->ifindex;
2348                         goto make_route;
2349                 }
2350                 if (dev_out)
2351                         dev_put(dev_out);
2352                 dev_out = NULL;
2353         }
2354
2355
2356         if (oldflp->oif) {
2357                 dev_out = dev_get_by_index(net, oldflp->oif);
2358                 err = -ENODEV;
2359                 if (dev_out == NULL)
2360                         goto out;
2361
2362                 /* RACE: Check return value of inet_select_addr instead. */
2363                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2364                         dev_put(dev_out);
2365                         goto out;       /* Wrong error code */
2366                 }
2367
2368                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2369                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2370                         if (!fl.fl4_src)
2371                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2372                                                               RT_SCOPE_LINK);
2373                         goto make_route;
2374                 }
2375                 if (!fl.fl4_src) {
2376                         if (ipv4_is_multicast(oldflp->fl4_dst))
2377                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2378                                                               fl.fl4_scope);
2379                         else if (!oldflp->fl4_dst)
2380                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2381                                                               RT_SCOPE_HOST);
2382                 }
2383         }
2384
2385         if (!fl.fl4_dst) {
2386                 fl.fl4_dst = fl.fl4_src;
2387                 if (!fl.fl4_dst)
2388                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2389                 if (dev_out)
2390                         dev_put(dev_out);
2391                 dev_out = net->loopback_dev;
2392                 dev_hold(dev_out);
2393                 fl.oif = net->loopback_dev->ifindex;
2394                 res.type = RTN_LOCAL;
2395                 flags |= RTCF_LOCAL;
2396                 goto make_route;
2397         }
2398
2399         if (fib_lookup(net, &fl, &res)) {
2400                 res.fi = NULL;
2401                 if (oldflp->oif) {
2402                         /* Apparently, routing tables are wrong. Assume,
2403                            that the destination is on link.
2404
2405                            WHY? DW.
2406                            Because we are allowed to send to iface
2407                            even if it has NO routes and NO assigned
2408                            addresses. When oif is specified, routing
2409                            tables are looked up with only one purpose:
2410                            to catch if destination is gatewayed, rather than
2411                            direct. Moreover, if MSG_DONTROUTE is set,
2412                            we send packet, ignoring both routing tables
2413                            and ifaddr state. --ANK
2414
2415
2416                            We could make it even if oif is unknown,
2417                            likely IPv6, but we do not.
2418                          */
2419
2420                         if (fl.fl4_src == 0)
2421                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2422                                                               RT_SCOPE_LINK);
2423                         res.type = RTN_UNICAST;
2424                         goto make_route;
2425                 }
2426                 if (dev_out)
2427                         dev_put(dev_out);
2428                 err = -ENETUNREACH;
2429                 goto out;
2430         }
2431         free_res = 1;
2432
2433         if (res.type == RTN_LOCAL) {
2434                 if (!fl.fl4_src)
2435                         fl.fl4_src = fl.fl4_dst;
2436                 if (dev_out)
2437                         dev_put(dev_out);
2438                 dev_out = net->loopback_dev;
2439                 dev_hold(dev_out);
2440                 fl.oif = dev_out->ifindex;
2441                 if (res.fi)
2442                         fib_info_put(res.fi);
2443                 res.fi = NULL;
2444                 flags |= RTCF_LOCAL;
2445                 goto make_route;
2446         }
2447
2448 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2449         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2450                 fib_select_multipath(&fl, &res);
2451         else
2452 #endif
2453         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2454                 fib_select_default(net, &fl, &res);
2455
2456         if (!fl.fl4_src)
2457                 fl.fl4_src = FIB_RES_PREFSRC(res);
2458
2459         if (dev_out)
2460                 dev_put(dev_out);
2461         dev_out = FIB_RES_DEV(res);
2462         dev_hold(dev_out);
2463         fl.oif = dev_out->ifindex;
2464
2465
2466 make_route:
2467         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2468
2469
2470         if (free_res)
2471                 fib_res_put(&res);
2472         if (dev_out)
2473                 dev_put(dev_out);
2474 out:    return err;
2475 }
2476
2477 int __ip_route_output_key(struct net *net, struct rtable **rp,
2478                           const struct flowi *flp)
2479 {
2480         unsigned hash;
2481         struct rtable *rth;
2482
2483         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2484
2485         rcu_read_lock_bh();
2486         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2487                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2488                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2489                     rth->fl.fl4_src == flp->fl4_src &&
2490                     rth->fl.iif == 0 &&
2491                     rth->fl.oif == flp->oif &&
2492                     rth->fl.mark == flp->mark &&
2493                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2494                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2495                     net_eq(dev_net(rth->u.dst.dev), net) &&
2496                     rth->rt_genid == atomic_read(&rt_genid)) {
2497                         dst_use(&rth->u.dst, jiffies);
2498                         RT_CACHE_STAT_INC(out_hit);
2499                         rcu_read_unlock_bh();
2500                         *rp = rth;
2501                         return 0;
2502                 }
2503                 RT_CACHE_STAT_INC(out_hlist_search);
2504         }
2505         rcu_read_unlock_bh();
2506
2507         return ip_route_output_slow(net, rp, flp);
2508 }
2509
2510 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2511
2512 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2513 {
2514 }
2515
2516 static struct dst_ops ipv4_dst_blackhole_ops = {
2517         .family                 =       AF_INET,
2518         .protocol               =       __constant_htons(ETH_P_IP),
2519         .destroy                =       ipv4_dst_destroy,
2520         .check                  =       ipv4_dst_check,
2521         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2522         .entry_size             =       sizeof(struct rtable),
2523         .entries                =       ATOMIC_INIT(0),
2524 };
2525
2526
2527 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2528 {
2529         struct rtable *ort = *rp;
2530         struct rtable *rt = (struct rtable *)
2531                 dst_alloc(&ipv4_dst_blackhole_ops);
2532
2533         if (rt) {
2534                 struct dst_entry *new = &rt->u.dst;
2535
2536                 atomic_set(&new->__refcnt, 1);
2537                 new->__use = 1;
2538                 new->input = dst_discard;
2539                 new->output = dst_discard;
2540                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2541
2542                 new->dev = ort->u.dst.dev;
2543                 if (new->dev)
2544                         dev_hold(new->dev);
2545
2546                 rt->fl = ort->fl;
2547
2548                 rt->idev = ort->idev;
2549                 if (rt->idev)
2550                         in_dev_hold(rt->idev);
2551                 rt->rt_genid = atomic_read(&rt_genid);
2552                 rt->rt_flags = ort->rt_flags;
2553                 rt->rt_type = ort->rt_type;
2554                 rt->rt_dst = ort->rt_dst;
2555                 rt->rt_src = ort->rt_src;
2556                 rt->rt_iif = ort->rt_iif;
2557                 rt->rt_gateway = ort->rt_gateway;
2558                 rt->rt_spec_dst = ort->rt_spec_dst;
2559                 rt->peer = ort->peer;
2560                 if (rt->peer)
2561                         atomic_inc(&rt->peer->refcnt);
2562
2563                 dst_free(new);
2564         }
2565
2566         dst_release(&(*rp)->u.dst);
2567         *rp = rt;
2568         return (rt ? 0 : -ENOMEM);
2569 }
2570
2571 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2572                          struct sock *sk, int flags)
2573 {
2574         int err;
2575
2576         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2577                 return err;
2578
2579         if (flp->proto) {
2580                 if (!flp->fl4_src)
2581                         flp->fl4_src = (*rp)->rt_src;
2582                 if (!flp->fl4_dst)
2583                         flp->fl4_dst = (*rp)->rt_dst;
2584                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2585                                     flags ? XFRM_LOOKUP_WAIT : 0);
2586                 if (err == -EREMOTE)
2587                         err = ipv4_dst_blackhole(rp, flp);
2588
2589                 return err;
2590         }
2591
2592         return 0;
2593 }
2594
2595 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2596
2597 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2598 {
2599         return ip_route_output_flow(net, rp, flp, NULL, 0);
2600 }
2601
2602 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2603                         int nowait, unsigned int flags)
2604 {
2605         struct rtable *rt = skb->rtable;
2606         struct rtmsg *r;
2607         struct nlmsghdr *nlh;
2608         long expires;
2609         u32 id = 0, ts = 0, tsage = 0, error;
2610
2611         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2612         if (nlh == NULL)
2613                 return -EMSGSIZE;
2614
2615         r = nlmsg_data(nlh);
2616         r->rtm_family    = AF_INET;
2617         r->rtm_dst_len  = 32;
2618         r->rtm_src_len  = 0;
2619         r->rtm_tos      = rt->fl.fl4_tos;
2620         r->rtm_table    = RT_TABLE_MAIN;
2621         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2622         r->rtm_type     = rt->rt_type;
2623         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2624         r->rtm_protocol = RTPROT_UNSPEC;
2625         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2626         if (rt->rt_flags & RTCF_NOTIFY)
2627                 r->rtm_flags |= RTM_F_NOTIFY;
2628
2629         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2630
2631         if (rt->fl.fl4_src) {
2632                 r->rtm_src_len = 32;
2633                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2634         }
2635         if (rt->u.dst.dev)
2636                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2637 #ifdef CONFIG_NET_CLS_ROUTE
2638         if (rt->u.dst.tclassid)
2639                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2640 #endif
2641         if (rt->fl.iif)
2642                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2643         else if (rt->rt_src != rt->fl.fl4_src)
2644                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2645
2646         if (rt->rt_dst != rt->rt_gateway)
2647                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2648
2649         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2650                 goto nla_put_failure;
2651
2652         error = rt->u.dst.error;
2653         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2654         if (rt->peer) {
2655                 id = rt->peer->ip_id_count;
2656                 if (rt->peer->tcp_ts_stamp) {
2657                         ts = rt->peer->tcp_ts;
2658                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2659                 }
2660         }
2661
2662         if (rt->fl.iif) {
2663 #ifdef CONFIG_IP_MROUTE
2664                 __be32 dst = rt->rt_dst;
2665
2666                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2667                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2668                         int err = ipmr_get_route(skb, r, nowait);
2669                         if (err <= 0) {
2670                                 if (!nowait) {
2671                                         if (err == 0)
2672                                                 return 0;
2673                                         goto nla_put_failure;
2674                                 } else {
2675                                         if (err == -EMSGSIZE)
2676                                                 goto nla_put_failure;
2677                                         error = err;
2678                                 }
2679                         }
2680                 } else
2681 #endif
2682                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2683         }
2684
2685         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2686                                expires, error) < 0)
2687                 goto nla_put_failure;
2688
2689         return nlmsg_end(skb, nlh);
2690
2691 nla_put_failure:
2692         nlmsg_cancel(skb, nlh);
2693         return -EMSGSIZE;
2694 }
2695
2696 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2697 {
2698         struct net *net = sock_net(in_skb->sk);
2699         struct rtmsg *rtm;
2700         struct nlattr *tb[RTA_MAX+1];
2701         struct rtable *rt = NULL;
2702         __be32 dst = 0;
2703         __be32 src = 0;
2704         u32 iif;
2705         int err;
2706         struct sk_buff *skb;
2707
2708         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2709         if (err < 0)
2710                 goto errout;
2711
2712         rtm = nlmsg_data(nlh);
2713
2714         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2715         if (skb == NULL) {
2716                 err = -ENOBUFS;
2717                 goto errout;
2718         }
2719
2720         /* Reserve room for dummy headers, this skb can pass
2721            through good chunk of routing engine.
2722          */
2723         skb_reset_mac_header(skb);
2724         skb_reset_network_header(skb);
2725
2726         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2727         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2728         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2729
2730         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2731         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2732         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2733
2734         if (iif) {
2735                 struct net_device *dev;
2736
2737                 dev = __dev_get_by_index(net, iif);
2738                 if (dev == NULL) {
2739                         err = -ENODEV;
2740                         goto errout_free;
2741                 }
2742
2743                 skb->protocol   = htons(ETH_P_IP);
2744                 skb->dev        = dev;
2745                 local_bh_disable();
2746                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2747                 local_bh_enable();
2748
2749                 rt = skb->rtable;
2750                 if (err == 0 && rt->u.dst.error)
2751                         err = -rt->u.dst.error;
2752         } else {
2753                 struct flowi fl = {
2754                         .nl_u = {
2755                                 .ip4_u = {
2756                                         .daddr = dst,
2757                                         .saddr = src,
2758                                         .tos = rtm->rtm_tos,
2759                                 },
2760                         },
2761                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2762                 };
2763                 err = ip_route_output_key(net, &rt, &fl);
2764         }
2765
2766         if (err)
2767                 goto errout_free;
2768
2769         skb->rtable = rt;
2770         if (rtm->rtm_flags & RTM_F_NOTIFY)
2771                 rt->rt_flags |= RTCF_NOTIFY;
2772
2773         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2774                            RTM_NEWROUTE, 0, 0);
2775         if (err <= 0)
2776                 goto errout_free;
2777
2778         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2779 errout:
2780         return err;
2781
2782 errout_free:
2783         kfree_skb(skb);
2784         goto errout;
2785 }
2786
2787 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2788 {
2789         struct rtable *rt;
2790         int h, s_h;
2791         int idx, s_idx;
2792         struct net *net;
2793
2794         net = sock_net(skb->sk);
2795
2796         s_h = cb->args[0];
2797         if (s_h < 0)
2798                 s_h = 0;
2799         s_idx = idx = cb->args[1];
2800         for (h = s_h; h <= rt_hash_mask; h++) {
2801                 rcu_read_lock_bh();
2802                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2803                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2804                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2805                                 continue;
2806                         if (rt->rt_genid != atomic_read(&rt_genid))
2807                                 continue;
2808                         skb->dst = dst_clone(&rt->u.dst);
2809                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2810                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2811                                          1, NLM_F_MULTI) <= 0) {
2812                                 dst_release(xchg(&skb->dst, NULL));
2813                                 rcu_read_unlock_bh();
2814                                 goto done;
2815                         }
2816                         dst_release(xchg(&skb->dst, NULL));
2817                 }
2818                 rcu_read_unlock_bh();
2819                 s_idx = 0;
2820         }
2821
2822 done:
2823         cb->args[0] = h;
2824         cb->args[1] = idx;
2825         return skb->len;
2826 }
2827
2828 void ip_rt_multicast_event(struct in_device *in_dev)
2829 {
2830         rt_cache_flush(0);
2831 }
2832
2833 #ifdef CONFIG_SYSCTL
2834 static int flush_delay;
2835
2836 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2837                                         struct file *filp, void __user *buffer,
2838                                         size_t *lenp, loff_t *ppos)
2839 {
2840         if (write) {
2841                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2842                 rt_cache_flush(flush_delay);
2843                 return 0;
2844         }
2845
2846         return -EINVAL;
2847 }
2848
2849 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2850                                                 int __user *name,
2851                                                 int nlen,
2852                                                 void __user *oldval,
2853                                                 size_t __user *oldlenp,
2854                                                 void __user *newval,
2855                                                 size_t newlen)
2856 {
2857         int delay;
2858         if (newlen != sizeof(int))
2859                 return -EINVAL;
2860         if (get_user(delay, (int __user *)newval))
2861                 return -EFAULT;
2862         rt_cache_flush(delay);
2863         return 0;
2864 }
2865
2866 ctl_table ipv4_route_table[] = {
2867         {
2868                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2869                 .procname       = "flush",
2870                 .data           = &flush_delay,
2871                 .maxlen         = sizeof(int),
2872                 .mode           = 0200,
2873                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2874                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2875         },
2876         {
2877                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2878                 .procname       = "gc_thresh",
2879                 .data           = &ipv4_dst_ops.gc_thresh,
2880                 .maxlen         = sizeof(int),
2881                 .mode           = 0644,
2882                 .proc_handler   = &proc_dointvec,
2883         },
2884         {
2885                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2886                 .procname       = "max_size",
2887                 .data           = &ip_rt_max_size,
2888                 .maxlen         = sizeof(int),
2889                 .mode           = 0644,
2890                 .proc_handler   = &proc_dointvec,
2891         },
2892         {
2893                 /*  Deprecated. Use gc_min_interval_ms */
2894
2895                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2896                 .procname       = "gc_min_interval",
2897                 .data           = &ip_rt_gc_min_interval,
2898                 .maxlen         = sizeof(int),
2899                 .mode           = 0644,
2900                 .proc_handler   = &proc_dointvec_jiffies,
2901                 .strategy       = &sysctl_jiffies,
2902         },
2903         {
2904                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2905                 .procname       = "gc_min_interval_ms",
2906                 .data           = &ip_rt_gc_min_interval,
2907                 .maxlen         = sizeof(int),
2908                 .mode           = 0644,
2909                 .proc_handler   = &proc_dointvec_ms_jiffies,
2910                 .strategy       = &sysctl_ms_jiffies,
2911         },
2912         {
2913                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2914                 .procname       = "gc_timeout",
2915                 .data           = &ip_rt_gc_timeout,
2916                 .maxlen         = sizeof(int),
2917                 .mode           = 0644,
2918                 .proc_handler   = &proc_dointvec_jiffies,
2919                 .strategy       = &sysctl_jiffies,
2920         },
2921         {
2922                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2923                 .procname       = "gc_interval",
2924                 .data           = &ip_rt_gc_interval,
2925                 .maxlen         = sizeof(int),
2926                 .mode           = 0644,
2927                 .proc_handler   = &proc_dointvec_jiffies,
2928                 .strategy       = &sysctl_jiffies,
2929         },
2930         {
2931                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2932                 .procname       = "redirect_load",
2933                 .data           = &ip_rt_redirect_load,
2934                 .maxlen         = sizeof(int),
2935                 .mode           = 0644,
2936                 .proc_handler   = &proc_dointvec,
2937         },
2938         {
2939                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2940                 .procname       = "redirect_number",
2941                 .data           = &ip_rt_redirect_number,
2942                 .maxlen         = sizeof(int),
2943                 .mode           = 0644,
2944                 .proc_handler   = &proc_dointvec,
2945         },
2946         {
2947                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2948                 .procname       = "redirect_silence",
2949                 .data           = &ip_rt_redirect_silence,
2950                 .maxlen         = sizeof(int),
2951                 .mode           = 0644,
2952                 .proc_handler   = &proc_dointvec,
2953         },
2954         {
2955                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2956                 .procname       = "error_cost",
2957                 .data           = &ip_rt_error_cost,
2958                 .maxlen         = sizeof(int),
2959                 .mode           = 0644,
2960                 .proc_handler   = &proc_dointvec,
2961         },
2962         {
2963                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2964                 .procname       = "error_burst",
2965                 .data           = &ip_rt_error_burst,
2966                 .maxlen         = sizeof(int),
2967                 .mode           = 0644,
2968                 .proc_handler   = &proc_dointvec,
2969         },
2970         {
2971                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2972                 .procname       = "gc_elasticity",
2973                 .data           = &ip_rt_gc_elasticity,
2974                 .maxlen         = sizeof(int),
2975                 .mode           = 0644,
2976                 .proc_handler   = &proc_dointvec,
2977         },
2978         {
2979                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2980                 .procname       = "mtu_expires",
2981                 .data           = &ip_rt_mtu_expires,
2982                 .maxlen         = sizeof(int),
2983                 .mode           = 0644,
2984                 .proc_handler   = &proc_dointvec_jiffies,
2985                 .strategy       = &sysctl_jiffies,
2986         },
2987         {
2988                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2989                 .procname       = "min_pmtu",
2990                 .data           = &ip_rt_min_pmtu,
2991                 .maxlen         = sizeof(int),
2992                 .mode           = 0644,
2993                 .proc_handler   = &proc_dointvec,
2994         },
2995         {
2996                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2997                 .procname       = "min_adv_mss",
2998                 .data           = &ip_rt_min_advmss,
2999                 .maxlen         = sizeof(int),
3000                 .mode           = 0644,
3001                 .proc_handler   = &proc_dointvec,
3002         },
3003         {
3004                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3005                 .procname       = "secret_interval",
3006                 .data           = &ip_rt_secret_interval,
3007                 .maxlen         = sizeof(int),
3008                 .mode           = 0644,
3009                 .proc_handler   = &proc_dointvec_jiffies,
3010                 .strategy       = &sysctl_jiffies,
3011         },
3012         { .ctl_name = 0 }
3013 };
3014 #endif
3015
3016 #ifdef CONFIG_NET_CLS_ROUTE
3017 struct ip_rt_acct *ip_rt_acct __read_mostly;
3018 #endif /* CONFIG_NET_CLS_ROUTE */
3019
3020 static __initdata unsigned long rhash_entries;
3021 static int __init set_rhash_entries(char *str)
3022 {
3023         if (!str)
3024                 return 0;
3025         rhash_entries = simple_strtoul(str, &str, 0);
3026         return 1;
3027 }
3028 __setup("rhash_entries=", set_rhash_entries);
3029
3030 int __init ip_rt_init(void)
3031 {
3032         int rc = 0;
3033
3034         atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3035                              (jiffies ^ (jiffies >> 7))));
3036
3037 #ifdef CONFIG_NET_CLS_ROUTE
3038         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3039         if (!ip_rt_acct)
3040                 panic("IP: failed to allocate ip_rt_acct\n");
3041 #endif
3042
3043         ipv4_dst_ops.kmem_cachep =
3044                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3045                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3046
3047         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3048
3049         rt_hash_table = (struct rt_hash_bucket *)
3050                 alloc_large_system_hash("IP route cache",
3051                                         sizeof(struct rt_hash_bucket),
3052                                         rhash_entries,
3053                                         (num_physpages >= 128 * 1024) ?
3054                                         15 : 17,
3055                                         0,
3056                                         &rt_hash_log,
3057                                         &rt_hash_mask,
3058                                         0);
3059         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3060         rt_hash_lock_init();
3061
3062         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3063         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3064
3065         devinet_init();
3066         ip_fib_init();
3067
3068         rt_secret_timer.function = rt_secret_rebuild;
3069         rt_secret_timer.data = 0;
3070         init_timer_deferrable(&rt_secret_timer);
3071
3072         /* All the timers, started at system startup tend
3073            to synchronize. Perturb it a bit.
3074          */
3075         schedule_delayed_work(&expires_work,
3076                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3077
3078         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3079                 ip_rt_secret_interval;
3080         add_timer(&rt_secret_timer);
3081
3082         if (ip_rt_proc_init())
3083                 printk(KERN_ERR "Unable to create route proc files\n");
3084 #ifdef CONFIG_XFRM
3085         xfrm_init();
3086         xfrm4_init();
3087 #endif
3088         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3089
3090         return rc;
3091 }
3092
3093 EXPORT_SYMBOL(__ip_select_ident);
3094 EXPORT_SYMBOL(ip_route_input);
3095 EXPORT_SYMBOL(ip_route_output_key);