net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111 #include <net/atmclip.h>
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 124 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_gc_elasticity __read_mostly    = 8;
 131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 133 static int ip_rt_min_advmss __read_mostly       = 256;
 134 static int rt_chain_length_max __read_mostly    = 20;
 135 static int redirect_genid;
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 143 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 144 static void              ipv4_dst_destroy(struct dst_entry *dst);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148 static int rt_garbage_collect(struct dst_ops *ops);
 149
 150 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 151                             int how)
 152 {
 153 }
 154
 155 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 156 {
 157         struct rtable *rt = (struct rtable *) dst;
 158         struct inet_peer *peer;
 159         u32 *p = NULL;
 160
 161         if (!rt->peer)
 162                 rt_bind_peer(rt, rt->rt_dst, 1);
 163
 164         peer = rt->peer;
 165         if (peer) {
 166                 u32 *old_p = __DST_METRICS_PTR(old);
 167                 unsigned long prev, new;
 168
 169                 p = peer->metrics;
 170                 if (inet_metrics_new(peer))
 171                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 172
 173                 new = (unsigned long) p;
 174                 prev = cmpxchg(&dst->_metrics, old, new);
 175
 176                 if (prev != old) {
 177                         p = __DST_METRICS_PTR(prev);
 178                         if (prev & DST_METRICS_READ_ONLY)
 179                                 p = NULL;
 180                 } else {
 181                         if (rt->fi) {
 182                                 fib_info_put(rt->fi);
 183                                 rt->fi = NULL;
 184                         }
 185                 }
 186         }
 187         return p;
 188 }
 189
 190 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 191
 192 static struct dst_ops ipv4_dst_ops = {
 193         .family =               AF_INET,
 194         .protocol =             cpu_to_be16(ETH_P_IP),
 195         .gc =                   rt_garbage_collect,
 196         .check =                ipv4_dst_check,
 197         .default_advmss =       ipv4_default_advmss,
 198         .default_mtu =          ipv4_default_mtu,
 199         .cow_metrics =          ipv4_cow_metrics,
 200         .destroy =              ipv4_dst_destroy,
 201         .ifdown =               ipv4_dst_ifdown,
 202         .negative_advice =      ipv4_negative_advice,
 203         .link_failure =         ipv4_link_failure,
 204         .update_pmtu =          ip_rt_update_pmtu,
 205         .local_out =            __ip_local_out,
 206         .neigh_lookup =         ipv4_neigh_lookup,
 207 };
 208
 209 #define ECN_OR_COST(class)      TC_PRIO_##class
 210
 211 const __u8 ip_tos2prio[16] = {
 212         TC_PRIO_BESTEFFORT,
 213         ECN_OR_COST(BESTEFFORT),
 214         TC_PRIO_BESTEFFORT,
 215         ECN_OR_COST(BESTEFFORT),
 216         TC_PRIO_BULK,
 217         ECN_OR_COST(BULK),
 218         TC_PRIO_BULK,
 219         ECN_OR_COST(BULK),
 220         TC_PRIO_INTERACTIVE,
 221         ECN_OR_COST(INTERACTIVE),
 222         TC_PRIO_INTERACTIVE,
 223         ECN_OR_COST(INTERACTIVE),
 224         TC_PRIO_INTERACTIVE_BULK,
 225         ECN_OR_COST(INTERACTIVE_BULK),
 226         TC_PRIO_INTERACTIVE_BULK,
 227         ECN_OR_COST(INTERACTIVE_BULK)
 228 };
 229
 230
 231 /*
 232  * Route cache.
 233  */
 234
 235 /* The locking scheme is rather straight forward:
 236  *
 237  * 1) Read-Copy Update protects the buckets of the central route hash.
 238  * 2) Only writers remove entries, and they hold the lock
 239  *    as they look at rtable reference counts.
 240  * 3) Only readers acquire references to rtable entries,
 241  *    they do so with atomic increments and with the
 242  *    lock held.
 243  */
 244
 245 struct rt_hash_bucket {
 246         struct rtable __rcu     *chain;
 247 };
 248
 249 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 250         defined(CONFIG_PROVE_LOCKING)
 251 /*
 252  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 253  * The size of this table is a power of two and depends on the number of CPUS.
 254  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 255  */
 256 #ifdef CONFIG_LOCKDEP
 257 # define RT_HASH_LOCK_SZ        256
 258 #else
 259 # if NR_CPUS >= 32
 260 #  define RT_HASH_LOCK_SZ       4096
 261 # elif NR_CPUS >= 16
 262 #  define RT_HASH_LOCK_SZ       2048
 263 # elif NR_CPUS >= 8
 264 #  define RT_HASH_LOCK_SZ       1024
 265 # elif NR_CPUS >= 4
 266 #  define RT_HASH_LOCK_SZ       512
 267 # else
 268 #  define RT_HASH_LOCK_SZ       256
 269 # endif
 270 #endif
 271
 272 static spinlock_t       *rt_hash_locks;
 273 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 274
 275 static __init void rt_hash_lock_init(void)
 276 {
 277         int i;
 278
 279         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 280                         GFP_KERNEL);
 281         if (!rt_hash_locks)
 282                 panic("IP: failed to allocate rt_hash_locks\n");
 283
 284         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 285                 spin_lock_init(&rt_hash_locks[i]);
 286 }
 287 #else
 288 # define rt_hash_lock_addr(slot) NULL
 289
 290 static inline void rt_hash_lock_init(void)
 291 {
 292 }
 293 #endif
 294
 295 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 296 static unsigned                 rt_hash_mask __read_mostly;
 297 static unsigned int             rt_hash_log  __read_mostly;
 298
 299 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 300 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 301
 302 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 303                                    int genid)
 304 {
 305         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 306                             idx, genid)
 307                 & rt_hash_mask;
 308 }
 309
 310 static inline int rt_genid(struct net *net)
 311 {
 312         return atomic_read(&net->ipv4.rt_genid);
 313 }
 314
 315 #ifdef CONFIG_PROC_FS
 316 struct rt_cache_iter_state {
 317         struct seq_net_private p;
 318         int bucket;
 319         int genid;
 320 };
 321
 322 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 323 {
 324         struct rt_cache_iter_state *st = seq->private;
 325         struct rtable *r = NULL;
 326
 327         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 328                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 329                         continue;
 330                 rcu_read_lock_bh();
 331                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 332                 while (r) {
 333                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 334                             r->rt_genid == st->genid)
 335                                 return r;
 336                         r = rcu_dereference_bh(r->dst.rt_next);
 337                 }
 338                 rcu_read_unlock_bh();
 339         }
 340         return r;
 341 }
 342
 343 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 344                                           struct rtable *r)
 345 {
 346         struct rt_cache_iter_state *st = seq->private;
 347
 348         r = rcu_dereference_bh(r->dst.rt_next);
 349         while (!r) {
 350                 rcu_read_unlock_bh();
 351                 do {
 352                         if (--st->bucket < 0)
 353                                 return NULL;
 354                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 355                 rcu_read_lock_bh();
 356                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 357         }
 358         return r;
 359 }
 360
 361 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 362                                         struct rtable *r)
 363 {
 364         struct rt_cache_iter_state *st = seq->private;
 365         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 366                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 367                         continue;
 368                 if (r->rt_genid == st->genid)
 369                         break;
 370         }
 371         return r;
 372 }
 373
 374 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 375 {
 376         struct rtable *r = rt_cache_get_first(seq);
 377
 378         if (r)
 379                 while (pos && (r = rt_cache_get_next(seq, r)))
 380                         --pos;
 381         return pos ? NULL : r;
 382 }
 383
 384 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 385 {
 386         struct rt_cache_iter_state *st = seq->private;
 387         if (*pos)
 388                 return rt_cache_get_idx(seq, *pos - 1);
 389         st->genid = rt_genid(seq_file_net(seq));
 390         return SEQ_START_TOKEN;
 391 }
 392
 393 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 394 {
 395         struct rtable *r;
 396
 397         if (v == SEQ_START_TOKEN)
 398                 r = rt_cache_get_first(seq);
 399         else
 400                 r = rt_cache_get_next(seq, v);
 401         ++*pos;
 402         return r;
 403 }
 404
 405 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 406 {
 407         if (v && v != SEQ_START_TOKEN)
 408                 rcu_read_unlock_bh();
 409 }
 410
 411 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 412 {
 413         if (v == SEQ_START_TOKEN)
 414                 seq_printf(seq, "%-127s\n",
 415                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 416                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 417                            "HHUptod\tSpecDst");
 418         else {
 419                 struct rtable *r = v;
 420                 struct neighbour *n;
 421                 int len, HHUptod;
 422
 423                 rcu_read_lock();
 424                 n = dst_get_neighbour(&r->dst);
 425                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 426                 rcu_read_unlock();
 427
 428                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 429                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 430                         r->dst.dev ? r->dst.dev->name : "*",
 431                         (__force u32)r->rt_dst,
 432                         (__force u32)r->rt_gateway,
 433                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 434                         r->dst.__use, 0, (__force u32)r->rt_src,
 435                         dst_metric_advmss(&r->dst) + 40,
 436                         dst_metric(&r->dst, RTAX_WINDOW),
 437                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 438                               dst_metric(&r->dst, RTAX_RTTVAR)),
 439                         r->rt_key_tos,
 440                         -1,
 441                         HHUptod,
 442                         r->rt_spec_dst, &len);
 443
 444                 seq_printf(seq, "%*s\n", 127 - len, "");
 445         }
 446         return 0;
 447 }
 448
 449 static const struct seq_operations rt_cache_seq_ops = {
 450         .start  = rt_cache_seq_start,
 451         .next   = rt_cache_seq_next,
 452         .stop   = rt_cache_seq_stop,
 453         .show   = rt_cache_seq_show,
 454 };
 455
 456 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 457 {
 458         return seq_open_net(inode, file, &rt_cache_seq_ops,
 459                         sizeof(struct rt_cache_iter_state));
 460 }
 461
 462 static const struct file_operations rt_cache_seq_fops = {
 463         .owner   = THIS_MODULE,
 464         .open    = rt_cache_seq_open,
 465         .read    = seq_read,
 466         .llseek  = seq_lseek,
 467         .release = seq_release_net,
 468 };
 469
 470
 471 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 472 {
 473         int cpu;
 474
 475         if (*pos == 0)
 476                 return SEQ_START_TOKEN;
 477
 478         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 479                 if (!cpu_possible(cpu))
 480                         continue;
 481                 *pos = cpu+1;
 482                 return &per_cpu(rt_cache_stat, cpu);
 483         }
 484         return NULL;
 485 }
 486
 487 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 488 {
 489         int cpu;
 490
 491         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 492                 if (!cpu_possible(cpu))
 493                         continue;
 494                 *pos = cpu+1;
 495                 return &per_cpu(rt_cache_stat, cpu);
 496         }
 497         return NULL;
 498
 499 }
 500
 501 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 502 {
 503
 504 }
 505
 506 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 507 {
 508         struct rt_cache_stat *st = v;
 509
 510         if (v == SEQ_START_TOKEN) {
 511                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 512                 return 0;
 513         }
 514
 515         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 516                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 517                    dst_entries_get_slow(&ipv4_dst_ops),
 518                    st->in_hit,
 519                    st->in_slow_tot,
 520                    st->in_slow_mc,
 521                    st->in_no_route,
 522                    st->in_brd,
 523                    st->in_martian_dst,
 524                    st->in_martian_src,
 525
 526                    st->out_hit,
 527                    st->out_slow_tot,
 528                    st->out_slow_mc,
 529
 530                    st->gc_total,
 531                    st->gc_ignored,
 532                    st->gc_goal_miss,
 533                    st->gc_dst_overflow,
 534                    st->in_hlist_search,
 535                    st->out_hlist_search
 536                 );
 537         return 0;
 538 }
 539
 540 static const struct seq_operations rt_cpu_seq_ops = {
 541         .start  = rt_cpu_seq_start,
 542         .next   = rt_cpu_seq_next,
 543         .stop   = rt_cpu_seq_stop,
 544         .show   = rt_cpu_seq_show,
 545 };
 546
 547
 548 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 549 {
 550         return seq_open(file, &rt_cpu_seq_ops);
 551 }
 552
 553 static const struct file_operations rt_cpu_seq_fops = {
 554         .owner   = THIS_MODULE,
 555         .open    = rt_cpu_seq_open,
 556         .read    = seq_read,
 557         .llseek  = seq_lseek,
 558         .release = seq_release,
 559 };
 560
 561 #ifdef CONFIG_IP_ROUTE_CLASSID
 562 static int rt_acct_proc_show(struct seq_file *m, void *v)
 563 {
 564         struct ip_rt_acct *dst, *src;
 565         unsigned int i, j;
 566
 567         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 568         if (!dst)
 569                 return -ENOMEM;
 570
 571         for_each_possible_cpu(i) {
 572                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 573                 for (j = 0; j < 256; j++) {
 574                         dst[j].o_bytes   += src[j].o_bytes;
 575                         dst[j].o_packets += src[j].o_packets;
 576                         dst[j].i_bytes   += src[j].i_bytes;
 577                         dst[j].i_packets += src[j].i_packets;
 578                 }
 579         }
 580
 581         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 582         kfree(dst);
 583         return 0;
 584 }
 585
 586 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 587 {
 588         return single_open(file, rt_acct_proc_show, NULL);
 589 }
 590
 591 static const struct file_operations rt_acct_proc_fops = {
 592         .owner          = THIS_MODULE,
 593         .open           = rt_acct_proc_open,
 594         .read           = seq_read,
 595         .llseek         = seq_lseek,
 596         .release        = single_release,
 597 };
 598 #endif
 599
 600 static int __net_init ip_rt_do_proc_init(struct net *net)
 601 {
 602         struct proc_dir_entry *pde;
 603
 604         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 605                         &rt_cache_seq_fops);
 606         if (!pde)
 607                 goto err1;
 608
 609         pde = proc_create("rt_cache", S_IRUGO,
 610                           net->proc_net_stat, &rt_cpu_seq_fops);
 611         if (!pde)
 612                 goto err2;
 613
 614 #ifdef CONFIG_IP_ROUTE_CLASSID
 615         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 616         if (!pde)
 617                 goto err3;
 618 #endif
 619         return 0;
 620
 621 #ifdef CONFIG_IP_ROUTE_CLASSID
 622 err3:
 623         remove_proc_entry("rt_cache", net->proc_net_stat);
 624 #endif
 625 err2:
 626         remove_proc_entry("rt_cache", net->proc_net);
 627 err1:
 628         return -ENOMEM;
 629 }
 630
 631 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 632 {
 633         remove_proc_entry("rt_cache", net->proc_net_stat);
 634         remove_proc_entry("rt_cache", net->proc_net);
 635 #ifdef CONFIG_IP_ROUTE_CLASSID
 636         remove_proc_entry("rt_acct", net->proc_net);
 637 #endif
 638 }
 639
 640 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 641         .init = ip_rt_do_proc_init,
 642         .exit = ip_rt_do_proc_exit,
 643 };
 644
 645 static int __init ip_rt_proc_init(void)
 646 {
 647         return register_pernet_subsys(&ip_rt_proc_ops);
 648 }
 649
 650 #else
 651 static inline int ip_rt_proc_init(void)
 652 {
 653         return 0;
 654 }
 655 #endif /* CONFIG_PROC_FS */
 656
 657 static inline void rt_free(struct rtable *rt)
 658 {
 659         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 660 }
 661
 662 static inline void rt_drop(struct rtable *rt)
 663 {
 664         ip_rt_put(rt);
 665         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 666 }
 667
 668 static inline int rt_fast_clean(struct rtable *rth)
 669 {
 670         /* Kill broadcast/multicast entries very aggresively, if they
 671            collide in hash table with more useful entries */
 672         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 673                 rt_is_input_route(rth) && rth->dst.rt_next;
 674 }
 675
 676 static inline int rt_valuable(struct rtable *rth)
 677 {
 678         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 679                 (rth->peer && rth->peer->pmtu_expires);
 680 }
 681
 682 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 683 {
 684         unsigned long age;
 685         int ret = 0;
 686
 687         if (atomic_read(&rth->dst.__refcnt))
 688                 goto out;
 689
 690         age = jiffies - rth->dst.lastuse;
 691         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 692             (age <= tmo2 && rt_valuable(rth)))
 693                 goto out;
 694         ret = 1;
 695 out:    return ret;
 696 }
 697
 698 /* Bits of score are:
 699  * 31: very valuable
 700  * 30: not quite useless
 701  * 29..0: usage counter
 702  */
 703 static inline u32 rt_score(struct rtable *rt)
 704 {
 705         u32 score = jiffies - rt->dst.lastuse;
 706
 707         score = ~score & ~(3<<30);
 708
 709         if (rt_valuable(rt))
 710                 score |= (1<<31);
 711
 712         if (rt_is_output_route(rt) ||
 713             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 714                 score |= (1<<30);
 715
 716         return score;
 717 }
 718
 719 static inline bool rt_caching(const struct net *net)
 720 {
 721         return net->ipv4.current_rt_cache_rebuild_count <=
 722                 net->ipv4.sysctl_rt_cache_rebuild_count;
 723 }
 724
 725 static inline bool compare_hash_inputs(const struct rtable *rt1,
 726                                        const struct rtable *rt2)
 727 {
 728         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 729                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 730                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 731 }
 732
 733 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 734 {
 735         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 736                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 737                 (rt1->rt_mark ^ rt2->rt_mark) |
 738                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 739                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 740                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 741 }
 742
 743 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 744 {
 745         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 746 }
 747
 748 static inline int rt_is_expired(struct rtable *rth)
 749 {
 750         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 751 }
 752
 753 /*
 754  * Perform a full scan of hash table and free all entries.
 755  * Can be called by a softirq or a process.
 756  * In the later case, we want to be reschedule if necessary
 757  */
 758 static void rt_do_flush(struct net *net, int process_context)
 759 {
 760         unsigned int i;
 761         struct rtable *rth, *next;
 762
 763         for (i = 0; i <= rt_hash_mask; i++) {
 764                 struct rtable __rcu **pprev;
 765                 struct rtable *list;
 766
 767                 if (process_context && need_resched())
 768                         cond_resched();
 769                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
 770                 if (!rth)
 771                         continue;
 772
 773                 spin_lock_bh(rt_hash_lock_addr(i));
 774
 775                 list = NULL;
 776                 pprev = &rt_hash_table[i].chain;
 777                 rth = rcu_dereference_protected(*pprev,
 778                         lockdep_is_held(rt_hash_lock_addr(i)));
 779
 780                 while (rth) {
 781                         next = rcu_dereference_protected(rth->dst.rt_next,
 782                                 lockdep_is_held(rt_hash_lock_addr(i)));
 783
 784                         if (!net ||
 785                             net_eq(dev_net(rth->dst.dev), net)) {
 786                                 rcu_assign_pointer(*pprev, next);
 787                                 rcu_assign_pointer(rth->dst.rt_next, list);
 788                                 list = rth;
 789                         } else {
 790                                 pprev = &rth->dst.rt_next;
 791                         }
 792                         rth = next;
 793                 }
 794
 795                 spin_unlock_bh(rt_hash_lock_addr(i));
 796
 797                 for (; list; list = next) {
 798                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 799                         rt_free(list);
 800                 }
 801         }
 802 }
 803
 804 /*
 805  * While freeing expired entries, we compute average chain length
 806  * and standard deviation, using fixed-point arithmetic.
 807  * This to have an estimation of rt_chain_length_max
 808  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 809  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 810  */
 811
 812 #define FRACT_BITS 3
 813 #define ONE (1UL << FRACT_BITS)
 814
 815 /*
 816  * Given a hash chain and an item in this hash chain,
 817  * find if a previous entry has the same hash_inputs
 818  * (but differs on tos, mark or oif)
 819  * Returns 0 if an alias is found.
 820  * Returns ONE if rth has no alias before itself.
 821  */
 822 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 823 {
 824         const struct rtable *aux = head;
 825
 826         while (aux != rth) {
 827                 if (compare_hash_inputs(aux, rth))
 828                         return 0;
 829                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 830         }
 831         return ONE;
 832 }
 833
 834 /*
 835  * Perturbation of rt_genid by a small quantity [1..256]
 836  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 837  * many times (2^24) without giving recent rt_genid.
 838  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 839  */
 840 static void rt_cache_invalidate(struct net *net)
 841 {
 842         unsigned char shuffle;
 843
 844         get_random_bytes(&shuffle, sizeof(shuffle));
 845         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 846         redirect_genid++;
 847 }
 848
 849 /*
 850  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 851  * delay >= 0 : invalidate & flush cache (can be long)
 852  */
 853 void rt_cache_flush(struct net *net, int delay)
 854 {
 855         rt_cache_invalidate(net);
 856         if (delay >= 0)
 857                 rt_do_flush(net, !in_softirq());
 858 }
 859
 860 /* Flush previous cache invalidated entries from the cache */
 861 void rt_cache_flush_batch(struct net *net)
 862 {
 863         rt_do_flush(net, !in_softirq());
 864 }
 865
 866 static void rt_emergency_hash_rebuild(struct net *net)
 867 {
 868         if (net_ratelimit())
 869                 printk(KERN_WARNING "Route hash chain too long!\n");
 870         rt_cache_invalidate(net);
 871 }
 872
 873 /*
 874    Short description of GC goals.
 875
 876    We want to build algorithm, which will keep routing cache
 877    at some equilibrium point, when number of aged off entries
 878    is kept approximately equal to newly generated ones.
 879
 880    Current expiration strength is variable "expire".
 881    We try to adjust it dynamically, so that if networking
 882    is idle expires is large enough to keep enough of warm entries,
 883    and when load increases it reduces to limit cache size.
 884  */
 885
 886 static int rt_garbage_collect(struct dst_ops *ops)
 887 {
 888         static unsigned long expire = RT_GC_TIMEOUT;
 889         static unsigned long last_gc;
 890         static int rover;
 891         static int equilibrium;
 892         struct rtable *rth;
 893         struct rtable __rcu **rthp;
 894         unsigned long now = jiffies;
 895         int goal;
 896         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 897
 898         /*
 899          * Garbage collection is pretty expensive,
 900          * do not make it too frequently.
 901          */
 902
 903         RT_CACHE_STAT_INC(gc_total);
 904
 905         if (now - last_gc < ip_rt_gc_min_interval &&
 906             entries < ip_rt_max_size) {
 907                 RT_CACHE_STAT_INC(gc_ignored);
 908                 goto out;
 909         }
 910
 911         entries = dst_entries_get_slow(&ipv4_dst_ops);
 912         /* Calculate number of entries, which we want to expire now. */
 913         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 914         if (goal <= 0) {
 915                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 916                         equilibrium = ipv4_dst_ops.gc_thresh;
 917                 goal = entries - equilibrium;
 918                 if (goal > 0) {
 919                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 920                         goal = entries - equilibrium;
 921                 }
 922         } else {
 923                 /* We are in dangerous area. Try to reduce cache really
 924                  * aggressively.
 925                  */
 926                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 927                 equilibrium = entries - goal;
 928         }
 929
 930         if (now - last_gc >= ip_rt_gc_min_interval)
 931                 last_gc = now;
 932
 933         if (goal <= 0) {
 934                 equilibrium += goal;
 935                 goto work_done;
 936         }
 937
 938         do {
 939                 int i, k;
 940
 941                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 942                         unsigned long tmo = expire;
 943
 944                         k = (k + 1) & rt_hash_mask;
 945                         rthp = &rt_hash_table[k].chain;
 946                         spin_lock_bh(rt_hash_lock_addr(k));
 947                         while ((rth = rcu_dereference_protected(*rthp,
 948                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 949                                 if (!rt_is_expired(rth) &&
 950                                         !rt_may_expire(rth, tmo, expire)) {
 951                                         tmo >>= 1;
 952                                         rthp = &rth->dst.rt_next;
 953                                         continue;
 954                                 }
 955                                 *rthp = rth->dst.rt_next;
 956                                 rt_free(rth);
 957                                 goal--;
 958                         }
 959                         spin_unlock_bh(rt_hash_lock_addr(k));
 960                         if (goal <= 0)
 961                                 break;
 962                 }
 963                 rover = k;
 964
 965                 if (goal <= 0)
 966                         goto work_done;
 967
 968                 /* Goal is not achieved. We stop process if:
 969
 970                    - if expire reduced to zero. Otherwise, expire is halfed.
 971                    - if table is not full.
 972                    - if we are called from interrupt.
 973                    - jiffies check is just fallback/debug loop breaker.
 974                      We will not spin here for long time in any case.
 975                  */
 976
 977                 RT_CACHE_STAT_INC(gc_goal_miss);
 978
 979                 if (expire == 0)
 980                         break;
 981
 982                 expire >>= 1;
 983
 984                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 985                         goto out;
 986         } while (!in_softirq() && time_before_eq(jiffies, now));
 987
 988         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 989                 goto out;
 990         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 991                 goto out;
 992         if (net_ratelimit())
 993                 printk(KERN_WARNING "dst cache overflow\n");
 994         RT_CACHE_STAT_INC(gc_dst_overflow);
 995         return 1;
 996
 997 work_done:
 998         expire += ip_rt_gc_min_interval;
 999         if (expire > ip_rt_gc_timeout ||
1000             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1001             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1002                 expire = ip_rt_gc_timeout;
1003 out:    return 0;
1004 }
1005
1006 /*
1007  * Returns number of entries in a hash chain that have different hash_inputs
1008  */
1009 static int slow_chain_length(const struct rtable *head)
1010 {
1011         int length = 0;
1012         const struct rtable *rth = head;
1013
1014         while (rth) {
1015                 length += has_noalias(head, rth);
1016                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1017         }
1018         return length >> FRACT_BITS;
1019 }
1020
1021 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1022 {
1023         struct neigh_table *tbl = &arp_tbl;
1024         static const __be32 inaddr_any = 0;
1025         struct net_device *dev = dst->dev;
1026         const __be32 *pkey = daddr;
1027         struct neighbour *n;
1028
1029 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1030         if (dev->type == ARPHRD_ATM)
1031                 tbl = clip_tbl_hook;
1032 #endif
1033         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1034                 pkey = &inaddr_any;
1035
1036         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1037         if (n)
1038                 return n;
1039         return neigh_create(tbl, pkey, dev);
1040 }
1041
1042 static int rt_bind_neighbour(struct rtable *rt)
1043 {
1044         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1045         if (IS_ERR(n))
1046                 return PTR_ERR(n);
1047         dst_set_neighbour(&rt->dst, n);
1048
1049         return 0;
1050 }
1051
1052 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1053                                      struct sk_buff *skb, int ifindex)
1054 {
1055         struct rtable   *rth, *cand;
1056         struct rtable __rcu **rthp, **candp;
1057         unsigned long   now;
1058         u32             min_score;
1059         int             chain_length;
1060         int attempts = !in_softirq();
1061
1062 restart:
1063         chain_length = 0;
1064         min_score = ~(u32)0;
1065         cand = NULL;
1066         candp = NULL;
1067         now = jiffies;
1068
1069         if (!rt_caching(dev_net(rt->dst.dev))) {
1070                 /*
1071                  * If we're not caching, just tell the caller we
1072                  * were successful and don't touch the route.  The
1073                  * caller hold the sole reference to the cache entry, and
1074                  * it will be released when the caller is done with it.
1075                  * If we drop it here, the callers have no way to resolve routes
1076                  * when we're not caching.  Instead, just point *rp at rt, so
1077                  * the caller gets a single use out of the route
1078                  * Note that we do rt_free on this new route entry, so that
1079                  * once its refcount hits zero, we are still able to reap it
1080                  * (Thanks Alexey)
1081                  * Note: To avoid expensive rcu stuff for this uncached dst,
1082                  * we set DST_NOCACHE so that dst_release() can free dst without
1083                  * waiting a grace period.
1084                  */
1085
1086                 rt->dst.flags |= DST_NOCACHE;
1087                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1088                         int err = rt_bind_neighbour(rt);
1089                         if (err) {
1090                                 if (net_ratelimit())
1091                                         printk(KERN_WARNING
1092                                             "Neighbour table failure & not caching routes.\n");
1093                                 ip_rt_put(rt);
1094                                 return ERR_PTR(err);
1095                         }
1096                 }
1097
1098                 goto skip_hashing;
1099         }
1100
1101         rthp = &rt_hash_table[hash].chain;
1102
1103         spin_lock_bh(rt_hash_lock_addr(hash));
1104         while ((rth = rcu_dereference_protected(*rthp,
1105                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1106                 if (rt_is_expired(rth)) {
1107                         *rthp = rth->dst.rt_next;
1108                         rt_free(rth);
1109                         continue;
1110                 }
1111                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1112                         /* Put it first */
1113                         *rthp = rth->dst.rt_next;
1114                         /*
1115                          * Since lookup is lockfree, the deletion
1116                          * must be visible to another weakly ordered CPU before
1117                          * the insertion at the start of the hash chain.
1118                          */
1119                         rcu_assign_pointer(rth->dst.rt_next,
1120                                            rt_hash_table[hash].chain);
1121                         /*
1122                          * Since lookup is lockfree, the update writes
1123                          * must be ordered for consistency on SMP.
1124                          */
1125                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1126
1127                         dst_use(&rth->dst, now);
1128                         spin_unlock_bh(rt_hash_lock_addr(hash));
1129
1130                         rt_drop(rt);
1131                         if (skb)
1132                                 skb_dst_set(skb, &rth->dst);
1133                         return rth;
1134                 }
1135
1136                 if (!atomic_read(&rth->dst.__refcnt)) {
1137                         u32 score = rt_score(rth);
1138
1139                         if (score <= min_score) {
1140                                 cand = rth;
1141                                 candp = rthp;
1142                                 min_score = score;
1143                         }
1144                 }
1145
1146                 chain_length++;
1147
1148                 rthp = &rth->dst.rt_next;
1149         }
1150
1151         if (cand) {
1152                 /* ip_rt_gc_elasticity used to be average length of chain
1153                  * length, when exceeded gc becomes really aggressive.
1154                  *
1155                  * The second limit is less certain. At the moment it allows
1156                  * only 2 entries per bucket. We will see.
1157                  */
1158                 if (chain_length > ip_rt_gc_elasticity) {
1159                         *candp = cand->dst.rt_next;
1160                         rt_free(cand);
1161                 }
1162         } else {
1163                 if (chain_length > rt_chain_length_max &&
1164                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1165                         struct net *net = dev_net(rt->dst.dev);
1166                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1167                         if (!rt_caching(net)) {
1168                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1169                                         rt->dst.dev->name, num);
1170                         }
1171                         rt_emergency_hash_rebuild(net);
1172                         spin_unlock_bh(rt_hash_lock_addr(hash));
1173
1174                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1175                                         ifindex, rt_genid(net));
1176                         goto restart;
1177                 }
1178         }
1179
1180         /* Try to bind route to arp only if it is output
1181            route or unicast forwarding path.
1182          */
1183         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1184                 int err = rt_bind_neighbour(rt);
1185                 if (err) {
1186                         spin_unlock_bh(rt_hash_lock_addr(hash));
1187
1188                         if (err != -ENOBUFS) {
1189                                 rt_drop(rt);
1190                                 return ERR_PTR(err);
1191                         }
1192
1193                         /* Neighbour tables are full and nothing
1194                            can be released. Try to shrink route cache,
1195                            it is most likely it holds some neighbour records.
1196                          */
1197                         if (attempts-- > 0) {
1198                                 int saved_elasticity = ip_rt_gc_elasticity;
1199                                 int saved_int = ip_rt_gc_min_interval;
1200                                 ip_rt_gc_elasticity     = 1;
1201                                 ip_rt_gc_min_interval   = 0;
1202                                 rt_garbage_collect(&ipv4_dst_ops);
1203                                 ip_rt_gc_min_interval   = saved_int;
1204                                 ip_rt_gc_elasticity     = saved_elasticity;
1205                                 goto restart;
1206                         }
1207
1208                         if (net_ratelimit())
1209                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1210                         rt_drop(rt);
1211                         return ERR_PTR(-ENOBUFS);
1212                 }
1213         }
1214
1215         rt->dst.rt_next = rt_hash_table[hash].chain;
1216
1217         /*
1218          * Since lookup is lockfree, we must make sure
1219          * previous writes to rt are committed to memory
1220          * before making rt visible to other CPUS.
1221          */
1222         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1223
1224         spin_unlock_bh(rt_hash_lock_addr(hash));
1225
1226 skip_hashing:
1227         if (skb)
1228                 skb_dst_set(skb, &rt->dst);
1229         return rt;
1230 }
1231
1232 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1233
1234 static u32 rt_peer_genid(void)
1235 {
1236         return atomic_read(&__rt_peer_genid);
1237 }
1238
1239 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1240 {
1241         struct inet_peer *peer;
1242
1243         peer = inet_getpeer_v4(daddr, create);
1244
1245         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1246                 inet_putpeer(peer);
1247         else
1248                 rt->rt_peer_genid = rt_peer_genid();
1249 }
1250
1251 /*
1252  * Peer allocation may fail only in serious out-of-memory conditions.  However
1253  * we still can generate some output.
1254  * Random ID selection looks a bit dangerous because we have no chances to
1255  * select ID being unique in a reasonable period of time.
1256  * But broken packet identifier may be better than no packet at all.
1257  */
1258 static void ip_select_fb_ident(struct iphdr *iph)
1259 {
1260         static DEFINE_SPINLOCK(ip_fb_id_lock);
1261         static u32 ip_fallback_id;
1262         u32 salt;
1263
1264         spin_lock_bh(&ip_fb_id_lock);
1265         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1266         iph->id = htons(salt & 0xFFFF);
1267         ip_fallback_id = salt;
1268         spin_unlock_bh(&ip_fb_id_lock);
1269 }
1270
1271 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1272 {
1273         struct rtable *rt = (struct rtable *) dst;
1274
1275         if (rt) {
1276                 if (rt->peer == NULL)
1277                         rt_bind_peer(rt, rt->rt_dst, 1);
1278
1279                 /* If peer is attached to destination, it is never detached,
1280                    so that we need not to grab a lock to dereference it.
1281                  */
1282                 if (rt->peer) {
1283                         iph->id = htons(inet_getid(rt->peer, more));
1284                         return;
1285                 }
1286         } else
1287                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1288                        __builtin_return_address(0));
1289
1290         ip_select_fb_ident(iph);
1291 }
1292 EXPORT_SYMBOL(__ip_select_ident);
1293
1294 static void rt_del(unsigned hash, struct rtable *rt)
1295 {
1296         struct rtable __rcu **rthp;
1297         struct rtable *aux;
1298
1299         rthp = &rt_hash_table[hash].chain;
1300         spin_lock_bh(rt_hash_lock_addr(hash));
1301         ip_rt_put(rt);
1302         while ((aux = rcu_dereference_protected(*rthp,
1303                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1304                 if (aux == rt || rt_is_expired(aux)) {
1305                         *rthp = aux->dst.rt_next;
1306                         rt_free(aux);
1307                         continue;
1308                 }
1309                 rthp = &aux->dst.rt_next;
1310         }
1311         spin_unlock_bh(rt_hash_lock_addr(hash));
1312 }
1313
1314 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1315 {
1316         struct rtable *rt = (struct rtable *) dst;
1317         __be32 orig_gw = rt->rt_gateway;
1318         struct neighbour *n, *old_n;
1319
1320         dst_confirm(&rt->dst);
1321
1322         rt->rt_gateway = peer->redirect_learned.a4;
1323
1324         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1325         if (IS_ERR(n)) {
1326                 rt->rt_gateway = orig_gw;
1327                 return;
1328         }
1329         old_n = xchg(&rt->dst._neighbour, n);
1330         if (old_n)
1331                 neigh_release(old_n);
1332         if (!(n->nud_state & NUD_VALID)) {
1333                 neigh_event_send(n, NULL);
1334         } else {
1335                 rt->rt_flags |= RTCF_REDIRECTED;
1336                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1337         }
1338 }
1339
1340 /* called in rcu_read_lock() section */
1341 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1342                     __be32 saddr, struct net_device *dev)
1343 {
1344         int s, i;
1345         struct in_device *in_dev = __in_dev_get_rcu(dev);
1346         __be32 skeys[2] = { saddr, 0 };
1347         int    ikeys[2] = { dev->ifindex, 0 };
1348         struct inet_peer *peer;
1349         struct net *net;
1350
1351         if (!in_dev)
1352                 return;
1353
1354         net = dev_net(dev);
1355         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1356             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1357             ipv4_is_zeronet(new_gw))
1358                 goto reject_redirect;
1359
1360         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1361                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1362                         goto reject_redirect;
1363                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1364                         goto reject_redirect;
1365         } else {
1366                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1367                         goto reject_redirect;
1368         }
1369
1370         for (s = 0; s < 2; s++) {
1371                 for (i = 0; i < 2; i++) {
1372                         unsigned int hash;
1373                         struct rtable __rcu **rthp;
1374                         struct rtable *rt;
1375
1376                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1377
1378                         rthp = &rt_hash_table[hash].chain;
1379
1380                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1381                                 rthp = &rt->dst.rt_next;
1382
1383                                 if (rt->rt_key_dst != daddr ||
1384                                     rt->rt_key_src != skeys[s] ||
1385                                     rt->rt_oif != ikeys[i] ||
1386                                     rt_is_input_route(rt) ||
1387                                     rt_is_expired(rt) ||
1388                                     !net_eq(dev_net(rt->dst.dev), net) ||
1389                                     rt->dst.error ||
1390                                     rt->dst.dev != dev ||
1391                                     rt->rt_gateway != old_gw)
1392                                         continue;
1393
1394                                 if (!rt->peer)
1395                                         rt_bind_peer(rt, rt->rt_dst, 1);
1396
1397                                 peer = rt->peer;
1398                                 if (peer) {
1399                                         if (peer->redirect_learned.a4 != new_gw ||
1400                                             peer->redirect_genid != redirect_genid) {
1401                                                 peer->redirect_learned.a4 = new_gw;
1402                                                 peer->redirect_genid = redirect_genid;
1403                                                 atomic_inc(&__rt_peer_genid);
1404                                         }
1405                                         check_peer_redir(&rt->dst, peer);
1406                                 }
1407                         }
1408                 }
1409         }
1410         return;
1411
1412 reject_redirect:
1413 #ifdef CONFIG_IP_ROUTE_VERBOSE
1414         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1415                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1416                         "  Advised path = %pI4 -> %pI4\n",
1417                        &old_gw, dev->name, &new_gw,
1418                        &saddr, &daddr);
1419 #endif
1420         ;
1421 }
1422
1423 static bool peer_pmtu_expired(struct inet_peer *peer)
1424 {
1425         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1426
1427         return orig &&
1428                time_after_eq(jiffies, orig) &&
1429                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1430 }
1431
1432 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1433 {
1434         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1435
1436         return orig &&
1437                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1438 }
1439
1440 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1441 {
1442         struct rtable *rt = (struct rtable *)dst;
1443         struct dst_entry *ret = dst;
1444
1445         if (rt) {
1446                 if (dst->obsolete > 0) {
1447                         ip_rt_put(rt);
1448                         ret = NULL;
1449                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1450                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1451                                                 rt->rt_oif,
1452                                                 rt_genid(dev_net(dst->dev)));
1453                         rt_del(hash, rt);
1454                         ret = NULL;
1455                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1456                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1457                 }
1458         }
1459         return ret;
1460 }
1461
1462 /*
1463  * Algorithm:
1464  *      1. The first ip_rt_redirect_number redirects are sent
1465  *         with exponential backoff, then we stop sending them at all,
1466  *         assuming that the host ignores our redirects.
1467  *      2. If we did not see packets requiring redirects
1468  *         during ip_rt_redirect_silence, we assume that the host
1469  *         forgot redirected route and start to send redirects again.
1470  *
1471  * This algorithm is much cheaper and more intelligent than dumb load limiting
1472  * in icmp.c.
1473  *
1474  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1475  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1476  */
1477
1478 void ip_rt_send_redirect(struct sk_buff *skb)
1479 {
1480         struct rtable *rt = skb_rtable(skb);
1481         struct in_device *in_dev;
1482         struct inet_peer *peer;
1483         int log_martians;
1484
1485         rcu_read_lock();
1486         in_dev = __in_dev_get_rcu(rt->dst.dev);
1487         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1488                 rcu_read_unlock();
1489                 return;
1490         }
1491         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1492         rcu_read_unlock();
1493
1494         if (!rt->peer)
1495                 rt_bind_peer(rt, rt->rt_dst, 1);
1496         peer = rt->peer;
1497         if (!peer) {
1498                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1499                 return;
1500         }
1501
1502         /* No redirected packets during ip_rt_redirect_silence;
1503          * reset the algorithm.
1504          */
1505         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1506                 peer->rate_tokens = 0;
1507
1508         /* Too many ignored redirects; do not send anything
1509          * set dst.rate_last to the last seen redirected packet.
1510          */
1511         if (peer->rate_tokens >= ip_rt_redirect_number) {
1512                 peer->rate_last = jiffies;
1513                 return;
1514         }
1515
1516         /* Check for load limit; set rate_last to the latest sent
1517          * redirect.
1518          */
1519         if (peer->rate_tokens == 0 ||
1520             time_after(jiffies,
1521                        (peer->rate_last +
1522                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1523                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1524                 peer->rate_last = jiffies;
1525                 ++peer->rate_tokens;
1526 #ifdef CONFIG_IP_ROUTE_VERBOSE
1527                 if (log_martians &&
1528                     peer->rate_tokens == ip_rt_redirect_number &&
1529                     net_ratelimit())
1530                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1531                                &ip_hdr(skb)->saddr, rt->rt_iif,
1532                                 &rt->rt_dst, &rt->rt_gateway);
1533 #endif
1534         }
1535 }
1536
1537 static int ip_error(struct sk_buff *skb)
1538 {
1539         struct rtable *rt = skb_rtable(skb);
1540         struct inet_peer *peer;
1541         unsigned long now;
1542         bool send;
1543         int code;
1544
1545         switch (rt->dst.error) {
1546         case EINVAL:
1547         default:
1548                 goto out;
1549         case EHOSTUNREACH:
1550                 code = ICMP_HOST_UNREACH;
1551                 break;
1552         case ENETUNREACH:
1553                 code = ICMP_NET_UNREACH;
1554                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1555                                 IPSTATS_MIB_INNOROUTES);
1556                 break;
1557         case EACCES:
1558                 code = ICMP_PKT_FILTERED;
1559                 break;
1560         }
1561
1562         if (!rt->peer)
1563                 rt_bind_peer(rt, rt->rt_dst, 1);
1564         peer = rt->peer;
1565
1566         send = true;
1567         if (peer) {
1568                 now = jiffies;
1569                 peer->rate_tokens += now - peer->rate_last;
1570                 if (peer->rate_tokens > ip_rt_error_burst)
1571                         peer->rate_tokens = ip_rt_error_burst;
1572                 peer->rate_last = now;
1573                 if (peer->rate_tokens >= ip_rt_error_cost)
1574                         peer->rate_tokens -= ip_rt_error_cost;
1575                 else
1576                         send = false;
1577         }
1578         if (send)
1579                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1580
1581 out:    kfree_skb(skb);
1582         return 0;
1583 }
1584
1585 /*
1586  *      The last two values are not from the RFC but
1587  *      are needed for AMPRnet AX.25 paths.
1588  */
1589
1590 static const unsigned short mtu_plateau[] =
1591 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1592
1593 static inline unsigned short guess_mtu(unsigned short old_mtu)
1594 {
1595         int i;
1596
1597         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1598                 if (old_mtu > mtu_plateau[i])
1599                         return mtu_plateau[i];
1600         return 68;
1601 }
1602
1603 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1604                                  unsigned short new_mtu,
1605                                  struct net_device *dev)
1606 {
1607         unsigned short old_mtu = ntohs(iph->tot_len);
1608         unsigned short est_mtu = 0;
1609         struct inet_peer *peer;
1610
1611         peer = inet_getpeer_v4(iph->daddr, 1);
1612         if (peer) {
1613                 unsigned short mtu = new_mtu;
1614
1615                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1616                         /* BSD 4.2 derived systems incorrectly adjust
1617                          * tot_len by the IP header length, and report
1618                          * a zero MTU in the ICMP message.
1619                          */
1620                         if (mtu == 0 &&
1621                             old_mtu >= 68 + (iph->ihl << 2))
1622                                 old_mtu -= iph->ihl << 2;
1623                         mtu = guess_mtu(old_mtu);
1624                 }
1625
1626                 if (mtu < ip_rt_min_pmtu)
1627                         mtu = ip_rt_min_pmtu;
1628                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1629                         unsigned long pmtu_expires;
1630
1631                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1632                         if (!pmtu_expires)
1633                                 pmtu_expires = 1UL;
1634
1635                         est_mtu = mtu;
1636                         peer->pmtu_learned = mtu;
1637                         peer->pmtu_expires = pmtu_expires;
1638                         atomic_inc(&__rt_peer_genid);
1639                 }
1640
1641                 inet_putpeer(peer);
1642         }
1643         return est_mtu ? : new_mtu;
1644 }
1645
1646 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1647 {
1648         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1649
1650         if (!expires)
1651                 return;
1652         if (time_before(jiffies, expires)) {
1653                 u32 orig_dst_mtu = dst_mtu(dst);
1654                 if (peer->pmtu_learned < orig_dst_mtu) {
1655                         if (!peer->pmtu_orig)
1656                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1657                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1658                 }
1659         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1660                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1661 }
1662
1663 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1664 {
1665         struct rtable *rt = (struct rtable *) dst;
1666         struct inet_peer *peer;
1667
1668         dst_confirm(dst);
1669
1670         if (!rt->peer)
1671                 rt_bind_peer(rt, rt->rt_dst, 1);
1672         peer = rt->peer;
1673         if (peer) {
1674                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1675
1676                 if (mtu < ip_rt_min_pmtu)
1677                         mtu = ip_rt_min_pmtu;
1678                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1679
1680                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1681                         if (!pmtu_expires)
1682                                 pmtu_expires = 1UL;
1683
1684                         peer->pmtu_learned = mtu;
1685                         peer->pmtu_expires = pmtu_expires;
1686
1687                         atomic_inc(&__rt_peer_genid);
1688                         rt->rt_peer_genid = rt_peer_genid();
1689                 }
1690                 check_peer_pmtu(dst, peer);
1691         }
1692 }
1693
1694
1695 static void ipv4_validate_peer(struct rtable *rt)
1696 {
1697         if (rt->rt_peer_genid != rt_peer_genid()) {
1698                 struct inet_peer *peer;
1699
1700                 if (!rt->peer)
1701                         rt_bind_peer(rt, rt->rt_dst, 0);
1702
1703                 peer = rt->peer;
1704                 if (peer) {
1705                         check_peer_pmtu(&rt->dst, peer);
1706
1707                         if (peer->redirect_genid != redirect_genid)
1708                                 peer->redirect_learned.a4 = 0;
1709                         if (peer->redirect_learned.a4 &&
1710                             peer->redirect_learned.a4 != rt->rt_gateway)
1711                                 check_peer_redir(&rt->dst, peer);
1712                 }
1713
1714                 rt->rt_peer_genid = rt_peer_genid();
1715         }
1716 }
1717
1718 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1719 {
1720         struct rtable *rt = (struct rtable *) dst;
1721
1722         if (rt_is_expired(rt))
1723                 return NULL;
1724         ipv4_validate_peer(rt);
1725         return dst;
1726 }
1727
1728 static void ipv4_dst_destroy(struct dst_entry *dst)
1729 {
1730         struct rtable *rt = (struct rtable *) dst;
1731         struct inet_peer *peer = rt->peer;
1732
1733         if (rt->fi) {
1734                 fib_info_put(rt->fi);
1735                 rt->fi = NULL;
1736         }
1737         if (peer) {
1738                 rt->peer = NULL;
1739                 inet_putpeer(peer);
1740         }
1741 }
1742
1743
1744 static void ipv4_link_failure(struct sk_buff *skb)
1745 {
1746         struct rtable *rt;
1747
1748         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1749
1750         rt = skb_rtable(skb);
1751         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1752                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1753 }
1754
1755 static int ip_rt_bug(struct sk_buff *skb)
1756 {
1757         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1758                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1759                 skb->dev ? skb->dev->name : "?");
1760         kfree_skb(skb);
1761         WARN_ON(1);
1762         return 0;
1763 }
1764
1765 /*
1766    We do not cache source address of outgoing interface,
1767    because it is used only by IP RR, TS and SRR options,
1768    so that it out of fast path.
1769
1770    BTW remember: "addr" is allowed to be not aligned
1771    in IP options!
1772  */
1773
1774 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1775 {
1776         __be32 src;
1777
1778         if (rt_is_output_route(rt))
1779                 src = ip_hdr(skb)->saddr;
1780         else {
1781                 struct fib_result res;
1782                 struct flowi4 fl4;
1783                 struct iphdr *iph;
1784
1785                 iph = ip_hdr(skb);
1786
1787                 memset(&fl4, 0, sizeof(fl4));
1788                 fl4.daddr = iph->daddr;
1789                 fl4.saddr = iph->saddr;
1790                 fl4.flowi4_tos = RT_TOS(iph->tos);
1791                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1792                 fl4.flowi4_iif = skb->dev->ifindex;
1793                 fl4.flowi4_mark = skb->mark;
1794
1795                 rcu_read_lock();
1796                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1797                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1798                 else
1799                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1800                                         RT_SCOPE_UNIVERSE);
1801                 rcu_read_unlock();
1802         }
1803         memcpy(addr, &src, 4);
1804 }
1805
1806 #ifdef CONFIG_IP_ROUTE_CLASSID
1807 static void set_class_tag(struct rtable *rt, u32 tag)
1808 {
1809         if (!(rt->dst.tclassid & 0xFFFF))
1810                 rt->dst.tclassid |= tag & 0xFFFF;
1811         if (!(rt->dst.tclassid & 0xFFFF0000))
1812                 rt->dst.tclassid |= tag & 0xFFFF0000;
1813 }
1814 #endif
1815
1816 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1817 {
1818         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1819
1820         if (advmss == 0) {
1821                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1822                                ip_rt_min_advmss);
1823                 if (advmss > 65535 - 40)
1824                         advmss = 65535 - 40;
1825         }
1826         return advmss;
1827 }
1828
1829 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1830 {
1831         unsigned int mtu = dst->dev->mtu;
1832
1833         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1834                 const struct rtable *rt = (const struct rtable *) dst;
1835
1836                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1837                         mtu = 576;
1838         }
1839
1840         if (mtu > IP_MAX_MTU)
1841                 mtu = IP_MAX_MTU;
1842
1843         return mtu;
1844 }
1845
1846 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1847                             struct fib_info *fi)
1848 {
1849         struct inet_peer *peer;
1850         int create = 0;
1851
1852         /* If a peer entry exists for this destination, we must hook
1853          * it up in order to get at cached metrics.
1854          */
1855         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1856                 create = 1;
1857
1858         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1859         if (peer) {
1860                 rt->rt_peer_genid = rt_peer_genid();
1861                 if (inet_metrics_new(peer))
1862                         memcpy(peer->metrics, fi->fib_metrics,
1863                                sizeof(u32) * RTAX_MAX);
1864                 dst_init_metrics(&rt->dst, peer->metrics, false);
1865
1866                 check_peer_pmtu(&rt->dst, peer);
1867                 if (peer->redirect_genid != redirect_genid)
1868                         peer->redirect_learned.a4 = 0;
1869                 if (peer->redirect_learned.a4 &&
1870                     peer->redirect_learned.a4 != rt->rt_gateway) {
1871                         rt->rt_gateway = peer->redirect_learned.a4;
1872                         rt->rt_flags |= RTCF_REDIRECTED;
1873                 }
1874         } else {
1875                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1876                         rt->fi = fi;
1877                         atomic_inc(&fi->fib_clntref);
1878                 }
1879                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1880         }
1881 }
1882
1883 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1884                            const struct fib_result *res,
1885                            struct fib_info *fi, u16 type, u32 itag)
1886 {
1887         struct dst_entry *dst = &rt->dst;
1888
1889         if (fi) {
1890                 if (FIB_RES_GW(*res) &&
1891                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1892                         rt->rt_gateway = FIB_RES_GW(*res);
1893                 rt_init_metrics(rt, fl4, fi);
1894 #ifdef CONFIG_IP_ROUTE_CLASSID
1895                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1896 #endif
1897         }
1898
1899         if (dst_mtu(dst) > IP_MAX_MTU)
1900                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1901         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1902                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1903
1904 #ifdef CONFIG_IP_ROUTE_CLASSID
1905 #ifdef CONFIG_IP_MULTIPLE_TABLES
1906         set_class_tag(rt, fib_rules_tclass(res));
1907 #endif
1908         set_class_tag(rt, itag);
1909 #endif
1910 }
1911
1912 static struct rtable *rt_dst_alloc(struct net_device *dev,
1913                                    bool nopolicy, bool noxfrm)
1914 {
1915         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1916                          DST_HOST |
1917                          (nopolicy ? DST_NOPOLICY : 0) |
1918                          (noxfrm ? DST_NOXFRM : 0));
1919 }
1920
1921 /* called in rcu_read_lock() section */
1922 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1923                                 u8 tos, struct net_device *dev, int our)
1924 {
1925         unsigned int hash;
1926         struct rtable *rth;
1927         __be32 spec_dst;
1928         struct in_device *in_dev = __in_dev_get_rcu(dev);
1929         u32 itag = 0;
1930         int err;
1931
1932         /* Primary sanity checks. */
1933
1934         if (in_dev == NULL)
1935                 return -EINVAL;
1936
1937         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1938             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1939                 goto e_inval;
1940
1941         if (ipv4_is_zeronet(saddr)) {
1942                 if (!ipv4_is_local_multicast(daddr))
1943                         goto e_inval;
1944                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1945         } else {
1946                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1947                                           &itag);
1948                 if (err < 0)
1949                         goto e_err;
1950         }
1951         rth = rt_dst_alloc(init_net.loopback_dev,
1952                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1953         if (!rth)
1954                 goto e_nobufs;
1955
1956 #ifdef CONFIG_IP_ROUTE_CLASSID
1957         rth->dst.tclassid = itag;
1958 #endif
1959         rth->dst.output = ip_rt_bug;
1960
1961         rth->rt_key_dst = daddr;
1962         rth->rt_key_src = saddr;
1963         rth->rt_genid   = rt_genid(dev_net(dev));
1964         rth->rt_flags   = RTCF_MULTICAST;
1965         rth->rt_type    = RTN_MULTICAST;
1966         rth->rt_key_tos = tos;
1967         rth->rt_dst     = daddr;
1968         rth->rt_src     = saddr;
1969         rth->rt_route_iif = dev->ifindex;
1970         rth->rt_iif     = dev->ifindex;
1971         rth->rt_oif     = 0;
1972         rth->rt_mark    = skb->mark;
1973         rth->rt_gateway = daddr;
1974         rth->rt_spec_dst= spec_dst;
1975         rth->rt_peer_genid = 0;
1976         rth->peer = NULL;
1977         rth->fi = NULL;
1978         if (our) {
1979                 rth->dst.input= ip_local_deliver;
1980                 rth->rt_flags |= RTCF_LOCAL;
1981         }
1982
1983 #ifdef CONFIG_IP_MROUTE
1984         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1985                 rth->dst.input = ip_mr_input;
1986 #endif
1987         RT_CACHE_STAT_INC(in_slow_mc);
1988
1989         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1990         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1991         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1992
1993 e_nobufs:
1994         return -ENOBUFS;
1995 e_inval:
1996         return -EINVAL;
1997 e_err:
1998         return err;
1999 }
2000
2001
2002 static void ip_handle_martian_source(struct net_device *dev,
2003                                      struct in_device *in_dev,
2004                                      struct sk_buff *skb,
2005                                      __be32 daddr,
2006                                      __be32 saddr)
2007 {
2008         RT_CACHE_STAT_INC(in_martian_src);
2009 #ifdef CONFIG_IP_ROUTE_VERBOSE
2010         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2011                 /*
2012                  *      RFC1812 recommendation, if source is martian,
2013                  *      the only hint is MAC header.
2014                  */
2015                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2016                         &daddr, &saddr, dev->name);
2017                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2018                         int i;
2019                         const unsigned char *p = skb_mac_header(skb);
2020                         printk(KERN_WARNING "ll header: ");
2021                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2022                                 printk("%02x", *p);
2023                                 if (i < (dev->hard_header_len - 1))
2024                                         printk(":");
2025                         }
2026                         printk("\n");
2027                 }
2028         }
2029 #endif
2030 }
2031
2032 /* called in rcu_read_lock() section */
2033 static int __mkroute_input(struct sk_buff *skb,
2034                            const struct fib_result *res,
2035                            struct in_device *in_dev,
2036                            __be32 daddr, __be32 saddr, u32 tos,
2037                            struct rtable **result)
2038 {
2039         struct rtable *rth;
2040         int err;
2041         struct in_device *out_dev;
2042         unsigned int flags = 0;
2043         __be32 spec_dst;
2044         u32 itag;
2045
2046         /* get a working reference to the output device */
2047         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2048         if (out_dev == NULL) {
2049                 if (net_ratelimit())
2050                         printk(KERN_CRIT "Bug in ip_route_input" \
2051                                "_slow(). Please, report\n");
2052                 return -EINVAL;
2053         }
2054
2055
2056         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2057                                   in_dev->dev, &spec_dst, &itag);
2058         if (err < 0) {
2059                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2060                                          saddr);
2061
2062                 goto cleanup;
2063         }
2064
2065         if (err)
2066                 flags |= RTCF_DIRECTSRC;
2067
2068         if (out_dev == in_dev && err &&
2069             (IN_DEV_SHARED_MEDIA(out_dev) ||
2070              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2071                 flags |= RTCF_DOREDIRECT;
2072
2073         if (skb->protocol != htons(ETH_P_IP)) {
2074                 /* Not IP (i.e. ARP). Do not create route, if it is
2075                  * invalid for proxy arp. DNAT routes are always valid.
2076                  *
2077                  * Proxy arp feature have been extended to allow, ARP
2078                  * replies back to the same interface, to support
2079                  * Private VLAN switch technologies. See arp.c.
2080                  */
2081                 if (out_dev == in_dev &&
2082                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2083                         err = -EINVAL;
2084                         goto cleanup;
2085                 }
2086         }
2087
2088         rth = rt_dst_alloc(out_dev->dev,
2089                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2090                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2091         if (!rth) {
2092                 err = -ENOBUFS;
2093                 goto cleanup;
2094         }
2095
2096         rth->rt_key_dst = daddr;
2097         rth->rt_key_src = saddr;
2098         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2099         rth->rt_flags = flags;
2100         rth->rt_type = res->type;
2101         rth->rt_key_tos = tos;
2102         rth->rt_dst     = daddr;
2103         rth->rt_src     = saddr;
2104         rth->rt_route_iif = in_dev->dev->ifindex;
2105         rth->rt_iif     = in_dev->dev->ifindex;
2106         rth->rt_oif     = 0;
2107         rth->rt_mark    = skb->mark;
2108         rth->rt_gateway = daddr;
2109         rth->rt_spec_dst= spec_dst;
2110         rth->rt_peer_genid = 0;
2111         rth->peer = NULL;
2112         rth->fi = NULL;
2113
2114         rth->dst.input = ip_forward;
2115         rth->dst.output = ip_output;
2116
2117         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2118
2119         *result = rth;
2120         err = 0;
2121  cleanup:
2122         return err;
2123 }
2124
2125 static int ip_mkroute_input(struct sk_buff *skb,
2126                             struct fib_result *res,
2127                             const struct flowi4 *fl4,
2128                             struct in_device *in_dev,
2129                             __be32 daddr, __be32 saddr, u32 tos)
2130 {
2131         struct rtable* rth = NULL;
2132         int err;
2133         unsigned hash;
2134
2135 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2136         if (res->fi && res->fi->fib_nhs > 1)
2137                 fib_select_multipath(res);
2138 #endif
2139
2140         /* create a routing cache entry */
2141         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2142         if (err)
2143                 return err;
2144
2145         /* put it into the cache */
2146         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2147                        rt_genid(dev_net(rth->dst.dev)));
2148         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2149         if (IS_ERR(rth))
2150                 return PTR_ERR(rth);
2151         return 0;
2152 }
2153
2154 /*
2155  *      NOTE. We drop all the packets that has local source
2156  *      addresses, because every properly looped back packet
2157  *      must have correct destination already attached by output routine.
2158  *
2159  *      Such approach solves two big problems:
2160  *      1. Not simplex devices are handled properly.
2161  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2162  *      called with rcu_read_lock()
2163  */
2164
2165 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2166                                u8 tos, struct net_device *dev)
2167 {
2168         struct fib_result res;
2169         struct in_device *in_dev = __in_dev_get_rcu(dev);
2170         struct flowi4   fl4;
2171         unsigned        flags = 0;
2172         u32             itag = 0;
2173         struct rtable * rth;
2174         unsigned        hash;
2175         __be32          spec_dst;
2176         int             err = -EINVAL;
2177         struct net    * net = dev_net(dev);
2178
2179         /* IP on this device is disabled. */
2180
2181         if (!in_dev)
2182                 goto out;
2183
2184         /* Check for the most weird martians, which can be not detected
2185            by fib_lookup.
2186          */
2187
2188         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2189             ipv4_is_loopback(saddr))
2190                 goto martian_source;
2191
2192         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2193                 goto brd_input;
2194
2195         /* Accept zero addresses only to limited broadcast;
2196          * I even do not know to fix it or not. Waiting for complains :-)
2197          */
2198         if (ipv4_is_zeronet(saddr))
2199                 goto martian_source;
2200
2201         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2202                 goto martian_destination;
2203
2204         /*
2205          *      Now we are ready to route packet.
2206          */
2207         fl4.flowi4_oif = 0;
2208         fl4.flowi4_iif = dev->ifindex;
2209         fl4.flowi4_mark = skb->mark;
2210         fl4.flowi4_tos = tos;
2211         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2212         fl4.daddr = daddr;
2213         fl4.saddr = saddr;
2214         err = fib_lookup(net, &fl4, &res);
2215         if (err != 0) {
2216                 if (!IN_DEV_FORWARD(in_dev))
2217                         goto e_hostunreach;
2218                 goto no_route;
2219         }
2220
2221         RT_CACHE_STAT_INC(in_slow_tot);
2222
2223         if (res.type == RTN_BROADCAST)
2224                 goto brd_input;
2225
2226         if (res.type == RTN_LOCAL) {
2227                 err = fib_validate_source(skb, saddr, daddr, tos,
2228                                           net->loopback_dev->ifindex,
2229                                           dev, &spec_dst, &itag);
2230                 if (err < 0)
2231                         goto martian_source_keep_err;
2232                 if (err)
2233                         flags |= RTCF_DIRECTSRC;
2234                 spec_dst = daddr;
2235                 goto local_input;
2236         }
2237
2238         if (!IN_DEV_FORWARD(in_dev))
2239                 goto e_hostunreach;
2240         if (res.type != RTN_UNICAST)
2241                 goto martian_destination;
2242
2243         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2244 out:    return err;
2245
2246 brd_input:
2247         if (skb->protocol != htons(ETH_P_IP))
2248                 goto e_inval;
2249
2250         if (ipv4_is_zeronet(saddr))
2251                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2252         else {
2253                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2254                                           &itag);
2255                 if (err < 0)
2256                         goto martian_source_keep_err;
2257                 if (err)
2258                         flags |= RTCF_DIRECTSRC;
2259         }
2260         flags |= RTCF_BROADCAST;
2261         res.type = RTN_BROADCAST;
2262         RT_CACHE_STAT_INC(in_brd);
2263
2264 local_input:
2265         rth = rt_dst_alloc(net->loopback_dev,
2266                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2267         if (!rth)
2268                 goto e_nobufs;
2269
2270         rth->dst.input= ip_local_deliver;
2271         rth->dst.output= ip_rt_bug;
2272 #ifdef CONFIG_IP_ROUTE_CLASSID
2273         rth->dst.tclassid = itag;
2274 #endif
2275
2276         rth->rt_key_dst = daddr;
2277         rth->rt_key_src = saddr;
2278         rth->rt_genid = rt_genid(net);
2279         rth->rt_flags   = flags|RTCF_LOCAL;
2280         rth->rt_type    = res.type;
2281         rth->rt_key_tos = tos;
2282         rth->rt_dst     = daddr;
2283         rth->rt_src     = saddr;
2284 #ifdef CONFIG_IP_ROUTE_CLASSID
2285         rth->dst.tclassid = itag;
2286 #endif
2287         rth->rt_route_iif = dev->ifindex;
2288         rth->rt_iif     = dev->ifindex;
2289         rth->rt_oif     = 0;
2290         rth->rt_mark    = skb->mark;
2291         rth->rt_gateway = daddr;
2292         rth->rt_spec_dst= spec_dst;
2293         rth->rt_peer_genid = 0;
2294         rth->peer = NULL;
2295         rth->fi = NULL;
2296         if (res.type == RTN_UNREACHABLE) {
2297                 rth->dst.input= ip_error;
2298                 rth->dst.error= -err;
2299                 rth->rt_flags   &= ~RTCF_LOCAL;
2300         }
2301         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2302         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2303         err = 0;
2304         if (IS_ERR(rth))
2305                 err = PTR_ERR(rth);
2306         goto out;
2307
2308 no_route:
2309         RT_CACHE_STAT_INC(in_no_route);
2310         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2311         res.type = RTN_UNREACHABLE;
2312         if (err == -ESRCH)
2313                 err = -ENETUNREACH;
2314         goto local_input;
2315
2316         /*
2317          *      Do not cache martian addresses: they should be logged (RFC1812)
2318          */
2319 martian_destination:
2320         RT_CACHE_STAT_INC(in_martian_dst);
2321 #ifdef CONFIG_IP_ROUTE_VERBOSE
2322         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2323                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2324                         &daddr, &saddr, dev->name);
2325 #endif
2326
2327 e_hostunreach:
2328         err = -EHOSTUNREACH;
2329         goto out;
2330
2331 e_inval:
2332         err = -EINVAL;
2333         goto out;
2334
2335 e_nobufs:
2336         err = -ENOBUFS;
2337         goto out;
2338
2339 martian_source:
2340         err = -EINVAL;
2341 martian_source_keep_err:
2342         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2343         goto out;
2344 }
2345
2346 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2347                            u8 tos, struct net_device *dev, bool noref)
2348 {
2349         struct rtable * rth;
2350         unsigned        hash;
2351         int iif = dev->ifindex;
2352         struct net *net;
2353         int res;
2354
2355         net = dev_net(dev);
2356
2357         rcu_read_lock();
2358
2359         if (!rt_caching(net))
2360                 goto skip_cache;
2361
2362         tos &= IPTOS_RT_MASK;
2363         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2364
2365         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2366              rth = rcu_dereference(rth->dst.rt_next)) {
2367                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2368                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2369                      (rth->rt_route_iif ^ iif) |
2370                      (rth->rt_key_tos ^ tos)) == 0 &&
2371                     rth->rt_mark == skb->mark &&
2372                     net_eq(dev_net(rth->dst.dev), net) &&
2373                     !rt_is_expired(rth)) {
2374                         ipv4_validate_peer(rth);
2375                         if (noref) {
2376                                 dst_use_noref(&rth->dst, jiffies);
2377                                 skb_dst_set_noref(skb, &rth->dst);
2378                         } else {
2379                                 dst_use(&rth->dst, jiffies);
2380                                 skb_dst_set(skb, &rth->dst);
2381                         }
2382                         RT_CACHE_STAT_INC(in_hit);
2383                         rcu_read_unlock();
2384                         return 0;
2385                 }
2386                 RT_CACHE_STAT_INC(in_hlist_search);
2387         }
2388
2389 skip_cache:
2390         /* Multicast recognition logic is moved from route cache to here.
2391            The problem was that too many Ethernet cards have broken/missing
2392            hardware multicast filters :-( As result the host on multicasting
2393            network acquires a lot of useless route cache entries, sort of
2394            SDR messages from all the world. Now we try to get rid of them.
2395            Really, provided software IP multicast filter is organized
2396            reasonably (at least, hashed), it does not result in a slowdown
2397            comparing with route cache reject entries.
2398            Note, that multicast routers are not affected, because
2399            route cache entry is created eventually.
2400          */
2401         if (ipv4_is_multicast(daddr)) {
2402                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2403
2404                 if (in_dev) {
2405                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2406                                                   ip_hdr(skb)->protocol);
2407                         if (our
2408 #ifdef CONFIG_IP_MROUTE
2409                                 ||
2410                             (!ipv4_is_local_multicast(daddr) &&
2411                              IN_DEV_MFORWARD(in_dev))
2412 #endif
2413                            ) {
2414                                 int res = ip_route_input_mc(skb, daddr, saddr,
2415                                                             tos, dev, our);
2416                                 rcu_read_unlock();
2417                                 return res;
2418                         }
2419                 }
2420                 rcu_read_unlock();
2421                 return -EINVAL;
2422         }
2423         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2424         rcu_read_unlock();
2425         return res;
2426 }
2427 EXPORT_SYMBOL(ip_route_input_common);
2428
2429 /* called with rcu_read_lock() */
2430 static struct rtable *__mkroute_output(const struct fib_result *res,
2431                                        const struct flowi4 *fl4,
2432                                        __be32 orig_daddr, __be32 orig_saddr,
2433                                        int orig_oif, __u8 orig_rtos,
2434                                        struct net_device *dev_out,
2435                                        unsigned int flags)
2436 {
2437         struct fib_info *fi = res->fi;
2438         struct in_device *in_dev;
2439         u16 type = res->type;
2440         struct rtable *rth;
2441
2442         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2443                 return ERR_PTR(-EINVAL);
2444
2445         if (ipv4_is_lbcast(fl4->daddr))
2446                 type = RTN_BROADCAST;
2447         else if (ipv4_is_multicast(fl4->daddr))
2448                 type = RTN_MULTICAST;
2449         else if (ipv4_is_zeronet(fl4->daddr))
2450                 return ERR_PTR(-EINVAL);
2451
2452         if (dev_out->flags & IFF_LOOPBACK)
2453                 flags |= RTCF_LOCAL;
2454
2455         in_dev = __in_dev_get_rcu(dev_out);
2456         if (!in_dev)
2457                 return ERR_PTR(-EINVAL);
2458
2459         if (type == RTN_BROADCAST) {
2460                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2461                 fi = NULL;
2462         } else if (type == RTN_MULTICAST) {
2463                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2464                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2465                                      fl4->flowi4_proto))
2466                         flags &= ~RTCF_LOCAL;
2467                 /* If multicast route do not exist use
2468                  * default one, but do not gateway in this case.
2469                  * Yes, it is hack.
2470                  */
2471                 if (fi && res->prefixlen < 4)
2472                         fi = NULL;
2473         }
2474
2475         rth = rt_dst_alloc(dev_out,
2476                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2477                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2478         if (!rth)
2479                 return ERR_PTR(-ENOBUFS);
2480
2481         rth->dst.output = ip_output;
2482
2483         rth->rt_key_dst = orig_daddr;
2484         rth->rt_key_src = orig_saddr;
2485         rth->rt_genid = rt_genid(dev_net(dev_out));
2486         rth->rt_flags   = flags;
2487         rth->rt_type    = type;
2488         rth->rt_key_tos = orig_rtos;
2489         rth->rt_dst     = fl4->daddr;
2490         rth->rt_src     = fl4->saddr;
2491         rth->rt_route_iif = 0;
2492         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2493         rth->rt_oif     = orig_oif;
2494         rth->rt_mark    = fl4->flowi4_mark;
2495         rth->rt_gateway = fl4->daddr;
2496         rth->rt_spec_dst= fl4->saddr;
2497         rth->rt_peer_genid = 0;
2498         rth->peer = NULL;
2499         rth->fi = NULL;
2500
2501         RT_CACHE_STAT_INC(out_slow_tot);
2502
2503         if (flags & RTCF_LOCAL) {
2504                 rth->dst.input = ip_local_deliver;
2505                 rth->rt_spec_dst = fl4->daddr;
2506         }
2507         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2508                 rth->rt_spec_dst = fl4->saddr;
2509                 if (flags & RTCF_LOCAL &&
2510                     !(dev_out->flags & IFF_LOOPBACK)) {
2511                         rth->dst.output = ip_mc_output;
2512                         RT_CACHE_STAT_INC(out_slow_mc);
2513                 }
2514 #ifdef CONFIG_IP_MROUTE
2515                 if (type == RTN_MULTICAST) {
2516                         if (IN_DEV_MFORWARD(in_dev) &&
2517                             !ipv4_is_local_multicast(fl4->daddr)) {
2518                                 rth->dst.input = ip_mr_input;
2519                                 rth->dst.output = ip_mc_output;
2520                         }
2521                 }
2522 #endif
2523         }
2524
2525         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2526
2527         return rth;
2528 }
2529
2530 /*
2531  * Major route resolver routine.
2532  * called with rcu_read_lock();
2533  */
2534
2535 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2536 {
2537         struct net_device *dev_out = NULL;
2538         __u8 tos = RT_FL_TOS(fl4);
2539         unsigned int flags = 0;
2540         struct fib_result res;
2541         struct rtable *rth;
2542         __be32 orig_daddr;
2543         __be32 orig_saddr;
2544         int orig_oif;
2545
2546         res.fi          = NULL;
2547 #ifdef CONFIG_IP_MULTIPLE_TABLES
2548         res.r           = NULL;
2549 #endif
2550
2551         orig_daddr = fl4->daddr;
2552         orig_saddr = fl4->saddr;
2553         orig_oif = fl4->flowi4_oif;
2554
2555         fl4->flowi4_iif = net->loopback_dev->ifindex;
2556         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2557         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2558                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2559
2560         rcu_read_lock();
2561         if (fl4->saddr) {
2562                 rth = ERR_PTR(-EINVAL);
2563                 if (ipv4_is_multicast(fl4->saddr) ||
2564                     ipv4_is_lbcast(fl4->saddr) ||
2565                     ipv4_is_zeronet(fl4->saddr))
2566                         goto out;
2567
2568                 /* I removed check for oif == dev_out->oif here.
2569                    It was wrong for two reasons:
2570                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2571                       is assigned to multiple interfaces.
2572                    2. Moreover, we are allowed to send packets with saddr
2573                       of another iface. --ANK
2574                  */
2575
2576                 if (fl4->flowi4_oif == 0 &&
2577                     (ipv4_is_multicast(fl4->daddr) ||
2578                      ipv4_is_lbcast(fl4->daddr))) {
2579                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2580                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2581                         if (dev_out == NULL)
2582                                 goto out;
2583
2584                         /* Special hack: user can direct multicasts
2585                            and limited broadcast via necessary interface
2586                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2587                            This hack is not just for fun, it allows
2588                            vic,vat and friends to work.
2589                            They bind socket to loopback, set ttl to zero
2590                            and expect that it will work.
2591                            From the viewpoint of routing cache they are broken,
2592                            because we are not allowed to build multicast path
2593                            with loopback source addr (look, routing cache
2594                            cannot know, that ttl is zero, so that packet
2595                            will not leave this host and route is valid).
2596                            Luckily, this hack is good workaround.
2597                          */
2598
2599                         fl4->flowi4_oif = dev_out->ifindex;
2600                         goto make_route;
2601                 }
2602
2603                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2604                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2605                         if (!__ip_dev_find(net, fl4->saddr, false))
2606                                 goto out;
2607                 }
2608         }
2609
2610
2611         if (fl4->flowi4_oif) {
2612                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2613                 rth = ERR_PTR(-ENODEV);
2614                 if (dev_out == NULL)
2615                         goto out;
2616
2617                 /* RACE: Check return value of inet_select_addr instead. */
2618                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2619                         rth = ERR_PTR(-ENETUNREACH);
2620                         goto out;
2621                 }
2622                 if (ipv4_is_local_multicast(fl4->daddr) ||
2623                     ipv4_is_lbcast(fl4->daddr)) {
2624                         if (!fl4->saddr)
2625                                 fl4->saddr = inet_select_addr(dev_out, 0,
2626                                                               RT_SCOPE_LINK);
2627                         goto make_route;
2628                 }
2629                 if (fl4->saddr) {
2630                         if (ipv4_is_multicast(fl4->daddr))
2631                                 fl4->saddr = inet_select_addr(dev_out, 0,
2632                                                               fl4->flowi4_scope);
2633                         else if (!fl4->daddr)
2634                                 fl4->saddr = inet_select_addr(dev_out, 0,
2635                                                               RT_SCOPE_HOST);
2636                 }
2637         }
2638
2639         if (!fl4->daddr) {
2640                 fl4->daddr = fl4->saddr;
2641                 if (!fl4->daddr)
2642                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2643                 dev_out = net->loopback_dev;
2644                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2645                 res.type = RTN_LOCAL;
2646                 flags |= RTCF_LOCAL;
2647                 goto make_route;
2648         }
2649
2650         if (fib_lookup(net, fl4, &res)) {
2651                 res.fi = NULL;
2652                 if (fl4->flowi4_oif) {
2653                         /* Apparently, routing tables are wrong. Assume,
2654                            that the destination is on link.
2655
2656                            WHY? DW.
2657                            Because we are allowed to send to iface
2658                            even if it has NO routes and NO assigned
2659                            addresses. When oif is specified, routing
2660                            tables are looked up with only one purpose:
2661                            to catch if destination is gatewayed, rather than
2662                            direct. Moreover, if MSG_DONTROUTE is set,
2663                            we send packet, ignoring both routing tables
2664                            and ifaddr state. --ANK
2665
2666
2667                            We could make it even if oif is unknown,
2668                            likely IPv6, but we do not.
2669                          */
2670
2671                         if (fl4->saddr == 0)
2672                                 fl4->saddr = inet_select_addr(dev_out, 0,
2673                                                               RT_SCOPE_LINK);
2674                         res.type = RTN_UNICAST;
2675                         goto make_route;
2676                 }
2677                 rth = ERR_PTR(-ENETUNREACH);
2678                 goto out;
2679         }
2680
2681         if (res.type == RTN_LOCAL) {
2682                 if (!fl4->saddr) {
2683                         if (res.fi->fib_prefsrc)
2684                                 fl4->saddr = res.fi->fib_prefsrc;
2685                         else
2686                                 fl4->saddr = fl4->daddr;
2687                 }
2688                 dev_out = net->loopback_dev;
2689                 fl4->flowi4_oif = dev_out->ifindex;
2690                 res.fi = NULL;
2691                 flags |= RTCF_LOCAL;
2692                 goto make_route;
2693         }
2694
2695 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2696         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2697                 fib_select_multipath(&res);
2698         else
2699 #endif
2700         if (!res.prefixlen &&
2701             res.table->tb_num_default > 1 &&
2702             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2703                 fib_select_default(&res);
2704
2705         if (!fl4->saddr)
2706                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2707
2708         dev_out = FIB_RES_DEV(res);
2709         fl4->flowi4_oif = dev_out->ifindex;
2710
2711
2712 make_route:
2713         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2714                                tos, dev_out, flags);
2715         if (!IS_ERR(rth)) {
2716                 unsigned int hash;
2717
2718                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2719                                rt_genid(dev_net(dev_out)));
2720                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2721         }
2722
2723 out:
2724         rcu_read_unlock();
2725         return rth;
2726 }
2727
2728 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2729 {
2730         struct rtable *rth;
2731         unsigned int hash;
2732
2733         if (!rt_caching(net))
2734                 goto slow_output;
2735
2736         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2737
2738         rcu_read_lock_bh();
2739         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2740                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2741                 if (rth->rt_key_dst == flp4->daddr &&
2742                     rth->rt_key_src == flp4->saddr &&
2743                     rt_is_output_route(rth) &&
2744                     rth->rt_oif == flp4->flowi4_oif &&
2745                     rth->rt_mark == flp4->flowi4_mark &&
2746                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2747                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2748                     net_eq(dev_net(rth->dst.dev), net) &&
2749                     !rt_is_expired(rth)) {
2750                         ipv4_validate_peer(rth);
2751                         dst_use(&rth->dst, jiffies);
2752                         RT_CACHE_STAT_INC(out_hit);
2753                         rcu_read_unlock_bh();
2754                         if (!flp4->saddr)
2755                                 flp4->saddr = rth->rt_src;
2756                         if (!flp4->daddr)
2757                                 flp4->daddr = rth->rt_dst;
2758                         return rth;
2759                 }
2760                 RT_CACHE_STAT_INC(out_hlist_search);
2761         }
2762         rcu_read_unlock_bh();
2763
2764 slow_output:
2765         return ip_route_output_slow(net, flp4);
2766 }
2767 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2768
2769 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2770 {
2771         return NULL;
2772 }
2773
2774 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2775 {
2776         return 0;
2777 }
2778
2779 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2780 {
2781 }
2782
2783 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2784                                           unsigned long old)
2785 {
2786         return NULL;
2787 }
2788
2789 static struct dst_ops ipv4_dst_blackhole_ops = {
2790         .family                 =       AF_INET,
2791         .protocol               =       cpu_to_be16(ETH_P_IP),
2792         .destroy                =       ipv4_dst_destroy,
2793         .check                  =       ipv4_blackhole_dst_check,
2794         .default_mtu            =       ipv4_blackhole_default_mtu,
2795         .default_advmss         =       ipv4_default_advmss,
2796         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2797         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2798         .neigh_lookup           =       ipv4_neigh_lookup,
2799 };
2800
2801 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2802 {
2803         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2804         struct rtable *ort = (struct rtable *) dst_orig;
2805
2806         if (rt) {
2807                 struct dst_entry *new = &rt->dst;
2808
2809                 new->__use = 1;
2810                 new->input = dst_discard;
2811                 new->output = dst_discard;
2812                 dst_copy_metrics(new, &ort->dst);
2813
2814                 new->dev = ort->dst.dev;
2815                 if (new->dev)
2816                         dev_hold(new->dev);
2817
2818                 rt->rt_key_dst = ort->rt_key_dst;
2819                 rt->rt_key_src = ort->rt_key_src;
2820                 rt->rt_key_tos = ort->rt_key_tos;
2821                 rt->rt_route_iif = ort->rt_route_iif;
2822                 rt->rt_iif = ort->rt_iif;
2823                 rt->rt_oif = ort->rt_oif;
2824                 rt->rt_mark = ort->rt_mark;
2825
2826                 rt->rt_genid = rt_genid(net);
2827                 rt->rt_flags = ort->rt_flags;
2828                 rt->rt_type = ort->rt_type;
2829                 rt->rt_dst = ort->rt_dst;
2830                 rt->rt_src = ort->rt_src;
2831                 rt->rt_gateway = ort->rt_gateway;
2832                 rt->rt_spec_dst = ort->rt_spec_dst;
2833                 rt->peer = ort->peer;
2834                 if (rt->peer)
2835                         atomic_inc(&rt->peer->refcnt);
2836                 rt->fi = ort->fi;
2837                 if (rt->fi)
2838                         atomic_inc(&rt->fi->fib_clntref);
2839
2840                 dst_free(new);
2841         }
2842
2843         dst_release(dst_orig);
2844
2845         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2846 }
2847
2848 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2849                                     struct sock *sk)
2850 {
2851         struct rtable *rt = __ip_route_output_key(net, flp4);
2852
2853         if (IS_ERR(rt))
2854                 return rt;
2855
2856         if (flp4->flowi4_proto)
2857                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2858                                                    flowi4_to_flowi(flp4),
2859                                                    sk, 0);
2860
2861         return rt;
2862 }
2863 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2864
2865 static int rt_fill_info(struct net *net,
2866                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2867                         int nowait, unsigned int flags)
2868 {
2869         struct rtable *rt = skb_rtable(skb);
2870         struct rtmsg *r;
2871         struct nlmsghdr *nlh;
2872         long expires = 0;
2873         const struct inet_peer *peer = rt->peer;
2874         u32 id = 0, ts = 0, tsage = 0, error;
2875
2876         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2877         if (nlh == NULL)
2878                 return -EMSGSIZE;
2879
2880         r = nlmsg_data(nlh);
2881         r->rtm_family    = AF_INET;
2882         r->rtm_dst_len  = 32;
2883         r->rtm_src_len  = 0;
2884         r->rtm_tos      = rt->rt_key_tos;
2885         r->rtm_table    = RT_TABLE_MAIN;
2886         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2887         r->rtm_type     = rt->rt_type;
2888         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2889         r->rtm_protocol = RTPROT_UNSPEC;
2890         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2891         if (rt->rt_flags & RTCF_NOTIFY)
2892                 r->rtm_flags |= RTM_F_NOTIFY;
2893
2894         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2895
2896         if (rt->rt_key_src) {
2897                 r->rtm_src_len = 32;
2898                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2899         }
2900         if (rt->dst.dev)
2901                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2902 #ifdef CONFIG_IP_ROUTE_CLASSID
2903         if (rt->dst.tclassid)
2904                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2905 #endif
2906         if (rt_is_input_route(rt))
2907                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2908         else if (rt->rt_src != rt->rt_key_src)
2909                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2910
2911         if (rt->rt_dst != rt->rt_gateway)
2912                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2913
2914         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2915                 goto nla_put_failure;
2916
2917         if (rt->rt_mark)
2918                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2919
2920         error = rt->dst.error;
2921         if (peer) {
2922                 inet_peer_refcheck(rt->peer);
2923                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2924                 if (peer->tcp_ts_stamp) {
2925                         ts = peer->tcp_ts;
2926                         tsage = get_seconds() - peer->tcp_ts_stamp;
2927                 }
2928                 expires = ACCESS_ONCE(peer->pmtu_expires);
2929                 if (expires)
2930                         expires -= jiffies;
2931         }
2932
2933         if (rt_is_input_route(rt)) {
2934 #ifdef CONFIG_IP_MROUTE
2935                 __be32 dst = rt->rt_dst;
2936
2937                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2938                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2939                         int err = ipmr_get_route(net, skb,
2940                                                  rt->rt_src, rt->rt_dst,
2941                                                  r, nowait);
2942                         if (err <= 0) {
2943                                 if (!nowait) {
2944                                         if (err == 0)
2945                                                 return 0;
2946                                         goto nla_put_failure;
2947                                 } else {
2948                                         if (err == -EMSGSIZE)
2949                                                 goto nla_put_failure;
2950                                         error = err;
2951                                 }
2952                         }
2953                 } else
2954 #endif
2955                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2956         }
2957
2958         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2959                                expires, error) < 0)
2960                 goto nla_put_failure;
2961
2962         return nlmsg_end(skb, nlh);
2963
2964 nla_put_failure:
2965         nlmsg_cancel(skb, nlh);
2966         return -EMSGSIZE;
2967 }
2968
2969 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2970 {
2971         struct net *net = sock_net(in_skb->sk);
2972         struct rtmsg *rtm;
2973         struct nlattr *tb[RTA_MAX+1];
2974         struct rtable *rt = NULL;
2975         __be32 dst = 0;
2976         __be32 src = 0;
2977         u32 iif;
2978         int err;
2979         int mark;
2980         struct sk_buff *skb;
2981
2982         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2983         if (err < 0)
2984                 goto errout;
2985
2986         rtm = nlmsg_data(nlh);
2987
2988         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2989         if (skb == NULL) {
2990                 err = -ENOBUFS;
2991                 goto errout;
2992         }
2993
2994         /* Reserve room for dummy headers, this skb can pass
2995            through good chunk of routing engine.
2996          */
2997         skb_reset_mac_header(skb);
2998         skb_reset_network_header(skb);
2999
3000         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3001         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3002         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3003
3004         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3005         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3006         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3007         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3008
3009         if (iif) {
3010                 struct net_device *dev;
3011
3012                 dev = __dev_get_by_index(net, iif);
3013                 if (dev == NULL) {
3014                         err = -ENODEV;
3015                         goto errout_free;
3016                 }
3017
3018                 skb->protocol   = htons(ETH_P_IP);
3019                 skb->dev        = dev;
3020                 skb->mark       = mark;
3021                 local_bh_disable();
3022                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3023                 local_bh_enable();
3024
3025                 rt = skb_rtable(skb);
3026                 if (err == 0 && rt->dst.error)
3027                         err = -rt->dst.error;
3028         } else {
3029                 struct flowi4 fl4 = {
3030                         .daddr = dst,
3031                         .saddr = src,
3032                         .flowi4_tos = rtm->rtm_tos,
3033                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3034                         .flowi4_mark = mark,
3035                 };
3036                 rt = ip_route_output_key(net, &fl4);
3037
3038                 err = 0;
3039                 if (IS_ERR(rt))
3040                         err = PTR_ERR(rt);
3041         }
3042
3043         if (err)
3044                 goto errout_free;
3045
3046         skb_dst_set(skb, &rt->dst);
3047         if (rtm->rtm_flags & RTM_F_NOTIFY)
3048                 rt->rt_flags |= RTCF_NOTIFY;
3049
3050         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3051                            RTM_NEWROUTE, 0, 0);
3052         if (err <= 0)
3053                 goto errout_free;
3054
3055         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3056 errout:
3057         return err;
3058
3059 errout_free:
3060         kfree_skb(skb);
3061         goto errout;
3062 }
3063
3064 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3065 {
3066         struct rtable *rt;
3067         int h, s_h;
3068         int idx, s_idx;
3069         struct net *net;
3070
3071         net = sock_net(skb->sk);
3072
3073         s_h = cb->args[0];
3074         if (s_h < 0)
3075                 s_h = 0;
3076         s_idx = idx = cb->args[1];
3077         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3078                 if (!rt_hash_table[h].chain)
3079                         continue;
3080                 rcu_read_lock_bh();
3081                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3082                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3083                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3084                                 continue;
3085                         if (rt_is_expired(rt))
3086                                 continue;
3087                         skb_dst_set_noref(skb, &rt->dst);
3088                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3089                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3090                                          1, NLM_F_MULTI) <= 0) {
3091                                 skb_dst_drop(skb);
3092                                 rcu_read_unlock_bh();
3093                                 goto done;
3094                         }
3095                         skb_dst_drop(skb);
3096                 }
3097                 rcu_read_unlock_bh();
3098         }
3099
3100 done:
3101         cb->args[0] = h;
3102         cb->args[1] = idx;
3103         return skb->len;
3104 }
3105
3106 void ip_rt_multicast_event(struct in_device *in_dev)
3107 {
3108         rt_cache_flush(dev_net(in_dev->dev), 0);
3109 }
3110
3111 #ifdef CONFIG_SYSCTL
3112 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3113                                         void __user *buffer,
3114                                         size_t *lenp, loff_t *ppos)
3115 {
3116         if (write) {
3117                 int flush_delay;
3118                 ctl_table ctl;
3119                 struct net *net;
3120
3121                 memcpy(&ctl, __ctl, sizeof(ctl));
3122                 ctl.data = &flush_delay;
3123                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3124
3125                 net = (struct net *)__ctl->extra1;
3126                 rt_cache_flush(net, flush_delay);
3127                 return 0;
3128         }
3129
3130         return -EINVAL;
3131 }
3132
3133 static ctl_table ipv4_route_table[] = {
3134         {
3135                 .procname       = "gc_thresh",
3136                 .data           = &ipv4_dst_ops.gc_thresh,
3137                 .maxlen         = sizeof(int),
3138                 .mode           = 0644,
3139                 .proc_handler   = proc_dointvec,
3140         },
3141         {
3142                 .procname       = "max_size",
3143                 .data           = &ip_rt_max_size,
3144                 .maxlen         = sizeof(int),
3145                 .mode           = 0644,
3146                 .proc_handler   = proc_dointvec,
3147         },
3148         {
3149                 /*  Deprecated. Use gc_min_interval_ms */
3150
3151                 .procname       = "gc_min_interval",
3152                 .data           = &ip_rt_gc_min_interval,
3153                 .maxlen         = sizeof(int),
3154                 .mode           = 0644,
3155                 .proc_handler   = proc_dointvec_jiffies,
3156         },
3157         {
3158                 .procname       = "gc_min_interval_ms",
3159                 .data           = &ip_rt_gc_min_interval,
3160                 .maxlen         = sizeof(int),
3161                 .mode           = 0644,
3162                 .proc_handler   = proc_dointvec_ms_jiffies,
3163         },
3164         {
3165                 .procname       = "gc_timeout",
3166                 .data           = &ip_rt_gc_timeout,
3167                 .maxlen         = sizeof(int),
3168                 .mode           = 0644,
3169                 .proc_handler   = proc_dointvec_jiffies,
3170         },
3171         {
3172                 .procname       = "gc_interval",
3173                 .data           = &ip_rt_gc_interval,
3174                 .maxlen         = sizeof(int),
3175                 .mode           = 0644,
3176                 .proc_handler   = proc_dointvec_jiffies,
3177         },
3178         {
3179                 .procname       = "redirect_load",
3180                 .data           = &ip_rt_redirect_load,
3181                 .maxlen         = sizeof(int),
3182                 .mode           = 0644,
3183                 .proc_handler   = proc_dointvec,
3184         },
3185         {
3186                 .procname       = "redirect_number",
3187                 .data           = &ip_rt_redirect_number,
3188                 .maxlen         = sizeof(int),
3189                 .mode           = 0644,
3190                 .proc_handler   = proc_dointvec,
3191         },
3192         {
3193                 .procname       = "redirect_silence",
3194                 .data           = &ip_rt_redirect_silence,
3195                 .maxlen         = sizeof(int),
3196                 .mode           = 0644,
3197                 .proc_handler   = proc_dointvec,
3198         },
3199         {
3200                 .procname       = "error_cost",
3201                 .data           = &ip_rt_error_cost,
3202                 .maxlen         = sizeof(int),
3203                 .mode           = 0644,
3204                 .proc_handler   = proc_dointvec,
3205         },
3206         {
3207                 .procname       = "error_burst",
3208                 .data           = &ip_rt_error_burst,
3209                 .maxlen         = sizeof(int),
3210                 .mode           = 0644,
3211                 .proc_handler   = proc_dointvec,
3212         },
3213         {
3214                 .procname       = "gc_elasticity",
3215                 .data           = &ip_rt_gc_elasticity,
3216                 .maxlen         = sizeof(int),
3217                 .mode           = 0644,
3218                 .proc_handler   = proc_dointvec,
3219         },
3220         {
3221                 .procname       = "mtu_expires",
3222                 .data           = &ip_rt_mtu_expires,
3223                 .maxlen         = sizeof(int),
3224                 .mode           = 0644,
3225                 .proc_handler   = proc_dointvec_jiffies,
3226         },
3227         {
3228                 .procname       = "min_pmtu",
3229                 .data           = &ip_rt_min_pmtu,
3230                 .maxlen         = sizeof(int),
3231                 .mode           = 0644,
3232                 .proc_handler   = proc_dointvec,
3233         },
3234         {
3235                 .procname       = "min_adv_mss",
3236                 .data           = &ip_rt_min_advmss,
3237                 .maxlen         = sizeof(int),
3238                 .mode           = 0644,
3239                 .proc_handler   = proc_dointvec,
3240         },
3241         { }
3242 };
3243
3244 static struct ctl_table empty[1];
3245
3246 static struct ctl_table ipv4_skeleton[] =
3247 {
3248         { .procname = "route",
3249           .mode = 0555, .child = ipv4_route_table},
3250         { .procname = "neigh",
3251           .mode = 0555, .child = empty},
3252         { }
3253 };
3254
3255 static __net_initdata struct ctl_path ipv4_path[] = {
3256         { .procname = "net", },
3257         { .procname = "ipv4", },
3258         { },
3259 };
3260
3261 static struct ctl_table ipv4_route_flush_table[] = {
3262         {
3263                 .procname       = "flush",
3264                 .maxlen         = sizeof(int),
3265                 .mode           = 0200,
3266                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3267         },
3268         { },
3269 };
3270
3271 static __net_initdata struct ctl_path ipv4_route_path[] = {
3272         { .procname = "net", },
3273         { .procname = "ipv4", },
3274         { .procname = "route", },
3275         { },
3276 };
3277
3278 static __net_init int sysctl_route_net_init(struct net *net)
3279 {
3280         struct ctl_table *tbl;
3281
3282         tbl = ipv4_route_flush_table;
3283         if (!net_eq(net, &init_net)) {
3284                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3285                 if (tbl == NULL)
3286                         goto err_dup;
3287         }
3288         tbl[0].extra1 = net;
3289
3290         net->ipv4.route_hdr =
3291                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3292         if (net->ipv4.route_hdr == NULL)
3293                 goto err_reg;
3294         return 0;
3295
3296 err_reg:
3297         if (tbl != ipv4_route_flush_table)
3298                 kfree(tbl);
3299 err_dup:
3300         return -ENOMEM;
3301 }
3302
3303 static __net_exit void sysctl_route_net_exit(struct net *net)
3304 {
3305         struct ctl_table *tbl;
3306
3307         tbl = net->ipv4.route_hdr->ctl_table_arg;
3308         unregister_net_sysctl_table(net->ipv4.route_hdr);
3309         BUG_ON(tbl == ipv4_route_flush_table);
3310         kfree(tbl);
3311 }
3312
3313 static __net_initdata struct pernet_operations sysctl_route_ops = {
3314         .init = sysctl_route_net_init,
3315         .exit = sysctl_route_net_exit,
3316 };
3317 #endif
3318
3319 static __net_init int rt_genid_init(struct net *net)
3320 {
3321         get_random_bytes(&net->ipv4.rt_genid,
3322                          sizeof(net->ipv4.rt_genid));
3323         get_random_bytes(&net->ipv4.dev_addr_genid,
3324                          sizeof(net->ipv4.dev_addr_genid));
3325         return 0;
3326 }
3327
3328 static __net_initdata struct pernet_operations rt_genid_ops = {
3329         .init = rt_genid_init,
3330 };
3331
3332
3333 #ifdef CONFIG_IP_ROUTE_CLASSID
3334 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3335 #endif /* CONFIG_IP_ROUTE_CLASSID */
3336
3337 static __initdata unsigned long rhash_entries;
3338 static int __init set_rhash_entries(char *str)
3339 {
3340         if (!str)
3341                 return 0;
3342         rhash_entries = simple_strtoul(str, &str, 0);
3343         return 1;
3344 }
3345 __setup("rhash_entries=", set_rhash_entries);
3346
3347 int __init ip_rt_init(void)
3348 {
3349         int rc = 0;
3350
3351 #ifdef CONFIG_IP_ROUTE_CLASSID
3352         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3353         if (!ip_rt_acct)
3354                 panic("IP: failed to allocate ip_rt_acct\n");
3355 #endif
3356
3357         ipv4_dst_ops.kmem_cachep =
3358                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3359                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3360
3361         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3362
3363         if (dst_entries_init(&ipv4_dst_ops) < 0)
3364                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3365
3366         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3367                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3368
3369         rt_hash_table = (struct rt_hash_bucket *)
3370                 alloc_large_system_hash("IP route cache",
3371                                         sizeof(struct rt_hash_bucket),
3372                                         rhash_entries,
3373                                         (totalram_pages >= 128 * 1024) ?
3374                                         15 : 17,
3375                                         0,
3376                                         &rt_hash_log,
3377                                         &rt_hash_mask,
3378                                         rhash_entries ? 0 : 512 * 1024);
3379         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3380         rt_hash_lock_init();
3381
3382         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3383         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3384
3385         devinet_init();
3386         ip_fib_init();
3387
3388         if (ip_rt_proc_init())
3389                 printk(KERN_ERR "Unable to create route proc files\n");
3390 #ifdef CONFIG_XFRM
3391         xfrm_init();
3392         xfrm4_init(ip_rt_max_size);
3393 #endif
3394         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3395
3396 #ifdef CONFIG_SYSCTL
3397         register_pernet_subsys(&sysctl_route_ops);
3398 #endif
3399         register_pernet_subsys(&rt_genid_ops);
3400         return rc;
3401 }
3402
3403 #ifdef CONFIG_SYSCTL
3404 /*
3405  * We really need to sanitize the damn ipv4 init order, then all
3406  * this nonsense will go away.
3407  */
3408 void __init ip_static_sysctl_init(void)
3409 {
3410         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3411 }
3412 #endif