mfd: Copy the device pointer to the twl4030-madc structure
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv4 / route.c
blob4845bfe02d24edfeef0d1c1015ac4b1ac0074cc8
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/secure_seq.h>
114 #define RT_FL_TOS(oldflp4) \
115 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
117 #define IP_MAX_MTU 0xFFF0
119 #define RT_GC_TIMEOUT (300*HZ)
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
124 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125 static int ip_rt_redirect_number __read_mostly = 9;
126 static int ip_rt_redirect_load __read_mostly = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly = HZ;
129 static int ip_rt_error_burst __read_mostly = 5 * HZ;
130 static int ip_rt_gc_elasticity __read_mostly = 8;
131 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133 static int ip_rt_min_advmss __read_mostly = 256;
134 static int rt_chain_length_max __read_mostly = 20;
136 static struct delayed_work expires_work;
137 static unsigned long expires_ljiffies;
140 * Interface to generic destination cache.
143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
145 static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148 static void ipv4_link_failure(struct sk_buff *skb);
149 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150 static int rt_garbage_collect(struct dst_ops *ops);
152 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
153 int how)
157 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159 struct rtable *rt = (struct rtable *) dst;
160 struct inet_peer *peer;
161 u32 *p = NULL;
163 if (!rt->peer)
164 rt_bind_peer(rt, rt->rt_dst, 1);
166 peer = rt->peer;
167 if (peer) {
168 u32 *old_p = __DST_METRICS_PTR(old);
169 unsigned long prev, new;
171 p = peer->metrics;
172 if (inet_metrics_new(peer))
173 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
175 new = (unsigned long) p;
176 prev = cmpxchg(&dst->_metrics, old, new);
178 if (prev != old) {
179 p = __DST_METRICS_PTR(prev);
180 if (prev & DST_METRICS_READ_ONLY)
181 p = NULL;
182 } else {
183 if (rt->fi) {
184 fib_info_put(rt->fi);
185 rt->fi = NULL;
189 return p;
192 static struct dst_ops ipv4_dst_ops = {
193 .family = AF_INET,
194 .protocol = cpu_to_be16(ETH_P_IP),
195 .gc = rt_garbage_collect,
196 .check = ipv4_dst_check,
197 .default_advmss = ipv4_default_advmss,
198 .default_mtu = ipv4_default_mtu,
199 .cow_metrics = ipv4_cow_metrics,
200 .destroy = ipv4_dst_destroy,
201 .ifdown = ipv4_dst_ifdown,
202 .negative_advice = ipv4_negative_advice,
203 .link_failure = ipv4_link_failure,
204 .update_pmtu = ip_rt_update_pmtu,
205 .local_out = __ip_local_out,
208 #define ECN_OR_COST(class) TC_PRIO_##class
210 const __u8 ip_tos2prio[16] = {
211 TC_PRIO_BESTEFFORT,
212 ECN_OR_COST(BESTEFFORT),
213 TC_PRIO_BESTEFFORT,
214 ECN_OR_COST(BESTEFFORT),
215 TC_PRIO_BULK,
216 ECN_OR_COST(BULK),
217 TC_PRIO_BULK,
218 ECN_OR_COST(BULK),
219 TC_PRIO_INTERACTIVE,
220 ECN_OR_COST(INTERACTIVE),
221 TC_PRIO_INTERACTIVE,
222 ECN_OR_COST(INTERACTIVE),
223 TC_PRIO_INTERACTIVE_BULK,
224 ECN_OR_COST(INTERACTIVE_BULK),
225 TC_PRIO_INTERACTIVE_BULK,
226 ECN_OR_COST(INTERACTIVE_BULK)
231 * Route cache.
234 /* The locking scheme is rather straight forward:
236 * 1) Read-Copy Update protects the buckets of the central route hash.
237 * 2) Only writers remove entries, and they hold the lock
238 * as they look at rtable reference counts.
239 * 3) Only readers acquire references to rtable entries,
240 * they do so with atomic increments and with the
241 * lock held.
244 struct rt_hash_bucket {
245 struct rtable __rcu *chain;
248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
249 defined(CONFIG_PROVE_LOCKING)
251 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
252 * The size of this table is a power of two and depends on the number of CPUS.
253 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
255 #ifdef CONFIG_LOCKDEP
256 # define RT_HASH_LOCK_SZ 256
257 #else
258 # if NR_CPUS >= 32
259 # define RT_HASH_LOCK_SZ 4096
260 # elif NR_CPUS >= 16
261 # define RT_HASH_LOCK_SZ 2048
262 # elif NR_CPUS >= 8
263 # define RT_HASH_LOCK_SZ 1024
264 # elif NR_CPUS >= 4
265 # define RT_HASH_LOCK_SZ 512
266 # else
267 # define RT_HASH_LOCK_SZ 256
268 # endif
269 #endif
271 static spinlock_t *rt_hash_locks;
272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
274 static __init void rt_hash_lock_init(void)
276 int i;
278 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
279 GFP_KERNEL);
280 if (!rt_hash_locks)
281 panic("IP: failed to allocate rt_hash_locks\n");
283 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
284 spin_lock_init(&rt_hash_locks[i]);
286 #else
287 # define rt_hash_lock_addr(slot) NULL
289 static inline void rt_hash_lock_init(void)
292 #endif
294 static struct rt_hash_bucket *rt_hash_table __read_mostly;
295 static unsigned rt_hash_mask __read_mostly;
296 static unsigned int rt_hash_log __read_mostly;
298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
302 int genid)
304 return jhash_3words((__force u32)daddr, (__force u32)saddr,
305 idx, genid)
306 & rt_hash_mask;
309 static inline int rt_genid(struct net *net)
311 return atomic_read(&net->ipv4.rt_genid);
314 #ifdef CONFIG_PROC_FS
315 struct rt_cache_iter_state {
316 struct seq_net_private p;
317 int bucket;
318 int genid;
321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
323 struct rt_cache_iter_state *st = seq->private;
324 struct rtable *r = NULL;
326 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
327 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
328 continue;
329 rcu_read_lock_bh();
330 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
331 while (r) {
332 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
333 r->rt_genid == st->genid)
334 return r;
335 r = rcu_dereference_bh(r->dst.rt_next);
337 rcu_read_unlock_bh();
339 return r;
342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
343 struct rtable *r)
345 struct rt_cache_iter_state *st = seq->private;
347 r = rcu_dereference_bh(r->dst.rt_next);
348 while (!r) {
349 rcu_read_unlock_bh();
350 do {
351 if (--st->bucket < 0)
352 return NULL;
353 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
354 rcu_read_lock_bh();
355 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
357 return r;
360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
361 struct rtable *r)
363 struct rt_cache_iter_state *st = seq->private;
364 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
365 if (dev_net(r->dst.dev) != seq_file_net(seq))
366 continue;
367 if (r->rt_genid == st->genid)
368 break;
370 return r;
373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
375 struct rtable *r = rt_cache_get_first(seq);
377 if (r)
378 while (pos && (r = rt_cache_get_next(seq, r)))
379 --pos;
380 return pos ? NULL : r;
383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
385 struct rt_cache_iter_state *st = seq->private;
386 if (*pos)
387 return rt_cache_get_idx(seq, *pos - 1);
388 st->genid = rt_genid(seq_file_net(seq));
389 return SEQ_START_TOKEN;
392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
394 struct rtable *r;
396 if (v == SEQ_START_TOKEN)
397 r = rt_cache_get_first(seq);
398 else
399 r = rt_cache_get_next(seq, v);
400 ++*pos;
401 return r;
404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
406 if (v && v != SEQ_START_TOKEN)
407 rcu_read_unlock_bh();
410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
412 if (v == SEQ_START_TOKEN)
413 seq_printf(seq, "%-127s\n",
414 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
415 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
416 "HHUptod\tSpecDst");
417 else {
418 struct rtable *r = v;
419 int len;
421 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
422 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
423 r->dst.dev ? r->dst.dev->name : "*",
424 (__force u32)r->rt_dst,
425 (__force u32)r->rt_gateway,
426 r->rt_flags, atomic_read(&r->dst.__refcnt),
427 r->dst.__use, 0, (__force u32)r->rt_src,
428 dst_metric_advmss(&r->dst) + 40,
429 dst_metric(&r->dst, RTAX_WINDOW),
430 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
431 dst_metric(&r->dst, RTAX_RTTVAR)),
432 r->rt_key_tos,
433 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
434 r->dst.hh ? (r->dst.hh->hh_output ==
435 dev_queue_xmit) : 0,
436 r->rt_spec_dst, &len);
438 seq_printf(seq, "%*s\n", 127 - len, "");
440 return 0;
443 static const struct seq_operations rt_cache_seq_ops = {
444 .start = rt_cache_seq_start,
445 .next = rt_cache_seq_next,
446 .stop = rt_cache_seq_stop,
447 .show = rt_cache_seq_show,
450 static int rt_cache_seq_open(struct inode *inode, struct file *file)
452 return seq_open_net(inode, file, &rt_cache_seq_ops,
453 sizeof(struct rt_cache_iter_state));
456 static const struct file_operations rt_cache_seq_fops = {
457 .owner = THIS_MODULE,
458 .open = rt_cache_seq_open,
459 .read = seq_read,
460 .llseek = seq_lseek,
461 .release = seq_release_net,
465 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
467 int cpu;
469 if (*pos == 0)
470 return SEQ_START_TOKEN;
472 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
473 if (!cpu_possible(cpu))
474 continue;
475 *pos = cpu+1;
476 return &per_cpu(rt_cache_stat, cpu);
478 return NULL;
481 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
483 int cpu;
485 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
486 if (!cpu_possible(cpu))
487 continue;
488 *pos = cpu+1;
489 return &per_cpu(rt_cache_stat, cpu);
491 return NULL;
495 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
500 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
502 struct rt_cache_stat *st = v;
504 if (v == SEQ_START_TOKEN) {
505 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
506 return 0;
509 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
510 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
511 dst_entries_get_slow(&ipv4_dst_ops),
512 st->in_hit,
513 st->in_slow_tot,
514 st->in_slow_mc,
515 st->in_no_route,
516 st->in_brd,
517 st->in_martian_dst,
518 st->in_martian_src,
520 st->out_hit,
521 st->out_slow_tot,
522 st->out_slow_mc,
524 st->gc_total,
525 st->gc_ignored,
526 st->gc_goal_miss,
527 st->gc_dst_overflow,
528 st->in_hlist_search,
529 st->out_hlist_search
531 return 0;
534 static const struct seq_operations rt_cpu_seq_ops = {
535 .start = rt_cpu_seq_start,
536 .next = rt_cpu_seq_next,
537 .stop = rt_cpu_seq_stop,
538 .show = rt_cpu_seq_show,
542 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
544 return seq_open(file, &rt_cpu_seq_ops);
547 static const struct file_operations rt_cpu_seq_fops = {
548 .owner = THIS_MODULE,
549 .open = rt_cpu_seq_open,
550 .read = seq_read,
551 .llseek = seq_lseek,
552 .release = seq_release,
555 #ifdef CONFIG_IP_ROUTE_CLASSID
556 static int rt_acct_proc_show(struct seq_file *m, void *v)
558 struct ip_rt_acct *dst, *src;
559 unsigned int i, j;
561 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
562 if (!dst)
563 return -ENOMEM;
565 for_each_possible_cpu(i) {
566 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
567 for (j = 0; j < 256; j++) {
568 dst[j].o_bytes += src[j].o_bytes;
569 dst[j].o_packets += src[j].o_packets;
570 dst[j].i_bytes += src[j].i_bytes;
571 dst[j].i_packets += src[j].i_packets;
575 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
576 kfree(dst);
577 return 0;
580 static int rt_acct_proc_open(struct inode *inode, struct file *file)
582 return single_open(file, rt_acct_proc_show, NULL);
585 static const struct file_operations rt_acct_proc_fops = {
586 .owner = THIS_MODULE,
587 .open = rt_acct_proc_open,
588 .read = seq_read,
589 .llseek = seq_lseek,
590 .release = single_release,
592 #endif
594 static int __net_init ip_rt_do_proc_init(struct net *net)
596 struct proc_dir_entry *pde;
598 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
599 &rt_cache_seq_fops);
600 if (!pde)
601 goto err1;
603 pde = proc_create("rt_cache", S_IRUGO,
604 net->proc_net_stat, &rt_cpu_seq_fops);
605 if (!pde)
606 goto err2;
608 #ifdef CONFIG_IP_ROUTE_CLASSID
609 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
610 if (!pde)
611 goto err3;
612 #endif
613 return 0;
615 #ifdef CONFIG_IP_ROUTE_CLASSID
616 err3:
617 remove_proc_entry("rt_cache", net->proc_net_stat);
618 #endif
619 err2:
620 remove_proc_entry("rt_cache", net->proc_net);
621 err1:
622 return -ENOMEM;
625 static void __net_exit ip_rt_do_proc_exit(struct net *net)
627 remove_proc_entry("rt_cache", net->proc_net_stat);
628 remove_proc_entry("rt_cache", net->proc_net);
629 #ifdef CONFIG_IP_ROUTE_CLASSID
630 remove_proc_entry("rt_acct", net->proc_net);
631 #endif
634 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
635 .init = ip_rt_do_proc_init,
636 .exit = ip_rt_do_proc_exit,
639 static int __init ip_rt_proc_init(void)
641 return register_pernet_subsys(&ip_rt_proc_ops);
644 #else
645 static inline int ip_rt_proc_init(void)
647 return 0;
649 #endif /* CONFIG_PROC_FS */
651 static inline void rt_free(struct rtable *rt)
653 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
656 static inline void rt_drop(struct rtable *rt)
658 ip_rt_put(rt);
659 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
662 static inline int rt_fast_clean(struct rtable *rth)
664 /* Kill broadcast/multicast entries very aggresively, if they
665 collide in hash table with more useful entries */
666 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
667 rt_is_input_route(rth) && rth->dst.rt_next;
670 static inline int rt_valuable(struct rtable *rth)
672 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
673 (rth->peer && rth->peer->pmtu_expires);
676 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
678 unsigned long age;
679 int ret = 0;
681 if (atomic_read(&rth->dst.__refcnt))
682 goto out;
684 age = jiffies - rth->dst.lastuse;
685 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
686 (age <= tmo2 && rt_valuable(rth)))
687 goto out;
688 ret = 1;
689 out: return ret;
692 /* Bits of score are:
693 * 31: very valuable
694 * 30: not quite useless
695 * 29..0: usage counter
697 static inline u32 rt_score(struct rtable *rt)
699 u32 score = jiffies - rt->dst.lastuse;
701 score = ~score & ~(3<<30);
703 if (rt_valuable(rt))
704 score |= (1<<31);
706 if (rt_is_output_route(rt) ||
707 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
708 score |= (1<<30);
710 return score;
713 static inline bool rt_caching(const struct net *net)
715 return net->ipv4.current_rt_cache_rebuild_count <=
716 net->ipv4.sysctl_rt_cache_rebuild_count;
719 static inline bool compare_hash_inputs(const struct rtable *rt1,
720 const struct rtable *rt2)
722 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
723 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
724 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
727 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
729 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
730 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
731 (rt1->rt_mark ^ rt2->rt_mark) |
732 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
733 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
734 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
737 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
739 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
742 static inline int rt_is_expired(struct rtable *rth)
744 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
748 * Perform a full scan of hash table and free all entries.
749 * Can be called by a softirq or a process.
750 * In the later case, we want to be reschedule if necessary
752 static void rt_do_flush(struct net *net, int process_context)
754 unsigned int i;
755 struct rtable *rth, *next;
757 for (i = 0; i <= rt_hash_mask; i++) {
758 struct rtable __rcu **pprev;
759 struct rtable *list;
761 if (process_context && need_resched())
762 cond_resched();
763 rth = rcu_dereference_raw(rt_hash_table[i].chain);
764 if (!rth)
765 continue;
767 spin_lock_bh(rt_hash_lock_addr(i));
769 list = NULL;
770 pprev = &rt_hash_table[i].chain;
771 rth = rcu_dereference_protected(*pprev,
772 lockdep_is_held(rt_hash_lock_addr(i)));
774 while (rth) {
775 next = rcu_dereference_protected(rth->dst.rt_next,
776 lockdep_is_held(rt_hash_lock_addr(i)));
778 if (!net ||
779 net_eq(dev_net(rth->dst.dev), net)) {
780 rcu_assign_pointer(*pprev, next);
781 rcu_assign_pointer(rth->dst.rt_next, list);
782 list = rth;
783 } else {
784 pprev = &rth->dst.rt_next;
786 rth = next;
789 spin_unlock_bh(rt_hash_lock_addr(i));
791 for (; list; list = next) {
792 next = rcu_dereference_protected(list->dst.rt_next, 1);
793 rt_free(list);
799 * While freeing expired entries, we compute average chain length
800 * and standard deviation, using fixed-point arithmetic.
801 * This to have an estimation of rt_chain_length_max
802 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
803 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
806 #define FRACT_BITS 3
807 #define ONE (1UL << FRACT_BITS)
810 * Given a hash chain and an item in this hash chain,
811 * find if a previous entry has the same hash_inputs
812 * (but differs on tos, mark or oif)
813 * Returns 0 if an alias is found.
814 * Returns ONE if rth has no alias before itself.
816 static int has_noalias(const struct rtable *head, const struct rtable *rth)
818 const struct rtable *aux = head;
820 while (aux != rth) {
821 if (compare_hash_inputs(aux, rth))
822 return 0;
823 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
825 return ONE;
828 static void rt_check_expire(void)
830 static unsigned int rover;
831 unsigned int i = rover, goal;
832 struct rtable *rth;
833 struct rtable __rcu **rthp;
834 unsigned long samples = 0;
835 unsigned long sum = 0, sum2 = 0;
836 unsigned long delta;
837 u64 mult;
839 delta = jiffies - expires_ljiffies;
840 expires_ljiffies = jiffies;
841 mult = ((u64)delta) << rt_hash_log;
842 if (ip_rt_gc_timeout > 1)
843 do_div(mult, ip_rt_gc_timeout);
844 goal = (unsigned int)mult;
845 if (goal > rt_hash_mask)
846 goal = rt_hash_mask + 1;
847 for (; goal > 0; goal--) {
848 unsigned long tmo = ip_rt_gc_timeout;
849 unsigned long length;
851 i = (i + 1) & rt_hash_mask;
852 rthp = &rt_hash_table[i].chain;
854 if (need_resched())
855 cond_resched();
857 samples++;
859 if (rcu_dereference_raw(*rthp) == NULL)
860 continue;
861 length = 0;
862 spin_lock_bh(rt_hash_lock_addr(i));
863 while ((rth = rcu_dereference_protected(*rthp,
864 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
865 prefetch(rth->dst.rt_next);
866 if (rt_is_expired(rth)) {
867 *rthp = rth->dst.rt_next;
868 rt_free(rth);
869 continue;
871 if (rth->dst.expires) {
872 /* Entry is expired even if it is in use */
873 if (time_before_eq(jiffies, rth->dst.expires)) {
874 nofree:
875 tmo >>= 1;
876 rthp = &rth->dst.rt_next;
878 * We only count entries on
879 * a chain with equal hash inputs once
880 * so that entries for different QOS
881 * levels, and other non-hash input
882 * attributes don't unfairly skew
883 * the length computation
885 length += has_noalias(rt_hash_table[i].chain, rth);
886 continue;
888 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
889 goto nofree;
891 /* Cleanup aged off entries. */
892 *rthp = rth->dst.rt_next;
893 rt_free(rth);
895 spin_unlock_bh(rt_hash_lock_addr(i));
896 sum += length;
897 sum2 += length*length;
899 if (samples) {
900 unsigned long avg = sum / samples;
901 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
902 rt_chain_length_max = max_t(unsigned long,
903 ip_rt_gc_elasticity,
904 (avg + 4*sd) >> FRACT_BITS);
906 rover = i;
910 * rt_worker_func() is run in process context.
911 * we call rt_check_expire() to scan part of the hash table
913 static void rt_worker_func(struct work_struct *work)
915 rt_check_expire();
916 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
920 * Perturbation of rt_genid by a small quantity [1..256]
921 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
922 * many times (2^24) without giving recent rt_genid.
923 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
925 static void rt_cache_invalidate(struct net *net)
927 unsigned char shuffle;
929 get_random_bytes(&shuffle, sizeof(shuffle));
930 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
934 * delay < 0 : invalidate cache (fast : entries will be deleted later)
935 * delay >= 0 : invalidate & flush cache (can be long)
937 void rt_cache_flush(struct net *net, int delay)
939 rt_cache_invalidate(net);
940 if (delay >= 0)
941 rt_do_flush(net, !in_softirq());
944 /* Flush previous cache invalidated entries from the cache */
945 void rt_cache_flush_batch(struct net *net)
947 rt_do_flush(net, !in_softirq());
950 static void rt_emergency_hash_rebuild(struct net *net)
952 if (net_ratelimit())
953 printk(KERN_WARNING "Route hash chain too long!\n");
954 rt_cache_invalidate(net);
958 Short description of GC goals.
960 We want to build algorithm, which will keep routing cache
961 at some equilibrium point, when number of aged off entries
962 is kept approximately equal to newly generated ones.
964 Current expiration strength is variable "expire".
965 We try to adjust it dynamically, so that if networking
966 is idle expires is large enough to keep enough of warm entries,
967 and when load increases it reduces to limit cache size.
970 static int rt_garbage_collect(struct dst_ops *ops)
972 static unsigned long expire = RT_GC_TIMEOUT;
973 static unsigned long last_gc;
974 static int rover;
975 static int equilibrium;
976 struct rtable *rth;
977 struct rtable __rcu **rthp;
978 unsigned long now = jiffies;
979 int goal;
980 int entries = dst_entries_get_fast(&ipv4_dst_ops);
983 * Garbage collection is pretty expensive,
984 * do not make it too frequently.
987 RT_CACHE_STAT_INC(gc_total);
989 if (now - last_gc < ip_rt_gc_min_interval &&
990 entries < ip_rt_max_size) {
991 RT_CACHE_STAT_INC(gc_ignored);
992 goto out;
995 entries = dst_entries_get_slow(&ipv4_dst_ops);
996 /* Calculate number of entries, which we want to expire now. */
997 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
998 if (goal <= 0) {
999 if (equilibrium < ipv4_dst_ops.gc_thresh)
1000 equilibrium = ipv4_dst_ops.gc_thresh;
1001 goal = entries - equilibrium;
1002 if (goal > 0) {
1003 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1004 goal = entries - equilibrium;
1006 } else {
1007 /* We are in dangerous area. Try to reduce cache really
1008 * aggressively.
1010 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1011 equilibrium = entries - goal;
1014 if (now - last_gc >= ip_rt_gc_min_interval)
1015 last_gc = now;
1017 if (goal <= 0) {
1018 equilibrium += goal;
1019 goto work_done;
1022 do {
1023 int i, k;
1025 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1026 unsigned long tmo = expire;
1028 k = (k + 1) & rt_hash_mask;
1029 rthp = &rt_hash_table[k].chain;
1030 spin_lock_bh(rt_hash_lock_addr(k));
1031 while ((rth = rcu_dereference_protected(*rthp,
1032 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1033 if (!rt_is_expired(rth) &&
1034 !rt_may_expire(rth, tmo, expire)) {
1035 tmo >>= 1;
1036 rthp = &rth->dst.rt_next;
1037 continue;
1039 *rthp = rth->dst.rt_next;
1040 rt_free(rth);
1041 goal--;
1043 spin_unlock_bh(rt_hash_lock_addr(k));
1044 if (goal <= 0)
1045 break;
1047 rover = k;
1049 if (goal <= 0)
1050 goto work_done;
1052 /* Goal is not achieved. We stop process if:
1054 - if expire reduced to zero. Otherwise, expire is halfed.
1055 - if table is not full.
1056 - if we are called from interrupt.
1057 - jiffies check is just fallback/debug loop breaker.
1058 We will not spin here for long time in any case.
1061 RT_CACHE_STAT_INC(gc_goal_miss);
1063 if (expire == 0)
1064 break;
1066 expire >>= 1;
1068 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1069 goto out;
1070 } while (!in_softirq() && time_before_eq(jiffies, now));
1072 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1073 goto out;
1074 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1075 goto out;
1076 if (net_ratelimit())
1077 printk(KERN_WARNING "dst cache overflow\n");
1078 RT_CACHE_STAT_INC(gc_dst_overflow);
1079 return 1;
1081 work_done:
1082 expire += ip_rt_gc_min_interval;
1083 if (expire > ip_rt_gc_timeout ||
1084 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1085 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1086 expire = ip_rt_gc_timeout;
1087 out: return 0;
1091 * Returns number of entries in a hash chain that have different hash_inputs
1093 static int slow_chain_length(const struct rtable *head)
1095 int length = 0;
1096 const struct rtable *rth = head;
1098 while (rth) {
1099 length += has_noalias(head, rth);
1100 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1102 return length >> FRACT_BITS;
1105 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1106 struct sk_buff *skb, int ifindex)
1108 struct rtable *rth, *cand;
1109 struct rtable __rcu **rthp, **candp;
1110 unsigned long now;
1111 u32 min_score;
1112 int chain_length;
1113 int attempts = !in_softirq();
1115 restart:
1116 chain_length = 0;
1117 min_score = ~(u32)0;
1118 cand = NULL;
1119 candp = NULL;
1120 now = jiffies;
1122 if (!rt_caching(dev_net(rt->dst.dev))) {
1124 * If we're not caching, just tell the caller we
1125 * were successful and don't touch the route. The
1126 * caller hold the sole reference to the cache entry, and
1127 * it will be released when the caller is done with it.
1128 * If we drop it here, the callers have no way to resolve routes
1129 * when we're not caching. Instead, just point *rp at rt, so
1130 * the caller gets a single use out of the route
1131 * Note that we do rt_free on this new route entry, so that
1132 * once its refcount hits zero, we are still able to reap it
1133 * (Thanks Alexey)
1134 * Note: To avoid expensive rcu stuff for this uncached dst,
1135 * we set DST_NOCACHE so that dst_release() can free dst without
1136 * waiting a grace period.
1139 rt->dst.flags |= DST_NOCACHE;
1140 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1141 int err = arp_bind_neighbour(&rt->dst);
1142 if (err) {
1143 if (net_ratelimit())
1144 printk(KERN_WARNING
1145 "Neighbour table failure & not caching routes.\n");
1146 ip_rt_put(rt);
1147 return ERR_PTR(err);
1151 goto skip_hashing;
1154 rthp = &rt_hash_table[hash].chain;
1156 spin_lock_bh(rt_hash_lock_addr(hash));
1157 while ((rth = rcu_dereference_protected(*rthp,
1158 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1159 if (rt_is_expired(rth)) {
1160 *rthp = rth->dst.rt_next;
1161 rt_free(rth);
1162 continue;
1164 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1165 /* Put it first */
1166 *rthp = rth->dst.rt_next;
1168 * Since lookup is lockfree, the deletion
1169 * must be visible to another weakly ordered CPU before
1170 * the insertion at the start of the hash chain.
1172 rcu_assign_pointer(rth->dst.rt_next,
1173 rt_hash_table[hash].chain);
1175 * Since lookup is lockfree, the update writes
1176 * must be ordered for consistency on SMP.
1178 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1180 dst_use(&rth->dst, now);
1181 spin_unlock_bh(rt_hash_lock_addr(hash));
1183 rt_drop(rt);
1184 if (skb)
1185 skb_dst_set(skb, &rth->dst);
1186 return rth;
1189 if (!atomic_read(&rth->dst.__refcnt)) {
1190 u32 score = rt_score(rth);
1192 if (score <= min_score) {
1193 cand = rth;
1194 candp = rthp;
1195 min_score = score;
1199 chain_length++;
1201 rthp = &rth->dst.rt_next;
1204 if (cand) {
1205 /* ip_rt_gc_elasticity used to be average length of chain
1206 * length, when exceeded gc becomes really aggressive.
1208 * The second limit is less certain. At the moment it allows
1209 * only 2 entries per bucket. We will see.
1211 if (chain_length > ip_rt_gc_elasticity) {
1212 *candp = cand->dst.rt_next;
1213 rt_free(cand);
1215 } else {
1216 if (chain_length > rt_chain_length_max &&
1217 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1218 struct net *net = dev_net(rt->dst.dev);
1219 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1220 if (!rt_caching(net)) {
1221 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1222 rt->dst.dev->name, num);
1224 rt_emergency_hash_rebuild(net);
1225 spin_unlock_bh(rt_hash_lock_addr(hash));
1227 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1228 ifindex, rt_genid(net));
1229 goto restart;
1233 /* Try to bind route to arp only if it is output
1234 route or unicast forwarding path.
1236 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1237 int err = arp_bind_neighbour(&rt->dst);
1238 if (err) {
1239 spin_unlock_bh(rt_hash_lock_addr(hash));
1241 if (err != -ENOBUFS) {
1242 rt_drop(rt);
1243 return ERR_PTR(err);
1246 /* Neighbour tables are full and nothing
1247 can be released. Try to shrink route cache,
1248 it is most likely it holds some neighbour records.
1250 if (attempts-- > 0) {
1251 int saved_elasticity = ip_rt_gc_elasticity;
1252 int saved_int = ip_rt_gc_min_interval;
1253 ip_rt_gc_elasticity = 1;
1254 ip_rt_gc_min_interval = 0;
1255 rt_garbage_collect(&ipv4_dst_ops);
1256 ip_rt_gc_min_interval = saved_int;
1257 ip_rt_gc_elasticity = saved_elasticity;
1258 goto restart;
1261 if (net_ratelimit())
1262 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1263 rt_drop(rt);
1264 return ERR_PTR(-ENOBUFS);
1268 rt->dst.rt_next = rt_hash_table[hash].chain;
1271 * Since lookup is lockfree, we must make sure
1272 * previous writes to rt are committed to memory
1273 * before making rt visible to other CPUS.
1275 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1277 spin_unlock_bh(rt_hash_lock_addr(hash));
1279 skip_hashing:
1280 if (skb)
1281 skb_dst_set(skb, &rt->dst);
1282 return rt;
1285 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1287 static u32 rt_peer_genid(void)
1289 return atomic_read(&__rt_peer_genid);
1292 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1294 struct inet_peer *peer;
1296 peer = inet_getpeer_v4(daddr, create);
1298 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1299 inet_putpeer(peer);
1300 else
1301 rt->rt_peer_genid = rt_peer_genid();
1305 * Peer allocation may fail only in serious out-of-memory conditions. However
1306 * we still can generate some output.
1307 * Random ID selection looks a bit dangerous because we have no chances to
1308 * select ID being unique in a reasonable period of time.
1309 * But broken packet identifier may be better than no packet at all.
1311 static void ip_select_fb_ident(struct iphdr *iph)
1313 static DEFINE_SPINLOCK(ip_fb_id_lock);
1314 static u32 ip_fallback_id;
1315 u32 salt;
1317 spin_lock_bh(&ip_fb_id_lock);
1318 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1319 iph->id = htons(salt & 0xFFFF);
1320 ip_fallback_id = salt;
1321 spin_unlock_bh(&ip_fb_id_lock);
1324 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1326 struct rtable *rt = (struct rtable *) dst;
1328 if (rt) {
1329 if (rt->peer == NULL)
1330 rt_bind_peer(rt, rt->rt_dst, 1);
1332 /* If peer is attached to destination, it is never detached,
1333 so that we need not to grab a lock to dereference it.
1335 if (rt->peer) {
1336 iph->id = htons(inet_getid(rt->peer, more));
1337 return;
1339 } else
1340 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1341 __builtin_return_address(0));
1343 ip_select_fb_ident(iph);
1345 EXPORT_SYMBOL(__ip_select_ident);
1347 static void rt_del(unsigned hash, struct rtable *rt)
1349 struct rtable __rcu **rthp;
1350 struct rtable *aux;
1352 rthp = &rt_hash_table[hash].chain;
1353 spin_lock_bh(rt_hash_lock_addr(hash));
1354 ip_rt_put(rt);
1355 while ((aux = rcu_dereference_protected(*rthp,
1356 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1357 if (aux == rt || rt_is_expired(aux)) {
1358 *rthp = aux->dst.rt_next;
1359 rt_free(aux);
1360 continue;
1362 rthp = &aux->dst.rt_next;
1364 spin_unlock_bh(rt_hash_lock_addr(hash));
1367 /* called in rcu_read_lock() section */
1368 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1369 __be32 saddr, struct net_device *dev)
1371 struct in_device *in_dev = __in_dev_get_rcu(dev);
1372 struct inet_peer *peer;
1373 struct net *net;
1375 if (!in_dev)
1376 return;
1378 net = dev_net(dev);
1379 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1380 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1381 ipv4_is_zeronet(new_gw))
1382 goto reject_redirect;
1384 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1385 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1386 goto reject_redirect;
1387 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1388 goto reject_redirect;
1389 } else {
1390 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1391 goto reject_redirect;
1394 peer = inet_getpeer_v4(daddr, 1);
1395 if (peer) {
1396 peer->redirect_learned.a4 = new_gw;
1398 inet_putpeer(peer);
1400 atomic_inc(&__rt_peer_genid);
1402 return;
1404 reject_redirect:
1405 #ifdef CONFIG_IP_ROUTE_VERBOSE
1406 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1407 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1408 " Advised path = %pI4 -> %pI4\n",
1409 &old_gw, dev->name, &new_gw,
1410 &saddr, &daddr);
1411 #endif
1415 static bool peer_pmtu_expired(struct inet_peer *peer)
1417 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1419 return orig &&
1420 time_after_eq(jiffies, orig) &&
1421 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1424 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1426 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1428 return orig &&
1429 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1432 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1434 struct rtable *rt = (struct rtable *)dst;
1435 struct dst_entry *ret = dst;
1437 if (rt) {
1438 if (dst->obsolete > 0) {
1439 ip_rt_put(rt);
1440 ret = NULL;
1441 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1442 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1443 rt->rt_oif,
1444 rt_genid(dev_net(dst->dev)));
1445 rt_del(hash, rt);
1446 ret = NULL;
1447 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1448 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1451 return ret;
1455 * Algorithm:
1456 * 1. The first ip_rt_redirect_number redirects are sent
1457 * with exponential backoff, then we stop sending them at all,
1458 * assuming that the host ignores our redirects.
1459 * 2. If we did not see packets requiring redirects
1460 * during ip_rt_redirect_silence, we assume that the host
1461 * forgot redirected route and start to send redirects again.
1463 * This algorithm is much cheaper and more intelligent than dumb load limiting
1464 * in icmp.c.
1466 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1467 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1470 void ip_rt_send_redirect(struct sk_buff *skb)
1472 struct rtable *rt = skb_rtable(skb);
1473 struct in_device *in_dev;
1474 struct inet_peer *peer;
1475 int log_martians;
1477 rcu_read_lock();
1478 in_dev = __in_dev_get_rcu(rt->dst.dev);
1479 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1480 rcu_read_unlock();
1481 return;
1483 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1484 rcu_read_unlock();
1486 if (!rt->peer)
1487 rt_bind_peer(rt, rt->rt_dst, 1);
1488 peer = rt->peer;
1489 if (!peer) {
1490 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1491 return;
1494 /* No redirected packets during ip_rt_redirect_silence;
1495 * reset the algorithm.
1497 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1498 peer->rate_tokens = 0;
1500 /* Too many ignored redirects; do not send anything
1501 * set dst.rate_last to the last seen redirected packet.
1503 if (peer->rate_tokens >= ip_rt_redirect_number) {
1504 peer->rate_last = jiffies;
1505 return;
1508 /* Check for load limit; set rate_last to the latest sent
1509 * redirect.
1511 if (peer->rate_tokens == 0 ||
1512 time_after(jiffies,
1513 (peer->rate_last +
1514 (ip_rt_redirect_load << peer->rate_tokens)))) {
1515 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1516 peer->rate_last = jiffies;
1517 ++peer->rate_tokens;
1518 #ifdef CONFIG_IP_ROUTE_VERBOSE
1519 if (log_martians &&
1520 peer->rate_tokens == ip_rt_redirect_number &&
1521 net_ratelimit())
1522 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1523 &ip_hdr(skb)->saddr, rt->rt_iif,
1524 &rt->rt_dst, &rt->rt_gateway);
1525 #endif
1529 static int ip_error(struct sk_buff *skb)
1531 struct rtable *rt = skb_rtable(skb);
1532 struct inet_peer *peer;
1533 unsigned long now;
1534 bool send;
1535 int code;
1537 switch (rt->dst.error) {
1538 case EINVAL:
1539 default:
1540 goto out;
1541 case EHOSTUNREACH:
1542 code = ICMP_HOST_UNREACH;
1543 break;
1544 case ENETUNREACH:
1545 code = ICMP_NET_UNREACH;
1546 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1547 IPSTATS_MIB_INNOROUTES);
1548 break;
1549 case EACCES:
1550 code = ICMP_PKT_FILTERED;
1551 break;
1554 if (!rt->peer)
1555 rt_bind_peer(rt, rt->rt_dst, 1);
1556 peer = rt->peer;
1558 send = true;
1559 if (peer) {
1560 now = jiffies;
1561 peer->rate_tokens += now - peer->rate_last;
1562 if (peer->rate_tokens > ip_rt_error_burst)
1563 peer->rate_tokens = ip_rt_error_burst;
1564 peer->rate_last = now;
1565 if (peer->rate_tokens >= ip_rt_error_cost)
1566 peer->rate_tokens -= ip_rt_error_cost;
1567 else
1568 send = false;
1570 if (send)
1571 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1573 out: kfree_skb(skb);
1574 return 0;
1578 * The last two values are not from the RFC but
1579 * are needed for AMPRnet AX.25 paths.
1582 static const unsigned short mtu_plateau[] =
1583 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1585 static inline unsigned short guess_mtu(unsigned short old_mtu)
1587 int i;
1589 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1590 if (old_mtu > mtu_plateau[i])
1591 return mtu_plateau[i];
1592 return 68;
1595 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1596 unsigned short new_mtu,
1597 struct net_device *dev)
1599 unsigned short old_mtu = ntohs(iph->tot_len);
1600 unsigned short est_mtu = 0;
1601 struct inet_peer *peer;
1603 peer = inet_getpeer_v4(iph->daddr, 1);
1604 if (peer) {
1605 unsigned short mtu = new_mtu;
1607 if (new_mtu < 68 || new_mtu >= old_mtu) {
1608 /* BSD 4.2 derived systems incorrectly adjust
1609 * tot_len by the IP header length, and report
1610 * a zero MTU in the ICMP message.
1612 if (mtu == 0 &&
1613 old_mtu >= 68 + (iph->ihl << 2))
1614 old_mtu -= iph->ihl << 2;
1615 mtu = guess_mtu(old_mtu);
1618 if (mtu < ip_rt_min_pmtu)
1619 mtu = ip_rt_min_pmtu;
1620 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1621 unsigned long pmtu_expires;
1623 pmtu_expires = jiffies + ip_rt_mtu_expires;
1624 if (!pmtu_expires)
1625 pmtu_expires = 1UL;
1627 est_mtu = mtu;
1628 peer->pmtu_learned = mtu;
1629 peer->pmtu_expires = pmtu_expires;
1632 inet_putpeer(peer);
1634 atomic_inc(&__rt_peer_genid);
1636 return est_mtu ? : new_mtu;
1639 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1641 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1643 if (!expires)
1644 return;
1645 if (time_before(jiffies, expires)) {
1646 u32 orig_dst_mtu = dst_mtu(dst);
1647 if (peer->pmtu_learned < orig_dst_mtu) {
1648 if (!peer->pmtu_orig)
1649 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1650 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1652 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1653 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1656 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1658 struct rtable *rt = (struct rtable *) dst;
1659 struct inet_peer *peer;
1661 dst_confirm(dst);
1663 if (!rt->peer)
1664 rt_bind_peer(rt, rt->rt_dst, 1);
1665 peer = rt->peer;
1666 if (peer) {
1667 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1669 if (mtu < ip_rt_min_pmtu)
1670 mtu = ip_rt_min_pmtu;
1671 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1673 pmtu_expires = jiffies + ip_rt_mtu_expires;
1674 if (!pmtu_expires)
1675 pmtu_expires = 1UL;
1677 peer->pmtu_learned = mtu;
1678 peer->pmtu_expires = pmtu_expires;
1680 atomic_inc(&__rt_peer_genid);
1681 rt->rt_peer_genid = rt_peer_genid();
1683 check_peer_pmtu(dst, peer);
1687 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1689 struct rtable *rt = (struct rtable *) dst;
1690 __be32 orig_gw = rt->rt_gateway;
1692 dst_confirm(&rt->dst);
1694 neigh_release(rt->dst.neighbour);
1695 rt->dst.neighbour = NULL;
1697 rt->rt_gateway = peer->redirect_learned.a4;
1698 if (arp_bind_neighbour(&rt->dst) ||
1699 !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1700 if (rt->dst.neighbour)
1701 neigh_event_send(rt->dst.neighbour, NULL);
1702 rt->rt_gateway = orig_gw;
1703 return -EAGAIN;
1704 } else {
1705 rt->rt_flags |= RTCF_REDIRECTED;
1706 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1707 rt->dst.neighbour);
1709 return 0;
1712 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1714 struct rtable *rt = (struct rtable *) dst;
1716 if (rt_is_expired(rt))
1717 return NULL;
1718 if (rt->rt_peer_genid != rt_peer_genid()) {
1719 struct inet_peer *peer;
1721 if (!rt->peer)
1722 rt_bind_peer(rt, rt->rt_dst, 0);
1724 peer = rt->peer;
1725 if (peer) {
1726 check_peer_pmtu(dst, peer);
1728 if (peer->redirect_learned.a4 &&
1729 peer->redirect_learned.a4 != rt->rt_gateway) {
1730 if (check_peer_redir(dst, peer))
1731 return NULL;
1735 rt->rt_peer_genid = rt_peer_genid();
1737 return dst;
1740 static void ipv4_dst_destroy(struct dst_entry *dst)
1742 struct rtable *rt = (struct rtable *) dst;
1743 struct inet_peer *peer = rt->peer;
1745 if (rt->fi) {
1746 fib_info_put(rt->fi);
1747 rt->fi = NULL;
1749 if (peer) {
1750 rt->peer = NULL;
1751 inet_putpeer(peer);
1756 static void ipv4_link_failure(struct sk_buff *skb)
1758 struct rtable *rt;
1760 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1762 rt = skb_rtable(skb);
1763 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1764 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1767 static int ip_rt_bug(struct sk_buff *skb)
1769 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1770 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1771 skb->dev ? skb->dev->name : "?");
1772 kfree_skb(skb);
1773 WARN_ON(1);
1774 return 0;
1778 We do not cache source address of outgoing interface,
1779 because it is used only by IP RR, TS and SRR options,
1780 so that it out of fast path.
1782 BTW remember: "addr" is allowed to be not aligned
1783 in IP options!
1786 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1788 __be32 src;
1790 if (rt_is_output_route(rt))
1791 src = ip_hdr(skb)->saddr;
1792 else {
1793 struct fib_result res;
1794 struct flowi4 fl4;
1795 struct iphdr *iph;
1797 iph = ip_hdr(skb);
1799 memset(&fl4, 0, sizeof(fl4));
1800 fl4.daddr = iph->daddr;
1801 fl4.saddr = iph->saddr;
1802 fl4.flowi4_tos = RT_TOS(iph->tos);
1803 fl4.flowi4_oif = rt->dst.dev->ifindex;
1804 fl4.flowi4_iif = skb->dev->ifindex;
1805 fl4.flowi4_mark = skb->mark;
1807 rcu_read_lock();
1808 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1809 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1810 else
1811 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1812 RT_SCOPE_UNIVERSE);
1813 rcu_read_unlock();
1815 memcpy(addr, &src, 4);
1818 #ifdef CONFIG_IP_ROUTE_CLASSID
1819 static void set_class_tag(struct rtable *rt, u32 tag)
1821 if (!(rt->dst.tclassid & 0xFFFF))
1822 rt->dst.tclassid |= tag & 0xFFFF;
1823 if (!(rt->dst.tclassid & 0xFFFF0000))
1824 rt->dst.tclassid |= tag & 0xFFFF0000;
1826 #endif
1828 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1830 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1832 if (advmss == 0) {
1833 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1834 ip_rt_min_advmss);
1835 if (advmss > 65535 - 40)
1836 advmss = 65535 - 40;
1838 return advmss;
1841 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1843 unsigned int mtu = dst->dev->mtu;
1845 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1846 const struct rtable *rt = (const struct rtable *) dst;
1848 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1849 mtu = 576;
1852 if (mtu > IP_MAX_MTU)
1853 mtu = IP_MAX_MTU;
1855 return mtu;
1858 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1859 struct fib_info *fi)
1861 struct inet_peer *peer;
1862 int create = 0;
1864 /* If a peer entry exists for this destination, we must hook
1865 * it up in order to get at cached metrics.
1867 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1868 create = 1;
1870 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1871 if (peer) {
1872 rt->rt_peer_genid = rt_peer_genid();
1873 if (inet_metrics_new(peer))
1874 memcpy(peer->metrics, fi->fib_metrics,
1875 sizeof(u32) * RTAX_MAX);
1876 dst_init_metrics(&rt->dst, peer->metrics, false);
1878 check_peer_pmtu(&rt->dst, peer);
1879 if (peer->redirect_learned.a4 &&
1880 peer->redirect_learned.a4 != rt->rt_gateway) {
1881 rt->rt_gateway = peer->redirect_learned.a4;
1882 rt->rt_flags |= RTCF_REDIRECTED;
1884 } else {
1885 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1886 rt->fi = fi;
1887 atomic_inc(&fi->fib_clntref);
1889 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1893 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1894 const struct fib_result *res,
1895 struct fib_info *fi, u16 type, u32 itag)
1897 struct dst_entry *dst = &rt->dst;
1899 if (fi) {
1900 if (FIB_RES_GW(*res) &&
1901 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1902 rt->rt_gateway = FIB_RES_GW(*res);
1903 rt_init_metrics(rt, fl4, fi);
1904 #ifdef CONFIG_IP_ROUTE_CLASSID
1905 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1906 #endif
1909 if (dst_mtu(dst) > IP_MAX_MTU)
1910 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1911 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1912 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1914 #ifdef CONFIG_IP_ROUTE_CLASSID
1915 #ifdef CONFIG_IP_MULTIPLE_TABLES
1916 set_class_tag(rt, fib_rules_tclass(res));
1917 #endif
1918 set_class_tag(rt, itag);
1919 #endif
1922 static struct rtable *rt_dst_alloc(struct net_device *dev,
1923 bool nopolicy, bool noxfrm)
1925 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1926 DST_HOST |
1927 (nopolicy ? DST_NOPOLICY : 0) |
1928 (noxfrm ? DST_NOXFRM : 0));
1931 /* called in rcu_read_lock() section */
1932 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1933 u8 tos, struct net_device *dev, int our)
1935 unsigned int hash;
1936 struct rtable *rth;
1937 __be32 spec_dst;
1938 struct in_device *in_dev = __in_dev_get_rcu(dev);
1939 u32 itag = 0;
1940 int err;
1942 /* Primary sanity checks. */
1944 if (in_dev == NULL)
1945 return -EINVAL;
1947 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1948 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1949 goto e_inval;
1951 if (ipv4_is_zeronet(saddr)) {
1952 if (!ipv4_is_local_multicast(daddr))
1953 goto e_inval;
1954 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1955 } else {
1956 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1957 &itag);
1958 if (err < 0)
1959 goto e_err;
1961 rth = rt_dst_alloc(init_net.loopback_dev,
1962 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1963 if (!rth)
1964 goto e_nobufs;
1966 #ifdef CONFIG_IP_ROUTE_CLASSID
1967 rth->dst.tclassid = itag;
1968 #endif
1969 rth->dst.output = ip_rt_bug;
1971 rth->rt_key_dst = daddr;
1972 rth->rt_key_src = saddr;
1973 rth->rt_genid = rt_genid(dev_net(dev));
1974 rth->rt_flags = RTCF_MULTICAST;
1975 rth->rt_type = RTN_MULTICAST;
1976 rth->rt_key_tos = tos;
1977 rth->rt_dst = daddr;
1978 rth->rt_src = saddr;
1979 rth->rt_route_iif = dev->ifindex;
1980 rth->rt_iif = dev->ifindex;
1981 rth->rt_oif = 0;
1982 rth->rt_mark = skb->mark;
1983 rth->rt_gateway = daddr;
1984 rth->rt_spec_dst= spec_dst;
1985 rth->rt_peer_genid = 0;
1986 rth->peer = NULL;
1987 rth->fi = NULL;
1988 if (our) {
1989 rth->dst.input= ip_local_deliver;
1990 rth->rt_flags |= RTCF_LOCAL;
1993 #ifdef CONFIG_IP_MROUTE
1994 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1995 rth->dst.input = ip_mr_input;
1996 #endif
1997 RT_CACHE_STAT_INC(in_slow_mc);
1999 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2000 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2001 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2003 e_nobufs:
2004 return -ENOBUFS;
2005 e_inval:
2006 return -EINVAL;
2007 e_err:
2008 return err;
2012 static void ip_handle_martian_source(struct net_device *dev,
2013 struct in_device *in_dev,
2014 struct sk_buff *skb,
2015 __be32 daddr,
2016 __be32 saddr)
2018 RT_CACHE_STAT_INC(in_martian_src);
2019 #ifdef CONFIG_IP_ROUTE_VERBOSE
2020 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2022 * RFC1812 recommendation, if source is martian,
2023 * the only hint is MAC header.
2025 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2026 &daddr, &saddr, dev->name);
2027 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2028 int i;
2029 const unsigned char *p = skb_mac_header(skb);
2030 printk(KERN_WARNING "ll header: ");
2031 for (i = 0; i < dev->hard_header_len; i++, p++) {
2032 printk("%02x", *p);
2033 if (i < (dev->hard_header_len - 1))
2034 printk(":");
2036 printk("\n");
2039 #endif
2042 /* called in rcu_read_lock() section */
2043 static int __mkroute_input(struct sk_buff *skb,
2044 const struct fib_result *res,
2045 struct in_device *in_dev,
2046 __be32 daddr, __be32 saddr, u32 tos,
2047 struct rtable **result)
2049 struct rtable *rth;
2050 int err;
2051 struct in_device *out_dev;
2052 unsigned int flags = 0;
2053 __be32 spec_dst;
2054 u32 itag;
2056 /* get a working reference to the output device */
2057 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2058 if (out_dev == NULL) {
2059 if (net_ratelimit())
2060 printk(KERN_CRIT "Bug in ip_route_input" \
2061 "_slow(). Please, report\n");
2062 return -EINVAL;
2066 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2067 in_dev->dev, &spec_dst, &itag);
2068 if (err < 0) {
2069 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2070 saddr);
2072 goto cleanup;
2075 if (err)
2076 flags |= RTCF_DIRECTSRC;
2078 if (out_dev == in_dev && err &&
2079 (IN_DEV_SHARED_MEDIA(out_dev) ||
2080 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2081 flags |= RTCF_DOREDIRECT;
2083 if (skb->protocol != htons(ETH_P_IP)) {
2084 /* Not IP (i.e. ARP). Do not create route, if it is
2085 * invalid for proxy arp. DNAT routes are always valid.
2087 * Proxy arp feature have been extended to allow, ARP
2088 * replies back to the same interface, to support
2089 * Private VLAN switch technologies. See arp.c.
2091 if (out_dev == in_dev &&
2092 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2093 err = -EINVAL;
2094 goto cleanup;
2098 rth = rt_dst_alloc(out_dev->dev,
2099 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2100 IN_DEV_CONF_GET(out_dev, NOXFRM));
2101 if (!rth) {
2102 err = -ENOBUFS;
2103 goto cleanup;
2106 rth->rt_key_dst = daddr;
2107 rth->rt_key_src = saddr;
2108 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2109 rth->rt_flags = flags;
2110 rth->rt_type = res->type;
2111 rth->rt_key_tos = tos;
2112 rth->rt_dst = daddr;
2113 rth->rt_src = saddr;
2114 rth->rt_route_iif = in_dev->dev->ifindex;
2115 rth->rt_iif = in_dev->dev->ifindex;
2116 rth->rt_oif = 0;
2117 rth->rt_mark = skb->mark;
2118 rth->rt_gateway = daddr;
2119 rth->rt_spec_dst= spec_dst;
2120 rth->rt_peer_genid = 0;
2121 rth->peer = NULL;
2122 rth->fi = NULL;
2124 rth->dst.input = ip_forward;
2125 rth->dst.output = ip_output;
2127 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2129 *result = rth;
2130 err = 0;
2131 cleanup:
2132 return err;
2135 static int ip_mkroute_input(struct sk_buff *skb,
2136 struct fib_result *res,
2137 const struct flowi4 *fl4,
2138 struct in_device *in_dev,
2139 __be32 daddr, __be32 saddr, u32 tos)
2141 struct rtable* rth = NULL;
2142 int err;
2143 unsigned hash;
2145 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2146 if (res->fi && res->fi->fib_nhs > 1)
2147 fib_select_multipath(res);
2148 #endif
2150 /* create a routing cache entry */
2151 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2152 if (err)
2153 return err;
2155 /* put it into the cache */
2156 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2157 rt_genid(dev_net(rth->dst.dev)));
2158 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2159 if (IS_ERR(rth))
2160 return PTR_ERR(rth);
2161 return 0;
2165 * NOTE. We drop all the packets that has local source
2166 * addresses, because every properly looped back packet
2167 * must have correct destination already attached by output routine.
2169 * Such approach solves two big problems:
2170 * 1. Not simplex devices are handled properly.
2171 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2172 * called with rcu_read_lock()
2175 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2176 u8 tos, struct net_device *dev)
2178 struct fib_result res;
2179 struct in_device *in_dev = __in_dev_get_rcu(dev);
2180 struct flowi4 fl4;
2181 unsigned flags = 0;
2182 u32 itag = 0;
2183 struct rtable * rth;
2184 unsigned hash;
2185 __be32 spec_dst;
2186 int err = -EINVAL;
2187 struct net * net = dev_net(dev);
2189 /* IP on this device is disabled. */
2191 if (!in_dev)
2192 goto out;
2194 /* Check for the most weird martians, which can be not detected
2195 by fib_lookup.
2198 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2199 ipv4_is_loopback(saddr))
2200 goto martian_source;
2202 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2203 goto brd_input;
2205 /* Accept zero addresses only to limited broadcast;
2206 * I even do not know to fix it or not. Waiting for complains :-)
2208 if (ipv4_is_zeronet(saddr))
2209 goto martian_source;
2211 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2212 goto martian_destination;
2215 * Now we are ready to route packet.
2217 fl4.flowi4_oif = 0;
2218 fl4.flowi4_iif = dev->ifindex;
2219 fl4.flowi4_mark = skb->mark;
2220 fl4.flowi4_tos = tos;
2221 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2222 fl4.daddr = daddr;
2223 fl4.saddr = saddr;
2224 err = fib_lookup(net, &fl4, &res);
2225 if (err != 0) {
2226 if (!IN_DEV_FORWARD(in_dev))
2227 goto e_hostunreach;
2228 goto no_route;
2231 RT_CACHE_STAT_INC(in_slow_tot);
2233 if (res.type == RTN_BROADCAST)
2234 goto brd_input;
2236 if (res.type == RTN_LOCAL) {
2237 err = fib_validate_source(skb, saddr, daddr, tos,
2238 net->loopback_dev->ifindex,
2239 dev, &spec_dst, &itag);
2240 if (err < 0)
2241 goto martian_source_keep_err;
2242 if (err)
2243 flags |= RTCF_DIRECTSRC;
2244 spec_dst = daddr;
2245 goto local_input;
2248 if (!IN_DEV_FORWARD(in_dev))
2249 goto e_hostunreach;
2250 if (res.type != RTN_UNICAST)
2251 goto martian_destination;
2253 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2254 out: return err;
2256 brd_input:
2257 if (skb->protocol != htons(ETH_P_IP))
2258 goto e_inval;
2260 if (ipv4_is_zeronet(saddr))
2261 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2262 else {
2263 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2264 &itag);
2265 if (err < 0)
2266 goto martian_source_keep_err;
2267 if (err)
2268 flags |= RTCF_DIRECTSRC;
2270 flags |= RTCF_BROADCAST;
2271 res.type = RTN_BROADCAST;
2272 RT_CACHE_STAT_INC(in_brd);
2274 local_input:
2275 rth = rt_dst_alloc(net->loopback_dev,
2276 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2277 if (!rth)
2278 goto e_nobufs;
2280 rth->dst.input= ip_local_deliver;
2281 rth->dst.output= ip_rt_bug;
2282 #ifdef CONFIG_IP_ROUTE_CLASSID
2283 rth->dst.tclassid = itag;
2284 #endif
2286 rth->rt_key_dst = daddr;
2287 rth->rt_key_src = saddr;
2288 rth->rt_genid = rt_genid(net);
2289 rth->rt_flags = flags|RTCF_LOCAL;
2290 rth->rt_type = res.type;
2291 rth->rt_key_tos = tos;
2292 rth->rt_dst = daddr;
2293 rth->rt_src = saddr;
2294 #ifdef CONFIG_IP_ROUTE_CLASSID
2295 rth->dst.tclassid = itag;
2296 #endif
2297 rth->rt_route_iif = dev->ifindex;
2298 rth->rt_iif = dev->ifindex;
2299 rth->rt_oif = 0;
2300 rth->rt_mark = skb->mark;
2301 rth->rt_gateway = daddr;
2302 rth->rt_spec_dst= spec_dst;
2303 rth->rt_peer_genid = 0;
2304 rth->peer = NULL;
2305 rth->fi = NULL;
2306 if (res.type == RTN_UNREACHABLE) {
2307 rth->dst.input= ip_error;
2308 rth->dst.error= -err;
2309 rth->rt_flags &= ~RTCF_LOCAL;
2311 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2312 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2313 err = 0;
2314 if (IS_ERR(rth))
2315 err = PTR_ERR(rth);
2316 goto out;
2318 no_route:
2319 RT_CACHE_STAT_INC(in_no_route);
2320 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2321 res.type = RTN_UNREACHABLE;
2322 if (err == -ESRCH)
2323 err = -ENETUNREACH;
2324 goto local_input;
2327 * Do not cache martian addresses: they should be logged (RFC1812)
2329 martian_destination:
2330 RT_CACHE_STAT_INC(in_martian_dst);
2331 #ifdef CONFIG_IP_ROUTE_VERBOSE
2332 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2333 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2334 &daddr, &saddr, dev->name);
2335 #endif
2337 e_hostunreach:
2338 err = -EHOSTUNREACH;
2339 goto out;
2341 e_inval:
2342 err = -EINVAL;
2343 goto out;
2345 e_nobufs:
2346 err = -ENOBUFS;
2347 goto out;
2349 martian_source:
2350 err = -EINVAL;
2351 martian_source_keep_err:
2352 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2353 goto out;
2356 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2357 u8 tos, struct net_device *dev, bool noref)
2359 struct rtable * rth;
2360 unsigned hash;
2361 int iif = dev->ifindex;
2362 struct net *net;
2363 int res;
2365 net = dev_net(dev);
2367 rcu_read_lock();
2369 if (!rt_caching(net))
2370 goto skip_cache;
2372 tos &= IPTOS_RT_MASK;
2373 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2375 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2376 rth = rcu_dereference(rth->dst.rt_next)) {
2377 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2378 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2379 (rth->rt_route_iif ^ iif) |
2380 (rth->rt_key_tos ^ tos)) == 0 &&
2381 rth->rt_mark == skb->mark &&
2382 net_eq(dev_net(rth->dst.dev), net) &&
2383 !rt_is_expired(rth)) {
2384 if (noref) {
2385 dst_use_noref(&rth->dst, jiffies);
2386 skb_dst_set_noref(skb, &rth->dst);
2387 } else {
2388 dst_use(&rth->dst, jiffies);
2389 skb_dst_set(skb, &rth->dst);
2391 RT_CACHE_STAT_INC(in_hit);
2392 rcu_read_unlock();
2393 return 0;
2395 RT_CACHE_STAT_INC(in_hlist_search);
2398 skip_cache:
2399 /* Multicast recognition logic is moved from route cache to here.
2400 The problem was that too many Ethernet cards have broken/missing
2401 hardware multicast filters :-( As result the host on multicasting
2402 network acquires a lot of useless route cache entries, sort of
2403 SDR messages from all the world. Now we try to get rid of them.
2404 Really, provided software IP multicast filter is organized
2405 reasonably (at least, hashed), it does not result in a slowdown
2406 comparing with route cache reject entries.
2407 Note, that multicast routers are not affected, because
2408 route cache entry is created eventually.
2410 if (ipv4_is_multicast(daddr)) {
2411 struct in_device *in_dev = __in_dev_get_rcu(dev);
2413 if (in_dev) {
2414 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2415 ip_hdr(skb)->protocol);
2416 if (our
2417 #ifdef CONFIG_IP_MROUTE
2419 (!ipv4_is_local_multicast(daddr) &&
2420 IN_DEV_MFORWARD(in_dev))
2421 #endif
2423 int res = ip_route_input_mc(skb, daddr, saddr,
2424 tos, dev, our);
2425 rcu_read_unlock();
2426 return res;
2429 rcu_read_unlock();
2430 return -EINVAL;
2432 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2433 rcu_read_unlock();
2434 return res;
2436 EXPORT_SYMBOL(ip_route_input_common);
2438 /* called with rcu_read_lock() */
2439 static struct rtable *__mkroute_output(const struct fib_result *res,
2440 const struct flowi4 *fl4,
2441 __be32 orig_daddr, __be32 orig_saddr,
2442 int orig_oif, struct net_device *dev_out,
2443 unsigned int flags)
2445 struct fib_info *fi = res->fi;
2446 u32 tos = RT_FL_TOS(fl4);
2447 struct in_device *in_dev;
2448 u16 type = res->type;
2449 struct rtable *rth;
2451 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2452 return ERR_PTR(-EINVAL);
2454 if (ipv4_is_lbcast(fl4->daddr))
2455 type = RTN_BROADCAST;
2456 else if (ipv4_is_multicast(fl4->daddr))
2457 type = RTN_MULTICAST;
2458 else if (ipv4_is_zeronet(fl4->daddr))
2459 return ERR_PTR(-EINVAL);
2461 if (dev_out->flags & IFF_LOOPBACK)
2462 flags |= RTCF_LOCAL;
2464 in_dev = __in_dev_get_rcu(dev_out);
2465 if (!in_dev)
2466 return ERR_PTR(-EINVAL);
2468 if (type == RTN_BROADCAST) {
2469 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2470 fi = NULL;
2471 } else if (type == RTN_MULTICAST) {
2472 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2473 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2474 fl4->flowi4_proto))
2475 flags &= ~RTCF_LOCAL;
2476 /* If multicast route do not exist use
2477 * default one, but do not gateway in this case.
2478 * Yes, it is hack.
2480 if (fi && res->prefixlen < 4)
2481 fi = NULL;
2484 rth = rt_dst_alloc(dev_out,
2485 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2486 IN_DEV_CONF_GET(in_dev, NOXFRM));
2487 if (!rth)
2488 return ERR_PTR(-ENOBUFS);
2490 rth->dst.output = ip_output;
2492 rth->rt_key_dst = orig_daddr;
2493 rth->rt_key_src = orig_saddr;
2494 rth->rt_genid = rt_genid(dev_net(dev_out));
2495 rth->rt_flags = flags;
2496 rth->rt_type = type;
2497 rth->rt_key_tos = tos;
2498 rth->rt_dst = fl4->daddr;
2499 rth->rt_src = fl4->saddr;
2500 rth->rt_route_iif = 0;
2501 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2502 rth->rt_oif = orig_oif;
2503 rth->rt_mark = fl4->flowi4_mark;
2504 rth->rt_gateway = fl4->daddr;
2505 rth->rt_spec_dst= fl4->saddr;
2506 rth->rt_peer_genid = 0;
2507 rth->peer = NULL;
2508 rth->fi = NULL;
2510 RT_CACHE_STAT_INC(out_slow_tot);
2512 if (flags & RTCF_LOCAL) {
2513 rth->dst.input = ip_local_deliver;
2514 rth->rt_spec_dst = fl4->daddr;
2516 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2517 rth->rt_spec_dst = fl4->saddr;
2518 if (flags & RTCF_LOCAL &&
2519 !(dev_out->flags & IFF_LOOPBACK)) {
2520 rth->dst.output = ip_mc_output;
2521 RT_CACHE_STAT_INC(out_slow_mc);
2523 #ifdef CONFIG_IP_MROUTE
2524 if (type == RTN_MULTICAST) {
2525 if (IN_DEV_MFORWARD(in_dev) &&
2526 !ipv4_is_local_multicast(fl4->daddr)) {
2527 rth->dst.input = ip_mr_input;
2528 rth->dst.output = ip_mc_output;
2531 #endif
2534 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2536 return rth;
2540 * Major route resolver routine.
2541 * called with rcu_read_lock();
2544 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2546 struct net_device *dev_out = NULL;
2547 u32 tos = RT_FL_TOS(fl4);
2548 unsigned int flags = 0;
2549 struct fib_result res;
2550 struct rtable *rth;
2551 __be32 orig_daddr;
2552 __be32 orig_saddr;
2553 int orig_oif;
2555 res.fi = NULL;
2556 #ifdef CONFIG_IP_MULTIPLE_TABLES
2557 res.r = NULL;
2558 #endif
2560 orig_daddr = fl4->daddr;
2561 orig_saddr = fl4->saddr;
2562 orig_oif = fl4->flowi4_oif;
2564 fl4->flowi4_iif = net->loopback_dev->ifindex;
2565 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2566 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2567 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2569 rcu_read_lock();
2570 if (fl4->saddr) {
2571 rth = ERR_PTR(-EINVAL);
2572 if (ipv4_is_multicast(fl4->saddr) ||
2573 ipv4_is_lbcast(fl4->saddr) ||
2574 ipv4_is_zeronet(fl4->saddr))
2575 goto out;
2577 /* I removed check for oif == dev_out->oif here.
2578 It was wrong for two reasons:
2579 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2580 is assigned to multiple interfaces.
2581 2. Moreover, we are allowed to send packets with saddr
2582 of another iface. --ANK
2585 if (fl4->flowi4_oif == 0 &&
2586 (ipv4_is_multicast(fl4->daddr) ||
2587 ipv4_is_lbcast(fl4->daddr))) {
2588 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2589 dev_out = __ip_dev_find(net, fl4->saddr, false);
2590 if (dev_out == NULL)
2591 goto out;
2593 /* Special hack: user can direct multicasts
2594 and limited broadcast via necessary interface
2595 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2596 This hack is not just for fun, it allows
2597 vic,vat and friends to work.
2598 They bind socket to loopback, set ttl to zero
2599 and expect that it will work.
2600 From the viewpoint of routing cache they are broken,
2601 because we are not allowed to build multicast path
2602 with loopback source addr (look, routing cache
2603 cannot know, that ttl is zero, so that packet
2604 will not leave this host and route is valid).
2605 Luckily, this hack is good workaround.
2608 fl4->flowi4_oif = dev_out->ifindex;
2609 goto make_route;
2612 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2613 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2614 if (!__ip_dev_find(net, fl4->saddr, false))
2615 goto out;
2620 if (fl4->flowi4_oif) {
2621 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2622 rth = ERR_PTR(-ENODEV);
2623 if (dev_out == NULL)
2624 goto out;
2626 /* RACE: Check return value of inet_select_addr instead. */
2627 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2628 rth = ERR_PTR(-ENETUNREACH);
2629 goto out;
2631 if (ipv4_is_local_multicast(fl4->daddr) ||
2632 ipv4_is_lbcast(fl4->daddr)) {
2633 if (!fl4->saddr)
2634 fl4->saddr = inet_select_addr(dev_out, 0,
2635 RT_SCOPE_LINK);
2636 goto make_route;
2638 if (fl4->saddr) {
2639 if (ipv4_is_multicast(fl4->daddr))
2640 fl4->saddr = inet_select_addr(dev_out, 0,
2641 fl4->flowi4_scope);
2642 else if (!fl4->daddr)
2643 fl4->saddr = inet_select_addr(dev_out, 0,
2644 RT_SCOPE_HOST);
2648 if (!fl4->daddr) {
2649 fl4->daddr = fl4->saddr;
2650 if (!fl4->daddr)
2651 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2652 dev_out = net->loopback_dev;
2653 fl4->flowi4_oif = net->loopback_dev->ifindex;
2654 res.type = RTN_LOCAL;
2655 flags |= RTCF_LOCAL;
2656 goto make_route;
2659 if (fib_lookup(net, fl4, &res)) {
2660 res.fi = NULL;
2661 if (fl4->flowi4_oif) {
2662 /* Apparently, routing tables are wrong. Assume,
2663 that the destination is on link.
2665 WHY? DW.
2666 Because we are allowed to send to iface
2667 even if it has NO routes and NO assigned
2668 addresses. When oif is specified, routing
2669 tables are looked up with only one purpose:
2670 to catch if destination is gatewayed, rather than
2671 direct. Moreover, if MSG_DONTROUTE is set,
2672 we send packet, ignoring both routing tables
2673 and ifaddr state. --ANK
2676 We could make it even if oif is unknown,
2677 likely IPv6, but we do not.
2680 if (fl4->saddr == 0)
2681 fl4->saddr = inet_select_addr(dev_out, 0,
2682 RT_SCOPE_LINK);
2683 res.type = RTN_UNICAST;
2684 goto make_route;
2686 rth = ERR_PTR(-ENETUNREACH);
2687 goto out;
2690 if (res.type == RTN_LOCAL) {
2691 if (!fl4->saddr) {
2692 if (res.fi->fib_prefsrc)
2693 fl4->saddr = res.fi->fib_prefsrc;
2694 else
2695 fl4->saddr = fl4->daddr;
2697 dev_out = net->loopback_dev;
2698 fl4->flowi4_oif = dev_out->ifindex;
2699 res.fi = NULL;
2700 flags |= RTCF_LOCAL;
2701 goto make_route;
2704 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2705 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2706 fib_select_multipath(&res);
2707 else
2708 #endif
2709 if (!res.prefixlen &&
2710 res.table->tb_num_default > 1 &&
2711 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2712 fib_select_default(&res);
2714 if (!fl4->saddr)
2715 fl4->saddr = FIB_RES_PREFSRC(net, res);
2717 dev_out = FIB_RES_DEV(res);
2718 fl4->flowi4_oif = dev_out->ifindex;
2721 make_route:
2722 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2723 dev_out, flags);
2724 if (!IS_ERR(rth)) {
2725 unsigned int hash;
2727 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2728 rt_genid(dev_net(dev_out)));
2729 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2732 out:
2733 rcu_read_unlock();
2734 return rth;
2737 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2739 struct rtable *rth;
2740 unsigned int hash;
2742 if (!rt_caching(net))
2743 goto slow_output;
2745 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2747 rcu_read_lock_bh();
2748 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2749 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2750 if (rth->rt_key_dst == flp4->daddr &&
2751 rth->rt_key_src == flp4->saddr &&
2752 rt_is_output_route(rth) &&
2753 rth->rt_oif == flp4->flowi4_oif &&
2754 rth->rt_mark == flp4->flowi4_mark &&
2755 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2756 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2757 net_eq(dev_net(rth->dst.dev), net) &&
2758 !rt_is_expired(rth)) {
2759 dst_use(&rth->dst, jiffies);
2760 RT_CACHE_STAT_INC(out_hit);
2761 rcu_read_unlock_bh();
2762 if (!flp4->saddr)
2763 flp4->saddr = rth->rt_src;
2764 if (!flp4->daddr)
2765 flp4->daddr = rth->rt_dst;
2766 return rth;
2768 RT_CACHE_STAT_INC(out_hlist_search);
2770 rcu_read_unlock_bh();
2772 slow_output:
2773 return ip_route_output_slow(net, flp4);
2775 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2777 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2779 return NULL;
2782 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2784 return 0;
2787 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2791 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2792 unsigned long old)
2794 return NULL;
2797 static struct dst_ops ipv4_dst_blackhole_ops = {
2798 .family = AF_INET,
2799 .protocol = cpu_to_be16(ETH_P_IP),
2800 .destroy = ipv4_dst_destroy,
2801 .check = ipv4_blackhole_dst_check,
2802 .default_mtu = ipv4_blackhole_default_mtu,
2803 .default_advmss = ipv4_default_advmss,
2804 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2805 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2808 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2810 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2811 struct rtable *ort = (struct rtable *) dst_orig;
2813 if (rt) {
2814 struct dst_entry *new = &rt->dst;
2816 new->__use = 1;
2817 new->input = dst_discard;
2818 new->output = dst_discard;
2819 dst_copy_metrics(new, &ort->dst);
2821 new->dev = ort->dst.dev;
2822 if (new->dev)
2823 dev_hold(new->dev);
2825 rt->rt_key_dst = ort->rt_key_dst;
2826 rt->rt_key_src = ort->rt_key_src;
2827 rt->rt_key_tos = ort->rt_key_tos;
2828 rt->rt_route_iif = ort->rt_route_iif;
2829 rt->rt_iif = ort->rt_iif;
2830 rt->rt_oif = ort->rt_oif;
2831 rt->rt_mark = ort->rt_mark;
2833 rt->rt_genid = rt_genid(net);
2834 rt->rt_flags = ort->rt_flags;
2835 rt->rt_type = ort->rt_type;
2836 rt->rt_dst = ort->rt_dst;
2837 rt->rt_src = ort->rt_src;
2838 rt->rt_gateway = ort->rt_gateway;
2839 rt->rt_spec_dst = ort->rt_spec_dst;
2840 rt->peer = ort->peer;
2841 if (rt->peer)
2842 atomic_inc(&rt->peer->refcnt);
2843 rt->fi = ort->fi;
2844 if (rt->fi)
2845 atomic_inc(&rt->fi->fib_clntref);
2847 dst_free(new);
2850 dst_release(dst_orig);
2852 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2855 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2856 struct sock *sk)
2858 struct rtable *rt = __ip_route_output_key(net, flp4);
2860 if (IS_ERR(rt))
2861 return rt;
2863 if (flp4->flowi4_proto)
2864 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2865 flowi4_to_flowi(flp4),
2866 sk, 0);
2868 return rt;
2870 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2872 static int rt_fill_info(struct net *net,
2873 struct sk_buff *skb, u32 pid, u32 seq, int event,
2874 int nowait, unsigned int flags)
2876 struct rtable *rt = skb_rtable(skb);
2877 struct rtmsg *r;
2878 struct nlmsghdr *nlh;
2879 long expires = 0;
2880 const struct inet_peer *peer = rt->peer;
2881 u32 id = 0, ts = 0, tsage = 0, error;
2883 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2884 if (nlh == NULL)
2885 return -EMSGSIZE;
2887 r = nlmsg_data(nlh);
2888 r->rtm_family = AF_INET;
2889 r->rtm_dst_len = 32;
2890 r->rtm_src_len = 0;
2891 r->rtm_tos = rt->rt_key_tos;
2892 r->rtm_table = RT_TABLE_MAIN;
2893 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2894 r->rtm_type = rt->rt_type;
2895 r->rtm_scope = RT_SCOPE_UNIVERSE;
2896 r->rtm_protocol = RTPROT_UNSPEC;
2897 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2898 if (rt->rt_flags & RTCF_NOTIFY)
2899 r->rtm_flags |= RTM_F_NOTIFY;
2901 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2903 if (rt->rt_key_src) {
2904 r->rtm_src_len = 32;
2905 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2907 if (rt->dst.dev)
2908 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2909 #ifdef CONFIG_IP_ROUTE_CLASSID
2910 if (rt->dst.tclassid)
2911 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2912 #endif
2913 if (rt_is_input_route(rt))
2914 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2915 else if (rt->rt_src != rt->rt_key_src)
2916 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2918 if (rt->rt_dst != rt->rt_gateway)
2919 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2921 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2922 goto nla_put_failure;
2924 if (rt->rt_mark)
2925 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2927 error = rt->dst.error;
2928 if (peer) {
2929 inet_peer_refcheck(rt->peer);
2930 id = atomic_read(&peer->ip_id_count) & 0xffff;
2931 if (peer->tcp_ts_stamp) {
2932 ts = peer->tcp_ts;
2933 tsage = get_seconds() - peer->tcp_ts_stamp;
2935 expires = ACCESS_ONCE(peer->pmtu_expires);
2936 if (expires)
2937 expires -= jiffies;
2940 if (rt_is_input_route(rt)) {
2941 #ifdef CONFIG_IP_MROUTE
2942 __be32 dst = rt->rt_dst;
2944 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2945 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2946 int err = ipmr_get_route(net, skb,
2947 rt->rt_src, rt->rt_dst,
2948 r, nowait);
2949 if (err <= 0) {
2950 if (!nowait) {
2951 if (err == 0)
2952 return 0;
2953 goto nla_put_failure;
2954 } else {
2955 if (err == -EMSGSIZE)
2956 goto nla_put_failure;
2957 error = err;
2960 } else
2961 #endif
2962 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2965 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2966 expires, error) < 0)
2967 goto nla_put_failure;
2969 return nlmsg_end(skb, nlh);
2971 nla_put_failure:
2972 nlmsg_cancel(skb, nlh);
2973 return -EMSGSIZE;
2976 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2978 struct net *net = sock_net(in_skb->sk);
2979 struct rtmsg *rtm;
2980 struct nlattr *tb[RTA_MAX+1];
2981 struct rtable *rt = NULL;
2982 __be32 dst = 0;
2983 __be32 src = 0;
2984 u32 iif;
2985 int err;
2986 int mark;
2987 struct sk_buff *skb;
2989 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2990 if (err < 0)
2991 goto errout;
2993 rtm = nlmsg_data(nlh);
2995 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2996 if (skb == NULL) {
2997 err = -ENOBUFS;
2998 goto errout;
3001 /* Reserve room for dummy headers, this skb can pass
3002 through good chunk of routing engine.
3004 skb_reset_mac_header(skb);
3005 skb_reset_network_header(skb);
3007 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3008 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3009 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3011 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3012 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3013 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3014 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3016 if (iif) {
3017 struct net_device *dev;
3019 dev = __dev_get_by_index(net, iif);
3020 if (dev == NULL) {
3021 err = -ENODEV;
3022 goto errout_free;
3025 skb->protocol = htons(ETH_P_IP);
3026 skb->dev = dev;
3027 skb->mark = mark;
3028 local_bh_disable();
3029 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3030 local_bh_enable();
3032 rt = skb_rtable(skb);
3033 if (err == 0 && rt->dst.error)
3034 err = -rt->dst.error;
3035 } else {
3036 struct flowi4 fl4 = {
3037 .daddr = dst,
3038 .saddr = src,
3039 .flowi4_tos = rtm->rtm_tos,
3040 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3041 .flowi4_mark = mark,
3043 rt = ip_route_output_key(net, &fl4);
3045 err = 0;
3046 if (IS_ERR(rt))
3047 err = PTR_ERR(rt);
3050 if (err)
3051 goto errout_free;
3053 skb_dst_set(skb, &rt->dst);
3054 if (rtm->rtm_flags & RTM_F_NOTIFY)
3055 rt->rt_flags |= RTCF_NOTIFY;
3057 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3058 RTM_NEWROUTE, 0, 0);
3059 if (err <= 0)
3060 goto errout_free;
3062 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3063 errout:
3064 return err;
3066 errout_free:
3067 kfree_skb(skb);
3068 goto errout;
3071 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3073 struct rtable *rt;
3074 int h, s_h;
3075 int idx, s_idx;
3076 struct net *net;
3078 net = sock_net(skb->sk);
3080 s_h = cb->args[0];
3081 if (s_h < 0)
3082 s_h = 0;
3083 s_idx = idx = cb->args[1];
3084 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3085 if (!rt_hash_table[h].chain)
3086 continue;
3087 rcu_read_lock_bh();
3088 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3089 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3090 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3091 continue;
3092 if (rt_is_expired(rt))
3093 continue;
3094 skb_dst_set_noref(skb, &rt->dst);
3095 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3096 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3097 1, NLM_F_MULTI) <= 0) {
3098 skb_dst_drop(skb);
3099 rcu_read_unlock_bh();
3100 goto done;
3102 skb_dst_drop(skb);
3104 rcu_read_unlock_bh();
3107 done:
3108 cb->args[0] = h;
3109 cb->args[1] = idx;
3110 return skb->len;
3113 void ip_rt_multicast_event(struct in_device *in_dev)
3115 rt_cache_flush(dev_net(in_dev->dev), 0);
3118 #ifdef CONFIG_SYSCTL
3119 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3120 void __user *buffer,
3121 size_t *lenp, loff_t *ppos)
3123 if (write) {
3124 int flush_delay;
3125 ctl_table ctl;
3126 struct net *net;
3128 memcpy(&ctl, __ctl, sizeof(ctl));
3129 ctl.data = &flush_delay;
3130 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3132 net = (struct net *)__ctl->extra1;
3133 rt_cache_flush(net, flush_delay);
3134 return 0;
3137 return -EINVAL;
3140 static ctl_table ipv4_route_table[] = {
3142 .procname = "gc_thresh",
3143 .data = &ipv4_dst_ops.gc_thresh,
3144 .maxlen = sizeof(int),
3145 .mode = 0644,
3146 .proc_handler = proc_dointvec,
3149 .procname = "max_size",
3150 .data = &ip_rt_max_size,
3151 .maxlen = sizeof(int),
3152 .mode = 0644,
3153 .proc_handler = proc_dointvec,
3156 /* Deprecated. Use gc_min_interval_ms */
3158 .procname = "gc_min_interval",
3159 .data = &ip_rt_gc_min_interval,
3160 .maxlen = sizeof(int),
3161 .mode = 0644,
3162 .proc_handler = proc_dointvec_jiffies,
3165 .procname = "gc_min_interval_ms",
3166 .data = &ip_rt_gc_min_interval,
3167 .maxlen = sizeof(int),
3168 .mode = 0644,
3169 .proc_handler = proc_dointvec_ms_jiffies,
3172 .procname = "gc_timeout",
3173 .data = &ip_rt_gc_timeout,
3174 .maxlen = sizeof(int),
3175 .mode = 0644,
3176 .proc_handler = proc_dointvec_jiffies,
3179 .procname = "gc_interval",
3180 .data = &ip_rt_gc_interval,
3181 .maxlen = sizeof(int),
3182 .mode = 0644,
3183 .proc_handler = proc_dointvec_jiffies,
3186 .procname = "gc_interval",
3187 .data = &ip_rt_gc_interval,
3188 .maxlen = sizeof(int),
3189 .mode = 0644,
3190 .proc_handler = proc_dointvec_jiffies,
3193 .procname = "redirect_load",
3194 .data = &ip_rt_redirect_load,
3195 .maxlen = sizeof(int),
3196 .mode = 0644,
3197 .proc_handler = proc_dointvec,
3200 .procname = "redirect_number",
3201 .data = &ip_rt_redirect_number,
3202 .maxlen = sizeof(int),
3203 .mode = 0644,
3204 .proc_handler = proc_dointvec,
3207 .procname = "redirect_silence",
3208 .data = &ip_rt_redirect_silence,
3209 .maxlen = sizeof(int),
3210 .mode = 0644,
3211 .proc_handler = proc_dointvec,
3214 .procname = "error_cost",
3215 .data = &ip_rt_error_cost,
3216 .maxlen = sizeof(int),
3217 .mode = 0644,
3218 .proc_handler = proc_dointvec,
3221 .procname = "error_burst",
3222 .data = &ip_rt_error_burst,
3223 .maxlen = sizeof(int),
3224 .mode = 0644,
3225 .proc_handler = proc_dointvec,
3228 .procname = "gc_elasticity",
3229 .data = &ip_rt_gc_elasticity,
3230 .maxlen = sizeof(int),
3231 .mode = 0644,
3232 .proc_handler = proc_dointvec,
3235 .procname = "mtu_expires",
3236 .data = &ip_rt_mtu_expires,
3237 .maxlen = sizeof(int),
3238 .mode = 0644,
3239 .proc_handler = proc_dointvec_jiffies,
3242 .procname = "min_pmtu",
3243 .data = &ip_rt_min_pmtu,
3244 .maxlen = sizeof(int),
3245 .mode = 0644,
3246 .proc_handler = proc_dointvec,
3249 .procname = "min_adv_mss",
3250 .data = &ip_rt_min_advmss,
3251 .maxlen = sizeof(int),
3252 .mode = 0644,
3253 .proc_handler = proc_dointvec,
3258 static struct ctl_table empty[1];
3260 static struct ctl_table ipv4_skeleton[] =
3262 { .procname = "route",
3263 .mode = 0555, .child = ipv4_route_table},
3264 { .procname = "neigh",
3265 .mode = 0555, .child = empty},
3269 static __net_initdata struct ctl_path ipv4_path[] = {
3270 { .procname = "net", },
3271 { .procname = "ipv4", },
3272 { },
3275 static struct ctl_table ipv4_route_flush_table[] = {
3277 .procname = "flush",
3278 .maxlen = sizeof(int),
3279 .mode = 0200,
3280 .proc_handler = ipv4_sysctl_rtcache_flush,
3282 { },
3285 static __net_initdata struct ctl_path ipv4_route_path[] = {
3286 { .procname = "net", },
3287 { .procname = "ipv4", },
3288 { .procname = "route", },
3289 { },
3292 static __net_init int sysctl_route_net_init(struct net *net)
3294 struct ctl_table *tbl;
3296 tbl = ipv4_route_flush_table;
3297 if (!net_eq(net, &init_net)) {
3298 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3299 if (tbl == NULL)
3300 goto err_dup;
3302 tbl[0].extra1 = net;
3304 net->ipv4.route_hdr =
3305 register_net_sysctl_table(net, ipv4_route_path, tbl);
3306 if (net->ipv4.route_hdr == NULL)
3307 goto err_reg;
3308 return 0;
3310 err_reg:
3311 if (tbl != ipv4_route_flush_table)
3312 kfree(tbl);
3313 err_dup:
3314 return -ENOMEM;
3317 static __net_exit void sysctl_route_net_exit(struct net *net)
3319 struct ctl_table *tbl;
3321 tbl = net->ipv4.route_hdr->ctl_table_arg;
3322 unregister_net_sysctl_table(net->ipv4.route_hdr);
3323 BUG_ON(tbl == ipv4_route_flush_table);
3324 kfree(tbl);
3327 static __net_initdata struct pernet_operations sysctl_route_ops = {
3328 .init = sysctl_route_net_init,
3329 .exit = sysctl_route_net_exit,
3331 #endif
3333 static __net_init int rt_genid_init(struct net *net)
3335 get_random_bytes(&net->ipv4.rt_genid,
3336 sizeof(net->ipv4.rt_genid));
3337 get_random_bytes(&net->ipv4.dev_addr_genid,
3338 sizeof(net->ipv4.dev_addr_genid));
3339 return 0;
3342 static __net_initdata struct pernet_operations rt_genid_ops = {
3343 .init = rt_genid_init,
3347 #ifdef CONFIG_IP_ROUTE_CLASSID
3348 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3349 #endif /* CONFIG_IP_ROUTE_CLASSID */
3351 static __initdata unsigned long rhash_entries;
3352 static int __init set_rhash_entries(char *str)
3354 if (!str)
3355 return 0;
3356 rhash_entries = simple_strtoul(str, &str, 0);
3357 return 1;
3359 __setup("rhash_entries=", set_rhash_entries);
3361 int __init ip_rt_init(void)
3363 int rc = 0;
3365 #ifdef CONFIG_IP_ROUTE_CLASSID
3366 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3367 if (!ip_rt_acct)
3368 panic("IP: failed to allocate ip_rt_acct\n");
3369 #endif
3371 ipv4_dst_ops.kmem_cachep =
3372 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3373 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3375 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3377 if (dst_entries_init(&ipv4_dst_ops) < 0)
3378 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3380 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3381 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3383 rt_hash_table = (struct rt_hash_bucket *)
3384 alloc_large_system_hash("IP route cache",
3385 sizeof(struct rt_hash_bucket),
3386 rhash_entries,
3387 (totalram_pages >= 128 * 1024) ?
3388 15 : 17,
3390 &rt_hash_log,
3391 &rt_hash_mask,
3392 rhash_entries ? 0 : 512 * 1024);
3393 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3394 rt_hash_lock_init();
3396 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3397 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3399 devinet_init();
3400 ip_fib_init();
3402 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3403 expires_ljiffies = jiffies;
3404 schedule_delayed_work(&expires_work,
3405 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3407 if (ip_rt_proc_init())
3408 printk(KERN_ERR "Unable to create route proc files\n");
3409 #ifdef CONFIG_XFRM
3410 xfrm_init();
3411 xfrm4_init(ip_rt_max_size);
3412 #endif
3413 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3415 #ifdef CONFIG_SYSCTL
3416 register_pernet_subsys(&sysctl_route_ops);
3417 #endif
3418 register_pernet_subsys(&rt_genid_ops);
3419 return rc;
3422 #ifdef CONFIG_SYSCTL
3424 * We really need to sanitize the damn ipv4 init order, then all
3425 * this nonsense will go away.
3427 void __init ip_static_sysctl_init(void)
3429 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3431 #endif