ipv4: fix lockdep splat in rt_cache_seq_show
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv4 / route.c
blob8b3661b3cda6cd6ad088aad61d7c93896545889c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 #include <net/atmclip.h>
112 #include <net/secure_seq.h>
114 #define RT_FL_TOS(oldflp4) \
115 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
117 #define IP_MAX_MTU 0xFFF0
119 #define RT_GC_TIMEOUT (300*HZ)
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
124 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125 static int ip_rt_redirect_number __read_mostly = 9;
126 static int ip_rt_redirect_load __read_mostly = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly = HZ;
129 static int ip_rt_error_burst __read_mostly = 5 * HZ;
130 static int ip_rt_gc_elasticity __read_mostly = 8;
131 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133 static int ip_rt_min_advmss __read_mostly = 256;
134 static int rt_chain_length_max __read_mostly = 20;
137 * Interface to generic destination cache.
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
143 static void ipv4_dst_destroy(struct dst_entry *dst);
144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145 static void ipv4_link_failure(struct sk_buff *skb);
146 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
147 static int rt_garbage_collect(struct dst_ops *ops);
149 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
150 int how)
154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156 struct rtable *rt = (struct rtable *) dst;
157 struct inet_peer *peer;
158 u32 *p = NULL;
160 if (!rt->peer)
161 rt_bind_peer(rt, rt->rt_dst, 1);
163 peer = rt->peer;
164 if (peer) {
165 u32 *old_p = __DST_METRICS_PTR(old);
166 unsigned long prev, new;
168 p = peer->metrics;
169 if (inet_metrics_new(peer))
170 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
172 new = (unsigned long) p;
173 prev = cmpxchg(&dst->_metrics, old, new);
175 if (prev != old) {
176 p = __DST_METRICS_PTR(prev);
177 if (prev & DST_METRICS_READ_ONLY)
178 p = NULL;
179 } else {
180 if (rt->fi) {
181 fib_info_put(rt->fi);
182 rt->fi = NULL;
186 return p;
189 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
191 static struct dst_ops ipv4_dst_ops = {
192 .family = AF_INET,
193 .protocol = cpu_to_be16(ETH_P_IP),
194 .gc = rt_garbage_collect,
195 .check = ipv4_dst_check,
196 .default_advmss = ipv4_default_advmss,
197 .default_mtu = ipv4_default_mtu,
198 .cow_metrics = ipv4_cow_metrics,
199 .destroy = ipv4_dst_destroy,
200 .ifdown = ipv4_dst_ifdown,
201 .negative_advice = ipv4_negative_advice,
202 .link_failure = ipv4_link_failure,
203 .update_pmtu = ip_rt_update_pmtu,
204 .local_out = __ip_local_out,
205 .neigh_lookup = ipv4_neigh_lookup,
208 #define ECN_OR_COST(class) TC_PRIO_##class
210 const __u8 ip_tos2prio[16] = {
211 TC_PRIO_BESTEFFORT,
212 ECN_OR_COST(BESTEFFORT),
213 TC_PRIO_BESTEFFORT,
214 ECN_OR_COST(BESTEFFORT),
215 TC_PRIO_BULK,
216 ECN_OR_COST(BULK),
217 TC_PRIO_BULK,
218 ECN_OR_COST(BULK),
219 TC_PRIO_INTERACTIVE,
220 ECN_OR_COST(INTERACTIVE),
221 TC_PRIO_INTERACTIVE,
222 ECN_OR_COST(INTERACTIVE),
223 TC_PRIO_INTERACTIVE_BULK,
224 ECN_OR_COST(INTERACTIVE_BULK),
225 TC_PRIO_INTERACTIVE_BULK,
226 ECN_OR_COST(INTERACTIVE_BULK)
231 * Route cache.
234 /* The locking scheme is rather straight forward:
236 * 1) Read-Copy Update protects the buckets of the central route hash.
237 * 2) Only writers remove entries, and they hold the lock
238 * as they look at rtable reference counts.
239 * 3) Only readers acquire references to rtable entries,
240 * they do so with atomic increments and with the
241 * lock held.
244 struct rt_hash_bucket {
245 struct rtable __rcu *chain;
248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
249 defined(CONFIG_PROVE_LOCKING)
251 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
252 * The size of this table is a power of two and depends on the number of CPUS.
253 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
255 #ifdef CONFIG_LOCKDEP
256 # define RT_HASH_LOCK_SZ 256
257 #else
258 # if NR_CPUS >= 32
259 # define RT_HASH_LOCK_SZ 4096
260 # elif NR_CPUS >= 16
261 # define RT_HASH_LOCK_SZ 2048
262 # elif NR_CPUS >= 8
263 # define RT_HASH_LOCK_SZ 1024
264 # elif NR_CPUS >= 4
265 # define RT_HASH_LOCK_SZ 512
266 # else
267 # define RT_HASH_LOCK_SZ 256
268 # endif
269 #endif
271 static spinlock_t *rt_hash_locks;
272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
274 static __init void rt_hash_lock_init(void)
276 int i;
278 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
279 GFP_KERNEL);
280 if (!rt_hash_locks)
281 panic("IP: failed to allocate rt_hash_locks\n");
283 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
284 spin_lock_init(&rt_hash_locks[i]);
286 #else
287 # define rt_hash_lock_addr(slot) NULL
289 static inline void rt_hash_lock_init(void)
292 #endif
294 static struct rt_hash_bucket *rt_hash_table __read_mostly;
295 static unsigned rt_hash_mask __read_mostly;
296 static unsigned int rt_hash_log __read_mostly;
298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
302 int genid)
304 return jhash_3words((__force u32)daddr, (__force u32)saddr,
305 idx, genid)
306 & rt_hash_mask;
309 static inline int rt_genid(struct net *net)
311 return atomic_read(&net->ipv4.rt_genid);
314 #ifdef CONFIG_PROC_FS
315 struct rt_cache_iter_state {
316 struct seq_net_private p;
317 int bucket;
318 int genid;
321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
323 struct rt_cache_iter_state *st = seq->private;
324 struct rtable *r = NULL;
326 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
327 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
328 continue;
329 rcu_read_lock_bh();
330 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
331 while (r) {
332 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
333 r->rt_genid == st->genid)
334 return r;
335 r = rcu_dereference_bh(r->dst.rt_next);
337 rcu_read_unlock_bh();
339 return r;
342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
343 struct rtable *r)
345 struct rt_cache_iter_state *st = seq->private;
347 r = rcu_dereference_bh(r->dst.rt_next);
348 while (!r) {
349 rcu_read_unlock_bh();
350 do {
351 if (--st->bucket < 0)
352 return NULL;
353 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
354 rcu_read_lock_bh();
355 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
357 return r;
360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
361 struct rtable *r)
363 struct rt_cache_iter_state *st = seq->private;
364 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
365 if (dev_net(r->dst.dev) != seq_file_net(seq))
366 continue;
367 if (r->rt_genid == st->genid)
368 break;
370 return r;
373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
375 struct rtable *r = rt_cache_get_first(seq);
377 if (r)
378 while (pos && (r = rt_cache_get_next(seq, r)))
379 --pos;
380 return pos ? NULL : r;
383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
385 struct rt_cache_iter_state *st = seq->private;
386 if (*pos)
387 return rt_cache_get_idx(seq, *pos - 1);
388 st->genid = rt_genid(seq_file_net(seq));
389 return SEQ_START_TOKEN;
392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
394 struct rtable *r;
396 if (v == SEQ_START_TOKEN)
397 r = rt_cache_get_first(seq);
398 else
399 r = rt_cache_get_next(seq, v);
400 ++*pos;
401 return r;
404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
406 if (v && v != SEQ_START_TOKEN)
407 rcu_read_unlock_bh();
410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
412 if (v == SEQ_START_TOKEN)
413 seq_printf(seq, "%-127s\n",
414 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
415 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
416 "HHUptod\tSpecDst");
417 else {
418 struct rtable *r = v;
419 struct neighbour *n;
420 int len, HHUptod;
422 rcu_read_lock();
423 n = dst_get_neighbour(&r->dst);
424 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
425 rcu_read_unlock();
427 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
428 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
429 r->dst.dev ? r->dst.dev->name : "*",
430 (__force u32)r->rt_dst,
431 (__force u32)r->rt_gateway,
432 r->rt_flags, atomic_read(&r->dst.__refcnt),
433 r->dst.__use, 0, (__force u32)r->rt_src,
434 dst_metric_advmss(&r->dst) + 40,
435 dst_metric(&r->dst, RTAX_WINDOW),
436 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
437 dst_metric(&r->dst, RTAX_RTTVAR)),
438 r->rt_key_tos,
440 HHUptod,
441 r->rt_spec_dst, &len);
443 seq_printf(seq, "%*s\n", 127 - len, "");
445 return 0;
448 static const struct seq_operations rt_cache_seq_ops = {
449 .start = rt_cache_seq_start,
450 .next = rt_cache_seq_next,
451 .stop = rt_cache_seq_stop,
452 .show = rt_cache_seq_show,
455 static int rt_cache_seq_open(struct inode *inode, struct file *file)
457 return seq_open_net(inode, file, &rt_cache_seq_ops,
458 sizeof(struct rt_cache_iter_state));
461 static const struct file_operations rt_cache_seq_fops = {
462 .owner = THIS_MODULE,
463 .open = rt_cache_seq_open,
464 .read = seq_read,
465 .llseek = seq_lseek,
466 .release = seq_release_net,
470 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
472 int cpu;
474 if (*pos == 0)
475 return SEQ_START_TOKEN;
477 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
478 if (!cpu_possible(cpu))
479 continue;
480 *pos = cpu+1;
481 return &per_cpu(rt_cache_stat, cpu);
483 return NULL;
486 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
488 int cpu;
490 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
491 if (!cpu_possible(cpu))
492 continue;
493 *pos = cpu+1;
494 return &per_cpu(rt_cache_stat, cpu);
496 return NULL;
500 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
507 struct rt_cache_stat *st = v;
509 if (v == SEQ_START_TOKEN) {
510 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
511 return 0;
514 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
515 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
516 dst_entries_get_slow(&ipv4_dst_ops),
517 st->in_hit,
518 st->in_slow_tot,
519 st->in_slow_mc,
520 st->in_no_route,
521 st->in_brd,
522 st->in_martian_dst,
523 st->in_martian_src,
525 st->out_hit,
526 st->out_slow_tot,
527 st->out_slow_mc,
529 st->gc_total,
530 st->gc_ignored,
531 st->gc_goal_miss,
532 st->gc_dst_overflow,
533 st->in_hlist_search,
534 st->out_hlist_search
536 return 0;
539 static const struct seq_operations rt_cpu_seq_ops = {
540 .start = rt_cpu_seq_start,
541 .next = rt_cpu_seq_next,
542 .stop = rt_cpu_seq_stop,
543 .show = rt_cpu_seq_show,
547 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
549 return seq_open(file, &rt_cpu_seq_ops);
552 static const struct file_operations rt_cpu_seq_fops = {
553 .owner = THIS_MODULE,
554 .open = rt_cpu_seq_open,
555 .read = seq_read,
556 .llseek = seq_lseek,
557 .release = seq_release,
560 #ifdef CONFIG_IP_ROUTE_CLASSID
561 static int rt_acct_proc_show(struct seq_file *m, void *v)
563 struct ip_rt_acct *dst, *src;
564 unsigned int i, j;
566 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
567 if (!dst)
568 return -ENOMEM;
570 for_each_possible_cpu(i) {
571 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
572 for (j = 0; j < 256; j++) {
573 dst[j].o_bytes += src[j].o_bytes;
574 dst[j].o_packets += src[j].o_packets;
575 dst[j].i_bytes += src[j].i_bytes;
576 dst[j].i_packets += src[j].i_packets;
580 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
581 kfree(dst);
582 return 0;
585 static int rt_acct_proc_open(struct inode *inode, struct file *file)
587 return single_open(file, rt_acct_proc_show, NULL);
590 static const struct file_operations rt_acct_proc_fops = {
591 .owner = THIS_MODULE,
592 .open = rt_acct_proc_open,
593 .read = seq_read,
594 .llseek = seq_lseek,
595 .release = single_release,
597 #endif
599 static int __net_init ip_rt_do_proc_init(struct net *net)
601 struct proc_dir_entry *pde;
603 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
604 &rt_cache_seq_fops);
605 if (!pde)
606 goto err1;
608 pde = proc_create("rt_cache", S_IRUGO,
609 net->proc_net_stat, &rt_cpu_seq_fops);
610 if (!pde)
611 goto err2;
613 #ifdef CONFIG_IP_ROUTE_CLASSID
614 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
615 if (!pde)
616 goto err3;
617 #endif
618 return 0;
620 #ifdef CONFIG_IP_ROUTE_CLASSID
621 err3:
622 remove_proc_entry("rt_cache", net->proc_net_stat);
623 #endif
624 err2:
625 remove_proc_entry("rt_cache", net->proc_net);
626 err1:
627 return -ENOMEM;
630 static void __net_exit ip_rt_do_proc_exit(struct net *net)
632 remove_proc_entry("rt_cache", net->proc_net_stat);
633 remove_proc_entry("rt_cache", net->proc_net);
634 #ifdef CONFIG_IP_ROUTE_CLASSID
635 remove_proc_entry("rt_acct", net->proc_net);
636 #endif
639 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
640 .init = ip_rt_do_proc_init,
641 .exit = ip_rt_do_proc_exit,
644 static int __init ip_rt_proc_init(void)
646 return register_pernet_subsys(&ip_rt_proc_ops);
649 #else
650 static inline int ip_rt_proc_init(void)
652 return 0;
654 #endif /* CONFIG_PROC_FS */
656 static inline void rt_free(struct rtable *rt)
658 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
661 static inline void rt_drop(struct rtable *rt)
663 ip_rt_put(rt);
664 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
667 static inline int rt_fast_clean(struct rtable *rth)
669 /* Kill broadcast/multicast entries very aggresively, if they
670 collide in hash table with more useful entries */
671 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
672 rt_is_input_route(rth) && rth->dst.rt_next;
675 static inline int rt_valuable(struct rtable *rth)
677 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
678 (rth->peer && rth->peer->pmtu_expires);
681 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
683 unsigned long age;
684 int ret = 0;
686 if (atomic_read(&rth->dst.__refcnt))
687 goto out;
689 age = jiffies - rth->dst.lastuse;
690 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
691 (age <= tmo2 && rt_valuable(rth)))
692 goto out;
693 ret = 1;
694 out: return ret;
697 /* Bits of score are:
698 * 31: very valuable
699 * 30: not quite useless
700 * 29..0: usage counter
702 static inline u32 rt_score(struct rtable *rt)
704 u32 score = jiffies - rt->dst.lastuse;
706 score = ~score & ~(3<<30);
708 if (rt_valuable(rt))
709 score |= (1<<31);
711 if (rt_is_output_route(rt) ||
712 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
713 score |= (1<<30);
715 return score;
718 static inline bool rt_caching(const struct net *net)
720 return net->ipv4.current_rt_cache_rebuild_count <=
721 net->ipv4.sysctl_rt_cache_rebuild_count;
724 static inline bool compare_hash_inputs(const struct rtable *rt1,
725 const struct rtable *rt2)
727 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
728 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
729 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
732 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
734 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
735 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
736 (rt1->rt_mark ^ rt2->rt_mark) |
737 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
738 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
739 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
742 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
744 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
747 static inline int rt_is_expired(struct rtable *rth)
749 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
753 * Perform a full scan of hash table and free all entries.
754 * Can be called by a softirq or a process.
755 * In the later case, we want to be reschedule if necessary
757 static void rt_do_flush(struct net *net, int process_context)
759 unsigned int i;
760 struct rtable *rth, *next;
762 for (i = 0; i <= rt_hash_mask; i++) {
763 struct rtable __rcu **pprev;
764 struct rtable *list;
766 if (process_context && need_resched())
767 cond_resched();
768 rth = rcu_dereference_raw(rt_hash_table[i].chain);
769 if (!rth)
770 continue;
772 spin_lock_bh(rt_hash_lock_addr(i));
774 list = NULL;
775 pprev = &rt_hash_table[i].chain;
776 rth = rcu_dereference_protected(*pprev,
777 lockdep_is_held(rt_hash_lock_addr(i)));
779 while (rth) {
780 next = rcu_dereference_protected(rth->dst.rt_next,
781 lockdep_is_held(rt_hash_lock_addr(i)));
783 if (!net ||
784 net_eq(dev_net(rth->dst.dev), net)) {
785 rcu_assign_pointer(*pprev, next);
786 rcu_assign_pointer(rth->dst.rt_next, list);
787 list = rth;
788 } else {
789 pprev = &rth->dst.rt_next;
791 rth = next;
794 spin_unlock_bh(rt_hash_lock_addr(i));
796 for (; list; list = next) {
797 next = rcu_dereference_protected(list->dst.rt_next, 1);
798 rt_free(list);
804 * While freeing expired entries, we compute average chain length
805 * and standard deviation, using fixed-point arithmetic.
806 * This to have an estimation of rt_chain_length_max
807 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
808 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
811 #define FRACT_BITS 3
812 #define ONE (1UL << FRACT_BITS)
815 * Given a hash chain and an item in this hash chain,
816 * find if a previous entry has the same hash_inputs
817 * (but differs on tos, mark or oif)
818 * Returns 0 if an alias is found.
819 * Returns ONE if rth has no alias before itself.
821 static int has_noalias(const struct rtable *head, const struct rtable *rth)
823 const struct rtable *aux = head;
825 while (aux != rth) {
826 if (compare_hash_inputs(aux, rth))
827 return 0;
828 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
830 return ONE;
834 * Perturbation of rt_genid by a small quantity [1..256]
835 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
836 * many times (2^24) without giving recent rt_genid.
837 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
839 static void rt_cache_invalidate(struct net *net)
841 unsigned char shuffle;
843 get_random_bytes(&shuffle, sizeof(shuffle));
844 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
848 * delay < 0 : invalidate cache (fast : entries will be deleted later)
849 * delay >= 0 : invalidate & flush cache (can be long)
851 void rt_cache_flush(struct net *net, int delay)
853 rt_cache_invalidate(net);
854 if (delay >= 0)
855 rt_do_flush(net, !in_softirq());
858 /* Flush previous cache invalidated entries from the cache */
859 void rt_cache_flush_batch(struct net *net)
861 rt_do_flush(net, !in_softirq());
864 static void rt_emergency_hash_rebuild(struct net *net)
866 if (net_ratelimit())
867 printk(KERN_WARNING "Route hash chain too long!\n");
868 rt_cache_invalidate(net);
872 Short description of GC goals.
874 We want to build algorithm, which will keep routing cache
875 at some equilibrium point, when number of aged off entries
876 is kept approximately equal to newly generated ones.
878 Current expiration strength is variable "expire".
879 We try to adjust it dynamically, so that if networking
880 is idle expires is large enough to keep enough of warm entries,
881 and when load increases it reduces to limit cache size.
884 static int rt_garbage_collect(struct dst_ops *ops)
886 static unsigned long expire = RT_GC_TIMEOUT;
887 static unsigned long last_gc;
888 static int rover;
889 static int equilibrium;
890 struct rtable *rth;
891 struct rtable __rcu **rthp;
892 unsigned long now = jiffies;
893 int goal;
894 int entries = dst_entries_get_fast(&ipv4_dst_ops);
897 * Garbage collection is pretty expensive,
898 * do not make it too frequently.
901 RT_CACHE_STAT_INC(gc_total);
903 if (now - last_gc < ip_rt_gc_min_interval &&
904 entries < ip_rt_max_size) {
905 RT_CACHE_STAT_INC(gc_ignored);
906 goto out;
909 entries = dst_entries_get_slow(&ipv4_dst_ops);
910 /* Calculate number of entries, which we want to expire now. */
911 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
912 if (goal <= 0) {
913 if (equilibrium < ipv4_dst_ops.gc_thresh)
914 equilibrium = ipv4_dst_ops.gc_thresh;
915 goal = entries - equilibrium;
916 if (goal > 0) {
917 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
918 goal = entries - equilibrium;
920 } else {
921 /* We are in dangerous area. Try to reduce cache really
922 * aggressively.
924 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
925 equilibrium = entries - goal;
928 if (now - last_gc >= ip_rt_gc_min_interval)
929 last_gc = now;
931 if (goal <= 0) {
932 equilibrium += goal;
933 goto work_done;
936 do {
937 int i, k;
939 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
940 unsigned long tmo = expire;
942 k = (k + 1) & rt_hash_mask;
943 rthp = &rt_hash_table[k].chain;
944 spin_lock_bh(rt_hash_lock_addr(k));
945 while ((rth = rcu_dereference_protected(*rthp,
946 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
947 if (!rt_is_expired(rth) &&
948 !rt_may_expire(rth, tmo, expire)) {
949 tmo >>= 1;
950 rthp = &rth->dst.rt_next;
951 continue;
953 *rthp = rth->dst.rt_next;
954 rt_free(rth);
955 goal--;
957 spin_unlock_bh(rt_hash_lock_addr(k));
958 if (goal <= 0)
959 break;
961 rover = k;
963 if (goal <= 0)
964 goto work_done;
966 /* Goal is not achieved. We stop process if:
968 - if expire reduced to zero. Otherwise, expire is halfed.
969 - if table is not full.
970 - if we are called from interrupt.
971 - jiffies check is just fallback/debug loop breaker.
972 We will not spin here for long time in any case.
975 RT_CACHE_STAT_INC(gc_goal_miss);
977 if (expire == 0)
978 break;
980 expire >>= 1;
982 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
983 goto out;
984 } while (!in_softirq() && time_before_eq(jiffies, now));
986 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
987 goto out;
988 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
989 goto out;
990 if (net_ratelimit())
991 printk(KERN_WARNING "dst cache overflow\n");
992 RT_CACHE_STAT_INC(gc_dst_overflow);
993 return 1;
995 work_done:
996 expire += ip_rt_gc_min_interval;
997 if (expire > ip_rt_gc_timeout ||
998 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
999 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1000 expire = ip_rt_gc_timeout;
1001 out: return 0;
1005 * Returns number of entries in a hash chain that have different hash_inputs
1007 static int slow_chain_length(const struct rtable *head)
1009 int length = 0;
1010 const struct rtable *rth = head;
1012 while (rth) {
1013 length += has_noalias(head, rth);
1014 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1016 return length >> FRACT_BITS;
1019 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1021 struct neigh_table *tbl = &arp_tbl;
1022 static const __be32 inaddr_any = 0;
1023 struct net_device *dev = dst->dev;
1024 const __be32 *pkey = daddr;
1025 struct neighbour *n;
1027 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1028 if (dev->type == ARPHRD_ATM)
1029 tbl = clip_tbl_hook;
1030 #endif
1031 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1032 pkey = &inaddr_any;
1034 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1035 if (n)
1036 return n;
1037 return neigh_create(tbl, pkey, dev);
1040 static int rt_bind_neighbour(struct rtable *rt)
1042 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1043 if (IS_ERR(n))
1044 return PTR_ERR(n);
1045 dst_set_neighbour(&rt->dst, n);
1047 return 0;
1050 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1051 struct sk_buff *skb, int ifindex)
1053 struct rtable *rth, *cand;
1054 struct rtable __rcu **rthp, **candp;
1055 unsigned long now;
1056 u32 min_score;
1057 int chain_length;
1058 int attempts = !in_softirq();
1060 restart:
1061 chain_length = 0;
1062 min_score = ~(u32)0;
1063 cand = NULL;
1064 candp = NULL;
1065 now = jiffies;
1067 if (!rt_caching(dev_net(rt->dst.dev))) {
1069 * If we're not caching, just tell the caller we
1070 * were successful and don't touch the route. The
1071 * caller hold the sole reference to the cache entry, and
1072 * it will be released when the caller is done with it.
1073 * If we drop it here, the callers have no way to resolve routes
1074 * when we're not caching. Instead, just point *rp at rt, so
1075 * the caller gets a single use out of the route
1076 * Note that we do rt_free on this new route entry, so that
1077 * once its refcount hits zero, we are still able to reap it
1078 * (Thanks Alexey)
1079 * Note: To avoid expensive rcu stuff for this uncached dst,
1080 * we set DST_NOCACHE so that dst_release() can free dst without
1081 * waiting a grace period.
1084 rt->dst.flags |= DST_NOCACHE;
1085 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1086 int err = rt_bind_neighbour(rt);
1087 if (err) {
1088 if (net_ratelimit())
1089 printk(KERN_WARNING
1090 "Neighbour table failure & not caching routes.\n");
1091 ip_rt_put(rt);
1092 return ERR_PTR(err);
1096 goto skip_hashing;
1099 rthp = &rt_hash_table[hash].chain;
1101 spin_lock_bh(rt_hash_lock_addr(hash));
1102 while ((rth = rcu_dereference_protected(*rthp,
1103 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1104 if (rt_is_expired(rth)) {
1105 *rthp = rth->dst.rt_next;
1106 rt_free(rth);
1107 continue;
1109 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1110 /* Put it first */
1111 *rthp = rth->dst.rt_next;
1113 * Since lookup is lockfree, the deletion
1114 * must be visible to another weakly ordered CPU before
1115 * the insertion at the start of the hash chain.
1117 rcu_assign_pointer(rth->dst.rt_next,
1118 rt_hash_table[hash].chain);
1120 * Since lookup is lockfree, the update writes
1121 * must be ordered for consistency on SMP.
1123 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1125 dst_use(&rth->dst, now);
1126 spin_unlock_bh(rt_hash_lock_addr(hash));
1128 rt_drop(rt);
1129 if (skb)
1130 skb_dst_set(skb, &rth->dst);
1131 return rth;
1134 if (!atomic_read(&rth->dst.__refcnt)) {
1135 u32 score = rt_score(rth);
1137 if (score <= min_score) {
1138 cand = rth;
1139 candp = rthp;
1140 min_score = score;
1144 chain_length++;
1146 rthp = &rth->dst.rt_next;
1149 if (cand) {
1150 /* ip_rt_gc_elasticity used to be average length of chain
1151 * length, when exceeded gc becomes really aggressive.
1153 * The second limit is less certain. At the moment it allows
1154 * only 2 entries per bucket. We will see.
1156 if (chain_length > ip_rt_gc_elasticity) {
1157 *candp = cand->dst.rt_next;
1158 rt_free(cand);
1160 } else {
1161 if (chain_length > rt_chain_length_max &&
1162 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1163 struct net *net = dev_net(rt->dst.dev);
1164 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1165 if (!rt_caching(net)) {
1166 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1167 rt->dst.dev->name, num);
1169 rt_emergency_hash_rebuild(net);
1170 spin_unlock_bh(rt_hash_lock_addr(hash));
1172 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1173 ifindex, rt_genid(net));
1174 goto restart;
1178 /* Try to bind route to arp only if it is output
1179 route or unicast forwarding path.
1181 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1182 int err = rt_bind_neighbour(rt);
1183 if (err) {
1184 spin_unlock_bh(rt_hash_lock_addr(hash));
1186 if (err != -ENOBUFS) {
1187 rt_drop(rt);
1188 return ERR_PTR(err);
1191 /* Neighbour tables are full and nothing
1192 can be released. Try to shrink route cache,
1193 it is most likely it holds some neighbour records.
1195 if (attempts-- > 0) {
1196 int saved_elasticity = ip_rt_gc_elasticity;
1197 int saved_int = ip_rt_gc_min_interval;
1198 ip_rt_gc_elasticity = 1;
1199 ip_rt_gc_min_interval = 0;
1200 rt_garbage_collect(&ipv4_dst_ops);
1201 ip_rt_gc_min_interval = saved_int;
1202 ip_rt_gc_elasticity = saved_elasticity;
1203 goto restart;
1206 if (net_ratelimit())
1207 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1208 rt_drop(rt);
1209 return ERR_PTR(-ENOBUFS);
1213 rt->dst.rt_next = rt_hash_table[hash].chain;
1216 * Since lookup is lockfree, we must make sure
1217 * previous writes to rt are committed to memory
1218 * before making rt visible to other CPUS.
1220 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1222 spin_unlock_bh(rt_hash_lock_addr(hash));
1224 skip_hashing:
1225 if (skb)
1226 skb_dst_set(skb, &rt->dst);
1227 return rt;
1230 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1232 static u32 rt_peer_genid(void)
1234 return atomic_read(&__rt_peer_genid);
1237 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1239 struct inet_peer *peer;
1241 peer = inet_getpeer_v4(daddr, create);
1243 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1244 inet_putpeer(peer);
1245 else
1246 rt->rt_peer_genid = rt_peer_genid();
1250 * Peer allocation may fail only in serious out-of-memory conditions. However
1251 * we still can generate some output.
1252 * Random ID selection looks a bit dangerous because we have no chances to
1253 * select ID being unique in a reasonable period of time.
1254 * But broken packet identifier may be better than no packet at all.
1256 static void ip_select_fb_ident(struct iphdr *iph)
1258 static DEFINE_SPINLOCK(ip_fb_id_lock);
1259 static u32 ip_fallback_id;
1260 u32 salt;
1262 spin_lock_bh(&ip_fb_id_lock);
1263 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1264 iph->id = htons(salt & 0xFFFF);
1265 ip_fallback_id = salt;
1266 spin_unlock_bh(&ip_fb_id_lock);
1269 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1271 struct rtable *rt = (struct rtable *) dst;
1273 if (rt) {
1274 if (rt->peer == NULL)
1275 rt_bind_peer(rt, rt->rt_dst, 1);
1277 /* If peer is attached to destination, it is never detached,
1278 so that we need not to grab a lock to dereference it.
1280 if (rt->peer) {
1281 iph->id = htons(inet_getid(rt->peer, more));
1282 return;
1284 } else
1285 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1286 __builtin_return_address(0));
1288 ip_select_fb_ident(iph);
1290 EXPORT_SYMBOL(__ip_select_ident);
1292 static void rt_del(unsigned hash, struct rtable *rt)
1294 struct rtable __rcu **rthp;
1295 struct rtable *aux;
1297 rthp = &rt_hash_table[hash].chain;
1298 spin_lock_bh(rt_hash_lock_addr(hash));
1299 ip_rt_put(rt);
1300 while ((aux = rcu_dereference_protected(*rthp,
1301 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1302 if (aux == rt || rt_is_expired(aux)) {
1303 *rthp = aux->dst.rt_next;
1304 rt_free(aux);
1305 continue;
1307 rthp = &aux->dst.rt_next;
1309 spin_unlock_bh(rt_hash_lock_addr(hash));
1312 /* called in rcu_read_lock() section */
1313 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1314 __be32 saddr, struct net_device *dev)
1316 struct in_device *in_dev = __in_dev_get_rcu(dev);
1317 struct inet_peer *peer;
1318 struct net *net;
1320 if (!in_dev)
1321 return;
1323 net = dev_net(dev);
1324 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1325 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1326 ipv4_is_zeronet(new_gw))
1327 goto reject_redirect;
1329 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1330 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1331 goto reject_redirect;
1332 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1333 goto reject_redirect;
1334 } else {
1335 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1336 goto reject_redirect;
1339 peer = inet_getpeer_v4(daddr, 1);
1340 if (peer) {
1341 peer->redirect_learned.a4 = new_gw;
1343 inet_putpeer(peer);
1345 atomic_inc(&__rt_peer_genid);
1347 return;
1349 reject_redirect:
1350 #ifdef CONFIG_IP_ROUTE_VERBOSE
1351 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1352 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1353 " Advised path = %pI4 -> %pI4\n",
1354 &old_gw, dev->name, &new_gw,
1355 &saddr, &daddr);
1356 #endif
1360 static bool peer_pmtu_expired(struct inet_peer *peer)
1362 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1364 return orig &&
1365 time_after_eq(jiffies, orig) &&
1366 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1369 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1371 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1373 return orig &&
1374 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1377 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1379 struct rtable *rt = (struct rtable *)dst;
1380 struct dst_entry *ret = dst;
1382 if (rt) {
1383 if (dst->obsolete > 0) {
1384 ip_rt_put(rt);
1385 ret = NULL;
1386 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1387 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1388 rt->rt_oif,
1389 rt_genid(dev_net(dst->dev)));
1390 rt_del(hash, rt);
1391 ret = NULL;
1392 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1393 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1396 return ret;
1400 * Algorithm:
1401 * 1. The first ip_rt_redirect_number redirects are sent
1402 * with exponential backoff, then we stop sending them at all,
1403 * assuming that the host ignores our redirects.
1404 * 2. If we did not see packets requiring redirects
1405 * during ip_rt_redirect_silence, we assume that the host
1406 * forgot redirected route and start to send redirects again.
1408 * This algorithm is much cheaper and more intelligent than dumb load limiting
1409 * in icmp.c.
1411 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1412 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1415 void ip_rt_send_redirect(struct sk_buff *skb)
1417 struct rtable *rt = skb_rtable(skb);
1418 struct in_device *in_dev;
1419 struct inet_peer *peer;
1420 int log_martians;
1422 rcu_read_lock();
1423 in_dev = __in_dev_get_rcu(rt->dst.dev);
1424 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1425 rcu_read_unlock();
1426 return;
1428 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1429 rcu_read_unlock();
1431 if (!rt->peer)
1432 rt_bind_peer(rt, rt->rt_dst, 1);
1433 peer = rt->peer;
1434 if (!peer) {
1435 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1436 return;
1439 /* No redirected packets during ip_rt_redirect_silence;
1440 * reset the algorithm.
1442 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1443 peer->rate_tokens = 0;
1445 /* Too many ignored redirects; do not send anything
1446 * set dst.rate_last to the last seen redirected packet.
1448 if (peer->rate_tokens >= ip_rt_redirect_number) {
1449 peer->rate_last = jiffies;
1450 return;
1453 /* Check for load limit; set rate_last to the latest sent
1454 * redirect.
1456 if (peer->rate_tokens == 0 ||
1457 time_after(jiffies,
1458 (peer->rate_last +
1459 (ip_rt_redirect_load << peer->rate_tokens)))) {
1460 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1461 peer->rate_last = jiffies;
1462 ++peer->rate_tokens;
1463 #ifdef CONFIG_IP_ROUTE_VERBOSE
1464 if (log_martians &&
1465 peer->rate_tokens == ip_rt_redirect_number &&
1466 net_ratelimit())
1467 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1468 &ip_hdr(skb)->saddr, rt->rt_iif,
1469 &rt->rt_dst, &rt->rt_gateway);
1470 #endif
1474 static int ip_error(struct sk_buff *skb)
1476 struct rtable *rt = skb_rtable(skb);
1477 struct inet_peer *peer;
1478 unsigned long now;
1479 bool send;
1480 int code;
1482 switch (rt->dst.error) {
1483 case EINVAL:
1484 default:
1485 goto out;
1486 case EHOSTUNREACH:
1487 code = ICMP_HOST_UNREACH;
1488 break;
1489 case ENETUNREACH:
1490 code = ICMP_NET_UNREACH;
1491 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1492 IPSTATS_MIB_INNOROUTES);
1493 break;
1494 case EACCES:
1495 code = ICMP_PKT_FILTERED;
1496 break;
1499 if (!rt->peer)
1500 rt_bind_peer(rt, rt->rt_dst, 1);
1501 peer = rt->peer;
1503 send = true;
1504 if (peer) {
1505 now = jiffies;
1506 peer->rate_tokens += now - peer->rate_last;
1507 if (peer->rate_tokens > ip_rt_error_burst)
1508 peer->rate_tokens = ip_rt_error_burst;
1509 peer->rate_last = now;
1510 if (peer->rate_tokens >= ip_rt_error_cost)
1511 peer->rate_tokens -= ip_rt_error_cost;
1512 else
1513 send = false;
1515 if (send)
1516 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1518 out: kfree_skb(skb);
1519 return 0;
1523 * The last two values are not from the RFC but
1524 * are needed for AMPRnet AX.25 paths.
1527 static const unsigned short mtu_plateau[] =
1528 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1530 static inline unsigned short guess_mtu(unsigned short old_mtu)
1532 int i;
1534 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1535 if (old_mtu > mtu_plateau[i])
1536 return mtu_plateau[i];
1537 return 68;
1540 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1541 unsigned short new_mtu,
1542 struct net_device *dev)
1544 unsigned short old_mtu = ntohs(iph->tot_len);
1545 unsigned short est_mtu = 0;
1546 struct inet_peer *peer;
1548 peer = inet_getpeer_v4(iph->daddr, 1);
1549 if (peer) {
1550 unsigned short mtu = new_mtu;
1552 if (new_mtu < 68 || new_mtu >= old_mtu) {
1553 /* BSD 4.2 derived systems incorrectly adjust
1554 * tot_len by the IP header length, and report
1555 * a zero MTU in the ICMP message.
1557 if (mtu == 0 &&
1558 old_mtu >= 68 + (iph->ihl << 2))
1559 old_mtu -= iph->ihl << 2;
1560 mtu = guess_mtu(old_mtu);
1563 if (mtu < ip_rt_min_pmtu)
1564 mtu = ip_rt_min_pmtu;
1565 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1566 unsigned long pmtu_expires;
1568 pmtu_expires = jiffies + ip_rt_mtu_expires;
1569 if (!pmtu_expires)
1570 pmtu_expires = 1UL;
1572 est_mtu = mtu;
1573 peer->pmtu_learned = mtu;
1574 peer->pmtu_expires = pmtu_expires;
1577 inet_putpeer(peer);
1579 atomic_inc(&__rt_peer_genid);
1581 return est_mtu ? : new_mtu;
1584 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1586 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1588 if (!expires)
1589 return;
1590 if (time_before(jiffies, expires)) {
1591 u32 orig_dst_mtu = dst_mtu(dst);
1592 if (peer->pmtu_learned < orig_dst_mtu) {
1593 if (!peer->pmtu_orig)
1594 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1595 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1597 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1598 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1601 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1603 struct rtable *rt = (struct rtable *) dst;
1604 struct inet_peer *peer;
1606 dst_confirm(dst);
1608 if (!rt->peer)
1609 rt_bind_peer(rt, rt->rt_dst, 1);
1610 peer = rt->peer;
1611 if (peer) {
1612 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1614 if (mtu < ip_rt_min_pmtu)
1615 mtu = ip_rt_min_pmtu;
1616 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1618 pmtu_expires = jiffies + ip_rt_mtu_expires;
1619 if (!pmtu_expires)
1620 pmtu_expires = 1UL;
1622 peer->pmtu_learned = mtu;
1623 peer->pmtu_expires = pmtu_expires;
1625 atomic_inc(&__rt_peer_genid);
1626 rt->rt_peer_genid = rt_peer_genid();
1628 check_peer_pmtu(dst, peer);
1632 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1634 struct rtable *rt = (struct rtable *) dst;
1635 __be32 orig_gw = rt->rt_gateway;
1636 struct neighbour *n, *old_n;
1638 dst_confirm(&rt->dst);
1640 rt->rt_gateway = peer->redirect_learned.a4;
1642 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1643 if (IS_ERR(n))
1644 return PTR_ERR(n);
1645 old_n = xchg(&rt->dst._neighbour, n);
1646 if (old_n)
1647 neigh_release(old_n);
1648 if (!n || !(n->nud_state & NUD_VALID)) {
1649 if (n)
1650 neigh_event_send(n, NULL);
1651 rt->rt_gateway = orig_gw;
1652 return -EAGAIN;
1653 } else {
1654 rt->rt_flags |= RTCF_REDIRECTED;
1655 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1657 return 0;
1660 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1662 struct rtable *rt = (struct rtable *) dst;
1664 if (rt_is_expired(rt))
1665 return NULL;
1666 if (rt->rt_peer_genid != rt_peer_genid()) {
1667 struct inet_peer *peer;
1669 if (!rt->peer)
1670 rt_bind_peer(rt, rt->rt_dst, 0);
1672 peer = rt->peer;
1673 if (peer) {
1674 check_peer_pmtu(dst, peer);
1676 if (peer->redirect_learned.a4 &&
1677 peer->redirect_learned.a4 != rt->rt_gateway) {
1678 if (check_peer_redir(dst, peer))
1679 return NULL;
1683 rt->rt_peer_genid = rt_peer_genid();
1685 return dst;
1688 static void ipv4_dst_destroy(struct dst_entry *dst)
1690 struct rtable *rt = (struct rtable *) dst;
1691 struct inet_peer *peer = rt->peer;
1693 if (rt->fi) {
1694 fib_info_put(rt->fi);
1695 rt->fi = NULL;
1697 if (peer) {
1698 rt->peer = NULL;
1699 inet_putpeer(peer);
1704 static void ipv4_link_failure(struct sk_buff *skb)
1706 struct rtable *rt;
1708 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1710 rt = skb_rtable(skb);
1711 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1712 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1715 static int ip_rt_bug(struct sk_buff *skb)
1717 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1718 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1719 skb->dev ? skb->dev->name : "?");
1720 kfree_skb(skb);
1721 WARN_ON(1);
1722 return 0;
1726 We do not cache source address of outgoing interface,
1727 because it is used only by IP RR, TS and SRR options,
1728 so that it out of fast path.
1730 BTW remember: "addr" is allowed to be not aligned
1731 in IP options!
1734 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1736 __be32 src;
1738 if (rt_is_output_route(rt))
1739 src = ip_hdr(skb)->saddr;
1740 else {
1741 struct fib_result res;
1742 struct flowi4 fl4;
1743 struct iphdr *iph;
1745 iph = ip_hdr(skb);
1747 memset(&fl4, 0, sizeof(fl4));
1748 fl4.daddr = iph->daddr;
1749 fl4.saddr = iph->saddr;
1750 fl4.flowi4_tos = RT_TOS(iph->tos);
1751 fl4.flowi4_oif = rt->dst.dev->ifindex;
1752 fl4.flowi4_iif = skb->dev->ifindex;
1753 fl4.flowi4_mark = skb->mark;
1755 rcu_read_lock();
1756 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1757 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1758 else
1759 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1760 RT_SCOPE_UNIVERSE);
1761 rcu_read_unlock();
1763 memcpy(addr, &src, 4);
1766 #ifdef CONFIG_IP_ROUTE_CLASSID
1767 static void set_class_tag(struct rtable *rt, u32 tag)
1769 if (!(rt->dst.tclassid & 0xFFFF))
1770 rt->dst.tclassid |= tag & 0xFFFF;
1771 if (!(rt->dst.tclassid & 0xFFFF0000))
1772 rt->dst.tclassid |= tag & 0xFFFF0000;
1774 #endif
1776 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1778 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1780 if (advmss == 0) {
1781 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1782 ip_rt_min_advmss);
1783 if (advmss > 65535 - 40)
1784 advmss = 65535 - 40;
1786 return advmss;
1789 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1791 unsigned int mtu = dst->dev->mtu;
1793 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1794 const struct rtable *rt = (const struct rtable *) dst;
1796 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1797 mtu = 576;
1800 if (mtu > IP_MAX_MTU)
1801 mtu = IP_MAX_MTU;
1803 return mtu;
1806 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1807 struct fib_info *fi)
1809 struct inet_peer *peer;
1810 int create = 0;
1812 /* If a peer entry exists for this destination, we must hook
1813 * it up in order to get at cached metrics.
1815 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1816 create = 1;
1818 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1819 if (peer) {
1820 rt->rt_peer_genid = rt_peer_genid();
1821 if (inet_metrics_new(peer))
1822 memcpy(peer->metrics, fi->fib_metrics,
1823 sizeof(u32) * RTAX_MAX);
1824 dst_init_metrics(&rt->dst, peer->metrics, false);
1826 check_peer_pmtu(&rt->dst, peer);
1827 if (peer->redirect_learned.a4 &&
1828 peer->redirect_learned.a4 != rt->rt_gateway) {
1829 rt->rt_gateway = peer->redirect_learned.a4;
1830 rt->rt_flags |= RTCF_REDIRECTED;
1832 } else {
1833 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1834 rt->fi = fi;
1835 atomic_inc(&fi->fib_clntref);
1837 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1841 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1842 const struct fib_result *res,
1843 struct fib_info *fi, u16 type, u32 itag)
1845 struct dst_entry *dst = &rt->dst;
1847 if (fi) {
1848 if (FIB_RES_GW(*res) &&
1849 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1850 rt->rt_gateway = FIB_RES_GW(*res);
1851 rt_init_metrics(rt, fl4, fi);
1852 #ifdef CONFIG_IP_ROUTE_CLASSID
1853 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1854 #endif
1857 if (dst_mtu(dst) > IP_MAX_MTU)
1858 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1859 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1860 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1862 #ifdef CONFIG_IP_ROUTE_CLASSID
1863 #ifdef CONFIG_IP_MULTIPLE_TABLES
1864 set_class_tag(rt, fib_rules_tclass(res));
1865 #endif
1866 set_class_tag(rt, itag);
1867 #endif
1870 static struct rtable *rt_dst_alloc(struct net_device *dev,
1871 bool nopolicy, bool noxfrm)
1873 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1874 DST_HOST |
1875 (nopolicy ? DST_NOPOLICY : 0) |
1876 (noxfrm ? DST_NOXFRM : 0));
1879 /* called in rcu_read_lock() section */
1880 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1881 u8 tos, struct net_device *dev, int our)
1883 unsigned int hash;
1884 struct rtable *rth;
1885 __be32 spec_dst;
1886 struct in_device *in_dev = __in_dev_get_rcu(dev);
1887 u32 itag = 0;
1888 int err;
1890 /* Primary sanity checks. */
1892 if (in_dev == NULL)
1893 return -EINVAL;
1895 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1896 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1897 goto e_inval;
1899 if (ipv4_is_zeronet(saddr)) {
1900 if (!ipv4_is_local_multicast(daddr))
1901 goto e_inval;
1902 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1903 } else {
1904 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1905 &itag);
1906 if (err < 0)
1907 goto e_err;
1909 rth = rt_dst_alloc(init_net.loopback_dev,
1910 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1911 if (!rth)
1912 goto e_nobufs;
1914 #ifdef CONFIG_IP_ROUTE_CLASSID
1915 rth->dst.tclassid = itag;
1916 #endif
1917 rth->dst.output = ip_rt_bug;
1919 rth->rt_key_dst = daddr;
1920 rth->rt_key_src = saddr;
1921 rth->rt_genid = rt_genid(dev_net(dev));
1922 rth->rt_flags = RTCF_MULTICAST;
1923 rth->rt_type = RTN_MULTICAST;
1924 rth->rt_key_tos = tos;
1925 rth->rt_dst = daddr;
1926 rth->rt_src = saddr;
1927 rth->rt_route_iif = dev->ifindex;
1928 rth->rt_iif = dev->ifindex;
1929 rth->rt_oif = 0;
1930 rth->rt_mark = skb->mark;
1931 rth->rt_gateway = daddr;
1932 rth->rt_spec_dst= spec_dst;
1933 rth->rt_peer_genid = 0;
1934 rth->peer = NULL;
1935 rth->fi = NULL;
1936 if (our) {
1937 rth->dst.input= ip_local_deliver;
1938 rth->rt_flags |= RTCF_LOCAL;
1941 #ifdef CONFIG_IP_MROUTE
1942 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1943 rth->dst.input = ip_mr_input;
1944 #endif
1945 RT_CACHE_STAT_INC(in_slow_mc);
1947 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1948 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1949 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1951 e_nobufs:
1952 return -ENOBUFS;
1953 e_inval:
1954 return -EINVAL;
1955 e_err:
1956 return err;
1960 static void ip_handle_martian_source(struct net_device *dev,
1961 struct in_device *in_dev,
1962 struct sk_buff *skb,
1963 __be32 daddr,
1964 __be32 saddr)
1966 RT_CACHE_STAT_INC(in_martian_src);
1967 #ifdef CONFIG_IP_ROUTE_VERBOSE
1968 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1970 * RFC1812 recommendation, if source is martian,
1971 * the only hint is MAC header.
1973 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1974 &daddr, &saddr, dev->name);
1975 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1976 int i;
1977 const unsigned char *p = skb_mac_header(skb);
1978 printk(KERN_WARNING "ll header: ");
1979 for (i = 0; i < dev->hard_header_len; i++, p++) {
1980 printk("%02x", *p);
1981 if (i < (dev->hard_header_len - 1))
1982 printk(":");
1984 printk("\n");
1987 #endif
1990 /* called in rcu_read_lock() section */
1991 static int __mkroute_input(struct sk_buff *skb,
1992 const struct fib_result *res,
1993 struct in_device *in_dev,
1994 __be32 daddr, __be32 saddr, u32 tos,
1995 struct rtable **result)
1997 struct rtable *rth;
1998 int err;
1999 struct in_device *out_dev;
2000 unsigned int flags = 0;
2001 __be32 spec_dst;
2002 u32 itag;
2004 /* get a working reference to the output device */
2005 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2006 if (out_dev == NULL) {
2007 if (net_ratelimit())
2008 printk(KERN_CRIT "Bug in ip_route_input" \
2009 "_slow(). Please, report\n");
2010 return -EINVAL;
2014 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2015 in_dev->dev, &spec_dst, &itag);
2016 if (err < 0) {
2017 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2018 saddr);
2020 goto cleanup;
2023 if (err)
2024 flags |= RTCF_DIRECTSRC;
2026 if (out_dev == in_dev && err &&
2027 (IN_DEV_SHARED_MEDIA(out_dev) ||
2028 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2029 flags |= RTCF_DOREDIRECT;
2031 if (skb->protocol != htons(ETH_P_IP)) {
2032 /* Not IP (i.e. ARP). Do not create route, if it is
2033 * invalid for proxy arp. DNAT routes are always valid.
2035 * Proxy arp feature have been extended to allow, ARP
2036 * replies back to the same interface, to support
2037 * Private VLAN switch technologies. See arp.c.
2039 if (out_dev == in_dev &&
2040 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2041 err = -EINVAL;
2042 goto cleanup;
2046 rth = rt_dst_alloc(out_dev->dev,
2047 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2048 IN_DEV_CONF_GET(out_dev, NOXFRM));
2049 if (!rth) {
2050 err = -ENOBUFS;
2051 goto cleanup;
2054 rth->rt_key_dst = daddr;
2055 rth->rt_key_src = saddr;
2056 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2057 rth->rt_flags = flags;
2058 rth->rt_type = res->type;
2059 rth->rt_key_tos = tos;
2060 rth->rt_dst = daddr;
2061 rth->rt_src = saddr;
2062 rth->rt_route_iif = in_dev->dev->ifindex;
2063 rth->rt_iif = in_dev->dev->ifindex;
2064 rth->rt_oif = 0;
2065 rth->rt_mark = skb->mark;
2066 rth->rt_gateway = daddr;
2067 rth->rt_spec_dst= spec_dst;
2068 rth->rt_peer_genid = 0;
2069 rth->peer = NULL;
2070 rth->fi = NULL;
2072 rth->dst.input = ip_forward;
2073 rth->dst.output = ip_output;
2075 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2077 *result = rth;
2078 err = 0;
2079 cleanup:
2080 return err;
2083 static int ip_mkroute_input(struct sk_buff *skb,
2084 struct fib_result *res,
2085 const struct flowi4 *fl4,
2086 struct in_device *in_dev,
2087 __be32 daddr, __be32 saddr, u32 tos)
2089 struct rtable* rth = NULL;
2090 int err;
2091 unsigned hash;
2093 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2094 if (res->fi && res->fi->fib_nhs > 1)
2095 fib_select_multipath(res);
2096 #endif
2098 /* create a routing cache entry */
2099 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2100 if (err)
2101 return err;
2103 /* put it into the cache */
2104 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2105 rt_genid(dev_net(rth->dst.dev)));
2106 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2107 if (IS_ERR(rth))
2108 return PTR_ERR(rth);
2109 return 0;
2113 * NOTE. We drop all the packets that has local source
2114 * addresses, because every properly looped back packet
2115 * must have correct destination already attached by output routine.
2117 * Such approach solves two big problems:
2118 * 1. Not simplex devices are handled properly.
2119 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2120 * called with rcu_read_lock()
2123 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2124 u8 tos, struct net_device *dev)
2126 struct fib_result res;
2127 struct in_device *in_dev = __in_dev_get_rcu(dev);
2128 struct flowi4 fl4;
2129 unsigned flags = 0;
2130 u32 itag = 0;
2131 struct rtable * rth;
2132 unsigned hash;
2133 __be32 spec_dst;
2134 int err = -EINVAL;
2135 struct net * net = dev_net(dev);
2137 /* IP on this device is disabled. */
2139 if (!in_dev)
2140 goto out;
2142 /* Check for the most weird martians, which can be not detected
2143 by fib_lookup.
2146 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2147 ipv4_is_loopback(saddr))
2148 goto martian_source;
2150 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2151 goto brd_input;
2153 /* Accept zero addresses only to limited broadcast;
2154 * I even do not know to fix it or not. Waiting for complains :-)
2156 if (ipv4_is_zeronet(saddr))
2157 goto martian_source;
2159 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2160 goto martian_destination;
2163 * Now we are ready to route packet.
2165 fl4.flowi4_oif = 0;
2166 fl4.flowi4_iif = dev->ifindex;
2167 fl4.flowi4_mark = skb->mark;
2168 fl4.flowi4_tos = tos;
2169 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2170 fl4.daddr = daddr;
2171 fl4.saddr = saddr;
2172 err = fib_lookup(net, &fl4, &res);
2173 if (err != 0) {
2174 if (!IN_DEV_FORWARD(in_dev))
2175 goto e_hostunreach;
2176 goto no_route;
2179 RT_CACHE_STAT_INC(in_slow_tot);
2181 if (res.type == RTN_BROADCAST)
2182 goto brd_input;
2184 if (res.type == RTN_LOCAL) {
2185 err = fib_validate_source(skb, saddr, daddr, tos,
2186 net->loopback_dev->ifindex,
2187 dev, &spec_dst, &itag);
2188 if (err < 0)
2189 goto martian_source_keep_err;
2190 if (err)
2191 flags |= RTCF_DIRECTSRC;
2192 spec_dst = daddr;
2193 goto local_input;
2196 if (!IN_DEV_FORWARD(in_dev))
2197 goto e_hostunreach;
2198 if (res.type != RTN_UNICAST)
2199 goto martian_destination;
2201 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2202 out: return err;
2204 brd_input:
2205 if (skb->protocol != htons(ETH_P_IP))
2206 goto e_inval;
2208 if (ipv4_is_zeronet(saddr))
2209 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2210 else {
2211 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2212 &itag);
2213 if (err < 0)
2214 goto martian_source_keep_err;
2215 if (err)
2216 flags |= RTCF_DIRECTSRC;
2218 flags |= RTCF_BROADCAST;
2219 res.type = RTN_BROADCAST;
2220 RT_CACHE_STAT_INC(in_brd);
2222 local_input:
2223 rth = rt_dst_alloc(net->loopback_dev,
2224 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2225 if (!rth)
2226 goto e_nobufs;
2228 rth->dst.input= ip_local_deliver;
2229 rth->dst.output= ip_rt_bug;
2230 #ifdef CONFIG_IP_ROUTE_CLASSID
2231 rth->dst.tclassid = itag;
2232 #endif
2234 rth->rt_key_dst = daddr;
2235 rth->rt_key_src = saddr;
2236 rth->rt_genid = rt_genid(net);
2237 rth->rt_flags = flags|RTCF_LOCAL;
2238 rth->rt_type = res.type;
2239 rth->rt_key_tos = tos;
2240 rth->rt_dst = daddr;
2241 rth->rt_src = saddr;
2242 #ifdef CONFIG_IP_ROUTE_CLASSID
2243 rth->dst.tclassid = itag;
2244 #endif
2245 rth->rt_route_iif = dev->ifindex;
2246 rth->rt_iif = dev->ifindex;
2247 rth->rt_oif = 0;
2248 rth->rt_mark = skb->mark;
2249 rth->rt_gateway = daddr;
2250 rth->rt_spec_dst= spec_dst;
2251 rth->rt_peer_genid = 0;
2252 rth->peer = NULL;
2253 rth->fi = NULL;
2254 if (res.type == RTN_UNREACHABLE) {
2255 rth->dst.input= ip_error;
2256 rth->dst.error= -err;
2257 rth->rt_flags &= ~RTCF_LOCAL;
2259 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2260 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2261 err = 0;
2262 if (IS_ERR(rth))
2263 err = PTR_ERR(rth);
2264 goto out;
2266 no_route:
2267 RT_CACHE_STAT_INC(in_no_route);
2268 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2269 res.type = RTN_UNREACHABLE;
2270 if (err == -ESRCH)
2271 err = -ENETUNREACH;
2272 goto local_input;
2275 * Do not cache martian addresses: they should be logged (RFC1812)
2277 martian_destination:
2278 RT_CACHE_STAT_INC(in_martian_dst);
2279 #ifdef CONFIG_IP_ROUTE_VERBOSE
2280 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2281 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2282 &daddr, &saddr, dev->name);
2283 #endif
2285 e_hostunreach:
2286 err = -EHOSTUNREACH;
2287 goto out;
2289 e_inval:
2290 err = -EINVAL;
2291 goto out;
2293 e_nobufs:
2294 err = -ENOBUFS;
2295 goto out;
2297 martian_source:
2298 err = -EINVAL;
2299 martian_source_keep_err:
2300 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2301 goto out;
2304 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2305 u8 tos, struct net_device *dev, bool noref)
2307 struct rtable * rth;
2308 unsigned hash;
2309 int iif = dev->ifindex;
2310 struct net *net;
2311 int res;
2313 net = dev_net(dev);
2315 rcu_read_lock();
2317 if (!rt_caching(net))
2318 goto skip_cache;
2320 tos &= IPTOS_RT_MASK;
2321 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2323 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2324 rth = rcu_dereference(rth->dst.rt_next)) {
2325 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2326 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2327 (rth->rt_route_iif ^ iif) |
2328 (rth->rt_key_tos ^ tos)) == 0 &&
2329 rth->rt_mark == skb->mark &&
2330 net_eq(dev_net(rth->dst.dev), net) &&
2331 !rt_is_expired(rth)) {
2332 if (noref) {
2333 dst_use_noref(&rth->dst, jiffies);
2334 skb_dst_set_noref(skb, &rth->dst);
2335 } else {
2336 dst_use(&rth->dst, jiffies);
2337 skb_dst_set(skb, &rth->dst);
2339 RT_CACHE_STAT_INC(in_hit);
2340 rcu_read_unlock();
2341 return 0;
2343 RT_CACHE_STAT_INC(in_hlist_search);
2346 skip_cache:
2347 /* Multicast recognition logic is moved from route cache to here.
2348 The problem was that too many Ethernet cards have broken/missing
2349 hardware multicast filters :-( As result the host on multicasting
2350 network acquires a lot of useless route cache entries, sort of
2351 SDR messages from all the world. Now we try to get rid of them.
2352 Really, provided software IP multicast filter is organized
2353 reasonably (at least, hashed), it does not result in a slowdown
2354 comparing with route cache reject entries.
2355 Note, that multicast routers are not affected, because
2356 route cache entry is created eventually.
2358 if (ipv4_is_multicast(daddr)) {
2359 struct in_device *in_dev = __in_dev_get_rcu(dev);
2361 if (in_dev) {
2362 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2363 ip_hdr(skb)->protocol);
2364 if (our
2365 #ifdef CONFIG_IP_MROUTE
2367 (!ipv4_is_local_multicast(daddr) &&
2368 IN_DEV_MFORWARD(in_dev))
2369 #endif
2371 int res = ip_route_input_mc(skb, daddr, saddr,
2372 tos, dev, our);
2373 rcu_read_unlock();
2374 return res;
2377 rcu_read_unlock();
2378 return -EINVAL;
2380 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2381 rcu_read_unlock();
2382 return res;
2384 EXPORT_SYMBOL(ip_route_input_common);
2386 /* called with rcu_read_lock() */
2387 static struct rtable *__mkroute_output(const struct fib_result *res,
2388 const struct flowi4 *fl4,
2389 __be32 orig_daddr, __be32 orig_saddr,
2390 int orig_oif, struct net_device *dev_out,
2391 unsigned int flags)
2393 struct fib_info *fi = res->fi;
2394 u32 tos = RT_FL_TOS(fl4);
2395 struct in_device *in_dev;
2396 u16 type = res->type;
2397 struct rtable *rth;
2399 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2400 return ERR_PTR(-EINVAL);
2402 if (ipv4_is_lbcast(fl4->daddr))
2403 type = RTN_BROADCAST;
2404 else if (ipv4_is_multicast(fl4->daddr))
2405 type = RTN_MULTICAST;
2406 else if (ipv4_is_zeronet(fl4->daddr))
2407 return ERR_PTR(-EINVAL);
2409 if (dev_out->flags & IFF_LOOPBACK)
2410 flags |= RTCF_LOCAL;
2412 in_dev = __in_dev_get_rcu(dev_out);
2413 if (!in_dev)
2414 return ERR_PTR(-EINVAL);
2416 if (type == RTN_BROADCAST) {
2417 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2418 fi = NULL;
2419 } else if (type == RTN_MULTICAST) {
2420 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2421 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2422 fl4->flowi4_proto))
2423 flags &= ~RTCF_LOCAL;
2424 /* If multicast route do not exist use
2425 * default one, but do not gateway in this case.
2426 * Yes, it is hack.
2428 if (fi && res->prefixlen < 4)
2429 fi = NULL;
2432 rth = rt_dst_alloc(dev_out,
2433 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2434 IN_DEV_CONF_GET(in_dev, NOXFRM));
2435 if (!rth)
2436 return ERR_PTR(-ENOBUFS);
2438 rth->dst.output = ip_output;
2440 rth->rt_key_dst = orig_daddr;
2441 rth->rt_key_src = orig_saddr;
2442 rth->rt_genid = rt_genid(dev_net(dev_out));
2443 rth->rt_flags = flags;
2444 rth->rt_type = type;
2445 rth->rt_key_tos = tos;
2446 rth->rt_dst = fl4->daddr;
2447 rth->rt_src = fl4->saddr;
2448 rth->rt_route_iif = 0;
2449 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2450 rth->rt_oif = orig_oif;
2451 rth->rt_mark = fl4->flowi4_mark;
2452 rth->rt_gateway = fl4->daddr;
2453 rth->rt_spec_dst= fl4->saddr;
2454 rth->rt_peer_genid = 0;
2455 rth->peer = NULL;
2456 rth->fi = NULL;
2458 RT_CACHE_STAT_INC(out_slow_tot);
2460 if (flags & RTCF_LOCAL) {
2461 rth->dst.input = ip_local_deliver;
2462 rth->rt_spec_dst = fl4->daddr;
2464 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2465 rth->rt_spec_dst = fl4->saddr;
2466 if (flags & RTCF_LOCAL &&
2467 !(dev_out->flags & IFF_LOOPBACK)) {
2468 rth->dst.output = ip_mc_output;
2469 RT_CACHE_STAT_INC(out_slow_mc);
2471 #ifdef CONFIG_IP_MROUTE
2472 if (type == RTN_MULTICAST) {
2473 if (IN_DEV_MFORWARD(in_dev) &&
2474 !ipv4_is_local_multicast(fl4->daddr)) {
2475 rth->dst.input = ip_mr_input;
2476 rth->dst.output = ip_mc_output;
2479 #endif
2482 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2484 return rth;
2488 * Major route resolver routine.
2489 * called with rcu_read_lock();
2492 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2494 struct net_device *dev_out = NULL;
2495 u32 tos = RT_FL_TOS(fl4);
2496 unsigned int flags = 0;
2497 struct fib_result res;
2498 struct rtable *rth;
2499 __be32 orig_daddr;
2500 __be32 orig_saddr;
2501 int orig_oif;
2503 res.fi = NULL;
2504 #ifdef CONFIG_IP_MULTIPLE_TABLES
2505 res.r = NULL;
2506 #endif
2508 orig_daddr = fl4->daddr;
2509 orig_saddr = fl4->saddr;
2510 orig_oif = fl4->flowi4_oif;
2512 fl4->flowi4_iif = net->loopback_dev->ifindex;
2513 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2514 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2515 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2517 rcu_read_lock();
2518 if (fl4->saddr) {
2519 rth = ERR_PTR(-EINVAL);
2520 if (ipv4_is_multicast(fl4->saddr) ||
2521 ipv4_is_lbcast(fl4->saddr) ||
2522 ipv4_is_zeronet(fl4->saddr))
2523 goto out;
2525 /* I removed check for oif == dev_out->oif here.
2526 It was wrong for two reasons:
2527 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2528 is assigned to multiple interfaces.
2529 2. Moreover, we are allowed to send packets with saddr
2530 of another iface. --ANK
2533 if (fl4->flowi4_oif == 0 &&
2534 (ipv4_is_multicast(fl4->daddr) ||
2535 ipv4_is_lbcast(fl4->daddr))) {
2536 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2537 dev_out = __ip_dev_find(net, fl4->saddr, false);
2538 if (dev_out == NULL)
2539 goto out;
2541 /* Special hack: user can direct multicasts
2542 and limited broadcast via necessary interface
2543 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2544 This hack is not just for fun, it allows
2545 vic,vat and friends to work.
2546 They bind socket to loopback, set ttl to zero
2547 and expect that it will work.
2548 From the viewpoint of routing cache they are broken,
2549 because we are not allowed to build multicast path
2550 with loopback source addr (look, routing cache
2551 cannot know, that ttl is zero, so that packet
2552 will not leave this host and route is valid).
2553 Luckily, this hack is good workaround.
2556 fl4->flowi4_oif = dev_out->ifindex;
2557 goto make_route;
2560 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2561 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2562 if (!__ip_dev_find(net, fl4->saddr, false))
2563 goto out;
2568 if (fl4->flowi4_oif) {
2569 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2570 rth = ERR_PTR(-ENODEV);
2571 if (dev_out == NULL)
2572 goto out;
2574 /* RACE: Check return value of inet_select_addr instead. */
2575 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2576 rth = ERR_PTR(-ENETUNREACH);
2577 goto out;
2579 if (ipv4_is_local_multicast(fl4->daddr) ||
2580 ipv4_is_lbcast(fl4->daddr)) {
2581 if (!fl4->saddr)
2582 fl4->saddr = inet_select_addr(dev_out, 0,
2583 RT_SCOPE_LINK);
2584 goto make_route;
2586 if (fl4->saddr) {
2587 if (ipv4_is_multicast(fl4->daddr))
2588 fl4->saddr = inet_select_addr(dev_out, 0,
2589 fl4->flowi4_scope);
2590 else if (!fl4->daddr)
2591 fl4->saddr = inet_select_addr(dev_out, 0,
2592 RT_SCOPE_HOST);
2596 if (!fl4->daddr) {
2597 fl4->daddr = fl4->saddr;
2598 if (!fl4->daddr)
2599 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2600 dev_out = net->loopback_dev;
2601 fl4->flowi4_oif = net->loopback_dev->ifindex;
2602 res.type = RTN_LOCAL;
2603 flags |= RTCF_LOCAL;
2604 goto make_route;
2607 if (fib_lookup(net, fl4, &res)) {
2608 res.fi = NULL;
2609 if (fl4->flowi4_oif) {
2610 /* Apparently, routing tables are wrong. Assume,
2611 that the destination is on link.
2613 WHY? DW.
2614 Because we are allowed to send to iface
2615 even if it has NO routes and NO assigned
2616 addresses. When oif is specified, routing
2617 tables are looked up with only one purpose:
2618 to catch if destination is gatewayed, rather than
2619 direct. Moreover, if MSG_DONTROUTE is set,
2620 we send packet, ignoring both routing tables
2621 and ifaddr state. --ANK
2624 We could make it even if oif is unknown,
2625 likely IPv6, but we do not.
2628 if (fl4->saddr == 0)
2629 fl4->saddr = inet_select_addr(dev_out, 0,
2630 RT_SCOPE_LINK);
2631 res.type = RTN_UNICAST;
2632 goto make_route;
2634 rth = ERR_PTR(-ENETUNREACH);
2635 goto out;
2638 if (res.type == RTN_LOCAL) {
2639 if (!fl4->saddr) {
2640 if (res.fi->fib_prefsrc)
2641 fl4->saddr = res.fi->fib_prefsrc;
2642 else
2643 fl4->saddr = fl4->daddr;
2645 dev_out = net->loopback_dev;
2646 fl4->flowi4_oif = dev_out->ifindex;
2647 res.fi = NULL;
2648 flags |= RTCF_LOCAL;
2649 goto make_route;
2652 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2653 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2654 fib_select_multipath(&res);
2655 else
2656 #endif
2657 if (!res.prefixlen &&
2658 res.table->tb_num_default > 1 &&
2659 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2660 fib_select_default(&res);
2662 if (!fl4->saddr)
2663 fl4->saddr = FIB_RES_PREFSRC(net, res);
2665 dev_out = FIB_RES_DEV(res);
2666 fl4->flowi4_oif = dev_out->ifindex;
2669 make_route:
2670 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2671 dev_out, flags);
2672 if (!IS_ERR(rth)) {
2673 unsigned int hash;
2675 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2676 rt_genid(dev_net(dev_out)));
2677 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2680 out:
2681 rcu_read_unlock();
2682 return rth;
2685 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2687 struct rtable *rth;
2688 unsigned int hash;
2690 if (!rt_caching(net))
2691 goto slow_output;
2693 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2695 rcu_read_lock_bh();
2696 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2697 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2698 if (rth->rt_key_dst == flp4->daddr &&
2699 rth->rt_key_src == flp4->saddr &&
2700 rt_is_output_route(rth) &&
2701 rth->rt_oif == flp4->flowi4_oif &&
2702 rth->rt_mark == flp4->flowi4_mark &&
2703 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2704 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2705 net_eq(dev_net(rth->dst.dev), net) &&
2706 !rt_is_expired(rth)) {
2707 dst_use(&rth->dst, jiffies);
2708 RT_CACHE_STAT_INC(out_hit);
2709 rcu_read_unlock_bh();
2710 if (!flp4->saddr)
2711 flp4->saddr = rth->rt_src;
2712 if (!flp4->daddr)
2713 flp4->daddr = rth->rt_dst;
2714 return rth;
2716 RT_CACHE_STAT_INC(out_hlist_search);
2718 rcu_read_unlock_bh();
2720 slow_output:
2721 return ip_route_output_slow(net, flp4);
2723 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2725 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2727 return NULL;
2730 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2732 return 0;
2735 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2739 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2740 unsigned long old)
2742 return NULL;
2745 static struct dst_ops ipv4_dst_blackhole_ops = {
2746 .family = AF_INET,
2747 .protocol = cpu_to_be16(ETH_P_IP),
2748 .destroy = ipv4_dst_destroy,
2749 .check = ipv4_blackhole_dst_check,
2750 .default_mtu = ipv4_blackhole_default_mtu,
2751 .default_advmss = ipv4_default_advmss,
2752 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2753 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2754 .neigh_lookup = ipv4_neigh_lookup,
2757 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2759 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2760 struct rtable *ort = (struct rtable *) dst_orig;
2762 if (rt) {
2763 struct dst_entry *new = &rt->dst;
2765 new->__use = 1;
2766 new->input = dst_discard;
2767 new->output = dst_discard;
2768 dst_copy_metrics(new, &ort->dst);
2770 new->dev = ort->dst.dev;
2771 if (new->dev)
2772 dev_hold(new->dev);
2774 rt->rt_key_dst = ort->rt_key_dst;
2775 rt->rt_key_src = ort->rt_key_src;
2776 rt->rt_key_tos = ort->rt_key_tos;
2777 rt->rt_route_iif = ort->rt_route_iif;
2778 rt->rt_iif = ort->rt_iif;
2779 rt->rt_oif = ort->rt_oif;
2780 rt->rt_mark = ort->rt_mark;
2782 rt->rt_genid = rt_genid(net);
2783 rt->rt_flags = ort->rt_flags;
2784 rt->rt_type = ort->rt_type;
2785 rt->rt_dst = ort->rt_dst;
2786 rt->rt_src = ort->rt_src;
2787 rt->rt_gateway = ort->rt_gateway;
2788 rt->rt_spec_dst = ort->rt_spec_dst;
2789 rt->peer = ort->peer;
2790 if (rt->peer)
2791 atomic_inc(&rt->peer->refcnt);
2792 rt->fi = ort->fi;
2793 if (rt->fi)
2794 atomic_inc(&rt->fi->fib_clntref);
2796 dst_free(new);
2799 dst_release(dst_orig);
2801 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2804 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2805 struct sock *sk)
2807 struct rtable *rt = __ip_route_output_key(net, flp4);
2809 if (IS_ERR(rt))
2810 return rt;
2812 if (flp4->flowi4_proto)
2813 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2814 flowi4_to_flowi(flp4),
2815 sk, 0);
2817 return rt;
2819 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2821 static int rt_fill_info(struct net *net,
2822 struct sk_buff *skb, u32 pid, u32 seq, int event,
2823 int nowait, unsigned int flags)
2825 struct rtable *rt = skb_rtable(skb);
2826 struct rtmsg *r;
2827 struct nlmsghdr *nlh;
2828 long expires = 0;
2829 const struct inet_peer *peer = rt->peer;
2830 u32 id = 0, ts = 0, tsage = 0, error;
2832 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2833 if (nlh == NULL)
2834 return -EMSGSIZE;
2836 r = nlmsg_data(nlh);
2837 r->rtm_family = AF_INET;
2838 r->rtm_dst_len = 32;
2839 r->rtm_src_len = 0;
2840 r->rtm_tos = rt->rt_key_tos;
2841 r->rtm_table = RT_TABLE_MAIN;
2842 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2843 r->rtm_type = rt->rt_type;
2844 r->rtm_scope = RT_SCOPE_UNIVERSE;
2845 r->rtm_protocol = RTPROT_UNSPEC;
2846 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2847 if (rt->rt_flags & RTCF_NOTIFY)
2848 r->rtm_flags |= RTM_F_NOTIFY;
2850 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2852 if (rt->rt_key_src) {
2853 r->rtm_src_len = 32;
2854 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2856 if (rt->dst.dev)
2857 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2858 #ifdef CONFIG_IP_ROUTE_CLASSID
2859 if (rt->dst.tclassid)
2860 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2861 #endif
2862 if (rt_is_input_route(rt))
2863 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2864 else if (rt->rt_src != rt->rt_key_src)
2865 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2867 if (rt->rt_dst != rt->rt_gateway)
2868 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2870 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2871 goto nla_put_failure;
2873 if (rt->rt_mark)
2874 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2876 error = rt->dst.error;
2877 if (peer) {
2878 inet_peer_refcheck(rt->peer);
2879 id = atomic_read(&peer->ip_id_count) & 0xffff;
2880 if (peer->tcp_ts_stamp) {
2881 ts = peer->tcp_ts;
2882 tsage = get_seconds() - peer->tcp_ts_stamp;
2884 expires = ACCESS_ONCE(peer->pmtu_expires);
2885 if (expires)
2886 expires -= jiffies;
2889 if (rt_is_input_route(rt)) {
2890 #ifdef CONFIG_IP_MROUTE
2891 __be32 dst = rt->rt_dst;
2893 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2894 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2895 int err = ipmr_get_route(net, skb,
2896 rt->rt_src, rt->rt_dst,
2897 r, nowait);
2898 if (err <= 0) {
2899 if (!nowait) {
2900 if (err == 0)
2901 return 0;
2902 goto nla_put_failure;
2903 } else {
2904 if (err == -EMSGSIZE)
2905 goto nla_put_failure;
2906 error = err;
2909 } else
2910 #endif
2911 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2914 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2915 expires, error) < 0)
2916 goto nla_put_failure;
2918 return nlmsg_end(skb, nlh);
2920 nla_put_failure:
2921 nlmsg_cancel(skb, nlh);
2922 return -EMSGSIZE;
2925 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2927 struct net *net = sock_net(in_skb->sk);
2928 struct rtmsg *rtm;
2929 struct nlattr *tb[RTA_MAX+1];
2930 struct rtable *rt = NULL;
2931 __be32 dst = 0;
2932 __be32 src = 0;
2933 u32 iif;
2934 int err;
2935 int mark;
2936 struct sk_buff *skb;
2938 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2939 if (err < 0)
2940 goto errout;
2942 rtm = nlmsg_data(nlh);
2944 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2945 if (skb == NULL) {
2946 err = -ENOBUFS;
2947 goto errout;
2950 /* Reserve room for dummy headers, this skb can pass
2951 through good chunk of routing engine.
2953 skb_reset_mac_header(skb);
2954 skb_reset_network_header(skb);
2956 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2957 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2958 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2960 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2961 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2962 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2963 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2965 if (iif) {
2966 struct net_device *dev;
2968 dev = __dev_get_by_index(net, iif);
2969 if (dev == NULL) {
2970 err = -ENODEV;
2971 goto errout_free;
2974 skb->protocol = htons(ETH_P_IP);
2975 skb->dev = dev;
2976 skb->mark = mark;
2977 local_bh_disable();
2978 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2979 local_bh_enable();
2981 rt = skb_rtable(skb);
2982 if (err == 0 && rt->dst.error)
2983 err = -rt->dst.error;
2984 } else {
2985 struct flowi4 fl4 = {
2986 .daddr = dst,
2987 .saddr = src,
2988 .flowi4_tos = rtm->rtm_tos,
2989 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2990 .flowi4_mark = mark,
2992 rt = ip_route_output_key(net, &fl4);
2994 err = 0;
2995 if (IS_ERR(rt))
2996 err = PTR_ERR(rt);
2999 if (err)
3000 goto errout_free;
3002 skb_dst_set(skb, &rt->dst);
3003 if (rtm->rtm_flags & RTM_F_NOTIFY)
3004 rt->rt_flags |= RTCF_NOTIFY;
3006 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3007 RTM_NEWROUTE, 0, 0);
3008 if (err <= 0)
3009 goto errout_free;
3011 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3012 errout:
3013 return err;
3015 errout_free:
3016 kfree_skb(skb);
3017 goto errout;
3020 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3022 struct rtable *rt;
3023 int h, s_h;
3024 int idx, s_idx;
3025 struct net *net;
3027 net = sock_net(skb->sk);
3029 s_h = cb->args[0];
3030 if (s_h < 0)
3031 s_h = 0;
3032 s_idx = idx = cb->args[1];
3033 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3034 if (!rt_hash_table[h].chain)
3035 continue;
3036 rcu_read_lock_bh();
3037 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3038 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3039 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3040 continue;
3041 if (rt_is_expired(rt))
3042 continue;
3043 skb_dst_set_noref(skb, &rt->dst);
3044 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3045 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3046 1, NLM_F_MULTI) <= 0) {
3047 skb_dst_drop(skb);
3048 rcu_read_unlock_bh();
3049 goto done;
3051 skb_dst_drop(skb);
3053 rcu_read_unlock_bh();
3056 done:
3057 cb->args[0] = h;
3058 cb->args[1] = idx;
3059 return skb->len;
3062 void ip_rt_multicast_event(struct in_device *in_dev)
3064 rt_cache_flush(dev_net(in_dev->dev), 0);
3067 #ifdef CONFIG_SYSCTL
3068 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3069 void __user *buffer,
3070 size_t *lenp, loff_t *ppos)
3072 if (write) {
3073 int flush_delay;
3074 ctl_table ctl;
3075 struct net *net;
3077 memcpy(&ctl, __ctl, sizeof(ctl));
3078 ctl.data = &flush_delay;
3079 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3081 net = (struct net *)__ctl->extra1;
3082 rt_cache_flush(net, flush_delay);
3083 return 0;
3086 return -EINVAL;
3089 static ctl_table ipv4_route_table[] = {
3091 .procname = "gc_thresh",
3092 .data = &ipv4_dst_ops.gc_thresh,
3093 .maxlen = sizeof(int),
3094 .mode = 0644,
3095 .proc_handler = proc_dointvec,
3098 .procname = "max_size",
3099 .data = &ip_rt_max_size,
3100 .maxlen = sizeof(int),
3101 .mode = 0644,
3102 .proc_handler = proc_dointvec,
3105 /* Deprecated. Use gc_min_interval_ms */
3107 .procname = "gc_min_interval",
3108 .data = &ip_rt_gc_min_interval,
3109 .maxlen = sizeof(int),
3110 .mode = 0644,
3111 .proc_handler = proc_dointvec_jiffies,
3114 .procname = "gc_min_interval_ms",
3115 .data = &ip_rt_gc_min_interval,
3116 .maxlen = sizeof(int),
3117 .mode = 0644,
3118 .proc_handler = proc_dointvec_ms_jiffies,
3121 .procname = "gc_timeout",
3122 .data = &ip_rt_gc_timeout,
3123 .maxlen = sizeof(int),
3124 .mode = 0644,
3125 .proc_handler = proc_dointvec_jiffies,
3128 .procname = "gc_interval",
3129 .data = &ip_rt_gc_interval,
3130 .maxlen = sizeof(int),
3131 .mode = 0644,
3132 .proc_handler = proc_dointvec_jiffies,
3135 .procname = "redirect_load",
3136 .data = &ip_rt_redirect_load,
3137 .maxlen = sizeof(int),
3138 .mode = 0644,
3139 .proc_handler = proc_dointvec,
3142 .procname = "redirect_number",
3143 .data = &ip_rt_redirect_number,
3144 .maxlen = sizeof(int),
3145 .mode = 0644,
3146 .proc_handler = proc_dointvec,
3149 .procname = "redirect_silence",
3150 .data = &ip_rt_redirect_silence,
3151 .maxlen = sizeof(int),
3152 .mode = 0644,
3153 .proc_handler = proc_dointvec,
3156 .procname = "error_cost",
3157 .data = &ip_rt_error_cost,
3158 .maxlen = sizeof(int),
3159 .mode = 0644,
3160 .proc_handler = proc_dointvec,
3163 .procname = "error_burst",
3164 .data = &ip_rt_error_burst,
3165 .maxlen = sizeof(int),
3166 .mode = 0644,
3167 .proc_handler = proc_dointvec,
3170 .procname = "gc_elasticity",
3171 .data = &ip_rt_gc_elasticity,
3172 .maxlen = sizeof(int),
3173 .mode = 0644,
3174 .proc_handler = proc_dointvec,
3177 .procname = "mtu_expires",
3178 .data = &ip_rt_mtu_expires,
3179 .maxlen = sizeof(int),
3180 .mode = 0644,
3181 .proc_handler = proc_dointvec_jiffies,
3184 .procname = "min_pmtu",
3185 .data = &ip_rt_min_pmtu,
3186 .maxlen = sizeof(int),
3187 .mode = 0644,
3188 .proc_handler = proc_dointvec,
3191 .procname = "min_adv_mss",
3192 .data = &ip_rt_min_advmss,
3193 .maxlen = sizeof(int),
3194 .mode = 0644,
3195 .proc_handler = proc_dointvec,
3200 static struct ctl_table empty[1];
3202 static struct ctl_table ipv4_skeleton[] =
3204 { .procname = "route",
3205 .mode = 0555, .child = ipv4_route_table},
3206 { .procname = "neigh",
3207 .mode = 0555, .child = empty},
3211 static __net_initdata struct ctl_path ipv4_path[] = {
3212 { .procname = "net", },
3213 { .procname = "ipv4", },
3214 { },
3217 static struct ctl_table ipv4_route_flush_table[] = {
3219 .procname = "flush",
3220 .maxlen = sizeof(int),
3221 .mode = 0200,
3222 .proc_handler = ipv4_sysctl_rtcache_flush,
3224 { },
3227 static __net_initdata struct ctl_path ipv4_route_path[] = {
3228 { .procname = "net", },
3229 { .procname = "ipv4", },
3230 { .procname = "route", },
3231 { },
3234 static __net_init int sysctl_route_net_init(struct net *net)
3236 struct ctl_table *tbl;
3238 tbl = ipv4_route_flush_table;
3239 if (!net_eq(net, &init_net)) {
3240 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3241 if (tbl == NULL)
3242 goto err_dup;
3244 tbl[0].extra1 = net;
3246 net->ipv4.route_hdr =
3247 register_net_sysctl_table(net, ipv4_route_path, tbl);
3248 if (net->ipv4.route_hdr == NULL)
3249 goto err_reg;
3250 return 0;
3252 err_reg:
3253 if (tbl != ipv4_route_flush_table)
3254 kfree(tbl);
3255 err_dup:
3256 return -ENOMEM;
3259 static __net_exit void sysctl_route_net_exit(struct net *net)
3261 struct ctl_table *tbl;
3263 tbl = net->ipv4.route_hdr->ctl_table_arg;
3264 unregister_net_sysctl_table(net->ipv4.route_hdr);
3265 BUG_ON(tbl == ipv4_route_flush_table);
3266 kfree(tbl);
3269 static __net_initdata struct pernet_operations sysctl_route_ops = {
3270 .init = sysctl_route_net_init,
3271 .exit = sysctl_route_net_exit,
3273 #endif
3275 static __net_init int rt_genid_init(struct net *net)
3277 get_random_bytes(&net->ipv4.rt_genid,
3278 sizeof(net->ipv4.rt_genid));
3279 get_random_bytes(&net->ipv4.dev_addr_genid,
3280 sizeof(net->ipv4.dev_addr_genid));
3281 return 0;
3284 static __net_initdata struct pernet_operations rt_genid_ops = {
3285 .init = rt_genid_init,
3289 #ifdef CONFIG_IP_ROUTE_CLASSID
3290 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3291 #endif /* CONFIG_IP_ROUTE_CLASSID */
3293 static __initdata unsigned long rhash_entries;
3294 static int __init set_rhash_entries(char *str)
3296 if (!str)
3297 return 0;
3298 rhash_entries = simple_strtoul(str, &str, 0);
3299 return 1;
3301 __setup("rhash_entries=", set_rhash_entries);
3303 int __init ip_rt_init(void)
3305 int rc = 0;
3307 #ifdef CONFIG_IP_ROUTE_CLASSID
3308 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3309 if (!ip_rt_acct)
3310 panic("IP: failed to allocate ip_rt_acct\n");
3311 #endif
3313 ipv4_dst_ops.kmem_cachep =
3314 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3315 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3317 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3319 if (dst_entries_init(&ipv4_dst_ops) < 0)
3320 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3322 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3323 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3325 rt_hash_table = (struct rt_hash_bucket *)
3326 alloc_large_system_hash("IP route cache",
3327 sizeof(struct rt_hash_bucket),
3328 rhash_entries,
3329 (totalram_pages >= 128 * 1024) ?
3330 15 : 17,
3332 &rt_hash_log,
3333 &rt_hash_mask,
3334 rhash_entries ? 0 : 512 * 1024);
3335 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3336 rt_hash_lock_init();
3338 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3339 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3341 devinet_init();
3342 ip_fib_init();
3344 if (ip_rt_proc_init())
3345 printk(KERN_ERR "Unable to create route proc files\n");
3346 #ifdef CONFIG_XFRM
3347 xfrm_init();
3348 xfrm4_init(ip_rt_max_size);
3349 #endif
3350 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3352 #ifdef CONFIG_SYSCTL
3353 register_pernet_subsys(&sysctl_route_ops);
3354 #endif
3355 register_pernet_subsys(&rt_genid_ops);
3356 return rc;
3359 #ifdef CONFIG_SYSCTL
3361 * We really need to sanitize the damn ipv4 init order, then all
3362 * this nonsense will go away.
3364 void __init ip_static_sysctl_init(void)
3366 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3368 #endif