rtc: omap: let device wakeup capability be configured from chip init logic
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv4 / route.c
blobcb562fdd9b9a5342cee41f2cc642a8b4af9e30ce
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
112 #define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115 #define IP_MAX_MTU 0xFFF0
117 #define RT_GC_TIMEOUT (300*HZ)
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly = 9;
124 static int ip_rt_redirect_load __read_mostly = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly = HZ;
127 static int ip_rt_error_burst __read_mostly = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly = 8;
129 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly = 256;
132 static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
133 static int rt_chain_length_max __read_mostly = 20;
135 static struct delayed_work expires_work;
136 static unsigned long expires_ljiffies;
139 * Interface to generic destination cache.
142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
143 static void ipv4_dst_destroy(struct dst_entry *dst);
144 static void ipv4_dst_ifdown(struct dst_entry *dst,
145 struct net_device *dev, int how);
146 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
147 static void ipv4_link_failure(struct sk_buff *skb);
148 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
149 static int rt_garbage_collect(struct dst_ops *ops);
152 static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET,
154 .protocol = cpu_to_be16(ETH_P_IP),
155 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check,
157 .destroy = ipv4_dst_destroy,
158 .ifdown = ipv4_dst_ifdown,
159 .negative_advice = ipv4_negative_advice,
160 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu,
162 .local_out = __ip_local_out,
163 .entries = ATOMIC_INIT(0),
166 #define ECN_OR_COST(class) TC_PRIO_##class
168 const __u8 ip_tos2prio[16] = {
169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
189 * Route cache.
192 /* The locking scheme is rather straight forward:
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
199 * lock held.
202 struct rt_hash_bucket {
203 struct rtable *chain;
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ 256
215 #else
216 # if NR_CPUS >= 32
217 # define RT_HASH_LOCK_SZ 4096
218 # elif NR_CPUS >= 16
219 # define RT_HASH_LOCK_SZ 2048
220 # elif NR_CPUS >= 8
221 # define RT_HASH_LOCK_SZ 1024
222 # elif NR_CPUS >= 4
223 # define RT_HASH_LOCK_SZ 512
224 # else
225 # define RT_HASH_LOCK_SZ 256
226 # endif
227 #endif
229 static spinlock_t *rt_hash_locks;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
232 static __init void rt_hash_lock_init(void)
234 int i;
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 GFP_KERNEL);
238 if (!rt_hash_locks)
239 panic("IP: failed to allocate rt_hash_locks\n");
241 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 spin_lock_init(&rt_hash_locks[i]);
244 #else
245 # define rt_hash_lock_addr(slot) NULL
247 static inline void rt_hash_lock_init(void)
250 #endif
252 static struct rt_hash_bucket *rt_hash_table __read_mostly;
253 static unsigned rt_hash_mask __read_mostly;
254 static unsigned int rt_hash_log __read_mostly;
256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257 #define RT_CACHE_STAT_INC(field) \
258 (__raw_get_cpu_var(rt_cache_stat).field++)
260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid)
263 return jhash_3words((__force u32)(__be32)(daddr),
264 (__force u32)(__be32)(saddr),
265 idx, genid)
266 & rt_hash_mask;
269 static inline int rt_genid(struct net *net)
271 return atomic_read(&net->ipv4.rt_genid);
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276 struct seq_net_private p;
277 int bucket;
278 int genid;
281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
283 struct rt_cache_iter_state *st = seq->private;
284 struct rtable *r = NULL;
286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287 if (!rt_hash_table[st->bucket].chain)
288 continue;
289 rcu_read_lock_bh();
290 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
291 while (r) {
292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293 r->rt_genid == st->genid)
294 return r;
295 r = rcu_dereference_bh(r->u.dst.rt_next);
297 rcu_read_unlock_bh();
299 return r;
302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
303 struct rtable *r)
305 struct rt_cache_iter_state *st = seq->private;
307 r = r->u.dst.rt_next;
308 while (!r) {
309 rcu_read_unlock_bh();
310 do {
311 if (--st->bucket < 0)
312 return NULL;
313 } while (!rt_hash_table[st->bucket].chain);
314 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain;
317 return rcu_dereference_bh(r);
320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
321 struct rtable *r)
323 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
326 continue;
327 if (r->rt_genid == st->genid)
328 break;
330 return r;
333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
335 struct rtable *r = rt_cache_get_first(seq);
337 if (r)
338 while (pos && (r = rt_cache_get_next(seq, r)))
339 --pos;
340 return pos ? NULL : r;
343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
345 struct rt_cache_iter_state *st = seq->private;
346 if (*pos)
347 return rt_cache_get_idx(seq, *pos - 1);
348 st->genid = rt_genid(seq_file_net(seq));
349 return SEQ_START_TOKEN;
352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
354 struct rtable *r;
356 if (v == SEQ_START_TOKEN)
357 r = rt_cache_get_first(seq);
358 else
359 r = rt_cache_get_next(seq, v);
360 ++*pos;
361 return r;
364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
366 if (v && v != SEQ_START_TOKEN)
367 rcu_read_unlock_bh();
370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
372 if (v == SEQ_START_TOKEN)
373 seq_printf(seq, "%-127s\n",
374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 "HHUptod\tSpecDst");
377 else {
378 struct rtable *r = v;
379 int len;
381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 r->u.dst.dev ? r->u.dst.dev->name : "*",
384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 dst_metric(&r->u.dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 dev_queue_xmit) : 0,
396 r->rt_spec_dst, &len);
398 seq_printf(seq, "%*s\n", 127 - len, "");
400 return 0;
403 static const struct seq_operations rt_cache_seq_ops = {
404 .start = rt_cache_seq_start,
405 .next = rt_cache_seq_next,
406 .stop = rt_cache_seq_stop,
407 .show = rt_cache_seq_show,
410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
412 return seq_open_net(inode, file, &rt_cache_seq_ops,
413 sizeof(struct rt_cache_iter_state));
416 static const struct file_operations rt_cache_seq_fops = {
417 .owner = THIS_MODULE,
418 .open = rt_cache_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
421 .release = seq_release_net,
425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
427 int cpu;
429 if (*pos == 0)
430 return SEQ_START_TOKEN;
432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433 if (!cpu_possible(cpu))
434 continue;
435 *pos = cpu+1;
436 return &per_cpu(rt_cache_stat, cpu);
438 return NULL;
441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
443 int cpu;
445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
449 return &per_cpu(rt_cache_stat, cpu);
451 return NULL;
455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
462 struct rt_cache_stat *st = v;
464 if (v == SEQ_START_TOKEN) {
465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 return 0;
469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops.entries),
472 st->in_hit,
473 st->in_slow_tot,
474 st->in_slow_mc,
475 st->in_no_route,
476 st->in_brd,
477 st->in_martian_dst,
478 st->in_martian_src,
480 st->out_hit,
481 st->out_slow_tot,
482 st->out_slow_mc,
484 st->gc_total,
485 st->gc_ignored,
486 st->gc_goal_miss,
487 st->gc_dst_overflow,
488 st->in_hlist_search,
489 st->out_hlist_search
491 return 0;
494 static const struct seq_operations rt_cpu_seq_ops = {
495 .start = rt_cpu_seq_start,
496 .next = rt_cpu_seq_next,
497 .stop = rt_cpu_seq_stop,
498 .show = rt_cpu_seq_show,
502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
504 return seq_open(file, &rt_cpu_seq_ops);
507 static const struct file_operations rt_cpu_seq_fops = {
508 .owner = THIS_MODULE,
509 .open = rt_cpu_seq_open,
510 .read = seq_read,
511 .llseek = seq_lseek,
512 .release = seq_release,
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int rt_acct_proc_show(struct seq_file *m, void *v)
518 struct ip_rt_acct *dst, *src;
519 unsigned int i, j;
521 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 if (!dst)
523 return -ENOMEM;
525 for_each_possible_cpu(i) {
526 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 for (j = 0; j < 256; j++) {
528 dst[j].o_bytes += src[j].o_bytes;
529 dst[j].o_packets += src[j].o_packets;
530 dst[j].i_bytes += src[j].i_bytes;
531 dst[j].i_packets += src[j].i_packets;
535 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
536 kfree(dst);
537 return 0;
540 static int rt_acct_proc_open(struct inode *inode, struct file *file)
542 return single_open(file, rt_acct_proc_show, NULL);
545 static const struct file_operations rt_acct_proc_fops = {
546 .owner = THIS_MODULE,
547 .open = rt_acct_proc_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = single_release,
552 #endif
554 static int __net_init ip_rt_do_proc_init(struct net *net)
556 struct proc_dir_entry *pde;
558 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
559 &rt_cache_seq_fops);
560 if (!pde)
561 goto err1;
563 pde = proc_create("rt_cache", S_IRUGO,
564 net->proc_net_stat, &rt_cpu_seq_fops);
565 if (!pde)
566 goto err2;
568 #ifdef CONFIG_NET_CLS_ROUTE
569 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
570 if (!pde)
571 goto err3;
572 #endif
573 return 0;
575 #ifdef CONFIG_NET_CLS_ROUTE
576 err3:
577 remove_proc_entry("rt_cache", net->proc_net_stat);
578 #endif
579 err2:
580 remove_proc_entry("rt_cache", net->proc_net);
581 err1:
582 return -ENOMEM;
585 static void __net_exit ip_rt_do_proc_exit(struct net *net)
587 remove_proc_entry("rt_cache", net->proc_net_stat);
588 remove_proc_entry("rt_cache", net->proc_net);
589 #ifdef CONFIG_NET_CLS_ROUTE
590 remove_proc_entry("rt_acct", net->proc_net);
591 #endif
594 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
595 .init = ip_rt_do_proc_init,
596 .exit = ip_rt_do_proc_exit,
599 static int __init ip_rt_proc_init(void)
601 return register_pernet_subsys(&ip_rt_proc_ops);
604 #else
605 static inline int ip_rt_proc_init(void)
607 return 0;
609 #endif /* CONFIG_PROC_FS */
611 static inline void rt_free(struct rtable *rt)
613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
616 static inline void rt_drop(struct rtable *rt)
618 ip_rt_put(rt);
619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
622 static inline int rt_fast_clean(struct rtable *rth)
624 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 rth->fl.iif && rth->u.dst.rt_next;
630 static inline int rt_valuable(struct rtable *rth)
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 rth->u.dst.expires;
636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
638 unsigned long age;
639 int ret = 0;
641 if (atomic_read(&rth->u.dst.__refcnt))
642 goto out;
644 ret = 1;
645 if (rth->u.dst.expires &&
646 time_after_eq(jiffies, rth->u.dst.expires))
647 goto out;
649 age = jiffies - rth->u.dst.lastuse;
650 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth)))
653 goto out;
654 ret = 1;
655 out: return ret;
658 /* Bits of score are:
659 * 31: very valuable
660 * 30: not quite useless
661 * 29..0: usage counter
663 static inline u32 rt_score(struct rtable *rt)
665 u32 score = jiffies - rt->u.dst.lastuse;
667 score = ~score & ~(3<<30);
669 if (rt_valuable(rt))
670 score |= (1<<31);
672 if (!rt->fl.iif ||
673 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 score |= (1<<30);
676 return score;
679 static inline bool rt_caching(const struct net *net)
681 return net->ipv4.current_rt_cache_rebuild_count <=
682 net->ipv4.sysctl_rt_cache_rebuild_count;
685 static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2)
688 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 (fl1->iif ^ fl2->iif)) == 0);
693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
695 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
697 (fl1->mark ^ fl2->mark) |
698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 (fl1->oif ^ fl2->oif) |
701 (fl1->iif ^ fl2->iif)) == 0;
704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
706 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
709 static inline int rt_is_expired(struct rtable *rth)
711 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
715 * Perform a full scan of hash table and free all entries.
716 * Can be called by a softirq or a process.
717 * In the later case, we want to be reschedule if necessary
719 static void rt_do_flush(int process_context)
721 unsigned int i;
722 struct rtable *rth, *next;
723 struct rtable * tail;
725 for (i = 0; i <= rt_hash_mask; i++) {
726 if (process_context && need_resched())
727 cond_resched();
728 rth = rt_hash_table[i].chain;
729 if (!rth)
730 continue;
732 spin_lock_bh(rt_hash_lock_addr(i));
733 #ifdef CONFIG_NET_NS
735 struct rtable ** prev, * p;
737 rth = rt_hash_table[i].chain;
739 /* defer releasing the head of the list after spin_unlock */
740 for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 if (!rt_is_expired(tail))
742 break;
743 if (rth != tail)
744 rt_hash_table[i].chain = tail;
746 /* call rt_free on entries after the tail requiring flush */
747 prev = &rt_hash_table[i].chain;
748 for (p = *prev; p; p = next) {
749 next = p->u.dst.rt_next;
750 if (!rt_is_expired(p)) {
751 prev = &p->u.dst.rt_next;
752 } else {
753 *prev = next;
754 rt_free(p);
758 #else
759 rth = rt_hash_table[i].chain;
760 rt_hash_table[i].chain = NULL;
761 tail = NULL;
762 #endif
763 spin_unlock_bh(rt_hash_lock_addr(i));
765 for (; rth != tail; rth = next) {
766 next = rth->u.dst.rt_next;
767 rt_free(rth);
773 * While freeing expired entries, we compute average chain length
774 * and standard deviation, using fixed-point arithmetic.
775 * This to have an estimation of rt_chain_length_max
776 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
777 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
780 #define FRACT_BITS 3
781 #define ONE (1UL << FRACT_BITS)
784 * Given a hash chain and an item in this hash chain,
785 * find if a previous entry has the same hash_inputs
786 * (but differs on tos, mark or oif)
787 * Returns 0 if an alias is found.
788 * Returns ONE if rth has no alias before itself.
790 static int has_noalias(const struct rtable *head, const struct rtable *rth)
792 const struct rtable *aux = head;
794 while (aux != rth) {
795 if (compare_hash_inputs(&aux->fl, &rth->fl))
796 return 0;
797 aux = aux->u.dst.rt_next;
799 return ONE;
802 static void rt_check_expire(void)
804 static unsigned int rover;
805 unsigned int i = rover, goal;
806 struct rtable *rth, **rthp;
807 unsigned long samples = 0;
808 unsigned long sum = 0, sum2 = 0;
809 unsigned long delta;
810 u64 mult;
812 delta = jiffies - expires_ljiffies;
813 expires_ljiffies = jiffies;
814 mult = ((u64)delta) << rt_hash_log;
815 if (ip_rt_gc_timeout > 1)
816 do_div(mult, ip_rt_gc_timeout);
817 goal = (unsigned int)mult;
818 if (goal > rt_hash_mask)
819 goal = rt_hash_mask + 1;
820 for (; goal > 0; goal--) {
821 unsigned long tmo = ip_rt_gc_timeout;
822 unsigned long length;
824 i = (i + 1) & rt_hash_mask;
825 rthp = &rt_hash_table[i].chain;
827 if (need_resched())
828 cond_resched();
830 samples++;
832 if (*rthp == NULL)
833 continue;
834 length = 0;
835 spin_lock_bh(rt_hash_lock_addr(i));
836 while ((rth = *rthp) != NULL) {
837 prefetch(rth->u.dst.rt_next);
838 if (rt_is_expired(rth)) {
839 *rthp = rth->u.dst.rt_next;
840 rt_free(rth);
841 continue;
843 if (rth->u.dst.expires) {
844 /* Entry is expired even if it is in use */
845 if (time_before_eq(jiffies, rth->u.dst.expires)) {
846 nofree:
847 tmo >>= 1;
848 rthp = &rth->u.dst.rt_next;
850 * We only count entries on
851 * a chain with equal hash inputs once
852 * so that entries for different QOS
853 * levels, and other non-hash input
854 * attributes don't unfairly skew
855 * the length computation
857 length += has_noalias(rt_hash_table[i].chain, rth);
858 continue;
860 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
861 goto nofree;
863 /* Cleanup aged off entries. */
864 *rthp = rth->u.dst.rt_next;
865 rt_free(rth);
867 spin_unlock_bh(rt_hash_lock_addr(i));
868 sum += length;
869 sum2 += length*length;
871 if (samples) {
872 unsigned long avg = sum / samples;
873 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
874 rt_chain_length_max = max_t(unsigned long,
875 ip_rt_gc_elasticity,
876 (avg + 4*sd) >> FRACT_BITS);
878 rover = i;
882 * rt_worker_func() is run in process context.
883 * we call rt_check_expire() to scan part of the hash table
885 static void rt_worker_func(struct work_struct *work)
887 rt_check_expire();
888 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
892 * Pertubation of rt_genid by a small quantity [1..256]
893 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
894 * many times (2^24) without giving recent rt_genid.
895 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
897 static void rt_cache_invalidate(struct net *net)
899 unsigned char shuffle;
901 get_random_bytes(&shuffle, sizeof(shuffle));
902 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
906 * delay < 0 : invalidate cache (fast : entries will be deleted later)
907 * delay >= 0 : invalidate & flush cache (can be long)
909 void rt_cache_flush(struct net *net, int delay)
911 rt_cache_invalidate(net);
912 if (delay >= 0)
913 rt_do_flush(!in_softirq());
916 /* Flush previous cache invalidated entries from the cache */
917 void rt_cache_flush_batch(void)
919 rt_do_flush(!in_softirq());
923 * We change rt_genid and let gc do the cleanup
925 static void rt_secret_rebuild(unsigned long __net)
927 struct net *net = (struct net *)__net;
928 rt_cache_invalidate(net);
929 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
932 static void rt_secret_rebuild_oneshot(struct net *net)
934 del_timer_sync(&net->ipv4.rt_secret_timer);
935 rt_cache_invalidate(net);
936 if (ip_rt_secret_interval)
937 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
940 static void rt_emergency_hash_rebuild(struct net *net)
942 if (net_ratelimit()) {
943 printk(KERN_WARNING "Route hash chain too long!\n");
944 printk(KERN_WARNING "Adjust your secret_interval!\n");
947 rt_secret_rebuild_oneshot(net);
951 Short description of GC goals.
953 We want to build algorithm, which will keep routing cache
954 at some equilibrium point, when number of aged off entries
955 is kept approximately equal to newly generated ones.
957 Current expiration strength is variable "expire".
958 We try to adjust it dynamically, so that if networking
959 is idle expires is large enough to keep enough of warm entries,
960 and when load increases it reduces to limit cache size.
963 static int rt_garbage_collect(struct dst_ops *ops)
965 static unsigned long expire = RT_GC_TIMEOUT;
966 static unsigned long last_gc;
967 static int rover;
968 static int equilibrium;
969 struct rtable *rth, **rthp;
970 unsigned long now = jiffies;
971 int goal;
974 * Garbage collection is pretty expensive,
975 * do not make it too frequently.
978 RT_CACHE_STAT_INC(gc_total);
980 if (now - last_gc < ip_rt_gc_min_interval &&
981 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
982 RT_CACHE_STAT_INC(gc_ignored);
983 goto out;
986 /* Calculate number of entries, which we want to expire now. */
987 goal = atomic_read(&ipv4_dst_ops.entries) -
988 (ip_rt_gc_elasticity << rt_hash_log);
989 if (goal <= 0) {
990 if (equilibrium < ipv4_dst_ops.gc_thresh)
991 equilibrium = ipv4_dst_ops.gc_thresh;
992 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
993 if (goal > 0) {
994 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
995 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
997 } else {
998 /* We are in dangerous area. Try to reduce cache really
999 * aggressively.
1001 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1002 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
1005 if (now - last_gc >= ip_rt_gc_min_interval)
1006 last_gc = now;
1008 if (goal <= 0) {
1009 equilibrium += goal;
1010 goto work_done;
1013 do {
1014 int i, k;
1016 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1017 unsigned long tmo = expire;
1019 k = (k + 1) & rt_hash_mask;
1020 rthp = &rt_hash_table[k].chain;
1021 spin_lock_bh(rt_hash_lock_addr(k));
1022 while ((rth = *rthp) != NULL) {
1023 if (!rt_is_expired(rth) &&
1024 !rt_may_expire(rth, tmo, expire)) {
1025 tmo >>= 1;
1026 rthp = &rth->u.dst.rt_next;
1027 continue;
1029 *rthp = rth->u.dst.rt_next;
1030 rt_free(rth);
1031 goal--;
1033 spin_unlock_bh(rt_hash_lock_addr(k));
1034 if (goal <= 0)
1035 break;
1037 rover = k;
1039 if (goal <= 0)
1040 goto work_done;
1042 /* Goal is not achieved. We stop process if:
1044 - if expire reduced to zero. Otherwise, expire is halfed.
1045 - if table is not full.
1046 - if we are called from interrupt.
1047 - jiffies check is just fallback/debug loop breaker.
1048 We will not spin here for long time in any case.
1051 RT_CACHE_STAT_INC(gc_goal_miss);
1053 if (expire == 0)
1054 break;
1056 expire >>= 1;
1057 #if RT_CACHE_DEBUG >= 2
1058 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1059 atomic_read(&ipv4_dst_ops.entries), goal, i);
1060 #endif
1062 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1063 goto out;
1064 } while (!in_softirq() && time_before_eq(jiffies, now));
1066 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1067 goto out;
1068 if (net_ratelimit())
1069 printk(KERN_WARNING "dst cache overflow\n");
1070 RT_CACHE_STAT_INC(gc_dst_overflow);
1071 return 1;
1073 work_done:
1074 expire += ip_rt_gc_min_interval;
1075 if (expire > ip_rt_gc_timeout ||
1076 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1077 expire = ip_rt_gc_timeout;
1078 #if RT_CACHE_DEBUG >= 2
1079 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1080 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1081 #endif
1082 out: return 0;
1086 * Returns number of entries in a hash chain that have different hash_inputs
1088 static int slow_chain_length(const struct rtable *head)
1090 int length = 0;
1091 const struct rtable *rth = head;
1093 while (rth) {
1094 length += has_noalias(head, rth);
1095 rth = rth->u.dst.rt_next;
1097 return length >> FRACT_BITS;
1100 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1101 struct rtable **rp, struct sk_buff *skb, int ifindex)
1103 struct rtable *rth, **rthp;
1104 unsigned long now;
1105 struct rtable *cand, **candp;
1106 u32 min_score;
1107 int chain_length;
1108 int attempts = !in_softirq();
1110 restart:
1111 chain_length = 0;
1112 min_score = ~(u32)0;
1113 cand = NULL;
1114 candp = NULL;
1115 now = jiffies;
1117 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1119 * If we're not caching, just tell the caller we
1120 * were successful and don't touch the route. The
1121 * caller hold the sole reference to the cache entry, and
1122 * it will be released when the caller is done with it.
1123 * If we drop it here, the callers have no way to resolve routes
1124 * when we're not caching. Instead, just point *rp at rt, so
1125 * the caller gets a single use out of the route
1126 * Note that we do rt_free on this new route entry, so that
1127 * once its refcount hits zero, we are still able to reap it
1128 * (Thanks Alexey)
1129 * Note also the rt_free uses call_rcu. We don't actually
1130 * need rcu protection here, this is just our path to get
1131 * on the route gc list.
1134 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1135 int err = arp_bind_neighbour(&rt->u.dst);
1136 if (err) {
1137 if (net_ratelimit())
1138 printk(KERN_WARNING
1139 "Neighbour table failure & not caching routes.\n");
1140 rt_drop(rt);
1141 return err;
1145 rt_free(rt);
1146 goto skip_hashing;
1149 rthp = &rt_hash_table[hash].chain;
1151 spin_lock_bh(rt_hash_lock_addr(hash));
1152 while ((rth = *rthp) != NULL) {
1153 if (rt_is_expired(rth)) {
1154 *rthp = rth->u.dst.rt_next;
1155 rt_free(rth);
1156 continue;
1158 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1159 /* Put it first */
1160 *rthp = rth->u.dst.rt_next;
1162 * Since lookup is lockfree, the deletion
1163 * must be visible to another weakly ordered CPU before
1164 * the insertion at the start of the hash chain.
1166 rcu_assign_pointer(rth->u.dst.rt_next,
1167 rt_hash_table[hash].chain);
1169 * Since lookup is lockfree, the update writes
1170 * must be ordered for consistency on SMP.
1172 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1174 dst_use(&rth->u.dst, now);
1175 spin_unlock_bh(rt_hash_lock_addr(hash));
1177 rt_drop(rt);
1178 if (rp)
1179 *rp = rth;
1180 else
1181 skb_dst_set(skb, &rth->u.dst);
1182 return 0;
1185 if (!atomic_read(&rth->u.dst.__refcnt)) {
1186 u32 score = rt_score(rth);
1188 if (score <= min_score) {
1189 cand = rth;
1190 candp = rthp;
1191 min_score = score;
1195 chain_length++;
1197 rthp = &rth->u.dst.rt_next;
1200 if (cand) {
1201 /* ip_rt_gc_elasticity used to be average length of chain
1202 * length, when exceeded gc becomes really aggressive.
1204 * The second limit is less certain. At the moment it allows
1205 * only 2 entries per bucket. We will see.
1207 if (chain_length > ip_rt_gc_elasticity) {
1208 *candp = cand->u.dst.rt_next;
1209 rt_free(cand);
1211 } else {
1212 if (chain_length > rt_chain_length_max &&
1213 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1214 struct net *net = dev_net(rt->u.dst.dev);
1215 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1216 if (!rt_caching(net)) {
1217 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1218 rt->u.dst.dev->name, num);
1220 rt_emergency_hash_rebuild(net);
1221 spin_unlock_bh(rt_hash_lock_addr(hash));
1223 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1224 ifindex, rt_genid(net));
1225 goto restart;
1229 /* Try to bind route to arp only if it is output
1230 route or unicast forwarding path.
1232 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1233 int err = arp_bind_neighbour(&rt->u.dst);
1234 if (err) {
1235 spin_unlock_bh(rt_hash_lock_addr(hash));
1237 if (err != -ENOBUFS) {
1238 rt_drop(rt);
1239 return err;
1242 /* Neighbour tables are full and nothing
1243 can be released. Try to shrink route cache,
1244 it is most likely it holds some neighbour records.
1246 if (attempts-- > 0) {
1247 int saved_elasticity = ip_rt_gc_elasticity;
1248 int saved_int = ip_rt_gc_min_interval;
1249 ip_rt_gc_elasticity = 1;
1250 ip_rt_gc_min_interval = 0;
1251 rt_garbage_collect(&ipv4_dst_ops);
1252 ip_rt_gc_min_interval = saved_int;
1253 ip_rt_gc_elasticity = saved_elasticity;
1254 goto restart;
1257 if (net_ratelimit())
1258 printk(KERN_WARNING "Neighbour table overflow.\n");
1259 rt_drop(rt);
1260 return -ENOBUFS;
1264 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1266 #if RT_CACHE_DEBUG >= 2
1267 if (rt->u.dst.rt_next) {
1268 struct rtable *trt;
1269 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1270 hash, &rt->rt_dst);
1271 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1272 printk(" . %pI4", &trt->rt_dst);
1273 printk("\n");
1275 #endif
1277 * Since lookup is lockfree, we must make sure
1278 * previous writes to rt are comitted to memory
1279 * before making rt visible to other CPUS.
1281 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1283 spin_unlock_bh(rt_hash_lock_addr(hash));
1285 skip_hashing:
1286 if (rp)
1287 *rp = rt;
1288 else
1289 skb_dst_set(skb, &rt->u.dst);
1290 return 0;
1293 void rt_bind_peer(struct rtable *rt, int create)
1295 static DEFINE_SPINLOCK(rt_peer_lock);
1296 struct inet_peer *peer;
1298 peer = inet_getpeer(rt->rt_dst, create);
1300 spin_lock_bh(&rt_peer_lock);
1301 if (rt->peer == NULL) {
1302 rt->peer = peer;
1303 peer = NULL;
1305 spin_unlock_bh(&rt_peer_lock);
1306 if (peer)
1307 inet_putpeer(peer);
1311 * Peer allocation may fail only in serious out-of-memory conditions. However
1312 * we still can generate some output.
1313 * Random ID selection looks a bit dangerous because we have no chances to
1314 * select ID being unique in a reasonable period of time.
1315 * But broken packet identifier may be better than no packet at all.
1317 static void ip_select_fb_ident(struct iphdr *iph)
1319 static DEFINE_SPINLOCK(ip_fb_id_lock);
1320 static u32 ip_fallback_id;
1321 u32 salt;
1323 spin_lock_bh(&ip_fb_id_lock);
1324 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1325 iph->id = htons(salt & 0xFFFF);
1326 ip_fallback_id = salt;
1327 spin_unlock_bh(&ip_fb_id_lock);
1330 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1332 struct rtable *rt = (struct rtable *) dst;
1334 if (rt) {
1335 if (rt->peer == NULL)
1336 rt_bind_peer(rt, 1);
1338 /* If peer is attached to destination, it is never detached,
1339 so that we need not to grab a lock to dereference it.
1341 if (rt->peer) {
1342 iph->id = htons(inet_getid(rt->peer, more));
1343 return;
1345 } else
1346 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1347 __builtin_return_address(0));
1349 ip_select_fb_ident(iph);
1352 static void rt_del(unsigned hash, struct rtable *rt)
1354 struct rtable **rthp, *aux;
1356 rthp = &rt_hash_table[hash].chain;
1357 spin_lock_bh(rt_hash_lock_addr(hash));
1358 ip_rt_put(rt);
1359 while ((aux = *rthp) != NULL) {
1360 if (aux == rt || rt_is_expired(aux)) {
1361 *rthp = aux->u.dst.rt_next;
1362 rt_free(aux);
1363 continue;
1365 rthp = &aux->u.dst.rt_next;
1367 spin_unlock_bh(rt_hash_lock_addr(hash));
1370 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1371 __be32 saddr, struct net_device *dev)
1373 int i, k;
1374 struct in_device *in_dev = in_dev_get(dev);
1375 struct rtable *rth, **rthp;
1376 __be32 skeys[2] = { saddr, 0 };
1377 int ikeys[2] = { dev->ifindex, 0 };
1378 struct netevent_redirect netevent;
1379 struct net *net;
1381 if (!in_dev)
1382 return;
1384 net = dev_net(dev);
1385 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1386 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1387 ipv4_is_zeronet(new_gw))
1388 goto reject_redirect;
1390 if (!rt_caching(net))
1391 goto reject_redirect;
1393 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1394 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1395 goto reject_redirect;
1396 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1397 goto reject_redirect;
1398 } else {
1399 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1400 goto reject_redirect;
1403 for (i = 0; i < 2; i++) {
1404 for (k = 0; k < 2; k++) {
1405 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1406 rt_genid(net));
1408 rthp=&rt_hash_table[hash].chain;
1410 rcu_read_lock();
1411 while ((rth = rcu_dereference(*rthp)) != NULL) {
1412 struct rtable *rt;
1414 if (rth->fl.fl4_dst != daddr ||
1415 rth->fl.fl4_src != skeys[i] ||
1416 rth->fl.oif != ikeys[k] ||
1417 rth->fl.iif != 0 ||
1418 rt_is_expired(rth) ||
1419 !net_eq(dev_net(rth->u.dst.dev), net)) {
1420 rthp = &rth->u.dst.rt_next;
1421 continue;
1424 if (rth->rt_dst != daddr ||
1425 rth->rt_src != saddr ||
1426 rth->u.dst.error ||
1427 rth->rt_gateway != old_gw ||
1428 rth->u.dst.dev != dev)
1429 break;
1431 dst_hold(&rth->u.dst);
1432 rcu_read_unlock();
1434 rt = dst_alloc(&ipv4_dst_ops);
1435 if (rt == NULL) {
1436 ip_rt_put(rth);
1437 in_dev_put(in_dev);
1438 return;
1441 /* Copy all the information. */
1442 *rt = *rth;
1443 rt->u.dst.__use = 1;
1444 atomic_set(&rt->u.dst.__refcnt, 1);
1445 rt->u.dst.child = NULL;
1446 if (rt->u.dst.dev)
1447 dev_hold(rt->u.dst.dev);
1448 if (rt->idev)
1449 in_dev_hold(rt->idev);
1450 rt->u.dst.obsolete = -1;
1451 rt->u.dst.lastuse = jiffies;
1452 rt->u.dst.path = &rt->u.dst;
1453 rt->u.dst.neighbour = NULL;
1454 rt->u.dst.hh = NULL;
1455 #ifdef CONFIG_XFRM
1456 rt->u.dst.xfrm = NULL;
1457 #endif
1458 rt->rt_genid = rt_genid(net);
1459 rt->rt_flags |= RTCF_REDIRECTED;
1461 /* Gateway is different ... */
1462 rt->rt_gateway = new_gw;
1464 /* Redirect received -> path was valid */
1465 dst_confirm(&rth->u.dst);
1467 if (rt->peer)
1468 atomic_inc(&rt->peer->refcnt);
1470 if (arp_bind_neighbour(&rt->u.dst) ||
1471 !(rt->u.dst.neighbour->nud_state &
1472 NUD_VALID)) {
1473 if (rt->u.dst.neighbour)
1474 neigh_event_send(rt->u.dst.neighbour, NULL);
1475 ip_rt_put(rth);
1476 rt_drop(rt);
1477 goto do_next;
1480 netevent.old = &rth->u.dst;
1481 netevent.new = &rt->u.dst;
1482 call_netevent_notifiers(NETEVENT_REDIRECT,
1483 &netevent);
1485 rt_del(hash, rth);
1486 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1487 ip_rt_put(rt);
1488 goto do_next;
1490 rcu_read_unlock();
1491 do_next:
1495 in_dev_put(in_dev);
1496 return;
1498 reject_redirect:
1499 #ifdef CONFIG_IP_ROUTE_VERBOSE
1500 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1501 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1502 " Advised path = %pI4 -> %pI4\n",
1503 &old_gw, dev->name, &new_gw,
1504 &saddr, &daddr);
1505 #endif
1506 in_dev_put(in_dev);
1509 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1511 struct rtable *rt = (struct rtable *)dst;
1512 struct dst_entry *ret = dst;
1514 if (rt) {
1515 if (dst->obsolete > 0) {
1516 ip_rt_put(rt);
1517 ret = NULL;
1518 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1519 (rt->u.dst.expires &&
1520 time_after_eq(jiffies, rt->u.dst.expires))) {
1521 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1522 rt->fl.oif,
1523 rt_genid(dev_net(dst->dev)));
1524 #if RT_CACHE_DEBUG >= 1
1525 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1526 &rt->rt_dst, rt->fl.fl4_tos);
1527 #endif
1528 rt_del(hash, rt);
1529 ret = NULL;
1532 return ret;
1536 * Algorithm:
1537 * 1. The first ip_rt_redirect_number redirects are sent
1538 * with exponential backoff, then we stop sending them at all,
1539 * assuming that the host ignores our redirects.
1540 * 2. If we did not see packets requiring redirects
1541 * during ip_rt_redirect_silence, we assume that the host
1542 * forgot redirected route and start to send redirects again.
1544 * This algorithm is much cheaper and more intelligent than dumb load limiting
1545 * in icmp.c.
1547 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1548 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1551 void ip_rt_send_redirect(struct sk_buff *skb)
1553 struct rtable *rt = skb_rtable(skb);
1554 struct in_device *in_dev;
1555 int log_martians;
1557 rcu_read_lock();
1558 in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1559 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1560 rcu_read_unlock();
1561 return;
1563 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1564 rcu_read_unlock();
1566 /* No redirected packets during ip_rt_redirect_silence;
1567 * reset the algorithm.
1569 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1570 rt->u.dst.rate_tokens = 0;
1572 /* Too many ignored redirects; do not send anything
1573 * set u.dst.rate_last to the last seen redirected packet.
1575 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1576 rt->u.dst.rate_last = jiffies;
1577 return;
1580 /* Check for load limit; set rate_last to the latest sent
1581 * redirect.
1583 if (rt->u.dst.rate_tokens == 0 ||
1584 time_after(jiffies,
1585 (rt->u.dst.rate_last +
1586 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1587 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1588 rt->u.dst.rate_last = jiffies;
1589 ++rt->u.dst.rate_tokens;
1590 #ifdef CONFIG_IP_ROUTE_VERBOSE
1591 if (log_martians &&
1592 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1593 net_ratelimit())
1594 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1595 &rt->rt_src, rt->rt_iif,
1596 &rt->rt_dst, &rt->rt_gateway);
1597 #endif
1601 static int ip_error(struct sk_buff *skb)
1603 struct rtable *rt = skb_rtable(skb);
1604 unsigned long now;
1605 int code;
1607 switch (rt->u.dst.error) {
1608 case EINVAL:
1609 default:
1610 goto out;
1611 case EHOSTUNREACH:
1612 code = ICMP_HOST_UNREACH;
1613 break;
1614 case ENETUNREACH:
1615 code = ICMP_NET_UNREACH;
1616 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1617 IPSTATS_MIB_INNOROUTES);
1618 break;
1619 case EACCES:
1620 code = ICMP_PKT_FILTERED;
1621 break;
1624 now = jiffies;
1625 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1626 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1627 rt->u.dst.rate_tokens = ip_rt_error_burst;
1628 rt->u.dst.rate_last = now;
1629 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1630 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1631 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1634 out: kfree_skb(skb);
1635 return 0;
1639 * The last two values are not from the RFC but
1640 * are needed for AMPRnet AX.25 paths.
1643 static const unsigned short mtu_plateau[] =
1644 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1646 static inline unsigned short guess_mtu(unsigned short old_mtu)
1648 int i;
1650 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1651 if (old_mtu > mtu_plateau[i])
1652 return mtu_plateau[i];
1653 return 68;
1656 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1657 unsigned short new_mtu,
1658 struct net_device *dev)
1660 int i, k;
1661 unsigned short old_mtu = ntohs(iph->tot_len);
1662 struct rtable *rth;
1663 int ikeys[2] = { dev->ifindex, 0 };
1664 __be32 skeys[2] = { iph->saddr, 0, };
1665 __be32 daddr = iph->daddr;
1666 unsigned short est_mtu = 0;
1668 for (k = 0; k < 2; k++) {
1669 for (i = 0; i < 2; i++) {
1670 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1671 rt_genid(net));
1673 rcu_read_lock();
1674 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1675 rth = rcu_dereference(rth->u.dst.rt_next)) {
1676 unsigned short mtu = new_mtu;
1678 if (rth->fl.fl4_dst != daddr ||
1679 rth->fl.fl4_src != skeys[i] ||
1680 rth->rt_dst != daddr ||
1681 rth->rt_src != iph->saddr ||
1682 rth->fl.oif != ikeys[k] ||
1683 rth->fl.iif != 0 ||
1684 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1685 !net_eq(dev_net(rth->u.dst.dev), net) ||
1686 rt_is_expired(rth))
1687 continue;
1689 if (new_mtu < 68 || new_mtu >= old_mtu) {
1691 /* BSD 4.2 compatibility hack :-( */
1692 if (mtu == 0 &&
1693 old_mtu >= dst_mtu(&rth->u.dst) &&
1694 old_mtu >= 68 + (iph->ihl << 2))
1695 old_mtu -= iph->ihl << 2;
1697 mtu = guess_mtu(old_mtu);
1699 if (mtu <= dst_mtu(&rth->u.dst)) {
1700 if (mtu < dst_mtu(&rth->u.dst)) {
1701 dst_confirm(&rth->u.dst);
1702 if (mtu < ip_rt_min_pmtu) {
1703 mtu = ip_rt_min_pmtu;
1704 rth->u.dst.metrics[RTAX_LOCK-1] |=
1705 (1 << RTAX_MTU);
1707 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1708 dst_set_expires(&rth->u.dst,
1709 ip_rt_mtu_expires);
1711 est_mtu = mtu;
1714 rcu_read_unlock();
1717 return est_mtu ? : new_mtu;
1720 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1722 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1723 !(dst_metric_locked(dst, RTAX_MTU))) {
1724 if (mtu < ip_rt_min_pmtu) {
1725 mtu = ip_rt_min_pmtu;
1726 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1728 dst->metrics[RTAX_MTU-1] = mtu;
1729 dst_set_expires(dst, ip_rt_mtu_expires);
1730 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1734 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1736 if (rt_is_expired((struct rtable *)dst))
1737 return NULL;
1738 return dst;
1741 static void ipv4_dst_destroy(struct dst_entry *dst)
1743 struct rtable *rt = (struct rtable *) dst;
1744 struct inet_peer *peer = rt->peer;
1745 struct in_device *idev = rt->idev;
1747 if (peer) {
1748 rt->peer = NULL;
1749 inet_putpeer(peer);
1752 if (idev) {
1753 rt->idev = NULL;
1754 in_dev_put(idev);
1758 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1759 int how)
1761 struct rtable *rt = (struct rtable *) dst;
1762 struct in_device *idev = rt->idev;
1763 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1764 struct in_device *loopback_idev =
1765 in_dev_get(dev_net(dev)->loopback_dev);
1766 if (loopback_idev) {
1767 rt->idev = loopback_idev;
1768 in_dev_put(idev);
1773 static void ipv4_link_failure(struct sk_buff *skb)
1775 struct rtable *rt;
1777 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1779 rt = skb_rtable(skb);
1780 if (rt)
1781 dst_set_expires(&rt->u.dst, 0);
1784 static int ip_rt_bug(struct sk_buff *skb)
1786 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1787 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1788 skb->dev ? skb->dev->name : "?");
1789 kfree_skb(skb);
1790 return 0;
1794 We do not cache source address of outgoing interface,
1795 because it is used only by IP RR, TS and SRR options,
1796 so that it out of fast path.
1798 BTW remember: "addr" is allowed to be not aligned
1799 in IP options!
1802 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1804 __be32 src;
1805 struct fib_result res;
1807 if (rt->fl.iif == 0)
1808 src = rt->rt_src;
1809 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1810 src = FIB_RES_PREFSRC(res);
1811 fib_res_put(&res);
1812 } else
1813 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1814 RT_SCOPE_UNIVERSE);
1815 memcpy(addr, &src, 4);
1818 #ifdef CONFIG_NET_CLS_ROUTE
1819 static void set_class_tag(struct rtable *rt, u32 tag)
1821 if (!(rt->u.dst.tclassid & 0xFFFF))
1822 rt->u.dst.tclassid |= tag & 0xFFFF;
1823 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1824 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1826 #endif
1828 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1830 struct fib_info *fi = res->fi;
1832 if (fi) {
1833 if (FIB_RES_GW(*res) &&
1834 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1835 rt->rt_gateway = FIB_RES_GW(*res);
1836 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1837 sizeof(rt->u.dst.metrics));
1838 if (fi->fib_mtu == 0) {
1839 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1840 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1841 rt->rt_gateway != rt->rt_dst &&
1842 rt->u.dst.dev->mtu > 576)
1843 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1845 #ifdef CONFIG_NET_CLS_ROUTE
1846 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1847 #endif
1848 } else
1849 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1851 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1852 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1853 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1854 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1855 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1856 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1857 ip_rt_min_advmss);
1858 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1859 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1861 #ifdef CONFIG_NET_CLS_ROUTE
1862 #ifdef CONFIG_IP_MULTIPLE_TABLES
1863 set_class_tag(rt, fib_rules_tclass(res));
1864 #endif
1865 set_class_tag(rt, itag);
1866 #endif
1867 rt->rt_type = res->type;
1870 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1871 u8 tos, struct net_device *dev, int our)
1873 unsigned hash;
1874 struct rtable *rth;
1875 __be32 spec_dst;
1876 struct in_device *in_dev = in_dev_get(dev);
1877 u32 itag = 0;
1879 /* Primary sanity checks. */
1881 if (in_dev == NULL)
1882 return -EINVAL;
1884 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1885 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1886 goto e_inval;
1888 if (ipv4_is_zeronet(saddr)) {
1889 if (!ipv4_is_local_multicast(daddr))
1890 goto e_inval;
1891 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1892 } else if (fib_validate_source(saddr, 0, tos, 0,
1893 dev, &spec_dst, &itag, 0) < 0)
1894 goto e_inval;
1896 rth = dst_alloc(&ipv4_dst_ops);
1897 if (!rth)
1898 goto e_nobufs;
1900 rth->u.dst.output = ip_rt_bug;
1901 rth->u.dst.obsolete = -1;
1903 atomic_set(&rth->u.dst.__refcnt, 1);
1904 rth->u.dst.flags= DST_HOST;
1905 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1906 rth->u.dst.flags |= DST_NOPOLICY;
1907 rth->fl.fl4_dst = daddr;
1908 rth->rt_dst = daddr;
1909 rth->fl.fl4_tos = tos;
1910 rth->fl.mark = skb->mark;
1911 rth->fl.fl4_src = saddr;
1912 rth->rt_src = saddr;
1913 #ifdef CONFIG_NET_CLS_ROUTE
1914 rth->u.dst.tclassid = itag;
1915 #endif
1916 rth->rt_iif =
1917 rth->fl.iif = dev->ifindex;
1918 rth->u.dst.dev = init_net.loopback_dev;
1919 dev_hold(rth->u.dst.dev);
1920 rth->idev = in_dev_get(rth->u.dst.dev);
1921 rth->fl.oif = 0;
1922 rth->rt_gateway = daddr;
1923 rth->rt_spec_dst= spec_dst;
1924 rth->rt_genid = rt_genid(dev_net(dev));
1925 rth->rt_flags = RTCF_MULTICAST;
1926 rth->rt_type = RTN_MULTICAST;
1927 if (our) {
1928 rth->u.dst.input= ip_local_deliver;
1929 rth->rt_flags |= RTCF_LOCAL;
1932 #ifdef CONFIG_IP_MROUTE
1933 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1934 rth->u.dst.input = ip_mr_input;
1935 #endif
1936 RT_CACHE_STAT_INC(in_slow_mc);
1938 in_dev_put(in_dev);
1939 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1940 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1942 e_nobufs:
1943 in_dev_put(in_dev);
1944 return -ENOBUFS;
1946 e_inval:
1947 in_dev_put(in_dev);
1948 return -EINVAL;
1952 static void ip_handle_martian_source(struct net_device *dev,
1953 struct in_device *in_dev,
1954 struct sk_buff *skb,
1955 __be32 daddr,
1956 __be32 saddr)
1958 RT_CACHE_STAT_INC(in_martian_src);
1959 #ifdef CONFIG_IP_ROUTE_VERBOSE
1960 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1962 * RFC1812 recommendation, if source is martian,
1963 * the only hint is MAC header.
1965 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1966 &daddr, &saddr, dev->name);
1967 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1968 int i;
1969 const unsigned char *p = skb_mac_header(skb);
1970 printk(KERN_WARNING "ll header: ");
1971 for (i = 0; i < dev->hard_header_len; i++, p++) {
1972 printk("%02x", *p);
1973 if (i < (dev->hard_header_len - 1))
1974 printk(":");
1976 printk("\n");
1979 #endif
1982 static int __mkroute_input(struct sk_buff *skb,
1983 struct fib_result *res,
1984 struct in_device *in_dev,
1985 __be32 daddr, __be32 saddr, u32 tos,
1986 struct rtable **result)
1989 struct rtable *rth;
1990 int err;
1991 struct in_device *out_dev;
1992 unsigned flags = 0;
1993 __be32 spec_dst;
1994 u32 itag;
1996 /* get a working reference to the output device */
1997 out_dev = in_dev_get(FIB_RES_DEV(*res));
1998 if (out_dev == NULL) {
1999 if (net_ratelimit())
2000 printk(KERN_CRIT "Bug in ip_route_input" \
2001 "_slow(). Please, report\n");
2002 return -EINVAL;
2006 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
2007 in_dev->dev, &spec_dst, &itag, skb->mark);
2008 if (err < 0) {
2009 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2010 saddr);
2012 err = -EINVAL;
2013 goto cleanup;
2016 if (err)
2017 flags |= RTCF_DIRECTSRC;
2019 if (out_dev == in_dev && err &&
2020 (IN_DEV_SHARED_MEDIA(out_dev) ||
2021 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2022 flags |= RTCF_DOREDIRECT;
2024 if (skb->protocol != htons(ETH_P_IP)) {
2025 /* Not IP (i.e. ARP). Do not create route, if it is
2026 * invalid for proxy arp. DNAT routes are always valid.
2028 * Proxy arp feature have been extended to allow, ARP
2029 * replies back to the same interface, to support
2030 * Private VLAN switch technologies. See arp.c.
2032 if (out_dev == in_dev &&
2033 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2034 err = -EINVAL;
2035 goto cleanup;
2040 rth = dst_alloc(&ipv4_dst_ops);
2041 if (!rth) {
2042 err = -ENOBUFS;
2043 goto cleanup;
2046 atomic_set(&rth->u.dst.__refcnt, 1);
2047 rth->u.dst.flags= DST_HOST;
2048 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2049 rth->u.dst.flags |= DST_NOPOLICY;
2050 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2051 rth->u.dst.flags |= DST_NOXFRM;
2052 rth->fl.fl4_dst = daddr;
2053 rth->rt_dst = daddr;
2054 rth->fl.fl4_tos = tos;
2055 rth->fl.mark = skb->mark;
2056 rth->fl.fl4_src = saddr;
2057 rth->rt_src = saddr;
2058 rth->rt_gateway = daddr;
2059 rth->rt_iif =
2060 rth->fl.iif = in_dev->dev->ifindex;
2061 rth->u.dst.dev = (out_dev)->dev;
2062 dev_hold(rth->u.dst.dev);
2063 rth->idev = in_dev_get(rth->u.dst.dev);
2064 rth->fl.oif = 0;
2065 rth->rt_spec_dst= spec_dst;
2067 rth->u.dst.obsolete = -1;
2068 rth->u.dst.input = ip_forward;
2069 rth->u.dst.output = ip_output;
2070 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2072 rt_set_nexthop(rth, res, itag);
2074 rth->rt_flags = flags;
2076 *result = rth;
2077 err = 0;
2078 cleanup:
2079 /* release the working reference to the output device */
2080 in_dev_put(out_dev);
2081 return err;
2084 static int ip_mkroute_input(struct sk_buff *skb,
2085 struct fib_result *res,
2086 const struct flowi *fl,
2087 struct in_device *in_dev,
2088 __be32 daddr, __be32 saddr, u32 tos)
2090 struct rtable* rth = NULL;
2091 int err;
2092 unsigned hash;
2094 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2095 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2096 fib_select_multipath(fl, res);
2097 #endif
2099 /* create a routing cache entry */
2100 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2101 if (err)
2102 return err;
2104 /* put it into the cache */
2105 hash = rt_hash(daddr, saddr, fl->iif,
2106 rt_genid(dev_net(rth->u.dst.dev)));
2107 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2111 * NOTE. We drop all the packets that has local source
2112 * addresses, because every properly looped back packet
2113 * must have correct destination already attached by output routine.
2115 * Such approach solves two big problems:
2116 * 1. Not simplex devices are handled properly.
2117 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2120 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2121 u8 tos, struct net_device *dev)
2123 struct fib_result res;
2124 struct in_device *in_dev = in_dev_get(dev);
2125 struct flowi fl = { .nl_u = { .ip4_u =
2126 { .daddr = daddr,
2127 .saddr = saddr,
2128 .tos = tos,
2129 .scope = RT_SCOPE_UNIVERSE,
2130 } },
2131 .mark = skb->mark,
2132 .iif = dev->ifindex };
2133 unsigned flags = 0;
2134 u32 itag = 0;
2135 struct rtable * rth;
2136 unsigned hash;
2137 __be32 spec_dst;
2138 int err = -EINVAL;
2139 int free_res = 0;
2140 struct net * net = dev_net(dev);
2142 /* IP on this device is disabled. */
2144 if (!in_dev)
2145 goto out;
2147 /* Check for the most weird martians, which can be not detected
2148 by fib_lookup.
2151 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2152 ipv4_is_loopback(saddr))
2153 goto martian_source;
2155 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2156 goto brd_input;
2158 /* Accept zero addresses only to limited broadcast;
2159 * I even do not know to fix it or not. Waiting for complains :-)
2161 if (ipv4_is_zeronet(saddr))
2162 goto martian_source;
2164 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2165 ipv4_is_loopback(daddr))
2166 goto martian_destination;
2169 * Now we are ready to route packet.
2171 if ((err = fib_lookup(net, &fl, &res)) != 0) {
2172 if (!IN_DEV_FORWARD(in_dev))
2173 goto e_hostunreach;
2174 goto no_route;
2176 free_res = 1;
2178 RT_CACHE_STAT_INC(in_slow_tot);
2180 if (res.type == RTN_BROADCAST)
2181 goto brd_input;
2183 if (res.type == RTN_LOCAL) {
2184 int result;
2185 result = fib_validate_source(saddr, daddr, tos,
2186 net->loopback_dev->ifindex,
2187 dev, &spec_dst, &itag, skb->mark);
2188 if (result < 0)
2189 goto martian_source;
2190 if (result)
2191 flags |= RTCF_DIRECTSRC;
2192 spec_dst = daddr;
2193 goto local_input;
2196 if (!IN_DEV_FORWARD(in_dev))
2197 goto e_hostunreach;
2198 if (res.type != RTN_UNICAST)
2199 goto martian_destination;
2201 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2202 done:
2203 in_dev_put(in_dev);
2204 if (free_res)
2205 fib_res_put(&res);
2206 out: return err;
2208 brd_input:
2209 if (skb->protocol != htons(ETH_P_IP))
2210 goto e_inval;
2212 if (ipv4_is_zeronet(saddr))
2213 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2214 else {
2215 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2216 &itag, skb->mark);
2217 if (err < 0)
2218 goto martian_source;
2219 if (err)
2220 flags |= RTCF_DIRECTSRC;
2222 flags |= RTCF_BROADCAST;
2223 res.type = RTN_BROADCAST;
2224 RT_CACHE_STAT_INC(in_brd);
2226 local_input:
2227 rth = dst_alloc(&ipv4_dst_ops);
2228 if (!rth)
2229 goto e_nobufs;
2231 rth->u.dst.output= ip_rt_bug;
2232 rth->u.dst.obsolete = -1;
2233 rth->rt_genid = rt_genid(net);
2235 atomic_set(&rth->u.dst.__refcnt, 1);
2236 rth->u.dst.flags= DST_HOST;
2237 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2238 rth->u.dst.flags |= DST_NOPOLICY;
2239 rth->fl.fl4_dst = daddr;
2240 rth->rt_dst = daddr;
2241 rth->fl.fl4_tos = tos;
2242 rth->fl.mark = skb->mark;
2243 rth->fl.fl4_src = saddr;
2244 rth->rt_src = saddr;
2245 #ifdef CONFIG_NET_CLS_ROUTE
2246 rth->u.dst.tclassid = itag;
2247 #endif
2248 rth->rt_iif =
2249 rth->fl.iif = dev->ifindex;
2250 rth->u.dst.dev = net->loopback_dev;
2251 dev_hold(rth->u.dst.dev);
2252 rth->idev = in_dev_get(rth->u.dst.dev);
2253 rth->rt_gateway = daddr;
2254 rth->rt_spec_dst= spec_dst;
2255 rth->u.dst.input= ip_local_deliver;
2256 rth->rt_flags = flags|RTCF_LOCAL;
2257 if (res.type == RTN_UNREACHABLE) {
2258 rth->u.dst.input= ip_error;
2259 rth->u.dst.error= -err;
2260 rth->rt_flags &= ~RTCF_LOCAL;
2262 rth->rt_type = res.type;
2263 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2264 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2265 goto done;
2267 no_route:
2268 RT_CACHE_STAT_INC(in_no_route);
2269 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2270 res.type = RTN_UNREACHABLE;
2271 if (err == -ESRCH)
2272 err = -ENETUNREACH;
2273 goto local_input;
2276 * Do not cache martian addresses: they should be logged (RFC1812)
2278 martian_destination:
2279 RT_CACHE_STAT_INC(in_martian_dst);
2280 #ifdef CONFIG_IP_ROUTE_VERBOSE
2281 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2282 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2283 &daddr, &saddr, dev->name);
2284 #endif
2286 e_hostunreach:
2287 err = -EHOSTUNREACH;
2288 goto done;
2290 e_inval:
2291 err = -EINVAL;
2292 goto done;
2294 e_nobufs:
2295 err = -ENOBUFS;
2296 goto done;
2298 martian_source:
2299 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2300 goto e_inval;
2303 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2304 u8 tos, struct net_device *dev)
2306 struct rtable * rth;
2307 unsigned hash;
2308 int iif = dev->ifindex;
2309 struct net *net;
2311 net = dev_net(dev);
2313 if (!rt_caching(net))
2314 goto skip_cache;
2316 tos &= IPTOS_RT_MASK;
2317 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2319 rcu_read_lock();
2320 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2321 rth = rcu_dereference(rth->u.dst.rt_next)) {
2322 if (((rth->fl.fl4_dst ^ daddr) |
2323 (rth->fl.fl4_src ^ saddr) |
2324 (rth->fl.iif ^ iif) |
2325 rth->fl.oif |
2326 (rth->fl.fl4_tos ^ tos)) == 0 &&
2327 rth->fl.mark == skb->mark &&
2328 net_eq(dev_net(rth->u.dst.dev), net) &&
2329 !rt_is_expired(rth)) {
2330 dst_use(&rth->u.dst, jiffies);
2331 RT_CACHE_STAT_INC(in_hit);
2332 rcu_read_unlock();
2333 skb_dst_set(skb, &rth->u.dst);
2334 return 0;
2336 RT_CACHE_STAT_INC(in_hlist_search);
2338 rcu_read_unlock();
2340 skip_cache:
2341 /* Multicast recognition logic is moved from route cache to here.
2342 The problem was that too many Ethernet cards have broken/missing
2343 hardware multicast filters :-( As result the host on multicasting
2344 network acquires a lot of useless route cache entries, sort of
2345 SDR messages from all the world. Now we try to get rid of them.
2346 Really, provided software IP multicast filter is organized
2347 reasonably (at least, hashed), it does not result in a slowdown
2348 comparing with route cache reject entries.
2349 Note, that multicast routers are not affected, because
2350 route cache entry is created eventually.
2352 if (ipv4_is_multicast(daddr)) {
2353 struct in_device *in_dev;
2355 rcu_read_lock();
2356 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2357 int our = ip_check_mc(in_dev, daddr, saddr,
2358 ip_hdr(skb)->protocol);
2359 if (our
2360 #ifdef CONFIG_IP_MROUTE
2362 (!ipv4_is_local_multicast(daddr) &&
2363 IN_DEV_MFORWARD(in_dev))
2364 #endif
2366 rcu_read_unlock();
2367 return ip_route_input_mc(skb, daddr, saddr,
2368 tos, dev, our);
2371 rcu_read_unlock();
2372 return -EINVAL;
2374 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2377 static int __mkroute_output(struct rtable **result,
2378 struct fib_result *res,
2379 const struct flowi *fl,
2380 const struct flowi *oldflp,
2381 struct net_device *dev_out,
2382 unsigned flags)
2384 struct rtable *rth;
2385 struct in_device *in_dev;
2386 u32 tos = RT_FL_TOS(oldflp);
2387 int err = 0;
2389 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2390 return -EINVAL;
2392 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2393 res->type = RTN_BROADCAST;
2394 else if (ipv4_is_multicast(fl->fl4_dst))
2395 res->type = RTN_MULTICAST;
2396 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2397 return -EINVAL;
2399 if (dev_out->flags & IFF_LOOPBACK)
2400 flags |= RTCF_LOCAL;
2402 /* get work reference to inet device */
2403 in_dev = in_dev_get(dev_out);
2404 if (!in_dev)
2405 return -EINVAL;
2407 if (res->type == RTN_BROADCAST) {
2408 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2409 if (res->fi) {
2410 fib_info_put(res->fi);
2411 res->fi = NULL;
2413 } else if (res->type == RTN_MULTICAST) {
2414 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2415 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2416 oldflp->proto))
2417 flags &= ~RTCF_LOCAL;
2418 /* If multicast route do not exist use
2419 default one, but do not gateway in this case.
2420 Yes, it is hack.
2422 if (res->fi && res->prefixlen < 4) {
2423 fib_info_put(res->fi);
2424 res->fi = NULL;
2429 rth = dst_alloc(&ipv4_dst_ops);
2430 if (!rth) {
2431 err = -ENOBUFS;
2432 goto cleanup;
2435 atomic_set(&rth->u.dst.__refcnt, 1);
2436 rth->u.dst.flags= DST_HOST;
2437 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2438 rth->u.dst.flags |= DST_NOXFRM;
2439 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2440 rth->u.dst.flags |= DST_NOPOLICY;
2442 rth->fl.fl4_dst = oldflp->fl4_dst;
2443 rth->fl.fl4_tos = tos;
2444 rth->fl.fl4_src = oldflp->fl4_src;
2445 rth->fl.oif = oldflp->oif;
2446 rth->fl.mark = oldflp->mark;
2447 rth->rt_dst = fl->fl4_dst;
2448 rth->rt_src = fl->fl4_src;
2449 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2450 /* get references to the devices that are to be hold by the routing
2451 cache entry */
2452 rth->u.dst.dev = dev_out;
2453 dev_hold(dev_out);
2454 rth->idev = in_dev_get(dev_out);
2455 rth->rt_gateway = fl->fl4_dst;
2456 rth->rt_spec_dst= fl->fl4_src;
2458 rth->u.dst.output=ip_output;
2459 rth->u.dst.obsolete = -1;
2460 rth->rt_genid = rt_genid(dev_net(dev_out));
2462 RT_CACHE_STAT_INC(out_slow_tot);
2464 if (flags & RTCF_LOCAL) {
2465 rth->u.dst.input = ip_local_deliver;
2466 rth->rt_spec_dst = fl->fl4_dst;
2468 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2469 rth->rt_spec_dst = fl->fl4_src;
2470 if (flags & RTCF_LOCAL &&
2471 !(dev_out->flags & IFF_LOOPBACK)) {
2472 rth->u.dst.output = ip_mc_output;
2473 RT_CACHE_STAT_INC(out_slow_mc);
2475 #ifdef CONFIG_IP_MROUTE
2476 if (res->type == RTN_MULTICAST) {
2477 if (IN_DEV_MFORWARD(in_dev) &&
2478 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2479 rth->u.dst.input = ip_mr_input;
2480 rth->u.dst.output = ip_mc_output;
2483 #endif
2486 rt_set_nexthop(rth, res, 0);
2488 rth->rt_flags = flags;
2490 *result = rth;
2491 cleanup:
2492 /* release work reference to inet device */
2493 in_dev_put(in_dev);
2495 return err;
2498 static int ip_mkroute_output(struct rtable **rp,
2499 struct fib_result *res,
2500 const struct flowi *fl,
2501 const struct flowi *oldflp,
2502 struct net_device *dev_out,
2503 unsigned flags)
2505 struct rtable *rth = NULL;
2506 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2507 unsigned hash;
2508 if (err == 0) {
2509 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2510 rt_genid(dev_net(dev_out)));
2511 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2514 return err;
2518 * Major route resolver routine.
2521 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2522 const struct flowi *oldflp)
2524 u32 tos = RT_FL_TOS(oldflp);
2525 struct flowi fl = { .nl_u = { .ip4_u =
2526 { .daddr = oldflp->fl4_dst,
2527 .saddr = oldflp->fl4_src,
2528 .tos = tos & IPTOS_RT_MASK,
2529 .scope = ((tos & RTO_ONLINK) ?
2530 RT_SCOPE_LINK :
2531 RT_SCOPE_UNIVERSE),
2532 } },
2533 .mark = oldflp->mark,
2534 .iif = net->loopback_dev->ifindex,
2535 .oif = oldflp->oif };
2536 struct fib_result res;
2537 unsigned flags = 0;
2538 struct net_device *dev_out = NULL;
2539 int free_res = 0;
2540 int err;
2543 res.fi = NULL;
2544 #ifdef CONFIG_IP_MULTIPLE_TABLES
2545 res.r = NULL;
2546 #endif
2548 if (oldflp->fl4_src) {
2549 err = -EINVAL;
2550 if (ipv4_is_multicast(oldflp->fl4_src) ||
2551 ipv4_is_lbcast(oldflp->fl4_src) ||
2552 ipv4_is_zeronet(oldflp->fl4_src))
2553 goto out;
2555 /* I removed check for oif == dev_out->oif here.
2556 It was wrong for two reasons:
2557 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2558 is assigned to multiple interfaces.
2559 2. Moreover, we are allowed to send packets with saddr
2560 of another iface. --ANK
2563 if (oldflp->oif == 0 &&
2564 (ipv4_is_multicast(oldflp->fl4_dst) ||
2565 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2566 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2567 dev_out = ip_dev_find(net, oldflp->fl4_src);
2568 if (dev_out == NULL)
2569 goto out;
2571 /* Special hack: user can direct multicasts
2572 and limited broadcast via necessary interface
2573 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2574 This hack is not just for fun, it allows
2575 vic,vat and friends to work.
2576 They bind socket to loopback, set ttl to zero
2577 and expect that it will work.
2578 From the viewpoint of routing cache they are broken,
2579 because we are not allowed to build multicast path
2580 with loopback source addr (look, routing cache
2581 cannot know, that ttl is zero, so that packet
2582 will not leave this host and route is valid).
2583 Luckily, this hack is good workaround.
2586 fl.oif = dev_out->ifindex;
2587 goto make_route;
2590 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2591 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2592 dev_out = ip_dev_find(net, oldflp->fl4_src);
2593 if (dev_out == NULL)
2594 goto out;
2595 dev_put(dev_out);
2596 dev_out = NULL;
2601 if (oldflp->oif) {
2602 dev_out = dev_get_by_index(net, oldflp->oif);
2603 err = -ENODEV;
2604 if (dev_out == NULL)
2605 goto out;
2607 /* RACE: Check return value of inet_select_addr instead. */
2608 if (__in_dev_get_rtnl(dev_out) == NULL) {
2609 dev_put(dev_out);
2610 goto out; /* Wrong error code */
2613 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2614 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2615 if (!fl.fl4_src)
2616 fl.fl4_src = inet_select_addr(dev_out, 0,
2617 RT_SCOPE_LINK);
2618 goto make_route;
2620 if (!fl.fl4_src) {
2621 if (ipv4_is_multicast(oldflp->fl4_dst))
2622 fl.fl4_src = inet_select_addr(dev_out, 0,
2623 fl.fl4_scope);
2624 else if (!oldflp->fl4_dst)
2625 fl.fl4_src = inet_select_addr(dev_out, 0,
2626 RT_SCOPE_HOST);
2630 if (!fl.fl4_dst) {
2631 fl.fl4_dst = fl.fl4_src;
2632 if (!fl.fl4_dst)
2633 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2634 if (dev_out)
2635 dev_put(dev_out);
2636 dev_out = net->loopback_dev;
2637 dev_hold(dev_out);
2638 fl.oif = net->loopback_dev->ifindex;
2639 res.type = RTN_LOCAL;
2640 flags |= RTCF_LOCAL;
2641 goto make_route;
2644 if (fib_lookup(net, &fl, &res)) {
2645 res.fi = NULL;
2646 if (oldflp->oif) {
2647 /* Apparently, routing tables are wrong. Assume,
2648 that the destination is on link.
2650 WHY? DW.
2651 Because we are allowed to send to iface
2652 even if it has NO routes and NO assigned
2653 addresses. When oif is specified, routing
2654 tables are looked up with only one purpose:
2655 to catch if destination is gatewayed, rather than
2656 direct. Moreover, if MSG_DONTROUTE is set,
2657 we send packet, ignoring both routing tables
2658 and ifaddr state. --ANK
2661 We could make it even if oif is unknown,
2662 likely IPv6, but we do not.
2665 if (fl.fl4_src == 0)
2666 fl.fl4_src = inet_select_addr(dev_out, 0,
2667 RT_SCOPE_LINK);
2668 res.type = RTN_UNICAST;
2669 goto make_route;
2671 if (dev_out)
2672 dev_put(dev_out);
2673 err = -ENETUNREACH;
2674 goto out;
2676 free_res = 1;
2678 if (res.type == RTN_LOCAL) {
2679 if (!fl.fl4_src)
2680 fl.fl4_src = fl.fl4_dst;
2681 if (dev_out)
2682 dev_put(dev_out);
2683 dev_out = net->loopback_dev;
2684 dev_hold(dev_out);
2685 fl.oif = dev_out->ifindex;
2686 if (res.fi)
2687 fib_info_put(res.fi);
2688 res.fi = NULL;
2689 flags |= RTCF_LOCAL;
2690 goto make_route;
2693 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2694 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2695 fib_select_multipath(&fl, &res);
2696 else
2697 #endif
2698 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2699 fib_select_default(net, &fl, &res);
2701 if (!fl.fl4_src)
2702 fl.fl4_src = FIB_RES_PREFSRC(res);
2704 if (dev_out)
2705 dev_put(dev_out);
2706 dev_out = FIB_RES_DEV(res);
2707 dev_hold(dev_out);
2708 fl.oif = dev_out->ifindex;
2711 make_route:
2712 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2715 if (free_res)
2716 fib_res_put(&res);
2717 if (dev_out)
2718 dev_put(dev_out);
2719 out: return err;
2722 int __ip_route_output_key(struct net *net, struct rtable **rp,
2723 const struct flowi *flp)
2725 unsigned hash;
2726 struct rtable *rth;
2728 if (!rt_caching(net))
2729 goto slow_output;
2731 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2733 rcu_read_lock_bh();
2734 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2735 rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
2736 if (rth->fl.fl4_dst == flp->fl4_dst &&
2737 rth->fl.fl4_src == flp->fl4_src &&
2738 rth->fl.iif == 0 &&
2739 rth->fl.oif == flp->oif &&
2740 rth->fl.mark == flp->mark &&
2741 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2742 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2743 net_eq(dev_net(rth->u.dst.dev), net) &&
2744 !rt_is_expired(rth)) {
2745 dst_use(&rth->u.dst, jiffies);
2746 RT_CACHE_STAT_INC(out_hit);
2747 rcu_read_unlock_bh();
2748 *rp = rth;
2749 return 0;
2751 RT_CACHE_STAT_INC(out_hlist_search);
2753 rcu_read_unlock_bh();
2755 slow_output:
2756 return ip_route_output_slow(net, rp, flp);
2759 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2761 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2765 static struct dst_ops ipv4_dst_blackhole_ops = {
2766 .family = AF_INET,
2767 .protocol = cpu_to_be16(ETH_P_IP),
2768 .destroy = ipv4_dst_destroy,
2769 .check = ipv4_dst_check,
2770 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2771 .entries = ATOMIC_INIT(0),
2775 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2777 struct rtable *ort = *rp;
2778 struct rtable *rt = (struct rtable *)
2779 dst_alloc(&ipv4_dst_blackhole_ops);
2781 if (rt) {
2782 struct dst_entry *new = &rt->u.dst;
2784 atomic_set(&new->__refcnt, 1);
2785 new->__use = 1;
2786 new->input = dst_discard;
2787 new->output = dst_discard;
2788 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2790 new->dev = ort->u.dst.dev;
2791 if (new->dev)
2792 dev_hold(new->dev);
2794 rt->fl = ort->fl;
2796 rt->idev = ort->idev;
2797 if (rt->idev)
2798 in_dev_hold(rt->idev);
2799 rt->rt_genid = rt_genid(net);
2800 rt->rt_flags = ort->rt_flags;
2801 rt->rt_type = ort->rt_type;
2802 rt->rt_dst = ort->rt_dst;
2803 rt->rt_src = ort->rt_src;
2804 rt->rt_iif = ort->rt_iif;
2805 rt->rt_gateway = ort->rt_gateway;
2806 rt->rt_spec_dst = ort->rt_spec_dst;
2807 rt->peer = ort->peer;
2808 if (rt->peer)
2809 atomic_inc(&rt->peer->refcnt);
2811 dst_free(new);
2814 dst_release(&(*rp)->u.dst);
2815 *rp = rt;
2816 return (rt ? 0 : -ENOMEM);
2819 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2820 struct sock *sk, int flags)
2822 int err;
2824 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2825 return err;
2827 if (flp->proto) {
2828 if (!flp->fl4_src)
2829 flp->fl4_src = (*rp)->rt_src;
2830 if (!flp->fl4_dst)
2831 flp->fl4_dst = (*rp)->rt_dst;
2832 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2833 flags ? XFRM_LOOKUP_WAIT : 0);
2834 if (err == -EREMOTE)
2835 err = ipv4_dst_blackhole(net, rp, flp);
2837 return err;
2840 return 0;
2843 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2845 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2847 return ip_route_output_flow(net, rp, flp, NULL, 0);
2850 static int rt_fill_info(struct net *net,
2851 struct sk_buff *skb, u32 pid, u32 seq, int event,
2852 int nowait, unsigned int flags)
2854 struct rtable *rt = skb_rtable(skb);
2855 struct rtmsg *r;
2856 struct nlmsghdr *nlh;
2857 long expires;
2858 u32 id = 0, ts = 0, tsage = 0, error;
2860 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2861 if (nlh == NULL)
2862 return -EMSGSIZE;
2864 r = nlmsg_data(nlh);
2865 r->rtm_family = AF_INET;
2866 r->rtm_dst_len = 32;
2867 r->rtm_src_len = 0;
2868 r->rtm_tos = rt->fl.fl4_tos;
2869 r->rtm_table = RT_TABLE_MAIN;
2870 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2871 r->rtm_type = rt->rt_type;
2872 r->rtm_scope = RT_SCOPE_UNIVERSE;
2873 r->rtm_protocol = RTPROT_UNSPEC;
2874 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2875 if (rt->rt_flags & RTCF_NOTIFY)
2876 r->rtm_flags |= RTM_F_NOTIFY;
2878 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2880 if (rt->fl.fl4_src) {
2881 r->rtm_src_len = 32;
2882 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2884 if (rt->u.dst.dev)
2885 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2886 #ifdef CONFIG_NET_CLS_ROUTE
2887 if (rt->u.dst.tclassid)
2888 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2889 #endif
2890 if (rt->fl.iif)
2891 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2892 else if (rt->rt_src != rt->fl.fl4_src)
2893 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2895 if (rt->rt_dst != rt->rt_gateway)
2896 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2898 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2899 goto nla_put_failure;
2901 error = rt->u.dst.error;
2902 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2903 if (rt->peer) {
2904 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2905 if (rt->peer->tcp_ts_stamp) {
2906 ts = rt->peer->tcp_ts;
2907 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2911 if (rt->fl.iif) {
2912 #ifdef CONFIG_IP_MROUTE
2913 __be32 dst = rt->rt_dst;
2915 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2916 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2917 int err = ipmr_get_route(net, skb, r, nowait);
2918 if (err <= 0) {
2919 if (!nowait) {
2920 if (err == 0)
2921 return 0;
2922 goto nla_put_failure;
2923 } else {
2924 if (err == -EMSGSIZE)
2925 goto nla_put_failure;
2926 error = err;
2929 } else
2930 #endif
2931 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2934 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2935 expires, error) < 0)
2936 goto nla_put_failure;
2938 return nlmsg_end(skb, nlh);
2940 nla_put_failure:
2941 nlmsg_cancel(skb, nlh);
2942 return -EMSGSIZE;
2945 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2947 struct net *net = sock_net(in_skb->sk);
2948 struct rtmsg *rtm;
2949 struct nlattr *tb[RTA_MAX+1];
2950 struct rtable *rt = NULL;
2951 __be32 dst = 0;
2952 __be32 src = 0;
2953 u32 iif;
2954 int err;
2955 struct sk_buff *skb;
2957 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2958 if (err < 0)
2959 goto errout;
2961 rtm = nlmsg_data(nlh);
2963 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2964 if (skb == NULL) {
2965 err = -ENOBUFS;
2966 goto errout;
2969 /* Reserve room for dummy headers, this skb can pass
2970 through good chunk of routing engine.
2972 skb_reset_mac_header(skb);
2973 skb_reset_network_header(skb);
2975 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2976 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2977 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2979 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2980 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2981 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2983 if (iif) {
2984 struct net_device *dev;
2986 dev = __dev_get_by_index(net, iif);
2987 if (dev == NULL) {
2988 err = -ENODEV;
2989 goto errout_free;
2992 skb->protocol = htons(ETH_P_IP);
2993 skb->dev = dev;
2994 local_bh_disable();
2995 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2996 local_bh_enable();
2998 rt = skb_rtable(skb);
2999 if (err == 0 && rt->u.dst.error)
3000 err = -rt->u.dst.error;
3001 } else {
3002 struct flowi fl = {
3003 .nl_u = {
3004 .ip4_u = {
3005 .daddr = dst,
3006 .saddr = src,
3007 .tos = rtm->rtm_tos,
3010 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3012 err = ip_route_output_key(net, &rt, &fl);
3015 if (err)
3016 goto errout_free;
3018 skb_dst_set(skb, &rt->u.dst);
3019 if (rtm->rtm_flags & RTM_F_NOTIFY)
3020 rt->rt_flags |= RTCF_NOTIFY;
3022 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3023 RTM_NEWROUTE, 0, 0);
3024 if (err <= 0)
3025 goto errout_free;
3027 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3028 errout:
3029 return err;
3031 errout_free:
3032 kfree_skb(skb);
3033 goto errout;
3036 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3038 struct rtable *rt;
3039 int h, s_h;
3040 int idx, s_idx;
3041 struct net *net;
3043 net = sock_net(skb->sk);
3045 s_h = cb->args[0];
3046 if (s_h < 0)
3047 s_h = 0;
3048 s_idx = idx = cb->args[1];
3049 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3050 if (!rt_hash_table[h].chain)
3051 continue;
3052 rcu_read_lock_bh();
3053 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3054 rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
3055 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3056 continue;
3057 if (rt_is_expired(rt))
3058 continue;
3059 skb_dst_set(skb, dst_clone(&rt->u.dst));
3060 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3061 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3062 1, NLM_F_MULTI) <= 0) {
3063 skb_dst_drop(skb);
3064 rcu_read_unlock_bh();
3065 goto done;
3067 skb_dst_drop(skb);
3069 rcu_read_unlock_bh();
3072 done:
3073 cb->args[0] = h;
3074 cb->args[1] = idx;
3075 return skb->len;
3078 void ip_rt_multicast_event(struct in_device *in_dev)
3080 rt_cache_flush(dev_net(in_dev->dev), 0);
3083 #ifdef CONFIG_SYSCTL
3084 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3085 void __user *buffer,
3086 size_t *lenp, loff_t *ppos)
3088 if (write) {
3089 int flush_delay;
3090 ctl_table ctl;
3091 struct net *net;
3093 memcpy(&ctl, __ctl, sizeof(ctl));
3094 ctl.data = &flush_delay;
3095 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3097 net = (struct net *)__ctl->extra1;
3098 rt_cache_flush(net, flush_delay);
3099 return 0;
3102 return -EINVAL;
3105 static void rt_secret_reschedule(int old)
3107 struct net *net;
3108 int new = ip_rt_secret_interval;
3109 int diff = new - old;
3111 if (!diff)
3112 return;
3114 rtnl_lock();
3115 for_each_net(net) {
3116 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3117 long time;
3119 if (!new)
3120 continue;
3122 if (deleted) {
3123 time = net->ipv4.rt_secret_timer.expires - jiffies;
3125 if (time <= 0 || (time += diff) <= 0)
3126 time = 0;
3127 } else
3128 time = new;
3130 mod_timer(&net->ipv4.rt_secret_timer, jiffies + time);
3132 rtnl_unlock();
3135 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3136 void __user *buffer, size_t *lenp,
3137 loff_t *ppos)
3139 int old = ip_rt_secret_interval;
3140 int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3142 rt_secret_reschedule(old);
3144 return ret;
3147 static ctl_table ipv4_route_table[] = {
3149 .procname = "gc_thresh",
3150 .data = &ipv4_dst_ops.gc_thresh,
3151 .maxlen = sizeof(int),
3152 .mode = 0644,
3153 .proc_handler = proc_dointvec,
3156 .procname = "max_size",
3157 .data = &ip_rt_max_size,
3158 .maxlen = sizeof(int),
3159 .mode = 0644,
3160 .proc_handler = proc_dointvec,
3163 /* Deprecated. Use gc_min_interval_ms */
3165 .procname = "gc_min_interval",
3166 .data = &ip_rt_gc_min_interval,
3167 .maxlen = sizeof(int),
3168 .mode = 0644,
3169 .proc_handler = proc_dointvec_jiffies,
3172 .procname = "gc_min_interval_ms",
3173 .data = &ip_rt_gc_min_interval,
3174 .maxlen = sizeof(int),
3175 .mode = 0644,
3176 .proc_handler = proc_dointvec_ms_jiffies,
3179 .procname = "gc_timeout",
3180 .data = &ip_rt_gc_timeout,
3181 .maxlen = sizeof(int),
3182 .mode = 0644,
3183 .proc_handler = proc_dointvec_jiffies,
3186 .procname = "gc_interval",
3187 .data = &ip_rt_gc_interval,
3188 .maxlen = sizeof(int),
3189 .mode = 0644,
3190 .proc_handler = proc_dointvec_jiffies,
3193 .procname = "redirect_load",
3194 .data = &ip_rt_redirect_load,
3195 .maxlen = sizeof(int),
3196 .mode = 0644,
3197 .proc_handler = proc_dointvec,
3200 .procname = "redirect_number",
3201 .data = &ip_rt_redirect_number,
3202 .maxlen = sizeof(int),
3203 .mode = 0644,
3204 .proc_handler = proc_dointvec,
3207 .procname = "redirect_silence",
3208 .data = &ip_rt_redirect_silence,
3209 .maxlen = sizeof(int),
3210 .mode = 0644,
3211 .proc_handler = proc_dointvec,
3214 .procname = "error_cost",
3215 .data = &ip_rt_error_cost,
3216 .maxlen = sizeof(int),
3217 .mode = 0644,
3218 .proc_handler = proc_dointvec,
3221 .procname = "error_burst",
3222 .data = &ip_rt_error_burst,
3223 .maxlen = sizeof(int),
3224 .mode = 0644,
3225 .proc_handler = proc_dointvec,
3228 .procname = "gc_elasticity",
3229 .data = &ip_rt_gc_elasticity,
3230 .maxlen = sizeof(int),
3231 .mode = 0644,
3232 .proc_handler = proc_dointvec,
3235 .procname = "mtu_expires",
3236 .data = &ip_rt_mtu_expires,
3237 .maxlen = sizeof(int),
3238 .mode = 0644,
3239 .proc_handler = proc_dointvec_jiffies,
3242 .procname = "min_pmtu",
3243 .data = &ip_rt_min_pmtu,
3244 .maxlen = sizeof(int),
3245 .mode = 0644,
3246 .proc_handler = proc_dointvec,
3249 .procname = "min_adv_mss",
3250 .data = &ip_rt_min_advmss,
3251 .maxlen = sizeof(int),
3252 .mode = 0644,
3253 .proc_handler = proc_dointvec,
3256 .procname = "secret_interval",
3257 .data = &ip_rt_secret_interval,
3258 .maxlen = sizeof(int),
3259 .mode = 0644,
3260 .proc_handler = ipv4_sysctl_rt_secret_interval,
3265 static struct ctl_table empty[1];
3267 static struct ctl_table ipv4_skeleton[] =
3269 { .procname = "route",
3270 .mode = 0555, .child = ipv4_route_table},
3271 { .procname = "neigh",
3272 .mode = 0555, .child = empty},
3276 static __net_initdata struct ctl_path ipv4_path[] = {
3277 { .procname = "net", },
3278 { .procname = "ipv4", },
3279 { },
3282 static struct ctl_table ipv4_route_flush_table[] = {
3284 .procname = "flush",
3285 .maxlen = sizeof(int),
3286 .mode = 0200,
3287 .proc_handler = ipv4_sysctl_rtcache_flush,
3289 { },
3292 static __net_initdata struct ctl_path ipv4_route_path[] = {
3293 { .procname = "net", },
3294 { .procname = "ipv4", },
3295 { .procname = "route", },
3296 { },
3299 static __net_init int sysctl_route_net_init(struct net *net)
3301 struct ctl_table *tbl;
3303 tbl = ipv4_route_flush_table;
3304 if (!net_eq(net, &init_net)) {
3305 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3306 if (tbl == NULL)
3307 goto err_dup;
3309 tbl[0].extra1 = net;
3311 net->ipv4.route_hdr =
3312 register_net_sysctl_table(net, ipv4_route_path, tbl);
3313 if (net->ipv4.route_hdr == NULL)
3314 goto err_reg;
3315 return 0;
3317 err_reg:
3318 if (tbl != ipv4_route_flush_table)
3319 kfree(tbl);
3320 err_dup:
3321 return -ENOMEM;
3324 static __net_exit void sysctl_route_net_exit(struct net *net)
3326 struct ctl_table *tbl;
3328 tbl = net->ipv4.route_hdr->ctl_table_arg;
3329 unregister_net_sysctl_table(net->ipv4.route_hdr);
3330 BUG_ON(tbl == ipv4_route_flush_table);
3331 kfree(tbl);
3334 static __net_initdata struct pernet_operations sysctl_route_ops = {
3335 .init = sysctl_route_net_init,
3336 .exit = sysctl_route_net_exit,
3338 #endif
3341 static __net_init int rt_secret_timer_init(struct net *net)
3343 atomic_set(&net->ipv4.rt_genid,
3344 (int) ((num_physpages ^ (num_physpages>>8)) ^
3345 (jiffies ^ (jiffies >> 7))));
3347 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3348 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3349 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3351 if (ip_rt_secret_interval) {
3352 net->ipv4.rt_secret_timer.expires =
3353 jiffies + net_random() % ip_rt_secret_interval +
3354 ip_rt_secret_interval;
3355 add_timer(&net->ipv4.rt_secret_timer);
3357 return 0;
3360 static __net_exit void rt_secret_timer_exit(struct net *net)
3362 del_timer_sync(&net->ipv4.rt_secret_timer);
3365 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3366 .init = rt_secret_timer_init,
3367 .exit = rt_secret_timer_exit,
3371 #ifdef CONFIG_NET_CLS_ROUTE
3372 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3373 #endif /* CONFIG_NET_CLS_ROUTE */
3375 static __initdata unsigned long rhash_entries;
3376 static int __init set_rhash_entries(char *str)
3378 if (!str)
3379 return 0;
3380 rhash_entries = simple_strtoul(str, &str, 0);
3381 return 1;
3383 __setup("rhash_entries=", set_rhash_entries);
3385 int __init ip_rt_init(void)
3387 int rc = 0;
3389 #ifdef CONFIG_NET_CLS_ROUTE
3390 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3391 if (!ip_rt_acct)
3392 panic("IP: failed to allocate ip_rt_acct\n");
3393 #endif
3395 ipv4_dst_ops.kmem_cachep =
3396 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3397 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3399 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3401 rt_hash_table = (struct rt_hash_bucket *)
3402 alloc_large_system_hash("IP route cache",
3403 sizeof(struct rt_hash_bucket),
3404 rhash_entries,
3405 (totalram_pages >= 128 * 1024) ?
3406 15 : 17,
3408 &rt_hash_log,
3409 &rt_hash_mask,
3410 rhash_entries ? 0 : 512 * 1024);
3411 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3412 rt_hash_lock_init();
3414 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3415 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3417 devinet_init();
3418 ip_fib_init();
3420 /* All the timers, started at system startup tend
3421 to synchronize. Perturb it a bit.
3423 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3424 expires_ljiffies = jiffies;
3425 schedule_delayed_work(&expires_work,
3426 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3428 if (register_pernet_subsys(&rt_secret_timer_ops))
3429 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3431 if (ip_rt_proc_init())
3432 printk(KERN_ERR "Unable to create route proc files\n");
3433 #ifdef CONFIG_XFRM
3434 xfrm_init();
3435 xfrm4_init(ip_rt_max_size);
3436 #endif
3437 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3439 #ifdef CONFIG_SYSCTL
3440 register_pernet_subsys(&sysctl_route_ops);
3441 #endif
3442 return rc;
3445 #ifdef CONFIG_SYSCTL
3447 * We really need to sanitize the damn ipv4 init order, then all
3448 * this nonsense will go away.
3450 void __init ip_static_sysctl_init(void)
3452 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3454 #endif
3456 EXPORT_SYMBOL(__ip_select_ident);
3457 EXPORT_SYMBOL(ip_route_input);
3458 EXPORT_SYMBOL(ip_route_output_key);