powerpc/pseries: Fix to handle slb resize across migration
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv4 / route.c
blobd777f845ed1deb460e86e8c4daf39516ed3726e1
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
111 #define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 #define IP_MAX_MTU 0xFFF0
116 #define RT_GC_TIMEOUT (300*HZ)
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122 static int ip_rt_redirect_number __read_mostly = 9;
123 static int ip_rt_redirect_load __read_mostly = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly = HZ;
126 static int ip_rt_error_burst __read_mostly = 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly = 8;
128 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly = 256;
131 static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
132 static int rt_chain_length_max __read_mostly = 20;
134 static void rt_worker_func(struct work_struct *work);
135 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
138 * Interface to generic destination cache.
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void ipv4_dst_destroy(struct dst_entry *dst);
143 static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void ipv4_link_failure(struct sk_buff *skb);
147 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149 static void rt_emergency_hash_rebuild(struct net *net);
152 static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET,
154 .protocol = cpu_to_be16(ETH_P_IP),
155 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check,
157 .destroy = ipv4_dst_destroy,
158 .ifdown = ipv4_dst_ifdown,
159 .negative_advice = ipv4_negative_advice,
160 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu,
162 .local_out = __ip_local_out,
163 .entries = ATOMIC_INIT(0),
166 #define ECN_OR_COST(class) TC_PRIO_##class
168 const __u8 ip_tos2prio[16] = {
169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
189 * Route cache.
192 /* The locking scheme is rather straight forward:
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
199 * lock held.
202 struct rt_hash_bucket {
203 struct rtable *chain;
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ 256
215 #else
216 # if NR_CPUS >= 32
217 # define RT_HASH_LOCK_SZ 4096
218 # elif NR_CPUS >= 16
219 # define RT_HASH_LOCK_SZ 2048
220 # elif NR_CPUS >= 8
221 # define RT_HASH_LOCK_SZ 1024
222 # elif NR_CPUS >= 4
223 # define RT_HASH_LOCK_SZ 512
224 # else
225 # define RT_HASH_LOCK_SZ 256
226 # endif
227 #endif
229 static spinlock_t *rt_hash_locks;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
232 static __init void rt_hash_lock_init(void)
234 int i;
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 GFP_KERNEL);
238 if (!rt_hash_locks)
239 panic("IP: failed to allocate rt_hash_locks\n");
241 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 spin_lock_init(&rt_hash_locks[i]);
244 #else
245 # define rt_hash_lock_addr(slot) NULL
247 static inline void rt_hash_lock_init(void)
250 #endif
252 static struct rt_hash_bucket *rt_hash_table __read_mostly;
253 static unsigned rt_hash_mask __read_mostly;
254 static unsigned int rt_hash_log __read_mostly;
256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257 #define RT_CACHE_STAT_INC(field) \
258 (__raw_get_cpu_var(rt_cache_stat).field++)
260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid)
263 return jhash_3words((__force u32)(__be32)(daddr),
264 (__force u32)(__be32)(saddr),
265 idx, genid)
266 & rt_hash_mask;
269 static inline int rt_genid(struct net *net)
271 return atomic_read(&net->ipv4.rt_genid);
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276 struct seq_net_private p;
277 int bucket;
278 int genid;
281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
283 struct rt_cache_iter_state *st = seq->private;
284 struct rtable *r = NULL;
286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287 if (!rt_hash_table[st->bucket].chain)
288 continue;
289 rcu_read_lock_bh();
290 r = rcu_dereference(rt_hash_table[st->bucket].chain);
291 while (r) {
292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293 r->rt_genid == st->genid)
294 return r;
295 r = rcu_dereference(r->u.dst.rt_next);
297 rcu_read_unlock_bh();
299 return r;
302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
303 struct rtable *r)
305 struct rt_cache_iter_state *st = seq->private;
307 r = r->u.dst.rt_next;
308 while (!r) {
309 rcu_read_unlock_bh();
310 do {
311 if (--st->bucket < 0)
312 return NULL;
313 } while (!rt_hash_table[st->bucket].chain);
314 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain;
317 return rcu_dereference(r);
320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
321 struct rtable *r)
323 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
326 continue;
327 if (r->rt_genid == st->genid)
328 break;
330 return r;
333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
335 struct rtable *r = rt_cache_get_first(seq);
337 if (r)
338 while (pos && (r = rt_cache_get_next(seq, r)))
339 --pos;
340 return pos ? NULL : r;
343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
345 struct rt_cache_iter_state *st = seq->private;
346 if (*pos)
347 return rt_cache_get_idx(seq, *pos - 1);
348 st->genid = rt_genid(seq_file_net(seq));
349 return SEQ_START_TOKEN;
352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
354 struct rtable *r;
356 if (v == SEQ_START_TOKEN)
357 r = rt_cache_get_first(seq);
358 else
359 r = rt_cache_get_next(seq, v);
360 ++*pos;
361 return r;
364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
366 if (v && v != SEQ_START_TOKEN)
367 rcu_read_unlock_bh();
370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
372 if (v == SEQ_START_TOKEN)
373 seq_printf(seq, "%-127s\n",
374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 "HHUptod\tSpecDst");
377 else {
378 struct rtable *r = v;
379 int len;
381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 r->u.dst.dev ? r->u.dst.dev->name : "*",
384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 dst_metric(&r->u.dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 dev_queue_xmit) : 0,
396 r->rt_spec_dst, &len);
398 seq_printf(seq, "%*s\n", 127 - len, "");
400 return 0;
403 static const struct seq_operations rt_cache_seq_ops = {
404 .start = rt_cache_seq_start,
405 .next = rt_cache_seq_next,
406 .stop = rt_cache_seq_stop,
407 .show = rt_cache_seq_show,
410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
412 return seq_open_net(inode, file, &rt_cache_seq_ops,
413 sizeof(struct rt_cache_iter_state));
416 static const struct file_operations rt_cache_seq_fops = {
417 .owner = THIS_MODULE,
418 .open = rt_cache_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
421 .release = seq_release_net,
425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
427 int cpu;
429 if (*pos == 0)
430 return SEQ_START_TOKEN;
432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433 if (!cpu_possible(cpu))
434 continue;
435 *pos = cpu+1;
436 return &per_cpu(rt_cache_stat, cpu);
438 return NULL;
441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
443 int cpu;
445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
449 return &per_cpu(rt_cache_stat, cpu);
451 return NULL;
455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
462 struct rt_cache_stat *st = v;
464 if (v == SEQ_START_TOKEN) {
465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 return 0;
469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops.entries),
472 st->in_hit,
473 st->in_slow_tot,
474 st->in_slow_mc,
475 st->in_no_route,
476 st->in_brd,
477 st->in_martian_dst,
478 st->in_martian_src,
480 st->out_hit,
481 st->out_slow_tot,
482 st->out_slow_mc,
484 st->gc_total,
485 st->gc_ignored,
486 st->gc_goal_miss,
487 st->gc_dst_overflow,
488 st->in_hlist_search,
489 st->out_hlist_search
491 return 0;
494 static const struct seq_operations rt_cpu_seq_ops = {
495 .start = rt_cpu_seq_start,
496 .next = rt_cpu_seq_next,
497 .stop = rt_cpu_seq_stop,
498 .show = rt_cpu_seq_show,
502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
504 return seq_open(file, &rt_cpu_seq_ops);
507 static const struct file_operations rt_cpu_seq_fops = {
508 .owner = THIS_MODULE,
509 .open = rt_cpu_seq_open,
510 .read = seq_read,
511 .llseek = seq_lseek,
512 .release = seq_release,
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
517 int length, int *eof, void *data)
519 unsigned int i;
521 if ((offset & 3) || (length & 3))
522 return -EIO;
524 if (offset >= sizeof(struct ip_rt_acct) * 256) {
525 *eof = 1;
526 return 0;
529 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
530 length = sizeof(struct ip_rt_acct) * 256 - offset;
531 *eof = 1;
534 offset /= sizeof(u32);
536 if (length > 0) {
537 u32 *dst = (u32 *) buffer;
539 *start = buffer;
540 memset(dst, 0, length);
542 for_each_possible_cpu(i) {
543 unsigned int j;
544 u32 *src;
546 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
547 for (j = 0; j < length/4; j++)
548 dst[j] += src[j];
551 return length;
553 #endif
555 static int __net_init ip_rt_do_proc_init(struct net *net)
557 struct proc_dir_entry *pde;
559 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
560 &rt_cache_seq_fops);
561 if (!pde)
562 goto err1;
564 pde = proc_create("rt_cache", S_IRUGO,
565 net->proc_net_stat, &rt_cpu_seq_fops);
566 if (!pde)
567 goto err2;
569 #ifdef CONFIG_NET_CLS_ROUTE
570 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
571 ip_rt_acct_read, NULL);
572 if (!pde)
573 goto err3;
574 #endif
575 return 0;
577 #ifdef CONFIG_NET_CLS_ROUTE
578 err3:
579 remove_proc_entry("rt_cache", net->proc_net_stat);
580 #endif
581 err2:
582 remove_proc_entry("rt_cache", net->proc_net);
583 err1:
584 return -ENOMEM;
587 static void __net_exit ip_rt_do_proc_exit(struct net *net)
589 remove_proc_entry("rt_cache", net->proc_net_stat);
590 remove_proc_entry("rt_cache", net->proc_net);
591 remove_proc_entry("rt_acct", net->proc_net);
594 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
595 .init = ip_rt_do_proc_init,
596 .exit = ip_rt_do_proc_exit,
599 static int __init ip_rt_proc_init(void)
601 return register_pernet_subsys(&ip_rt_proc_ops);
604 #else
605 static inline int ip_rt_proc_init(void)
607 return 0;
609 #endif /* CONFIG_PROC_FS */
611 static inline void rt_free(struct rtable *rt)
613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
616 static inline void rt_drop(struct rtable *rt)
618 ip_rt_put(rt);
619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
622 static inline int rt_fast_clean(struct rtable *rth)
624 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 rth->fl.iif && rth->u.dst.rt_next;
630 static inline int rt_valuable(struct rtable *rth)
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 rth->u.dst.expires;
636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
638 unsigned long age;
639 int ret = 0;
641 if (atomic_read(&rth->u.dst.__refcnt))
642 goto out;
644 ret = 1;
645 if (rth->u.dst.expires &&
646 time_after_eq(jiffies, rth->u.dst.expires))
647 goto out;
649 age = jiffies - rth->u.dst.lastuse;
650 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth)))
653 goto out;
654 ret = 1;
655 out: return ret;
658 /* Bits of score are:
659 * 31: very valuable
660 * 30: not quite useless
661 * 29..0: usage counter
663 static inline u32 rt_score(struct rtable *rt)
665 u32 score = jiffies - rt->u.dst.lastuse;
667 score = ~score & ~(3<<30);
669 if (rt_valuable(rt))
670 score |= (1<<31);
672 if (!rt->fl.iif ||
673 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 score |= (1<<30);
676 return score;
679 static inline bool rt_caching(const struct net *net)
681 return net->ipv4.current_rt_cache_rebuild_count <=
682 net->ipv4.sysctl_rt_cache_rebuild_count;
685 static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2)
688 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 (fl1->iif ^ fl2->iif)) == 0);
693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
695 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
697 (fl1->mark ^ fl2->mark) |
698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 (fl1->oif ^ fl2->oif) |
701 (fl1->iif ^ fl2->iif)) == 0;
704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
706 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
709 static inline int rt_is_expired(struct rtable *rth)
711 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
715 * Perform a full scan of hash table and free all entries.
716 * Can be called by a softirq or a process.
717 * In the later case, we want to be reschedule if necessary
719 static void rt_do_flush(int process_context)
721 unsigned int i;
722 struct rtable *rth, *next;
723 struct rtable * tail;
725 for (i = 0; i <= rt_hash_mask; i++) {
726 if (process_context && need_resched())
727 cond_resched();
728 rth = rt_hash_table[i].chain;
729 if (!rth)
730 continue;
732 spin_lock_bh(rt_hash_lock_addr(i));
733 #ifdef CONFIG_NET_NS
735 struct rtable ** prev, * p;
737 rth = rt_hash_table[i].chain;
739 /* defer releasing the head of the list after spin_unlock */
740 for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 if (!rt_is_expired(tail))
742 break;
743 if (rth != tail)
744 rt_hash_table[i].chain = tail;
746 /* call rt_free on entries after the tail requiring flush */
747 prev = &rt_hash_table[i].chain;
748 for (p = *prev; p; p = next) {
749 next = p->u.dst.rt_next;
750 if (!rt_is_expired(p)) {
751 prev = &p->u.dst.rt_next;
752 } else {
753 *prev = next;
754 rt_free(p);
758 #else
759 rth = rt_hash_table[i].chain;
760 rt_hash_table[i].chain = NULL;
761 tail = NULL;
762 #endif
763 spin_unlock_bh(rt_hash_lock_addr(i));
765 for (; rth != tail; rth = next) {
766 next = rth->u.dst.rt_next;
767 rt_free(rth);
773 * While freeing expired entries, we compute average chain length
774 * and standard deviation, using fixed-point arithmetic.
775 * This to have an estimation of rt_chain_length_max
776 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
777 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
780 #define FRACT_BITS 3
781 #define ONE (1UL << FRACT_BITS)
783 static void rt_check_expire(void)
785 static unsigned int rover;
786 unsigned int i = rover, goal;
787 struct rtable *rth, *aux, **rthp;
788 unsigned long samples = 0;
789 unsigned long sum = 0, sum2 = 0;
790 u64 mult;
792 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
793 if (ip_rt_gc_timeout > 1)
794 do_div(mult, ip_rt_gc_timeout);
795 goal = (unsigned int)mult;
796 if (goal > rt_hash_mask)
797 goal = rt_hash_mask + 1;
798 for (; goal > 0; goal--) {
799 unsigned long tmo = ip_rt_gc_timeout;
800 unsigned long length;
802 i = (i + 1) & rt_hash_mask;
803 rthp = &rt_hash_table[i].chain;
805 if (need_resched())
806 cond_resched();
808 samples++;
810 if (*rthp == NULL)
811 continue;
812 length = 0;
813 spin_lock_bh(rt_hash_lock_addr(i));
814 while ((rth = *rthp) != NULL) {
815 prefetch(rth->u.dst.rt_next);
816 if (rt_is_expired(rth)) {
817 *rthp = rth->u.dst.rt_next;
818 rt_free(rth);
819 continue;
821 if (rth->u.dst.expires) {
822 /* Entry is expired even if it is in use */
823 if (time_before_eq(jiffies, rth->u.dst.expires)) {
824 nofree:
825 tmo >>= 1;
826 rthp = &rth->u.dst.rt_next;
828 * We only count entries on
829 * a chain with equal hash inputs once
830 * so that entries for different QOS
831 * levels, and other non-hash input
832 * attributes don't unfairly skew
833 * the length computation
835 for (aux = rt_hash_table[i].chain;;) {
836 if (aux == rth) {
837 length += ONE;
838 break;
840 if (compare_hash_inputs(&aux->fl, &rth->fl))
841 break;
842 aux = aux->u.dst.rt_next;
844 continue;
846 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
847 goto nofree;
849 /* Cleanup aged off entries. */
850 *rthp = rth->u.dst.rt_next;
851 rt_free(rth);
853 spin_unlock_bh(rt_hash_lock_addr(i));
854 sum += length;
855 sum2 += length*length;
857 if (samples) {
858 unsigned long avg = sum / samples;
859 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
860 rt_chain_length_max = max_t(unsigned long,
861 ip_rt_gc_elasticity,
862 (avg + 4*sd) >> FRACT_BITS);
864 rover = i;
868 * rt_worker_func() is run in process context.
869 * we call rt_check_expire() to scan part of the hash table
871 static void rt_worker_func(struct work_struct *work)
873 rt_check_expire();
874 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
878 * Pertubation of rt_genid by a small quantity [1..256]
879 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
880 * many times (2^24) without giving recent rt_genid.
881 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
883 static void rt_cache_invalidate(struct net *net)
885 unsigned char shuffle;
887 get_random_bytes(&shuffle, sizeof(shuffle));
888 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
892 * delay < 0 : invalidate cache (fast : entries will be deleted later)
893 * delay >= 0 : invalidate & flush cache (can be long)
895 void rt_cache_flush(struct net *net, int delay)
897 rt_cache_invalidate(net);
898 if (delay >= 0)
899 rt_do_flush(!in_softirq());
903 * We change rt_genid and let gc do the cleanup
905 static void rt_secret_rebuild(unsigned long __net)
907 struct net *net = (struct net *)__net;
908 rt_cache_invalidate(net);
909 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
912 static void rt_secret_rebuild_oneshot(struct net *net)
914 del_timer_sync(&net->ipv4.rt_secret_timer);
915 rt_cache_invalidate(net);
916 if (ip_rt_secret_interval) {
917 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
918 add_timer(&net->ipv4.rt_secret_timer);
922 static void rt_emergency_hash_rebuild(struct net *net)
924 if (net_ratelimit()) {
925 printk(KERN_WARNING "Route hash chain too long!\n");
926 printk(KERN_WARNING "Adjust your secret_interval!\n");
929 rt_secret_rebuild_oneshot(net);
933 Short description of GC goals.
935 We want to build algorithm, which will keep routing cache
936 at some equilibrium point, when number of aged off entries
937 is kept approximately equal to newly generated ones.
939 Current expiration strength is variable "expire".
940 We try to adjust it dynamically, so that if networking
941 is idle expires is large enough to keep enough of warm entries,
942 and when load increases it reduces to limit cache size.
945 static int rt_garbage_collect(struct dst_ops *ops)
947 static unsigned long expire = RT_GC_TIMEOUT;
948 static unsigned long last_gc;
949 static int rover;
950 static int equilibrium;
951 struct rtable *rth, **rthp;
952 unsigned long now = jiffies;
953 int goal;
956 * Garbage collection is pretty expensive,
957 * do not make it too frequently.
960 RT_CACHE_STAT_INC(gc_total);
962 if (now - last_gc < ip_rt_gc_min_interval &&
963 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
964 RT_CACHE_STAT_INC(gc_ignored);
965 goto out;
968 /* Calculate number of entries, which we want to expire now. */
969 goal = atomic_read(&ipv4_dst_ops.entries) -
970 (ip_rt_gc_elasticity << rt_hash_log);
971 if (goal <= 0) {
972 if (equilibrium < ipv4_dst_ops.gc_thresh)
973 equilibrium = ipv4_dst_ops.gc_thresh;
974 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
975 if (goal > 0) {
976 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
977 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
979 } else {
980 /* We are in dangerous area. Try to reduce cache really
981 * aggressively.
983 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
984 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
987 if (now - last_gc >= ip_rt_gc_min_interval)
988 last_gc = now;
990 if (goal <= 0) {
991 equilibrium += goal;
992 goto work_done;
995 do {
996 int i, k;
998 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
999 unsigned long tmo = expire;
1001 k = (k + 1) & rt_hash_mask;
1002 rthp = &rt_hash_table[k].chain;
1003 spin_lock_bh(rt_hash_lock_addr(k));
1004 while ((rth = *rthp) != NULL) {
1005 if (!rt_is_expired(rth) &&
1006 !rt_may_expire(rth, tmo, expire)) {
1007 tmo >>= 1;
1008 rthp = &rth->u.dst.rt_next;
1009 continue;
1011 *rthp = rth->u.dst.rt_next;
1012 rt_free(rth);
1013 goal--;
1015 spin_unlock_bh(rt_hash_lock_addr(k));
1016 if (goal <= 0)
1017 break;
1019 rover = k;
1021 if (goal <= 0)
1022 goto work_done;
1024 /* Goal is not achieved. We stop process if:
1026 - if expire reduced to zero. Otherwise, expire is halfed.
1027 - if table is not full.
1028 - if we are called from interrupt.
1029 - jiffies check is just fallback/debug loop breaker.
1030 We will not spin here for long time in any case.
1033 RT_CACHE_STAT_INC(gc_goal_miss);
1035 if (expire == 0)
1036 break;
1038 expire >>= 1;
1039 #if RT_CACHE_DEBUG >= 2
1040 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1041 atomic_read(&ipv4_dst_ops.entries), goal, i);
1042 #endif
1044 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1045 goto out;
1046 } while (!in_softirq() && time_before_eq(jiffies, now));
1048 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1049 goto out;
1050 if (net_ratelimit())
1051 printk(KERN_WARNING "dst cache overflow\n");
1052 RT_CACHE_STAT_INC(gc_dst_overflow);
1053 return 1;
1055 work_done:
1056 expire += ip_rt_gc_min_interval;
1057 if (expire > ip_rt_gc_timeout ||
1058 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1059 expire = ip_rt_gc_timeout;
1060 #if RT_CACHE_DEBUG >= 2
1061 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1062 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1063 #endif
1064 out: return 0;
1067 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
1069 struct rtable *rth, **rthp;
1070 unsigned long now;
1071 struct rtable *cand, **candp;
1072 u32 min_score;
1073 int chain_length;
1074 int attempts = !in_softirq();
1076 restart:
1077 chain_length = 0;
1078 min_score = ~(u32)0;
1079 cand = NULL;
1080 candp = NULL;
1081 now = jiffies;
1083 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1085 * If we're not caching, just tell the caller we
1086 * were successful and don't touch the route. The
1087 * caller hold the sole reference to the cache entry, and
1088 * it will be released when the caller is done with it.
1089 * If we drop it here, the callers have no way to resolve routes
1090 * when we're not caching. Instead, just point *rp at rt, so
1091 * the caller gets a single use out of the route
1092 * Note that we do rt_free on this new route entry, so that
1093 * once its refcount hits zero, we are still able to reap it
1094 * (Thanks Alexey)
1095 * Note also the rt_free uses call_rcu. We don't actually
1096 * need rcu protection here, this is just our path to get
1097 * on the route gc list.
1100 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1101 int err = arp_bind_neighbour(&rt->u.dst);
1102 if (err) {
1103 if (net_ratelimit())
1104 printk(KERN_WARNING
1105 "Neighbour table failure & not caching routes.\n");
1106 rt_drop(rt);
1107 return err;
1111 rt_free(rt);
1112 goto skip_hashing;
1115 rthp = &rt_hash_table[hash].chain;
1117 spin_lock_bh(rt_hash_lock_addr(hash));
1118 while ((rth = *rthp) != NULL) {
1119 if (rt_is_expired(rth)) {
1120 *rthp = rth->u.dst.rt_next;
1121 rt_free(rth);
1122 continue;
1124 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1125 /* Put it first */
1126 *rthp = rth->u.dst.rt_next;
1128 * Since lookup is lockfree, the deletion
1129 * must be visible to another weakly ordered CPU before
1130 * the insertion at the start of the hash chain.
1132 rcu_assign_pointer(rth->u.dst.rt_next,
1133 rt_hash_table[hash].chain);
1135 * Since lookup is lockfree, the update writes
1136 * must be ordered for consistency on SMP.
1138 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1140 dst_use(&rth->u.dst, now);
1141 spin_unlock_bh(rt_hash_lock_addr(hash));
1143 rt_drop(rt);
1144 *rp = rth;
1145 return 0;
1148 if (!atomic_read(&rth->u.dst.__refcnt)) {
1149 u32 score = rt_score(rth);
1151 if (score <= min_score) {
1152 cand = rth;
1153 candp = rthp;
1154 min_score = score;
1158 chain_length++;
1160 rthp = &rth->u.dst.rt_next;
1163 if (cand) {
1164 /* ip_rt_gc_elasticity used to be average length of chain
1165 * length, when exceeded gc becomes really aggressive.
1167 * The second limit is less certain. At the moment it allows
1168 * only 2 entries per bucket. We will see.
1170 if (chain_length > ip_rt_gc_elasticity) {
1171 *candp = cand->u.dst.rt_next;
1172 rt_free(cand);
1174 } else {
1175 if (chain_length > rt_chain_length_max) {
1176 struct net *net = dev_net(rt->u.dst.dev);
1177 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1178 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1179 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1180 rt->u.dst.dev->name, num);
1182 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1186 /* Try to bind route to arp only if it is output
1187 route or unicast forwarding path.
1189 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1190 int err = arp_bind_neighbour(&rt->u.dst);
1191 if (err) {
1192 spin_unlock_bh(rt_hash_lock_addr(hash));
1194 if (err != -ENOBUFS) {
1195 rt_drop(rt);
1196 return err;
1199 /* Neighbour tables are full and nothing
1200 can be released. Try to shrink route cache,
1201 it is most likely it holds some neighbour records.
1203 if (attempts-- > 0) {
1204 int saved_elasticity = ip_rt_gc_elasticity;
1205 int saved_int = ip_rt_gc_min_interval;
1206 ip_rt_gc_elasticity = 1;
1207 ip_rt_gc_min_interval = 0;
1208 rt_garbage_collect(&ipv4_dst_ops);
1209 ip_rt_gc_min_interval = saved_int;
1210 ip_rt_gc_elasticity = saved_elasticity;
1211 goto restart;
1214 if (net_ratelimit())
1215 printk(KERN_WARNING "Neighbour table overflow.\n");
1216 rt_drop(rt);
1217 return -ENOBUFS;
1221 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1223 #if RT_CACHE_DEBUG >= 2
1224 if (rt->u.dst.rt_next) {
1225 struct rtable *trt;
1226 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1227 hash, &rt->rt_dst);
1228 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1229 printk(" . %pI4", &trt->rt_dst);
1230 printk("\n");
1232 #endif
1234 * Since lookup is lockfree, we must make sure
1235 * previous writes to rt are comitted to memory
1236 * before making rt visible to other CPUS.
1238 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1240 spin_unlock_bh(rt_hash_lock_addr(hash));
1242 skip_hashing:
1243 *rp = rt;
1244 return 0;
1247 void rt_bind_peer(struct rtable *rt, int create)
1249 static DEFINE_SPINLOCK(rt_peer_lock);
1250 struct inet_peer *peer;
1252 peer = inet_getpeer(rt->rt_dst, create);
1254 spin_lock_bh(&rt_peer_lock);
1255 if (rt->peer == NULL) {
1256 rt->peer = peer;
1257 peer = NULL;
1259 spin_unlock_bh(&rt_peer_lock);
1260 if (peer)
1261 inet_putpeer(peer);
1265 * Peer allocation may fail only in serious out-of-memory conditions. However
1266 * we still can generate some output.
1267 * Random ID selection looks a bit dangerous because we have no chances to
1268 * select ID being unique in a reasonable period of time.
1269 * But broken packet identifier may be better than no packet at all.
1271 static void ip_select_fb_ident(struct iphdr *iph)
1273 static DEFINE_SPINLOCK(ip_fb_id_lock);
1274 static u32 ip_fallback_id;
1275 u32 salt;
1277 spin_lock_bh(&ip_fb_id_lock);
1278 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1279 iph->id = htons(salt & 0xFFFF);
1280 ip_fallback_id = salt;
1281 spin_unlock_bh(&ip_fb_id_lock);
1284 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1286 struct rtable *rt = (struct rtable *) dst;
1288 if (rt) {
1289 if (rt->peer == NULL)
1290 rt_bind_peer(rt, 1);
1292 /* If peer is attached to destination, it is never detached,
1293 so that we need not to grab a lock to dereference it.
1295 if (rt->peer) {
1296 iph->id = htons(inet_getid(rt->peer, more));
1297 return;
1299 } else
1300 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1301 __builtin_return_address(0));
1303 ip_select_fb_ident(iph);
1306 static void rt_del(unsigned hash, struct rtable *rt)
1308 struct rtable **rthp, *aux;
1310 rthp = &rt_hash_table[hash].chain;
1311 spin_lock_bh(rt_hash_lock_addr(hash));
1312 ip_rt_put(rt);
1313 while ((aux = *rthp) != NULL) {
1314 if (aux == rt || rt_is_expired(aux)) {
1315 *rthp = aux->u.dst.rt_next;
1316 rt_free(aux);
1317 continue;
1319 rthp = &aux->u.dst.rt_next;
1321 spin_unlock_bh(rt_hash_lock_addr(hash));
1324 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1325 __be32 saddr, struct net_device *dev)
1327 int i, k;
1328 struct in_device *in_dev = in_dev_get(dev);
1329 struct rtable *rth, **rthp;
1330 __be32 skeys[2] = { saddr, 0 };
1331 int ikeys[2] = { dev->ifindex, 0 };
1332 struct netevent_redirect netevent;
1333 struct net *net;
1335 if (!in_dev)
1336 return;
1338 net = dev_net(dev);
1339 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1340 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1341 || ipv4_is_zeronet(new_gw))
1342 goto reject_redirect;
1344 if (!rt_caching(net))
1345 goto reject_redirect;
1347 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1348 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1349 goto reject_redirect;
1350 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1351 goto reject_redirect;
1352 } else {
1353 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1354 goto reject_redirect;
1357 for (i = 0; i < 2; i++) {
1358 for (k = 0; k < 2; k++) {
1359 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1360 rt_genid(net));
1362 rthp=&rt_hash_table[hash].chain;
1364 rcu_read_lock();
1365 while ((rth = rcu_dereference(*rthp)) != NULL) {
1366 struct rtable *rt;
1368 if (rth->fl.fl4_dst != daddr ||
1369 rth->fl.fl4_src != skeys[i] ||
1370 rth->fl.oif != ikeys[k] ||
1371 rth->fl.iif != 0 ||
1372 rt_is_expired(rth) ||
1373 !net_eq(dev_net(rth->u.dst.dev), net)) {
1374 rthp = &rth->u.dst.rt_next;
1375 continue;
1378 if (rth->rt_dst != daddr ||
1379 rth->rt_src != saddr ||
1380 rth->u.dst.error ||
1381 rth->rt_gateway != old_gw ||
1382 rth->u.dst.dev != dev)
1383 break;
1385 dst_hold(&rth->u.dst);
1386 rcu_read_unlock();
1388 rt = dst_alloc(&ipv4_dst_ops);
1389 if (rt == NULL) {
1390 ip_rt_put(rth);
1391 in_dev_put(in_dev);
1392 return;
1395 /* Copy all the information. */
1396 *rt = *rth;
1397 rt->u.dst.__use = 1;
1398 atomic_set(&rt->u.dst.__refcnt, 1);
1399 rt->u.dst.child = NULL;
1400 if (rt->u.dst.dev)
1401 dev_hold(rt->u.dst.dev);
1402 if (rt->idev)
1403 in_dev_hold(rt->idev);
1404 rt->u.dst.obsolete = 0;
1405 rt->u.dst.lastuse = jiffies;
1406 rt->u.dst.path = &rt->u.dst;
1407 rt->u.dst.neighbour = NULL;
1408 rt->u.dst.hh = NULL;
1409 #ifdef CONFIG_XFRM
1410 rt->u.dst.xfrm = NULL;
1411 #endif
1412 rt->rt_genid = rt_genid(net);
1413 rt->rt_flags |= RTCF_REDIRECTED;
1415 /* Gateway is different ... */
1416 rt->rt_gateway = new_gw;
1418 /* Redirect received -> path was valid */
1419 dst_confirm(&rth->u.dst);
1421 if (rt->peer)
1422 atomic_inc(&rt->peer->refcnt);
1424 if (arp_bind_neighbour(&rt->u.dst) ||
1425 !(rt->u.dst.neighbour->nud_state &
1426 NUD_VALID)) {
1427 if (rt->u.dst.neighbour)
1428 neigh_event_send(rt->u.dst.neighbour, NULL);
1429 ip_rt_put(rth);
1430 rt_drop(rt);
1431 goto do_next;
1434 netevent.old = &rth->u.dst;
1435 netevent.new = &rt->u.dst;
1436 call_netevent_notifiers(NETEVENT_REDIRECT,
1437 &netevent);
1439 rt_del(hash, rth);
1440 if (!rt_intern_hash(hash, rt, &rt))
1441 ip_rt_put(rt);
1442 goto do_next;
1444 rcu_read_unlock();
1445 do_next:
1449 in_dev_put(in_dev);
1450 return;
1452 reject_redirect:
1453 #ifdef CONFIG_IP_ROUTE_VERBOSE
1454 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1455 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1456 " Advised path = %pI4 -> %pI4\n",
1457 &old_gw, dev->name, &new_gw,
1458 &saddr, &daddr);
1459 #endif
1460 in_dev_put(in_dev);
1463 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1465 struct rtable *rt = (struct rtable *)dst;
1466 struct dst_entry *ret = dst;
1468 if (rt) {
1469 if (dst->obsolete) {
1470 ip_rt_put(rt);
1471 ret = NULL;
1472 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1473 rt->u.dst.expires) {
1474 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1475 rt->fl.oif,
1476 rt_genid(dev_net(dst->dev)));
1477 #if RT_CACHE_DEBUG >= 1
1478 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1479 &rt->rt_dst, rt->fl.fl4_tos);
1480 #endif
1481 rt_del(hash, rt);
1482 ret = NULL;
1485 return ret;
1489 * Algorithm:
1490 * 1. The first ip_rt_redirect_number redirects are sent
1491 * with exponential backoff, then we stop sending them at all,
1492 * assuming that the host ignores our redirects.
1493 * 2. If we did not see packets requiring redirects
1494 * during ip_rt_redirect_silence, we assume that the host
1495 * forgot redirected route and start to send redirects again.
1497 * This algorithm is much cheaper and more intelligent than dumb load limiting
1498 * in icmp.c.
1500 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1501 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1504 void ip_rt_send_redirect(struct sk_buff *skb)
1506 struct rtable *rt = skb->rtable;
1507 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1509 if (!in_dev)
1510 return;
1512 if (!IN_DEV_TX_REDIRECTS(in_dev))
1513 goto out;
1515 /* No redirected packets during ip_rt_redirect_silence;
1516 * reset the algorithm.
1518 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1519 rt->u.dst.rate_tokens = 0;
1521 /* Too many ignored redirects; do not send anything
1522 * set u.dst.rate_last to the last seen redirected packet.
1524 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1525 rt->u.dst.rate_last = jiffies;
1526 goto out;
1529 /* Check for load limit; set rate_last to the latest sent
1530 * redirect.
1532 if (rt->u.dst.rate_tokens == 0 ||
1533 time_after(jiffies,
1534 (rt->u.dst.rate_last +
1535 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1536 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1537 rt->u.dst.rate_last = jiffies;
1538 ++rt->u.dst.rate_tokens;
1539 #ifdef CONFIG_IP_ROUTE_VERBOSE
1540 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1541 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1542 net_ratelimit())
1543 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1544 &rt->rt_src, rt->rt_iif,
1545 &rt->rt_dst, &rt->rt_gateway);
1546 #endif
1548 out:
1549 in_dev_put(in_dev);
1552 static int ip_error(struct sk_buff *skb)
1554 struct rtable *rt = skb->rtable;
1555 unsigned long now;
1556 int code;
1558 switch (rt->u.dst.error) {
1559 case EINVAL:
1560 default:
1561 goto out;
1562 case EHOSTUNREACH:
1563 code = ICMP_HOST_UNREACH;
1564 break;
1565 case ENETUNREACH:
1566 code = ICMP_NET_UNREACH;
1567 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1568 IPSTATS_MIB_INNOROUTES);
1569 break;
1570 case EACCES:
1571 code = ICMP_PKT_FILTERED;
1572 break;
1575 now = jiffies;
1576 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1577 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1578 rt->u.dst.rate_tokens = ip_rt_error_burst;
1579 rt->u.dst.rate_last = now;
1580 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1581 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1582 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1585 out: kfree_skb(skb);
1586 return 0;
1590 * The last two values are not from the RFC but
1591 * are needed for AMPRnet AX.25 paths.
1594 static const unsigned short mtu_plateau[] =
1595 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1597 static inline unsigned short guess_mtu(unsigned short old_mtu)
1599 int i;
1601 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1602 if (old_mtu > mtu_plateau[i])
1603 return mtu_plateau[i];
1604 return 68;
1607 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1608 unsigned short new_mtu,
1609 struct net_device *dev)
1611 int i, k;
1612 unsigned short old_mtu = ntohs(iph->tot_len);
1613 struct rtable *rth;
1614 int ikeys[2] = { dev->ifindex, 0 };
1615 __be32 skeys[2] = { iph->saddr, 0, };
1616 __be32 daddr = iph->daddr;
1617 unsigned short est_mtu = 0;
1619 if (ipv4_config.no_pmtu_disc)
1620 return 0;
1622 for (k = 0; k < 2; k++) {
1623 for (i = 0; i < 2; i++) {
1624 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1625 rt_genid(net));
1627 rcu_read_lock();
1628 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1629 rth = rcu_dereference(rth->u.dst.rt_next)) {
1630 unsigned short mtu = new_mtu;
1632 if (rth->fl.fl4_dst != daddr ||
1633 rth->fl.fl4_src != skeys[i] ||
1634 rth->rt_dst != daddr ||
1635 rth->rt_src != iph->saddr ||
1636 rth->fl.oif != ikeys[k] ||
1637 rth->fl.iif != 0 ||
1638 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1639 !net_eq(dev_net(rth->u.dst.dev), net) ||
1640 rt_is_expired(rth))
1641 continue;
1643 if (new_mtu < 68 || new_mtu >= old_mtu) {
1645 /* BSD 4.2 compatibility hack :-( */
1646 if (mtu == 0 &&
1647 old_mtu >= dst_mtu(&rth->u.dst) &&
1648 old_mtu >= 68 + (iph->ihl << 2))
1649 old_mtu -= iph->ihl << 2;
1651 mtu = guess_mtu(old_mtu);
1653 if (mtu <= dst_mtu(&rth->u.dst)) {
1654 if (mtu < dst_mtu(&rth->u.dst)) {
1655 dst_confirm(&rth->u.dst);
1656 if (mtu < ip_rt_min_pmtu) {
1657 mtu = ip_rt_min_pmtu;
1658 rth->u.dst.metrics[RTAX_LOCK-1] |=
1659 (1 << RTAX_MTU);
1661 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1662 dst_set_expires(&rth->u.dst,
1663 ip_rt_mtu_expires);
1665 est_mtu = mtu;
1668 rcu_read_unlock();
1671 return est_mtu ? : new_mtu;
1674 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1676 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1677 !(dst_metric_locked(dst, RTAX_MTU))) {
1678 if (mtu < ip_rt_min_pmtu) {
1679 mtu = ip_rt_min_pmtu;
1680 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1682 dst->metrics[RTAX_MTU-1] = mtu;
1683 dst_set_expires(dst, ip_rt_mtu_expires);
1684 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1688 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1690 return NULL;
1693 static void ipv4_dst_destroy(struct dst_entry *dst)
1695 struct rtable *rt = (struct rtable *) dst;
1696 struct inet_peer *peer = rt->peer;
1697 struct in_device *idev = rt->idev;
1699 if (peer) {
1700 rt->peer = NULL;
1701 inet_putpeer(peer);
1704 if (idev) {
1705 rt->idev = NULL;
1706 in_dev_put(idev);
1710 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1711 int how)
1713 struct rtable *rt = (struct rtable *) dst;
1714 struct in_device *idev = rt->idev;
1715 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1716 struct in_device *loopback_idev =
1717 in_dev_get(dev_net(dev)->loopback_dev);
1718 if (loopback_idev) {
1719 rt->idev = loopback_idev;
1720 in_dev_put(idev);
1725 static void ipv4_link_failure(struct sk_buff *skb)
1727 struct rtable *rt;
1729 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1731 rt = skb->rtable;
1732 if (rt)
1733 dst_set_expires(&rt->u.dst, 0);
1736 static int ip_rt_bug(struct sk_buff *skb)
1738 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1739 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1740 skb->dev ? skb->dev->name : "?");
1741 kfree_skb(skb);
1742 return 0;
1746 We do not cache source address of outgoing interface,
1747 because it is used only by IP RR, TS and SRR options,
1748 so that it out of fast path.
1750 BTW remember: "addr" is allowed to be not aligned
1751 in IP options!
1754 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1756 __be32 src;
1757 struct fib_result res;
1759 if (rt->fl.iif == 0)
1760 src = rt->rt_src;
1761 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1762 src = FIB_RES_PREFSRC(res);
1763 fib_res_put(&res);
1764 } else
1765 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1766 RT_SCOPE_UNIVERSE);
1767 memcpy(addr, &src, 4);
1770 #ifdef CONFIG_NET_CLS_ROUTE
1771 static void set_class_tag(struct rtable *rt, u32 tag)
1773 if (!(rt->u.dst.tclassid & 0xFFFF))
1774 rt->u.dst.tclassid |= tag & 0xFFFF;
1775 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1776 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1778 #endif
1780 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1782 struct fib_info *fi = res->fi;
1784 if (fi) {
1785 if (FIB_RES_GW(*res) &&
1786 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1787 rt->rt_gateway = FIB_RES_GW(*res);
1788 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1789 sizeof(rt->u.dst.metrics));
1790 if (fi->fib_mtu == 0) {
1791 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1792 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1793 rt->rt_gateway != rt->rt_dst &&
1794 rt->u.dst.dev->mtu > 576)
1795 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1797 #ifdef CONFIG_NET_CLS_ROUTE
1798 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1799 #endif
1800 } else
1801 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1803 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1804 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1805 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1806 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1807 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1808 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1809 ip_rt_min_advmss);
1810 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1811 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1813 #ifdef CONFIG_NET_CLS_ROUTE
1814 #ifdef CONFIG_IP_MULTIPLE_TABLES
1815 set_class_tag(rt, fib_rules_tclass(res));
1816 #endif
1817 set_class_tag(rt, itag);
1818 #endif
1819 rt->rt_type = res->type;
1822 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1823 u8 tos, struct net_device *dev, int our)
1825 unsigned hash;
1826 struct rtable *rth;
1827 __be32 spec_dst;
1828 struct in_device *in_dev = in_dev_get(dev);
1829 u32 itag = 0;
1831 /* Primary sanity checks. */
1833 if (in_dev == NULL)
1834 return -EINVAL;
1836 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1837 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1838 goto e_inval;
1840 if (ipv4_is_zeronet(saddr)) {
1841 if (!ipv4_is_local_multicast(daddr))
1842 goto e_inval;
1843 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1844 } else if (fib_validate_source(saddr, 0, tos, 0,
1845 dev, &spec_dst, &itag) < 0)
1846 goto e_inval;
1848 rth = dst_alloc(&ipv4_dst_ops);
1849 if (!rth)
1850 goto e_nobufs;
1852 rth->u.dst.output= ip_rt_bug;
1854 atomic_set(&rth->u.dst.__refcnt, 1);
1855 rth->u.dst.flags= DST_HOST;
1856 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1857 rth->u.dst.flags |= DST_NOPOLICY;
1858 rth->fl.fl4_dst = daddr;
1859 rth->rt_dst = daddr;
1860 rth->fl.fl4_tos = tos;
1861 rth->fl.mark = skb->mark;
1862 rth->fl.fl4_src = saddr;
1863 rth->rt_src = saddr;
1864 #ifdef CONFIG_NET_CLS_ROUTE
1865 rth->u.dst.tclassid = itag;
1866 #endif
1867 rth->rt_iif =
1868 rth->fl.iif = dev->ifindex;
1869 rth->u.dst.dev = init_net.loopback_dev;
1870 dev_hold(rth->u.dst.dev);
1871 rth->idev = in_dev_get(rth->u.dst.dev);
1872 rth->fl.oif = 0;
1873 rth->rt_gateway = daddr;
1874 rth->rt_spec_dst= spec_dst;
1875 rth->rt_genid = rt_genid(dev_net(dev));
1876 rth->rt_flags = RTCF_MULTICAST;
1877 rth->rt_type = RTN_MULTICAST;
1878 if (our) {
1879 rth->u.dst.input= ip_local_deliver;
1880 rth->rt_flags |= RTCF_LOCAL;
1883 #ifdef CONFIG_IP_MROUTE
1884 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1885 rth->u.dst.input = ip_mr_input;
1886 #endif
1887 RT_CACHE_STAT_INC(in_slow_mc);
1889 in_dev_put(in_dev);
1890 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1891 return rt_intern_hash(hash, rth, &skb->rtable);
1893 e_nobufs:
1894 in_dev_put(in_dev);
1895 return -ENOBUFS;
1897 e_inval:
1898 in_dev_put(in_dev);
1899 return -EINVAL;
1903 static void ip_handle_martian_source(struct net_device *dev,
1904 struct in_device *in_dev,
1905 struct sk_buff *skb,
1906 __be32 daddr,
1907 __be32 saddr)
1909 RT_CACHE_STAT_INC(in_martian_src);
1910 #ifdef CONFIG_IP_ROUTE_VERBOSE
1911 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1913 * RFC1812 recommendation, if source is martian,
1914 * the only hint is MAC header.
1916 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1917 &daddr, &saddr, dev->name);
1918 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1919 int i;
1920 const unsigned char *p = skb_mac_header(skb);
1921 printk(KERN_WARNING "ll header: ");
1922 for (i = 0; i < dev->hard_header_len; i++, p++) {
1923 printk("%02x", *p);
1924 if (i < (dev->hard_header_len - 1))
1925 printk(":");
1927 printk("\n");
1930 #endif
1933 static int __mkroute_input(struct sk_buff *skb,
1934 struct fib_result *res,
1935 struct in_device *in_dev,
1936 __be32 daddr, __be32 saddr, u32 tos,
1937 struct rtable **result)
1940 struct rtable *rth;
1941 int err;
1942 struct in_device *out_dev;
1943 unsigned flags = 0;
1944 __be32 spec_dst;
1945 u32 itag;
1947 /* get a working reference to the output device */
1948 out_dev = in_dev_get(FIB_RES_DEV(*res));
1949 if (out_dev == NULL) {
1950 if (net_ratelimit())
1951 printk(KERN_CRIT "Bug in ip_route_input" \
1952 "_slow(). Please, report\n");
1953 return -EINVAL;
1957 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1958 in_dev->dev, &spec_dst, &itag);
1959 if (err < 0) {
1960 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1961 saddr);
1963 err = -EINVAL;
1964 goto cleanup;
1967 if (err)
1968 flags |= RTCF_DIRECTSRC;
1970 if (out_dev == in_dev && err &&
1971 (IN_DEV_SHARED_MEDIA(out_dev) ||
1972 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1973 flags |= RTCF_DOREDIRECT;
1975 if (skb->protocol != htons(ETH_P_IP)) {
1976 /* Not IP (i.e. ARP). Do not create route, if it is
1977 * invalid for proxy arp. DNAT routes are always valid.
1979 if (out_dev == in_dev) {
1980 err = -EINVAL;
1981 goto cleanup;
1986 rth = dst_alloc(&ipv4_dst_ops);
1987 if (!rth) {
1988 err = -ENOBUFS;
1989 goto cleanup;
1992 atomic_set(&rth->u.dst.__refcnt, 1);
1993 rth->u.dst.flags= DST_HOST;
1994 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1995 rth->u.dst.flags |= DST_NOPOLICY;
1996 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1997 rth->u.dst.flags |= DST_NOXFRM;
1998 rth->fl.fl4_dst = daddr;
1999 rth->rt_dst = daddr;
2000 rth->fl.fl4_tos = tos;
2001 rth->fl.mark = skb->mark;
2002 rth->fl.fl4_src = saddr;
2003 rth->rt_src = saddr;
2004 rth->rt_gateway = daddr;
2005 rth->rt_iif =
2006 rth->fl.iif = in_dev->dev->ifindex;
2007 rth->u.dst.dev = (out_dev)->dev;
2008 dev_hold(rth->u.dst.dev);
2009 rth->idev = in_dev_get(rth->u.dst.dev);
2010 rth->fl.oif = 0;
2011 rth->rt_spec_dst= spec_dst;
2013 rth->u.dst.input = ip_forward;
2014 rth->u.dst.output = ip_output;
2015 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2017 rt_set_nexthop(rth, res, itag);
2019 rth->rt_flags = flags;
2021 *result = rth;
2022 err = 0;
2023 cleanup:
2024 /* release the working reference to the output device */
2025 in_dev_put(out_dev);
2026 return err;
2029 static int ip_mkroute_input(struct sk_buff *skb,
2030 struct fib_result *res,
2031 const struct flowi *fl,
2032 struct in_device *in_dev,
2033 __be32 daddr, __be32 saddr, u32 tos)
2035 struct rtable* rth = NULL;
2036 int err;
2037 unsigned hash;
2039 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2040 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2041 fib_select_multipath(fl, res);
2042 #endif
2044 /* create a routing cache entry */
2045 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2046 if (err)
2047 return err;
2049 /* put it into the cache */
2050 hash = rt_hash(daddr, saddr, fl->iif,
2051 rt_genid(dev_net(rth->u.dst.dev)));
2052 return rt_intern_hash(hash, rth, &skb->rtable);
2056 * NOTE. We drop all the packets that has local source
2057 * addresses, because every properly looped back packet
2058 * must have correct destination already attached by output routine.
2060 * Such approach solves two big problems:
2061 * 1. Not simplex devices are handled properly.
2062 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2065 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2066 u8 tos, struct net_device *dev)
2068 struct fib_result res;
2069 struct in_device *in_dev = in_dev_get(dev);
2070 struct flowi fl = { .nl_u = { .ip4_u =
2071 { .daddr = daddr,
2072 .saddr = saddr,
2073 .tos = tos,
2074 .scope = RT_SCOPE_UNIVERSE,
2075 } },
2076 .mark = skb->mark,
2077 .iif = dev->ifindex };
2078 unsigned flags = 0;
2079 u32 itag = 0;
2080 struct rtable * rth;
2081 unsigned hash;
2082 __be32 spec_dst;
2083 int err = -EINVAL;
2084 int free_res = 0;
2085 struct net * net = dev_net(dev);
2087 /* IP on this device is disabled. */
2089 if (!in_dev)
2090 goto out;
2092 /* Check for the most weird martians, which can be not detected
2093 by fib_lookup.
2096 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2097 ipv4_is_loopback(saddr))
2098 goto martian_source;
2100 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2101 goto brd_input;
2103 /* Accept zero addresses only to limited broadcast;
2104 * I even do not know to fix it or not. Waiting for complains :-)
2106 if (ipv4_is_zeronet(saddr))
2107 goto martian_source;
2109 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2110 ipv4_is_loopback(daddr))
2111 goto martian_destination;
2114 * Now we are ready to route packet.
2116 if ((err = fib_lookup(net, &fl, &res)) != 0) {
2117 if (!IN_DEV_FORWARD(in_dev))
2118 goto e_hostunreach;
2119 goto no_route;
2121 free_res = 1;
2123 RT_CACHE_STAT_INC(in_slow_tot);
2125 if (res.type == RTN_BROADCAST)
2126 goto brd_input;
2128 if (res.type == RTN_LOCAL) {
2129 int result;
2130 result = fib_validate_source(saddr, daddr, tos,
2131 net->loopback_dev->ifindex,
2132 dev, &spec_dst, &itag);
2133 if (result < 0)
2134 goto martian_source;
2135 if (result)
2136 flags |= RTCF_DIRECTSRC;
2137 spec_dst = daddr;
2138 goto local_input;
2141 if (!IN_DEV_FORWARD(in_dev))
2142 goto e_hostunreach;
2143 if (res.type != RTN_UNICAST)
2144 goto martian_destination;
2146 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2147 done:
2148 in_dev_put(in_dev);
2149 if (free_res)
2150 fib_res_put(&res);
2151 out: return err;
2153 brd_input:
2154 if (skb->protocol != htons(ETH_P_IP))
2155 goto e_inval;
2157 if (ipv4_is_zeronet(saddr))
2158 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2159 else {
2160 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2161 &itag);
2162 if (err < 0)
2163 goto martian_source;
2164 if (err)
2165 flags |= RTCF_DIRECTSRC;
2167 flags |= RTCF_BROADCAST;
2168 res.type = RTN_BROADCAST;
2169 RT_CACHE_STAT_INC(in_brd);
2171 local_input:
2172 rth = dst_alloc(&ipv4_dst_ops);
2173 if (!rth)
2174 goto e_nobufs;
2176 rth->u.dst.output= ip_rt_bug;
2177 rth->rt_genid = rt_genid(net);
2179 atomic_set(&rth->u.dst.__refcnt, 1);
2180 rth->u.dst.flags= DST_HOST;
2181 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2182 rth->u.dst.flags |= DST_NOPOLICY;
2183 rth->fl.fl4_dst = daddr;
2184 rth->rt_dst = daddr;
2185 rth->fl.fl4_tos = tos;
2186 rth->fl.mark = skb->mark;
2187 rth->fl.fl4_src = saddr;
2188 rth->rt_src = saddr;
2189 #ifdef CONFIG_NET_CLS_ROUTE
2190 rth->u.dst.tclassid = itag;
2191 #endif
2192 rth->rt_iif =
2193 rth->fl.iif = dev->ifindex;
2194 rth->u.dst.dev = net->loopback_dev;
2195 dev_hold(rth->u.dst.dev);
2196 rth->idev = in_dev_get(rth->u.dst.dev);
2197 rth->rt_gateway = daddr;
2198 rth->rt_spec_dst= spec_dst;
2199 rth->u.dst.input= ip_local_deliver;
2200 rth->rt_flags = flags|RTCF_LOCAL;
2201 if (res.type == RTN_UNREACHABLE) {
2202 rth->u.dst.input= ip_error;
2203 rth->u.dst.error= -err;
2204 rth->rt_flags &= ~RTCF_LOCAL;
2206 rth->rt_type = res.type;
2207 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2208 err = rt_intern_hash(hash, rth, &skb->rtable);
2209 goto done;
2211 no_route:
2212 RT_CACHE_STAT_INC(in_no_route);
2213 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2214 res.type = RTN_UNREACHABLE;
2215 if (err == -ESRCH)
2216 err = -ENETUNREACH;
2217 goto local_input;
2220 * Do not cache martian addresses: they should be logged (RFC1812)
2222 martian_destination:
2223 RT_CACHE_STAT_INC(in_martian_dst);
2224 #ifdef CONFIG_IP_ROUTE_VERBOSE
2225 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2226 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2227 &daddr, &saddr, dev->name);
2228 #endif
2230 e_hostunreach:
2231 err = -EHOSTUNREACH;
2232 goto done;
2234 e_inval:
2235 err = -EINVAL;
2236 goto done;
2238 e_nobufs:
2239 err = -ENOBUFS;
2240 goto done;
2242 martian_source:
2243 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2244 goto e_inval;
2247 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2248 u8 tos, struct net_device *dev)
2250 struct rtable * rth;
2251 unsigned hash;
2252 int iif = dev->ifindex;
2253 struct net *net;
2255 net = dev_net(dev);
2257 if (!rt_caching(net))
2258 goto skip_cache;
2260 tos &= IPTOS_RT_MASK;
2261 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2263 rcu_read_lock();
2264 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2265 rth = rcu_dereference(rth->u.dst.rt_next)) {
2266 if (((rth->fl.fl4_dst ^ daddr) |
2267 (rth->fl.fl4_src ^ saddr) |
2268 (rth->fl.iif ^ iif) |
2269 rth->fl.oif |
2270 (rth->fl.fl4_tos ^ tos)) == 0 &&
2271 rth->fl.mark == skb->mark &&
2272 net_eq(dev_net(rth->u.dst.dev), net) &&
2273 !rt_is_expired(rth)) {
2274 dst_use(&rth->u.dst, jiffies);
2275 RT_CACHE_STAT_INC(in_hit);
2276 rcu_read_unlock();
2277 skb->rtable = rth;
2278 return 0;
2280 RT_CACHE_STAT_INC(in_hlist_search);
2282 rcu_read_unlock();
2284 skip_cache:
2285 /* Multicast recognition logic is moved from route cache to here.
2286 The problem was that too many Ethernet cards have broken/missing
2287 hardware multicast filters :-( As result the host on multicasting
2288 network acquires a lot of useless route cache entries, sort of
2289 SDR messages from all the world. Now we try to get rid of them.
2290 Really, provided software IP multicast filter is organized
2291 reasonably (at least, hashed), it does not result in a slowdown
2292 comparing with route cache reject entries.
2293 Note, that multicast routers are not affected, because
2294 route cache entry is created eventually.
2296 if (ipv4_is_multicast(daddr)) {
2297 struct in_device *in_dev;
2299 rcu_read_lock();
2300 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2301 int our = ip_check_mc(in_dev, daddr, saddr,
2302 ip_hdr(skb)->protocol);
2303 if (our
2304 #ifdef CONFIG_IP_MROUTE
2305 || (!ipv4_is_local_multicast(daddr) &&
2306 IN_DEV_MFORWARD(in_dev))
2307 #endif
2309 rcu_read_unlock();
2310 return ip_route_input_mc(skb, daddr, saddr,
2311 tos, dev, our);
2314 rcu_read_unlock();
2315 return -EINVAL;
2317 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2320 static int __mkroute_output(struct rtable **result,
2321 struct fib_result *res,
2322 const struct flowi *fl,
2323 const struct flowi *oldflp,
2324 struct net_device *dev_out,
2325 unsigned flags)
2327 struct rtable *rth;
2328 struct in_device *in_dev;
2329 u32 tos = RT_FL_TOS(oldflp);
2330 int err = 0;
2332 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2333 return -EINVAL;
2335 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2336 res->type = RTN_BROADCAST;
2337 else if (ipv4_is_multicast(fl->fl4_dst))
2338 res->type = RTN_MULTICAST;
2339 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2340 return -EINVAL;
2342 if (dev_out->flags & IFF_LOOPBACK)
2343 flags |= RTCF_LOCAL;
2345 /* get work reference to inet device */
2346 in_dev = in_dev_get(dev_out);
2347 if (!in_dev)
2348 return -EINVAL;
2350 if (res->type == RTN_BROADCAST) {
2351 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2352 if (res->fi) {
2353 fib_info_put(res->fi);
2354 res->fi = NULL;
2356 } else if (res->type == RTN_MULTICAST) {
2357 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2358 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2359 oldflp->proto))
2360 flags &= ~RTCF_LOCAL;
2361 /* If multicast route do not exist use
2362 default one, but do not gateway in this case.
2363 Yes, it is hack.
2365 if (res->fi && res->prefixlen < 4) {
2366 fib_info_put(res->fi);
2367 res->fi = NULL;
2372 rth = dst_alloc(&ipv4_dst_ops);
2373 if (!rth) {
2374 err = -ENOBUFS;
2375 goto cleanup;
2378 atomic_set(&rth->u.dst.__refcnt, 1);
2379 rth->u.dst.flags= DST_HOST;
2380 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2381 rth->u.dst.flags |= DST_NOXFRM;
2382 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2383 rth->u.dst.flags |= DST_NOPOLICY;
2385 rth->fl.fl4_dst = oldflp->fl4_dst;
2386 rth->fl.fl4_tos = tos;
2387 rth->fl.fl4_src = oldflp->fl4_src;
2388 rth->fl.oif = oldflp->oif;
2389 rth->fl.mark = oldflp->mark;
2390 rth->rt_dst = fl->fl4_dst;
2391 rth->rt_src = fl->fl4_src;
2392 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2393 /* get references to the devices that are to be hold by the routing
2394 cache entry */
2395 rth->u.dst.dev = dev_out;
2396 dev_hold(dev_out);
2397 rth->idev = in_dev_get(dev_out);
2398 rth->rt_gateway = fl->fl4_dst;
2399 rth->rt_spec_dst= fl->fl4_src;
2401 rth->u.dst.output=ip_output;
2402 rth->rt_genid = rt_genid(dev_net(dev_out));
2404 RT_CACHE_STAT_INC(out_slow_tot);
2406 if (flags & RTCF_LOCAL) {
2407 rth->u.dst.input = ip_local_deliver;
2408 rth->rt_spec_dst = fl->fl4_dst;
2410 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2411 rth->rt_spec_dst = fl->fl4_src;
2412 if (flags & RTCF_LOCAL &&
2413 !(dev_out->flags & IFF_LOOPBACK)) {
2414 rth->u.dst.output = ip_mc_output;
2415 RT_CACHE_STAT_INC(out_slow_mc);
2417 #ifdef CONFIG_IP_MROUTE
2418 if (res->type == RTN_MULTICAST) {
2419 if (IN_DEV_MFORWARD(in_dev) &&
2420 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2421 rth->u.dst.input = ip_mr_input;
2422 rth->u.dst.output = ip_mc_output;
2425 #endif
2428 rt_set_nexthop(rth, res, 0);
2430 rth->rt_flags = flags;
2432 *result = rth;
2433 cleanup:
2434 /* release work reference to inet device */
2435 in_dev_put(in_dev);
2437 return err;
2440 static int ip_mkroute_output(struct rtable **rp,
2441 struct fib_result *res,
2442 const struct flowi *fl,
2443 const struct flowi *oldflp,
2444 struct net_device *dev_out,
2445 unsigned flags)
2447 struct rtable *rth = NULL;
2448 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2449 unsigned hash;
2450 if (err == 0) {
2451 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2452 rt_genid(dev_net(dev_out)));
2453 err = rt_intern_hash(hash, rth, rp);
2456 return err;
2460 * Major route resolver routine.
2463 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2464 const struct flowi *oldflp)
2466 u32 tos = RT_FL_TOS(oldflp);
2467 struct flowi fl = { .nl_u = { .ip4_u =
2468 { .daddr = oldflp->fl4_dst,
2469 .saddr = oldflp->fl4_src,
2470 .tos = tos & IPTOS_RT_MASK,
2471 .scope = ((tos & RTO_ONLINK) ?
2472 RT_SCOPE_LINK :
2473 RT_SCOPE_UNIVERSE),
2474 } },
2475 .mark = oldflp->mark,
2476 .iif = net->loopback_dev->ifindex,
2477 .oif = oldflp->oif };
2478 struct fib_result res;
2479 unsigned flags = 0;
2480 struct net_device *dev_out = NULL;
2481 int free_res = 0;
2482 int err;
2485 res.fi = NULL;
2486 #ifdef CONFIG_IP_MULTIPLE_TABLES
2487 res.r = NULL;
2488 #endif
2490 if (oldflp->fl4_src) {
2491 err = -EINVAL;
2492 if (ipv4_is_multicast(oldflp->fl4_src) ||
2493 ipv4_is_lbcast(oldflp->fl4_src) ||
2494 ipv4_is_zeronet(oldflp->fl4_src))
2495 goto out;
2497 /* I removed check for oif == dev_out->oif here.
2498 It was wrong for two reasons:
2499 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2500 is assigned to multiple interfaces.
2501 2. Moreover, we are allowed to send packets with saddr
2502 of another iface. --ANK
2505 if (oldflp->oif == 0
2506 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2507 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2508 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2509 dev_out = ip_dev_find(net, oldflp->fl4_src);
2510 if (dev_out == NULL)
2511 goto out;
2513 /* Special hack: user can direct multicasts
2514 and limited broadcast via necessary interface
2515 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2516 This hack is not just for fun, it allows
2517 vic,vat and friends to work.
2518 They bind socket to loopback, set ttl to zero
2519 and expect that it will work.
2520 From the viewpoint of routing cache they are broken,
2521 because we are not allowed to build multicast path
2522 with loopback source addr (look, routing cache
2523 cannot know, that ttl is zero, so that packet
2524 will not leave this host and route is valid).
2525 Luckily, this hack is good workaround.
2528 fl.oif = dev_out->ifindex;
2529 goto make_route;
2532 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2533 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2534 dev_out = ip_dev_find(net, oldflp->fl4_src);
2535 if (dev_out == NULL)
2536 goto out;
2537 dev_put(dev_out);
2538 dev_out = NULL;
2543 if (oldflp->oif) {
2544 dev_out = dev_get_by_index(net, oldflp->oif);
2545 err = -ENODEV;
2546 if (dev_out == NULL)
2547 goto out;
2549 /* RACE: Check return value of inet_select_addr instead. */
2550 if (__in_dev_get_rtnl(dev_out) == NULL) {
2551 dev_put(dev_out);
2552 goto out; /* Wrong error code */
2555 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2556 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2557 if (!fl.fl4_src)
2558 fl.fl4_src = inet_select_addr(dev_out, 0,
2559 RT_SCOPE_LINK);
2560 goto make_route;
2562 if (!fl.fl4_src) {
2563 if (ipv4_is_multicast(oldflp->fl4_dst))
2564 fl.fl4_src = inet_select_addr(dev_out, 0,
2565 fl.fl4_scope);
2566 else if (!oldflp->fl4_dst)
2567 fl.fl4_src = inet_select_addr(dev_out, 0,
2568 RT_SCOPE_HOST);
2572 if (!fl.fl4_dst) {
2573 fl.fl4_dst = fl.fl4_src;
2574 if (!fl.fl4_dst)
2575 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2576 if (dev_out)
2577 dev_put(dev_out);
2578 dev_out = net->loopback_dev;
2579 dev_hold(dev_out);
2580 fl.oif = net->loopback_dev->ifindex;
2581 res.type = RTN_LOCAL;
2582 flags |= RTCF_LOCAL;
2583 goto make_route;
2586 if (fib_lookup(net, &fl, &res)) {
2587 res.fi = NULL;
2588 if (oldflp->oif) {
2589 /* Apparently, routing tables are wrong. Assume,
2590 that the destination is on link.
2592 WHY? DW.
2593 Because we are allowed to send to iface
2594 even if it has NO routes and NO assigned
2595 addresses. When oif is specified, routing
2596 tables are looked up with only one purpose:
2597 to catch if destination is gatewayed, rather than
2598 direct. Moreover, if MSG_DONTROUTE is set,
2599 we send packet, ignoring both routing tables
2600 and ifaddr state. --ANK
2603 We could make it even if oif is unknown,
2604 likely IPv6, but we do not.
2607 if (fl.fl4_src == 0)
2608 fl.fl4_src = inet_select_addr(dev_out, 0,
2609 RT_SCOPE_LINK);
2610 res.type = RTN_UNICAST;
2611 goto make_route;
2613 if (dev_out)
2614 dev_put(dev_out);
2615 err = -ENETUNREACH;
2616 goto out;
2618 free_res = 1;
2620 if (res.type == RTN_LOCAL) {
2621 if (!fl.fl4_src)
2622 fl.fl4_src = fl.fl4_dst;
2623 if (dev_out)
2624 dev_put(dev_out);
2625 dev_out = net->loopback_dev;
2626 dev_hold(dev_out);
2627 fl.oif = dev_out->ifindex;
2628 if (res.fi)
2629 fib_info_put(res.fi);
2630 res.fi = NULL;
2631 flags |= RTCF_LOCAL;
2632 goto make_route;
2635 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2636 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2637 fib_select_multipath(&fl, &res);
2638 else
2639 #endif
2640 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2641 fib_select_default(net, &fl, &res);
2643 if (!fl.fl4_src)
2644 fl.fl4_src = FIB_RES_PREFSRC(res);
2646 if (dev_out)
2647 dev_put(dev_out);
2648 dev_out = FIB_RES_DEV(res);
2649 dev_hold(dev_out);
2650 fl.oif = dev_out->ifindex;
2653 make_route:
2654 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2657 if (free_res)
2658 fib_res_put(&res);
2659 if (dev_out)
2660 dev_put(dev_out);
2661 out: return err;
2664 int __ip_route_output_key(struct net *net, struct rtable **rp,
2665 const struct flowi *flp)
2667 unsigned hash;
2668 struct rtable *rth;
2670 if (!rt_caching(net))
2671 goto slow_output;
2673 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2675 rcu_read_lock_bh();
2676 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2677 rth = rcu_dereference(rth->u.dst.rt_next)) {
2678 if (rth->fl.fl4_dst == flp->fl4_dst &&
2679 rth->fl.fl4_src == flp->fl4_src &&
2680 rth->fl.iif == 0 &&
2681 rth->fl.oif == flp->oif &&
2682 rth->fl.mark == flp->mark &&
2683 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2684 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2685 net_eq(dev_net(rth->u.dst.dev), net) &&
2686 !rt_is_expired(rth)) {
2687 dst_use(&rth->u.dst, jiffies);
2688 RT_CACHE_STAT_INC(out_hit);
2689 rcu_read_unlock_bh();
2690 *rp = rth;
2691 return 0;
2693 RT_CACHE_STAT_INC(out_hlist_search);
2695 rcu_read_unlock_bh();
2697 slow_output:
2698 return ip_route_output_slow(net, rp, flp);
2701 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2703 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2707 static struct dst_ops ipv4_dst_blackhole_ops = {
2708 .family = AF_INET,
2709 .protocol = cpu_to_be16(ETH_P_IP),
2710 .destroy = ipv4_dst_destroy,
2711 .check = ipv4_dst_check,
2712 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2713 .entries = ATOMIC_INIT(0),
2717 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2719 struct rtable *ort = *rp;
2720 struct rtable *rt = (struct rtable *)
2721 dst_alloc(&ipv4_dst_blackhole_ops);
2723 if (rt) {
2724 struct dst_entry *new = &rt->u.dst;
2726 atomic_set(&new->__refcnt, 1);
2727 new->__use = 1;
2728 new->input = dst_discard;
2729 new->output = dst_discard;
2730 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2732 new->dev = ort->u.dst.dev;
2733 if (new->dev)
2734 dev_hold(new->dev);
2736 rt->fl = ort->fl;
2738 rt->idev = ort->idev;
2739 if (rt->idev)
2740 in_dev_hold(rt->idev);
2741 rt->rt_genid = rt_genid(net);
2742 rt->rt_flags = ort->rt_flags;
2743 rt->rt_type = ort->rt_type;
2744 rt->rt_dst = ort->rt_dst;
2745 rt->rt_src = ort->rt_src;
2746 rt->rt_iif = ort->rt_iif;
2747 rt->rt_gateway = ort->rt_gateway;
2748 rt->rt_spec_dst = ort->rt_spec_dst;
2749 rt->peer = ort->peer;
2750 if (rt->peer)
2751 atomic_inc(&rt->peer->refcnt);
2753 dst_free(new);
2756 dst_release(&(*rp)->u.dst);
2757 *rp = rt;
2758 return (rt ? 0 : -ENOMEM);
2761 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2762 struct sock *sk, int flags)
2764 int err;
2766 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2767 return err;
2769 if (flp->proto) {
2770 if (!flp->fl4_src)
2771 flp->fl4_src = (*rp)->rt_src;
2772 if (!flp->fl4_dst)
2773 flp->fl4_dst = (*rp)->rt_dst;
2774 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2775 flags ? XFRM_LOOKUP_WAIT : 0);
2776 if (err == -EREMOTE)
2777 err = ipv4_dst_blackhole(net, rp, flp);
2779 return err;
2782 return 0;
2785 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2787 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2789 return ip_route_output_flow(net, rp, flp, NULL, 0);
2792 static int rt_fill_info(struct net *net,
2793 struct sk_buff *skb, u32 pid, u32 seq, int event,
2794 int nowait, unsigned int flags)
2796 struct rtable *rt = skb->rtable;
2797 struct rtmsg *r;
2798 struct nlmsghdr *nlh;
2799 long expires;
2800 u32 id = 0, ts = 0, tsage = 0, error;
2802 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2803 if (nlh == NULL)
2804 return -EMSGSIZE;
2806 r = nlmsg_data(nlh);
2807 r->rtm_family = AF_INET;
2808 r->rtm_dst_len = 32;
2809 r->rtm_src_len = 0;
2810 r->rtm_tos = rt->fl.fl4_tos;
2811 r->rtm_table = RT_TABLE_MAIN;
2812 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2813 r->rtm_type = rt->rt_type;
2814 r->rtm_scope = RT_SCOPE_UNIVERSE;
2815 r->rtm_protocol = RTPROT_UNSPEC;
2816 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2817 if (rt->rt_flags & RTCF_NOTIFY)
2818 r->rtm_flags |= RTM_F_NOTIFY;
2820 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2822 if (rt->fl.fl4_src) {
2823 r->rtm_src_len = 32;
2824 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2826 if (rt->u.dst.dev)
2827 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2828 #ifdef CONFIG_NET_CLS_ROUTE
2829 if (rt->u.dst.tclassid)
2830 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2831 #endif
2832 if (rt->fl.iif)
2833 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2834 else if (rt->rt_src != rt->fl.fl4_src)
2835 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2837 if (rt->rt_dst != rt->rt_gateway)
2838 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2840 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2841 goto nla_put_failure;
2843 error = rt->u.dst.error;
2844 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2845 if (rt->peer) {
2846 id = rt->peer->ip_id_count;
2847 if (rt->peer->tcp_ts_stamp) {
2848 ts = rt->peer->tcp_ts;
2849 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2853 if (rt->fl.iif) {
2854 #ifdef CONFIG_IP_MROUTE
2855 __be32 dst = rt->rt_dst;
2857 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2858 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2859 int err = ipmr_get_route(net, skb, r, nowait);
2860 if (err <= 0) {
2861 if (!nowait) {
2862 if (err == 0)
2863 return 0;
2864 goto nla_put_failure;
2865 } else {
2866 if (err == -EMSGSIZE)
2867 goto nla_put_failure;
2868 error = err;
2871 } else
2872 #endif
2873 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2876 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2877 expires, error) < 0)
2878 goto nla_put_failure;
2880 return nlmsg_end(skb, nlh);
2882 nla_put_failure:
2883 nlmsg_cancel(skb, nlh);
2884 return -EMSGSIZE;
2887 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2889 struct net *net = sock_net(in_skb->sk);
2890 struct rtmsg *rtm;
2891 struct nlattr *tb[RTA_MAX+1];
2892 struct rtable *rt = NULL;
2893 __be32 dst = 0;
2894 __be32 src = 0;
2895 u32 iif;
2896 int err;
2897 struct sk_buff *skb;
2899 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2900 if (err < 0)
2901 goto errout;
2903 rtm = nlmsg_data(nlh);
2905 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2906 if (skb == NULL) {
2907 err = -ENOBUFS;
2908 goto errout;
2911 /* Reserve room for dummy headers, this skb can pass
2912 through good chunk of routing engine.
2914 skb_reset_mac_header(skb);
2915 skb_reset_network_header(skb);
2917 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2918 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2919 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2921 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2922 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2923 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2925 if (iif) {
2926 struct net_device *dev;
2928 dev = __dev_get_by_index(net, iif);
2929 if (dev == NULL) {
2930 err = -ENODEV;
2931 goto errout_free;
2934 skb->protocol = htons(ETH_P_IP);
2935 skb->dev = dev;
2936 local_bh_disable();
2937 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2938 local_bh_enable();
2940 rt = skb->rtable;
2941 if (err == 0 && rt->u.dst.error)
2942 err = -rt->u.dst.error;
2943 } else {
2944 struct flowi fl = {
2945 .nl_u = {
2946 .ip4_u = {
2947 .daddr = dst,
2948 .saddr = src,
2949 .tos = rtm->rtm_tos,
2952 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2954 err = ip_route_output_key(net, &rt, &fl);
2957 if (err)
2958 goto errout_free;
2960 skb->rtable = rt;
2961 if (rtm->rtm_flags & RTM_F_NOTIFY)
2962 rt->rt_flags |= RTCF_NOTIFY;
2964 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2965 RTM_NEWROUTE, 0, 0);
2966 if (err <= 0)
2967 goto errout_free;
2969 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2970 errout:
2971 return err;
2973 errout_free:
2974 kfree_skb(skb);
2975 goto errout;
2978 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2980 struct rtable *rt;
2981 int h, s_h;
2982 int idx, s_idx;
2983 struct net *net;
2985 net = sock_net(skb->sk);
2987 s_h = cb->args[0];
2988 if (s_h < 0)
2989 s_h = 0;
2990 s_idx = idx = cb->args[1];
2991 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2992 if (!rt_hash_table[h].chain)
2993 continue;
2994 rcu_read_lock_bh();
2995 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2996 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2997 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2998 continue;
2999 if (rt_is_expired(rt))
3000 continue;
3001 skb->dst = dst_clone(&rt->u.dst);
3002 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3003 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3004 1, NLM_F_MULTI) <= 0) {
3005 dst_release(xchg(&skb->dst, NULL));
3006 rcu_read_unlock_bh();
3007 goto done;
3009 dst_release(xchg(&skb->dst, NULL));
3011 rcu_read_unlock_bh();
3014 done:
3015 cb->args[0] = h;
3016 cb->args[1] = idx;
3017 return skb->len;
3020 void ip_rt_multicast_event(struct in_device *in_dev)
3022 rt_cache_flush(dev_net(in_dev->dev), 0);
3025 #ifdef CONFIG_SYSCTL
3026 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3027 struct file *filp, void __user *buffer,
3028 size_t *lenp, loff_t *ppos)
3030 if (write) {
3031 int flush_delay;
3032 ctl_table ctl;
3033 struct net *net;
3035 memcpy(&ctl, __ctl, sizeof(ctl));
3036 ctl.data = &flush_delay;
3037 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
3039 net = (struct net *)__ctl->extra1;
3040 rt_cache_flush(net, flush_delay);
3041 return 0;
3044 return -EINVAL;
3047 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3048 void __user *oldval,
3049 size_t __user *oldlenp,
3050 void __user *newval,
3051 size_t newlen)
3053 int delay;
3054 struct net *net;
3055 if (newlen != sizeof(int))
3056 return -EINVAL;
3057 if (get_user(delay, (int __user *)newval))
3058 return -EFAULT;
3059 net = (struct net *)table->extra1;
3060 rt_cache_flush(net, delay);
3061 return 0;
3064 static void rt_secret_reschedule(int old)
3066 struct net *net;
3067 int new = ip_rt_secret_interval;
3068 int diff = new - old;
3070 if (!diff)
3071 return;
3073 rtnl_lock();
3074 for_each_net(net) {
3075 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3077 if (!new)
3078 continue;
3080 if (deleted) {
3081 long time = net->ipv4.rt_secret_timer.expires - jiffies;
3083 if (time <= 0 || (time += diff) <= 0)
3084 time = 0;
3086 net->ipv4.rt_secret_timer.expires = time;
3087 } else
3088 net->ipv4.rt_secret_timer.expires = new;
3090 net->ipv4.rt_secret_timer.expires += jiffies;
3091 add_timer(&net->ipv4.rt_secret_timer);
3093 rtnl_unlock();
3096 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3097 struct file *filp,
3098 void __user *buffer, size_t *lenp,
3099 loff_t *ppos)
3101 int old = ip_rt_secret_interval;
3102 int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
3104 rt_secret_reschedule(old);
3106 return ret;
3109 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3110 void __user *oldval,
3111 size_t __user *oldlenp,
3112 void __user *newval,
3113 size_t newlen)
3115 int old = ip_rt_secret_interval;
3116 int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3118 rt_secret_reschedule(old);
3120 return ret;
3123 static ctl_table ipv4_route_table[] = {
3125 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
3126 .procname = "gc_thresh",
3127 .data = &ipv4_dst_ops.gc_thresh,
3128 .maxlen = sizeof(int),
3129 .mode = 0644,
3130 .proc_handler = proc_dointvec,
3133 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
3134 .procname = "max_size",
3135 .data = &ip_rt_max_size,
3136 .maxlen = sizeof(int),
3137 .mode = 0644,
3138 .proc_handler = proc_dointvec,
3141 /* Deprecated. Use gc_min_interval_ms */
3143 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3144 .procname = "gc_min_interval",
3145 .data = &ip_rt_gc_min_interval,
3146 .maxlen = sizeof(int),
3147 .mode = 0644,
3148 .proc_handler = proc_dointvec_jiffies,
3149 .strategy = sysctl_jiffies,
3152 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3153 .procname = "gc_min_interval_ms",
3154 .data = &ip_rt_gc_min_interval,
3155 .maxlen = sizeof(int),
3156 .mode = 0644,
3157 .proc_handler = proc_dointvec_ms_jiffies,
3158 .strategy = sysctl_ms_jiffies,
3161 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
3162 .procname = "gc_timeout",
3163 .data = &ip_rt_gc_timeout,
3164 .maxlen = sizeof(int),
3165 .mode = 0644,
3166 .proc_handler = proc_dointvec_jiffies,
3167 .strategy = sysctl_jiffies,
3170 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
3171 .procname = "gc_interval",
3172 .data = &ip_rt_gc_interval,
3173 .maxlen = sizeof(int),
3174 .mode = 0644,
3175 .proc_handler = proc_dointvec_jiffies,
3176 .strategy = sysctl_jiffies,
3179 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
3180 .procname = "redirect_load",
3181 .data = &ip_rt_redirect_load,
3182 .maxlen = sizeof(int),
3183 .mode = 0644,
3184 .proc_handler = proc_dointvec,
3187 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3188 .procname = "redirect_number",
3189 .data = &ip_rt_redirect_number,
3190 .maxlen = sizeof(int),
3191 .mode = 0644,
3192 .proc_handler = proc_dointvec,
3195 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3196 .procname = "redirect_silence",
3197 .data = &ip_rt_redirect_silence,
3198 .maxlen = sizeof(int),
3199 .mode = 0644,
3200 .proc_handler = proc_dointvec,
3203 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
3204 .procname = "error_cost",
3205 .data = &ip_rt_error_cost,
3206 .maxlen = sizeof(int),
3207 .mode = 0644,
3208 .proc_handler = proc_dointvec,
3211 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3212 .procname = "error_burst",
3213 .data = &ip_rt_error_burst,
3214 .maxlen = sizeof(int),
3215 .mode = 0644,
3216 .proc_handler = proc_dointvec,
3219 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3220 .procname = "gc_elasticity",
3221 .data = &ip_rt_gc_elasticity,
3222 .maxlen = sizeof(int),
3223 .mode = 0644,
3224 .proc_handler = proc_dointvec,
3227 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3228 .procname = "mtu_expires",
3229 .data = &ip_rt_mtu_expires,
3230 .maxlen = sizeof(int),
3231 .mode = 0644,
3232 .proc_handler = proc_dointvec_jiffies,
3233 .strategy = sysctl_jiffies,
3236 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3237 .procname = "min_pmtu",
3238 .data = &ip_rt_min_pmtu,
3239 .maxlen = sizeof(int),
3240 .mode = 0644,
3241 .proc_handler = proc_dointvec,
3244 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3245 .procname = "min_adv_mss",
3246 .data = &ip_rt_min_advmss,
3247 .maxlen = sizeof(int),
3248 .mode = 0644,
3249 .proc_handler = proc_dointvec,
3252 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3253 .procname = "secret_interval",
3254 .data = &ip_rt_secret_interval,
3255 .maxlen = sizeof(int),
3256 .mode = 0644,
3257 .proc_handler = ipv4_sysctl_rt_secret_interval,
3258 .strategy = ipv4_sysctl_rt_secret_interval_strategy,
3260 { .ctl_name = 0 }
3263 static struct ctl_table empty[1];
3265 static struct ctl_table ipv4_skeleton[] =
3267 { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3268 .mode = 0555, .child = ipv4_route_table},
3269 { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3270 .mode = 0555, .child = empty},
3274 static __net_initdata struct ctl_path ipv4_path[] = {
3275 { .procname = "net", .ctl_name = CTL_NET, },
3276 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3277 { },
3280 static struct ctl_table ipv4_route_flush_table[] = {
3282 .ctl_name = NET_IPV4_ROUTE_FLUSH,
3283 .procname = "flush",
3284 .maxlen = sizeof(int),
3285 .mode = 0200,
3286 .proc_handler = ipv4_sysctl_rtcache_flush,
3287 .strategy = ipv4_sysctl_rtcache_flush_strategy,
3289 { .ctl_name = 0 },
3292 static __net_initdata struct ctl_path ipv4_route_path[] = {
3293 { .procname = "net", .ctl_name = CTL_NET, },
3294 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3295 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3296 { },
3299 static __net_init int sysctl_route_net_init(struct net *net)
3301 struct ctl_table *tbl;
3303 tbl = ipv4_route_flush_table;
3304 if (net != &init_net) {
3305 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3306 if (tbl == NULL)
3307 goto err_dup;
3309 tbl[0].extra1 = net;
3311 net->ipv4.route_hdr =
3312 register_net_sysctl_table(net, ipv4_route_path, tbl);
3313 if (net->ipv4.route_hdr == NULL)
3314 goto err_reg;
3315 return 0;
3317 err_reg:
3318 if (tbl != ipv4_route_flush_table)
3319 kfree(tbl);
3320 err_dup:
3321 return -ENOMEM;
3324 static __net_exit void sysctl_route_net_exit(struct net *net)
3326 struct ctl_table *tbl;
3328 tbl = net->ipv4.route_hdr->ctl_table_arg;
3329 unregister_net_sysctl_table(net->ipv4.route_hdr);
3330 BUG_ON(tbl == ipv4_route_flush_table);
3331 kfree(tbl);
3334 static __net_initdata struct pernet_operations sysctl_route_ops = {
3335 .init = sysctl_route_net_init,
3336 .exit = sysctl_route_net_exit,
3338 #endif
3341 static __net_init int rt_secret_timer_init(struct net *net)
3343 atomic_set(&net->ipv4.rt_genid,
3344 (int) ((num_physpages ^ (num_physpages>>8)) ^
3345 (jiffies ^ (jiffies >> 7))));
3347 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3348 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3349 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3351 if (ip_rt_secret_interval) {
3352 net->ipv4.rt_secret_timer.expires =
3353 jiffies + net_random() % ip_rt_secret_interval +
3354 ip_rt_secret_interval;
3355 add_timer(&net->ipv4.rt_secret_timer);
3357 return 0;
3360 static __net_exit void rt_secret_timer_exit(struct net *net)
3362 del_timer_sync(&net->ipv4.rt_secret_timer);
3365 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3366 .init = rt_secret_timer_init,
3367 .exit = rt_secret_timer_exit,
3371 #ifdef CONFIG_NET_CLS_ROUTE
3372 struct ip_rt_acct *ip_rt_acct __read_mostly;
3373 #endif /* CONFIG_NET_CLS_ROUTE */
3375 static __initdata unsigned long rhash_entries;
3376 static int __init set_rhash_entries(char *str)
3378 if (!str)
3379 return 0;
3380 rhash_entries = simple_strtoul(str, &str, 0);
3381 return 1;
3383 __setup("rhash_entries=", set_rhash_entries);
3385 int __init ip_rt_init(void)
3387 int rc = 0;
3389 #ifdef CONFIG_NET_CLS_ROUTE
3390 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3391 if (!ip_rt_acct)
3392 panic("IP: failed to allocate ip_rt_acct\n");
3393 #endif
3395 ipv4_dst_ops.kmem_cachep =
3396 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3397 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3399 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3401 rt_hash_table = (struct rt_hash_bucket *)
3402 alloc_large_system_hash("IP route cache",
3403 sizeof(struct rt_hash_bucket),
3404 rhash_entries,
3405 (num_physpages >= 128 * 1024) ?
3406 15 : 17,
3408 &rt_hash_log,
3409 &rt_hash_mask,
3410 rhash_entries ? 0 : 512 * 1024);
3411 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3412 rt_hash_lock_init();
3414 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3415 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3417 devinet_init();
3418 ip_fib_init();
3420 /* All the timers, started at system startup tend
3421 to synchronize. Perturb it a bit.
3423 schedule_delayed_work(&expires_work,
3424 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3426 if (register_pernet_subsys(&rt_secret_timer_ops))
3427 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3429 if (ip_rt_proc_init())
3430 printk(KERN_ERR "Unable to create route proc files\n");
3431 #ifdef CONFIG_XFRM
3432 xfrm_init();
3433 xfrm4_init();
3434 #endif
3435 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3437 #ifdef CONFIG_SYSCTL
3438 register_pernet_subsys(&sysctl_route_ops);
3439 #endif
3440 return rc;
3443 #ifdef CONFIG_SYSCTL
3445 * We really need to sanitize the damn ipv4 init order, then all
3446 * this nonsense will go away.
3448 void __init ip_static_sysctl_init(void)
3450 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3452 #endif
3454 EXPORT_SYMBOL(__ip_select_ident);
3455 EXPORT_SYMBOL(ip_route_input);
3456 EXPORT_SYMBOL(ip_route_output_key);