IPV4: route rekey timer can be deferrable
[linux-2.6/kmemtrace.git] / net / ipv4 / route.c
bloba1c5b8dbdfed403d8a1adbeb43962887c2f62550
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
113 #define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
116 #define IP_MAX_MTU 0xFFF0
118 #define RT_GC_TIMEOUT (300*HZ)
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
123 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
124 static int ip_rt_redirect_number __read_mostly = 9;
125 static int ip_rt_redirect_load __read_mostly = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly = HZ;
128 static int ip_rt_error_burst __read_mostly = 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly = 8;
130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly = 256;
133 static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
135 static void rt_worker_func(struct work_struct *work);
136 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
137 static struct timer_list rt_secret_timer;
140 * Interface to generic destination cache.
143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144 static void ipv4_dst_destroy(struct dst_entry *dst);
145 static void ipv4_dst_ifdown(struct dst_entry *dst,
146 struct net_device *dev, int how);
147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148 static void ipv4_link_failure(struct sk_buff *skb);
149 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150 static int rt_garbage_collect(struct dst_ops *ops);
153 static struct dst_ops ipv4_dst_ops = {
154 .family = AF_INET,
155 .protocol = __constant_htons(ETH_P_IP),
156 .gc = rt_garbage_collect,
157 .check = ipv4_dst_check,
158 .destroy = ipv4_dst_destroy,
159 .ifdown = ipv4_dst_ifdown,
160 .negative_advice = ipv4_negative_advice,
161 .link_failure = ipv4_link_failure,
162 .update_pmtu = ip_rt_update_pmtu,
163 .local_out = ip_local_out,
164 .entry_size = sizeof(struct rtable),
165 .entries = ATOMIC_INIT(0),
168 #define ECN_OR_COST(class) TC_PRIO_##class
170 const __u8 ip_tos2prio[16] = {
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(FILLER),
173 TC_PRIO_BESTEFFORT,
174 ECN_OR_COST(BESTEFFORT),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_BULK,
178 ECN_OR_COST(BULK),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE,
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK)
191 * Route cache.
194 /* The locking scheme is rather straight forward:
196 * 1) Read-Copy Update protects the buckets of the central route hash.
197 * 2) Only writers remove entries, and they hold the lock
198 * as they look at rtable reference counts.
199 * 3) Only readers acquire references to rtable entries,
200 * they do so with atomic increments and with the
201 * lock held.
204 struct rt_hash_bucket {
205 struct rtable *chain;
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
208 defined(CONFIG_PROVE_LOCKING)
210 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
211 * The size of this table is a power of two and depends on the number of CPUS.
212 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
214 #ifdef CONFIG_LOCKDEP
215 # define RT_HASH_LOCK_SZ 256
216 #else
217 # if NR_CPUS >= 32
218 # define RT_HASH_LOCK_SZ 4096
219 # elif NR_CPUS >= 16
220 # define RT_HASH_LOCK_SZ 2048
221 # elif NR_CPUS >= 8
222 # define RT_HASH_LOCK_SZ 1024
223 # elif NR_CPUS >= 4
224 # define RT_HASH_LOCK_SZ 512
225 # else
226 # define RT_HASH_LOCK_SZ 256
227 # endif
228 #endif
230 static spinlock_t *rt_hash_locks;
231 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233 static __init void rt_hash_lock_init(void)
235 int i;
237 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
238 GFP_KERNEL);
239 if (!rt_hash_locks)
240 panic("IP: failed to allocate rt_hash_locks\n");
242 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
243 spin_lock_init(&rt_hash_locks[i]);
245 #else
246 # define rt_hash_lock_addr(slot) NULL
248 static inline void rt_hash_lock_init(void)
251 #endif
253 static struct rt_hash_bucket *rt_hash_table __read_mostly;
254 static unsigned rt_hash_mask __read_mostly;
255 static unsigned int rt_hash_log __read_mostly;
256 static atomic_t rt_genid __read_mostly;
258 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
259 #define RT_CACHE_STAT_INC(field) \
260 (__raw_get_cpu_var(rt_cache_stat).field++)
262 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
264 return jhash_3words((__force u32)(__be32)(daddr),
265 (__force u32)(__be32)(saddr),
266 idx, atomic_read(&rt_genid))
267 & rt_hash_mask;
270 #ifdef CONFIG_PROC_FS
271 struct rt_cache_iter_state {
272 struct seq_net_private p;
273 int bucket;
274 int genid;
277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
279 struct rt_cache_iter_state *st = seq->private;
280 struct rtable *r = NULL;
282 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283 rcu_read_lock_bh();
284 r = rcu_dereference(rt_hash_table[st->bucket].chain);
285 while (r) {
286 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
287 r->rt_genid == st->genid)
288 return r;
289 r = rcu_dereference(r->u.dst.rt_next);
291 rcu_read_unlock_bh();
293 return r;
296 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
297 struct rtable *r)
299 struct rt_cache_iter_state *st = seq->private;
300 r = r->u.dst.rt_next;
301 while (!r) {
302 rcu_read_unlock_bh();
303 if (--st->bucket < 0)
304 break;
305 rcu_read_lock_bh();
306 r = rt_hash_table[st->bucket].chain;
308 return rcu_dereference(r);
311 static struct rtable *rt_cache_get_next(struct seq_file *seq,
312 struct rtable *r)
314 struct rt_cache_iter_state *st = seq->private;
315 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
316 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
317 continue;
318 if (r->rt_genid == st->genid)
319 break;
321 return r;
324 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
326 struct rtable *r = rt_cache_get_first(seq);
328 if (r)
329 while (pos && (r = rt_cache_get_next(seq, r)))
330 --pos;
331 return pos ? NULL : r;
334 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
336 struct rt_cache_iter_state *st = seq->private;
337 if (*pos)
338 return rt_cache_get_idx(seq, *pos - 1);
339 st->genid = atomic_read(&rt_genid);
340 return SEQ_START_TOKEN;
343 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
345 struct rtable *r;
347 if (v == SEQ_START_TOKEN)
348 r = rt_cache_get_first(seq);
349 else
350 r = rt_cache_get_next(seq, v);
351 ++*pos;
352 return r;
355 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
357 if (v && v != SEQ_START_TOKEN)
358 rcu_read_unlock_bh();
361 static int rt_cache_seq_show(struct seq_file *seq, void *v)
363 if (v == SEQ_START_TOKEN)
364 seq_printf(seq, "%-127s\n",
365 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
366 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
367 "HHUptod\tSpecDst");
368 else {
369 struct rtable *r = v;
370 char temp[256];
372 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
373 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
374 r->u.dst.dev ? r->u.dst.dev->name : "*",
375 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
376 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
377 r->u.dst.__use, 0, (unsigned long)r->rt_src,
378 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
379 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
380 dst_metric(&r->u.dst, RTAX_WINDOW),
381 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
382 dst_metric(&r->u.dst, RTAX_RTTVAR)),
383 r->fl.fl4_tos,
384 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
385 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
386 dev_queue_xmit) : 0,
387 r->rt_spec_dst);
388 seq_printf(seq, "%-127s\n", temp);
390 return 0;
393 static const struct seq_operations rt_cache_seq_ops = {
394 .start = rt_cache_seq_start,
395 .next = rt_cache_seq_next,
396 .stop = rt_cache_seq_stop,
397 .show = rt_cache_seq_show,
400 static int rt_cache_seq_open(struct inode *inode, struct file *file)
402 return seq_open_net(inode, file, &rt_cache_seq_ops,
403 sizeof(struct rt_cache_iter_state));
406 static const struct file_operations rt_cache_seq_fops = {
407 .owner = THIS_MODULE,
408 .open = rt_cache_seq_open,
409 .read = seq_read,
410 .llseek = seq_lseek,
411 .release = seq_release_net,
415 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
417 int cpu;
419 if (*pos == 0)
420 return SEQ_START_TOKEN;
422 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
423 if (!cpu_possible(cpu))
424 continue;
425 *pos = cpu+1;
426 return &per_cpu(rt_cache_stat, cpu);
428 return NULL;
431 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
433 int cpu;
435 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
436 if (!cpu_possible(cpu))
437 continue;
438 *pos = cpu+1;
439 return &per_cpu(rt_cache_stat, cpu);
441 return NULL;
445 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
450 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
452 struct rt_cache_stat *st = v;
454 if (v == SEQ_START_TOKEN) {
455 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
456 return 0;
459 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
460 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
461 atomic_read(&ipv4_dst_ops.entries),
462 st->in_hit,
463 st->in_slow_tot,
464 st->in_slow_mc,
465 st->in_no_route,
466 st->in_brd,
467 st->in_martian_dst,
468 st->in_martian_src,
470 st->out_hit,
471 st->out_slow_tot,
472 st->out_slow_mc,
474 st->gc_total,
475 st->gc_ignored,
476 st->gc_goal_miss,
477 st->gc_dst_overflow,
478 st->in_hlist_search,
479 st->out_hlist_search
481 return 0;
484 static const struct seq_operations rt_cpu_seq_ops = {
485 .start = rt_cpu_seq_start,
486 .next = rt_cpu_seq_next,
487 .stop = rt_cpu_seq_stop,
488 .show = rt_cpu_seq_show,
492 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
494 return seq_open(file, &rt_cpu_seq_ops);
497 static const struct file_operations rt_cpu_seq_fops = {
498 .owner = THIS_MODULE,
499 .open = rt_cpu_seq_open,
500 .read = seq_read,
501 .llseek = seq_lseek,
502 .release = seq_release,
505 #ifdef CONFIG_NET_CLS_ROUTE
506 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
507 int length, int *eof, void *data)
509 unsigned int i;
511 if ((offset & 3) || (length & 3))
512 return -EIO;
514 if (offset >= sizeof(struct ip_rt_acct) * 256) {
515 *eof = 1;
516 return 0;
519 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
520 length = sizeof(struct ip_rt_acct) * 256 - offset;
521 *eof = 1;
524 offset /= sizeof(u32);
526 if (length > 0) {
527 u32 *dst = (u32 *) buffer;
529 *start = buffer;
530 memset(dst, 0, length);
532 for_each_possible_cpu(i) {
533 unsigned int j;
534 u32 *src;
536 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
537 for (j = 0; j < length/4; j++)
538 dst[j] += src[j];
541 return length;
543 #endif
545 static int __net_init ip_rt_do_proc_init(struct net *net)
547 struct proc_dir_entry *pde;
549 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
550 &rt_cache_seq_fops);
551 if (!pde)
552 goto err1;
554 pde = proc_create("rt_cache", S_IRUGO,
555 net->proc_net_stat, &rt_cpu_seq_fops);
556 if (!pde)
557 goto err2;
559 #ifdef CONFIG_NET_CLS_ROUTE
560 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
561 ip_rt_acct_read, NULL);
562 if (!pde)
563 goto err3;
564 #endif
565 return 0;
567 #ifdef CONFIG_NET_CLS_ROUTE
568 err3:
569 remove_proc_entry("rt_cache", net->proc_net_stat);
570 #endif
571 err2:
572 remove_proc_entry("rt_cache", net->proc_net);
573 err1:
574 return -ENOMEM;
577 static void __net_exit ip_rt_do_proc_exit(struct net *net)
579 remove_proc_entry("rt_cache", net->proc_net_stat);
580 remove_proc_entry("rt_cache", net->proc_net);
581 remove_proc_entry("rt_acct", net->proc_net);
584 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
585 .init = ip_rt_do_proc_init,
586 .exit = ip_rt_do_proc_exit,
589 static int __init ip_rt_proc_init(void)
591 return register_pernet_subsys(&ip_rt_proc_ops);
594 #else
595 static inline int ip_rt_proc_init(void)
597 return 0;
599 #endif /* CONFIG_PROC_FS */
601 static inline void rt_free(struct rtable *rt)
603 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
606 static inline void rt_drop(struct rtable *rt)
608 ip_rt_put(rt);
609 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
612 static inline int rt_fast_clean(struct rtable *rth)
614 /* Kill broadcast/multicast entries very aggresively, if they
615 collide in hash table with more useful entries */
616 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
617 rth->fl.iif && rth->u.dst.rt_next;
620 static inline int rt_valuable(struct rtable *rth)
622 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
623 rth->u.dst.expires;
626 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
628 unsigned long age;
629 int ret = 0;
631 if (atomic_read(&rth->u.dst.__refcnt))
632 goto out;
634 ret = 1;
635 if (rth->u.dst.expires &&
636 time_after_eq(jiffies, rth->u.dst.expires))
637 goto out;
639 age = jiffies - rth->u.dst.lastuse;
640 ret = 0;
641 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
642 (age <= tmo2 && rt_valuable(rth)))
643 goto out;
644 ret = 1;
645 out: return ret;
648 /* Bits of score are:
649 * 31: very valuable
650 * 30: not quite useless
651 * 29..0: usage counter
653 static inline u32 rt_score(struct rtable *rt)
655 u32 score = jiffies - rt->u.dst.lastuse;
657 score = ~score & ~(3<<30);
659 if (rt_valuable(rt))
660 score |= (1<<31);
662 if (!rt->fl.iif ||
663 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
664 score |= (1<<30);
666 return score;
669 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
671 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
672 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
673 (fl1->mark ^ fl2->mark) |
674 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
675 *(u16 *)&fl2->nl_u.ip4_u.tos) |
676 (fl1->oif ^ fl2->oif) |
677 (fl1->iif ^ fl2->iif)) == 0;
680 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
682 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
686 * Perform a full scan of hash table and free all entries.
687 * Can be called by a softirq or a process.
688 * In the later case, we want to be reschedule if necessary
690 static void rt_do_flush(int process_context)
692 unsigned int i;
693 struct rtable *rth, *next;
695 for (i = 0; i <= rt_hash_mask; i++) {
696 if (process_context && need_resched())
697 cond_resched();
698 rth = rt_hash_table[i].chain;
699 if (!rth)
700 continue;
702 spin_lock_bh(rt_hash_lock_addr(i));
703 rth = rt_hash_table[i].chain;
704 rt_hash_table[i].chain = NULL;
705 spin_unlock_bh(rt_hash_lock_addr(i));
707 for (; rth; rth = next) {
708 next = rth->u.dst.rt_next;
709 rt_free(rth);
714 static void rt_check_expire(void)
716 static unsigned int rover;
717 unsigned int i = rover, goal;
718 struct rtable *rth, **rthp;
719 u64 mult;
721 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
722 if (ip_rt_gc_timeout > 1)
723 do_div(mult, ip_rt_gc_timeout);
724 goal = (unsigned int)mult;
725 if (goal > rt_hash_mask)
726 goal = rt_hash_mask + 1;
727 for (; goal > 0; goal--) {
728 unsigned long tmo = ip_rt_gc_timeout;
730 i = (i + 1) & rt_hash_mask;
731 rthp = &rt_hash_table[i].chain;
733 if (need_resched())
734 cond_resched();
736 if (*rthp == NULL)
737 continue;
738 spin_lock_bh(rt_hash_lock_addr(i));
739 while ((rth = *rthp) != NULL) {
740 if (rth->rt_genid != atomic_read(&rt_genid)) {
741 *rthp = rth->u.dst.rt_next;
742 rt_free(rth);
743 continue;
745 if (rth->u.dst.expires) {
746 /* Entry is expired even if it is in use */
747 if (time_before_eq(jiffies, rth->u.dst.expires)) {
748 tmo >>= 1;
749 rthp = &rth->u.dst.rt_next;
750 continue;
752 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
753 tmo >>= 1;
754 rthp = &rth->u.dst.rt_next;
755 continue;
758 /* Cleanup aged off entries. */
759 *rthp = rth->u.dst.rt_next;
760 rt_free(rth);
762 spin_unlock_bh(rt_hash_lock_addr(i));
764 rover = i;
768 * rt_worker_func() is run in process context.
769 * we call rt_check_expire() to scan part of the hash table
771 static void rt_worker_func(struct work_struct *work)
773 rt_check_expire();
774 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
778 * Pertubation of rt_genid by a small quantity [1..256]
779 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
780 * many times (2^24) without giving recent rt_genid.
781 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
783 static void rt_cache_invalidate(void)
785 unsigned char shuffle;
787 get_random_bytes(&shuffle, sizeof(shuffle));
788 atomic_add(shuffle + 1U, &rt_genid);
792 * delay < 0 : invalidate cache (fast : entries will be deleted later)
793 * delay >= 0 : invalidate & flush cache (can be long)
795 void rt_cache_flush(int delay)
797 rt_cache_invalidate();
798 if (delay >= 0)
799 rt_do_flush(!in_softirq());
803 * We change rt_genid and let gc do the cleanup
805 static void rt_secret_rebuild(unsigned long dummy)
807 rt_cache_invalidate();
808 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
812 Short description of GC goals.
814 We want to build algorithm, which will keep routing cache
815 at some equilibrium point, when number of aged off entries
816 is kept approximately equal to newly generated ones.
818 Current expiration strength is variable "expire".
819 We try to adjust it dynamically, so that if networking
820 is idle expires is large enough to keep enough of warm entries,
821 and when load increases it reduces to limit cache size.
824 static int rt_garbage_collect(struct dst_ops *ops)
826 static unsigned long expire = RT_GC_TIMEOUT;
827 static unsigned long last_gc;
828 static int rover;
829 static int equilibrium;
830 struct rtable *rth, **rthp;
831 unsigned long now = jiffies;
832 int goal;
835 * Garbage collection is pretty expensive,
836 * do not make it too frequently.
839 RT_CACHE_STAT_INC(gc_total);
841 if (now - last_gc < ip_rt_gc_min_interval &&
842 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
843 RT_CACHE_STAT_INC(gc_ignored);
844 goto out;
847 /* Calculate number of entries, which we want to expire now. */
848 goal = atomic_read(&ipv4_dst_ops.entries) -
849 (ip_rt_gc_elasticity << rt_hash_log);
850 if (goal <= 0) {
851 if (equilibrium < ipv4_dst_ops.gc_thresh)
852 equilibrium = ipv4_dst_ops.gc_thresh;
853 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
854 if (goal > 0) {
855 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
856 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
858 } else {
859 /* We are in dangerous area. Try to reduce cache really
860 * aggressively.
862 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
863 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
866 if (now - last_gc >= ip_rt_gc_min_interval)
867 last_gc = now;
869 if (goal <= 0) {
870 equilibrium += goal;
871 goto work_done;
874 do {
875 int i, k;
877 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
878 unsigned long tmo = expire;
880 k = (k + 1) & rt_hash_mask;
881 rthp = &rt_hash_table[k].chain;
882 spin_lock_bh(rt_hash_lock_addr(k));
883 while ((rth = *rthp) != NULL) {
884 if (rth->rt_genid == atomic_read(&rt_genid) &&
885 !rt_may_expire(rth, tmo, expire)) {
886 tmo >>= 1;
887 rthp = &rth->u.dst.rt_next;
888 continue;
890 *rthp = rth->u.dst.rt_next;
891 rt_free(rth);
892 goal--;
894 spin_unlock_bh(rt_hash_lock_addr(k));
895 if (goal <= 0)
896 break;
898 rover = k;
900 if (goal <= 0)
901 goto work_done;
903 /* Goal is not achieved. We stop process if:
905 - if expire reduced to zero. Otherwise, expire is halfed.
906 - if table is not full.
907 - if we are called from interrupt.
908 - jiffies check is just fallback/debug loop breaker.
909 We will not spin here for long time in any case.
912 RT_CACHE_STAT_INC(gc_goal_miss);
914 if (expire == 0)
915 break;
917 expire >>= 1;
918 #if RT_CACHE_DEBUG >= 2
919 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
920 atomic_read(&ipv4_dst_ops.entries), goal, i);
921 #endif
923 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
924 goto out;
925 } while (!in_softirq() && time_before_eq(jiffies, now));
927 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
928 goto out;
929 if (net_ratelimit())
930 printk(KERN_WARNING "dst cache overflow\n");
931 RT_CACHE_STAT_INC(gc_dst_overflow);
932 return 1;
934 work_done:
935 expire += ip_rt_gc_min_interval;
936 if (expire > ip_rt_gc_timeout ||
937 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
938 expire = ip_rt_gc_timeout;
939 #if RT_CACHE_DEBUG >= 2
940 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
941 atomic_read(&ipv4_dst_ops.entries), goal, rover);
942 #endif
943 out: return 0;
946 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
948 struct rtable *rth, **rthp;
949 unsigned long now;
950 struct rtable *cand, **candp;
951 u32 min_score;
952 int chain_length;
953 int attempts = !in_softirq();
955 restart:
956 chain_length = 0;
957 min_score = ~(u32)0;
958 cand = NULL;
959 candp = NULL;
960 now = jiffies;
962 rthp = &rt_hash_table[hash].chain;
964 spin_lock_bh(rt_hash_lock_addr(hash));
965 while ((rth = *rthp) != NULL) {
966 if (rth->rt_genid != atomic_read(&rt_genid)) {
967 *rthp = rth->u.dst.rt_next;
968 rt_free(rth);
969 continue;
971 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
972 /* Put it first */
973 *rthp = rth->u.dst.rt_next;
975 * Since lookup is lockfree, the deletion
976 * must be visible to another weakly ordered CPU before
977 * the insertion at the start of the hash chain.
979 rcu_assign_pointer(rth->u.dst.rt_next,
980 rt_hash_table[hash].chain);
982 * Since lookup is lockfree, the update writes
983 * must be ordered for consistency on SMP.
985 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
987 dst_use(&rth->u.dst, now);
988 spin_unlock_bh(rt_hash_lock_addr(hash));
990 rt_drop(rt);
991 *rp = rth;
992 return 0;
995 if (!atomic_read(&rth->u.dst.__refcnt)) {
996 u32 score = rt_score(rth);
998 if (score <= min_score) {
999 cand = rth;
1000 candp = rthp;
1001 min_score = score;
1005 chain_length++;
1007 rthp = &rth->u.dst.rt_next;
1010 if (cand) {
1011 /* ip_rt_gc_elasticity used to be average length of chain
1012 * length, when exceeded gc becomes really aggressive.
1014 * The second limit is less certain. At the moment it allows
1015 * only 2 entries per bucket. We will see.
1017 if (chain_length > ip_rt_gc_elasticity) {
1018 *candp = cand->u.dst.rt_next;
1019 rt_free(cand);
1023 /* Try to bind route to arp only if it is output
1024 route or unicast forwarding path.
1026 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1027 int err = arp_bind_neighbour(&rt->u.dst);
1028 if (err) {
1029 spin_unlock_bh(rt_hash_lock_addr(hash));
1031 if (err != -ENOBUFS) {
1032 rt_drop(rt);
1033 return err;
1036 /* Neighbour tables are full and nothing
1037 can be released. Try to shrink route cache,
1038 it is most likely it holds some neighbour records.
1040 if (attempts-- > 0) {
1041 int saved_elasticity = ip_rt_gc_elasticity;
1042 int saved_int = ip_rt_gc_min_interval;
1043 ip_rt_gc_elasticity = 1;
1044 ip_rt_gc_min_interval = 0;
1045 rt_garbage_collect(&ipv4_dst_ops);
1046 ip_rt_gc_min_interval = saved_int;
1047 ip_rt_gc_elasticity = saved_elasticity;
1048 goto restart;
1051 if (net_ratelimit())
1052 printk(KERN_WARNING "Neighbour table overflow.\n");
1053 rt_drop(rt);
1054 return -ENOBUFS;
1058 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1059 #if RT_CACHE_DEBUG >= 2
1060 if (rt->u.dst.rt_next) {
1061 struct rtable *trt;
1062 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1063 NIPQUAD(rt->rt_dst));
1064 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1065 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1066 printk("\n");
1068 #endif
1069 rt_hash_table[hash].chain = rt;
1070 spin_unlock_bh(rt_hash_lock_addr(hash));
1071 *rp = rt;
1072 return 0;
1075 void rt_bind_peer(struct rtable *rt, int create)
1077 static DEFINE_SPINLOCK(rt_peer_lock);
1078 struct inet_peer *peer;
1080 peer = inet_getpeer(rt->rt_dst, create);
1082 spin_lock_bh(&rt_peer_lock);
1083 if (rt->peer == NULL) {
1084 rt->peer = peer;
1085 peer = NULL;
1087 spin_unlock_bh(&rt_peer_lock);
1088 if (peer)
1089 inet_putpeer(peer);
1093 * Peer allocation may fail only in serious out-of-memory conditions. However
1094 * we still can generate some output.
1095 * Random ID selection looks a bit dangerous because we have no chances to
1096 * select ID being unique in a reasonable period of time.
1097 * But broken packet identifier may be better than no packet at all.
1099 static void ip_select_fb_ident(struct iphdr *iph)
1101 static DEFINE_SPINLOCK(ip_fb_id_lock);
1102 static u32 ip_fallback_id;
1103 u32 salt;
1105 spin_lock_bh(&ip_fb_id_lock);
1106 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1107 iph->id = htons(salt & 0xFFFF);
1108 ip_fallback_id = salt;
1109 spin_unlock_bh(&ip_fb_id_lock);
1112 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1114 struct rtable *rt = (struct rtable *) dst;
1116 if (rt) {
1117 if (rt->peer == NULL)
1118 rt_bind_peer(rt, 1);
1120 /* If peer is attached to destination, it is never detached,
1121 so that we need not to grab a lock to dereference it.
1123 if (rt->peer) {
1124 iph->id = htons(inet_getid(rt->peer, more));
1125 return;
1127 } else
1128 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1129 __builtin_return_address(0));
1131 ip_select_fb_ident(iph);
1134 static void rt_del(unsigned hash, struct rtable *rt)
1136 struct rtable **rthp, *aux;
1138 rthp = &rt_hash_table[hash].chain;
1139 spin_lock_bh(rt_hash_lock_addr(hash));
1140 ip_rt_put(rt);
1141 while ((aux = *rthp) != NULL) {
1142 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1143 *rthp = aux->u.dst.rt_next;
1144 rt_free(aux);
1145 continue;
1147 rthp = &aux->u.dst.rt_next;
1149 spin_unlock_bh(rt_hash_lock_addr(hash));
1152 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1153 __be32 saddr, struct net_device *dev)
1155 int i, k;
1156 struct in_device *in_dev = in_dev_get(dev);
1157 struct rtable *rth, **rthp;
1158 __be32 skeys[2] = { saddr, 0 };
1159 int ikeys[2] = { dev->ifindex, 0 };
1160 struct netevent_redirect netevent;
1161 struct net *net;
1163 if (!in_dev)
1164 return;
1166 net = dev_net(dev);
1167 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1168 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1169 || ipv4_is_zeronet(new_gw))
1170 goto reject_redirect;
1172 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1173 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1174 goto reject_redirect;
1175 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1176 goto reject_redirect;
1177 } else {
1178 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1179 goto reject_redirect;
1182 for (i = 0; i < 2; i++) {
1183 for (k = 0; k < 2; k++) {
1184 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1186 rthp=&rt_hash_table[hash].chain;
1188 rcu_read_lock();
1189 while ((rth = rcu_dereference(*rthp)) != NULL) {
1190 struct rtable *rt;
1192 if (rth->fl.fl4_dst != daddr ||
1193 rth->fl.fl4_src != skeys[i] ||
1194 rth->fl.oif != ikeys[k] ||
1195 rth->fl.iif != 0 ||
1196 rth->rt_genid != atomic_read(&rt_genid) ||
1197 !net_eq(dev_net(rth->u.dst.dev), net)) {
1198 rthp = &rth->u.dst.rt_next;
1199 continue;
1202 if (rth->rt_dst != daddr ||
1203 rth->rt_src != saddr ||
1204 rth->u.dst.error ||
1205 rth->rt_gateway != old_gw ||
1206 rth->u.dst.dev != dev)
1207 break;
1209 dst_hold(&rth->u.dst);
1210 rcu_read_unlock();
1212 rt = dst_alloc(&ipv4_dst_ops);
1213 if (rt == NULL) {
1214 ip_rt_put(rth);
1215 in_dev_put(in_dev);
1216 return;
1219 /* Copy all the information. */
1220 *rt = *rth;
1221 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1222 rt->u.dst.__use = 1;
1223 atomic_set(&rt->u.dst.__refcnt, 1);
1224 rt->u.dst.child = NULL;
1225 if (rt->u.dst.dev)
1226 dev_hold(rt->u.dst.dev);
1227 if (rt->idev)
1228 in_dev_hold(rt->idev);
1229 rt->u.dst.obsolete = 0;
1230 rt->u.dst.lastuse = jiffies;
1231 rt->u.dst.path = &rt->u.dst;
1232 rt->u.dst.neighbour = NULL;
1233 rt->u.dst.hh = NULL;
1234 rt->u.dst.xfrm = NULL;
1235 rt->rt_genid = atomic_read(&rt_genid);
1236 rt->rt_flags |= RTCF_REDIRECTED;
1238 /* Gateway is different ... */
1239 rt->rt_gateway = new_gw;
1241 /* Redirect received -> path was valid */
1242 dst_confirm(&rth->u.dst);
1244 if (rt->peer)
1245 atomic_inc(&rt->peer->refcnt);
1247 if (arp_bind_neighbour(&rt->u.dst) ||
1248 !(rt->u.dst.neighbour->nud_state &
1249 NUD_VALID)) {
1250 if (rt->u.dst.neighbour)
1251 neigh_event_send(rt->u.dst.neighbour, NULL);
1252 ip_rt_put(rth);
1253 rt_drop(rt);
1254 goto do_next;
1257 netevent.old = &rth->u.dst;
1258 netevent.new = &rt->u.dst;
1259 call_netevent_notifiers(NETEVENT_REDIRECT,
1260 &netevent);
1262 rt_del(hash, rth);
1263 if (!rt_intern_hash(hash, rt, &rt))
1264 ip_rt_put(rt);
1265 goto do_next;
1267 rcu_read_unlock();
1268 do_next:
1272 in_dev_put(in_dev);
1273 return;
1275 reject_redirect:
1276 #ifdef CONFIG_IP_ROUTE_VERBOSE
1277 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1278 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1279 "%u.%u.%u.%u ignored.\n"
1280 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1281 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1282 NIPQUAD(saddr), NIPQUAD(daddr));
1283 #endif
1284 in_dev_put(in_dev);
1287 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1289 struct rtable *rt = (struct rtable *)dst;
1290 struct dst_entry *ret = dst;
1292 if (rt) {
1293 if (dst->obsolete) {
1294 ip_rt_put(rt);
1295 ret = NULL;
1296 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1297 rt->u.dst.expires) {
1298 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1299 rt->fl.oif);
1300 #if RT_CACHE_DEBUG >= 1
1301 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1302 "%u.%u.%u.%u/%02x dropped\n",
1303 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1304 #endif
1305 rt_del(hash, rt);
1306 ret = NULL;
1309 return ret;
1313 * Algorithm:
1314 * 1. The first ip_rt_redirect_number redirects are sent
1315 * with exponential backoff, then we stop sending them at all,
1316 * assuming that the host ignores our redirects.
1317 * 2. If we did not see packets requiring redirects
1318 * during ip_rt_redirect_silence, we assume that the host
1319 * forgot redirected route and start to send redirects again.
1321 * This algorithm is much cheaper and more intelligent than dumb load limiting
1322 * in icmp.c.
1324 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1325 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1328 void ip_rt_send_redirect(struct sk_buff *skb)
1330 struct rtable *rt = skb->rtable;
1331 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1333 if (!in_dev)
1334 return;
1336 if (!IN_DEV_TX_REDIRECTS(in_dev))
1337 goto out;
1339 /* No redirected packets during ip_rt_redirect_silence;
1340 * reset the algorithm.
1342 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1343 rt->u.dst.rate_tokens = 0;
1345 /* Too many ignored redirects; do not send anything
1346 * set u.dst.rate_last to the last seen redirected packet.
1348 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1349 rt->u.dst.rate_last = jiffies;
1350 goto out;
1353 /* Check for load limit; set rate_last to the latest sent
1354 * redirect.
1356 if (rt->u.dst.rate_tokens == 0 ||
1357 time_after(jiffies,
1358 (rt->u.dst.rate_last +
1359 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1360 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1361 rt->u.dst.rate_last = jiffies;
1362 ++rt->u.dst.rate_tokens;
1363 #ifdef CONFIG_IP_ROUTE_VERBOSE
1364 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1365 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1366 net_ratelimit())
1367 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1368 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1369 NIPQUAD(rt->rt_src), rt->rt_iif,
1370 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1371 #endif
1373 out:
1374 in_dev_put(in_dev);
1377 static int ip_error(struct sk_buff *skb)
1379 struct rtable *rt = skb->rtable;
1380 unsigned long now;
1381 int code;
1383 switch (rt->u.dst.error) {
1384 case EINVAL:
1385 default:
1386 goto out;
1387 case EHOSTUNREACH:
1388 code = ICMP_HOST_UNREACH;
1389 break;
1390 case ENETUNREACH:
1391 code = ICMP_NET_UNREACH;
1392 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1393 break;
1394 case EACCES:
1395 code = ICMP_PKT_FILTERED;
1396 break;
1399 now = jiffies;
1400 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1401 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1402 rt->u.dst.rate_tokens = ip_rt_error_burst;
1403 rt->u.dst.rate_last = now;
1404 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1405 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1406 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1409 out: kfree_skb(skb);
1410 return 0;
1414 * The last two values are not from the RFC but
1415 * are needed for AMPRnet AX.25 paths.
1418 static const unsigned short mtu_plateau[] =
1419 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1421 static inline unsigned short guess_mtu(unsigned short old_mtu)
1423 int i;
1425 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1426 if (old_mtu > mtu_plateau[i])
1427 return mtu_plateau[i];
1428 return 68;
1431 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1432 unsigned short new_mtu)
1434 int i;
1435 unsigned short old_mtu = ntohs(iph->tot_len);
1436 struct rtable *rth;
1437 __be32 skeys[2] = { iph->saddr, 0, };
1438 __be32 daddr = iph->daddr;
1439 unsigned short est_mtu = 0;
1441 if (ipv4_config.no_pmtu_disc)
1442 return 0;
1444 for (i = 0; i < 2; i++) {
1445 unsigned hash = rt_hash(daddr, skeys[i], 0);
1447 rcu_read_lock();
1448 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1449 rth = rcu_dereference(rth->u.dst.rt_next)) {
1450 if (rth->fl.fl4_dst == daddr &&
1451 rth->fl.fl4_src == skeys[i] &&
1452 rth->rt_dst == daddr &&
1453 rth->rt_src == iph->saddr &&
1454 rth->fl.iif == 0 &&
1455 !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1456 net_eq(dev_net(rth->u.dst.dev), net) &&
1457 rth->rt_genid == atomic_read(&rt_genid)) {
1458 unsigned short mtu = new_mtu;
1460 if (new_mtu < 68 || new_mtu >= old_mtu) {
1462 /* BSD 4.2 compatibility hack :-( */
1463 if (mtu == 0 &&
1464 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1465 old_mtu >= 68 + (iph->ihl << 2))
1466 old_mtu -= iph->ihl << 2;
1468 mtu = guess_mtu(old_mtu);
1470 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1471 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1472 dst_confirm(&rth->u.dst);
1473 if (mtu < ip_rt_min_pmtu) {
1474 mtu = ip_rt_min_pmtu;
1475 rth->u.dst.metrics[RTAX_LOCK-1] |=
1476 (1 << RTAX_MTU);
1478 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1479 dst_set_expires(&rth->u.dst,
1480 ip_rt_mtu_expires);
1482 est_mtu = mtu;
1486 rcu_read_unlock();
1488 return est_mtu ? : new_mtu;
1491 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1493 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1494 !(dst_metric_locked(dst, RTAX_MTU))) {
1495 if (mtu < ip_rt_min_pmtu) {
1496 mtu = ip_rt_min_pmtu;
1497 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1499 dst->metrics[RTAX_MTU-1] = mtu;
1500 dst_set_expires(dst, ip_rt_mtu_expires);
1501 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1505 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1507 return NULL;
1510 static void ipv4_dst_destroy(struct dst_entry *dst)
1512 struct rtable *rt = (struct rtable *) dst;
1513 struct inet_peer *peer = rt->peer;
1514 struct in_device *idev = rt->idev;
1516 if (peer) {
1517 rt->peer = NULL;
1518 inet_putpeer(peer);
1521 if (idev) {
1522 rt->idev = NULL;
1523 in_dev_put(idev);
1527 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1528 int how)
1530 struct rtable *rt = (struct rtable *) dst;
1531 struct in_device *idev = rt->idev;
1532 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1533 struct in_device *loopback_idev =
1534 in_dev_get(dev_net(dev)->loopback_dev);
1535 if (loopback_idev) {
1536 rt->idev = loopback_idev;
1537 in_dev_put(idev);
1542 static void ipv4_link_failure(struct sk_buff *skb)
1544 struct rtable *rt;
1546 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1548 rt = skb->rtable;
1549 if (rt)
1550 dst_set_expires(&rt->u.dst, 0);
1553 static int ip_rt_bug(struct sk_buff *skb)
1555 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1556 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1557 skb->dev ? skb->dev->name : "?");
1558 kfree_skb(skb);
1559 return 0;
1563 We do not cache source address of outgoing interface,
1564 because it is used only by IP RR, TS and SRR options,
1565 so that it out of fast path.
1567 BTW remember: "addr" is allowed to be not aligned
1568 in IP options!
1571 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1573 __be32 src;
1574 struct fib_result res;
1576 if (rt->fl.iif == 0)
1577 src = rt->rt_src;
1578 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1579 src = FIB_RES_PREFSRC(res);
1580 fib_res_put(&res);
1581 } else
1582 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1583 RT_SCOPE_UNIVERSE);
1584 memcpy(addr, &src, 4);
1587 #ifdef CONFIG_NET_CLS_ROUTE
1588 static void set_class_tag(struct rtable *rt, u32 tag)
1590 if (!(rt->u.dst.tclassid & 0xFFFF))
1591 rt->u.dst.tclassid |= tag & 0xFFFF;
1592 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1593 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1595 #endif
1597 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1599 struct fib_info *fi = res->fi;
1601 if (fi) {
1602 if (FIB_RES_GW(*res) &&
1603 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1604 rt->rt_gateway = FIB_RES_GW(*res);
1605 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1606 sizeof(rt->u.dst.metrics));
1607 if (fi->fib_mtu == 0) {
1608 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1609 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1610 rt->rt_gateway != rt->rt_dst &&
1611 rt->u.dst.dev->mtu > 576)
1612 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1614 #ifdef CONFIG_NET_CLS_ROUTE
1615 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1616 #endif
1617 } else
1618 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1620 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1621 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1622 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1623 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1624 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1625 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1626 ip_rt_min_advmss);
1627 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1628 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1630 #ifdef CONFIG_NET_CLS_ROUTE
1631 #ifdef CONFIG_IP_MULTIPLE_TABLES
1632 set_class_tag(rt, fib_rules_tclass(res));
1633 #endif
1634 set_class_tag(rt, itag);
1635 #endif
1636 rt->rt_type = res->type;
1639 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1640 u8 tos, struct net_device *dev, int our)
1642 unsigned hash;
1643 struct rtable *rth;
1644 __be32 spec_dst;
1645 struct in_device *in_dev = in_dev_get(dev);
1646 u32 itag = 0;
1648 /* Primary sanity checks. */
1650 if (in_dev == NULL)
1651 return -EINVAL;
1653 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1654 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1655 goto e_inval;
1657 if (ipv4_is_zeronet(saddr)) {
1658 if (!ipv4_is_local_multicast(daddr))
1659 goto e_inval;
1660 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1661 } else if (fib_validate_source(saddr, 0, tos, 0,
1662 dev, &spec_dst, &itag) < 0)
1663 goto e_inval;
1665 rth = dst_alloc(&ipv4_dst_ops);
1666 if (!rth)
1667 goto e_nobufs;
1669 rth->u.dst.output= ip_rt_bug;
1671 atomic_set(&rth->u.dst.__refcnt, 1);
1672 rth->u.dst.flags= DST_HOST;
1673 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1674 rth->u.dst.flags |= DST_NOPOLICY;
1675 rth->fl.fl4_dst = daddr;
1676 rth->rt_dst = daddr;
1677 rth->fl.fl4_tos = tos;
1678 rth->fl.mark = skb->mark;
1679 rth->fl.fl4_src = saddr;
1680 rth->rt_src = saddr;
1681 #ifdef CONFIG_NET_CLS_ROUTE
1682 rth->u.dst.tclassid = itag;
1683 #endif
1684 rth->rt_iif =
1685 rth->fl.iif = dev->ifindex;
1686 rth->u.dst.dev = init_net.loopback_dev;
1687 dev_hold(rth->u.dst.dev);
1688 rth->idev = in_dev_get(rth->u.dst.dev);
1689 rth->fl.oif = 0;
1690 rth->rt_gateway = daddr;
1691 rth->rt_spec_dst= spec_dst;
1692 rth->rt_genid = atomic_read(&rt_genid);
1693 rth->rt_flags = RTCF_MULTICAST;
1694 rth->rt_type = RTN_MULTICAST;
1695 if (our) {
1696 rth->u.dst.input= ip_local_deliver;
1697 rth->rt_flags |= RTCF_LOCAL;
1700 #ifdef CONFIG_IP_MROUTE
1701 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1702 rth->u.dst.input = ip_mr_input;
1703 #endif
1704 RT_CACHE_STAT_INC(in_slow_mc);
1706 in_dev_put(in_dev);
1707 hash = rt_hash(daddr, saddr, dev->ifindex);
1708 return rt_intern_hash(hash, rth, &skb->rtable);
1710 e_nobufs:
1711 in_dev_put(in_dev);
1712 return -ENOBUFS;
1714 e_inval:
1715 in_dev_put(in_dev);
1716 return -EINVAL;
1720 static void ip_handle_martian_source(struct net_device *dev,
1721 struct in_device *in_dev,
1722 struct sk_buff *skb,
1723 __be32 daddr,
1724 __be32 saddr)
1726 RT_CACHE_STAT_INC(in_martian_src);
1727 #ifdef CONFIG_IP_ROUTE_VERBOSE
1728 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1730 * RFC1812 recommendation, if source is martian,
1731 * the only hint is MAC header.
1733 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1734 "%u.%u.%u.%u, on dev %s\n",
1735 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1736 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1737 int i;
1738 const unsigned char *p = skb_mac_header(skb);
1739 printk(KERN_WARNING "ll header: ");
1740 for (i = 0; i < dev->hard_header_len; i++, p++) {
1741 printk("%02x", *p);
1742 if (i < (dev->hard_header_len - 1))
1743 printk(":");
1745 printk("\n");
1748 #endif
1751 static int __mkroute_input(struct sk_buff *skb,
1752 struct fib_result *res,
1753 struct in_device *in_dev,
1754 __be32 daddr, __be32 saddr, u32 tos,
1755 struct rtable **result)
1758 struct rtable *rth;
1759 int err;
1760 struct in_device *out_dev;
1761 unsigned flags = 0;
1762 __be32 spec_dst;
1763 u32 itag;
1765 /* get a working reference to the output device */
1766 out_dev = in_dev_get(FIB_RES_DEV(*res));
1767 if (out_dev == NULL) {
1768 if (net_ratelimit())
1769 printk(KERN_CRIT "Bug in ip_route_input" \
1770 "_slow(). Please, report\n");
1771 return -EINVAL;
1775 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1776 in_dev->dev, &spec_dst, &itag);
1777 if (err < 0) {
1778 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1779 saddr);
1781 err = -EINVAL;
1782 goto cleanup;
1785 if (err)
1786 flags |= RTCF_DIRECTSRC;
1788 if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1789 (IN_DEV_SHARED_MEDIA(out_dev) ||
1790 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1791 flags |= RTCF_DOREDIRECT;
1793 if (skb->protocol != htons(ETH_P_IP)) {
1794 /* Not IP (i.e. ARP). Do not create route, if it is
1795 * invalid for proxy arp. DNAT routes are always valid.
1797 if (out_dev == in_dev) {
1798 err = -EINVAL;
1799 goto cleanup;
1804 rth = dst_alloc(&ipv4_dst_ops);
1805 if (!rth) {
1806 err = -ENOBUFS;
1807 goto cleanup;
1810 atomic_set(&rth->u.dst.__refcnt, 1);
1811 rth->u.dst.flags= DST_HOST;
1812 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1813 rth->u.dst.flags |= DST_NOPOLICY;
1814 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1815 rth->u.dst.flags |= DST_NOXFRM;
1816 rth->fl.fl4_dst = daddr;
1817 rth->rt_dst = daddr;
1818 rth->fl.fl4_tos = tos;
1819 rth->fl.mark = skb->mark;
1820 rth->fl.fl4_src = saddr;
1821 rth->rt_src = saddr;
1822 rth->rt_gateway = daddr;
1823 rth->rt_iif =
1824 rth->fl.iif = in_dev->dev->ifindex;
1825 rth->u.dst.dev = (out_dev)->dev;
1826 dev_hold(rth->u.dst.dev);
1827 rth->idev = in_dev_get(rth->u.dst.dev);
1828 rth->fl.oif = 0;
1829 rth->rt_spec_dst= spec_dst;
1831 rth->u.dst.input = ip_forward;
1832 rth->u.dst.output = ip_output;
1833 rth->rt_genid = atomic_read(&rt_genid);
1835 rt_set_nexthop(rth, res, itag);
1837 rth->rt_flags = flags;
1839 *result = rth;
1840 err = 0;
1841 cleanup:
1842 /* release the working reference to the output device */
1843 in_dev_put(out_dev);
1844 return err;
1847 static int ip_mkroute_input(struct sk_buff *skb,
1848 struct fib_result *res,
1849 const struct flowi *fl,
1850 struct in_device *in_dev,
1851 __be32 daddr, __be32 saddr, u32 tos)
1853 struct rtable* rth = NULL;
1854 int err;
1855 unsigned hash;
1857 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1858 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1859 fib_select_multipath(fl, res);
1860 #endif
1862 /* create a routing cache entry */
1863 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1864 if (err)
1865 return err;
1867 /* put it into the cache */
1868 hash = rt_hash(daddr, saddr, fl->iif);
1869 return rt_intern_hash(hash, rth, &skb->rtable);
1873 * NOTE. We drop all the packets that has local source
1874 * addresses, because every properly looped back packet
1875 * must have correct destination already attached by output routine.
1877 * Such approach solves two big problems:
1878 * 1. Not simplex devices are handled properly.
1879 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1882 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1883 u8 tos, struct net_device *dev)
1885 struct fib_result res;
1886 struct in_device *in_dev = in_dev_get(dev);
1887 struct flowi fl = { .nl_u = { .ip4_u =
1888 { .daddr = daddr,
1889 .saddr = saddr,
1890 .tos = tos,
1891 .scope = RT_SCOPE_UNIVERSE,
1892 } },
1893 .mark = skb->mark,
1894 .iif = dev->ifindex };
1895 unsigned flags = 0;
1896 u32 itag = 0;
1897 struct rtable * rth;
1898 unsigned hash;
1899 __be32 spec_dst;
1900 int err = -EINVAL;
1901 int free_res = 0;
1902 struct net * net = dev_net(dev);
1904 /* IP on this device is disabled. */
1906 if (!in_dev)
1907 goto out;
1909 /* Check for the most weird martians, which can be not detected
1910 by fib_lookup.
1913 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1914 ipv4_is_loopback(saddr))
1915 goto martian_source;
1917 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1918 goto brd_input;
1920 /* Accept zero addresses only to limited broadcast;
1921 * I even do not know to fix it or not. Waiting for complains :-)
1923 if (ipv4_is_zeronet(saddr))
1924 goto martian_source;
1926 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1927 ipv4_is_loopback(daddr))
1928 goto martian_destination;
1931 * Now we are ready to route packet.
1933 if ((err = fib_lookup(net, &fl, &res)) != 0) {
1934 if (!IN_DEV_FORWARD(in_dev))
1935 goto e_hostunreach;
1936 goto no_route;
1938 free_res = 1;
1940 RT_CACHE_STAT_INC(in_slow_tot);
1942 if (res.type == RTN_BROADCAST)
1943 goto brd_input;
1945 if (res.type == RTN_LOCAL) {
1946 int result;
1947 result = fib_validate_source(saddr, daddr, tos,
1948 net->loopback_dev->ifindex,
1949 dev, &spec_dst, &itag);
1950 if (result < 0)
1951 goto martian_source;
1952 if (result)
1953 flags |= RTCF_DIRECTSRC;
1954 spec_dst = daddr;
1955 goto local_input;
1958 if (!IN_DEV_FORWARD(in_dev))
1959 goto e_hostunreach;
1960 if (res.type != RTN_UNICAST)
1961 goto martian_destination;
1963 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1964 done:
1965 in_dev_put(in_dev);
1966 if (free_res)
1967 fib_res_put(&res);
1968 out: return err;
1970 brd_input:
1971 if (skb->protocol != htons(ETH_P_IP))
1972 goto e_inval;
1974 if (ipv4_is_zeronet(saddr))
1975 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1976 else {
1977 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1978 &itag);
1979 if (err < 0)
1980 goto martian_source;
1981 if (err)
1982 flags |= RTCF_DIRECTSRC;
1984 flags |= RTCF_BROADCAST;
1985 res.type = RTN_BROADCAST;
1986 RT_CACHE_STAT_INC(in_brd);
1988 local_input:
1989 rth = dst_alloc(&ipv4_dst_ops);
1990 if (!rth)
1991 goto e_nobufs;
1993 rth->u.dst.output= ip_rt_bug;
1994 rth->rt_genid = atomic_read(&rt_genid);
1996 atomic_set(&rth->u.dst.__refcnt, 1);
1997 rth->u.dst.flags= DST_HOST;
1998 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1999 rth->u.dst.flags |= DST_NOPOLICY;
2000 rth->fl.fl4_dst = daddr;
2001 rth->rt_dst = daddr;
2002 rth->fl.fl4_tos = tos;
2003 rth->fl.mark = skb->mark;
2004 rth->fl.fl4_src = saddr;
2005 rth->rt_src = saddr;
2006 #ifdef CONFIG_NET_CLS_ROUTE
2007 rth->u.dst.tclassid = itag;
2008 #endif
2009 rth->rt_iif =
2010 rth->fl.iif = dev->ifindex;
2011 rth->u.dst.dev = net->loopback_dev;
2012 dev_hold(rth->u.dst.dev);
2013 rth->idev = in_dev_get(rth->u.dst.dev);
2014 rth->rt_gateway = daddr;
2015 rth->rt_spec_dst= spec_dst;
2016 rth->u.dst.input= ip_local_deliver;
2017 rth->rt_flags = flags|RTCF_LOCAL;
2018 if (res.type == RTN_UNREACHABLE) {
2019 rth->u.dst.input= ip_error;
2020 rth->u.dst.error= -err;
2021 rth->rt_flags &= ~RTCF_LOCAL;
2023 rth->rt_type = res.type;
2024 hash = rt_hash(daddr, saddr, fl.iif);
2025 err = rt_intern_hash(hash, rth, &skb->rtable);
2026 goto done;
2028 no_route:
2029 RT_CACHE_STAT_INC(in_no_route);
2030 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2031 res.type = RTN_UNREACHABLE;
2032 if (err == -ESRCH)
2033 err = -ENETUNREACH;
2034 goto local_input;
2037 * Do not cache martian addresses: they should be logged (RFC1812)
2039 martian_destination:
2040 RT_CACHE_STAT_INC(in_martian_dst);
2041 #ifdef CONFIG_IP_ROUTE_VERBOSE
2042 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2043 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2044 "%u.%u.%u.%u, dev %s\n",
2045 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2046 #endif
2048 e_hostunreach:
2049 err = -EHOSTUNREACH;
2050 goto done;
2052 e_inval:
2053 err = -EINVAL;
2054 goto done;
2056 e_nobufs:
2057 err = -ENOBUFS;
2058 goto done;
2060 martian_source:
2061 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2062 goto e_inval;
2065 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2066 u8 tos, struct net_device *dev)
2068 struct rtable * rth;
2069 unsigned hash;
2070 int iif = dev->ifindex;
2071 struct net *net;
2073 net = dev_net(dev);
2074 tos &= IPTOS_RT_MASK;
2075 hash = rt_hash(daddr, saddr, iif);
2077 rcu_read_lock();
2078 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2079 rth = rcu_dereference(rth->u.dst.rt_next)) {
2080 if (rth->fl.fl4_dst == daddr &&
2081 rth->fl.fl4_src == saddr &&
2082 rth->fl.iif == iif &&
2083 rth->fl.oif == 0 &&
2084 rth->fl.mark == skb->mark &&
2085 rth->fl.fl4_tos == tos &&
2086 net_eq(dev_net(rth->u.dst.dev), net) &&
2087 rth->rt_genid == atomic_read(&rt_genid)) {
2088 dst_use(&rth->u.dst, jiffies);
2089 RT_CACHE_STAT_INC(in_hit);
2090 rcu_read_unlock();
2091 skb->rtable = rth;
2092 return 0;
2094 RT_CACHE_STAT_INC(in_hlist_search);
2096 rcu_read_unlock();
2098 /* Multicast recognition logic is moved from route cache to here.
2099 The problem was that too many Ethernet cards have broken/missing
2100 hardware multicast filters :-( As result the host on multicasting
2101 network acquires a lot of useless route cache entries, sort of
2102 SDR messages from all the world. Now we try to get rid of them.
2103 Really, provided software IP multicast filter is organized
2104 reasonably (at least, hashed), it does not result in a slowdown
2105 comparing with route cache reject entries.
2106 Note, that multicast routers are not affected, because
2107 route cache entry is created eventually.
2109 if (ipv4_is_multicast(daddr)) {
2110 struct in_device *in_dev;
2112 rcu_read_lock();
2113 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2114 int our = ip_check_mc(in_dev, daddr, saddr,
2115 ip_hdr(skb)->protocol);
2116 if (our
2117 #ifdef CONFIG_IP_MROUTE
2118 || (!ipv4_is_local_multicast(daddr) &&
2119 IN_DEV_MFORWARD(in_dev))
2120 #endif
2122 rcu_read_unlock();
2123 return ip_route_input_mc(skb, daddr, saddr,
2124 tos, dev, our);
2127 rcu_read_unlock();
2128 return -EINVAL;
2130 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2133 static int __mkroute_output(struct rtable **result,
2134 struct fib_result *res,
2135 const struct flowi *fl,
2136 const struct flowi *oldflp,
2137 struct net_device *dev_out,
2138 unsigned flags)
2140 struct rtable *rth;
2141 struct in_device *in_dev;
2142 u32 tos = RT_FL_TOS(oldflp);
2143 int err = 0;
2145 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2146 return -EINVAL;
2148 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2149 res->type = RTN_BROADCAST;
2150 else if (ipv4_is_multicast(fl->fl4_dst))
2151 res->type = RTN_MULTICAST;
2152 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2153 return -EINVAL;
2155 if (dev_out->flags & IFF_LOOPBACK)
2156 flags |= RTCF_LOCAL;
2158 /* get work reference to inet device */
2159 in_dev = in_dev_get(dev_out);
2160 if (!in_dev)
2161 return -EINVAL;
2163 if (res->type == RTN_BROADCAST) {
2164 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2165 if (res->fi) {
2166 fib_info_put(res->fi);
2167 res->fi = NULL;
2169 } else if (res->type == RTN_MULTICAST) {
2170 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2171 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2172 oldflp->proto))
2173 flags &= ~RTCF_LOCAL;
2174 /* If multicast route do not exist use
2175 default one, but do not gateway in this case.
2176 Yes, it is hack.
2178 if (res->fi && res->prefixlen < 4) {
2179 fib_info_put(res->fi);
2180 res->fi = NULL;
2185 rth = dst_alloc(&ipv4_dst_ops);
2186 if (!rth) {
2187 err = -ENOBUFS;
2188 goto cleanup;
2191 atomic_set(&rth->u.dst.__refcnt, 1);
2192 rth->u.dst.flags= DST_HOST;
2193 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2194 rth->u.dst.flags |= DST_NOXFRM;
2195 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2196 rth->u.dst.flags |= DST_NOPOLICY;
2198 rth->fl.fl4_dst = oldflp->fl4_dst;
2199 rth->fl.fl4_tos = tos;
2200 rth->fl.fl4_src = oldflp->fl4_src;
2201 rth->fl.oif = oldflp->oif;
2202 rth->fl.mark = oldflp->mark;
2203 rth->rt_dst = fl->fl4_dst;
2204 rth->rt_src = fl->fl4_src;
2205 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2206 /* get references to the devices that are to be hold by the routing
2207 cache entry */
2208 rth->u.dst.dev = dev_out;
2209 dev_hold(dev_out);
2210 rth->idev = in_dev_get(dev_out);
2211 rth->rt_gateway = fl->fl4_dst;
2212 rth->rt_spec_dst= fl->fl4_src;
2214 rth->u.dst.output=ip_output;
2215 rth->rt_genid = atomic_read(&rt_genid);
2217 RT_CACHE_STAT_INC(out_slow_tot);
2219 if (flags & RTCF_LOCAL) {
2220 rth->u.dst.input = ip_local_deliver;
2221 rth->rt_spec_dst = fl->fl4_dst;
2223 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2224 rth->rt_spec_dst = fl->fl4_src;
2225 if (flags & RTCF_LOCAL &&
2226 !(dev_out->flags & IFF_LOOPBACK)) {
2227 rth->u.dst.output = ip_mc_output;
2228 RT_CACHE_STAT_INC(out_slow_mc);
2230 #ifdef CONFIG_IP_MROUTE
2231 if (res->type == RTN_MULTICAST) {
2232 if (IN_DEV_MFORWARD(in_dev) &&
2233 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2234 rth->u.dst.input = ip_mr_input;
2235 rth->u.dst.output = ip_mc_output;
2238 #endif
2241 rt_set_nexthop(rth, res, 0);
2243 rth->rt_flags = flags;
2245 *result = rth;
2246 cleanup:
2247 /* release work reference to inet device */
2248 in_dev_put(in_dev);
2250 return err;
2253 static int ip_mkroute_output(struct rtable **rp,
2254 struct fib_result *res,
2255 const struct flowi *fl,
2256 const struct flowi *oldflp,
2257 struct net_device *dev_out,
2258 unsigned flags)
2260 struct rtable *rth = NULL;
2261 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2262 unsigned hash;
2263 if (err == 0) {
2264 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2265 err = rt_intern_hash(hash, rth, rp);
2268 return err;
2272 * Major route resolver routine.
2275 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2276 const struct flowi *oldflp)
2278 u32 tos = RT_FL_TOS(oldflp);
2279 struct flowi fl = { .nl_u = { .ip4_u =
2280 { .daddr = oldflp->fl4_dst,
2281 .saddr = oldflp->fl4_src,
2282 .tos = tos & IPTOS_RT_MASK,
2283 .scope = ((tos & RTO_ONLINK) ?
2284 RT_SCOPE_LINK :
2285 RT_SCOPE_UNIVERSE),
2286 } },
2287 .mark = oldflp->mark,
2288 .iif = net->loopback_dev->ifindex,
2289 .oif = oldflp->oif };
2290 struct fib_result res;
2291 unsigned flags = 0;
2292 struct net_device *dev_out = NULL;
2293 int free_res = 0;
2294 int err;
2297 res.fi = NULL;
2298 #ifdef CONFIG_IP_MULTIPLE_TABLES
2299 res.r = NULL;
2300 #endif
2302 if (oldflp->fl4_src) {
2303 err = -EINVAL;
2304 if (ipv4_is_multicast(oldflp->fl4_src) ||
2305 ipv4_is_lbcast(oldflp->fl4_src) ||
2306 ipv4_is_zeronet(oldflp->fl4_src))
2307 goto out;
2309 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2310 dev_out = ip_dev_find(net, oldflp->fl4_src);
2311 if (dev_out == NULL)
2312 goto out;
2314 /* I removed check for oif == dev_out->oif here.
2315 It was wrong for two reasons:
2316 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2317 is assigned to multiple interfaces.
2318 2. Moreover, we are allowed to send packets with saddr
2319 of another iface. --ANK
2322 if (oldflp->oif == 0
2323 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2324 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2325 /* Special hack: user can direct multicasts
2326 and limited broadcast via necessary interface
2327 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2328 This hack is not just for fun, it allows
2329 vic,vat and friends to work.
2330 They bind socket to loopback, set ttl to zero
2331 and expect that it will work.
2332 From the viewpoint of routing cache they are broken,
2333 because we are not allowed to build multicast path
2334 with loopback source addr (look, routing cache
2335 cannot know, that ttl is zero, so that packet
2336 will not leave this host and route is valid).
2337 Luckily, this hack is good workaround.
2340 fl.oif = dev_out->ifindex;
2341 goto make_route;
2343 if (dev_out)
2344 dev_put(dev_out);
2345 dev_out = NULL;
2349 if (oldflp->oif) {
2350 dev_out = dev_get_by_index(net, oldflp->oif);
2351 err = -ENODEV;
2352 if (dev_out == NULL)
2353 goto out;
2355 /* RACE: Check return value of inet_select_addr instead. */
2356 if (__in_dev_get_rtnl(dev_out) == NULL) {
2357 dev_put(dev_out);
2358 goto out; /* Wrong error code */
2361 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2362 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2363 if (!fl.fl4_src)
2364 fl.fl4_src = inet_select_addr(dev_out, 0,
2365 RT_SCOPE_LINK);
2366 goto make_route;
2368 if (!fl.fl4_src) {
2369 if (ipv4_is_multicast(oldflp->fl4_dst))
2370 fl.fl4_src = inet_select_addr(dev_out, 0,
2371 fl.fl4_scope);
2372 else if (!oldflp->fl4_dst)
2373 fl.fl4_src = inet_select_addr(dev_out, 0,
2374 RT_SCOPE_HOST);
2378 if (!fl.fl4_dst) {
2379 fl.fl4_dst = fl.fl4_src;
2380 if (!fl.fl4_dst)
2381 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2382 if (dev_out)
2383 dev_put(dev_out);
2384 dev_out = net->loopback_dev;
2385 dev_hold(dev_out);
2386 fl.oif = net->loopback_dev->ifindex;
2387 res.type = RTN_LOCAL;
2388 flags |= RTCF_LOCAL;
2389 goto make_route;
2392 if (fib_lookup(net, &fl, &res)) {
2393 res.fi = NULL;
2394 if (oldflp->oif) {
2395 /* Apparently, routing tables are wrong. Assume,
2396 that the destination is on link.
2398 WHY? DW.
2399 Because we are allowed to send to iface
2400 even if it has NO routes and NO assigned
2401 addresses. When oif is specified, routing
2402 tables are looked up with only one purpose:
2403 to catch if destination is gatewayed, rather than
2404 direct. Moreover, if MSG_DONTROUTE is set,
2405 we send packet, ignoring both routing tables
2406 and ifaddr state. --ANK
2409 We could make it even if oif is unknown,
2410 likely IPv6, but we do not.
2413 if (fl.fl4_src == 0)
2414 fl.fl4_src = inet_select_addr(dev_out, 0,
2415 RT_SCOPE_LINK);
2416 res.type = RTN_UNICAST;
2417 goto make_route;
2419 if (dev_out)
2420 dev_put(dev_out);
2421 err = -ENETUNREACH;
2422 goto out;
2424 free_res = 1;
2426 if (res.type == RTN_LOCAL) {
2427 if (!fl.fl4_src)
2428 fl.fl4_src = fl.fl4_dst;
2429 if (dev_out)
2430 dev_put(dev_out);
2431 dev_out = net->loopback_dev;
2432 dev_hold(dev_out);
2433 fl.oif = dev_out->ifindex;
2434 if (res.fi)
2435 fib_info_put(res.fi);
2436 res.fi = NULL;
2437 flags |= RTCF_LOCAL;
2438 goto make_route;
2441 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2442 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2443 fib_select_multipath(&fl, &res);
2444 else
2445 #endif
2446 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2447 fib_select_default(net, &fl, &res);
2449 if (!fl.fl4_src)
2450 fl.fl4_src = FIB_RES_PREFSRC(res);
2452 if (dev_out)
2453 dev_put(dev_out);
2454 dev_out = FIB_RES_DEV(res);
2455 dev_hold(dev_out);
2456 fl.oif = dev_out->ifindex;
2459 make_route:
2460 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2463 if (free_res)
2464 fib_res_put(&res);
2465 if (dev_out)
2466 dev_put(dev_out);
2467 out: return err;
2470 int __ip_route_output_key(struct net *net, struct rtable **rp,
2471 const struct flowi *flp)
2473 unsigned hash;
2474 struct rtable *rth;
2476 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2478 rcu_read_lock_bh();
2479 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2480 rth = rcu_dereference(rth->u.dst.rt_next)) {
2481 if (rth->fl.fl4_dst == flp->fl4_dst &&
2482 rth->fl.fl4_src == flp->fl4_src &&
2483 rth->fl.iif == 0 &&
2484 rth->fl.oif == flp->oif &&
2485 rth->fl.mark == flp->mark &&
2486 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2487 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2488 net_eq(dev_net(rth->u.dst.dev), net) &&
2489 rth->rt_genid == atomic_read(&rt_genid)) {
2490 dst_use(&rth->u.dst, jiffies);
2491 RT_CACHE_STAT_INC(out_hit);
2492 rcu_read_unlock_bh();
2493 *rp = rth;
2494 return 0;
2496 RT_CACHE_STAT_INC(out_hlist_search);
2498 rcu_read_unlock_bh();
2500 return ip_route_output_slow(net, rp, flp);
2503 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2505 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2509 static struct dst_ops ipv4_dst_blackhole_ops = {
2510 .family = AF_INET,
2511 .protocol = __constant_htons(ETH_P_IP),
2512 .destroy = ipv4_dst_destroy,
2513 .check = ipv4_dst_check,
2514 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2515 .entry_size = sizeof(struct rtable),
2516 .entries = ATOMIC_INIT(0),
2520 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2522 struct rtable *ort = *rp;
2523 struct rtable *rt = (struct rtable *)
2524 dst_alloc(&ipv4_dst_blackhole_ops);
2526 if (rt) {
2527 struct dst_entry *new = &rt->u.dst;
2529 atomic_set(&new->__refcnt, 1);
2530 new->__use = 1;
2531 new->input = dst_discard;
2532 new->output = dst_discard;
2533 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2535 new->dev = ort->u.dst.dev;
2536 if (new->dev)
2537 dev_hold(new->dev);
2539 rt->fl = ort->fl;
2541 rt->idev = ort->idev;
2542 if (rt->idev)
2543 in_dev_hold(rt->idev);
2544 rt->rt_genid = atomic_read(&rt_genid);
2545 rt->rt_flags = ort->rt_flags;
2546 rt->rt_type = ort->rt_type;
2547 rt->rt_dst = ort->rt_dst;
2548 rt->rt_src = ort->rt_src;
2549 rt->rt_iif = ort->rt_iif;
2550 rt->rt_gateway = ort->rt_gateway;
2551 rt->rt_spec_dst = ort->rt_spec_dst;
2552 rt->peer = ort->peer;
2553 if (rt->peer)
2554 atomic_inc(&rt->peer->refcnt);
2556 dst_free(new);
2559 dst_release(&(*rp)->u.dst);
2560 *rp = rt;
2561 return (rt ? 0 : -ENOMEM);
2564 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2565 struct sock *sk, int flags)
2567 int err;
2569 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2570 return err;
2572 if (flp->proto) {
2573 if (!flp->fl4_src)
2574 flp->fl4_src = (*rp)->rt_src;
2575 if (!flp->fl4_dst)
2576 flp->fl4_dst = (*rp)->rt_dst;
2577 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2578 flags ? XFRM_LOOKUP_WAIT : 0);
2579 if (err == -EREMOTE)
2580 err = ipv4_dst_blackhole(rp, flp);
2582 return err;
2585 return 0;
2588 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2590 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2592 return ip_route_output_flow(net, rp, flp, NULL, 0);
2595 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2596 int nowait, unsigned int flags)
2598 struct rtable *rt = skb->rtable;
2599 struct rtmsg *r;
2600 struct nlmsghdr *nlh;
2601 long expires;
2602 u32 id = 0, ts = 0, tsage = 0, error;
2604 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2605 if (nlh == NULL)
2606 return -EMSGSIZE;
2608 r = nlmsg_data(nlh);
2609 r->rtm_family = AF_INET;
2610 r->rtm_dst_len = 32;
2611 r->rtm_src_len = 0;
2612 r->rtm_tos = rt->fl.fl4_tos;
2613 r->rtm_table = RT_TABLE_MAIN;
2614 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2615 r->rtm_type = rt->rt_type;
2616 r->rtm_scope = RT_SCOPE_UNIVERSE;
2617 r->rtm_protocol = RTPROT_UNSPEC;
2618 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2619 if (rt->rt_flags & RTCF_NOTIFY)
2620 r->rtm_flags |= RTM_F_NOTIFY;
2622 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2624 if (rt->fl.fl4_src) {
2625 r->rtm_src_len = 32;
2626 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2628 if (rt->u.dst.dev)
2629 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2630 #ifdef CONFIG_NET_CLS_ROUTE
2631 if (rt->u.dst.tclassid)
2632 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2633 #endif
2634 if (rt->fl.iif)
2635 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2636 else if (rt->rt_src != rt->fl.fl4_src)
2637 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2639 if (rt->rt_dst != rt->rt_gateway)
2640 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2642 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2643 goto nla_put_failure;
2645 error = rt->u.dst.error;
2646 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2647 if (rt->peer) {
2648 id = rt->peer->ip_id_count;
2649 if (rt->peer->tcp_ts_stamp) {
2650 ts = rt->peer->tcp_ts;
2651 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2655 if (rt->fl.iif) {
2656 #ifdef CONFIG_IP_MROUTE
2657 __be32 dst = rt->rt_dst;
2659 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2660 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2661 int err = ipmr_get_route(skb, r, nowait);
2662 if (err <= 0) {
2663 if (!nowait) {
2664 if (err == 0)
2665 return 0;
2666 goto nla_put_failure;
2667 } else {
2668 if (err == -EMSGSIZE)
2669 goto nla_put_failure;
2670 error = err;
2673 } else
2674 #endif
2675 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2678 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2679 expires, error) < 0)
2680 goto nla_put_failure;
2682 return nlmsg_end(skb, nlh);
2684 nla_put_failure:
2685 nlmsg_cancel(skb, nlh);
2686 return -EMSGSIZE;
2689 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2691 struct net *net = sock_net(in_skb->sk);
2692 struct rtmsg *rtm;
2693 struct nlattr *tb[RTA_MAX+1];
2694 struct rtable *rt = NULL;
2695 __be32 dst = 0;
2696 __be32 src = 0;
2697 u32 iif;
2698 int err;
2699 struct sk_buff *skb;
2701 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2702 if (err < 0)
2703 goto errout;
2705 rtm = nlmsg_data(nlh);
2707 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2708 if (skb == NULL) {
2709 err = -ENOBUFS;
2710 goto errout;
2713 /* Reserve room for dummy headers, this skb can pass
2714 through good chunk of routing engine.
2716 skb_reset_mac_header(skb);
2717 skb_reset_network_header(skb);
2719 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2720 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2721 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2723 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2724 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2725 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2727 if (iif) {
2728 struct net_device *dev;
2730 dev = __dev_get_by_index(net, iif);
2731 if (dev == NULL) {
2732 err = -ENODEV;
2733 goto errout_free;
2736 skb->protocol = htons(ETH_P_IP);
2737 skb->dev = dev;
2738 local_bh_disable();
2739 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2740 local_bh_enable();
2742 rt = skb->rtable;
2743 if (err == 0 && rt->u.dst.error)
2744 err = -rt->u.dst.error;
2745 } else {
2746 struct flowi fl = {
2747 .nl_u = {
2748 .ip4_u = {
2749 .daddr = dst,
2750 .saddr = src,
2751 .tos = rtm->rtm_tos,
2754 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2756 err = ip_route_output_key(net, &rt, &fl);
2759 if (err)
2760 goto errout_free;
2762 skb->rtable = rt;
2763 if (rtm->rtm_flags & RTM_F_NOTIFY)
2764 rt->rt_flags |= RTCF_NOTIFY;
2766 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2767 RTM_NEWROUTE, 0, 0);
2768 if (err <= 0)
2769 goto errout_free;
2771 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2772 errout:
2773 return err;
2775 errout_free:
2776 kfree_skb(skb);
2777 goto errout;
2780 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2782 struct rtable *rt;
2783 int h, s_h;
2784 int idx, s_idx;
2785 struct net *net;
2787 net = sock_net(skb->sk);
2789 s_h = cb->args[0];
2790 if (s_h < 0)
2791 s_h = 0;
2792 s_idx = idx = cb->args[1];
2793 for (h = s_h; h <= rt_hash_mask; h++) {
2794 rcu_read_lock_bh();
2795 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2796 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2797 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2798 continue;
2799 if (rt->rt_genid != atomic_read(&rt_genid))
2800 continue;
2801 skb->dst = dst_clone(&rt->u.dst);
2802 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2803 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2804 1, NLM_F_MULTI) <= 0) {
2805 dst_release(xchg(&skb->dst, NULL));
2806 rcu_read_unlock_bh();
2807 goto done;
2809 dst_release(xchg(&skb->dst, NULL));
2811 rcu_read_unlock_bh();
2812 s_idx = 0;
2815 done:
2816 cb->args[0] = h;
2817 cb->args[1] = idx;
2818 return skb->len;
2821 void ip_rt_multicast_event(struct in_device *in_dev)
2823 rt_cache_flush(0);
2826 #ifdef CONFIG_SYSCTL
2827 static int flush_delay;
2829 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2830 struct file *filp, void __user *buffer,
2831 size_t *lenp, loff_t *ppos)
2833 if (write) {
2834 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2835 rt_cache_flush(flush_delay);
2836 return 0;
2839 return -EINVAL;
2842 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2843 int __user *name,
2844 int nlen,
2845 void __user *oldval,
2846 size_t __user *oldlenp,
2847 void __user *newval,
2848 size_t newlen)
2850 int delay;
2851 if (newlen != sizeof(int))
2852 return -EINVAL;
2853 if (get_user(delay, (int __user *)newval))
2854 return -EFAULT;
2855 rt_cache_flush(delay);
2856 return 0;
2859 ctl_table ipv4_route_table[] = {
2861 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2862 .procname = "flush",
2863 .data = &flush_delay,
2864 .maxlen = sizeof(int),
2865 .mode = 0200,
2866 .proc_handler = &ipv4_sysctl_rtcache_flush,
2867 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2870 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2871 .procname = "gc_thresh",
2872 .data = &ipv4_dst_ops.gc_thresh,
2873 .maxlen = sizeof(int),
2874 .mode = 0644,
2875 .proc_handler = &proc_dointvec,
2878 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2879 .procname = "max_size",
2880 .data = &ip_rt_max_size,
2881 .maxlen = sizeof(int),
2882 .mode = 0644,
2883 .proc_handler = &proc_dointvec,
2886 /* Deprecated. Use gc_min_interval_ms */
2888 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2889 .procname = "gc_min_interval",
2890 .data = &ip_rt_gc_min_interval,
2891 .maxlen = sizeof(int),
2892 .mode = 0644,
2893 .proc_handler = &proc_dointvec_jiffies,
2894 .strategy = &sysctl_jiffies,
2897 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2898 .procname = "gc_min_interval_ms",
2899 .data = &ip_rt_gc_min_interval,
2900 .maxlen = sizeof(int),
2901 .mode = 0644,
2902 .proc_handler = &proc_dointvec_ms_jiffies,
2903 .strategy = &sysctl_ms_jiffies,
2906 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2907 .procname = "gc_timeout",
2908 .data = &ip_rt_gc_timeout,
2909 .maxlen = sizeof(int),
2910 .mode = 0644,
2911 .proc_handler = &proc_dointvec_jiffies,
2912 .strategy = &sysctl_jiffies,
2915 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2916 .procname = "gc_interval",
2917 .data = &ip_rt_gc_interval,
2918 .maxlen = sizeof(int),
2919 .mode = 0644,
2920 .proc_handler = &proc_dointvec_jiffies,
2921 .strategy = &sysctl_jiffies,
2924 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2925 .procname = "redirect_load",
2926 .data = &ip_rt_redirect_load,
2927 .maxlen = sizeof(int),
2928 .mode = 0644,
2929 .proc_handler = &proc_dointvec,
2932 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2933 .procname = "redirect_number",
2934 .data = &ip_rt_redirect_number,
2935 .maxlen = sizeof(int),
2936 .mode = 0644,
2937 .proc_handler = &proc_dointvec,
2940 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2941 .procname = "redirect_silence",
2942 .data = &ip_rt_redirect_silence,
2943 .maxlen = sizeof(int),
2944 .mode = 0644,
2945 .proc_handler = &proc_dointvec,
2948 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2949 .procname = "error_cost",
2950 .data = &ip_rt_error_cost,
2951 .maxlen = sizeof(int),
2952 .mode = 0644,
2953 .proc_handler = &proc_dointvec,
2956 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2957 .procname = "error_burst",
2958 .data = &ip_rt_error_burst,
2959 .maxlen = sizeof(int),
2960 .mode = 0644,
2961 .proc_handler = &proc_dointvec,
2964 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2965 .procname = "gc_elasticity",
2966 .data = &ip_rt_gc_elasticity,
2967 .maxlen = sizeof(int),
2968 .mode = 0644,
2969 .proc_handler = &proc_dointvec,
2972 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2973 .procname = "mtu_expires",
2974 .data = &ip_rt_mtu_expires,
2975 .maxlen = sizeof(int),
2976 .mode = 0644,
2977 .proc_handler = &proc_dointvec_jiffies,
2978 .strategy = &sysctl_jiffies,
2981 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2982 .procname = "min_pmtu",
2983 .data = &ip_rt_min_pmtu,
2984 .maxlen = sizeof(int),
2985 .mode = 0644,
2986 .proc_handler = &proc_dointvec,
2989 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2990 .procname = "min_adv_mss",
2991 .data = &ip_rt_min_advmss,
2992 .maxlen = sizeof(int),
2993 .mode = 0644,
2994 .proc_handler = &proc_dointvec,
2997 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2998 .procname = "secret_interval",
2999 .data = &ip_rt_secret_interval,
3000 .maxlen = sizeof(int),
3001 .mode = 0644,
3002 .proc_handler = &proc_dointvec_jiffies,
3003 .strategy = &sysctl_jiffies,
3005 { .ctl_name = 0 }
3007 #endif
3009 #ifdef CONFIG_NET_CLS_ROUTE
3010 struct ip_rt_acct *ip_rt_acct __read_mostly;
3011 #endif /* CONFIG_NET_CLS_ROUTE */
3013 static __initdata unsigned long rhash_entries;
3014 static int __init set_rhash_entries(char *str)
3016 if (!str)
3017 return 0;
3018 rhash_entries = simple_strtoul(str, &str, 0);
3019 return 1;
3021 __setup("rhash_entries=", set_rhash_entries);
3023 int __init ip_rt_init(void)
3025 int rc = 0;
3027 atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3028 (jiffies ^ (jiffies >> 7))));
3030 #ifdef CONFIG_NET_CLS_ROUTE
3031 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3032 if (!ip_rt_acct)
3033 panic("IP: failed to allocate ip_rt_acct\n");
3034 #endif
3036 ipv4_dst_ops.kmem_cachep =
3037 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3038 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3040 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3042 rt_hash_table = (struct rt_hash_bucket *)
3043 alloc_large_system_hash("IP route cache",
3044 sizeof(struct rt_hash_bucket),
3045 rhash_entries,
3046 (num_physpages >= 128 * 1024) ?
3047 15 : 17,
3049 &rt_hash_log,
3050 &rt_hash_mask,
3052 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3053 rt_hash_lock_init();
3055 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3056 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3058 devinet_init();
3059 ip_fib_init();
3061 rt_secret_timer.function = rt_secret_rebuild;
3062 rt_secret_timer.data = 0;
3063 init_timer_deferrable(&rt_secret_timer);
3065 /* All the timers, started at system startup tend
3066 to synchronize. Perturb it a bit.
3068 schedule_delayed_work(&expires_work,
3069 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3071 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3072 ip_rt_secret_interval;
3073 add_timer(&rt_secret_timer);
3075 if (ip_rt_proc_init())
3076 printk(KERN_ERR "Unable to create route proc files\n");
3077 #ifdef CONFIG_XFRM
3078 xfrm_init();
3079 xfrm4_init();
3080 #endif
3081 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3083 return rc;
3086 EXPORT_SYMBOL(__ip_select_ident);
3087 EXPORT_SYMBOL(ip_route_input);
3088 EXPORT_SYMBOL(ip_route_output_key);