net: Fix various endianness glitches
[linux-2.6/libata-dev.git] / net / ipv4 / route.c
bloba947428ef0aecac3e606f3eaf67b74d0bd398a82
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
112 #define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115 #define IP_MAX_MTU 0xFFF0
117 #define RT_GC_TIMEOUT (300*HZ)
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly = 9;
124 static int ip_rt_redirect_load __read_mostly = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly = HZ;
127 static int ip_rt_error_burst __read_mostly = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly = 8;
129 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly = 256;
132 static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
133 static int rt_chain_length_max __read_mostly = 20;
135 static struct delayed_work expires_work;
136 static unsigned long expires_ljiffies;
139 * Interface to generic destination cache.
142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
143 static void ipv4_dst_destroy(struct dst_entry *dst);
144 static void ipv4_dst_ifdown(struct dst_entry *dst,
145 struct net_device *dev, int how);
146 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
147 static void ipv4_link_failure(struct sk_buff *skb);
148 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
149 static int rt_garbage_collect(struct dst_ops *ops);
152 static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET,
154 .protocol = cpu_to_be16(ETH_P_IP),
155 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check,
157 .destroy = ipv4_dst_destroy,
158 .ifdown = ipv4_dst_ifdown,
159 .negative_advice = ipv4_negative_advice,
160 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu,
162 .local_out = __ip_local_out,
163 .entries = ATOMIC_INIT(0),
166 #define ECN_OR_COST(class) TC_PRIO_##class
168 const __u8 ip_tos2prio[16] = {
169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
189 * Route cache.
192 /* The locking scheme is rather straight forward:
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
199 * lock held.
202 struct rt_hash_bucket {
203 struct rtable *chain;
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ 256
215 #else
216 # if NR_CPUS >= 32
217 # define RT_HASH_LOCK_SZ 4096
218 # elif NR_CPUS >= 16
219 # define RT_HASH_LOCK_SZ 2048
220 # elif NR_CPUS >= 8
221 # define RT_HASH_LOCK_SZ 1024
222 # elif NR_CPUS >= 4
223 # define RT_HASH_LOCK_SZ 512
224 # else
225 # define RT_HASH_LOCK_SZ 256
226 # endif
227 #endif
229 static spinlock_t *rt_hash_locks;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
232 static __init void rt_hash_lock_init(void)
234 int i;
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 GFP_KERNEL);
238 if (!rt_hash_locks)
239 panic("IP: failed to allocate rt_hash_locks\n");
241 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 spin_lock_init(&rt_hash_locks[i]);
244 #else
245 # define rt_hash_lock_addr(slot) NULL
247 static inline void rt_hash_lock_init(void)
250 #endif
252 static struct rt_hash_bucket *rt_hash_table __read_mostly;
253 static unsigned rt_hash_mask __read_mostly;
254 static unsigned int rt_hash_log __read_mostly;
256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257 #define RT_CACHE_STAT_INC(field) \
258 (__raw_get_cpu_var(rt_cache_stat).field++)
260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid)
263 return jhash_3words((__force u32)daddr, (__force u32)saddr,
264 idx, genid)
265 & rt_hash_mask;
268 static inline int rt_genid(struct net *net)
270 return atomic_read(&net->ipv4.rt_genid);
273 #ifdef CONFIG_PROC_FS
274 struct rt_cache_iter_state {
275 struct seq_net_private p;
276 int bucket;
277 int genid;
280 static struct rtable *rt_cache_get_first(struct seq_file *seq)
282 struct rt_cache_iter_state *st = seq->private;
283 struct rtable *r = NULL;
285 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
286 if (!rt_hash_table[st->bucket].chain)
287 continue;
288 rcu_read_lock_bh();
289 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
290 while (r) {
291 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
292 r->rt_genid == st->genid)
293 return r;
294 r = rcu_dereference_bh(r->u.dst.rt_next);
296 rcu_read_unlock_bh();
298 return r;
301 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
302 struct rtable *r)
304 struct rt_cache_iter_state *st = seq->private;
306 r = r->u.dst.rt_next;
307 while (!r) {
308 rcu_read_unlock_bh();
309 do {
310 if (--st->bucket < 0)
311 return NULL;
312 } while (!rt_hash_table[st->bucket].chain);
313 rcu_read_lock_bh();
314 r = rt_hash_table[st->bucket].chain;
316 return rcu_dereference_bh(r);
319 static struct rtable *rt_cache_get_next(struct seq_file *seq,
320 struct rtable *r)
322 struct rt_cache_iter_state *st = seq->private;
323 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
324 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
325 continue;
326 if (r->rt_genid == st->genid)
327 break;
329 return r;
332 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
334 struct rtable *r = rt_cache_get_first(seq);
336 if (r)
337 while (pos && (r = rt_cache_get_next(seq, r)))
338 --pos;
339 return pos ? NULL : r;
342 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344 struct rt_cache_iter_state *st = seq->private;
345 if (*pos)
346 return rt_cache_get_idx(seq, *pos - 1);
347 st->genid = rt_genid(seq_file_net(seq));
348 return SEQ_START_TOKEN;
351 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353 struct rtable *r;
355 if (v == SEQ_START_TOKEN)
356 r = rt_cache_get_first(seq);
357 else
358 r = rt_cache_get_next(seq, v);
359 ++*pos;
360 return r;
363 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365 if (v && v != SEQ_START_TOKEN)
366 rcu_read_unlock_bh();
369 static int rt_cache_seq_show(struct seq_file *seq, void *v)
371 if (v == SEQ_START_TOKEN)
372 seq_printf(seq, "%-127s\n",
373 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
374 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
375 "HHUptod\tSpecDst");
376 else {
377 struct rtable *r = v;
378 int len;
380 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
381 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
382 r->u.dst.dev ? r->u.dst.dev->name : "*",
383 (__force u32)r->rt_dst,
384 (__force u32)r->rt_gateway,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 r->u.dst.__use, 0, (__force u32)r->rt_src,
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 dst_metric(&r->u.dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 dev_queue_xmit) : 0,
396 r->rt_spec_dst, &len);
398 seq_printf(seq, "%*s\n", 127 - len, "");
400 return 0;
403 static const struct seq_operations rt_cache_seq_ops = {
404 .start = rt_cache_seq_start,
405 .next = rt_cache_seq_next,
406 .stop = rt_cache_seq_stop,
407 .show = rt_cache_seq_show,
410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
412 return seq_open_net(inode, file, &rt_cache_seq_ops,
413 sizeof(struct rt_cache_iter_state));
416 static const struct file_operations rt_cache_seq_fops = {
417 .owner = THIS_MODULE,
418 .open = rt_cache_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
421 .release = seq_release_net,
425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
427 int cpu;
429 if (*pos == 0)
430 return SEQ_START_TOKEN;
432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433 if (!cpu_possible(cpu))
434 continue;
435 *pos = cpu+1;
436 return &per_cpu(rt_cache_stat, cpu);
438 return NULL;
441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
443 int cpu;
445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
449 return &per_cpu(rt_cache_stat, cpu);
451 return NULL;
455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
462 struct rt_cache_stat *st = v;
464 if (v == SEQ_START_TOKEN) {
465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 return 0;
469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops.entries),
472 st->in_hit,
473 st->in_slow_tot,
474 st->in_slow_mc,
475 st->in_no_route,
476 st->in_brd,
477 st->in_martian_dst,
478 st->in_martian_src,
480 st->out_hit,
481 st->out_slow_tot,
482 st->out_slow_mc,
484 st->gc_total,
485 st->gc_ignored,
486 st->gc_goal_miss,
487 st->gc_dst_overflow,
488 st->in_hlist_search,
489 st->out_hlist_search
491 return 0;
494 static const struct seq_operations rt_cpu_seq_ops = {
495 .start = rt_cpu_seq_start,
496 .next = rt_cpu_seq_next,
497 .stop = rt_cpu_seq_stop,
498 .show = rt_cpu_seq_show,
502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
504 return seq_open(file, &rt_cpu_seq_ops);
507 static const struct file_operations rt_cpu_seq_fops = {
508 .owner = THIS_MODULE,
509 .open = rt_cpu_seq_open,
510 .read = seq_read,
511 .llseek = seq_lseek,
512 .release = seq_release,
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int rt_acct_proc_show(struct seq_file *m, void *v)
518 struct ip_rt_acct *dst, *src;
519 unsigned int i, j;
521 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 if (!dst)
523 return -ENOMEM;
525 for_each_possible_cpu(i) {
526 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 for (j = 0; j < 256; j++) {
528 dst[j].o_bytes += src[j].o_bytes;
529 dst[j].o_packets += src[j].o_packets;
530 dst[j].i_bytes += src[j].i_bytes;
531 dst[j].i_packets += src[j].i_packets;
535 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
536 kfree(dst);
537 return 0;
540 static int rt_acct_proc_open(struct inode *inode, struct file *file)
542 return single_open(file, rt_acct_proc_show, NULL);
545 static const struct file_operations rt_acct_proc_fops = {
546 .owner = THIS_MODULE,
547 .open = rt_acct_proc_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = single_release,
552 #endif
554 static int __net_init ip_rt_do_proc_init(struct net *net)
556 struct proc_dir_entry *pde;
558 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
559 &rt_cache_seq_fops);
560 if (!pde)
561 goto err1;
563 pde = proc_create("rt_cache", S_IRUGO,
564 net->proc_net_stat, &rt_cpu_seq_fops);
565 if (!pde)
566 goto err2;
568 #ifdef CONFIG_NET_CLS_ROUTE
569 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
570 if (!pde)
571 goto err3;
572 #endif
573 return 0;
575 #ifdef CONFIG_NET_CLS_ROUTE
576 err3:
577 remove_proc_entry("rt_cache", net->proc_net_stat);
578 #endif
579 err2:
580 remove_proc_entry("rt_cache", net->proc_net);
581 err1:
582 return -ENOMEM;
585 static void __net_exit ip_rt_do_proc_exit(struct net *net)
587 remove_proc_entry("rt_cache", net->proc_net_stat);
588 remove_proc_entry("rt_cache", net->proc_net);
589 #ifdef CONFIG_NET_CLS_ROUTE
590 remove_proc_entry("rt_acct", net->proc_net);
591 #endif
594 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
595 .init = ip_rt_do_proc_init,
596 .exit = ip_rt_do_proc_exit,
599 static int __init ip_rt_proc_init(void)
601 return register_pernet_subsys(&ip_rt_proc_ops);
604 #else
605 static inline int ip_rt_proc_init(void)
607 return 0;
609 #endif /* CONFIG_PROC_FS */
611 static inline void rt_free(struct rtable *rt)
613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
616 static inline void rt_drop(struct rtable *rt)
618 ip_rt_put(rt);
619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
622 static inline int rt_fast_clean(struct rtable *rth)
624 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 rth->fl.iif && rth->u.dst.rt_next;
630 static inline int rt_valuable(struct rtable *rth)
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 rth->u.dst.expires;
636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
638 unsigned long age;
639 int ret = 0;
641 if (atomic_read(&rth->u.dst.__refcnt))
642 goto out;
644 ret = 1;
645 if (rth->u.dst.expires &&
646 time_after_eq(jiffies, rth->u.dst.expires))
647 goto out;
649 age = jiffies - rth->u.dst.lastuse;
650 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth)))
653 goto out;
654 ret = 1;
655 out: return ret;
658 /* Bits of score are:
659 * 31: very valuable
660 * 30: not quite useless
661 * 29..0: usage counter
663 static inline u32 rt_score(struct rtable *rt)
665 u32 score = jiffies - rt->u.dst.lastuse;
667 score = ~score & ~(3<<30);
669 if (rt_valuable(rt))
670 score |= (1<<31);
672 if (!rt->fl.iif ||
673 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 score |= (1<<30);
676 return score;
679 static inline bool rt_caching(const struct net *net)
681 return net->ipv4.current_rt_cache_rebuild_count <=
682 net->ipv4.sysctl_rt_cache_rebuild_count;
685 static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2)
688 return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
689 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
690 (fl1->iif ^ fl2->iif)) == 0);
693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
695 return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
696 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
697 (fl1->mark ^ fl2->mark) |
698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
699 (fl1->oif ^ fl2->oif) |
700 (fl1->iif ^ fl2->iif)) == 0;
703 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
708 static inline int rt_is_expired(struct rtable *rth)
710 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
714 * Perform a full scan of hash table and free all entries.
715 * Can be called by a softirq or a process.
716 * In the later case, we want to be reschedule if necessary
718 static void rt_do_flush(int process_context)
720 unsigned int i;
721 struct rtable *rth, *next;
722 struct rtable * tail;
724 for (i = 0; i <= rt_hash_mask; i++) {
725 if (process_context && need_resched())
726 cond_resched();
727 rth = rt_hash_table[i].chain;
728 if (!rth)
729 continue;
731 spin_lock_bh(rt_hash_lock_addr(i));
732 #ifdef CONFIG_NET_NS
734 struct rtable ** prev, * p;
736 rth = rt_hash_table[i].chain;
738 /* defer releasing the head of the list after spin_unlock */
739 for (tail = rth; tail; tail = tail->u.dst.rt_next)
740 if (!rt_is_expired(tail))
741 break;
742 if (rth != tail)
743 rt_hash_table[i].chain = tail;
745 /* call rt_free on entries after the tail requiring flush */
746 prev = &rt_hash_table[i].chain;
747 for (p = *prev; p; p = next) {
748 next = p->u.dst.rt_next;
749 if (!rt_is_expired(p)) {
750 prev = &p->u.dst.rt_next;
751 } else {
752 *prev = next;
753 rt_free(p);
757 #else
758 rth = rt_hash_table[i].chain;
759 rt_hash_table[i].chain = NULL;
760 tail = NULL;
761 #endif
762 spin_unlock_bh(rt_hash_lock_addr(i));
764 for (; rth != tail; rth = next) {
765 next = rth->u.dst.rt_next;
766 rt_free(rth);
772 * While freeing expired entries, we compute average chain length
773 * and standard deviation, using fixed-point arithmetic.
774 * This to have an estimation of rt_chain_length_max
775 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
776 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
779 #define FRACT_BITS 3
780 #define ONE (1UL << FRACT_BITS)
783 * Given a hash chain and an item in this hash chain,
784 * find if a previous entry has the same hash_inputs
785 * (but differs on tos, mark or oif)
786 * Returns 0 if an alias is found.
787 * Returns ONE if rth has no alias before itself.
789 static int has_noalias(const struct rtable *head, const struct rtable *rth)
791 const struct rtable *aux = head;
793 while (aux != rth) {
794 if (compare_hash_inputs(&aux->fl, &rth->fl))
795 return 0;
796 aux = aux->u.dst.rt_next;
798 return ONE;
801 static void rt_check_expire(void)
803 static unsigned int rover;
804 unsigned int i = rover, goal;
805 struct rtable *rth, **rthp;
806 unsigned long samples = 0;
807 unsigned long sum = 0, sum2 = 0;
808 unsigned long delta;
809 u64 mult;
811 delta = jiffies - expires_ljiffies;
812 expires_ljiffies = jiffies;
813 mult = ((u64)delta) << rt_hash_log;
814 if (ip_rt_gc_timeout > 1)
815 do_div(mult, ip_rt_gc_timeout);
816 goal = (unsigned int)mult;
817 if (goal > rt_hash_mask)
818 goal = rt_hash_mask + 1;
819 for (; goal > 0; goal--) {
820 unsigned long tmo = ip_rt_gc_timeout;
821 unsigned long length;
823 i = (i + 1) & rt_hash_mask;
824 rthp = &rt_hash_table[i].chain;
826 if (need_resched())
827 cond_resched();
829 samples++;
831 if (*rthp == NULL)
832 continue;
833 length = 0;
834 spin_lock_bh(rt_hash_lock_addr(i));
835 while ((rth = *rthp) != NULL) {
836 prefetch(rth->u.dst.rt_next);
837 if (rt_is_expired(rth)) {
838 *rthp = rth->u.dst.rt_next;
839 rt_free(rth);
840 continue;
842 if (rth->u.dst.expires) {
843 /* Entry is expired even if it is in use */
844 if (time_before_eq(jiffies, rth->u.dst.expires)) {
845 nofree:
846 tmo >>= 1;
847 rthp = &rth->u.dst.rt_next;
849 * We only count entries on
850 * a chain with equal hash inputs once
851 * so that entries for different QOS
852 * levels, and other non-hash input
853 * attributes don't unfairly skew
854 * the length computation
856 length += has_noalias(rt_hash_table[i].chain, rth);
857 continue;
859 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
860 goto nofree;
862 /* Cleanup aged off entries. */
863 *rthp = rth->u.dst.rt_next;
864 rt_free(rth);
866 spin_unlock_bh(rt_hash_lock_addr(i));
867 sum += length;
868 sum2 += length*length;
870 if (samples) {
871 unsigned long avg = sum / samples;
872 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
873 rt_chain_length_max = max_t(unsigned long,
874 ip_rt_gc_elasticity,
875 (avg + 4*sd) >> FRACT_BITS);
877 rover = i;
881 * rt_worker_func() is run in process context.
882 * we call rt_check_expire() to scan part of the hash table
884 static void rt_worker_func(struct work_struct *work)
886 rt_check_expire();
887 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
891 * Pertubation of rt_genid by a small quantity [1..256]
892 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
893 * many times (2^24) without giving recent rt_genid.
894 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
896 static void rt_cache_invalidate(struct net *net)
898 unsigned char shuffle;
900 get_random_bytes(&shuffle, sizeof(shuffle));
901 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
905 * delay < 0 : invalidate cache (fast : entries will be deleted later)
906 * delay >= 0 : invalidate & flush cache (can be long)
908 void rt_cache_flush(struct net *net, int delay)
910 rt_cache_invalidate(net);
911 if (delay >= 0)
912 rt_do_flush(!in_softirq());
915 /* Flush previous cache invalidated entries from the cache */
916 void rt_cache_flush_batch(void)
918 rt_do_flush(!in_softirq());
922 * We change rt_genid and let gc do the cleanup
924 static void rt_secret_rebuild(unsigned long __net)
926 struct net *net = (struct net *)__net;
927 rt_cache_invalidate(net);
928 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
931 static void rt_secret_rebuild_oneshot(struct net *net)
933 del_timer_sync(&net->ipv4.rt_secret_timer);
934 rt_cache_invalidate(net);
935 if (ip_rt_secret_interval)
936 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
939 static void rt_emergency_hash_rebuild(struct net *net)
941 if (net_ratelimit()) {
942 printk(KERN_WARNING "Route hash chain too long!\n");
943 printk(KERN_WARNING "Adjust your secret_interval!\n");
946 rt_secret_rebuild_oneshot(net);
950 Short description of GC goals.
952 We want to build algorithm, which will keep routing cache
953 at some equilibrium point, when number of aged off entries
954 is kept approximately equal to newly generated ones.
956 Current expiration strength is variable "expire".
957 We try to adjust it dynamically, so that if networking
958 is idle expires is large enough to keep enough of warm entries,
959 and when load increases it reduces to limit cache size.
962 static int rt_garbage_collect(struct dst_ops *ops)
964 static unsigned long expire = RT_GC_TIMEOUT;
965 static unsigned long last_gc;
966 static int rover;
967 static int equilibrium;
968 struct rtable *rth, **rthp;
969 unsigned long now = jiffies;
970 int goal;
973 * Garbage collection is pretty expensive,
974 * do not make it too frequently.
977 RT_CACHE_STAT_INC(gc_total);
979 if (now - last_gc < ip_rt_gc_min_interval &&
980 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
981 RT_CACHE_STAT_INC(gc_ignored);
982 goto out;
985 /* Calculate number of entries, which we want to expire now. */
986 goal = atomic_read(&ipv4_dst_ops.entries) -
987 (ip_rt_gc_elasticity << rt_hash_log);
988 if (goal <= 0) {
989 if (equilibrium < ipv4_dst_ops.gc_thresh)
990 equilibrium = ipv4_dst_ops.gc_thresh;
991 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
992 if (goal > 0) {
993 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
994 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
996 } else {
997 /* We are in dangerous area. Try to reduce cache really
998 * aggressively.
1000 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1001 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
1004 if (now - last_gc >= ip_rt_gc_min_interval)
1005 last_gc = now;
1007 if (goal <= 0) {
1008 equilibrium += goal;
1009 goto work_done;
1012 do {
1013 int i, k;
1015 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1016 unsigned long tmo = expire;
1018 k = (k + 1) & rt_hash_mask;
1019 rthp = &rt_hash_table[k].chain;
1020 spin_lock_bh(rt_hash_lock_addr(k));
1021 while ((rth = *rthp) != NULL) {
1022 if (!rt_is_expired(rth) &&
1023 !rt_may_expire(rth, tmo, expire)) {
1024 tmo >>= 1;
1025 rthp = &rth->u.dst.rt_next;
1026 continue;
1028 *rthp = rth->u.dst.rt_next;
1029 rt_free(rth);
1030 goal--;
1032 spin_unlock_bh(rt_hash_lock_addr(k));
1033 if (goal <= 0)
1034 break;
1036 rover = k;
1038 if (goal <= 0)
1039 goto work_done;
1041 /* Goal is not achieved. We stop process if:
1043 - if expire reduced to zero. Otherwise, expire is halfed.
1044 - if table is not full.
1045 - if we are called from interrupt.
1046 - jiffies check is just fallback/debug loop breaker.
1047 We will not spin here for long time in any case.
1050 RT_CACHE_STAT_INC(gc_goal_miss);
1052 if (expire == 0)
1053 break;
1055 expire >>= 1;
1056 #if RT_CACHE_DEBUG >= 2
1057 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1058 atomic_read(&ipv4_dst_ops.entries), goal, i);
1059 #endif
1061 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1062 goto out;
1063 } while (!in_softirq() && time_before_eq(jiffies, now));
1065 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1066 goto out;
1067 if (net_ratelimit())
1068 printk(KERN_WARNING "dst cache overflow\n");
1069 RT_CACHE_STAT_INC(gc_dst_overflow);
1070 return 1;
1072 work_done:
1073 expire += ip_rt_gc_min_interval;
1074 if (expire > ip_rt_gc_timeout ||
1075 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1076 expire = ip_rt_gc_timeout;
1077 #if RT_CACHE_DEBUG >= 2
1078 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1079 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1080 #endif
1081 out: return 0;
1085 * Returns number of entries in a hash chain that have different hash_inputs
1087 static int slow_chain_length(const struct rtable *head)
1089 int length = 0;
1090 const struct rtable *rth = head;
1092 while (rth) {
1093 length += has_noalias(head, rth);
1094 rth = rth->u.dst.rt_next;
1096 return length >> FRACT_BITS;
1099 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1100 struct rtable **rp, struct sk_buff *skb, int ifindex)
1102 struct rtable *rth, **rthp;
1103 unsigned long now;
1104 struct rtable *cand, **candp;
1105 u32 min_score;
1106 int chain_length;
1107 int attempts = !in_softirq();
1109 restart:
1110 chain_length = 0;
1111 min_score = ~(u32)0;
1112 cand = NULL;
1113 candp = NULL;
1114 now = jiffies;
1116 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1118 * If we're not caching, just tell the caller we
1119 * were successful and don't touch the route. The
1120 * caller hold the sole reference to the cache entry, and
1121 * it will be released when the caller is done with it.
1122 * If we drop it here, the callers have no way to resolve routes
1123 * when we're not caching. Instead, just point *rp at rt, so
1124 * the caller gets a single use out of the route
1125 * Note that we do rt_free on this new route entry, so that
1126 * once its refcount hits zero, we are still able to reap it
1127 * (Thanks Alexey)
1128 * Note also the rt_free uses call_rcu. We don't actually
1129 * need rcu protection here, this is just our path to get
1130 * on the route gc list.
1133 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1134 int err = arp_bind_neighbour(&rt->u.dst);
1135 if (err) {
1136 if (net_ratelimit())
1137 printk(KERN_WARNING
1138 "Neighbour table failure & not caching routes.\n");
1139 rt_drop(rt);
1140 return err;
1144 rt_free(rt);
1145 goto skip_hashing;
1148 rthp = &rt_hash_table[hash].chain;
1150 spin_lock_bh(rt_hash_lock_addr(hash));
1151 while ((rth = *rthp) != NULL) {
1152 if (rt_is_expired(rth)) {
1153 *rthp = rth->u.dst.rt_next;
1154 rt_free(rth);
1155 continue;
1157 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1158 /* Put it first */
1159 *rthp = rth->u.dst.rt_next;
1161 * Since lookup is lockfree, the deletion
1162 * must be visible to another weakly ordered CPU before
1163 * the insertion at the start of the hash chain.
1165 rcu_assign_pointer(rth->u.dst.rt_next,
1166 rt_hash_table[hash].chain);
1168 * Since lookup is lockfree, the update writes
1169 * must be ordered for consistency on SMP.
1171 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1173 dst_use(&rth->u.dst, now);
1174 spin_unlock_bh(rt_hash_lock_addr(hash));
1176 rt_drop(rt);
1177 if (rp)
1178 *rp = rth;
1179 else
1180 skb_dst_set(skb, &rth->u.dst);
1181 return 0;
1184 if (!atomic_read(&rth->u.dst.__refcnt)) {
1185 u32 score = rt_score(rth);
1187 if (score <= min_score) {
1188 cand = rth;
1189 candp = rthp;
1190 min_score = score;
1194 chain_length++;
1196 rthp = &rth->u.dst.rt_next;
1199 if (cand) {
1200 /* ip_rt_gc_elasticity used to be average length of chain
1201 * length, when exceeded gc becomes really aggressive.
1203 * The second limit is less certain. At the moment it allows
1204 * only 2 entries per bucket. We will see.
1206 if (chain_length > ip_rt_gc_elasticity) {
1207 *candp = cand->u.dst.rt_next;
1208 rt_free(cand);
1210 } else {
1211 if (chain_length > rt_chain_length_max &&
1212 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1213 struct net *net = dev_net(rt->u.dst.dev);
1214 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1215 if (!rt_caching(net)) {
1216 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1217 rt->u.dst.dev->name, num);
1219 rt_emergency_hash_rebuild(net);
1220 spin_unlock_bh(rt_hash_lock_addr(hash));
1222 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1223 ifindex, rt_genid(net));
1224 goto restart;
1228 /* Try to bind route to arp only if it is output
1229 route or unicast forwarding path.
1231 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1232 int err = arp_bind_neighbour(&rt->u.dst);
1233 if (err) {
1234 spin_unlock_bh(rt_hash_lock_addr(hash));
1236 if (err != -ENOBUFS) {
1237 rt_drop(rt);
1238 return err;
1241 /* Neighbour tables are full and nothing
1242 can be released. Try to shrink route cache,
1243 it is most likely it holds some neighbour records.
1245 if (attempts-- > 0) {
1246 int saved_elasticity = ip_rt_gc_elasticity;
1247 int saved_int = ip_rt_gc_min_interval;
1248 ip_rt_gc_elasticity = 1;
1249 ip_rt_gc_min_interval = 0;
1250 rt_garbage_collect(&ipv4_dst_ops);
1251 ip_rt_gc_min_interval = saved_int;
1252 ip_rt_gc_elasticity = saved_elasticity;
1253 goto restart;
1256 if (net_ratelimit())
1257 printk(KERN_WARNING "Neighbour table overflow.\n");
1258 rt_drop(rt);
1259 return -ENOBUFS;
1263 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1265 #if RT_CACHE_DEBUG >= 2
1266 if (rt->u.dst.rt_next) {
1267 struct rtable *trt;
1268 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1269 hash, &rt->rt_dst);
1270 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1271 printk(" . %pI4", &trt->rt_dst);
1272 printk("\n");
1274 #endif
1276 * Since lookup is lockfree, we must make sure
1277 * previous writes to rt are comitted to memory
1278 * before making rt visible to other CPUS.
1280 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1282 spin_unlock_bh(rt_hash_lock_addr(hash));
1284 skip_hashing:
1285 if (rp)
1286 *rp = rt;
1287 else
1288 skb_dst_set(skb, &rt->u.dst);
1289 return 0;
1292 void rt_bind_peer(struct rtable *rt, int create)
1294 static DEFINE_SPINLOCK(rt_peer_lock);
1295 struct inet_peer *peer;
1297 peer = inet_getpeer(rt->rt_dst, create);
1299 spin_lock_bh(&rt_peer_lock);
1300 if (rt->peer == NULL) {
1301 rt->peer = peer;
1302 peer = NULL;
1304 spin_unlock_bh(&rt_peer_lock);
1305 if (peer)
1306 inet_putpeer(peer);
1310 * Peer allocation may fail only in serious out-of-memory conditions. However
1311 * we still can generate some output.
1312 * Random ID selection looks a bit dangerous because we have no chances to
1313 * select ID being unique in a reasonable period of time.
1314 * But broken packet identifier may be better than no packet at all.
1316 static void ip_select_fb_ident(struct iphdr *iph)
1318 static DEFINE_SPINLOCK(ip_fb_id_lock);
1319 static u32 ip_fallback_id;
1320 u32 salt;
1322 spin_lock_bh(&ip_fb_id_lock);
1323 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1324 iph->id = htons(salt & 0xFFFF);
1325 ip_fallback_id = salt;
1326 spin_unlock_bh(&ip_fb_id_lock);
1329 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1331 struct rtable *rt = (struct rtable *) dst;
1333 if (rt) {
1334 if (rt->peer == NULL)
1335 rt_bind_peer(rt, 1);
1337 /* If peer is attached to destination, it is never detached,
1338 so that we need not to grab a lock to dereference it.
1340 if (rt->peer) {
1341 iph->id = htons(inet_getid(rt->peer, more));
1342 return;
1344 } else
1345 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1346 __builtin_return_address(0));
1348 ip_select_fb_ident(iph);
1351 static void rt_del(unsigned hash, struct rtable *rt)
1353 struct rtable **rthp, *aux;
1355 rthp = &rt_hash_table[hash].chain;
1356 spin_lock_bh(rt_hash_lock_addr(hash));
1357 ip_rt_put(rt);
1358 while ((aux = *rthp) != NULL) {
1359 if (aux == rt || rt_is_expired(aux)) {
1360 *rthp = aux->u.dst.rt_next;
1361 rt_free(aux);
1362 continue;
1364 rthp = &aux->u.dst.rt_next;
1366 spin_unlock_bh(rt_hash_lock_addr(hash));
1369 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1370 __be32 saddr, struct net_device *dev)
1372 int i, k;
1373 struct in_device *in_dev = in_dev_get(dev);
1374 struct rtable *rth, **rthp;
1375 __be32 skeys[2] = { saddr, 0 };
1376 int ikeys[2] = { dev->ifindex, 0 };
1377 struct netevent_redirect netevent;
1378 struct net *net;
1380 if (!in_dev)
1381 return;
1383 net = dev_net(dev);
1384 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1385 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1386 ipv4_is_zeronet(new_gw))
1387 goto reject_redirect;
1389 if (!rt_caching(net))
1390 goto reject_redirect;
1392 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1393 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1394 goto reject_redirect;
1395 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1396 goto reject_redirect;
1397 } else {
1398 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1399 goto reject_redirect;
1402 for (i = 0; i < 2; i++) {
1403 for (k = 0; k < 2; k++) {
1404 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1405 rt_genid(net));
1407 rthp=&rt_hash_table[hash].chain;
1409 rcu_read_lock();
1410 while ((rth = rcu_dereference(*rthp)) != NULL) {
1411 struct rtable *rt;
1413 if (rth->fl.fl4_dst != daddr ||
1414 rth->fl.fl4_src != skeys[i] ||
1415 rth->fl.oif != ikeys[k] ||
1416 rth->fl.iif != 0 ||
1417 rt_is_expired(rth) ||
1418 !net_eq(dev_net(rth->u.dst.dev), net)) {
1419 rthp = &rth->u.dst.rt_next;
1420 continue;
1423 if (rth->rt_dst != daddr ||
1424 rth->rt_src != saddr ||
1425 rth->u.dst.error ||
1426 rth->rt_gateway != old_gw ||
1427 rth->u.dst.dev != dev)
1428 break;
1430 dst_hold(&rth->u.dst);
1431 rcu_read_unlock();
1433 rt = dst_alloc(&ipv4_dst_ops);
1434 if (rt == NULL) {
1435 ip_rt_put(rth);
1436 in_dev_put(in_dev);
1437 return;
1440 /* Copy all the information. */
1441 *rt = *rth;
1442 rt->u.dst.__use = 1;
1443 atomic_set(&rt->u.dst.__refcnt, 1);
1444 rt->u.dst.child = NULL;
1445 if (rt->u.dst.dev)
1446 dev_hold(rt->u.dst.dev);
1447 if (rt->idev)
1448 in_dev_hold(rt->idev);
1449 rt->u.dst.obsolete = -1;
1450 rt->u.dst.lastuse = jiffies;
1451 rt->u.dst.path = &rt->u.dst;
1452 rt->u.dst.neighbour = NULL;
1453 rt->u.dst.hh = NULL;
1454 #ifdef CONFIG_XFRM
1455 rt->u.dst.xfrm = NULL;
1456 #endif
1457 rt->rt_genid = rt_genid(net);
1458 rt->rt_flags |= RTCF_REDIRECTED;
1460 /* Gateway is different ... */
1461 rt->rt_gateway = new_gw;
1463 /* Redirect received -> path was valid */
1464 dst_confirm(&rth->u.dst);
1466 if (rt->peer)
1467 atomic_inc(&rt->peer->refcnt);
1469 if (arp_bind_neighbour(&rt->u.dst) ||
1470 !(rt->u.dst.neighbour->nud_state &
1471 NUD_VALID)) {
1472 if (rt->u.dst.neighbour)
1473 neigh_event_send(rt->u.dst.neighbour, NULL);
1474 ip_rt_put(rth);
1475 rt_drop(rt);
1476 goto do_next;
1479 netevent.old = &rth->u.dst;
1480 netevent.new = &rt->u.dst;
1481 call_netevent_notifiers(NETEVENT_REDIRECT,
1482 &netevent);
1484 rt_del(hash, rth);
1485 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1486 ip_rt_put(rt);
1487 goto do_next;
1489 rcu_read_unlock();
1490 do_next:
1494 in_dev_put(in_dev);
1495 return;
1497 reject_redirect:
1498 #ifdef CONFIG_IP_ROUTE_VERBOSE
1499 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1500 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1501 " Advised path = %pI4 -> %pI4\n",
1502 &old_gw, dev->name, &new_gw,
1503 &saddr, &daddr);
1504 #endif
1505 in_dev_put(in_dev);
1508 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1510 struct rtable *rt = (struct rtable *)dst;
1511 struct dst_entry *ret = dst;
1513 if (rt) {
1514 if (dst->obsolete > 0) {
1515 ip_rt_put(rt);
1516 ret = NULL;
1517 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1518 (rt->u.dst.expires &&
1519 time_after_eq(jiffies, rt->u.dst.expires))) {
1520 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1521 rt->fl.oif,
1522 rt_genid(dev_net(dst->dev)));
1523 #if RT_CACHE_DEBUG >= 1
1524 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1525 &rt->rt_dst, rt->fl.fl4_tos);
1526 #endif
1527 rt_del(hash, rt);
1528 ret = NULL;
1531 return ret;
1535 * Algorithm:
1536 * 1. The first ip_rt_redirect_number redirects are sent
1537 * with exponential backoff, then we stop sending them at all,
1538 * assuming that the host ignores our redirects.
1539 * 2. If we did not see packets requiring redirects
1540 * during ip_rt_redirect_silence, we assume that the host
1541 * forgot redirected route and start to send redirects again.
1543 * This algorithm is much cheaper and more intelligent than dumb load limiting
1544 * in icmp.c.
1546 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1547 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1550 void ip_rt_send_redirect(struct sk_buff *skb)
1552 struct rtable *rt = skb_rtable(skb);
1553 struct in_device *in_dev;
1554 int log_martians;
1556 rcu_read_lock();
1557 in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1558 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1559 rcu_read_unlock();
1560 return;
1562 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1563 rcu_read_unlock();
1565 /* No redirected packets during ip_rt_redirect_silence;
1566 * reset the algorithm.
1568 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1569 rt->u.dst.rate_tokens = 0;
1571 /* Too many ignored redirects; do not send anything
1572 * set u.dst.rate_last to the last seen redirected packet.
1574 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1575 rt->u.dst.rate_last = jiffies;
1576 return;
1579 /* Check for load limit; set rate_last to the latest sent
1580 * redirect.
1582 if (rt->u.dst.rate_tokens == 0 ||
1583 time_after(jiffies,
1584 (rt->u.dst.rate_last +
1585 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1586 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1587 rt->u.dst.rate_last = jiffies;
1588 ++rt->u.dst.rate_tokens;
1589 #ifdef CONFIG_IP_ROUTE_VERBOSE
1590 if (log_martians &&
1591 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1592 net_ratelimit())
1593 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1594 &rt->rt_src, rt->rt_iif,
1595 &rt->rt_dst, &rt->rt_gateway);
1596 #endif
1600 static int ip_error(struct sk_buff *skb)
1602 struct rtable *rt = skb_rtable(skb);
1603 unsigned long now;
1604 int code;
1606 switch (rt->u.dst.error) {
1607 case EINVAL:
1608 default:
1609 goto out;
1610 case EHOSTUNREACH:
1611 code = ICMP_HOST_UNREACH;
1612 break;
1613 case ENETUNREACH:
1614 code = ICMP_NET_UNREACH;
1615 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1616 IPSTATS_MIB_INNOROUTES);
1617 break;
1618 case EACCES:
1619 code = ICMP_PKT_FILTERED;
1620 break;
1623 now = jiffies;
1624 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1625 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1626 rt->u.dst.rate_tokens = ip_rt_error_burst;
1627 rt->u.dst.rate_last = now;
1628 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1629 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1630 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1633 out: kfree_skb(skb);
1634 return 0;
1638 * The last two values are not from the RFC but
1639 * are needed for AMPRnet AX.25 paths.
1642 static const unsigned short mtu_plateau[] =
1643 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1645 static inline unsigned short guess_mtu(unsigned short old_mtu)
1647 int i;
1649 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1650 if (old_mtu > mtu_plateau[i])
1651 return mtu_plateau[i];
1652 return 68;
1655 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1656 unsigned short new_mtu,
1657 struct net_device *dev)
1659 int i, k;
1660 unsigned short old_mtu = ntohs(iph->tot_len);
1661 struct rtable *rth;
1662 int ikeys[2] = { dev->ifindex, 0 };
1663 __be32 skeys[2] = { iph->saddr, 0, };
1664 __be32 daddr = iph->daddr;
1665 unsigned short est_mtu = 0;
1667 for (k = 0; k < 2; k++) {
1668 for (i = 0; i < 2; i++) {
1669 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1670 rt_genid(net));
1672 rcu_read_lock();
1673 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1674 rth = rcu_dereference(rth->u.dst.rt_next)) {
1675 unsigned short mtu = new_mtu;
1677 if (rth->fl.fl4_dst != daddr ||
1678 rth->fl.fl4_src != skeys[i] ||
1679 rth->rt_dst != daddr ||
1680 rth->rt_src != iph->saddr ||
1681 rth->fl.oif != ikeys[k] ||
1682 rth->fl.iif != 0 ||
1683 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1684 !net_eq(dev_net(rth->u.dst.dev), net) ||
1685 rt_is_expired(rth))
1686 continue;
1688 if (new_mtu < 68 || new_mtu >= old_mtu) {
1690 /* BSD 4.2 compatibility hack :-( */
1691 if (mtu == 0 &&
1692 old_mtu >= dst_mtu(&rth->u.dst) &&
1693 old_mtu >= 68 + (iph->ihl << 2))
1694 old_mtu -= iph->ihl << 2;
1696 mtu = guess_mtu(old_mtu);
1698 if (mtu <= dst_mtu(&rth->u.dst)) {
1699 if (mtu < dst_mtu(&rth->u.dst)) {
1700 dst_confirm(&rth->u.dst);
1701 if (mtu < ip_rt_min_pmtu) {
1702 mtu = ip_rt_min_pmtu;
1703 rth->u.dst.metrics[RTAX_LOCK-1] |=
1704 (1 << RTAX_MTU);
1706 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1707 dst_set_expires(&rth->u.dst,
1708 ip_rt_mtu_expires);
1710 est_mtu = mtu;
1713 rcu_read_unlock();
1716 return est_mtu ? : new_mtu;
1719 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1721 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1722 !(dst_metric_locked(dst, RTAX_MTU))) {
1723 if (mtu < ip_rt_min_pmtu) {
1724 mtu = ip_rt_min_pmtu;
1725 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1727 dst->metrics[RTAX_MTU-1] = mtu;
1728 dst_set_expires(dst, ip_rt_mtu_expires);
1729 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1733 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1735 if (rt_is_expired((struct rtable *)dst))
1736 return NULL;
1737 return dst;
1740 static void ipv4_dst_destroy(struct dst_entry *dst)
1742 struct rtable *rt = (struct rtable *) dst;
1743 struct inet_peer *peer = rt->peer;
1744 struct in_device *idev = rt->idev;
1746 if (peer) {
1747 rt->peer = NULL;
1748 inet_putpeer(peer);
1751 if (idev) {
1752 rt->idev = NULL;
1753 in_dev_put(idev);
1757 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1758 int how)
1760 struct rtable *rt = (struct rtable *) dst;
1761 struct in_device *idev = rt->idev;
1762 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1763 struct in_device *loopback_idev =
1764 in_dev_get(dev_net(dev)->loopback_dev);
1765 if (loopback_idev) {
1766 rt->idev = loopback_idev;
1767 in_dev_put(idev);
1772 static void ipv4_link_failure(struct sk_buff *skb)
1774 struct rtable *rt;
1776 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1778 rt = skb_rtable(skb);
1779 if (rt)
1780 dst_set_expires(&rt->u.dst, 0);
1783 static int ip_rt_bug(struct sk_buff *skb)
1785 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1786 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1787 skb->dev ? skb->dev->name : "?");
1788 kfree_skb(skb);
1789 return 0;
1793 We do not cache source address of outgoing interface,
1794 because it is used only by IP RR, TS and SRR options,
1795 so that it out of fast path.
1797 BTW remember: "addr" is allowed to be not aligned
1798 in IP options!
1801 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1803 __be32 src;
1804 struct fib_result res;
1806 if (rt->fl.iif == 0)
1807 src = rt->rt_src;
1808 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1809 src = FIB_RES_PREFSRC(res);
1810 fib_res_put(&res);
1811 } else
1812 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1813 RT_SCOPE_UNIVERSE);
1814 memcpy(addr, &src, 4);
1817 #ifdef CONFIG_NET_CLS_ROUTE
1818 static void set_class_tag(struct rtable *rt, u32 tag)
1820 if (!(rt->u.dst.tclassid & 0xFFFF))
1821 rt->u.dst.tclassid |= tag & 0xFFFF;
1822 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1823 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1825 #endif
1827 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1829 struct fib_info *fi = res->fi;
1831 if (fi) {
1832 if (FIB_RES_GW(*res) &&
1833 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1834 rt->rt_gateway = FIB_RES_GW(*res);
1835 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1836 sizeof(rt->u.dst.metrics));
1837 if (fi->fib_mtu == 0) {
1838 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1839 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1840 rt->rt_gateway != rt->rt_dst &&
1841 rt->u.dst.dev->mtu > 576)
1842 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1844 #ifdef CONFIG_NET_CLS_ROUTE
1845 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1846 #endif
1847 } else
1848 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1850 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1851 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1852 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1853 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1854 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1855 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1856 ip_rt_min_advmss);
1857 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1858 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1860 #ifdef CONFIG_NET_CLS_ROUTE
1861 #ifdef CONFIG_IP_MULTIPLE_TABLES
1862 set_class_tag(rt, fib_rules_tclass(res));
1863 #endif
1864 set_class_tag(rt, itag);
1865 #endif
1866 rt->rt_type = res->type;
1869 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1870 u8 tos, struct net_device *dev, int our)
1872 unsigned hash;
1873 struct rtable *rth;
1874 __be32 spec_dst;
1875 struct in_device *in_dev = in_dev_get(dev);
1876 u32 itag = 0;
1878 /* Primary sanity checks. */
1880 if (in_dev == NULL)
1881 return -EINVAL;
1883 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1884 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1885 goto e_inval;
1887 if (ipv4_is_zeronet(saddr)) {
1888 if (!ipv4_is_local_multicast(daddr))
1889 goto e_inval;
1890 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1891 } else if (fib_validate_source(saddr, 0, tos, 0,
1892 dev, &spec_dst, &itag, 0) < 0)
1893 goto e_inval;
1895 rth = dst_alloc(&ipv4_dst_ops);
1896 if (!rth)
1897 goto e_nobufs;
1899 rth->u.dst.output = ip_rt_bug;
1900 rth->u.dst.obsolete = -1;
1902 atomic_set(&rth->u.dst.__refcnt, 1);
1903 rth->u.dst.flags= DST_HOST;
1904 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1905 rth->u.dst.flags |= DST_NOPOLICY;
1906 rth->fl.fl4_dst = daddr;
1907 rth->rt_dst = daddr;
1908 rth->fl.fl4_tos = tos;
1909 rth->fl.mark = skb->mark;
1910 rth->fl.fl4_src = saddr;
1911 rth->rt_src = saddr;
1912 #ifdef CONFIG_NET_CLS_ROUTE
1913 rth->u.dst.tclassid = itag;
1914 #endif
1915 rth->rt_iif =
1916 rth->fl.iif = dev->ifindex;
1917 rth->u.dst.dev = init_net.loopback_dev;
1918 dev_hold(rth->u.dst.dev);
1919 rth->idev = in_dev_get(rth->u.dst.dev);
1920 rth->fl.oif = 0;
1921 rth->rt_gateway = daddr;
1922 rth->rt_spec_dst= spec_dst;
1923 rth->rt_genid = rt_genid(dev_net(dev));
1924 rth->rt_flags = RTCF_MULTICAST;
1925 rth->rt_type = RTN_MULTICAST;
1926 if (our) {
1927 rth->u.dst.input= ip_local_deliver;
1928 rth->rt_flags |= RTCF_LOCAL;
1931 #ifdef CONFIG_IP_MROUTE
1932 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1933 rth->u.dst.input = ip_mr_input;
1934 #endif
1935 RT_CACHE_STAT_INC(in_slow_mc);
1937 in_dev_put(in_dev);
1938 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1939 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1941 e_nobufs:
1942 in_dev_put(in_dev);
1943 return -ENOBUFS;
1945 e_inval:
1946 in_dev_put(in_dev);
1947 return -EINVAL;
1951 static void ip_handle_martian_source(struct net_device *dev,
1952 struct in_device *in_dev,
1953 struct sk_buff *skb,
1954 __be32 daddr,
1955 __be32 saddr)
1957 RT_CACHE_STAT_INC(in_martian_src);
1958 #ifdef CONFIG_IP_ROUTE_VERBOSE
1959 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1961 * RFC1812 recommendation, if source is martian,
1962 * the only hint is MAC header.
1964 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1965 &daddr, &saddr, dev->name);
1966 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1967 int i;
1968 const unsigned char *p = skb_mac_header(skb);
1969 printk(KERN_WARNING "ll header: ");
1970 for (i = 0; i < dev->hard_header_len; i++, p++) {
1971 printk("%02x", *p);
1972 if (i < (dev->hard_header_len - 1))
1973 printk(":");
1975 printk("\n");
1978 #endif
1981 static int __mkroute_input(struct sk_buff *skb,
1982 struct fib_result *res,
1983 struct in_device *in_dev,
1984 __be32 daddr, __be32 saddr, u32 tos,
1985 struct rtable **result)
1988 struct rtable *rth;
1989 int err;
1990 struct in_device *out_dev;
1991 unsigned flags = 0;
1992 __be32 spec_dst;
1993 u32 itag;
1995 /* get a working reference to the output device */
1996 out_dev = in_dev_get(FIB_RES_DEV(*res));
1997 if (out_dev == NULL) {
1998 if (net_ratelimit())
1999 printk(KERN_CRIT "Bug in ip_route_input" \
2000 "_slow(). Please, report\n");
2001 return -EINVAL;
2005 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
2006 in_dev->dev, &spec_dst, &itag, skb->mark);
2007 if (err < 0) {
2008 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2009 saddr);
2011 err = -EINVAL;
2012 goto cleanup;
2015 if (err)
2016 flags |= RTCF_DIRECTSRC;
2018 if (out_dev == in_dev && err &&
2019 (IN_DEV_SHARED_MEDIA(out_dev) ||
2020 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2021 flags |= RTCF_DOREDIRECT;
2023 if (skb->protocol != htons(ETH_P_IP)) {
2024 /* Not IP (i.e. ARP). Do not create route, if it is
2025 * invalid for proxy arp. DNAT routes are always valid.
2027 * Proxy arp feature have been extended to allow, ARP
2028 * replies back to the same interface, to support
2029 * Private VLAN switch technologies. See arp.c.
2031 if (out_dev == in_dev &&
2032 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2033 err = -EINVAL;
2034 goto cleanup;
2039 rth = dst_alloc(&ipv4_dst_ops);
2040 if (!rth) {
2041 err = -ENOBUFS;
2042 goto cleanup;
2045 atomic_set(&rth->u.dst.__refcnt, 1);
2046 rth->u.dst.flags= DST_HOST;
2047 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2048 rth->u.dst.flags |= DST_NOPOLICY;
2049 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2050 rth->u.dst.flags |= DST_NOXFRM;
2051 rth->fl.fl4_dst = daddr;
2052 rth->rt_dst = daddr;
2053 rth->fl.fl4_tos = tos;
2054 rth->fl.mark = skb->mark;
2055 rth->fl.fl4_src = saddr;
2056 rth->rt_src = saddr;
2057 rth->rt_gateway = daddr;
2058 rth->rt_iif =
2059 rth->fl.iif = in_dev->dev->ifindex;
2060 rth->u.dst.dev = (out_dev)->dev;
2061 dev_hold(rth->u.dst.dev);
2062 rth->idev = in_dev_get(rth->u.dst.dev);
2063 rth->fl.oif = 0;
2064 rth->rt_spec_dst= spec_dst;
2066 rth->u.dst.obsolete = -1;
2067 rth->u.dst.input = ip_forward;
2068 rth->u.dst.output = ip_output;
2069 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2071 rt_set_nexthop(rth, res, itag);
2073 rth->rt_flags = flags;
2075 *result = rth;
2076 err = 0;
2077 cleanup:
2078 /* release the working reference to the output device */
2079 in_dev_put(out_dev);
2080 return err;
2083 static int ip_mkroute_input(struct sk_buff *skb,
2084 struct fib_result *res,
2085 const struct flowi *fl,
2086 struct in_device *in_dev,
2087 __be32 daddr, __be32 saddr, u32 tos)
2089 struct rtable* rth = NULL;
2090 int err;
2091 unsigned hash;
2093 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2094 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2095 fib_select_multipath(fl, res);
2096 #endif
2098 /* create a routing cache entry */
2099 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2100 if (err)
2101 return err;
2103 /* put it into the cache */
2104 hash = rt_hash(daddr, saddr, fl->iif,
2105 rt_genid(dev_net(rth->u.dst.dev)));
2106 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2110 * NOTE. We drop all the packets that has local source
2111 * addresses, because every properly looped back packet
2112 * must have correct destination already attached by output routine.
2114 * Such approach solves two big problems:
2115 * 1. Not simplex devices are handled properly.
2116 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2119 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2120 u8 tos, struct net_device *dev)
2122 struct fib_result res;
2123 struct in_device *in_dev = in_dev_get(dev);
2124 struct flowi fl = { .nl_u = { .ip4_u =
2125 { .daddr = daddr,
2126 .saddr = saddr,
2127 .tos = tos,
2128 .scope = RT_SCOPE_UNIVERSE,
2129 } },
2130 .mark = skb->mark,
2131 .iif = dev->ifindex };
2132 unsigned flags = 0;
2133 u32 itag = 0;
2134 struct rtable * rth;
2135 unsigned hash;
2136 __be32 spec_dst;
2137 int err = -EINVAL;
2138 int free_res = 0;
2139 struct net * net = dev_net(dev);
2141 /* IP on this device is disabled. */
2143 if (!in_dev)
2144 goto out;
2146 /* Check for the most weird martians, which can be not detected
2147 by fib_lookup.
2150 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2151 ipv4_is_loopback(saddr))
2152 goto martian_source;
2154 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2155 goto brd_input;
2157 /* Accept zero addresses only to limited broadcast;
2158 * I even do not know to fix it or not. Waiting for complains :-)
2160 if (ipv4_is_zeronet(saddr))
2161 goto martian_source;
2163 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2164 ipv4_is_loopback(daddr))
2165 goto martian_destination;
2168 * Now we are ready to route packet.
2170 if ((err = fib_lookup(net, &fl, &res)) != 0) {
2171 if (!IN_DEV_FORWARD(in_dev))
2172 goto e_hostunreach;
2173 goto no_route;
2175 free_res = 1;
2177 RT_CACHE_STAT_INC(in_slow_tot);
2179 if (res.type == RTN_BROADCAST)
2180 goto brd_input;
2182 if (res.type == RTN_LOCAL) {
2183 int result;
2184 result = fib_validate_source(saddr, daddr, tos,
2185 net->loopback_dev->ifindex,
2186 dev, &spec_dst, &itag, skb->mark);
2187 if (result < 0)
2188 goto martian_source;
2189 if (result)
2190 flags |= RTCF_DIRECTSRC;
2191 spec_dst = daddr;
2192 goto local_input;
2195 if (!IN_DEV_FORWARD(in_dev))
2196 goto e_hostunreach;
2197 if (res.type != RTN_UNICAST)
2198 goto martian_destination;
2200 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2201 done:
2202 in_dev_put(in_dev);
2203 if (free_res)
2204 fib_res_put(&res);
2205 out: return err;
2207 brd_input:
2208 if (skb->protocol != htons(ETH_P_IP))
2209 goto e_inval;
2211 if (ipv4_is_zeronet(saddr))
2212 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2213 else {
2214 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2215 &itag, skb->mark);
2216 if (err < 0)
2217 goto martian_source;
2218 if (err)
2219 flags |= RTCF_DIRECTSRC;
2221 flags |= RTCF_BROADCAST;
2222 res.type = RTN_BROADCAST;
2223 RT_CACHE_STAT_INC(in_brd);
2225 local_input:
2226 rth = dst_alloc(&ipv4_dst_ops);
2227 if (!rth)
2228 goto e_nobufs;
2230 rth->u.dst.output= ip_rt_bug;
2231 rth->u.dst.obsolete = -1;
2232 rth->rt_genid = rt_genid(net);
2234 atomic_set(&rth->u.dst.__refcnt, 1);
2235 rth->u.dst.flags= DST_HOST;
2236 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2237 rth->u.dst.flags |= DST_NOPOLICY;
2238 rth->fl.fl4_dst = daddr;
2239 rth->rt_dst = daddr;
2240 rth->fl.fl4_tos = tos;
2241 rth->fl.mark = skb->mark;
2242 rth->fl.fl4_src = saddr;
2243 rth->rt_src = saddr;
2244 #ifdef CONFIG_NET_CLS_ROUTE
2245 rth->u.dst.tclassid = itag;
2246 #endif
2247 rth->rt_iif =
2248 rth->fl.iif = dev->ifindex;
2249 rth->u.dst.dev = net->loopback_dev;
2250 dev_hold(rth->u.dst.dev);
2251 rth->idev = in_dev_get(rth->u.dst.dev);
2252 rth->rt_gateway = daddr;
2253 rth->rt_spec_dst= spec_dst;
2254 rth->u.dst.input= ip_local_deliver;
2255 rth->rt_flags = flags|RTCF_LOCAL;
2256 if (res.type == RTN_UNREACHABLE) {
2257 rth->u.dst.input= ip_error;
2258 rth->u.dst.error= -err;
2259 rth->rt_flags &= ~RTCF_LOCAL;
2261 rth->rt_type = res.type;
2262 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2263 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2264 goto done;
2266 no_route:
2267 RT_CACHE_STAT_INC(in_no_route);
2268 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2269 res.type = RTN_UNREACHABLE;
2270 if (err == -ESRCH)
2271 err = -ENETUNREACH;
2272 goto local_input;
2275 * Do not cache martian addresses: they should be logged (RFC1812)
2277 martian_destination:
2278 RT_CACHE_STAT_INC(in_martian_dst);
2279 #ifdef CONFIG_IP_ROUTE_VERBOSE
2280 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2281 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2282 &daddr, &saddr, dev->name);
2283 #endif
2285 e_hostunreach:
2286 err = -EHOSTUNREACH;
2287 goto done;
2289 e_inval:
2290 err = -EINVAL;
2291 goto done;
2293 e_nobufs:
2294 err = -ENOBUFS;
2295 goto done;
2297 martian_source:
2298 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2299 goto e_inval;
2302 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2303 u8 tos, struct net_device *dev)
2305 struct rtable * rth;
2306 unsigned hash;
2307 int iif = dev->ifindex;
2308 struct net *net;
2310 net = dev_net(dev);
2312 if (!rt_caching(net))
2313 goto skip_cache;
2315 tos &= IPTOS_RT_MASK;
2316 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2318 rcu_read_lock();
2319 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2320 rth = rcu_dereference(rth->u.dst.rt_next)) {
2321 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2322 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2323 (rth->fl.iif ^ iif) |
2324 rth->fl.oif |
2325 (rth->fl.fl4_tos ^ tos)) == 0 &&
2326 rth->fl.mark == skb->mark &&
2327 net_eq(dev_net(rth->u.dst.dev), net) &&
2328 !rt_is_expired(rth)) {
2329 dst_use(&rth->u.dst, jiffies);
2330 RT_CACHE_STAT_INC(in_hit);
2331 rcu_read_unlock();
2332 skb_dst_set(skb, &rth->u.dst);
2333 return 0;
2335 RT_CACHE_STAT_INC(in_hlist_search);
2337 rcu_read_unlock();
2339 skip_cache:
2340 /* Multicast recognition logic is moved from route cache to here.
2341 The problem was that too many Ethernet cards have broken/missing
2342 hardware multicast filters :-( As result the host on multicasting
2343 network acquires a lot of useless route cache entries, sort of
2344 SDR messages from all the world. Now we try to get rid of them.
2345 Really, provided software IP multicast filter is organized
2346 reasonably (at least, hashed), it does not result in a slowdown
2347 comparing with route cache reject entries.
2348 Note, that multicast routers are not affected, because
2349 route cache entry is created eventually.
2351 if (ipv4_is_multicast(daddr)) {
2352 struct in_device *in_dev;
2354 rcu_read_lock();
2355 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2356 int our = ip_check_mc(in_dev, daddr, saddr,
2357 ip_hdr(skb)->protocol);
2358 if (our
2359 #ifdef CONFIG_IP_MROUTE
2361 (!ipv4_is_local_multicast(daddr) &&
2362 IN_DEV_MFORWARD(in_dev))
2363 #endif
2365 rcu_read_unlock();
2366 return ip_route_input_mc(skb, daddr, saddr,
2367 tos, dev, our);
2370 rcu_read_unlock();
2371 return -EINVAL;
2373 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2376 static int __mkroute_output(struct rtable **result,
2377 struct fib_result *res,
2378 const struct flowi *fl,
2379 const struct flowi *oldflp,
2380 struct net_device *dev_out,
2381 unsigned flags)
2383 struct rtable *rth;
2384 struct in_device *in_dev;
2385 u32 tos = RT_FL_TOS(oldflp);
2386 int err = 0;
2388 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2389 return -EINVAL;
2391 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2392 res->type = RTN_BROADCAST;
2393 else if (ipv4_is_multicast(fl->fl4_dst))
2394 res->type = RTN_MULTICAST;
2395 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2396 return -EINVAL;
2398 if (dev_out->flags & IFF_LOOPBACK)
2399 flags |= RTCF_LOCAL;
2401 /* get work reference to inet device */
2402 in_dev = in_dev_get(dev_out);
2403 if (!in_dev)
2404 return -EINVAL;
2406 if (res->type == RTN_BROADCAST) {
2407 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2408 if (res->fi) {
2409 fib_info_put(res->fi);
2410 res->fi = NULL;
2412 } else if (res->type == RTN_MULTICAST) {
2413 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2414 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2415 oldflp->proto))
2416 flags &= ~RTCF_LOCAL;
2417 /* If multicast route do not exist use
2418 default one, but do not gateway in this case.
2419 Yes, it is hack.
2421 if (res->fi && res->prefixlen < 4) {
2422 fib_info_put(res->fi);
2423 res->fi = NULL;
2428 rth = dst_alloc(&ipv4_dst_ops);
2429 if (!rth) {
2430 err = -ENOBUFS;
2431 goto cleanup;
2434 atomic_set(&rth->u.dst.__refcnt, 1);
2435 rth->u.dst.flags= DST_HOST;
2436 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2437 rth->u.dst.flags |= DST_NOXFRM;
2438 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2439 rth->u.dst.flags |= DST_NOPOLICY;
2441 rth->fl.fl4_dst = oldflp->fl4_dst;
2442 rth->fl.fl4_tos = tos;
2443 rth->fl.fl4_src = oldflp->fl4_src;
2444 rth->fl.oif = oldflp->oif;
2445 rth->fl.mark = oldflp->mark;
2446 rth->rt_dst = fl->fl4_dst;
2447 rth->rt_src = fl->fl4_src;
2448 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2449 /* get references to the devices that are to be hold by the routing
2450 cache entry */
2451 rth->u.dst.dev = dev_out;
2452 dev_hold(dev_out);
2453 rth->idev = in_dev_get(dev_out);
2454 rth->rt_gateway = fl->fl4_dst;
2455 rth->rt_spec_dst= fl->fl4_src;
2457 rth->u.dst.output=ip_output;
2458 rth->u.dst.obsolete = -1;
2459 rth->rt_genid = rt_genid(dev_net(dev_out));
2461 RT_CACHE_STAT_INC(out_slow_tot);
2463 if (flags & RTCF_LOCAL) {
2464 rth->u.dst.input = ip_local_deliver;
2465 rth->rt_spec_dst = fl->fl4_dst;
2467 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2468 rth->rt_spec_dst = fl->fl4_src;
2469 if (flags & RTCF_LOCAL &&
2470 !(dev_out->flags & IFF_LOOPBACK)) {
2471 rth->u.dst.output = ip_mc_output;
2472 RT_CACHE_STAT_INC(out_slow_mc);
2474 #ifdef CONFIG_IP_MROUTE
2475 if (res->type == RTN_MULTICAST) {
2476 if (IN_DEV_MFORWARD(in_dev) &&
2477 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2478 rth->u.dst.input = ip_mr_input;
2479 rth->u.dst.output = ip_mc_output;
2482 #endif
2485 rt_set_nexthop(rth, res, 0);
2487 rth->rt_flags = flags;
2489 *result = rth;
2490 cleanup:
2491 /* release work reference to inet device */
2492 in_dev_put(in_dev);
2494 return err;
2497 static int ip_mkroute_output(struct rtable **rp,
2498 struct fib_result *res,
2499 const struct flowi *fl,
2500 const struct flowi *oldflp,
2501 struct net_device *dev_out,
2502 unsigned flags)
2504 struct rtable *rth = NULL;
2505 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2506 unsigned hash;
2507 if (err == 0) {
2508 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2509 rt_genid(dev_net(dev_out)));
2510 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2513 return err;
2517 * Major route resolver routine.
2520 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2521 const struct flowi *oldflp)
2523 u32 tos = RT_FL_TOS(oldflp);
2524 struct flowi fl = { .nl_u = { .ip4_u =
2525 { .daddr = oldflp->fl4_dst,
2526 .saddr = oldflp->fl4_src,
2527 .tos = tos & IPTOS_RT_MASK,
2528 .scope = ((tos & RTO_ONLINK) ?
2529 RT_SCOPE_LINK :
2530 RT_SCOPE_UNIVERSE),
2531 } },
2532 .mark = oldflp->mark,
2533 .iif = net->loopback_dev->ifindex,
2534 .oif = oldflp->oif };
2535 struct fib_result res;
2536 unsigned flags = 0;
2537 struct net_device *dev_out = NULL;
2538 int free_res = 0;
2539 int err;
2542 res.fi = NULL;
2543 #ifdef CONFIG_IP_MULTIPLE_TABLES
2544 res.r = NULL;
2545 #endif
2547 if (oldflp->fl4_src) {
2548 err = -EINVAL;
2549 if (ipv4_is_multicast(oldflp->fl4_src) ||
2550 ipv4_is_lbcast(oldflp->fl4_src) ||
2551 ipv4_is_zeronet(oldflp->fl4_src))
2552 goto out;
2554 /* I removed check for oif == dev_out->oif here.
2555 It was wrong for two reasons:
2556 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2557 is assigned to multiple interfaces.
2558 2. Moreover, we are allowed to send packets with saddr
2559 of another iface. --ANK
2562 if (oldflp->oif == 0 &&
2563 (ipv4_is_multicast(oldflp->fl4_dst) ||
2564 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2565 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2566 dev_out = ip_dev_find(net, oldflp->fl4_src);
2567 if (dev_out == NULL)
2568 goto out;
2570 /* Special hack: user can direct multicasts
2571 and limited broadcast via necessary interface
2572 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2573 This hack is not just for fun, it allows
2574 vic,vat and friends to work.
2575 They bind socket to loopback, set ttl to zero
2576 and expect that it will work.
2577 From the viewpoint of routing cache they are broken,
2578 because we are not allowed to build multicast path
2579 with loopback source addr (look, routing cache
2580 cannot know, that ttl is zero, so that packet
2581 will not leave this host and route is valid).
2582 Luckily, this hack is good workaround.
2585 fl.oif = dev_out->ifindex;
2586 goto make_route;
2589 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2590 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2591 dev_out = ip_dev_find(net, oldflp->fl4_src);
2592 if (dev_out == NULL)
2593 goto out;
2594 dev_put(dev_out);
2595 dev_out = NULL;
2600 if (oldflp->oif) {
2601 dev_out = dev_get_by_index(net, oldflp->oif);
2602 err = -ENODEV;
2603 if (dev_out == NULL)
2604 goto out;
2606 /* RACE: Check return value of inet_select_addr instead. */
2607 if (__in_dev_get_rtnl(dev_out) == NULL) {
2608 dev_put(dev_out);
2609 goto out; /* Wrong error code */
2612 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2613 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2614 if (!fl.fl4_src)
2615 fl.fl4_src = inet_select_addr(dev_out, 0,
2616 RT_SCOPE_LINK);
2617 goto make_route;
2619 if (!fl.fl4_src) {
2620 if (ipv4_is_multicast(oldflp->fl4_dst))
2621 fl.fl4_src = inet_select_addr(dev_out, 0,
2622 fl.fl4_scope);
2623 else if (!oldflp->fl4_dst)
2624 fl.fl4_src = inet_select_addr(dev_out, 0,
2625 RT_SCOPE_HOST);
2629 if (!fl.fl4_dst) {
2630 fl.fl4_dst = fl.fl4_src;
2631 if (!fl.fl4_dst)
2632 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2633 if (dev_out)
2634 dev_put(dev_out);
2635 dev_out = net->loopback_dev;
2636 dev_hold(dev_out);
2637 fl.oif = net->loopback_dev->ifindex;
2638 res.type = RTN_LOCAL;
2639 flags |= RTCF_LOCAL;
2640 goto make_route;
2643 if (fib_lookup(net, &fl, &res)) {
2644 res.fi = NULL;
2645 if (oldflp->oif) {
2646 /* Apparently, routing tables are wrong. Assume,
2647 that the destination is on link.
2649 WHY? DW.
2650 Because we are allowed to send to iface
2651 even if it has NO routes and NO assigned
2652 addresses. When oif is specified, routing
2653 tables are looked up with only one purpose:
2654 to catch if destination is gatewayed, rather than
2655 direct. Moreover, if MSG_DONTROUTE is set,
2656 we send packet, ignoring both routing tables
2657 and ifaddr state. --ANK
2660 We could make it even if oif is unknown,
2661 likely IPv6, but we do not.
2664 if (fl.fl4_src == 0)
2665 fl.fl4_src = inet_select_addr(dev_out, 0,
2666 RT_SCOPE_LINK);
2667 res.type = RTN_UNICAST;
2668 goto make_route;
2670 if (dev_out)
2671 dev_put(dev_out);
2672 err = -ENETUNREACH;
2673 goto out;
2675 free_res = 1;
2677 if (res.type == RTN_LOCAL) {
2678 if (!fl.fl4_src)
2679 fl.fl4_src = fl.fl4_dst;
2680 if (dev_out)
2681 dev_put(dev_out);
2682 dev_out = net->loopback_dev;
2683 dev_hold(dev_out);
2684 fl.oif = dev_out->ifindex;
2685 if (res.fi)
2686 fib_info_put(res.fi);
2687 res.fi = NULL;
2688 flags |= RTCF_LOCAL;
2689 goto make_route;
2692 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2693 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2694 fib_select_multipath(&fl, &res);
2695 else
2696 #endif
2697 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2698 fib_select_default(net, &fl, &res);
2700 if (!fl.fl4_src)
2701 fl.fl4_src = FIB_RES_PREFSRC(res);
2703 if (dev_out)
2704 dev_put(dev_out);
2705 dev_out = FIB_RES_DEV(res);
2706 dev_hold(dev_out);
2707 fl.oif = dev_out->ifindex;
2710 make_route:
2711 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2714 if (free_res)
2715 fib_res_put(&res);
2716 if (dev_out)
2717 dev_put(dev_out);
2718 out: return err;
2721 int __ip_route_output_key(struct net *net, struct rtable **rp,
2722 const struct flowi *flp)
2724 unsigned hash;
2725 struct rtable *rth;
2727 if (!rt_caching(net))
2728 goto slow_output;
2730 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2732 rcu_read_lock_bh();
2733 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2734 rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
2735 if (rth->fl.fl4_dst == flp->fl4_dst &&
2736 rth->fl.fl4_src == flp->fl4_src &&
2737 rth->fl.iif == 0 &&
2738 rth->fl.oif == flp->oif &&
2739 rth->fl.mark == flp->mark &&
2740 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2741 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2742 net_eq(dev_net(rth->u.dst.dev), net) &&
2743 !rt_is_expired(rth)) {
2744 dst_use(&rth->u.dst, jiffies);
2745 RT_CACHE_STAT_INC(out_hit);
2746 rcu_read_unlock_bh();
2747 *rp = rth;
2748 return 0;
2750 RT_CACHE_STAT_INC(out_hlist_search);
2752 rcu_read_unlock_bh();
2754 slow_output:
2755 return ip_route_output_slow(net, rp, flp);
2758 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2760 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2764 static struct dst_ops ipv4_dst_blackhole_ops = {
2765 .family = AF_INET,
2766 .protocol = cpu_to_be16(ETH_P_IP),
2767 .destroy = ipv4_dst_destroy,
2768 .check = ipv4_dst_check,
2769 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2770 .entries = ATOMIC_INIT(0),
2774 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2776 struct rtable *ort = *rp;
2777 struct rtable *rt = (struct rtable *)
2778 dst_alloc(&ipv4_dst_blackhole_ops);
2780 if (rt) {
2781 struct dst_entry *new = &rt->u.dst;
2783 atomic_set(&new->__refcnt, 1);
2784 new->__use = 1;
2785 new->input = dst_discard;
2786 new->output = dst_discard;
2787 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2789 new->dev = ort->u.dst.dev;
2790 if (new->dev)
2791 dev_hold(new->dev);
2793 rt->fl = ort->fl;
2795 rt->idev = ort->idev;
2796 if (rt->idev)
2797 in_dev_hold(rt->idev);
2798 rt->rt_genid = rt_genid(net);
2799 rt->rt_flags = ort->rt_flags;
2800 rt->rt_type = ort->rt_type;
2801 rt->rt_dst = ort->rt_dst;
2802 rt->rt_src = ort->rt_src;
2803 rt->rt_iif = ort->rt_iif;
2804 rt->rt_gateway = ort->rt_gateway;
2805 rt->rt_spec_dst = ort->rt_spec_dst;
2806 rt->peer = ort->peer;
2807 if (rt->peer)
2808 atomic_inc(&rt->peer->refcnt);
2810 dst_free(new);
2813 dst_release(&(*rp)->u.dst);
2814 *rp = rt;
2815 return (rt ? 0 : -ENOMEM);
2818 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2819 struct sock *sk, int flags)
2821 int err;
2823 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2824 return err;
2826 if (flp->proto) {
2827 if (!flp->fl4_src)
2828 flp->fl4_src = (*rp)->rt_src;
2829 if (!flp->fl4_dst)
2830 flp->fl4_dst = (*rp)->rt_dst;
2831 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2832 flags ? XFRM_LOOKUP_WAIT : 0);
2833 if (err == -EREMOTE)
2834 err = ipv4_dst_blackhole(net, rp, flp);
2836 return err;
2839 return 0;
2842 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2844 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2846 return ip_route_output_flow(net, rp, flp, NULL, 0);
2849 static int rt_fill_info(struct net *net,
2850 struct sk_buff *skb, u32 pid, u32 seq, int event,
2851 int nowait, unsigned int flags)
2853 struct rtable *rt = skb_rtable(skb);
2854 struct rtmsg *r;
2855 struct nlmsghdr *nlh;
2856 long expires;
2857 u32 id = 0, ts = 0, tsage = 0, error;
2859 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2860 if (nlh == NULL)
2861 return -EMSGSIZE;
2863 r = nlmsg_data(nlh);
2864 r->rtm_family = AF_INET;
2865 r->rtm_dst_len = 32;
2866 r->rtm_src_len = 0;
2867 r->rtm_tos = rt->fl.fl4_tos;
2868 r->rtm_table = RT_TABLE_MAIN;
2869 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2870 r->rtm_type = rt->rt_type;
2871 r->rtm_scope = RT_SCOPE_UNIVERSE;
2872 r->rtm_protocol = RTPROT_UNSPEC;
2873 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2874 if (rt->rt_flags & RTCF_NOTIFY)
2875 r->rtm_flags |= RTM_F_NOTIFY;
2877 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2879 if (rt->fl.fl4_src) {
2880 r->rtm_src_len = 32;
2881 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2883 if (rt->u.dst.dev)
2884 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2885 #ifdef CONFIG_NET_CLS_ROUTE
2886 if (rt->u.dst.tclassid)
2887 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2888 #endif
2889 if (rt->fl.iif)
2890 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2891 else if (rt->rt_src != rt->fl.fl4_src)
2892 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2894 if (rt->rt_dst != rt->rt_gateway)
2895 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2897 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2898 goto nla_put_failure;
2900 error = rt->u.dst.error;
2901 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2902 if (rt->peer) {
2903 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2904 if (rt->peer->tcp_ts_stamp) {
2905 ts = rt->peer->tcp_ts;
2906 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2910 if (rt->fl.iif) {
2911 #ifdef CONFIG_IP_MROUTE
2912 __be32 dst = rt->rt_dst;
2914 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2915 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2916 int err = ipmr_get_route(net, skb, r, nowait);
2917 if (err <= 0) {
2918 if (!nowait) {
2919 if (err == 0)
2920 return 0;
2921 goto nla_put_failure;
2922 } else {
2923 if (err == -EMSGSIZE)
2924 goto nla_put_failure;
2925 error = err;
2928 } else
2929 #endif
2930 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2933 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2934 expires, error) < 0)
2935 goto nla_put_failure;
2937 return nlmsg_end(skb, nlh);
2939 nla_put_failure:
2940 nlmsg_cancel(skb, nlh);
2941 return -EMSGSIZE;
2944 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2946 struct net *net = sock_net(in_skb->sk);
2947 struct rtmsg *rtm;
2948 struct nlattr *tb[RTA_MAX+1];
2949 struct rtable *rt = NULL;
2950 __be32 dst = 0;
2951 __be32 src = 0;
2952 u32 iif;
2953 int err;
2954 struct sk_buff *skb;
2956 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2957 if (err < 0)
2958 goto errout;
2960 rtm = nlmsg_data(nlh);
2962 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2963 if (skb == NULL) {
2964 err = -ENOBUFS;
2965 goto errout;
2968 /* Reserve room for dummy headers, this skb can pass
2969 through good chunk of routing engine.
2971 skb_reset_mac_header(skb);
2972 skb_reset_network_header(skb);
2974 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2975 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2976 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2978 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2979 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2980 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2982 if (iif) {
2983 struct net_device *dev;
2985 dev = __dev_get_by_index(net, iif);
2986 if (dev == NULL) {
2987 err = -ENODEV;
2988 goto errout_free;
2991 skb->protocol = htons(ETH_P_IP);
2992 skb->dev = dev;
2993 local_bh_disable();
2994 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2995 local_bh_enable();
2997 rt = skb_rtable(skb);
2998 if (err == 0 && rt->u.dst.error)
2999 err = -rt->u.dst.error;
3000 } else {
3001 struct flowi fl = {
3002 .nl_u = {
3003 .ip4_u = {
3004 .daddr = dst,
3005 .saddr = src,
3006 .tos = rtm->rtm_tos,
3009 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3011 err = ip_route_output_key(net, &rt, &fl);
3014 if (err)
3015 goto errout_free;
3017 skb_dst_set(skb, &rt->u.dst);
3018 if (rtm->rtm_flags & RTM_F_NOTIFY)
3019 rt->rt_flags |= RTCF_NOTIFY;
3021 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3022 RTM_NEWROUTE, 0, 0);
3023 if (err <= 0)
3024 goto errout_free;
3026 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3027 errout:
3028 return err;
3030 errout_free:
3031 kfree_skb(skb);
3032 goto errout;
3035 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3037 struct rtable *rt;
3038 int h, s_h;
3039 int idx, s_idx;
3040 struct net *net;
3042 net = sock_net(skb->sk);
3044 s_h = cb->args[0];
3045 if (s_h < 0)
3046 s_h = 0;
3047 s_idx = idx = cb->args[1];
3048 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3049 if (!rt_hash_table[h].chain)
3050 continue;
3051 rcu_read_lock_bh();
3052 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3053 rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
3054 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3055 continue;
3056 if (rt_is_expired(rt))
3057 continue;
3058 skb_dst_set(skb, dst_clone(&rt->u.dst));
3059 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3060 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3061 1, NLM_F_MULTI) <= 0) {
3062 skb_dst_drop(skb);
3063 rcu_read_unlock_bh();
3064 goto done;
3066 skb_dst_drop(skb);
3068 rcu_read_unlock_bh();
3071 done:
3072 cb->args[0] = h;
3073 cb->args[1] = idx;
3074 return skb->len;
3077 void ip_rt_multicast_event(struct in_device *in_dev)
3079 rt_cache_flush(dev_net(in_dev->dev), 0);
3082 #ifdef CONFIG_SYSCTL
3083 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3084 void __user *buffer,
3085 size_t *lenp, loff_t *ppos)
3087 if (write) {
3088 int flush_delay;
3089 ctl_table ctl;
3090 struct net *net;
3092 memcpy(&ctl, __ctl, sizeof(ctl));
3093 ctl.data = &flush_delay;
3094 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3096 net = (struct net *)__ctl->extra1;
3097 rt_cache_flush(net, flush_delay);
3098 return 0;
3101 return -EINVAL;
3104 static void rt_secret_reschedule(int old)
3106 struct net *net;
3107 int new = ip_rt_secret_interval;
3108 int diff = new - old;
3110 if (!diff)
3111 return;
3113 rtnl_lock();
3114 for_each_net(net) {
3115 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3116 long time;
3118 if (!new)
3119 continue;
3121 if (deleted) {
3122 time = net->ipv4.rt_secret_timer.expires - jiffies;
3124 if (time <= 0 || (time += diff) <= 0)
3125 time = 0;
3126 } else
3127 time = new;
3129 mod_timer(&net->ipv4.rt_secret_timer, jiffies + time);
3131 rtnl_unlock();
3134 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3135 void __user *buffer, size_t *lenp,
3136 loff_t *ppos)
3138 int old = ip_rt_secret_interval;
3139 int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3141 rt_secret_reschedule(old);
3143 return ret;
3146 static ctl_table ipv4_route_table[] = {
3148 .procname = "gc_thresh",
3149 .data = &ipv4_dst_ops.gc_thresh,
3150 .maxlen = sizeof(int),
3151 .mode = 0644,
3152 .proc_handler = proc_dointvec,
3155 .procname = "max_size",
3156 .data = &ip_rt_max_size,
3157 .maxlen = sizeof(int),
3158 .mode = 0644,
3159 .proc_handler = proc_dointvec,
3162 /* Deprecated. Use gc_min_interval_ms */
3164 .procname = "gc_min_interval",
3165 .data = &ip_rt_gc_min_interval,
3166 .maxlen = sizeof(int),
3167 .mode = 0644,
3168 .proc_handler = proc_dointvec_jiffies,
3171 .procname = "gc_min_interval_ms",
3172 .data = &ip_rt_gc_min_interval,
3173 .maxlen = sizeof(int),
3174 .mode = 0644,
3175 .proc_handler = proc_dointvec_ms_jiffies,
3178 .procname = "gc_timeout",
3179 .data = &ip_rt_gc_timeout,
3180 .maxlen = sizeof(int),
3181 .mode = 0644,
3182 .proc_handler = proc_dointvec_jiffies,
3185 .procname = "gc_interval",
3186 .data = &ip_rt_gc_interval,
3187 .maxlen = sizeof(int),
3188 .mode = 0644,
3189 .proc_handler = proc_dointvec_jiffies,
3192 .procname = "redirect_load",
3193 .data = &ip_rt_redirect_load,
3194 .maxlen = sizeof(int),
3195 .mode = 0644,
3196 .proc_handler = proc_dointvec,
3199 .procname = "redirect_number",
3200 .data = &ip_rt_redirect_number,
3201 .maxlen = sizeof(int),
3202 .mode = 0644,
3203 .proc_handler = proc_dointvec,
3206 .procname = "redirect_silence",
3207 .data = &ip_rt_redirect_silence,
3208 .maxlen = sizeof(int),
3209 .mode = 0644,
3210 .proc_handler = proc_dointvec,
3213 .procname = "error_cost",
3214 .data = &ip_rt_error_cost,
3215 .maxlen = sizeof(int),
3216 .mode = 0644,
3217 .proc_handler = proc_dointvec,
3220 .procname = "error_burst",
3221 .data = &ip_rt_error_burst,
3222 .maxlen = sizeof(int),
3223 .mode = 0644,
3224 .proc_handler = proc_dointvec,
3227 .procname = "gc_elasticity",
3228 .data = &ip_rt_gc_elasticity,
3229 .maxlen = sizeof(int),
3230 .mode = 0644,
3231 .proc_handler = proc_dointvec,
3234 .procname = "mtu_expires",
3235 .data = &ip_rt_mtu_expires,
3236 .maxlen = sizeof(int),
3237 .mode = 0644,
3238 .proc_handler = proc_dointvec_jiffies,
3241 .procname = "min_pmtu",
3242 .data = &ip_rt_min_pmtu,
3243 .maxlen = sizeof(int),
3244 .mode = 0644,
3245 .proc_handler = proc_dointvec,
3248 .procname = "min_adv_mss",
3249 .data = &ip_rt_min_advmss,
3250 .maxlen = sizeof(int),
3251 .mode = 0644,
3252 .proc_handler = proc_dointvec,
3255 .procname = "secret_interval",
3256 .data = &ip_rt_secret_interval,
3257 .maxlen = sizeof(int),
3258 .mode = 0644,
3259 .proc_handler = ipv4_sysctl_rt_secret_interval,
3264 static struct ctl_table empty[1];
3266 static struct ctl_table ipv4_skeleton[] =
3268 { .procname = "route",
3269 .mode = 0555, .child = ipv4_route_table},
3270 { .procname = "neigh",
3271 .mode = 0555, .child = empty},
3275 static __net_initdata struct ctl_path ipv4_path[] = {
3276 { .procname = "net", },
3277 { .procname = "ipv4", },
3278 { },
3281 static struct ctl_table ipv4_route_flush_table[] = {
3283 .procname = "flush",
3284 .maxlen = sizeof(int),
3285 .mode = 0200,
3286 .proc_handler = ipv4_sysctl_rtcache_flush,
3288 { },
3291 static __net_initdata struct ctl_path ipv4_route_path[] = {
3292 { .procname = "net", },
3293 { .procname = "ipv4", },
3294 { .procname = "route", },
3295 { },
3298 static __net_init int sysctl_route_net_init(struct net *net)
3300 struct ctl_table *tbl;
3302 tbl = ipv4_route_flush_table;
3303 if (!net_eq(net, &init_net)) {
3304 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3305 if (tbl == NULL)
3306 goto err_dup;
3308 tbl[0].extra1 = net;
3310 net->ipv4.route_hdr =
3311 register_net_sysctl_table(net, ipv4_route_path, tbl);
3312 if (net->ipv4.route_hdr == NULL)
3313 goto err_reg;
3314 return 0;
3316 err_reg:
3317 if (tbl != ipv4_route_flush_table)
3318 kfree(tbl);
3319 err_dup:
3320 return -ENOMEM;
3323 static __net_exit void sysctl_route_net_exit(struct net *net)
3325 struct ctl_table *tbl;
3327 tbl = net->ipv4.route_hdr->ctl_table_arg;
3328 unregister_net_sysctl_table(net->ipv4.route_hdr);
3329 BUG_ON(tbl == ipv4_route_flush_table);
3330 kfree(tbl);
3333 static __net_initdata struct pernet_operations sysctl_route_ops = {
3334 .init = sysctl_route_net_init,
3335 .exit = sysctl_route_net_exit,
3337 #endif
3340 static __net_init int rt_secret_timer_init(struct net *net)
3342 atomic_set(&net->ipv4.rt_genid,
3343 (int) ((num_physpages ^ (num_physpages>>8)) ^
3344 (jiffies ^ (jiffies >> 7))));
3346 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3347 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3348 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3350 if (ip_rt_secret_interval) {
3351 net->ipv4.rt_secret_timer.expires =
3352 jiffies + net_random() % ip_rt_secret_interval +
3353 ip_rt_secret_interval;
3354 add_timer(&net->ipv4.rt_secret_timer);
3356 return 0;
3359 static __net_exit void rt_secret_timer_exit(struct net *net)
3361 del_timer_sync(&net->ipv4.rt_secret_timer);
3364 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3365 .init = rt_secret_timer_init,
3366 .exit = rt_secret_timer_exit,
3370 #ifdef CONFIG_NET_CLS_ROUTE
3371 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3372 #endif /* CONFIG_NET_CLS_ROUTE */
3374 static __initdata unsigned long rhash_entries;
3375 static int __init set_rhash_entries(char *str)
3377 if (!str)
3378 return 0;
3379 rhash_entries = simple_strtoul(str, &str, 0);
3380 return 1;
3382 __setup("rhash_entries=", set_rhash_entries);
3384 int __init ip_rt_init(void)
3386 int rc = 0;
3388 #ifdef CONFIG_NET_CLS_ROUTE
3389 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3390 if (!ip_rt_acct)
3391 panic("IP: failed to allocate ip_rt_acct\n");
3392 #endif
3394 ipv4_dst_ops.kmem_cachep =
3395 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3396 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3398 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3400 rt_hash_table = (struct rt_hash_bucket *)
3401 alloc_large_system_hash("IP route cache",
3402 sizeof(struct rt_hash_bucket),
3403 rhash_entries,
3404 (totalram_pages >= 128 * 1024) ?
3405 15 : 17,
3407 &rt_hash_log,
3408 &rt_hash_mask,
3409 rhash_entries ? 0 : 512 * 1024);
3410 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3411 rt_hash_lock_init();
3413 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3414 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3416 devinet_init();
3417 ip_fib_init();
3419 /* All the timers, started at system startup tend
3420 to synchronize. Perturb it a bit.
3422 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3423 expires_ljiffies = jiffies;
3424 schedule_delayed_work(&expires_work,
3425 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3427 if (register_pernet_subsys(&rt_secret_timer_ops))
3428 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3430 if (ip_rt_proc_init())
3431 printk(KERN_ERR "Unable to create route proc files\n");
3432 #ifdef CONFIG_XFRM
3433 xfrm_init();
3434 xfrm4_init(ip_rt_max_size);
3435 #endif
3436 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3438 #ifdef CONFIG_SYSCTL
3439 register_pernet_subsys(&sysctl_route_ops);
3440 #endif
3441 return rc;
3444 #ifdef CONFIG_SYSCTL
3446 * We really need to sanitize the damn ipv4 init order, then all
3447 * this nonsense will go away.
3449 void __init ip_static_sysctl_init(void)
3451 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3453 #endif
3455 EXPORT_SYMBOL(__ip_select_ident);
3456 EXPORT_SYMBOL(ip_route_input);
3457 EXPORT_SYMBOL(ip_route_output_key);