[NET]: Make the device list and device lookups per namespace.
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv4 / route.c
blob396c631166a43e849d5cd8aee0fd0b65d2cd7a82
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
111 #define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 #define IP_MAX_MTU 0xFFF0
116 #define RT_GC_TIMEOUT (300*HZ)
118 static int ip_rt_min_delay = 2 * HZ;
119 static int ip_rt_max_delay = 10 * HZ;
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval = 60 * HZ;
123 static int ip_rt_gc_min_interval = HZ / 2;
124 static int ip_rt_redirect_number = 9;
125 static int ip_rt_redirect_load = HZ / 50;
126 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost = HZ;
128 static int ip_rt_error_burst = 5 * HZ;
129 static int ip_rt_gc_elasticity = 8;
130 static int ip_rt_mtu_expires = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu = 512 + 20 + 20;
132 static int ip_rt_min_advmss = 256;
133 static int ip_rt_secret_interval = 10 * 60 * HZ;
134 static unsigned long rt_deadline;
136 #define RTprint(a...) printk(KERN_DEBUG a)
138 static struct timer_list rt_flush_timer;
139 static struct timer_list rt_periodic_timer;
140 static struct timer_list rt_secret_timer;
143 * Interface to generic destination cache.
146 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
147 static void ipv4_dst_destroy(struct dst_entry *dst);
148 static void ipv4_dst_ifdown(struct dst_entry *dst,
149 struct net_device *dev, int how);
150 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
151 static void ipv4_link_failure(struct sk_buff *skb);
152 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
153 static int rt_garbage_collect(void);
156 static struct dst_ops ipv4_dst_ops = {
157 .family = AF_INET,
158 .protocol = __constant_htons(ETH_P_IP),
159 .gc = rt_garbage_collect,
160 .check = ipv4_dst_check,
161 .destroy = ipv4_dst_destroy,
162 .ifdown = ipv4_dst_ifdown,
163 .negative_advice = ipv4_negative_advice,
164 .link_failure = ipv4_link_failure,
165 .update_pmtu = ip_rt_update_pmtu,
166 .entry_size = sizeof(struct rtable),
169 #define ECN_OR_COST(class) TC_PRIO_##class
171 const __u8 ip_tos2prio[16] = {
172 TC_PRIO_BESTEFFORT,
173 ECN_OR_COST(FILLER),
174 TC_PRIO_BESTEFFORT,
175 ECN_OR_COST(BESTEFFORT),
176 TC_PRIO_BULK,
177 ECN_OR_COST(BULK),
178 TC_PRIO_BULK,
179 ECN_OR_COST(BULK),
180 TC_PRIO_INTERACTIVE,
181 ECN_OR_COST(INTERACTIVE),
182 TC_PRIO_INTERACTIVE,
183 ECN_OR_COST(INTERACTIVE),
184 TC_PRIO_INTERACTIVE_BULK,
185 ECN_OR_COST(INTERACTIVE_BULK),
186 TC_PRIO_INTERACTIVE_BULK,
187 ECN_OR_COST(INTERACTIVE_BULK)
192 * Route cache.
195 /* The locking scheme is rather straight forward:
197 * 1) Read-Copy Update protects the buckets of the central route hash.
198 * 2) Only writers remove entries, and they hold the lock
199 * as they look at rtable reference counts.
200 * 3) Only readers acquire references to rtable entries,
201 * they do so with atomic increments and with the
202 * lock held.
205 struct rt_hash_bucket {
206 struct rtable *chain;
208 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
209 defined(CONFIG_PROVE_LOCKING)
211 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
212 * The size of this table is a power of two and depends on the number of CPUS.
213 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215 #ifdef CONFIG_LOCKDEP
216 # define RT_HASH_LOCK_SZ 256
217 #else
218 # if NR_CPUS >= 32
219 # define RT_HASH_LOCK_SZ 4096
220 # elif NR_CPUS >= 16
221 # define RT_HASH_LOCK_SZ 2048
222 # elif NR_CPUS >= 8
223 # define RT_HASH_LOCK_SZ 1024
224 # elif NR_CPUS >= 4
225 # define RT_HASH_LOCK_SZ 512
226 # else
227 # define RT_HASH_LOCK_SZ 256
228 # endif
229 #endif
231 static spinlock_t *rt_hash_locks;
232 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233 # define rt_hash_lock_init() { \
234 int i; \
235 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
236 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
237 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
238 spin_lock_init(&rt_hash_locks[i]); \
240 #else
241 # define rt_hash_lock_addr(slot) NULL
242 # define rt_hash_lock_init()
243 #endif
245 static struct rt_hash_bucket *rt_hash_table;
246 static unsigned rt_hash_mask;
247 static int rt_hash_log;
248 static unsigned int rt_hash_rnd;
250 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
251 #define RT_CACHE_STAT_INC(field) \
252 (__raw_get_cpu_var(rt_cache_stat).field++)
254 static int rt_intern_hash(unsigned hash, struct rtable *rth,
255 struct rtable **res);
257 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
259 return (jhash_2words(daddr, saddr, rt_hash_rnd)
260 & rt_hash_mask);
263 #define rt_hash(daddr, saddr, idx) \
264 rt_hash_code((__force u32)(__be32)(daddr),\
265 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
267 #ifdef CONFIG_PROC_FS
268 struct rt_cache_iter_state {
269 int bucket;
272 static struct rtable *rt_cache_get_first(struct seq_file *seq)
274 struct rtable *r = NULL;
275 struct rt_cache_iter_state *st = seq->private;
277 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
278 rcu_read_lock_bh();
279 r = rt_hash_table[st->bucket].chain;
280 if (r)
281 break;
282 rcu_read_unlock_bh();
284 return r;
287 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
289 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
291 r = r->u.dst.rt_next;
292 while (!r) {
293 rcu_read_unlock_bh();
294 if (--st->bucket < 0)
295 break;
296 rcu_read_lock_bh();
297 r = rt_hash_table[st->bucket].chain;
299 return r;
302 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
304 struct rtable *r = rt_cache_get_first(seq);
306 if (r)
307 while (pos && (r = rt_cache_get_next(seq, r)))
308 --pos;
309 return pos ? NULL : r;
312 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
314 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
317 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
319 struct rtable *r = NULL;
321 if (v == SEQ_START_TOKEN)
322 r = rt_cache_get_first(seq);
323 else
324 r = rt_cache_get_next(seq, v);
325 ++*pos;
326 return r;
329 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
331 if (v && v != SEQ_START_TOKEN)
332 rcu_read_unlock_bh();
335 static int rt_cache_seq_show(struct seq_file *seq, void *v)
337 if (v == SEQ_START_TOKEN)
338 seq_printf(seq, "%-127s\n",
339 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
340 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
341 "HHUptod\tSpecDst");
342 else {
343 struct rtable *r = v;
344 char temp[256];
346 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
347 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
348 r->u.dst.dev ? r->u.dst.dev->name : "*",
349 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
350 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
351 r->u.dst.__use, 0, (unsigned long)r->rt_src,
352 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
353 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
354 dst_metric(&r->u.dst, RTAX_WINDOW),
355 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
356 dst_metric(&r->u.dst, RTAX_RTTVAR)),
357 r->fl.fl4_tos,
358 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
359 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
360 dev_queue_xmit) : 0,
361 r->rt_spec_dst);
362 seq_printf(seq, "%-127s\n", temp);
364 return 0;
367 static const struct seq_operations rt_cache_seq_ops = {
368 .start = rt_cache_seq_start,
369 .next = rt_cache_seq_next,
370 .stop = rt_cache_seq_stop,
371 .show = rt_cache_seq_show,
374 static int rt_cache_seq_open(struct inode *inode, struct file *file)
376 struct seq_file *seq;
377 int rc = -ENOMEM;
378 struct rt_cache_iter_state *s;
380 s = kzalloc(sizeof(*s), GFP_KERNEL);
381 if (!s)
382 goto out;
383 rc = seq_open(file, &rt_cache_seq_ops);
384 if (rc)
385 goto out_kfree;
386 seq = file->private_data;
387 seq->private = s;
388 out:
389 return rc;
390 out_kfree:
391 kfree(s);
392 goto out;
395 static const struct file_operations rt_cache_seq_fops = {
396 .owner = THIS_MODULE,
397 .open = rt_cache_seq_open,
398 .read = seq_read,
399 .llseek = seq_lseek,
400 .release = seq_release_private,
404 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
406 int cpu;
408 if (*pos == 0)
409 return SEQ_START_TOKEN;
411 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
412 if (!cpu_possible(cpu))
413 continue;
414 *pos = cpu+1;
415 return &per_cpu(rt_cache_stat, cpu);
417 return NULL;
420 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
422 int cpu;
424 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
425 if (!cpu_possible(cpu))
426 continue;
427 *pos = cpu+1;
428 return &per_cpu(rt_cache_stat, cpu);
430 return NULL;
434 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
439 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
441 struct rt_cache_stat *st = v;
443 if (v == SEQ_START_TOKEN) {
444 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
445 return 0;
448 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
449 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
450 atomic_read(&ipv4_dst_ops.entries),
451 st->in_hit,
452 st->in_slow_tot,
453 st->in_slow_mc,
454 st->in_no_route,
455 st->in_brd,
456 st->in_martian_dst,
457 st->in_martian_src,
459 st->out_hit,
460 st->out_slow_tot,
461 st->out_slow_mc,
463 st->gc_total,
464 st->gc_ignored,
465 st->gc_goal_miss,
466 st->gc_dst_overflow,
467 st->in_hlist_search,
468 st->out_hlist_search
470 return 0;
473 static const struct seq_operations rt_cpu_seq_ops = {
474 .start = rt_cpu_seq_start,
475 .next = rt_cpu_seq_next,
476 .stop = rt_cpu_seq_stop,
477 .show = rt_cpu_seq_show,
481 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
483 return seq_open(file, &rt_cpu_seq_ops);
486 static const struct file_operations rt_cpu_seq_fops = {
487 .owner = THIS_MODULE,
488 .open = rt_cpu_seq_open,
489 .read = seq_read,
490 .llseek = seq_lseek,
491 .release = seq_release,
494 #endif /* CONFIG_PROC_FS */
496 static __inline__ void rt_free(struct rtable *rt)
498 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
501 static __inline__ void rt_drop(struct rtable *rt)
503 ip_rt_put(rt);
504 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
507 static __inline__ int rt_fast_clean(struct rtable *rth)
509 /* Kill broadcast/multicast entries very aggresively, if they
510 collide in hash table with more useful entries */
511 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
512 rth->fl.iif && rth->u.dst.rt_next;
515 static __inline__ int rt_valuable(struct rtable *rth)
517 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
518 rth->u.dst.expires;
521 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
523 unsigned long age;
524 int ret = 0;
526 if (atomic_read(&rth->u.dst.__refcnt))
527 goto out;
529 ret = 1;
530 if (rth->u.dst.expires &&
531 time_after_eq(jiffies, rth->u.dst.expires))
532 goto out;
534 age = jiffies - rth->u.dst.lastuse;
535 ret = 0;
536 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
537 (age <= tmo2 && rt_valuable(rth)))
538 goto out;
539 ret = 1;
540 out: return ret;
543 /* Bits of score are:
544 * 31: very valuable
545 * 30: not quite useless
546 * 29..0: usage counter
548 static inline u32 rt_score(struct rtable *rt)
550 u32 score = jiffies - rt->u.dst.lastuse;
552 score = ~score & ~(3<<30);
554 if (rt_valuable(rt))
555 score |= (1<<31);
557 if (!rt->fl.iif ||
558 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
559 score |= (1<<30);
561 return score;
564 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
566 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
567 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
568 (fl1->mark ^ fl2->mark) |
569 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
570 *(u16 *)&fl2->nl_u.ip4_u.tos) |
571 (fl1->oif ^ fl2->oif) |
572 (fl1->iif ^ fl2->iif)) == 0;
575 /* This runs via a timer and thus is always in BH context. */
576 static void rt_check_expire(unsigned long dummy)
578 static unsigned int rover;
579 unsigned int i = rover, goal;
580 struct rtable *rth, **rthp;
581 unsigned long now = jiffies;
582 u64 mult;
584 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
585 if (ip_rt_gc_timeout > 1)
586 do_div(mult, ip_rt_gc_timeout);
587 goal = (unsigned int)mult;
588 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
589 for (; goal > 0; goal--) {
590 unsigned long tmo = ip_rt_gc_timeout;
592 i = (i + 1) & rt_hash_mask;
593 rthp = &rt_hash_table[i].chain;
595 if (*rthp == 0)
596 continue;
597 spin_lock(rt_hash_lock_addr(i));
598 while ((rth = *rthp) != NULL) {
599 if (rth->u.dst.expires) {
600 /* Entry is expired even if it is in use */
601 if (time_before_eq(now, rth->u.dst.expires)) {
602 tmo >>= 1;
603 rthp = &rth->u.dst.rt_next;
604 continue;
606 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
607 tmo >>= 1;
608 rthp = &rth->u.dst.rt_next;
609 continue;
612 /* Cleanup aged off entries. */
613 *rthp = rth->u.dst.rt_next;
614 rt_free(rth);
616 spin_unlock(rt_hash_lock_addr(i));
618 /* Fallback loop breaker. */
619 if (time_after(jiffies, now))
620 break;
622 rover = i;
623 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
626 /* This can run from both BH and non-BH contexts, the latter
627 * in the case of a forced flush event.
629 static void rt_run_flush(unsigned long dummy)
631 int i;
632 struct rtable *rth, *next;
634 rt_deadline = 0;
636 get_random_bytes(&rt_hash_rnd, 4);
638 for (i = rt_hash_mask; i >= 0; i--) {
639 spin_lock_bh(rt_hash_lock_addr(i));
640 rth = rt_hash_table[i].chain;
641 if (rth)
642 rt_hash_table[i].chain = NULL;
643 spin_unlock_bh(rt_hash_lock_addr(i));
645 for (; rth; rth = next) {
646 next = rth->u.dst.rt_next;
647 rt_free(rth);
652 static DEFINE_SPINLOCK(rt_flush_lock);
654 void rt_cache_flush(int delay)
656 unsigned long now = jiffies;
657 int user_mode = !in_softirq();
659 if (delay < 0)
660 delay = ip_rt_min_delay;
662 spin_lock_bh(&rt_flush_lock);
664 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
665 long tmo = (long)(rt_deadline - now);
667 /* If flush timer is already running
668 and flush request is not immediate (delay > 0):
670 if deadline is not achieved, prolongate timer to "delay",
671 otherwise fire it at deadline time.
674 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
675 tmo = 0;
677 if (delay > tmo)
678 delay = tmo;
681 if (delay <= 0) {
682 spin_unlock_bh(&rt_flush_lock);
683 rt_run_flush(0);
684 return;
687 if (rt_deadline == 0)
688 rt_deadline = now + ip_rt_max_delay;
690 mod_timer(&rt_flush_timer, now+delay);
691 spin_unlock_bh(&rt_flush_lock);
694 static void rt_secret_rebuild(unsigned long dummy)
696 unsigned long now = jiffies;
698 rt_cache_flush(0);
699 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
703 Short description of GC goals.
705 We want to build algorithm, which will keep routing cache
706 at some equilibrium point, when number of aged off entries
707 is kept approximately equal to newly generated ones.
709 Current expiration strength is variable "expire".
710 We try to adjust it dynamically, so that if networking
711 is idle expires is large enough to keep enough of warm entries,
712 and when load increases it reduces to limit cache size.
715 static int rt_garbage_collect(void)
717 static unsigned long expire = RT_GC_TIMEOUT;
718 static unsigned long last_gc;
719 static int rover;
720 static int equilibrium;
721 struct rtable *rth, **rthp;
722 unsigned long now = jiffies;
723 int goal;
726 * Garbage collection is pretty expensive,
727 * do not make it too frequently.
730 RT_CACHE_STAT_INC(gc_total);
732 if (now - last_gc < ip_rt_gc_min_interval &&
733 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
734 RT_CACHE_STAT_INC(gc_ignored);
735 goto out;
738 /* Calculate number of entries, which we want to expire now. */
739 goal = atomic_read(&ipv4_dst_ops.entries) -
740 (ip_rt_gc_elasticity << rt_hash_log);
741 if (goal <= 0) {
742 if (equilibrium < ipv4_dst_ops.gc_thresh)
743 equilibrium = ipv4_dst_ops.gc_thresh;
744 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
745 if (goal > 0) {
746 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
747 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
749 } else {
750 /* We are in dangerous area. Try to reduce cache really
751 * aggressively.
753 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
754 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
757 if (now - last_gc >= ip_rt_gc_min_interval)
758 last_gc = now;
760 if (goal <= 0) {
761 equilibrium += goal;
762 goto work_done;
765 do {
766 int i, k;
768 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
769 unsigned long tmo = expire;
771 k = (k + 1) & rt_hash_mask;
772 rthp = &rt_hash_table[k].chain;
773 spin_lock_bh(rt_hash_lock_addr(k));
774 while ((rth = *rthp) != NULL) {
775 if (!rt_may_expire(rth, tmo, expire)) {
776 tmo >>= 1;
777 rthp = &rth->u.dst.rt_next;
778 continue;
780 *rthp = rth->u.dst.rt_next;
781 rt_free(rth);
782 goal--;
784 spin_unlock_bh(rt_hash_lock_addr(k));
785 if (goal <= 0)
786 break;
788 rover = k;
790 if (goal <= 0)
791 goto work_done;
793 /* Goal is not achieved. We stop process if:
795 - if expire reduced to zero. Otherwise, expire is halfed.
796 - if table is not full.
797 - if we are called from interrupt.
798 - jiffies check is just fallback/debug loop breaker.
799 We will not spin here for long time in any case.
802 RT_CACHE_STAT_INC(gc_goal_miss);
804 if (expire == 0)
805 break;
807 expire >>= 1;
808 #if RT_CACHE_DEBUG >= 2
809 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
810 atomic_read(&ipv4_dst_ops.entries), goal, i);
811 #endif
813 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
814 goto out;
815 } while (!in_softirq() && time_before_eq(jiffies, now));
817 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
818 goto out;
819 if (net_ratelimit())
820 printk(KERN_WARNING "dst cache overflow\n");
821 RT_CACHE_STAT_INC(gc_dst_overflow);
822 return 1;
824 work_done:
825 expire += ip_rt_gc_min_interval;
826 if (expire > ip_rt_gc_timeout ||
827 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
828 expire = ip_rt_gc_timeout;
829 #if RT_CACHE_DEBUG >= 2
830 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
831 atomic_read(&ipv4_dst_ops.entries), goal, rover);
832 #endif
833 out: return 0;
836 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
838 struct rtable *rth, **rthp;
839 unsigned long now;
840 struct rtable *cand, **candp;
841 u32 min_score;
842 int chain_length;
843 int attempts = !in_softirq();
845 restart:
846 chain_length = 0;
847 min_score = ~(u32)0;
848 cand = NULL;
849 candp = NULL;
850 now = jiffies;
852 rthp = &rt_hash_table[hash].chain;
854 spin_lock_bh(rt_hash_lock_addr(hash));
855 while ((rth = *rthp) != NULL) {
856 if (compare_keys(&rth->fl, &rt->fl)) {
857 /* Put it first */
858 *rthp = rth->u.dst.rt_next;
860 * Since lookup is lockfree, the deletion
861 * must be visible to another weakly ordered CPU before
862 * the insertion at the start of the hash chain.
864 rcu_assign_pointer(rth->u.dst.rt_next,
865 rt_hash_table[hash].chain);
867 * Since lookup is lockfree, the update writes
868 * must be ordered for consistency on SMP.
870 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
872 rth->u.dst.__use++;
873 dst_hold(&rth->u.dst);
874 rth->u.dst.lastuse = now;
875 spin_unlock_bh(rt_hash_lock_addr(hash));
877 rt_drop(rt);
878 *rp = rth;
879 return 0;
882 if (!atomic_read(&rth->u.dst.__refcnt)) {
883 u32 score = rt_score(rth);
885 if (score <= min_score) {
886 cand = rth;
887 candp = rthp;
888 min_score = score;
892 chain_length++;
894 rthp = &rth->u.dst.rt_next;
897 if (cand) {
898 /* ip_rt_gc_elasticity used to be average length of chain
899 * length, when exceeded gc becomes really aggressive.
901 * The second limit is less certain. At the moment it allows
902 * only 2 entries per bucket. We will see.
904 if (chain_length > ip_rt_gc_elasticity) {
905 *candp = cand->u.dst.rt_next;
906 rt_free(cand);
910 /* Try to bind route to arp only if it is output
911 route or unicast forwarding path.
913 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
914 int err = arp_bind_neighbour(&rt->u.dst);
915 if (err) {
916 spin_unlock_bh(rt_hash_lock_addr(hash));
918 if (err != -ENOBUFS) {
919 rt_drop(rt);
920 return err;
923 /* Neighbour tables are full and nothing
924 can be released. Try to shrink route cache,
925 it is most likely it holds some neighbour records.
927 if (attempts-- > 0) {
928 int saved_elasticity = ip_rt_gc_elasticity;
929 int saved_int = ip_rt_gc_min_interval;
930 ip_rt_gc_elasticity = 1;
931 ip_rt_gc_min_interval = 0;
932 rt_garbage_collect();
933 ip_rt_gc_min_interval = saved_int;
934 ip_rt_gc_elasticity = saved_elasticity;
935 goto restart;
938 if (net_ratelimit())
939 printk(KERN_WARNING "Neighbour table overflow.\n");
940 rt_drop(rt);
941 return -ENOBUFS;
945 rt->u.dst.rt_next = rt_hash_table[hash].chain;
946 #if RT_CACHE_DEBUG >= 2
947 if (rt->u.dst.rt_next) {
948 struct rtable *trt;
949 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
950 NIPQUAD(rt->rt_dst));
951 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
952 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
953 printk("\n");
955 #endif
956 rt_hash_table[hash].chain = rt;
957 spin_unlock_bh(rt_hash_lock_addr(hash));
958 *rp = rt;
959 return 0;
962 void rt_bind_peer(struct rtable *rt, int create)
964 static DEFINE_SPINLOCK(rt_peer_lock);
965 struct inet_peer *peer;
967 peer = inet_getpeer(rt->rt_dst, create);
969 spin_lock_bh(&rt_peer_lock);
970 if (rt->peer == NULL) {
971 rt->peer = peer;
972 peer = NULL;
974 spin_unlock_bh(&rt_peer_lock);
975 if (peer)
976 inet_putpeer(peer);
980 * Peer allocation may fail only in serious out-of-memory conditions. However
981 * we still can generate some output.
982 * Random ID selection looks a bit dangerous because we have no chances to
983 * select ID being unique in a reasonable period of time.
984 * But broken packet identifier may be better than no packet at all.
986 static void ip_select_fb_ident(struct iphdr *iph)
988 static DEFINE_SPINLOCK(ip_fb_id_lock);
989 static u32 ip_fallback_id;
990 u32 salt;
992 spin_lock_bh(&ip_fb_id_lock);
993 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
994 iph->id = htons(salt & 0xFFFF);
995 ip_fallback_id = salt;
996 spin_unlock_bh(&ip_fb_id_lock);
999 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1001 struct rtable *rt = (struct rtable *) dst;
1003 if (rt) {
1004 if (rt->peer == NULL)
1005 rt_bind_peer(rt, 1);
1007 /* If peer is attached to destination, it is never detached,
1008 so that we need not to grab a lock to dereference it.
1010 if (rt->peer) {
1011 iph->id = htons(inet_getid(rt->peer, more));
1012 return;
1014 } else
1015 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1016 __builtin_return_address(0));
1018 ip_select_fb_ident(iph);
1021 static void rt_del(unsigned hash, struct rtable *rt)
1023 struct rtable **rthp;
1025 spin_lock_bh(rt_hash_lock_addr(hash));
1026 ip_rt_put(rt);
1027 for (rthp = &rt_hash_table[hash].chain; *rthp;
1028 rthp = &(*rthp)->u.dst.rt_next)
1029 if (*rthp == rt) {
1030 *rthp = rt->u.dst.rt_next;
1031 rt_free(rt);
1032 break;
1034 spin_unlock_bh(rt_hash_lock_addr(hash));
1037 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1038 __be32 saddr, struct net_device *dev)
1040 int i, k;
1041 struct in_device *in_dev = in_dev_get(dev);
1042 struct rtable *rth, **rthp;
1043 __be32 skeys[2] = { saddr, 0 };
1044 int ikeys[2] = { dev->ifindex, 0 };
1045 struct netevent_redirect netevent;
1047 if (!in_dev)
1048 return;
1050 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1051 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1052 goto reject_redirect;
1054 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1055 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1056 goto reject_redirect;
1057 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1058 goto reject_redirect;
1059 } else {
1060 if (inet_addr_type(new_gw) != RTN_UNICAST)
1061 goto reject_redirect;
1064 for (i = 0; i < 2; i++) {
1065 for (k = 0; k < 2; k++) {
1066 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1068 rthp=&rt_hash_table[hash].chain;
1070 rcu_read_lock();
1071 while ((rth = rcu_dereference(*rthp)) != NULL) {
1072 struct rtable *rt;
1074 if (rth->fl.fl4_dst != daddr ||
1075 rth->fl.fl4_src != skeys[i] ||
1076 rth->fl.oif != ikeys[k] ||
1077 rth->fl.iif != 0) {
1078 rthp = &rth->u.dst.rt_next;
1079 continue;
1082 if (rth->rt_dst != daddr ||
1083 rth->rt_src != saddr ||
1084 rth->u.dst.error ||
1085 rth->rt_gateway != old_gw ||
1086 rth->u.dst.dev != dev)
1087 break;
1089 dst_hold(&rth->u.dst);
1090 rcu_read_unlock();
1092 rt = dst_alloc(&ipv4_dst_ops);
1093 if (rt == NULL) {
1094 ip_rt_put(rth);
1095 in_dev_put(in_dev);
1096 return;
1099 /* Copy all the information. */
1100 *rt = *rth;
1101 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1102 rt->u.dst.__use = 1;
1103 atomic_set(&rt->u.dst.__refcnt, 1);
1104 rt->u.dst.child = NULL;
1105 if (rt->u.dst.dev)
1106 dev_hold(rt->u.dst.dev);
1107 if (rt->idev)
1108 in_dev_hold(rt->idev);
1109 rt->u.dst.obsolete = 0;
1110 rt->u.dst.lastuse = jiffies;
1111 rt->u.dst.path = &rt->u.dst;
1112 rt->u.dst.neighbour = NULL;
1113 rt->u.dst.hh = NULL;
1114 rt->u.dst.xfrm = NULL;
1116 rt->rt_flags |= RTCF_REDIRECTED;
1118 /* Gateway is different ... */
1119 rt->rt_gateway = new_gw;
1121 /* Redirect received -> path was valid */
1122 dst_confirm(&rth->u.dst);
1124 if (rt->peer)
1125 atomic_inc(&rt->peer->refcnt);
1127 if (arp_bind_neighbour(&rt->u.dst) ||
1128 !(rt->u.dst.neighbour->nud_state &
1129 NUD_VALID)) {
1130 if (rt->u.dst.neighbour)
1131 neigh_event_send(rt->u.dst.neighbour, NULL);
1132 ip_rt_put(rth);
1133 rt_drop(rt);
1134 goto do_next;
1137 netevent.old = &rth->u.dst;
1138 netevent.new = &rt->u.dst;
1139 call_netevent_notifiers(NETEVENT_REDIRECT,
1140 &netevent);
1142 rt_del(hash, rth);
1143 if (!rt_intern_hash(hash, rt, &rt))
1144 ip_rt_put(rt);
1145 goto do_next;
1147 rcu_read_unlock();
1148 do_next:
1152 in_dev_put(in_dev);
1153 return;
1155 reject_redirect:
1156 #ifdef CONFIG_IP_ROUTE_VERBOSE
1157 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1158 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1159 "%u.%u.%u.%u ignored.\n"
1160 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1161 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1162 NIPQUAD(saddr), NIPQUAD(daddr));
1163 #endif
1164 in_dev_put(in_dev);
1167 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1169 struct rtable *rt = (struct rtable*)dst;
1170 struct dst_entry *ret = dst;
1172 if (rt) {
1173 if (dst->obsolete) {
1174 ip_rt_put(rt);
1175 ret = NULL;
1176 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1177 rt->u.dst.expires) {
1178 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1179 rt->fl.oif);
1180 #if RT_CACHE_DEBUG >= 1
1181 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1182 "%u.%u.%u.%u/%02x dropped\n",
1183 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1184 #endif
1185 rt_del(hash, rt);
1186 ret = NULL;
1189 return ret;
1193 * Algorithm:
1194 * 1. The first ip_rt_redirect_number redirects are sent
1195 * with exponential backoff, then we stop sending them at all,
1196 * assuming that the host ignores our redirects.
1197 * 2. If we did not see packets requiring redirects
1198 * during ip_rt_redirect_silence, we assume that the host
1199 * forgot redirected route and start to send redirects again.
1201 * This algorithm is much cheaper and more intelligent than dumb load limiting
1202 * in icmp.c.
1204 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1205 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1208 void ip_rt_send_redirect(struct sk_buff *skb)
1210 struct rtable *rt = (struct rtable*)skb->dst;
1211 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1213 if (!in_dev)
1214 return;
1216 if (!IN_DEV_TX_REDIRECTS(in_dev))
1217 goto out;
1219 /* No redirected packets during ip_rt_redirect_silence;
1220 * reset the algorithm.
1222 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1223 rt->u.dst.rate_tokens = 0;
1225 /* Too many ignored redirects; do not send anything
1226 * set u.dst.rate_last to the last seen redirected packet.
1228 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1229 rt->u.dst.rate_last = jiffies;
1230 goto out;
1233 /* Check for load limit; set rate_last to the latest sent
1234 * redirect.
1236 if (rt->u.dst.rate_tokens == 0 ||
1237 time_after(jiffies,
1238 (rt->u.dst.rate_last +
1239 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1240 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1241 rt->u.dst.rate_last = jiffies;
1242 ++rt->u.dst.rate_tokens;
1243 #ifdef CONFIG_IP_ROUTE_VERBOSE
1244 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1245 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1246 net_ratelimit())
1247 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1248 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1249 NIPQUAD(rt->rt_src), rt->rt_iif,
1250 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1251 #endif
1253 out:
1254 in_dev_put(in_dev);
1257 static int ip_error(struct sk_buff *skb)
1259 struct rtable *rt = (struct rtable*)skb->dst;
1260 unsigned long now;
1261 int code;
1263 switch (rt->u.dst.error) {
1264 case EINVAL:
1265 default:
1266 goto out;
1267 case EHOSTUNREACH:
1268 code = ICMP_HOST_UNREACH;
1269 break;
1270 case ENETUNREACH:
1271 code = ICMP_NET_UNREACH;
1272 break;
1273 case EACCES:
1274 code = ICMP_PKT_FILTERED;
1275 break;
1278 now = jiffies;
1279 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1280 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1281 rt->u.dst.rate_tokens = ip_rt_error_burst;
1282 rt->u.dst.rate_last = now;
1283 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1284 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1285 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1288 out: kfree_skb(skb);
1289 return 0;
1293 * The last two values are not from the RFC but
1294 * are needed for AMPRnet AX.25 paths.
1297 static const unsigned short mtu_plateau[] =
1298 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1300 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1302 int i;
1304 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1305 if (old_mtu > mtu_plateau[i])
1306 return mtu_plateau[i];
1307 return 68;
1310 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1312 int i;
1313 unsigned short old_mtu = ntohs(iph->tot_len);
1314 struct rtable *rth;
1315 __be32 skeys[2] = { iph->saddr, 0, };
1316 __be32 daddr = iph->daddr;
1317 unsigned short est_mtu = 0;
1319 if (ipv4_config.no_pmtu_disc)
1320 return 0;
1322 for (i = 0; i < 2; i++) {
1323 unsigned hash = rt_hash(daddr, skeys[i], 0);
1325 rcu_read_lock();
1326 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1327 rth = rcu_dereference(rth->u.dst.rt_next)) {
1328 if (rth->fl.fl4_dst == daddr &&
1329 rth->fl.fl4_src == skeys[i] &&
1330 rth->rt_dst == daddr &&
1331 rth->rt_src == iph->saddr &&
1332 rth->fl.iif == 0 &&
1333 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1334 unsigned short mtu = new_mtu;
1336 if (new_mtu < 68 || new_mtu >= old_mtu) {
1338 /* BSD 4.2 compatibility hack :-( */
1339 if (mtu == 0 &&
1340 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1341 old_mtu >= 68 + (iph->ihl << 2))
1342 old_mtu -= iph->ihl << 2;
1344 mtu = guess_mtu(old_mtu);
1346 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1347 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1348 dst_confirm(&rth->u.dst);
1349 if (mtu < ip_rt_min_pmtu) {
1350 mtu = ip_rt_min_pmtu;
1351 rth->u.dst.metrics[RTAX_LOCK-1] |=
1352 (1 << RTAX_MTU);
1354 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1355 dst_set_expires(&rth->u.dst,
1356 ip_rt_mtu_expires);
1358 est_mtu = mtu;
1362 rcu_read_unlock();
1364 return est_mtu ? : new_mtu;
1367 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1369 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1370 !(dst_metric_locked(dst, RTAX_MTU))) {
1371 if (mtu < ip_rt_min_pmtu) {
1372 mtu = ip_rt_min_pmtu;
1373 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1375 dst->metrics[RTAX_MTU-1] = mtu;
1376 dst_set_expires(dst, ip_rt_mtu_expires);
1377 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1381 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1383 return NULL;
1386 static void ipv4_dst_destroy(struct dst_entry *dst)
1388 struct rtable *rt = (struct rtable *) dst;
1389 struct inet_peer *peer = rt->peer;
1390 struct in_device *idev = rt->idev;
1392 if (peer) {
1393 rt->peer = NULL;
1394 inet_putpeer(peer);
1397 if (idev) {
1398 rt->idev = NULL;
1399 in_dev_put(idev);
1403 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1404 int how)
1406 struct rtable *rt = (struct rtable *) dst;
1407 struct in_device *idev = rt->idev;
1408 if (dev != &loopback_dev && idev && idev->dev == dev) {
1409 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1410 if (loopback_idev) {
1411 rt->idev = loopback_idev;
1412 in_dev_put(idev);
1417 static void ipv4_link_failure(struct sk_buff *skb)
1419 struct rtable *rt;
1421 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1423 rt = (struct rtable *) skb->dst;
1424 if (rt)
1425 dst_set_expires(&rt->u.dst, 0);
1428 static int ip_rt_bug(struct sk_buff *skb)
1430 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1431 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1432 skb->dev ? skb->dev->name : "?");
1433 kfree_skb(skb);
1434 return 0;
1438 We do not cache source address of outgoing interface,
1439 because it is used only by IP RR, TS and SRR options,
1440 so that it out of fast path.
1442 BTW remember: "addr" is allowed to be not aligned
1443 in IP options!
1446 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1448 __be32 src;
1449 struct fib_result res;
1451 if (rt->fl.iif == 0)
1452 src = rt->rt_src;
1453 else if (fib_lookup(&rt->fl, &res) == 0) {
1454 src = FIB_RES_PREFSRC(res);
1455 fib_res_put(&res);
1456 } else
1457 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1458 RT_SCOPE_UNIVERSE);
1459 memcpy(addr, &src, 4);
1462 #ifdef CONFIG_NET_CLS_ROUTE
1463 static void set_class_tag(struct rtable *rt, u32 tag)
1465 if (!(rt->u.dst.tclassid & 0xFFFF))
1466 rt->u.dst.tclassid |= tag & 0xFFFF;
1467 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1468 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1470 #endif
1472 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1474 struct fib_info *fi = res->fi;
1476 if (fi) {
1477 if (FIB_RES_GW(*res) &&
1478 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1479 rt->rt_gateway = FIB_RES_GW(*res);
1480 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1481 sizeof(rt->u.dst.metrics));
1482 if (fi->fib_mtu == 0) {
1483 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1484 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1485 rt->rt_gateway != rt->rt_dst &&
1486 rt->u.dst.dev->mtu > 576)
1487 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1489 #ifdef CONFIG_NET_CLS_ROUTE
1490 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1491 #endif
1492 } else
1493 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1495 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1496 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1497 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1498 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1499 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1500 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1501 ip_rt_min_advmss);
1502 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1503 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1505 #ifdef CONFIG_NET_CLS_ROUTE
1506 #ifdef CONFIG_IP_MULTIPLE_TABLES
1507 set_class_tag(rt, fib_rules_tclass(res));
1508 #endif
1509 set_class_tag(rt, itag);
1510 #endif
1511 rt->rt_type = res->type;
1514 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1515 u8 tos, struct net_device *dev, int our)
1517 unsigned hash;
1518 struct rtable *rth;
1519 __be32 spec_dst;
1520 struct in_device *in_dev = in_dev_get(dev);
1521 u32 itag = 0;
1523 /* Primary sanity checks. */
1525 if (in_dev == NULL)
1526 return -EINVAL;
1528 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1529 skb->protocol != htons(ETH_P_IP))
1530 goto e_inval;
1532 if (ZERONET(saddr)) {
1533 if (!LOCAL_MCAST(daddr))
1534 goto e_inval;
1535 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1536 } else if (fib_validate_source(saddr, 0, tos, 0,
1537 dev, &spec_dst, &itag) < 0)
1538 goto e_inval;
1540 rth = dst_alloc(&ipv4_dst_ops);
1541 if (!rth)
1542 goto e_nobufs;
1544 rth->u.dst.output= ip_rt_bug;
1546 atomic_set(&rth->u.dst.__refcnt, 1);
1547 rth->u.dst.flags= DST_HOST;
1548 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1549 rth->u.dst.flags |= DST_NOPOLICY;
1550 rth->fl.fl4_dst = daddr;
1551 rth->rt_dst = daddr;
1552 rth->fl.fl4_tos = tos;
1553 rth->fl.mark = skb->mark;
1554 rth->fl.fl4_src = saddr;
1555 rth->rt_src = saddr;
1556 #ifdef CONFIG_NET_CLS_ROUTE
1557 rth->u.dst.tclassid = itag;
1558 #endif
1559 rth->rt_iif =
1560 rth->fl.iif = dev->ifindex;
1561 rth->u.dst.dev = &loopback_dev;
1562 dev_hold(rth->u.dst.dev);
1563 rth->idev = in_dev_get(rth->u.dst.dev);
1564 rth->fl.oif = 0;
1565 rth->rt_gateway = daddr;
1566 rth->rt_spec_dst= spec_dst;
1567 rth->rt_type = RTN_MULTICAST;
1568 rth->rt_flags = RTCF_MULTICAST;
1569 if (our) {
1570 rth->u.dst.input= ip_local_deliver;
1571 rth->rt_flags |= RTCF_LOCAL;
1574 #ifdef CONFIG_IP_MROUTE
1575 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1576 rth->u.dst.input = ip_mr_input;
1577 #endif
1578 RT_CACHE_STAT_INC(in_slow_mc);
1580 in_dev_put(in_dev);
1581 hash = rt_hash(daddr, saddr, dev->ifindex);
1582 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1584 e_nobufs:
1585 in_dev_put(in_dev);
1586 return -ENOBUFS;
1588 e_inval:
1589 in_dev_put(in_dev);
1590 return -EINVAL;
1594 static void ip_handle_martian_source(struct net_device *dev,
1595 struct in_device *in_dev,
1596 struct sk_buff *skb,
1597 __be32 daddr,
1598 __be32 saddr)
1600 RT_CACHE_STAT_INC(in_martian_src);
1601 #ifdef CONFIG_IP_ROUTE_VERBOSE
1602 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1604 * RFC1812 recommendation, if source is martian,
1605 * the only hint is MAC header.
1607 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1608 "%u.%u.%u.%u, on dev %s\n",
1609 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1610 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1611 int i;
1612 const unsigned char *p = skb_mac_header(skb);
1613 printk(KERN_WARNING "ll header: ");
1614 for (i = 0; i < dev->hard_header_len; i++, p++) {
1615 printk("%02x", *p);
1616 if (i < (dev->hard_header_len - 1))
1617 printk(":");
1619 printk("\n");
1622 #endif
1625 static inline int __mkroute_input(struct sk_buff *skb,
1626 struct fib_result* res,
1627 struct in_device *in_dev,
1628 __be32 daddr, __be32 saddr, u32 tos,
1629 struct rtable **result)
1632 struct rtable *rth;
1633 int err;
1634 struct in_device *out_dev;
1635 unsigned flags = 0;
1636 __be32 spec_dst;
1637 u32 itag;
1639 /* get a working reference to the output device */
1640 out_dev = in_dev_get(FIB_RES_DEV(*res));
1641 if (out_dev == NULL) {
1642 if (net_ratelimit())
1643 printk(KERN_CRIT "Bug in ip_route_input" \
1644 "_slow(). Please, report\n");
1645 return -EINVAL;
1649 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1650 in_dev->dev, &spec_dst, &itag);
1651 if (err < 0) {
1652 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1653 saddr);
1655 err = -EINVAL;
1656 goto cleanup;
1659 if (err)
1660 flags |= RTCF_DIRECTSRC;
1662 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1663 (IN_DEV_SHARED_MEDIA(out_dev) ||
1664 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1665 flags |= RTCF_DOREDIRECT;
1667 if (skb->protocol != htons(ETH_P_IP)) {
1668 /* Not IP (i.e. ARP). Do not create route, if it is
1669 * invalid for proxy arp. DNAT routes are always valid.
1671 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1672 err = -EINVAL;
1673 goto cleanup;
1678 rth = dst_alloc(&ipv4_dst_ops);
1679 if (!rth) {
1680 err = -ENOBUFS;
1681 goto cleanup;
1684 atomic_set(&rth->u.dst.__refcnt, 1);
1685 rth->u.dst.flags= DST_HOST;
1686 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1687 rth->u.dst.flags |= DST_NOPOLICY;
1688 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1689 rth->u.dst.flags |= DST_NOXFRM;
1690 rth->fl.fl4_dst = daddr;
1691 rth->rt_dst = daddr;
1692 rth->fl.fl4_tos = tos;
1693 rth->fl.mark = skb->mark;
1694 rth->fl.fl4_src = saddr;
1695 rth->rt_src = saddr;
1696 rth->rt_gateway = daddr;
1697 rth->rt_iif =
1698 rth->fl.iif = in_dev->dev->ifindex;
1699 rth->u.dst.dev = (out_dev)->dev;
1700 dev_hold(rth->u.dst.dev);
1701 rth->idev = in_dev_get(rth->u.dst.dev);
1702 rth->fl.oif = 0;
1703 rth->rt_spec_dst= spec_dst;
1705 rth->u.dst.input = ip_forward;
1706 rth->u.dst.output = ip_output;
1708 rt_set_nexthop(rth, res, itag);
1710 rth->rt_flags = flags;
1712 *result = rth;
1713 err = 0;
1714 cleanup:
1715 /* release the working reference to the output device */
1716 in_dev_put(out_dev);
1717 return err;
1720 static inline int ip_mkroute_input(struct sk_buff *skb,
1721 struct fib_result* res,
1722 const struct flowi *fl,
1723 struct in_device *in_dev,
1724 __be32 daddr, __be32 saddr, u32 tos)
1726 struct rtable* rth = NULL;
1727 int err;
1728 unsigned hash;
1730 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1731 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1732 fib_select_multipath(fl, res);
1733 #endif
1735 /* create a routing cache entry */
1736 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1737 if (err)
1738 return err;
1740 /* put it into the cache */
1741 hash = rt_hash(daddr, saddr, fl->iif);
1742 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1746 * NOTE. We drop all the packets that has local source
1747 * addresses, because every properly looped back packet
1748 * must have correct destination already attached by output routine.
1750 * Such approach solves two big problems:
1751 * 1. Not simplex devices are handled properly.
1752 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1755 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1756 u8 tos, struct net_device *dev)
1758 struct fib_result res;
1759 struct in_device *in_dev = in_dev_get(dev);
1760 struct flowi fl = { .nl_u = { .ip4_u =
1761 { .daddr = daddr,
1762 .saddr = saddr,
1763 .tos = tos,
1764 .scope = RT_SCOPE_UNIVERSE,
1765 } },
1766 .mark = skb->mark,
1767 .iif = dev->ifindex };
1768 unsigned flags = 0;
1769 u32 itag = 0;
1770 struct rtable * rth;
1771 unsigned hash;
1772 __be32 spec_dst;
1773 int err = -EINVAL;
1774 int free_res = 0;
1776 /* IP on this device is disabled. */
1778 if (!in_dev)
1779 goto out;
1781 /* Check for the most weird martians, which can be not detected
1782 by fib_lookup.
1785 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1786 goto martian_source;
1788 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1789 goto brd_input;
1791 /* Accept zero addresses only to limited broadcast;
1792 * I even do not know to fix it or not. Waiting for complains :-)
1794 if (ZERONET(saddr))
1795 goto martian_source;
1797 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1798 goto martian_destination;
1801 * Now we are ready to route packet.
1803 if ((err = fib_lookup(&fl, &res)) != 0) {
1804 if (!IN_DEV_FORWARD(in_dev))
1805 goto e_hostunreach;
1806 goto no_route;
1808 free_res = 1;
1810 RT_CACHE_STAT_INC(in_slow_tot);
1812 if (res.type == RTN_BROADCAST)
1813 goto brd_input;
1815 if (res.type == RTN_LOCAL) {
1816 int result;
1817 result = fib_validate_source(saddr, daddr, tos,
1818 loopback_dev.ifindex,
1819 dev, &spec_dst, &itag);
1820 if (result < 0)
1821 goto martian_source;
1822 if (result)
1823 flags |= RTCF_DIRECTSRC;
1824 spec_dst = daddr;
1825 goto local_input;
1828 if (!IN_DEV_FORWARD(in_dev))
1829 goto e_hostunreach;
1830 if (res.type != RTN_UNICAST)
1831 goto martian_destination;
1833 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1834 if (err == -ENOBUFS)
1835 goto e_nobufs;
1836 if (err == -EINVAL)
1837 goto e_inval;
1839 done:
1840 in_dev_put(in_dev);
1841 if (free_res)
1842 fib_res_put(&res);
1843 out: return err;
1845 brd_input:
1846 if (skb->protocol != htons(ETH_P_IP))
1847 goto e_inval;
1849 if (ZERONET(saddr))
1850 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1851 else {
1852 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1853 &itag);
1854 if (err < 0)
1855 goto martian_source;
1856 if (err)
1857 flags |= RTCF_DIRECTSRC;
1859 flags |= RTCF_BROADCAST;
1860 res.type = RTN_BROADCAST;
1861 RT_CACHE_STAT_INC(in_brd);
1863 local_input:
1864 rth = dst_alloc(&ipv4_dst_ops);
1865 if (!rth)
1866 goto e_nobufs;
1868 rth->u.dst.output= ip_rt_bug;
1870 atomic_set(&rth->u.dst.__refcnt, 1);
1871 rth->u.dst.flags= DST_HOST;
1872 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1873 rth->u.dst.flags |= DST_NOPOLICY;
1874 rth->fl.fl4_dst = daddr;
1875 rth->rt_dst = daddr;
1876 rth->fl.fl4_tos = tos;
1877 rth->fl.mark = skb->mark;
1878 rth->fl.fl4_src = saddr;
1879 rth->rt_src = saddr;
1880 #ifdef CONFIG_NET_CLS_ROUTE
1881 rth->u.dst.tclassid = itag;
1882 #endif
1883 rth->rt_iif =
1884 rth->fl.iif = dev->ifindex;
1885 rth->u.dst.dev = &loopback_dev;
1886 dev_hold(rth->u.dst.dev);
1887 rth->idev = in_dev_get(rth->u.dst.dev);
1888 rth->rt_gateway = daddr;
1889 rth->rt_spec_dst= spec_dst;
1890 rth->u.dst.input= ip_local_deliver;
1891 rth->rt_flags = flags|RTCF_LOCAL;
1892 if (res.type == RTN_UNREACHABLE) {
1893 rth->u.dst.input= ip_error;
1894 rth->u.dst.error= -err;
1895 rth->rt_flags &= ~RTCF_LOCAL;
1897 rth->rt_type = res.type;
1898 hash = rt_hash(daddr, saddr, fl.iif);
1899 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1900 goto done;
1902 no_route:
1903 RT_CACHE_STAT_INC(in_no_route);
1904 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1905 res.type = RTN_UNREACHABLE;
1906 goto local_input;
1909 * Do not cache martian addresses: they should be logged (RFC1812)
1911 martian_destination:
1912 RT_CACHE_STAT_INC(in_martian_dst);
1913 #ifdef CONFIG_IP_ROUTE_VERBOSE
1914 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1915 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1916 "%u.%u.%u.%u, dev %s\n",
1917 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1918 #endif
1920 e_hostunreach:
1921 err = -EHOSTUNREACH;
1922 goto done;
1924 e_inval:
1925 err = -EINVAL;
1926 goto done;
1928 e_nobufs:
1929 err = -ENOBUFS;
1930 goto done;
1932 martian_source:
1933 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1934 goto e_inval;
1937 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1938 u8 tos, struct net_device *dev)
1940 struct rtable * rth;
1941 unsigned hash;
1942 int iif = dev->ifindex;
1944 tos &= IPTOS_RT_MASK;
1945 hash = rt_hash(daddr, saddr, iif);
1947 rcu_read_lock();
1948 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1949 rth = rcu_dereference(rth->u.dst.rt_next)) {
1950 if (rth->fl.fl4_dst == daddr &&
1951 rth->fl.fl4_src == saddr &&
1952 rth->fl.iif == iif &&
1953 rth->fl.oif == 0 &&
1954 rth->fl.mark == skb->mark &&
1955 rth->fl.fl4_tos == tos) {
1956 rth->u.dst.lastuse = jiffies;
1957 dst_hold(&rth->u.dst);
1958 rth->u.dst.__use++;
1959 RT_CACHE_STAT_INC(in_hit);
1960 rcu_read_unlock();
1961 skb->dst = (struct dst_entry*)rth;
1962 return 0;
1964 RT_CACHE_STAT_INC(in_hlist_search);
1966 rcu_read_unlock();
1968 /* Multicast recognition logic is moved from route cache to here.
1969 The problem was that too many Ethernet cards have broken/missing
1970 hardware multicast filters :-( As result the host on multicasting
1971 network acquires a lot of useless route cache entries, sort of
1972 SDR messages from all the world. Now we try to get rid of them.
1973 Really, provided software IP multicast filter is organized
1974 reasonably (at least, hashed), it does not result in a slowdown
1975 comparing with route cache reject entries.
1976 Note, that multicast routers are not affected, because
1977 route cache entry is created eventually.
1979 if (MULTICAST(daddr)) {
1980 struct in_device *in_dev;
1982 rcu_read_lock();
1983 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1984 int our = ip_check_mc(in_dev, daddr, saddr,
1985 ip_hdr(skb)->protocol);
1986 if (our
1987 #ifdef CONFIG_IP_MROUTE
1988 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1989 #endif
1991 rcu_read_unlock();
1992 return ip_route_input_mc(skb, daddr, saddr,
1993 tos, dev, our);
1996 rcu_read_unlock();
1997 return -EINVAL;
1999 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2002 static inline int __mkroute_output(struct rtable **result,
2003 struct fib_result* res,
2004 const struct flowi *fl,
2005 const struct flowi *oldflp,
2006 struct net_device *dev_out,
2007 unsigned flags)
2009 struct rtable *rth;
2010 struct in_device *in_dev;
2011 u32 tos = RT_FL_TOS(oldflp);
2012 int err = 0;
2014 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2015 return -EINVAL;
2017 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2018 res->type = RTN_BROADCAST;
2019 else if (MULTICAST(fl->fl4_dst))
2020 res->type = RTN_MULTICAST;
2021 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2022 return -EINVAL;
2024 if (dev_out->flags & IFF_LOOPBACK)
2025 flags |= RTCF_LOCAL;
2027 /* get work reference to inet device */
2028 in_dev = in_dev_get(dev_out);
2029 if (!in_dev)
2030 return -EINVAL;
2032 if (res->type == RTN_BROADCAST) {
2033 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2034 if (res->fi) {
2035 fib_info_put(res->fi);
2036 res->fi = NULL;
2038 } else if (res->type == RTN_MULTICAST) {
2039 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2040 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2041 oldflp->proto))
2042 flags &= ~RTCF_LOCAL;
2043 /* If multicast route do not exist use
2044 default one, but do not gateway in this case.
2045 Yes, it is hack.
2047 if (res->fi && res->prefixlen < 4) {
2048 fib_info_put(res->fi);
2049 res->fi = NULL;
2054 rth = dst_alloc(&ipv4_dst_ops);
2055 if (!rth) {
2056 err = -ENOBUFS;
2057 goto cleanup;
2060 atomic_set(&rth->u.dst.__refcnt, 1);
2061 rth->u.dst.flags= DST_HOST;
2062 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2063 rth->u.dst.flags |= DST_NOXFRM;
2064 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2065 rth->u.dst.flags |= DST_NOPOLICY;
2067 rth->fl.fl4_dst = oldflp->fl4_dst;
2068 rth->fl.fl4_tos = tos;
2069 rth->fl.fl4_src = oldflp->fl4_src;
2070 rth->fl.oif = oldflp->oif;
2071 rth->fl.mark = oldflp->mark;
2072 rth->rt_dst = fl->fl4_dst;
2073 rth->rt_src = fl->fl4_src;
2074 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2075 /* get references to the devices that are to be hold by the routing
2076 cache entry */
2077 rth->u.dst.dev = dev_out;
2078 dev_hold(dev_out);
2079 rth->idev = in_dev_get(dev_out);
2080 rth->rt_gateway = fl->fl4_dst;
2081 rth->rt_spec_dst= fl->fl4_src;
2083 rth->u.dst.output=ip_output;
2085 RT_CACHE_STAT_INC(out_slow_tot);
2087 if (flags & RTCF_LOCAL) {
2088 rth->u.dst.input = ip_local_deliver;
2089 rth->rt_spec_dst = fl->fl4_dst;
2091 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2092 rth->rt_spec_dst = fl->fl4_src;
2093 if (flags & RTCF_LOCAL &&
2094 !(dev_out->flags & IFF_LOOPBACK)) {
2095 rth->u.dst.output = ip_mc_output;
2096 RT_CACHE_STAT_INC(out_slow_mc);
2098 #ifdef CONFIG_IP_MROUTE
2099 if (res->type == RTN_MULTICAST) {
2100 if (IN_DEV_MFORWARD(in_dev) &&
2101 !LOCAL_MCAST(oldflp->fl4_dst)) {
2102 rth->u.dst.input = ip_mr_input;
2103 rth->u.dst.output = ip_mc_output;
2106 #endif
2109 rt_set_nexthop(rth, res, 0);
2111 rth->rt_flags = flags;
2113 *result = rth;
2114 cleanup:
2115 /* release work reference to inet device */
2116 in_dev_put(in_dev);
2118 return err;
2121 static inline int ip_mkroute_output(struct rtable **rp,
2122 struct fib_result* res,
2123 const struct flowi *fl,
2124 const struct flowi *oldflp,
2125 struct net_device *dev_out,
2126 unsigned flags)
2128 struct rtable *rth = NULL;
2129 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2130 unsigned hash;
2131 if (err == 0) {
2132 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2133 err = rt_intern_hash(hash, rth, rp);
2136 return err;
2140 * Major route resolver routine.
2143 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2145 u32 tos = RT_FL_TOS(oldflp);
2146 struct flowi fl = { .nl_u = { .ip4_u =
2147 { .daddr = oldflp->fl4_dst,
2148 .saddr = oldflp->fl4_src,
2149 .tos = tos & IPTOS_RT_MASK,
2150 .scope = ((tos & RTO_ONLINK) ?
2151 RT_SCOPE_LINK :
2152 RT_SCOPE_UNIVERSE),
2153 } },
2154 .mark = oldflp->mark,
2155 .iif = loopback_dev.ifindex,
2156 .oif = oldflp->oif };
2157 struct fib_result res;
2158 unsigned flags = 0;
2159 struct net_device *dev_out = NULL;
2160 int free_res = 0;
2161 int err;
2164 res.fi = NULL;
2165 #ifdef CONFIG_IP_MULTIPLE_TABLES
2166 res.r = NULL;
2167 #endif
2169 if (oldflp->fl4_src) {
2170 err = -EINVAL;
2171 if (MULTICAST(oldflp->fl4_src) ||
2172 BADCLASS(oldflp->fl4_src) ||
2173 ZERONET(oldflp->fl4_src))
2174 goto out;
2176 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2177 dev_out = ip_dev_find(oldflp->fl4_src);
2178 if (dev_out == NULL)
2179 goto out;
2181 /* I removed check for oif == dev_out->oif here.
2182 It was wrong for two reasons:
2183 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2184 assigned to multiple interfaces.
2185 2. Moreover, we are allowed to send packets with saddr
2186 of another iface. --ANK
2189 if (oldflp->oif == 0
2190 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2191 /* Special hack: user can direct multicasts
2192 and limited broadcast via necessary interface
2193 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2194 This hack is not just for fun, it allows
2195 vic,vat and friends to work.
2196 They bind socket to loopback, set ttl to zero
2197 and expect that it will work.
2198 From the viewpoint of routing cache they are broken,
2199 because we are not allowed to build multicast path
2200 with loopback source addr (look, routing cache
2201 cannot know, that ttl is zero, so that packet
2202 will not leave this host and route is valid).
2203 Luckily, this hack is good workaround.
2206 fl.oif = dev_out->ifindex;
2207 goto make_route;
2209 if (dev_out)
2210 dev_put(dev_out);
2211 dev_out = NULL;
2215 if (oldflp->oif) {
2216 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2217 err = -ENODEV;
2218 if (dev_out == NULL)
2219 goto out;
2221 /* RACE: Check return value of inet_select_addr instead. */
2222 if (__in_dev_get_rtnl(dev_out) == NULL) {
2223 dev_put(dev_out);
2224 goto out; /* Wrong error code */
2227 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2228 if (!fl.fl4_src)
2229 fl.fl4_src = inet_select_addr(dev_out, 0,
2230 RT_SCOPE_LINK);
2231 goto make_route;
2233 if (!fl.fl4_src) {
2234 if (MULTICAST(oldflp->fl4_dst))
2235 fl.fl4_src = inet_select_addr(dev_out, 0,
2236 fl.fl4_scope);
2237 else if (!oldflp->fl4_dst)
2238 fl.fl4_src = inet_select_addr(dev_out, 0,
2239 RT_SCOPE_HOST);
2243 if (!fl.fl4_dst) {
2244 fl.fl4_dst = fl.fl4_src;
2245 if (!fl.fl4_dst)
2246 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2247 if (dev_out)
2248 dev_put(dev_out);
2249 dev_out = &loopback_dev;
2250 dev_hold(dev_out);
2251 fl.oif = loopback_dev.ifindex;
2252 res.type = RTN_LOCAL;
2253 flags |= RTCF_LOCAL;
2254 goto make_route;
2257 if (fib_lookup(&fl, &res)) {
2258 res.fi = NULL;
2259 if (oldflp->oif) {
2260 /* Apparently, routing tables are wrong. Assume,
2261 that the destination is on link.
2263 WHY? DW.
2264 Because we are allowed to send to iface
2265 even if it has NO routes and NO assigned
2266 addresses. When oif is specified, routing
2267 tables are looked up with only one purpose:
2268 to catch if destination is gatewayed, rather than
2269 direct. Moreover, if MSG_DONTROUTE is set,
2270 we send packet, ignoring both routing tables
2271 and ifaddr state. --ANK
2274 We could make it even if oif is unknown,
2275 likely IPv6, but we do not.
2278 if (fl.fl4_src == 0)
2279 fl.fl4_src = inet_select_addr(dev_out, 0,
2280 RT_SCOPE_LINK);
2281 res.type = RTN_UNICAST;
2282 goto make_route;
2284 if (dev_out)
2285 dev_put(dev_out);
2286 err = -ENETUNREACH;
2287 goto out;
2289 free_res = 1;
2291 if (res.type == RTN_LOCAL) {
2292 if (!fl.fl4_src)
2293 fl.fl4_src = fl.fl4_dst;
2294 if (dev_out)
2295 dev_put(dev_out);
2296 dev_out = &loopback_dev;
2297 dev_hold(dev_out);
2298 fl.oif = dev_out->ifindex;
2299 if (res.fi)
2300 fib_info_put(res.fi);
2301 res.fi = NULL;
2302 flags |= RTCF_LOCAL;
2303 goto make_route;
2306 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2307 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2308 fib_select_multipath(&fl, &res);
2309 else
2310 #endif
2311 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2312 fib_select_default(&fl, &res);
2314 if (!fl.fl4_src)
2315 fl.fl4_src = FIB_RES_PREFSRC(res);
2317 if (dev_out)
2318 dev_put(dev_out);
2319 dev_out = FIB_RES_DEV(res);
2320 dev_hold(dev_out);
2321 fl.oif = dev_out->ifindex;
2324 make_route:
2325 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2328 if (free_res)
2329 fib_res_put(&res);
2330 if (dev_out)
2331 dev_put(dev_out);
2332 out: return err;
2335 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2337 unsigned hash;
2338 struct rtable *rth;
2340 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2342 rcu_read_lock_bh();
2343 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2344 rth = rcu_dereference(rth->u.dst.rt_next)) {
2345 if (rth->fl.fl4_dst == flp->fl4_dst &&
2346 rth->fl.fl4_src == flp->fl4_src &&
2347 rth->fl.iif == 0 &&
2348 rth->fl.oif == flp->oif &&
2349 rth->fl.mark == flp->mark &&
2350 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2351 (IPTOS_RT_MASK | RTO_ONLINK))) {
2352 rth->u.dst.lastuse = jiffies;
2353 dst_hold(&rth->u.dst);
2354 rth->u.dst.__use++;
2355 RT_CACHE_STAT_INC(out_hit);
2356 rcu_read_unlock_bh();
2357 *rp = rth;
2358 return 0;
2360 RT_CACHE_STAT_INC(out_hlist_search);
2362 rcu_read_unlock_bh();
2364 return ip_route_output_slow(rp, flp);
2367 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2369 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2373 static struct dst_ops ipv4_dst_blackhole_ops = {
2374 .family = AF_INET,
2375 .protocol = __constant_htons(ETH_P_IP),
2376 .destroy = ipv4_dst_destroy,
2377 .check = ipv4_dst_check,
2378 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2379 .entry_size = sizeof(struct rtable),
2383 static int ipv4_blackhole_output(struct sk_buff *skb)
2385 kfree_skb(skb);
2386 return 0;
2389 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2391 struct rtable *ort = *rp;
2392 struct rtable *rt = (struct rtable *)
2393 dst_alloc(&ipv4_dst_blackhole_ops);
2395 if (rt) {
2396 struct dst_entry *new = &rt->u.dst;
2398 atomic_set(&new->__refcnt, 1);
2399 new->__use = 1;
2400 new->input = ipv4_blackhole_output;
2401 new->output = ipv4_blackhole_output;
2402 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2404 new->dev = ort->u.dst.dev;
2405 if (new->dev)
2406 dev_hold(new->dev);
2408 rt->fl = ort->fl;
2410 rt->idev = ort->idev;
2411 if (rt->idev)
2412 in_dev_hold(rt->idev);
2413 rt->rt_flags = ort->rt_flags;
2414 rt->rt_type = ort->rt_type;
2415 rt->rt_dst = ort->rt_dst;
2416 rt->rt_src = ort->rt_src;
2417 rt->rt_iif = ort->rt_iif;
2418 rt->rt_gateway = ort->rt_gateway;
2419 rt->rt_spec_dst = ort->rt_spec_dst;
2420 rt->peer = ort->peer;
2421 if (rt->peer)
2422 atomic_inc(&rt->peer->refcnt);
2424 dst_free(new);
2427 dst_release(&(*rp)->u.dst);
2428 *rp = rt;
2429 return (rt ? 0 : -ENOMEM);
2432 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2434 int err;
2436 if ((err = __ip_route_output_key(rp, flp)) != 0)
2437 return err;
2439 if (flp->proto) {
2440 if (!flp->fl4_src)
2441 flp->fl4_src = (*rp)->rt_src;
2442 if (!flp->fl4_dst)
2443 flp->fl4_dst = (*rp)->rt_dst;
2444 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2445 if (err == -EREMOTE)
2446 err = ipv4_dst_blackhole(rp, flp, sk);
2448 return err;
2451 return 0;
2454 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2456 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2458 return ip_route_output_flow(rp, flp, NULL, 0);
2461 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2462 int nowait, unsigned int flags)
2464 struct rtable *rt = (struct rtable*)skb->dst;
2465 struct rtmsg *r;
2466 struct nlmsghdr *nlh;
2467 long expires;
2468 u32 id = 0, ts = 0, tsage = 0, error;
2470 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2471 if (nlh == NULL)
2472 return -EMSGSIZE;
2474 r = nlmsg_data(nlh);
2475 r->rtm_family = AF_INET;
2476 r->rtm_dst_len = 32;
2477 r->rtm_src_len = 0;
2478 r->rtm_tos = rt->fl.fl4_tos;
2479 r->rtm_table = RT_TABLE_MAIN;
2480 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2481 r->rtm_type = rt->rt_type;
2482 r->rtm_scope = RT_SCOPE_UNIVERSE;
2483 r->rtm_protocol = RTPROT_UNSPEC;
2484 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2485 if (rt->rt_flags & RTCF_NOTIFY)
2486 r->rtm_flags |= RTM_F_NOTIFY;
2488 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2490 if (rt->fl.fl4_src) {
2491 r->rtm_src_len = 32;
2492 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2494 if (rt->u.dst.dev)
2495 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2496 #ifdef CONFIG_NET_CLS_ROUTE
2497 if (rt->u.dst.tclassid)
2498 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2499 #endif
2500 if (rt->fl.iif)
2501 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2502 else if (rt->rt_src != rt->fl.fl4_src)
2503 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2505 if (rt->rt_dst != rt->rt_gateway)
2506 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2508 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2509 goto nla_put_failure;
2511 error = rt->u.dst.error;
2512 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2513 if (rt->peer) {
2514 id = rt->peer->ip_id_count;
2515 if (rt->peer->tcp_ts_stamp) {
2516 ts = rt->peer->tcp_ts;
2517 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2521 if (rt->fl.iif) {
2522 #ifdef CONFIG_IP_MROUTE
2523 __be32 dst = rt->rt_dst;
2525 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2526 IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2527 int err = ipmr_get_route(skb, r, nowait);
2528 if (err <= 0) {
2529 if (!nowait) {
2530 if (err == 0)
2531 return 0;
2532 goto nla_put_failure;
2533 } else {
2534 if (err == -EMSGSIZE)
2535 goto nla_put_failure;
2536 error = err;
2539 } else
2540 #endif
2541 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2544 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2545 expires, error) < 0)
2546 goto nla_put_failure;
2548 return nlmsg_end(skb, nlh);
2550 nla_put_failure:
2551 nlmsg_cancel(skb, nlh);
2552 return -EMSGSIZE;
2555 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2557 struct rtmsg *rtm;
2558 struct nlattr *tb[RTA_MAX+1];
2559 struct rtable *rt = NULL;
2560 __be32 dst = 0;
2561 __be32 src = 0;
2562 u32 iif;
2563 int err;
2564 struct sk_buff *skb;
2566 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2567 if (err < 0)
2568 goto errout;
2570 rtm = nlmsg_data(nlh);
2572 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2573 if (skb == NULL) {
2574 err = -ENOBUFS;
2575 goto errout;
2578 /* Reserve room for dummy headers, this skb can pass
2579 through good chunk of routing engine.
2581 skb_reset_mac_header(skb);
2582 skb_reset_network_header(skb);
2584 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2585 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2586 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2588 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2589 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2590 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2592 if (iif) {
2593 struct net_device *dev;
2595 dev = __dev_get_by_index(&init_net, iif);
2596 if (dev == NULL) {
2597 err = -ENODEV;
2598 goto errout_free;
2601 skb->protocol = htons(ETH_P_IP);
2602 skb->dev = dev;
2603 local_bh_disable();
2604 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2605 local_bh_enable();
2607 rt = (struct rtable*) skb->dst;
2608 if (err == 0 && rt->u.dst.error)
2609 err = -rt->u.dst.error;
2610 } else {
2611 struct flowi fl = {
2612 .nl_u = {
2613 .ip4_u = {
2614 .daddr = dst,
2615 .saddr = src,
2616 .tos = rtm->rtm_tos,
2619 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2621 err = ip_route_output_key(&rt, &fl);
2624 if (err)
2625 goto errout_free;
2627 skb->dst = &rt->u.dst;
2628 if (rtm->rtm_flags & RTM_F_NOTIFY)
2629 rt->rt_flags |= RTCF_NOTIFY;
2631 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2632 RTM_NEWROUTE, 0, 0);
2633 if (err <= 0)
2634 goto errout_free;
2636 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2637 errout:
2638 return err;
2640 errout_free:
2641 kfree_skb(skb);
2642 goto errout;
2645 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2647 struct rtable *rt;
2648 int h, s_h;
2649 int idx, s_idx;
2651 s_h = cb->args[0];
2652 s_idx = idx = cb->args[1];
2653 for (h = 0; h <= rt_hash_mask; h++) {
2654 if (h < s_h) continue;
2655 if (h > s_h)
2656 s_idx = 0;
2657 rcu_read_lock_bh();
2658 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2659 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2660 if (idx < s_idx)
2661 continue;
2662 skb->dst = dst_clone(&rt->u.dst);
2663 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2664 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2665 1, NLM_F_MULTI) <= 0) {
2666 dst_release(xchg(&skb->dst, NULL));
2667 rcu_read_unlock_bh();
2668 goto done;
2670 dst_release(xchg(&skb->dst, NULL));
2672 rcu_read_unlock_bh();
2675 done:
2676 cb->args[0] = h;
2677 cb->args[1] = idx;
2678 return skb->len;
2681 void ip_rt_multicast_event(struct in_device *in_dev)
2683 rt_cache_flush(0);
2686 #ifdef CONFIG_SYSCTL
2687 static int flush_delay;
2689 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2690 struct file *filp, void __user *buffer,
2691 size_t *lenp, loff_t *ppos)
2693 if (write) {
2694 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2695 rt_cache_flush(flush_delay);
2696 return 0;
2699 return -EINVAL;
2702 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2703 int __user *name,
2704 int nlen,
2705 void __user *oldval,
2706 size_t __user *oldlenp,
2707 void __user *newval,
2708 size_t newlen)
2710 int delay;
2711 if (newlen != sizeof(int))
2712 return -EINVAL;
2713 if (get_user(delay, (int __user *)newval))
2714 return -EFAULT;
2715 rt_cache_flush(delay);
2716 return 0;
2719 ctl_table ipv4_route_table[] = {
2721 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2722 .procname = "flush",
2723 .data = &flush_delay,
2724 .maxlen = sizeof(int),
2725 .mode = 0200,
2726 .proc_handler = &ipv4_sysctl_rtcache_flush,
2727 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2730 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2731 .procname = "min_delay",
2732 .data = &ip_rt_min_delay,
2733 .maxlen = sizeof(int),
2734 .mode = 0644,
2735 .proc_handler = &proc_dointvec_jiffies,
2736 .strategy = &sysctl_jiffies,
2739 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2740 .procname = "max_delay",
2741 .data = &ip_rt_max_delay,
2742 .maxlen = sizeof(int),
2743 .mode = 0644,
2744 .proc_handler = &proc_dointvec_jiffies,
2745 .strategy = &sysctl_jiffies,
2748 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2749 .procname = "gc_thresh",
2750 .data = &ipv4_dst_ops.gc_thresh,
2751 .maxlen = sizeof(int),
2752 .mode = 0644,
2753 .proc_handler = &proc_dointvec,
2756 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2757 .procname = "max_size",
2758 .data = &ip_rt_max_size,
2759 .maxlen = sizeof(int),
2760 .mode = 0644,
2761 .proc_handler = &proc_dointvec,
2764 /* Deprecated. Use gc_min_interval_ms */
2766 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2767 .procname = "gc_min_interval",
2768 .data = &ip_rt_gc_min_interval,
2769 .maxlen = sizeof(int),
2770 .mode = 0644,
2771 .proc_handler = &proc_dointvec_jiffies,
2772 .strategy = &sysctl_jiffies,
2775 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2776 .procname = "gc_min_interval_ms",
2777 .data = &ip_rt_gc_min_interval,
2778 .maxlen = sizeof(int),
2779 .mode = 0644,
2780 .proc_handler = &proc_dointvec_ms_jiffies,
2781 .strategy = &sysctl_ms_jiffies,
2784 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2785 .procname = "gc_timeout",
2786 .data = &ip_rt_gc_timeout,
2787 .maxlen = sizeof(int),
2788 .mode = 0644,
2789 .proc_handler = &proc_dointvec_jiffies,
2790 .strategy = &sysctl_jiffies,
2793 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2794 .procname = "gc_interval",
2795 .data = &ip_rt_gc_interval,
2796 .maxlen = sizeof(int),
2797 .mode = 0644,
2798 .proc_handler = &proc_dointvec_jiffies,
2799 .strategy = &sysctl_jiffies,
2802 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2803 .procname = "redirect_load",
2804 .data = &ip_rt_redirect_load,
2805 .maxlen = sizeof(int),
2806 .mode = 0644,
2807 .proc_handler = &proc_dointvec,
2810 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2811 .procname = "redirect_number",
2812 .data = &ip_rt_redirect_number,
2813 .maxlen = sizeof(int),
2814 .mode = 0644,
2815 .proc_handler = &proc_dointvec,
2818 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2819 .procname = "redirect_silence",
2820 .data = &ip_rt_redirect_silence,
2821 .maxlen = sizeof(int),
2822 .mode = 0644,
2823 .proc_handler = &proc_dointvec,
2826 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2827 .procname = "error_cost",
2828 .data = &ip_rt_error_cost,
2829 .maxlen = sizeof(int),
2830 .mode = 0644,
2831 .proc_handler = &proc_dointvec,
2834 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2835 .procname = "error_burst",
2836 .data = &ip_rt_error_burst,
2837 .maxlen = sizeof(int),
2838 .mode = 0644,
2839 .proc_handler = &proc_dointvec,
2842 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2843 .procname = "gc_elasticity",
2844 .data = &ip_rt_gc_elasticity,
2845 .maxlen = sizeof(int),
2846 .mode = 0644,
2847 .proc_handler = &proc_dointvec,
2850 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2851 .procname = "mtu_expires",
2852 .data = &ip_rt_mtu_expires,
2853 .maxlen = sizeof(int),
2854 .mode = 0644,
2855 .proc_handler = &proc_dointvec_jiffies,
2856 .strategy = &sysctl_jiffies,
2859 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2860 .procname = "min_pmtu",
2861 .data = &ip_rt_min_pmtu,
2862 .maxlen = sizeof(int),
2863 .mode = 0644,
2864 .proc_handler = &proc_dointvec,
2867 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2868 .procname = "min_adv_mss",
2869 .data = &ip_rt_min_advmss,
2870 .maxlen = sizeof(int),
2871 .mode = 0644,
2872 .proc_handler = &proc_dointvec,
2875 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2876 .procname = "secret_interval",
2877 .data = &ip_rt_secret_interval,
2878 .maxlen = sizeof(int),
2879 .mode = 0644,
2880 .proc_handler = &proc_dointvec_jiffies,
2881 .strategy = &sysctl_jiffies,
2883 { .ctl_name = 0 }
2885 #endif
2887 #ifdef CONFIG_NET_CLS_ROUTE
2888 struct ip_rt_acct *ip_rt_acct;
2890 /* This code sucks. But you should have seen it before! --RR */
2892 /* IP route accounting ptr for this logical cpu number. */
2893 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2895 #ifdef CONFIG_PROC_FS
2896 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2897 int length, int *eof, void *data)
2899 unsigned int i;
2901 if ((offset & 3) || (length & 3))
2902 return -EIO;
2904 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2905 *eof = 1;
2906 return 0;
2909 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2910 length = sizeof(struct ip_rt_acct) * 256 - offset;
2911 *eof = 1;
2914 offset /= sizeof(u32);
2916 if (length > 0) {
2917 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2918 u32 *dst = (u32 *) buffer;
2920 /* Copy first cpu. */
2921 *start = buffer;
2922 memcpy(dst, src, length);
2924 /* Add the other cpus in, one int at a time */
2925 for_each_possible_cpu(i) {
2926 unsigned int j;
2928 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2930 for (j = 0; j < length/4; j++)
2931 dst[j] += src[j];
2934 return length;
2936 #endif /* CONFIG_PROC_FS */
2937 #endif /* CONFIG_NET_CLS_ROUTE */
2939 static __initdata unsigned long rhash_entries;
2940 static int __init set_rhash_entries(char *str)
2942 if (!str)
2943 return 0;
2944 rhash_entries = simple_strtoul(str, &str, 0);
2945 return 1;
2947 __setup("rhash_entries=", set_rhash_entries);
2949 int __init ip_rt_init(void)
2951 int rc = 0;
2953 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2954 (jiffies ^ (jiffies >> 7)));
2956 #ifdef CONFIG_NET_CLS_ROUTE
2958 int order;
2959 for (order = 0;
2960 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2961 /* NOTHING */;
2962 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2963 if (!ip_rt_acct)
2964 panic("IP: failed to allocate ip_rt_acct\n");
2965 memset(ip_rt_acct, 0, PAGE_SIZE << order);
2967 #endif
2969 ipv4_dst_ops.kmem_cachep =
2970 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2971 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2973 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2975 rt_hash_table = (struct rt_hash_bucket *)
2976 alloc_large_system_hash("IP route cache",
2977 sizeof(struct rt_hash_bucket),
2978 rhash_entries,
2979 (num_physpages >= 128 * 1024) ?
2980 15 : 17,
2982 &rt_hash_log,
2983 &rt_hash_mask,
2985 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2986 rt_hash_lock_init();
2988 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2989 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2991 devinet_init();
2992 ip_fib_init();
2994 init_timer(&rt_flush_timer);
2995 rt_flush_timer.function = rt_run_flush;
2996 init_timer(&rt_periodic_timer);
2997 rt_periodic_timer.function = rt_check_expire;
2998 init_timer(&rt_secret_timer);
2999 rt_secret_timer.function = rt_secret_rebuild;
3001 /* All the timers, started at system startup tend
3002 to synchronize. Perturb it a bit.
3004 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3005 ip_rt_gc_interval;
3006 add_timer(&rt_periodic_timer);
3008 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3009 ip_rt_secret_interval;
3010 add_timer(&rt_secret_timer);
3012 #ifdef CONFIG_PROC_FS
3014 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3015 if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3016 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3017 init_net.proc_net_stat))) {
3018 return -ENOMEM;
3020 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3022 #ifdef CONFIG_NET_CLS_ROUTE
3023 create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
3024 #endif
3025 #endif
3026 #ifdef CONFIG_XFRM
3027 xfrm_init();
3028 xfrm4_init();
3029 #endif
3030 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3032 return rc;
3035 EXPORT_SYMBOL(__ip_select_ident);
3036 EXPORT_SYMBOL(ip_route_input);
3037 EXPORT_SYMBOL(ip_route_output_key);