Linux-2.6.12-rc2
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / ipv4 / route.c
blob9f91a116d91926df3ba936a80f020a6ab1084d2b
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
58 * This program is free software; you can redistribute it and/or
59 * modify it under the terms of the GNU General Public License
60 * as published by the Free Software Foundation; either version
61 * 2 of the License, or (at your option) any later version.
64 #include <linux/config.h>
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/sched.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/rtnetlink.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/ip_mp_alg.h>
104 #ifdef CONFIG_SYSCTL
105 #include <linux/sysctl.h>
106 #endif
108 #define RT_FL_TOS(oldflp) \
109 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
111 #define IP_MAX_MTU 0xFFF0
113 #define RT_GC_TIMEOUT (300*HZ)
115 static int ip_rt_min_delay = 2 * HZ;
116 static int ip_rt_max_delay = 10 * HZ;
117 static int ip_rt_max_size;
118 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
119 static int ip_rt_gc_interval = 60 * HZ;
120 static int ip_rt_gc_min_interval = HZ / 2;
121 static int ip_rt_redirect_number = 9;
122 static int ip_rt_redirect_load = HZ / 50;
123 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost = HZ;
125 static int ip_rt_error_burst = 5 * HZ;
126 static int ip_rt_gc_elasticity = 8;
127 static int ip_rt_mtu_expires = 10 * 60 * HZ;
128 static int ip_rt_min_pmtu = 512 + 20 + 20;
129 static int ip_rt_min_advmss = 256;
130 static int ip_rt_secret_interval = 10 * 60 * HZ;
131 static unsigned long rt_deadline;
133 #define RTprint(a...) printk(KERN_DEBUG a)
135 static struct timer_list rt_flush_timer;
136 static struct timer_list rt_periodic_timer;
137 static struct timer_list rt_secret_timer;
140 * Interface to generic destination cache.
143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144 static void ipv4_dst_destroy(struct dst_entry *dst);
145 static void ipv4_dst_ifdown(struct dst_entry *dst,
146 struct net_device *dev, int how);
147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148 static void ipv4_link_failure(struct sk_buff *skb);
149 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150 static int rt_garbage_collect(void);
153 static struct dst_ops ipv4_dst_ops = {
154 .family = AF_INET,
155 .protocol = __constant_htons(ETH_P_IP),
156 .gc = rt_garbage_collect,
157 .check = ipv4_dst_check,
158 .destroy = ipv4_dst_destroy,
159 .ifdown = ipv4_dst_ifdown,
160 .negative_advice = ipv4_negative_advice,
161 .link_failure = ipv4_link_failure,
162 .update_pmtu = ip_rt_update_pmtu,
163 .entry_size = sizeof(struct rtable),
166 #define ECN_OR_COST(class) TC_PRIO_##class
168 __u8 ip_tos2prio[16] = {
169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
189 * Route cache.
192 /* The locking scheme is rather straight forward:
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
199 * lock held.
202 struct rt_hash_bucket {
203 struct rtable *chain;
204 spinlock_t lock;
205 } __attribute__((__aligned__(8)));
207 static struct rt_hash_bucket *rt_hash_table;
208 static unsigned rt_hash_mask;
209 static int rt_hash_log;
210 static unsigned int rt_hash_rnd;
212 struct rt_cache_stat *rt_cache_stat;
214 static int rt_intern_hash(unsigned hash, struct rtable *rth,
215 struct rtable **res);
217 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
219 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
220 & rt_hash_mask);
223 #ifdef CONFIG_PROC_FS
224 struct rt_cache_iter_state {
225 int bucket;
228 static struct rtable *rt_cache_get_first(struct seq_file *seq)
230 struct rtable *r = NULL;
231 struct rt_cache_iter_state *st = seq->private;
233 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
234 rcu_read_lock_bh();
235 r = rt_hash_table[st->bucket].chain;
236 if (r)
237 break;
238 rcu_read_unlock_bh();
240 return r;
243 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
245 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
247 r = r->u.rt_next;
248 while (!r) {
249 rcu_read_unlock_bh();
250 if (--st->bucket < 0)
251 break;
252 rcu_read_lock_bh();
253 r = rt_hash_table[st->bucket].chain;
255 return r;
258 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
260 struct rtable *r = rt_cache_get_first(seq);
262 if (r)
263 while (pos && (r = rt_cache_get_next(seq, r)))
264 --pos;
265 return pos ? NULL : r;
268 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
270 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
273 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
275 struct rtable *r = NULL;
277 if (v == SEQ_START_TOKEN)
278 r = rt_cache_get_first(seq);
279 else
280 r = rt_cache_get_next(seq, v);
281 ++*pos;
282 return r;
285 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
287 if (v && v != SEQ_START_TOKEN)
288 rcu_read_unlock_bh();
291 static int rt_cache_seq_show(struct seq_file *seq, void *v)
293 if (v == SEQ_START_TOKEN)
294 seq_printf(seq, "%-127s\n",
295 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
296 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
297 "HHUptod\tSpecDst");
298 else {
299 struct rtable *r = v;
300 char temp[256];
302 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
303 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
304 r->u.dst.dev ? r->u.dst.dev->name : "*",
305 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
306 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
307 r->u.dst.__use, 0, (unsigned long)r->rt_src,
308 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
309 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
310 dst_metric(&r->u.dst, RTAX_WINDOW),
311 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
312 dst_metric(&r->u.dst, RTAX_RTTVAR)),
313 r->fl.fl4_tos,
314 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
315 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
316 dev_queue_xmit) : 0,
317 r->rt_spec_dst);
318 seq_printf(seq, "%-127s\n", temp);
320 return 0;
323 static struct seq_operations rt_cache_seq_ops = {
324 .start = rt_cache_seq_start,
325 .next = rt_cache_seq_next,
326 .stop = rt_cache_seq_stop,
327 .show = rt_cache_seq_show,
330 static int rt_cache_seq_open(struct inode *inode, struct file *file)
332 struct seq_file *seq;
333 int rc = -ENOMEM;
334 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
336 if (!s)
337 goto out;
338 rc = seq_open(file, &rt_cache_seq_ops);
339 if (rc)
340 goto out_kfree;
341 seq = file->private_data;
342 seq->private = s;
343 memset(s, 0, sizeof(*s));
344 out:
345 return rc;
346 out_kfree:
347 kfree(s);
348 goto out;
351 static struct file_operations rt_cache_seq_fops = {
352 .owner = THIS_MODULE,
353 .open = rt_cache_seq_open,
354 .read = seq_read,
355 .llseek = seq_lseek,
356 .release = seq_release_private,
360 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
362 int cpu;
364 if (*pos == 0)
365 return SEQ_START_TOKEN;
367 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
368 if (!cpu_possible(cpu))
369 continue;
370 *pos = cpu+1;
371 return per_cpu_ptr(rt_cache_stat, cpu);
373 return NULL;
376 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
378 int cpu;
380 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
381 if (!cpu_possible(cpu))
382 continue;
383 *pos = cpu+1;
384 return per_cpu_ptr(rt_cache_stat, cpu);
386 return NULL;
390 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
395 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
397 struct rt_cache_stat *st = v;
399 if (v == SEQ_START_TOKEN) {
400 seq_printf(seq, "entries in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
401 return 0;
404 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
405 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
406 atomic_read(&ipv4_dst_ops.entries),
407 st->in_hit,
408 st->in_slow_tot,
409 st->in_slow_mc,
410 st->in_no_route,
411 st->in_brd,
412 st->in_martian_dst,
413 st->in_martian_src,
415 st->out_hit,
416 st->out_slow_tot,
417 st->out_slow_mc,
419 st->gc_total,
420 st->gc_ignored,
421 st->gc_goal_miss,
422 st->gc_dst_overflow,
423 st->in_hlist_search,
424 st->out_hlist_search
426 return 0;
429 static struct seq_operations rt_cpu_seq_ops = {
430 .start = rt_cpu_seq_start,
431 .next = rt_cpu_seq_next,
432 .stop = rt_cpu_seq_stop,
433 .show = rt_cpu_seq_show,
437 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
439 return seq_open(file, &rt_cpu_seq_ops);
442 static struct file_operations rt_cpu_seq_fops = {
443 .owner = THIS_MODULE,
444 .open = rt_cpu_seq_open,
445 .read = seq_read,
446 .llseek = seq_lseek,
447 .release = seq_release,
450 #endif /* CONFIG_PROC_FS */
452 static __inline__ void rt_free(struct rtable *rt)
454 multipath_remove(rt);
455 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
458 static __inline__ void rt_drop(struct rtable *rt)
460 multipath_remove(rt);
461 ip_rt_put(rt);
462 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
465 static __inline__ int rt_fast_clean(struct rtable *rth)
467 /* Kill broadcast/multicast entries very aggresively, if they
468 collide in hash table with more useful entries */
469 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
470 rth->fl.iif && rth->u.rt_next;
473 static __inline__ int rt_valuable(struct rtable *rth)
475 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
476 rth->u.dst.expires;
479 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
481 unsigned long age;
482 int ret = 0;
484 if (atomic_read(&rth->u.dst.__refcnt))
485 goto out;
487 ret = 1;
488 if (rth->u.dst.expires &&
489 time_after_eq(jiffies, rth->u.dst.expires))
490 goto out;
492 age = jiffies - rth->u.dst.lastuse;
493 ret = 0;
494 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
495 (age <= tmo2 && rt_valuable(rth)))
496 goto out;
497 ret = 1;
498 out: return ret;
501 /* Bits of score are:
502 * 31: very valuable
503 * 30: not quite useless
504 * 29..0: usage counter
506 static inline u32 rt_score(struct rtable *rt)
508 u32 score = jiffies - rt->u.dst.lastuse;
510 score = ~score & ~(3<<30);
512 if (rt_valuable(rt))
513 score |= (1<<31);
515 if (!rt->fl.iif ||
516 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
517 score |= (1<<30);
519 return score;
522 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
524 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
525 fl1->oif == fl2->oif &&
526 fl1->iif == fl2->iif;
529 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
530 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
531 struct rtable *expentry,
532 int *removed_count)
534 int passedexpired = 0;
535 struct rtable **nextstep = NULL;
536 struct rtable **rthp = chain_head;
537 struct rtable *rth;
539 if (removed_count)
540 *removed_count = 0;
542 while ((rth = *rthp) != NULL) {
543 if (rth == expentry)
544 passedexpired = 1;
546 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
547 compare_keys(&(*rthp)->fl, &expentry->fl)) {
548 if (*rthp == expentry) {
549 *rthp = rth->u.rt_next;
550 continue;
551 } else {
552 *rthp = rth->u.rt_next;
553 rt_free(rth);
554 if (removed_count)
555 ++(*removed_count);
557 } else {
558 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
559 passedexpired && !nextstep)
560 nextstep = &rth->u.rt_next;
562 rthp = &rth->u.rt_next;
566 rt_free(expentry);
567 if (removed_count)
568 ++(*removed_count);
570 return nextstep;
572 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
575 /* This runs via a timer and thus is always in BH context. */
576 static void rt_check_expire(unsigned long dummy)
578 static int rover;
579 int i = rover, t;
580 struct rtable *rth, **rthp;
581 unsigned long now = jiffies;
583 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
584 t -= ip_rt_gc_timeout) {
585 unsigned long tmo = ip_rt_gc_timeout;
587 i = (i + 1) & rt_hash_mask;
588 rthp = &rt_hash_table[i].chain;
590 spin_lock(&rt_hash_table[i].lock);
591 while ((rth = *rthp) != NULL) {
592 if (rth->u.dst.expires) {
593 /* Entry is expired even if it is in use */
594 if (time_before_eq(now, rth->u.dst.expires)) {
595 tmo >>= 1;
596 rthp = &rth->u.rt_next;
597 continue;
599 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
600 tmo >>= 1;
601 rthp = &rth->u.rt_next;
602 continue;
605 /* Cleanup aged off entries. */
606 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
607 /* remove all related balanced entries if necessary */
608 if (rth->u.dst.flags & DST_BALANCED) {
609 rthp = rt_remove_balanced_route(
610 &rt_hash_table[i].chain,
611 rth, NULL);
612 if (!rthp)
613 break;
614 } else {
615 *rthp = rth->u.rt_next;
616 rt_free(rth);
618 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
619 *rthp = rth->u.rt_next;
620 rt_free(rth);
621 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
623 spin_unlock(&rt_hash_table[i].lock);
625 /* Fallback loop breaker. */
626 if (time_after(jiffies, now))
627 break;
629 rover = i;
630 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
633 /* This can run from both BH and non-BH contexts, the latter
634 * in the case of a forced flush event.
636 static void rt_run_flush(unsigned long dummy)
638 int i;
639 struct rtable *rth, *next;
641 rt_deadline = 0;
643 get_random_bytes(&rt_hash_rnd, 4);
645 for (i = rt_hash_mask; i >= 0; i--) {
646 spin_lock_bh(&rt_hash_table[i].lock);
647 rth = rt_hash_table[i].chain;
648 if (rth)
649 rt_hash_table[i].chain = NULL;
650 spin_unlock_bh(&rt_hash_table[i].lock);
652 for (; rth; rth = next) {
653 next = rth->u.rt_next;
654 rt_free(rth);
659 static DEFINE_SPINLOCK(rt_flush_lock);
661 void rt_cache_flush(int delay)
663 unsigned long now = jiffies;
664 int user_mode = !in_softirq();
666 if (delay < 0)
667 delay = ip_rt_min_delay;
669 /* flush existing multipath state*/
670 multipath_flush();
672 spin_lock_bh(&rt_flush_lock);
674 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
675 long tmo = (long)(rt_deadline - now);
677 /* If flush timer is already running
678 and flush request is not immediate (delay > 0):
680 if deadline is not achieved, prolongate timer to "delay",
681 otherwise fire it at deadline time.
684 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
685 tmo = 0;
687 if (delay > tmo)
688 delay = tmo;
691 if (delay <= 0) {
692 spin_unlock_bh(&rt_flush_lock);
693 rt_run_flush(0);
694 return;
697 if (rt_deadline == 0)
698 rt_deadline = now + ip_rt_max_delay;
700 mod_timer(&rt_flush_timer, now+delay);
701 spin_unlock_bh(&rt_flush_lock);
704 static void rt_secret_rebuild(unsigned long dummy)
706 unsigned long now = jiffies;
708 rt_cache_flush(0);
709 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
713 Short description of GC goals.
715 We want to build algorithm, which will keep routing cache
716 at some equilibrium point, when number of aged off entries
717 is kept approximately equal to newly generated ones.
719 Current expiration strength is variable "expire".
720 We try to adjust it dynamically, so that if networking
721 is idle expires is large enough to keep enough of warm entries,
722 and when load increases it reduces to limit cache size.
725 static int rt_garbage_collect(void)
727 static unsigned long expire = RT_GC_TIMEOUT;
728 static unsigned long last_gc;
729 static int rover;
730 static int equilibrium;
731 struct rtable *rth, **rthp;
732 unsigned long now = jiffies;
733 int goal;
736 * Garbage collection is pretty expensive,
737 * do not make it too frequently.
740 RT_CACHE_STAT_INC(gc_total);
742 if (now - last_gc < ip_rt_gc_min_interval &&
743 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
744 RT_CACHE_STAT_INC(gc_ignored);
745 goto out;
748 /* Calculate number of entries, which we want to expire now. */
749 goal = atomic_read(&ipv4_dst_ops.entries) -
750 (ip_rt_gc_elasticity << rt_hash_log);
751 if (goal <= 0) {
752 if (equilibrium < ipv4_dst_ops.gc_thresh)
753 equilibrium = ipv4_dst_ops.gc_thresh;
754 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
755 if (goal > 0) {
756 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
757 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
759 } else {
760 /* We are in dangerous area. Try to reduce cache really
761 * aggressively.
763 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
764 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
767 if (now - last_gc >= ip_rt_gc_min_interval)
768 last_gc = now;
770 if (goal <= 0) {
771 equilibrium += goal;
772 goto work_done;
775 do {
776 int i, k;
778 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
779 unsigned long tmo = expire;
781 k = (k + 1) & rt_hash_mask;
782 rthp = &rt_hash_table[k].chain;
783 spin_lock_bh(&rt_hash_table[k].lock);
784 while ((rth = *rthp) != NULL) {
785 if (!rt_may_expire(rth, tmo, expire)) {
786 tmo >>= 1;
787 rthp = &rth->u.rt_next;
788 continue;
790 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
791 /* remove all related balanced entries
792 * if necessary
794 if (rth->u.dst.flags & DST_BALANCED) {
795 int r;
797 rthp = rt_remove_balanced_route(
798 &rt_hash_table[i].chain,
799 rth,
800 &r);
801 goal -= r;
802 if (!rthp)
803 break;
804 } else {
805 *rthp = rth->u.rt_next;
806 rt_free(rth);
807 goal--;
809 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
810 *rthp = rth->u.rt_next;
811 rt_free(rth);
812 goal--;
813 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
815 spin_unlock_bh(&rt_hash_table[k].lock);
816 if (goal <= 0)
817 break;
819 rover = k;
821 if (goal <= 0)
822 goto work_done;
824 /* Goal is not achieved. We stop process if:
826 - if expire reduced to zero. Otherwise, expire is halfed.
827 - if table is not full.
828 - if we are called from interrupt.
829 - jiffies check is just fallback/debug loop breaker.
830 We will not spin here for long time in any case.
833 RT_CACHE_STAT_INC(gc_goal_miss);
835 if (expire == 0)
836 break;
838 expire >>= 1;
839 #if RT_CACHE_DEBUG >= 2
840 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
841 atomic_read(&ipv4_dst_ops.entries), goal, i);
842 #endif
844 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
845 goto out;
846 } while (!in_softirq() && time_before_eq(jiffies, now));
848 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
849 goto out;
850 if (net_ratelimit())
851 printk(KERN_WARNING "dst cache overflow\n");
852 RT_CACHE_STAT_INC(gc_dst_overflow);
853 return 1;
855 work_done:
856 expire += ip_rt_gc_min_interval;
857 if (expire > ip_rt_gc_timeout ||
858 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
859 expire = ip_rt_gc_timeout;
860 #if RT_CACHE_DEBUG >= 2
861 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
862 atomic_read(&ipv4_dst_ops.entries), goal, rover);
863 #endif
864 out: return 0;
867 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
869 struct rtable *rth, **rthp;
870 unsigned long now;
871 struct rtable *cand, **candp;
872 u32 min_score;
873 int chain_length;
874 int attempts = !in_softirq();
876 restart:
877 chain_length = 0;
878 min_score = ~(u32)0;
879 cand = NULL;
880 candp = NULL;
881 now = jiffies;
883 rthp = &rt_hash_table[hash].chain;
885 spin_lock_bh(&rt_hash_table[hash].lock);
886 while ((rth = *rthp) != NULL) {
887 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
888 if (!(rth->u.dst.flags & DST_BALANCED) &&
889 compare_keys(&rth->fl, &rt->fl)) {
890 #else
891 if (compare_keys(&rth->fl, &rt->fl)) {
892 #endif
893 /* Put it first */
894 *rthp = rth->u.rt_next;
896 * Since lookup is lockfree, the deletion
897 * must be visible to another weakly ordered CPU before
898 * the insertion at the start of the hash chain.
900 rcu_assign_pointer(rth->u.rt_next,
901 rt_hash_table[hash].chain);
903 * Since lookup is lockfree, the update writes
904 * must be ordered for consistency on SMP.
906 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
908 rth->u.dst.__use++;
909 dst_hold(&rth->u.dst);
910 rth->u.dst.lastuse = now;
911 spin_unlock_bh(&rt_hash_table[hash].lock);
913 rt_drop(rt);
914 *rp = rth;
915 return 0;
918 if (!atomic_read(&rth->u.dst.__refcnt)) {
919 u32 score = rt_score(rth);
921 if (score <= min_score) {
922 cand = rth;
923 candp = rthp;
924 min_score = score;
928 chain_length++;
930 rthp = &rth->u.rt_next;
933 if (cand) {
934 /* ip_rt_gc_elasticity used to be average length of chain
935 * length, when exceeded gc becomes really aggressive.
937 * The second limit is less certain. At the moment it allows
938 * only 2 entries per bucket. We will see.
940 if (chain_length > ip_rt_gc_elasticity) {
941 *candp = cand->u.rt_next;
942 rt_free(cand);
946 /* Try to bind route to arp only if it is output
947 route or unicast forwarding path.
949 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
950 int err = arp_bind_neighbour(&rt->u.dst);
951 if (err) {
952 spin_unlock_bh(&rt_hash_table[hash].lock);
954 if (err != -ENOBUFS) {
955 rt_drop(rt);
956 return err;
959 /* Neighbour tables are full and nothing
960 can be released. Try to shrink route cache,
961 it is most likely it holds some neighbour records.
963 if (attempts-- > 0) {
964 int saved_elasticity = ip_rt_gc_elasticity;
965 int saved_int = ip_rt_gc_min_interval;
966 ip_rt_gc_elasticity = 1;
967 ip_rt_gc_min_interval = 0;
968 rt_garbage_collect();
969 ip_rt_gc_min_interval = saved_int;
970 ip_rt_gc_elasticity = saved_elasticity;
971 goto restart;
974 if (net_ratelimit())
975 printk(KERN_WARNING "Neighbour table overflow.\n");
976 rt_drop(rt);
977 return -ENOBUFS;
981 rt->u.rt_next = rt_hash_table[hash].chain;
982 #if RT_CACHE_DEBUG >= 2
983 if (rt->u.rt_next) {
984 struct rtable *trt;
985 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
986 NIPQUAD(rt->rt_dst));
987 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
988 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
989 printk("\n");
991 #endif
992 rt_hash_table[hash].chain = rt;
993 spin_unlock_bh(&rt_hash_table[hash].lock);
994 *rp = rt;
995 return 0;
998 void rt_bind_peer(struct rtable *rt, int create)
1000 static DEFINE_SPINLOCK(rt_peer_lock);
1001 struct inet_peer *peer;
1003 peer = inet_getpeer(rt->rt_dst, create);
1005 spin_lock_bh(&rt_peer_lock);
1006 if (rt->peer == NULL) {
1007 rt->peer = peer;
1008 peer = NULL;
1010 spin_unlock_bh(&rt_peer_lock);
1011 if (peer)
1012 inet_putpeer(peer);
1016 * Peer allocation may fail only in serious out-of-memory conditions. However
1017 * we still can generate some output.
1018 * Random ID selection looks a bit dangerous because we have no chances to
1019 * select ID being unique in a reasonable period of time.
1020 * But broken packet identifier may be better than no packet at all.
1022 static void ip_select_fb_ident(struct iphdr *iph)
1024 static DEFINE_SPINLOCK(ip_fb_id_lock);
1025 static u32 ip_fallback_id;
1026 u32 salt;
1028 spin_lock_bh(&ip_fb_id_lock);
1029 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1030 iph->id = htons(salt & 0xFFFF);
1031 ip_fallback_id = salt;
1032 spin_unlock_bh(&ip_fb_id_lock);
1035 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1037 struct rtable *rt = (struct rtable *) dst;
1039 if (rt) {
1040 if (rt->peer == NULL)
1041 rt_bind_peer(rt, 1);
1043 /* If peer is attached to destination, it is never detached,
1044 so that we need not to grab a lock to dereference it.
1046 if (rt->peer) {
1047 iph->id = htons(inet_getid(rt->peer, more));
1048 return;
1050 } else
1051 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
1053 ip_select_fb_ident(iph);
1056 static void rt_del(unsigned hash, struct rtable *rt)
1058 struct rtable **rthp;
1060 spin_lock_bh(&rt_hash_table[hash].lock);
1061 ip_rt_put(rt);
1062 for (rthp = &rt_hash_table[hash].chain; *rthp;
1063 rthp = &(*rthp)->u.rt_next)
1064 if (*rthp == rt) {
1065 *rthp = rt->u.rt_next;
1066 rt_free(rt);
1067 break;
1069 spin_unlock_bh(&rt_hash_table[hash].lock);
1072 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1073 u32 saddr, u8 tos, struct net_device *dev)
1075 int i, k;
1076 struct in_device *in_dev = in_dev_get(dev);
1077 struct rtable *rth, **rthp;
1078 u32 skeys[2] = { saddr, 0 };
1079 int ikeys[2] = { dev->ifindex, 0 };
1081 tos &= IPTOS_RT_MASK;
1083 if (!in_dev)
1084 return;
1086 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1087 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1088 goto reject_redirect;
1090 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1091 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1092 goto reject_redirect;
1093 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1094 goto reject_redirect;
1095 } else {
1096 if (inet_addr_type(new_gw) != RTN_UNICAST)
1097 goto reject_redirect;
1100 for (i = 0; i < 2; i++) {
1101 for (k = 0; k < 2; k++) {
1102 unsigned hash = rt_hash_code(daddr,
1103 skeys[i] ^ (ikeys[k] << 5),
1104 tos);
1106 rthp=&rt_hash_table[hash].chain;
1108 rcu_read_lock();
1109 while ((rth = rcu_dereference(*rthp)) != NULL) {
1110 struct rtable *rt;
1112 if (rth->fl.fl4_dst != daddr ||
1113 rth->fl.fl4_src != skeys[i] ||
1114 rth->fl.fl4_tos != tos ||
1115 rth->fl.oif != ikeys[k] ||
1116 rth->fl.iif != 0) {
1117 rthp = &rth->u.rt_next;
1118 continue;
1121 if (rth->rt_dst != daddr ||
1122 rth->rt_src != saddr ||
1123 rth->u.dst.error ||
1124 rth->rt_gateway != old_gw ||
1125 rth->u.dst.dev != dev)
1126 break;
1128 dst_hold(&rth->u.dst);
1129 rcu_read_unlock();
1131 rt = dst_alloc(&ipv4_dst_ops);
1132 if (rt == NULL) {
1133 ip_rt_put(rth);
1134 in_dev_put(in_dev);
1135 return;
1138 /* Copy all the information. */
1139 *rt = *rth;
1140 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1141 rt->u.dst.__use = 1;
1142 atomic_set(&rt->u.dst.__refcnt, 1);
1143 rt->u.dst.child = NULL;
1144 if (rt->u.dst.dev)
1145 dev_hold(rt->u.dst.dev);
1146 if (rt->idev)
1147 in_dev_hold(rt->idev);
1148 rt->u.dst.obsolete = 0;
1149 rt->u.dst.lastuse = jiffies;
1150 rt->u.dst.path = &rt->u.dst;
1151 rt->u.dst.neighbour = NULL;
1152 rt->u.dst.hh = NULL;
1153 rt->u.dst.xfrm = NULL;
1155 rt->rt_flags |= RTCF_REDIRECTED;
1157 /* Gateway is different ... */
1158 rt->rt_gateway = new_gw;
1160 /* Redirect received -> path was valid */
1161 dst_confirm(&rth->u.dst);
1163 if (rt->peer)
1164 atomic_inc(&rt->peer->refcnt);
1166 if (arp_bind_neighbour(&rt->u.dst) ||
1167 !(rt->u.dst.neighbour->nud_state &
1168 NUD_VALID)) {
1169 if (rt->u.dst.neighbour)
1170 neigh_event_send(rt->u.dst.neighbour, NULL);
1171 ip_rt_put(rth);
1172 rt_drop(rt);
1173 goto do_next;
1176 rt_del(hash, rth);
1177 if (!rt_intern_hash(hash, rt, &rt))
1178 ip_rt_put(rt);
1179 goto do_next;
1181 rcu_read_unlock();
1182 do_next:
1186 in_dev_put(in_dev);
1187 return;
1189 reject_redirect:
1190 #ifdef CONFIG_IP_ROUTE_VERBOSE
1191 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1192 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1193 "%u.%u.%u.%u ignored.\n"
1194 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1195 "tos %02x\n",
1196 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1197 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1198 #endif
1199 in_dev_put(in_dev);
1202 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1204 struct rtable *rt = (struct rtable*)dst;
1205 struct dst_entry *ret = dst;
1207 if (rt) {
1208 if (dst->obsolete) {
1209 ip_rt_put(rt);
1210 ret = NULL;
1211 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1212 rt->u.dst.expires) {
1213 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1214 rt->fl.fl4_src ^
1215 (rt->fl.oif << 5),
1216 rt->fl.fl4_tos);
1217 #if RT_CACHE_DEBUG >= 1
1218 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1219 "%u.%u.%u.%u/%02x dropped\n",
1220 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1221 #endif
1222 rt_del(hash, rt);
1223 ret = NULL;
1226 return ret;
1230 * Algorithm:
1231 * 1. The first ip_rt_redirect_number redirects are sent
1232 * with exponential backoff, then we stop sending them at all,
1233 * assuming that the host ignores our redirects.
1234 * 2. If we did not see packets requiring redirects
1235 * during ip_rt_redirect_silence, we assume that the host
1236 * forgot redirected route and start to send redirects again.
1238 * This algorithm is much cheaper and more intelligent than dumb load limiting
1239 * in icmp.c.
1241 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1242 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1245 void ip_rt_send_redirect(struct sk_buff *skb)
1247 struct rtable *rt = (struct rtable*)skb->dst;
1248 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1250 if (!in_dev)
1251 return;
1253 if (!IN_DEV_TX_REDIRECTS(in_dev))
1254 goto out;
1256 /* No redirected packets during ip_rt_redirect_silence;
1257 * reset the algorithm.
1259 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1260 rt->u.dst.rate_tokens = 0;
1262 /* Too many ignored redirects; do not send anything
1263 * set u.dst.rate_last to the last seen redirected packet.
1265 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1266 rt->u.dst.rate_last = jiffies;
1267 goto out;
1270 /* Check for load limit; set rate_last to the latest sent
1271 * redirect.
1273 if (time_after(jiffies,
1274 (rt->u.dst.rate_last +
1275 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1276 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1277 rt->u.dst.rate_last = jiffies;
1278 ++rt->u.dst.rate_tokens;
1279 #ifdef CONFIG_IP_ROUTE_VERBOSE
1280 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1281 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1282 net_ratelimit())
1283 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1284 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1285 NIPQUAD(rt->rt_src), rt->rt_iif,
1286 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1287 #endif
1289 out:
1290 in_dev_put(in_dev);
1293 static int ip_error(struct sk_buff *skb)
1295 struct rtable *rt = (struct rtable*)skb->dst;
1296 unsigned long now;
1297 int code;
1299 switch (rt->u.dst.error) {
1300 case EINVAL:
1301 default:
1302 goto out;
1303 case EHOSTUNREACH:
1304 code = ICMP_HOST_UNREACH;
1305 break;
1306 case ENETUNREACH:
1307 code = ICMP_NET_UNREACH;
1308 break;
1309 case EACCES:
1310 code = ICMP_PKT_FILTERED;
1311 break;
1314 now = jiffies;
1315 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1316 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1317 rt->u.dst.rate_tokens = ip_rt_error_burst;
1318 rt->u.dst.rate_last = now;
1319 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1320 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1321 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1324 out: kfree_skb(skb);
1325 return 0;
1329 * The last two values are not from the RFC but
1330 * are needed for AMPRnet AX.25 paths.
1333 static unsigned short mtu_plateau[] =
1334 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1336 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1338 int i;
1340 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1341 if (old_mtu > mtu_plateau[i])
1342 return mtu_plateau[i];
1343 return 68;
1346 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1348 int i;
1349 unsigned short old_mtu = ntohs(iph->tot_len);
1350 struct rtable *rth;
1351 u32 skeys[2] = { iph->saddr, 0, };
1352 u32 daddr = iph->daddr;
1353 u8 tos = iph->tos & IPTOS_RT_MASK;
1354 unsigned short est_mtu = 0;
1356 if (ipv4_config.no_pmtu_disc)
1357 return 0;
1359 for (i = 0; i < 2; i++) {
1360 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1362 rcu_read_lock();
1363 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1364 rth = rcu_dereference(rth->u.rt_next)) {
1365 if (rth->fl.fl4_dst == daddr &&
1366 rth->fl.fl4_src == skeys[i] &&
1367 rth->rt_dst == daddr &&
1368 rth->rt_src == iph->saddr &&
1369 rth->fl.fl4_tos == tos &&
1370 rth->fl.iif == 0 &&
1371 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1372 unsigned short mtu = new_mtu;
1374 if (new_mtu < 68 || new_mtu >= old_mtu) {
1376 /* BSD 4.2 compatibility hack :-( */
1377 if (mtu == 0 &&
1378 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1379 old_mtu >= 68 + (iph->ihl << 2))
1380 old_mtu -= iph->ihl << 2;
1382 mtu = guess_mtu(old_mtu);
1384 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1385 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1386 dst_confirm(&rth->u.dst);
1387 if (mtu < ip_rt_min_pmtu) {
1388 mtu = ip_rt_min_pmtu;
1389 rth->u.dst.metrics[RTAX_LOCK-1] |=
1390 (1 << RTAX_MTU);
1392 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1393 dst_set_expires(&rth->u.dst,
1394 ip_rt_mtu_expires);
1396 est_mtu = mtu;
1400 rcu_read_unlock();
1402 return est_mtu ? : new_mtu;
1405 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1407 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1408 !(dst_metric_locked(dst, RTAX_MTU))) {
1409 if (mtu < ip_rt_min_pmtu) {
1410 mtu = ip_rt_min_pmtu;
1411 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1413 dst->metrics[RTAX_MTU-1] = mtu;
1414 dst_set_expires(dst, ip_rt_mtu_expires);
1418 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1420 return NULL;
1423 static void ipv4_dst_destroy(struct dst_entry *dst)
1425 struct rtable *rt = (struct rtable *) dst;
1426 struct inet_peer *peer = rt->peer;
1427 struct in_device *idev = rt->idev;
1429 if (peer) {
1430 rt->peer = NULL;
1431 inet_putpeer(peer);
1434 if (idev) {
1435 rt->idev = NULL;
1436 in_dev_put(idev);
1440 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1441 int how)
1443 struct rtable *rt = (struct rtable *) dst;
1444 struct in_device *idev = rt->idev;
1445 if (dev != &loopback_dev && idev && idev->dev == dev) {
1446 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1447 if (loopback_idev) {
1448 rt->idev = loopback_idev;
1449 in_dev_put(idev);
1454 static void ipv4_link_failure(struct sk_buff *skb)
1456 struct rtable *rt;
1458 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1460 rt = (struct rtable *) skb->dst;
1461 if (rt)
1462 dst_set_expires(&rt->u.dst, 0);
1465 static int ip_rt_bug(struct sk_buff *skb)
1467 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1468 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1469 skb->dev ? skb->dev->name : "?");
1470 kfree_skb(skb);
1471 return 0;
1475 We do not cache source address of outgoing interface,
1476 because it is used only by IP RR, TS and SRR options,
1477 so that it out of fast path.
1479 BTW remember: "addr" is allowed to be not aligned
1480 in IP options!
1483 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1485 u32 src;
1486 struct fib_result res;
1488 if (rt->fl.iif == 0)
1489 src = rt->rt_src;
1490 else if (fib_lookup(&rt->fl, &res) == 0) {
1491 src = FIB_RES_PREFSRC(res);
1492 fib_res_put(&res);
1493 } else
1494 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1495 RT_SCOPE_UNIVERSE);
1496 memcpy(addr, &src, 4);
1499 #ifdef CONFIG_NET_CLS_ROUTE
1500 static void set_class_tag(struct rtable *rt, u32 tag)
1502 if (!(rt->u.dst.tclassid & 0xFFFF))
1503 rt->u.dst.tclassid |= tag & 0xFFFF;
1504 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1505 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1507 #endif
1509 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1511 struct fib_info *fi = res->fi;
1513 if (fi) {
1514 if (FIB_RES_GW(*res) &&
1515 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1516 rt->rt_gateway = FIB_RES_GW(*res);
1517 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1518 sizeof(rt->u.dst.metrics));
1519 if (fi->fib_mtu == 0) {
1520 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1521 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1522 rt->rt_gateway != rt->rt_dst &&
1523 rt->u.dst.dev->mtu > 576)
1524 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1526 #ifdef CONFIG_NET_CLS_ROUTE
1527 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1528 #endif
1529 } else
1530 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1532 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1533 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1534 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1535 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1536 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1537 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1538 ip_rt_min_advmss);
1539 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1540 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1542 #ifdef CONFIG_NET_CLS_ROUTE
1543 #ifdef CONFIG_IP_MULTIPLE_TABLES
1544 set_class_tag(rt, fib_rules_tclass(res));
1545 #endif
1546 set_class_tag(rt, itag);
1547 #endif
1548 rt->rt_type = res->type;
1551 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1552 u8 tos, struct net_device *dev, int our)
1554 unsigned hash;
1555 struct rtable *rth;
1556 u32 spec_dst;
1557 struct in_device *in_dev = in_dev_get(dev);
1558 u32 itag = 0;
1560 /* Primary sanity checks. */
1562 if (in_dev == NULL)
1563 return -EINVAL;
1565 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1566 skb->protocol != htons(ETH_P_IP))
1567 goto e_inval;
1569 if (ZERONET(saddr)) {
1570 if (!LOCAL_MCAST(daddr))
1571 goto e_inval;
1572 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1573 } else if (fib_validate_source(saddr, 0, tos, 0,
1574 dev, &spec_dst, &itag) < 0)
1575 goto e_inval;
1577 rth = dst_alloc(&ipv4_dst_ops);
1578 if (!rth)
1579 goto e_nobufs;
1581 rth->u.dst.output= ip_rt_bug;
1583 atomic_set(&rth->u.dst.__refcnt, 1);
1584 rth->u.dst.flags= DST_HOST;
1585 if (in_dev->cnf.no_policy)
1586 rth->u.dst.flags |= DST_NOPOLICY;
1587 rth->fl.fl4_dst = daddr;
1588 rth->rt_dst = daddr;
1589 rth->fl.fl4_tos = tos;
1590 #ifdef CONFIG_IP_ROUTE_FWMARK
1591 rth->fl.fl4_fwmark= skb->nfmark;
1592 #endif
1593 rth->fl.fl4_src = saddr;
1594 rth->rt_src = saddr;
1595 #ifdef CONFIG_NET_CLS_ROUTE
1596 rth->u.dst.tclassid = itag;
1597 #endif
1598 rth->rt_iif =
1599 rth->fl.iif = dev->ifindex;
1600 rth->u.dst.dev = &loopback_dev;
1601 dev_hold(rth->u.dst.dev);
1602 rth->idev = in_dev_get(rth->u.dst.dev);
1603 rth->fl.oif = 0;
1604 rth->rt_gateway = daddr;
1605 rth->rt_spec_dst= spec_dst;
1606 rth->rt_type = RTN_MULTICAST;
1607 rth->rt_flags = RTCF_MULTICAST;
1608 if (our) {
1609 rth->u.dst.input= ip_local_deliver;
1610 rth->rt_flags |= RTCF_LOCAL;
1613 #ifdef CONFIG_IP_MROUTE
1614 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1615 rth->u.dst.input = ip_mr_input;
1616 #endif
1617 RT_CACHE_STAT_INC(in_slow_mc);
1619 in_dev_put(in_dev);
1620 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1621 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1623 e_nobufs:
1624 in_dev_put(in_dev);
1625 return -ENOBUFS;
1627 e_inval:
1628 in_dev_put(in_dev);
1629 return -EINVAL;
1633 static void ip_handle_martian_source(struct net_device *dev,
1634 struct in_device *in_dev,
1635 struct sk_buff *skb,
1636 u32 daddr,
1637 u32 saddr)
1639 RT_CACHE_STAT_INC(in_martian_src);
1640 #ifdef CONFIG_IP_ROUTE_VERBOSE
1641 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1643 * RFC1812 recommendation, if source is martian,
1644 * the only hint is MAC header.
1646 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1647 "%u.%u.%u.%u, on dev %s\n",
1648 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1649 if (dev->hard_header_len) {
1650 int i;
1651 unsigned char *p = skb->mac.raw;
1652 printk(KERN_WARNING "ll header: ");
1653 for (i = 0; i < dev->hard_header_len; i++, p++) {
1654 printk("%02x", *p);
1655 if (i < (dev->hard_header_len - 1))
1656 printk(":");
1658 printk("\n");
1661 #endif
1664 static inline int __mkroute_input(struct sk_buff *skb,
1665 struct fib_result* res,
1666 struct in_device *in_dev,
1667 u32 daddr, u32 saddr, u32 tos,
1668 struct rtable **result)
1671 struct rtable *rth;
1672 int err;
1673 struct in_device *out_dev;
1674 unsigned flags = 0;
1675 u32 spec_dst, itag;
1677 /* get a working reference to the output device */
1678 out_dev = in_dev_get(FIB_RES_DEV(*res));
1679 if (out_dev == NULL) {
1680 if (net_ratelimit())
1681 printk(KERN_CRIT "Bug in ip_route_input" \
1682 "_slow(). Please, report\n");
1683 return -EINVAL;
1687 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1688 in_dev->dev, &spec_dst, &itag);
1689 if (err < 0) {
1690 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1691 saddr);
1693 err = -EINVAL;
1694 goto cleanup;
1697 if (err)
1698 flags |= RTCF_DIRECTSRC;
1700 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1701 (IN_DEV_SHARED_MEDIA(out_dev) ||
1702 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1703 flags |= RTCF_DOREDIRECT;
1705 if (skb->protocol != htons(ETH_P_IP)) {
1706 /* Not IP (i.e. ARP). Do not create route, if it is
1707 * invalid for proxy arp. DNAT routes are always valid.
1709 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1710 err = -EINVAL;
1711 goto cleanup;
1716 rth = dst_alloc(&ipv4_dst_ops);
1717 if (!rth) {
1718 err = -ENOBUFS;
1719 goto cleanup;
1722 rth->u.dst.flags= DST_HOST;
1723 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1724 if (res->fi->fib_nhs > 1)
1725 rth->u.dst.flags |= DST_BALANCED;
1726 #endif
1727 if (in_dev->cnf.no_policy)
1728 rth->u.dst.flags |= DST_NOPOLICY;
1729 if (in_dev->cnf.no_xfrm)
1730 rth->u.dst.flags |= DST_NOXFRM;
1731 rth->fl.fl4_dst = daddr;
1732 rth->rt_dst = daddr;
1733 rth->fl.fl4_tos = tos;
1734 #ifdef CONFIG_IP_ROUTE_FWMARK
1735 rth->fl.fl4_fwmark= skb->nfmark;
1736 #endif
1737 rth->fl.fl4_src = saddr;
1738 rth->rt_src = saddr;
1739 rth->rt_gateway = daddr;
1740 rth->rt_iif =
1741 rth->fl.iif = in_dev->dev->ifindex;
1742 rth->u.dst.dev = (out_dev)->dev;
1743 dev_hold(rth->u.dst.dev);
1744 rth->idev = in_dev_get(rth->u.dst.dev);
1745 rth->fl.oif = 0;
1746 rth->rt_spec_dst= spec_dst;
1748 rth->u.dst.input = ip_forward;
1749 rth->u.dst.output = ip_output;
1751 rt_set_nexthop(rth, res, itag);
1753 rth->rt_flags = flags;
1755 *result = rth;
1756 err = 0;
1757 cleanup:
1758 /* release the working reference to the output device */
1759 in_dev_put(out_dev);
1760 return err;
1763 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1764 struct fib_result* res,
1765 const struct flowi *fl,
1766 struct in_device *in_dev,
1767 u32 daddr, u32 saddr, u32 tos)
1769 struct rtable* rth;
1770 int err;
1771 unsigned hash;
1773 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1774 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1775 fib_select_multipath(fl, res);
1776 #endif
1778 /* create a routing cache entry */
1779 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1780 if (err)
1781 return err;
1782 atomic_set(&rth->u.dst.__refcnt, 1);
1784 /* put it into the cache */
1785 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1786 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1789 static inline int ip_mkroute_input(struct sk_buff *skb,
1790 struct fib_result* res,
1791 const struct flowi *fl,
1792 struct in_device *in_dev,
1793 u32 daddr, u32 saddr, u32 tos)
1795 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1796 struct rtable* rth;
1797 unsigned char hop, hopcount, lasthop;
1798 int err = -EINVAL;
1799 unsigned int hash;
1801 if (res->fi)
1802 hopcount = res->fi->fib_nhs;
1803 else
1804 hopcount = 1;
1806 lasthop = hopcount - 1;
1808 /* distinguish between multipath and singlepath */
1809 if (hopcount < 2)
1810 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1811 saddr, tos);
1813 /* add all alternatives to the routing cache */
1814 for (hop = 0; hop < hopcount; hop++) {
1815 res->nh_sel = hop;
1817 /* create a routing cache entry */
1818 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1819 &rth);
1820 if (err)
1821 return err;
1823 /* put it into the cache */
1824 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1825 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1826 if (err)
1827 return err;
1829 /* forward hop information to multipath impl. */
1830 multipath_set_nhinfo(rth,
1831 FIB_RES_NETWORK(*res),
1832 FIB_RES_NETMASK(*res),
1833 res->prefixlen,
1834 &FIB_RES_NH(*res));
1836 /* only for the last hop the reference count is handled
1837 * outside
1839 if (hop == lasthop)
1840 atomic_set(&(skb->dst->__refcnt), 1);
1842 return err;
1843 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1844 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1845 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1850 * NOTE. We drop all the packets that has local source
1851 * addresses, because every properly looped back packet
1852 * must have correct destination already attached by output routine.
1854 * Such approach solves two big problems:
1855 * 1. Not simplex devices are handled properly.
1856 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1859 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1860 u8 tos, struct net_device *dev)
1862 struct fib_result res;
1863 struct in_device *in_dev = in_dev_get(dev);
1864 struct flowi fl = { .nl_u = { .ip4_u =
1865 { .daddr = daddr,
1866 .saddr = saddr,
1867 .tos = tos,
1868 .scope = RT_SCOPE_UNIVERSE,
1869 #ifdef CONFIG_IP_ROUTE_FWMARK
1870 .fwmark = skb->nfmark
1871 #endif
1872 } },
1873 .iif = dev->ifindex };
1874 unsigned flags = 0;
1875 u32 itag = 0;
1876 struct rtable * rth;
1877 unsigned hash;
1878 u32 spec_dst;
1879 int err = -EINVAL;
1880 int free_res = 0;
1882 /* IP on this device is disabled. */
1884 if (!in_dev)
1885 goto out;
1887 /* Check for the most weird martians, which can be not detected
1888 by fib_lookup.
1891 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1892 goto martian_source;
1894 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1895 goto brd_input;
1897 /* Accept zero addresses only to limited broadcast;
1898 * I even do not know to fix it or not. Waiting for complains :-)
1900 if (ZERONET(saddr))
1901 goto martian_source;
1903 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1904 goto martian_destination;
1907 * Now we are ready to route packet.
1909 if ((err = fib_lookup(&fl, &res)) != 0) {
1910 if (!IN_DEV_FORWARD(in_dev))
1911 goto e_inval;
1912 goto no_route;
1914 free_res = 1;
1916 RT_CACHE_STAT_INC(in_slow_tot);
1918 if (res.type == RTN_BROADCAST)
1919 goto brd_input;
1921 if (res.type == RTN_LOCAL) {
1922 int result;
1923 result = fib_validate_source(saddr, daddr, tos,
1924 loopback_dev.ifindex,
1925 dev, &spec_dst, &itag);
1926 if (result < 0)
1927 goto martian_source;
1928 if (result)
1929 flags |= RTCF_DIRECTSRC;
1930 spec_dst = daddr;
1931 goto local_input;
1934 if (!IN_DEV_FORWARD(in_dev))
1935 goto e_inval;
1936 if (res.type != RTN_UNICAST)
1937 goto martian_destination;
1939 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1940 if (err == -ENOBUFS)
1941 goto e_nobufs;
1942 if (err == -EINVAL)
1943 goto e_inval;
1945 done:
1946 in_dev_put(in_dev);
1947 if (free_res)
1948 fib_res_put(&res);
1949 out: return err;
1951 brd_input:
1952 if (skb->protocol != htons(ETH_P_IP))
1953 goto e_inval;
1955 if (ZERONET(saddr))
1956 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1957 else {
1958 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1959 &itag);
1960 if (err < 0)
1961 goto martian_source;
1962 if (err)
1963 flags |= RTCF_DIRECTSRC;
1965 flags |= RTCF_BROADCAST;
1966 res.type = RTN_BROADCAST;
1967 RT_CACHE_STAT_INC(in_brd);
1969 local_input:
1970 rth = dst_alloc(&ipv4_dst_ops);
1971 if (!rth)
1972 goto e_nobufs;
1974 rth->u.dst.output= ip_rt_bug;
1976 atomic_set(&rth->u.dst.__refcnt, 1);
1977 rth->u.dst.flags= DST_HOST;
1978 if (in_dev->cnf.no_policy)
1979 rth->u.dst.flags |= DST_NOPOLICY;
1980 rth->fl.fl4_dst = daddr;
1981 rth->rt_dst = daddr;
1982 rth->fl.fl4_tos = tos;
1983 #ifdef CONFIG_IP_ROUTE_FWMARK
1984 rth->fl.fl4_fwmark= skb->nfmark;
1985 #endif
1986 rth->fl.fl4_src = saddr;
1987 rth->rt_src = saddr;
1988 #ifdef CONFIG_NET_CLS_ROUTE
1989 rth->u.dst.tclassid = itag;
1990 #endif
1991 rth->rt_iif =
1992 rth->fl.iif = dev->ifindex;
1993 rth->u.dst.dev = &loopback_dev;
1994 dev_hold(rth->u.dst.dev);
1995 rth->idev = in_dev_get(rth->u.dst.dev);
1996 rth->rt_gateway = daddr;
1997 rth->rt_spec_dst= spec_dst;
1998 rth->u.dst.input= ip_local_deliver;
1999 rth->rt_flags = flags|RTCF_LOCAL;
2000 if (res.type == RTN_UNREACHABLE) {
2001 rth->u.dst.input= ip_error;
2002 rth->u.dst.error= -err;
2003 rth->rt_flags &= ~RTCF_LOCAL;
2005 rth->rt_type = res.type;
2006 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2007 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2008 goto done;
2010 no_route:
2011 RT_CACHE_STAT_INC(in_no_route);
2012 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2013 res.type = RTN_UNREACHABLE;
2014 goto local_input;
2017 * Do not cache martian addresses: they should be logged (RFC1812)
2019 martian_destination:
2020 RT_CACHE_STAT_INC(in_martian_dst);
2021 #ifdef CONFIG_IP_ROUTE_VERBOSE
2022 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2023 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2024 "%u.%u.%u.%u, dev %s\n",
2025 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2026 #endif
2027 e_inval:
2028 err = -EINVAL;
2029 goto done;
2031 e_nobufs:
2032 err = -ENOBUFS;
2033 goto done;
2035 martian_source:
2036 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2037 goto e_inval;
2040 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2041 u8 tos, struct net_device *dev)
2043 struct rtable * rth;
2044 unsigned hash;
2045 int iif = dev->ifindex;
2047 tos &= IPTOS_RT_MASK;
2048 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2050 rcu_read_lock();
2051 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2052 rth = rcu_dereference(rth->u.rt_next)) {
2053 if (rth->fl.fl4_dst == daddr &&
2054 rth->fl.fl4_src == saddr &&
2055 rth->fl.iif == iif &&
2056 rth->fl.oif == 0 &&
2057 #ifdef CONFIG_IP_ROUTE_FWMARK
2058 rth->fl.fl4_fwmark == skb->nfmark &&
2059 #endif
2060 rth->fl.fl4_tos == tos) {
2061 rth->u.dst.lastuse = jiffies;
2062 dst_hold(&rth->u.dst);
2063 rth->u.dst.__use++;
2064 RT_CACHE_STAT_INC(in_hit);
2065 rcu_read_unlock();
2066 skb->dst = (struct dst_entry*)rth;
2067 return 0;
2069 RT_CACHE_STAT_INC(in_hlist_search);
2071 rcu_read_unlock();
2073 /* Multicast recognition logic is moved from route cache to here.
2074 The problem was that too many Ethernet cards have broken/missing
2075 hardware multicast filters :-( As result the host on multicasting
2076 network acquires a lot of useless route cache entries, sort of
2077 SDR messages from all the world. Now we try to get rid of them.
2078 Really, provided software IP multicast filter is organized
2079 reasonably (at least, hashed), it does not result in a slowdown
2080 comparing with route cache reject entries.
2081 Note, that multicast routers are not affected, because
2082 route cache entry is created eventually.
2084 if (MULTICAST(daddr)) {
2085 struct in_device *in_dev;
2087 rcu_read_lock();
2088 if ((in_dev = __in_dev_get(dev)) != NULL) {
2089 int our = ip_check_mc(in_dev, daddr, saddr,
2090 skb->nh.iph->protocol);
2091 if (our
2092 #ifdef CONFIG_IP_MROUTE
2093 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2094 #endif
2096 rcu_read_unlock();
2097 return ip_route_input_mc(skb, daddr, saddr,
2098 tos, dev, our);
2101 rcu_read_unlock();
2102 return -EINVAL;
2104 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2107 static inline int __mkroute_output(struct rtable **result,
2108 struct fib_result* res,
2109 const struct flowi *fl,
2110 const struct flowi *oldflp,
2111 struct net_device *dev_out,
2112 unsigned flags)
2114 struct rtable *rth;
2115 struct in_device *in_dev;
2116 u32 tos = RT_FL_TOS(oldflp);
2117 int err = 0;
2119 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2120 return -EINVAL;
2122 if (fl->fl4_dst == 0xFFFFFFFF)
2123 res->type = RTN_BROADCAST;
2124 else if (MULTICAST(fl->fl4_dst))
2125 res->type = RTN_MULTICAST;
2126 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2127 return -EINVAL;
2129 if (dev_out->flags & IFF_LOOPBACK)
2130 flags |= RTCF_LOCAL;
2132 /* get work reference to inet device */
2133 in_dev = in_dev_get(dev_out);
2134 if (!in_dev)
2135 return -EINVAL;
2137 if (res->type == RTN_BROADCAST) {
2138 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2139 if (res->fi) {
2140 fib_info_put(res->fi);
2141 res->fi = NULL;
2143 } else if (res->type == RTN_MULTICAST) {
2144 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2145 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2146 oldflp->proto))
2147 flags &= ~RTCF_LOCAL;
2148 /* If multicast route do not exist use
2149 default one, but do not gateway in this case.
2150 Yes, it is hack.
2152 if (res->fi && res->prefixlen < 4) {
2153 fib_info_put(res->fi);
2154 res->fi = NULL;
2159 rth = dst_alloc(&ipv4_dst_ops);
2160 if (!rth) {
2161 err = -ENOBUFS;
2162 goto cleanup;
2165 rth->u.dst.flags= DST_HOST;
2166 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2167 if (res->fi) {
2168 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2169 if (res->fi->fib_nhs > 1)
2170 rth->u.dst.flags |= DST_BALANCED;
2172 #endif
2173 if (in_dev->cnf.no_xfrm)
2174 rth->u.dst.flags |= DST_NOXFRM;
2175 if (in_dev->cnf.no_policy)
2176 rth->u.dst.flags |= DST_NOPOLICY;
2178 rth->fl.fl4_dst = oldflp->fl4_dst;
2179 rth->fl.fl4_tos = tos;
2180 rth->fl.fl4_src = oldflp->fl4_src;
2181 rth->fl.oif = oldflp->oif;
2182 #ifdef CONFIG_IP_ROUTE_FWMARK
2183 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2184 #endif
2185 rth->rt_dst = fl->fl4_dst;
2186 rth->rt_src = fl->fl4_src;
2187 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2188 /* get references to the devices that are to be hold by the routing
2189 cache entry */
2190 rth->u.dst.dev = dev_out;
2191 dev_hold(dev_out);
2192 rth->idev = in_dev_get(dev_out);
2193 rth->rt_gateway = fl->fl4_dst;
2194 rth->rt_spec_dst= fl->fl4_src;
2196 rth->u.dst.output=ip_output;
2198 RT_CACHE_STAT_INC(out_slow_tot);
2200 if (flags & RTCF_LOCAL) {
2201 rth->u.dst.input = ip_local_deliver;
2202 rth->rt_spec_dst = fl->fl4_dst;
2204 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2205 rth->rt_spec_dst = fl->fl4_src;
2206 if (flags & RTCF_LOCAL &&
2207 !(dev_out->flags & IFF_LOOPBACK)) {
2208 rth->u.dst.output = ip_mc_output;
2209 RT_CACHE_STAT_INC(out_slow_mc);
2211 #ifdef CONFIG_IP_MROUTE
2212 if (res->type == RTN_MULTICAST) {
2213 if (IN_DEV_MFORWARD(in_dev) &&
2214 !LOCAL_MCAST(oldflp->fl4_dst)) {
2215 rth->u.dst.input = ip_mr_input;
2216 rth->u.dst.output = ip_mc_output;
2219 #endif
2222 rt_set_nexthop(rth, res, 0);
2224 rth->rt_flags = flags;
2226 *result = rth;
2227 cleanup:
2228 /* release work reference to inet device */
2229 in_dev_put(in_dev);
2231 return err;
2234 static inline int ip_mkroute_output_def(struct rtable **rp,
2235 struct fib_result* res,
2236 const struct flowi *fl,
2237 const struct flowi *oldflp,
2238 struct net_device *dev_out,
2239 unsigned flags)
2241 struct rtable *rth;
2242 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2243 unsigned hash;
2244 if (err == 0) {
2245 u32 tos = RT_FL_TOS(oldflp);
2247 atomic_set(&rth->u.dst.__refcnt, 1);
2249 hash = rt_hash_code(oldflp->fl4_dst,
2250 oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2251 err = rt_intern_hash(hash, rth, rp);
2254 return err;
2257 static inline int ip_mkroute_output(struct rtable** rp,
2258 struct fib_result* res,
2259 const struct flowi *fl,
2260 const struct flowi *oldflp,
2261 struct net_device *dev_out,
2262 unsigned flags)
2264 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2265 u32 tos = RT_FL_TOS(oldflp);
2266 unsigned char hop;
2267 unsigned hash;
2268 int err = -EINVAL;
2269 struct rtable *rth;
2271 if (res->fi && res->fi->fib_nhs > 1) {
2272 unsigned char hopcount = res->fi->fib_nhs;
2274 for (hop = 0; hop < hopcount; hop++) {
2275 struct net_device *dev2nexthop;
2277 res->nh_sel = hop;
2279 /* hold a work reference to the output device */
2280 dev2nexthop = FIB_RES_DEV(*res);
2281 dev_hold(dev2nexthop);
2283 err = __mkroute_output(&rth, res, fl, oldflp,
2284 dev2nexthop, flags);
2286 if (err != 0)
2287 goto cleanup;
2289 hash = rt_hash_code(oldflp->fl4_dst,
2290 oldflp->fl4_src ^
2291 (oldflp->oif << 5), tos);
2292 err = rt_intern_hash(hash, rth, rp);
2294 /* forward hop information to multipath impl. */
2295 multipath_set_nhinfo(rth,
2296 FIB_RES_NETWORK(*res),
2297 FIB_RES_NETMASK(*res),
2298 res->prefixlen,
2299 &FIB_RES_NH(*res));
2300 cleanup:
2301 /* release work reference to output device */
2302 dev_put(dev2nexthop);
2304 if (err != 0)
2305 return err;
2307 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2308 return err;
2309 } else {
2310 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2311 flags);
2313 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2314 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2315 #endif
2319 * Major route resolver routine.
2322 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2324 u32 tos = RT_FL_TOS(oldflp);
2325 struct flowi fl = { .nl_u = { .ip4_u =
2326 { .daddr = oldflp->fl4_dst,
2327 .saddr = oldflp->fl4_src,
2328 .tos = tos & IPTOS_RT_MASK,
2329 .scope = ((tos & RTO_ONLINK) ?
2330 RT_SCOPE_LINK :
2331 RT_SCOPE_UNIVERSE),
2332 #ifdef CONFIG_IP_ROUTE_FWMARK
2333 .fwmark = oldflp->fl4_fwmark
2334 #endif
2335 } },
2336 .iif = loopback_dev.ifindex,
2337 .oif = oldflp->oif };
2338 struct fib_result res;
2339 unsigned flags = 0;
2340 struct net_device *dev_out = NULL;
2341 int free_res = 0;
2342 int err;
2345 res.fi = NULL;
2346 #ifdef CONFIG_IP_MULTIPLE_TABLES
2347 res.r = NULL;
2348 #endif
2350 if (oldflp->fl4_src) {
2351 err = -EINVAL;
2352 if (MULTICAST(oldflp->fl4_src) ||
2353 BADCLASS(oldflp->fl4_src) ||
2354 ZERONET(oldflp->fl4_src))
2355 goto out;
2357 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2358 dev_out = ip_dev_find(oldflp->fl4_src);
2359 if (dev_out == NULL)
2360 goto out;
2362 /* I removed check for oif == dev_out->oif here.
2363 It was wrong for two reasons:
2364 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2365 assigned to multiple interfaces.
2366 2. Moreover, we are allowed to send packets with saddr
2367 of another iface. --ANK
2370 if (oldflp->oif == 0
2371 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2372 /* Special hack: user can direct multicasts
2373 and limited broadcast via necessary interface
2374 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2375 This hack is not just for fun, it allows
2376 vic,vat and friends to work.
2377 They bind socket to loopback, set ttl to zero
2378 and expect that it will work.
2379 From the viewpoint of routing cache they are broken,
2380 because we are not allowed to build multicast path
2381 with loopback source addr (look, routing cache
2382 cannot know, that ttl is zero, so that packet
2383 will not leave this host and route is valid).
2384 Luckily, this hack is good workaround.
2387 fl.oif = dev_out->ifindex;
2388 goto make_route;
2390 if (dev_out)
2391 dev_put(dev_out);
2392 dev_out = NULL;
2396 if (oldflp->oif) {
2397 dev_out = dev_get_by_index(oldflp->oif);
2398 err = -ENODEV;
2399 if (dev_out == NULL)
2400 goto out;
2401 if (__in_dev_get(dev_out) == NULL) {
2402 dev_put(dev_out);
2403 goto out; /* Wrong error code */
2406 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2407 if (!fl.fl4_src)
2408 fl.fl4_src = inet_select_addr(dev_out, 0,
2409 RT_SCOPE_LINK);
2410 goto make_route;
2412 if (!fl.fl4_src) {
2413 if (MULTICAST(oldflp->fl4_dst))
2414 fl.fl4_src = inet_select_addr(dev_out, 0,
2415 fl.fl4_scope);
2416 else if (!oldflp->fl4_dst)
2417 fl.fl4_src = inet_select_addr(dev_out, 0,
2418 RT_SCOPE_HOST);
2422 if (!fl.fl4_dst) {
2423 fl.fl4_dst = fl.fl4_src;
2424 if (!fl.fl4_dst)
2425 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2426 if (dev_out)
2427 dev_put(dev_out);
2428 dev_out = &loopback_dev;
2429 dev_hold(dev_out);
2430 fl.oif = loopback_dev.ifindex;
2431 res.type = RTN_LOCAL;
2432 flags |= RTCF_LOCAL;
2433 goto make_route;
2436 if (fib_lookup(&fl, &res)) {
2437 res.fi = NULL;
2438 if (oldflp->oif) {
2439 /* Apparently, routing tables are wrong. Assume,
2440 that the destination is on link.
2442 WHY? DW.
2443 Because we are allowed to send to iface
2444 even if it has NO routes and NO assigned
2445 addresses. When oif is specified, routing
2446 tables are looked up with only one purpose:
2447 to catch if destination is gatewayed, rather than
2448 direct. Moreover, if MSG_DONTROUTE is set,
2449 we send packet, ignoring both routing tables
2450 and ifaddr state. --ANK
2453 We could make it even if oif is unknown,
2454 likely IPv6, but we do not.
2457 if (fl.fl4_src == 0)
2458 fl.fl4_src = inet_select_addr(dev_out, 0,
2459 RT_SCOPE_LINK);
2460 res.type = RTN_UNICAST;
2461 goto make_route;
2463 if (dev_out)
2464 dev_put(dev_out);
2465 err = -ENETUNREACH;
2466 goto out;
2468 free_res = 1;
2470 if (res.type == RTN_LOCAL) {
2471 if (!fl.fl4_src)
2472 fl.fl4_src = fl.fl4_dst;
2473 if (dev_out)
2474 dev_put(dev_out);
2475 dev_out = &loopback_dev;
2476 dev_hold(dev_out);
2477 fl.oif = dev_out->ifindex;
2478 if (res.fi)
2479 fib_info_put(res.fi);
2480 res.fi = NULL;
2481 flags |= RTCF_LOCAL;
2482 goto make_route;
2485 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2486 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2487 fib_select_multipath(&fl, &res);
2488 else
2489 #endif
2490 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2491 fib_select_default(&fl, &res);
2493 if (!fl.fl4_src)
2494 fl.fl4_src = FIB_RES_PREFSRC(res);
2496 if (dev_out)
2497 dev_put(dev_out);
2498 dev_out = FIB_RES_DEV(res);
2499 dev_hold(dev_out);
2500 fl.oif = dev_out->ifindex;
2503 make_route:
2504 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2507 if (free_res)
2508 fib_res_put(&res);
2509 if (dev_out)
2510 dev_put(dev_out);
2511 out: return err;
2514 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2516 unsigned hash;
2517 struct rtable *rth;
2519 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2521 rcu_read_lock_bh();
2522 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2523 rth = rcu_dereference(rth->u.rt_next)) {
2524 if (rth->fl.fl4_dst == flp->fl4_dst &&
2525 rth->fl.fl4_src == flp->fl4_src &&
2526 rth->fl.iif == 0 &&
2527 rth->fl.oif == flp->oif &&
2528 #ifdef CONFIG_IP_ROUTE_FWMARK
2529 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2530 #endif
2531 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2532 (IPTOS_RT_MASK | RTO_ONLINK))) {
2534 /* check for multipath routes and choose one if
2535 * necessary
2537 if (multipath_select_route(flp, rth, rp)) {
2538 dst_hold(&(*rp)->u.dst);
2539 RT_CACHE_STAT_INC(out_hit);
2540 rcu_read_unlock_bh();
2541 return 0;
2544 rth->u.dst.lastuse = jiffies;
2545 dst_hold(&rth->u.dst);
2546 rth->u.dst.__use++;
2547 RT_CACHE_STAT_INC(out_hit);
2548 rcu_read_unlock_bh();
2549 *rp = rth;
2550 return 0;
2552 RT_CACHE_STAT_INC(out_hlist_search);
2554 rcu_read_unlock_bh();
2556 return ip_route_output_slow(rp, flp);
2559 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2561 int err;
2563 if ((err = __ip_route_output_key(rp, flp)) != 0)
2564 return err;
2566 if (flp->proto) {
2567 if (!flp->fl4_src)
2568 flp->fl4_src = (*rp)->rt_src;
2569 if (!flp->fl4_dst)
2570 flp->fl4_dst = (*rp)->rt_dst;
2571 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2574 return 0;
2577 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2579 return ip_route_output_flow(rp, flp, NULL, 0);
2582 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2583 int nowait)
2585 struct rtable *rt = (struct rtable*)skb->dst;
2586 struct rtmsg *r;
2587 struct nlmsghdr *nlh;
2588 unsigned char *b = skb->tail;
2589 struct rta_cacheinfo ci;
2590 #ifdef CONFIG_IP_MROUTE
2591 struct rtattr *eptr;
2592 #endif
2593 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
2594 r = NLMSG_DATA(nlh);
2595 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
2596 r->rtm_family = AF_INET;
2597 r->rtm_dst_len = 32;
2598 r->rtm_src_len = 0;
2599 r->rtm_tos = rt->fl.fl4_tos;
2600 r->rtm_table = RT_TABLE_MAIN;
2601 r->rtm_type = rt->rt_type;
2602 r->rtm_scope = RT_SCOPE_UNIVERSE;
2603 r->rtm_protocol = RTPROT_UNSPEC;
2604 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2605 if (rt->rt_flags & RTCF_NOTIFY)
2606 r->rtm_flags |= RTM_F_NOTIFY;
2607 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2608 if (rt->fl.fl4_src) {
2609 r->rtm_src_len = 32;
2610 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2612 if (rt->u.dst.dev)
2613 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2614 #ifdef CONFIG_NET_CLS_ROUTE
2615 if (rt->u.dst.tclassid)
2616 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2617 #endif
2618 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2619 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2620 __u32 alg = rt->rt_multipath_alg;
2622 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2624 #endif
2625 if (rt->fl.iif)
2626 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2627 else if (rt->rt_src != rt->fl.fl4_src)
2628 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2629 if (rt->rt_dst != rt->rt_gateway)
2630 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2631 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2632 goto rtattr_failure;
2633 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2634 ci.rta_used = rt->u.dst.__use;
2635 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2636 if (rt->u.dst.expires)
2637 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2638 else
2639 ci.rta_expires = 0;
2640 ci.rta_error = rt->u.dst.error;
2641 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2642 if (rt->peer) {
2643 ci.rta_id = rt->peer->ip_id_count;
2644 if (rt->peer->tcp_ts_stamp) {
2645 ci.rta_ts = rt->peer->tcp_ts;
2646 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2649 #ifdef CONFIG_IP_MROUTE
2650 eptr = (struct rtattr*)skb->tail;
2651 #endif
2652 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2653 if (rt->fl.iif) {
2654 #ifdef CONFIG_IP_MROUTE
2655 u32 dst = rt->rt_dst;
2657 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2658 ipv4_devconf.mc_forwarding) {
2659 int err = ipmr_get_route(skb, r, nowait);
2660 if (err <= 0) {
2661 if (!nowait) {
2662 if (err == 0)
2663 return 0;
2664 goto nlmsg_failure;
2665 } else {
2666 if (err == -EMSGSIZE)
2667 goto nlmsg_failure;
2668 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2671 } else
2672 #endif
2673 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2676 nlh->nlmsg_len = skb->tail - b;
2677 return skb->len;
2679 nlmsg_failure:
2680 rtattr_failure:
2681 skb_trim(skb, b - skb->data);
2682 return -1;
2685 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2687 struct rtattr **rta = arg;
2688 struct rtmsg *rtm = NLMSG_DATA(nlh);
2689 struct rtable *rt = NULL;
2690 u32 dst = 0;
2691 u32 src = 0;
2692 int iif = 0;
2693 int err = -ENOBUFS;
2694 struct sk_buff *skb;
2696 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2697 if (!skb)
2698 goto out;
2700 /* Reserve room for dummy headers, this skb can pass
2701 through good chunk of routing engine.
2703 skb->mac.raw = skb->data;
2704 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2706 if (rta[RTA_SRC - 1])
2707 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2708 if (rta[RTA_DST - 1])
2709 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2710 if (rta[RTA_IIF - 1])
2711 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2713 if (iif) {
2714 struct net_device *dev = __dev_get_by_index(iif);
2715 err = -ENODEV;
2716 if (!dev)
2717 goto out_free;
2718 skb->protocol = htons(ETH_P_IP);
2719 skb->dev = dev;
2720 local_bh_disable();
2721 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2722 local_bh_enable();
2723 rt = (struct rtable*)skb->dst;
2724 if (!err && rt->u.dst.error)
2725 err = -rt->u.dst.error;
2726 } else {
2727 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2728 .saddr = src,
2729 .tos = rtm->rtm_tos } } };
2730 int oif = 0;
2731 if (rta[RTA_OIF - 1])
2732 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2733 fl.oif = oif;
2734 err = ip_route_output_key(&rt, &fl);
2736 if (err)
2737 goto out_free;
2739 skb->dst = &rt->u.dst;
2740 if (rtm->rtm_flags & RTM_F_NOTIFY)
2741 rt->rt_flags |= RTCF_NOTIFY;
2743 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2745 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2746 RTM_NEWROUTE, 0);
2747 if (!err)
2748 goto out_free;
2749 if (err < 0) {
2750 err = -EMSGSIZE;
2751 goto out_free;
2754 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2755 if (err > 0)
2756 err = 0;
2757 out: return err;
2759 out_free:
2760 kfree_skb(skb);
2761 goto out;
2764 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2766 struct rtable *rt;
2767 int h, s_h;
2768 int idx, s_idx;
2770 s_h = cb->args[0];
2771 s_idx = idx = cb->args[1];
2772 for (h = 0; h <= rt_hash_mask; h++) {
2773 if (h < s_h) continue;
2774 if (h > s_h)
2775 s_idx = 0;
2776 rcu_read_lock_bh();
2777 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2778 rt = rcu_dereference(rt->u.rt_next), idx++) {
2779 if (idx < s_idx)
2780 continue;
2781 skb->dst = dst_clone(&rt->u.dst);
2782 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2783 cb->nlh->nlmsg_seq,
2784 RTM_NEWROUTE, 1) <= 0) {
2785 dst_release(xchg(&skb->dst, NULL));
2786 rcu_read_unlock_bh();
2787 goto done;
2789 dst_release(xchg(&skb->dst, NULL));
2791 rcu_read_unlock_bh();
2794 done:
2795 cb->args[0] = h;
2796 cb->args[1] = idx;
2797 return skb->len;
2800 void ip_rt_multicast_event(struct in_device *in_dev)
2802 rt_cache_flush(0);
2805 #ifdef CONFIG_SYSCTL
2806 static int flush_delay;
2808 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2809 struct file *filp, void __user *buffer,
2810 size_t *lenp, loff_t *ppos)
2812 if (write) {
2813 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2814 rt_cache_flush(flush_delay);
2815 return 0;
2818 return -EINVAL;
2821 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2822 int __user *name,
2823 int nlen,
2824 void __user *oldval,
2825 size_t __user *oldlenp,
2826 void __user *newval,
2827 size_t newlen,
2828 void **context)
2830 int delay;
2831 if (newlen != sizeof(int))
2832 return -EINVAL;
2833 if (get_user(delay, (int __user *)newval))
2834 return -EFAULT;
2835 rt_cache_flush(delay);
2836 return 0;
2839 ctl_table ipv4_route_table[] = {
2841 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2842 .procname = "flush",
2843 .data = &flush_delay,
2844 .maxlen = sizeof(int),
2845 .mode = 0644,
2846 .proc_handler = &ipv4_sysctl_rtcache_flush,
2847 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2850 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2851 .procname = "min_delay",
2852 .data = &ip_rt_min_delay,
2853 .maxlen = sizeof(int),
2854 .mode = 0644,
2855 .proc_handler = &proc_dointvec_jiffies,
2856 .strategy = &sysctl_jiffies,
2859 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2860 .procname = "max_delay",
2861 .data = &ip_rt_max_delay,
2862 .maxlen = sizeof(int),
2863 .mode = 0644,
2864 .proc_handler = &proc_dointvec_jiffies,
2865 .strategy = &sysctl_jiffies,
2868 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2869 .procname = "gc_thresh",
2870 .data = &ipv4_dst_ops.gc_thresh,
2871 .maxlen = sizeof(int),
2872 .mode = 0644,
2873 .proc_handler = &proc_dointvec,
2876 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2877 .procname = "max_size",
2878 .data = &ip_rt_max_size,
2879 .maxlen = sizeof(int),
2880 .mode = 0644,
2881 .proc_handler = &proc_dointvec,
2884 /* Deprecated. Use gc_min_interval_ms */
2886 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2887 .procname = "gc_min_interval",
2888 .data = &ip_rt_gc_min_interval,
2889 .maxlen = sizeof(int),
2890 .mode = 0644,
2891 .proc_handler = &proc_dointvec_jiffies,
2892 .strategy = &sysctl_jiffies,
2895 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2896 .procname = "gc_min_interval_ms",
2897 .data = &ip_rt_gc_min_interval,
2898 .maxlen = sizeof(int),
2899 .mode = 0644,
2900 .proc_handler = &proc_dointvec_ms_jiffies,
2901 .strategy = &sysctl_ms_jiffies,
2904 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2905 .procname = "gc_timeout",
2906 .data = &ip_rt_gc_timeout,
2907 .maxlen = sizeof(int),
2908 .mode = 0644,
2909 .proc_handler = &proc_dointvec_jiffies,
2910 .strategy = &sysctl_jiffies,
2913 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2914 .procname = "gc_interval",
2915 .data = &ip_rt_gc_interval,
2916 .maxlen = sizeof(int),
2917 .mode = 0644,
2918 .proc_handler = &proc_dointvec_jiffies,
2919 .strategy = &sysctl_jiffies,
2922 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2923 .procname = "redirect_load",
2924 .data = &ip_rt_redirect_load,
2925 .maxlen = sizeof(int),
2926 .mode = 0644,
2927 .proc_handler = &proc_dointvec,
2930 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2931 .procname = "redirect_number",
2932 .data = &ip_rt_redirect_number,
2933 .maxlen = sizeof(int),
2934 .mode = 0644,
2935 .proc_handler = &proc_dointvec,
2938 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2939 .procname = "redirect_silence",
2940 .data = &ip_rt_redirect_silence,
2941 .maxlen = sizeof(int),
2942 .mode = 0644,
2943 .proc_handler = &proc_dointvec,
2946 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2947 .procname = "error_cost",
2948 .data = &ip_rt_error_cost,
2949 .maxlen = sizeof(int),
2950 .mode = 0644,
2951 .proc_handler = &proc_dointvec,
2954 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2955 .procname = "error_burst",
2956 .data = &ip_rt_error_burst,
2957 .maxlen = sizeof(int),
2958 .mode = 0644,
2959 .proc_handler = &proc_dointvec,
2962 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2963 .procname = "gc_elasticity",
2964 .data = &ip_rt_gc_elasticity,
2965 .maxlen = sizeof(int),
2966 .mode = 0644,
2967 .proc_handler = &proc_dointvec,
2970 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2971 .procname = "mtu_expires",
2972 .data = &ip_rt_mtu_expires,
2973 .maxlen = sizeof(int),
2974 .mode = 0644,
2975 .proc_handler = &proc_dointvec_jiffies,
2976 .strategy = &sysctl_jiffies,
2979 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2980 .procname = "min_pmtu",
2981 .data = &ip_rt_min_pmtu,
2982 .maxlen = sizeof(int),
2983 .mode = 0644,
2984 .proc_handler = &proc_dointvec,
2987 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2988 .procname = "min_adv_mss",
2989 .data = &ip_rt_min_advmss,
2990 .maxlen = sizeof(int),
2991 .mode = 0644,
2992 .proc_handler = &proc_dointvec,
2995 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2996 .procname = "secret_interval",
2997 .data = &ip_rt_secret_interval,
2998 .maxlen = sizeof(int),
2999 .mode = 0644,
3000 .proc_handler = &proc_dointvec_jiffies,
3001 .strategy = &sysctl_jiffies,
3003 { .ctl_name = 0 }
3005 #endif
3007 #ifdef CONFIG_NET_CLS_ROUTE
3008 struct ip_rt_acct *ip_rt_acct;
3010 /* This code sucks. But you should have seen it before! --RR */
3012 /* IP route accounting ptr for this logical cpu number. */
3013 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3015 #ifdef CONFIG_PROC_FS
3016 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3017 int length, int *eof, void *data)
3019 unsigned int i;
3021 if ((offset & 3) || (length & 3))
3022 return -EIO;
3024 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3025 *eof = 1;
3026 return 0;
3029 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3030 length = sizeof(struct ip_rt_acct) * 256 - offset;
3031 *eof = 1;
3034 offset /= sizeof(u32);
3036 if (length > 0) {
3037 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3038 u32 *dst = (u32 *) buffer;
3040 /* Copy first cpu. */
3041 *start = buffer;
3042 memcpy(dst, src, length);
3044 /* Add the other cpus in, one int at a time */
3045 for_each_cpu(i) {
3046 unsigned int j;
3048 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3050 for (j = 0; j < length/4; j++)
3051 dst[j] += src[j];
3054 return length;
3056 #endif /* CONFIG_PROC_FS */
3057 #endif /* CONFIG_NET_CLS_ROUTE */
3059 static __initdata unsigned long rhash_entries;
3060 static int __init set_rhash_entries(char *str)
3062 if (!str)
3063 return 0;
3064 rhash_entries = simple_strtoul(str, &str, 0);
3065 return 1;
3067 __setup("rhash_entries=", set_rhash_entries);
3069 int __init ip_rt_init(void)
3071 int i, order, goal, rc = 0;
3073 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3074 (jiffies ^ (jiffies >> 7)));
3076 #ifdef CONFIG_NET_CLS_ROUTE
3077 for (order = 0;
3078 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3079 /* NOTHING */;
3080 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3081 if (!ip_rt_acct)
3082 panic("IP: failed to allocate ip_rt_acct\n");
3083 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3084 #endif
3086 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3087 sizeof(struct rtable),
3088 0, SLAB_HWCACHE_ALIGN,
3089 NULL, NULL);
3091 if (!ipv4_dst_ops.kmem_cachep)
3092 panic("IP: failed to allocate ip_dst_cache\n");
3094 goal = num_physpages >> (26 - PAGE_SHIFT);
3095 if (rhash_entries)
3096 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
3097 for (order = 0; (1UL << order) < goal; order++)
3098 /* NOTHING */;
3100 do {
3101 rt_hash_mask = (1UL << order) * PAGE_SIZE /
3102 sizeof(struct rt_hash_bucket);
3103 while (rt_hash_mask & (rt_hash_mask - 1))
3104 rt_hash_mask--;
3105 rt_hash_table = (struct rt_hash_bucket *)
3106 __get_free_pages(GFP_ATOMIC, order);
3107 } while (rt_hash_table == NULL && --order > 0);
3109 if (!rt_hash_table)
3110 panic("Failed to allocate IP route cache hash table\n");
3112 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3113 rt_hash_mask,
3114 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3116 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3117 /* NOTHING */;
3119 rt_hash_mask--;
3120 for (i = 0; i <= rt_hash_mask; i++) {
3121 spin_lock_init(&rt_hash_table[i].lock);
3122 rt_hash_table[i].chain = NULL;
3125 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3126 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3128 rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3129 if (!rt_cache_stat)
3130 return -ENOMEM;
3132 devinet_init();
3133 ip_fib_init();
3135 init_timer(&rt_flush_timer);
3136 rt_flush_timer.function = rt_run_flush;
3137 init_timer(&rt_periodic_timer);
3138 rt_periodic_timer.function = rt_check_expire;
3139 init_timer(&rt_secret_timer);
3140 rt_secret_timer.function = rt_secret_rebuild;
3142 /* All the timers, started at system startup tend
3143 to synchronize. Perturb it a bit.
3145 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3146 ip_rt_gc_interval;
3147 add_timer(&rt_periodic_timer);
3149 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3150 ip_rt_secret_interval;
3151 add_timer(&rt_secret_timer);
3153 #ifdef CONFIG_PROC_FS
3155 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3156 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3157 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3158 proc_net_stat))) {
3159 free_percpu(rt_cache_stat);
3160 return -ENOMEM;
3162 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3164 #ifdef CONFIG_NET_CLS_ROUTE
3165 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3166 #endif
3167 #endif
3168 #ifdef CONFIG_XFRM
3169 xfrm_init();
3170 xfrm4_init();
3171 #endif
3172 return rc;
3175 EXPORT_SYMBOL(__ip_select_ident);
3176 EXPORT_SYMBOL(ip_route_input);
3177 EXPORT_SYMBOL(ip_route_output_key);