[PATCH] mark struct file_operations const 7
[linux-2.6/sactl.git] / net / ipv4 / route.c
blob9b5e56481d53d1929e8846cc1406faf60300a426
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/mm.h>
75 #include <linux/bootmem.h>
76 #include <linux/string.h>
77 #include <linux/socket.h>
78 #include <linux/sockios.h>
79 #include <linux/errno.h>
80 #include <linux/in.h>
81 #include <linux/inet.h>
82 #include <linux/netdevice.h>
83 #include <linux/proc_fs.h>
84 #include <linux/init.h>
85 #include <linux/skbuff.h>
86 #include <linux/rtnetlink.h>
87 #include <linux/inetdevice.h>
88 #include <linux/igmp.h>
89 #include <linux/pkt_sched.h>
90 #include <linux/mroute.h>
91 #include <linux/netfilter_ipv4.h>
92 #include <linux/random.h>
93 #include <linux/jhash.h>
94 #include <linux/rcupdate.h>
95 #include <linux/times.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/ip_mp_alg.h>
107 #include <net/netevent.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
112 #define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115 #define IP_MAX_MTU 0xFFF0
117 #define RT_GC_TIMEOUT (300*HZ)
119 static int ip_rt_min_delay = 2 * HZ;
120 static int ip_rt_max_delay = 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval = 60 * HZ;
124 static int ip_rt_gc_min_interval = HZ / 2;
125 static int ip_rt_redirect_number = 9;
126 static int ip_rt_redirect_load = HZ / 50;
127 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost = HZ;
129 static int ip_rt_error_burst = 5 * HZ;
130 static int ip_rt_gc_elasticity = 8;
131 static int ip_rt_mtu_expires = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu = 512 + 20 + 20;
133 static int ip_rt_min_advmss = 256;
134 static int ip_rt_secret_interval = 10 * 60 * HZ;
135 static unsigned long rt_deadline;
137 #define RTprint(a...) printk(KERN_DEBUG a)
139 static struct timer_list rt_flush_timer;
140 static struct timer_list rt_periodic_timer;
141 static struct timer_list rt_secret_timer;
144 * Interface to generic destination cache.
147 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
148 static void ipv4_dst_destroy(struct dst_entry *dst);
149 static void ipv4_dst_ifdown(struct dst_entry *dst,
150 struct net_device *dev, int how);
151 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
152 static void ipv4_link_failure(struct sk_buff *skb);
153 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
154 static int rt_garbage_collect(void);
157 static struct dst_ops ipv4_dst_ops = {
158 .family = AF_INET,
159 .protocol = __constant_htons(ETH_P_IP),
160 .gc = rt_garbage_collect,
161 .check = ipv4_dst_check,
162 .destroy = ipv4_dst_destroy,
163 .ifdown = ipv4_dst_ifdown,
164 .negative_advice = ipv4_negative_advice,
165 .link_failure = ipv4_link_failure,
166 .update_pmtu = ip_rt_update_pmtu,
167 .entry_size = sizeof(struct rtable),
170 #define ECN_OR_COST(class) TC_PRIO_##class
172 __u8 ip_tos2prio[16] = {
173 TC_PRIO_BESTEFFORT,
174 ECN_OR_COST(FILLER),
175 TC_PRIO_BESTEFFORT,
176 ECN_OR_COST(BESTEFFORT),
177 TC_PRIO_BULK,
178 ECN_OR_COST(BULK),
179 TC_PRIO_BULK,
180 ECN_OR_COST(BULK),
181 TC_PRIO_INTERACTIVE,
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE,
184 ECN_OR_COST(INTERACTIVE),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK),
187 TC_PRIO_INTERACTIVE_BULK,
188 ECN_OR_COST(INTERACTIVE_BULK)
193 * Route cache.
196 /* The locking scheme is rather straight forward:
198 * 1) Read-Copy Update protects the buckets of the central route hash.
199 * 2) Only writers remove entries, and they hold the lock
200 * as they look at rtable reference counts.
201 * 3) Only readers acquire references to rtable entries,
202 * they do so with atomic increments and with the
203 * lock held.
206 struct rt_hash_bucket {
207 struct rtable *chain;
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210 defined(CONFIG_PROVE_LOCKING)
212 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213 * The size of this table is a power of two and depends on the number of CPUS.
214 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ 256
218 #else
219 # if NR_CPUS >= 32
220 # define RT_HASH_LOCK_SZ 4096
221 # elif NR_CPUS >= 16
222 # define RT_HASH_LOCK_SZ 2048
223 # elif NR_CPUS >= 8
224 # define RT_HASH_LOCK_SZ 1024
225 # elif NR_CPUS >= 4
226 # define RT_HASH_LOCK_SZ 512
227 # else
228 # define RT_HASH_LOCK_SZ 256
229 # endif
230 #endif
232 static spinlock_t *rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234 # define rt_hash_lock_init() { \
235 int i; \
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
237 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
238 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
239 spin_lock_init(&rt_hash_locks[i]); \
241 #else
242 # define rt_hash_lock_addr(slot) NULL
243 # define rt_hash_lock_init()
244 #endif
246 static struct rt_hash_bucket *rt_hash_table;
247 static unsigned rt_hash_mask;
248 static int rt_hash_log;
249 static unsigned int rt_hash_rnd;
251 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
252 #define RT_CACHE_STAT_INC(field) \
253 (__raw_get_cpu_var(rt_cache_stat).field++)
255 static int rt_intern_hash(unsigned hash, struct rtable *rth,
256 struct rtable **res);
258 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
260 return (jhash_2words(daddr, saddr, rt_hash_rnd)
261 & rt_hash_mask);
264 #define rt_hash(daddr, saddr, idx) \
265 rt_hash_code((__force u32)(__be32)(daddr),\
266 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
268 #ifdef CONFIG_PROC_FS
269 struct rt_cache_iter_state {
270 int bucket;
273 static struct rtable *rt_cache_get_first(struct seq_file *seq)
275 struct rtable *r = NULL;
276 struct rt_cache_iter_state *st = seq->private;
278 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
279 rcu_read_lock_bh();
280 r = rt_hash_table[st->bucket].chain;
281 if (r)
282 break;
283 rcu_read_unlock_bh();
285 return r;
288 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
290 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
292 r = r->u.dst.rt_next;
293 while (!r) {
294 rcu_read_unlock_bh();
295 if (--st->bucket < 0)
296 break;
297 rcu_read_lock_bh();
298 r = rt_hash_table[st->bucket].chain;
300 return r;
303 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
305 struct rtable *r = rt_cache_get_first(seq);
307 if (r)
308 while (pos && (r = rt_cache_get_next(seq, r)))
309 --pos;
310 return pos ? NULL : r;
313 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
315 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
318 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
320 struct rtable *r = NULL;
322 if (v == SEQ_START_TOKEN)
323 r = rt_cache_get_first(seq);
324 else
325 r = rt_cache_get_next(seq, v);
326 ++*pos;
327 return r;
330 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
332 if (v && v != SEQ_START_TOKEN)
333 rcu_read_unlock_bh();
336 static int rt_cache_seq_show(struct seq_file *seq, void *v)
338 if (v == SEQ_START_TOKEN)
339 seq_printf(seq, "%-127s\n",
340 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
341 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
342 "HHUptod\tSpecDst");
343 else {
344 struct rtable *r = v;
345 char temp[256];
347 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
348 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
349 r->u.dst.dev ? r->u.dst.dev->name : "*",
350 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
351 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
352 r->u.dst.__use, 0, (unsigned long)r->rt_src,
353 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
354 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
355 dst_metric(&r->u.dst, RTAX_WINDOW),
356 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
357 dst_metric(&r->u.dst, RTAX_RTTVAR)),
358 r->fl.fl4_tos,
359 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
360 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
361 dev_queue_xmit) : 0,
362 r->rt_spec_dst);
363 seq_printf(seq, "%-127s\n", temp);
365 return 0;
368 static struct seq_operations rt_cache_seq_ops = {
369 .start = rt_cache_seq_start,
370 .next = rt_cache_seq_next,
371 .stop = rt_cache_seq_stop,
372 .show = rt_cache_seq_show,
375 static int rt_cache_seq_open(struct inode *inode, struct file *file)
377 struct seq_file *seq;
378 int rc = -ENOMEM;
379 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
381 if (!s)
382 goto out;
383 rc = seq_open(file, &rt_cache_seq_ops);
384 if (rc)
385 goto out_kfree;
386 seq = file->private_data;
387 seq->private = s;
388 memset(s, 0, sizeof(*s));
389 out:
390 return rc;
391 out_kfree:
392 kfree(s);
393 goto out;
396 static const struct file_operations rt_cache_seq_fops = {
397 .owner = THIS_MODULE,
398 .open = rt_cache_seq_open,
399 .read = seq_read,
400 .llseek = seq_lseek,
401 .release = seq_release_private,
405 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
407 int cpu;
409 if (*pos == 0)
410 return SEQ_START_TOKEN;
412 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
413 if (!cpu_possible(cpu))
414 continue;
415 *pos = cpu+1;
416 return &per_cpu(rt_cache_stat, cpu);
418 return NULL;
421 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
423 int cpu;
425 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
426 if (!cpu_possible(cpu))
427 continue;
428 *pos = cpu+1;
429 return &per_cpu(rt_cache_stat, cpu);
431 return NULL;
435 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
440 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
442 struct rt_cache_stat *st = v;
444 if (v == SEQ_START_TOKEN) {
445 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
446 return 0;
449 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
450 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
451 atomic_read(&ipv4_dst_ops.entries),
452 st->in_hit,
453 st->in_slow_tot,
454 st->in_slow_mc,
455 st->in_no_route,
456 st->in_brd,
457 st->in_martian_dst,
458 st->in_martian_src,
460 st->out_hit,
461 st->out_slow_tot,
462 st->out_slow_mc,
464 st->gc_total,
465 st->gc_ignored,
466 st->gc_goal_miss,
467 st->gc_dst_overflow,
468 st->in_hlist_search,
469 st->out_hlist_search
471 return 0;
474 static struct seq_operations rt_cpu_seq_ops = {
475 .start = rt_cpu_seq_start,
476 .next = rt_cpu_seq_next,
477 .stop = rt_cpu_seq_stop,
478 .show = rt_cpu_seq_show,
482 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
484 return seq_open(file, &rt_cpu_seq_ops);
487 static const struct file_operations rt_cpu_seq_fops = {
488 .owner = THIS_MODULE,
489 .open = rt_cpu_seq_open,
490 .read = seq_read,
491 .llseek = seq_lseek,
492 .release = seq_release,
495 #endif /* CONFIG_PROC_FS */
497 static __inline__ void rt_free(struct rtable *rt)
499 multipath_remove(rt);
500 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
503 static __inline__ void rt_drop(struct rtable *rt)
505 multipath_remove(rt);
506 ip_rt_put(rt);
507 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
510 static __inline__ int rt_fast_clean(struct rtable *rth)
512 /* Kill broadcast/multicast entries very aggresively, if they
513 collide in hash table with more useful entries */
514 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
515 rth->fl.iif && rth->u.dst.rt_next;
518 static __inline__ int rt_valuable(struct rtable *rth)
520 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
521 rth->u.dst.expires;
524 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
526 unsigned long age;
527 int ret = 0;
529 if (atomic_read(&rth->u.dst.__refcnt))
530 goto out;
532 ret = 1;
533 if (rth->u.dst.expires &&
534 time_after_eq(jiffies, rth->u.dst.expires))
535 goto out;
537 age = jiffies - rth->u.dst.lastuse;
538 ret = 0;
539 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
540 (age <= tmo2 && rt_valuable(rth)))
541 goto out;
542 ret = 1;
543 out: return ret;
546 /* Bits of score are:
547 * 31: very valuable
548 * 30: not quite useless
549 * 29..0: usage counter
551 static inline u32 rt_score(struct rtable *rt)
553 u32 score = jiffies - rt->u.dst.lastuse;
555 score = ~score & ~(3<<30);
557 if (rt_valuable(rt))
558 score |= (1<<31);
560 if (!rt->fl.iif ||
561 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
562 score |= (1<<30);
564 return score;
567 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
569 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
570 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
571 (fl1->mark ^ fl2->mark) |
572 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
573 *(u16 *)&fl2->nl_u.ip4_u.tos) |
574 (fl1->oif ^ fl2->oif) |
575 (fl1->iif ^ fl2->iif)) == 0;
578 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
579 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
580 struct rtable *expentry,
581 int *removed_count)
583 int passedexpired = 0;
584 struct rtable **nextstep = NULL;
585 struct rtable **rthp = chain_head;
586 struct rtable *rth;
588 if (removed_count)
589 *removed_count = 0;
591 while ((rth = *rthp) != NULL) {
592 if (rth == expentry)
593 passedexpired = 1;
595 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
596 compare_keys(&(*rthp)->fl, &expentry->fl)) {
597 if (*rthp == expentry) {
598 *rthp = rth->u.dst.rt_next;
599 continue;
600 } else {
601 *rthp = rth->u.dst.rt_next;
602 rt_free(rth);
603 if (removed_count)
604 ++(*removed_count);
606 } else {
607 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
608 passedexpired && !nextstep)
609 nextstep = &rth->u.dst.rt_next;
611 rthp = &rth->u.dst.rt_next;
615 rt_free(expentry);
616 if (removed_count)
617 ++(*removed_count);
619 return nextstep;
621 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
624 /* This runs via a timer and thus is always in BH context. */
625 static void rt_check_expire(unsigned long dummy)
627 static unsigned int rover;
628 unsigned int i = rover, goal;
629 struct rtable *rth, **rthp;
630 unsigned long now = jiffies;
631 u64 mult;
633 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
634 if (ip_rt_gc_timeout > 1)
635 do_div(mult, ip_rt_gc_timeout);
636 goal = (unsigned int)mult;
637 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
638 for (; goal > 0; goal--) {
639 unsigned long tmo = ip_rt_gc_timeout;
641 i = (i + 1) & rt_hash_mask;
642 rthp = &rt_hash_table[i].chain;
644 if (*rthp == 0)
645 continue;
646 spin_lock(rt_hash_lock_addr(i));
647 while ((rth = *rthp) != NULL) {
648 if (rth->u.dst.expires) {
649 /* Entry is expired even if it is in use */
650 if (time_before_eq(now, rth->u.dst.expires)) {
651 tmo >>= 1;
652 rthp = &rth->u.dst.rt_next;
653 continue;
655 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
656 tmo >>= 1;
657 rthp = &rth->u.dst.rt_next;
658 continue;
661 /* Cleanup aged off entries. */
662 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
663 /* remove all related balanced entries if necessary */
664 if (rth->u.dst.flags & DST_BALANCED) {
665 rthp = rt_remove_balanced_route(
666 &rt_hash_table[i].chain,
667 rth, NULL);
668 if (!rthp)
669 break;
670 } else {
671 *rthp = rth->u.dst.rt_next;
672 rt_free(rth);
674 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
675 *rthp = rth->u.dst.rt_next;
676 rt_free(rth);
677 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
679 spin_unlock(rt_hash_lock_addr(i));
681 /* Fallback loop breaker. */
682 if (time_after(jiffies, now))
683 break;
685 rover = i;
686 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
689 /* This can run from both BH and non-BH contexts, the latter
690 * in the case of a forced flush event.
692 static void rt_run_flush(unsigned long dummy)
694 int i;
695 struct rtable *rth, *next;
697 rt_deadline = 0;
699 get_random_bytes(&rt_hash_rnd, 4);
701 for (i = rt_hash_mask; i >= 0; i--) {
702 spin_lock_bh(rt_hash_lock_addr(i));
703 rth = rt_hash_table[i].chain;
704 if (rth)
705 rt_hash_table[i].chain = NULL;
706 spin_unlock_bh(rt_hash_lock_addr(i));
708 for (; rth; rth = next) {
709 next = rth->u.dst.rt_next;
710 rt_free(rth);
715 static DEFINE_SPINLOCK(rt_flush_lock);
717 void rt_cache_flush(int delay)
719 unsigned long now = jiffies;
720 int user_mode = !in_softirq();
722 if (delay < 0)
723 delay = ip_rt_min_delay;
725 /* flush existing multipath state*/
726 multipath_flush();
728 spin_lock_bh(&rt_flush_lock);
730 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
731 long tmo = (long)(rt_deadline - now);
733 /* If flush timer is already running
734 and flush request is not immediate (delay > 0):
736 if deadline is not achieved, prolongate timer to "delay",
737 otherwise fire it at deadline time.
740 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
741 tmo = 0;
743 if (delay > tmo)
744 delay = tmo;
747 if (delay <= 0) {
748 spin_unlock_bh(&rt_flush_lock);
749 rt_run_flush(0);
750 return;
753 if (rt_deadline == 0)
754 rt_deadline = now + ip_rt_max_delay;
756 mod_timer(&rt_flush_timer, now+delay);
757 spin_unlock_bh(&rt_flush_lock);
760 static void rt_secret_rebuild(unsigned long dummy)
762 unsigned long now = jiffies;
764 rt_cache_flush(0);
765 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
769 Short description of GC goals.
771 We want to build algorithm, which will keep routing cache
772 at some equilibrium point, when number of aged off entries
773 is kept approximately equal to newly generated ones.
775 Current expiration strength is variable "expire".
776 We try to adjust it dynamically, so that if networking
777 is idle expires is large enough to keep enough of warm entries,
778 and when load increases it reduces to limit cache size.
781 static int rt_garbage_collect(void)
783 static unsigned long expire = RT_GC_TIMEOUT;
784 static unsigned long last_gc;
785 static int rover;
786 static int equilibrium;
787 struct rtable *rth, **rthp;
788 unsigned long now = jiffies;
789 int goal;
792 * Garbage collection is pretty expensive,
793 * do not make it too frequently.
796 RT_CACHE_STAT_INC(gc_total);
798 if (now - last_gc < ip_rt_gc_min_interval &&
799 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
800 RT_CACHE_STAT_INC(gc_ignored);
801 goto out;
804 /* Calculate number of entries, which we want to expire now. */
805 goal = atomic_read(&ipv4_dst_ops.entries) -
806 (ip_rt_gc_elasticity << rt_hash_log);
807 if (goal <= 0) {
808 if (equilibrium < ipv4_dst_ops.gc_thresh)
809 equilibrium = ipv4_dst_ops.gc_thresh;
810 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
811 if (goal > 0) {
812 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
813 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
815 } else {
816 /* We are in dangerous area. Try to reduce cache really
817 * aggressively.
819 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
820 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
823 if (now - last_gc >= ip_rt_gc_min_interval)
824 last_gc = now;
826 if (goal <= 0) {
827 equilibrium += goal;
828 goto work_done;
831 do {
832 int i, k;
834 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
835 unsigned long tmo = expire;
837 k = (k + 1) & rt_hash_mask;
838 rthp = &rt_hash_table[k].chain;
839 spin_lock_bh(rt_hash_lock_addr(k));
840 while ((rth = *rthp) != NULL) {
841 if (!rt_may_expire(rth, tmo, expire)) {
842 tmo >>= 1;
843 rthp = &rth->u.dst.rt_next;
844 continue;
846 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
847 /* remove all related balanced entries
848 * if necessary
850 if (rth->u.dst.flags & DST_BALANCED) {
851 int r;
853 rthp = rt_remove_balanced_route(
854 &rt_hash_table[k].chain,
855 rth,
856 &r);
857 goal -= r;
858 if (!rthp)
859 break;
860 } else {
861 *rthp = rth->u.dst.rt_next;
862 rt_free(rth);
863 goal--;
865 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
866 *rthp = rth->u.dst.rt_next;
867 rt_free(rth);
868 goal--;
869 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
871 spin_unlock_bh(rt_hash_lock_addr(k));
872 if (goal <= 0)
873 break;
875 rover = k;
877 if (goal <= 0)
878 goto work_done;
880 /* Goal is not achieved. We stop process if:
882 - if expire reduced to zero. Otherwise, expire is halfed.
883 - if table is not full.
884 - if we are called from interrupt.
885 - jiffies check is just fallback/debug loop breaker.
886 We will not spin here for long time in any case.
889 RT_CACHE_STAT_INC(gc_goal_miss);
891 if (expire == 0)
892 break;
894 expire >>= 1;
895 #if RT_CACHE_DEBUG >= 2
896 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
897 atomic_read(&ipv4_dst_ops.entries), goal, i);
898 #endif
900 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
901 goto out;
902 } while (!in_softirq() && time_before_eq(jiffies, now));
904 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
905 goto out;
906 if (net_ratelimit())
907 printk(KERN_WARNING "dst cache overflow\n");
908 RT_CACHE_STAT_INC(gc_dst_overflow);
909 return 1;
911 work_done:
912 expire += ip_rt_gc_min_interval;
913 if (expire > ip_rt_gc_timeout ||
914 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
915 expire = ip_rt_gc_timeout;
916 #if RT_CACHE_DEBUG >= 2
917 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
918 atomic_read(&ipv4_dst_ops.entries), goal, rover);
919 #endif
920 out: return 0;
923 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
925 struct rtable *rth, **rthp;
926 unsigned long now;
927 struct rtable *cand, **candp;
928 u32 min_score;
929 int chain_length;
930 int attempts = !in_softirq();
932 restart:
933 chain_length = 0;
934 min_score = ~(u32)0;
935 cand = NULL;
936 candp = NULL;
937 now = jiffies;
939 rthp = &rt_hash_table[hash].chain;
941 spin_lock_bh(rt_hash_lock_addr(hash));
942 while ((rth = *rthp) != NULL) {
943 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
944 if (!(rth->u.dst.flags & DST_BALANCED) &&
945 compare_keys(&rth->fl, &rt->fl)) {
946 #else
947 if (compare_keys(&rth->fl, &rt->fl)) {
948 #endif
949 /* Put it first */
950 *rthp = rth->u.dst.rt_next;
952 * Since lookup is lockfree, the deletion
953 * must be visible to another weakly ordered CPU before
954 * the insertion at the start of the hash chain.
956 rcu_assign_pointer(rth->u.dst.rt_next,
957 rt_hash_table[hash].chain);
959 * Since lookup is lockfree, the update writes
960 * must be ordered for consistency on SMP.
962 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
964 rth->u.dst.__use++;
965 dst_hold(&rth->u.dst);
966 rth->u.dst.lastuse = now;
967 spin_unlock_bh(rt_hash_lock_addr(hash));
969 rt_drop(rt);
970 *rp = rth;
971 return 0;
974 if (!atomic_read(&rth->u.dst.__refcnt)) {
975 u32 score = rt_score(rth);
977 if (score <= min_score) {
978 cand = rth;
979 candp = rthp;
980 min_score = score;
984 chain_length++;
986 rthp = &rth->u.dst.rt_next;
989 if (cand) {
990 /* ip_rt_gc_elasticity used to be average length of chain
991 * length, when exceeded gc becomes really aggressive.
993 * The second limit is less certain. At the moment it allows
994 * only 2 entries per bucket. We will see.
996 if (chain_length > ip_rt_gc_elasticity) {
997 *candp = cand->u.dst.rt_next;
998 rt_free(cand);
1002 /* Try to bind route to arp only if it is output
1003 route or unicast forwarding path.
1005 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1006 int err = arp_bind_neighbour(&rt->u.dst);
1007 if (err) {
1008 spin_unlock_bh(rt_hash_lock_addr(hash));
1010 if (err != -ENOBUFS) {
1011 rt_drop(rt);
1012 return err;
1015 /* Neighbour tables are full and nothing
1016 can be released. Try to shrink route cache,
1017 it is most likely it holds some neighbour records.
1019 if (attempts-- > 0) {
1020 int saved_elasticity = ip_rt_gc_elasticity;
1021 int saved_int = ip_rt_gc_min_interval;
1022 ip_rt_gc_elasticity = 1;
1023 ip_rt_gc_min_interval = 0;
1024 rt_garbage_collect();
1025 ip_rt_gc_min_interval = saved_int;
1026 ip_rt_gc_elasticity = saved_elasticity;
1027 goto restart;
1030 if (net_ratelimit())
1031 printk(KERN_WARNING "Neighbour table overflow.\n");
1032 rt_drop(rt);
1033 return -ENOBUFS;
1037 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1038 #if RT_CACHE_DEBUG >= 2
1039 if (rt->u.dst.rt_next) {
1040 struct rtable *trt;
1041 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1042 NIPQUAD(rt->rt_dst));
1043 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1044 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1045 printk("\n");
1047 #endif
1048 rt_hash_table[hash].chain = rt;
1049 spin_unlock_bh(rt_hash_lock_addr(hash));
1050 *rp = rt;
1051 return 0;
1054 void rt_bind_peer(struct rtable *rt, int create)
1056 static DEFINE_SPINLOCK(rt_peer_lock);
1057 struct inet_peer *peer;
1059 peer = inet_getpeer(rt->rt_dst, create);
1061 spin_lock_bh(&rt_peer_lock);
1062 if (rt->peer == NULL) {
1063 rt->peer = peer;
1064 peer = NULL;
1066 spin_unlock_bh(&rt_peer_lock);
1067 if (peer)
1068 inet_putpeer(peer);
1072 * Peer allocation may fail only in serious out-of-memory conditions. However
1073 * we still can generate some output.
1074 * Random ID selection looks a bit dangerous because we have no chances to
1075 * select ID being unique in a reasonable period of time.
1076 * But broken packet identifier may be better than no packet at all.
1078 static void ip_select_fb_ident(struct iphdr *iph)
1080 static DEFINE_SPINLOCK(ip_fb_id_lock);
1081 static u32 ip_fallback_id;
1082 u32 salt;
1084 spin_lock_bh(&ip_fb_id_lock);
1085 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1086 iph->id = htons(salt & 0xFFFF);
1087 ip_fallback_id = salt;
1088 spin_unlock_bh(&ip_fb_id_lock);
1091 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1093 struct rtable *rt = (struct rtable *) dst;
1095 if (rt) {
1096 if (rt->peer == NULL)
1097 rt_bind_peer(rt, 1);
1099 /* If peer is attached to destination, it is never detached,
1100 so that we need not to grab a lock to dereference it.
1102 if (rt->peer) {
1103 iph->id = htons(inet_getid(rt->peer, more));
1104 return;
1106 } else
1107 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1108 __builtin_return_address(0));
1110 ip_select_fb_ident(iph);
1113 static void rt_del(unsigned hash, struct rtable *rt)
1115 struct rtable **rthp;
1117 spin_lock_bh(rt_hash_lock_addr(hash));
1118 ip_rt_put(rt);
1119 for (rthp = &rt_hash_table[hash].chain; *rthp;
1120 rthp = &(*rthp)->u.dst.rt_next)
1121 if (*rthp == rt) {
1122 *rthp = rt->u.dst.rt_next;
1123 rt_free(rt);
1124 break;
1126 spin_unlock_bh(rt_hash_lock_addr(hash));
1129 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1130 __be32 saddr, struct net_device *dev)
1132 int i, k;
1133 struct in_device *in_dev = in_dev_get(dev);
1134 struct rtable *rth, **rthp;
1135 __be32 skeys[2] = { saddr, 0 };
1136 int ikeys[2] = { dev->ifindex, 0 };
1137 struct netevent_redirect netevent;
1139 if (!in_dev)
1140 return;
1142 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1143 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1144 goto reject_redirect;
1146 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1147 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1148 goto reject_redirect;
1149 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1150 goto reject_redirect;
1151 } else {
1152 if (inet_addr_type(new_gw) != RTN_UNICAST)
1153 goto reject_redirect;
1156 for (i = 0; i < 2; i++) {
1157 for (k = 0; k < 2; k++) {
1158 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1160 rthp=&rt_hash_table[hash].chain;
1162 rcu_read_lock();
1163 while ((rth = rcu_dereference(*rthp)) != NULL) {
1164 struct rtable *rt;
1166 if (rth->fl.fl4_dst != daddr ||
1167 rth->fl.fl4_src != skeys[i] ||
1168 rth->fl.oif != ikeys[k] ||
1169 rth->fl.iif != 0) {
1170 rthp = &rth->u.dst.rt_next;
1171 continue;
1174 if (rth->rt_dst != daddr ||
1175 rth->rt_src != saddr ||
1176 rth->u.dst.error ||
1177 rth->rt_gateway != old_gw ||
1178 rth->u.dst.dev != dev)
1179 break;
1181 dst_hold(&rth->u.dst);
1182 rcu_read_unlock();
1184 rt = dst_alloc(&ipv4_dst_ops);
1185 if (rt == NULL) {
1186 ip_rt_put(rth);
1187 in_dev_put(in_dev);
1188 return;
1191 /* Copy all the information. */
1192 *rt = *rth;
1193 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1194 rt->u.dst.__use = 1;
1195 atomic_set(&rt->u.dst.__refcnt, 1);
1196 rt->u.dst.child = NULL;
1197 if (rt->u.dst.dev)
1198 dev_hold(rt->u.dst.dev);
1199 if (rt->idev)
1200 in_dev_hold(rt->idev);
1201 rt->u.dst.obsolete = 0;
1202 rt->u.dst.lastuse = jiffies;
1203 rt->u.dst.path = &rt->u.dst;
1204 rt->u.dst.neighbour = NULL;
1205 rt->u.dst.hh = NULL;
1206 rt->u.dst.xfrm = NULL;
1208 rt->rt_flags |= RTCF_REDIRECTED;
1210 /* Gateway is different ... */
1211 rt->rt_gateway = new_gw;
1213 /* Redirect received -> path was valid */
1214 dst_confirm(&rth->u.dst);
1216 if (rt->peer)
1217 atomic_inc(&rt->peer->refcnt);
1219 if (arp_bind_neighbour(&rt->u.dst) ||
1220 !(rt->u.dst.neighbour->nud_state &
1221 NUD_VALID)) {
1222 if (rt->u.dst.neighbour)
1223 neigh_event_send(rt->u.dst.neighbour, NULL);
1224 ip_rt_put(rth);
1225 rt_drop(rt);
1226 goto do_next;
1229 netevent.old = &rth->u.dst;
1230 netevent.new = &rt->u.dst;
1231 call_netevent_notifiers(NETEVENT_REDIRECT,
1232 &netevent);
1234 rt_del(hash, rth);
1235 if (!rt_intern_hash(hash, rt, &rt))
1236 ip_rt_put(rt);
1237 goto do_next;
1239 rcu_read_unlock();
1240 do_next:
1244 in_dev_put(in_dev);
1245 return;
1247 reject_redirect:
1248 #ifdef CONFIG_IP_ROUTE_VERBOSE
1249 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1250 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1251 "%u.%u.%u.%u ignored.\n"
1252 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1253 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1254 NIPQUAD(saddr), NIPQUAD(daddr));
1255 #endif
1256 in_dev_put(in_dev);
1259 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1261 struct rtable *rt = (struct rtable*)dst;
1262 struct dst_entry *ret = dst;
1264 if (rt) {
1265 if (dst->obsolete) {
1266 ip_rt_put(rt);
1267 ret = NULL;
1268 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1269 rt->u.dst.expires) {
1270 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1271 rt->fl.oif);
1272 #if RT_CACHE_DEBUG >= 1
1273 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1274 "%u.%u.%u.%u/%02x dropped\n",
1275 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1276 #endif
1277 rt_del(hash, rt);
1278 ret = NULL;
1281 return ret;
1285 * Algorithm:
1286 * 1. The first ip_rt_redirect_number redirects are sent
1287 * with exponential backoff, then we stop sending them at all,
1288 * assuming that the host ignores our redirects.
1289 * 2. If we did not see packets requiring redirects
1290 * during ip_rt_redirect_silence, we assume that the host
1291 * forgot redirected route and start to send redirects again.
1293 * This algorithm is much cheaper and more intelligent than dumb load limiting
1294 * in icmp.c.
1296 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1297 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1300 void ip_rt_send_redirect(struct sk_buff *skb)
1302 struct rtable *rt = (struct rtable*)skb->dst;
1303 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1305 if (!in_dev)
1306 return;
1308 if (!IN_DEV_TX_REDIRECTS(in_dev))
1309 goto out;
1311 /* No redirected packets during ip_rt_redirect_silence;
1312 * reset the algorithm.
1314 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1315 rt->u.dst.rate_tokens = 0;
1317 /* Too many ignored redirects; do not send anything
1318 * set u.dst.rate_last to the last seen redirected packet.
1320 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1321 rt->u.dst.rate_last = jiffies;
1322 goto out;
1325 /* Check for load limit; set rate_last to the latest sent
1326 * redirect.
1328 if (rt->u.dst.rate_tokens == 0 ||
1329 time_after(jiffies,
1330 (rt->u.dst.rate_last +
1331 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1332 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1333 rt->u.dst.rate_last = jiffies;
1334 ++rt->u.dst.rate_tokens;
1335 #ifdef CONFIG_IP_ROUTE_VERBOSE
1336 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1337 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1338 net_ratelimit())
1339 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1340 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1341 NIPQUAD(rt->rt_src), rt->rt_iif,
1342 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1343 #endif
1345 out:
1346 in_dev_put(in_dev);
1349 static int ip_error(struct sk_buff *skb)
1351 struct rtable *rt = (struct rtable*)skb->dst;
1352 unsigned long now;
1353 int code;
1355 switch (rt->u.dst.error) {
1356 case EINVAL:
1357 default:
1358 goto out;
1359 case EHOSTUNREACH:
1360 code = ICMP_HOST_UNREACH;
1361 break;
1362 case ENETUNREACH:
1363 code = ICMP_NET_UNREACH;
1364 break;
1365 case EACCES:
1366 code = ICMP_PKT_FILTERED;
1367 break;
1370 now = jiffies;
1371 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1372 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1373 rt->u.dst.rate_tokens = ip_rt_error_burst;
1374 rt->u.dst.rate_last = now;
1375 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1376 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1377 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1380 out: kfree_skb(skb);
1381 return 0;
1385 * The last two values are not from the RFC but
1386 * are needed for AMPRnet AX.25 paths.
1389 static const unsigned short mtu_plateau[] =
1390 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1392 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1394 int i;
1396 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1397 if (old_mtu > mtu_plateau[i])
1398 return mtu_plateau[i];
1399 return 68;
1402 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1404 int i;
1405 unsigned short old_mtu = ntohs(iph->tot_len);
1406 struct rtable *rth;
1407 __be32 skeys[2] = { iph->saddr, 0, };
1408 __be32 daddr = iph->daddr;
1409 unsigned short est_mtu = 0;
1411 if (ipv4_config.no_pmtu_disc)
1412 return 0;
1414 for (i = 0; i < 2; i++) {
1415 unsigned hash = rt_hash(daddr, skeys[i], 0);
1417 rcu_read_lock();
1418 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1419 rth = rcu_dereference(rth->u.dst.rt_next)) {
1420 if (rth->fl.fl4_dst == daddr &&
1421 rth->fl.fl4_src == skeys[i] &&
1422 rth->rt_dst == daddr &&
1423 rth->rt_src == iph->saddr &&
1424 rth->fl.iif == 0 &&
1425 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1426 unsigned short mtu = new_mtu;
1428 if (new_mtu < 68 || new_mtu >= old_mtu) {
1430 /* BSD 4.2 compatibility hack :-( */
1431 if (mtu == 0 &&
1432 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1433 old_mtu >= 68 + (iph->ihl << 2))
1434 old_mtu -= iph->ihl << 2;
1436 mtu = guess_mtu(old_mtu);
1438 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1439 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1440 dst_confirm(&rth->u.dst);
1441 if (mtu < ip_rt_min_pmtu) {
1442 mtu = ip_rt_min_pmtu;
1443 rth->u.dst.metrics[RTAX_LOCK-1] |=
1444 (1 << RTAX_MTU);
1446 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1447 dst_set_expires(&rth->u.dst,
1448 ip_rt_mtu_expires);
1450 est_mtu = mtu;
1454 rcu_read_unlock();
1456 return est_mtu ? : new_mtu;
1459 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1461 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1462 !(dst_metric_locked(dst, RTAX_MTU))) {
1463 if (mtu < ip_rt_min_pmtu) {
1464 mtu = ip_rt_min_pmtu;
1465 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1467 dst->metrics[RTAX_MTU-1] = mtu;
1468 dst_set_expires(dst, ip_rt_mtu_expires);
1469 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1473 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1475 return NULL;
1478 static void ipv4_dst_destroy(struct dst_entry *dst)
1480 struct rtable *rt = (struct rtable *) dst;
1481 struct inet_peer *peer = rt->peer;
1482 struct in_device *idev = rt->idev;
1484 if (peer) {
1485 rt->peer = NULL;
1486 inet_putpeer(peer);
1489 if (idev) {
1490 rt->idev = NULL;
1491 in_dev_put(idev);
1495 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1496 int how)
1498 struct rtable *rt = (struct rtable *) dst;
1499 struct in_device *idev = rt->idev;
1500 if (dev != &loopback_dev && idev && idev->dev == dev) {
1501 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1502 if (loopback_idev) {
1503 rt->idev = loopback_idev;
1504 in_dev_put(idev);
1509 static void ipv4_link_failure(struct sk_buff *skb)
1511 struct rtable *rt;
1513 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1515 rt = (struct rtable *) skb->dst;
1516 if (rt)
1517 dst_set_expires(&rt->u.dst, 0);
1520 static int ip_rt_bug(struct sk_buff *skb)
1522 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1523 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1524 skb->dev ? skb->dev->name : "?");
1525 kfree_skb(skb);
1526 return 0;
1530 We do not cache source address of outgoing interface,
1531 because it is used only by IP RR, TS and SRR options,
1532 so that it out of fast path.
1534 BTW remember: "addr" is allowed to be not aligned
1535 in IP options!
1538 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1540 __be32 src;
1541 struct fib_result res;
1543 if (rt->fl.iif == 0)
1544 src = rt->rt_src;
1545 else if (fib_lookup(&rt->fl, &res) == 0) {
1546 src = FIB_RES_PREFSRC(res);
1547 fib_res_put(&res);
1548 } else
1549 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1550 RT_SCOPE_UNIVERSE);
1551 memcpy(addr, &src, 4);
1554 #ifdef CONFIG_NET_CLS_ROUTE
1555 static void set_class_tag(struct rtable *rt, u32 tag)
1557 if (!(rt->u.dst.tclassid & 0xFFFF))
1558 rt->u.dst.tclassid |= tag & 0xFFFF;
1559 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1560 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1562 #endif
1564 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1566 struct fib_info *fi = res->fi;
1568 if (fi) {
1569 if (FIB_RES_GW(*res) &&
1570 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1571 rt->rt_gateway = FIB_RES_GW(*res);
1572 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1573 sizeof(rt->u.dst.metrics));
1574 if (fi->fib_mtu == 0) {
1575 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1576 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1577 rt->rt_gateway != rt->rt_dst &&
1578 rt->u.dst.dev->mtu > 576)
1579 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1581 #ifdef CONFIG_NET_CLS_ROUTE
1582 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1583 #endif
1584 } else
1585 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1587 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1588 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1589 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1590 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1591 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1592 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1593 ip_rt_min_advmss);
1594 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1595 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1597 #ifdef CONFIG_NET_CLS_ROUTE
1598 #ifdef CONFIG_IP_MULTIPLE_TABLES
1599 set_class_tag(rt, fib_rules_tclass(res));
1600 #endif
1601 set_class_tag(rt, itag);
1602 #endif
1603 rt->rt_type = res->type;
1606 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1607 u8 tos, struct net_device *dev, int our)
1609 unsigned hash;
1610 struct rtable *rth;
1611 __be32 spec_dst;
1612 struct in_device *in_dev = in_dev_get(dev);
1613 u32 itag = 0;
1615 /* Primary sanity checks. */
1617 if (in_dev == NULL)
1618 return -EINVAL;
1620 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1621 skb->protocol != htons(ETH_P_IP))
1622 goto e_inval;
1624 if (ZERONET(saddr)) {
1625 if (!LOCAL_MCAST(daddr))
1626 goto e_inval;
1627 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1628 } else if (fib_validate_source(saddr, 0, tos, 0,
1629 dev, &spec_dst, &itag) < 0)
1630 goto e_inval;
1632 rth = dst_alloc(&ipv4_dst_ops);
1633 if (!rth)
1634 goto e_nobufs;
1636 rth->u.dst.output= ip_rt_bug;
1638 atomic_set(&rth->u.dst.__refcnt, 1);
1639 rth->u.dst.flags= DST_HOST;
1640 if (in_dev->cnf.no_policy)
1641 rth->u.dst.flags |= DST_NOPOLICY;
1642 rth->fl.fl4_dst = daddr;
1643 rth->rt_dst = daddr;
1644 rth->fl.fl4_tos = tos;
1645 rth->fl.mark = skb->mark;
1646 rth->fl.fl4_src = saddr;
1647 rth->rt_src = saddr;
1648 #ifdef CONFIG_NET_CLS_ROUTE
1649 rth->u.dst.tclassid = itag;
1650 #endif
1651 rth->rt_iif =
1652 rth->fl.iif = dev->ifindex;
1653 rth->u.dst.dev = &loopback_dev;
1654 dev_hold(rth->u.dst.dev);
1655 rth->idev = in_dev_get(rth->u.dst.dev);
1656 rth->fl.oif = 0;
1657 rth->rt_gateway = daddr;
1658 rth->rt_spec_dst= spec_dst;
1659 rth->rt_type = RTN_MULTICAST;
1660 rth->rt_flags = RTCF_MULTICAST;
1661 if (our) {
1662 rth->u.dst.input= ip_local_deliver;
1663 rth->rt_flags |= RTCF_LOCAL;
1666 #ifdef CONFIG_IP_MROUTE
1667 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1668 rth->u.dst.input = ip_mr_input;
1669 #endif
1670 RT_CACHE_STAT_INC(in_slow_mc);
1672 in_dev_put(in_dev);
1673 hash = rt_hash(daddr, saddr, dev->ifindex);
1674 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1676 e_nobufs:
1677 in_dev_put(in_dev);
1678 return -ENOBUFS;
1680 e_inval:
1681 in_dev_put(in_dev);
1682 return -EINVAL;
1686 static void ip_handle_martian_source(struct net_device *dev,
1687 struct in_device *in_dev,
1688 struct sk_buff *skb,
1689 __be32 daddr,
1690 __be32 saddr)
1692 RT_CACHE_STAT_INC(in_martian_src);
1693 #ifdef CONFIG_IP_ROUTE_VERBOSE
1694 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1696 * RFC1812 recommendation, if source is martian,
1697 * the only hint is MAC header.
1699 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1700 "%u.%u.%u.%u, on dev %s\n",
1701 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1702 if (dev->hard_header_len && skb->mac.raw) {
1703 int i;
1704 unsigned char *p = skb->mac.raw;
1705 printk(KERN_WARNING "ll header: ");
1706 for (i = 0; i < dev->hard_header_len; i++, p++) {
1707 printk("%02x", *p);
1708 if (i < (dev->hard_header_len - 1))
1709 printk(":");
1711 printk("\n");
1714 #endif
1717 static inline int __mkroute_input(struct sk_buff *skb,
1718 struct fib_result* res,
1719 struct in_device *in_dev,
1720 __be32 daddr, __be32 saddr, u32 tos,
1721 struct rtable **result)
1724 struct rtable *rth;
1725 int err;
1726 struct in_device *out_dev;
1727 unsigned flags = 0;
1728 __be32 spec_dst;
1729 u32 itag;
1731 /* get a working reference to the output device */
1732 out_dev = in_dev_get(FIB_RES_DEV(*res));
1733 if (out_dev == NULL) {
1734 if (net_ratelimit())
1735 printk(KERN_CRIT "Bug in ip_route_input" \
1736 "_slow(). Please, report\n");
1737 return -EINVAL;
1741 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1742 in_dev->dev, &spec_dst, &itag);
1743 if (err < 0) {
1744 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1745 saddr);
1747 err = -EINVAL;
1748 goto cleanup;
1751 if (err)
1752 flags |= RTCF_DIRECTSRC;
1754 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1755 (IN_DEV_SHARED_MEDIA(out_dev) ||
1756 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1757 flags |= RTCF_DOREDIRECT;
1759 if (skb->protocol != htons(ETH_P_IP)) {
1760 /* Not IP (i.e. ARP). Do not create route, if it is
1761 * invalid for proxy arp. DNAT routes are always valid.
1763 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1764 err = -EINVAL;
1765 goto cleanup;
1770 rth = dst_alloc(&ipv4_dst_ops);
1771 if (!rth) {
1772 err = -ENOBUFS;
1773 goto cleanup;
1776 atomic_set(&rth->u.dst.__refcnt, 1);
1777 rth->u.dst.flags= DST_HOST;
1778 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1779 if (res->fi->fib_nhs > 1)
1780 rth->u.dst.flags |= DST_BALANCED;
1781 #endif
1782 if (in_dev->cnf.no_policy)
1783 rth->u.dst.flags |= DST_NOPOLICY;
1784 if (out_dev->cnf.no_xfrm)
1785 rth->u.dst.flags |= DST_NOXFRM;
1786 rth->fl.fl4_dst = daddr;
1787 rth->rt_dst = daddr;
1788 rth->fl.fl4_tos = tos;
1789 rth->fl.mark = skb->mark;
1790 rth->fl.fl4_src = saddr;
1791 rth->rt_src = saddr;
1792 rth->rt_gateway = daddr;
1793 rth->rt_iif =
1794 rth->fl.iif = in_dev->dev->ifindex;
1795 rth->u.dst.dev = (out_dev)->dev;
1796 dev_hold(rth->u.dst.dev);
1797 rth->idev = in_dev_get(rth->u.dst.dev);
1798 rth->fl.oif = 0;
1799 rth->rt_spec_dst= spec_dst;
1801 rth->u.dst.input = ip_forward;
1802 rth->u.dst.output = ip_output;
1804 rt_set_nexthop(rth, res, itag);
1806 rth->rt_flags = flags;
1808 *result = rth;
1809 err = 0;
1810 cleanup:
1811 /* release the working reference to the output device */
1812 in_dev_put(out_dev);
1813 return err;
1816 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1817 struct fib_result* res,
1818 const struct flowi *fl,
1819 struct in_device *in_dev,
1820 __be32 daddr, __be32 saddr, u32 tos)
1822 struct rtable* rth = NULL;
1823 int err;
1824 unsigned hash;
1826 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1827 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1828 fib_select_multipath(fl, res);
1829 #endif
1831 /* create a routing cache entry */
1832 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1833 if (err)
1834 return err;
1836 /* put it into the cache */
1837 hash = rt_hash(daddr, saddr, fl->iif);
1838 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1841 static inline int ip_mkroute_input(struct sk_buff *skb,
1842 struct fib_result* res,
1843 const struct flowi *fl,
1844 struct in_device *in_dev,
1845 __be32 daddr, __be32 saddr, u32 tos)
1847 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1848 struct rtable* rth = NULL, *rtres;
1849 unsigned char hop, hopcount;
1850 int err = -EINVAL;
1851 unsigned int hash;
1853 if (res->fi)
1854 hopcount = res->fi->fib_nhs;
1855 else
1856 hopcount = 1;
1858 /* distinguish between multipath and singlepath */
1859 if (hopcount < 2)
1860 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1861 saddr, tos);
1863 /* add all alternatives to the routing cache */
1864 for (hop = 0; hop < hopcount; hop++) {
1865 res->nh_sel = hop;
1867 /* put reference to previous result */
1868 if (hop)
1869 ip_rt_put(rtres);
1871 /* create a routing cache entry */
1872 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1873 &rth);
1874 if (err)
1875 return err;
1877 /* put it into the cache */
1878 hash = rt_hash(daddr, saddr, fl->iif);
1879 err = rt_intern_hash(hash, rth, &rtres);
1880 if (err)
1881 return err;
1883 /* forward hop information to multipath impl. */
1884 multipath_set_nhinfo(rth,
1885 FIB_RES_NETWORK(*res),
1886 FIB_RES_NETMASK(*res),
1887 res->prefixlen,
1888 &FIB_RES_NH(*res));
1890 skb->dst = &rtres->u.dst;
1891 return err;
1892 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1893 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1894 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1899 * NOTE. We drop all the packets that has local source
1900 * addresses, because every properly looped back packet
1901 * must have correct destination already attached by output routine.
1903 * Such approach solves two big problems:
1904 * 1. Not simplex devices are handled properly.
1905 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1908 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1909 u8 tos, struct net_device *dev)
1911 struct fib_result res;
1912 struct in_device *in_dev = in_dev_get(dev);
1913 struct flowi fl = { .nl_u = { .ip4_u =
1914 { .daddr = daddr,
1915 .saddr = saddr,
1916 .tos = tos,
1917 .scope = RT_SCOPE_UNIVERSE,
1918 } },
1919 .mark = skb->mark,
1920 .iif = dev->ifindex };
1921 unsigned flags = 0;
1922 u32 itag = 0;
1923 struct rtable * rth;
1924 unsigned hash;
1925 __be32 spec_dst;
1926 int err = -EINVAL;
1927 int free_res = 0;
1929 /* IP on this device is disabled. */
1931 if (!in_dev)
1932 goto out;
1934 /* Check for the most weird martians, which can be not detected
1935 by fib_lookup.
1938 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1939 goto martian_source;
1941 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1942 goto brd_input;
1944 /* Accept zero addresses only to limited broadcast;
1945 * I even do not know to fix it or not. Waiting for complains :-)
1947 if (ZERONET(saddr))
1948 goto martian_source;
1950 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1951 goto martian_destination;
1954 * Now we are ready to route packet.
1956 if ((err = fib_lookup(&fl, &res)) != 0) {
1957 if (!IN_DEV_FORWARD(in_dev))
1958 goto e_hostunreach;
1959 goto no_route;
1961 free_res = 1;
1963 RT_CACHE_STAT_INC(in_slow_tot);
1965 if (res.type == RTN_BROADCAST)
1966 goto brd_input;
1968 if (res.type == RTN_LOCAL) {
1969 int result;
1970 result = fib_validate_source(saddr, daddr, tos,
1971 loopback_dev.ifindex,
1972 dev, &spec_dst, &itag);
1973 if (result < 0)
1974 goto martian_source;
1975 if (result)
1976 flags |= RTCF_DIRECTSRC;
1977 spec_dst = daddr;
1978 goto local_input;
1981 if (!IN_DEV_FORWARD(in_dev))
1982 goto e_hostunreach;
1983 if (res.type != RTN_UNICAST)
1984 goto martian_destination;
1986 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1987 if (err == -ENOBUFS)
1988 goto e_nobufs;
1989 if (err == -EINVAL)
1990 goto e_inval;
1992 done:
1993 in_dev_put(in_dev);
1994 if (free_res)
1995 fib_res_put(&res);
1996 out: return err;
1998 brd_input:
1999 if (skb->protocol != htons(ETH_P_IP))
2000 goto e_inval;
2002 if (ZERONET(saddr))
2003 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2004 else {
2005 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2006 &itag);
2007 if (err < 0)
2008 goto martian_source;
2009 if (err)
2010 flags |= RTCF_DIRECTSRC;
2012 flags |= RTCF_BROADCAST;
2013 res.type = RTN_BROADCAST;
2014 RT_CACHE_STAT_INC(in_brd);
2016 local_input:
2017 rth = dst_alloc(&ipv4_dst_ops);
2018 if (!rth)
2019 goto e_nobufs;
2021 rth->u.dst.output= ip_rt_bug;
2023 atomic_set(&rth->u.dst.__refcnt, 1);
2024 rth->u.dst.flags= DST_HOST;
2025 if (in_dev->cnf.no_policy)
2026 rth->u.dst.flags |= DST_NOPOLICY;
2027 rth->fl.fl4_dst = daddr;
2028 rth->rt_dst = daddr;
2029 rth->fl.fl4_tos = tos;
2030 rth->fl.mark = skb->mark;
2031 rth->fl.fl4_src = saddr;
2032 rth->rt_src = saddr;
2033 #ifdef CONFIG_NET_CLS_ROUTE
2034 rth->u.dst.tclassid = itag;
2035 #endif
2036 rth->rt_iif =
2037 rth->fl.iif = dev->ifindex;
2038 rth->u.dst.dev = &loopback_dev;
2039 dev_hold(rth->u.dst.dev);
2040 rth->idev = in_dev_get(rth->u.dst.dev);
2041 rth->rt_gateway = daddr;
2042 rth->rt_spec_dst= spec_dst;
2043 rth->u.dst.input= ip_local_deliver;
2044 rth->rt_flags = flags|RTCF_LOCAL;
2045 if (res.type == RTN_UNREACHABLE) {
2046 rth->u.dst.input= ip_error;
2047 rth->u.dst.error= -err;
2048 rth->rt_flags &= ~RTCF_LOCAL;
2050 rth->rt_type = res.type;
2051 hash = rt_hash(daddr, saddr, fl.iif);
2052 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2053 goto done;
2055 no_route:
2056 RT_CACHE_STAT_INC(in_no_route);
2057 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2058 res.type = RTN_UNREACHABLE;
2059 goto local_input;
2062 * Do not cache martian addresses: they should be logged (RFC1812)
2064 martian_destination:
2065 RT_CACHE_STAT_INC(in_martian_dst);
2066 #ifdef CONFIG_IP_ROUTE_VERBOSE
2067 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2068 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2069 "%u.%u.%u.%u, dev %s\n",
2070 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2071 #endif
2073 e_hostunreach:
2074 err = -EHOSTUNREACH;
2075 goto done;
2077 e_inval:
2078 err = -EINVAL;
2079 goto done;
2081 e_nobufs:
2082 err = -ENOBUFS;
2083 goto done;
2085 martian_source:
2086 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2087 goto e_inval;
2090 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2091 u8 tos, struct net_device *dev)
2093 struct rtable * rth;
2094 unsigned hash;
2095 int iif = dev->ifindex;
2097 tos &= IPTOS_RT_MASK;
2098 hash = rt_hash(daddr, saddr, iif);
2100 rcu_read_lock();
2101 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2102 rth = rcu_dereference(rth->u.dst.rt_next)) {
2103 if (rth->fl.fl4_dst == daddr &&
2104 rth->fl.fl4_src == saddr &&
2105 rth->fl.iif == iif &&
2106 rth->fl.oif == 0 &&
2107 rth->fl.mark == skb->mark &&
2108 rth->fl.fl4_tos == tos) {
2109 rth->u.dst.lastuse = jiffies;
2110 dst_hold(&rth->u.dst);
2111 rth->u.dst.__use++;
2112 RT_CACHE_STAT_INC(in_hit);
2113 rcu_read_unlock();
2114 skb->dst = (struct dst_entry*)rth;
2115 return 0;
2117 RT_CACHE_STAT_INC(in_hlist_search);
2119 rcu_read_unlock();
2121 /* Multicast recognition logic is moved from route cache to here.
2122 The problem was that too many Ethernet cards have broken/missing
2123 hardware multicast filters :-( As result the host on multicasting
2124 network acquires a lot of useless route cache entries, sort of
2125 SDR messages from all the world. Now we try to get rid of them.
2126 Really, provided software IP multicast filter is organized
2127 reasonably (at least, hashed), it does not result in a slowdown
2128 comparing with route cache reject entries.
2129 Note, that multicast routers are not affected, because
2130 route cache entry is created eventually.
2132 if (MULTICAST(daddr)) {
2133 struct in_device *in_dev;
2135 rcu_read_lock();
2136 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2137 int our = ip_check_mc(in_dev, daddr, saddr,
2138 skb->nh.iph->protocol);
2139 if (our
2140 #ifdef CONFIG_IP_MROUTE
2141 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2142 #endif
2144 rcu_read_unlock();
2145 return ip_route_input_mc(skb, daddr, saddr,
2146 tos, dev, our);
2149 rcu_read_unlock();
2150 return -EINVAL;
2152 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2155 static inline int __mkroute_output(struct rtable **result,
2156 struct fib_result* res,
2157 const struct flowi *fl,
2158 const struct flowi *oldflp,
2159 struct net_device *dev_out,
2160 unsigned flags)
2162 struct rtable *rth;
2163 struct in_device *in_dev;
2164 u32 tos = RT_FL_TOS(oldflp);
2165 int err = 0;
2167 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2168 return -EINVAL;
2170 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2171 res->type = RTN_BROADCAST;
2172 else if (MULTICAST(fl->fl4_dst))
2173 res->type = RTN_MULTICAST;
2174 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2175 return -EINVAL;
2177 if (dev_out->flags & IFF_LOOPBACK)
2178 flags |= RTCF_LOCAL;
2180 /* get work reference to inet device */
2181 in_dev = in_dev_get(dev_out);
2182 if (!in_dev)
2183 return -EINVAL;
2185 if (res->type == RTN_BROADCAST) {
2186 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2187 if (res->fi) {
2188 fib_info_put(res->fi);
2189 res->fi = NULL;
2191 } else if (res->type == RTN_MULTICAST) {
2192 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2193 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2194 oldflp->proto))
2195 flags &= ~RTCF_LOCAL;
2196 /* If multicast route do not exist use
2197 default one, but do not gateway in this case.
2198 Yes, it is hack.
2200 if (res->fi && res->prefixlen < 4) {
2201 fib_info_put(res->fi);
2202 res->fi = NULL;
2207 rth = dst_alloc(&ipv4_dst_ops);
2208 if (!rth) {
2209 err = -ENOBUFS;
2210 goto cleanup;
2213 atomic_set(&rth->u.dst.__refcnt, 1);
2214 rth->u.dst.flags= DST_HOST;
2215 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2216 if (res->fi) {
2217 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2218 if (res->fi->fib_nhs > 1)
2219 rth->u.dst.flags |= DST_BALANCED;
2221 #endif
2222 if (in_dev->cnf.no_xfrm)
2223 rth->u.dst.flags |= DST_NOXFRM;
2224 if (in_dev->cnf.no_policy)
2225 rth->u.dst.flags |= DST_NOPOLICY;
2227 rth->fl.fl4_dst = oldflp->fl4_dst;
2228 rth->fl.fl4_tos = tos;
2229 rth->fl.fl4_src = oldflp->fl4_src;
2230 rth->fl.oif = oldflp->oif;
2231 rth->fl.mark = oldflp->mark;
2232 rth->rt_dst = fl->fl4_dst;
2233 rth->rt_src = fl->fl4_src;
2234 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2235 /* get references to the devices that are to be hold by the routing
2236 cache entry */
2237 rth->u.dst.dev = dev_out;
2238 dev_hold(dev_out);
2239 rth->idev = in_dev_get(dev_out);
2240 rth->rt_gateway = fl->fl4_dst;
2241 rth->rt_spec_dst= fl->fl4_src;
2243 rth->u.dst.output=ip_output;
2245 RT_CACHE_STAT_INC(out_slow_tot);
2247 if (flags & RTCF_LOCAL) {
2248 rth->u.dst.input = ip_local_deliver;
2249 rth->rt_spec_dst = fl->fl4_dst;
2251 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2252 rth->rt_spec_dst = fl->fl4_src;
2253 if (flags & RTCF_LOCAL &&
2254 !(dev_out->flags & IFF_LOOPBACK)) {
2255 rth->u.dst.output = ip_mc_output;
2256 RT_CACHE_STAT_INC(out_slow_mc);
2258 #ifdef CONFIG_IP_MROUTE
2259 if (res->type == RTN_MULTICAST) {
2260 if (IN_DEV_MFORWARD(in_dev) &&
2261 !LOCAL_MCAST(oldflp->fl4_dst)) {
2262 rth->u.dst.input = ip_mr_input;
2263 rth->u.dst.output = ip_mc_output;
2266 #endif
2269 rt_set_nexthop(rth, res, 0);
2271 rth->rt_flags = flags;
2273 *result = rth;
2274 cleanup:
2275 /* release work reference to inet device */
2276 in_dev_put(in_dev);
2278 return err;
2281 static inline int ip_mkroute_output_def(struct rtable **rp,
2282 struct fib_result* res,
2283 const struct flowi *fl,
2284 const struct flowi *oldflp,
2285 struct net_device *dev_out,
2286 unsigned flags)
2288 struct rtable *rth = NULL;
2289 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2290 unsigned hash;
2291 if (err == 0) {
2292 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2293 err = rt_intern_hash(hash, rth, rp);
2296 return err;
2299 static inline int ip_mkroute_output(struct rtable** rp,
2300 struct fib_result* res,
2301 const struct flowi *fl,
2302 const struct flowi *oldflp,
2303 struct net_device *dev_out,
2304 unsigned flags)
2306 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2307 unsigned char hop;
2308 unsigned hash;
2309 int err = -EINVAL;
2310 struct rtable *rth = NULL;
2312 if (res->fi && res->fi->fib_nhs > 1) {
2313 unsigned char hopcount = res->fi->fib_nhs;
2315 for (hop = 0; hop < hopcount; hop++) {
2316 struct net_device *dev2nexthop;
2318 res->nh_sel = hop;
2320 /* hold a work reference to the output device */
2321 dev2nexthop = FIB_RES_DEV(*res);
2322 dev_hold(dev2nexthop);
2324 /* put reference to previous result */
2325 if (hop)
2326 ip_rt_put(*rp);
2328 err = __mkroute_output(&rth, res, fl, oldflp,
2329 dev2nexthop, flags);
2331 if (err != 0)
2332 goto cleanup;
2334 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2335 oldflp->oif);
2336 err = rt_intern_hash(hash, rth, rp);
2338 /* forward hop information to multipath impl. */
2339 multipath_set_nhinfo(rth,
2340 FIB_RES_NETWORK(*res),
2341 FIB_RES_NETMASK(*res),
2342 res->prefixlen,
2343 &FIB_RES_NH(*res));
2344 cleanup:
2345 /* release work reference to output device */
2346 dev_put(dev2nexthop);
2348 if (err != 0)
2349 return err;
2351 return err;
2352 } else {
2353 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2354 flags);
2356 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2357 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2358 #endif
2362 * Major route resolver routine.
2365 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2367 u32 tos = RT_FL_TOS(oldflp);
2368 struct flowi fl = { .nl_u = { .ip4_u =
2369 { .daddr = oldflp->fl4_dst,
2370 .saddr = oldflp->fl4_src,
2371 .tos = tos & IPTOS_RT_MASK,
2372 .scope = ((tos & RTO_ONLINK) ?
2373 RT_SCOPE_LINK :
2374 RT_SCOPE_UNIVERSE),
2375 } },
2376 .mark = oldflp->mark,
2377 .iif = loopback_dev.ifindex,
2378 .oif = oldflp->oif };
2379 struct fib_result res;
2380 unsigned flags = 0;
2381 struct net_device *dev_out = NULL;
2382 int free_res = 0;
2383 int err;
2386 res.fi = NULL;
2387 #ifdef CONFIG_IP_MULTIPLE_TABLES
2388 res.r = NULL;
2389 #endif
2391 if (oldflp->fl4_src) {
2392 err = -EINVAL;
2393 if (MULTICAST(oldflp->fl4_src) ||
2394 BADCLASS(oldflp->fl4_src) ||
2395 ZERONET(oldflp->fl4_src))
2396 goto out;
2398 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2399 dev_out = ip_dev_find(oldflp->fl4_src);
2400 if (dev_out == NULL)
2401 goto out;
2403 /* I removed check for oif == dev_out->oif here.
2404 It was wrong for two reasons:
2405 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2406 assigned to multiple interfaces.
2407 2. Moreover, we are allowed to send packets with saddr
2408 of another iface. --ANK
2411 if (oldflp->oif == 0
2412 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2413 /* Special hack: user can direct multicasts
2414 and limited broadcast via necessary interface
2415 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2416 This hack is not just for fun, it allows
2417 vic,vat and friends to work.
2418 They bind socket to loopback, set ttl to zero
2419 and expect that it will work.
2420 From the viewpoint of routing cache they are broken,
2421 because we are not allowed to build multicast path
2422 with loopback source addr (look, routing cache
2423 cannot know, that ttl is zero, so that packet
2424 will not leave this host and route is valid).
2425 Luckily, this hack is good workaround.
2428 fl.oif = dev_out->ifindex;
2429 goto make_route;
2431 if (dev_out)
2432 dev_put(dev_out);
2433 dev_out = NULL;
2437 if (oldflp->oif) {
2438 dev_out = dev_get_by_index(oldflp->oif);
2439 err = -ENODEV;
2440 if (dev_out == NULL)
2441 goto out;
2443 /* RACE: Check return value of inet_select_addr instead. */
2444 if (__in_dev_get_rtnl(dev_out) == NULL) {
2445 dev_put(dev_out);
2446 goto out; /* Wrong error code */
2449 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2450 if (!fl.fl4_src)
2451 fl.fl4_src = inet_select_addr(dev_out, 0,
2452 RT_SCOPE_LINK);
2453 goto make_route;
2455 if (!fl.fl4_src) {
2456 if (MULTICAST(oldflp->fl4_dst))
2457 fl.fl4_src = inet_select_addr(dev_out, 0,
2458 fl.fl4_scope);
2459 else if (!oldflp->fl4_dst)
2460 fl.fl4_src = inet_select_addr(dev_out, 0,
2461 RT_SCOPE_HOST);
2465 if (!fl.fl4_dst) {
2466 fl.fl4_dst = fl.fl4_src;
2467 if (!fl.fl4_dst)
2468 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2469 if (dev_out)
2470 dev_put(dev_out);
2471 dev_out = &loopback_dev;
2472 dev_hold(dev_out);
2473 fl.oif = loopback_dev.ifindex;
2474 res.type = RTN_LOCAL;
2475 flags |= RTCF_LOCAL;
2476 goto make_route;
2479 if (fib_lookup(&fl, &res)) {
2480 res.fi = NULL;
2481 if (oldflp->oif) {
2482 /* Apparently, routing tables are wrong. Assume,
2483 that the destination is on link.
2485 WHY? DW.
2486 Because we are allowed to send to iface
2487 even if it has NO routes and NO assigned
2488 addresses. When oif is specified, routing
2489 tables are looked up with only one purpose:
2490 to catch if destination is gatewayed, rather than
2491 direct. Moreover, if MSG_DONTROUTE is set,
2492 we send packet, ignoring both routing tables
2493 and ifaddr state. --ANK
2496 We could make it even if oif is unknown,
2497 likely IPv6, but we do not.
2500 if (fl.fl4_src == 0)
2501 fl.fl4_src = inet_select_addr(dev_out, 0,
2502 RT_SCOPE_LINK);
2503 res.type = RTN_UNICAST;
2504 goto make_route;
2506 if (dev_out)
2507 dev_put(dev_out);
2508 err = -ENETUNREACH;
2509 goto out;
2511 free_res = 1;
2513 if (res.type == RTN_LOCAL) {
2514 if (!fl.fl4_src)
2515 fl.fl4_src = fl.fl4_dst;
2516 if (dev_out)
2517 dev_put(dev_out);
2518 dev_out = &loopback_dev;
2519 dev_hold(dev_out);
2520 fl.oif = dev_out->ifindex;
2521 if (res.fi)
2522 fib_info_put(res.fi);
2523 res.fi = NULL;
2524 flags |= RTCF_LOCAL;
2525 goto make_route;
2528 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2529 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2530 fib_select_multipath(&fl, &res);
2531 else
2532 #endif
2533 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2534 fib_select_default(&fl, &res);
2536 if (!fl.fl4_src)
2537 fl.fl4_src = FIB_RES_PREFSRC(res);
2539 if (dev_out)
2540 dev_put(dev_out);
2541 dev_out = FIB_RES_DEV(res);
2542 dev_hold(dev_out);
2543 fl.oif = dev_out->ifindex;
2546 make_route:
2547 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2550 if (free_res)
2551 fib_res_put(&res);
2552 if (dev_out)
2553 dev_put(dev_out);
2554 out: return err;
2557 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2559 unsigned hash;
2560 struct rtable *rth;
2562 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2564 rcu_read_lock_bh();
2565 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2566 rth = rcu_dereference(rth->u.dst.rt_next)) {
2567 if (rth->fl.fl4_dst == flp->fl4_dst &&
2568 rth->fl.fl4_src == flp->fl4_src &&
2569 rth->fl.iif == 0 &&
2570 rth->fl.oif == flp->oif &&
2571 rth->fl.mark == flp->mark &&
2572 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2573 (IPTOS_RT_MASK | RTO_ONLINK))) {
2575 /* check for multipath routes and choose one if
2576 * necessary
2578 if (multipath_select_route(flp, rth, rp)) {
2579 dst_hold(&(*rp)->u.dst);
2580 RT_CACHE_STAT_INC(out_hit);
2581 rcu_read_unlock_bh();
2582 return 0;
2585 rth->u.dst.lastuse = jiffies;
2586 dst_hold(&rth->u.dst);
2587 rth->u.dst.__use++;
2588 RT_CACHE_STAT_INC(out_hit);
2589 rcu_read_unlock_bh();
2590 *rp = rth;
2591 return 0;
2593 RT_CACHE_STAT_INC(out_hlist_search);
2595 rcu_read_unlock_bh();
2597 return ip_route_output_slow(rp, flp);
2600 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2602 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2604 int err;
2606 if ((err = __ip_route_output_key(rp, flp)) != 0)
2607 return err;
2609 if (flp->proto) {
2610 if (!flp->fl4_src)
2611 flp->fl4_src = (*rp)->rt_src;
2612 if (!flp->fl4_dst)
2613 flp->fl4_dst = (*rp)->rt_dst;
2614 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2617 return 0;
2620 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2622 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2624 return ip_route_output_flow(rp, flp, NULL, 0);
2627 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2628 int nowait, unsigned int flags)
2630 struct rtable *rt = (struct rtable*)skb->dst;
2631 struct rtmsg *r;
2632 struct nlmsghdr *nlh;
2633 long expires;
2634 u32 id = 0, ts = 0, tsage = 0, error;
2636 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2637 if (nlh == NULL)
2638 return -EMSGSIZE;
2640 r = nlmsg_data(nlh);
2641 r->rtm_family = AF_INET;
2642 r->rtm_dst_len = 32;
2643 r->rtm_src_len = 0;
2644 r->rtm_tos = rt->fl.fl4_tos;
2645 r->rtm_table = RT_TABLE_MAIN;
2646 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2647 r->rtm_type = rt->rt_type;
2648 r->rtm_scope = RT_SCOPE_UNIVERSE;
2649 r->rtm_protocol = RTPROT_UNSPEC;
2650 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2651 if (rt->rt_flags & RTCF_NOTIFY)
2652 r->rtm_flags |= RTM_F_NOTIFY;
2654 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2656 if (rt->fl.fl4_src) {
2657 r->rtm_src_len = 32;
2658 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2660 if (rt->u.dst.dev)
2661 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2662 #ifdef CONFIG_NET_CLS_ROUTE
2663 if (rt->u.dst.tclassid)
2664 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2665 #endif
2666 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2667 if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2668 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2669 #endif
2670 if (rt->fl.iif)
2671 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2672 else if (rt->rt_src != rt->fl.fl4_src)
2673 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2675 if (rt->rt_dst != rt->rt_gateway)
2676 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2678 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2679 goto nla_put_failure;
2681 error = rt->u.dst.error;
2682 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2683 if (rt->peer) {
2684 id = rt->peer->ip_id_count;
2685 if (rt->peer->tcp_ts_stamp) {
2686 ts = rt->peer->tcp_ts;
2687 tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2691 if (rt->fl.iif) {
2692 #ifdef CONFIG_IP_MROUTE
2693 __be32 dst = rt->rt_dst;
2695 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2696 ipv4_devconf.mc_forwarding) {
2697 int err = ipmr_get_route(skb, r, nowait);
2698 if (err <= 0) {
2699 if (!nowait) {
2700 if (err == 0)
2701 return 0;
2702 goto nla_put_failure;
2703 } else {
2704 if (err == -EMSGSIZE)
2705 goto nla_put_failure;
2706 error = err;
2709 } else
2710 #endif
2711 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2714 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2715 expires, error) < 0)
2716 goto nla_put_failure;
2718 return nlmsg_end(skb, nlh);
2720 nla_put_failure:
2721 nlmsg_cancel(skb, nlh);
2722 return -EMSGSIZE;
2725 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2727 struct rtmsg *rtm;
2728 struct nlattr *tb[RTA_MAX+1];
2729 struct rtable *rt = NULL;
2730 __be32 dst = 0;
2731 __be32 src = 0;
2732 u32 iif;
2733 int err;
2734 struct sk_buff *skb;
2736 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2737 if (err < 0)
2738 goto errout;
2740 rtm = nlmsg_data(nlh);
2742 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2743 if (skb == NULL) {
2744 err = -ENOBUFS;
2745 goto errout;
2748 /* Reserve room for dummy headers, this skb can pass
2749 through good chunk of routing engine.
2751 skb->mac.raw = skb->nh.raw = skb->data;
2753 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2754 skb->nh.iph->protocol = IPPROTO_ICMP;
2755 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2757 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2758 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2759 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2761 if (iif) {
2762 struct net_device *dev;
2764 dev = __dev_get_by_index(iif);
2765 if (dev == NULL) {
2766 err = -ENODEV;
2767 goto errout_free;
2770 skb->protocol = htons(ETH_P_IP);
2771 skb->dev = dev;
2772 local_bh_disable();
2773 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2774 local_bh_enable();
2776 rt = (struct rtable*) skb->dst;
2777 if (err == 0 && rt->u.dst.error)
2778 err = -rt->u.dst.error;
2779 } else {
2780 struct flowi fl = {
2781 .nl_u = {
2782 .ip4_u = {
2783 .daddr = dst,
2784 .saddr = src,
2785 .tos = rtm->rtm_tos,
2788 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2790 err = ip_route_output_key(&rt, &fl);
2793 if (err)
2794 goto errout_free;
2796 skb->dst = &rt->u.dst;
2797 if (rtm->rtm_flags & RTM_F_NOTIFY)
2798 rt->rt_flags |= RTCF_NOTIFY;
2800 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2801 RTM_NEWROUTE, 0, 0);
2802 if (err <= 0)
2803 goto errout_free;
2805 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2806 errout:
2807 return err;
2809 errout_free:
2810 kfree_skb(skb);
2811 goto errout;
2814 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2816 struct rtable *rt;
2817 int h, s_h;
2818 int idx, s_idx;
2820 s_h = cb->args[0];
2821 s_idx = idx = cb->args[1];
2822 for (h = 0; h <= rt_hash_mask; h++) {
2823 if (h < s_h) continue;
2824 if (h > s_h)
2825 s_idx = 0;
2826 rcu_read_lock_bh();
2827 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2828 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2829 if (idx < s_idx)
2830 continue;
2831 skb->dst = dst_clone(&rt->u.dst);
2832 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2833 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2834 1, NLM_F_MULTI) <= 0) {
2835 dst_release(xchg(&skb->dst, NULL));
2836 rcu_read_unlock_bh();
2837 goto done;
2839 dst_release(xchg(&skb->dst, NULL));
2841 rcu_read_unlock_bh();
2844 done:
2845 cb->args[0] = h;
2846 cb->args[1] = idx;
2847 return skb->len;
2850 void ip_rt_multicast_event(struct in_device *in_dev)
2852 rt_cache_flush(0);
2855 #ifdef CONFIG_SYSCTL
2856 static int flush_delay;
2858 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2859 struct file *filp, void __user *buffer,
2860 size_t *lenp, loff_t *ppos)
2862 if (write) {
2863 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2864 rt_cache_flush(flush_delay);
2865 return 0;
2868 return -EINVAL;
2871 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2872 int __user *name,
2873 int nlen,
2874 void __user *oldval,
2875 size_t __user *oldlenp,
2876 void __user *newval,
2877 size_t newlen)
2879 int delay;
2880 if (newlen != sizeof(int))
2881 return -EINVAL;
2882 if (get_user(delay, (int __user *)newval))
2883 return -EFAULT;
2884 rt_cache_flush(delay);
2885 return 0;
2888 ctl_table ipv4_route_table[] = {
2890 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2891 .procname = "flush",
2892 .data = &flush_delay,
2893 .maxlen = sizeof(int),
2894 .mode = 0200,
2895 .proc_handler = &ipv4_sysctl_rtcache_flush,
2896 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2899 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2900 .procname = "min_delay",
2901 .data = &ip_rt_min_delay,
2902 .maxlen = sizeof(int),
2903 .mode = 0644,
2904 .proc_handler = &proc_dointvec_jiffies,
2905 .strategy = &sysctl_jiffies,
2908 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2909 .procname = "max_delay",
2910 .data = &ip_rt_max_delay,
2911 .maxlen = sizeof(int),
2912 .mode = 0644,
2913 .proc_handler = &proc_dointvec_jiffies,
2914 .strategy = &sysctl_jiffies,
2917 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2918 .procname = "gc_thresh",
2919 .data = &ipv4_dst_ops.gc_thresh,
2920 .maxlen = sizeof(int),
2921 .mode = 0644,
2922 .proc_handler = &proc_dointvec,
2925 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2926 .procname = "max_size",
2927 .data = &ip_rt_max_size,
2928 .maxlen = sizeof(int),
2929 .mode = 0644,
2930 .proc_handler = &proc_dointvec,
2933 /* Deprecated. Use gc_min_interval_ms */
2935 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2936 .procname = "gc_min_interval",
2937 .data = &ip_rt_gc_min_interval,
2938 .maxlen = sizeof(int),
2939 .mode = 0644,
2940 .proc_handler = &proc_dointvec_jiffies,
2941 .strategy = &sysctl_jiffies,
2944 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2945 .procname = "gc_min_interval_ms",
2946 .data = &ip_rt_gc_min_interval,
2947 .maxlen = sizeof(int),
2948 .mode = 0644,
2949 .proc_handler = &proc_dointvec_ms_jiffies,
2950 .strategy = &sysctl_ms_jiffies,
2953 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2954 .procname = "gc_timeout",
2955 .data = &ip_rt_gc_timeout,
2956 .maxlen = sizeof(int),
2957 .mode = 0644,
2958 .proc_handler = &proc_dointvec_jiffies,
2959 .strategy = &sysctl_jiffies,
2962 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2963 .procname = "gc_interval",
2964 .data = &ip_rt_gc_interval,
2965 .maxlen = sizeof(int),
2966 .mode = 0644,
2967 .proc_handler = &proc_dointvec_jiffies,
2968 .strategy = &sysctl_jiffies,
2971 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2972 .procname = "redirect_load",
2973 .data = &ip_rt_redirect_load,
2974 .maxlen = sizeof(int),
2975 .mode = 0644,
2976 .proc_handler = &proc_dointvec,
2979 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2980 .procname = "redirect_number",
2981 .data = &ip_rt_redirect_number,
2982 .maxlen = sizeof(int),
2983 .mode = 0644,
2984 .proc_handler = &proc_dointvec,
2987 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2988 .procname = "redirect_silence",
2989 .data = &ip_rt_redirect_silence,
2990 .maxlen = sizeof(int),
2991 .mode = 0644,
2992 .proc_handler = &proc_dointvec,
2995 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2996 .procname = "error_cost",
2997 .data = &ip_rt_error_cost,
2998 .maxlen = sizeof(int),
2999 .mode = 0644,
3000 .proc_handler = &proc_dointvec,
3003 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3004 .procname = "error_burst",
3005 .data = &ip_rt_error_burst,
3006 .maxlen = sizeof(int),
3007 .mode = 0644,
3008 .proc_handler = &proc_dointvec,
3011 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3012 .procname = "gc_elasticity",
3013 .data = &ip_rt_gc_elasticity,
3014 .maxlen = sizeof(int),
3015 .mode = 0644,
3016 .proc_handler = &proc_dointvec,
3019 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3020 .procname = "mtu_expires",
3021 .data = &ip_rt_mtu_expires,
3022 .maxlen = sizeof(int),
3023 .mode = 0644,
3024 .proc_handler = &proc_dointvec_jiffies,
3025 .strategy = &sysctl_jiffies,
3028 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3029 .procname = "min_pmtu",
3030 .data = &ip_rt_min_pmtu,
3031 .maxlen = sizeof(int),
3032 .mode = 0644,
3033 .proc_handler = &proc_dointvec,
3036 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3037 .procname = "min_adv_mss",
3038 .data = &ip_rt_min_advmss,
3039 .maxlen = sizeof(int),
3040 .mode = 0644,
3041 .proc_handler = &proc_dointvec,
3044 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3045 .procname = "secret_interval",
3046 .data = &ip_rt_secret_interval,
3047 .maxlen = sizeof(int),
3048 .mode = 0644,
3049 .proc_handler = &proc_dointvec_jiffies,
3050 .strategy = &sysctl_jiffies,
3052 { .ctl_name = 0 }
3054 #endif
3056 #ifdef CONFIG_NET_CLS_ROUTE
3057 struct ip_rt_acct *ip_rt_acct;
3059 /* This code sucks. But you should have seen it before! --RR */
3061 /* IP route accounting ptr for this logical cpu number. */
3062 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3064 #ifdef CONFIG_PROC_FS
3065 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3066 int length, int *eof, void *data)
3068 unsigned int i;
3070 if ((offset & 3) || (length & 3))
3071 return -EIO;
3073 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3074 *eof = 1;
3075 return 0;
3078 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3079 length = sizeof(struct ip_rt_acct) * 256 - offset;
3080 *eof = 1;
3083 offset /= sizeof(u32);
3085 if (length > 0) {
3086 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3087 u32 *dst = (u32 *) buffer;
3089 /* Copy first cpu. */
3090 *start = buffer;
3091 memcpy(dst, src, length);
3093 /* Add the other cpus in, one int at a time */
3094 for_each_possible_cpu(i) {
3095 unsigned int j;
3097 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3099 for (j = 0; j < length/4; j++)
3100 dst[j] += src[j];
3103 return length;
3105 #endif /* CONFIG_PROC_FS */
3106 #endif /* CONFIG_NET_CLS_ROUTE */
3108 static __initdata unsigned long rhash_entries;
3109 static int __init set_rhash_entries(char *str)
3111 if (!str)
3112 return 0;
3113 rhash_entries = simple_strtoul(str, &str, 0);
3114 return 1;
3116 __setup("rhash_entries=", set_rhash_entries);
3118 int __init ip_rt_init(void)
3120 int rc = 0;
3122 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3123 (jiffies ^ (jiffies >> 7)));
3125 #ifdef CONFIG_NET_CLS_ROUTE
3127 int order;
3128 for (order = 0;
3129 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3130 /* NOTHING */;
3131 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3132 if (!ip_rt_acct)
3133 panic("IP: failed to allocate ip_rt_acct\n");
3134 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3136 #endif
3138 ipv4_dst_ops.kmem_cachep =
3139 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3140 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3142 rt_hash_table = (struct rt_hash_bucket *)
3143 alloc_large_system_hash("IP route cache",
3144 sizeof(struct rt_hash_bucket),
3145 rhash_entries,
3146 (num_physpages >= 128 * 1024) ?
3147 15 : 17,
3149 &rt_hash_log,
3150 &rt_hash_mask,
3152 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3153 rt_hash_lock_init();
3155 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3156 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3158 devinet_init();
3159 ip_fib_init();
3161 init_timer(&rt_flush_timer);
3162 rt_flush_timer.function = rt_run_flush;
3163 init_timer(&rt_periodic_timer);
3164 rt_periodic_timer.function = rt_check_expire;
3165 init_timer(&rt_secret_timer);
3166 rt_secret_timer.function = rt_secret_rebuild;
3168 /* All the timers, started at system startup tend
3169 to synchronize. Perturb it a bit.
3171 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3172 ip_rt_gc_interval;
3173 add_timer(&rt_periodic_timer);
3175 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3176 ip_rt_secret_interval;
3177 add_timer(&rt_secret_timer);
3179 #ifdef CONFIG_PROC_FS
3181 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3182 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3183 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3184 proc_net_stat))) {
3185 return -ENOMEM;
3187 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3189 #ifdef CONFIG_NET_CLS_ROUTE
3190 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3191 #endif
3192 #endif
3193 #ifdef CONFIG_XFRM
3194 xfrm_init();
3195 xfrm4_init();
3196 #endif
3197 return rc;
3200 EXPORT_SYMBOL(__ip_select_ident);
3201 EXPORT_SYMBOL(ip_route_input);
3202 EXPORT_SYMBOL(ip_route_output_key);