[PATCH] dvb: frontend: add ALPS TDED4 PLL
[linux-2.6/cjktty.git] / net / ipv4 / route.c
blob726ea5e8180a1275e58ebc919490f24516c37d14
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/config.h>
66 #include <linux/module.h>
67 #include <asm/uaccess.h>
68 #include <asm/system.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/sched.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/rtnetlink.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/ip_mp_alg.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
110 #define RT_FL_TOS(oldflp) \
111 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113 #define IP_MAX_MTU 0xFFF0
115 #define RT_GC_TIMEOUT (300*HZ)
117 static int ip_rt_min_delay = 2 * HZ;
118 static int ip_rt_max_delay = 10 * HZ;
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval = 60 * HZ;
122 static int ip_rt_gc_min_interval = HZ / 2;
123 static int ip_rt_redirect_number = 9;
124 static int ip_rt_redirect_load = HZ / 50;
125 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost = HZ;
127 static int ip_rt_error_burst = 5 * HZ;
128 static int ip_rt_gc_elasticity = 8;
129 static int ip_rt_mtu_expires = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu = 512 + 20 + 20;
131 static int ip_rt_min_advmss = 256;
132 static int ip_rt_secret_interval = 10 * 60 * HZ;
133 static unsigned long rt_deadline;
135 #define RTprint(a...) printk(KERN_DEBUG a)
137 static struct timer_list rt_flush_timer;
138 static struct timer_list rt_periodic_timer;
139 static struct timer_list rt_secret_timer;
142 * Interface to generic destination cache.
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
147 static void ipv4_dst_ifdown(struct dst_entry *dst,
148 struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void ipv4_link_failure(struct sk_buff *skb);
151 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(void);
155 static struct dst_ops ipv4_dst_ops = {
156 .family = AF_INET,
157 .protocol = __constant_htons(ETH_P_IP),
158 .gc = rt_garbage_collect,
159 .check = ipv4_dst_check,
160 .destroy = ipv4_dst_destroy,
161 .ifdown = ipv4_dst_ifdown,
162 .negative_advice = ipv4_negative_advice,
163 .link_failure = ipv4_link_failure,
164 .update_pmtu = ip_rt_update_pmtu,
165 .entry_size = sizeof(struct rtable),
168 #define ECN_OR_COST(class) TC_PRIO_##class
170 __u8 ip_tos2prio[16] = {
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(FILLER),
173 TC_PRIO_BESTEFFORT,
174 ECN_OR_COST(BESTEFFORT),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_BULK,
178 ECN_OR_COST(BULK),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE,
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK)
191 * Route cache.
194 /* The locking scheme is rather straight forward:
196 * 1) Read-Copy Update protects the buckets of the central route hash.
197 * 2) Only writers remove entries, and they hold the lock
198 * as they look at rtable reference counts.
199 * 3) Only readers acquire references to rtable entries,
200 * they do so with atomic increments and with the
201 * lock held.
204 struct rt_hash_bucket {
205 struct rtable *chain;
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
212 #if NR_CPUS >= 32
213 #define RT_HASH_LOCK_SZ 4096
214 #elif NR_CPUS >= 16
215 #define RT_HASH_LOCK_SZ 2048
216 #elif NR_CPUS >= 8
217 #define RT_HASH_LOCK_SZ 1024
218 #elif NR_CPUS >= 4
219 #define RT_HASH_LOCK_SZ 512
220 #else
221 #define RT_HASH_LOCK_SZ 256
222 #endif
224 static spinlock_t *rt_hash_locks;
225 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226 # define rt_hash_lock_init() { \
227 int i; \
228 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 spin_lock_init(&rt_hash_locks[i]); \
233 #else
234 # define rt_hash_lock_addr(slot) NULL
235 # define rt_hash_lock_init()
236 #endif
238 static struct rt_hash_bucket *rt_hash_table;
239 static unsigned rt_hash_mask;
240 static int rt_hash_log;
241 static unsigned int rt_hash_rnd;
243 struct rt_cache_stat *rt_cache_stat;
245 static int rt_intern_hash(unsigned hash, struct rtable *rth,
246 struct rtable **res);
248 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
250 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
251 & rt_hash_mask);
254 #ifdef CONFIG_PROC_FS
255 struct rt_cache_iter_state {
256 int bucket;
259 static struct rtable *rt_cache_get_first(struct seq_file *seq)
261 struct rtable *r = NULL;
262 struct rt_cache_iter_state *st = seq->private;
264 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
265 rcu_read_lock_bh();
266 r = rt_hash_table[st->bucket].chain;
267 if (r)
268 break;
269 rcu_read_unlock_bh();
271 return r;
274 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
276 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
278 r = r->u.rt_next;
279 while (!r) {
280 rcu_read_unlock_bh();
281 if (--st->bucket < 0)
282 break;
283 rcu_read_lock_bh();
284 r = rt_hash_table[st->bucket].chain;
286 return r;
289 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
291 struct rtable *r = rt_cache_get_first(seq);
293 if (r)
294 while (pos && (r = rt_cache_get_next(seq, r)))
295 --pos;
296 return pos ? NULL : r;
299 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
301 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
304 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
306 struct rtable *r = NULL;
308 if (v == SEQ_START_TOKEN)
309 r = rt_cache_get_first(seq);
310 else
311 r = rt_cache_get_next(seq, v);
312 ++*pos;
313 return r;
316 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
318 if (v && v != SEQ_START_TOKEN)
319 rcu_read_unlock_bh();
322 static int rt_cache_seq_show(struct seq_file *seq, void *v)
324 if (v == SEQ_START_TOKEN)
325 seq_printf(seq, "%-127s\n",
326 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
327 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
328 "HHUptod\tSpecDst");
329 else {
330 struct rtable *r = v;
331 char temp[256];
333 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
334 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
335 r->u.dst.dev ? r->u.dst.dev->name : "*",
336 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
337 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
338 r->u.dst.__use, 0, (unsigned long)r->rt_src,
339 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
340 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
341 dst_metric(&r->u.dst, RTAX_WINDOW),
342 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
343 dst_metric(&r->u.dst, RTAX_RTTVAR)),
344 r->fl.fl4_tos,
345 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
346 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
347 dev_queue_xmit) : 0,
348 r->rt_spec_dst);
349 seq_printf(seq, "%-127s\n", temp);
351 return 0;
354 static struct seq_operations rt_cache_seq_ops = {
355 .start = rt_cache_seq_start,
356 .next = rt_cache_seq_next,
357 .stop = rt_cache_seq_stop,
358 .show = rt_cache_seq_show,
361 static int rt_cache_seq_open(struct inode *inode, struct file *file)
363 struct seq_file *seq;
364 int rc = -ENOMEM;
365 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
367 if (!s)
368 goto out;
369 rc = seq_open(file, &rt_cache_seq_ops);
370 if (rc)
371 goto out_kfree;
372 seq = file->private_data;
373 seq->private = s;
374 memset(s, 0, sizeof(*s));
375 out:
376 return rc;
377 out_kfree:
378 kfree(s);
379 goto out;
382 static struct file_operations rt_cache_seq_fops = {
383 .owner = THIS_MODULE,
384 .open = rt_cache_seq_open,
385 .read = seq_read,
386 .llseek = seq_lseek,
387 .release = seq_release_private,
391 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
393 int cpu;
395 if (*pos == 0)
396 return SEQ_START_TOKEN;
398 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
399 if (!cpu_possible(cpu))
400 continue;
401 *pos = cpu+1;
402 return per_cpu_ptr(rt_cache_stat, cpu);
404 return NULL;
407 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
409 int cpu;
411 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
412 if (!cpu_possible(cpu))
413 continue;
414 *pos = cpu+1;
415 return per_cpu_ptr(rt_cache_stat, cpu);
417 return NULL;
421 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
426 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
428 struct rt_cache_stat *st = v;
430 if (v == SEQ_START_TOKEN) {
431 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
432 return 0;
435 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
436 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
437 atomic_read(&ipv4_dst_ops.entries),
438 st->in_hit,
439 st->in_slow_tot,
440 st->in_slow_mc,
441 st->in_no_route,
442 st->in_brd,
443 st->in_martian_dst,
444 st->in_martian_src,
446 st->out_hit,
447 st->out_slow_tot,
448 st->out_slow_mc,
450 st->gc_total,
451 st->gc_ignored,
452 st->gc_goal_miss,
453 st->gc_dst_overflow,
454 st->in_hlist_search,
455 st->out_hlist_search
457 return 0;
460 static struct seq_operations rt_cpu_seq_ops = {
461 .start = rt_cpu_seq_start,
462 .next = rt_cpu_seq_next,
463 .stop = rt_cpu_seq_stop,
464 .show = rt_cpu_seq_show,
468 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
470 return seq_open(file, &rt_cpu_seq_ops);
473 static struct file_operations rt_cpu_seq_fops = {
474 .owner = THIS_MODULE,
475 .open = rt_cpu_seq_open,
476 .read = seq_read,
477 .llseek = seq_lseek,
478 .release = seq_release,
481 #endif /* CONFIG_PROC_FS */
483 static __inline__ void rt_free(struct rtable *rt)
485 multipath_remove(rt);
486 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
489 static __inline__ void rt_drop(struct rtable *rt)
491 multipath_remove(rt);
492 ip_rt_put(rt);
493 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
496 static __inline__ int rt_fast_clean(struct rtable *rth)
498 /* Kill broadcast/multicast entries very aggresively, if they
499 collide in hash table with more useful entries */
500 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
501 rth->fl.iif && rth->u.rt_next;
504 static __inline__ int rt_valuable(struct rtable *rth)
506 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
507 rth->u.dst.expires;
510 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
512 unsigned long age;
513 int ret = 0;
515 if (atomic_read(&rth->u.dst.__refcnt))
516 goto out;
518 ret = 1;
519 if (rth->u.dst.expires &&
520 time_after_eq(jiffies, rth->u.dst.expires))
521 goto out;
523 age = jiffies - rth->u.dst.lastuse;
524 ret = 0;
525 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
526 (age <= tmo2 && rt_valuable(rth)))
527 goto out;
528 ret = 1;
529 out: return ret;
532 /* Bits of score are:
533 * 31: very valuable
534 * 30: not quite useless
535 * 29..0: usage counter
537 static inline u32 rt_score(struct rtable *rt)
539 u32 score = jiffies - rt->u.dst.lastuse;
541 score = ~score & ~(3<<30);
543 if (rt_valuable(rt))
544 score |= (1<<31);
546 if (!rt->fl.iif ||
547 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
548 score |= (1<<30);
550 return score;
553 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
555 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
556 fl1->oif == fl2->oif &&
557 fl1->iif == fl2->iif;
560 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
561 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
562 struct rtable *expentry,
563 int *removed_count)
565 int passedexpired = 0;
566 struct rtable **nextstep = NULL;
567 struct rtable **rthp = chain_head;
568 struct rtable *rth;
570 if (removed_count)
571 *removed_count = 0;
573 while ((rth = *rthp) != NULL) {
574 if (rth == expentry)
575 passedexpired = 1;
577 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
578 compare_keys(&(*rthp)->fl, &expentry->fl)) {
579 if (*rthp == expentry) {
580 *rthp = rth->u.rt_next;
581 continue;
582 } else {
583 *rthp = rth->u.rt_next;
584 rt_free(rth);
585 if (removed_count)
586 ++(*removed_count);
588 } else {
589 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
590 passedexpired && !nextstep)
591 nextstep = &rth->u.rt_next;
593 rthp = &rth->u.rt_next;
597 rt_free(expentry);
598 if (removed_count)
599 ++(*removed_count);
601 return nextstep;
603 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
606 /* This runs via a timer and thus is always in BH context. */
607 static void rt_check_expire(unsigned long dummy)
609 static unsigned int rover;
610 unsigned int i = rover, goal;
611 struct rtable *rth, **rthp;
612 unsigned long now = jiffies;
613 u64 mult;
615 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
616 if (ip_rt_gc_timeout > 1)
617 do_div(mult, ip_rt_gc_timeout);
618 goal = (unsigned int)mult;
619 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
620 for (; goal > 0; goal--) {
621 unsigned long tmo = ip_rt_gc_timeout;
623 i = (i + 1) & rt_hash_mask;
624 rthp = &rt_hash_table[i].chain;
626 if (*rthp == 0)
627 continue;
628 spin_lock(rt_hash_lock_addr(i));
629 while ((rth = *rthp) != NULL) {
630 if (rth->u.dst.expires) {
631 /* Entry is expired even if it is in use */
632 if (time_before_eq(now, rth->u.dst.expires)) {
633 tmo >>= 1;
634 rthp = &rth->u.rt_next;
635 continue;
637 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
638 tmo >>= 1;
639 rthp = &rth->u.rt_next;
640 continue;
643 /* Cleanup aged off entries. */
644 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
645 /* remove all related balanced entries if necessary */
646 if (rth->u.dst.flags & DST_BALANCED) {
647 rthp = rt_remove_balanced_route(
648 &rt_hash_table[i].chain,
649 rth, NULL);
650 if (!rthp)
651 break;
652 } else {
653 *rthp = rth->u.rt_next;
654 rt_free(rth);
656 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
657 *rthp = rth->u.rt_next;
658 rt_free(rth);
659 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
661 spin_unlock(rt_hash_lock_addr(i));
663 /* Fallback loop breaker. */
664 if (time_after(jiffies, now))
665 break;
667 rover = i;
668 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
671 /* This can run from both BH and non-BH contexts, the latter
672 * in the case of a forced flush event.
674 static void rt_run_flush(unsigned long dummy)
676 int i;
677 struct rtable *rth, *next;
679 rt_deadline = 0;
681 get_random_bytes(&rt_hash_rnd, 4);
683 for (i = rt_hash_mask; i >= 0; i--) {
684 spin_lock_bh(rt_hash_lock_addr(i));
685 rth = rt_hash_table[i].chain;
686 if (rth)
687 rt_hash_table[i].chain = NULL;
688 spin_unlock_bh(rt_hash_lock_addr(i));
690 for (; rth; rth = next) {
691 next = rth->u.rt_next;
692 rt_free(rth);
697 static DEFINE_SPINLOCK(rt_flush_lock);
699 void rt_cache_flush(int delay)
701 unsigned long now = jiffies;
702 int user_mode = !in_softirq();
704 if (delay < 0)
705 delay = ip_rt_min_delay;
707 /* flush existing multipath state*/
708 multipath_flush();
710 spin_lock_bh(&rt_flush_lock);
712 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
713 long tmo = (long)(rt_deadline - now);
715 /* If flush timer is already running
716 and flush request is not immediate (delay > 0):
718 if deadline is not achieved, prolongate timer to "delay",
719 otherwise fire it at deadline time.
722 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
723 tmo = 0;
725 if (delay > tmo)
726 delay = tmo;
729 if (delay <= 0) {
730 spin_unlock_bh(&rt_flush_lock);
731 rt_run_flush(0);
732 return;
735 if (rt_deadline == 0)
736 rt_deadline = now + ip_rt_max_delay;
738 mod_timer(&rt_flush_timer, now+delay);
739 spin_unlock_bh(&rt_flush_lock);
742 static void rt_secret_rebuild(unsigned long dummy)
744 unsigned long now = jiffies;
746 rt_cache_flush(0);
747 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
751 Short description of GC goals.
753 We want to build algorithm, which will keep routing cache
754 at some equilibrium point, when number of aged off entries
755 is kept approximately equal to newly generated ones.
757 Current expiration strength is variable "expire".
758 We try to adjust it dynamically, so that if networking
759 is idle expires is large enough to keep enough of warm entries,
760 and when load increases it reduces to limit cache size.
763 static int rt_garbage_collect(void)
765 static unsigned long expire = RT_GC_TIMEOUT;
766 static unsigned long last_gc;
767 static int rover;
768 static int equilibrium;
769 struct rtable *rth, **rthp;
770 unsigned long now = jiffies;
771 int goal;
774 * Garbage collection is pretty expensive,
775 * do not make it too frequently.
778 RT_CACHE_STAT_INC(gc_total);
780 if (now - last_gc < ip_rt_gc_min_interval &&
781 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
782 RT_CACHE_STAT_INC(gc_ignored);
783 goto out;
786 /* Calculate number of entries, which we want to expire now. */
787 goal = atomic_read(&ipv4_dst_ops.entries) -
788 (ip_rt_gc_elasticity << rt_hash_log);
789 if (goal <= 0) {
790 if (equilibrium < ipv4_dst_ops.gc_thresh)
791 equilibrium = ipv4_dst_ops.gc_thresh;
792 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
793 if (goal > 0) {
794 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
795 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
797 } else {
798 /* We are in dangerous area. Try to reduce cache really
799 * aggressively.
801 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
802 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
805 if (now - last_gc >= ip_rt_gc_min_interval)
806 last_gc = now;
808 if (goal <= 0) {
809 equilibrium += goal;
810 goto work_done;
813 do {
814 int i, k;
816 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
817 unsigned long tmo = expire;
819 k = (k + 1) & rt_hash_mask;
820 rthp = &rt_hash_table[k].chain;
821 spin_lock_bh(rt_hash_lock_addr(k));
822 while ((rth = *rthp) != NULL) {
823 if (!rt_may_expire(rth, tmo, expire)) {
824 tmo >>= 1;
825 rthp = &rth->u.rt_next;
826 continue;
828 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
829 /* remove all related balanced entries
830 * if necessary
832 if (rth->u.dst.flags & DST_BALANCED) {
833 int r;
835 rthp = rt_remove_balanced_route(
836 &rt_hash_table[i].chain,
837 rth,
838 &r);
839 goal -= r;
840 if (!rthp)
841 break;
842 } else {
843 *rthp = rth->u.rt_next;
844 rt_free(rth);
845 goal--;
847 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
848 *rthp = rth->u.rt_next;
849 rt_free(rth);
850 goal--;
851 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
853 spin_unlock_bh(rt_hash_lock_addr(k));
854 if (goal <= 0)
855 break;
857 rover = k;
859 if (goal <= 0)
860 goto work_done;
862 /* Goal is not achieved. We stop process if:
864 - if expire reduced to zero. Otherwise, expire is halfed.
865 - if table is not full.
866 - if we are called from interrupt.
867 - jiffies check is just fallback/debug loop breaker.
868 We will not spin here for long time in any case.
871 RT_CACHE_STAT_INC(gc_goal_miss);
873 if (expire == 0)
874 break;
876 expire >>= 1;
877 #if RT_CACHE_DEBUG >= 2
878 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
879 atomic_read(&ipv4_dst_ops.entries), goal, i);
880 #endif
882 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
883 goto out;
884 } while (!in_softirq() && time_before_eq(jiffies, now));
886 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
887 goto out;
888 if (net_ratelimit())
889 printk(KERN_WARNING "dst cache overflow\n");
890 RT_CACHE_STAT_INC(gc_dst_overflow);
891 return 1;
893 work_done:
894 expire += ip_rt_gc_min_interval;
895 if (expire > ip_rt_gc_timeout ||
896 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
897 expire = ip_rt_gc_timeout;
898 #if RT_CACHE_DEBUG >= 2
899 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
900 atomic_read(&ipv4_dst_ops.entries), goal, rover);
901 #endif
902 out: return 0;
905 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
907 struct rtable *rth, **rthp;
908 unsigned long now;
909 struct rtable *cand, **candp;
910 u32 min_score;
911 int chain_length;
912 int attempts = !in_softirq();
914 restart:
915 chain_length = 0;
916 min_score = ~(u32)0;
917 cand = NULL;
918 candp = NULL;
919 now = jiffies;
921 rthp = &rt_hash_table[hash].chain;
923 spin_lock_bh(rt_hash_lock_addr(hash));
924 while ((rth = *rthp) != NULL) {
925 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
926 if (!(rth->u.dst.flags & DST_BALANCED) &&
927 compare_keys(&rth->fl, &rt->fl)) {
928 #else
929 if (compare_keys(&rth->fl, &rt->fl)) {
930 #endif
931 /* Put it first */
932 *rthp = rth->u.rt_next;
934 * Since lookup is lockfree, the deletion
935 * must be visible to another weakly ordered CPU before
936 * the insertion at the start of the hash chain.
938 rcu_assign_pointer(rth->u.rt_next,
939 rt_hash_table[hash].chain);
941 * Since lookup is lockfree, the update writes
942 * must be ordered for consistency on SMP.
944 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
946 rth->u.dst.__use++;
947 dst_hold(&rth->u.dst);
948 rth->u.dst.lastuse = now;
949 spin_unlock_bh(rt_hash_lock_addr(hash));
951 rt_drop(rt);
952 *rp = rth;
953 return 0;
956 if (!atomic_read(&rth->u.dst.__refcnt)) {
957 u32 score = rt_score(rth);
959 if (score <= min_score) {
960 cand = rth;
961 candp = rthp;
962 min_score = score;
966 chain_length++;
968 rthp = &rth->u.rt_next;
971 if (cand) {
972 /* ip_rt_gc_elasticity used to be average length of chain
973 * length, when exceeded gc becomes really aggressive.
975 * The second limit is less certain. At the moment it allows
976 * only 2 entries per bucket. We will see.
978 if (chain_length > ip_rt_gc_elasticity) {
979 *candp = cand->u.rt_next;
980 rt_free(cand);
984 /* Try to bind route to arp only if it is output
985 route or unicast forwarding path.
987 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
988 int err = arp_bind_neighbour(&rt->u.dst);
989 if (err) {
990 spin_unlock_bh(rt_hash_lock_addr(hash));
992 if (err != -ENOBUFS) {
993 rt_drop(rt);
994 return err;
997 /* Neighbour tables are full and nothing
998 can be released. Try to shrink route cache,
999 it is most likely it holds some neighbour records.
1001 if (attempts-- > 0) {
1002 int saved_elasticity = ip_rt_gc_elasticity;
1003 int saved_int = ip_rt_gc_min_interval;
1004 ip_rt_gc_elasticity = 1;
1005 ip_rt_gc_min_interval = 0;
1006 rt_garbage_collect();
1007 ip_rt_gc_min_interval = saved_int;
1008 ip_rt_gc_elasticity = saved_elasticity;
1009 goto restart;
1012 if (net_ratelimit())
1013 printk(KERN_WARNING "Neighbour table overflow.\n");
1014 rt_drop(rt);
1015 return -ENOBUFS;
1019 rt->u.rt_next = rt_hash_table[hash].chain;
1020 #if RT_CACHE_DEBUG >= 2
1021 if (rt->u.rt_next) {
1022 struct rtable *trt;
1023 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1024 NIPQUAD(rt->rt_dst));
1025 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1026 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1027 printk("\n");
1029 #endif
1030 rt_hash_table[hash].chain = rt;
1031 spin_unlock_bh(rt_hash_lock_addr(hash));
1032 *rp = rt;
1033 return 0;
1036 void rt_bind_peer(struct rtable *rt, int create)
1038 static DEFINE_SPINLOCK(rt_peer_lock);
1039 struct inet_peer *peer;
1041 peer = inet_getpeer(rt->rt_dst, create);
1043 spin_lock_bh(&rt_peer_lock);
1044 if (rt->peer == NULL) {
1045 rt->peer = peer;
1046 peer = NULL;
1048 spin_unlock_bh(&rt_peer_lock);
1049 if (peer)
1050 inet_putpeer(peer);
1054 * Peer allocation may fail only in serious out-of-memory conditions. However
1055 * we still can generate some output.
1056 * Random ID selection looks a bit dangerous because we have no chances to
1057 * select ID being unique in a reasonable period of time.
1058 * But broken packet identifier may be better than no packet at all.
1060 static void ip_select_fb_ident(struct iphdr *iph)
1062 static DEFINE_SPINLOCK(ip_fb_id_lock);
1063 static u32 ip_fallback_id;
1064 u32 salt;
1066 spin_lock_bh(&ip_fb_id_lock);
1067 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1068 iph->id = htons(salt & 0xFFFF);
1069 ip_fallback_id = salt;
1070 spin_unlock_bh(&ip_fb_id_lock);
1073 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1075 struct rtable *rt = (struct rtable *) dst;
1077 if (rt) {
1078 if (rt->peer == NULL)
1079 rt_bind_peer(rt, 1);
1081 /* If peer is attached to destination, it is never detached,
1082 so that we need not to grab a lock to dereference it.
1084 if (rt->peer) {
1085 iph->id = htons(inet_getid(rt->peer, more));
1086 return;
1088 } else
1089 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1090 __builtin_return_address(0));
1092 ip_select_fb_ident(iph);
1095 static void rt_del(unsigned hash, struct rtable *rt)
1097 struct rtable **rthp;
1099 spin_lock_bh(rt_hash_lock_addr(hash));
1100 ip_rt_put(rt);
1101 for (rthp = &rt_hash_table[hash].chain; *rthp;
1102 rthp = &(*rthp)->u.rt_next)
1103 if (*rthp == rt) {
1104 *rthp = rt->u.rt_next;
1105 rt_free(rt);
1106 break;
1108 spin_unlock_bh(rt_hash_lock_addr(hash));
1111 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1112 u32 saddr, u8 tos, struct net_device *dev)
1114 int i, k;
1115 struct in_device *in_dev = in_dev_get(dev);
1116 struct rtable *rth, **rthp;
1117 u32 skeys[2] = { saddr, 0 };
1118 int ikeys[2] = { dev->ifindex, 0 };
1120 tos &= IPTOS_RT_MASK;
1122 if (!in_dev)
1123 return;
1125 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1126 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1127 goto reject_redirect;
1129 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1130 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1131 goto reject_redirect;
1132 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1133 goto reject_redirect;
1134 } else {
1135 if (inet_addr_type(new_gw) != RTN_UNICAST)
1136 goto reject_redirect;
1139 for (i = 0; i < 2; i++) {
1140 for (k = 0; k < 2; k++) {
1141 unsigned hash = rt_hash_code(daddr,
1142 skeys[i] ^ (ikeys[k] << 5),
1143 tos);
1145 rthp=&rt_hash_table[hash].chain;
1147 rcu_read_lock();
1148 while ((rth = rcu_dereference(*rthp)) != NULL) {
1149 struct rtable *rt;
1151 if (rth->fl.fl4_dst != daddr ||
1152 rth->fl.fl4_src != skeys[i] ||
1153 rth->fl.fl4_tos != tos ||
1154 rth->fl.oif != ikeys[k] ||
1155 rth->fl.iif != 0) {
1156 rthp = &rth->u.rt_next;
1157 continue;
1160 if (rth->rt_dst != daddr ||
1161 rth->rt_src != saddr ||
1162 rth->u.dst.error ||
1163 rth->rt_gateway != old_gw ||
1164 rth->u.dst.dev != dev)
1165 break;
1167 dst_hold(&rth->u.dst);
1168 rcu_read_unlock();
1170 rt = dst_alloc(&ipv4_dst_ops);
1171 if (rt == NULL) {
1172 ip_rt_put(rth);
1173 in_dev_put(in_dev);
1174 return;
1177 /* Copy all the information. */
1178 *rt = *rth;
1179 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1180 rt->u.dst.__use = 1;
1181 atomic_set(&rt->u.dst.__refcnt, 1);
1182 rt->u.dst.child = NULL;
1183 if (rt->u.dst.dev)
1184 dev_hold(rt->u.dst.dev);
1185 if (rt->idev)
1186 in_dev_hold(rt->idev);
1187 rt->u.dst.obsolete = 0;
1188 rt->u.dst.lastuse = jiffies;
1189 rt->u.dst.path = &rt->u.dst;
1190 rt->u.dst.neighbour = NULL;
1191 rt->u.dst.hh = NULL;
1192 rt->u.dst.xfrm = NULL;
1194 rt->rt_flags |= RTCF_REDIRECTED;
1196 /* Gateway is different ... */
1197 rt->rt_gateway = new_gw;
1199 /* Redirect received -> path was valid */
1200 dst_confirm(&rth->u.dst);
1202 if (rt->peer)
1203 atomic_inc(&rt->peer->refcnt);
1205 if (arp_bind_neighbour(&rt->u.dst) ||
1206 !(rt->u.dst.neighbour->nud_state &
1207 NUD_VALID)) {
1208 if (rt->u.dst.neighbour)
1209 neigh_event_send(rt->u.dst.neighbour, NULL);
1210 ip_rt_put(rth);
1211 rt_drop(rt);
1212 goto do_next;
1215 rt_del(hash, rth);
1216 if (!rt_intern_hash(hash, rt, &rt))
1217 ip_rt_put(rt);
1218 goto do_next;
1220 rcu_read_unlock();
1221 do_next:
1225 in_dev_put(in_dev);
1226 return;
1228 reject_redirect:
1229 #ifdef CONFIG_IP_ROUTE_VERBOSE
1230 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1231 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1232 "%u.%u.%u.%u ignored.\n"
1233 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1234 "tos %02x\n",
1235 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1236 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1237 #endif
1238 in_dev_put(in_dev);
1241 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1243 struct rtable *rt = (struct rtable*)dst;
1244 struct dst_entry *ret = dst;
1246 if (rt) {
1247 if (dst->obsolete) {
1248 ip_rt_put(rt);
1249 ret = NULL;
1250 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1251 rt->u.dst.expires) {
1252 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1253 rt->fl.fl4_src ^
1254 (rt->fl.oif << 5),
1255 rt->fl.fl4_tos);
1256 #if RT_CACHE_DEBUG >= 1
1257 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1258 "%u.%u.%u.%u/%02x dropped\n",
1259 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1260 #endif
1261 rt_del(hash, rt);
1262 ret = NULL;
1265 return ret;
1269 * Algorithm:
1270 * 1. The first ip_rt_redirect_number redirects are sent
1271 * with exponential backoff, then we stop sending them at all,
1272 * assuming that the host ignores our redirects.
1273 * 2. If we did not see packets requiring redirects
1274 * during ip_rt_redirect_silence, we assume that the host
1275 * forgot redirected route and start to send redirects again.
1277 * This algorithm is much cheaper and more intelligent than dumb load limiting
1278 * in icmp.c.
1280 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1281 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1284 void ip_rt_send_redirect(struct sk_buff *skb)
1286 struct rtable *rt = (struct rtable*)skb->dst;
1287 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1289 if (!in_dev)
1290 return;
1292 if (!IN_DEV_TX_REDIRECTS(in_dev))
1293 goto out;
1295 /* No redirected packets during ip_rt_redirect_silence;
1296 * reset the algorithm.
1298 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1299 rt->u.dst.rate_tokens = 0;
1301 /* Too many ignored redirects; do not send anything
1302 * set u.dst.rate_last to the last seen redirected packet.
1304 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1305 rt->u.dst.rate_last = jiffies;
1306 goto out;
1309 /* Check for load limit; set rate_last to the latest sent
1310 * redirect.
1312 if (time_after(jiffies,
1313 (rt->u.dst.rate_last +
1314 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1315 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1316 rt->u.dst.rate_last = jiffies;
1317 ++rt->u.dst.rate_tokens;
1318 #ifdef CONFIG_IP_ROUTE_VERBOSE
1319 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1320 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1321 net_ratelimit())
1322 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1323 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1324 NIPQUAD(rt->rt_src), rt->rt_iif,
1325 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1326 #endif
1328 out:
1329 in_dev_put(in_dev);
1332 static int ip_error(struct sk_buff *skb)
1334 struct rtable *rt = (struct rtable*)skb->dst;
1335 unsigned long now;
1336 int code;
1338 switch (rt->u.dst.error) {
1339 case EINVAL:
1340 default:
1341 goto out;
1342 case EHOSTUNREACH:
1343 code = ICMP_HOST_UNREACH;
1344 break;
1345 case ENETUNREACH:
1346 code = ICMP_NET_UNREACH;
1347 break;
1348 case EACCES:
1349 code = ICMP_PKT_FILTERED;
1350 break;
1353 now = jiffies;
1354 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1355 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1356 rt->u.dst.rate_tokens = ip_rt_error_burst;
1357 rt->u.dst.rate_last = now;
1358 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1359 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1360 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1363 out: kfree_skb(skb);
1364 return 0;
1368 * The last two values are not from the RFC but
1369 * are needed for AMPRnet AX.25 paths.
1372 static unsigned short mtu_plateau[] =
1373 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1375 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1377 int i;
1379 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1380 if (old_mtu > mtu_plateau[i])
1381 return mtu_plateau[i];
1382 return 68;
1385 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1387 int i;
1388 unsigned short old_mtu = ntohs(iph->tot_len);
1389 struct rtable *rth;
1390 u32 skeys[2] = { iph->saddr, 0, };
1391 u32 daddr = iph->daddr;
1392 u8 tos = iph->tos & IPTOS_RT_MASK;
1393 unsigned short est_mtu = 0;
1395 if (ipv4_config.no_pmtu_disc)
1396 return 0;
1398 for (i = 0; i < 2; i++) {
1399 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1401 rcu_read_lock();
1402 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1403 rth = rcu_dereference(rth->u.rt_next)) {
1404 if (rth->fl.fl4_dst == daddr &&
1405 rth->fl.fl4_src == skeys[i] &&
1406 rth->rt_dst == daddr &&
1407 rth->rt_src == iph->saddr &&
1408 rth->fl.fl4_tos == tos &&
1409 rth->fl.iif == 0 &&
1410 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1411 unsigned short mtu = new_mtu;
1413 if (new_mtu < 68 || new_mtu >= old_mtu) {
1415 /* BSD 4.2 compatibility hack :-( */
1416 if (mtu == 0 &&
1417 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1418 old_mtu >= 68 + (iph->ihl << 2))
1419 old_mtu -= iph->ihl << 2;
1421 mtu = guess_mtu(old_mtu);
1423 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1424 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1425 dst_confirm(&rth->u.dst);
1426 if (mtu < ip_rt_min_pmtu) {
1427 mtu = ip_rt_min_pmtu;
1428 rth->u.dst.metrics[RTAX_LOCK-1] |=
1429 (1 << RTAX_MTU);
1431 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1432 dst_set_expires(&rth->u.dst,
1433 ip_rt_mtu_expires);
1435 est_mtu = mtu;
1439 rcu_read_unlock();
1441 return est_mtu ? : new_mtu;
1444 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1446 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1447 !(dst_metric_locked(dst, RTAX_MTU))) {
1448 if (mtu < ip_rt_min_pmtu) {
1449 mtu = ip_rt_min_pmtu;
1450 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1452 dst->metrics[RTAX_MTU-1] = mtu;
1453 dst_set_expires(dst, ip_rt_mtu_expires);
1457 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1459 return NULL;
1462 static void ipv4_dst_destroy(struct dst_entry *dst)
1464 struct rtable *rt = (struct rtable *) dst;
1465 struct inet_peer *peer = rt->peer;
1466 struct in_device *idev = rt->idev;
1468 if (peer) {
1469 rt->peer = NULL;
1470 inet_putpeer(peer);
1473 if (idev) {
1474 rt->idev = NULL;
1475 in_dev_put(idev);
1479 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1480 int how)
1482 struct rtable *rt = (struct rtable *) dst;
1483 struct in_device *idev = rt->idev;
1484 if (dev != &loopback_dev && idev && idev->dev == dev) {
1485 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1486 if (loopback_idev) {
1487 rt->idev = loopback_idev;
1488 in_dev_put(idev);
1493 static void ipv4_link_failure(struct sk_buff *skb)
1495 struct rtable *rt;
1497 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1499 rt = (struct rtable *) skb->dst;
1500 if (rt)
1501 dst_set_expires(&rt->u.dst, 0);
1504 static int ip_rt_bug(struct sk_buff *skb)
1506 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1507 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1508 skb->dev ? skb->dev->name : "?");
1509 kfree_skb(skb);
1510 return 0;
1514 We do not cache source address of outgoing interface,
1515 because it is used only by IP RR, TS and SRR options,
1516 so that it out of fast path.
1518 BTW remember: "addr" is allowed to be not aligned
1519 in IP options!
1522 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1524 u32 src;
1525 struct fib_result res;
1527 if (rt->fl.iif == 0)
1528 src = rt->rt_src;
1529 else if (fib_lookup(&rt->fl, &res) == 0) {
1530 src = FIB_RES_PREFSRC(res);
1531 fib_res_put(&res);
1532 } else
1533 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1534 RT_SCOPE_UNIVERSE);
1535 memcpy(addr, &src, 4);
1538 #ifdef CONFIG_NET_CLS_ROUTE
1539 static void set_class_tag(struct rtable *rt, u32 tag)
1541 if (!(rt->u.dst.tclassid & 0xFFFF))
1542 rt->u.dst.tclassid |= tag & 0xFFFF;
1543 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1544 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1546 #endif
1548 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1550 struct fib_info *fi = res->fi;
1552 if (fi) {
1553 if (FIB_RES_GW(*res) &&
1554 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1555 rt->rt_gateway = FIB_RES_GW(*res);
1556 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1557 sizeof(rt->u.dst.metrics));
1558 if (fi->fib_mtu == 0) {
1559 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1560 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1561 rt->rt_gateway != rt->rt_dst &&
1562 rt->u.dst.dev->mtu > 576)
1563 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1565 #ifdef CONFIG_NET_CLS_ROUTE
1566 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1567 #endif
1568 } else
1569 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1571 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1572 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1573 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1574 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1575 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1576 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1577 ip_rt_min_advmss);
1578 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1579 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1581 #ifdef CONFIG_NET_CLS_ROUTE
1582 #ifdef CONFIG_IP_MULTIPLE_TABLES
1583 set_class_tag(rt, fib_rules_tclass(res));
1584 #endif
1585 set_class_tag(rt, itag);
1586 #endif
1587 rt->rt_type = res->type;
1590 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1591 u8 tos, struct net_device *dev, int our)
1593 unsigned hash;
1594 struct rtable *rth;
1595 u32 spec_dst;
1596 struct in_device *in_dev = in_dev_get(dev);
1597 u32 itag = 0;
1599 /* Primary sanity checks. */
1601 if (in_dev == NULL)
1602 return -EINVAL;
1604 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1605 skb->protocol != htons(ETH_P_IP))
1606 goto e_inval;
1608 if (ZERONET(saddr)) {
1609 if (!LOCAL_MCAST(daddr))
1610 goto e_inval;
1611 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1612 } else if (fib_validate_source(saddr, 0, tos, 0,
1613 dev, &spec_dst, &itag) < 0)
1614 goto e_inval;
1616 rth = dst_alloc(&ipv4_dst_ops);
1617 if (!rth)
1618 goto e_nobufs;
1620 rth->u.dst.output= ip_rt_bug;
1622 atomic_set(&rth->u.dst.__refcnt, 1);
1623 rth->u.dst.flags= DST_HOST;
1624 if (in_dev->cnf.no_policy)
1625 rth->u.dst.flags |= DST_NOPOLICY;
1626 rth->fl.fl4_dst = daddr;
1627 rth->rt_dst = daddr;
1628 rth->fl.fl4_tos = tos;
1629 #ifdef CONFIG_IP_ROUTE_FWMARK
1630 rth->fl.fl4_fwmark= skb->nfmark;
1631 #endif
1632 rth->fl.fl4_src = saddr;
1633 rth->rt_src = saddr;
1634 #ifdef CONFIG_NET_CLS_ROUTE
1635 rth->u.dst.tclassid = itag;
1636 #endif
1637 rth->rt_iif =
1638 rth->fl.iif = dev->ifindex;
1639 rth->u.dst.dev = &loopback_dev;
1640 dev_hold(rth->u.dst.dev);
1641 rth->idev = in_dev_get(rth->u.dst.dev);
1642 rth->fl.oif = 0;
1643 rth->rt_gateway = daddr;
1644 rth->rt_spec_dst= spec_dst;
1645 rth->rt_type = RTN_MULTICAST;
1646 rth->rt_flags = RTCF_MULTICAST;
1647 if (our) {
1648 rth->u.dst.input= ip_local_deliver;
1649 rth->rt_flags |= RTCF_LOCAL;
1652 #ifdef CONFIG_IP_MROUTE
1653 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1654 rth->u.dst.input = ip_mr_input;
1655 #endif
1656 RT_CACHE_STAT_INC(in_slow_mc);
1658 in_dev_put(in_dev);
1659 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1660 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1662 e_nobufs:
1663 in_dev_put(in_dev);
1664 return -ENOBUFS;
1666 e_inval:
1667 in_dev_put(in_dev);
1668 return -EINVAL;
1672 static void ip_handle_martian_source(struct net_device *dev,
1673 struct in_device *in_dev,
1674 struct sk_buff *skb,
1675 u32 daddr,
1676 u32 saddr)
1678 RT_CACHE_STAT_INC(in_martian_src);
1679 #ifdef CONFIG_IP_ROUTE_VERBOSE
1680 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1682 * RFC1812 recommendation, if source is martian,
1683 * the only hint is MAC header.
1685 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1686 "%u.%u.%u.%u, on dev %s\n",
1687 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1688 if (dev->hard_header_len) {
1689 int i;
1690 unsigned char *p = skb->mac.raw;
1691 printk(KERN_WARNING "ll header: ");
1692 for (i = 0; i < dev->hard_header_len; i++, p++) {
1693 printk("%02x", *p);
1694 if (i < (dev->hard_header_len - 1))
1695 printk(":");
1697 printk("\n");
1700 #endif
1703 static inline int __mkroute_input(struct sk_buff *skb,
1704 struct fib_result* res,
1705 struct in_device *in_dev,
1706 u32 daddr, u32 saddr, u32 tos,
1707 struct rtable **result)
1710 struct rtable *rth;
1711 int err;
1712 struct in_device *out_dev;
1713 unsigned flags = 0;
1714 u32 spec_dst, itag;
1716 /* get a working reference to the output device */
1717 out_dev = in_dev_get(FIB_RES_DEV(*res));
1718 if (out_dev == NULL) {
1719 if (net_ratelimit())
1720 printk(KERN_CRIT "Bug in ip_route_input" \
1721 "_slow(). Please, report\n");
1722 return -EINVAL;
1726 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1727 in_dev->dev, &spec_dst, &itag);
1728 if (err < 0) {
1729 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1730 saddr);
1732 err = -EINVAL;
1733 goto cleanup;
1736 if (err)
1737 flags |= RTCF_DIRECTSRC;
1739 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1740 (IN_DEV_SHARED_MEDIA(out_dev) ||
1741 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1742 flags |= RTCF_DOREDIRECT;
1744 if (skb->protocol != htons(ETH_P_IP)) {
1745 /* Not IP (i.e. ARP). Do not create route, if it is
1746 * invalid for proxy arp. DNAT routes are always valid.
1748 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1749 err = -EINVAL;
1750 goto cleanup;
1755 rth = dst_alloc(&ipv4_dst_ops);
1756 if (!rth) {
1757 err = -ENOBUFS;
1758 goto cleanup;
1761 rth->u.dst.flags= DST_HOST;
1762 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1763 if (res->fi->fib_nhs > 1)
1764 rth->u.dst.flags |= DST_BALANCED;
1765 #endif
1766 if (in_dev->cnf.no_policy)
1767 rth->u.dst.flags |= DST_NOPOLICY;
1768 if (in_dev->cnf.no_xfrm)
1769 rth->u.dst.flags |= DST_NOXFRM;
1770 rth->fl.fl4_dst = daddr;
1771 rth->rt_dst = daddr;
1772 rth->fl.fl4_tos = tos;
1773 #ifdef CONFIG_IP_ROUTE_FWMARK
1774 rth->fl.fl4_fwmark= skb->nfmark;
1775 #endif
1776 rth->fl.fl4_src = saddr;
1777 rth->rt_src = saddr;
1778 rth->rt_gateway = daddr;
1779 rth->rt_iif =
1780 rth->fl.iif = in_dev->dev->ifindex;
1781 rth->u.dst.dev = (out_dev)->dev;
1782 dev_hold(rth->u.dst.dev);
1783 rth->idev = in_dev_get(rth->u.dst.dev);
1784 rth->fl.oif = 0;
1785 rth->rt_spec_dst= spec_dst;
1787 rth->u.dst.input = ip_forward;
1788 rth->u.dst.output = ip_output;
1790 rt_set_nexthop(rth, res, itag);
1792 rth->rt_flags = flags;
1794 *result = rth;
1795 err = 0;
1796 cleanup:
1797 /* release the working reference to the output device */
1798 in_dev_put(out_dev);
1799 return err;
1802 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1803 struct fib_result* res,
1804 const struct flowi *fl,
1805 struct in_device *in_dev,
1806 u32 daddr, u32 saddr, u32 tos)
1808 struct rtable* rth = NULL;
1809 int err;
1810 unsigned hash;
1812 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1813 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1814 fib_select_multipath(fl, res);
1815 #endif
1817 /* create a routing cache entry */
1818 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1819 if (err)
1820 return err;
1821 atomic_set(&rth->u.dst.__refcnt, 1);
1823 /* put it into the cache */
1824 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1825 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1828 static inline int ip_mkroute_input(struct sk_buff *skb,
1829 struct fib_result* res,
1830 const struct flowi *fl,
1831 struct in_device *in_dev,
1832 u32 daddr, u32 saddr, u32 tos)
1834 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1835 struct rtable* rth = NULL;
1836 unsigned char hop, hopcount, lasthop;
1837 int err = -EINVAL;
1838 unsigned int hash;
1840 if (res->fi)
1841 hopcount = res->fi->fib_nhs;
1842 else
1843 hopcount = 1;
1845 lasthop = hopcount - 1;
1847 /* distinguish between multipath and singlepath */
1848 if (hopcount < 2)
1849 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1850 saddr, tos);
1852 /* add all alternatives to the routing cache */
1853 for (hop = 0; hop < hopcount; hop++) {
1854 res->nh_sel = hop;
1856 /* create a routing cache entry */
1857 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1858 &rth);
1859 if (err)
1860 return err;
1862 /* put it into the cache */
1863 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1864 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1865 if (err)
1866 return err;
1868 /* forward hop information to multipath impl. */
1869 multipath_set_nhinfo(rth,
1870 FIB_RES_NETWORK(*res),
1871 FIB_RES_NETMASK(*res),
1872 res->prefixlen,
1873 &FIB_RES_NH(*res));
1875 /* only for the last hop the reference count is handled
1876 * outside
1878 if (hop == lasthop)
1879 atomic_set(&(skb->dst->__refcnt), 1);
1881 return err;
1882 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1883 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1884 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1889 * NOTE. We drop all the packets that has local source
1890 * addresses, because every properly looped back packet
1891 * must have correct destination already attached by output routine.
1893 * Such approach solves two big problems:
1894 * 1. Not simplex devices are handled properly.
1895 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1898 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1899 u8 tos, struct net_device *dev)
1901 struct fib_result res;
1902 struct in_device *in_dev = in_dev_get(dev);
1903 struct flowi fl = { .nl_u = { .ip4_u =
1904 { .daddr = daddr,
1905 .saddr = saddr,
1906 .tos = tos,
1907 .scope = RT_SCOPE_UNIVERSE,
1908 #ifdef CONFIG_IP_ROUTE_FWMARK
1909 .fwmark = skb->nfmark
1910 #endif
1911 } },
1912 .iif = dev->ifindex };
1913 unsigned flags = 0;
1914 u32 itag = 0;
1915 struct rtable * rth;
1916 unsigned hash;
1917 u32 spec_dst;
1918 int err = -EINVAL;
1919 int free_res = 0;
1921 /* IP on this device is disabled. */
1923 if (!in_dev)
1924 goto out;
1926 /* Check for the most weird martians, which can be not detected
1927 by fib_lookup.
1930 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1931 goto martian_source;
1933 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1934 goto brd_input;
1936 /* Accept zero addresses only to limited broadcast;
1937 * I even do not know to fix it or not. Waiting for complains :-)
1939 if (ZERONET(saddr))
1940 goto martian_source;
1942 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1943 goto martian_destination;
1946 * Now we are ready to route packet.
1948 if ((err = fib_lookup(&fl, &res)) != 0) {
1949 if (!IN_DEV_FORWARD(in_dev))
1950 goto e_hostunreach;
1951 goto no_route;
1953 free_res = 1;
1955 RT_CACHE_STAT_INC(in_slow_tot);
1957 if (res.type == RTN_BROADCAST)
1958 goto brd_input;
1960 if (res.type == RTN_LOCAL) {
1961 int result;
1962 result = fib_validate_source(saddr, daddr, tos,
1963 loopback_dev.ifindex,
1964 dev, &spec_dst, &itag);
1965 if (result < 0)
1966 goto martian_source;
1967 if (result)
1968 flags |= RTCF_DIRECTSRC;
1969 spec_dst = daddr;
1970 goto local_input;
1973 if (!IN_DEV_FORWARD(in_dev))
1974 goto e_hostunreach;
1975 if (res.type != RTN_UNICAST)
1976 goto martian_destination;
1978 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1979 if (err == -ENOBUFS)
1980 goto e_nobufs;
1981 if (err == -EINVAL)
1982 goto e_inval;
1984 done:
1985 in_dev_put(in_dev);
1986 if (free_res)
1987 fib_res_put(&res);
1988 out: return err;
1990 brd_input:
1991 if (skb->protocol != htons(ETH_P_IP))
1992 goto e_inval;
1994 if (ZERONET(saddr))
1995 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1996 else {
1997 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1998 &itag);
1999 if (err < 0)
2000 goto martian_source;
2001 if (err)
2002 flags |= RTCF_DIRECTSRC;
2004 flags |= RTCF_BROADCAST;
2005 res.type = RTN_BROADCAST;
2006 RT_CACHE_STAT_INC(in_brd);
2008 local_input:
2009 rth = dst_alloc(&ipv4_dst_ops);
2010 if (!rth)
2011 goto e_nobufs;
2013 rth->u.dst.output= ip_rt_bug;
2015 atomic_set(&rth->u.dst.__refcnt, 1);
2016 rth->u.dst.flags= DST_HOST;
2017 if (in_dev->cnf.no_policy)
2018 rth->u.dst.flags |= DST_NOPOLICY;
2019 rth->fl.fl4_dst = daddr;
2020 rth->rt_dst = daddr;
2021 rth->fl.fl4_tos = tos;
2022 #ifdef CONFIG_IP_ROUTE_FWMARK
2023 rth->fl.fl4_fwmark= skb->nfmark;
2024 #endif
2025 rth->fl.fl4_src = saddr;
2026 rth->rt_src = saddr;
2027 #ifdef CONFIG_NET_CLS_ROUTE
2028 rth->u.dst.tclassid = itag;
2029 #endif
2030 rth->rt_iif =
2031 rth->fl.iif = dev->ifindex;
2032 rth->u.dst.dev = &loopback_dev;
2033 dev_hold(rth->u.dst.dev);
2034 rth->idev = in_dev_get(rth->u.dst.dev);
2035 rth->rt_gateway = daddr;
2036 rth->rt_spec_dst= spec_dst;
2037 rth->u.dst.input= ip_local_deliver;
2038 rth->rt_flags = flags|RTCF_LOCAL;
2039 if (res.type == RTN_UNREACHABLE) {
2040 rth->u.dst.input= ip_error;
2041 rth->u.dst.error= -err;
2042 rth->rt_flags &= ~RTCF_LOCAL;
2044 rth->rt_type = res.type;
2045 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2046 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2047 goto done;
2049 no_route:
2050 RT_CACHE_STAT_INC(in_no_route);
2051 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2052 res.type = RTN_UNREACHABLE;
2053 goto local_input;
2056 * Do not cache martian addresses: they should be logged (RFC1812)
2058 martian_destination:
2059 RT_CACHE_STAT_INC(in_martian_dst);
2060 #ifdef CONFIG_IP_ROUTE_VERBOSE
2061 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2062 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2063 "%u.%u.%u.%u, dev %s\n",
2064 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2065 #endif
2067 e_hostunreach:
2068 err = -EHOSTUNREACH;
2069 goto done;
2071 e_inval:
2072 err = -EINVAL;
2073 goto done;
2075 e_nobufs:
2076 err = -ENOBUFS;
2077 goto done;
2079 martian_source:
2080 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2081 goto e_inval;
2084 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2085 u8 tos, struct net_device *dev)
2087 struct rtable * rth;
2088 unsigned hash;
2089 int iif = dev->ifindex;
2091 tos &= IPTOS_RT_MASK;
2092 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2094 rcu_read_lock();
2095 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2096 rth = rcu_dereference(rth->u.rt_next)) {
2097 if (rth->fl.fl4_dst == daddr &&
2098 rth->fl.fl4_src == saddr &&
2099 rth->fl.iif == iif &&
2100 rth->fl.oif == 0 &&
2101 #ifdef CONFIG_IP_ROUTE_FWMARK
2102 rth->fl.fl4_fwmark == skb->nfmark &&
2103 #endif
2104 rth->fl.fl4_tos == tos) {
2105 rth->u.dst.lastuse = jiffies;
2106 dst_hold(&rth->u.dst);
2107 rth->u.dst.__use++;
2108 RT_CACHE_STAT_INC(in_hit);
2109 rcu_read_unlock();
2110 skb->dst = (struct dst_entry*)rth;
2111 return 0;
2113 RT_CACHE_STAT_INC(in_hlist_search);
2115 rcu_read_unlock();
2117 /* Multicast recognition logic is moved from route cache to here.
2118 The problem was that too many Ethernet cards have broken/missing
2119 hardware multicast filters :-( As result the host on multicasting
2120 network acquires a lot of useless route cache entries, sort of
2121 SDR messages from all the world. Now we try to get rid of them.
2122 Really, provided software IP multicast filter is organized
2123 reasonably (at least, hashed), it does not result in a slowdown
2124 comparing with route cache reject entries.
2125 Note, that multicast routers are not affected, because
2126 route cache entry is created eventually.
2128 if (MULTICAST(daddr)) {
2129 struct in_device *in_dev;
2131 rcu_read_lock();
2132 if ((in_dev = __in_dev_get(dev)) != NULL) {
2133 int our = ip_check_mc(in_dev, daddr, saddr,
2134 skb->nh.iph->protocol);
2135 if (our
2136 #ifdef CONFIG_IP_MROUTE
2137 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2138 #endif
2140 rcu_read_unlock();
2141 return ip_route_input_mc(skb, daddr, saddr,
2142 tos, dev, our);
2145 rcu_read_unlock();
2146 return -EINVAL;
2148 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2151 static inline int __mkroute_output(struct rtable **result,
2152 struct fib_result* res,
2153 const struct flowi *fl,
2154 const struct flowi *oldflp,
2155 struct net_device *dev_out,
2156 unsigned flags)
2158 struct rtable *rth;
2159 struct in_device *in_dev;
2160 u32 tos = RT_FL_TOS(oldflp);
2161 int err = 0;
2163 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2164 return -EINVAL;
2166 if (fl->fl4_dst == 0xFFFFFFFF)
2167 res->type = RTN_BROADCAST;
2168 else if (MULTICAST(fl->fl4_dst))
2169 res->type = RTN_MULTICAST;
2170 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2171 return -EINVAL;
2173 if (dev_out->flags & IFF_LOOPBACK)
2174 flags |= RTCF_LOCAL;
2176 /* get work reference to inet device */
2177 in_dev = in_dev_get(dev_out);
2178 if (!in_dev)
2179 return -EINVAL;
2181 if (res->type == RTN_BROADCAST) {
2182 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2183 if (res->fi) {
2184 fib_info_put(res->fi);
2185 res->fi = NULL;
2187 } else if (res->type == RTN_MULTICAST) {
2188 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2189 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2190 oldflp->proto))
2191 flags &= ~RTCF_LOCAL;
2192 /* If multicast route do not exist use
2193 default one, but do not gateway in this case.
2194 Yes, it is hack.
2196 if (res->fi && res->prefixlen < 4) {
2197 fib_info_put(res->fi);
2198 res->fi = NULL;
2203 rth = dst_alloc(&ipv4_dst_ops);
2204 if (!rth) {
2205 err = -ENOBUFS;
2206 goto cleanup;
2209 rth->u.dst.flags= DST_HOST;
2210 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2211 if (res->fi) {
2212 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2213 if (res->fi->fib_nhs > 1)
2214 rth->u.dst.flags |= DST_BALANCED;
2216 #endif
2217 if (in_dev->cnf.no_xfrm)
2218 rth->u.dst.flags |= DST_NOXFRM;
2219 if (in_dev->cnf.no_policy)
2220 rth->u.dst.flags |= DST_NOPOLICY;
2222 rth->fl.fl4_dst = oldflp->fl4_dst;
2223 rth->fl.fl4_tos = tos;
2224 rth->fl.fl4_src = oldflp->fl4_src;
2225 rth->fl.oif = oldflp->oif;
2226 #ifdef CONFIG_IP_ROUTE_FWMARK
2227 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2228 #endif
2229 rth->rt_dst = fl->fl4_dst;
2230 rth->rt_src = fl->fl4_src;
2231 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2232 /* get references to the devices that are to be hold by the routing
2233 cache entry */
2234 rth->u.dst.dev = dev_out;
2235 dev_hold(dev_out);
2236 rth->idev = in_dev_get(dev_out);
2237 rth->rt_gateway = fl->fl4_dst;
2238 rth->rt_spec_dst= fl->fl4_src;
2240 rth->u.dst.output=ip_output;
2242 RT_CACHE_STAT_INC(out_slow_tot);
2244 if (flags & RTCF_LOCAL) {
2245 rth->u.dst.input = ip_local_deliver;
2246 rth->rt_spec_dst = fl->fl4_dst;
2248 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2249 rth->rt_spec_dst = fl->fl4_src;
2250 if (flags & RTCF_LOCAL &&
2251 !(dev_out->flags & IFF_LOOPBACK)) {
2252 rth->u.dst.output = ip_mc_output;
2253 RT_CACHE_STAT_INC(out_slow_mc);
2255 #ifdef CONFIG_IP_MROUTE
2256 if (res->type == RTN_MULTICAST) {
2257 if (IN_DEV_MFORWARD(in_dev) &&
2258 !LOCAL_MCAST(oldflp->fl4_dst)) {
2259 rth->u.dst.input = ip_mr_input;
2260 rth->u.dst.output = ip_mc_output;
2263 #endif
2266 rt_set_nexthop(rth, res, 0);
2268 rth->rt_flags = flags;
2270 *result = rth;
2271 cleanup:
2272 /* release work reference to inet device */
2273 in_dev_put(in_dev);
2275 return err;
2278 static inline int ip_mkroute_output_def(struct rtable **rp,
2279 struct fib_result* res,
2280 const struct flowi *fl,
2281 const struct flowi *oldflp,
2282 struct net_device *dev_out,
2283 unsigned flags)
2285 struct rtable *rth = NULL;
2286 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2287 unsigned hash;
2288 if (err == 0) {
2289 u32 tos = RT_FL_TOS(oldflp);
2291 atomic_set(&rth->u.dst.__refcnt, 1);
2293 hash = rt_hash_code(oldflp->fl4_dst,
2294 oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2295 err = rt_intern_hash(hash, rth, rp);
2298 return err;
2301 static inline int ip_mkroute_output(struct rtable** rp,
2302 struct fib_result* res,
2303 const struct flowi *fl,
2304 const struct flowi *oldflp,
2305 struct net_device *dev_out,
2306 unsigned flags)
2308 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2309 u32 tos = RT_FL_TOS(oldflp);
2310 unsigned char hop;
2311 unsigned hash;
2312 int err = -EINVAL;
2313 struct rtable *rth = NULL;
2315 if (res->fi && res->fi->fib_nhs > 1) {
2316 unsigned char hopcount = res->fi->fib_nhs;
2318 for (hop = 0; hop < hopcount; hop++) {
2319 struct net_device *dev2nexthop;
2321 res->nh_sel = hop;
2323 /* hold a work reference to the output device */
2324 dev2nexthop = FIB_RES_DEV(*res);
2325 dev_hold(dev2nexthop);
2327 err = __mkroute_output(&rth, res, fl, oldflp,
2328 dev2nexthop, flags);
2330 if (err != 0)
2331 goto cleanup;
2333 hash = rt_hash_code(oldflp->fl4_dst,
2334 oldflp->fl4_src ^
2335 (oldflp->oif << 5), tos);
2336 err = rt_intern_hash(hash, rth, rp);
2338 /* forward hop information to multipath impl. */
2339 multipath_set_nhinfo(rth,
2340 FIB_RES_NETWORK(*res),
2341 FIB_RES_NETMASK(*res),
2342 res->prefixlen,
2343 &FIB_RES_NH(*res));
2344 cleanup:
2345 /* release work reference to output device */
2346 dev_put(dev2nexthop);
2348 if (err != 0)
2349 return err;
2351 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2352 return err;
2353 } else {
2354 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2355 flags);
2357 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2358 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2359 #endif
2363 * Major route resolver routine.
2366 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2368 u32 tos = RT_FL_TOS(oldflp);
2369 struct flowi fl = { .nl_u = { .ip4_u =
2370 { .daddr = oldflp->fl4_dst,
2371 .saddr = oldflp->fl4_src,
2372 .tos = tos & IPTOS_RT_MASK,
2373 .scope = ((tos & RTO_ONLINK) ?
2374 RT_SCOPE_LINK :
2375 RT_SCOPE_UNIVERSE),
2376 #ifdef CONFIG_IP_ROUTE_FWMARK
2377 .fwmark = oldflp->fl4_fwmark
2378 #endif
2379 } },
2380 .iif = loopback_dev.ifindex,
2381 .oif = oldflp->oif };
2382 struct fib_result res;
2383 unsigned flags = 0;
2384 struct net_device *dev_out = NULL;
2385 int free_res = 0;
2386 int err;
2389 res.fi = NULL;
2390 #ifdef CONFIG_IP_MULTIPLE_TABLES
2391 res.r = NULL;
2392 #endif
2394 if (oldflp->fl4_src) {
2395 err = -EINVAL;
2396 if (MULTICAST(oldflp->fl4_src) ||
2397 BADCLASS(oldflp->fl4_src) ||
2398 ZERONET(oldflp->fl4_src))
2399 goto out;
2401 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2402 dev_out = ip_dev_find(oldflp->fl4_src);
2403 if (dev_out == NULL)
2404 goto out;
2406 /* I removed check for oif == dev_out->oif here.
2407 It was wrong for two reasons:
2408 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2409 assigned to multiple interfaces.
2410 2. Moreover, we are allowed to send packets with saddr
2411 of another iface. --ANK
2414 if (oldflp->oif == 0
2415 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2416 /* Special hack: user can direct multicasts
2417 and limited broadcast via necessary interface
2418 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2419 This hack is not just for fun, it allows
2420 vic,vat and friends to work.
2421 They bind socket to loopback, set ttl to zero
2422 and expect that it will work.
2423 From the viewpoint of routing cache they are broken,
2424 because we are not allowed to build multicast path
2425 with loopback source addr (look, routing cache
2426 cannot know, that ttl is zero, so that packet
2427 will not leave this host and route is valid).
2428 Luckily, this hack is good workaround.
2431 fl.oif = dev_out->ifindex;
2432 goto make_route;
2434 if (dev_out)
2435 dev_put(dev_out);
2436 dev_out = NULL;
2440 if (oldflp->oif) {
2441 dev_out = dev_get_by_index(oldflp->oif);
2442 err = -ENODEV;
2443 if (dev_out == NULL)
2444 goto out;
2445 if (__in_dev_get(dev_out) == NULL) {
2446 dev_put(dev_out);
2447 goto out; /* Wrong error code */
2450 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2451 if (!fl.fl4_src)
2452 fl.fl4_src = inet_select_addr(dev_out, 0,
2453 RT_SCOPE_LINK);
2454 goto make_route;
2456 if (!fl.fl4_src) {
2457 if (MULTICAST(oldflp->fl4_dst))
2458 fl.fl4_src = inet_select_addr(dev_out, 0,
2459 fl.fl4_scope);
2460 else if (!oldflp->fl4_dst)
2461 fl.fl4_src = inet_select_addr(dev_out, 0,
2462 RT_SCOPE_HOST);
2466 if (!fl.fl4_dst) {
2467 fl.fl4_dst = fl.fl4_src;
2468 if (!fl.fl4_dst)
2469 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2470 if (dev_out)
2471 dev_put(dev_out);
2472 dev_out = &loopback_dev;
2473 dev_hold(dev_out);
2474 fl.oif = loopback_dev.ifindex;
2475 res.type = RTN_LOCAL;
2476 flags |= RTCF_LOCAL;
2477 goto make_route;
2480 if (fib_lookup(&fl, &res)) {
2481 res.fi = NULL;
2482 if (oldflp->oif) {
2483 /* Apparently, routing tables are wrong. Assume,
2484 that the destination is on link.
2486 WHY? DW.
2487 Because we are allowed to send to iface
2488 even if it has NO routes and NO assigned
2489 addresses. When oif is specified, routing
2490 tables are looked up with only one purpose:
2491 to catch if destination is gatewayed, rather than
2492 direct. Moreover, if MSG_DONTROUTE is set,
2493 we send packet, ignoring both routing tables
2494 and ifaddr state. --ANK
2497 We could make it even if oif is unknown,
2498 likely IPv6, but we do not.
2501 if (fl.fl4_src == 0)
2502 fl.fl4_src = inet_select_addr(dev_out, 0,
2503 RT_SCOPE_LINK);
2504 res.type = RTN_UNICAST;
2505 goto make_route;
2507 if (dev_out)
2508 dev_put(dev_out);
2509 err = -ENETUNREACH;
2510 goto out;
2512 free_res = 1;
2514 if (res.type == RTN_LOCAL) {
2515 if (!fl.fl4_src)
2516 fl.fl4_src = fl.fl4_dst;
2517 if (dev_out)
2518 dev_put(dev_out);
2519 dev_out = &loopback_dev;
2520 dev_hold(dev_out);
2521 fl.oif = dev_out->ifindex;
2522 if (res.fi)
2523 fib_info_put(res.fi);
2524 res.fi = NULL;
2525 flags |= RTCF_LOCAL;
2526 goto make_route;
2529 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2530 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2531 fib_select_multipath(&fl, &res);
2532 else
2533 #endif
2534 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2535 fib_select_default(&fl, &res);
2537 if (!fl.fl4_src)
2538 fl.fl4_src = FIB_RES_PREFSRC(res);
2540 if (dev_out)
2541 dev_put(dev_out);
2542 dev_out = FIB_RES_DEV(res);
2543 dev_hold(dev_out);
2544 fl.oif = dev_out->ifindex;
2547 make_route:
2548 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2551 if (free_res)
2552 fib_res_put(&res);
2553 if (dev_out)
2554 dev_put(dev_out);
2555 out: return err;
2558 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2560 unsigned hash;
2561 struct rtable *rth;
2563 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2565 rcu_read_lock_bh();
2566 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2567 rth = rcu_dereference(rth->u.rt_next)) {
2568 if (rth->fl.fl4_dst == flp->fl4_dst &&
2569 rth->fl.fl4_src == flp->fl4_src &&
2570 rth->fl.iif == 0 &&
2571 rth->fl.oif == flp->oif &&
2572 #ifdef CONFIG_IP_ROUTE_FWMARK
2573 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2574 #endif
2575 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2576 (IPTOS_RT_MASK | RTO_ONLINK))) {
2578 /* check for multipath routes and choose one if
2579 * necessary
2581 if (multipath_select_route(flp, rth, rp)) {
2582 dst_hold(&(*rp)->u.dst);
2583 RT_CACHE_STAT_INC(out_hit);
2584 rcu_read_unlock_bh();
2585 return 0;
2588 rth->u.dst.lastuse = jiffies;
2589 dst_hold(&rth->u.dst);
2590 rth->u.dst.__use++;
2591 RT_CACHE_STAT_INC(out_hit);
2592 rcu_read_unlock_bh();
2593 *rp = rth;
2594 return 0;
2596 RT_CACHE_STAT_INC(out_hlist_search);
2598 rcu_read_unlock_bh();
2600 return ip_route_output_slow(rp, flp);
2603 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2605 int err;
2607 if ((err = __ip_route_output_key(rp, flp)) != 0)
2608 return err;
2610 if (flp->proto) {
2611 if (!flp->fl4_src)
2612 flp->fl4_src = (*rp)->rt_src;
2613 if (!flp->fl4_dst)
2614 flp->fl4_dst = (*rp)->rt_dst;
2615 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2618 return 0;
2621 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2623 return ip_route_output_flow(rp, flp, NULL, 0);
2626 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2627 int nowait, unsigned int flags)
2629 struct rtable *rt = (struct rtable*)skb->dst;
2630 struct rtmsg *r;
2631 struct nlmsghdr *nlh;
2632 unsigned char *b = skb->tail;
2633 struct rta_cacheinfo ci;
2634 #ifdef CONFIG_IP_MROUTE
2635 struct rtattr *eptr;
2636 #endif
2637 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2638 r = NLMSG_DATA(nlh);
2639 r->rtm_family = AF_INET;
2640 r->rtm_dst_len = 32;
2641 r->rtm_src_len = 0;
2642 r->rtm_tos = rt->fl.fl4_tos;
2643 r->rtm_table = RT_TABLE_MAIN;
2644 r->rtm_type = rt->rt_type;
2645 r->rtm_scope = RT_SCOPE_UNIVERSE;
2646 r->rtm_protocol = RTPROT_UNSPEC;
2647 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2648 if (rt->rt_flags & RTCF_NOTIFY)
2649 r->rtm_flags |= RTM_F_NOTIFY;
2650 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2651 if (rt->fl.fl4_src) {
2652 r->rtm_src_len = 32;
2653 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2655 if (rt->u.dst.dev)
2656 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2657 #ifdef CONFIG_NET_CLS_ROUTE
2658 if (rt->u.dst.tclassid)
2659 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2660 #endif
2661 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2662 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2663 __u32 alg = rt->rt_multipath_alg;
2665 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2667 #endif
2668 if (rt->fl.iif)
2669 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2670 else if (rt->rt_src != rt->fl.fl4_src)
2671 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2672 if (rt->rt_dst != rt->rt_gateway)
2673 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2674 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2675 goto rtattr_failure;
2676 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2677 ci.rta_used = rt->u.dst.__use;
2678 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2679 if (rt->u.dst.expires)
2680 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2681 else
2682 ci.rta_expires = 0;
2683 ci.rta_error = rt->u.dst.error;
2684 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2685 if (rt->peer) {
2686 ci.rta_id = rt->peer->ip_id_count;
2687 if (rt->peer->tcp_ts_stamp) {
2688 ci.rta_ts = rt->peer->tcp_ts;
2689 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2692 #ifdef CONFIG_IP_MROUTE
2693 eptr = (struct rtattr*)skb->tail;
2694 #endif
2695 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2696 if (rt->fl.iif) {
2697 #ifdef CONFIG_IP_MROUTE
2698 u32 dst = rt->rt_dst;
2700 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2701 ipv4_devconf.mc_forwarding) {
2702 int err = ipmr_get_route(skb, r, nowait);
2703 if (err <= 0) {
2704 if (!nowait) {
2705 if (err == 0)
2706 return 0;
2707 goto nlmsg_failure;
2708 } else {
2709 if (err == -EMSGSIZE)
2710 goto nlmsg_failure;
2711 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2714 } else
2715 #endif
2716 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2719 nlh->nlmsg_len = skb->tail - b;
2720 return skb->len;
2722 nlmsg_failure:
2723 rtattr_failure:
2724 skb_trim(skb, b - skb->data);
2725 return -1;
2728 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2730 struct rtattr **rta = arg;
2731 struct rtmsg *rtm = NLMSG_DATA(nlh);
2732 struct rtable *rt = NULL;
2733 u32 dst = 0;
2734 u32 src = 0;
2735 int iif = 0;
2736 int err = -ENOBUFS;
2737 struct sk_buff *skb;
2739 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2740 if (!skb)
2741 goto out;
2743 /* Reserve room for dummy headers, this skb can pass
2744 through good chunk of routing engine.
2746 skb->mac.raw = skb->data;
2747 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2749 if (rta[RTA_SRC - 1])
2750 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2751 if (rta[RTA_DST - 1])
2752 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2753 if (rta[RTA_IIF - 1])
2754 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2756 if (iif) {
2757 struct net_device *dev = __dev_get_by_index(iif);
2758 err = -ENODEV;
2759 if (!dev)
2760 goto out_free;
2761 skb->protocol = htons(ETH_P_IP);
2762 skb->dev = dev;
2763 local_bh_disable();
2764 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2765 local_bh_enable();
2766 rt = (struct rtable*)skb->dst;
2767 if (!err && rt->u.dst.error)
2768 err = -rt->u.dst.error;
2769 } else {
2770 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2771 .saddr = src,
2772 .tos = rtm->rtm_tos } } };
2773 int oif = 0;
2774 if (rta[RTA_OIF - 1])
2775 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2776 fl.oif = oif;
2777 err = ip_route_output_key(&rt, &fl);
2779 if (err)
2780 goto out_free;
2782 skb->dst = &rt->u.dst;
2783 if (rtm->rtm_flags & RTM_F_NOTIFY)
2784 rt->rt_flags |= RTCF_NOTIFY;
2786 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2788 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2789 RTM_NEWROUTE, 0, 0);
2790 if (!err)
2791 goto out_free;
2792 if (err < 0) {
2793 err = -EMSGSIZE;
2794 goto out_free;
2797 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2798 if (err > 0)
2799 err = 0;
2800 out: return err;
2802 out_free:
2803 kfree_skb(skb);
2804 goto out;
2807 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2809 struct rtable *rt;
2810 int h, s_h;
2811 int idx, s_idx;
2813 s_h = cb->args[0];
2814 s_idx = idx = cb->args[1];
2815 for (h = 0; h <= rt_hash_mask; h++) {
2816 if (h < s_h) continue;
2817 if (h > s_h)
2818 s_idx = 0;
2819 rcu_read_lock_bh();
2820 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2821 rt = rcu_dereference(rt->u.rt_next), idx++) {
2822 if (idx < s_idx)
2823 continue;
2824 skb->dst = dst_clone(&rt->u.dst);
2825 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2826 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2827 1, NLM_F_MULTI) <= 0) {
2828 dst_release(xchg(&skb->dst, NULL));
2829 rcu_read_unlock_bh();
2830 goto done;
2832 dst_release(xchg(&skb->dst, NULL));
2834 rcu_read_unlock_bh();
2837 done:
2838 cb->args[0] = h;
2839 cb->args[1] = idx;
2840 return skb->len;
2843 void ip_rt_multicast_event(struct in_device *in_dev)
2845 rt_cache_flush(0);
2848 #ifdef CONFIG_SYSCTL
2849 static int flush_delay;
2851 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2852 struct file *filp, void __user *buffer,
2853 size_t *lenp, loff_t *ppos)
2855 if (write) {
2856 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2857 rt_cache_flush(flush_delay);
2858 return 0;
2861 return -EINVAL;
2864 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2865 int __user *name,
2866 int nlen,
2867 void __user *oldval,
2868 size_t __user *oldlenp,
2869 void __user *newval,
2870 size_t newlen,
2871 void **context)
2873 int delay;
2874 if (newlen != sizeof(int))
2875 return -EINVAL;
2876 if (get_user(delay, (int __user *)newval))
2877 return -EFAULT;
2878 rt_cache_flush(delay);
2879 return 0;
2882 ctl_table ipv4_route_table[] = {
2884 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2885 .procname = "flush",
2886 .data = &flush_delay,
2887 .maxlen = sizeof(int),
2888 .mode = 0200,
2889 .proc_handler = &ipv4_sysctl_rtcache_flush,
2890 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2893 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2894 .procname = "min_delay",
2895 .data = &ip_rt_min_delay,
2896 .maxlen = sizeof(int),
2897 .mode = 0644,
2898 .proc_handler = &proc_dointvec_jiffies,
2899 .strategy = &sysctl_jiffies,
2902 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2903 .procname = "max_delay",
2904 .data = &ip_rt_max_delay,
2905 .maxlen = sizeof(int),
2906 .mode = 0644,
2907 .proc_handler = &proc_dointvec_jiffies,
2908 .strategy = &sysctl_jiffies,
2911 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2912 .procname = "gc_thresh",
2913 .data = &ipv4_dst_ops.gc_thresh,
2914 .maxlen = sizeof(int),
2915 .mode = 0644,
2916 .proc_handler = &proc_dointvec,
2919 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2920 .procname = "max_size",
2921 .data = &ip_rt_max_size,
2922 .maxlen = sizeof(int),
2923 .mode = 0644,
2924 .proc_handler = &proc_dointvec,
2927 /* Deprecated. Use gc_min_interval_ms */
2929 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2930 .procname = "gc_min_interval",
2931 .data = &ip_rt_gc_min_interval,
2932 .maxlen = sizeof(int),
2933 .mode = 0644,
2934 .proc_handler = &proc_dointvec_jiffies,
2935 .strategy = &sysctl_jiffies,
2938 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2939 .procname = "gc_min_interval_ms",
2940 .data = &ip_rt_gc_min_interval,
2941 .maxlen = sizeof(int),
2942 .mode = 0644,
2943 .proc_handler = &proc_dointvec_ms_jiffies,
2944 .strategy = &sysctl_ms_jiffies,
2947 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2948 .procname = "gc_timeout",
2949 .data = &ip_rt_gc_timeout,
2950 .maxlen = sizeof(int),
2951 .mode = 0644,
2952 .proc_handler = &proc_dointvec_jiffies,
2953 .strategy = &sysctl_jiffies,
2956 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2957 .procname = "gc_interval",
2958 .data = &ip_rt_gc_interval,
2959 .maxlen = sizeof(int),
2960 .mode = 0644,
2961 .proc_handler = &proc_dointvec_jiffies,
2962 .strategy = &sysctl_jiffies,
2965 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2966 .procname = "redirect_load",
2967 .data = &ip_rt_redirect_load,
2968 .maxlen = sizeof(int),
2969 .mode = 0644,
2970 .proc_handler = &proc_dointvec,
2973 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2974 .procname = "redirect_number",
2975 .data = &ip_rt_redirect_number,
2976 .maxlen = sizeof(int),
2977 .mode = 0644,
2978 .proc_handler = &proc_dointvec,
2981 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2982 .procname = "redirect_silence",
2983 .data = &ip_rt_redirect_silence,
2984 .maxlen = sizeof(int),
2985 .mode = 0644,
2986 .proc_handler = &proc_dointvec,
2989 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2990 .procname = "error_cost",
2991 .data = &ip_rt_error_cost,
2992 .maxlen = sizeof(int),
2993 .mode = 0644,
2994 .proc_handler = &proc_dointvec,
2997 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2998 .procname = "error_burst",
2999 .data = &ip_rt_error_burst,
3000 .maxlen = sizeof(int),
3001 .mode = 0644,
3002 .proc_handler = &proc_dointvec,
3005 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3006 .procname = "gc_elasticity",
3007 .data = &ip_rt_gc_elasticity,
3008 .maxlen = sizeof(int),
3009 .mode = 0644,
3010 .proc_handler = &proc_dointvec,
3013 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3014 .procname = "mtu_expires",
3015 .data = &ip_rt_mtu_expires,
3016 .maxlen = sizeof(int),
3017 .mode = 0644,
3018 .proc_handler = &proc_dointvec_jiffies,
3019 .strategy = &sysctl_jiffies,
3022 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3023 .procname = "min_pmtu",
3024 .data = &ip_rt_min_pmtu,
3025 .maxlen = sizeof(int),
3026 .mode = 0644,
3027 .proc_handler = &proc_dointvec,
3030 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3031 .procname = "min_adv_mss",
3032 .data = &ip_rt_min_advmss,
3033 .maxlen = sizeof(int),
3034 .mode = 0644,
3035 .proc_handler = &proc_dointvec,
3038 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3039 .procname = "secret_interval",
3040 .data = &ip_rt_secret_interval,
3041 .maxlen = sizeof(int),
3042 .mode = 0644,
3043 .proc_handler = &proc_dointvec_jiffies,
3044 .strategy = &sysctl_jiffies,
3046 { .ctl_name = 0 }
3048 #endif
3050 #ifdef CONFIG_NET_CLS_ROUTE
3051 struct ip_rt_acct *ip_rt_acct;
3053 /* This code sucks. But you should have seen it before! --RR */
3055 /* IP route accounting ptr for this logical cpu number. */
3056 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3058 #ifdef CONFIG_PROC_FS
3059 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3060 int length, int *eof, void *data)
3062 unsigned int i;
3064 if ((offset & 3) || (length & 3))
3065 return -EIO;
3067 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3068 *eof = 1;
3069 return 0;
3072 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3073 length = sizeof(struct ip_rt_acct) * 256 - offset;
3074 *eof = 1;
3077 offset /= sizeof(u32);
3079 if (length > 0) {
3080 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3081 u32 *dst = (u32 *) buffer;
3083 /* Copy first cpu. */
3084 *start = buffer;
3085 memcpy(dst, src, length);
3087 /* Add the other cpus in, one int at a time */
3088 for_each_cpu(i) {
3089 unsigned int j;
3091 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3093 for (j = 0; j < length/4; j++)
3094 dst[j] += src[j];
3097 return length;
3099 #endif /* CONFIG_PROC_FS */
3100 #endif /* CONFIG_NET_CLS_ROUTE */
3102 static __initdata unsigned long rhash_entries;
3103 static int __init set_rhash_entries(char *str)
3105 if (!str)
3106 return 0;
3107 rhash_entries = simple_strtoul(str, &str, 0);
3108 return 1;
3110 __setup("rhash_entries=", set_rhash_entries);
3112 int __init ip_rt_init(void)
3114 int rc = 0;
3116 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3117 (jiffies ^ (jiffies >> 7)));
3119 #ifdef CONFIG_NET_CLS_ROUTE
3121 int order;
3122 for (order = 0;
3123 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3124 /* NOTHING */;
3125 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3126 if (!ip_rt_acct)
3127 panic("IP: failed to allocate ip_rt_acct\n");
3128 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3130 #endif
3132 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3133 sizeof(struct rtable),
3134 0, SLAB_HWCACHE_ALIGN,
3135 NULL, NULL);
3137 if (!ipv4_dst_ops.kmem_cachep)
3138 panic("IP: failed to allocate ip_dst_cache\n");
3140 rt_hash_table = (struct rt_hash_bucket *)
3141 alloc_large_system_hash("IP route cache",
3142 sizeof(struct rt_hash_bucket),
3143 rhash_entries,
3144 (num_physpages >= 128 * 1024) ?
3145 (27 - PAGE_SHIFT) :
3146 (29 - PAGE_SHIFT),
3147 HASH_HIGHMEM,
3148 &rt_hash_log,
3149 &rt_hash_mask,
3151 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3152 rt_hash_lock_init();
3154 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3155 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3157 rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3158 if (!rt_cache_stat)
3159 return -ENOMEM;
3161 devinet_init();
3162 ip_fib_init();
3164 init_timer(&rt_flush_timer);
3165 rt_flush_timer.function = rt_run_flush;
3166 init_timer(&rt_periodic_timer);
3167 rt_periodic_timer.function = rt_check_expire;
3168 init_timer(&rt_secret_timer);
3169 rt_secret_timer.function = rt_secret_rebuild;
3171 /* All the timers, started at system startup tend
3172 to synchronize. Perturb it a bit.
3174 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3175 ip_rt_gc_interval;
3176 add_timer(&rt_periodic_timer);
3178 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3179 ip_rt_secret_interval;
3180 add_timer(&rt_secret_timer);
3182 #ifdef CONFIG_PROC_FS
3184 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3185 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3186 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3187 proc_net_stat))) {
3188 free_percpu(rt_cache_stat);
3189 return -ENOMEM;
3191 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3193 #ifdef CONFIG_NET_CLS_ROUTE
3194 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3195 #endif
3196 #endif
3197 #ifdef CONFIG_XFRM
3198 xfrm_init();
3199 xfrm4_init();
3200 #endif
3201 return rc;
3204 EXPORT_SYMBOL(__ip_select_ident);
3205 EXPORT_SYMBOL(ip_route_input);
3206 EXPORT_SYMBOL(ip_route_output_key);