mac80211: provide interface iterator for drivers
[linux-2.6/verdex.git] / net / ipv4 / route.c
blobfcae074b7ae4f83e87d8e41d6e48b356cea58c27
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
113 #define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
116 #define IP_MAX_MTU 0xFFF0
118 #define RT_GC_TIMEOUT (300*HZ)
120 static int ip_rt_min_delay = 2 * HZ;
121 static int ip_rt_max_delay = 10 * HZ;
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval = 60 * HZ;
125 static int ip_rt_gc_min_interval = HZ / 2;
126 static int ip_rt_redirect_number = 9;
127 static int ip_rt_redirect_load = HZ / 50;
128 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost = HZ;
130 static int ip_rt_error_burst = 5 * HZ;
131 static int ip_rt_gc_elasticity = 8;
132 static int ip_rt_mtu_expires = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu = 512 + 20 + 20;
134 static int ip_rt_min_advmss = 256;
135 static int ip_rt_secret_interval = 10 * 60 * HZ;
136 static int ip_rt_flush_expected;
137 static unsigned long rt_deadline;
139 #define RTprint(a...) printk(KERN_DEBUG a)
141 static struct timer_list rt_flush_timer;
142 static void rt_worker_func(struct work_struct *work);
143 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
144 static struct timer_list rt_secret_timer;
147 * Interface to generic destination cache.
150 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151 static void ipv4_dst_destroy(struct dst_entry *dst);
152 static void ipv4_dst_ifdown(struct dst_entry *dst,
153 struct net_device *dev, int how);
154 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155 static void ipv4_link_failure(struct sk_buff *skb);
156 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157 static int rt_garbage_collect(void);
160 static struct dst_ops ipv4_dst_ops = {
161 .family = AF_INET,
162 .protocol = __constant_htons(ETH_P_IP),
163 .gc = rt_garbage_collect,
164 .check = ipv4_dst_check,
165 .destroy = ipv4_dst_destroy,
166 .ifdown = ipv4_dst_ifdown,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
170 .local_out = ip_local_out,
171 .entry_size = sizeof(struct rtable),
174 #define ECN_OR_COST(class) TC_PRIO_##class
176 const __u8 ip_tos2prio[16] = {
177 TC_PRIO_BESTEFFORT,
178 ECN_OR_COST(FILLER),
179 TC_PRIO_BESTEFFORT,
180 ECN_OR_COST(BESTEFFORT),
181 TC_PRIO_BULK,
182 ECN_OR_COST(BULK),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_INTERACTIVE,
186 ECN_OR_COST(INTERACTIVE),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE_BULK,
190 ECN_OR_COST(INTERACTIVE_BULK),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK)
197 * Route cache.
200 /* The locking scheme is rather straight forward:
202 * 1) Read-Copy Update protects the buckets of the central route hash.
203 * 2) Only writers remove entries, and they hold the lock
204 * as they look at rtable reference counts.
205 * 3) Only readers acquire references to rtable entries,
206 * they do so with atomic increments and with the
207 * lock held.
210 struct rt_hash_bucket {
211 struct rtable *chain;
213 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
214 defined(CONFIG_PROVE_LOCKING)
216 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
217 * The size of this table is a power of two and depends on the number of CPUS.
218 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
220 #ifdef CONFIG_LOCKDEP
221 # define RT_HASH_LOCK_SZ 256
222 #else
223 # if NR_CPUS >= 32
224 # define RT_HASH_LOCK_SZ 4096
225 # elif NR_CPUS >= 16
226 # define RT_HASH_LOCK_SZ 2048
227 # elif NR_CPUS >= 8
228 # define RT_HASH_LOCK_SZ 1024
229 # elif NR_CPUS >= 4
230 # define RT_HASH_LOCK_SZ 512
231 # else
232 # define RT_HASH_LOCK_SZ 256
233 # endif
234 #endif
236 static spinlock_t *rt_hash_locks;
237 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
238 # define rt_hash_lock_init() { \
239 int i; \
240 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
241 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
242 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
243 spin_lock_init(&rt_hash_locks[i]); \
245 #else
246 # define rt_hash_lock_addr(slot) NULL
247 # define rt_hash_lock_init()
248 #endif
250 static struct rt_hash_bucket *rt_hash_table;
251 static unsigned rt_hash_mask;
252 static unsigned int rt_hash_log;
253 static unsigned int rt_hash_rnd;
255 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
256 #define RT_CACHE_STAT_INC(field) \
257 (__raw_get_cpu_var(rt_cache_stat).field++)
259 static int rt_intern_hash(unsigned hash, struct rtable *rth,
260 struct rtable **res);
262 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
264 return (jhash_2words(daddr, saddr, rt_hash_rnd)
265 & rt_hash_mask);
268 #define rt_hash(daddr, saddr, idx) \
269 rt_hash_code((__force u32)(__be32)(daddr),\
270 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
272 #ifdef CONFIG_PROC_FS
273 struct rt_cache_iter_state {
274 int bucket;
277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
279 struct rtable *r = NULL;
280 struct rt_cache_iter_state *st = seq->private;
282 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283 rcu_read_lock_bh();
284 r = rt_hash_table[st->bucket].chain;
285 if (r)
286 break;
287 rcu_read_unlock_bh();
289 return rcu_dereference(r);
292 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
294 struct rt_cache_iter_state *st = seq->private;
296 r = r->u.dst.rt_next;
297 while (!r) {
298 rcu_read_unlock_bh();
299 if (--st->bucket < 0)
300 break;
301 rcu_read_lock_bh();
302 r = rt_hash_table[st->bucket].chain;
304 return rcu_dereference(r);
307 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
309 struct rtable *r = rt_cache_get_first(seq);
311 if (r)
312 while (pos && (r = rt_cache_get_next(seq, r)))
313 --pos;
314 return pos ? NULL : r;
317 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
319 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
322 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
324 struct rtable *r = NULL;
326 if (v == SEQ_START_TOKEN)
327 r = rt_cache_get_first(seq);
328 else
329 r = rt_cache_get_next(seq, v);
330 ++*pos;
331 return r;
334 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
336 if (v && v != SEQ_START_TOKEN)
337 rcu_read_unlock_bh();
340 static int rt_cache_seq_show(struct seq_file *seq, void *v)
342 if (v == SEQ_START_TOKEN)
343 seq_printf(seq, "%-127s\n",
344 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
345 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
346 "HHUptod\tSpecDst");
347 else {
348 struct rtable *r = v;
349 char temp[256];
351 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
352 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
353 r->u.dst.dev ? r->u.dst.dev->name : "*",
354 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
355 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
356 r->u.dst.__use, 0, (unsigned long)r->rt_src,
357 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
358 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
359 dst_metric(&r->u.dst, RTAX_WINDOW),
360 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
361 dst_metric(&r->u.dst, RTAX_RTTVAR)),
362 r->fl.fl4_tos,
363 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
364 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
365 dev_queue_xmit) : 0,
366 r->rt_spec_dst);
367 seq_printf(seq, "%-127s\n", temp);
369 return 0;
372 static const struct seq_operations rt_cache_seq_ops = {
373 .start = rt_cache_seq_start,
374 .next = rt_cache_seq_next,
375 .stop = rt_cache_seq_stop,
376 .show = rt_cache_seq_show,
379 static int rt_cache_seq_open(struct inode *inode, struct file *file)
381 return seq_open_private(file, &rt_cache_seq_ops,
382 sizeof(struct rt_cache_iter_state));
385 static const struct file_operations rt_cache_seq_fops = {
386 .owner = THIS_MODULE,
387 .open = rt_cache_seq_open,
388 .read = seq_read,
389 .llseek = seq_lseek,
390 .release = seq_release_private,
394 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
396 int cpu;
398 if (*pos == 0)
399 return SEQ_START_TOKEN;
401 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
402 if (!cpu_possible(cpu))
403 continue;
404 *pos = cpu+1;
405 return &per_cpu(rt_cache_stat, cpu);
407 return NULL;
410 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
412 int cpu;
414 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
415 if (!cpu_possible(cpu))
416 continue;
417 *pos = cpu+1;
418 return &per_cpu(rt_cache_stat, cpu);
420 return NULL;
424 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
429 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
431 struct rt_cache_stat *st = v;
433 if (v == SEQ_START_TOKEN) {
434 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
435 return 0;
438 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
439 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
440 atomic_read(&ipv4_dst_ops.entries),
441 st->in_hit,
442 st->in_slow_tot,
443 st->in_slow_mc,
444 st->in_no_route,
445 st->in_brd,
446 st->in_martian_dst,
447 st->in_martian_src,
449 st->out_hit,
450 st->out_slow_tot,
451 st->out_slow_mc,
453 st->gc_total,
454 st->gc_ignored,
455 st->gc_goal_miss,
456 st->gc_dst_overflow,
457 st->in_hlist_search,
458 st->out_hlist_search
460 return 0;
463 static const struct seq_operations rt_cpu_seq_ops = {
464 .start = rt_cpu_seq_start,
465 .next = rt_cpu_seq_next,
466 .stop = rt_cpu_seq_stop,
467 .show = rt_cpu_seq_show,
471 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
473 return seq_open(file, &rt_cpu_seq_ops);
476 static const struct file_operations rt_cpu_seq_fops = {
477 .owner = THIS_MODULE,
478 .open = rt_cpu_seq_open,
479 .read = seq_read,
480 .llseek = seq_lseek,
481 .release = seq_release,
484 #endif /* CONFIG_PROC_FS */
486 static __inline__ void rt_free(struct rtable *rt)
488 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
491 static __inline__ void rt_drop(struct rtable *rt)
493 ip_rt_put(rt);
494 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
497 static __inline__ int rt_fast_clean(struct rtable *rth)
499 /* Kill broadcast/multicast entries very aggresively, if they
500 collide in hash table with more useful entries */
501 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
502 rth->fl.iif && rth->u.dst.rt_next;
505 static __inline__ int rt_valuable(struct rtable *rth)
507 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
508 rth->u.dst.expires;
511 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
513 unsigned long age;
514 int ret = 0;
516 if (atomic_read(&rth->u.dst.__refcnt))
517 goto out;
519 ret = 1;
520 if (rth->u.dst.expires &&
521 time_after_eq(jiffies, rth->u.dst.expires))
522 goto out;
524 age = jiffies - rth->u.dst.lastuse;
525 ret = 0;
526 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
527 (age <= tmo2 && rt_valuable(rth)))
528 goto out;
529 ret = 1;
530 out: return ret;
533 /* Bits of score are:
534 * 31: very valuable
535 * 30: not quite useless
536 * 29..0: usage counter
538 static inline u32 rt_score(struct rtable *rt)
540 u32 score = jiffies - rt->u.dst.lastuse;
542 score = ~score & ~(3<<30);
544 if (rt_valuable(rt))
545 score |= (1<<31);
547 if (!rt->fl.iif ||
548 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
549 score |= (1<<30);
551 return score;
554 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
556 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
557 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
558 (fl1->mark ^ fl2->mark) |
559 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
560 *(u16 *)&fl2->nl_u.ip4_u.tos) |
561 (fl1->oif ^ fl2->oif) |
562 (fl1->iif ^ fl2->iif)) == 0;
566 * Perform a full scan of hash table and free all entries.
567 * Can be called by a softirq or a process.
568 * In the later case, we want to be reschedule if necessary
570 static void rt_do_flush(int process_context)
572 unsigned int i;
573 struct rtable *rth, *next;
575 for (i = 0; i <= rt_hash_mask; i++) {
576 if (process_context && need_resched())
577 cond_resched();
578 rth = rt_hash_table[i].chain;
579 if (!rth)
580 continue;
582 spin_lock_bh(rt_hash_lock_addr(i));
583 rth = rt_hash_table[i].chain;
584 rt_hash_table[i].chain = NULL;
585 spin_unlock_bh(rt_hash_lock_addr(i));
587 for (; rth; rth = next) {
588 next = rth->u.dst.rt_next;
589 rt_free(rth);
594 static void rt_check_expire(void)
596 static unsigned int rover;
597 unsigned int i = rover, goal;
598 struct rtable *rth, **rthp;
599 u64 mult;
601 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
602 if (ip_rt_gc_timeout > 1)
603 do_div(mult, ip_rt_gc_timeout);
604 goal = (unsigned int)mult;
605 if (goal > rt_hash_mask)
606 goal = rt_hash_mask + 1;
607 for (; goal > 0; goal--) {
608 unsigned long tmo = ip_rt_gc_timeout;
610 i = (i + 1) & rt_hash_mask;
611 rthp = &rt_hash_table[i].chain;
613 if (need_resched())
614 cond_resched();
616 if (*rthp == NULL)
617 continue;
618 spin_lock_bh(rt_hash_lock_addr(i));
619 while ((rth = *rthp) != NULL) {
620 if (rth->u.dst.expires) {
621 /* Entry is expired even if it is in use */
622 if (time_before_eq(jiffies, rth->u.dst.expires)) {
623 tmo >>= 1;
624 rthp = &rth->u.dst.rt_next;
625 continue;
627 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
628 tmo >>= 1;
629 rthp = &rth->u.dst.rt_next;
630 continue;
633 /* Cleanup aged off entries. */
634 *rthp = rth->u.dst.rt_next;
635 rt_free(rth);
637 spin_unlock_bh(rt_hash_lock_addr(i));
639 rover = i;
643 * rt_worker_func() is run in process context.
644 * If a whole flush was scheduled, it is done.
645 * Else, we call rt_check_expire() to scan part of the hash table
647 static void rt_worker_func(struct work_struct *work)
649 if (ip_rt_flush_expected) {
650 ip_rt_flush_expected = 0;
651 rt_do_flush(1);
652 } else
653 rt_check_expire();
654 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
657 /* This can run from both BH and non-BH contexts, the latter
658 * in the case of a forced flush event.
660 static void rt_run_flush(unsigned long process_context)
662 rt_deadline = 0;
664 get_random_bytes(&rt_hash_rnd, 4);
666 rt_do_flush(process_context);
669 static DEFINE_SPINLOCK(rt_flush_lock);
671 void rt_cache_flush(int delay)
673 unsigned long now = jiffies;
674 int user_mode = !in_softirq();
676 if (delay < 0)
677 delay = ip_rt_min_delay;
679 spin_lock_bh(&rt_flush_lock);
681 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
682 long tmo = (long)(rt_deadline - now);
684 /* If flush timer is already running
685 and flush request is not immediate (delay > 0):
687 if deadline is not achieved, prolongate timer to "delay",
688 otherwise fire it at deadline time.
691 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
692 tmo = 0;
694 if (delay > tmo)
695 delay = tmo;
698 if (delay <= 0) {
699 spin_unlock_bh(&rt_flush_lock);
700 rt_run_flush(user_mode);
701 return;
704 if (rt_deadline == 0)
705 rt_deadline = now + ip_rt_max_delay;
707 mod_timer(&rt_flush_timer, now+delay);
708 spin_unlock_bh(&rt_flush_lock);
712 * We change rt_hash_rnd and ask next rt_worker_func() invocation
713 * to perform a flush in process context
715 static void rt_secret_rebuild(unsigned long dummy)
717 get_random_bytes(&rt_hash_rnd, 4);
718 ip_rt_flush_expected = 1;
719 cancel_delayed_work(&expires_work);
720 schedule_delayed_work(&expires_work, HZ/10);
721 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
725 Short description of GC goals.
727 We want to build algorithm, which will keep routing cache
728 at some equilibrium point, when number of aged off entries
729 is kept approximately equal to newly generated ones.
731 Current expiration strength is variable "expire".
732 We try to adjust it dynamically, so that if networking
733 is idle expires is large enough to keep enough of warm entries,
734 and when load increases it reduces to limit cache size.
737 static int rt_garbage_collect(void)
739 static unsigned long expire = RT_GC_TIMEOUT;
740 static unsigned long last_gc;
741 static int rover;
742 static int equilibrium;
743 struct rtable *rth, **rthp;
744 unsigned long now = jiffies;
745 int goal;
748 * Garbage collection is pretty expensive,
749 * do not make it too frequently.
752 RT_CACHE_STAT_INC(gc_total);
754 if (now - last_gc < ip_rt_gc_min_interval &&
755 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
756 RT_CACHE_STAT_INC(gc_ignored);
757 goto out;
760 /* Calculate number of entries, which we want to expire now. */
761 goal = atomic_read(&ipv4_dst_ops.entries) -
762 (ip_rt_gc_elasticity << rt_hash_log);
763 if (goal <= 0) {
764 if (equilibrium < ipv4_dst_ops.gc_thresh)
765 equilibrium = ipv4_dst_ops.gc_thresh;
766 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
767 if (goal > 0) {
768 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
769 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
771 } else {
772 /* We are in dangerous area. Try to reduce cache really
773 * aggressively.
775 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
776 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
779 if (now - last_gc >= ip_rt_gc_min_interval)
780 last_gc = now;
782 if (goal <= 0) {
783 equilibrium += goal;
784 goto work_done;
787 do {
788 int i, k;
790 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
791 unsigned long tmo = expire;
793 k = (k + 1) & rt_hash_mask;
794 rthp = &rt_hash_table[k].chain;
795 spin_lock_bh(rt_hash_lock_addr(k));
796 while ((rth = *rthp) != NULL) {
797 if (!rt_may_expire(rth, tmo, expire)) {
798 tmo >>= 1;
799 rthp = &rth->u.dst.rt_next;
800 continue;
802 *rthp = rth->u.dst.rt_next;
803 rt_free(rth);
804 goal--;
806 spin_unlock_bh(rt_hash_lock_addr(k));
807 if (goal <= 0)
808 break;
810 rover = k;
812 if (goal <= 0)
813 goto work_done;
815 /* Goal is not achieved. We stop process if:
817 - if expire reduced to zero. Otherwise, expire is halfed.
818 - if table is not full.
819 - if we are called from interrupt.
820 - jiffies check is just fallback/debug loop breaker.
821 We will not spin here for long time in any case.
824 RT_CACHE_STAT_INC(gc_goal_miss);
826 if (expire == 0)
827 break;
829 expire >>= 1;
830 #if RT_CACHE_DEBUG >= 2
831 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
832 atomic_read(&ipv4_dst_ops.entries), goal, i);
833 #endif
835 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
836 goto out;
837 } while (!in_softirq() && time_before_eq(jiffies, now));
839 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
840 goto out;
841 if (net_ratelimit())
842 printk(KERN_WARNING "dst cache overflow\n");
843 RT_CACHE_STAT_INC(gc_dst_overflow);
844 return 1;
846 work_done:
847 expire += ip_rt_gc_min_interval;
848 if (expire > ip_rt_gc_timeout ||
849 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
850 expire = ip_rt_gc_timeout;
851 #if RT_CACHE_DEBUG >= 2
852 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
853 atomic_read(&ipv4_dst_ops.entries), goal, rover);
854 #endif
855 out: return 0;
858 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
860 struct rtable *rth, **rthp;
861 unsigned long now;
862 struct rtable *cand, **candp;
863 u32 min_score;
864 int chain_length;
865 int attempts = !in_softirq();
867 restart:
868 chain_length = 0;
869 min_score = ~(u32)0;
870 cand = NULL;
871 candp = NULL;
872 now = jiffies;
874 rthp = &rt_hash_table[hash].chain;
876 spin_lock_bh(rt_hash_lock_addr(hash));
877 while ((rth = *rthp) != NULL) {
878 if (compare_keys(&rth->fl, &rt->fl)) {
879 /* Put it first */
880 *rthp = rth->u.dst.rt_next;
882 * Since lookup is lockfree, the deletion
883 * must be visible to another weakly ordered CPU before
884 * the insertion at the start of the hash chain.
886 rcu_assign_pointer(rth->u.dst.rt_next,
887 rt_hash_table[hash].chain);
889 * Since lookup is lockfree, the update writes
890 * must be ordered for consistency on SMP.
892 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
894 dst_use(&rth->u.dst, now);
895 spin_unlock_bh(rt_hash_lock_addr(hash));
897 rt_drop(rt);
898 *rp = rth;
899 return 0;
902 if (!atomic_read(&rth->u.dst.__refcnt)) {
903 u32 score = rt_score(rth);
905 if (score <= min_score) {
906 cand = rth;
907 candp = rthp;
908 min_score = score;
912 chain_length++;
914 rthp = &rth->u.dst.rt_next;
917 if (cand) {
918 /* ip_rt_gc_elasticity used to be average length of chain
919 * length, when exceeded gc becomes really aggressive.
921 * The second limit is less certain. At the moment it allows
922 * only 2 entries per bucket. We will see.
924 if (chain_length > ip_rt_gc_elasticity) {
925 *candp = cand->u.dst.rt_next;
926 rt_free(cand);
930 /* Try to bind route to arp only if it is output
931 route or unicast forwarding path.
933 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
934 int err = arp_bind_neighbour(&rt->u.dst);
935 if (err) {
936 spin_unlock_bh(rt_hash_lock_addr(hash));
938 if (err != -ENOBUFS) {
939 rt_drop(rt);
940 return err;
943 /* Neighbour tables are full and nothing
944 can be released. Try to shrink route cache,
945 it is most likely it holds some neighbour records.
947 if (attempts-- > 0) {
948 int saved_elasticity = ip_rt_gc_elasticity;
949 int saved_int = ip_rt_gc_min_interval;
950 ip_rt_gc_elasticity = 1;
951 ip_rt_gc_min_interval = 0;
952 rt_garbage_collect();
953 ip_rt_gc_min_interval = saved_int;
954 ip_rt_gc_elasticity = saved_elasticity;
955 goto restart;
958 if (net_ratelimit())
959 printk(KERN_WARNING "Neighbour table overflow.\n");
960 rt_drop(rt);
961 return -ENOBUFS;
965 rt->u.dst.rt_next = rt_hash_table[hash].chain;
966 #if RT_CACHE_DEBUG >= 2
967 if (rt->u.dst.rt_next) {
968 struct rtable *trt;
969 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
970 NIPQUAD(rt->rt_dst));
971 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
972 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
973 printk("\n");
975 #endif
976 rt_hash_table[hash].chain = rt;
977 spin_unlock_bh(rt_hash_lock_addr(hash));
978 *rp = rt;
979 return 0;
982 void rt_bind_peer(struct rtable *rt, int create)
984 static DEFINE_SPINLOCK(rt_peer_lock);
985 struct inet_peer *peer;
987 peer = inet_getpeer(rt->rt_dst, create);
989 spin_lock_bh(&rt_peer_lock);
990 if (rt->peer == NULL) {
991 rt->peer = peer;
992 peer = NULL;
994 spin_unlock_bh(&rt_peer_lock);
995 if (peer)
996 inet_putpeer(peer);
1000 * Peer allocation may fail only in serious out-of-memory conditions. However
1001 * we still can generate some output.
1002 * Random ID selection looks a bit dangerous because we have no chances to
1003 * select ID being unique in a reasonable period of time.
1004 * But broken packet identifier may be better than no packet at all.
1006 static void ip_select_fb_ident(struct iphdr *iph)
1008 static DEFINE_SPINLOCK(ip_fb_id_lock);
1009 static u32 ip_fallback_id;
1010 u32 salt;
1012 spin_lock_bh(&ip_fb_id_lock);
1013 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1014 iph->id = htons(salt & 0xFFFF);
1015 ip_fallback_id = salt;
1016 spin_unlock_bh(&ip_fb_id_lock);
1019 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1021 struct rtable *rt = (struct rtable *) dst;
1023 if (rt) {
1024 if (rt->peer == NULL)
1025 rt_bind_peer(rt, 1);
1027 /* If peer is attached to destination, it is never detached,
1028 so that we need not to grab a lock to dereference it.
1030 if (rt->peer) {
1031 iph->id = htons(inet_getid(rt->peer, more));
1032 return;
1034 } else
1035 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1036 __builtin_return_address(0));
1038 ip_select_fb_ident(iph);
1041 static void rt_del(unsigned hash, struct rtable *rt)
1043 struct rtable **rthp;
1045 spin_lock_bh(rt_hash_lock_addr(hash));
1046 ip_rt_put(rt);
1047 for (rthp = &rt_hash_table[hash].chain; *rthp;
1048 rthp = &(*rthp)->u.dst.rt_next)
1049 if (*rthp == rt) {
1050 *rthp = rt->u.dst.rt_next;
1051 rt_free(rt);
1052 break;
1054 spin_unlock_bh(rt_hash_lock_addr(hash));
1057 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1058 __be32 saddr, struct net_device *dev)
1060 int i, k;
1061 struct in_device *in_dev = in_dev_get(dev);
1062 struct rtable *rth, **rthp;
1063 __be32 skeys[2] = { saddr, 0 };
1064 int ikeys[2] = { dev->ifindex, 0 };
1065 struct netevent_redirect netevent;
1067 if (!in_dev)
1068 return;
1070 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1071 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1072 goto reject_redirect;
1074 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1075 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1076 goto reject_redirect;
1077 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1078 goto reject_redirect;
1079 } else {
1080 if (inet_addr_type(new_gw) != RTN_UNICAST)
1081 goto reject_redirect;
1084 for (i = 0; i < 2; i++) {
1085 for (k = 0; k < 2; k++) {
1086 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1088 rthp=&rt_hash_table[hash].chain;
1090 rcu_read_lock();
1091 while ((rth = rcu_dereference(*rthp)) != NULL) {
1092 struct rtable *rt;
1094 if (rth->fl.fl4_dst != daddr ||
1095 rth->fl.fl4_src != skeys[i] ||
1096 rth->fl.oif != ikeys[k] ||
1097 rth->fl.iif != 0) {
1098 rthp = &rth->u.dst.rt_next;
1099 continue;
1102 if (rth->rt_dst != daddr ||
1103 rth->rt_src != saddr ||
1104 rth->u.dst.error ||
1105 rth->rt_gateway != old_gw ||
1106 rth->u.dst.dev != dev)
1107 break;
1109 dst_hold(&rth->u.dst);
1110 rcu_read_unlock();
1112 rt = dst_alloc(&ipv4_dst_ops);
1113 if (rt == NULL) {
1114 ip_rt_put(rth);
1115 in_dev_put(in_dev);
1116 return;
1119 /* Copy all the information. */
1120 *rt = *rth;
1121 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1122 rt->u.dst.__use = 1;
1123 atomic_set(&rt->u.dst.__refcnt, 1);
1124 rt->u.dst.child = NULL;
1125 if (rt->u.dst.dev)
1126 dev_hold(rt->u.dst.dev);
1127 if (rt->idev)
1128 in_dev_hold(rt->idev);
1129 rt->u.dst.obsolete = 0;
1130 rt->u.dst.lastuse = jiffies;
1131 rt->u.dst.path = &rt->u.dst;
1132 rt->u.dst.neighbour = NULL;
1133 rt->u.dst.hh = NULL;
1134 rt->u.dst.xfrm = NULL;
1136 rt->rt_flags |= RTCF_REDIRECTED;
1138 /* Gateway is different ... */
1139 rt->rt_gateway = new_gw;
1141 /* Redirect received -> path was valid */
1142 dst_confirm(&rth->u.dst);
1144 if (rt->peer)
1145 atomic_inc(&rt->peer->refcnt);
1147 if (arp_bind_neighbour(&rt->u.dst) ||
1148 !(rt->u.dst.neighbour->nud_state &
1149 NUD_VALID)) {
1150 if (rt->u.dst.neighbour)
1151 neigh_event_send(rt->u.dst.neighbour, NULL);
1152 ip_rt_put(rth);
1153 rt_drop(rt);
1154 goto do_next;
1157 netevent.old = &rth->u.dst;
1158 netevent.new = &rt->u.dst;
1159 call_netevent_notifiers(NETEVENT_REDIRECT,
1160 &netevent);
1162 rt_del(hash, rth);
1163 if (!rt_intern_hash(hash, rt, &rt))
1164 ip_rt_put(rt);
1165 goto do_next;
1167 rcu_read_unlock();
1168 do_next:
1172 in_dev_put(in_dev);
1173 return;
1175 reject_redirect:
1176 #ifdef CONFIG_IP_ROUTE_VERBOSE
1177 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1178 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1179 "%u.%u.%u.%u ignored.\n"
1180 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1181 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1182 NIPQUAD(saddr), NIPQUAD(daddr));
1183 #endif
1184 in_dev_put(in_dev);
1187 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1189 struct rtable *rt = (struct rtable*)dst;
1190 struct dst_entry *ret = dst;
1192 if (rt) {
1193 if (dst->obsolete) {
1194 ip_rt_put(rt);
1195 ret = NULL;
1196 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1197 rt->u.dst.expires) {
1198 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1199 rt->fl.oif);
1200 #if RT_CACHE_DEBUG >= 1
1201 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1202 "%u.%u.%u.%u/%02x dropped\n",
1203 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1204 #endif
1205 rt_del(hash, rt);
1206 ret = NULL;
1209 return ret;
1213 * Algorithm:
1214 * 1. The first ip_rt_redirect_number redirects are sent
1215 * with exponential backoff, then we stop sending them at all,
1216 * assuming that the host ignores our redirects.
1217 * 2. If we did not see packets requiring redirects
1218 * during ip_rt_redirect_silence, we assume that the host
1219 * forgot redirected route and start to send redirects again.
1221 * This algorithm is much cheaper and more intelligent than dumb load limiting
1222 * in icmp.c.
1224 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1225 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1228 void ip_rt_send_redirect(struct sk_buff *skb)
1230 struct rtable *rt = (struct rtable*)skb->dst;
1231 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1233 if (!in_dev)
1234 return;
1236 if (!IN_DEV_TX_REDIRECTS(in_dev))
1237 goto out;
1239 /* No redirected packets during ip_rt_redirect_silence;
1240 * reset the algorithm.
1242 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1243 rt->u.dst.rate_tokens = 0;
1245 /* Too many ignored redirects; do not send anything
1246 * set u.dst.rate_last to the last seen redirected packet.
1248 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1249 rt->u.dst.rate_last = jiffies;
1250 goto out;
1253 /* Check for load limit; set rate_last to the latest sent
1254 * redirect.
1256 if (rt->u.dst.rate_tokens == 0 ||
1257 time_after(jiffies,
1258 (rt->u.dst.rate_last +
1259 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1260 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1261 rt->u.dst.rate_last = jiffies;
1262 ++rt->u.dst.rate_tokens;
1263 #ifdef CONFIG_IP_ROUTE_VERBOSE
1264 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1265 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1266 net_ratelimit())
1267 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1268 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1269 NIPQUAD(rt->rt_src), rt->rt_iif,
1270 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1271 #endif
1273 out:
1274 in_dev_put(in_dev);
1277 static int ip_error(struct sk_buff *skb)
1279 struct rtable *rt = (struct rtable*)skb->dst;
1280 unsigned long now;
1281 int code;
1283 switch (rt->u.dst.error) {
1284 case EINVAL:
1285 default:
1286 goto out;
1287 case EHOSTUNREACH:
1288 code = ICMP_HOST_UNREACH;
1289 break;
1290 case ENETUNREACH:
1291 code = ICMP_NET_UNREACH;
1292 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1293 break;
1294 case EACCES:
1295 code = ICMP_PKT_FILTERED;
1296 break;
1299 now = jiffies;
1300 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1301 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1302 rt->u.dst.rate_tokens = ip_rt_error_burst;
1303 rt->u.dst.rate_last = now;
1304 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1305 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1306 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1309 out: kfree_skb(skb);
1310 return 0;
1314 * The last two values are not from the RFC but
1315 * are needed for AMPRnet AX.25 paths.
1318 static const unsigned short mtu_plateau[] =
1319 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1321 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1323 int i;
1325 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1326 if (old_mtu > mtu_plateau[i])
1327 return mtu_plateau[i];
1328 return 68;
1331 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1333 int i;
1334 unsigned short old_mtu = ntohs(iph->tot_len);
1335 struct rtable *rth;
1336 __be32 skeys[2] = { iph->saddr, 0, };
1337 __be32 daddr = iph->daddr;
1338 unsigned short est_mtu = 0;
1340 if (ipv4_config.no_pmtu_disc)
1341 return 0;
1343 for (i = 0; i < 2; i++) {
1344 unsigned hash = rt_hash(daddr, skeys[i], 0);
1346 rcu_read_lock();
1347 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1348 rth = rcu_dereference(rth->u.dst.rt_next)) {
1349 if (rth->fl.fl4_dst == daddr &&
1350 rth->fl.fl4_src == skeys[i] &&
1351 rth->rt_dst == daddr &&
1352 rth->rt_src == iph->saddr &&
1353 rth->fl.iif == 0 &&
1354 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1355 unsigned short mtu = new_mtu;
1357 if (new_mtu < 68 || new_mtu >= old_mtu) {
1359 /* BSD 4.2 compatibility hack :-( */
1360 if (mtu == 0 &&
1361 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1362 old_mtu >= 68 + (iph->ihl << 2))
1363 old_mtu -= iph->ihl << 2;
1365 mtu = guess_mtu(old_mtu);
1367 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1368 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1369 dst_confirm(&rth->u.dst);
1370 if (mtu < ip_rt_min_pmtu) {
1371 mtu = ip_rt_min_pmtu;
1372 rth->u.dst.metrics[RTAX_LOCK-1] |=
1373 (1 << RTAX_MTU);
1375 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1376 dst_set_expires(&rth->u.dst,
1377 ip_rt_mtu_expires);
1379 est_mtu = mtu;
1383 rcu_read_unlock();
1385 return est_mtu ? : new_mtu;
1388 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1390 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1391 !(dst_metric_locked(dst, RTAX_MTU))) {
1392 if (mtu < ip_rt_min_pmtu) {
1393 mtu = ip_rt_min_pmtu;
1394 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1396 dst->metrics[RTAX_MTU-1] = mtu;
1397 dst_set_expires(dst, ip_rt_mtu_expires);
1398 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1402 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1404 return NULL;
1407 static void ipv4_dst_destroy(struct dst_entry *dst)
1409 struct rtable *rt = (struct rtable *) dst;
1410 struct inet_peer *peer = rt->peer;
1411 struct in_device *idev = rt->idev;
1413 if (peer) {
1414 rt->peer = NULL;
1415 inet_putpeer(peer);
1418 if (idev) {
1419 rt->idev = NULL;
1420 in_dev_put(idev);
1424 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1425 int how)
1427 struct rtable *rt = (struct rtable *) dst;
1428 struct in_device *idev = rt->idev;
1429 if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1430 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1431 if (loopback_idev) {
1432 rt->idev = loopback_idev;
1433 in_dev_put(idev);
1438 static void ipv4_link_failure(struct sk_buff *skb)
1440 struct rtable *rt;
1442 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1444 rt = (struct rtable *) skb->dst;
1445 if (rt)
1446 dst_set_expires(&rt->u.dst, 0);
1449 static int ip_rt_bug(struct sk_buff *skb)
1451 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1452 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1453 skb->dev ? skb->dev->name : "?");
1454 kfree_skb(skb);
1455 return 0;
1459 We do not cache source address of outgoing interface,
1460 because it is used only by IP RR, TS and SRR options,
1461 so that it out of fast path.
1463 BTW remember: "addr" is allowed to be not aligned
1464 in IP options!
1467 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1469 __be32 src;
1470 struct fib_result res;
1472 if (rt->fl.iif == 0)
1473 src = rt->rt_src;
1474 else if (fib_lookup(&rt->fl, &res) == 0) {
1475 src = FIB_RES_PREFSRC(res);
1476 fib_res_put(&res);
1477 } else
1478 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1479 RT_SCOPE_UNIVERSE);
1480 memcpy(addr, &src, 4);
1483 #ifdef CONFIG_NET_CLS_ROUTE
1484 static void set_class_tag(struct rtable *rt, u32 tag)
1486 if (!(rt->u.dst.tclassid & 0xFFFF))
1487 rt->u.dst.tclassid |= tag & 0xFFFF;
1488 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1489 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1491 #endif
1493 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1495 struct fib_info *fi = res->fi;
1497 if (fi) {
1498 if (FIB_RES_GW(*res) &&
1499 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1500 rt->rt_gateway = FIB_RES_GW(*res);
1501 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1502 sizeof(rt->u.dst.metrics));
1503 if (fi->fib_mtu == 0) {
1504 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1505 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1506 rt->rt_gateway != rt->rt_dst &&
1507 rt->u.dst.dev->mtu > 576)
1508 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1510 #ifdef CONFIG_NET_CLS_ROUTE
1511 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1512 #endif
1513 } else
1514 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1516 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1517 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1518 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1519 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1520 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1521 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1522 ip_rt_min_advmss);
1523 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1524 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1526 #ifdef CONFIG_NET_CLS_ROUTE
1527 #ifdef CONFIG_IP_MULTIPLE_TABLES
1528 set_class_tag(rt, fib_rules_tclass(res));
1529 #endif
1530 set_class_tag(rt, itag);
1531 #endif
1532 rt->rt_type = res->type;
1535 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1536 u8 tos, struct net_device *dev, int our)
1538 unsigned hash;
1539 struct rtable *rth;
1540 __be32 spec_dst;
1541 struct in_device *in_dev = in_dev_get(dev);
1542 u32 itag = 0;
1544 /* Primary sanity checks. */
1546 if (in_dev == NULL)
1547 return -EINVAL;
1549 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1550 skb->protocol != htons(ETH_P_IP))
1551 goto e_inval;
1553 if (ZERONET(saddr)) {
1554 if (!LOCAL_MCAST(daddr))
1555 goto e_inval;
1556 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1557 } else if (fib_validate_source(saddr, 0, tos, 0,
1558 dev, &spec_dst, &itag) < 0)
1559 goto e_inval;
1561 rth = dst_alloc(&ipv4_dst_ops);
1562 if (!rth)
1563 goto e_nobufs;
1565 rth->u.dst.output= ip_rt_bug;
1567 atomic_set(&rth->u.dst.__refcnt, 1);
1568 rth->u.dst.flags= DST_HOST;
1569 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1570 rth->u.dst.flags |= DST_NOPOLICY;
1571 rth->fl.fl4_dst = daddr;
1572 rth->rt_dst = daddr;
1573 rth->fl.fl4_tos = tos;
1574 rth->fl.mark = skb->mark;
1575 rth->fl.fl4_src = saddr;
1576 rth->rt_src = saddr;
1577 #ifdef CONFIG_NET_CLS_ROUTE
1578 rth->u.dst.tclassid = itag;
1579 #endif
1580 rth->rt_iif =
1581 rth->fl.iif = dev->ifindex;
1582 rth->u.dst.dev = init_net.loopback_dev;
1583 dev_hold(rth->u.dst.dev);
1584 rth->idev = in_dev_get(rth->u.dst.dev);
1585 rth->fl.oif = 0;
1586 rth->rt_gateway = daddr;
1587 rth->rt_spec_dst= spec_dst;
1588 rth->rt_type = RTN_MULTICAST;
1589 rth->rt_flags = RTCF_MULTICAST;
1590 if (our) {
1591 rth->u.dst.input= ip_local_deliver;
1592 rth->rt_flags |= RTCF_LOCAL;
1595 #ifdef CONFIG_IP_MROUTE
1596 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1597 rth->u.dst.input = ip_mr_input;
1598 #endif
1599 RT_CACHE_STAT_INC(in_slow_mc);
1601 in_dev_put(in_dev);
1602 hash = rt_hash(daddr, saddr, dev->ifindex);
1603 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1605 e_nobufs:
1606 in_dev_put(in_dev);
1607 return -ENOBUFS;
1609 e_inval:
1610 in_dev_put(in_dev);
1611 return -EINVAL;
1615 static void ip_handle_martian_source(struct net_device *dev,
1616 struct in_device *in_dev,
1617 struct sk_buff *skb,
1618 __be32 daddr,
1619 __be32 saddr)
1621 RT_CACHE_STAT_INC(in_martian_src);
1622 #ifdef CONFIG_IP_ROUTE_VERBOSE
1623 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1625 * RFC1812 recommendation, if source is martian,
1626 * the only hint is MAC header.
1628 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1629 "%u.%u.%u.%u, on dev %s\n",
1630 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1631 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1632 int i;
1633 const unsigned char *p = skb_mac_header(skb);
1634 printk(KERN_WARNING "ll header: ");
1635 for (i = 0; i < dev->hard_header_len; i++, p++) {
1636 printk("%02x", *p);
1637 if (i < (dev->hard_header_len - 1))
1638 printk(":");
1640 printk("\n");
1643 #endif
1646 static inline int __mkroute_input(struct sk_buff *skb,
1647 struct fib_result* res,
1648 struct in_device *in_dev,
1649 __be32 daddr, __be32 saddr, u32 tos,
1650 struct rtable **result)
1653 struct rtable *rth;
1654 int err;
1655 struct in_device *out_dev;
1656 unsigned flags = 0;
1657 __be32 spec_dst;
1658 u32 itag;
1660 /* get a working reference to the output device */
1661 out_dev = in_dev_get(FIB_RES_DEV(*res));
1662 if (out_dev == NULL) {
1663 if (net_ratelimit())
1664 printk(KERN_CRIT "Bug in ip_route_input" \
1665 "_slow(). Please, report\n");
1666 return -EINVAL;
1670 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1671 in_dev->dev, &spec_dst, &itag);
1672 if (err < 0) {
1673 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1674 saddr);
1676 err = -EINVAL;
1677 goto cleanup;
1680 if (err)
1681 flags |= RTCF_DIRECTSRC;
1683 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1684 (IN_DEV_SHARED_MEDIA(out_dev) ||
1685 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1686 flags |= RTCF_DOREDIRECT;
1688 if (skb->protocol != htons(ETH_P_IP)) {
1689 /* Not IP (i.e. ARP). Do not create route, if it is
1690 * invalid for proxy arp. DNAT routes are always valid.
1692 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1693 err = -EINVAL;
1694 goto cleanup;
1699 rth = dst_alloc(&ipv4_dst_ops);
1700 if (!rth) {
1701 err = -ENOBUFS;
1702 goto cleanup;
1705 atomic_set(&rth->u.dst.__refcnt, 1);
1706 rth->u.dst.flags= DST_HOST;
1707 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1708 rth->u.dst.flags |= DST_NOPOLICY;
1709 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1710 rth->u.dst.flags |= DST_NOXFRM;
1711 rth->fl.fl4_dst = daddr;
1712 rth->rt_dst = daddr;
1713 rth->fl.fl4_tos = tos;
1714 rth->fl.mark = skb->mark;
1715 rth->fl.fl4_src = saddr;
1716 rth->rt_src = saddr;
1717 rth->rt_gateway = daddr;
1718 rth->rt_iif =
1719 rth->fl.iif = in_dev->dev->ifindex;
1720 rth->u.dst.dev = (out_dev)->dev;
1721 dev_hold(rth->u.dst.dev);
1722 rth->idev = in_dev_get(rth->u.dst.dev);
1723 rth->fl.oif = 0;
1724 rth->rt_spec_dst= spec_dst;
1726 rth->u.dst.input = ip_forward;
1727 rth->u.dst.output = ip_output;
1729 rt_set_nexthop(rth, res, itag);
1731 rth->rt_flags = flags;
1733 *result = rth;
1734 err = 0;
1735 cleanup:
1736 /* release the working reference to the output device */
1737 in_dev_put(out_dev);
1738 return err;
1741 static inline int ip_mkroute_input(struct sk_buff *skb,
1742 struct fib_result* res,
1743 const struct flowi *fl,
1744 struct in_device *in_dev,
1745 __be32 daddr, __be32 saddr, u32 tos)
1747 struct rtable* rth = NULL;
1748 int err;
1749 unsigned hash;
1751 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1752 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1753 fib_select_multipath(fl, res);
1754 #endif
1756 /* create a routing cache entry */
1757 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1758 if (err)
1759 return err;
1761 /* put it into the cache */
1762 hash = rt_hash(daddr, saddr, fl->iif);
1763 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1767 * NOTE. We drop all the packets that has local source
1768 * addresses, because every properly looped back packet
1769 * must have correct destination already attached by output routine.
1771 * Such approach solves two big problems:
1772 * 1. Not simplex devices are handled properly.
1773 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1776 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1777 u8 tos, struct net_device *dev)
1779 struct fib_result res;
1780 struct in_device *in_dev = in_dev_get(dev);
1781 struct flowi fl = { .nl_u = { .ip4_u =
1782 { .daddr = daddr,
1783 .saddr = saddr,
1784 .tos = tos,
1785 .scope = RT_SCOPE_UNIVERSE,
1786 } },
1787 .mark = skb->mark,
1788 .iif = dev->ifindex };
1789 unsigned flags = 0;
1790 u32 itag = 0;
1791 struct rtable * rth;
1792 unsigned hash;
1793 __be32 spec_dst;
1794 int err = -EINVAL;
1795 int free_res = 0;
1797 /* IP on this device is disabled. */
1799 if (!in_dev)
1800 goto out;
1802 /* Check for the most weird martians, which can be not detected
1803 by fib_lookup.
1806 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1807 goto martian_source;
1809 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1810 goto brd_input;
1812 /* Accept zero addresses only to limited broadcast;
1813 * I even do not know to fix it or not. Waiting for complains :-)
1815 if (ZERONET(saddr))
1816 goto martian_source;
1818 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1819 goto martian_destination;
1822 * Now we are ready to route packet.
1824 if ((err = fib_lookup(&fl, &res)) != 0) {
1825 if (!IN_DEV_FORWARD(in_dev))
1826 goto e_hostunreach;
1827 goto no_route;
1829 free_res = 1;
1831 RT_CACHE_STAT_INC(in_slow_tot);
1833 if (res.type == RTN_BROADCAST)
1834 goto brd_input;
1836 if (res.type == RTN_LOCAL) {
1837 int result;
1838 result = fib_validate_source(saddr, daddr, tos,
1839 init_net.loopback_dev->ifindex,
1840 dev, &spec_dst, &itag);
1841 if (result < 0)
1842 goto martian_source;
1843 if (result)
1844 flags |= RTCF_DIRECTSRC;
1845 spec_dst = daddr;
1846 goto local_input;
1849 if (!IN_DEV_FORWARD(in_dev))
1850 goto e_hostunreach;
1851 if (res.type != RTN_UNICAST)
1852 goto martian_destination;
1854 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1855 done:
1856 in_dev_put(in_dev);
1857 if (free_res)
1858 fib_res_put(&res);
1859 out: return err;
1861 brd_input:
1862 if (skb->protocol != htons(ETH_P_IP))
1863 goto e_inval;
1865 if (ZERONET(saddr))
1866 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1867 else {
1868 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1869 &itag);
1870 if (err < 0)
1871 goto martian_source;
1872 if (err)
1873 flags |= RTCF_DIRECTSRC;
1875 flags |= RTCF_BROADCAST;
1876 res.type = RTN_BROADCAST;
1877 RT_CACHE_STAT_INC(in_brd);
1879 local_input:
1880 rth = dst_alloc(&ipv4_dst_ops);
1881 if (!rth)
1882 goto e_nobufs;
1884 rth->u.dst.output= ip_rt_bug;
1886 atomic_set(&rth->u.dst.__refcnt, 1);
1887 rth->u.dst.flags= DST_HOST;
1888 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1889 rth->u.dst.flags |= DST_NOPOLICY;
1890 rth->fl.fl4_dst = daddr;
1891 rth->rt_dst = daddr;
1892 rth->fl.fl4_tos = tos;
1893 rth->fl.mark = skb->mark;
1894 rth->fl.fl4_src = saddr;
1895 rth->rt_src = saddr;
1896 #ifdef CONFIG_NET_CLS_ROUTE
1897 rth->u.dst.tclassid = itag;
1898 #endif
1899 rth->rt_iif =
1900 rth->fl.iif = dev->ifindex;
1901 rth->u.dst.dev = init_net.loopback_dev;
1902 dev_hold(rth->u.dst.dev);
1903 rth->idev = in_dev_get(rth->u.dst.dev);
1904 rth->rt_gateway = daddr;
1905 rth->rt_spec_dst= spec_dst;
1906 rth->u.dst.input= ip_local_deliver;
1907 rth->rt_flags = flags|RTCF_LOCAL;
1908 if (res.type == RTN_UNREACHABLE) {
1909 rth->u.dst.input= ip_error;
1910 rth->u.dst.error= -err;
1911 rth->rt_flags &= ~RTCF_LOCAL;
1913 rth->rt_type = res.type;
1914 hash = rt_hash(daddr, saddr, fl.iif);
1915 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1916 goto done;
1918 no_route:
1919 RT_CACHE_STAT_INC(in_no_route);
1920 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1921 res.type = RTN_UNREACHABLE;
1922 if (err == -ESRCH)
1923 err = -ENETUNREACH;
1924 goto local_input;
1927 * Do not cache martian addresses: they should be logged (RFC1812)
1929 martian_destination:
1930 RT_CACHE_STAT_INC(in_martian_dst);
1931 #ifdef CONFIG_IP_ROUTE_VERBOSE
1932 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1933 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1934 "%u.%u.%u.%u, dev %s\n",
1935 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1936 #endif
1938 e_hostunreach:
1939 err = -EHOSTUNREACH;
1940 goto done;
1942 e_inval:
1943 err = -EINVAL;
1944 goto done;
1946 e_nobufs:
1947 err = -ENOBUFS;
1948 goto done;
1950 martian_source:
1951 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1952 goto e_inval;
1955 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1956 u8 tos, struct net_device *dev)
1958 struct rtable * rth;
1959 unsigned hash;
1960 int iif = dev->ifindex;
1962 tos &= IPTOS_RT_MASK;
1963 hash = rt_hash(daddr, saddr, iif);
1965 rcu_read_lock();
1966 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1967 rth = rcu_dereference(rth->u.dst.rt_next)) {
1968 if (rth->fl.fl4_dst == daddr &&
1969 rth->fl.fl4_src == saddr &&
1970 rth->fl.iif == iif &&
1971 rth->fl.oif == 0 &&
1972 rth->fl.mark == skb->mark &&
1973 rth->fl.fl4_tos == tos) {
1974 dst_use(&rth->u.dst, jiffies);
1975 RT_CACHE_STAT_INC(in_hit);
1976 rcu_read_unlock();
1977 skb->dst = (struct dst_entry*)rth;
1978 return 0;
1980 RT_CACHE_STAT_INC(in_hlist_search);
1982 rcu_read_unlock();
1984 /* Multicast recognition logic is moved from route cache to here.
1985 The problem was that too many Ethernet cards have broken/missing
1986 hardware multicast filters :-( As result the host on multicasting
1987 network acquires a lot of useless route cache entries, sort of
1988 SDR messages from all the world. Now we try to get rid of them.
1989 Really, provided software IP multicast filter is organized
1990 reasonably (at least, hashed), it does not result in a slowdown
1991 comparing with route cache reject entries.
1992 Note, that multicast routers are not affected, because
1993 route cache entry is created eventually.
1995 if (MULTICAST(daddr)) {
1996 struct in_device *in_dev;
1998 rcu_read_lock();
1999 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2000 int our = ip_check_mc(in_dev, daddr, saddr,
2001 ip_hdr(skb)->protocol);
2002 if (our
2003 #ifdef CONFIG_IP_MROUTE
2004 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2005 #endif
2007 rcu_read_unlock();
2008 return ip_route_input_mc(skb, daddr, saddr,
2009 tos, dev, our);
2012 rcu_read_unlock();
2013 return -EINVAL;
2015 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2018 static inline int __mkroute_output(struct rtable **result,
2019 struct fib_result* res,
2020 const struct flowi *fl,
2021 const struct flowi *oldflp,
2022 struct net_device *dev_out,
2023 unsigned flags)
2025 struct rtable *rth;
2026 struct in_device *in_dev;
2027 u32 tos = RT_FL_TOS(oldflp);
2028 int err = 0;
2030 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2031 return -EINVAL;
2033 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2034 res->type = RTN_BROADCAST;
2035 else if (MULTICAST(fl->fl4_dst))
2036 res->type = RTN_MULTICAST;
2037 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2038 return -EINVAL;
2040 if (dev_out->flags & IFF_LOOPBACK)
2041 flags |= RTCF_LOCAL;
2043 /* get work reference to inet device */
2044 in_dev = in_dev_get(dev_out);
2045 if (!in_dev)
2046 return -EINVAL;
2048 if (res->type == RTN_BROADCAST) {
2049 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2050 if (res->fi) {
2051 fib_info_put(res->fi);
2052 res->fi = NULL;
2054 } else if (res->type == RTN_MULTICAST) {
2055 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2056 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2057 oldflp->proto))
2058 flags &= ~RTCF_LOCAL;
2059 /* If multicast route do not exist use
2060 default one, but do not gateway in this case.
2061 Yes, it is hack.
2063 if (res->fi && res->prefixlen < 4) {
2064 fib_info_put(res->fi);
2065 res->fi = NULL;
2070 rth = dst_alloc(&ipv4_dst_ops);
2071 if (!rth) {
2072 err = -ENOBUFS;
2073 goto cleanup;
2076 atomic_set(&rth->u.dst.__refcnt, 1);
2077 rth->u.dst.flags= DST_HOST;
2078 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2079 rth->u.dst.flags |= DST_NOXFRM;
2080 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2081 rth->u.dst.flags |= DST_NOPOLICY;
2083 rth->fl.fl4_dst = oldflp->fl4_dst;
2084 rth->fl.fl4_tos = tos;
2085 rth->fl.fl4_src = oldflp->fl4_src;
2086 rth->fl.oif = oldflp->oif;
2087 rth->fl.mark = oldflp->mark;
2088 rth->rt_dst = fl->fl4_dst;
2089 rth->rt_src = fl->fl4_src;
2090 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2091 /* get references to the devices that are to be hold by the routing
2092 cache entry */
2093 rth->u.dst.dev = dev_out;
2094 dev_hold(dev_out);
2095 rth->idev = in_dev_get(dev_out);
2096 rth->rt_gateway = fl->fl4_dst;
2097 rth->rt_spec_dst= fl->fl4_src;
2099 rth->u.dst.output=ip_output;
2101 RT_CACHE_STAT_INC(out_slow_tot);
2103 if (flags & RTCF_LOCAL) {
2104 rth->u.dst.input = ip_local_deliver;
2105 rth->rt_spec_dst = fl->fl4_dst;
2107 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2108 rth->rt_spec_dst = fl->fl4_src;
2109 if (flags & RTCF_LOCAL &&
2110 !(dev_out->flags & IFF_LOOPBACK)) {
2111 rth->u.dst.output = ip_mc_output;
2112 RT_CACHE_STAT_INC(out_slow_mc);
2114 #ifdef CONFIG_IP_MROUTE
2115 if (res->type == RTN_MULTICAST) {
2116 if (IN_DEV_MFORWARD(in_dev) &&
2117 !LOCAL_MCAST(oldflp->fl4_dst)) {
2118 rth->u.dst.input = ip_mr_input;
2119 rth->u.dst.output = ip_mc_output;
2122 #endif
2125 rt_set_nexthop(rth, res, 0);
2127 rth->rt_flags = flags;
2129 *result = rth;
2130 cleanup:
2131 /* release work reference to inet device */
2132 in_dev_put(in_dev);
2134 return err;
2137 static inline int ip_mkroute_output(struct rtable **rp,
2138 struct fib_result* res,
2139 const struct flowi *fl,
2140 const struct flowi *oldflp,
2141 struct net_device *dev_out,
2142 unsigned flags)
2144 struct rtable *rth = NULL;
2145 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2146 unsigned hash;
2147 if (err == 0) {
2148 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2149 err = rt_intern_hash(hash, rth, rp);
2152 return err;
2156 * Major route resolver routine.
2159 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2161 u32 tos = RT_FL_TOS(oldflp);
2162 struct flowi fl = { .nl_u = { .ip4_u =
2163 { .daddr = oldflp->fl4_dst,
2164 .saddr = oldflp->fl4_src,
2165 .tos = tos & IPTOS_RT_MASK,
2166 .scope = ((tos & RTO_ONLINK) ?
2167 RT_SCOPE_LINK :
2168 RT_SCOPE_UNIVERSE),
2169 } },
2170 .mark = oldflp->mark,
2171 .iif = init_net.loopback_dev->ifindex,
2172 .oif = oldflp->oif };
2173 struct fib_result res;
2174 unsigned flags = 0;
2175 struct net_device *dev_out = NULL;
2176 int free_res = 0;
2177 int err;
2180 res.fi = NULL;
2181 #ifdef CONFIG_IP_MULTIPLE_TABLES
2182 res.r = NULL;
2183 #endif
2185 if (oldflp->fl4_src) {
2186 err = -EINVAL;
2187 if (MULTICAST(oldflp->fl4_src) ||
2188 BADCLASS(oldflp->fl4_src) ||
2189 ZERONET(oldflp->fl4_src))
2190 goto out;
2192 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2193 dev_out = ip_dev_find(oldflp->fl4_src);
2194 if (dev_out == NULL)
2195 goto out;
2197 /* I removed check for oif == dev_out->oif here.
2198 It was wrong for two reasons:
2199 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2200 assigned to multiple interfaces.
2201 2. Moreover, we are allowed to send packets with saddr
2202 of another iface. --ANK
2205 if (oldflp->oif == 0
2206 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2207 /* Special hack: user can direct multicasts
2208 and limited broadcast via necessary interface
2209 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2210 This hack is not just for fun, it allows
2211 vic,vat and friends to work.
2212 They bind socket to loopback, set ttl to zero
2213 and expect that it will work.
2214 From the viewpoint of routing cache they are broken,
2215 because we are not allowed to build multicast path
2216 with loopback source addr (look, routing cache
2217 cannot know, that ttl is zero, so that packet
2218 will not leave this host and route is valid).
2219 Luckily, this hack is good workaround.
2222 fl.oif = dev_out->ifindex;
2223 goto make_route;
2225 if (dev_out)
2226 dev_put(dev_out);
2227 dev_out = NULL;
2231 if (oldflp->oif) {
2232 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2233 err = -ENODEV;
2234 if (dev_out == NULL)
2235 goto out;
2237 /* RACE: Check return value of inet_select_addr instead. */
2238 if (__in_dev_get_rtnl(dev_out) == NULL) {
2239 dev_put(dev_out);
2240 goto out; /* Wrong error code */
2243 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2244 if (!fl.fl4_src)
2245 fl.fl4_src = inet_select_addr(dev_out, 0,
2246 RT_SCOPE_LINK);
2247 goto make_route;
2249 if (!fl.fl4_src) {
2250 if (MULTICAST(oldflp->fl4_dst))
2251 fl.fl4_src = inet_select_addr(dev_out, 0,
2252 fl.fl4_scope);
2253 else if (!oldflp->fl4_dst)
2254 fl.fl4_src = inet_select_addr(dev_out, 0,
2255 RT_SCOPE_HOST);
2259 if (!fl.fl4_dst) {
2260 fl.fl4_dst = fl.fl4_src;
2261 if (!fl.fl4_dst)
2262 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2263 if (dev_out)
2264 dev_put(dev_out);
2265 dev_out = init_net.loopback_dev;
2266 dev_hold(dev_out);
2267 fl.oif = init_net.loopback_dev->ifindex;
2268 res.type = RTN_LOCAL;
2269 flags |= RTCF_LOCAL;
2270 goto make_route;
2273 if (fib_lookup(&fl, &res)) {
2274 res.fi = NULL;
2275 if (oldflp->oif) {
2276 /* Apparently, routing tables are wrong. Assume,
2277 that the destination is on link.
2279 WHY? DW.
2280 Because we are allowed to send to iface
2281 even if it has NO routes and NO assigned
2282 addresses. When oif is specified, routing
2283 tables are looked up with only one purpose:
2284 to catch if destination is gatewayed, rather than
2285 direct. Moreover, if MSG_DONTROUTE is set,
2286 we send packet, ignoring both routing tables
2287 and ifaddr state. --ANK
2290 We could make it even if oif is unknown,
2291 likely IPv6, but we do not.
2294 if (fl.fl4_src == 0)
2295 fl.fl4_src = inet_select_addr(dev_out, 0,
2296 RT_SCOPE_LINK);
2297 res.type = RTN_UNICAST;
2298 goto make_route;
2300 if (dev_out)
2301 dev_put(dev_out);
2302 err = -ENETUNREACH;
2303 goto out;
2305 free_res = 1;
2307 if (res.type == RTN_LOCAL) {
2308 if (!fl.fl4_src)
2309 fl.fl4_src = fl.fl4_dst;
2310 if (dev_out)
2311 dev_put(dev_out);
2312 dev_out = init_net.loopback_dev;
2313 dev_hold(dev_out);
2314 fl.oif = dev_out->ifindex;
2315 if (res.fi)
2316 fib_info_put(res.fi);
2317 res.fi = NULL;
2318 flags |= RTCF_LOCAL;
2319 goto make_route;
2322 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2323 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2324 fib_select_multipath(&fl, &res);
2325 else
2326 #endif
2327 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2328 fib_select_default(&fl, &res);
2330 if (!fl.fl4_src)
2331 fl.fl4_src = FIB_RES_PREFSRC(res);
2333 if (dev_out)
2334 dev_put(dev_out);
2335 dev_out = FIB_RES_DEV(res);
2336 dev_hold(dev_out);
2337 fl.oif = dev_out->ifindex;
2340 make_route:
2341 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2344 if (free_res)
2345 fib_res_put(&res);
2346 if (dev_out)
2347 dev_put(dev_out);
2348 out: return err;
2351 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2353 unsigned hash;
2354 struct rtable *rth;
2356 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2358 rcu_read_lock_bh();
2359 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2360 rth = rcu_dereference(rth->u.dst.rt_next)) {
2361 if (rth->fl.fl4_dst == flp->fl4_dst &&
2362 rth->fl.fl4_src == flp->fl4_src &&
2363 rth->fl.iif == 0 &&
2364 rth->fl.oif == flp->oif &&
2365 rth->fl.mark == flp->mark &&
2366 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2367 (IPTOS_RT_MASK | RTO_ONLINK))) {
2368 dst_use(&rth->u.dst, jiffies);
2369 RT_CACHE_STAT_INC(out_hit);
2370 rcu_read_unlock_bh();
2371 *rp = rth;
2372 return 0;
2374 RT_CACHE_STAT_INC(out_hlist_search);
2376 rcu_read_unlock_bh();
2378 return ip_route_output_slow(rp, flp);
2381 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2383 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2387 static struct dst_ops ipv4_dst_blackhole_ops = {
2388 .family = AF_INET,
2389 .protocol = __constant_htons(ETH_P_IP),
2390 .destroy = ipv4_dst_destroy,
2391 .check = ipv4_dst_check,
2392 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2393 .entry_size = sizeof(struct rtable),
2397 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2399 struct rtable *ort = *rp;
2400 struct rtable *rt = (struct rtable *)
2401 dst_alloc(&ipv4_dst_blackhole_ops);
2403 if (rt) {
2404 struct dst_entry *new = &rt->u.dst;
2406 atomic_set(&new->__refcnt, 1);
2407 new->__use = 1;
2408 new->input = dst_discard;
2409 new->output = dst_discard;
2410 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2412 new->dev = ort->u.dst.dev;
2413 if (new->dev)
2414 dev_hold(new->dev);
2416 rt->fl = ort->fl;
2418 rt->idev = ort->idev;
2419 if (rt->idev)
2420 in_dev_hold(rt->idev);
2421 rt->rt_flags = ort->rt_flags;
2422 rt->rt_type = ort->rt_type;
2423 rt->rt_dst = ort->rt_dst;
2424 rt->rt_src = ort->rt_src;
2425 rt->rt_iif = ort->rt_iif;
2426 rt->rt_gateway = ort->rt_gateway;
2427 rt->rt_spec_dst = ort->rt_spec_dst;
2428 rt->peer = ort->peer;
2429 if (rt->peer)
2430 atomic_inc(&rt->peer->refcnt);
2432 dst_free(new);
2435 dst_release(&(*rp)->u.dst);
2436 *rp = rt;
2437 return (rt ? 0 : -ENOMEM);
2440 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2442 int err;
2444 if ((err = __ip_route_output_key(rp, flp)) != 0)
2445 return err;
2447 if (flp->proto) {
2448 if (!flp->fl4_src)
2449 flp->fl4_src = (*rp)->rt_src;
2450 if (!flp->fl4_dst)
2451 flp->fl4_dst = (*rp)->rt_dst;
2452 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2453 if (err == -EREMOTE)
2454 err = ipv4_dst_blackhole(rp, flp, sk);
2456 return err;
2459 return 0;
2462 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2464 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2466 return ip_route_output_flow(rp, flp, NULL, 0);
2469 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2470 int nowait, unsigned int flags)
2472 struct rtable *rt = (struct rtable*)skb->dst;
2473 struct rtmsg *r;
2474 struct nlmsghdr *nlh;
2475 long expires;
2476 u32 id = 0, ts = 0, tsage = 0, error;
2478 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2479 if (nlh == NULL)
2480 return -EMSGSIZE;
2482 r = nlmsg_data(nlh);
2483 r->rtm_family = AF_INET;
2484 r->rtm_dst_len = 32;
2485 r->rtm_src_len = 0;
2486 r->rtm_tos = rt->fl.fl4_tos;
2487 r->rtm_table = RT_TABLE_MAIN;
2488 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2489 r->rtm_type = rt->rt_type;
2490 r->rtm_scope = RT_SCOPE_UNIVERSE;
2491 r->rtm_protocol = RTPROT_UNSPEC;
2492 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2493 if (rt->rt_flags & RTCF_NOTIFY)
2494 r->rtm_flags |= RTM_F_NOTIFY;
2496 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2498 if (rt->fl.fl4_src) {
2499 r->rtm_src_len = 32;
2500 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2502 if (rt->u.dst.dev)
2503 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2504 #ifdef CONFIG_NET_CLS_ROUTE
2505 if (rt->u.dst.tclassid)
2506 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2507 #endif
2508 if (rt->fl.iif)
2509 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2510 else if (rt->rt_src != rt->fl.fl4_src)
2511 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2513 if (rt->rt_dst != rt->rt_gateway)
2514 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2516 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2517 goto nla_put_failure;
2519 error = rt->u.dst.error;
2520 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2521 if (rt->peer) {
2522 id = rt->peer->ip_id_count;
2523 if (rt->peer->tcp_ts_stamp) {
2524 ts = rt->peer->tcp_ts;
2525 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2529 if (rt->fl.iif) {
2530 #ifdef CONFIG_IP_MROUTE
2531 __be32 dst = rt->rt_dst;
2533 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2534 IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2535 int err = ipmr_get_route(skb, r, nowait);
2536 if (err <= 0) {
2537 if (!nowait) {
2538 if (err == 0)
2539 return 0;
2540 goto nla_put_failure;
2541 } else {
2542 if (err == -EMSGSIZE)
2543 goto nla_put_failure;
2544 error = err;
2547 } else
2548 #endif
2549 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2552 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2553 expires, error) < 0)
2554 goto nla_put_failure;
2556 return nlmsg_end(skb, nlh);
2558 nla_put_failure:
2559 nlmsg_cancel(skb, nlh);
2560 return -EMSGSIZE;
2563 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2565 struct net *net = in_skb->sk->sk_net;
2566 struct rtmsg *rtm;
2567 struct nlattr *tb[RTA_MAX+1];
2568 struct rtable *rt = NULL;
2569 __be32 dst = 0;
2570 __be32 src = 0;
2571 u32 iif;
2572 int err;
2573 struct sk_buff *skb;
2575 if (net != &init_net)
2576 return -EINVAL;
2578 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2579 if (err < 0)
2580 goto errout;
2582 rtm = nlmsg_data(nlh);
2584 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2585 if (skb == NULL) {
2586 err = -ENOBUFS;
2587 goto errout;
2590 /* Reserve room for dummy headers, this skb can pass
2591 through good chunk of routing engine.
2593 skb_reset_mac_header(skb);
2594 skb_reset_network_header(skb);
2596 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2597 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2598 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2600 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2601 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2602 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2604 if (iif) {
2605 struct net_device *dev;
2607 dev = __dev_get_by_index(&init_net, iif);
2608 if (dev == NULL) {
2609 err = -ENODEV;
2610 goto errout_free;
2613 skb->protocol = htons(ETH_P_IP);
2614 skb->dev = dev;
2615 local_bh_disable();
2616 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2617 local_bh_enable();
2619 rt = (struct rtable*) skb->dst;
2620 if (err == 0 && rt->u.dst.error)
2621 err = -rt->u.dst.error;
2622 } else {
2623 struct flowi fl = {
2624 .nl_u = {
2625 .ip4_u = {
2626 .daddr = dst,
2627 .saddr = src,
2628 .tos = rtm->rtm_tos,
2631 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2633 err = ip_route_output_key(&rt, &fl);
2636 if (err)
2637 goto errout_free;
2639 skb->dst = &rt->u.dst;
2640 if (rtm->rtm_flags & RTM_F_NOTIFY)
2641 rt->rt_flags |= RTCF_NOTIFY;
2643 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2644 RTM_NEWROUTE, 0, 0);
2645 if (err <= 0)
2646 goto errout_free;
2648 err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2649 errout:
2650 return err;
2652 errout_free:
2653 kfree_skb(skb);
2654 goto errout;
2657 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2659 struct rtable *rt;
2660 int h, s_h;
2661 int idx, s_idx;
2663 s_h = cb->args[0];
2664 if (s_h < 0)
2665 s_h = 0;
2666 s_idx = idx = cb->args[1];
2667 for (h = s_h; h <= rt_hash_mask; h++) {
2668 rcu_read_lock_bh();
2669 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2670 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2671 if (idx < s_idx)
2672 continue;
2673 skb->dst = dst_clone(&rt->u.dst);
2674 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2675 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2676 1, NLM_F_MULTI) <= 0) {
2677 dst_release(xchg(&skb->dst, NULL));
2678 rcu_read_unlock_bh();
2679 goto done;
2681 dst_release(xchg(&skb->dst, NULL));
2683 rcu_read_unlock_bh();
2684 s_idx = 0;
2687 done:
2688 cb->args[0] = h;
2689 cb->args[1] = idx;
2690 return skb->len;
2693 void ip_rt_multicast_event(struct in_device *in_dev)
2695 rt_cache_flush(0);
2698 #ifdef CONFIG_SYSCTL
2699 static int flush_delay;
2701 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2702 struct file *filp, void __user *buffer,
2703 size_t *lenp, loff_t *ppos)
2705 if (write) {
2706 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2707 rt_cache_flush(flush_delay);
2708 return 0;
2711 return -EINVAL;
2714 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2715 int __user *name,
2716 int nlen,
2717 void __user *oldval,
2718 size_t __user *oldlenp,
2719 void __user *newval,
2720 size_t newlen)
2722 int delay;
2723 if (newlen != sizeof(int))
2724 return -EINVAL;
2725 if (get_user(delay, (int __user *)newval))
2726 return -EFAULT;
2727 rt_cache_flush(delay);
2728 return 0;
2731 ctl_table ipv4_route_table[] = {
2733 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2734 .procname = "flush",
2735 .data = &flush_delay,
2736 .maxlen = sizeof(int),
2737 .mode = 0200,
2738 .proc_handler = &ipv4_sysctl_rtcache_flush,
2739 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2742 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2743 .procname = "min_delay",
2744 .data = &ip_rt_min_delay,
2745 .maxlen = sizeof(int),
2746 .mode = 0644,
2747 .proc_handler = &proc_dointvec_jiffies,
2748 .strategy = &sysctl_jiffies,
2751 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2752 .procname = "max_delay",
2753 .data = &ip_rt_max_delay,
2754 .maxlen = sizeof(int),
2755 .mode = 0644,
2756 .proc_handler = &proc_dointvec_jiffies,
2757 .strategy = &sysctl_jiffies,
2760 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2761 .procname = "gc_thresh",
2762 .data = &ipv4_dst_ops.gc_thresh,
2763 .maxlen = sizeof(int),
2764 .mode = 0644,
2765 .proc_handler = &proc_dointvec,
2768 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2769 .procname = "max_size",
2770 .data = &ip_rt_max_size,
2771 .maxlen = sizeof(int),
2772 .mode = 0644,
2773 .proc_handler = &proc_dointvec,
2776 /* Deprecated. Use gc_min_interval_ms */
2778 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2779 .procname = "gc_min_interval",
2780 .data = &ip_rt_gc_min_interval,
2781 .maxlen = sizeof(int),
2782 .mode = 0644,
2783 .proc_handler = &proc_dointvec_jiffies,
2784 .strategy = &sysctl_jiffies,
2787 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2788 .procname = "gc_min_interval_ms",
2789 .data = &ip_rt_gc_min_interval,
2790 .maxlen = sizeof(int),
2791 .mode = 0644,
2792 .proc_handler = &proc_dointvec_ms_jiffies,
2793 .strategy = &sysctl_ms_jiffies,
2796 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2797 .procname = "gc_timeout",
2798 .data = &ip_rt_gc_timeout,
2799 .maxlen = sizeof(int),
2800 .mode = 0644,
2801 .proc_handler = &proc_dointvec_jiffies,
2802 .strategy = &sysctl_jiffies,
2805 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2806 .procname = "gc_interval",
2807 .data = &ip_rt_gc_interval,
2808 .maxlen = sizeof(int),
2809 .mode = 0644,
2810 .proc_handler = &proc_dointvec_jiffies,
2811 .strategy = &sysctl_jiffies,
2814 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2815 .procname = "redirect_load",
2816 .data = &ip_rt_redirect_load,
2817 .maxlen = sizeof(int),
2818 .mode = 0644,
2819 .proc_handler = &proc_dointvec,
2822 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2823 .procname = "redirect_number",
2824 .data = &ip_rt_redirect_number,
2825 .maxlen = sizeof(int),
2826 .mode = 0644,
2827 .proc_handler = &proc_dointvec,
2830 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2831 .procname = "redirect_silence",
2832 .data = &ip_rt_redirect_silence,
2833 .maxlen = sizeof(int),
2834 .mode = 0644,
2835 .proc_handler = &proc_dointvec,
2838 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2839 .procname = "error_cost",
2840 .data = &ip_rt_error_cost,
2841 .maxlen = sizeof(int),
2842 .mode = 0644,
2843 .proc_handler = &proc_dointvec,
2846 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2847 .procname = "error_burst",
2848 .data = &ip_rt_error_burst,
2849 .maxlen = sizeof(int),
2850 .mode = 0644,
2851 .proc_handler = &proc_dointvec,
2854 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2855 .procname = "gc_elasticity",
2856 .data = &ip_rt_gc_elasticity,
2857 .maxlen = sizeof(int),
2858 .mode = 0644,
2859 .proc_handler = &proc_dointvec,
2862 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2863 .procname = "mtu_expires",
2864 .data = &ip_rt_mtu_expires,
2865 .maxlen = sizeof(int),
2866 .mode = 0644,
2867 .proc_handler = &proc_dointvec_jiffies,
2868 .strategy = &sysctl_jiffies,
2871 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2872 .procname = "min_pmtu",
2873 .data = &ip_rt_min_pmtu,
2874 .maxlen = sizeof(int),
2875 .mode = 0644,
2876 .proc_handler = &proc_dointvec,
2879 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2880 .procname = "min_adv_mss",
2881 .data = &ip_rt_min_advmss,
2882 .maxlen = sizeof(int),
2883 .mode = 0644,
2884 .proc_handler = &proc_dointvec,
2887 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2888 .procname = "secret_interval",
2889 .data = &ip_rt_secret_interval,
2890 .maxlen = sizeof(int),
2891 .mode = 0644,
2892 .proc_handler = &proc_dointvec_jiffies,
2893 .strategy = &sysctl_jiffies,
2895 { .ctl_name = 0 }
2897 #endif
2899 #ifdef CONFIG_NET_CLS_ROUTE
2900 struct ip_rt_acct *ip_rt_acct __read_mostly;
2902 /* IP route accounting ptr for this logical cpu number. */
2903 #define IP_RT_ACCT_CPU(cpu) (per_cpu_ptr(ip_rt_acct, cpu))
2905 #ifdef CONFIG_PROC_FS
2906 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2907 int length, int *eof, void *data)
2909 unsigned int i;
2911 if ((offset & 3) || (length & 3))
2912 return -EIO;
2914 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2915 *eof = 1;
2916 return 0;
2919 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2920 length = sizeof(struct ip_rt_acct) * 256 - offset;
2921 *eof = 1;
2924 offset /= sizeof(u32);
2926 if (length > 0) {
2927 u32 *dst = (u32 *) buffer;
2929 *start = buffer;
2930 memset(dst, 0, length);
2932 for_each_possible_cpu(i) {
2933 unsigned int j;
2934 u32 *src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2936 for (j = 0; j < length/4; j++)
2937 dst[j] += src[j];
2940 return length;
2942 #endif /* CONFIG_PROC_FS */
2943 #endif /* CONFIG_NET_CLS_ROUTE */
2945 static __initdata unsigned long rhash_entries;
2946 static int __init set_rhash_entries(char *str)
2948 if (!str)
2949 return 0;
2950 rhash_entries = simple_strtoul(str, &str, 0);
2951 return 1;
2953 __setup("rhash_entries=", set_rhash_entries);
2955 int __init ip_rt_init(void)
2957 int rc = 0;
2959 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2960 (jiffies ^ (jiffies >> 7)));
2962 #ifdef CONFIG_NET_CLS_ROUTE
2963 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
2964 if (!ip_rt_acct)
2965 panic("IP: failed to allocate ip_rt_acct\n");
2966 #endif
2968 ipv4_dst_ops.kmem_cachep =
2969 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2970 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2972 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2974 rt_hash_table = (struct rt_hash_bucket *)
2975 alloc_large_system_hash("IP route cache",
2976 sizeof(struct rt_hash_bucket),
2977 rhash_entries,
2978 (num_physpages >= 128 * 1024) ?
2979 15 : 17,
2981 &rt_hash_log,
2982 &rt_hash_mask,
2984 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2985 rt_hash_lock_init();
2987 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2988 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2990 devinet_init();
2991 ip_fib_init();
2993 setup_timer(&rt_flush_timer, rt_run_flush, 0);
2994 setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
2996 /* All the timers, started at system startup tend
2997 to synchronize. Perturb it a bit.
2999 schedule_delayed_work(&expires_work,
3000 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3002 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3003 ip_rt_secret_interval;
3004 add_timer(&rt_secret_timer);
3006 #ifdef CONFIG_PROC_FS
3008 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3009 if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3010 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3011 init_net.proc_net_stat))) {
3012 return -ENOMEM;
3014 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3016 #ifdef CONFIG_NET_CLS_ROUTE
3017 create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
3018 #endif
3019 #endif
3020 #ifdef CONFIG_XFRM
3021 xfrm_init();
3022 xfrm4_init();
3023 #endif
3024 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3026 return rc;
3029 EXPORT_SYMBOL(__ip_select_ident);
3030 EXPORT_SYMBOL(ip_route_input);
3031 EXPORT_SYMBOL(ip_route_output_key);