2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
100 #include <net/ip_fib.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
108 #include <linux/sysctl.h>
111 #define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 #define IP_MAX_MTU 0xFFF0
116 #define RT_GC_TIMEOUT (300*HZ)
118 static int ip_rt_max_size
;
119 static int ip_rt_gc_timeout __read_mostly
= RT_GC_TIMEOUT
;
120 static int ip_rt_gc_interval __read_mostly
= 60 * HZ
;
121 static int ip_rt_gc_min_interval __read_mostly
= HZ
/ 2;
122 static int ip_rt_redirect_number __read_mostly
= 9;
123 static int ip_rt_redirect_load __read_mostly
= HZ
/ 50;
124 static int ip_rt_redirect_silence __read_mostly
= ((HZ
/ 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly
= HZ
;
126 static int ip_rt_error_burst __read_mostly
= 5 * HZ
;
127 static int ip_rt_gc_elasticity __read_mostly
= 8;
128 static int ip_rt_mtu_expires __read_mostly
= 10 * 60 * HZ
;
129 static int ip_rt_min_pmtu __read_mostly
= 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly
= 256;
131 static int ip_rt_secret_interval __read_mostly
= 10 * 60 * HZ
;
132 static int rt_chain_length_max __read_mostly
= 20;
134 static void rt_worker_func(struct work_struct
*work
);
135 static DECLARE_DELAYED_WORK(expires_work
, rt_worker_func
);
138 * Interface to generic destination cache.
141 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
);
142 static void ipv4_dst_destroy(struct dst_entry
*dst
);
143 static void ipv4_dst_ifdown(struct dst_entry
*dst
,
144 struct net_device
*dev
, int how
);
145 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
);
146 static void ipv4_link_failure(struct sk_buff
*skb
);
147 static void ip_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
);
148 static int rt_garbage_collect(struct dst_ops
*ops
);
149 static void rt_emergency_hash_rebuild(struct net
*net
);
152 static struct dst_ops ipv4_dst_ops
= {
154 .protocol
= cpu_to_be16(ETH_P_IP
),
155 .gc
= rt_garbage_collect
,
156 .check
= ipv4_dst_check
,
157 .destroy
= ipv4_dst_destroy
,
158 .ifdown
= ipv4_dst_ifdown
,
159 .negative_advice
= ipv4_negative_advice
,
160 .link_failure
= ipv4_link_failure
,
161 .update_pmtu
= ip_rt_update_pmtu
,
162 .local_out
= __ip_local_out
,
163 .entries
= ATOMIC_INIT(0),
166 #define ECN_OR_COST(class) TC_PRIO_##class
168 const __u8 ip_tos2prio
[16] = {
172 ECN_OR_COST(BESTEFFORT
),
178 ECN_OR_COST(INTERACTIVE
),
180 ECN_OR_COST(INTERACTIVE
),
181 TC_PRIO_INTERACTIVE_BULK
,
182 ECN_OR_COST(INTERACTIVE_BULK
),
183 TC_PRIO_INTERACTIVE_BULK
,
184 ECN_OR_COST(INTERACTIVE_BULK
)
192 /* The locking scheme is rather straight forward:
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
202 struct rt_hash_bucket
{
203 struct rtable
*chain
;
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ 256
217 # define RT_HASH_LOCK_SZ 4096
219 # define RT_HASH_LOCK_SZ 2048
221 # define RT_HASH_LOCK_SZ 1024
223 # define RT_HASH_LOCK_SZ 512
225 # define RT_HASH_LOCK_SZ 256
229 static spinlock_t
*rt_hash_locks
;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
232 static __init
void rt_hash_lock_init(void)
236 rt_hash_locks
= kmalloc(sizeof(spinlock_t
) * RT_HASH_LOCK_SZ
,
239 panic("IP: failed to allocate rt_hash_locks\n");
241 for (i
= 0; i
< RT_HASH_LOCK_SZ
; i
++)
242 spin_lock_init(&rt_hash_locks
[i
]);
245 # define rt_hash_lock_addr(slot) NULL
247 static inline void rt_hash_lock_init(void)
252 static struct rt_hash_bucket
*rt_hash_table __read_mostly
;
253 static unsigned rt_hash_mask __read_mostly
;
254 static unsigned int rt_hash_log __read_mostly
;
256 static DEFINE_PER_CPU(struct rt_cache_stat
, rt_cache_stat
);
257 #define RT_CACHE_STAT_INC(field) \
258 (__raw_get_cpu_var(rt_cache_stat).field++)
260 static inline unsigned int rt_hash(__be32 daddr
, __be32 saddr
, int idx
,
263 return jhash_3words((__force u32
)(__be32
)(daddr
),
264 (__force u32
)(__be32
)(saddr
),
269 static inline int rt_genid(struct net
*net
)
271 return atomic_read(&net
->ipv4
.rt_genid
);
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state
{
276 struct seq_net_private p
;
281 static struct rtable
*rt_cache_get_first(struct seq_file
*seq
)
283 struct rt_cache_iter_state
*st
= seq
->private;
284 struct rtable
*r
= NULL
;
286 for (st
->bucket
= rt_hash_mask
; st
->bucket
>= 0; --st
->bucket
) {
287 if (!rt_hash_table
[st
->bucket
].chain
)
290 r
= rcu_dereference(rt_hash_table
[st
->bucket
].chain
);
292 if (dev_net(r
->u
.dst
.dev
) == seq_file_net(seq
) &&
293 r
->rt_genid
== st
->genid
)
295 r
= rcu_dereference(r
->u
.dst
.rt_next
);
297 rcu_read_unlock_bh();
302 static struct rtable
*__rt_cache_get_next(struct seq_file
*seq
,
305 struct rt_cache_iter_state
*st
= seq
->private;
307 r
= r
->u
.dst
.rt_next
;
309 rcu_read_unlock_bh();
311 if (--st
->bucket
< 0)
313 } while (!rt_hash_table
[st
->bucket
].chain
);
315 r
= rt_hash_table
[st
->bucket
].chain
;
317 return rcu_dereference(r
);
320 static struct rtable
*rt_cache_get_next(struct seq_file
*seq
,
323 struct rt_cache_iter_state
*st
= seq
->private;
324 while ((r
= __rt_cache_get_next(seq
, r
)) != NULL
) {
325 if (dev_net(r
->u
.dst
.dev
) != seq_file_net(seq
))
327 if (r
->rt_genid
== st
->genid
)
333 static struct rtable
*rt_cache_get_idx(struct seq_file
*seq
, loff_t pos
)
335 struct rtable
*r
= rt_cache_get_first(seq
);
338 while (pos
&& (r
= rt_cache_get_next(seq
, r
)))
340 return pos
? NULL
: r
;
343 static void *rt_cache_seq_start(struct seq_file
*seq
, loff_t
*pos
)
345 struct rt_cache_iter_state
*st
= seq
->private;
347 return rt_cache_get_idx(seq
, *pos
- 1);
348 st
->genid
= rt_genid(seq_file_net(seq
));
349 return SEQ_START_TOKEN
;
352 static void *rt_cache_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
356 if (v
== SEQ_START_TOKEN
)
357 r
= rt_cache_get_first(seq
);
359 r
= rt_cache_get_next(seq
, v
);
364 static void rt_cache_seq_stop(struct seq_file
*seq
, void *v
)
366 if (v
&& v
!= SEQ_START_TOKEN
)
367 rcu_read_unlock_bh();
370 static int rt_cache_seq_show(struct seq_file
*seq
, void *v
)
372 if (v
== SEQ_START_TOKEN
)
373 seq_printf(seq
, "%-127s\n",
374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
378 struct rtable
*r
= v
;
381 seq_printf(seq
, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 r
->u
.dst
.dev
? r
->u
.dst
.dev
->name
: "*",
384 (unsigned long)r
->rt_dst
, (unsigned long)r
->rt_gateway
,
385 r
->rt_flags
, atomic_read(&r
->u
.dst
.__refcnt
),
386 r
->u
.dst
.__use
, 0, (unsigned long)r
->rt_src
,
387 (dst_metric(&r
->u
.dst
, RTAX_ADVMSS
) ?
388 (int)dst_metric(&r
->u
.dst
, RTAX_ADVMSS
) + 40 : 0),
389 dst_metric(&r
->u
.dst
, RTAX_WINDOW
),
390 (int)((dst_metric(&r
->u
.dst
, RTAX_RTT
) >> 3) +
391 dst_metric(&r
->u
.dst
, RTAX_RTTVAR
)),
393 r
->u
.dst
.hh
? atomic_read(&r
->u
.dst
.hh
->hh_refcnt
) : -1,
394 r
->u
.dst
.hh
? (r
->u
.dst
.hh
->hh_output
==
396 r
->rt_spec_dst
, &len
);
398 seq_printf(seq
, "%*s\n", 127 - len
, "");
403 static const struct seq_operations rt_cache_seq_ops
= {
404 .start
= rt_cache_seq_start
,
405 .next
= rt_cache_seq_next
,
406 .stop
= rt_cache_seq_stop
,
407 .show
= rt_cache_seq_show
,
410 static int rt_cache_seq_open(struct inode
*inode
, struct file
*file
)
412 return seq_open_net(inode
, file
, &rt_cache_seq_ops
,
413 sizeof(struct rt_cache_iter_state
));
416 static const struct file_operations rt_cache_seq_fops
= {
417 .owner
= THIS_MODULE
,
418 .open
= rt_cache_seq_open
,
421 .release
= seq_release_net
,
425 static void *rt_cpu_seq_start(struct seq_file
*seq
, loff_t
*pos
)
430 return SEQ_START_TOKEN
;
432 for (cpu
= *pos
-1; cpu
< nr_cpu_ids
; ++cpu
) {
433 if (!cpu_possible(cpu
))
436 return &per_cpu(rt_cache_stat
, cpu
);
441 static void *rt_cpu_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
445 for (cpu
= *pos
; cpu
< nr_cpu_ids
; ++cpu
) {
446 if (!cpu_possible(cpu
))
449 return &per_cpu(rt_cache_stat
, cpu
);
455 static void rt_cpu_seq_stop(struct seq_file
*seq
, void *v
)
460 static int rt_cpu_seq_show(struct seq_file
*seq
, void *v
)
462 struct rt_cache_stat
*st
= v
;
464 if (v
== SEQ_START_TOKEN
) {
465 seq_printf(seq
, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
469 seq_printf(seq
,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops
.entries
),
494 static const struct seq_operations rt_cpu_seq_ops
= {
495 .start
= rt_cpu_seq_start
,
496 .next
= rt_cpu_seq_next
,
497 .stop
= rt_cpu_seq_stop
,
498 .show
= rt_cpu_seq_show
,
502 static int rt_cpu_seq_open(struct inode
*inode
, struct file
*file
)
504 return seq_open(file
, &rt_cpu_seq_ops
);
507 static const struct file_operations rt_cpu_seq_fops
= {
508 .owner
= THIS_MODULE
,
509 .open
= rt_cpu_seq_open
,
512 .release
= seq_release
,
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int ip_rt_acct_read(char *buffer
, char **start
, off_t offset
,
517 int length
, int *eof
, void *data
)
521 if ((offset
& 3) || (length
& 3))
524 if (offset
>= sizeof(struct ip_rt_acct
) * 256) {
529 if (offset
+ length
>= sizeof(struct ip_rt_acct
) * 256) {
530 length
= sizeof(struct ip_rt_acct
) * 256 - offset
;
534 offset
/= sizeof(u32
);
537 u32
*dst
= (u32
*) buffer
;
540 memset(dst
, 0, length
);
542 for_each_possible_cpu(i
) {
546 src
= ((u32
*) per_cpu_ptr(ip_rt_acct
, i
)) + offset
;
547 for (j
= 0; j
< length
/4; j
++)
555 static int __net_init
ip_rt_do_proc_init(struct net
*net
)
557 struct proc_dir_entry
*pde
;
559 pde
= proc_net_fops_create(net
, "rt_cache", S_IRUGO
,
564 pde
= proc_create("rt_cache", S_IRUGO
,
565 net
->proc_net_stat
, &rt_cpu_seq_fops
);
569 #ifdef CONFIG_NET_CLS_ROUTE
570 pde
= create_proc_read_entry("rt_acct", 0, net
->proc_net
,
571 ip_rt_acct_read
, NULL
);
577 #ifdef CONFIG_NET_CLS_ROUTE
579 remove_proc_entry("rt_cache", net
->proc_net_stat
);
582 remove_proc_entry("rt_cache", net
->proc_net
);
587 static void __net_exit
ip_rt_do_proc_exit(struct net
*net
)
589 remove_proc_entry("rt_cache", net
->proc_net_stat
);
590 remove_proc_entry("rt_cache", net
->proc_net
);
591 remove_proc_entry("rt_acct", net
->proc_net
);
594 static struct pernet_operations ip_rt_proc_ops __net_initdata
= {
595 .init
= ip_rt_do_proc_init
,
596 .exit
= ip_rt_do_proc_exit
,
599 static int __init
ip_rt_proc_init(void)
601 return register_pernet_subsys(&ip_rt_proc_ops
);
605 static inline int ip_rt_proc_init(void)
609 #endif /* CONFIG_PROC_FS */
611 static inline void rt_free(struct rtable
*rt
)
613 call_rcu_bh(&rt
->u
.dst
.rcu_head
, dst_rcu_free
);
616 static inline void rt_drop(struct rtable
*rt
)
619 call_rcu_bh(&rt
->u
.dst
.rcu_head
, dst_rcu_free
);
622 static inline int rt_fast_clean(struct rtable
*rth
)
624 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */
626 return (rth
->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) &&
627 rth
->fl
.iif
&& rth
->u
.dst
.rt_next
;
630 static inline int rt_valuable(struct rtable
*rth
)
632 return (rth
->rt_flags
& (RTCF_REDIRECTED
| RTCF_NOTIFY
)) ||
636 static int rt_may_expire(struct rtable
*rth
, unsigned long tmo1
, unsigned long tmo2
)
641 if (atomic_read(&rth
->u
.dst
.__refcnt
))
645 if (rth
->u
.dst
.expires
&&
646 time_after_eq(jiffies
, rth
->u
.dst
.expires
))
649 age
= jiffies
- rth
->u
.dst
.lastuse
;
651 if ((age
<= tmo1
&& !rt_fast_clean(rth
)) ||
652 (age
<= tmo2
&& rt_valuable(rth
)))
658 /* Bits of score are:
660 * 30: not quite useless
661 * 29..0: usage counter
663 static inline u32
rt_score(struct rtable
*rt
)
665 u32 score
= jiffies
- rt
->u
.dst
.lastuse
;
667 score
= ~score
& ~(3<<30);
673 !(rt
->rt_flags
& (RTCF_BROADCAST
|RTCF_MULTICAST
|RTCF_LOCAL
)))
679 static inline bool rt_caching(const struct net
*net
)
681 return net
->ipv4
.current_rt_cache_rebuild_count
<=
682 net
->ipv4
.sysctl_rt_cache_rebuild_count
;
685 static inline bool compare_hash_inputs(const struct flowi
*fl1
,
686 const struct flowi
*fl2
)
688 return (__force u32
)(((fl1
->nl_u
.ip4_u
.daddr
^ fl2
->nl_u
.ip4_u
.daddr
) |
689 (fl1
->nl_u
.ip4_u
.saddr
^ fl2
->nl_u
.ip4_u
.saddr
) |
690 (fl1
->iif
^ fl2
->iif
)) == 0);
693 static inline int compare_keys(struct flowi
*fl1
, struct flowi
*fl2
)
695 return ((__force u32
)((fl1
->nl_u
.ip4_u
.daddr
^ fl2
->nl_u
.ip4_u
.daddr
) |
696 (fl1
->nl_u
.ip4_u
.saddr
^ fl2
->nl_u
.ip4_u
.saddr
)) |
697 (fl1
->mark
^ fl2
->mark
) |
698 (*(u16
*)&fl1
->nl_u
.ip4_u
.tos
^
699 *(u16
*)&fl2
->nl_u
.ip4_u
.tos
) |
700 (fl1
->oif
^ fl2
->oif
) |
701 (fl1
->iif
^ fl2
->iif
)) == 0;
704 static inline int compare_netns(struct rtable
*rt1
, struct rtable
*rt2
)
706 return dev_net(rt1
->u
.dst
.dev
) == dev_net(rt2
->u
.dst
.dev
);
709 static inline int rt_is_expired(struct rtable
*rth
)
711 return rth
->rt_genid
!= rt_genid(dev_net(rth
->u
.dst
.dev
));
715 * Perform a full scan of hash table and free all entries.
716 * Can be called by a softirq or a process.
717 * In the later case, we want to be reschedule if necessary
719 static void rt_do_flush(int process_context
)
722 struct rtable
*rth
, *next
;
723 struct rtable
* tail
;
725 for (i
= 0; i
<= rt_hash_mask
; i
++) {
726 if (process_context
&& need_resched())
728 rth
= rt_hash_table
[i
].chain
;
732 spin_lock_bh(rt_hash_lock_addr(i
));
735 struct rtable
** prev
, * p
;
737 rth
= rt_hash_table
[i
].chain
;
739 /* defer releasing the head of the list after spin_unlock */
740 for (tail
= rth
; tail
; tail
= tail
->u
.dst
.rt_next
)
741 if (!rt_is_expired(tail
))
744 rt_hash_table
[i
].chain
= tail
;
746 /* call rt_free on entries after the tail requiring flush */
747 prev
= &rt_hash_table
[i
].chain
;
748 for (p
= *prev
; p
; p
= next
) {
749 next
= p
->u
.dst
.rt_next
;
750 if (!rt_is_expired(p
)) {
751 prev
= &p
->u
.dst
.rt_next
;
759 rth
= rt_hash_table
[i
].chain
;
760 rt_hash_table
[i
].chain
= NULL
;
763 spin_unlock_bh(rt_hash_lock_addr(i
));
765 for (; rth
!= tail
; rth
= next
) {
766 next
= rth
->u
.dst
.rt_next
;
773 * While freeing expired entries, we compute average chain length
774 * and standard deviation, using fixed-point arithmetic.
775 * This to have an estimation of rt_chain_length_max
776 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
777 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
781 #define ONE (1UL << FRACT_BITS)
783 static void rt_check_expire(void)
785 static unsigned int rover
;
786 unsigned int i
= rover
, goal
;
787 struct rtable
*rth
, *aux
, **rthp
;
788 unsigned long samples
= 0;
789 unsigned long sum
= 0, sum2
= 0;
792 mult
= ((u64
)ip_rt_gc_interval
) << rt_hash_log
;
793 if (ip_rt_gc_timeout
> 1)
794 do_div(mult
, ip_rt_gc_timeout
);
795 goal
= (unsigned int)mult
;
796 if (goal
> rt_hash_mask
)
797 goal
= rt_hash_mask
+ 1;
798 for (; goal
> 0; goal
--) {
799 unsigned long tmo
= ip_rt_gc_timeout
;
800 unsigned long length
;
802 i
= (i
+ 1) & rt_hash_mask
;
803 rthp
= &rt_hash_table
[i
].chain
;
813 spin_lock_bh(rt_hash_lock_addr(i
));
814 while ((rth
= *rthp
) != NULL
) {
815 prefetch(rth
->u
.dst
.rt_next
);
816 if (rt_is_expired(rth
)) {
817 *rthp
= rth
->u
.dst
.rt_next
;
821 if (rth
->u
.dst
.expires
) {
822 /* Entry is expired even if it is in use */
823 if (time_before_eq(jiffies
, rth
->u
.dst
.expires
)) {
826 rthp
= &rth
->u
.dst
.rt_next
;
828 * We only count entries on
829 * a chain with equal hash inputs once
830 * so that entries for different QOS
831 * levels, and other non-hash input
832 * attributes don't unfairly skew
833 * the length computation
835 for (aux
= rt_hash_table
[i
].chain
;;) {
840 if (compare_hash_inputs(&aux
->fl
, &rth
->fl
))
842 aux
= aux
->u
.dst
.rt_next
;
846 } else if (!rt_may_expire(rth
, tmo
, ip_rt_gc_timeout
))
849 /* Cleanup aged off entries. */
850 *rthp
= rth
->u
.dst
.rt_next
;
853 spin_unlock_bh(rt_hash_lock_addr(i
));
855 sum2
+= length
*length
;
858 unsigned long avg
= sum
/ samples
;
859 unsigned long sd
= int_sqrt(sum2
/ samples
- avg
*avg
);
860 rt_chain_length_max
= max_t(unsigned long,
862 (avg
+ 4*sd
) >> FRACT_BITS
);
868 * rt_worker_func() is run in process context.
869 * we call rt_check_expire() to scan part of the hash table
871 static void rt_worker_func(struct work_struct
*work
)
874 schedule_delayed_work(&expires_work
, ip_rt_gc_interval
);
878 * Pertubation of rt_genid by a small quantity [1..256]
879 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
880 * many times (2^24) without giving recent rt_genid.
881 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
883 static void rt_cache_invalidate(struct net
*net
)
885 unsigned char shuffle
;
887 get_random_bytes(&shuffle
, sizeof(shuffle
));
888 atomic_add(shuffle
+ 1U, &net
->ipv4
.rt_genid
);
892 * delay < 0 : invalidate cache (fast : entries will be deleted later)
893 * delay >= 0 : invalidate & flush cache (can be long)
895 void rt_cache_flush(struct net
*net
, int delay
)
897 rt_cache_invalidate(net
);
899 rt_do_flush(!in_softirq());
903 * We change rt_genid and let gc do the cleanup
905 static void rt_secret_rebuild(unsigned long __net
)
907 struct net
*net
= (struct net
*)__net
;
908 rt_cache_invalidate(net
);
909 mod_timer(&net
->ipv4
.rt_secret_timer
, jiffies
+ ip_rt_secret_interval
);
912 static void rt_secret_rebuild_oneshot(struct net
*net
)
914 del_timer_sync(&net
->ipv4
.rt_secret_timer
);
915 rt_cache_invalidate(net
);
916 if (ip_rt_secret_interval
) {
917 net
->ipv4
.rt_secret_timer
.expires
+= ip_rt_secret_interval
;
918 add_timer(&net
->ipv4
.rt_secret_timer
);
922 static void rt_emergency_hash_rebuild(struct net
*net
)
924 if (net_ratelimit()) {
925 printk(KERN_WARNING
"Route hash chain too long!\n");
926 printk(KERN_WARNING
"Adjust your secret_interval!\n");
929 rt_secret_rebuild_oneshot(net
);
933 Short description of GC goals.
935 We want to build algorithm, which will keep routing cache
936 at some equilibrium point, when number of aged off entries
937 is kept approximately equal to newly generated ones.
939 Current expiration strength is variable "expire".
940 We try to adjust it dynamically, so that if networking
941 is idle expires is large enough to keep enough of warm entries,
942 and when load increases it reduces to limit cache size.
945 static int rt_garbage_collect(struct dst_ops
*ops
)
947 static unsigned long expire
= RT_GC_TIMEOUT
;
948 static unsigned long last_gc
;
950 static int equilibrium
;
951 struct rtable
*rth
, **rthp
;
952 unsigned long now
= jiffies
;
956 * Garbage collection is pretty expensive,
957 * do not make it too frequently.
960 RT_CACHE_STAT_INC(gc_total
);
962 if (now
- last_gc
< ip_rt_gc_min_interval
&&
963 atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
) {
964 RT_CACHE_STAT_INC(gc_ignored
);
968 /* Calculate number of entries, which we want to expire now. */
969 goal
= atomic_read(&ipv4_dst_ops
.entries
) -
970 (ip_rt_gc_elasticity
<< rt_hash_log
);
972 if (equilibrium
< ipv4_dst_ops
.gc_thresh
)
973 equilibrium
= ipv4_dst_ops
.gc_thresh
;
974 goal
= atomic_read(&ipv4_dst_ops
.entries
) - equilibrium
;
976 equilibrium
+= min_t(unsigned int, goal
>> 1, rt_hash_mask
+ 1);
977 goal
= atomic_read(&ipv4_dst_ops
.entries
) - equilibrium
;
980 /* We are in dangerous area. Try to reduce cache really
983 goal
= max_t(unsigned int, goal
>> 1, rt_hash_mask
+ 1);
984 equilibrium
= atomic_read(&ipv4_dst_ops
.entries
) - goal
;
987 if (now
- last_gc
>= ip_rt_gc_min_interval
)
998 for (i
= rt_hash_mask
, k
= rover
; i
>= 0; i
--) {
999 unsigned long tmo
= expire
;
1001 k
= (k
+ 1) & rt_hash_mask
;
1002 rthp
= &rt_hash_table
[k
].chain
;
1003 spin_lock_bh(rt_hash_lock_addr(k
));
1004 while ((rth
= *rthp
) != NULL
) {
1005 if (!rt_is_expired(rth
) &&
1006 !rt_may_expire(rth
, tmo
, expire
)) {
1008 rthp
= &rth
->u
.dst
.rt_next
;
1011 *rthp
= rth
->u
.dst
.rt_next
;
1015 spin_unlock_bh(rt_hash_lock_addr(k
));
1024 /* Goal is not achieved. We stop process if:
1026 - if expire reduced to zero. Otherwise, expire is halfed.
1027 - if table is not full.
1028 - if we are called from interrupt.
1029 - jiffies check is just fallback/debug loop breaker.
1030 We will not spin here for long time in any case.
1033 RT_CACHE_STAT_INC(gc_goal_miss
);
1039 #if RT_CACHE_DEBUG >= 2
1040 printk(KERN_DEBUG
"expire>> %u %d %d %d\n", expire
,
1041 atomic_read(&ipv4_dst_ops
.entries
), goal
, i
);
1044 if (atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
1046 } while (!in_softirq() && time_before_eq(jiffies
, now
));
1048 if (atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
1050 if (net_ratelimit())
1051 printk(KERN_WARNING
"dst cache overflow\n");
1052 RT_CACHE_STAT_INC(gc_dst_overflow
);
1056 expire
+= ip_rt_gc_min_interval
;
1057 if (expire
> ip_rt_gc_timeout
||
1058 atomic_read(&ipv4_dst_ops
.entries
) < ipv4_dst_ops
.gc_thresh
)
1059 expire
= ip_rt_gc_timeout
;
1060 #if RT_CACHE_DEBUG >= 2
1061 printk(KERN_DEBUG
"expire++ %u %d %d %d\n", expire
,
1062 atomic_read(&ipv4_dst_ops
.entries
), goal
, rover
);
1067 static int rt_intern_hash(unsigned hash
, struct rtable
*rt
, struct rtable
**rp
)
1069 struct rtable
*rth
, **rthp
;
1071 struct rtable
*cand
, **candp
;
1074 int attempts
= !in_softirq();
1078 min_score
= ~(u32
)0;
1083 if (!rt_caching(dev_net(rt
->u
.dst
.dev
))) {
1085 * If we're not caching, just tell the caller we
1086 * were successful and don't touch the route. The
1087 * caller hold the sole reference to the cache entry, and
1088 * it will be released when the caller is done with it.
1089 * If we drop it here, the callers have no way to resolve routes
1090 * when we're not caching. Instead, just point *rp at rt, so
1091 * the caller gets a single use out of the route
1092 * Note that we do rt_free on this new route entry, so that
1093 * once its refcount hits zero, we are still able to reap it
1095 * Note also the rt_free uses call_rcu. We don't actually
1096 * need rcu protection here, this is just our path to get
1097 * on the route gc list.
1100 if (rt
->rt_type
== RTN_UNICAST
|| rt
->fl
.iif
== 0) {
1101 int err
= arp_bind_neighbour(&rt
->u
.dst
);
1103 if (net_ratelimit())
1105 "Neighbour table failure & not caching routes.\n");
1115 rthp
= &rt_hash_table
[hash
].chain
;
1117 spin_lock_bh(rt_hash_lock_addr(hash
));
1118 while ((rth
= *rthp
) != NULL
) {
1119 if (rt_is_expired(rth
)) {
1120 *rthp
= rth
->u
.dst
.rt_next
;
1124 if (compare_keys(&rth
->fl
, &rt
->fl
) && compare_netns(rth
, rt
)) {
1126 *rthp
= rth
->u
.dst
.rt_next
;
1128 * Since lookup is lockfree, the deletion
1129 * must be visible to another weakly ordered CPU before
1130 * the insertion at the start of the hash chain.
1132 rcu_assign_pointer(rth
->u
.dst
.rt_next
,
1133 rt_hash_table
[hash
].chain
);
1135 * Since lookup is lockfree, the update writes
1136 * must be ordered for consistency on SMP.
1138 rcu_assign_pointer(rt_hash_table
[hash
].chain
, rth
);
1140 dst_use(&rth
->u
.dst
, now
);
1141 spin_unlock_bh(rt_hash_lock_addr(hash
));
1148 if (!atomic_read(&rth
->u
.dst
.__refcnt
)) {
1149 u32 score
= rt_score(rth
);
1151 if (score
<= min_score
) {
1160 rthp
= &rth
->u
.dst
.rt_next
;
1164 /* ip_rt_gc_elasticity used to be average length of chain
1165 * length, when exceeded gc becomes really aggressive.
1167 * The second limit is less certain. At the moment it allows
1168 * only 2 entries per bucket. We will see.
1170 if (chain_length
> ip_rt_gc_elasticity
) {
1171 *candp
= cand
->u
.dst
.rt_next
;
1175 if (chain_length
> rt_chain_length_max
) {
1176 struct net
*net
= dev_net(rt
->u
.dst
.dev
);
1177 int num
= ++net
->ipv4
.current_rt_cache_rebuild_count
;
1178 if (!rt_caching(dev_net(rt
->u
.dst
.dev
))) {
1179 printk(KERN_WARNING
"%s: %d rebuilds is over limit, route caching disabled\n",
1180 rt
->u
.dst
.dev
->name
, num
);
1182 rt_emergency_hash_rebuild(dev_net(rt
->u
.dst
.dev
));
1186 /* Try to bind route to arp only if it is output
1187 route or unicast forwarding path.
1189 if (rt
->rt_type
== RTN_UNICAST
|| rt
->fl
.iif
== 0) {
1190 int err
= arp_bind_neighbour(&rt
->u
.dst
);
1192 spin_unlock_bh(rt_hash_lock_addr(hash
));
1194 if (err
!= -ENOBUFS
) {
1199 /* Neighbour tables are full and nothing
1200 can be released. Try to shrink route cache,
1201 it is most likely it holds some neighbour records.
1203 if (attempts
-- > 0) {
1204 int saved_elasticity
= ip_rt_gc_elasticity
;
1205 int saved_int
= ip_rt_gc_min_interval
;
1206 ip_rt_gc_elasticity
= 1;
1207 ip_rt_gc_min_interval
= 0;
1208 rt_garbage_collect(&ipv4_dst_ops
);
1209 ip_rt_gc_min_interval
= saved_int
;
1210 ip_rt_gc_elasticity
= saved_elasticity
;
1214 if (net_ratelimit())
1215 printk(KERN_WARNING
"Neighbour table overflow.\n");
1221 rt
->u
.dst
.rt_next
= rt_hash_table
[hash
].chain
;
1223 #if RT_CACHE_DEBUG >= 2
1224 if (rt
->u
.dst
.rt_next
) {
1226 printk(KERN_DEBUG
"rt_cache @%02x: %pI4",
1228 for (trt
= rt
->u
.dst
.rt_next
; trt
; trt
= trt
->u
.dst
.rt_next
)
1229 printk(" . %pI4", &trt
->rt_dst
);
1234 * Since lookup is lockfree, we must make sure
1235 * previous writes to rt are comitted to memory
1236 * before making rt visible to other CPUS.
1238 rcu_assign_pointer(rt_hash_table
[hash
].chain
, rt
);
1240 spin_unlock_bh(rt_hash_lock_addr(hash
));
1247 void rt_bind_peer(struct rtable
*rt
, int create
)
1249 static DEFINE_SPINLOCK(rt_peer_lock
);
1250 struct inet_peer
*peer
;
1252 peer
= inet_getpeer(rt
->rt_dst
, create
);
1254 spin_lock_bh(&rt_peer_lock
);
1255 if (rt
->peer
== NULL
) {
1259 spin_unlock_bh(&rt_peer_lock
);
1265 * Peer allocation may fail only in serious out-of-memory conditions. However
1266 * we still can generate some output.
1267 * Random ID selection looks a bit dangerous because we have no chances to
1268 * select ID being unique in a reasonable period of time.
1269 * But broken packet identifier may be better than no packet at all.
1271 static void ip_select_fb_ident(struct iphdr
*iph
)
1273 static DEFINE_SPINLOCK(ip_fb_id_lock
);
1274 static u32 ip_fallback_id
;
1277 spin_lock_bh(&ip_fb_id_lock
);
1278 salt
= secure_ip_id((__force __be32
)ip_fallback_id
^ iph
->daddr
);
1279 iph
->id
= htons(salt
& 0xFFFF);
1280 ip_fallback_id
= salt
;
1281 spin_unlock_bh(&ip_fb_id_lock
);
1284 void __ip_select_ident(struct iphdr
*iph
, struct dst_entry
*dst
, int more
)
1286 struct rtable
*rt
= (struct rtable
*) dst
;
1289 if (rt
->peer
== NULL
)
1290 rt_bind_peer(rt
, 1);
1292 /* If peer is attached to destination, it is never detached,
1293 so that we need not to grab a lock to dereference it.
1296 iph
->id
= htons(inet_getid(rt
->peer
, more
));
1300 printk(KERN_DEBUG
"rt_bind_peer(0) @%p\n",
1301 __builtin_return_address(0));
1303 ip_select_fb_ident(iph
);
1306 static void rt_del(unsigned hash
, struct rtable
*rt
)
1308 struct rtable
**rthp
, *aux
;
1310 rthp
= &rt_hash_table
[hash
].chain
;
1311 spin_lock_bh(rt_hash_lock_addr(hash
));
1313 while ((aux
= *rthp
) != NULL
) {
1314 if (aux
== rt
|| rt_is_expired(aux
)) {
1315 *rthp
= aux
->u
.dst
.rt_next
;
1319 rthp
= &aux
->u
.dst
.rt_next
;
1321 spin_unlock_bh(rt_hash_lock_addr(hash
));
1324 void ip_rt_redirect(__be32 old_gw
, __be32 daddr
, __be32 new_gw
,
1325 __be32 saddr
, struct net_device
*dev
)
1328 struct in_device
*in_dev
= in_dev_get(dev
);
1329 struct rtable
*rth
, **rthp
;
1330 __be32 skeys
[2] = { saddr
, 0 };
1331 int ikeys
[2] = { dev
->ifindex
, 0 };
1332 struct netevent_redirect netevent
;
1339 if (new_gw
== old_gw
|| !IN_DEV_RX_REDIRECTS(in_dev
)
1340 || ipv4_is_multicast(new_gw
) || ipv4_is_lbcast(new_gw
)
1341 || ipv4_is_zeronet(new_gw
))
1342 goto reject_redirect
;
1344 if (!rt_caching(net
))
1345 goto reject_redirect
;
1347 if (!IN_DEV_SHARED_MEDIA(in_dev
)) {
1348 if (!inet_addr_onlink(in_dev
, new_gw
, old_gw
))
1349 goto reject_redirect
;
1350 if (IN_DEV_SEC_REDIRECTS(in_dev
) && ip_fib_check_default(new_gw
, dev
))
1351 goto reject_redirect
;
1353 if (inet_addr_type(net
, new_gw
) != RTN_UNICAST
)
1354 goto reject_redirect
;
1357 for (i
= 0; i
< 2; i
++) {
1358 for (k
= 0; k
< 2; k
++) {
1359 unsigned hash
= rt_hash(daddr
, skeys
[i
], ikeys
[k
],
1362 rthp
=&rt_hash_table
[hash
].chain
;
1365 while ((rth
= rcu_dereference(*rthp
)) != NULL
) {
1368 if (rth
->fl
.fl4_dst
!= daddr
||
1369 rth
->fl
.fl4_src
!= skeys
[i
] ||
1370 rth
->fl
.oif
!= ikeys
[k
] ||
1372 rt_is_expired(rth
) ||
1373 !net_eq(dev_net(rth
->u
.dst
.dev
), net
)) {
1374 rthp
= &rth
->u
.dst
.rt_next
;
1378 if (rth
->rt_dst
!= daddr
||
1379 rth
->rt_src
!= saddr
||
1381 rth
->rt_gateway
!= old_gw
||
1382 rth
->u
.dst
.dev
!= dev
)
1385 dst_hold(&rth
->u
.dst
);
1388 rt
= dst_alloc(&ipv4_dst_ops
);
1395 /* Copy all the information. */
1397 rt
->u
.dst
.__use
= 1;
1398 atomic_set(&rt
->u
.dst
.__refcnt
, 1);
1399 rt
->u
.dst
.child
= NULL
;
1401 dev_hold(rt
->u
.dst
.dev
);
1403 in_dev_hold(rt
->idev
);
1404 rt
->u
.dst
.obsolete
= 0;
1405 rt
->u
.dst
.lastuse
= jiffies
;
1406 rt
->u
.dst
.path
= &rt
->u
.dst
;
1407 rt
->u
.dst
.neighbour
= NULL
;
1408 rt
->u
.dst
.hh
= NULL
;
1410 rt
->u
.dst
.xfrm
= NULL
;
1412 rt
->rt_genid
= rt_genid(net
);
1413 rt
->rt_flags
|= RTCF_REDIRECTED
;
1415 /* Gateway is different ... */
1416 rt
->rt_gateway
= new_gw
;
1418 /* Redirect received -> path was valid */
1419 dst_confirm(&rth
->u
.dst
);
1422 atomic_inc(&rt
->peer
->refcnt
);
1424 if (arp_bind_neighbour(&rt
->u
.dst
) ||
1425 !(rt
->u
.dst
.neighbour
->nud_state
&
1427 if (rt
->u
.dst
.neighbour
)
1428 neigh_event_send(rt
->u
.dst
.neighbour
, NULL
);
1434 netevent
.old
= &rth
->u
.dst
;
1435 netevent
.new = &rt
->u
.dst
;
1436 call_netevent_notifiers(NETEVENT_REDIRECT
,
1440 if (!rt_intern_hash(hash
, rt
, &rt
))
1453 #ifdef CONFIG_IP_ROUTE_VERBOSE
1454 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
1455 printk(KERN_INFO
"Redirect from %pI4 on %s about %pI4 ignored.\n"
1456 " Advised path = %pI4 -> %pI4\n",
1457 &old_gw
, dev
->name
, &new_gw
,
1463 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
)
1465 struct rtable
*rt
= (struct rtable
*)dst
;
1466 struct dst_entry
*ret
= dst
;
1469 if (dst
->obsolete
) {
1472 } else if ((rt
->rt_flags
& RTCF_REDIRECTED
) ||
1473 rt
->u
.dst
.expires
) {
1474 unsigned hash
= rt_hash(rt
->fl
.fl4_dst
, rt
->fl
.fl4_src
,
1476 rt_genid(dev_net(dst
->dev
)));
1477 #if RT_CACHE_DEBUG >= 1
1478 printk(KERN_DEBUG
"ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1479 &rt
->rt_dst
, rt
->fl
.fl4_tos
);
1490 * 1. The first ip_rt_redirect_number redirects are sent
1491 * with exponential backoff, then we stop sending them at all,
1492 * assuming that the host ignores our redirects.
1493 * 2. If we did not see packets requiring redirects
1494 * during ip_rt_redirect_silence, we assume that the host
1495 * forgot redirected route and start to send redirects again.
1497 * This algorithm is much cheaper and more intelligent than dumb load limiting
1500 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1501 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1504 void ip_rt_send_redirect(struct sk_buff
*skb
)
1506 struct rtable
*rt
= skb
->rtable
;
1507 struct in_device
*in_dev
= in_dev_get(rt
->u
.dst
.dev
);
1512 if (!IN_DEV_TX_REDIRECTS(in_dev
))
1515 /* No redirected packets during ip_rt_redirect_silence;
1516 * reset the algorithm.
1518 if (time_after(jiffies
, rt
->u
.dst
.rate_last
+ ip_rt_redirect_silence
))
1519 rt
->u
.dst
.rate_tokens
= 0;
1521 /* Too many ignored redirects; do not send anything
1522 * set u.dst.rate_last to the last seen redirected packet.
1524 if (rt
->u
.dst
.rate_tokens
>= ip_rt_redirect_number
) {
1525 rt
->u
.dst
.rate_last
= jiffies
;
1529 /* Check for load limit; set rate_last to the latest sent
1532 if (rt
->u
.dst
.rate_tokens
== 0 ||
1534 (rt
->u
.dst
.rate_last
+
1535 (ip_rt_redirect_load
<< rt
->u
.dst
.rate_tokens
)))) {
1536 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, rt
->rt_gateway
);
1537 rt
->u
.dst
.rate_last
= jiffies
;
1538 ++rt
->u
.dst
.rate_tokens
;
1539 #ifdef CONFIG_IP_ROUTE_VERBOSE
1540 if (IN_DEV_LOG_MARTIANS(in_dev
) &&
1541 rt
->u
.dst
.rate_tokens
== ip_rt_redirect_number
&&
1543 printk(KERN_WARNING
"host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1544 &rt
->rt_src
, rt
->rt_iif
,
1545 &rt
->rt_dst
, &rt
->rt_gateway
);
1552 static int ip_error(struct sk_buff
*skb
)
1554 struct rtable
*rt
= skb
->rtable
;
1558 switch (rt
->u
.dst
.error
) {
1563 code
= ICMP_HOST_UNREACH
;
1566 code
= ICMP_NET_UNREACH
;
1567 IP_INC_STATS_BH(dev_net(rt
->u
.dst
.dev
),
1568 IPSTATS_MIB_INNOROUTES
);
1571 code
= ICMP_PKT_FILTERED
;
1576 rt
->u
.dst
.rate_tokens
+= now
- rt
->u
.dst
.rate_last
;
1577 if (rt
->u
.dst
.rate_tokens
> ip_rt_error_burst
)
1578 rt
->u
.dst
.rate_tokens
= ip_rt_error_burst
;
1579 rt
->u
.dst
.rate_last
= now
;
1580 if (rt
->u
.dst
.rate_tokens
>= ip_rt_error_cost
) {
1581 rt
->u
.dst
.rate_tokens
-= ip_rt_error_cost
;
1582 icmp_send(skb
, ICMP_DEST_UNREACH
, code
, 0);
1585 out
: kfree_skb(skb
);
1590 * The last two values are not from the RFC but
1591 * are needed for AMPRnet AX.25 paths.
1594 static const unsigned short mtu_plateau
[] =
1595 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1597 static inline unsigned short guess_mtu(unsigned short old_mtu
)
1601 for (i
= 0; i
< ARRAY_SIZE(mtu_plateau
); i
++)
1602 if (old_mtu
> mtu_plateau
[i
])
1603 return mtu_plateau
[i
];
1607 unsigned short ip_rt_frag_needed(struct net
*net
, struct iphdr
*iph
,
1608 unsigned short new_mtu
,
1609 struct net_device
*dev
)
1612 unsigned short old_mtu
= ntohs(iph
->tot_len
);
1614 int ikeys
[2] = { dev
->ifindex
, 0 };
1615 __be32 skeys
[2] = { iph
->saddr
, 0, };
1616 __be32 daddr
= iph
->daddr
;
1617 unsigned short est_mtu
= 0;
1619 if (ipv4_config
.no_pmtu_disc
)
1622 for (k
= 0; k
< 2; k
++) {
1623 for (i
= 0; i
< 2; i
++) {
1624 unsigned hash
= rt_hash(daddr
, skeys
[i
], ikeys
[k
],
1628 for (rth
= rcu_dereference(rt_hash_table
[hash
].chain
); rth
;
1629 rth
= rcu_dereference(rth
->u
.dst
.rt_next
)) {
1630 unsigned short mtu
= new_mtu
;
1632 if (rth
->fl
.fl4_dst
!= daddr
||
1633 rth
->fl
.fl4_src
!= skeys
[i
] ||
1634 rth
->rt_dst
!= daddr
||
1635 rth
->rt_src
!= iph
->saddr
||
1636 rth
->fl
.oif
!= ikeys
[k
] ||
1638 dst_metric_locked(&rth
->u
.dst
, RTAX_MTU
) ||
1639 !net_eq(dev_net(rth
->u
.dst
.dev
), net
) ||
1643 if (new_mtu
< 68 || new_mtu
>= old_mtu
) {
1645 /* BSD 4.2 compatibility hack :-( */
1647 old_mtu
>= dst_mtu(&rth
->u
.dst
) &&
1648 old_mtu
>= 68 + (iph
->ihl
<< 2))
1649 old_mtu
-= iph
->ihl
<< 2;
1651 mtu
= guess_mtu(old_mtu
);
1653 if (mtu
<= dst_mtu(&rth
->u
.dst
)) {
1654 if (mtu
< dst_mtu(&rth
->u
.dst
)) {
1655 dst_confirm(&rth
->u
.dst
);
1656 if (mtu
< ip_rt_min_pmtu
) {
1657 mtu
= ip_rt_min_pmtu
;
1658 rth
->u
.dst
.metrics
[RTAX_LOCK
-1] |=
1661 rth
->u
.dst
.metrics
[RTAX_MTU
-1] = mtu
;
1662 dst_set_expires(&rth
->u
.dst
,
1671 return est_mtu
? : new_mtu
;
1674 static void ip_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
1676 if (dst_mtu(dst
) > mtu
&& mtu
>= 68 &&
1677 !(dst_metric_locked(dst
, RTAX_MTU
))) {
1678 if (mtu
< ip_rt_min_pmtu
) {
1679 mtu
= ip_rt_min_pmtu
;
1680 dst
->metrics
[RTAX_LOCK
-1] |= (1 << RTAX_MTU
);
1682 dst
->metrics
[RTAX_MTU
-1] = mtu
;
1683 dst_set_expires(dst
, ip_rt_mtu_expires
);
1684 call_netevent_notifiers(NETEVENT_PMTU_UPDATE
, dst
);
1688 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
)
1693 static void ipv4_dst_destroy(struct dst_entry
*dst
)
1695 struct rtable
*rt
= (struct rtable
*) dst
;
1696 struct inet_peer
*peer
= rt
->peer
;
1697 struct in_device
*idev
= rt
->idev
;
1710 static void ipv4_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
1713 struct rtable
*rt
= (struct rtable
*) dst
;
1714 struct in_device
*idev
= rt
->idev
;
1715 if (dev
!= dev_net(dev
)->loopback_dev
&& idev
&& idev
->dev
== dev
) {
1716 struct in_device
*loopback_idev
=
1717 in_dev_get(dev_net(dev
)->loopback_dev
);
1718 if (loopback_idev
) {
1719 rt
->idev
= loopback_idev
;
1725 static void ipv4_link_failure(struct sk_buff
*skb
)
1729 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_HOST_UNREACH
, 0);
1733 dst_set_expires(&rt
->u
.dst
, 0);
1736 static int ip_rt_bug(struct sk_buff
*skb
)
1738 printk(KERN_DEBUG
"ip_rt_bug: %pI4 -> %pI4, %s\n",
1739 &ip_hdr(skb
)->saddr
, &ip_hdr(skb
)->daddr
,
1740 skb
->dev
? skb
->dev
->name
: "?");
1746 We do not cache source address of outgoing interface,
1747 because it is used only by IP RR, TS and SRR options,
1748 so that it out of fast path.
1750 BTW remember: "addr" is allowed to be not aligned
1754 void ip_rt_get_source(u8
*addr
, struct rtable
*rt
)
1757 struct fib_result res
;
1759 if (rt
->fl
.iif
== 0)
1761 else if (fib_lookup(dev_net(rt
->u
.dst
.dev
), &rt
->fl
, &res
) == 0) {
1762 src
= FIB_RES_PREFSRC(res
);
1765 src
= inet_select_addr(rt
->u
.dst
.dev
, rt
->rt_gateway
,
1767 memcpy(addr
, &src
, 4);
1770 #ifdef CONFIG_NET_CLS_ROUTE
1771 static void set_class_tag(struct rtable
*rt
, u32 tag
)
1773 if (!(rt
->u
.dst
.tclassid
& 0xFFFF))
1774 rt
->u
.dst
.tclassid
|= tag
& 0xFFFF;
1775 if (!(rt
->u
.dst
.tclassid
& 0xFFFF0000))
1776 rt
->u
.dst
.tclassid
|= tag
& 0xFFFF0000;
1780 static void rt_set_nexthop(struct rtable
*rt
, struct fib_result
*res
, u32 itag
)
1782 struct fib_info
*fi
= res
->fi
;
1785 if (FIB_RES_GW(*res
) &&
1786 FIB_RES_NH(*res
).nh_scope
== RT_SCOPE_LINK
)
1787 rt
->rt_gateway
= FIB_RES_GW(*res
);
1788 memcpy(rt
->u
.dst
.metrics
, fi
->fib_metrics
,
1789 sizeof(rt
->u
.dst
.metrics
));
1790 if (fi
->fib_mtu
== 0) {
1791 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = rt
->u
.dst
.dev
->mtu
;
1792 if (dst_metric_locked(&rt
->u
.dst
, RTAX_MTU
) &&
1793 rt
->rt_gateway
!= rt
->rt_dst
&&
1794 rt
->u
.dst
.dev
->mtu
> 576)
1795 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = 576;
1797 #ifdef CONFIG_NET_CLS_ROUTE
1798 rt
->u
.dst
.tclassid
= FIB_RES_NH(*res
).nh_tclassid
;
1801 rt
->u
.dst
.metrics
[RTAX_MTU
-1]= rt
->u
.dst
.dev
->mtu
;
1803 if (dst_metric(&rt
->u
.dst
, RTAX_HOPLIMIT
) == 0)
1804 rt
->u
.dst
.metrics
[RTAX_HOPLIMIT
-1] = sysctl_ip_default_ttl
;
1805 if (dst_mtu(&rt
->u
.dst
) > IP_MAX_MTU
)
1806 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = IP_MAX_MTU
;
1807 if (dst_metric(&rt
->u
.dst
, RTAX_ADVMSS
) == 0)
1808 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = max_t(unsigned int, rt
->u
.dst
.dev
->mtu
- 40,
1810 if (dst_metric(&rt
->u
.dst
, RTAX_ADVMSS
) > 65535 - 40)
1811 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = 65535 - 40;
1813 #ifdef CONFIG_NET_CLS_ROUTE
1814 #ifdef CONFIG_IP_MULTIPLE_TABLES
1815 set_class_tag(rt
, fib_rules_tclass(res
));
1817 set_class_tag(rt
, itag
);
1819 rt
->rt_type
= res
->type
;
1822 static int ip_route_input_mc(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1823 u8 tos
, struct net_device
*dev
, int our
)
1828 struct in_device
*in_dev
= in_dev_get(dev
);
1831 /* Primary sanity checks. */
1836 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
) ||
1837 ipv4_is_loopback(saddr
) || skb
->protocol
!= htons(ETH_P_IP
))
1840 if (ipv4_is_zeronet(saddr
)) {
1841 if (!ipv4_is_local_multicast(daddr
))
1843 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
1844 } else if (fib_validate_source(saddr
, 0, tos
, 0,
1845 dev
, &spec_dst
, &itag
) < 0)
1848 rth
= dst_alloc(&ipv4_dst_ops
);
1852 rth
->u
.dst
.output
= ip_rt_bug
;
1854 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
1855 rth
->u
.dst
.flags
= DST_HOST
;
1856 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
1857 rth
->u
.dst
.flags
|= DST_NOPOLICY
;
1858 rth
->fl
.fl4_dst
= daddr
;
1859 rth
->rt_dst
= daddr
;
1860 rth
->fl
.fl4_tos
= tos
;
1861 rth
->fl
.mark
= skb
->mark
;
1862 rth
->fl
.fl4_src
= saddr
;
1863 rth
->rt_src
= saddr
;
1864 #ifdef CONFIG_NET_CLS_ROUTE
1865 rth
->u
.dst
.tclassid
= itag
;
1868 rth
->fl
.iif
= dev
->ifindex
;
1869 rth
->u
.dst
.dev
= init_net
.loopback_dev
;
1870 dev_hold(rth
->u
.dst
.dev
);
1871 rth
->idev
= in_dev_get(rth
->u
.dst
.dev
);
1873 rth
->rt_gateway
= daddr
;
1874 rth
->rt_spec_dst
= spec_dst
;
1875 rth
->rt_genid
= rt_genid(dev_net(dev
));
1876 rth
->rt_flags
= RTCF_MULTICAST
;
1877 rth
->rt_type
= RTN_MULTICAST
;
1879 rth
->u
.dst
.input
= ip_local_deliver
;
1880 rth
->rt_flags
|= RTCF_LOCAL
;
1883 #ifdef CONFIG_IP_MROUTE
1884 if (!ipv4_is_local_multicast(daddr
) && IN_DEV_MFORWARD(in_dev
))
1885 rth
->u
.dst
.input
= ip_mr_input
;
1887 RT_CACHE_STAT_INC(in_slow_mc
);
1890 hash
= rt_hash(daddr
, saddr
, dev
->ifindex
, rt_genid(dev_net(dev
)));
1891 return rt_intern_hash(hash
, rth
, &skb
->rtable
);
1903 static void ip_handle_martian_source(struct net_device
*dev
,
1904 struct in_device
*in_dev
,
1905 struct sk_buff
*skb
,
1909 RT_CACHE_STAT_INC(in_martian_src
);
1910 #ifdef CONFIG_IP_ROUTE_VERBOSE
1911 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit()) {
1913 * RFC1812 recommendation, if source is martian,
1914 * the only hint is MAC header.
1916 printk(KERN_WARNING
"martian source %pI4 from %pI4, on dev %s\n",
1917 &daddr
, &saddr
, dev
->name
);
1918 if (dev
->hard_header_len
&& skb_mac_header_was_set(skb
)) {
1920 const unsigned char *p
= skb_mac_header(skb
);
1921 printk(KERN_WARNING
"ll header: ");
1922 for (i
= 0; i
< dev
->hard_header_len
; i
++, p
++) {
1924 if (i
< (dev
->hard_header_len
- 1))
1933 static int __mkroute_input(struct sk_buff
*skb
,
1934 struct fib_result
*res
,
1935 struct in_device
*in_dev
,
1936 __be32 daddr
, __be32 saddr
, u32 tos
,
1937 struct rtable
**result
)
1942 struct in_device
*out_dev
;
1947 /* get a working reference to the output device */
1948 out_dev
= in_dev_get(FIB_RES_DEV(*res
));
1949 if (out_dev
== NULL
) {
1950 if (net_ratelimit())
1951 printk(KERN_CRIT
"Bug in ip_route_input" \
1952 "_slow(). Please, report\n");
1957 err
= fib_validate_source(saddr
, daddr
, tos
, FIB_RES_OIF(*res
),
1958 in_dev
->dev
, &spec_dst
, &itag
);
1960 ip_handle_martian_source(in_dev
->dev
, in_dev
, skb
, daddr
,
1968 flags
|= RTCF_DIRECTSRC
;
1970 if (out_dev
== in_dev
&& err
&&
1971 (IN_DEV_SHARED_MEDIA(out_dev
) ||
1972 inet_addr_onlink(out_dev
, saddr
, FIB_RES_GW(*res
))))
1973 flags
|= RTCF_DOREDIRECT
;
1975 if (skb
->protocol
!= htons(ETH_P_IP
)) {
1976 /* Not IP (i.e. ARP). Do not create route, if it is
1977 * invalid for proxy arp. DNAT routes are always valid.
1979 if (out_dev
== in_dev
) {
1986 rth
= dst_alloc(&ipv4_dst_ops
);
1992 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
1993 rth
->u
.dst
.flags
= DST_HOST
;
1994 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
1995 rth
->u
.dst
.flags
|= DST_NOPOLICY
;
1996 if (IN_DEV_CONF_GET(out_dev
, NOXFRM
))
1997 rth
->u
.dst
.flags
|= DST_NOXFRM
;
1998 rth
->fl
.fl4_dst
= daddr
;
1999 rth
->rt_dst
= daddr
;
2000 rth
->fl
.fl4_tos
= tos
;
2001 rth
->fl
.mark
= skb
->mark
;
2002 rth
->fl
.fl4_src
= saddr
;
2003 rth
->rt_src
= saddr
;
2004 rth
->rt_gateway
= daddr
;
2006 rth
->fl
.iif
= in_dev
->dev
->ifindex
;
2007 rth
->u
.dst
.dev
= (out_dev
)->dev
;
2008 dev_hold(rth
->u
.dst
.dev
);
2009 rth
->idev
= in_dev_get(rth
->u
.dst
.dev
);
2011 rth
->rt_spec_dst
= spec_dst
;
2013 rth
->u
.dst
.input
= ip_forward
;
2014 rth
->u
.dst
.output
= ip_output
;
2015 rth
->rt_genid
= rt_genid(dev_net(rth
->u
.dst
.dev
));
2017 rt_set_nexthop(rth
, res
, itag
);
2019 rth
->rt_flags
= flags
;
2024 /* release the working reference to the output device */
2025 in_dev_put(out_dev
);
2029 static int ip_mkroute_input(struct sk_buff
*skb
,
2030 struct fib_result
*res
,
2031 const struct flowi
*fl
,
2032 struct in_device
*in_dev
,
2033 __be32 daddr
, __be32 saddr
, u32 tos
)
2035 struct rtable
* rth
= NULL
;
2039 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2040 if (res
->fi
&& res
->fi
->fib_nhs
> 1 && fl
->oif
== 0)
2041 fib_select_multipath(fl
, res
);
2044 /* create a routing cache entry */
2045 err
= __mkroute_input(skb
, res
, in_dev
, daddr
, saddr
, tos
, &rth
);
2049 /* put it into the cache */
2050 hash
= rt_hash(daddr
, saddr
, fl
->iif
,
2051 rt_genid(dev_net(rth
->u
.dst
.dev
)));
2052 return rt_intern_hash(hash
, rth
, &skb
->rtable
);
2056 * NOTE. We drop all the packets that has local source
2057 * addresses, because every properly looped back packet
2058 * must have correct destination already attached by output routine.
2060 * Such approach solves two big problems:
2061 * 1. Not simplex devices are handled properly.
2062 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2065 static int ip_route_input_slow(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2066 u8 tos
, struct net_device
*dev
)
2068 struct fib_result res
;
2069 struct in_device
*in_dev
= in_dev_get(dev
);
2070 struct flowi fl
= { .nl_u
= { .ip4_u
=
2074 .scope
= RT_SCOPE_UNIVERSE
,
2077 .iif
= dev
->ifindex
};
2080 struct rtable
* rth
;
2085 struct net
* net
= dev_net(dev
);
2087 /* IP on this device is disabled. */
2092 /* Check for the most weird martians, which can be not detected
2096 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
) ||
2097 ipv4_is_loopback(saddr
))
2098 goto martian_source
;
2100 if (daddr
== htonl(0xFFFFFFFF) || (saddr
== 0 && daddr
== 0))
2103 /* Accept zero addresses only to limited broadcast;
2104 * I even do not know to fix it or not. Waiting for complains :-)
2106 if (ipv4_is_zeronet(saddr
))
2107 goto martian_source
;
2109 if (ipv4_is_lbcast(daddr
) || ipv4_is_zeronet(daddr
) ||
2110 ipv4_is_loopback(daddr
))
2111 goto martian_destination
;
2114 * Now we are ready to route packet.
2116 if ((err
= fib_lookup(net
, &fl
, &res
)) != 0) {
2117 if (!IN_DEV_FORWARD(in_dev
))
2123 RT_CACHE_STAT_INC(in_slow_tot
);
2125 if (res
.type
== RTN_BROADCAST
)
2128 if (res
.type
== RTN_LOCAL
) {
2130 result
= fib_validate_source(saddr
, daddr
, tos
,
2131 net
->loopback_dev
->ifindex
,
2132 dev
, &spec_dst
, &itag
);
2134 goto martian_source
;
2136 flags
|= RTCF_DIRECTSRC
;
2141 if (!IN_DEV_FORWARD(in_dev
))
2143 if (res
.type
!= RTN_UNICAST
)
2144 goto martian_destination
;
2146 err
= ip_mkroute_input(skb
, &res
, &fl
, in_dev
, daddr
, saddr
, tos
);
2154 if (skb
->protocol
!= htons(ETH_P_IP
))
2157 if (ipv4_is_zeronet(saddr
))
2158 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
2160 err
= fib_validate_source(saddr
, 0, tos
, 0, dev
, &spec_dst
,
2163 goto martian_source
;
2165 flags
|= RTCF_DIRECTSRC
;
2167 flags
|= RTCF_BROADCAST
;
2168 res
.type
= RTN_BROADCAST
;
2169 RT_CACHE_STAT_INC(in_brd
);
2172 rth
= dst_alloc(&ipv4_dst_ops
);
2176 rth
->u
.dst
.output
= ip_rt_bug
;
2177 rth
->rt_genid
= rt_genid(net
);
2179 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
2180 rth
->u
.dst
.flags
= DST_HOST
;
2181 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
2182 rth
->u
.dst
.flags
|= DST_NOPOLICY
;
2183 rth
->fl
.fl4_dst
= daddr
;
2184 rth
->rt_dst
= daddr
;
2185 rth
->fl
.fl4_tos
= tos
;
2186 rth
->fl
.mark
= skb
->mark
;
2187 rth
->fl
.fl4_src
= saddr
;
2188 rth
->rt_src
= saddr
;
2189 #ifdef CONFIG_NET_CLS_ROUTE
2190 rth
->u
.dst
.tclassid
= itag
;
2193 rth
->fl
.iif
= dev
->ifindex
;
2194 rth
->u
.dst
.dev
= net
->loopback_dev
;
2195 dev_hold(rth
->u
.dst
.dev
);
2196 rth
->idev
= in_dev_get(rth
->u
.dst
.dev
);
2197 rth
->rt_gateway
= daddr
;
2198 rth
->rt_spec_dst
= spec_dst
;
2199 rth
->u
.dst
.input
= ip_local_deliver
;
2200 rth
->rt_flags
= flags
|RTCF_LOCAL
;
2201 if (res
.type
== RTN_UNREACHABLE
) {
2202 rth
->u
.dst
.input
= ip_error
;
2203 rth
->u
.dst
.error
= -err
;
2204 rth
->rt_flags
&= ~RTCF_LOCAL
;
2206 rth
->rt_type
= res
.type
;
2207 hash
= rt_hash(daddr
, saddr
, fl
.iif
, rt_genid(net
));
2208 err
= rt_intern_hash(hash
, rth
, &skb
->rtable
);
2212 RT_CACHE_STAT_INC(in_no_route
);
2213 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_UNIVERSE
);
2214 res
.type
= RTN_UNREACHABLE
;
2220 * Do not cache martian addresses: they should be logged (RFC1812)
2222 martian_destination
:
2223 RT_CACHE_STAT_INC(in_martian_dst
);
2224 #ifdef CONFIG_IP_ROUTE_VERBOSE
2225 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
2226 printk(KERN_WARNING
"martian destination %pI4 from %pI4, dev %s\n",
2227 &daddr
, &saddr
, dev
->name
);
2231 err
= -EHOSTUNREACH
;
2243 ip_handle_martian_source(dev
, in_dev
, skb
, daddr
, saddr
);
2247 int ip_route_input(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2248 u8 tos
, struct net_device
*dev
)
2250 struct rtable
* rth
;
2252 int iif
= dev
->ifindex
;
2257 if (!rt_caching(net
))
2260 tos
&= IPTOS_RT_MASK
;
2261 hash
= rt_hash(daddr
, saddr
, iif
, rt_genid(net
));
2264 for (rth
= rcu_dereference(rt_hash_table
[hash
].chain
); rth
;
2265 rth
= rcu_dereference(rth
->u
.dst
.rt_next
)) {
2266 if (((rth
->fl
.fl4_dst
^ daddr
) |
2267 (rth
->fl
.fl4_src
^ saddr
) |
2268 (rth
->fl
.iif
^ iif
) |
2270 (rth
->fl
.fl4_tos
^ tos
)) == 0 &&
2271 rth
->fl
.mark
== skb
->mark
&&
2272 net_eq(dev_net(rth
->u
.dst
.dev
), net
) &&
2273 !rt_is_expired(rth
)) {
2274 dst_use(&rth
->u
.dst
, jiffies
);
2275 RT_CACHE_STAT_INC(in_hit
);
2280 RT_CACHE_STAT_INC(in_hlist_search
);
2285 /* Multicast recognition logic is moved from route cache to here.
2286 The problem was that too many Ethernet cards have broken/missing
2287 hardware multicast filters :-( As result the host on multicasting
2288 network acquires a lot of useless route cache entries, sort of
2289 SDR messages from all the world. Now we try to get rid of them.
2290 Really, provided software IP multicast filter is organized
2291 reasonably (at least, hashed), it does not result in a slowdown
2292 comparing with route cache reject entries.
2293 Note, that multicast routers are not affected, because
2294 route cache entry is created eventually.
2296 if (ipv4_is_multicast(daddr
)) {
2297 struct in_device
*in_dev
;
2300 if ((in_dev
= __in_dev_get_rcu(dev
)) != NULL
) {
2301 int our
= ip_check_mc(in_dev
, daddr
, saddr
,
2302 ip_hdr(skb
)->protocol
);
2304 #ifdef CONFIG_IP_MROUTE
2305 || (!ipv4_is_local_multicast(daddr
) &&
2306 IN_DEV_MFORWARD(in_dev
))
2310 return ip_route_input_mc(skb
, daddr
, saddr
,
2317 return ip_route_input_slow(skb
, daddr
, saddr
, tos
, dev
);
2320 static int __mkroute_output(struct rtable
**result
,
2321 struct fib_result
*res
,
2322 const struct flowi
*fl
,
2323 const struct flowi
*oldflp
,
2324 struct net_device
*dev_out
,
2328 struct in_device
*in_dev
;
2329 u32 tos
= RT_FL_TOS(oldflp
);
2332 if (ipv4_is_loopback(fl
->fl4_src
) && !(dev_out
->flags
&IFF_LOOPBACK
))
2335 if (fl
->fl4_dst
== htonl(0xFFFFFFFF))
2336 res
->type
= RTN_BROADCAST
;
2337 else if (ipv4_is_multicast(fl
->fl4_dst
))
2338 res
->type
= RTN_MULTICAST
;
2339 else if (ipv4_is_lbcast(fl
->fl4_dst
) || ipv4_is_zeronet(fl
->fl4_dst
))
2342 if (dev_out
->flags
& IFF_LOOPBACK
)
2343 flags
|= RTCF_LOCAL
;
2345 /* get work reference to inet device */
2346 in_dev
= in_dev_get(dev_out
);
2350 if (res
->type
== RTN_BROADCAST
) {
2351 flags
|= RTCF_BROADCAST
| RTCF_LOCAL
;
2353 fib_info_put(res
->fi
);
2356 } else if (res
->type
== RTN_MULTICAST
) {
2357 flags
|= RTCF_MULTICAST
|RTCF_LOCAL
;
2358 if (!ip_check_mc(in_dev
, oldflp
->fl4_dst
, oldflp
->fl4_src
,
2360 flags
&= ~RTCF_LOCAL
;
2361 /* If multicast route do not exist use
2362 default one, but do not gateway in this case.
2365 if (res
->fi
&& res
->prefixlen
< 4) {
2366 fib_info_put(res
->fi
);
2372 rth
= dst_alloc(&ipv4_dst_ops
);
2378 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
2379 rth
->u
.dst
.flags
= DST_HOST
;
2380 if (IN_DEV_CONF_GET(in_dev
, NOXFRM
))
2381 rth
->u
.dst
.flags
|= DST_NOXFRM
;
2382 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
2383 rth
->u
.dst
.flags
|= DST_NOPOLICY
;
2385 rth
->fl
.fl4_dst
= oldflp
->fl4_dst
;
2386 rth
->fl
.fl4_tos
= tos
;
2387 rth
->fl
.fl4_src
= oldflp
->fl4_src
;
2388 rth
->fl
.oif
= oldflp
->oif
;
2389 rth
->fl
.mark
= oldflp
->mark
;
2390 rth
->rt_dst
= fl
->fl4_dst
;
2391 rth
->rt_src
= fl
->fl4_src
;
2392 rth
->rt_iif
= oldflp
->oif
? : dev_out
->ifindex
;
2393 /* get references to the devices that are to be hold by the routing
2395 rth
->u
.dst
.dev
= dev_out
;
2397 rth
->idev
= in_dev_get(dev_out
);
2398 rth
->rt_gateway
= fl
->fl4_dst
;
2399 rth
->rt_spec_dst
= fl
->fl4_src
;
2401 rth
->u
.dst
.output
=ip_output
;
2402 rth
->rt_genid
= rt_genid(dev_net(dev_out
));
2404 RT_CACHE_STAT_INC(out_slow_tot
);
2406 if (flags
& RTCF_LOCAL
) {
2407 rth
->u
.dst
.input
= ip_local_deliver
;
2408 rth
->rt_spec_dst
= fl
->fl4_dst
;
2410 if (flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) {
2411 rth
->rt_spec_dst
= fl
->fl4_src
;
2412 if (flags
& RTCF_LOCAL
&&
2413 !(dev_out
->flags
& IFF_LOOPBACK
)) {
2414 rth
->u
.dst
.output
= ip_mc_output
;
2415 RT_CACHE_STAT_INC(out_slow_mc
);
2417 #ifdef CONFIG_IP_MROUTE
2418 if (res
->type
== RTN_MULTICAST
) {
2419 if (IN_DEV_MFORWARD(in_dev
) &&
2420 !ipv4_is_local_multicast(oldflp
->fl4_dst
)) {
2421 rth
->u
.dst
.input
= ip_mr_input
;
2422 rth
->u
.dst
.output
= ip_mc_output
;
2428 rt_set_nexthop(rth
, res
, 0);
2430 rth
->rt_flags
= flags
;
2434 /* release work reference to inet device */
2440 static int ip_mkroute_output(struct rtable
**rp
,
2441 struct fib_result
*res
,
2442 const struct flowi
*fl
,
2443 const struct flowi
*oldflp
,
2444 struct net_device
*dev_out
,
2447 struct rtable
*rth
= NULL
;
2448 int err
= __mkroute_output(&rth
, res
, fl
, oldflp
, dev_out
, flags
);
2451 hash
= rt_hash(oldflp
->fl4_dst
, oldflp
->fl4_src
, oldflp
->oif
,
2452 rt_genid(dev_net(dev_out
)));
2453 err
= rt_intern_hash(hash
, rth
, rp
);
2460 * Major route resolver routine.
2463 static int ip_route_output_slow(struct net
*net
, struct rtable
**rp
,
2464 const struct flowi
*oldflp
)
2466 u32 tos
= RT_FL_TOS(oldflp
);
2467 struct flowi fl
= { .nl_u
= { .ip4_u
=
2468 { .daddr
= oldflp
->fl4_dst
,
2469 .saddr
= oldflp
->fl4_src
,
2470 .tos
= tos
& IPTOS_RT_MASK
,
2471 .scope
= ((tos
& RTO_ONLINK
) ?
2475 .mark
= oldflp
->mark
,
2476 .iif
= net
->loopback_dev
->ifindex
,
2477 .oif
= oldflp
->oif
};
2478 struct fib_result res
;
2480 struct net_device
*dev_out
= NULL
;
2486 #ifdef CONFIG_IP_MULTIPLE_TABLES
2490 if (oldflp
->fl4_src
) {
2492 if (ipv4_is_multicast(oldflp
->fl4_src
) ||
2493 ipv4_is_lbcast(oldflp
->fl4_src
) ||
2494 ipv4_is_zeronet(oldflp
->fl4_src
))
2497 /* I removed check for oif == dev_out->oif here.
2498 It was wrong for two reasons:
2499 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2500 is assigned to multiple interfaces.
2501 2. Moreover, we are allowed to send packets with saddr
2502 of another iface. --ANK
2505 if (oldflp
->oif
== 0
2506 && (ipv4_is_multicast(oldflp
->fl4_dst
) ||
2507 oldflp
->fl4_dst
== htonl(0xFFFFFFFF))) {
2508 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2509 dev_out
= ip_dev_find(net
, oldflp
->fl4_src
);
2510 if (dev_out
== NULL
)
2513 /* Special hack: user can direct multicasts
2514 and limited broadcast via necessary interface
2515 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2516 This hack is not just for fun, it allows
2517 vic,vat and friends to work.
2518 They bind socket to loopback, set ttl to zero
2519 and expect that it will work.
2520 From the viewpoint of routing cache they are broken,
2521 because we are not allowed to build multicast path
2522 with loopback source addr (look, routing cache
2523 cannot know, that ttl is zero, so that packet
2524 will not leave this host and route is valid).
2525 Luckily, this hack is good workaround.
2528 fl
.oif
= dev_out
->ifindex
;
2532 if (!(oldflp
->flags
& FLOWI_FLAG_ANYSRC
)) {
2533 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2534 dev_out
= ip_dev_find(net
, oldflp
->fl4_src
);
2535 if (dev_out
== NULL
)
2544 dev_out
= dev_get_by_index(net
, oldflp
->oif
);
2546 if (dev_out
== NULL
)
2549 /* RACE: Check return value of inet_select_addr instead. */
2550 if (__in_dev_get_rtnl(dev_out
) == NULL
) {
2552 goto out
; /* Wrong error code */
2555 if (ipv4_is_local_multicast(oldflp
->fl4_dst
) ||
2556 oldflp
->fl4_dst
== htonl(0xFFFFFFFF)) {
2558 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2563 if (ipv4_is_multicast(oldflp
->fl4_dst
))
2564 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2566 else if (!oldflp
->fl4_dst
)
2567 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2573 fl
.fl4_dst
= fl
.fl4_src
;
2575 fl
.fl4_dst
= fl
.fl4_src
= htonl(INADDR_LOOPBACK
);
2578 dev_out
= net
->loopback_dev
;
2580 fl
.oif
= net
->loopback_dev
->ifindex
;
2581 res
.type
= RTN_LOCAL
;
2582 flags
|= RTCF_LOCAL
;
2586 if (fib_lookup(net
, &fl
, &res
)) {
2589 /* Apparently, routing tables are wrong. Assume,
2590 that the destination is on link.
2593 Because we are allowed to send to iface
2594 even if it has NO routes and NO assigned
2595 addresses. When oif is specified, routing
2596 tables are looked up with only one purpose:
2597 to catch if destination is gatewayed, rather than
2598 direct. Moreover, if MSG_DONTROUTE is set,
2599 we send packet, ignoring both routing tables
2600 and ifaddr state. --ANK
2603 We could make it even if oif is unknown,
2604 likely IPv6, but we do not.
2607 if (fl
.fl4_src
== 0)
2608 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2610 res
.type
= RTN_UNICAST
;
2620 if (res
.type
== RTN_LOCAL
) {
2622 fl
.fl4_src
= fl
.fl4_dst
;
2625 dev_out
= net
->loopback_dev
;
2627 fl
.oif
= dev_out
->ifindex
;
2629 fib_info_put(res
.fi
);
2631 flags
|= RTCF_LOCAL
;
2635 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2636 if (res
.fi
->fib_nhs
> 1 && fl
.oif
== 0)
2637 fib_select_multipath(&fl
, &res
);
2640 if (!res
.prefixlen
&& res
.type
== RTN_UNICAST
&& !fl
.oif
)
2641 fib_select_default(net
, &fl
, &res
);
2644 fl
.fl4_src
= FIB_RES_PREFSRC(res
);
2648 dev_out
= FIB_RES_DEV(res
);
2650 fl
.oif
= dev_out
->ifindex
;
2654 err
= ip_mkroute_output(rp
, &res
, &fl
, oldflp
, dev_out
, flags
);
2664 int __ip_route_output_key(struct net
*net
, struct rtable
**rp
,
2665 const struct flowi
*flp
)
2670 if (!rt_caching(net
))
2673 hash
= rt_hash(flp
->fl4_dst
, flp
->fl4_src
, flp
->oif
, rt_genid(net
));
2676 for (rth
= rcu_dereference(rt_hash_table
[hash
].chain
); rth
;
2677 rth
= rcu_dereference(rth
->u
.dst
.rt_next
)) {
2678 if (rth
->fl
.fl4_dst
== flp
->fl4_dst
&&
2679 rth
->fl
.fl4_src
== flp
->fl4_src
&&
2681 rth
->fl
.oif
== flp
->oif
&&
2682 rth
->fl
.mark
== flp
->mark
&&
2683 !((rth
->fl
.fl4_tos
^ flp
->fl4_tos
) &
2684 (IPTOS_RT_MASK
| RTO_ONLINK
)) &&
2685 net_eq(dev_net(rth
->u
.dst
.dev
), net
) &&
2686 !rt_is_expired(rth
)) {
2687 dst_use(&rth
->u
.dst
, jiffies
);
2688 RT_CACHE_STAT_INC(out_hit
);
2689 rcu_read_unlock_bh();
2693 RT_CACHE_STAT_INC(out_hlist_search
);
2695 rcu_read_unlock_bh();
2698 return ip_route_output_slow(net
, rp
, flp
);
2701 EXPORT_SYMBOL_GPL(__ip_route_output_key
);
2703 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
2707 static struct dst_ops ipv4_dst_blackhole_ops
= {
2709 .protocol
= cpu_to_be16(ETH_P_IP
),
2710 .destroy
= ipv4_dst_destroy
,
2711 .check
= ipv4_dst_check
,
2712 .update_pmtu
= ipv4_rt_blackhole_update_pmtu
,
2713 .entries
= ATOMIC_INIT(0),
2717 static int ipv4_dst_blackhole(struct net
*net
, struct rtable
**rp
, struct flowi
*flp
)
2719 struct rtable
*ort
= *rp
;
2720 struct rtable
*rt
= (struct rtable
*)
2721 dst_alloc(&ipv4_dst_blackhole_ops
);
2724 struct dst_entry
*new = &rt
->u
.dst
;
2726 atomic_set(&new->__refcnt
, 1);
2728 new->input
= dst_discard
;
2729 new->output
= dst_discard
;
2730 memcpy(new->metrics
, ort
->u
.dst
.metrics
, RTAX_MAX
*sizeof(u32
));
2732 new->dev
= ort
->u
.dst
.dev
;
2738 rt
->idev
= ort
->idev
;
2740 in_dev_hold(rt
->idev
);
2741 rt
->rt_genid
= rt_genid(net
);
2742 rt
->rt_flags
= ort
->rt_flags
;
2743 rt
->rt_type
= ort
->rt_type
;
2744 rt
->rt_dst
= ort
->rt_dst
;
2745 rt
->rt_src
= ort
->rt_src
;
2746 rt
->rt_iif
= ort
->rt_iif
;
2747 rt
->rt_gateway
= ort
->rt_gateway
;
2748 rt
->rt_spec_dst
= ort
->rt_spec_dst
;
2749 rt
->peer
= ort
->peer
;
2751 atomic_inc(&rt
->peer
->refcnt
);
2756 dst_release(&(*rp
)->u
.dst
);
2758 return (rt
? 0 : -ENOMEM
);
2761 int ip_route_output_flow(struct net
*net
, struct rtable
**rp
, struct flowi
*flp
,
2762 struct sock
*sk
, int flags
)
2766 if ((err
= __ip_route_output_key(net
, rp
, flp
)) != 0)
2771 flp
->fl4_src
= (*rp
)->rt_src
;
2773 flp
->fl4_dst
= (*rp
)->rt_dst
;
2774 err
= __xfrm_lookup(net
, (struct dst_entry
**)rp
, flp
, sk
,
2775 flags
? XFRM_LOOKUP_WAIT
: 0);
2776 if (err
== -EREMOTE
)
2777 err
= ipv4_dst_blackhole(net
, rp
, flp
);
2785 EXPORT_SYMBOL_GPL(ip_route_output_flow
);
2787 int ip_route_output_key(struct net
*net
, struct rtable
**rp
, struct flowi
*flp
)
2789 return ip_route_output_flow(net
, rp
, flp
, NULL
, 0);
2792 static int rt_fill_info(struct net
*net
,
2793 struct sk_buff
*skb
, u32 pid
, u32 seq
, int event
,
2794 int nowait
, unsigned int flags
)
2796 struct rtable
*rt
= skb
->rtable
;
2798 struct nlmsghdr
*nlh
;
2800 u32 id
= 0, ts
= 0, tsage
= 0, error
;
2802 nlh
= nlmsg_put(skb
, pid
, seq
, event
, sizeof(*r
), flags
);
2806 r
= nlmsg_data(nlh
);
2807 r
->rtm_family
= AF_INET
;
2808 r
->rtm_dst_len
= 32;
2810 r
->rtm_tos
= rt
->fl
.fl4_tos
;
2811 r
->rtm_table
= RT_TABLE_MAIN
;
2812 NLA_PUT_U32(skb
, RTA_TABLE
, RT_TABLE_MAIN
);
2813 r
->rtm_type
= rt
->rt_type
;
2814 r
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2815 r
->rtm_protocol
= RTPROT_UNSPEC
;
2816 r
->rtm_flags
= (rt
->rt_flags
& ~0xFFFF) | RTM_F_CLONED
;
2817 if (rt
->rt_flags
& RTCF_NOTIFY
)
2818 r
->rtm_flags
|= RTM_F_NOTIFY
;
2820 NLA_PUT_BE32(skb
, RTA_DST
, rt
->rt_dst
);
2822 if (rt
->fl
.fl4_src
) {
2823 r
->rtm_src_len
= 32;
2824 NLA_PUT_BE32(skb
, RTA_SRC
, rt
->fl
.fl4_src
);
2827 NLA_PUT_U32(skb
, RTA_OIF
, rt
->u
.dst
.dev
->ifindex
);
2828 #ifdef CONFIG_NET_CLS_ROUTE
2829 if (rt
->u
.dst
.tclassid
)
2830 NLA_PUT_U32(skb
, RTA_FLOW
, rt
->u
.dst
.tclassid
);
2833 NLA_PUT_BE32(skb
, RTA_PREFSRC
, rt
->rt_spec_dst
);
2834 else if (rt
->rt_src
!= rt
->fl
.fl4_src
)
2835 NLA_PUT_BE32(skb
, RTA_PREFSRC
, rt
->rt_src
);
2837 if (rt
->rt_dst
!= rt
->rt_gateway
)
2838 NLA_PUT_BE32(skb
, RTA_GATEWAY
, rt
->rt_gateway
);
2840 if (rtnetlink_put_metrics(skb
, rt
->u
.dst
.metrics
) < 0)
2841 goto nla_put_failure
;
2843 error
= rt
->u
.dst
.error
;
2844 expires
= rt
->u
.dst
.expires
? rt
->u
.dst
.expires
- jiffies
: 0;
2846 id
= rt
->peer
->ip_id_count
;
2847 if (rt
->peer
->tcp_ts_stamp
) {
2848 ts
= rt
->peer
->tcp_ts
;
2849 tsage
= get_seconds() - rt
->peer
->tcp_ts_stamp
;
2854 #ifdef CONFIG_IP_MROUTE
2855 __be32 dst
= rt
->rt_dst
;
2857 if (ipv4_is_multicast(dst
) && !ipv4_is_local_multicast(dst
) &&
2858 IPV4_DEVCONF_ALL(net
, MC_FORWARDING
)) {
2859 int err
= ipmr_get_route(net
, skb
, r
, nowait
);
2864 goto nla_put_failure
;
2866 if (err
== -EMSGSIZE
)
2867 goto nla_put_failure
;
2873 NLA_PUT_U32(skb
, RTA_IIF
, rt
->fl
.iif
);
2876 if (rtnl_put_cacheinfo(skb
, &rt
->u
.dst
, id
, ts
, tsage
,
2877 expires
, error
) < 0)
2878 goto nla_put_failure
;
2880 return nlmsg_end(skb
, nlh
);
2883 nlmsg_cancel(skb
, nlh
);
2887 static int inet_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
2889 struct net
*net
= sock_net(in_skb
->sk
);
2891 struct nlattr
*tb
[RTA_MAX
+1];
2892 struct rtable
*rt
= NULL
;
2897 struct sk_buff
*skb
;
2899 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv4_policy
);
2903 rtm
= nlmsg_data(nlh
);
2905 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
2911 /* Reserve room for dummy headers, this skb can pass
2912 through good chunk of routing engine.
2914 skb_reset_mac_header(skb
);
2915 skb_reset_network_header(skb
);
2917 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2918 ip_hdr(skb
)->protocol
= IPPROTO_ICMP
;
2919 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct iphdr
));
2921 src
= tb
[RTA_SRC
] ? nla_get_be32(tb
[RTA_SRC
]) : 0;
2922 dst
= tb
[RTA_DST
] ? nla_get_be32(tb
[RTA_DST
]) : 0;
2923 iif
= tb
[RTA_IIF
] ? nla_get_u32(tb
[RTA_IIF
]) : 0;
2926 struct net_device
*dev
;
2928 dev
= __dev_get_by_index(net
, iif
);
2934 skb
->protocol
= htons(ETH_P_IP
);
2937 err
= ip_route_input(skb
, dst
, src
, rtm
->rtm_tos
, dev
);
2941 if (err
== 0 && rt
->u
.dst
.error
)
2942 err
= -rt
->u
.dst
.error
;
2949 .tos
= rtm
->rtm_tos
,
2952 .oif
= tb
[RTA_OIF
] ? nla_get_u32(tb
[RTA_OIF
]) : 0,
2954 err
= ip_route_output_key(net
, &rt
, &fl
);
2961 if (rtm
->rtm_flags
& RTM_F_NOTIFY
)
2962 rt
->rt_flags
|= RTCF_NOTIFY
;
2964 err
= rt_fill_info(net
, skb
, NETLINK_CB(in_skb
).pid
, nlh
->nlmsg_seq
,
2965 RTM_NEWROUTE
, 0, 0);
2969 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).pid
);
2978 int ip_rt_dump(struct sk_buff
*skb
, struct netlink_callback
*cb
)
2985 net
= sock_net(skb
->sk
);
2990 s_idx
= idx
= cb
->args
[1];
2991 for (h
= s_h
; h
<= rt_hash_mask
; h
++, s_idx
= 0) {
2992 if (!rt_hash_table
[h
].chain
)
2995 for (rt
= rcu_dereference(rt_hash_table
[h
].chain
), idx
= 0; rt
;
2996 rt
= rcu_dereference(rt
->u
.dst
.rt_next
), idx
++) {
2997 if (!net_eq(dev_net(rt
->u
.dst
.dev
), net
) || idx
< s_idx
)
2999 if (rt_is_expired(rt
))
3001 skb
->dst
= dst_clone(&rt
->u
.dst
);
3002 if (rt_fill_info(net
, skb
, NETLINK_CB(cb
->skb
).pid
,
3003 cb
->nlh
->nlmsg_seq
, RTM_NEWROUTE
,
3004 1, NLM_F_MULTI
) <= 0) {
3005 dst_release(xchg(&skb
->dst
, NULL
));
3006 rcu_read_unlock_bh();
3009 dst_release(xchg(&skb
->dst
, NULL
));
3011 rcu_read_unlock_bh();
3020 void ip_rt_multicast_event(struct in_device
*in_dev
)
3022 rt_cache_flush(dev_net(in_dev
->dev
), 0);
3025 #ifdef CONFIG_SYSCTL
3026 static int ipv4_sysctl_rtcache_flush(ctl_table
*__ctl
, int write
,
3027 struct file
*filp
, void __user
*buffer
,
3028 size_t *lenp
, loff_t
*ppos
)
3035 memcpy(&ctl
, __ctl
, sizeof(ctl
));
3036 ctl
.data
= &flush_delay
;
3037 proc_dointvec(&ctl
, write
, filp
, buffer
, lenp
, ppos
);
3039 net
= (struct net
*)__ctl
->extra1
;
3040 rt_cache_flush(net
, flush_delay
);
3047 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table
*table
,
3048 void __user
*oldval
,
3049 size_t __user
*oldlenp
,
3050 void __user
*newval
,
3055 if (newlen
!= sizeof(int))
3057 if (get_user(delay
, (int __user
*)newval
))
3059 net
= (struct net
*)table
->extra1
;
3060 rt_cache_flush(net
, delay
);
3064 static void rt_secret_reschedule(int old
)
3067 int new = ip_rt_secret_interval
;
3068 int diff
= new - old
;
3075 int deleted
= del_timer_sync(&net
->ipv4
.rt_secret_timer
);
3081 long time
= net
->ipv4
.rt_secret_timer
.expires
- jiffies
;
3083 if (time
<= 0 || (time
+= diff
) <= 0)
3086 net
->ipv4
.rt_secret_timer
.expires
= time
;
3088 net
->ipv4
.rt_secret_timer
.expires
= new;
3090 net
->ipv4
.rt_secret_timer
.expires
+= jiffies
;
3091 add_timer(&net
->ipv4
.rt_secret_timer
);
3096 static int ipv4_sysctl_rt_secret_interval(ctl_table
*ctl
, int write
,
3098 void __user
*buffer
, size_t *lenp
,
3101 int old
= ip_rt_secret_interval
;
3102 int ret
= proc_dointvec_jiffies(ctl
, write
, filp
, buffer
, lenp
, ppos
);
3104 rt_secret_reschedule(old
);
3109 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table
*table
,
3110 void __user
*oldval
,
3111 size_t __user
*oldlenp
,
3112 void __user
*newval
,
3115 int old
= ip_rt_secret_interval
;
3116 int ret
= sysctl_jiffies(table
, oldval
, oldlenp
, newval
, newlen
);
3118 rt_secret_reschedule(old
);
3123 static ctl_table ipv4_route_table
[] = {
3125 .ctl_name
= NET_IPV4_ROUTE_GC_THRESH
,
3126 .procname
= "gc_thresh",
3127 .data
= &ipv4_dst_ops
.gc_thresh
,
3128 .maxlen
= sizeof(int),
3130 .proc_handler
= proc_dointvec
,
3133 .ctl_name
= NET_IPV4_ROUTE_MAX_SIZE
,
3134 .procname
= "max_size",
3135 .data
= &ip_rt_max_size
,
3136 .maxlen
= sizeof(int),
3138 .proc_handler
= proc_dointvec
,
3141 /* Deprecated. Use gc_min_interval_ms */
3143 .ctl_name
= NET_IPV4_ROUTE_GC_MIN_INTERVAL
,
3144 .procname
= "gc_min_interval",
3145 .data
= &ip_rt_gc_min_interval
,
3146 .maxlen
= sizeof(int),
3148 .proc_handler
= proc_dointvec_jiffies
,
3149 .strategy
= sysctl_jiffies
,
3152 .ctl_name
= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS
,
3153 .procname
= "gc_min_interval_ms",
3154 .data
= &ip_rt_gc_min_interval
,
3155 .maxlen
= sizeof(int),
3157 .proc_handler
= proc_dointvec_ms_jiffies
,
3158 .strategy
= sysctl_ms_jiffies
,
3161 .ctl_name
= NET_IPV4_ROUTE_GC_TIMEOUT
,
3162 .procname
= "gc_timeout",
3163 .data
= &ip_rt_gc_timeout
,
3164 .maxlen
= sizeof(int),
3166 .proc_handler
= proc_dointvec_jiffies
,
3167 .strategy
= sysctl_jiffies
,
3170 .ctl_name
= NET_IPV4_ROUTE_GC_INTERVAL
,
3171 .procname
= "gc_interval",
3172 .data
= &ip_rt_gc_interval
,
3173 .maxlen
= sizeof(int),
3175 .proc_handler
= proc_dointvec_jiffies
,
3176 .strategy
= sysctl_jiffies
,
3179 .ctl_name
= NET_IPV4_ROUTE_REDIRECT_LOAD
,
3180 .procname
= "redirect_load",
3181 .data
= &ip_rt_redirect_load
,
3182 .maxlen
= sizeof(int),
3184 .proc_handler
= proc_dointvec
,
3187 .ctl_name
= NET_IPV4_ROUTE_REDIRECT_NUMBER
,
3188 .procname
= "redirect_number",
3189 .data
= &ip_rt_redirect_number
,
3190 .maxlen
= sizeof(int),
3192 .proc_handler
= proc_dointvec
,
3195 .ctl_name
= NET_IPV4_ROUTE_REDIRECT_SILENCE
,
3196 .procname
= "redirect_silence",
3197 .data
= &ip_rt_redirect_silence
,
3198 .maxlen
= sizeof(int),
3200 .proc_handler
= proc_dointvec
,
3203 .ctl_name
= NET_IPV4_ROUTE_ERROR_COST
,
3204 .procname
= "error_cost",
3205 .data
= &ip_rt_error_cost
,
3206 .maxlen
= sizeof(int),
3208 .proc_handler
= proc_dointvec
,
3211 .ctl_name
= NET_IPV4_ROUTE_ERROR_BURST
,
3212 .procname
= "error_burst",
3213 .data
= &ip_rt_error_burst
,
3214 .maxlen
= sizeof(int),
3216 .proc_handler
= proc_dointvec
,
3219 .ctl_name
= NET_IPV4_ROUTE_GC_ELASTICITY
,
3220 .procname
= "gc_elasticity",
3221 .data
= &ip_rt_gc_elasticity
,
3222 .maxlen
= sizeof(int),
3224 .proc_handler
= proc_dointvec
,
3227 .ctl_name
= NET_IPV4_ROUTE_MTU_EXPIRES
,
3228 .procname
= "mtu_expires",
3229 .data
= &ip_rt_mtu_expires
,
3230 .maxlen
= sizeof(int),
3232 .proc_handler
= proc_dointvec_jiffies
,
3233 .strategy
= sysctl_jiffies
,
3236 .ctl_name
= NET_IPV4_ROUTE_MIN_PMTU
,
3237 .procname
= "min_pmtu",
3238 .data
= &ip_rt_min_pmtu
,
3239 .maxlen
= sizeof(int),
3241 .proc_handler
= proc_dointvec
,
3244 .ctl_name
= NET_IPV4_ROUTE_MIN_ADVMSS
,
3245 .procname
= "min_adv_mss",
3246 .data
= &ip_rt_min_advmss
,
3247 .maxlen
= sizeof(int),
3249 .proc_handler
= proc_dointvec
,
3252 .ctl_name
= NET_IPV4_ROUTE_SECRET_INTERVAL
,
3253 .procname
= "secret_interval",
3254 .data
= &ip_rt_secret_interval
,
3255 .maxlen
= sizeof(int),
3257 .proc_handler
= ipv4_sysctl_rt_secret_interval
,
3258 .strategy
= ipv4_sysctl_rt_secret_interval_strategy
,
3263 static struct ctl_table empty
[1];
3265 static struct ctl_table ipv4_skeleton
[] =
3267 { .procname
= "route", .ctl_name
= NET_IPV4_ROUTE
,
3268 .mode
= 0555, .child
= ipv4_route_table
},
3269 { .procname
= "neigh", .ctl_name
= NET_IPV4_NEIGH
,
3270 .mode
= 0555, .child
= empty
},
3274 static __net_initdata
struct ctl_path ipv4_path
[] = {
3275 { .procname
= "net", .ctl_name
= CTL_NET
, },
3276 { .procname
= "ipv4", .ctl_name
= NET_IPV4
, },
3280 static struct ctl_table ipv4_route_flush_table
[] = {
3282 .ctl_name
= NET_IPV4_ROUTE_FLUSH
,
3283 .procname
= "flush",
3284 .maxlen
= sizeof(int),
3286 .proc_handler
= ipv4_sysctl_rtcache_flush
,
3287 .strategy
= ipv4_sysctl_rtcache_flush_strategy
,
3292 static __net_initdata
struct ctl_path ipv4_route_path
[] = {
3293 { .procname
= "net", .ctl_name
= CTL_NET
, },
3294 { .procname
= "ipv4", .ctl_name
= NET_IPV4
, },
3295 { .procname
= "route", .ctl_name
= NET_IPV4_ROUTE
, },
3299 static __net_init
int sysctl_route_net_init(struct net
*net
)
3301 struct ctl_table
*tbl
;
3303 tbl
= ipv4_route_flush_table
;
3304 if (net
!= &init_net
) {
3305 tbl
= kmemdup(tbl
, sizeof(ipv4_route_flush_table
), GFP_KERNEL
);
3309 tbl
[0].extra1
= net
;
3311 net
->ipv4
.route_hdr
=
3312 register_net_sysctl_table(net
, ipv4_route_path
, tbl
);
3313 if (net
->ipv4
.route_hdr
== NULL
)
3318 if (tbl
!= ipv4_route_flush_table
)
3324 static __net_exit
void sysctl_route_net_exit(struct net
*net
)
3326 struct ctl_table
*tbl
;
3328 tbl
= net
->ipv4
.route_hdr
->ctl_table_arg
;
3329 unregister_net_sysctl_table(net
->ipv4
.route_hdr
);
3330 BUG_ON(tbl
== ipv4_route_flush_table
);
3334 static __net_initdata
struct pernet_operations sysctl_route_ops
= {
3335 .init
= sysctl_route_net_init
,
3336 .exit
= sysctl_route_net_exit
,
3341 static __net_init
int rt_secret_timer_init(struct net
*net
)
3343 atomic_set(&net
->ipv4
.rt_genid
,
3344 (int) ((num_physpages
^ (num_physpages
>>8)) ^
3345 (jiffies
^ (jiffies
>> 7))));
3347 net
->ipv4
.rt_secret_timer
.function
= rt_secret_rebuild
;
3348 net
->ipv4
.rt_secret_timer
.data
= (unsigned long)net
;
3349 init_timer_deferrable(&net
->ipv4
.rt_secret_timer
);
3351 if (ip_rt_secret_interval
) {
3352 net
->ipv4
.rt_secret_timer
.expires
=
3353 jiffies
+ net_random() % ip_rt_secret_interval
+
3354 ip_rt_secret_interval
;
3355 add_timer(&net
->ipv4
.rt_secret_timer
);
3360 static __net_exit
void rt_secret_timer_exit(struct net
*net
)
3362 del_timer_sync(&net
->ipv4
.rt_secret_timer
);
3365 static __net_initdata
struct pernet_operations rt_secret_timer_ops
= {
3366 .init
= rt_secret_timer_init
,
3367 .exit
= rt_secret_timer_exit
,
3371 #ifdef CONFIG_NET_CLS_ROUTE
3372 struct ip_rt_acct
*ip_rt_acct __read_mostly
;
3373 #endif /* CONFIG_NET_CLS_ROUTE */
3375 static __initdata
unsigned long rhash_entries
;
3376 static int __init
set_rhash_entries(char *str
)
3380 rhash_entries
= simple_strtoul(str
, &str
, 0);
3383 __setup("rhash_entries=", set_rhash_entries
);
3385 int __init
ip_rt_init(void)
3389 #ifdef CONFIG_NET_CLS_ROUTE
3390 ip_rt_acct
= __alloc_percpu(256 * sizeof(struct ip_rt_acct
), __alignof__(struct ip_rt_acct
));
3392 panic("IP: failed to allocate ip_rt_acct\n");
3395 ipv4_dst_ops
.kmem_cachep
=
3396 kmem_cache_create("ip_dst_cache", sizeof(struct rtable
), 0,
3397 SLAB_HWCACHE_ALIGN
|SLAB_PANIC
, NULL
);
3399 ipv4_dst_blackhole_ops
.kmem_cachep
= ipv4_dst_ops
.kmem_cachep
;
3401 rt_hash_table
= (struct rt_hash_bucket
*)
3402 alloc_large_system_hash("IP route cache",
3403 sizeof(struct rt_hash_bucket
),
3405 (num_physpages
>= 128 * 1024) ?
3410 rhash_entries
? 0 : 512 * 1024);
3411 memset(rt_hash_table
, 0, (rt_hash_mask
+ 1) * sizeof(struct rt_hash_bucket
));
3412 rt_hash_lock_init();
3414 ipv4_dst_ops
.gc_thresh
= (rt_hash_mask
+ 1);
3415 ip_rt_max_size
= (rt_hash_mask
+ 1) * 16;
3420 /* All the timers, started at system startup tend
3421 to synchronize. Perturb it a bit.
3423 schedule_delayed_work(&expires_work
,
3424 net_random() % ip_rt_gc_interval
+ ip_rt_gc_interval
);
3426 if (register_pernet_subsys(&rt_secret_timer_ops
))
3427 printk(KERN_ERR
"Unable to setup rt_secret_timer\n");
3429 if (ip_rt_proc_init())
3430 printk(KERN_ERR
"Unable to create route proc files\n");
3435 rtnl_register(PF_INET
, RTM_GETROUTE
, inet_rtm_getroute
, NULL
);
3437 #ifdef CONFIG_SYSCTL
3438 register_pernet_subsys(&sysctl_route_ops
);
3443 #ifdef CONFIG_SYSCTL
3445 * We really need to sanitize the damn ipv4 init order, then all
3446 * this nonsense will go away.
3448 void __init
ip_static_sysctl_init(void)
3450 register_sysctl_paths(ipv4_path
, ipv4_skeleton
);
3454 EXPORT_SYMBOL(__ip_select_ident
);
3455 EXPORT_SYMBOL(ip_route_input
);
3456 EXPORT_SYMBOL(ip_route_output_key
);