2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
109 #include <linux/sysctl.h>
112 #define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115 #define IP_MAX_MTU 0xFFF0
117 #define RT_GC_TIMEOUT (300*HZ)
119 static int ip_rt_max_size
;
120 static int ip_rt_gc_timeout __read_mostly
= RT_GC_TIMEOUT
;
121 static int ip_rt_gc_interval __read_mostly
= 60 * HZ
;
122 static int ip_rt_gc_min_interval __read_mostly
= HZ
/ 2;
123 static int ip_rt_redirect_number __read_mostly
= 9;
124 static int ip_rt_redirect_load __read_mostly
= HZ
/ 50;
125 static int ip_rt_redirect_silence __read_mostly
= ((HZ
/ 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly
= HZ
;
127 static int ip_rt_error_burst __read_mostly
= 5 * HZ
;
128 static int ip_rt_gc_elasticity __read_mostly
= 8;
129 static int ip_rt_mtu_expires __read_mostly
= 10 * 60 * HZ
;
130 static int ip_rt_min_pmtu __read_mostly
= 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly
= 256;
132 static int ip_rt_secret_interval __read_mostly
= 10 * 60 * HZ
;
133 static int rt_chain_length_max __read_mostly
= 20;
135 static struct delayed_work expires_work
;
136 static unsigned long expires_ljiffies
;
139 * Interface to generic destination cache.
142 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
);
143 static void ipv4_dst_destroy(struct dst_entry
*dst
);
144 static void ipv4_dst_ifdown(struct dst_entry
*dst
,
145 struct net_device
*dev
, int how
);
146 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
);
147 static void ipv4_link_failure(struct sk_buff
*skb
);
148 static void ip_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
);
149 static int rt_garbage_collect(struct dst_ops
*ops
);
152 static struct dst_ops ipv4_dst_ops
= {
154 .protocol
= cpu_to_be16(ETH_P_IP
),
155 .gc
= rt_garbage_collect
,
156 .check
= ipv4_dst_check
,
157 .destroy
= ipv4_dst_destroy
,
158 .ifdown
= ipv4_dst_ifdown
,
159 .negative_advice
= ipv4_negative_advice
,
160 .link_failure
= ipv4_link_failure
,
161 .update_pmtu
= ip_rt_update_pmtu
,
162 .local_out
= __ip_local_out
,
163 .entries
= ATOMIC_INIT(0),
166 #define ECN_OR_COST(class) TC_PRIO_##class
168 const __u8 ip_tos2prio
[16] = {
172 ECN_OR_COST(BESTEFFORT
),
178 ECN_OR_COST(INTERACTIVE
),
180 ECN_OR_COST(INTERACTIVE
),
181 TC_PRIO_INTERACTIVE_BULK
,
182 ECN_OR_COST(INTERACTIVE_BULK
),
183 TC_PRIO_INTERACTIVE_BULK
,
184 ECN_OR_COST(INTERACTIVE_BULK
)
192 /* The locking scheme is rather straight forward:
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
202 struct rt_hash_bucket
{
203 struct rtable
*chain
;
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ 256
217 # define RT_HASH_LOCK_SZ 4096
219 # define RT_HASH_LOCK_SZ 2048
221 # define RT_HASH_LOCK_SZ 1024
223 # define RT_HASH_LOCK_SZ 512
225 # define RT_HASH_LOCK_SZ 256
229 static spinlock_t
*rt_hash_locks
;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
232 static __init
void rt_hash_lock_init(void)
236 rt_hash_locks
= kmalloc(sizeof(spinlock_t
) * RT_HASH_LOCK_SZ
,
239 panic("IP: failed to allocate rt_hash_locks\n");
241 for (i
= 0; i
< RT_HASH_LOCK_SZ
; i
++)
242 spin_lock_init(&rt_hash_locks
[i
]);
245 # define rt_hash_lock_addr(slot) NULL
247 static inline void rt_hash_lock_init(void)
252 static struct rt_hash_bucket
*rt_hash_table __read_mostly
;
253 static unsigned rt_hash_mask __read_mostly
;
254 static unsigned int rt_hash_log __read_mostly
;
256 static DEFINE_PER_CPU(struct rt_cache_stat
, rt_cache_stat
);
257 #define RT_CACHE_STAT_INC(field) \
258 (__raw_get_cpu_var(rt_cache_stat).field++)
260 static inline unsigned int rt_hash(__be32 daddr
, __be32 saddr
, int idx
,
263 return jhash_3words((__force u32
)daddr
, (__force u32
)saddr
,
268 static inline int rt_genid(struct net
*net
)
270 return atomic_read(&net
->ipv4
.rt_genid
);
273 #ifdef CONFIG_PROC_FS
274 struct rt_cache_iter_state
{
275 struct seq_net_private p
;
280 static struct rtable
*rt_cache_get_first(struct seq_file
*seq
)
282 struct rt_cache_iter_state
*st
= seq
->private;
283 struct rtable
*r
= NULL
;
285 for (st
->bucket
= rt_hash_mask
; st
->bucket
>= 0; --st
->bucket
) {
286 if (!rt_hash_table
[st
->bucket
].chain
)
289 r
= rcu_dereference_bh(rt_hash_table
[st
->bucket
].chain
);
291 if (dev_net(r
->u
.dst
.dev
) == seq_file_net(seq
) &&
292 r
->rt_genid
== st
->genid
)
294 r
= rcu_dereference_bh(r
->u
.dst
.rt_next
);
296 rcu_read_unlock_bh();
301 static struct rtable
*__rt_cache_get_next(struct seq_file
*seq
,
304 struct rt_cache_iter_state
*st
= seq
->private;
306 r
= r
->u
.dst
.rt_next
;
308 rcu_read_unlock_bh();
310 if (--st
->bucket
< 0)
312 } while (!rt_hash_table
[st
->bucket
].chain
);
314 r
= rt_hash_table
[st
->bucket
].chain
;
316 return rcu_dereference_bh(r
);
319 static struct rtable
*rt_cache_get_next(struct seq_file
*seq
,
322 struct rt_cache_iter_state
*st
= seq
->private;
323 while ((r
= __rt_cache_get_next(seq
, r
)) != NULL
) {
324 if (dev_net(r
->u
.dst
.dev
) != seq_file_net(seq
))
326 if (r
->rt_genid
== st
->genid
)
332 static struct rtable
*rt_cache_get_idx(struct seq_file
*seq
, loff_t pos
)
334 struct rtable
*r
= rt_cache_get_first(seq
);
337 while (pos
&& (r
= rt_cache_get_next(seq
, r
)))
339 return pos
? NULL
: r
;
342 static void *rt_cache_seq_start(struct seq_file
*seq
, loff_t
*pos
)
344 struct rt_cache_iter_state
*st
= seq
->private;
346 return rt_cache_get_idx(seq
, *pos
- 1);
347 st
->genid
= rt_genid(seq_file_net(seq
));
348 return SEQ_START_TOKEN
;
351 static void *rt_cache_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
355 if (v
== SEQ_START_TOKEN
)
356 r
= rt_cache_get_first(seq
);
358 r
= rt_cache_get_next(seq
, v
);
363 static void rt_cache_seq_stop(struct seq_file
*seq
, void *v
)
365 if (v
&& v
!= SEQ_START_TOKEN
)
366 rcu_read_unlock_bh();
369 static int rt_cache_seq_show(struct seq_file
*seq
, void *v
)
371 if (v
== SEQ_START_TOKEN
)
372 seq_printf(seq
, "%-127s\n",
373 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
374 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
377 struct rtable
*r
= v
;
380 seq_printf(seq
, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
381 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
382 r
->u
.dst
.dev
? r
->u
.dst
.dev
->name
: "*",
383 (__force u32
)r
->rt_dst
,
384 (__force u32
)r
->rt_gateway
,
385 r
->rt_flags
, atomic_read(&r
->u
.dst
.__refcnt
),
386 r
->u
.dst
.__use
, 0, (__force u32
)r
->rt_src
,
387 (dst_metric(&r
->u
.dst
, RTAX_ADVMSS
) ?
388 (int)dst_metric(&r
->u
.dst
, RTAX_ADVMSS
) + 40 : 0),
389 dst_metric(&r
->u
.dst
, RTAX_WINDOW
),
390 (int)((dst_metric(&r
->u
.dst
, RTAX_RTT
) >> 3) +
391 dst_metric(&r
->u
.dst
, RTAX_RTTVAR
)),
393 r
->u
.dst
.hh
? atomic_read(&r
->u
.dst
.hh
->hh_refcnt
) : -1,
394 r
->u
.dst
.hh
? (r
->u
.dst
.hh
->hh_output
==
396 r
->rt_spec_dst
, &len
);
398 seq_printf(seq
, "%*s\n", 127 - len
, "");
403 static const struct seq_operations rt_cache_seq_ops
= {
404 .start
= rt_cache_seq_start
,
405 .next
= rt_cache_seq_next
,
406 .stop
= rt_cache_seq_stop
,
407 .show
= rt_cache_seq_show
,
410 static int rt_cache_seq_open(struct inode
*inode
, struct file
*file
)
412 return seq_open_net(inode
, file
, &rt_cache_seq_ops
,
413 sizeof(struct rt_cache_iter_state
));
416 static const struct file_operations rt_cache_seq_fops
= {
417 .owner
= THIS_MODULE
,
418 .open
= rt_cache_seq_open
,
421 .release
= seq_release_net
,
425 static void *rt_cpu_seq_start(struct seq_file
*seq
, loff_t
*pos
)
430 return SEQ_START_TOKEN
;
432 for (cpu
= *pos
-1; cpu
< nr_cpu_ids
; ++cpu
) {
433 if (!cpu_possible(cpu
))
436 return &per_cpu(rt_cache_stat
, cpu
);
441 static void *rt_cpu_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
445 for (cpu
= *pos
; cpu
< nr_cpu_ids
; ++cpu
) {
446 if (!cpu_possible(cpu
))
449 return &per_cpu(rt_cache_stat
, cpu
);
455 static void rt_cpu_seq_stop(struct seq_file
*seq
, void *v
)
460 static int rt_cpu_seq_show(struct seq_file
*seq
, void *v
)
462 struct rt_cache_stat
*st
= v
;
464 if (v
== SEQ_START_TOKEN
) {
465 seq_printf(seq
, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
469 seq_printf(seq
,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops
.entries
),
494 static const struct seq_operations rt_cpu_seq_ops
= {
495 .start
= rt_cpu_seq_start
,
496 .next
= rt_cpu_seq_next
,
497 .stop
= rt_cpu_seq_stop
,
498 .show
= rt_cpu_seq_show
,
502 static int rt_cpu_seq_open(struct inode
*inode
, struct file
*file
)
504 return seq_open(file
, &rt_cpu_seq_ops
);
507 static const struct file_operations rt_cpu_seq_fops
= {
508 .owner
= THIS_MODULE
,
509 .open
= rt_cpu_seq_open
,
512 .release
= seq_release
,
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int rt_acct_proc_show(struct seq_file
*m
, void *v
)
518 struct ip_rt_acct
*dst
, *src
;
521 dst
= kcalloc(256, sizeof(struct ip_rt_acct
), GFP_KERNEL
);
525 for_each_possible_cpu(i
) {
526 src
= (struct ip_rt_acct
*)per_cpu_ptr(ip_rt_acct
, i
);
527 for (j
= 0; j
< 256; j
++) {
528 dst
[j
].o_bytes
+= src
[j
].o_bytes
;
529 dst
[j
].o_packets
+= src
[j
].o_packets
;
530 dst
[j
].i_bytes
+= src
[j
].i_bytes
;
531 dst
[j
].i_packets
+= src
[j
].i_packets
;
535 seq_write(m
, dst
, 256 * sizeof(struct ip_rt_acct
));
540 static int rt_acct_proc_open(struct inode
*inode
, struct file
*file
)
542 return single_open(file
, rt_acct_proc_show
, NULL
);
545 static const struct file_operations rt_acct_proc_fops
= {
546 .owner
= THIS_MODULE
,
547 .open
= rt_acct_proc_open
,
550 .release
= single_release
,
554 static int __net_init
ip_rt_do_proc_init(struct net
*net
)
556 struct proc_dir_entry
*pde
;
558 pde
= proc_net_fops_create(net
, "rt_cache", S_IRUGO
,
563 pde
= proc_create("rt_cache", S_IRUGO
,
564 net
->proc_net_stat
, &rt_cpu_seq_fops
);
568 #ifdef CONFIG_NET_CLS_ROUTE
569 pde
= proc_create("rt_acct", 0, net
->proc_net
, &rt_acct_proc_fops
);
575 #ifdef CONFIG_NET_CLS_ROUTE
577 remove_proc_entry("rt_cache", net
->proc_net_stat
);
580 remove_proc_entry("rt_cache", net
->proc_net
);
585 static void __net_exit
ip_rt_do_proc_exit(struct net
*net
)
587 remove_proc_entry("rt_cache", net
->proc_net_stat
);
588 remove_proc_entry("rt_cache", net
->proc_net
);
589 #ifdef CONFIG_NET_CLS_ROUTE
590 remove_proc_entry("rt_acct", net
->proc_net
);
594 static struct pernet_operations ip_rt_proc_ops __net_initdata
= {
595 .init
= ip_rt_do_proc_init
,
596 .exit
= ip_rt_do_proc_exit
,
599 static int __init
ip_rt_proc_init(void)
601 return register_pernet_subsys(&ip_rt_proc_ops
);
605 static inline int ip_rt_proc_init(void)
609 #endif /* CONFIG_PROC_FS */
611 static inline void rt_free(struct rtable
*rt
)
613 call_rcu_bh(&rt
->u
.dst
.rcu_head
, dst_rcu_free
);
616 static inline void rt_drop(struct rtable
*rt
)
619 call_rcu_bh(&rt
->u
.dst
.rcu_head
, dst_rcu_free
);
622 static inline int rt_fast_clean(struct rtable
*rth
)
624 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */
626 return (rth
->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) &&
627 rth
->fl
.iif
&& rth
->u
.dst
.rt_next
;
630 static inline int rt_valuable(struct rtable
*rth
)
632 return (rth
->rt_flags
& (RTCF_REDIRECTED
| RTCF_NOTIFY
)) ||
636 static int rt_may_expire(struct rtable
*rth
, unsigned long tmo1
, unsigned long tmo2
)
641 if (atomic_read(&rth
->u
.dst
.__refcnt
))
645 if (rth
->u
.dst
.expires
&&
646 time_after_eq(jiffies
, rth
->u
.dst
.expires
))
649 age
= jiffies
- rth
->u
.dst
.lastuse
;
651 if ((age
<= tmo1
&& !rt_fast_clean(rth
)) ||
652 (age
<= tmo2
&& rt_valuable(rth
)))
658 /* Bits of score are:
660 * 30: not quite useless
661 * 29..0: usage counter
663 static inline u32
rt_score(struct rtable
*rt
)
665 u32 score
= jiffies
- rt
->u
.dst
.lastuse
;
667 score
= ~score
& ~(3<<30);
673 !(rt
->rt_flags
& (RTCF_BROADCAST
|RTCF_MULTICAST
|RTCF_LOCAL
)))
679 static inline bool rt_caching(const struct net
*net
)
681 return net
->ipv4
.current_rt_cache_rebuild_count
<=
682 net
->ipv4
.sysctl_rt_cache_rebuild_count
;
685 static inline bool compare_hash_inputs(const struct flowi
*fl1
,
686 const struct flowi
*fl2
)
688 return ((((__force u32
)fl1
->nl_u
.ip4_u
.daddr
^ (__force u32
)fl2
->nl_u
.ip4_u
.daddr
) |
689 ((__force u32
)fl1
->nl_u
.ip4_u
.saddr
^ (__force u32
)fl2
->nl_u
.ip4_u
.saddr
) |
690 (fl1
->iif
^ fl2
->iif
)) == 0);
693 static inline int compare_keys(struct flowi
*fl1
, struct flowi
*fl2
)
695 return (((__force u32
)fl1
->nl_u
.ip4_u
.daddr
^ (__force u32
)fl2
->nl_u
.ip4_u
.daddr
) |
696 ((__force u32
)fl1
->nl_u
.ip4_u
.saddr
^ (__force u32
)fl2
->nl_u
.ip4_u
.saddr
) |
697 (fl1
->mark
^ fl2
->mark
) |
698 (*(u16
*)&fl1
->nl_u
.ip4_u
.tos
^ *(u16
*)&fl2
->nl_u
.ip4_u
.tos
) |
699 (fl1
->oif
^ fl2
->oif
) |
700 (fl1
->iif
^ fl2
->iif
)) == 0;
703 static inline int compare_netns(struct rtable
*rt1
, struct rtable
*rt2
)
705 return net_eq(dev_net(rt1
->u
.dst
.dev
), dev_net(rt2
->u
.dst
.dev
));
708 static inline int rt_is_expired(struct rtable
*rth
)
710 return rth
->rt_genid
!= rt_genid(dev_net(rth
->u
.dst
.dev
));
714 * Perform a full scan of hash table and free all entries.
715 * Can be called by a softirq or a process.
716 * In the later case, we want to be reschedule if necessary
718 static void rt_do_flush(int process_context
)
721 struct rtable
*rth
, *next
;
722 struct rtable
* tail
;
724 for (i
= 0; i
<= rt_hash_mask
; i
++) {
725 if (process_context
&& need_resched())
727 rth
= rt_hash_table
[i
].chain
;
731 spin_lock_bh(rt_hash_lock_addr(i
));
734 struct rtable
** prev
, * p
;
736 rth
= rt_hash_table
[i
].chain
;
738 /* defer releasing the head of the list after spin_unlock */
739 for (tail
= rth
; tail
; tail
= tail
->u
.dst
.rt_next
)
740 if (!rt_is_expired(tail
))
743 rt_hash_table
[i
].chain
= tail
;
745 /* call rt_free on entries after the tail requiring flush */
746 prev
= &rt_hash_table
[i
].chain
;
747 for (p
= *prev
; p
; p
= next
) {
748 next
= p
->u
.dst
.rt_next
;
749 if (!rt_is_expired(p
)) {
750 prev
= &p
->u
.dst
.rt_next
;
758 rth
= rt_hash_table
[i
].chain
;
759 rt_hash_table
[i
].chain
= NULL
;
762 spin_unlock_bh(rt_hash_lock_addr(i
));
764 for (; rth
!= tail
; rth
= next
) {
765 next
= rth
->u
.dst
.rt_next
;
772 * While freeing expired entries, we compute average chain length
773 * and standard deviation, using fixed-point arithmetic.
774 * This to have an estimation of rt_chain_length_max
775 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
776 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
780 #define ONE (1UL << FRACT_BITS)
783 * Given a hash chain and an item in this hash chain,
784 * find if a previous entry has the same hash_inputs
785 * (but differs on tos, mark or oif)
786 * Returns 0 if an alias is found.
787 * Returns ONE if rth has no alias before itself.
789 static int has_noalias(const struct rtable
*head
, const struct rtable
*rth
)
791 const struct rtable
*aux
= head
;
794 if (compare_hash_inputs(&aux
->fl
, &rth
->fl
))
796 aux
= aux
->u
.dst
.rt_next
;
801 static void rt_check_expire(void)
803 static unsigned int rover
;
804 unsigned int i
= rover
, goal
;
805 struct rtable
*rth
, **rthp
;
806 unsigned long samples
= 0;
807 unsigned long sum
= 0, sum2
= 0;
811 delta
= jiffies
- expires_ljiffies
;
812 expires_ljiffies
= jiffies
;
813 mult
= ((u64
)delta
) << rt_hash_log
;
814 if (ip_rt_gc_timeout
> 1)
815 do_div(mult
, ip_rt_gc_timeout
);
816 goal
= (unsigned int)mult
;
817 if (goal
> rt_hash_mask
)
818 goal
= rt_hash_mask
+ 1;
819 for (; goal
> 0; goal
--) {
820 unsigned long tmo
= ip_rt_gc_timeout
;
821 unsigned long length
;
823 i
= (i
+ 1) & rt_hash_mask
;
824 rthp
= &rt_hash_table
[i
].chain
;
834 spin_lock_bh(rt_hash_lock_addr(i
));
835 while ((rth
= *rthp
) != NULL
) {
836 prefetch(rth
->u
.dst
.rt_next
);
837 if (rt_is_expired(rth
)) {
838 *rthp
= rth
->u
.dst
.rt_next
;
842 if (rth
->u
.dst
.expires
) {
843 /* Entry is expired even if it is in use */
844 if (time_before_eq(jiffies
, rth
->u
.dst
.expires
)) {
847 rthp
= &rth
->u
.dst
.rt_next
;
849 * We only count entries on
850 * a chain with equal hash inputs once
851 * so that entries for different QOS
852 * levels, and other non-hash input
853 * attributes don't unfairly skew
854 * the length computation
856 length
+= has_noalias(rt_hash_table
[i
].chain
, rth
);
859 } else if (!rt_may_expire(rth
, tmo
, ip_rt_gc_timeout
))
862 /* Cleanup aged off entries. */
863 *rthp
= rth
->u
.dst
.rt_next
;
866 spin_unlock_bh(rt_hash_lock_addr(i
));
868 sum2
+= length
*length
;
871 unsigned long avg
= sum
/ samples
;
872 unsigned long sd
= int_sqrt(sum2
/ samples
- avg
*avg
);
873 rt_chain_length_max
= max_t(unsigned long,
875 (avg
+ 4*sd
) >> FRACT_BITS
);
881 * rt_worker_func() is run in process context.
882 * we call rt_check_expire() to scan part of the hash table
884 static void rt_worker_func(struct work_struct
*work
)
887 schedule_delayed_work(&expires_work
, ip_rt_gc_interval
);
891 * Pertubation of rt_genid by a small quantity [1..256]
892 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
893 * many times (2^24) without giving recent rt_genid.
894 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
896 static void rt_cache_invalidate(struct net
*net
)
898 unsigned char shuffle
;
900 get_random_bytes(&shuffle
, sizeof(shuffle
));
901 atomic_add(shuffle
+ 1U, &net
->ipv4
.rt_genid
);
905 * delay < 0 : invalidate cache (fast : entries will be deleted later)
906 * delay >= 0 : invalidate & flush cache (can be long)
908 void rt_cache_flush(struct net
*net
, int delay
)
910 rt_cache_invalidate(net
);
912 rt_do_flush(!in_softirq());
915 /* Flush previous cache invalidated entries from the cache */
916 void rt_cache_flush_batch(void)
918 rt_do_flush(!in_softirq());
922 * We change rt_genid and let gc do the cleanup
924 static void rt_secret_rebuild(unsigned long __net
)
926 struct net
*net
= (struct net
*)__net
;
927 rt_cache_invalidate(net
);
928 mod_timer(&net
->ipv4
.rt_secret_timer
, jiffies
+ ip_rt_secret_interval
);
931 static void rt_secret_rebuild_oneshot(struct net
*net
)
933 del_timer_sync(&net
->ipv4
.rt_secret_timer
);
934 rt_cache_invalidate(net
);
935 if (ip_rt_secret_interval
)
936 mod_timer(&net
->ipv4
.rt_secret_timer
, jiffies
+ ip_rt_secret_interval
);
939 static void rt_emergency_hash_rebuild(struct net
*net
)
941 if (net_ratelimit()) {
942 printk(KERN_WARNING
"Route hash chain too long!\n");
943 printk(KERN_WARNING
"Adjust your secret_interval!\n");
946 rt_secret_rebuild_oneshot(net
);
950 Short description of GC goals.
952 We want to build algorithm, which will keep routing cache
953 at some equilibrium point, when number of aged off entries
954 is kept approximately equal to newly generated ones.
956 Current expiration strength is variable "expire".
957 We try to adjust it dynamically, so that if networking
958 is idle expires is large enough to keep enough of warm entries,
959 and when load increases it reduces to limit cache size.
962 static int rt_garbage_collect(struct dst_ops
*ops
)
964 static unsigned long expire
= RT_GC_TIMEOUT
;
965 static unsigned long last_gc
;
967 static int equilibrium
;
968 struct rtable
*rth
, **rthp
;
969 unsigned long now
= jiffies
;
973 * Garbage collection is pretty expensive,
974 * do not make it too frequently.
977 RT_CACHE_STAT_INC(gc_total
);
979 if (now
- last_gc
< ip_rt_gc_min_interval
&&
980 atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
) {
981 RT_CACHE_STAT_INC(gc_ignored
);
985 /* Calculate number of entries, which we want to expire now. */
986 goal
= atomic_read(&ipv4_dst_ops
.entries
) -
987 (ip_rt_gc_elasticity
<< rt_hash_log
);
989 if (equilibrium
< ipv4_dst_ops
.gc_thresh
)
990 equilibrium
= ipv4_dst_ops
.gc_thresh
;
991 goal
= atomic_read(&ipv4_dst_ops
.entries
) - equilibrium
;
993 equilibrium
+= min_t(unsigned int, goal
>> 1, rt_hash_mask
+ 1);
994 goal
= atomic_read(&ipv4_dst_ops
.entries
) - equilibrium
;
997 /* We are in dangerous area. Try to reduce cache really
1000 goal
= max_t(unsigned int, goal
>> 1, rt_hash_mask
+ 1);
1001 equilibrium
= atomic_read(&ipv4_dst_ops
.entries
) - goal
;
1004 if (now
- last_gc
>= ip_rt_gc_min_interval
)
1008 equilibrium
+= goal
;
1015 for (i
= rt_hash_mask
, k
= rover
; i
>= 0; i
--) {
1016 unsigned long tmo
= expire
;
1018 k
= (k
+ 1) & rt_hash_mask
;
1019 rthp
= &rt_hash_table
[k
].chain
;
1020 spin_lock_bh(rt_hash_lock_addr(k
));
1021 while ((rth
= *rthp
) != NULL
) {
1022 if (!rt_is_expired(rth
) &&
1023 !rt_may_expire(rth
, tmo
, expire
)) {
1025 rthp
= &rth
->u
.dst
.rt_next
;
1028 *rthp
= rth
->u
.dst
.rt_next
;
1032 spin_unlock_bh(rt_hash_lock_addr(k
));
1041 /* Goal is not achieved. We stop process if:
1043 - if expire reduced to zero. Otherwise, expire is halfed.
1044 - if table is not full.
1045 - if we are called from interrupt.
1046 - jiffies check is just fallback/debug loop breaker.
1047 We will not spin here for long time in any case.
1050 RT_CACHE_STAT_INC(gc_goal_miss
);
1056 #if RT_CACHE_DEBUG >= 2
1057 printk(KERN_DEBUG
"expire>> %u %d %d %d\n", expire
,
1058 atomic_read(&ipv4_dst_ops
.entries
), goal
, i
);
1061 if (atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
1063 } while (!in_softirq() && time_before_eq(jiffies
, now
));
1065 if (atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
1067 if (net_ratelimit())
1068 printk(KERN_WARNING
"dst cache overflow\n");
1069 RT_CACHE_STAT_INC(gc_dst_overflow
);
1073 expire
+= ip_rt_gc_min_interval
;
1074 if (expire
> ip_rt_gc_timeout
||
1075 atomic_read(&ipv4_dst_ops
.entries
) < ipv4_dst_ops
.gc_thresh
)
1076 expire
= ip_rt_gc_timeout
;
1077 #if RT_CACHE_DEBUG >= 2
1078 printk(KERN_DEBUG
"expire++ %u %d %d %d\n", expire
,
1079 atomic_read(&ipv4_dst_ops
.entries
), goal
, rover
);
1085 * Returns number of entries in a hash chain that have different hash_inputs
1087 static int slow_chain_length(const struct rtable
*head
)
1090 const struct rtable
*rth
= head
;
1093 length
+= has_noalias(head
, rth
);
1094 rth
= rth
->u
.dst
.rt_next
;
1096 return length
>> FRACT_BITS
;
1099 static int rt_intern_hash(unsigned hash
, struct rtable
*rt
,
1100 struct rtable
**rp
, struct sk_buff
*skb
, int ifindex
)
1102 struct rtable
*rth
, **rthp
;
1104 struct rtable
*cand
, **candp
;
1107 int attempts
= !in_softirq();
1111 min_score
= ~(u32
)0;
1116 if (!rt_caching(dev_net(rt
->u
.dst
.dev
))) {
1118 * If we're not caching, just tell the caller we
1119 * were successful and don't touch the route. The
1120 * caller hold the sole reference to the cache entry, and
1121 * it will be released when the caller is done with it.
1122 * If we drop it here, the callers have no way to resolve routes
1123 * when we're not caching. Instead, just point *rp at rt, so
1124 * the caller gets a single use out of the route
1125 * Note that we do rt_free on this new route entry, so that
1126 * once its refcount hits zero, we are still able to reap it
1128 * Note also the rt_free uses call_rcu. We don't actually
1129 * need rcu protection here, this is just our path to get
1130 * on the route gc list.
1133 if (rt
->rt_type
== RTN_UNICAST
|| rt
->fl
.iif
== 0) {
1134 int err
= arp_bind_neighbour(&rt
->u
.dst
);
1136 if (net_ratelimit())
1138 "Neighbour table failure & not caching routes.\n");
1148 rthp
= &rt_hash_table
[hash
].chain
;
1150 spin_lock_bh(rt_hash_lock_addr(hash
));
1151 while ((rth
= *rthp
) != NULL
) {
1152 if (rt_is_expired(rth
)) {
1153 *rthp
= rth
->u
.dst
.rt_next
;
1157 if (compare_keys(&rth
->fl
, &rt
->fl
) && compare_netns(rth
, rt
)) {
1159 *rthp
= rth
->u
.dst
.rt_next
;
1161 * Since lookup is lockfree, the deletion
1162 * must be visible to another weakly ordered CPU before
1163 * the insertion at the start of the hash chain.
1165 rcu_assign_pointer(rth
->u
.dst
.rt_next
,
1166 rt_hash_table
[hash
].chain
);
1168 * Since lookup is lockfree, the update writes
1169 * must be ordered for consistency on SMP.
1171 rcu_assign_pointer(rt_hash_table
[hash
].chain
, rth
);
1173 dst_use(&rth
->u
.dst
, now
);
1174 spin_unlock_bh(rt_hash_lock_addr(hash
));
1180 skb_dst_set(skb
, &rth
->u
.dst
);
1184 if (!atomic_read(&rth
->u
.dst
.__refcnt
)) {
1185 u32 score
= rt_score(rth
);
1187 if (score
<= min_score
) {
1196 rthp
= &rth
->u
.dst
.rt_next
;
1200 /* ip_rt_gc_elasticity used to be average length of chain
1201 * length, when exceeded gc becomes really aggressive.
1203 * The second limit is less certain. At the moment it allows
1204 * only 2 entries per bucket. We will see.
1206 if (chain_length
> ip_rt_gc_elasticity
) {
1207 *candp
= cand
->u
.dst
.rt_next
;
1211 if (chain_length
> rt_chain_length_max
&&
1212 slow_chain_length(rt_hash_table
[hash
].chain
) > rt_chain_length_max
) {
1213 struct net
*net
= dev_net(rt
->u
.dst
.dev
);
1214 int num
= ++net
->ipv4
.current_rt_cache_rebuild_count
;
1215 if (!rt_caching(net
)) {
1216 printk(KERN_WARNING
"%s: %d rebuilds is over limit, route caching disabled\n",
1217 rt
->u
.dst
.dev
->name
, num
);
1219 rt_emergency_hash_rebuild(net
);
1220 spin_unlock_bh(rt_hash_lock_addr(hash
));
1222 hash
= rt_hash(rt
->fl
.fl4_dst
, rt
->fl
.fl4_src
,
1223 ifindex
, rt_genid(net
));
1228 /* Try to bind route to arp only if it is output
1229 route or unicast forwarding path.
1231 if (rt
->rt_type
== RTN_UNICAST
|| rt
->fl
.iif
== 0) {
1232 int err
= arp_bind_neighbour(&rt
->u
.dst
);
1234 spin_unlock_bh(rt_hash_lock_addr(hash
));
1236 if (err
!= -ENOBUFS
) {
1241 /* Neighbour tables are full and nothing
1242 can be released. Try to shrink route cache,
1243 it is most likely it holds some neighbour records.
1245 if (attempts
-- > 0) {
1246 int saved_elasticity
= ip_rt_gc_elasticity
;
1247 int saved_int
= ip_rt_gc_min_interval
;
1248 ip_rt_gc_elasticity
= 1;
1249 ip_rt_gc_min_interval
= 0;
1250 rt_garbage_collect(&ipv4_dst_ops
);
1251 ip_rt_gc_min_interval
= saved_int
;
1252 ip_rt_gc_elasticity
= saved_elasticity
;
1256 if (net_ratelimit())
1257 printk(KERN_WARNING
"Neighbour table overflow.\n");
1263 rt
->u
.dst
.rt_next
= rt_hash_table
[hash
].chain
;
1265 #if RT_CACHE_DEBUG >= 2
1266 if (rt
->u
.dst
.rt_next
) {
1268 printk(KERN_DEBUG
"rt_cache @%02x: %pI4",
1270 for (trt
= rt
->u
.dst
.rt_next
; trt
; trt
= trt
->u
.dst
.rt_next
)
1271 printk(" . %pI4", &trt
->rt_dst
);
1276 * Since lookup is lockfree, we must make sure
1277 * previous writes to rt are comitted to memory
1278 * before making rt visible to other CPUS.
1280 rcu_assign_pointer(rt_hash_table
[hash
].chain
, rt
);
1282 spin_unlock_bh(rt_hash_lock_addr(hash
));
1288 skb_dst_set(skb
, &rt
->u
.dst
);
1292 void rt_bind_peer(struct rtable
*rt
, int create
)
1294 static DEFINE_SPINLOCK(rt_peer_lock
);
1295 struct inet_peer
*peer
;
1297 peer
= inet_getpeer(rt
->rt_dst
, create
);
1299 spin_lock_bh(&rt_peer_lock
);
1300 if (rt
->peer
== NULL
) {
1304 spin_unlock_bh(&rt_peer_lock
);
1310 * Peer allocation may fail only in serious out-of-memory conditions. However
1311 * we still can generate some output.
1312 * Random ID selection looks a bit dangerous because we have no chances to
1313 * select ID being unique in a reasonable period of time.
1314 * But broken packet identifier may be better than no packet at all.
1316 static void ip_select_fb_ident(struct iphdr
*iph
)
1318 static DEFINE_SPINLOCK(ip_fb_id_lock
);
1319 static u32 ip_fallback_id
;
1322 spin_lock_bh(&ip_fb_id_lock
);
1323 salt
= secure_ip_id((__force __be32
)ip_fallback_id
^ iph
->daddr
);
1324 iph
->id
= htons(salt
& 0xFFFF);
1325 ip_fallback_id
= salt
;
1326 spin_unlock_bh(&ip_fb_id_lock
);
1329 void __ip_select_ident(struct iphdr
*iph
, struct dst_entry
*dst
, int more
)
1331 struct rtable
*rt
= (struct rtable
*) dst
;
1334 if (rt
->peer
== NULL
)
1335 rt_bind_peer(rt
, 1);
1337 /* If peer is attached to destination, it is never detached,
1338 so that we need not to grab a lock to dereference it.
1341 iph
->id
= htons(inet_getid(rt
->peer
, more
));
1345 printk(KERN_DEBUG
"rt_bind_peer(0) @%p\n",
1346 __builtin_return_address(0));
1348 ip_select_fb_ident(iph
);
1351 static void rt_del(unsigned hash
, struct rtable
*rt
)
1353 struct rtable
**rthp
, *aux
;
1355 rthp
= &rt_hash_table
[hash
].chain
;
1356 spin_lock_bh(rt_hash_lock_addr(hash
));
1358 while ((aux
= *rthp
) != NULL
) {
1359 if (aux
== rt
|| rt_is_expired(aux
)) {
1360 *rthp
= aux
->u
.dst
.rt_next
;
1364 rthp
= &aux
->u
.dst
.rt_next
;
1366 spin_unlock_bh(rt_hash_lock_addr(hash
));
1369 void ip_rt_redirect(__be32 old_gw
, __be32 daddr
, __be32 new_gw
,
1370 __be32 saddr
, struct net_device
*dev
)
1373 struct in_device
*in_dev
= in_dev_get(dev
);
1374 struct rtable
*rth
, **rthp
;
1375 __be32 skeys
[2] = { saddr
, 0 };
1376 int ikeys
[2] = { dev
->ifindex
, 0 };
1377 struct netevent_redirect netevent
;
1384 if (new_gw
== old_gw
|| !IN_DEV_RX_REDIRECTS(in_dev
) ||
1385 ipv4_is_multicast(new_gw
) || ipv4_is_lbcast(new_gw
) ||
1386 ipv4_is_zeronet(new_gw
))
1387 goto reject_redirect
;
1389 if (!rt_caching(net
))
1390 goto reject_redirect
;
1392 if (!IN_DEV_SHARED_MEDIA(in_dev
)) {
1393 if (!inet_addr_onlink(in_dev
, new_gw
, old_gw
))
1394 goto reject_redirect
;
1395 if (IN_DEV_SEC_REDIRECTS(in_dev
) && ip_fib_check_default(new_gw
, dev
))
1396 goto reject_redirect
;
1398 if (inet_addr_type(net
, new_gw
) != RTN_UNICAST
)
1399 goto reject_redirect
;
1402 for (i
= 0; i
< 2; i
++) {
1403 for (k
= 0; k
< 2; k
++) {
1404 unsigned hash
= rt_hash(daddr
, skeys
[i
], ikeys
[k
],
1407 rthp
=&rt_hash_table
[hash
].chain
;
1410 while ((rth
= rcu_dereference(*rthp
)) != NULL
) {
1413 if (rth
->fl
.fl4_dst
!= daddr
||
1414 rth
->fl
.fl4_src
!= skeys
[i
] ||
1415 rth
->fl
.oif
!= ikeys
[k
] ||
1417 rt_is_expired(rth
) ||
1418 !net_eq(dev_net(rth
->u
.dst
.dev
), net
)) {
1419 rthp
= &rth
->u
.dst
.rt_next
;
1423 if (rth
->rt_dst
!= daddr
||
1424 rth
->rt_src
!= saddr
||
1426 rth
->rt_gateway
!= old_gw
||
1427 rth
->u
.dst
.dev
!= dev
)
1430 dst_hold(&rth
->u
.dst
);
1433 rt
= dst_alloc(&ipv4_dst_ops
);
1440 /* Copy all the information. */
1442 rt
->u
.dst
.__use
= 1;
1443 atomic_set(&rt
->u
.dst
.__refcnt
, 1);
1444 rt
->u
.dst
.child
= NULL
;
1446 dev_hold(rt
->u
.dst
.dev
);
1448 in_dev_hold(rt
->idev
);
1449 rt
->u
.dst
.obsolete
= -1;
1450 rt
->u
.dst
.lastuse
= jiffies
;
1451 rt
->u
.dst
.path
= &rt
->u
.dst
;
1452 rt
->u
.dst
.neighbour
= NULL
;
1453 rt
->u
.dst
.hh
= NULL
;
1455 rt
->u
.dst
.xfrm
= NULL
;
1457 rt
->rt_genid
= rt_genid(net
);
1458 rt
->rt_flags
|= RTCF_REDIRECTED
;
1460 /* Gateway is different ... */
1461 rt
->rt_gateway
= new_gw
;
1463 /* Redirect received -> path was valid */
1464 dst_confirm(&rth
->u
.dst
);
1467 atomic_inc(&rt
->peer
->refcnt
);
1469 if (arp_bind_neighbour(&rt
->u
.dst
) ||
1470 !(rt
->u
.dst
.neighbour
->nud_state
&
1472 if (rt
->u
.dst
.neighbour
)
1473 neigh_event_send(rt
->u
.dst
.neighbour
, NULL
);
1479 netevent
.old
= &rth
->u
.dst
;
1480 netevent
.new = &rt
->u
.dst
;
1481 call_netevent_notifiers(NETEVENT_REDIRECT
,
1485 if (!rt_intern_hash(hash
, rt
, &rt
, NULL
, rt
->fl
.oif
))
1498 #ifdef CONFIG_IP_ROUTE_VERBOSE
1499 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
1500 printk(KERN_INFO
"Redirect from %pI4 on %s about %pI4 ignored.\n"
1501 " Advised path = %pI4 -> %pI4\n",
1502 &old_gw
, dev
->name
, &new_gw
,
1508 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
)
1510 struct rtable
*rt
= (struct rtable
*)dst
;
1511 struct dst_entry
*ret
= dst
;
1514 if (dst
->obsolete
> 0) {
1517 } else if ((rt
->rt_flags
& RTCF_REDIRECTED
) ||
1518 (rt
->u
.dst
.expires
&&
1519 time_after_eq(jiffies
, rt
->u
.dst
.expires
))) {
1520 unsigned hash
= rt_hash(rt
->fl
.fl4_dst
, rt
->fl
.fl4_src
,
1522 rt_genid(dev_net(dst
->dev
)));
1523 #if RT_CACHE_DEBUG >= 1
1524 printk(KERN_DEBUG
"ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1525 &rt
->rt_dst
, rt
->fl
.fl4_tos
);
1536 * 1. The first ip_rt_redirect_number redirects are sent
1537 * with exponential backoff, then we stop sending them at all,
1538 * assuming that the host ignores our redirects.
1539 * 2. If we did not see packets requiring redirects
1540 * during ip_rt_redirect_silence, we assume that the host
1541 * forgot redirected route and start to send redirects again.
1543 * This algorithm is much cheaper and more intelligent than dumb load limiting
1546 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1547 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1550 void ip_rt_send_redirect(struct sk_buff
*skb
)
1552 struct rtable
*rt
= skb_rtable(skb
);
1553 struct in_device
*in_dev
;
1557 in_dev
= __in_dev_get_rcu(rt
->u
.dst
.dev
);
1558 if (!in_dev
|| !IN_DEV_TX_REDIRECTS(in_dev
)) {
1562 log_martians
= IN_DEV_LOG_MARTIANS(in_dev
);
1565 /* No redirected packets during ip_rt_redirect_silence;
1566 * reset the algorithm.
1568 if (time_after(jiffies
, rt
->u
.dst
.rate_last
+ ip_rt_redirect_silence
))
1569 rt
->u
.dst
.rate_tokens
= 0;
1571 /* Too many ignored redirects; do not send anything
1572 * set u.dst.rate_last to the last seen redirected packet.
1574 if (rt
->u
.dst
.rate_tokens
>= ip_rt_redirect_number
) {
1575 rt
->u
.dst
.rate_last
= jiffies
;
1579 /* Check for load limit; set rate_last to the latest sent
1582 if (rt
->u
.dst
.rate_tokens
== 0 ||
1584 (rt
->u
.dst
.rate_last
+
1585 (ip_rt_redirect_load
<< rt
->u
.dst
.rate_tokens
)))) {
1586 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, rt
->rt_gateway
);
1587 rt
->u
.dst
.rate_last
= jiffies
;
1588 ++rt
->u
.dst
.rate_tokens
;
1589 #ifdef CONFIG_IP_ROUTE_VERBOSE
1591 rt
->u
.dst
.rate_tokens
== ip_rt_redirect_number
&&
1593 printk(KERN_WARNING
"host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1594 &rt
->rt_src
, rt
->rt_iif
,
1595 &rt
->rt_dst
, &rt
->rt_gateway
);
1600 static int ip_error(struct sk_buff
*skb
)
1602 struct rtable
*rt
= skb_rtable(skb
);
1606 switch (rt
->u
.dst
.error
) {
1611 code
= ICMP_HOST_UNREACH
;
1614 code
= ICMP_NET_UNREACH
;
1615 IP_INC_STATS_BH(dev_net(rt
->u
.dst
.dev
),
1616 IPSTATS_MIB_INNOROUTES
);
1619 code
= ICMP_PKT_FILTERED
;
1624 rt
->u
.dst
.rate_tokens
+= now
- rt
->u
.dst
.rate_last
;
1625 if (rt
->u
.dst
.rate_tokens
> ip_rt_error_burst
)
1626 rt
->u
.dst
.rate_tokens
= ip_rt_error_burst
;
1627 rt
->u
.dst
.rate_last
= now
;
1628 if (rt
->u
.dst
.rate_tokens
>= ip_rt_error_cost
) {
1629 rt
->u
.dst
.rate_tokens
-= ip_rt_error_cost
;
1630 icmp_send(skb
, ICMP_DEST_UNREACH
, code
, 0);
1633 out
: kfree_skb(skb
);
1638 * The last two values are not from the RFC but
1639 * are needed for AMPRnet AX.25 paths.
1642 static const unsigned short mtu_plateau
[] =
1643 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1645 static inline unsigned short guess_mtu(unsigned short old_mtu
)
1649 for (i
= 0; i
< ARRAY_SIZE(mtu_plateau
); i
++)
1650 if (old_mtu
> mtu_plateau
[i
])
1651 return mtu_plateau
[i
];
1655 unsigned short ip_rt_frag_needed(struct net
*net
, struct iphdr
*iph
,
1656 unsigned short new_mtu
,
1657 struct net_device
*dev
)
1660 unsigned short old_mtu
= ntohs(iph
->tot_len
);
1662 int ikeys
[2] = { dev
->ifindex
, 0 };
1663 __be32 skeys
[2] = { iph
->saddr
, 0, };
1664 __be32 daddr
= iph
->daddr
;
1665 unsigned short est_mtu
= 0;
1667 for (k
= 0; k
< 2; k
++) {
1668 for (i
= 0; i
< 2; i
++) {
1669 unsigned hash
= rt_hash(daddr
, skeys
[i
], ikeys
[k
],
1673 for (rth
= rcu_dereference(rt_hash_table
[hash
].chain
); rth
;
1674 rth
= rcu_dereference(rth
->u
.dst
.rt_next
)) {
1675 unsigned short mtu
= new_mtu
;
1677 if (rth
->fl
.fl4_dst
!= daddr
||
1678 rth
->fl
.fl4_src
!= skeys
[i
] ||
1679 rth
->rt_dst
!= daddr
||
1680 rth
->rt_src
!= iph
->saddr
||
1681 rth
->fl
.oif
!= ikeys
[k
] ||
1683 dst_metric_locked(&rth
->u
.dst
, RTAX_MTU
) ||
1684 !net_eq(dev_net(rth
->u
.dst
.dev
), net
) ||
1688 if (new_mtu
< 68 || new_mtu
>= old_mtu
) {
1690 /* BSD 4.2 compatibility hack :-( */
1692 old_mtu
>= dst_mtu(&rth
->u
.dst
) &&
1693 old_mtu
>= 68 + (iph
->ihl
<< 2))
1694 old_mtu
-= iph
->ihl
<< 2;
1696 mtu
= guess_mtu(old_mtu
);
1698 if (mtu
<= dst_mtu(&rth
->u
.dst
)) {
1699 if (mtu
< dst_mtu(&rth
->u
.dst
)) {
1700 dst_confirm(&rth
->u
.dst
);
1701 if (mtu
< ip_rt_min_pmtu
) {
1702 mtu
= ip_rt_min_pmtu
;
1703 rth
->u
.dst
.metrics
[RTAX_LOCK
-1] |=
1706 rth
->u
.dst
.metrics
[RTAX_MTU
-1] = mtu
;
1707 dst_set_expires(&rth
->u
.dst
,
1716 return est_mtu
? : new_mtu
;
1719 static void ip_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
1721 if (dst_mtu(dst
) > mtu
&& mtu
>= 68 &&
1722 !(dst_metric_locked(dst
, RTAX_MTU
))) {
1723 if (mtu
< ip_rt_min_pmtu
) {
1724 mtu
= ip_rt_min_pmtu
;
1725 dst
->metrics
[RTAX_LOCK
-1] |= (1 << RTAX_MTU
);
1727 dst
->metrics
[RTAX_MTU
-1] = mtu
;
1728 dst_set_expires(dst
, ip_rt_mtu_expires
);
1729 call_netevent_notifiers(NETEVENT_PMTU_UPDATE
, dst
);
1733 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
)
1735 if (rt_is_expired((struct rtable
*)dst
))
1740 static void ipv4_dst_destroy(struct dst_entry
*dst
)
1742 struct rtable
*rt
= (struct rtable
*) dst
;
1743 struct inet_peer
*peer
= rt
->peer
;
1744 struct in_device
*idev
= rt
->idev
;
1757 static void ipv4_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
1760 struct rtable
*rt
= (struct rtable
*) dst
;
1761 struct in_device
*idev
= rt
->idev
;
1762 if (dev
!= dev_net(dev
)->loopback_dev
&& idev
&& idev
->dev
== dev
) {
1763 struct in_device
*loopback_idev
=
1764 in_dev_get(dev_net(dev
)->loopback_dev
);
1765 if (loopback_idev
) {
1766 rt
->idev
= loopback_idev
;
1772 static void ipv4_link_failure(struct sk_buff
*skb
)
1776 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_HOST_UNREACH
, 0);
1778 rt
= skb_rtable(skb
);
1780 dst_set_expires(&rt
->u
.dst
, 0);
1783 static int ip_rt_bug(struct sk_buff
*skb
)
1785 printk(KERN_DEBUG
"ip_rt_bug: %pI4 -> %pI4, %s\n",
1786 &ip_hdr(skb
)->saddr
, &ip_hdr(skb
)->daddr
,
1787 skb
->dev
? skb
->dev
->name
: "?");
1793 We do not cache source address of outgoing interface,
1794 because it is used only by IP RR, TS and SRR options,
1795 so that it out of fast path.
1797 BTW remember: "addr" is allowed to be not aligned
1801 void ip_rt_get_source(u8
*addr
, struct rtable
*rt
)
1804 struct fib_result res
;
1806 if (rt
->fl
.iif
== 0)
1808 else if (fib_lookup(dev_net(rt
->u
.dst
.dev
), &rt
->fl
, &res
) == 0) {
1809 src
= FIB_RES_PREFSRC(res
);
1812 src
= inet_select_addr(rt
->u
.dst
.dev
, rt
->rt_gateway
,
1814 memcpy(addr
, &src
, 4);
1817 #ifdef CONFIG_NET_CLS_ROUTE
1818 static void set_class_tag(struct rtable
*rt
, u32 tag
)
1820 if (!(rt
->u
.dst
.tclassid
& 0xFFFF))
1821 rt
->u
.dst
.tclassid
|= tag
& 0xFFFF;
1822 if (!(rt
->u
.dst
.tclassid
& 0xFFFF0000))
1823 rt
->u
.dst
.tclassid
|= tag
& 0xFFFF0000;
1827 static void rt_set_nexthop(struct rtable
*rt
, struct fib_result
*res
, u32 itag
)
1829 struct fib_info
*fi
= res
->fi
;
1832 if (FIB_RES_GW(*res
) &&
1833 FIB_RES_NH(*res
).nh_scope
== RT_SCOPE_LINK
)
1834 rt
->rt_gateway
= FIB_RES_GW(*res
);
1835 memcpy(rt
->u
.dst
.metrics
, fi
->fib_metrics
,
1836 sizeof(rt
->u
.dst
.metrics
));
1837 if (fi
->fib_mtu
== 0) {
1838 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = rt
->u
.dst
.dev
->mtu
;
1839 if (dst_metric_locked(&rt
->u
.dst
, RTAX_MTU
) &&
1840 rt
->rt_gateway
!= rt
->rt_dst
&&
1841 rt
->u
.dst
.dev
->mtu
> 576)
1842 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = 576;
1844 #ifdef CONFIG_NET_CLS_ROUTE
1845 rt
->u
.dst
.tclassid
= FIB_RES_NH(*res
).nh_tclassid
;
1848 rt
->u
.dst
.metrics
[RTAX_MTU
-1]= rt
->u
.dst
.dev
->mtu
;
1850 if (dst_metric(&rt
->u
.dst
, RTAX_HOPLIMIT
) == 0)
1851 rt
->u
.dst
.metrics
[RTAX_HOPLIMIT
-1] = sysctl_ip_default_ttl
;
1852 if (dst_mtu(&rt
->u
.dst
) > IP_MAX_MTU
)
1853 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = IP_MAX_MTU
;
1854 if (dst_metric(&rt
->u
.dst
, RTAX_ADVMSS
) == 0)
1855 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = max_t(unsigned int, rt
->u
.dst
.dev
->mtu
- 40,
1857 if (dst_metric(&rt
->u
.dst
, RTAX_ADVMSS
) > 65535 - 40)
1858 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = 65535 - 40;
1860 #ifdef CONFIG_NET_CLS_ROUTE
1861 #ifdef CONFIG_IP_MULTIPLE_TABLES
1862 set_class_tag(rt
, fib_rules_tclass(res
));
1864 set_class_tag(rt
, itag
);
1866 rt
->rt_type
= res
->type
;
1869 static int ip_route_input_mc(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1870 u8 tos
, struct net_device
*dev
, int our
)
1875 struct in_device
*in_dev
= in_dev_get(dev
);
1878 /* Primary sanity checks. */
1883 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
) ||
1884 ipv4_is_loopback(saddr
) || skb
->protocol
!= htons(ETH_P_IP
))
1887 if (ipv4_is_zeronet(saddr
)) {
1888 if (!ipv4_is_local_multicast(daddr
))
1890 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
1891 } else if (fib_validate_source(saddr
, 0, tos
, 0,
1892 dev
, &spec_dst
, &itag
, 0) < 0)
1895 rth
= dst_alloc(&ipv4_dst_ops
);
1899 rth
->u
.dst
.output
= ip_rt_bug
;
1900 rth
->u
.dst
.obsolete
= -1;
1902 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
1903 rth
->u
.dst
.flags
= DST_HOST
;
1904 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
1905 rth
->u
.dst
.flags
|= DST_NOPOLICY
;
1906 rth
->fl
.fl4_dst
= daddr
;
1907 rth
->rt_dst
= daddr
;
1908 rth
->fl
.fl4_tos
= tos
;
1909 rth
->fl
.mark
= skb
->mark
;
1910 rth
->fl
.fl4_src
= saddr
;
1911 rth
->rt_src
= saddr
;
1912 #ifdef CONFIG_NET_CLS_ROUTE
1913 rth
->u
.dst
.tclassid
= itag
;
1916 rth
->fl
.iif
= dev
->ifindex
;
1917 rth
->u
.dst
.dev
= init_net
.loopback_dev
;
1918 dev_hold(rth
->u
.dst
.dev
);
1919 rth
->idev
= in_dev_get(rth
->u
.dst
.dev
);
1921 rth
->rt_gateway
= daddr
;
1922 rth
->rt_spec_dst
= spec_dst
;
1923 rth
->rt_genid
= rt_genid(dev_net(dev
));
1924 rth
->rt_flags
= RTCF_MULTICAST
;
1925 rth
->rt_type
= RTN_MULTICAST
;
1927 rth
->u
.dst
.input
= ip_local_deliver
;
1928 rth
->rt_flags
|= RTCF_LOCAL
;
1931 #ifdef CONFIG_IP_MROUTE
1932 if (!ipv4_is_local_multicast(daddr
) && IN_DEV_MFORWARD(in_dev
))
1933 rth
->u
.dst
.input
= ip_mr_input
;
1935 RT_CACHE_STAT_INC(in_slow_mc
);
1938 hash
= rt_hash(daddr
, saddr
, dev
->ifindex
, rt_genid(dev_net(dev
)));
1939 return rt_intern_hash(hash
, rth
, NULL
, skb
, dev
->ifindex
);
1951 static void ip_handle_martian_source(struct net_device
*dev
,
1952 struct in_device
*in_dev
,
1953 struct sk_buff
*skb
,
1957 RT_CACHE_STAT_INC(in_martian_src
);
1958 #ifdef CONFIG_IP_ROUTE_VERBOSE
1959 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit()) {
1961 * RFC1812 recommendation, if source is martian,
1962 * the only hint is MAC header.
1964 printk(KERN_WARNING
"martian source %pI4 from %pI4, on dev %s\n",
1965 &daddr
, &saddr
, dev
->name
);
1966 if (dev
->hard_header_len
&& skb_mac_header_was_set(skb
)) {
1968 const unsigned char *p
= skb_mac_header(skb
);
1969 printk(KERN_WARNING
"ll header: ");
1970 for (i
= 0; i
< dev
->hard_header_len
; i
++, p
++) {
1972 if (i
< (dev
->hard_header_len
- 1))
1981 static int __mkroute_input(struct sk_buff
*skb
,
1982 struct fib_result
*res
,
1983 struct in_device
*in_dev
,
1984 __be32 daddr
, __be32 saddr
, u32 tos
,
1985 struct rtable
**result
)
1990 struct in_device
*out_dev
;
1995 /* get a working reference to the output device */
1996 out_dev
= in_dev_get(FIB_RES_DEV(*res
));
1997 if (out_dev
== NULL
) {
1998 if (net_ratelimit())
1999 printk(KERN_CRIT
"Bug in ip_route_input" \
2000 "_slow(). Please, report\n");
2005 err
= fib_validate_source(saddr
, daddr
, tos
, FIB_RES_OIF(*res
),
2006 in_dev
->dev
, &spec_dst
, &itag
, skb
->mark
);
2008 ip_handle_martian_source(in_dev
->dev
, in_dev
, skb
, daddr
,
2016 flags
|= RTCF_DIRECTSRC
;
2018 if (out_dev
== in_dev
&& err
&&
2019 (IN_DEV_SHARED_MEDIA(out_dev
) ||
2020 inet_addr_onlink(out_dev
, saddr
, FIB_RES_GW(*res
))))
2021 flags
|= RTCF_DOREDIRECT
;
2023 if (skb
->protocol
!= htons(ETH_P_IP
)) {
2024 /* Not IP (i.e. ARP). Do not create route, if it is
2025 * invalid for proxy arp. DNAT routes are always valid.
2027 * Proxy arp feature have been extended to allow, ARP
2028 * replies back to the same interface, to support
2029 * Private VLAN switch technologies. See arp.c.
2031 if (out_dev
== in_dev
&&
2032 IN_DEV_PROXY_ARP_PVLAN(in_dev
) == 0) {
2039 rth
= dst_alloc(&ipv4_dst_ops
);
2045 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
2046 rth
->u
.dst
.flags
= DST_HOST
;
2047 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
2048 rth
->u
.dst
.flags
|= DST_NOPOLICY
;
2049 if (IN_DEV_CONF_GET(out_dev
, NOXFRM
))
2050 rth
->u
.dst
.flags
|= DST_NOXFRM
;
2051 rth
->fl
.fl4_dst
= daddr
;
2052 rth
->rt_dst
= daddr
;
2053 rth
->fl
.fl4_tos
= tos
;
2054 rth
->fl
.mark
= skb
->mark
;
2055 rth
->fl
.fl4_src
= saddr
;
2056 rth
->rt_src
= saddr
;
2057 rth
->rt_gateway
= daddr
;
2059 rth
->fl
.iif
= in_dev
->dev
->ifindex
;
2060 rth
->u
.dst
.dev
= (out_dev
)->dev
;
2061 dev_hold(rth
->u
.dst
.dev
);
2062 rth
->idev
= in_dev_get(rth
->u
.dst
.dev
);
2064 rth
->rt_spec_dst
= spec_dst
;
2066 rth
->u
.dst
.obsolete
= -1;
2067 rth
->u
.dst
.input
= ip_forward
;
2068 rth
->u
.dst
.output
= ip_output
;
2069 rth
->rt_genid
= rt_genid(dev_net(rth
->u
.dst
.dev
));
2071 rt_set_nexthop(rth
, res
, itag
);
2073 rth
->rt_flags
= flags
;
2078 /* release the working reference to the output device */
2079 in_dev_put(out_dev
);
2083 static int ip_mkroute_input(struct sk_buff
*skb
,
2084 struct fib_result
*res
,
2085 const struct flowi
*fl
,
2086 struct in_device
*in_dev
,
2087 __be32 daddr
, __be32 saddr
, u32 tos
)
2089 struct rtable
* rth
= NULL
;
2093 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2094 if (res
->fi
&& res
->fi
->fib_nhs
> 1 && fl
->oif
== 0)
2095 fib_select_multipath(fl
, res
);
2098 /* create a routing cache entry */
2099 err
= __mkroute_input(skb
, res
, in_dev
, daddr
, saddr
, tos
, &rth
);
2103 /* put it into the cache */
2104 hash
= rt_hash(daddr
, saddr
, fl
->iif
,
2105 rt_genid(dev_net(rth
->u
.dst
.dev
)));
2106 return rt_intern_hash(hash
, rth
, NULL
, skb
, fl
->iif
);
2110 * NOTE. We drop all the packets that has local source
2111 * addresses, because every properly looped back packet
2112 * must have correct destination already attached by output routine.
2114 * Such approach solves two big problems:
2115 * 1. Not simplex devices are handled properly.
2116 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2119 static int ip_route_input_slow(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2120 u8 tos
, struct net_device
*dev
)
2122 struct fib_result res
;
2123 struct in_device
*in_dev
= in_dev_get(dev
);
2124 struct flowi fl
= { .nl_u
= { .ip4_u
=
2128 .scope
= RT_SCOPE_UNIVERSE
,
2131 .iif
= dev
->ifindex
};
2134 struct rtable
* rth
;
2139 struct net
* net
= dev_net(dev
);
2141 /* IP on this device is disabled. */
2146 /* Check for the most weird martians, which can be not detected
2150 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
) ||
2151 ipv4_is_loopback(saddr
))
2152 goto martian_source
;
2154 if (daddr
== htonl(0xFFFFFFFF) || (saddr
== 0 && daddr
== 0))
2157 /* Accept zero addresses only to limited broadcast;
2158 * I even do not know to fix it or not. Waiting for complains :-)
2160 if (ipv4_is_zeronet(saddr
))
2161 goto martian_source
;
2163 if (ipv4_is_lbcast(daddr
) || ipv4_is_zeronet(daddr
) ||
2164 ipv4_is_loopback(daddr
))
2165 goto martian_destination
;
2168 * Now we are ready to route packet.
2170 if ((err
= fib_lookup(net
, &fl
, &res
)) != 0) {
2171 if (!IN_DEV_FORWARD(in_dev
))
2177 RT_CACHE_STAT_INC(in_slow_tot
);
2179 if (res
.type
== RTN_BROADCAST
)
2182 if (res
.type
== RTN_LOCAL
) {
2184 result
= fib_validate_source(saddr
, daddr
, tos
,
2185 net
->loopback_dev
->ifindex
,
2186 dev
, &spec_dst
, &itag
, skb
->mark
);
2188 goto martian_source
;
2190 flags
|= RTCF_DIRECTSRC
;
2195 if (!IN_DEV_FORWARD(in_dev
))
2197 if (res
.type
!= RTN_UNICAST
)
2198 goto martian_destination
;
2200 err
= ip_mkroute_input(skb
, &res
, &fl
, in_dev
, daddr
, saddr
, tos
);
2208 if (skb
->protocol
!= htons(ETH_P_IP
))
2211 if (ipv4_is_zeronet(saddr
))
2212 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
2214 err
= fib_validate_source(saddr
, 0, tos
, 0, dev
, &spec_dst
,
2217 goto martian_source
;
2219 flags
|= RTCF_DIRECTSRC
;
2221 flags
|= RTCF_BROADCAST
;
2222 res
.type
= RTN_BROADCAST
;
2223 RT_CACHE_STAT_INC(in_brd
);
2226 rth
= dst_alloc(&ipv4_dst_ops
);
2230 rth
->u
.dst
.output
= ip_rt_bug
;
2231 rth
->u
.dst
.obsolete
= -1;
2232 rth
->rt_genid
= rt_genid(net
);
2234 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
2235 rth
->u
.dst
.flags
= DST_HOST
;
2236 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
2237 rth
->u
.dst
.flags
|= DST_NOPOLICY
;
2238 rth
->fl
.fl4_dst
= daddr
;
2239 rth
->rt_dst
= daddr
;
2240 rth
->fl
.fl4_tos
= tos
;
2241 rth
->fl
.mark
= skb
->mark
;
2242 rth
->fl
.fl4_src
= saddr
;
2243 rth
->rt_src
= saddr
;
2244 #ifdef CONFIG_NET_CLS_ROUTE
2245 rth
->u
.dst
.tclassid
= itag
;
2248 rth
->fl
.iif
= dev
->ifindex
;
2249 rth
->u
.dst
.dev
= net
->loopback_dev
;
2250 dev_hold(rth
->u
.dst
.dev
);
2251 rth
->idev
= in_dev_get(rth
->u
.dst
.dev
);
2252 rth
->rt_gateway
= daddr
;
2253 rth
->rt_spec_dst
= spec_dst
;
2254 rth
->u
.dst
.input
= ip_local_deliver
;
2255 rth
->rt_flags
= flags
|RTCF_LOCAL
;
2256 if (res
.type
== RTN_UNREACHABLE
) {
2257 rth
->u
.dst
.input
= ip_error
;
2258 rth
->u
.dst
.error
= -err
;
2259 rth
->rt_flags
&= ~RTCF_LOCAL
;
2261 rth
->rt_type
= res
.type
;
2262 hash
= rt_hash(daddr
, saddr
, fl
.iif
, rt_genid(net
));
2263 err
= rt_intern_hash(hash
, rth
, NULL
, skb
, fl
.iif
);
2267 RT_CACHE_STAT_INC(in_no_route
);
2268 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_UNIVERSE
);
2269 res
.type
= RTN_UNREACHABLE
;
2275 * Do not cache martian addresses: they should be logged (RFC1812)
2277 martian_destination
:
2278 RT_CACHE_STAT_INC(in_martian_dst
);
2279 #ifdef CONFIG_IP_ROUTE_VERBOSE
2280 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
2281 printk(KERN_WARNING
"martian destination %pI4 from %pI4, dev %s\n",
2282 &daddr
, &saddr
, dev
->name
);
2286 err
= -EHOSTUNREACH
;
2298 ip_handle_martian_source(dev
, in_dev
, skb
, daddr
, saddr
);
2302 int ip_route_input(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2303 u8 tos
, struct net_device
*dev
)
2305 struct rtable
* rth
;
2307 int iif
= dev
->ifindex
;
2312 if (!rt_caching(net
))
2315 tos
&= IPTOS_RT_MASK
;
2316 hash
= rt_hash(daddr
, saddr
, iif
, rt_genid(net
));
2319 for (rth
= rcu_dereference(rt_hash_table
[hash
].chain
); rth
;
2320 rth
= rcu_dereference(rth
->u
.dst
.rt_next
)) {
2321 if ((((__force u32
)rth
->fl
.fl4_dst
^ (__force u32
)daddr
) |
2322 ((__force u32
)rth
->fl
.fl4_src
^ (__force u32
)saddr
) |
2323 (rth
->fl
.iif
^ iif
) |
2325 (rth
->fl
.fl4_tos
^ tos
)) == 0 &&
2326 rth
->fl
.mark
== skb
->mark
&&
2327 net_eq(dev_net(rth
->u
.dst
.dev
), net
) &&
2328 !rt_is_expired(rth
)) {
2329 dst_use(&rth
->u
.dst
, jiffies
);
2330 RT_CACHE_STAT_INC(in_hit
);
2332 skb_dst_set(skb
, &rth
->u
.dst
);
2335 RT_CACHE_STAT_INC(in_hlist_search
);
2340 /* Multicast recognition logic is moved from route cache to here.
2341 The problem was that too many Ethernet cards have broken/missing
2342 hardware multicast filters :-( As result the host on multicasting
2343 network acquires a lot of useless route cache entries, sort of
2344 SDR messages from all the world. Now we try to get rid of them.
2345 Really, provided software IP multicast filter is organized
2346 reasonably (at least, hashed), it does not result in a slowdown
2347 comparing with route cache reject entries.
2348 Note, that multicast routers are not affected, because
2349 route cache entry is created eventually.
2351 if (ipv4_is_multicast(daddr
)) {
2352 struct in_device
*in_dev
;
2355 if ((in_dev
= __in_dev_get_rcu(dev
)) != NULL
) {
2356 int our
= ip_check_mc(in_dev
, daddr
, saddr
,
2357 ip_hdr(skb
)->protocol
);
2359 #ifdef CONFIG_IP_MROUTE
2361 (!ipv4_is_local_multicast(daddr
) &&
2362 IN_DEV_MFORWARD(in_dev
))
2366 return ip_route_input_mc(skb
, daddr
, saddr
,
2373 return ip_route_input_slow(skb
, daddr
, saddr
, tos
, dev
);
2376 static int __mkroute_output(struct rtable
**result
,
2377 struct fib_result
*res
,
2378 const struct flowi
*fl
,
2379 const struct flowi
*oldflp
,
2380 struct net_device
*dev_out
,
2384 struct in_device
*in_dev
;
2385 u32 tos
= RT_FL_TOS(oldflp
);
2388 if (ipv4_is_loopback(fl
->fl4_src
) && !(dev_out
->flags
&IFF_LOOPBACK
))
2391 if (fl
->fl4_dst
== htonl(0xFFFFFFFF))
2392 res
->type
= RTN_BROADCAST
;
2393 else if (ipv4_is_multicast(fl
->fl4_dst
))
2394 res
->type
= RTN_MULTICAST
;
2395 else if (ipv4_is_lbcast(fl
->fl4_dst
) || ipv4_is_zeronet(fl
->fl4_dst
))
2398 if (dev_out
->flags
& IFF_LOOPBACK
)
2399 flags
|= RTCF_LOCAL
;
2401 /* get work reference to inet device */
2402 in_dev
= in_dev_get(dev_out
);
2406 if (res
->type
== RTN_BROADCAST
) {
2407 flags
|= RTCF_BROADCAST
| RTCF_LOCAL
;
2409 fib_info_put(res
->fi
);
2412 } else if (res
->type
== RTN_MULTICAST
) {
2413 flags
|= RTCF_MULTICAST
|RTCF_LOCAL
;
2414 if (!ip_check_mc(in_dev
, oldflp
->fl4_dst
, oldflp
->fl4_src
,
2416 flags
&= ~RTCF_LOCAL
;
2417 /* If multicast route do not exist use
2418 default one, but do not gateway in this case.
2421 if (res
->fi
&& res
->prefixlen
< 4) {
2422 fib_info_put(res
->fi
);
2428 rth
= dst_alloc(&ipv4_dst_ops
);
2434 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
2435 rth
->u
.dst
.flags
= DST_HOST
;
2436 if (IN_DEV_CONF_GET(in_dev
, NOXFRM
))
2437 rth
->u
.dst
.flags
|= DST_NOXFRM
;
2438 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
2439 rth
->u
.dst
.flags
|= DST_NOPOLICY
;
2441 rth
->fl
.fl4_dst
= oldflp
->fl4_dst
;
2442 rth
->fl
.fl4_tos
= tos
;
2443 rth
->fl
.fl4_src
= oldflp
->fl4_src
;
2444 rth
->fl
.oif
= oldflp
->oif
;
2445 rth
->fl
.mark
= oldflp
->mark
;
2446 rth
->rt_dst
= fl
->fl4_dst
;
2447 rth
->rt_src
= fl
->fl4_src
;
2448 rth
->rt_iif
= oldflp
->oif
? : dev_out
->ifindex
;
2449 /* get references to the devices that are to be hold by the routing
2451 rth
->u
.dst
.dev
= dev_out
;
2453 rth
->idev
= in_dev_get(dev_out
);
2454 rth
->rt_gateway
= fl
->fl4_dst
;
2455 rth
->rt_spec_dst
= fl
->fl4_src
;
2457 rth
->u
.dst
.output
=ip_output
;
2458 rth
->u
.dst
.obsolete
= -1;
2459 rth
->rt_genid
= rt_genid(dev_net(dev_out
));
2461 RT_CACHE_STAT_INC(out_slow_tot
);
2463 if (flags
& RTCF_LOCAL
) {
2464 rth
->u
.dst
.input
= ip_local_deliver
;
2465 rth
->rt_spec_dst
= fl
->fl4_dst
;
2467 if (flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) {
2468 rth
->rt_spec_dst
= fl
->fl4_src
;
2469 if (flags
& RTCF_LOCAL
&&
2470 !(dev_out
->flags
& IFF_LOOPBACK
)) {
2471 rth
->u
.dst
.output
= ip_mc_output
;
2472 RT_CACHE_STAT_INC(out_slow_mc
);
2474 #ifdef CONFIG_IP_MROUTE
2475 if (res
->type
== RTN_MULTICAST
) {
2476 if (IN_DEV_MFORWARD(in_dev
) &&
2477 !ipv4_is_local_multicast(oldflp
->fl4_dst
)) {
2478 rth
->u
.dst
.input
= ip_mr_input
;
2479 rth
->u
.dst
.output
= ip_mc_output
;
2485 rt_set_nexthop(rth
, res
, 0);
2487 rth
->rt_flags
= flags
;
2491 /* release work reference to inet device */
2497 static int ip_mkroute_output(struct rtable
**rp
,
2498 struct fib_result
*res
,
2499 const struct flowi
*fl
,
2500 const struct flowi
*oldflp
,
2501 struct net_device
*dev_out
,
2504 struct rtable
*rth
= NULL
;
2505 int err
= __mkroute_output(&rth
, res
, fl
, oldflp
, dev_out
, flags
);
2508 hash
= rt_hash(oldflp
->fl4_dst
, oldflp
->fl4_src
, oldflp
->oif
,
2509 rt_genid(dev_net(dev_out
)));
2510 err
= rt_intern_hash(hash
, rth
, rp
, NULL
, oldflp
->oif
);
2517 * Major route resolver routine.
2520 static int ip_route_output_slow(struct net
*net
, struct rtable
**rp
,
2521 const struct flowi
*oldflp
)
2523 u32 tos
= RT_FL_TOS(oldflp
);
2524 struct flowi fl
= { .nl_u
= { .ip4_u
=
2525 { .daddr
= oldflp
->fl4_dst
,
2526 .saddr
= oldflp
->fl4_src
,
2527 .tos
= tos
& IPTOS_RT_MASK
,
2528 .scope
= ((tos
& RTO_ONLINK
) ?
2532 .mark
= oldflp
->mark
,
2533 .iif
= net
->loopback_dev
->ifindex
,
2534 .oif
= oldflp
->oif
};
2535 struct fib_result res
;
2537 struct net_device
*dev_out
= NULL
;
2543 #ifdef CONFIG_IP_MULTIPLE_TABLES
2547 if (oldflp
->fl4_src
) {
2549 if (ipv4_is_multicast(oldflp
->fl4_src
) ||
2550 ipv4_is_lbcast(oldflp
->fl4_src
) ||
2551 ipv4_is_zeronet(oldflp
->fl4_src
))
2554 /* I removed check for oif == dev_out->oif here.
2555 It was wrong for two reasons:
2556 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2557 is assigned to multiple interfaces.
2558 2. Moreover, we are allowed to send packets with saddr
2559 of another iface. --ANK
2562 if (oldflp
->oif
== 0 &&
2563 (ipv4_is_multicast(oldflp
->fl4_dst
) ||
2564 oldflp
->fl4_dst
== htonl(0xFFFFFFFF))) {
2565 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2566 dev_out
= ip_dev_find(net
, oldflp
->fl4_src
);
2567 if (dev_out
== NULL
)
2570 /* Special hack: user can direct multicasts
2571 and limited broadcast via necessary interface
2572 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2573 This hack is not just for fun, it allows
2574 vic,vat and friends to work.
2575 They bind socket to loopback, set ttl to zero
2576 and expect that it will work.
2577 From the viewpoint of routing cache they are broken,
2578 because we are not allowed to build multicast path
2579 with loopback source addr (look, routing cache
2580 cannot know, that ttl is zero, so that packet
2581 will not leave this host and route is valid).
2582 Luckily, this hack is good workaround.
2585 fl
.oif
= dev_out
->ifindex
;
2589 if (!(oldflp
->flags
& FLOWI_FLAG_ANYSRC
)) {
2590 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2591 dev_out
= ip_dev_find(net
, oldflp
->fl4_src
);
2592 if (dev_out
== NULL
)
2601 dev_out
= dev_get_by_index(net
, oldflp
->oif
);
2603 if (dev_out
== NULL
)
2606 /* RACE: Check return value of inet_select_addr instead. */
2607 if (__in_dev_get_rtnl(dev_out
) == NULL
) {
2609 goto out
; /* Wrong error code */
2612 if (ipv4_is_local_multicast(oldflp
->fl4_dst
) ||
2613 oldflp
->fl4_dst
== htonl(0xFFFFFFFF)) {
2615 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2620 if (ipv4_is_multicast(oldflp
->fl4_dst
))
2621 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2623 else if (!oldflp
->fl4_dst
)
2624 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2630 fl
.fl4_dst
= fl
.fl4_src
;
2632 fl
.fl4_dst
= fl
.fl4_src
= htonl(INADDR_LOOPBACK
);
2635 dev_out
= net
->loopback_dev
;
2637 fl
.oif
= net
->loopback_dev
->ifindex
;
2638 res
.type
= RTN_LOCAL
;
2639 flags
|= RTCF_LOCAL
;
2643 if (fib_lookup(net
, &fl
, &res
)) {
2646 /* Apparently, routing tables are wrong. Assume,
2647 that the destination is on link.
2650 Because we are allowed to send to iface
2651 even if it has NO routes and NO assigned
2652 addresses. When oif is specified, routing
2653 tables are looked up with only one purpose:
2654 to catch if destination is gatewayed, rather than
2655 direct. Moreover, if MSG_DONTROUTE is set,
2656 we send packet, ignoring both routing tables
2657 and ifaddr state. --ANK
2660 We could make it even if oif is unknown,
2661 likely IPv6, but we do not.
2664 if (fl
.fl4_src
== 0)
2665 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2667 res
.type
= RTN_UNICAST
;
2677 if (res
.type
== RTN_LOCAL
) {
2679 fl
.fl4_src
= fl
.fl4_dst
;
2682 dev_out
= net
->loopback_dev
;
2684 fl
.oif
= dev_out
->ifindex
;
2686 fib_info_put(res
.fi
);
2688 flags
|= RTCF_LOCAL
;
2692 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2693 if (res
.fi
->fib_nhs
> 1 && fl
.oif
== 0)
2694 fib_select_multipath(&fl
, &res
);
2697 if (!res
.prefixlen
&& res
.type
== RTN_UNICAST
&& !fl
.oif
)
2698 fib_select_default(net
, &fl
, &res
);
2701 fl
.fl4_src
= FIB_RES_PREFSRC(res
);
2705 dev_out
= FIB_RES_DEV(res
);
2707 fl
.oif
= dev_out
->ifindex
;
2711 err
= ip_mkroute_output(rp
, &res
, &fl
, oldflp
, dev_out
, flags
);
2721 int __ip_route_output_key(struct net
*net
, struct rtable
**rp
,
2722 const struct flowi
*flp
)
2727 if (!rt_caching(net
))
2730 hash
= rt_hash(flp
->fl4_dst
, flp
->fl4_src
, flp
->oif
, rt_genid(net
));
2733 for (rth
= rcu_dereference_bh(rt_hash_table
[hash
].chain
); rth
;
2734 rth
= rcu_dereference_bh(rth
->u
.dst
.rt_next
)) {
2735 if (rth
->fl
.fl4_dst
== flp
->fl4_dst
&&
2736 rth
->fl
.fl4_src
== flp
->fl4_src
&&
2738 rth
->fl
.oif
== flp
->oif
&&
2739 rth
->fl
.mark
== flp
->mark
&&
2740 !((rth
->fl
.fl4_tos
^ flp
->fl4_tos
) &
2741 (IPTOS_RT_MASK
| RTO_ONLINK
)) &&
2742 net_eq(dev_net(rth
->u
.dst
.dev
), net
) &&
2743 !rt_is_expired(rth
)) {
2744 dst_use(&rth
->u
.dst
, jiffies
);
2745 RT_CACHE_STAT_INC(out_hit
);
2746 rcu_read_unlock_bh();
2750 RT_CACHE_STAT_INC(out_hlist_search
);
2752 rcu_read_unlock_bh();
2755 return ip_route_output_slow(net
, rp
, flp
);
2758 EXPORT_SYMBOL_GPL(__ip_route_output_key
);
2760 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
2764 static struct dst_ops ipv4_dst_blackhole_ops
= {
2766 .protocol
= cpu_to_be16(ETH_P_IP
),
2767 .destroy
= ipv4_dst_destroy
,
2768 .check
= ipv4_dst_check
,
2769 .update_pmtu
= ipv4_rt_blackhole_update_pmtu
,
2770 .entries
= ATOMIC_INIT(0),
2774 static int ipv4_dst_blackhole(struct net
*net
, struct rtable
**rp
, struct flowi
*flp
)
2776 struct rtable
*ort
= *rp
;
2777 struct rtable
*rt
= (struct rtable
*)
2778 dst_alloc(&ipv4_dst_blackhole_ops
);
2781 struct dst_entry
*new = &rt
->u
.dst
;
2783 atomic_set(&new->__refcnt
, 1);
2785 new->input
= dst_discard
;
2786 new->output
= dst_discard
;
2787 memcpy(new->metrics
, ort
->u
.dst
.metrics
, RTAX_MAX
*sizeof(u32
));
2789 new->dev
= ort
->u
.dst
.dev
;
2795 rt
->idev
= ort
->idev
;
2797 in_dev_hold(rt
->idev
);
2798 rt
->rt_genid
= rt_genid(net
);
2799 rt
->rt_flags
= ort
->rt_flags
;
2800 rt
->rt_type
= ort
->rt_type
;
2801 rt
->rt_dst
= ort
->rt_dst
;
2802 rt
->rt_src
= ort
->rt_src
;
2803 rt
->rt_iif
= ort
->rt_iif
;
2804 rt
->rt_gateway
= ort
->rt_gateway
;
2805 rt
->rt_spec_dst
= ort
->rt_spec_dst
;
2806 rt
->peer
= ort
->peer
;
2808 atomic_inc(&rt
->peer
->refcnt
);
2813 dst_release(&(*rp
)->u
.dst
);
2815 return (rt
? 0 : -ENOMEM
);
2818 int ip_route_output_flow(struct net
*net
, struct rtable
**rp
, struct flowi
*flp
,
2819 struct sock
*sk
, int flags
)
2823 if ((err
= __ip_route_output_key(net
, rp
, flp
)) != 0)
2828 flp
->fl4_src
= (*rp
)->rt_src
;
2830 flp
->fl4_dst
= (*rp
)->rt_dst
;
2831 err
= __xfrm_lookup(net
, (struct dst_entry
**)rp
, flp
, sk
,
2832 flags
? XFRM_LOOKUP_WAIT
: 0);
2833 if (err
== -EREMOTE
)
2834 err
= ipv4_dst_blackhole(net
, rp
, flp
);
2842 EXPORT_SYMBOL_GPL(ip_route_output_flow
);
2844 int ip_route_output_key(struct net
*net
, struct rtable
**rp
, struct flowi
*flp
)
2846 return ip_route_output_flow(net
, rp
, flp
, NULL
, 0);
2849 static int rt_fill_info(struct net
*net
,
2850 struct sk_buff
*skb
, u32 pid
, u32 seq
, int event
,
2851 int nowait
, unsigned int flags
)
2853 struct rtable
*rt
= skb_rtable(skb
);
2855 struct nlmsghdr
*nlh
;
2857 u32 id
= 0, ts
= 0, tsage
= 0, error
;
2859 nlh
= nlmsg_put(skb
, pid
, seq
, event
, sizeof(*r
), flags
);
2863 r
= nlmsg_data(nlh
);
2864 r
->rtm_family
= AF_INET
;
2865 r
->rtm_dst_len
= 32;
2867 r
->rtm_tos
= rt
->fl
.fl4_tos
;
2868 r
->rtm_table
= RT_TABLE_MAIN
;
2869 NLA_PUT_U32(skb
, RTA_TABLE
, RT_TABLE_MAIN
);
2870 r
->rtm_type
= rt
->rt_type
;
2871 r
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2872 r
->rtm_protocol
= RTPROT_UNSPEC
;
2873 r
->rtm_flags
= (rt
->rt_flags
& ~0xFFFF) | RTM_F_CLONED
;
2874 if (rt
->rt_flags
& RTCF_NOTIFY
)
2875 r
->rtm_flags
|= RTM_F_NOTIFY
;
2877 NLA_PUT_BE32(skb
, RTA_DST
, rt
->rt_dst
);
2879 if (rt
->fl
.fl4_src
) {
2880 r
->rtm_src_len
= 32;
2881 NLA_PUT_BE32(skb
, RTA_SRC
, rt
->fl
.fl4_src
);
2884 NLA_PUT_U32(skb
, RTA_OIF
, rt
->u
.dst
.dev
->ifindex
);
2885 #ifdef CONFIG_NET_CLS_ROUTE
2886 if (rt
->u
.dst
.tclassid
)
2887 NLA_PUT_U32(skb
, RTA_FLOW
, rt
->u
.dst
.tclassid
);
2890 NLA_PUT_BE32(skb
, RTA_PREFSRC
, rt
->rt_spec_dst
);
2891 else if (rt
->rt_src
!= rt
->fl
.fl4_src
)
2892 NLA_PUT_BE32(skb
, RTA_PREFSRC
, rt
->rt_src
);
2894 if (rt
->rt_dst
!= rt
->rt_gateway
)
2895 NLA_PUT_BE32(skb
, RTA_GATEWAY
, rt
->rt_gateway
);
2897 if (rtnetlink_put_metrics(skb
, rt
->u
.dst
.metrics
) < 0)
2898 goto nla_put_failure
;
2900 error
= rt
->u
.dst
.error
;
2901 expires
= rt
->u
.dst
.expires
? rt
->u
.dst
.expires
- jiffies
: 0;
2903 id
= atomic_read(&rt
->peer
->ip_id_count
) & 0xffff;
2904 if (rt
->peer
->tcp_ts_stamp
) {
2905 ts
= rt
->peer
->tcp_ts
;
2906 tsage
= get_seconds() - rt
->peer
->tcp_ts_stamp
;
2911 #ifdef CONFIG_IP_MROUTE
2912 __be32 dst
= rt
->rt_dst
;
2914 if (ipv4_is_multicast(dst
) && !ipv4_is_local_multicast(dst
) &&
2915 IPV4_DEVCONF_ALL(net
, MC_FORWARDING
)) {
2916 int err
= ipmr_get_route(net
, skb
, r
, nowait
);
2921 goto nla_put_failure
;
2923 if (err
== -EMSGSIZE
)
2924 goto nla_put_failure
;
2930 NLA_PUT_U32(skb
, RTA_IIF
, rt
->fl
.iif
);
2933 if (rtnl_put_cacheinfo(skb
, &rt
->u
.dst
, id
, ts
, tsage
,
2934 expires
, error
) < 0)
2935 goto nla_put_failure
;
2937 return nlmsg_end(skb
, nlh
);
2940 nlmsg_cancel(skb
, nlh
);
2944 static int inet_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
2946 struct net
*net
= sock_net(in_skb
->sk
);
2948 struct nlattr
*tb
[RTA_MAX
+1];
2949 struct rtable
*rt
= NULL
;
2954 struct sk_buff
*skb
;
2956 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv4_policy
);
2960 rtm
= nlmsg_data(nlh
);
2962 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
2968 /* Reserve room for dummy headers, this skb can pass
2969 through good chunk of routing engine.
2971 skb_reset_mac_header(skb
);
2972 skb_reset_network_header(skb
);
2974 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2975 ip_hdr(skb
)->protocol
= IPPROTO_ICMP
;
2976 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct iphdr
));
2978 src
= tb
[RTA_SRC
] ? nla_get_be32(tb
[RTA_SRC
]) : 0;
2979 dst
= tb
[RTA_DST
] ? nla_get_be32(tb
[RTA_DST
]) : 0;
2980 iif
= tb
[RTA_IIF
] ? nla_get_u32(tb
[RTA_IIF
]) : 0;
2983 struct net_device
*dev
;
2985 dev
= __dev_get_by_index(net
, iif
);
2991 skb
->protocol
= htons(ETH_P_IP
);
2994 err
= ip_route_input(skb
, dst
, src
, rtm
->rtm_tos
, dev
);
2997 rt
= skb_rtable(skb
);
2998 if (err
== 0 && rt
->u
.dst
.error
)
2999 err
= -rt
->u
.dst
.error
;
3006 .tos
= rtm
->rtm_tos
,
3009 .oif
= tb
[RTA_OIF
] ? nla_get_u32(tb
[RTA_OIF
]) : 0,
3011 err
= ip_route_output_key(net
, &rt
, &fl
);
3017 skb_dst_set(skb
, &rt
->u
.dst
);
3018 if (rtm
->rtm_flags
& RTM_F_NOTIFY
)
3019 rt
->rt_flags
|= RTCF_NOTIFY
;
3021 err
= rt_fill_info(net
, skb
, NETLINK_CB(in_skb
).pid
, nlh
->nlmsg_seq
,
3022 RTM_NEWROUTE
, 0, 0);
3026 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).pid
);
3035 int ip_rt_dump(struct sk_buff
*skb
, struct netlink_callback
*cb
)
3042 net
= sock_net(skb
->sk
);
3047 s_idx
= idx
= cb
->args
[1];
3048 for (h
= s_h
; h
<= rt_hash_mask
; h
++, s_idx
= 0) {
3049 if (!rt_hash_table
[h
].chain
)
3052 for (rt
= rcu_dereference_bh(rt_hash_table
[h
].chain
), idx
= 0; rt
;
3053 rt
= rcu_dereference_bh(rt
->u
.dst
.rt_next
), idx
++) {
3054 if (!net_eq(dev_net(rt
->u
.dst
.dev
), net
) || idx
< s_idx
)
3056 if (rt_is_expired(rt
))
3058 skb_dst_set(skb
, dst_clone(&rt
->u
.dst
));
3059 if (rt_fill_info(net
, skb
, NETLINK_CB(cb
->skb
).pid
,
3060 cb
->nlh
->nlmsg_seq
, RTM_NEWROUTE
,
3061 1, NLM_F_MULTI
) <= 0) {
3063 rcu_read_unlock_bh();
3068 rcu_read_unlock_bh();
3077 void ip_rt_multicast_event(struct in_device
*in_dev
)
3079 rt_cache_flush(dev_net(in_dev
->dev
), 0);
3082 #ifdef CONFIG_SYSCTL
3083 static int ipv4_sysctl_rtcache_flush(ctl_table
*__ctl
, int write
,
3084 void __user
*buffer
,
3085 size_t *lenp
, loff_t
*ppos
)
3092 memcpy(&ctl
, __ctl
, sizeof(ctl
));
3093 ctl
.data
= &flush_delay
;
3094 proc_dointvec(&ctl
, write
, buffer
, lenp
, ppos
);
3096 net
= (struct net
*)__ctl
->extra1
;
3097 rt_cache_flush(net
, flush_delay
);
3104 static void rt_secret_reschedule(int old
)
3107 int new = ip_rt_secret_interval
;
3108 int diff
= new - old
;
3115 int deleted
= del_timer_sync(&net
->ipv4
.rt_secret_timer
);
3122 time
= net
->ipv4
.rt_secret_timer
.expires
- jiffies
;
3124 if (time
<= 0 || (time
+= diff
) <= 0)
3129 mod_timer(&net
->ipv4
.rt_secret_timer
, jiffies
+ time
);
3134 static int ipv4_sysctl_rt_secret_interval(ctl_table
*ctl
, int write
,
3135 void __user
*buffer
, size_t *lenp
,
3138 int old
= ip_rt_secret_interval
;
3139 int ret
= proc_dointvec_jiffies(ctl
, write
, buffer
, lenp
, ppos
);
3141 rt_secret_reschedule(old
);
3146 static ctl_table ipv4_route_table
[] = {
3148 .procname
= "gc_thresh",
3149 .data
= &ipv4_dst_ops
.gc_thresh
,
3150 .maxlen
= sizeof(int),
3152 .proc_handler
= proc_dointvec
,
3155 .procname
= "max_size",
3156 .data
= &ip_rt_max_size
,
3157 .maxlen
= sizeof(int),
3159 .proc_handler
= proc_dointvec
,
3162 /* Deprecated. Use gc_min_interval_ms */
3164 .procname
= "gc_min_interval",
3165 .data
= &ip_rt_gc_min_interval
,
3166 .maxlen
= sizeof(int),
3168 .proc_handler
= proc_dointvec_jiffies
,
3171 .procname
= "gc_min_interval_ms",
3172 .data
= &ip_rt_gc_min_interval
,
3173 .maxlen
= sizeof(int),
3175 .proc_handler
= proc_dointvec_ms_jiffies
,
3178 .procname
= "gc_timeout",
3179 .data
= &ip_rt_gc_timeout
,
3180 .maxlen
= sizeof(int),
3182 .proc_handler
= proc_dointvec_jiffies
,
3185 .procname
= "gc_interval",
3186 .data
= &ip_rt_gc_interval
,
3187 .maxlen
= sizeof(int),
3189 .proc_handler
= proc_dointvec_jiffies
,
3192 .procname
= "redirect_load",
3193 .data
= &ip_rt_redirect_load
,
3194 .maxlen
= sizeof(int),
3196 .proc_handler
= proc_dointvec
,
3199 .procname
= "redirect_number",
3200 .data
= &ip_rt_redirect_number
,
3201 .maxlen
= sizeof(int),
3203 .proc_handler
= proc_dointvec
,
3206 .procname
= "redirect_silence",
3207 .data
= &ip_rt_redirect_silence
,
3208 .maxlen
= sizeof(int),
3210 .proc_handler
= proc_dointvec
,
3213 .procname
= "error_cost",
3214 .data
= &ip_rt_error_cost
,
3215 .maxlen
= sizeof(int),
3217 .proc_handler
= proc_dointvec
,
3220 .procname
= "error_burst",
3221 .data
= &ip_rt_error_burst
,
3222 .maxlen
= sizeof(int),
3224 .proc_handler
= proc_dointvec
,
3227 .procname
= "gc_elasticity",
3228 .data
= &ip_rt_gc_elasticity
,
3229 .maxlen
= sizeof(int),
3231 .proc_handler
= proc_dointvec
,
3234 .procname
= "mtu_expires",
3235 .data
= &ip_rt_mtu_expires
,
3236 .maxlen
= sizeof(int),
3238 .proc_handler
= proc_dointvec_jiffies
,
3241 .procname
= "min_pmtu",
3242 .data
= &ip_rt_min_pmtu
,
3243 .maxlen
= sizeof(int),
3245 .proc_handler
= proc_dointvec
,
3248 .procname
= "min_adv_mss",
3249 .data
= &ip_rt_min_advmss
,
3250 .maxlen
= sizeof(int),
3252 .proc_handler
= proc_dointvec
,
3255 .procname
= "secret_interval",
3256 .data
= &ip_rt_secret_interval
,
3257 .maxlen
= sizeof(int),
3259 .proc_handler
= ipv4_sysctl_rt_secret_interval
,
3264 static struct ctl_table empty
[1];
3266 static struct ctl_table ipv4_skeleton
[] =
3268 { .procname
= "route",
3269 .mode
= 0555, .child
= ipv4_route_table
},
3270 { .procname
= "neigh",
3271 .mode
= 0555, .child
= empty
},
3275 static __net_initdata
struct ctl_path ipv4_path
[] = {
3276 { .procname
= "net", },
3277 { .procname
= "ipv4", },
3281 static struct ctl_table ipv4_route_flush_table
[] = {
3283 .procname
= "flush",
3284 .maxlen
= sizeof(int),
3286 .proc_handler
= ipv4_sysctl_rtcache_flush
,
3291 static __net_initdata
struct ctl_path ipv4_route_path
[] = {
3292 { .procname
= "net", },
3293 { .procname
= "ipv4", },
3294 { .procname
= "route", },
3298 static __net_init
int sysctl_route_net_init(struct net
*net
)
3300 struct ctl_table
*tbl
;
3302 tbl
= ipv4_route_flush_table
;
3303 if (!net_eq(net
, &init_net
)) {
3304 tbl
= kmemdup(tbl
, sizeof(ipv4_route_flush_table
), GFP_KERNEL
);
3308 tbl
[0].extra1
= net
;
3310 net
->ipv4
.route_hdr
=
3311 register_net_sysctl_table(net
, ipv4_route_path
, tbl
);
3312 if (net
->ipv4
.route_hdr
== NULL
)
3317 if (tbl
!= ipv4_route_flush_table
)
3323 static __net_exit
void sysctl_route_net_exit(struct net
*net
)
3325 struct ctl_table
*tbl
;
3327 tbl
= net
->ipv4
.route_hdr
->ctl_table_arg
;
3328 unregister_net_sysctl_table(net
->ipv4
.route_hdr
);
3329 BUG_ON(tbl
== ipv4_route_flush_table
);
3333 static __net_initdata
struct pernet_operations sysctl_route_ops
= {
3334 .init
= sysctl_route_net_init
,
3335 .exit
= sysctl_route_net_exit
,
3340 static __net_init
int rt_secret_timer_init(struct net
*net
)
3342 atomic_set(&net
->ipv4
.rt_genid
,
3343 (int) ((num_physpages
^ (num_physpages
>>8)) ^
3344 (jiffies
^ (jiffies
>> 7))));
3346 net
->ipv4
.rt_secret_timer
.function
= rt_secret_rebuild
;
3347 net
->ipv4
.rt_secret_timer
.data
= (unsigned long)net
;
3348 init_timer_deferrable(&net
->ipv4
.rt_secret_timer
);
3350 if (ip_rt_secret_interval
) {
3351 net
->ipv4
.rt_secret_timer
.expires
=
3352 jiffies
+ net_random() % ip_rt_secret_interval
+
3353 ip_rt_secret_interval
;
3354 add_timer(&net
->ipv4
.rt_secret_timer
);
3359 static __net_exit
void rt_secret_timer_exit(struct net
*net
)
3361 del_timer_sync(&net
->ipv4
.rt_secret_timer
);
3364 static __net_initdata
struct pernet_operations rt_secret_timer_ops
= {
3365 .init
= rt_secret_timer_init
,
3366 .exit
= rt_secret_timer_exit
,
3370 #ifdef CONFIG_NET_CLS_ROUTE
3371 struct ip_rt_acct __percpu
*ip_rt_acct __read_mostly
;
3372 #endif /* CONFIG_NET_CLS_ROUTE */
3374 static __initdata
unsigned long rhash_entries
;
3375 static int __init
set_rhash_entries(char *str
)
3379 rhash_entries
= simple_strtoul(str
, &str
, 0);
3382 __setup("rhash_entries=", set_rhash_entries
);
3384 int __init
ip_rt_init(void)
3388 #ifdef CONFIG_NET_CLS_ROUTE
3389 ip_rt_acct
= __alloc_percpu(256 * sizeof(struct ip_rt_acct
), __alignof__(struct ip_rt_acct
));
3391 panic("IP: failed to allocate ip_rt_acct\n");
3394 ipv4_dst_ops
.kmem_cachep
=
3395 kmem_cache_create("ip_dst_cache", sizeof(struct rtable
), 0,
3396 SLAB_HWCACHE_ALIGN
|SLAB_PANIC
, NULL
);
3398 ipv4_dst_blackhole_ops
.kmem_cachep
= ipv4_dst_ops
.kmem_cachep
;
3400 rt_hash_table
= (struct rt_hash_bucket
*)
3401 alloc_large_system_hash("IP route cache",
3402 sizeof(struct rt_hash_bucket
),
3404 (totalram_pages
>= 128 * 1024) ?
3409 rhash_entries
? 0 : 512 * 1024);
3410 memset(rt_hash_table
, 0, (rt_hash_mask
+ 1) * sizeof(struct rt_hash_bucket
));
3411 rt_hash_lock_init();
3413 ipv4_dst_ops
.gc_thresh
= (rt_hash_mask
+ 1);
3414 ip_rt_max_size
= (rt_hash_mask
+ 1) * 16;
3419 /* All the timers, started at system startup tend
3420 to synchronize. Perturb it a bit.
3422 INIT_DELAYED_WORK_DEFERRABLE(&expires_work
, rt_worker_func
);
3423 expires_ljiffies
= jiffies
;
3424 schedule_delayed_work(&expires_work
,
3425 net_random() % ip_rt_gc_interval
+ ip_rt_gc_interval
);
3427 if (register_pernet_subsys(&rt_secret_timer_ops
))
3428 printk(KERN_ERR
"Unable to setup rt_secret_timer\n");
3430 if (ip_rt_proc_init())
3431 printk(KERN_ERR
"Unable to create route proc files\n");
3434 xfrm4_init(ip_rt_max_size
);
3436 rtnl_register(PF_INET
, RTM_GETROUTE
, inet_rtm_getroute
, NULL
);
3438 #ifdef CONFIG_SYSCTL
3439 register_pernet_subsys(&sysctl_route_ops
);
3444 #ifdef CONFIG_SYSCTL
3446 * We really need to sanitize the damn ipv4 init order, then all
3447 * this nonsense will go away.
3449 void __init
ip_static_sysctl_init(void)
3451 register_sysctl_paths(ipv4_path
, ipv4_skeleton
);
3455 EXPORT_SYMBOL(__ip_select_ident
);
3456 EXPORT_SYMBOL(ip_route_input
);
3457 EXPORT_SYMBOL(ip_route_output_key
);