2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
109 #include <linux/sysctl.h>
112 #define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115 #define IP_MAX_MTU 0xFFF0
117 #define RT_GC_TIMEOUT (300*HZ)
119 static int ip_rt_max_size
;
120 static int ip_rt_gc_timeout __read_mostly
= RT_GC_TIMEOUT
;
121 static int ip_rt_gc_interval __read_mostly
= 60 * HZ
;
122 static int ip_rt_gc_min_interval __read_mostly
= HZ
/ 2;
123 static int ip_rt_redirect_number __read_mostly
= 9;
124 static int ip_rt_redirect_load __read_mostly
= HZ
/ 50;
125 static int ip_rt_redirect_silence __read_mostly
= ((HZ
/ 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly
= HZ
;
127 static int ip_rt_error_burst __read_mostly
= 5 * HZ
;
128 static int ip_rt_gc_elasticity __read_mostly
= 8;
129 static int ip_rt_mtu_expires __read_mostly
= 10 * 60 * HZ
;
130 static int ip_rt_min_pmtu __read_mostly
= 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly
= 256;
132 static int ip_rt_secret_interval __read_mostly
= 10 * 60 * HZ
;
133 static int rt_chain_length_max __read_mostly
= 20;
135 static struct delayed_work expires_work
;
136 static unsigned long expires_ljiffies
;
139 * Interface to generic destination cache.
142 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
);
143 static void ipv4_dst_destroy(struct dst_entry
*dst
);
144 static void ipv4_dst_ifdown(struct dst_entry
*dst
,
145 struct net_device
*dev
, int how
);
146 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
);
147 static void ipv4_link_failure(struct sk_buff
*skb
);
148 static void ip_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
);
149 static int rt_garbage_collect(struct dst_ops
*ops
);
152 static struct dst_ops ipv4_dst_ops
= {
154 .protocol
= cpu_to_be16(ETH_P_IP
),
155 .gc
= rt_garbage_collect
,
156 .check
= ipv4_dst_check
,
157 .destroy
= ipv4_dst_destroy
,
158 .ifdown
= ipv4_dst_ifdown
,
159 .negative_advice
= ipv4_negative_advice
,
160 .link_failure
= ipv4_link_failure
,
161 .update_pmtu
= ip_rt_update_pmtu
,
162 .local_out
= __ip_local_out
,
163 .entries
= ATOMIC_INIT(0),
166 #define ECN_OR_COST(class) TC_PRIO_##class
168 const __u8 ip_tos2prio
[16] = {
172 ECN_OR_COST(BESTEFFORT
),
178 ECN_OR_COST(INTERACTIVE
),
180 ECN_OR_COST(INTERACTIVE
),
181 TC_PRIO_INTERACTIVE_BULK
,
182 ECN_OR_COST(INTERACTIVE_BULK
),
183 TC_PRIO_INTERACTIVE_BULK
,
184 ECN_OR_COST(INTERACTIVE_BULK
)
192 /* The locking scheme is rather straight forward:
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
202 struct rt_hash_bucket
{
203 struct rtable
*chain
;
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ 256
217 # define RT_HASH_LOCK_SZ 4096
219 # define RT_HASH_LOCK_SZ 2048
221 # define RT_HASH_LOCK_SZ 1024
223 # define RT_HASH_LOCK_SZ 512
225 # define RT_HASH_LOCK_SZ 256
229 static spinlock_t
*rt_hash_locks
;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
232 static __init
void rt_hash_lock_init(void)
236 rt_hash_locks
= kmalloc(sizeof(spinlock_t
) * RT_HASH_LOCK_SZ
,
239 panic("IP: failed to allocate rt_hash_locks\n");
241 for (i
= 0; i
< RT_HASH_LOCK_SZ
; i
++)
242 spin_lock_init(&rt_hash_locks
[i
]);
245 # define rt_hash_lock_addr(slot) NULL
247 static inline void rt_hash_lock_init(void)
252 static struct rt_hash_bucket
*rt_hash_table __read_mostly
;
253 static unsigned rt_hash_mask __read_mostly
;
254 static unsigned int rt_hash_log __read_mostly
;
256 static DEFINE_PER_CPU(struct rt_cache_stat
, rt_cache_stat
);
257 #define RT_CACHE_STAT_INC(field) \
258 (__raw_get_cpu_var(rt_cache_stat).field++)
260 static inline unsigned int rt_hash(__be32 daddr
, __be32 saddr
, int idx
,
263 return jhash_3words((__force u32
)(__be32
)(daddr
),
264 (__force u32
)(__be32
)(saddr
),
269 static inline int rt_genid(struct net
*net
)
271 return atomic_read(&net
->ipv4
.rt_genid
);
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state
{
276 struct seq_net_private p
;
281 static struct rtable
*rt_cache_get_first(struct seq_file
*seq
)
283 struct rt_cache_iter_state
*st
= seq
->private;
284 struct rtable
*r
= NULL
;
286 for (st
->bucket
= rt_hash_mask
; st
->bucket
>= 0; --st
->bucket
) {
287 if (!rt_hash_table
[st
->bucket
].chain
)
290 r
= rcu_dereference_bh(rt_hash_table
[st
->bucket
].chain
);
292 if (dev_net(r
->u
.dst
.dev
) == seq_file_net(seq
) &&
293 r
->rt_genid
== st
->genid
)
295 r
= rcu_dereference_bh(r
->u
.dst
.rt_next
);
297 rcu_read_unlock_bh();
302 static struct rtable
*__rt_cache_get_next(struct seq_file
*seq
,
305 struct rt_cache_iter_state
*st
= seq
->private;
307 r
= r
->u
.dst
.rt_next
;
309 rcu_read_unlock_bh();
311 if (--st
->bucket
< 0)
313 } while (!rt_hash_table
[st
->bucket
].chain
);
315 r
= rt_hash_table
[st
->bucket
].chain
;
317 return rcu_dereference_bh(r
);
320 static struct rtable
*rt_cache_get_next(struct seq_file
*seq
,
323 struct rt_cache_iter_state
*st
= seq
->private;
324 while ((r
= __rt_cache_get_next(seq
, r
)) != NULL
) {
325 if (dev_net(r
->u
.dst
.dev
) != seq_file_net(seq
))
327 if (r
->rt_genid
== st
->genid
)
333 static struct rtable
*rt_cache_get_idx(struct seq_file
*seq
, loff_t pos
)
335 struct rtable
*r
= rt_cache_get_first(seq
);
338 while (pos
&& (r
= rt_cache_get_next(seq
, r
)))
340 return pos
? NULL
: r
;
343 static void *rt_cache_seq_start(struct seq_file
*seq
, loff_t
*pos
)
345 struct rt_cache_iter_state
*st
= seq
->private;
347 return rt_cache_get_idx(seq
, *pos
- 1);
348 st
->genid
= rt_genid(seq_file_net(seq
));
349 return SEQ_START_TOKEN
;
352 static void *rt_cache_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
356 if (v
== SEQ_START_TOKEN
)
357 r
= rt_cache_get_first(seq
);
359 r
= rt_cache_get_next(seq
, v
);
364 static void rt_cache_seq_stop(struct seq_file
*seq
, void *v
)
366 if (v
&& v
!= SEQ_START_TOKEN
)
367 rcu_read_unlock_bh();
370 static int rt_cache_seq_show(struct seq_file
*seq
, void *v
)
372 if (v
== SEQ_START_TOKEN
)
373 seq_printf(seq
, "%-127s\n",
374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
378 struct rtable
*r
= v
;
381 seq_printf(seq
, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 r
->u
.dst
.dev
? r
->u
.dst
.dev
->name
: "*",
384 (unsigned long)r
->rt_dst
, (unsigned long)r
->rt_gateway
,
385 r
->rt_flags
, atomic_read(&r
->u
.dst
.__refcnt
),
386 r
->u
.dst
.__use
, 0, (unsigned long)r
->rt_src
,
387 (dst_metric(&r
->u
.dst
, RTAX_ADVMSS
) ?
388 (int)dst_metric(&r
->u
.dst
, RTAX_ADVMSS
) + 40 : 0),
389 dst_metric(&r
->u
.dst
, RTAX_WINDOW
),
390 (int)((dst_metric(&r
->u
.dst
, RTAX_RTT
) >> 3) +
391 dst_metric(&r
->u
.dst
, RTAX_RTTVAR
)),
393 r
->u
.dst
.hh
? atomic_read(&r
->u
.dst
.hh
->hh_refcnt
) : -1,
394 r
->u
.dst
.hh
? (r
->u
.dst
.hh
->hh_output
==
396 r
->rt_spec_dst
, &len
);
398 seq_printf(seq
, "%*s\n", 127 - len
, "");
403 static const struct seq_operations rt_cache_seq_ops
= {
404 .start
= rt_cache_seq_start
,
405 .next
= rt_cache_seq_next
,
406 .stop
= rt_cache_seq_stop
,
407 .show
= rt_cache_seq_show
,
410 static int rt_cache_seq_open(struct inode
*inode
, struct file
*file
)
412 return seq_open_net(inode
, file
, &rt_cache_seq_ops
,
413 sizeof(struct rt_cache_iter_state
));
416 static const struct file_operations rt_cache_seq_fops
= {
417 .owner
= THIS_MODULE
,
418 .open
= rt_cache_seq_open
,
421 .release
= seq_release_net
,
425 static void *rt_cpu_seq_start(struct seq_file
*seq
, loff_t
*pos
)
430 return SEQ_START_TOKEN
;
432 for (cpu
= *pos
-1; cpu
< nr_cpu_ids
; ++cpu
) {
433 if (!cpu_possible(cpu
))
436 return &per_cpu(rt_cache_stat
, cpu
);
441 static void *rt_cpu_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
445 for (cpu
= *pos
; cpu
< nr_cpu_ids
; ++cpu
) {
446 if (!cpu_possible(cpu
))
449 return &per_cpu(rt_cache_stat
, cpu
);
455 static void rt_cpu_seq_stop(struct seq_file
*seq
, void *v
)
460 static int rt_cpu_seq_show(struct seq_file
*seq
, void *v
)
462 struct rt_cache_stat
*st
= v
;
464 if (v
== SEQ_START_TOKEN
) {
465 seq_printf(seq
, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
469 seq_printf(seq
,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops
.entries
),
494 static const struct seq_operations rt_cpu_seq_ops
= {
495 .start
= rt_cpu_seq_start
,
496 .next
= rt_cpu_seq_next
,
497 .stop
= rt_cpu_seq_stop
,
498 .show
= rt_cpu_seq_show
,
502 static int rt_cpu_seq_open(struct inode
*inode
, struct file
*file
)
504 return seq_open(file
, &rt_cpu_seq_ops
);
507 static const struct file_operations rt_cpu_seq_fops
= {
508 .owner
= THIS_MODULE
,
509 .open
= rt_cpu_seq_open
,
512 .release
= seq_release
,
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int rt_acct_proc_show(struct seq_file
*m
, void *v
)
518 struct ip_rt_acct
*dst
, *src
;
521 dst
= kcalloc(256, sizeof(struct ip_rt_acct
), GFP_KERNEL
);
525 for_each_possible_cpu(i
) {
526 src
= (struct ip_rt_acct
*)per_cpu_ptr(ip_rt_acct
, i
);
527 for (j
= 0; j
< 256; j
++) {
528 dst
[j
].o_bytes
+= src
[j
].o_bytes
;
529 dst
[j
].o_packets
+= src
[j
].o_packets
;
530 dst
[j
].i_bytes
+= src
[j
].i_bytes
;
531 dst
[j
].i_packets
+= src
[j
].i_packets
;
535 seq_write(m
, dst
, 256 * sizeof(struct ip_rt_acct
));
540 static int rt_acct_proc_open(struct inode
*inode
, struct file
*file
)
542 return single_open(file
, rt_acct_proc_show
, NULL
);
545 static const struct file_operations rt_acct_proc_fops
= {
546 .owner
= THIS_MODULE
,
547 .open
= rt_acct_proc_open
,
550 .release
= single_release
,
554 static int __net_init
ip_rt_do_proc_init(struct net
*net
)
556 struct proc_dir_entry
*pde
;
558 pde
= proc_net_fops_create(net
, "rt_cache", S_IRUGO
,
563 pde
= proc_create("rt_cache", S_IRUGO
,
564 net
->proc_net_stat
, &rt_cpu_seq_fops
);
568 #ifdef CONFIG_NET_CLS_ROUTE
569 pde
= proc_create("rt_acct", 0, net
->proc_net
, &rt_acct_proc_fops
);
575 #ifdef CONFIG_NET_CLS_ROUTE
577 remove_proc_entry("rt_cache", net
->proc_net_stat
);
580 remove_proc_entry("rt_cache", net
->proc_net
);
585 static void __net_exit
ip_rt_do_proc_exit(struct net
*net
)
587 remove_proc_entry("rt_cache", net
->proc_net_stat
);
588 remove_proc_entry("rt_cache", net
->proc_net
);
589 #ifdef CONFIG_NET_CLS_ROUTE
590 remove_proc_entry("rt_acct", net
->proc_net
);
594 static struct pernet_operations ip_rt_proc_ops __net_initdata
= {
595 .init
= ip_rt_do_proc_init
,
596 .exit
= ip_rt_do_proc_exit
,
599 static int __init
ip_rt_proc_init(void)
601 return register_pernet_subsys(&ip_rt_proc_ops
);
605 static inline int ip_rt_proc_init(void)
609 #endif /* CONFIG_PROC_FS */
611 static inline void rt_free(struct rtable
*rt
)
613 call_rcu_bh(&rt
->u
.dst
.rcu_head
, dst_rcu_free
);
616 static inline void rt_drop(struct rtable
*rt
)
619 call_rcu_bh(&rt
->u
.dst
.rcu_head
, dst_rcu_free
);
622 static inline int rt_fast_clean(struct rtable
*rth
)
624 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */
626 return (rth
->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) &&
627 rth
->fl
.iif
&& rth
->u
.dst
.rt_next
;
630 static inline int rt_valuable(struct rtable
*rth
)
632 return (rth
->rt_flags
& (RTCF_REDIRECTED
| RTCF_NOTIFY
)) ||
636 static int rt_may_expire(struct rtable
*rth
, unsigned long tmo1
, unsigned long tmo2
)
641 if (atomic_read(&rth
->u
.dst
.__refcnt
))
645 if (rth
->u
.dst
.expires
&&
646 time_after_eq(jiffies
, rth
->u
.dst
.expires
))
649 age
= jiffies
- rth
->u
.dst
.lastuse
;
651 if ((age
<= tmo1
&& !rt_fast_clean(rth
)) ||
652 (age
<= tmo2
&& rt_valuable(rth
)))
658 /* Bits of score are:
660 * 30: not quite useless
661 * 29..0: usage counter
663 static inline u32
rt_score(struct rtable
*rt
)
665 u32 score
= jiffies
- rt
->u
.dst
.lastuse
;
667 score
= ~score
& ~(3<<30);
673 !(rt
->rt_flags
& (RTCF_BROADCAST
|RTCF_MULTICAST
|RTCF_LOCAL
)))
679 static inline bool rt_caching(const struct net
*net
)
681 return net
->ipv4
.current_rt_cache_rebuild_count
<=
682 net
->ipv4
.sysctl_rt_cache_rebuild_count
;
685 static inline bool compare_hash_inputs(const struct flowi
*fl1
,
686 const struct flowi
*fl2
)
688 return (__force u32
)(((fl1
->nl_u
.ip4_u
.daddr
^ fl2
->nl_u
.ip4_u
.daddr
) |
689 (fl1
->nl_u
.ip4_u
.saddr
^ fl2
->nl_u
.ip4_u
.saddr
) |
690 (fl1
->iif
^ fl2
->iif
)) == 0);
693 static inline int compare_keys(struct flowi
*fl1
, struct flowi
*fl2
)
695 return ((__force u32
)((fl1
->nl_u
.ip4_u
.daddr
^ fl2
->nl_u
.ip4_u
.daddr
) |
696 (fl1
->nl_u
.ip4_u
.saddr
^ fl2
->nl_u
.ip4_u
.saddr
)) |
697 (fl1
->mark
^ fl2
->mark
) |
698 (*(u16
*)&fl1
->nl_u
.ip4_u
.tos
^
699 *(u16
*)&fl2
->nl_u
.ip4_u
.tos
) |
700 (fl1
->oif
^ fl2
->oif
) |
701 (fl1
->iif
^ fl2
->iif
)) == 0;
704 static inline int compare_netns(struct rtable
*rt1
, struct rtable
*rt2
)
706 return net_eq(dev_net(rt1
->u
.dst
.dev
), dev_net(rt2
->u
.dst
.dev
));
709 static inline int rt_is_expired(struct rtable
*rth
)
711 return rth
->rt_genid
!= rt_genid(dev_net(rth
->u
.dst
.dev
));
715 * Perform a full scan of hash table and free all entries.
716 * Can be called by a softirq or a process.
717 * In the later case, we want to be reschedule if necessary
719 static void rt_do_flush(int process_context
)
722 struct rtable
*rth
, *next
;
723 struct rtable
* tail
;
725 for (i
= 0; i
<= rt_hash_mask
; i
++) {
726 if (process_context
&& need_resched())
728 rth
= rt_hash_table
[i
].chain
;
732 spin_lock_bh(rt_hash_lock_addr(i
));
735 struct rtable
** prev
, * p
;
737 rth
= rt_hash_table
[i
].chain
;
739 /* defer releasing the head of the list after spin_unlock */
740 for (tail
= rth
; tail
; tail
= tail
->u
.dst
.rt_next
)
741 if (!rt_is_expired(tail
))
744 rt_hash_table
[i
].chain
= tail
;
746 /* call rt_free on entries after the tail requiring flush */
747 prev
= &rt_hash_table
[i
].chain
;
748 for (p
= *prev
; p
; p
= next
) {
749 next
= p
->u
.dst
.rt_next
;
750 if (!rt_is_expired(p
)) {
751 prev
= &p
->u
.dst
.rt_next
;
759 rth
= rt_hash_table
[i
].chain
;
760 rt_hash_table
[i
].chain
= NULL
;
763 spin_unlock_bh(rt_hash_lock_addr(i
));
765 for (; rth
!= tail
; rth
= next
) {
766 next
= rth
->u
.dst
.rt_next
;
773 * While freeing expired entries, we compute average chain length
774 * and standard deviation, using fixed-point arithmetic.
775 * This to have an estimation of rt_chain_length_max
776 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
777 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
781 #define ONE (1UL << FRACT_BITS)
784 * Given a hash chain and an item in this hash chain,
785 * find if a previous entry has the same hash_inputs
786 * (but differs on tos, mark or oif)
787 * Returns 0 if an alias is found.
788 * Returns ONE if rth has no alias before itself.
790 static int has_noalias(const struct rtable
*head
, const struct rtable
*rth
)
792 const struct rtable
*aux
= head
;
795 if (compare_hash_inputs(&aux
->fl
, &rth
->fl
))
797 aux
= aux
->u
.dst
.rt_next
;
802 static void rt_check_expire(void)
804 static unsigned int rover
;
805 unsigned int i
= rover
, goal
;
806 struct rtable
*rth
, **rthp
;
807 unsigned long samples
= 0;
808 unsigned long sum
= 0, sum2
= 0;
812 delta
= jiffies
- expires_ljiffies
;
813 expires_ljiffies
= jiffies
;
814 mult
= ((u64
)delta
) << rt_hash_log
;
815 if (ip_rt_gc_timeout
> 1)
816 do_div(mult
, ip_rt_gc_timeout
);
817 goal
= (unsigned int)mult
;
818 if (goal
> rt_hash_mask
)
819 goal
= rt_hash_mask
+ 1;
820 for (; goal
> 0; goal
--) {
821 unsigned long tmo
= ip_rt_gc_timeout
;
822 unsigned long length
;
824 i
= (i
+ 1) & rt_hash_mask
;
825 rthp
= &rt_hash_table
[i
].chain
;
835 spin_lock_bh(rt_hash_lock_addr(i
));
836 while ((rth
= *rthp
) != NULL
) {
837 prefetch(rth
->u
.dst
.rt_next
);
838 if (rt_is_expired(rth
)) {
839 *rthp
= rth
->u
.dst
.rt_next
;
843 if (rth
->u
.dst
.expires
) {
844 /* Entry is expired even if it is in use */
845 if (time_before_eq(jiffies
, rth
->u
.dst
.expires
)) {
848 rthp
= &rth
->u
.dst
.rt_next
;
850 * We only count entries on
851 * a chain with equal hash inputs once
852 * so that entries for different QOS
853 * levels, and other non-hash input
854 * attributes don't unfairly skew
855 * the length computation
857 length
+= has_noalias(rt_hash_table
[i
].chain
, rth
);
860 } else if (!rt_may_expire(rth
, tmo
, ip_rt_gc_timeout
))
863 /* Cleanup aged off entries. */
864 *rthp
= rth
->u
.dst
.rt_next
;
867 spin_unlock_bh(rt_hash_lock_addr(i
));
869 sum2
+= length
*length
;
872 unsigned long avg
= sum
/ samples
;
873 unsigned long sd
= int_sqrt(sum2
/ samples
- avg
*avg
);
874 rt_chain_length_max
= max_t(unsigned long,
876 (avg
+ 4*sd
) >> FRACT_BITS
);
882 * rt_worker_func() is run in process context.
883 * we call rt_check_expire() to scan part of the hash table
885 static void rt_worker_func(struct work_struct
*work
)
888 schedule_delayed_work(&expires_work
, ip_rt_gc_interval
);
892 * Pertubation of rt_genid by a small quantity [1..256]
893 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
894 * many times (2^24) without giving recent rt_genid.
895 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
897 static void rt_cache_invalidate(struct net
*net
)
899 unsigned char shuffle
;
901 get_random_bytes(&shuffle
, sizeof(shuffle
));
902 atomic_add(shuffle
+ 1U, &net
->ipv4
.rt_genid
);
906 * delay < 0 : invalidate cache (fast : entries will be deleted later)
907 * delay >= 0 : invalidate & flush cache (can be long)
909 void rt_cache_flush(struct net
*net
, int delay
)
911 rt_cache_invalidate(net
);
913 rt_do_flush(!in_softirq());
916 /* Flush previous cache invalidated entries from the cache */
917 void rt_cache_flush_batch(void)
919 rt_do_flush(!in_softirq());
923 * We change rt_genid and let gc do the cleanup
925 static void rt_secret_rebuild(unsigned long __net
)
927 struct net
*net
= (struct net
*)__net
;
928 rt_cache_invalidate(net
);
929 mod_timer(&net
->ipv4
.rt_secret_timer
, jiffies
+ ip_rt_secret_interval
);
932 static void rt_secret_rebuild_oneshot(struct net
*net
)
934 del_timer_sync(&net
->ipv4
.rt_secret_timer
);
935 rt_cache_invalidate(net
);
936 if (ip_rt_secret_interval
)
937 mod_timer(&net
->ipv4
.rt_secret_timer
, jiffies
+ ip_rt_secret_interval
);
940 static void rt_emergency_hash_rebuild(struct net
*net
)
942 if (net_ratelimit()) {
943 printk(KERN_WARNING
"Route hash chain too long!\n");
944 printk(KERN_WARNING
"Adjust your secret_interval!\n");
947 rt_secret_rebuild_oneshot(net
);
951 Short description of GC goals.
953 We want to build algorithm, which will keep routing cache
954 at some equilibrium point, when number of aged off entries
955 is kept approximately equal to newly generated ones.
957 Current expiration strength is variable "expire".
958 We try to adjust it dynamically, so that if networking
959 is idle expires is large enough to keep enough of warm entries,
960 and when load increases it reduces to limit cache size.
963 static int rt_garbage_collect(struct dst_ops
*ops
)
965 static unsigned long expire
= RT_GC_TIMEOUT
;
966 static unsigned long last_gc
;
968 static int equilibrium
;
969 struct rtable
*rth
, **rthp
;
970 unsigned long now
= jiffies
;
974 * Garbage collection is pretty expensive,
975 * do not make it too frequently.
978 RT_CACHE_STAT_INC(gc_total
);
980 if (now
- last_gc
< ip_rt_gc_min_interval
&&
981 atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
) {
982 RT_CACHE_STAT_INC(gc_ignored
);
986 /* Calculate number of entries, which we want to expire now. */
987 goal
= atomic_read(&ipv4_dst_ops
.entries
) -
988 (ip_rt_gc_elasticity
<< rt_hash_log
);
990 if (equilibrium
< ipv4_dst_ops
.gc_thresh
)
991 equilibrium
= ipv4_dst_ops
.gc_thresh
;
992 goal
= atomic_read(&ipv4_dst_ops
.entries
) - equilibrium
;
994 equilibrium
+= min_t(unsigned int, goal
>> 1, rt_hash_mask
+ 1);
995 goal
= atomic_read(&ipv4_dst_ops
.entries
) - equilibrium
;
998 /* We are in dangerous area. Try to reduce cache really
1001 goal
= max_t(unsigned int, goal
>> 1, rt_hash_mask
+ 1);
1002 equilibrium
= atomic_read(&ipv4_dst_ops
.entries
) - goal
;
1005 if (now
- last_gc
>= ip_rt_gc_min_interval
)
1009 equilibrium
+= goal
;
1016 for (i
= rt_hash_mask
, k
= rover
; i
>= 0; i
--) {
1017 unsigned long tmo
= expire
;
1019 k
= (k
+ 1) & rt_hash_mask
;
1020 rthp
= &rt_hash_table
[k
].chain
;
1021 spin_lock_bh(rt_hash_lock_addr(k
));
1022 while ((rth
= *rthp
) != NULL
) {
1023 if (!rt_is_expired(rth
) &&
1024 !rt_may_expire(rth
, tmo
, expire
)) {
1026 rthp
= &rth
->u
.dst
.rt_next
;
1029 *rthp
= rth
->u
.dst
.rt_next
;
1033 spin_unlock_bh(rt_hash_lock_addr(k
));
1042 /* Goal is not achieved. We stop process if:
1044 - if expire reduced to zero. Otherwise, expire is halfed.
1045 - if table is not full.
1046 - if we are called from interrupt.
1047 - jiffies check is just fallback/debug loop breaker.
1048 We will not spin here for long time in any case.
1051 RT_CACHE_STAT_INC(gc_goal_miss
);
1057 #if RT_CACHE_DEBUG >= 2
1058 printk(KERN_DEBUG
"expire>> %u %d %d %d\n", expire
,
1059 atomic_read(&ipv4_dst_ops
.entries
), goal
, i
);
1062 if (atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
1064 } while (!in_softirq() && time_before_eq(jiffies
, now
));
1066 if (atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
1068 if (net_ratelimit())
1069 printk(KERN_WARNING
"dst cache overflow\n");
1070 RT_CACHE_STAT_INC(gc_dst_overflow
);
1074 expire
+= ip_rt_gc_min_interval
;
1075 if (expire
> ip_rt_gc_timeout
||
1076 atomic_read(&ipv4_dst_ops
.entries
) < ipv4_dst_ops
.gc_thresh
)
1077 expire
= ip_rt_gc_timeout
;
1078 #if RT_CACHE_DEBUG >= 2
1079 printk(KERN_DEBUG
"expire++ %u %d %d %d\n", expire
,
1080 atomic_read(&ipv4_dst_ops
.entries
), goal
, rover
);
1086 * Returns number of entries in a hash chain that have different hash_inputs
1088 static int slow_chain_length(const struct rtable
*head
)
1091 const struct rtable
*rth
= head
;
1094 length
+= has_noalias(head
, rth
);
1095 rth
= rth
->u
.dst
.rt_next
;
1097 return length
>> FRACT_BITS
;
1100 static int rt_intern_hash(unsigned hash
, struct rtable
*rt
,
1101 struct rtable
**rp
, struct sk_buff
*skb
, int ifindex
)
1103 struct rtable
*rth
, **rthp
;
1105 struct rtable
*cand
, **candp
;
1108 int attempts
= !in_softirq();
1112 min_score
= ~(u32
)0;
1117 if (!rt_caching(dev_net(rt
->u
.dst
.dev
))) {
1119 * If we're not caching, just tell the caller we
1120 * were successful and don't touch the route. The
1121 * caller hold the sole reference to the cache entry, and
1122 * it will be released when the caller is done with it.
1123 * If we drop it here, the callers have no way to resolve routes
1124 * when we're not caching. Instead, just point *rp at rt, so
1125 * the caller gets a single use out of the route
1126 * Note that we do rt_free on this new route entry, so that
1127 * once its refcount hits zero, we are still able to reap it
1129 * Note also the rt_free uses call_rcu. We don't actually
1130 * need rcu protection here, this is just our path to get
1131 * on the route gc list.
1134 if (rt
->rt_type
== RTN_UNICAST
|| rt
->fl
.iif
== 0) {
1135 int err
= arp_bind_neighbour(&rt
->u
.dst
);
1137 if (net_ratelimit())
1139 "Neighbour table failure & not caching routes.\n");
1149 rthp
= &rt_hash_table
[hash
].chain
;
1151 spin_lock_bh(rt_hash_lock_addr(hash
));
1152 while ((rth
= *rthp
) != NULL
) {
1153 if (rt_is_expired(rth
)) {
1154 *rthp
= rth
->u
.dst
.rt_next
;
1158 if (compare_keys(&rth
->fl
, &rt
->fl
) && compare_netns(rth
, rt
)) {
1160 *rthp
= rth
->u
.dst
.rt_next
;
1162 * Since lookup is lockfree, the deletion
1163 * must be visible to another weakly ordered CPU before
1164 * the insertion at the start of the hash chain.
1166 rcu_assign_pointer(rth
->u
.dst
.rt_next
,
1167 rt_hash_table
[hash
].chain
);
1169 * Since lookup is lockfree, the update writes
1170 * must be ordered for consistency on SMP.
1172 rcu_assign_pointer(rt_hash_table
[hash
].chain
, rth
);
1174 dst_use(&rth
->u
.dst
, now
);
1175 spin_unlock_bh(rt_hash_lock_addr(hash
));
1181 skb_dst_set(skb
, &rth
->u
.dst
);
1185 if (!atomic_read(&rth
->u
.dst
.__refcnt
)) {
1186 u32 score
= rt_score(rth
);
1188 if (score
<= min_score
) {
1197 rthp
= &rth
->u
.dst
.rt_next
;
1201 /* ip_rt_gc_elasticity used to be average length of chain
1202 * length, when exceeded gc becomes really aggressive.
1204 * The second limit is less certain. At the moment it allows
1205 * only 2 entries per bucket. We will see.
1207 if (chain_length
> ip_rt_gc_elasticity
) {
1208 *candp
= cand
->u
.dst
.rt_next
;
1212 if (chain_length
> rt_chain_length_max
&&
1213 slow_chain_length(rt_hash_table
[hash
].chain
) > rt_chain_length_max
) {
1214 struct net
*net
= dev_net(rt
->u
.dst
.dev
);
1215 int num
= ++net
->ipv4
.current_rt_cache_rebuild_count
;
1216 if (!rt_caching(net
)) {
1217 printk(KERN_WARNING
"%s: %d rebuilds is over limit, route caching disabled\n",
1218 rt
->u
.dst
.dev
->name
, num
);
1220 rt_emergency_hash_rebuild(net
);
1221 spin_unlock_bh(rt_hash_lock_addr(hash
));
1223 hash
= rt_hash(rt
->fl
.fl4_dst
, rt
->fl
.fl4_src
,
1224 ifindex
, rt_genid(net
));
1229 /* Try to bind route to arp only if it is output
1230 route or unicast forwarding path.
1232 if (rt
->rt_type
== RTN_UNICAST
|| rt
->fl
.iif
== 0) {
1233 int err
= arp_bind_neighbour(&rt
->u
.dst
);
1235 spin_unlock_bh(rt_hash_lock_addr(hash
));
1237 if (err
!= -ENOBUFS
) {
1242 /* Neighbour tables are full and nothing
1243 can be released. Try to shrink route cache,
1244 it is most likely it holds some neighbour records.
1246 if (attempts
-- > 0) {
1247 int saved_elasticity
= ip_rt_gc_elasticity
;
1248 int saved_int
= ip_rt_gc_min_interval
;
1249 ip_rt_gc_elasticity
= 1;
1250 ip_rt_gc_min_interval
= 0;
1251 rt_garbage_collect(&ipv4_dst_ops
);
1252 ip_rt_gc_min_interval
= saved_int
;
1253 ip_rt_gc_elasticity
= saved_elasticity
;
1257 if (net_ratelimit())
1258 printk(KERN_WARNING
"Neighbour table overflow.\n");
1264 rt
->u
.dst
.rt_next
= rt_hash_table
[hash
].chain
;
1266 #if RT_CACHE_DEBUG >= 2
1267 if (rt
->u
.dst
.rt_next
) {
1269 printk(KERN_DEBUG
"rt_cache @%02x: %pI4",
1271 for (trt
= rt
->u
.dst
.rt_next
; trt
; trt
= trt
->u
.dst
.rt_next
)
1272 printk(" . %pI4", &trt
->rt_dst
);
1277 * Since lookup is lockfree, we must make sure
1278 * previous writes to rt are comitted to memory
1279 * before making rt visible to other CPUS.
1281 rcu_assign_pointer(rt_hash_table
[hash
].chain
, rt
);
1283 spin_unlock_bh(rt_hash_lock_addr(hash
));
1289 skb_dst_set(skb
, &rt
->u
.dst
);
1293 void rt_bind_peer(struct rtable
*rt
, int create
)
1295 static DEFINE_SPINLOCK(rt_peer_lock
);
1296 struct inet_peer
*peer
;
1298 peer
= inet_getpeer(rt
->rt_dst
, create
);
1300 spin_lock_bh(&rt_peer_lock
);
1301 if (rt
->peer
== NULL
) {
1305 spin_unlock_bh(&rt_peer_lock
);
1311 * Peer allocation may fail only in serious out-of-memory conditions. However
1312 * we still can generate some output.
1313 * Random ID selection looks a bit dangerous because we have no chances to
1314 * select ID being unique in a reasonable period of time.
1315 * But broken packet identifier may be better than no packet at all.
1317 static void ip_select_fb_ident(struct iphdr
*iph
)
1319 static DEFINE_SPINLOCK(ip_fb_id_lock
);
1320 static u32 ip_fallback_id
;
1323 spin_lock_bh(&ip_fb_id_lock
);
1324 salt
= secure_ip_id((__force __be32
)ip_fallback_id
^ iph
->daddr
);
1325 iph
->id
= htons(salt
& 0xFFFF);
1326 ip_fallback_id
= salt
;
1327 spin_unlock_bh(&ip_fb_id_lock
);
1330 void __ip_select_ident(struct iphdr
*iph
, struct dst_entry
*dst
, int more
)
1332 struct rtable
*rt
= (struct rtable
*) dst
;
1335 if (rt
->peer
== NULL
)
1336 rt_bind_peer(rt
, 1);
1338 /* If peer is attached to destination, it is never detached,
1339 so that we need not to grab a lock to dereference it.
1342 iph
->id
= htons(inet_getid(rt
->peer
, more
));
1346 printk(KERN_DEBUG
"rt_bind_peer(0) @%p\n",
1347 __builtin_return_address(0));
1349 ip_select_fb_ident(iph
);
1352 static void rt_del(unsigned hash
, struct rtable
*rt
)
1354 struct rtable
**rthp
, *aux
;
1356 rthp
= &rt_hash_table
[hash
].chain
;
1357 spin_lock_bh(rt_hash_lock_addr(hash
));
1359 while ((aux
= *rthp
) != NULL
) {
1360 if (aux
== rt
|| rt_is_expired(aux
)) {
1361 *rthp
= aux
->u
.dst
.rt_next
;
1365 rthp
= &aux
->u
.dst
.rt_next
;
1367 spin_unlock_bh(rt_hash_lock_addr(hash
));
1370 void ip_rt_redirect(__be32 old_gw
, __be32 daddr
, __be32 new_gw
,
1371 __be32 saddr
, struct net_device
*dev
)
1374 struct in_device
*in_dev
= in_dev_get(dev
);
1375 struct rtable
*rth
, **rthp
;
1376 __be32 skeys
[2] = { saddr
, 0 };
1377 int ikeys
[2] = { dev
->ifindex
, 0 };
1378 struct netevent_redirect netevent
;
1385 if (new_gw
== old_gw
|| !IN_DEV_RX_REDIRECTS(in_dev
) ||
1386 ipv4_is_multicast(new_gw
) || ipv4_is_lbcast(new_gw
) ||
1387 ipv4_is_zeronet(new_gw
))
1388 goto reject_redirect
;
1390 if (!rt_caching(net
))
1391 goto reject_redirect
;
1393 if (!IN_DEV_SHARED_MEDIA(in_dev
)) {
1394 if (!inet_addr_onlink(in_dev
, new_gw
, old_gw
))
1395 goto reject_redirect
;
1396 if (IN_DEV_SEC_REDIRECTS(in_dev
) && ip_fib_check_default(new_gw
, dev
))
1397 goto reject_redirect
;
1399 if (inet_addr_type(net
, new_gw
) != RTN_UNICAST
)
1400 goto reject_redirect
;
1403 for (i
= 0; i
< 2; i
++) {
1404 for (k
= 0; k
< 2; k
++) {
1405 unsigned hash
= rt_hash(daddr
, skeys
[i
], ikeys
[k
],
1408 rthp
=&rt_hash_table
[hash
].chain
;
1411 while ((rth
= rcu_dereference(*rthp
)) != NULL
) {
1414 if (rth
->fl
.fl4_dst
!= daddr
||
1415 rth
->fl
.fl4_src
!= skeys
[i
] ||
1416 rth
->fl
.oif
!= ikeys
[k
] ||
1418 rt_is_expired(rth
) ||
1419 !net_eq(dev_net(rth
->u
.dst
.dev
), net
)) {
1420 rthp
= &rth
->u
.dst
.rt_next
;
1424 if (rth
->rt_dst
!= daddr
||
1425 rth
->rt_src
!= saddr
||
1427 rth
->rt_gateway
!= old_gw
||
1428 rth
->u
.dst
.dev
!= dev
)
1431 dst_hold(&rth
->u
.dst
);
1434 rt
= dst_alloc(&ipv4_dst_ops
);
1441 /* Copy all the information. */
1443 rt
->u
.dst
.__use
= 1;
1444 atomic_set(&rt
->u
.dst
.__refcnt
, 1);
1445 rt
->u
.dst
.child
= NULL
;
1447 dev_hold(rt
->u
.dst
.dev
);
1449 in_dev_hold(rt
->idev
);
1450 rt
->u
.dst
.obsolete
= -1;
1451 rt
->u
.dst
.lastuse
= jiffies
;
1452 rt
->u
.dst
.path
= &rt
->u
.dst
;
1453 rt
->u
.dst
.neighbour
= NULL
;
1454 rt
->u
.dst
.hh
= NULL
;
1456 rt
->u
.dst
.xfrm
= NULL
;
1458 rt
->rt_genid
= rt_genid(net
);
1459 rt
->rt_flags
|= RTCF_REDIRECTED
;
1461 /* Gateway is different ... */
1462 rt
->rt_gateway
= new_gw
;
1464 /* Redirect received -> path was valid */
1465 dst_confirm(&rth
->u
.dst
);
1468 atomic_inc(&rt
->peer
->refcnt
);
1470 if (arp_bind_neighbour(&rt
->u
.dst
) ||
1471 !(rt
->u
.dst
.neighbour
->nud_state
&
1473 if (rt
->u
.dst
.neighbour
)
1474 neigh_event_send(rt
->u
.dst
.neighbour
, NULL
);
1480 netevent
.old
= &rth
->u
.dst
;
1481 netevent
.new = &rt
->u
.dst
;
1482 call_netevent_notifiers(NETEVENT_REDIRECT
,
1486 if (!rt_intern_hash(hash
, rt
, &rt
, NULL
, rt
->fl
.oif
))
1499 #ifdef CONFIG_IP_ROUTE_VERBOSE
1500 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
1501 printk(KERN_INFO
"Redirect from %pI4 on %s about %pI4 ignored.\n"
1502 " Advised path = %pI4 -> %pI4\n",
1503 &old_gw
, dev
->name
, &new_gw
,
1509 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
)
1511 struct rtable
*rt
= (struct rtable
*)dst
;
1512 struct dst_entry
*ret
= dst
;
1515 if (dst
->obsolete
> 0) {
1518 } else if ((rt
->rt_flags
& RTCF_REDIRECTED
) ||
1519 (rt
->u
.dst
.expires
&&
1520 time_after_eq(jiffies
, rt
->u
.dst
.expires
))) {
1521 unsigned hash
= rt_hash(rt
->fl
.fl4_dst
, rt
->fl
.fl4_src
,
1523 rt_genid(dev_net(dst
->dev
)));
1524 #if RT_CACHE_DEBUG >= 1
1525 printk(KERN_DEBUG
"ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1526 &rt
->rt_dst
, rt
->fl
.fl4_tos
);
1537 * 1. The first ip_rt_redirect_number redirects are sent
1538 * with exponential backoff, then we stop sending them at all,
1539 * assuming that the host ignores our redirects.
1540 * 2. If we did not see packets requiring redirects
1541 * during ip_rt_redirect_silence, we assume that the host
1542 * forgot redirected route and start to send redirects again.
1544 * This algorithm is much cheaper and more intelligent than dumb load limiting
1547 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1548 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1551 void ip_rt_send_redirect(struct sk_buff
*skb
)
1553 struct rtable
*rt
= skb_rtable(skb
);
1554 struct in_device
*in_dev
;
1558 in_dev
= __in_dev_get_rcu(rt
->u
.dst
.dev
);
1559 if (!in_dev
|| !IN_DEV_TX_REDIRECTS(in_dev
)) {
1563 log_martians
= IN_DEV_LOG_MARTIANS(in_dev
);
1566 /* No redirected packets during ip_rt_redirect_silence;
1567 * reset the algorithm.
1569 if (time_after(jiffies
, rt
->u
.dst
.rate_last
+ ip_rt_redirect_silence
))
1570 rt
->u
.dst
.rate_tokens
= 0;
1572 /* Too many ignored redirects; do not send anything
1573 * set u.dst.rate_last to the last seen redirected packet.
1575 if (rt
->u
.dst
.rate_tokens
>= ip_rt_redirect_number
) {
1576 rt
->u
.dst
.rate_last
= jiffies
;
1580 /* Check for load limit; set rate_last to the latest sent
1583 if (rt
->u
.dst
.rate_tokens
== 0 ||
1585 (rt
->u
.dst
.rate_last
+
1586 (ip_rt_redirect_load
<< rt
->u
.dst
.rate_tokens
)))) {
1587 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, rt
->rt_gateway
);
1588 rt
->u
.dst
.rate_last
= jiffies
;
1589 ++rt
->u
.dst
.rate_tokens
;
1590 #ifdef CONFIG_IP_ROUTE_VERBOSE
1592 rt
->u
.dst
.rate_tokens
== ip_rt_redirect_number
&&
1594 printk(KERN_WARNING
"host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1595 &rt
->rt_src
, rt
->rt_iif
,
1596 &rt
->rt_dst
, &rt
->rt_gateway
);
1601 static int ip_error(struct sk_buff
*skb
)
1603 struct rtable
*rt
= skb_rtable(skb
);
1607 switch (rt
->u
.dst
.error
) {
1612 code
= ICMP_HOST_UNREACH
;
1615 code
= ICMP_NET_UNREACH
;
1616 IP_INC_STATS_BH(dev_net(rt
->u
.dst
.dev
),
1617 IPSTATS_MIB_INNOROUTES
);
1620 code
= ICMP_PKT_FILTERED
;
1625 rt
->u
.dst
.rate_tokens
+= now
- rt
->u
.dst
.rate_last
;
1626 if (rt
->u
.dst
.rate_tokens
> ip_rt_error_burst
)
1627 rt
->u
.dst
.rate_tokens
= ip_rt_error_burst
;
1628 rt
->u
.dst
.rate_last
= now
;
1629 if (rt
->u
.dst
.rate_tokens
>= ip_rt_error_cost
) {
1630 rt
->u
.dst
.rate_tokens
-= ip_rt_error_cost
;
1631 icmp_send(skb
, ICMP_DEST_UNREACH
, code
, 0);
1634 out
: kfree_skb(skb
);
1639 * The last two values are not from the RFC but
1640 * are needed for AMPRnet AX.25 paths.
1643 static const unsigned short mtu_plateau
[] =
1644 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1646 static inline unsigned short guess_mtu(unsigned short old_mtu
)
1650 for (i
= 0; i
< ARRAY_SIZE(mtu_plateau
); i
++)
1651 if (old_mtu
> mtu_plateau
[i
])
1652 return mtu_plateau
[i
];
1656 unsigned short ip_rt_frag_needed(struct net
*net
, struct iphdr
*iph
,
1657 unsigned short new_mtu
,
1658 struct net_device
*dev
)
1661 unsigned short old_mtu
= ntohs(iph
->tot_len
);
1663 int ikeys
[2] = { dev
->ifindex
, 0 };
1664 __be32 skeys
[2] = { iph
->saddr
, 0, };
1665 __be32 daddr
= iph
->daddr
;
1666 unsigned short est_mtu
= 0;
1668 for (k
= 0; k
< 2; k
++) {
1669 for (i
= 0; i
< 2; i
++) {
1670 unsigned hash
= rt_hash(daddr
, skeys
[i
], ikeys
[k
],
1674 for (rth
= rcu_dereference(rt_hash_table
[hash
].chain
); rth
;
1675 rth
= rcu_dereference(rth
->u
.dst
.rt_next
)) {
1676 unsigned short mtu
= new_mtu
;
1678 if (rth
->fl
.fl4_dst
!= daddr
||
1679 rth
->fl
.fl4_src
!= skeys
[i
] ||
1680 rth
->rt_dst
!= daddr
||
1681 rth
->rt_src
!= iph
->saddr
||
1682 rth
->fl
.oif
!= ikeys
[k
] ||
1684 dst_metric_locked(&rth
->u
.dst
, RTAX_MTU
) ||
1685 !net_eq(dev_net(rth
->u
.dst
.dev
), net
) ||
1689 if (new_mtu
< 68 || new_mtu
>= old_mtu
) {
1691 /* BSD 4.2 compatibility hack :-( */
1693 old_mtu
>= dst_mtu(&rth
->u
.dst
) &&
1694 old_mtu
>= 68 + (iph
->ihl
<< 2))
1695 old_mtu
-= iph
->ihl
<< 2;
1697 mtu
= guess_mtu(old_mtu
);
1699 if (mtu
<= dst_mtu(&rth
->u
.dst
)) {
1700 if (mtu
< dst_mtu(&rth
->u
.dst
)) {
1701 dst_confirm(&rth
->u
.dst
);
1702 if (mtu
< ip_rt_min_pmtu
) {
1703 mtu
= ip_rt_min_pmtu
;
1704 rth
->u
.dst
.metrics
[RTAX_LOCK
-1] |=
1707 rth
->u
.dst
.metrics
[RTAX_MTU
-1] = mtu
;
1708 dst_set_expires(&rth
->u
.dst
,
1717 return est_mtu
? : new_mtu
;
1720 static void ip_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
1722 if (dst_mtu(dst
) > mtu
&& mtu
>= 68 &&
1723 !(dst_metric_locked(dst
, RTAX_MTU
))) {
1724 if (mtu
< ip_rt_min_pmtu
) {
1725 mtu
= ip_rt_min_pmtu
;
1726 dst
->metrics
[RTAX_LOCK
-1] |= (1 << RTAX_MTU
);
1728 dst
->metrics
[RTAX_MTU
-1] = mtu
;
1729 dst_set_expires(dst
, ip_rt_mtu_expires
);
1730 call_netevent_notifiers(NETEVENT_PMTU_UPDATE
, dst
);
1734 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
)
1736 if (rt_is_expired((struct rtable
*)dst
))
1741 static void ipv4_dst_destroy(struct dst_entry
*dst
)
1743 struct rtable
*rt
= (struct rtable
*) dst
;
1744 struct inet_peer
*peer
= rt
->peer
;
1745 struct in_device
*idev
= rt
->idev
;
1758 static void ipv4_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
1761 struct rtable
*rt
= (struct rtable
*) dst
;
1762 struct in_device
*idev
= rt
->idev
;
1763 if (dev
!= dev_net(dev
)->loopback_dev
&& idev
&& idev
->dev
== dev
) {
1764 struct in_device
*loopback_idev
=
1765 in_dev_get(dev_net(dev
)->loopback_dev
);
1766 if (loopback_idev
) {
1767 rt
->idev
= loopback_idev
;
1773 static void ipv4_link_failure(struct sk_buff
*skb
)
1777 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_HOST_UNREACH
, 0);
1779 rt
= skb_rtable(skb
);
1781 dst_set_expires(&rt
->u
.dst
, 0);
1784 static int ip_rt_bug(struct sk_buff
*skb
)
1786 printk(KERN_DEBUG
"ip_rt_bug: %pI4 -> %pI4, %s\n",
1787 &ip_hdr(skb
)->saddr
, &ip_hdr(skb
)->daddr
,
1788 skb
->dev
? skb
->dev
->name
: "?");
1794 We do not cache source address of outgoing interface,
1795 because it is used only by IP RR, TS and SRR options,
1796 so that it out of fast path.
1798 BTW remember: "addr" is allowed to be not aligned
1802 void ip_rt_get_source(u8
*addr
, struct rtable
*rt
)
1805 struct fib_result res
;
1807 if (rt
->fl
.iif
== 0)
1809 else if (fib_lookup(dev_net(rt
->u
.dst
.dev
), &rt
->fl
, &res
) == 0) {
1810 src
= FIB_RES_PREFSRC(res
);
1813 src
= inet_select_addr(rt
->u
.dst
.dev
, rt
->rt_gateway
,
1815 memcpy(addr
, &src
, 4);
1818 #ifdef CONFIG_NET_CLS_ROUTE
1819 static void set_class_tag(struct rtable
*rt
, u32 tag
)
1821 if (!(rt
->u
.dst
.tclassid
& 0xFFFF))
1822 rt
->u
.dst
.tclassid
|= tag
& 0xFFFF;
1823 if (!(rt
->u
.dst
.tclassid
& 0xFFFF0000))
1824 rt
->u
.dst
.tclassid
|= tag
& 0xFFFF0000;
1828 static void rt_set_nexthop(struct rtable
*rt
, struct fib_result
*res
, u32 itag
)
1830 struct fib_info
*fi
= res
->fi
;
1833 if (FIB_RES_GW(*res
) &&
1834 FIB_RES_NH(*res
).nh_scope
== RT_SCOPE_LINK
)
1835 rt
->rt_gateway
= FIB_RES_GW(*res
);
1836 memcpy(rt
->u
.dst
.metrics
, fi
->fib_metrics
,
1837 sizeof(rt
->u
.dst
.metrics
));
1838 if (fi
->fib_mtu
== 0) {
1839 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = rt
->u
.dst
.dev
->mtu
;
1840 if (dst_metric_locked(&rt
->u
.dst
, RTAX_MTU
) &&
1841 rt
->rt_gateway
!= rt
->rt_dst
&&
1842 rt
->u
.dst
.dev
->mtu
> 576)
1843 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = 576;
1845 #ifdef CONFIG_NET_CLS_ROUTE
1846 rt
->u
.dst
.tclassid
= FIB_RES_NH(*res
).nh_tclassid
;
1849 rt
->u
.dst
.metrics
[RTAX_MTU
-1]= rt
->u
.dst
.dev
->mtu
;
1851 if (dst_metric(&rt
->u
.dst
, RTAX_HOPLIMIT
) == 0)
1852 rt
->u
.dst
.metrics
[RTAX_HOPLIMIT
-1] = sysctl_ip_default_ttl
;
1853 if (dst_mtu(&rt
->u
.dst
) > IP_MAX_MTU
)
1854 rt
->u
.dst
.metrics
[RTAX_MTU
-1] = IP_MAX_MTU
;
1855 if (dst_metric(&rt
->u
.dst
, RTAX_ADVMSS
) == 0)
1856 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = max_t(unsigned int, rt
->u
.dst
.dev
->mtu
- 40,
1858 if (dst_metric(&rt
->u
.dst
, RTAX_ADVMSS
) > 65535 - 40)
1859 rt
->u
.dst
.metrics
[RTAX_ADVMSS
-1] = 65535 - 40;
1861 #ifdef CONFIG_NET_CLS_ROUTE
1862 #ifdef CONFIG_IP_MULTIPLE_TABLES
1863 set_class_tag(rt
, fib_rules_tclass(res
));
1865 set_class_tag(rt
, itag
);
1867 rt
->rt_type
= res
->type
;
1870 static int ip_route_input_mc(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1871 u8 tos
, struct net_device
*dev
, int our
)
1876 struct in_device
*in_dev
= in_dev_get(dev
);
1879 /* Primary sanity checks. */
1884 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
) ||
1885 ipv4_is_loopback(saddr
) || skb
->protocol
!= htons(ETH_P_IP
))
1888 if (ipv4_is_zeronet(saddr
)) {
1889 if (!ipv4_is_local_multicast(daddr
))
1891 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
1892 } else if (fib_validate_source(saddr
, 0, tos
, 0,
1893 dev
, &spec_dst
, &itag
, 0) < 0)
1896 rth
= dst_alloc(&ipv4_dst_ops
);
1900 rth
->u
.dst
.output
= ip_rt_bug
;
1901 rth
->u
.dst
.obsolete
= -1;
1903 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
1904 rth
->u
.dst
.flags
= DST_HOST
;
1905 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
1906 rth
->u
.dst
.flags
|= DST_NOPOLICY
;
1907 rth
->fl
.fl4_dst
= daddr
;
1908 rth
->rt_dst
= daddr
;
1909 rth
->fl
.fl4_tos
= tos
;
1910 rth
->fl
.mark
= skb
->mark
;
1911 rth
->fl
.fl4_src
= saddr
;
1912 rth
->rt_src
= saddr
;
1913 #ifdef CONFIG_NET_CLS_ROUTE
1914 rth
->u
.dst
.tclassid
= itag
;
1917 rth
->fl
.iif
= dev
->ifindex
;
1918 rth
->u
.dst
.dev
= init_net
.loopback_dev
;
1919 dev_hold(rth
->u
.dst
.dev
);
1920 rth
->idev
= in_dev_get(rth
->u
.dst
.dev
);
1922 rth
->rt_gateway
= daddr
;
1923 rth
->rt_spec_dst
= spec_dst
;
1924 rth
->rt_genid
= rt_genid(dev_net(dev
));
1925 rth
->rt_flags
= RTCF_MULTICAST
;
1926 rth
->rt_type
= RTN_MULTICAST
;
1928 rth
->u
.dst
.input
= ip_local_deliver
;
1929 rth
->rt_flags
|= RTCF_LOCAL
;
1932 #ifdef CONFIG_IP_MROUTE
1933 if (!ipv4_is_local_multicast(daddr
) && IN_DEV_MFORWARD(in_dev
))
1934 rth
->u
.dst
.input
= ip_mr_input
;
1936 RT_CACHE_STAT_INC(in_slow_mc
);
1939 hash
= rt_hash(daddr
, saddr
, dev
->ifindex
, rt_genid(dev_net(dev
)));
1940 return rt_intern_hash(hash
, rth
, NULL
, skb
, dev
->ifindex
);
1952 static void ip_handle_martian_source(struct net_device
*dev
,
1953 struct in_device
*in_dev
,
1954 struct sk_buff
*skb
,
1958 RT_CACHE_STAT_INC(in_martian_src
);
1959 #ifdef CONFIG_IP_ROUTE_VERBOSE
1960 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit()) {
1962 * RFC1812 recommendation, if source is martian,
1963 * the only hint is MAC header.
1965 printk(KERN_WARNING
"martian source %pI4 from %pI4, on dev %s\n",
1966 &daddr
, &saddr
, dev
->name
);
1967 if (dev
->hard_header_len
&& skb_mac_header_was_set(skb
)) {
1969 const unsigned char *p
= skb_mac_header(skb
);
1970 printk(KERN_WARNING
"ll header: ");
1971 for (i
= 0; i
< dev
->hard_header_len
; i
++, p
++) {
1973 if (i
< (dev
->hard_header_len
- 1))
1982 static int __mkroute_input(struct sk_buff
*skb
,
1983 struct fib_result
*res
,
1984 struct in_device
*in_dev
,
1985 __be32 daddr
, __be32 saddr
, u32 tos
,
1986 struct rtable
**result
)
1991 struct in_device
*out_dev
;
1996 /* get a working reference to the output device */
1997 out_dev
= in_dev_get(FIB_RES_DEV(*res
));
1998 if (out_dev
== NULL
) {
1999 if (net_ratelimit())
2000 printk(KERN_CRIT
"Bug in ip_route_input" \
2001 "_slow(). Please, report\n");
2006 err
= fib_validate_source(saddr
, daddr
, tos
, FIB_RES_OIF(*res
),
2007 in_dev
->dev
, &spec_dst
, &itag
, skb
->mark
);
2009 ip_handle_martian_source(in_dev
->dev
, in_dev
, skb
, daddr
,
2017 flags
|= RTCF_DIRECTSRC
;
2019 if (out_dev
== in_dev
&& err
&&
2020 (IN_DEV_SHARED_MEDIA(out_dev
) ||
2021 inet_addr_onlink(out_dev
, saddr
, FIB_RES_GW(*res
))))
2022 flags
|= RTCF_DOREDIRECT
;
2024 if (skb
->protocol
!= htons(ETH_P_IP
)) {
2025 /* Not IP (i.e. ARP). Do not create route, if it is
2026 * invalid for proxy arp. DNAT routes are always valid.
2028 * Proxy arp feature have been extended to allow, ARP
2029 * replies back to the same interface, to support
2030 * Private VLAN switch technologies. See arp.c.
2032 if (out_dev
== in_dev
&&
2033 IN_DEV_PROXY_ARP_PVLAN(in_dev
) == 0) {
2040 rth
= dst_alloc(&ipv4_dst_ops
);
2046 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
2047 rth
->u
.dst
.flags
= DST_HOST
;
2048 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
2049 rth
->u
.dst
.flags
|= DST_NOPOLICY
;
2050 if (IN_DEV_CONF_GET(out_dev
, NOXFRM
))
2051 rth
->u
.dst
.flags
|= DST_NOXFRM
;
2052 rth
->fl
.fl4_dst
= daddr
;
2053 rth
->rt_dst
= daddr
;
2054 rth
->fl
.fl4_tos
= tos
;
2055 rth
->fl
.mark
= skb
->mark
;
2056 rth
->fl
.fl4_src
= saddr
;
2057 rth
->rt_src
= saddr
;
2058 rth
->rt_gateway
= daddr
;
2060 rth
->fl
.iif
= in_dev
->dev
->ifindex
;
2061 rth
->u
.dst
.dev
= (out_dev
)->dev
;
2062 dev_hold(rth
->u
.dst
.dev
);
2063 rth
->idev
= in_dev_get(rth
->u
.dst
.dev
);
2065 rth
->rt_spec_dst
= spec_dst
;
2067 rth
->u
.dst
.obsolete
= -1;
2068 rth
->u
.dst
.input
= ip_forward
;
2069 rth
->u
.dst
.output
= ip_output
;
2070 rth
->rt_genid
= rt_genid(dev_net(rth
->u
.dst
.dev
));
2072 rt_set_nexthop(rth
, res
, itag
);
2074 rth
->rt_flags
= flags
;
2079 /* release the working reference to the output device */
2080 in_dev_put(out_dev
);
2084 static int ip_mkroute_input(struct sk_buff
*skb
,
2085 struct fib_result
*res
,
2086 const struct flowi
*fl
,
2087 struct in_device
*in_dev
,
2088 __be32 daddr
, __be32 saddr
, u32 tos
)
2090 struct rtable
* rth
= NULL
;
2094 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2095 if (res
->fi
&& res
->fi
->fib_nhs
> 1 && fl
->oif
== 0)
2096 fib_select_multipath(fl
, res
);
2099 /* create a routing cache entry */
2100 err
= __mkroute_input(skb
, res
, in_dev
, daddr
, saddr
, tos
, &rth
);
2104 /* put it into the cache */
2105 hash
= rt_hash(daddr
, saddr
, fl
->iif
,
2106 rt_genid(dev_net(rth
->u
.dst
.dev
)));
2107 return rt_intern_hash(hash
, rth
, NULL
, skb
, fl
->iif
);
2111 * NOTE. We drop all the packets that has local source
2112 * addresses, because every properly looped back packet
2113 * must have correct destination already attached by output routine.
2115 * Such approach solves two big problems:
2116 * 1. Not simplex devices are handled properly.
2117 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2120 static int ip_route_input_slow(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2121 u8 tos
, struct net_device
*dev
)
2123 struct fib_result res
;
2124 struct in_device
*in_dev
= in_dev_get(dev
);
2125 struct flowi fl
= { .nl_u
= { .ip4_u
=
2129 .scope
= RT_SCOPE_UNIVERSE
,
2132 .iif
= dev
->ifindex
};
2135 struct rtable
* rth
;
2140 struct net
* net
= dev_net(dev
);
2142 /* IP on this device is disabled. */
2147 /* Check for the most weird martians, which can be not detected
2151 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
) ||
2152 ipv4_is_loopback(saddr
))
2153 goto martian_source
;
2155 if (daddr
== htonl(0xFFFFFFFF) || (saddr
== 0 && daddr
== 0))
2158 /* Accept zero addresses only to limited broadcast;
2159 * I even do not know to fix it or not. Waiting for complains :-)
2161 if (ipv4_is_zeronet(saddr
))
2162 goto martian_source
;
2164 if (ipv4_is_lbcast(daddr
) || ipv4_is_zeronet(daddr
) ||
2165 ipv4_is_loopback(daddr
))
2166 goto martian_destination
;
2169 * Now we are ready to route packet.
2171 if ((err
= fib_lookup(net
, &fl
, &res
)) != 0) {
2172 if (!IN_DEV_FORWARD(in_dev
))
2178 RT_CACHE_STAT_INC(in_slow_tot
);
2180 if (res
.type
== RTN_BROADCAST
)
2183 if (res
.type
== RTN_LOCAL
) {
2185 result
= fib_validate_source(saddr
, daddr
, tos
,
2186 net
->loopback_dev
->ifindex
,
2187 dev
, &spec_dst
, &itag
, skb
->mark
);
2189 goto martian_source
;
2191 flags
|= RTCF_DIRECTSRC
;
2196 if (!IN_DEV_FORWARD(in_dev
))
2198 if (res
.type
!= RTN_UNICAST
)
2199 goto martian_destination
;
2201 err
= ip_mkroute_input(skb
, &res
, &fl
, in_dev
, daddr
, saddr
, tos
);
2209 if (skb
->protocol
!= htons(ETH_P_IP
))
2212 if (ipv4_is_zeronet(saddr
))
2213 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
2215 err
= fib_validate_source(saddr
, 0, tos
, 0, dev
, &spec_dst
,
2218 goto martian_source
;
2220 flags
|= RTCF_DIRECTSRC
;
2222 flags
|= RTCF_BROADCAST
;
2223 res
.type
= RTN_BROADCAST
;
2224 RT_CACHE_STAT_INC(in_brd
);
2227 rth
= dst_alloc(&ipv4_dst_ops
);
2231 rth
->u
.dst
.output
= ip_rt_bug
;
2232 rth
->u
.dst
.obsolete
= -1;
2233 rth
->rt_genid
= rt_genid(net
);
2235 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
2236 rth
->u
.dst
.flags
= DST_HOST
;
2237 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
2238 rth
->u
.dst
.flags
|= DST_NOPOLICY
;
2239 rth
->fl
.fl4_dst
= daddr
;
2240 rth
->rt_dst
= daddr
;
2241 rth
->fl
.fl4_tos
= tos
;
2242 rth
->fl
.mark
= skb
->mark
;
2243 rth
->fl
.fl4_src
= saddr
;
2244 rth
->rt_src
= saddr
;
2245 #ifdef CONFIG_NET_CLS_ROUTE
2246 rth
->u
.dst
.tclassid
= itag
;
2249 rth
->fl
.iif
= dev
->ifindex
;
2250 rth
->u
.dst
.dev
= net
->loopback_dev
;
2251 dev_hold(rth
->u
.dst
.dev
);
2252 rth
->idev
= in_dev_get(rth
->u
.dst
.dev
);
2253 rth
->rt_gateway
= daddr
;
2254 rth
->rt_spec_dst
= spec_dst
;
2255 rth
->u
.dst
.input
= ip_local_deliver
;
2256 rth
->rt_flags
= flags
|RTCF_LOCAL
;
2257 if (res
.type
== RTN_UNREACHABLE
) {
2258 rth
->u
.dst
.input
= ip_error
;
2259 rth
->u
.dst
.error
= -err
;
2260 rth
->rt_flags
&= ~RTCF_LOCAL
;
2262 rth
->rt_type
= res
.type
;
2263 hash
= rt_hash(daddr
, saddr
, fl
.iif
, rt_genid(net
));
2264 err
= rt_intern_hash(hash
, rth
, NULL
, skb
, fl
.iif
);
2268 RT_CACHE_STAT_INC(in_no_route
);
2269 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_UNIVERSE
);
2270 res
.type
= RTN_UNREACHABLE
;
2276 * Do not cache martian addresses: they should be logged (RFC1812)
2278 martian_destination
:
2279 RT_CACHE_STAT_INC(in_martian_dst
);
2280 #ifdef CONFIG_IP_ROUTE_VERBOSE
2281 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
2282 printk(KERN_WARNING
"martian destination %pI4 from %pI4, dev %s\n",
2283 &daddr
, &saddr
, dev
->name
);
2287 err
= -EHOSTUNREACH
;
2299 ip_handle_martian_source(dev
, in_dev
, skb
, daddr
, saddr
);
2303 int ip_route_input(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2304 u8 tos
, struct net_device
*dev
)
2306 struct rtable
* rth
;
2308 int iif
= dev
->ifindex
;
2313 if (!rt_caching(net
))
2316 tos
&= IPTOS_RT_MASK
;
2317 hash
= rt_hash(daddr
, saddr
, iif
, rt_genid(net
));
2320 for (rth
= rcu_dereference(rt_hash_table
[hash
].chain
); rth
;
2321 rth
= rcu_dereference(rth
->u
.dst
.rt_next
)) {
2322 if (((rth
->fl
.fl4_dst
^ daddr
) |
2323 (rth
->fl
.fl4_src
^ saddr
) |
2324 (rth
->fl
.iif
^ iif
) |
2326 (rth
->fl
.fl4_tos
^ tos
)) == 0 &&
2327 rth
->fl
.mark
== skb
->mark
&&
2328 net_eq(dev_net(rth
->u
.dst
.dev
), net
) &&
2329 !rt_is_expired(rth
)) {
2330 dst_use(&rth
->u
.dst
, jiffies
);
2331 RT_CACHE_STAT_INC(in_hit
);
2333 skb_dst_set(skb
, &rth
->u
.dst
);
2336 RT_CACHE_STAT_INC(in_hlist_search
);
2341 /* Multicast recognition logic is moved from route cache to here.
2342 The problem was that too many Ethernet cards have broken/missing
2343 hardware multicast filters :-( As result the host on multicasting
2344 network acquires a lot of useless route cache entries, sort of
2345 SDR messages from all the world. Now we try to get rid of them.
2346 Really, provided software IP multicast filter is organized
2347 reasonably (at least, hashed), it does not result in a slowdown
2348 comparing with route cache reject entries.
2349 Note, that multicast routers are not affected, because
2350 route cache entry is created eventually.
2352 if (ipv4_is_multicast(daddr
)) {
2353 struct in_device
*in_dev
;
2356 if ((in_dev
= __in_dev_get_rcu(dev
)) != NULL
) {
2357 int our
= ip_check_mc(in_dev
, daddr
, saddr
,
2358 ip_hdr(skb
)->protocol
);
2360 #ifdef CONFIG_IP_MROUTE
2362 (!ipv4_is_local_multicast(daddr
) &&
2363 IN_DEV_MFORWARD(in_dev
))
2367 return ip_route_input_mc(skb
, daddr
, saddr
,
2374 return ip_route_input_slow(skb
, daddr
, saddr
, tos
, dev
);
2377 static int __mkroute_output(struct rtable
**result
,
2378 struct fib_result
*res
,
2379 const struct flowi
*fl
,
2380 const struct flowi
*oldflp
,
2381 struct net_device
*dev_out
,
2385 struct in_device
*in_dev
;
2386 u32 tos
= RT_FL_TOS(oldflp
);
2389 if (ipv4_is_loopback(fl
->fl4_src
) && !(dev_out
->flags
&IFF_LOOPBACK
))
2392 if (fl
->fl4_dst
== htonl(0xFFFFFFFF))
2393 res
->type
= RTN_BROADCAST
;
2394 else if (ipv4_is_multicast(fl
->fl4_dst
))
2395 res
->type
= RTN_MULTICAST
;
2396 else if (ipv4_is_lbcast(fl
->fl4_dst
) || ipv4_is_zeronet(fl
->fl4_dst
))
2399 if (dev_out
->flags
& IFF_LOOPBACK
)
2400 flags
|= RTCF_LOCAL
;
2402 /* get work reference to inet device */
2403 in_dev
= in_dev_get(dev_out
);
2407 if (res
->type
== RTN_BROADCAST
) {
2408 flags
|= RTCF_BROADCAST
| RTCF_LOCAL
;
2410 fib_info_put(res
->fi
);
2413 } else if (res
->type
== RTN_MULTICAST
) {
2414 flags
|= RTCF_MULTICAST
|RTCF_LOCAL
;
2415 if (!ip_check_mc(in_dev
, oldflp
->fl4_dst
, oldflp
->fl4_src
,
2417 flags
&= ~RTCF_LOCAL
;
2418 /* If multicast route do not exist use
2419 default one, but do not gateway in this case.
2422 if (res
->fi
&& res
->prefixlen
< 4) {
2423 fib_info_put(res
->fi
);
2429 rth
= dst_alloc(&ipv4_dst_ops
);
2435 atomic_set(&rth
->u
.dst
.__refcnt
, 1);
2436 rth
->u
.dst
.flags
= DST_HOST
;
2437 if (IN_DEV_CONF_GET(in_dev
, NOXFRM
))
2438 rth
->u
.dst
.flags
|= DST_NOXFRM
;
2439 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
2440 rth
->u
.dst
.flags
|= DST_NOPOLICY
;
2442 rth
->fl
.fl4_dst
= oldflp
->fl4_dst
;
2443 rth
->fl
.fl4_tos
= tos
;
2444 rth
->fl
.fl4_src
= oldflp
->fl4_src
;
2445 rth
->fl
.oif
= oldflp
->oif
;
2446 rth
->fl
.mark
= oldflp
->mark
;
2447 rth
->rt_dst
= fl
->fl4_dst
;
2448 rth
->rt_src
= fl
->fl4_src
;
2449 rth
->rt_iif
= oldflp
->oif
? : dev_out
->ifindex
;
2450 /* get references to the devices that are to be hold by the routing
2452 rth
->u
.dst
.dev
= dev_out
;
2454 rth
->idev
= in_dev_get(dev_out
);
2455 rth
->rt_gateway
= fl
->fl4_dst
;
2456 rth
->rt_spec_dst
= fl
->fl4_src
;
2458 rth
->u
.dst
.output
=ip_output
;
2459 rth
->u
.dst
.obsolete
= -1;
2460 rth
->rt_genid
= rt_genid(dev_net(dev_out
));
2462 RT_CACHE_STAT_INC(out_slow_tot
);
2464 if (flags
& RTCF_LOCAL
) {
2465 rth
->u
.dst
.input
= ip_local_deliver
;
2466 rth
->rt_spec_dst
= fl
->fl4_dst
;
2468 if (flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) {
2469 rth
->rt_spec_dst
= fl
->fl4_src
;
2470 if (flags
& RTCF_LOCAL
&&
2471 !(dev_out
->flags
& IFF_LOOPBACK
)) {
2472 rth
->u
.dst
.output
= ip_mc_output
;
2473 RT_CACHE_STAT_INC(out_slow_mc
);
2475 #ifdef CONFIG_IP_MROUTE
2476 if (res
->type
== RTN_MULTICAST
) {
2477 if (IN_DEV_MFORWARD(in_dev
) &&
2478 !ipv4_is_local_multicast(oldflp
->fl4_dst
)) {
2479 rth
->u
.dst
.input
= ip_mr_input
;
2480 rth
->u
.dst
.output
= ip_mc_output
;
2486 rt_set_nexthop(rth
, res
, 0);
2488 rth
->rt_flags
= flags
;
2492 /* release work reference to inet device */
2498 static int ip_mkroute_output(struct rtable
**rp
,
2499 struct fib_result
*res
,
2500 const struct flowi
*fl
,
2501 const struct flowi
*oldflp
,
2502 struct net_device
*dev_out
,
2505 struct rtable
*rth
= NULL
;
2506 int err
= __mkroute_output(&rth
, res
, fl
, oldflp
, dev_out
, flags
);
2509 hash
= rt_hash(oldflp
->fl4_dst
, oldflp
->fl4_src
, oldflp
->oif
,
2510 rt_genid(dev_net(dev_out
)));
2511 err
= rt_intern_hash(hash
, rth
, rp
, NULL
, oldflp
->oif
);
2518 * Major route resolver routine.
2521 static int ip_route_output_slow(struct net
*net
, struct rtable
**rp
,
2522 const struct flowi
*oldflp
)
2524 u32 tos
= RT_FL_TOS(oldflp
);
2525 struct flowi fl
= { .nl_u
= { .ip4_u
=
2526 { .daddr
= oldflp
->fl4_dst
,
2527 .saddr
= oldflp
->fl4_src
,
2528 .tos
= tos
& IPTOS_RT_MASK
,
2529 .scope
= ((tos
& RTO_ONLINK
) ?
2533 .mark
= oldflp
->mark
,
2534 .iif
= net
->loopback_dev
->ifindex
,
2535 .oif
= oldflp
->oif
};
2536 struct fib_result res
;
2538 struct net_device
*dev_out
= NULL
;
2544 #ifdef CONFIG_IP_MULTIPLE_TABLES
2548 if (oldflp
->fl4_src
) {
2550 if (ipv4_is_multicast(oldflp
->fl4_src
) ||
2551 ipv4_is_lbcast(oldflp
->fl4_src
) ||
2552 ipv4_is_zeronet(oldflp
->fl4_src
))
2555 /* I removed check for oif == dev_out->oif here.
2556 It was wrong for two reasons:
2557 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2558 is assigned to multiple interfaces.
2559 2. Moreover, we are allowed to send packets with saddr
2560 of another iface. --ANK
2563 if (oldflp
->oif
== 0 &&
2564 (ipv4_is_multicast(oldflp
->fl4_dst
) ||
2565 oldflp
->fl4_dst
== htonl(0xFFFFFFFF))) {
2566 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2567 dev_out
= ip_dev_find(net
, oldflp
->fl4_src
);
2568 if (dev_out
== NULL
)
2571 /* Special hack: user can direct multicasts
2572 and limited broadcast via necessary interface
2573 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2574 This hack is not just for fun, it allows
2575 vic,vat and friends to work.
2576 They bind socket to loopback, set ttl to zero
2577 and expect that it will work.
2578 From the viewpoint of routing cache they are broken,
2579 because we are not allowed to build multicast path
2580 with loopback source addr (look, routing cache
2581 cannot know, that ttl is zero, so that packet
2582 will not leave this host and route is valid).
2583 Luckily, this hack is good workaround.
2586 fl
.oif
= dev_out
->ifindex
;
2590 if (!(oldflp
->flags
& FLOWI_FLAG_ANYSRC
)) {
2591 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2592 dev_out
= ip_dev_find(net
, oldflp
->fl4_src
);
2593 if (dev_out
== NULL
)
2602 dev_out
= dev_get_by_index(net
, oldflp
->oif
);
2604 if (dev_out
== NULL
)
2607 /* RACE: Check return value of inet_select_addr instead. */
2608 if (__in_dev_get_rtnl(dev_out
) == NULL
) {
2610 goto out
; /* Wrong error code */
2613 if (ipv4_is_local_multicast(oldflp
->fl4_dst
) ||
2614 oldflp
->fl4_dst
== htonl(0xFFFFFFFF)) {
2616 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2621 if (ipv4_is_multicast(oldflp
->fl4_dst
))
2622 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2624 else if (!oldflp
->fl4_dst
)
2625 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2631 fl
.fl4_dst
= fl
.fl4_src
;
2633 fl
.fl4_dst
= fl
.fl4_src
= htonl(INADDR_LOOPBACK
);
2636 dev_out
= net
->loopback_dev
;
2638 fl
.oif
= net
->loopback_dev
->ifindex
;
2639 res
.type
= RTN_LOCAL
;
2640 flags
|= RTCF_LOCAL
;
2644 if (fib_lookup(net
, &fl
, &res
)) {
2647 /* Apparently, routing tables are wrong. Assume,
2648 that the destination is on link.
2651 Because we are allowed to send to iface
2652 even if it has NO routes and NO assigned
2653 addresses. When oif is specified, routing
2654 tables are looked up with only one purpose:
2655 to catch if destination is gatewayed, rather than
2656 direct. Moreover, if MSG_DONTROUTE is set,
2657 we send packet, ignoring both routing tables
2658 and ifaddr state. --ANK
2661 We could make it even if oif is unknown,
2662 likely IPv6, but we do not.
2665 if (fl
.fl4_src
== 0)
2666 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2668 res
.type
= RTN_UNICAST
;
2678 if (res
.type
== RTN_LOCAL
) {
2680 fl
.fl4_src
= fl
.fl4_dst
;
2683 dev_out
= net
->loopback_dev
;
2685 fl
.oif
= dev_out
->ifindex
;
2687 fib_info_put(res
.fi
);
2689 flags
|= RTCF_LOCAL
;
2693 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2694 if (res
.fi
->fib_nhs
> 1 && fl
.oif
== 0)
2695 fib_select_multipath(&fl
, &res
);
2698 if (!res
.prefixlen
&& res
.type
== RTN_UNICAST
&& !fl
.oif
)
2699 fib_select_default(net
, &fl
, &res
);
2702 fl
.fl4_src
= FIB_RES_PREFSRC(res
);
2706 dev_out
= FIB_RES_DEV(res
);
2708 fl
.oif
= dev_out
->ifindex
;
2712 err
= ip_mkroute_output(rp
, &res
, &fl
, oldflp
, dev_out
, flags
);
2722 int __ip_route_output_key(struct net
*net
, struct rtable
**rp
,
2723 const struct flowi
*flp
)
2728 if (!rt_caching(net
))
2731 hash
= rt_hash(flp
->fl4_dst
, flp
->fl4_src
, flp
->oif
, rt_genid(net
));
2734 for (rth
= rcu_dereference_bh(rt_hash_table
[hash
].chain
); rth
;
2735 rth
= rcu_dereference_bh(rth
->u
.dst
.rt_next
)) {
2736 if (rth
->fl
.fl4_dst
== flp
->fl4_dst
&&
2737 rth
->fl
.fl4_src
== flp
->fl4_src
&&
2739 rth
->fl
.oif
== flp
->oif
&&
2740 rth
->fl
.mark
== flp
->mark
&&
2741 !((rth
->fl
.fl4_tos
^ flp
->fl4_tos
) &
2742 (IPTOS_RT_MASK
| RTO_ONLINK
)) &&
2743 net_eq(dev_net(rth
->u
.dst
.dev
), net
) &&
2744 !rt_is_expired(rth
)) {
2745 dst_use(&rth
->u
.dst
, jiffies
);
2746 RT_CACHE_STAT_INC(out_hit
);
2747 rcu_read_unlock_bh();
2751 RT_CACHE_STAT_INC(out_hlist_search
);
2753 rcu_read_unlock_bh();
2756 return ip_route_output_slow(net
, rp
, flp
);
2759 EXPORT_SYMBOL_GPL(__ip_route_output_key
);
2761 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
2765 static struct dst_ops ipv4_dst_blackhole_ops
= {
2767 .protocol
= cpu_to_be16(ETH_P_IP
),
2768 .destroy
= ipv4_dst_destroy
,
2769 .check
= ipv4_dst_check
,
2770 .update_pmtu
= ipv4_rt_blackhole_update_pmtu
,
2771 .entries
= ATOMIC_INIT(0),
2775 static int ipv4_dst_blackhole(struct net
*net
, struct rtable
**rp
, struct flowi
*flp
)
2777 struct rtable
*ort
= *rp
;
2778 struct rtable
*rt
= (struct rtable
*)
2779 dst_alloc(&ipv4_dst_blackhole_ops
);
2782 struct dst_entry
*new = &rt
->u
.dst
;
2784 atomic_set(&new->__refcnt
, 1);
2786 new->input
= dst_discard
;
2787 new->output
= dst_discard
;
2788 memcpy(new->metrics
, ort
->u
.dst
.metrics
, RTAX_MAX
*sizeof(u32
));
2790 new->dev
= ort
->u
.dst
.dev
;
2796 rt
->idev
= ort
->idev
;
2798 in_dev_hold(rt
->idev
);
2799 rt
->rt_genid
= rt_genid(net
);
2800 rt
->rt_flags
= ort
->rt_flags
;
2801 rt
->rt_type
= ort
->rt_type
;
2802 rt
->rt_dst
= ort
->rt_dst
;
2803 rt
->rt_src
= ort
->rt_src
;
2804 rt
->rt_iif
= ort
->rt_iif
;
2805 rt
->rt_gateway
= ort
->rt_gateway
;
2806 rt
->rt_spec_dst
= ort
->rt_spec_dst
;
2807 rt
->peer
= ort
->peer
;
2809 atomic_inc(&rt
->peer
->refcnt
);
2814 dst_release(&(*rp
)->u
.dst
);
2816 return (rt
? 0 : -ENOMEM
);
2819 int ip_route_output_flow(struct net
*net
, struct rtable
**rp
, struct flowi
*flp
,
2820 struct sock
*sk
, int flags
)
2824 if ((err
= __ip_route_output_key(net
, rp
, flp
)) != 0)
2829 flp
->fl4_src
= (*rp
)->rt_src
;
2831 flp
->fl4_dst
= (*rp
)->rt_dst
;
2832 err
= __xfrm_lookup(net
, (struct dst_entry
**)rp
, flp
, sk
,
2833 flags
? XFRM_LOOKUP_WAIT
: 0);
2834 if (err
== -EREMOTE
)
2835 err
= ipv4_dst_blackhole(net
, rp
, flp
);
2843 EXPORT_SYMBOL_GPL(ip_route_output_flow
);
2845 int ip_route_output_key(struct net
*net
, struct rtable
**rp
, struct flowi
*flp
)
2847 return ip_route_output_flow(net
, rp
, flp
, NULL
, 0);
2850 static int rt_fill_info(struct net
*net
,
2851 struct sk_buff
*skb
, u32 pid
, u32 seq
, int event
,
2852 int nowait
, unsigned int flags
)
2854 struct rtable
*rt
= skb_rtable(skb
);
2856 struct nlmsghdr
*nlh
;
2858 u32 id
= 0, ts
= 0, tsage
= 0, error
;
2860 nlh
= nlmsg_put(skb
, pid
, seq
, event
, sizeof(*r
), flags
);
2864 r
= nlmsg_data(nlh
);
2865 r
->rtm_family
= AF_INET
;
2866 r
->rtm_dst_len
= 32;
2868 r
->rtm_tos
= rt
->fl
.fl4_tos
;
2869 r
->rtm_table
= RT_TABLE_MAIN
;
2870 NLA_PUT_U32(skb
, RTA_TABLE
, RT_TABLE_MAIN
);
2871 r
->rtm_type
= rt
->rt_type
;
2872 r
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2873 r
->rtm_protocol
= RTPROT_UNSPEC
;
2874 r
->rtm_flags
= (rt
->rt_flags
& ~0xFFFF) | RTM_F_CLONED
;
2875 if (rt
->rt_flags
& RTCF_NOTIFY
)
2876 r
->rtm_flags
|= RTM_F_NOTIFY
;
2878 NLA_PUT_BE32(skb
, RTA_DST
, rt
->rt_dst
);
2880 if (rt
->fl
.fl4_src
) {
2881 r
->rtm_src_len
= 32;
2882 NLA_PUT_BE32(skb
, RTA_SRC
, rt
->fl
.fl4_src
);
2885 NLA_PUT_U32(skb
, RTA_OIF
, rt
->u
.dst
.dev
->ifindex
);
2886 #ifdef CONFIG_NET_CLS_ROUTE
2887 if (rt
->u
.dst
.tclassid
)
2888 NLA_PUT_U32(skb
, RTA_FLOW
, rt
->u
.dst
.tclassid
);
2891 NLA_PUT_BE32(skb
, RTA_PREFSRC
, rt
->rt_spec_dst
);
2892 else if (rt
->rt_src
!= rt
->fl
.fl4_src
)
2893 NLA_PUT_BE32(skb
, RTA_PREFSRC
, rt
->rt_src
);
2895 if (rt
->rt_dst
!= rt
->rt_gateway
)
2896 NLA_PUT_BE32(skb
, RTA_GATEWAY
, rt
->rt_gateway
);
2898 if (rtnetlink_put_metrics(skb
, rt
->u
.dst
.metrics
) < 0)
2899 goto nla_put_failure
;
2901 error
= rt
->u
.dst
.error
;
2902 expires
= rt
->u
.dst
.expires
? rt
->u
.dst
.expires
- jiffies
: 0;
2904 id
= atomic_read(&rt
->peer
->ip_id_count
) & 0xffff;
2905 if (rt
->peer
->tcp_ts_stamp
) {
2906 ts
= rt
->peer
->tcp_ts
;
2907 tsage
= get_seconds() - rt
->peer
->tcp_ts_stamp
;
2912 #ifdef CONFIG_IP_MROUTE
2913 __be32 dst
= rt
->rt_dst
;
2915 if (ipv4_is_multicast(dst
) && !ipv4_is_local_multicast(dst
) &&
2916 IPV4_DEVCONF_ALL(net
, MC_FORWARDING
)) {
2917 int err
= ipmr_get_route(net
, skb
, r
, nowait
);
2922 goto nla_put_failure
;
2924 if (err
== -EMSGSIZE
)
2925 goto nla_put_failure
;
2931 NLA_PUT_U32(skb
, RTA_IIF
, rt
->fl
.iif
);
2934 if (rtnl_put_cacheinfo(skb
, &rt
->u
.dst
, id
, ts
, tsage
,
2935 expires
, error
) < 0)
2936 goto nla_put_failure
;
2938 return nlmsg_end(skb
, nlh
);
2941 nlmsg_cancel(skb
, nlh
);
2945 static int inet_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
2947 struct net
*net
= sock_net(in_skb
->sk
);
2949 struct nlattr
*tb
[RTA_MAX
+1];
2950 struct rtable
*rt
= NULL
;
2955 struct sk_buff
*skb
;
2957 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv4_policy
);
2961 rtm
= nlmsg_data(nlh
);
2963 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
2969 /* Reserve room for dummy headers, this skb can pass
2970 through good chunk of routing engine.
2972 skb_reset_mac_header(skb
);
2973 skb_reset_network_header(skb
);
2975 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2976 ip_hdr(skb
)->protocol
= IPPROTO_ICMP
;
2977 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct iphdr
));
2979 src
= tb
[RTA_SRC
] ? nla_get_be32(tb
[RTA_SRC
]) : 0;
2980 dst
= tb
[RTA_DST
] ? nla_get_be32(tb
[RTA_DST
]) : 0;
2981 iif
= tb
[RTA_IIF
] ? nla_get_u32(tb
[RTA_IIF
]) : 0;
2984 struct net_device
*dev
;
2986 dev
= __dev_get_by_index(net
, iif
);
2992 skb
->protocol
= htons(ETH_P_IP
);
2995 err
= ip_route_input(skb
, dst
, src
, rtm
->rtm_tos
, dev
);
2998 rt
= skb_rtable(skb
);
2999 if (err
== 0 && rt
->u
.dst
.error
)
3000 err
= -rt
->u
.dst
.error
;
3007 .tos
= rtm
->rtm_tos
,
3010 .oif
= tb
[RTA_OIF
] ? nla_get_u32(tb
[RTA_OIF
]) : 0,
3012 err
= ip_route_output_key(net
, &rt
, &fl
);
3018 skb_dst_set(skb
, &rt
->u
.dst
);
3019 if (rtm
->rtm_flags
& RTM_F_NOTIFY
)
3020 rt
->rt_flags
|= RTCF_NOTIFY
;
3022 err
= rt_fill_info(net
, skb
, NETLINK_CB(in_skb
).pid
, nlh
->nlmsg_seq
,
3023 RTM_NEWROUTE
, 0, 0);
3027 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).pid
);
3036 int ip_rt_dump(struct sk_buff
*skb
, struct netlink_callback
*cb
)
3043 net
= sock_net(skb
->sk
);
3048 s_idx
= idx
= cb
->args
[1];
3049 for (h
= s_h
; h
<= rt_hash_mask
; h
++, s_idx
= 0) {
3050 if (!rt_hash_table
[h
].chain
)
3053 for (rt
= rcu_dereference_bh(rt_hash_table
[h
].chain
), idx
= 0; rt
;
3054 rt
= rcu_dereference_bh(rt
->u
.dst
.rt_next
), idx
++) {
3055 if (!net_eq(dev_net(rt
->u
.dst
.dev
), net
) || idx
< s_idx
)
3057 if (rt_is_expired(rt
))
3059 skb_dst_set(skb
, dst_clone(&rt
->u
.dst
));
3060 if (rt_fill_info(net
, skb
, NETLINK_CB(cb
->skb
).pid
,
3061 cb
->nlh
->nlmsg_seq
, RTM_NEWROUTE
,
3062 1, NLM_F_MULTI
) <= 0) {
3064 rcu_read_unlock_bh();
3069 rcu_read_unlock_bh();
3078 void ip_rt_multicast_event(struct in_device
*in_dev
)
3080 rt_cache_flush(dev_net(in_dev
->dev
), 0);
3083 #ifdef CONFIG_SYSCTL
3084 static int ipv4_sysctl_rtcache_flush(ctl_table
*__ctl
, int write
,
3085 void __user
*buffer
,
3086 size_t *lenp
, loff_t
*ppos
)
3093 memcpy(&ctl
, __ctl
, sizeof(ctl
));
3094 ctl
.data
= &flush_delay
;
3095 proc_dointvec(&ctl
, write
, buffer
, lenp
, ppos
);
3097 net
= (struct net
*)__ctl
->extra1
;
3098 rt_cache_flush(net
, flush_delay
);
3105 static void rt_secret_reschedule(int old
)
3108 int new = ip_rt_secret_interval
;
3109 int diff
= new - old
;
3116 int deleted
= del_timer_sync(&net
->ipv4
.rt_secret_timer
);
3123 time
= net
->ipv4
.rt_secret_timer
.expires
- jiffies
;
3125 if (time
<= 0 || (time
+= diff
) <= 0)
3130 mod_timer(&net
->ipv4
.rt_secret_timer
, jiffies
+ time
);
3135 static int ipv4_sysctl_rt_secret_interval(ctl_table
*ctl
, int write
,
3136 void __user
*buffer
, size_t *lenp
,
3139 int old
= ip_rt_secret_interval
;
3140 int ret
= proc_dointvec_jiffies(ctl
, write
, buffer
, lenp
, ppos
);
3142 rt_secret_reschedule(old
);
3147 static ctl_table ipv4_route_table
[] = {
3149 .procname
= "gc_thresh",
3150 .data
= &ipv4_dst_ops
.gc_thresh
,
3151 .maxlen
= sizeof(int),
3153 .proc_handler
= proc_dointvec
,
3156 .procname
= "max_size",
3157 .data
= &ip_rt_max_size
,
3158 .maxlen
= sizeof(int),
3160 .proc_handler
= proc_dointvec
,
3163 /* Deprecated. Use gc_min_interval_ms */
3165 .procname
= "gc_min_interval",
3166 .data
= &ip_rt_gc_min_interval
,
3167 .maxlen
= sizeof(int),
3169 .proc_handler
= proc_dointvec_jiffies
,
3172 .procname
= "gc_min_interval_ms",
3173 .data
= &ip_rt_gc_min_interval
,
3174 .maxlen
= sizeof(int),
3176 .proc_handler
= proc_dointvec_ms_jiffies
,
3179 .procname
= "gc_timeout",
3180 .data
= &ip_rt_gc_timeout
,
3181 .maxlen
= sizeof(int),
3183 .proc_handler
= proc_dointvec_jiffies
,
3186 .procname
= "gc_interval",
3187 .data
= &ip_rt_gc_interval
,
3188 .maxlen
= sizeof(int),
3190 .proc_handler
= proc_dointvec_jiffies
,
3193 .procname
= "redirect_load",
3194 .data
= &ip_rt_redirect_load
,
3195 .maxlen
= sizeof(int),
3197 .proc_handler
= proc_dointvec
,
3200 .procname
= "redirect_number",
3201 .data
= &ip_rt_redirect_number
,
3202 .maxlen
= sizeof(int),
3204 .proc_handler
= proc_dointvec
,
3207 .procname
= "redirect_silence",
3208 .data
= &ip_rt_redirect_silence
,
3209 .maxlen
= sizeof(int),
3211 .proc_handler
= proc_dointvec
,
3214 .procname
= "error_cost",
3215 .data
= &ip_rt_error_cost
,
3216 .maxlen
= sizeof(int),
3218 .proc_handler
= proc_dointvec
,
3221 .procname
= "error_burst",
3222 .data
= &ip_rt_error_burst
,
3223 .maxlen
= sizeof(int),
3225 .proc_handler
= proc_dointvec
,
3228 .procname
= "gc_elasticity",
3229 .data
= &ip_rt_gc_elasticity
,
3230 .maxlen
= sizeof(int),
3232 .proc_handler
= proc_dointvec
,
3235 .procname
= "mtu_expires",
3236 .data
= &ip_rt_mtu_expires
,
3237 .maxlen
= sizeof(int),
3239 .proc_handler
= proc_dointvec_jiffies
,
3242 .procname
= "min_pmtu",
3243 .data
= &ip_rt_min_pmtu
,
3244 .maxlen
= sizeof(int),
3246 .proc_handler
= proc_dointvec
,
3249 .procname
= "min_adv_mss",
3250 .data
= &ip_rt_min_advmss
,
3251 .maxlen
= sizeof(int),
3253 .proc_handler
= proc_dointvec
,
3256 .procname
= "secret_interval",
3257 .data
= &ip_rt_secret_interval
,
3258 .maxlen
= sizeof(int),
3260 .proc_handler
= ipv4_sysctl_rt_secret_interval
,
3265 static struct ctl_table empty
[1];
3267 static struct ctl_table ipv4_skeleton
[] =
3269 { .procname
= "route",
3270 .mode
= 0555, .child
= ipv4_route_table
},
3271 { .procname
= "neigh",
3272 .mode
= 0555, .child
= empty
},
3276 static __net_initdata
struct ctl_path ipv4_path
[] = {
3277 { .procname
= "net", },
3278 { .procname
= "ipv4", },
3282 static struct ctl_table ipv4_route_flush_table
[] = {
3284 .procname
= "flush",
3285 .maxlen
= sizeof(int),
3287 .proc_handler
= ipv4_sysctl_rtcache_flush
,
3292 static __net_initdata
struct ctl_path ipv4_route_path
[] = {
3293 { .procname
= "net", },
3294 { .procname
= "ipv4", },
3295 { .procname
= "route", },
3299 static __net_init
int sysctl_route_net_init(struct net
*net
)
3301 struct ctl_table
*tbl
;
3303 tbl
= ipv4_route_flush_table
;
3304 if (!net_eq(net
, &init_net
)) {
3305 tbl
= kmemdup(tbl
, sizeof(ipv4_route_flush_table
), GFP_KERNEL
);
3309 tbl
[0].extra1
= net
;
3311 net
->ipv4
.route_hdr
=
3312 register_net_sysctl_table(net
, ipv4_route_path
, tbl
);
3313 if (net
->ipv4
.route_hdr
== NULL
)
3318 if (tbl
!= ipv4_route_flush_table
)
3324 static __net_exit
void sysctl_route_net_exit(struct net
*net
)
3326 struct ctl_table
*tbl
;
3328 tbl
= net
->ipv4
.route_hdr
->ctl_table_arg
;
3329 unregister_net_sysctl_table(net
->ipv4
.route_hdr
);
3330 BUG_ON(tbl
== ipv4_route_flush_table
);
3334 static __net_initdata
struct pernet_operations sysctl_route_ops
= {
3335 .init
= sysctl_route_net_init
,
3336 .exit
= sysctl_route_net_exit
,
3341 static __net_init
int rt_secret_timer_init(struct net
*net
)
3343 atomic_set(&net
->ipv4
.rt_genid
,
3344 (int) ((num_physpages
^ (num_physpages
>>8)) ^
3345 (jiffies
^ (jiffies
>> 7))));
3347 net
->ipv4
.rt_secret_timer
.function
= rt_secret_rebuild
;
3348 net
->ipv4
.rt_secret_timer
.data
= (unsigned long)net
;
3349 init_timer_deferrable(&net
->ipv4
.rt_secret_timer
);
3351 if (ip_rt_secret_interval
) {
3352 net
->ipv4
.rt_secret_timer
.expires
=
3353 jiffies
+ net_random() % ip_rt_secret_interval
+
3354 ip_rt_secret_interval
;
3355 add_timer(&net
->ipv4
.rt_secret_timer
);
3360 static __net_exit
void rt_secret_timer_exit(struct net
*net
)
3362 del_timer_sync(&net
->ipv4
.rt_secret_timer
);
3365 static __net_initdata
struct pernet_operations rt_secret_timer_ops
= {
3366 .init
= rt_secret_timer_init
,
3367 .exit
= rt_secret_timer_exit
,
3371 #ifdef CONFIG_NET_CLS_ROUTE
3372 struct ip_rt_acct __percpu
*ip_rt_acct __read_mostly
;
3373 #endif /* CONFIG_NET_CLS_ROUTE */
3375 static __initdata
unsigned long rhash_entries
;
3376 static int __init
set_rhash_entries(char *str
)
3380 rhash_entries
= simple_strtoul(str
, &str
, 0);
3383 __setup("rhash_entries=", set_rhash_entries
);
3385 int __init
ip_rt_init(void)
3389 #ifdef CONFIG_NET_CLS_ROUTE
3390 ip_rt_acct
= __alloc_percpu(256 * sizeof(struct ip_rt_acct
), __alignof__(struct ip_rt_acct
));
3392 panic("IP: failed to allocate ip_rt_acct\n");
3395 ipv4_dst_ops
.kmem_cachep
=
3396 kmem_cache_create("ip_dst_cache", sizeof(struct rtable
), 0,
3397 SLAB_HWCACHE_ALIGN
|SLAB_PANIC
, NULL
);
3399 ipv4_dst_blackhole_ops
.kmem_cachep
= ipv4_dst_ops
.kmem_cachep
;
3401 rt_hash_table
= (struct rt_hash_bucket
*)
3402 alloc_large_system_hash("IP route cache",
3403 sizeof(struct rt_hash_bucket
),
3405 (totalram_pages
>= 128 * 1024) ?
3410 rhash_entries
? 0 : 512 * 1024);
3411 memset(rt_hash_table
, 0, (rt_hash_mask
+ 1) * sizeof(struct rt_hash_bucket
));
3412 rt_hash_lock_init();
3414 ipv4_dst_ops
.gc_thresh
= (rt_hash_mask
+ 1);
3415 ip_rt_max_size
= (rt_hash_mask
+ 1) * 16;
3420 /* All the timers, started at system startup tend
3421 to synchronize. Perturb it a bit.
3423 INIT_DELAYED_WORK_DEFERRABLE(&expires_work
, rt_worker_func
);
3424 expires_ljiffies
= jiffies
;
3425 schedule_delayed_work(&expires_work
,
3426 net_random() % ip_rt_gc_interval
+ ip_rt_gc_interval
);
3428 if (register_pernet_subsys(&rt_secret_timer_ops
))
3429 printk(KERN_ERR
"Unable to setup rt_secret_timer\n");
3431 if (ip_rt_proc_init())
3432 printk(KERN_ERR
"Unable to create route proc files\n");
3435 xfrm4_init(ip_rt_max_size
);
3437 rtnl_register(PF_INET
, RTM_GETROUTE
, inet_rtm_getroute
, NULL
);
3439 #ifdef CONFIG_SYSCTL
3440 register_pernet_subsys(&sysctl_route_ops
);
3445 #ifdef CONFIG_SYSCTL
3447 * We really need to sanitize the damn ipv4 init order, then all
3448 * this nonsense will go away.
3450 void __init
ip_static_sysctl_init(void)
3452 register_sysctl_paths(ipv4_path
, ipv4_skeleton
);
3456 EXPORT_SYMBOL(__ip_select_ident
);
3457 EXPORT_SYMBOL(ip_route_input
);
3458 EXPORT_SYMBOL(ip_route_output_key
);