2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
110 #include <linux/sysctl.h>
112 #include <net/secure_seq.h>
114 #define RT_FL_TOS(oldflp4) \
115 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
117 #define IP_MAX_MTU 0xFFF0
119 #define RT_GC_TIMEOUT (300*HZ)
121 static int ip_rt_max_size
;
122 static int ip_rt_gc_timeout __read_mostly
= RT_GC_TIMEOUT
;
123 static int ip_rt_gc_interval __read_mostly
= 60 * HZ
;
124 static int ip_rt_gc_min_interval __read_mostly
= HZ
/ 2;
125 static int ip_rt_redirect_number __read_mostly
= 9;
126 static int ip_rt_redirect_load __read_mostly
= HZ
/ 50;
127 static int ip_rt_redirect_silence __read_mostly
= ((HZ
/ 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly
= HZ
;
129 static int ip_rt_error_burst __read_mostly
= 5 * HZ
;
130 static int ip_rt_gc_elasticity __read_mostly
= 8;
131 static int ip_rt_mtu_expires __read_mostly
= 10 * 60 * HZ
;
132 static int ip_rt_min_pmtu __read_mostly
= 512 + 20 + 20;
133 static int ip_rt_min_advmss __read_mostly
= 256;
134 static int rt_chain_length_max __read_mostly
= 20;
136 static struct delayed_work expires_work
;
137 static unsigned long expires_ljiffies
;
140 * Interface to generic destination cache.
143 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
);
144 static unsigned int ipv4_default_advmss(const struct dst_entry
*dst
);
145 static unsigned int ipv4_default_mtu(const struct dst_entry
*dst
);
146 static void ipv4_dst_destroy(struct dst_entry
*dst
);
147 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
);
148 static void ipv4_link_failure(struct sk_buff
*skb
);
149 static void ip_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
);
150 static int rt_garbage_collect(struct dst_ops
*ops
);
152 static void ipv4_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
157 static u32
*ipv4_cow_metrics(struct dst_entry
*dst
, unsigned long old
)
159 struct rtable
*rt
= (struct rtable
*) dst
;
160 struct inet_peer
*peer
;
164 rt_bind_peer(rt
, rt
->rt_dst
, 1);
168 u32
*old_p
= __DST_METRICS_PTR(old
);
169 unsigned long prev
, new;
172 if (inet_metrics_new(peer
))
173 memcpy(p
, old_p
, sizeof(u32
) * RTAX_MAX
);
175 new = (unsigned long) p
;
176 prev
= cmpxchg(&dst
->_metrics
, old
, new);
179 p
= __DST_METRICS_PTR(prev
);
180 if (prev
& DST_METRICS_READ_ONLY
)
184 fib_info_put(rt
->fi
);
192 static struct dst_ops ipv4_dst_ops
= {
194 .protocol
= cpu_to_be16(ETH_P_IP
),
195 .gc
= rt_garbage_collect
,
196 .check
= ipv4_dst_check
,
197 .default_advmss
= ipv4_default_advmss
,
198 .default_mtu
= ipv4_default_mtu
,
199 .cow_metrics
= ipv4_cow_metrics
,
200 .destroy
= ipv4_dst_destroy
,
201 .ifdown
= ipv4_dst_ifdown
,
202 .negative_advice
= ipv4_negative_advice
,
203 .link_failure
= ipv4_link_failure
,
204 .update_pmtu
= ip_rt_update_pmtu
,
205 .local_out
= __ip_local_out
,
208 #define ECN_OR_COST(class) TC_PRIO_##class
210 const __u8 ip_tos2prio
[16] = {
212 ECN_OR_COST(BESTEFFORT
),
214 ECN_OR_COST(BESTEFFORT
),
220 ECN_OR_COST(INTERACTIVE
),
222 ECN_OR_COST(INTERACTIVE
),
223 TC_PRIO_INTERACTIVE_BULK
,
224 ECN_OR_COST(INTERACTIVE_BULK
),
225 TC_PRIO_INTERACTIVE_BULK
,
226 ECN_OR_COST(INTERACTIVE_BULK
)
234 /* The locking scheme is rather straight forward:
236 * 1) Read-Copy Update protects the buckets of the central route hash.
237 * 2) Only writers remove entries, and they hold the lock
238 * as they look at rtable reference counts.
239 * 3) Only readers acquire references to rtable entries,
240 * they do so with atomic increments and with the
244 struct rt_hash_bucket
{
245 struct rtable __rcu
*chain
;
248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
249 defined(CONFIG_PROVE_LOCKING)
251 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
252 * The size of this table is a power of two and depends on the number of CPUS.
253 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
255 #ifdef CONFIG_LOCKDEP
256 # define RT_HASH_LOCK_SZ 256
259 # define RT_HASH_LOCK_SZ 4096
261 # define RT_HASH_LOCK_SZ 2048
263 # define RT_HASH_LOCK_SZ 1024
265 # define RT_HASH_LOCK_SZ 512
267 # define RT_HASH_LOCK_SZ 256
271 static spinlock_t
*rt_hash_locks
;
272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
274 static __init
void rt_hash_lock_init(void)
278 rt_hash_locks
= kmalloc(sizeof(spinlock_t
) * RT_HASH_LOCK_SZ
,
281 panic("IP: failed to allocate rt_hash_locks\n");
283 for (i
= 0; i
< RT_HASH_LOCK_SZ
; i
++)
284 spin_lock_init(&rt_hash_locks
[i
]);
287 # define rt_hash_lock_addr(slot) NULL
289 static inline void rt_hash_lock_init(void)
294 static struct rt_hash_bucket
*rt_hash_table __read_mostly
;
295 static unsigned rt_hash_mask __read_mostly
;
296 static unsigned int rt_hash_log __read_mostly
;
298 static DEFINE_PER_CPU(struct rt_cache_stat
, rt_cache_stat
);
299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
301 static inline unsigned int rt_hash(__be32 daddr
, __be32 saddr
, int idx
,
304 return jhash_3words((__force u32
)daddr
, (__force u32
)saddr
,
309 static inline int rt_genid(struct net
*net
)
311 return atomic_read(&net
->ipv4
.rt_genid
);
314 #ifdef CONFIG_PROC_FS
315 struct rt_cache_iter_state
{
316 struct seq_net_private p
;
321 static struct rtable
*rt_cache_get_first(struct seq_file
*seq
)
323 struct rt_cache_iter_state
*st
= seq
->private;
324 struct rtable
*r
= NULL
;
326 for (st
->bucket
= rt_hash_mask
; st
->bucket
>= 0; --st
->bucket
) {
327 if (!rcu_dereference_raw(rt_hash_table
[st
->bucket
].chain
))
330 r
= rcu_dereference_bh(rt_hash_table
[st
->bucket
].chain
);
332 if (dev_net(r
->dst
.dev
) == seq_file_net(seq
) &&
333 r
->rt_genid
== st
->genid
)
335 r
= rcu_dereference_bh(r
->dst
.rt_next
);
337 rcu_read_unlock_bh();
342 static struct rtable
*__rt_cache_get_next(struct seq_file
*seq
,
345 struct rt_cache_iter_state
*st
= seq
->private;
347 r
= rcu_dereference_bh(r
->dst
.rt_next
);
349 rcu_read_unlock_bh();
351 if (--st
->bucket
< 0)
353 } while (!rcu_dereference_raw(rt_hash_table
[st
->bucket
].chain
));
355 r
= rcu_dereference_bh(rt_hash_table
[st
->bucket
].chain
);
360 static struct rtable
*rt_cache_get_next(struct seq_file
*seq
,
363 struct rt_cache_iter_state
*st
= seq
->private;
364 while ((r
= __rt_cache_get_next(seq
, r
)) != NULL
) {
365 if (dev_net(r
->dst
.dev
) != seq_file_net(seq
))
367 if (r
->rt_genid
== st
->genid
)
373 static struct rtable
*rt_cache_get_idx(struct seq_file
*seq
, loff_t pos
)
375 struct rtable
*r
= rt_cache_get_first(seq
);
378 while (pos
&& (r
= rt_cache_get_next(seq
, r
)))
380 return pos
? NULL
: r
;
383 static void *rt_cache_seq_start(struct seq_file
*seq
, loff_t
*pos
)
385 struct rt_cache_iter_state
*st
= seq
->private;
387 return rt_cache_get_idx(seq
, *pos
- 1);
388 st
->genid
= rt_genid(seq_file_net(seq
));
389 return SEQ_START_TOKEN
;
392 static void *rt_cache_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
396 if (v
== SEQ_START_TOKEN
)
397 r
= rt_cache_get_first(seq
);
399 r
= rt_cache_get_next(seq
, v
);
404 static void rt_cache_seq_stop(struct seq_file
*seq
, void *v
)
406 if (v
&& v
!= SEQ_START_TOKEN
)
407 rcu_read_unlock_bh();
410 static int rt_cache_seq_show(struct seq_file
*seq
, void *v
)
412 if (v
== SEQ_START_TOKEN
)
413 seq_printf(seq
, "%-127s\n",
414 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
415 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
418 struct rtable
*r
= v
;
421 seq_printf(seq
, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
422 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
423 r
->dst
.dev
? r
->dst
.dev
->name
: "*",
424 (__force u32
)r
->rt_dst
,
425 (__force u32
)r
->rt_gateway
,
426 r
->rt_flags
, atomic_read(&r
->dst
.__refcnt
),
427 r
->dst
.__use
, 0, (__force u32
)r
->rt_src
,
428 dst_metric_advmss(&r
->dst
) + 40,
429 dst_metric(&r
->dst
, RTAX_WINDOW
),
430 (int)((dst_metric(&r
->dst
, RTAX_RTT
) >> 3) +
431 dst_metric(&r
->dst
, RTAX_RTTVAR
)),
433 r
->dst
.hh
? atomic_read(&r
->dst
.hh
->hh_refcnt
) : -1,
434 r
->dst
.hh
? (r
->dst
.hh
->hh_output
==
436 r
->rt_spec_dst
, &len
);
438 seq_printf(seq
, "%*s\n", 127 - len
, "");
443 static const struct seq_operations rt_cache_seq_ops
= {
444 .start
= rt_cache_seq_start
,
445 .next
= rt_cache_seq_next
,
446 .stop
= rt_cache_seq_stop
,
447 .show
= rt_cache_seq_show
,
450 static int rt_cache_seq_open(struct inode
*inode
, struct file
*file
)
452 return seq_open_net(inode
, file
, &rt_cache_seq_ops
,
453 sizeof(struct rt_cache_iter_state
));
456 static const struct file_operations rt_cache_seq_fops
= {
457 .owner
= THIS_MODULE
,
458 .open
= rt_cache_seq_open
,
461 .release
= seq_release_net
,
465 static void *rt_cpu_seq_start(struct seq_file
*seq
, loff_t
*pos
)
470 return SEQ_START_TOKEN
;
472 for (cpu
= *pos
-1; cpu
< nr_cpu_ids
; ++cpu
) {
473 if (!cpu_possible(cpu
))
476 return &per_cpu(rt_cache_stat
, cpu
);
481 static void *rt_cpu_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
485 for (cpu
= *pos
; cpu
< nr_cpu_ids
; ++cpu
) {
486 if (!cpu_possible(cpu
))
489 return &per_cpu(rt_cache_stat
, cpu
);
495 static void rt_cpu_seq_stop(struct seq_file
*seq
, void *v
)
500 static int rt_cpu_seq_show(struct seq_file
*seq
, void *v
)
502 struct rt_cache_stat
*st
= v
;
504 if (v
== SEQ_START_TOKEN
) {
505 seq_printf(seq
, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
509 seq_printf(seq
,"%08x %08x %08x %08x %08x %08x %08x %08x "
510 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
511 dst_entries_get_slow(&ipv4_dst_ops
),
534 static const struct seq_operations rt_cpu_seq_ops
= {
535 .start
= rt_cpu_seq_start
,
536 .next
= rt_cpu_seq_next
,
537 .stop
= rt_cpu_seq_stop
,
538 .show
= rt_cpu_seq_show
,
542 static int rt_cpu_seq_open(struct inode
*inode
, struct file
*file
)
544 return seq_open(file
, &rt_cpu_seq_ops
);
547 static const struct file_operations rt_cpu_seq_fops
= {
548 .owner
= THIS_MODULE
,
549 .open
= rt_cpu_seq_open
,
552 .release
= seq_release
,
555 #ifdef CONFIG_IP_ROUTE_CLASSID
556 static int rt_acct_proc_show(struct seq_file
*m
, void *v
)
558 struct ip_rt_acct
*dst
, *src
;
561 dst
= kcalloc(256, sizeof(struct ip_rt_acct
), GFP_KERNEL
);
565 for_each_possible_cpu(i
) {
566 src
= (struct ip_rt_acct
*)per_cpu_ptr(ip_rt_acct
, i
);
567 for (j
= 0; j
< 256; j
++) {
568 dst
[j
].o_bytes
+= src
[j
].o_bytes
;
569 dst
[j
].o_packets
+= src
[j
].o_packets
;
570 dst
[j
].i_bytes
+= src
[j
].i_bytes
;
571 dst
[j
].i_packets
+= src
[j
].i_packets
;
575 seq_write(m
, dst
, 256 * sizeof(struct ip_rt_acct
));
580 static int rt_acct_proc_open(struct inode
*inode
, struct file
*file
)
582 return single_open(file
, rt_acct_proc_show
, NULL
);
585 static const struct file_operations rt_acct_proc_fops
= {
586 .owner
= THIS_MODULE
,
587 .open
= rt_acct_proc_open
,
590 .release
= single_release
,
594 static int __net_init
ip_rt_do_proc_init(struct net
*net
)
596 struct proc_dir_entry
*pde
;
598 pde
= proc_net_fops_create(net
, "rt_cache", S_IRUGO
,
603 pde
= proc_create("rt_cache", S_IRUGO
,
604 net
->proc_net_stat
, &rt_cpu_seq_fops
);
608 #ifdef CONFIG_IP_ROUTE_CLASSID
609 pde
= proc_create("rt_acct", 0, net
->proc_net
, &rt_acct_proc_fops
);
615 #ifdef CONFIG_IP_ROUTE_CLASSID
617 remove_proc_entry("rt_cache", net
->proc_net_stat
);
620 remove_proc_entry("rt_cache", net
->proc_net
);
625 static void __net_exit
ip_rt_do_proc_exit(struct net
*net
)
627 remove_proc_entry("rt_cache", net
->proc_net_stat
);
628 remove_proc_entry("rt_cache", net
->proc_net
);
629 #ifdef CONFIG_IP_ROUTE_CLASSID
630 remove_proc_entry("rt_acct", net
->proc_net
);
634 static struct pernet_operations ip_rt_proc_ops __net_initdata
= {
635 .init
= ip_rt_do_proc_init
,
636 .exit
= ip_rt_do_proc_exit
,
639 static int __init
ip_rt_proc_init(void)
641 return register_pernet_subsys(&ip_rt_proc_ops
);
645 static inline int ip_rt_proc_init(void)
649 #endif /* CONFIG_PROC_FS */
651 static inline void rt_free(struct rtable
*rt
)
653 call_rcu_bh(&rt
->dst
.rcu_head
, dst_rcu_free
);
656 static inline void rt_drop(struct rtable
*rt
)
659 call_rcu_bh(&rt
->dst
.rcu_head
, dst_rcu_free
);
662 static inline int rt_fast_clean(struct rtable
*rth
)
664 /* Kill broadcast/multicast entries very aggresively, if they
665 collide in hash table with more useful entries */
666 return (rth
->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) &&
667 rt_is_input_route(rth
) && rth
->dst
.rt_next
;
670 static inline int rt_valuable(struct rtable
*rth
)
672 return (rth
->rt_flags
& (RTCF_REDIRECTED
| RTCF_NOTIFY
)) ||
673 (rth
->peer
&& rth
->peer
->pmtu_expires
);
676 static int rt_may_expire(struct rtable
*rth
, unsigned long tmo1
, unsigned long tmo2
)
681 if (atomic_read(&rth
->dst
.__refcnt
))
684 age
= jiffies
- rth
->dst
.lastuse
;
685 if ((age
<= tmo1
&& !rt_fast_clean(rth
)) ||
686 (age
<= tmo2
&& rt_valuable(rth
)))
692 /* Bits of score are:
694 * 30: not quite useless
695 * 29..0: usage counter
697 static inline u32
rt_score(struct rtable
*rt
)
699 u32 score
= jiffies
- rt
->dst
.lastuse
;
701 score
= ~score
& ~(3<<30);
706 if (rt_is_output_route(rt
) ||
707 !(rt
->rt_flags
& (RTCF_BROADCAST
|RTCF_MULTICAST
|RTCF_LOCAL
)))
713 static inline bool rt_caching(const struct net
*net
)
715 return net
->ipv4
.current_rt_cache_rebuild_count
<=
716 net
->ipv4
.sysctl_rt_cache_rebuild_count
;
719 static inline bool compare_hash_inputs(const struct rtable
*rt1
,
720 const struct rtable
*rt2
)
722 return ((((__force u32
)rt1
->rt_key_dst
^ (__force u32
)rt2
->rt_key_dst
) |
723 ((__force u32
)rt1
->rt_key_src
^ (__force u32
)rt2
->rt_key_src
) |
724 (rt1
->rt_route_iif
^ rt2
->rt_route_iif
)) == 0);
727 static inline int compare_keys(struct rtable
*rt1
, struct rtable
*rt2
)
729 return (((__force u32
)rt1
->rt_key_dst
^ (__force u32
)rt2
->rt_key_dst
) |
730 ((__force u32
)rt1
->rt_key_src
^ (__force u32
)rt2
->rt_key_src
) |
731 (rt1
->rt_mark
^ rt2
->rt_mark
) |
732 (rt1
->rt_key_tos
^ rt2
->rt_key_tos
) |
733 (rt1
->rt_route_iif
^ rt2
->rt_route_iif
) |
734 (rt1
->rt_oif
^ rt2
->rt_oif
)) == 0;
737 static inline int compare_netns(struct rtable
*rt1
, struct rtable
*rt2
)
739 return net_eq(dev_net(rt1
->dst
.dev
), dev_net(rt2
->dst
.dev
));
742 static inline int rt_is_expired(struct rtable
*rth
)
744 return rth
->rt_genid
!= rt_genid(dev_net(rth
->dst
.dev
));
748 * Perform a full scan of hash table and free all entries.
749 * Can be called by a softirq or a process.
750 * In the later case, we want to be reschedule if necessary
752 static void rt_do_flush(struct net
*net
, int process_context
)
755 struct rtable
*rth
, *next
;
757 for (i
= 0; i
<= rt_hash_mask
; i
++) {
758 struct rtable __rcu
**pprev
;
761 if (process_context
&& need_resched())
763 rth
= rcu_dereference_raw(rt_hash_table
[i
].chain
);
767 spin_lock_bh(rt_hash_lock_addr(i
));
770 pprev
= &rt_hash_table
[i
].chain
;
771 rth
= rcu_dereference_protected(*pprev
,
772 lockdep_is_held(rt_hash_lock_addr(i
)));
775 next
= rcu_dereference_protected(rth
->dst
.rt_next
,
776 lockdep_is_held(rt_hash_lock_addr(i
)));
779 net_eq(dev_net(rth
->dst
.dev
), net
)) {
780 rcu_assign_pointer(*pprev
, next
);
781 rcu_assign_pointer(rth
->dst
.rt_next
, list
);
784 pprev
= &rth
->dst
.rt_next
;
789 spin_unlock_bh(rt_hash_lock_addr(i
));
791 for (; list
; list
= next
) {
792 next
= rcu_dereference_protected(list
->dst
.rt_next
, 1);
799 * While freeing expired entries, we compute average chain length
800 * and standard deviation, using fixed-point arithmetic.
801 * This to have an estimation of rt_chain_length_max
802 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
803 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
807 #define ONE (1UL << FRACT_BITS)
810 * Given a hash chain and an item in this hash chain,
811 * find if a previous entry has the same hash_inputs
812 * (but differs on tos, mark or oif)
813 * Returns 0 if an alias is found.
814 * Returns ONE if rth has no alias before itself.
816 static int has_noalias(const struct rtable
*head
, const struct rtable
*rth
)
818 const struct rtable
*aux
= head
;
821 if (compare_hash_inputs(aux
, rth
))
823 aux
= rcu_dereference_protected(aux
->dst
.rt_next
, 1);
828 static void rt_check_expire(void)
830 static unsigned int rover
;
831 unsigned int i
= rover
, goal
;
833 struct rtable __rcu
**rthp
;
834 unsigned long samples
= 0;
835 unsigned long sum
= 0, sum2
= 0;
839 delta
= jiffies
- expires_ljiffies
;
840 expires_ljiffies
= jiffies
;
841 mult
= ((u64
)delta
) << rt_hash_log
;
842 if (ip_rt_gc_timeout
> 1)
843 do_div(mult
, ip_rt_gc_timeout
);
844 goal
= (unsigned int)mult
;
845 if (goal
> rt_hash_mask
)
846 goal
= rt_hash_mask
+ 1;
847 for (; goal
> 0; goal
--) {
848 unsigned long tmo
= ip_rt_gc_timeout
;
849 unsigned long length
;
851 i
= (i
+ 1) & rt_hash_mask
;
852 rthp
= &rt_hash_table
[i
].chain
;
859 if (rcu_dereference_raw(*rthp
) == NULL
)
862 spin_lock_bh(rt_hash_lock_addr(i
));
863 while ((rth
= rcu_dereference_protected(*rthp
,
864 lockdep_is_held(rt_hash_lock_addr(i
)))) != NULL
) {
865 prefetch(rth
->dst
.rt_next
);
866 if (rt_is_expired(rth
)) {
867 *rthp
= rth
->dst
.rt_next
;
871 if (rth
->dst
.expires
) {
872 /* Entry is expired even if it is in use */
873 if (time_before_eq(jiffies
, rth
->dst
.expires
)) {
876 rthp
= &rth
->dst
.rt_next
;
878 * We only count entries on
879 * a chain with equal hash inputs once
880 * so that entries for different QOS
881 * levels, and other non-hash input
882 * attributes don't unfairly skew
883 * the length computation
885 length
+= has_noalias(rt_hash_table
[i
].chain
, rth
);
888 } else if (!rt_may_expire(rth
, tmo
, ip_rt_gc_timeout
))
891 /* Cleanup aged off entries. */
892 *rthp
= rth
->dst
.rt_next
;
895 spin_unlock_bh(rt_hash_lock_addr(i
));
897 sum2
+= length
*length
;
900 unsigned long avg
= sum
/ samples
;
901 unsigned long sd
= int_sqrt(sum2
/ samples
- avg
*avg
);
902 rt_chain_length_max
= max_t(unsigned long,
904 (avg
+ 4*sd
) >> FRACT_BITS
);
910 * rt_worker_func() is run in process context.
911 * we call rt_check_expire() to scan part of the hash table
913 static void rt_worker_func(struct work_struct
*work
)
916 schedule_delayed_work(&expires_work
, ip_rt_gc_interval
);
920 * Perturbation of rt_genid by a small quantity [1..256]
921 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
922 * many times (2^24) without giving recent rt_genid.
923 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
925 static void rt_cache_invalidate(struct net
*net
)
927 unsigned char shuffle
;
929 get_random_bytes(&shuffle
, sizeof(shuffle
));
930 atomic_add(shuffle
+ 1U, &net
->ipv4
.rt_genid
);
934 * delay < 0 : invalidate cache (fast : entries will be deleted later)
935 * delay >= 0 : invalidate & flush cache (can be long)
937 void rt_cache_flush(struct net
*net
, int delay
)
939 rt_cache_invalidate(net
);
941 rt_do_flush(net
, !in_softirq());
944 /* Flush previous cache invalidated entries from the cache */
945 void rt_cache_flush_batch(struct net
*net
)
947 rt_do_flush(net
, !in_softirq());
950 static void rt_emergency_hash_rebuild(struct net
*net
)
953 printk(KERN_WARNING
"Route hash chain too long!\n");
954 rt_cache_invalidate(net
);
958 Short description of GC goals.
960 We want to build algorithm, which will keep routing cache
961 at some equilibrium point, when number of aged off entries
962 is kept approximately equal to newly generated ones.
964 Current expiration strength is variable "expire".
965 We try to adjust it dynamically, so that if networking
966 is idle expires is large enough to keep enough of warm entries,
967 and when load increases it reduces to limit cache size.
970 static int rt_garbage_collect(struct dst_ops
*ops
)
972 static unsigned long expire
= RT_GC_TIMEOUT
;
973 static unsigned long last_gc
;
975 static int equilibrium
;
977 struct rtable __rcu
**rthp
;
978 unsigned long now
= jiffies
;
980 int entries
= dst_entries_get_fast(&ipv4_dst_ops
);
983 * Garbage collection is pretty expensive,
984 * do not make it too frequently.
987 RT_CACHE_STAT_INC(gc_total
);
989 if (now
- last_gc
< ip_rt_gc_min_interval
&&
990 entries
< ip_rt_max_size
) {
991 RT_CACHE_STAT_INC(gc_ignored
);
995 entries
= dst_entries_get_slow(&ipv4_dst_ops
);
996 /* Calculate number of entries, which we want to expire now. */
997 goal
= entries
- (ip_rt_gc_elasticity
<< rt_hash_log
);
999 if (equilibrium
< ipv4_dst_ops
.gc_thresh
)
1000 equilibrium
= ipv4_dst_ops
.gc_thresh
;
1001 goal
= entries
- equilibrium
;
1003 equilibrium
+= min_t(unsigned int, goal
>> 1, rt_hash_mask
+ 1);
1004 goal
= entries
- equilibrium
;
1007 /* We are in dangerous area. Try to reduce cache really
1010 goal
= max_t(unsigned int, goal
>> 1, rt_hash_mask
+ 1);
1011 equilibrium
= entries
- goal
;
1014 if (now
- last_gc
>= ip_rt_gc_min_interval
)
1018 equilibrium
+= goal
;
1025 for (i
= rt_hash_mask
, k
= rover
; i
>= 0; i
--) {
1026 unsigned long tmo
= expire
;
1028 k
= (k
+ 1) & rt_hash_mask
;
1029 rthp
= &rt_hash_table
[k
].chain
;
1030 spin_lock_bh(rt_hash_lock_addr(k
));
1031 while ((rth
= rcu_dereference_protected(*rthp
,
1032 lockdep_is_held(rt_hash_lock_addr(k
)))) != NULL
) {
1033 if (!rt_is_expired(rth
) &&
1034 !rt_may_expire(rth
, tmo
, expire
)) {
1036 rthp
= &rth
->dst
.rt_next
;
1039 *rthp
= rth
->dst
.rt_next
;
1043 spin_unlock_bh(rt_hash_lock_addr(k
));
1052 /* Goal is not achieved. We stop process if:
1054 - if expire reduced to zero. Otherwise, expire is halfed.
1055 - if table is not full.
1056 - if we are called from interrupt.
1057 - jiffies check is just fallback/debug loop breaker.
1058 We will not spin here for long time in any case.
1061 RT_CACHE_STAT_INC(gc_goal_miss
);
1068 if (dst_entries_get_fast(&ipv4_dst_ops
) < ip_rt_max_size
)
1070 } while (!in_softirq() && time_before_eq(jiffies
, now
));
1072 if (dst_entries_get_fast(&ipv4_dst_ops
) < ip_rt_max_size
)
1074 if (dst_entries_get_slow(&ipv4_dst_ops
) < ip_rt_max_size
)
1076 if (net_ratelimit())
1077 printk(KERN_WARNING
"dst cache overflow\n");
1078 RT_CACHE_STAT_INC(gc_dst_overflow
);
1082 expire
+= ip_rt_gc_min_interval
;
1083 if (expire
> ip_rt_gc_timeout
||
1084 dst_entries_get_fast(&ipv4_dst_ops
) < ipv4_dst_ops
.gc_thresh
||
1085 dst_entries_get_slow(&ipv4_dst_ops
) < ipv4_dst_ops
.gc_thresh
)
1086 expire
= ip_rt_gc_timeout
;
1091 * Returns number of entries in a hash chain that have different hash_inputs
1093 static int slow_chain_length(const struct rtable
*head
)
1096 const struct rtable
*rth
= head
;
1099 length
+= has_noalias(head
, rth
);
1100 rth
= rcu_dereference_protected(rth
->dst
.rt_next
, 1);
1102 return length
>> FRACT_BITS
;
1105 static struct rtable
*rt_intern_hash(unsigned hash
, struct rtable
*rt
,
1106 struct sk_buff
*skb
, int ifindex
)
1108 struct rtable
*rth
, *cand
;
1109 struct rtable __rcu
**rthp
, **candp
;
1113 int attempts
= !in_softirq();
1117 min_score
= ~(u32
)0;
1122 if (!rt_caching(dev_net(rt
->dst
.dev
))) {
1124 * If we're not caching, just tell the caller we
1125 * were successful and don't touch the route. The
1126 * caller hold the sole reference to the cache entry, and
1127 * it will be released when the caller is done with it.
1128 * If we drop it here, the callers have no way to resolve routes
1129 * when we're not caching. Instead, just point *rp at rt, so
1130 * the caller gets a single use out of the route
1131 * Note that we do rt_free on this new route entry, so that
1132 * once its refcount hits zero, we are still able to reap it
1134 * Note: To avoid expensive rcu stuff for this uncached dst,
1135 * we set DST_NOCACHE so that dst_release() can free dst without
1136 * waiting a grace period.
1139 rt
->dst
.flags
|= DST_NOCACHE
;
1140 if (rt
->rt_type
== RTN_UNICAST
|| rt_is_output_route(rt
)) {
1141 int err
= arp_bind_neighbour(&rt
->dst
);
1143 if (net_ratelimit())
1145 "Neighbour table failure & not caching routes.\n");
1147 return ERR_PTR(err
);
1154 rthp
= &rt_hash_table
[hash
].chain
;
1156 spin_lock_bh(rt_hash_lock_addr(hash
));
1157 while ((rth
= rcu_dereference_protected(*rthp
,
1158 lockdep_is_held(rt_hash_lock_addr(hash
)))) != NULL
) {
1159 if (rt_is_expired(rth
)) {
1160 *rthp
= rth
->dst
.rt_next
;
1164 if (compare_keys(rth
, rt
) && compare_netns(rth
, rt
)) {
1166 *rthp
= rth
->dst
.rt_next
;
1168 * Since lookup is lockfree, the deletion
1169 * must be visible to another weakly ordered CPU before
1170 * the insertion at the start of the hash chain.
1172 rcu_assign_pointer(rth
->dst
.rt_next
,
1173 rt_hash_table
[hash
].chain
);
1175 * Since lookup is lockfree, the update writes
1176 * must be ordered for consistency on SMP.
1178 rcu_assign_pointer(rt_hash_table
[hash
].chain
, rth
);
1180 dst_use(&rth
->dst
, now
);
1181 spin_unlock_bh(rt_hash_lock_addr(hash
));
1185 skb_dst_set(skb
, &rth
->dst
);
1189 if (!atomic_read(&rth
->dst
.__refcnt
)) {
1190 u32 score
= rt_score(rth
);
1192 if (score
<= min_score
) {
1201 rthp
= &rth
->dst
.rt_next
;
1205 /* ip_rt_gc_elasticity used to be average length of chain
1206 * length, when exceeded gc becomes really aggressive.
1208 * The second limit is less certain. At the moment it allows
1209 * only 2 entries per bucket. We will see.
1211 if (chain_length
> ip_rt_gc_elasticity
) {
1212 *candp
= cand
->dst
.rt_next
;
1216 if (chain_length
> rt_chain_length_max
&&
1217 slow_chain_length(rt_hash_table
[hash
].chain
) > rt_chain_length_max
) {
1218 struct net
*net
= dev_net(rt
->dst
.dev
);
1219 int num
= ++net
->ipv4
.current_rt_cache_rebuild_count
;
1220 if (!rt_caching(net
)) {
1221 printk(KERN_WARNING
"%s: %d rebuilds is over limit, route caching disabled\n",
1222 rt
->dst
.dev
->name
, num
);
1224 rt_emergency_hash_rebuild(net
);
1225 spin_unlock_bh(rt_hash_lock_addr(hash
));
1227 hash
= rt_hash(rt
->rt_key_dst
, rt
->rt_key_src
,
1228 ifindex
, rt_genid(net
));
1233 /* Try to bind route to arp only if it is output
1234 route or unicast forwarding path.
1236 if (rt
->rt_type
== RTN_UNICAST
|| rt_is_output_route(rt
)) {
1237 int err
= arp_bind_neighbour(&rt
->dst
);
1239 spin_unlock_bh(rt_hash_lock_addr(hash
));
1241 if (err
!= -ENOBUFS
) {
1243 return ERR_PTR(err
);
1246 /* Neighbour tables are full and nothing
1247 can be released. Try to shrink route cache,
1248 it is most likely it holds some neighbour records.
1250 if (attempts
-- > 0) {
1251 int saved_elasticity
= ip_rt_gc_elasticity
;
1252 int saved_int
= ip_rt_gc_min_interval
;
1253 ip_rt_gc_elasticity
= 1;
1254 ip_rt_gc_min_interval
= 0;
1255 rt_garbage_collect(&ipv4_dst_ops
);
1256 ip_rt_gc_min_interval
= saved_int
;
1257 ip_rt_gc_elasticity
= saved_elasticity
;
1261 if (net_ratelimit())
1262 printk(KERN_WARNING
"ipv4: Neighbour table overflow.\n");
1264 return ERR_PTR(-ENOBUFS
);
1268 rt
->dst
.rt_next
= rt_hash_table
[hash
].chain
;
1271 * Since lookup is lockfree, we must make sure
1272 * previous writes to rt are committed to memory
1273 * before making rt visible to other CPUS.
1275 rcu_assign_pointer(rt_hash_table
[hash
].chain
, rt
);
1277 spin_unlock_bh(rt_hash_lock_addr(hash
));
1281 skb_dst_set(skb
, &rt
->dst
);
1285 static atomic_t __rt_peer_genid
= ATOMIC_INIT(0);
1287 static u32
rt_peer_genid(void)
1289 return atomic_read(&__rt_peer_genid
);
1292 void rt_bind_peer(struct rtable
*rt
, __be32 daddr
, int create
)
1294 struct inet_peer
*peer
;
1296 peer
= inet_getpeer_v4(daddr
, create
);
1298 if (peer
&& cmpxchg(&rt
->peer
, NULL
, peer
) != NULL
)
1301 rt
->rt_peer_genid
= rt_peer_genid();
1305 * Peer allocation may fail only in serious out-of-memory conditions. However
1306 * we still can generate some output.
1307 * Random ID selection looks a bit dangerous because we have no chances to
1308 * select ID being unique in a reasonable period of time.
1309 * But broken packet identifier may be better than no packet at all.
1311 static void ip_select_fb_ident(struct iphdr
*iph
)
1313 static DEFINE_SPINLOCK(ip_fb_id_lock
);
1314 static u32 ip_fallback_id
;
1317 spin_lock_bh(&ip_fb_id_lock
);
1318 salt
= secure_ip_id((__force __be32
)ip_fallback_id
^ iph
->daddr
);
1319 iph
->id
= htons(salt
& 0xFFFF);
1320 ip_fallback_id
= salt
;
1321 spin_unlock_bh(&ip_fb_id_lock
);
1324 void __ip_select_ident(struct iphdr
*iph
, struct dst_entry
*dst
, int more
)
1326 struct rtable
*rt
= (struct rtable
*) dst
;
1329 if (rt
->peer
== NULL
)
1330 rt_bind_peer(rt
, rt
->rt_dst
, 1);
1332 /* If peer is attached to destination, it is never detached,
1333 so that we need not to grab a lock to dereference it.
1336 iph
->id
= htons(inet_getid(rt
->peer
, more
));
1340 printk(KERN_DEBUG
"rt_bind_peer(0) @%p\n",
1341 __builtin_return_address(0));
1343 ip_select_fb_ident(iph
);
1345 EXPORT_SYMBOL(__ip_select_ident
);
1347 static void rt_del(unsigned hash
, struct rtable
*rt
)
1349 struct rtable __rcu
**rthp
;
1352 rthp
= &rt_hash_table
[hash
].chain
;
1353 spin_lock_bh(rt_hash_lock_addr(hash
));
1355 while ((aux
= rcu_dereference_protected(*rthp
,
1356 lockdep_is_held(rt_hash_lock_addr(hash
)))) != NULL
) {
1357 if (aux
== rt
|| rt_is_expired(aux
)) {
1358 *rthp
= aux
->dst
.rt_next
;
1362 rthp
= &aux
->dst
.rt_next
;
1364 spin_unlock_bh(rt_hash_lock_addr(hash
));
1367 /* called in rcu_read_lock() section */
1368 void ip_rt_redirect(__be32 old_gw
, __be32 daddr
, __be32 new_gw
,
1369 __be32 saddr
, struct net_device
*dev
)
1371 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
1372 struct inet_peer
*peer
;
1379 if (new_gw
== old_gw
|| !IN_DEV_RX_REDIRECTS(in_dev
) ||
1380 ipv4_is_multicast(new_gw
) || ipv4_is_lbcast(new_gw
) ||
1381 ipv4_is_zeronet(new_gw
))
1382 goto reject_redirect
;
1384 if (!IN_DEV_SHARED_MEDIA(in_dev
)) {
1385 if (!inet_addr_onlink(in_dev
, new_gw
, old_gw
))
1386 goto reject_redirect
;
1387 if (IN_DEV_SEC_REDIRECTS(in_dev
) && ip_fib_check_default(new_gw
, dev
))
1388 goto reject_redirect
;
1390 if (inet_addr_type(net
, new_gw
) != RTN_UNICAST
)
1391 goto reject_redirect
;
1394 peer
= inet_getpeer_v4(daddr
, 1);
1396 peer
->redirect_learned
.a4
= new_gw
;
1400 atomic_inc(&__rt_peer_genid
);
1405 #ifdef CONFIG_IP_ROUTE_VERBOSE
1406 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
1407 printk(KERN_INFO
"Redirect from %pI4 on %s about %pI4 ignored.\n"
1408 " Advised path = %pI4 -> %pI4\n",
1409 &old_gw
, dev
->name
, &new_gw
,
1415 static bool peer_pmtu_expired(struct inet_peer
*peer
)
1417 unsigned long orig
= ACCESS_ONCE(peer
->pmtu_expires
);
1420 time_after_eq(jiffies
, orig
) &&
1421 cmpxchg(&peer
->pmtu_expires
, orig
, 0) == orig
;
1424 static bool peer_pmtu_cleaned(struct inet_peer
*peer
)
1426 unsigned long orig
= ACCESS_ONCE(peer
->pmtu_expires
);
1429 cmpxchg(&peer
->pmtu_expires
, orig
, 0) == orig
;
1432 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
)
1434 struct rtable
*rt
= (struct rtable
*)dst
;
1435 struct dst_entry
*ret
= dst
;
1438 if (dst
->obsolete
> 0) {
1441 } else if (rt
->rt_flags
& RTCF_REDIRECTED
) {
1442 unsigned hash
= rt_hash(rt
->rt_key_dst
, rt
->rt_key_src
,
1444 rt_genid(dev_net(dst
->dev
)));
1447 } else if (rt
->peer
&& peer_pmtu_expired(rt
->peer
)) {
1448 dst_metric_set(dst
, RTAX_MTU
, rt
->peer
->pmtu_orig
);
1456 * 1. The first ip_rt_redirect_number redirects are sent
1457 * with exponential backoff, then we stop sending them at all,
1458 * assuming that the host ignores our redirects.
1459 * 2. If we did not see packets requiring redirects
1460 * during ip_rt_redirect_silence, we assume that the host
1461 * forgot redirected route and start to send redirects again.
1463 * This algorithm is much cheaper and more intelligent than dumb load limiting
1466 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1467 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1470 void ip_rt_send_redirect(struct sk_buff
*skb
)
1472 struct rtable
*rt
= skb_rtable(skb
);
1473 struct in_device
*in_dev
;
1474 struct inet_peer
*peer
;
1478 in_dev
= __in_dev_get_rcu(rt
->dst
.dev
);
1479 if (!in_dev
|| !IN_DEV_TX_REDIRECTS(in_dev
)) {
1483 log_martians
= IN_DEV_LOG_MARTIANS(in_dev
);
1487 rt_bind_peer(rt
, rt
->rt_dst
, 1);
1490 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, rt
->rt_gateway
);
1494 /* No redirected packets during ip_rt_redirect_silence;
1495 * reset the algorithm.
1497 if (time_after(jiffies
, peer
->rate_last
+ ip_rt_redirect_silence
))
1498 peer
->rate_tokens
= 0;
1500 /* Too many ignored redirects; do not send anything
1501 * set dst.rate_last to the last seen redirected packet.
1503 if (peer
->rate_tokens
>= ip_rt_redirect_number
) {
1504 peer
->rate_last
= jiffies
;
1508 /* Check for load limit; set rate_last to the latest sent
1511 if (peer
->rate_tokens
== 0 ||
1514 (ip_rt_redirect_load
<< peer
->rate_tokens
)))) {
1515 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, rt
->rt_gateway
);
1516 peer
->rate_last
= jiffies
;
1517 ++peer
->rate_tokens
;
1518 #ifdef CONFIG_IP_ROUTE_VERBOSE
1520 peer
->rate_tokens
== ip_rt_redirect_number
&&
1522 printk(KERN_WARNING
"host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1523 &ip_hdr(skb
)->saddr
, rt
->rt_iif
,
1524 &rt
->rt_dst
, &rt
->rt_gateway
);
1529 static int ip_error(struct sk_buff
*skb
)
1531 struct rtable
*rt
= skb_rtable(skb
);
1532 struct inet_peer
*peer
;
1537 switch (rt
->dst
.error
) {
1542 code
= ICMP_HOST_UNREACH
;
1545 code
= ICMP_NET_UNREACH
;
1546 IP_INC_STATS_BH(dev_net(rt
->dst
.dev
),
1547 IPSTATS_MIB_INNOROUTES
);
1550 code
= ICMP_PKT_FILTERED
;
1555 rt_bind_peer(rt
, rt
->rt_dst
, 1);
1561 peer
->rate_tokens
+= now
- peer
->rate_last
;
1562 if (peer
->rate_tokens
> ip_rt_error_burst
)
1563 peer
->rate_tokens
= ip_rt_error_burst
;
1564 peer
->rate_last
= now
;
1565 if (peer
->rate_tokens
>= ip_rt_error_cost
)
1566 peer
->rate_tokens
-= ip_rt_error_cost
;
1571 icmp_send(skb
, ICMP_DEST_UNREACH
, code
, 0);
1573 out
: kfree_skb(skb
);
1578 * The last two values are not from the RFC but
1579 * are needed for AMPRnet AX.25 paths.
1582 static const unsigned short mtu_plateau
[] =
1583 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1585 static inline unsigned short guess_mtu(unsigned short old_mtu
)
1589 for (i
= 0; i
< ARRAY_SIZE(mtu_plateau
); i
++)
1590 if (old_mtu
> mtu_plateau
[i
])
1591 return mtu_plateau
[i
];
1595 unsigned short ip_rt_frag_needed(struct net
*net
, const struct iphdr
*iph
,
1596 unsigned short new_mtu
,
1597 struct net_device
*dev
)
1599 unsigned short old_mtu
= ntohs(iph
->tot_len
);
1600 unsigned short est_mtu
= 0;
1601 struct inet_peer
*peer
;
1603 peer
= inet_getpeer_v4(iph
->daddr
, 1);
1605 unsigned short mtu
= new_mtu
;
1607 if (new_mtu
< 68 || new_mtu
>= old_mtu
) {
1608 /* BSD 4.2 derived systems incorrectly adjust
1609 * tot_len by the IP header length, and report
1610 * a zero MTU in the ICMP message.
1613 old_mtu
>= 68 + (iph
->ihl
<< 2))
1614 old_mtu
-= iph
->ihl
<< 2;
1615 mtu
= guess_mtu(old_mtu
);
1618 if (mtu
< ip_rt_min_pmtu
)
1619 mtu
= ip_rt_min_pmtu
;
1620 if (!peer
->pmtu_expires
|| mtu
< peer
->pmtu_learned
) {
1621 unsigned long pmtu_expires
;
1623 pmtu_expires
= jiffies
+ ip_rt_mtu_expires
;
1628 peer
->pmtu_learned
= mtu
;
1629 peer
->pmtu_expires
= pmtu_expires
;
1634 atomic_inc(&__rt_peer_genid
);
1636 return est_mtu
? : new_mtu
;
1639 static void check_peer_pmtu(struct dst_entry
*dst
, struct inet_peer
*peer
)
1641 unsigned long expires
= ACCESS_ONCE(peer
->pmtu_expires
);
1645 if (time_before(jiffies
, expires
)) {
1646 u32 orig_dst_mtu
= dst_mtu(dst
);
1647 if (peer
->pmtu_learned
< orig_dst_mtu
) {
1648 if (!peer
->pmtu_orig
)
1649 peer
->pmtu_orig
= dst_metric_raw(dst
, RTAX_MTU
);
1650 dst_metric_set(dst
, RTAX_MTU
, peer
->pmtu_learned
);
1652 } else if (cmpxchg(&peer
->pmtu_expires
, expires
, 0) == expires
)
1653 dst_metric_set(dst
, RTAX_MTU
, peer
->pmtu_orig
);
1656 static void ip_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
1658 struct rtable
*rt
= (struct rtable
*) dst
;
1659 struct inet_peer
*peer
;
1664 rt_bind_peer(rt
, rt
->rt_dst
, 1);
1667 unsigned long pmtu_expires
= ACCESS_ONCE(peer
->pmtu_expires
);
1669 if (mtu
< ip_rt_min_pmtu
)
1670 mtu
= ip_rt_min_pmtu
;
1671 if (!pmtu_expires
|| mtu
< peer
->pmtu_learned
) {
1673 pmtu_expires
= jiffies
+ ip_rt_mtu_expires
;
1677 peer
->pmtu_learned
= mtu
;
1678 peer
->pmtu_expires
= pmtu_expires
;
1680 atomic_inc(&__rt_peer_genid
);
1681 rt
->rt_peer_genid
= rt_peer_genid();
1683 check_peer_pmtu(dst
, peer
);
1687 static int check_peer_redir(struct dst_entry
*dst
, struct inet_peer
*peer
)
1689 struct rtable
*rt
= (struct rtable
*) dst
;
1690 __be32 orig_gw
= rt
->rt_gateway
;
1692 dst_confirm(&rt
->dst
);
1694 neigh_release(rt
->dst
.neighbour
);
1695 rt
->dst
.neighbour
= NULL
;
1697 rt
->rt_gateway
= peer
->redirect_learned
.a4
;
1698 if (arp_bind_neighbour(&rt
->dst
) ||
1699 !(rt
->dst
.neighbour
->nud_state
& NUD_VALID
)) {
1700 if (rt
->dst
.neighbour
)
1701 neigh_event_send(rt
->dst
.neighbour
, NULL
);
1702 rt
->rt_gateway
= orig_gw
;
1705 rt
->rt_flags
|= RTCF_REDIRECTED
;
1706 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE
,
1712 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
)
1714 struct rtable
*rt
= (struct rtable
*) dst
;
1716 if (rt_is_expired(rt
))
1718 if (rt
->rt_peer_genid
!= rt_peer_genid()) {
1719 struct inet_peer
*peer
;
1722 rt_bind_peer(rt
, rt
->rt_dst
, 0);
1726 check_peer_pmtu(dst
, peer
);
1728 if (peer
->redirect_learned
.a4
&&
1729 peer
->redirect_learned
.a4
!= rt
->rt_gateway
) {
1730 if (check_peer_redir(dst
, peer
))
1735 rt
->rt_peer_genid
= rt_peer_genid();
1740 static void ipv4_dst_destroy(struct dst_entry
*dst
)
1742 struct rtable
*rt
= (struct rtable
*) dst
;
1743 struct inet_peer
*peer
= rt
->peer
;
1746 fib_info_put(rt
->fi
);
1756 static void ipv4_link_failure(struct sk_buff
*skb
)
1760 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_HOST_UNREACH
, 0);
1762 rt
= skb_rtable(skb
);
1763 if (rt
&& rt
->peer
&& peer_pmtu_cleaned(rt
->peer
))
1764 dst_metric_set(&rt
->dst
, RTAX_MTU
, rt
->peer
->pmtu_orig
);
1767 static int ip_rt_bug(struct sk_buff
*skb
)
1769 printk(KERN_DEBUG
"ip_rt_bug: %pI4 -> %pI4, %s\n",
1770 &ip_hdr(skb
)->saddr
, &ip_hdr(skb
)->daddr
,
1771 skb
->dev
? skb
->dev
->name
: "?");
1778 We do not cache source address of outgoing interface,
1779 because it is used only by IP RR, TS and SRR options,
1780 so that it out of fast path.
1782 BTW remember: "addr" is allowed to be not aligned
1786 void ip_rt_get_source(u8
*addr
, struct sk_buff
*skb
, struct rtable
*rt
)
1790 if (rt_is_output_route(rt
))
1791 src
= ip_hdr(skb
)->saddr
;
1793 struct fib_result res
;
1799 memset(&fl4
, 0, sizeof(fl4
));
1800 fl4
.daddr
= iph
->daddr
;
1801 fl4
.saddr
= iph
->saddr
;
1802 fl4
.flowi4_tos
= RT_TOS(iph
->tos
);
1803 fl4
.flowi4_oif
= rt
->dst
.dev
->ifindex
;
1804 fl4
.flowi4_iif
= skb
->dev
->ifindex
;
1805 fl4
.flowi4_mark
= skb
->mark
;
1808 if (fib_lookup(dev_net(rt
->dst
.dev
), &fl4
, &res
) == 0)
1809 src
= FIB_RES_PREFSRC(dev_net(rt
->dst
.dev
), res
);
1811 src
= inet_select_addr(rt
->dst
.dev
, rt
->rt_gateway
,
1815 memcpy(addr
, &src
, 4);
1818 #ifdef CONFIG_IP_ROUTE_CLASSID
1819 static void set_class_tag(struct rtable
*rt
, u32 tag
)
1821 if (!(rt
->dst
.tclassid
& 0xFFFF))
1822 rt
->dst
.tclassid
|= tag
& 0xFFFF;
1823 if (!(rt
->dst
.tclassid
& 0xFFFF0000))
1824 rt
->dst
.tclassid
|= tag
& 0xFFFF0000;
1828 static unsigned int ipv4_default_advmss(const struct dst_entry
*dst
)
1830 unsigned int advmss
= dst_metric_raw(dst
, RTAX_ADVMSS
);
1833 advmss
= max_t(unsigned int, dst
->dev
->mtu
- 40,
1835 if (advmss
> 65535 - 40)
1836 advmss
= 65535 - 40;
1841 static unsigned int ipv4_default_mtu(const struct dst_entry
*dst
)
1843 unsigned int mtu
= dst
->dev
->mtu
;
1845 if (unlikely(dst_metric_locked(dst
, RTAX_MTU
))) {
1846 const struct rtable
*rt
= (const struct rtable
*) dst
;
1848 if (rt
->rt_gateway
!= rt
->rt_dst
&& mtu
> 576)
1852 if (mtu
> IP_MAX_MTU
)
1858 static void rt_init_metrics(struct rtable
*rt
, const struct flowi4
*fl4
,
1859 struct fib_info
*fi
)
1861 struct inet_peer
*peer
;
1864 /* If a peer entry exists for this destination, we must hook
1865 * it up in order to get at cached metrics.
1867 if (fl4
&& (fl4
->flowi4_flags
& FLOWI_FLAG_PRECOW_METRICS
))
1870 rt
->peer
= peer
= inet_getpeer_v4(rt
->rt_dst
, create
);
1872 rt
->rt_peer_genid
= rt_peer_genid();
1873 if (inet_metrics_new(peer
))
1874 memcpy(peer
->metrics
, fi
->fib_metrics
,
1875 sizeof(u32
) * RTAX_MAX
);
1876 dst_init_metrics(&rt
->dst
, peer
->metrics
, false);
1878 check_peer_pmtu(&rt
->dst
, peer
);
1879 if (peer
->redirect_learned
.a4
&&
1880 peer
->redirect_learned
.a4
!= rt
->rt_gateway
) {
1881 rt
->rt_gateway
= peer
->redirect_learned
.a4
;
1882 rt
->rt_flags
|= RTCF_REDIRECTED
;
1885 if (fi
->fib_metrics
!= (u32
*) dst_default_metrics
) {
1887 atomic_inc(&fi
->fib_clntref
);
1889 dst_init_metrics(&rt
->dst
, fi
->fib_metrics
, true);
1893 static void rt_set_nexthop(struct rtable
*rt
, const struct flowi4
*fl4
,
1894 const struct fib_result
*res
,
1895 struct fib_info
*fi
, u16 type
, u32 itag
)
1897 struct dst_entry
*dst
= &rt
->dst
;
1900 if (FIB_RES_GW(*res
) &&
1901 FIB_RES_NH(*res
).nh_scope
== RT_SCOPE_LINK
)
1902 rt
->rt_gateway
= FIB_RES_GW(*res
);
1903 rt_init_metrics(rt
, fl4
, fi
);
1904 #ifdef CONFIG_IP_ROUTE_CLASSID
1905 dst
->tclassid
= FIB_RES_NH(*res
).nh_tclassid
;
1909 if (dst_mtu(dst
) > IP_MAX_MTU
)
1910 dst_metric_set(dst
, RTAX_MTU
, IP_MAX_MTU
);
1911 if (dst_metric_raw(dst
, RTAX_ADVMSS
) > 65535 - 40)
1912 dst_metric_set(dst
, RTAX_ADVMSS
, 65535 - 40);
1914 #ifdef CONFIG_IP_ROUTE_CLASSID
1915 #ifdef CONFIG_IP_MULTIPLE_TABLES
1916 set_class_tag(rt
, fib_rules_tclass(res
));
1918 set_class_tag(rt
, itag
);
1922 static struct rtable
*rt_dst_alloc(struct net_device
*dev
,
1923 bool nopolicy
, bool noxfrm
)
1925 return dst_alloc(&ipv4_dst_ops
, dev
, 1, -1,
1927 (nopolicy
? DST_NOPOLICY
: 0) |
1928 (noxfrm
? DST_NOXFRM
: 0));
1931 /* called in rcu_read_lock() section */
1932 static int ip_route_input_mc(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1933 u8 tos
, struct net_device
*dev
, int our
)
1938 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
1942 /* Primary sanity checks. */
1947 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
) ||
1948 ipv4_is_loopback(saddr
) || skb
->protocol
!= htons(ETH_P_IP
))
1951 if (ipv4_is_zeronet(saddr
)) {
1952 if (!ipv4_is_local_multicast(daddr
))
1954 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
1956 err
= fib_validate_source(skb
, saddr
, 0, tos
, 0, dev
, &spec_dst
,
1961 rth
= rt_dst_alloc(init_net
.loopback_dev
,
1962 IN_DEV_CONF_GET(in_dev
, NOPOLICY
), false);
1966 #ifdef CONFIG_IP_ROUTE_CLASSID
1967 rth
->dst
.tclassid
= itag
;
1969 rth
->dst
.output
= ip_rt_bug
;
1971 rth
->rt_key_dst
= daddr
;
1972 rth
->rt_key_src
= saddr
;
1973 rth
->rt_genid
= rt_genid(dev_net(dev
));
1974 rth
->rt_flags
= RTCF_MULTICAST
;
1975 rth
->rt_type
= RTN_MULTICAST
;
1976 rth
->rt_key_tos
= tos
;
1977 rth
->rt_dst
= daddr
;
1978 rth
->rt_src
= saddr
;
1979 rth
->rt_route_iif
= dev
->ifindex
;
1980 rth
->rt_iif
= dev
->ifindex
;
1982 rth
->rt_mark
= skb
->mark
;
1983 rth
->rt_gateway
= daddr
;
1984 rth
->rt_spec_dst
= spec_dst
;
1985 rth
->rt_peer_genid
= 0;
1989 rth
->dst
.input
= ip_local_deliver
;
1990 rth
->rt_flags
|= RTCF_LOCAL
;
1993 #ifdef CONFIG_IP_MROUTE
1994 if (!ipv4_is_local_multicast(daddr
) && IN_DEV_MFORWARD(in_dev
))
1995 rth
->dst
.input
= ip_mr_input
;
1997 RT_CACHE_STAT_INC(in_slow_mc
);
1999 hash
= rt_hash(daddr
, saddr
, dev
->ifindex
, rt_genid(dev_net(dev
)));
2000 rth
= rt_intern_hash(hash
, rth
, skb
, dev
->ifindex
);
2001 return IS_ERR(rth
) ? PTR_ERR(rth
) : 0;
2012 static void ip_handle_martian_source(struct net_device
*dev
,
2013 struct in_device
*in_dev
,
2014 struct sk_buff
*skb
,
2018 RT_CACHE_STAT_INC(in_martian_src
);
2019 #ifdef CONFIG_IP_ROUTE_VERBOSE
2020 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit()) {
2022 * RFC1812 recommendation, if source is martian,
2023 * the only hint is MAC header.
2025 printk(KERN_WARNING
"martian source %pI4 from %pI4, on dev %s\n",
2026 &daddr
, &saddr
, dev
->name
);
2027 if (dev
->hard_header_len
&& skb_mac_header_was_set(skb
)) {
2029 const unsigned char *p
= skb_mac_header(skb
);
2030 printk(KERN_WARNING
"ll header: ");
2031 for (i
= 0; i
< dev
->hard_header_len
; i
++, p
++) {
2033 if (i
< (dev
->hard_header_len
- 1))
2042 /* called in rcu_read_lock() section */
2043 static int __mkroute_input(struct sk_buff
*skb
,
2044 const struct fib_result
*res
,
2045 struct in_device
*in_dev
,
2046 __be32 daddr
, __be32 saddr
, u32 tos
,
2047 struct rtable
**result
)
2051 struct in_device
*out_dev
;
2052 unsigned int flags
= 0;
2056 /* get a working reference to the output device */
2057 out_dev
= __in_dev_get_rcu(FIB_RES_DEV(*res
));
2058 if (out_dev
== NULL
) {
2059 if (net_ratelimit())
2060 printk(KERN_CRIT
"Bug in ip_route_input" \
2061 "_slow(). Please, report\n");
2066 err
= fib_validate_source(skb
, saddr
, daddr
, tos
, FIB_RES_OIF(*res
),
2067 in_dev
->dev
, &spec_dst
, &itag
);
2069 ip_handle_martian_source(in_dev
->dev
, in_dev
, skb
, daddr
,
2076 flags
|= RTCF_DIRECTSRC
;
2078 if (out_dev
== in_dev
&& err
&&
2079 (IN_DEV_SHARED_MEDIA(out_dev
) ||
2080 inet_addr_onlink(out_dev
, saddr
, FIB_RES_GW(*res
))))
2081 flags
|= RTCF_DOREDIRECT
;
2083 if (skb
->protocol
!= htons(ETH_P_IP
)) {
2084 /* Not IP (i.e. ARP). Do not create route, if it is
2085 * invalid for proxy arp. DNAT routes are always valid.
2087 * Proxy arp feature have been extended to allow, ARP
2088 * replies back to the same interface, to support
2089 * Private VLAN switch technologies. See arp.c.
2091 if (out_dev
== in_dev
&&
2092 IN_DEV_PROXY_ARP_PVLAN(in_dev
) == 0) {
2098 rth
= rt_dst_alloc(out_dev
->dev
,
2099 IN_DEV_CONF_GET(in_dev
, NOPOLICY
),
2100 IN_DEV_CONF_GET(out_dev
, NOXFRM
));
2106 rth
->rt_key_dst
= daddr
;
2107 rth
->rt_key_src
= saddr
;
2108 rth
->rt_genid
= rt_genid(dev_net(rth
->dst
.dev
));
2109 rth
->rt_flags
= flags
;
2110 rth
->rt_type
= res
->type
;
2111 rth
->rt_key_tos
= tos
;
2112 rth
->rt_dst
= daddr
;
2113 rth
->rt_src
= saddr
;
2114 rth
->rt_route_iif
= in_dev
->dev
->ifindex
;
2115 rth
->rt_iif
= in_dev
->dev
->ifindex
;
2117 rth
->rt_mark
= skb
->mark
;
2118 rth
->rt_gateway
= daddr
;
2119 rth
->rt_spec_dst
= spec_dst
;
2120 rth
->rt_peer_genid
= 0;
2124 rth
->dst
.input
= ip_forward
;
2125 rth
->dst
.output
= ip_output
;
2127 rt_set_nexthop(rth
, NULL
, res
, res
->fi
, res
->type
, itag
);
2135 static int ip_mkroute_input(struct sk_buff
*skb
,
2136 struct fib_result
*res
,
2137 const struct flowi4
*fl4
,
2138 struct in_device
*in_dev
,
2139 __be32 daddr
, __be32 saddr
, u32 tos
)
2141 struct rtable
* rth
= NULL
;
2145 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2146 if (res
->fi
&& res
->fi
->fib_nhs
> 1)
2147 fib_select_multipath(res
);
2150 /* create a routing cache entry */
2151 err
= __mkroute_input(skb
, res
, in_dev
, daddr
, saddr
, tos
, &rth
);
2155 /* put it into the cache */
2156 hash
= rt_hash(daddr
, saddr
, fl4
->flowi4_iif
,
2157 rt_genid(dev_net(rth
->dst
.dev
)));
2158 rth
= rt_intern_hash(hash
, rth
, skb
, fl4
->flowi4_iif
);
2160 return PTR_ERR(rth
);
2165 * NOTE. We drop all the packets that has local source
2166 * addresses, because every properly looped back packet
2167 * must have correct destination already attached by output routine.
2169 * Such approach solves two big problems:
2170 * 1. Not simplex devices are handled properly.
2171 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2172 * called with rcu_read_lock()
2175 static int ip_route_input_slow(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2176 u8 tos
, struct net_device
*dev
)
2178 struct fib_result res
;
2179 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
2183 struct rtable
* rth
;
2187 struct net
* net
= dev_net(dev
);
2189 /* IP on this device is disabled. */
2194 /* Check for the most weird martians, which can be not detected
2198 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
) ||
2199 ipv4_is_loopback(saddr
))
2200 goto martian_source
;
2202 if (ipv4_is_lbcast(daddr
) || (saddr
== 0 && daddr
== 0))
2205 /* Accept zero addresses only to limited broadcast;
2206 * I even do not know to fix it or not. Waiting for complains :-)
2208 if (ipv4_is_zeronet(saddr
))
2209 goto martian_source
;
2211 if (ipv4_is_zeronet(daddr
) || ipv4_is_loopback(daddr
))
2212 goto martian_destination
;
2215 * Now we are ready to route packet.
2218 fl4
.flowi4_iif
= dev
->ifindex
;
2219 fl4
.flowi4_mark
= skb
->mark
;
2220 fl4
.flowi4_tos
= tos
;
2221 fl4
.flowi4_scope
= RT_SCOPE_UNIVERSE
;
2224 err
= fib_lookup(net
, &fl4
, &res
);
2226 if (!IN_DEV_FORWARD(in_dev
))
2231 RT_CACHE_STAT_INC(in_slow_tot
);
2233 if (res
.type
== RTN_BROADCAST
)
2236 if (res
.type
== RTN_LOCAL
) {
2237 err
= fib_validate_source(skb
, saddr
, daddr
, tos
,
2238 net
->loopback_dev
->ifindex
,
2239 dev
, &spec_dst
, &itag
);
2241 goto martian_source_keep_err
;
2243 flags
|= RTCF_DIRECTSRC
;
2248 if (!IN_DEV_FORWARD(in_dev
))
2250 if (res
.type
!= RTN_UNICAST
)
2251 goto martian_destination
;
2253 err
= ip_mkroute_input(skb
, &res
, &fl4
, in_dev
, daddr
, saddr
, tos
);
2257 if (skb
->protocol
!= htons(ETH_P_IP
))
2260 if (ipv4_is_zeronet(saddr
))
2261 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
2263 err
= fib_validate_source(skb
, saddr
, 0, tos
, 0, dev
, &spec_dst
,
2266 goto martian_source_keep_err
;
2268 flags
|= RTCF_DIRECTSRC
;
2270 flags
|= RTCF_BROADCAST
;
2271 res
.type
= RTN_BROADCAST
;
2272 RT_CACHE_STAT_INC(in_brd
);
2275 rth
= rt_dst_alloc(net
->loopback_dev
,
2276 IN_DEV_CONF_GET(in_dev
, NOPOLICY
), false);
2280 rth
->dst
.input
= ip_local_deliver
;
2281 rth
->dst
.output
= ip_rt_bug
;
2282 #ifdef CONFIG_IP_ROUTE_CLASSID
2283 rth
->dst
.tclassid
= itag
;
2286 rth
->rt_key_dst
= daddr
;
2287 rth
->rt_key_src
= saddr
;
2288 rth
->rt_genid
= rt_genid(net
);
2289 rth
->rt_flags
= flags
|RTCF_LOCAL
;
2290 rth
->rt_type
= res
.type
;
2291 rth
->rt_key_tos
= tos
;
2292 rth
->rt_dst
= daddr
;
2293 rth
->rt_src
= saddr
;
2294 #ifdef CONFIG_IP_ROUTE_CLASSID
2295 rth
->dst
.tclassid
= itag
;
2297 rth
->rt_route_iif
= dev
->ifindex
;
2298 rth
->rt_iif
= dev
->ifindex
;
2300 rth
->rt_mark
= skb
->mark
;
2301 rth
->rt_gateway
= daddr
;
2302 rth
->rt_spec_dst
= spec_dst
;
2303 rth
->rt_peer_genid
= 0;
2306 if (res
.type
== RTN_UNREACHABLE
) {
2307 rth
->dst
.input
= ip_error
;
2308 rth
->dst
.error
= -err
;
2309 rth
->rt_flags
&= ~RTCF_LOCAL
;
2311 hash
= rt_hash(daddr
, saddr
, fl4
.flowi4_iif
, rt_genid(net
));
2312 rth
= rt_intern_hash(hash
, rth
, skb
, fl4
.flowi4_iif
);
2319 RT_CACHE_STAT_INC(in_no_route
);
2320 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_UNIVERSE
);
2321 res
.type
= RTN_UNREACHABLE
;
2327 * Do not cache martian addresses: they should be logged (RFC1812)
2329 martian_destination
:
2330 RT_CACHE_STAT_INC(in_martian_dst
);
2331 #ifdef CONFIG_IP_ROUTE_VERBOSE
2332 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
2333 printk(KERN_WARNING
"martian destination %pI4 from %pI4, dev %s\n",
2334 &daddr
, &saddr
, dev
->name
);
2338 err
= -EHOSTUNREACH
;
2351 martian_source_keep_err
:
2352 ip_handle_martian_source(dev
, in_dev
, skb
, daddr
, saddr
);
2356 int ip_route_input_common(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2357 u8 tos
, struct net_device
*dev
, bool noref
)
2359 struct rtable
* rth
;
2361 int iif
= dev
->ifindex
;
2369 if (!rt_caching(net
))
2372 tos
&= IPTOS_RT_MASK
;
2373 hash
= rt_hash(daddr
, saddr
, iif
, rt_genid(net
));
2375 for (rth
= rcu_dereference(rt_hash_table
[hash
].chain
); rth
;
2376 rth
= rcu_dereference(rth
->dst
.rt_next
)) {
2377 if ((((__force u32
)rth
->rt_key_dst
^ (__force u32
)daddr
) |
2378 ((__force u32
)rth
->rt_key_src
^ (__force u32
)saddr
) |
2379 (rth
->rt_route_iif
^ iif
) |
2380 (rth
->rt_key_tos
^ tos
)) == 0 &&
2381 rth
->rt_mark
== skb
->mark
&&
2382 net_eq(dev_net(rth
->dst
.dev
), net
) &&
2383 !rt_is_expired(rth
)) {
2385 dst_use_noref(&rth
->dst
, jiffies
);
2386 skb_dst_set_noref(skb
, &rth
->dst
);
2388 dst_use(&rth
->dst
, jiffies
);
2389 skb_dst_set(skb
, &rth
->dst
);
2391 RT_CACHE_STAT_INC(in_hit
);
2395 RT_CACHE_STAT_INC(in_hlist_search
);
2399 /* Multicast recognition logic is moved from route cache to here.
2400 The problem was that too many Ethernet cards have broken/missing
2401 hardware multicast filters :-( As result the host on multicasting
2402 network acquires a lot of useless route cache entries, sort of
2403 SDR messages from all the world. Now we try to get rid of them.
2404 Really, provided software IP multicast filter is organized
2405 reasonably (at least, hashed), it does not result in a slowdown
2406 comparing with route cache reject entries.
2407 Note, that multicast routers are not affected, because
2408 route cache entry is created eventually.
2410 if (ipv4_is_multicast(daddr
)) {
2411 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
2414 int our
= ip_check_mc_rcu(in_dev
, daddr
, saddr
,
2415 ip_hdr(skb
)->protocol
);
2417 #ifdef CONFIG_IP_MROUTE
2419 (!ipv4_is_local_multicast(daddr
) &&
2420 IN_DEV_MFORWARD(in_dev
))
2423 int res
= ip_route_input_mc(skb
, daddr
, saddr
,
2432 res
= ip_route_input_slow(skb
, daddr
, saddr
, tos
, dev
);
2436 EXPORT_SYMBOL(ip_route_input_common
);
2438 /* called with rcu_read_lock() */
2439 static struct rtable
*__mkroute_output(const struct fib_result
*res
,
2440 const struct flowi4
*fl4
,
2441 __be32 orig_daddr
, __be32 orig_saddr
,
2442 int orig_oif
, struct net_device
*dev_out
,
2445 struct fib_info
*fi
= res
->fi
;
2446 u32 tos
= RT_FL_TOS(fl4
);
2447 struct in_device
*in_dev
;
2448 u16 type
= res
->type
;
2451 if (ipv4_is_loopback(fl4
->saddr
) && !(dev_out
->flags
& IFF_LOOPBACK
))
2452 return ERR_PTR(-EINVAL
);
2454 if (ipv4_is_lbcast(fl4
->daddr
))
2455 type
= RTN_BROADCAST
;
2456 else if (ipv4_is_multicast(fl4
->daddr
))
2457 type
= RTN_MULTICAST
;
2458 else if (ipv4_is_zeronet(fl4
->daddr
))
2459 return ERR_PTR(-EINVAL
);
2461 if (dev_out
->flags
& IFF_LOOPBACK
)
2462 flags
|= RTCF_LOCAL
;
2464 in_dev
= __in_dev_get_rcu(dev_out
);
2466 return ERR_PTR(-EINVAL
);
2468 if (type
== RTN_BROADCAST
) {
2469 flags
|= RTCF_BROADCAST
| RTCF_LOCAL
;
2471 } else if (type
== RTN_MULTICAST
) {
2472 flags
|= RTCF_MULTICAST
| RTCF_LOCAL
;
2473 if (!ip_check_mc_rcu(in_dev
, fl4
->daddr
, fl4
->saddr
,
2475 flags
&= ~RTCF_LOCAL
;
2476 /* If multicast route do not exist use
2477 * default one, but do not gateway in this case.
2480 if (fi
&& res
->prefixlen
< 4)
2484 rth
= rt_dst_alloc(dev_out
,
2485 IN_DEV_CONF_GET(in_dev
, NOPOLICY
),
2486 IN_DEV_CONF_GET(in_dev
, NOXFRM
));
2488 return ERR_PTR(-ENOBUFS
);
2490 rth
->dst
.output
= ip_output
;
2492 rth
->rt_key_dst
= orig_daddr
;
2493 rth
->rt_key_src
= orig_saddr
;
2494 rth
->rt_genid
= rt_genid(dev_net(dev_out
));
2495 rth
->rt_flags
= flags
;
2496 rth
->rt_type
= type
;
2497 rth
->rt_key_tos
= tos
;
2498 rth
->rt_dst
= fl4
->daddr
;
2499 rth
->rt_src
= fl4
->saddr
;
2500 rth
->rt_route_iif
= 0;
2501 rth
->rt_iif
= orig_oif
? : dev_out
->ifindex
;
2502 rth
->rt_oif
= orig_oif
;
2503 rth
->rt_mark
= fl4
->flowi4_mark
;
2504 rth
->rt_gateway
= fl4
->daddr
;
2505 rth
->rt_spec_dst
= fl4
->saddr
;
2506 rth
->rt_peer_genid
= 0;
2510 RT_CACHE_STAT_INC(out_slow_tot
);
2512 if (flags
& RTCF_LOCAL
) {
2513 rth
->dst
.input
= ip_local_deliver
;
2514 rth
->rt_spec_dst
= fl4
->daddr
;
2516 if (flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) {
2517 rth
->rt_spec_dst
= fl4
->saddr
;
2518 if (flags
& RTCF_LOCAL
&&
2519 !(dev_out
->flags
& IFF_LOOPBACK
)) {
2520 rth
->dst
.output
= ip_mc_output
;
2521 RT_CACHE_STAT_INC(out_slow_mc
);
2523 #ifdef CONFIG_IP_MROUTE
2524 if (type
== RTN_MULTICAST
) {
2525 if (IN_DEV_MFORWARD(in_dev
) &&
2526 !ipv4_is_local_multicast(fl4
->daddr
)) {
2527 rth
->dst
.input
= ip_mr_input
;
2528 rth
->dst
.output
= ip_mc_output
;
2534 rt_set_nexthop(rth
, fl4
, res
, fi
, type
, 0);
2540 * Major route resolver routine.
2541 * called with rcu_read_lock();
2544 static struct rtable
*ip_route_output_slow(struct net
*net
, struct flowi4
*fl4
)
2546 struct net_device
*dev_out
= NULL
;
2547 u32 tos
= RT_FL_TOS(fl4
);
2548 unsigned int flags
= 0;
2549 struct fib_result res
;
2556 #ifdef CONFIG_IP_MULTIPLE_TABLES
2560 orig_daddr
= fl4
->daddr
;
2561 orig_saddr
= fl4
->saddr
;
2562 orig_oif
= fl4
->flowi4_oif
;
2564 fl4
->flowi4_iif
= net
->loopback_dev
->ifindex
;
2565 fl4
->flowi4_tos
= tos
& IPTOS_RT_MASK
;
2566 fl4
->flowi4_scope
= ((tos
& RTO_ONLINK
) ?
2567 RT_SCOPE_LINK
: RT_SCOPE_UNIVERSE
);
2571 rth
= ERR_PTR(-EINVAL
);
2572 if (ipv4_is_multicast(fl4
->saddr
) ||
2573 ipv4_is_lbcast(fl4
->saddr
) ||
2574 ipv4_is_zeronet(fl4
->saddr
))
2577 /* I removed check for oif == dev_out->oif here.
2578 It was wrong for two reasons:
2579 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2580 is assigned to multiple interfaces.
2581 2. Moreover, we are allowed to send packets with saddr
2582 of another iface. --ANK
2585 if (fl4
->flowi4_oif
== 0 &&
2586 (ipv4_is_multicast(fl4
->daddr
) ||
2587 ipv4_is_lbcast(fl4
->daddr
))) {
2588 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2589 dev_out
= __ip_dev_find(net
, fl4
->saddr
, false);
2590 if (dev_out
== NULL
)
2593 /* Special hack: user can direct multicasts
2594 and limited broadcast via necessary interface
2595 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2596 This hack is not just for fun, it allows
2597 vic,vat and friends to work.
2598 They bind socket to loopback, set ttl to zero
2599 and expect that it will work.
2600 From the viewpoint of routing cache they are broken,
2601 because we are not allowed to build multicast path
2602 with loopback source addr (look, routing cache
2603 cannot know, that ttl is zero, so that packet
2604 will not leave this host and route is valid).
2605 Luckily, this hack is good workaround.
2608 fl4
->flowi4_oif
= dev_out
->ifindex
;
2612 if (!(fl4
->flowi4_flags
& FLOWI_FLAG_ANYSRC
)) {
2613 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2614 if (!__ip_dev_find(net
, fl4
->saddr
, false))
2620 if (fl4
->flowi4_oif
) {
2621 dev_out
= dev_get_by_index_rcu(net
, fl4
->flowi4_oif
);
2622 rth
= ERR_PTR(-ENODEV
);
2623 if (dev_out
== NULL
)
2626 /* RACE: Check return value of inet_select_addr instead. */
2627 if (!(dev_out
->flags
& IFF_UP
) || !__in_dev_get_rcu(dev_out
)) {
2628 rth
= ERR_PTR(-ENETUNREACH
);
2631 if (ipv4_is_local_multicast(fl4
->daddr
) ||
2632 ipv4_is_lbcast(fl4
->daddr
)) {
2634 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2639 if (ipv4_is_multicast(fl4
->daddr
))
2640 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2642 else if (!fl4
->daddr
)
2643 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2649 fl4
->daddr
= fl4
->saddr
;
2651 fl4
->daddr
= fl4
->saddr
= htonl(INADDR_LOOPBACK
);
2652 dev_out
= net
->loopback_dev
;
2653 fl4
->flowi4_oif
= net
->loopback_dev
->ifindex
;
2654 res
.type
= RTN_LOCAL
;
2655 flags
|= RTCF_LOCAL
;
2659 if (fib_lookup(net
, fl4
, &res
)) {
2661 if (fl4
->flowi4_oif
) {
2662 /* Apparently, routing tables are wrong. Assume,
2663 that the destination is on link.
2666 Because we are allowed to send to iface
2667 even if it has NO routes and NO assigned
2668 addresses. When oif is specified, routing
2669 tables are looked up with only one purpose:
2670 to catch if destination is gatewayed, rather than
2671 direct. Moreover, if MSG_DONTROUTE is set,
2672 we send packet, ignoring both routing tables
2673 and ifaddr state. --ANK
2676 We could make it even if oif is unknown,
2677 likely IPv6, but we do not.
2680 if (fl4
->saddr
== 0)
2681 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2683 res
.type
= RTN_UNICAST
;
2686 rth
= ERR_PTR(-ENETUNREACH
);
2690 if (res
.type
== RTN_LOCAL
) {
2692 if (res
.fi
->fib_prefsrc
)
2693 fl4
->saddr
= res
.fi
->fib_prefsrc
;
2695 fl4
->saddr
= fl4
->daddr
;
2697 dev_out
= net
->loopback_dev
;
2698 fl4
->flowi4_oif
= dev_out
->ifindex
;
2700 flags
|= RTCF_LOCAL
;
2704 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2705 if (res
.fi
->fib_nhs
> 1 && fl4
->flowi4_oif
== 0)
2706 fib_select_multipath(&res
);
2709 if (!res
.prefixlen
&&
2710 res
.table
->tb_num_default
> 1 &&
2711 res
.type
== RTN_UNICAST
&& !fl4
->flowi4_oif
)
2712 fib_select_default(&res
);
2715 fl4
->saddr
= FIB_RES_PREFSRC(net
, res
);
2717 dev_out
= FIB_RES_DEV(res
);
2718 fl4
->flowi4_oif
= dev_out
->ifindex
;
2722 rth
= __mkroute_output(&res
, fl4
, orig_daddr
, orig_saddr
, orig_oif
,
2727 hash
= rt_hash(orig_daddr
, orig_saddr
, orig_oif
,
2728 rt_genid(dev_net(dev_out
)));
2729 rth
= rt_intern_hash(hash
, rth
, NULL
, orig_oif
);
2737 struct rtable
*__ip_route_output_key(struct net
*net
, struct flowi4
*flp4
)
2742 if (!rt_caching(net
))
2745 hash
= rt_hash(flp4
->daddr
, flp4
->saddr
, flp4
->flowi4_oif
, rt_genid(net
));
2748 for (rth
= rcu_dereference_bh(rt_hash_table
[hash
].chain
); rth
;
2749 rth
= rcu_dereference_bh(rth
->dst
.rt_next
)) {
2750 if (rth
->rt_key_dst
== flp4
->daddr
&&
2751 rth
->rt_key_src
== flp4
->saddr
&&
2752 rt_is_output_route(rth
) &&
2753 rth
->rt_oif
== flp4
->flowi4_oif
&&
2754 rth
->rt_mark
== flp4
->flowi4_mark
&&
2755 !((rth
->rt_key_tos
^ flp4
->flowi4_tos
) &
2756 (IPTOS_RT_MASK
| RTO_ONLINK
)) &&
2757 net_eq(dev_net(rth
->dst
.dev
), net
) &&
2758 !rt_is_expired(rth
)) {
2759 dst_use(&rth
->dst
, jiffies
);
2760 RT_CACHE_STAT_INC(out_hit
);
2761 rcu_read_unlock_bh();
2763 flp4
->saddr
= rth
->rt_src
;
2765 flp4
->daddr
= rth
->rt_dst
;
2768 RT_CACHE_STAT_INC(out_hlist_search
);
2770 rcu_read_unlock_bh();
2773 return ip_route_output_slow(net
, flp4
);
2775 EXPORT_SYMBOL_GPL(__ip_route_output_key
);
2777 static struct dst_entry
*ipv4_blackhole_dst_check(struct dst_entry
*dst
, u32 cookie
)
2782 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry
*dst
)
2787 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
2791 static u32
*ipv4_rt_blackhole_cow_metrics(struct dst_entry
*dst
,
2797 static struct dst_ops ipv4_dst_blackhole_ops
= {
2799 .protocol
= cpu_to_be16(ETH_P_IP
),
2800 .destroy
= ipv4_dst_destroy
,
2801 .check
= ipv4_blackhole_dst_check
,
2802 .default_mtu
= ipv4_blackhole_default_mtu
,
2803 .default_advmss
= ipv4_default_advmss
,
2804 .update_pmtu
= ipv4_rt_blackhole_update_pmtu
,
2805 .cow_metrics
= ipv4_rt_blackhole_cow_metrics
,
2808 struct dst_entry
*ipv4_blackhole_route(struct net
*net
, struct dst_entry
*dst_orig
)
2810 struct rtable
*rt
= dst_alloc(&ipv4_dst_blackhole_ops
, NULL
, 1, 0, 0);
2811 struct rtable
*ort
= (struct rtable
*) dst_orig
;
2814 struct dst_entry
*new = &rt
->dst
;
2817 new->input
= dst_discard
;
2818 new->output
= dst_discard
;
2819 dst_copy_metrics(new, &ort
->dst
);
2821 new->dev
= ort
->dst
.dev
;
2825 rt
->rt_key_dst
= ort
->rt_key_dst
;
2826 rt
->rt_key_src
= ort
->rt_key_src
;
2827 rt
->rt_key_tos
= ort
->rt_key_tos
;
2828 rt
->rt_route_iif
= ort
->rt_route_iif
;
2829 rt
->rt_iif
= ort
->rt_iif
;
2830 rt
->rt_oif
= ort
->rt_oif
;
2831 rt
->rt_mark
= ort
->rt_mark
;
2833 rt
->rt_genid
= rt_genid(net
);
2834 rt
->rt_flags
= ort
->rt_flags
;
2835 rt
->rt_type
= ort
->rt_type
;
2836 rt
->rt_dst
= ort
->rt_dst
;
2837 rt
->rt_src
= ort
->rt_src
;
2838 rt
->rt_gateway
= ort
->rt_gateway
;
2839 rt
->rt_spec_dst
= ort
->rt_spec_dst
;
2840 rt
->peer
= ort
->peer
;
2842 atomic_inc(&rt
->peer
->refcnt
);
2845 atomic_inc(&rt
->fi
->fib_clntref
);
2850 dst_release(dst_orig
);
2852 return rt
? &rt
->dst
: ERR_PTR(-ENOMEM
);
2855 struct rtable
*ip_route_output_flow(struct net
*net
, struct flowi4
*flp4
,
2858 struct rtable
*rt
= __ip_route_output_key(net
, flp4
);
2863 if (flp4
->flowi4_proto
)
2864 rt
= (struct rtable
*) xfrm_lookup(net
, &rt
->dst
,
2865 flowi4_to_flowi(flp4
),
2870 EXPORT_SYMBOL_GPL(ip_route_output_flow
);
2872 static int rt_fill_info(struct net
*net
,
2873 struct sk_buff
*skb
, u32 pid
, u32 seq
, int event
,
2874 int nowait
, unsigned int flags
)
2876 struct rtable
*rt
= skb_rtable(skb
);
2878 struct nlmsghdr
*nlh
;
2880 const struct inet_peer
*peer
= rt
->peer
;
2881 u32 id
= 0, ts
= 0, tsage
= 0, error
;
2883 nlh
= nlmsg_put(skb
, pid
, seq
, event
, sizeof(*r
), flags
);
2887 r
= nlmsg_data(nlh
);
2888 r
->rtm_family
= AF_INET
;
2889 r
->rtm_dst_len
= 32;
2891 r
->rtm_tos
= rt
->rt_key_tos
;
2892 r
->rtm_table
= RT_TABLE_MAIN
;
2893 NLA_PUT_U32(skb
, RTA_TABLE
, RT_TABLE_MAIN
);
2894 r
->rtm_type
= rt
->rt_type
;
2895 r
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2896 r
->rtm_protocol
= RTPROT_UNSPEC
;
2897 r
->rtm_flags
= (rt
->rt_flags
& ~0xFFFF) | RTM_F_CLONED
;
2898 if (rt
->rt_flags
& RTCF_NOTIFY
)
2899 r
->rtm_flags
|= RTM_F_NOTIFY
;
2901 NLA_PUT_BE32(skb
, RTA_DST
, rt
->rt_dst
);
2903 if (rt
->rt_key_src
) {
2904 r
->rtm_src_len
= 32;
2905 NLA_PUT_BE32(skb
, RTA_SRC
, rt
->rt_key_src
);
2908 NLA_PUT_U32(skb
, RTA_OIF
, rt
->dst
.dev
->ifindex
);
2909 #ifdef CONFIG_IP_ROUTE_CLASSID
2910 if (rt
->dst
.tclassid
)
2911 NLA_PUT_U32(skb
, RTA_FLOW
, rt
->dst
.tclassid
);
2913 if (rt_is_input_route(rt
))
2914 NLA_PUT_BE32(skb
, RTA_PREFSRC
, rt
->rt_spec_dst
);
2915 else if (rt
->rt_src
!= rt
->rt_key_src
)
2916 NLA_PUT_BE32(skb
, RTA_PREFSRC
, rt
->rt_src
);
2918 if (rt
->rt_dst
!= rt
->rt_gateway
)
2919 NLA_PUT_BE32(skb
, RTA_GATEWAY
, rt
->rt_gateway
);
2921 if (rtnetlink_put_metrics(skb
, dst_metrics_ptr(&rt
->dst
)) < 0)
2922 goto nla_put_failure
;
2925 NLA_PUT_BE32(skb
, RTA_MARK
, rt
->rt_mark
);
2927 error
= rt
->dst
.error
;
2929 inet_peer_refcheck(rt
->peer
);
2930 id
= atomic_read(&peer
->ip_id_count
) & 0xffff;
2931 if (peer
->tcp_ts_stamp
) {
2933 tsage
= get_seconds() - peer
->tcp_ts_stamp
;
2935 expires
= ACCESS_ONCE(peer
->pmtu_expires
);
2940 if (rt_is_input_route(rt
)) {
2941 #ifdef CONFIG_IP_MROUTE
2942 __be32 dst
= rt
->rt_dst
;
2944 if (ipv4_is_multicast(dst
) && !ipv4_is_local_multicast(dst
) &&
2945 IPV4_DEVCONF_ALL(net
, MC_FORWARDING
)) {
2946 int err
= ipmr_get_route(net
, skb
,
2947 rt
->rt_src
, rt
->rt_dst
,
2953 goto nla_put_failure
;
2955 if (err
== -EMSGSIZE
)
2956 goto nla_put_failure
;
2962 NLA_PUT_U32(skb
, RTA_IIF
, rt
->rt_iif
);
2965 if (rtnl_put_cacheinfo(skb
, &rt
->dst
, id
, ts
, tsage
,
2966 expires
, error
) < 0)
2967 goto nla_put_failure
;
2969 return nlmsg_end(skb
, nlh
);
2972 nlmsg_cancel(skb
, nlh
);
2976 static int inet_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
2978 struct net
*net
= sock_net(in_skb
->sk
);
2980 struct nlattr
*tb
[RTA_MAX
+1];
2981 struct rtable
*rt
= NULL
;
2987 struct sk_buff
*skb
;
2989 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv4_policy
);
2993 rtm
= nlmsg_data(nlh
);
2995 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
3001 /* Reserve room for dummy headers, this skb can pass
3002 through good chunk of routing engine.
3004 skb_reset_mac_header(skb
);
3005 skb_reset_network_header(skb
);
3007 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3008 ip_hdr(skb
)->protocol
= IPPROTO_ICMP
;
3009 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct iphdr
));
3011 src
= tb
[RTA_SRC
] ? nla_get_be32(tb
[RTA_SRC
]) : 0;
3012 dst
= tb
[RTA_DST
] ? nla_get_be32(tb
[RTA_DST
]) : 0;
3013 iif
= tb
[RTA_IIF
] ? nla_get_u32(tb
[RTA_IIF
]) : 0;
3014 mark
= tb
[RTA_MARK
] ? nla_get_u32(tb
[RTA_MARK
]) : 0;
3017 struct net_device
*dev
;
3019 dev
= __dev_get_by_index(net
, iif
);
3025 skb
->protocol
= htons(ETH_P_IP
);
3029 err
= ip_route_input(skb
, dst
, src
, rtm
->rtm_tos
, dev
);
3032 rt
= skb_rtable(skb
);
3033 if (err
== 0 && rt
->dst
.error
)
3034 err
= -rt
->dst
.error
;
3036 struct flowi4 fl4
= {
3039 .flowi4_tos
= rtm
->rtm_tos
,
3040 .flowi4_oif
= tb
[RTA_OIF
] ? nla_get_u32(tb
[RTA_OIF
]) : 0,
3041 .flowi4_mark
= mark
,
3043 rt
= ip_route_output_key(net
, &fl4
);
3053 skb_dst_set(skb
, &rt
->dst
);
3054 if (rtm
->rtm_flags
& RTM_F_NOTIFY
)
3055 rt
->rt_flags
|= RTCF_NOTIFY
;
3057 err
= rt_fill_info(net
, skb
, NETLINK_CB(in_skb
).pid
, nlh
->nlmsg_seq
,
3058 RTM_NEWROUTE
, 0, 0);
3062 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).pid
);
3071 int ip_rt_dump(struct sk_buff
*skb
, struct netlink_callback
*cb
)
3078 net
= sock_net(skb
->sk
);
3083 s_idx
= idx
= cb
->args
[1];
3084 for (h
= s_h
; h
<= rt_hash_mask
; h
++, s_idx
= 0) {
3085 if (!rt_hash_table
[h
].chain
)
3088 for (rt
= rcu_dereference_bh(rt_hash_table
[h
].chain
), idx
= 0; rt
;
3089 rt
= rcu_dereference_bh(rt
->dst
.rt_next
), idx
++) {
3090 if (!net_eq(dev_net(rt
->dst
.dev
), net
) || idx
< s_idx
)
3092 if (rt_is_expired(rt
))
3094 skb_dst_set_noref(skb
, &rt
->dst
);
3095 if (rt_fill_info(net
, skb
, NETLINK_CB(cb
->skb
).pid
,
3096 cb
->nlh
->nlmsg_seq
, RTM_NEWROUTE
,
3097 1, NLM_F_MULTI
) <= 0) {
3099 rcu_read_unlock_bh();
3104 rcu_read_unlock_bh();
3113 void ip_rt_multicast_event(struct in_device
*in_dev
)
3115 rt_cache_flush(dev_net(in_dev
->dev
), 0);
3118 #ifdef CONFIG_SYSCTL
3119 static int ipv4_sysctl_rtcache_flush(ctl_table
*__ctl
, int write
,
3120 void __user
*buffer
,
3121 size_t *lenp
, loff_t
*ppos
)
3128 memcpy(&ctl
, __ctl
, sizeof(ctl
));
3129 ctl
.data
= &flush_delay
;
3130 proc_dointvec(&ctl
, write
, buffer
, lenp
, ppos
);
3132 net
= (struct net
*)__ctl
->extra1
;
3133 rt_cache_flush(net
, flush_delay
);
3140 static ctl_table ipv4_route_table
[] = {
3142 .procname
= "gc_thresh",
3143 .data
= &ipv4_dst_ops
.gc_thresh
,
3144 .maxlen
= sizeof(int),
3146 .proc_handler
= proc_dointvec
,
3149 .procname
= "max_size",
3150 .data
= &ip_rt_max_size
,
3151 .maxlen
= sizeof(int),
3153 .proc_handler
= proc_dointvec
,
3156 /* Deprecated. Use gc_min_interval_ms */
3158 .procname
= "gc_min_interval",
3159 .data
= &ip_rt_gc_min_interval
,
3160 .maxlen
= sizeof(int),
3162 .proc_handler
= proc_dointvec_jiffies
,
3165 .procname
= "gc_min_interval_ms",
3166 .data
= &ip_rt_gc_min_interval
,
3167 .maxlen
= sizeof(int),
3169 .proc_handler
= proc_dointvec_ms_jiffies
,
3172 .procname
= "gc_timeout",
3173 .data
= &ip_rt_gc_timeout
,
3174 .maxlen
= sizeof(int),
3176 .proc_handler
= proc_dointvec_jiffies
,
3179 .procname
= "gc_interval",
3180 .data
= &ip_rt_gc_interval
,
3181 .maxlen
= sizeof(int),
3183 .proc_handler
= proc_dointvec_jiffies
,
3186 .procname
= "gc_interval",
3187 .data
= &ip_rt_gc_interval
,
3188 .maxlen
= sizeof(int),
3190 .proc_handler
= proc_dointvec_jiffies
,
3193 .procname
= "redirect_load",
3194 .data
= &ip_rt_redirect_load
,
3195 .maxlen
= sizeof(int),
3197 .proc_handler
= proc_dointvec
,
3200 .procname
= "redirect_number",
3201 .data
= &ip_rt_redirect_number
,
3202 .maxlen
= sizeof(int),
3204 .proc_handler
= proc_dointvec
,
3207 .procname
= "redirect_silence",
3208 .data
= &ip_rt_redirect_silence
,
3209 .maxlen
= sizeof(int),
3211 .proc_handler
= proc_dointvec
,
3214 .procname
= "error_cost",
3215 .data
= &ip_rt_error_cost
,
3216 .maxlen
= sizeof(int),
3218 .proc_handler
= proc_dointvec
,
3221 .procname
= "error_burst",
3222 .data
= &ip_rt_error_burst
,
3223 .maxlen
= sizeof(int),
3225 .proc_handler
= proc_dointvec
,
3228 .procname
= "gc_elasticity",
3229 .data
= &ip_rt_gc_elasticity
,
3230 .maxlen
= sizeof(int),
3232 .proc_handler
= proc_dointvec
,
3235 .procname
= "mtu_expires",
3236 .data
= &ip_rt_mtu_expires
,
3237 .maxlen
= sizeof(int),
3239 .proc_handler
= proc_dointvec_jiffies
,
3242 .procname
= "min_pmtu",
3243 .data
= &ip_rt_min_pmtu
,
3244 .maxlen
= sizeof(int),
3246 .proc_handler
= proc_dointvec
,
3249 .procname
= "min_adv_mss",
3250 .data
= &ip_rt_min_advmss
,
3251 .maxlen
= sizeof(int),
3253 .proc_handler
= proc_dointvec
,
3258 static struct ctl_table empty
[1];
3260 static struct ctl_table ipv4_skeleton
[] =
3262 { .procname
= "route",
3263 .mode
= 0555, .child
= ipv4_route_table
},
3264 { .procname
= "neigh",
3265 .mode
= 0555, .child
= empty
},
3269 static __net_initdata
struct ctl_path ipv4_path
[] = {
3270 { .procname
= "net", },
3271 { .procname
= "ipv4", },
3275 static struct ctl_table ipv4_route_flush_table
[] = {
3277 .procname
= "flush",
3278 .maxlen
= sizeof(int),
3280 .proc_handler
= ipv4_sysctl_rtcache_flush
,
3285 static __net_initdata
struct ctl_path ipv4_route_path
[] = {
3286 { .procname
= "net", },
3287 { .procname
= "ipv4", },
3288 { .procname
= "route", },
3292 static __net_init
int sysctl_route_net_init(struct net
*net
)
3294 struct ctl_table
*tbl
;
3296 tbl
= ipv4_route_flush_table
;
3297 if (!net_eq(net
, &init_net
)) {
3298 tbl
= kmemdup(tbl
, sizeof(ipv4_route_flush_table
), GFP_KERNEL
);
3302 tbl
[0].extra1
= net
;
3304 net
->ipv4
.route_hdr
=
3305 register_net_sysctl_table(net
, ipv4_route_path
, tbl
);
3306 if (net
->ipv4
.route_hdr
== NULL
)
3311 if (tbl
!= ipv4_route_flush_table
)
3317 static __net_exit
void sysctl_route_net_exit(struct net
*net
)
3319 struct ctl_table
*tbl
;
3321 tbl
= net
->ipv4
.route_hdr
->ctl_table_arg
;
3322 unregister_net_sysctl_table(net
->ipv4
.route_hdr
);
3323 BUG_ON(tbl
== ipv4_route_flush_table
);
3327 static __net_initdata
struct pernet_operations sysctl_route_ops
= {
3328 .init
= sysctl_route_net_init
,
3329 .exit
= sysctl_route_net_exit
,
3333 static __net_init
int rt_genid_init(struct net
*net
)
3335 get_random_bytes(&net
->ipv4
.rt_genid
,
3336 sizeof(net
->ipv4
.rt_genid
));
3337 get_random_bytes(&net
->ipv4
.dev_addr_genid
,
3338 sizeof(net
->ipv4
.dev_addr_genid
));
3342 static __net_initdata
struct pernet_operations rt_genid_ops
= {
3343 .init
= rt_genid_init
,
3347 #ifdef CONFIG_IP_ROUTE_CLASSID
3348 struct ip_rt_acct __percpu
*ip_rt_acct __read_mostly
;
3349 #endif /* CONFIG_IP_ROUTE_CLASSID */
3351 static __initdata
unsigned long rhash_entries
;
3352 static int __init
set_rhash_entries(char *str
)
3356 rhash_entries
= simple_strtoul(str
, &str
, 0);
3359 __setup("rhash_entries=", set_rhash_entries
);
3361 int __init
ip_rt_init(void)
3365 #ifdef CONFIG_IP_ROUTE_CLASSID
3366 ip_rt_acct
= __alloc_percpu(256 * sizeof(struct ip_rt_acct
), __alignof__(struct ip_rt_acct
));
3368 panic("IP: failed to allocate ip_rt_acct\n");
3371 ipv4_dst_ops
.kmem_cachep
=
3372 kmem_cache_create("ip_dst_cache", sizeof(struct rtable
), 0,
3373 SLAB_HWCACHE_ALIGN
|SLAB_PANIC
, NULL
);
3375 ipv4_dst_blackhole_ops
.kmem_cachep
= ipv4_dst_ops
.kmem_cachep
;
3377 if (dst_entries_init(&ipv4_dst_ops
) < 0)
3378 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3380 if (dst_entries_init(&ipv4_dst_blackhole_ops
) < 0)
3381 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3383 rt_hash_table
= (struct rt_hash_bucket
*)
3384 alloc_large_system_hash("IP route cache",
3385 sizeof(struct rt_hash_bucket
),
3387 (totalram_pages
>= 128 * 1024) ?
3392 rhash_entries
? 0 : 512 * 1024);
3393 memset(rt_hash_table
, 0, (rt_hash_mask
+ 1) * sizeof(struct rt_hash_bucket
));
3394 rt_hash_lock_init();
3396 ipv4_dst_ops
.gc_thresh
= (rt_hash_mask
+ 1);
3397 ip_rt_max_size
= (rt_hash_mask
+ 1) * 16;
3402 INIT_DELAYED_WORK_DEFERRABLE(&expires_work
, rt_worker_func
);
3403 expires_ljiffies
= jiffies
;
3404 schedule_delayed_work(&expires_work
,
3405 net_random() % ip_rt_gc_interval
+ ip_rt_gc_interval
);
3407 if (ip_rt_proc_init())
3408 printk(KERN_ERR
"Unable to create route proc files\n");
3411 xfrm4_init(ip_rt_max_size
);
3413 rtnl_register(PF_INET
, RTM_GETROUTE
, inet_rtm_getroute
, NULL
);
3415 #ifdef CONFIG_SYSCTL
3416 register_pernet_subsys(&sysctl_route_ops
);
3418 register_pernet_subsys(&rt_genid_ops
);
3422 #ifdef CONFIG_SYSCTL
3424 * We really need to sanitize the damn ipv4 init order, then all
3425 * this nonsense will go away.
3427 void __init
ip_static_sysctl_init(void)
3429 register_sysctl_paths(ipv4_path
, ipv4_skeleton
);