2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.67 1999/05/08 20:00:20 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Splitted to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
56 * This program is free software; you can redistribute it and/or
57 * modify it under the terms of the GNU General Public License
58 * as published by the Free Software Foundation; either version
59 * 2 of the License, or (at your option) any later version.
62 #include <linux/config.h>
63 #include <asm/uaccess.h>
64 #include <asm/system.h>
65 #include <asm/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/sched.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/rtnetlink.h>
81 #include <linux/inetdevice.h>
82 #include <linux/igmp.h>
83 #include <linux/pkt_sched.h>
84 #include <linux/mroute.h>
85 #include <net/protocol.h>
87 #include <net/route.h>
89 #include <net/ip_fib.h>
94 #include <linux/sysctl.h>
97 #define IP_MAX_MTU 0xFFF0
99 #define RT_GC_TIMEOUT (300*HZ)
101 int ip_rt_min_delay
= 2*HZ
;
102 int ip_rt_max_delay
= 10*HZ
;
103 int ip_rt_gc_thresh
= RT_HASH_DIVISOR
;
104 int ip_rt_max_size
= RT_HASH_DIVISOR
*16;
105 int ip_rt_gc_timeout
= RT_GC_TIMEOUT
;
106 int ip_rt_gc_interval
= 60*HZ
;
107 int ip_rt_gc_min_interval
= 5*HZ
;
108 int ip_rt_redirect_number
= 9;
109 int ip_rt_redirect_load
= HZ
/50;
110 int ip_rt_redirect_silence
= ((HZ
/50) << (9+1));
111 int ip_rt_error_cost
= HZ
;
112 int ip_rt_error_burst
= 5*HZ
;
113 int ip_rt_gc_elasticity
= 8;
114 int ip_rt_mtu_expires
= 10*60*HZ
;
116 static unsigned long rt_deadline
= 0;
118 #define RTprint(a...) printk(KERN_DEBUG a)
120 static void rt_run_flush(unsigned long dummy
);
122 static struct timer_list rt_flush_timer
=
123 { NULL
, NULL
, 0, 0L, rt_run_flush
};
124 static struct timer_list rt_periodic_timer
=
125 { NULL
, NULL
, 0, 0L, NULL
};
128 * Interface to generic destination cache.
131 static struct dst_entry
* ipv4_dst_check(struct dst_entry
* dst
, u32
);
132 static struct dst_entry
* ipv4_dst_reroute(struct dst_entry
* dst
,
134 static struct dst_entry
* ipv4_negative_advice(struct dst_entry
*);
135 static void ipv4_link_failure(struct sk_buff
*skb
);
136 static int rt_garbage_collect(void);
139 struct dst_ops ipv4_dst_ops
=
142 __constant_htons(ETH_P_IP
),
149 ipv4_negative_advice
,
153 __u8 ip_tos2prio
[16] = {
166 TC_PRIO_INTERACTIVE_BULK
,
168 TC_PRIO_INTERACTIVE_BULK
,
177 static struct rtable
*rt_hash_table
[RT_HASH_DIVISOR
];
178 static rwlock_t rt_hash_lock
= RW_LOCK_UNLOCKED
;
180 static int rt_intern_hash(unsigned hash
, struct rtable
* rth
, struct rtable
** res
);
182 static __inline__
unsigned rt_hash_code(u32 daddr
, u32 saddr
, u8 tos
)
184 unsigned hash
= ((daddr
&0xF0F0F0F0)>>4)|((daddr
&0x0F0F0F0F)<<4);
185 hash
= hash
^saddr
^tos
;
186 hash
= hash
^(hash
>>16);
187 return (hash
^(hash
>>8)) & 0xFF;
190 #ifdef CONFIG_PROC_FS
192 static int rt_cache_get_info(char *buffer
, char **start
, off_t offset
, int length
, int dummy
)
203 sprintf(buffer
,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst");
208 read_lock_bh(&rt_hash_lock
);
210 for (i
= 0; i
<RT_HASH_DIVISOR
; i
++) {
211 for (r
= rt_hash_table
[i
]; r
; r
= r
->u
.rt_next
) {
213 * Spin through entries until we are ready
221 sprintf(temp
, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
222 r
->u
.dst
.dev
? r
->u
.dst
.dev
->name
: "*",
223 (unsigned long)r
->rt_dst
,
224 (unsigned long)r
->rt_gateway
,
226 atomic_read(&r
->u
.dst
.use
),
227 atomic_read(&r
->u
.dst
.refcnt
),
229 (unsigned long)r
->rt_src
, (int)r
->u
.dst
.pmtu
,
231 (int)r
->u
.dst
.rtt
, r
->key
.tos
,
232 r
->u
.dst
.hh
? atomic_read(&r
->u
.dst
.hh
->hh_refcnt
) : -1,
233 r
->u
.dst
.hh
? (r
->u
.dst
.hh
->hh_output
== dev_queue_xmit
) : 0,
235 sprintf(buffer
+len
,"%-127s\n",temp
);
237 if (pos
>= offset
+length
)
243 read_unlock_bh(&rt_hash_lock
);
245 *start
= buffer
+len
-(pos
-offset
);
253 static __inline__
void rt_free(struct rtable
*rt
)
255 dst_free(&rt
->u
.dst
);
258 static __inline__
void rt_drop(struct rtable
*rt
)
261 dst_free(&rt
->u
.dst
);
264 static __inline__
int rt_fast_clean(struct rtable
*rth
)
266 /* Kill broadcast/multicast entries very aggresively, if they
267 collide in hash table with more useful entries */
268 return ((rth
->rt_flags
&(RTCF_BROADCAST
|RTCF_MULTICAST
))
269 && rth
->key
.iif
&& rth
->u
.rt_next
);
272 static __inline__
int rt_valuable(struct rtable
*rth
)
274 return ((rth
->rt_flags
&(RTCF_REDIRECTED
|RTCF_NOTIFY
))
275 || rth
->u
.dst
.expires
);
278 static __inline__
int rt_may_expire(struct rtable
*rth
, int tmo1
, int tmo2
)
282 if (atomic_read(&rth
->u
.dst
.use
))
285 if (rth
->u
.dst
.expires
&& (long)(rth
->u
.dst
.expires
- jiffies
) <= 0)
288 age
= jiffies
- rth
->u
.dst
.lastuse
;
289 if (age
<= tmo1
&& !rt_fast_clean(rth
))
291 if (age
<= tmo2
&& rt_valuable(rth
))
296 static void rt_check_expire(unsigned long dummy
)
300 struct rtable
*rth
, **rthp
;
301 unsigned long now
= jiffies
;
303 for (i
=0; i
<RT_HASH_DIVISOR
/5; i
++) {
304 unsigned tmo
= ip_rt_gc_timeout
;
306 rover
= (rover
+ 1) & (RT_HASH_DIVISOR
-1);
307 rthp
= &rt_hash_table
[rover
];
309 write_lock_bh(&rt_hash_lock
);
310 while ((rth
= *rthp
) != NULL
) {
311 if (rth
->u
.dst
.expires
) {
312 /* Entrie is expired even if it is in use */
313 if ((long)(now
- rth
->u
.dst
.expires
) <= 0) {
315 rthp
= &rth
->u
.rt_next
;
318 } else if (!rt_may_expire(rth
, tmo
, ip_rt_gc_timeout
)) {
320 rthp
= &rth
->u
.rt_next
;
325 * Cleanup aged off entries.
327 *rthp
= rth
->u
.rt_next
;
330 write_unlock_bh(&rt_hash_lock
);
332 /* Fallback loop breaker. */
333 if ((jiffies
- now
) > 0)
336 rt_periodic_timer
.expires
= now
+ ip_rt_gc_interval
;
337 add_timer(&rt_periodic_timer
);
340 static void rt_run_flush(unsigned long dummy
)
343 struct rtable
* rth
, * next
;
347 write_lock_bh(&rt_hash_lock
);
348 for (i
=0; i
<RT_HASH_DIVISOR
; i
++) {
349 rth
= rt_hash_table
[i
];
352 rt_hash_table
[i
] = NULL
;
353 write_unlock_bh(&rt_hash_lock
);
355 for (; rth
; rth
=next
) {
356 next
= rth
->u
.rt_next
;
357 rth
->u
.rt_next
= NULL
;
361 write_lock_bh(&rt_hash_lock
);
363 write_unlock_bh(&rt_hash_lock
);
366 static spinlock_t rt_flush_lock
= SPIN_LOCK_UNLOCKED
;
368 void rt_cache_flush(int delay
)
370 unsigned long now
= jiffies
;
371 int user_mode
= !in_interrupt();
374 delay
= ip_rt_min_delay
;
376 spin_lock_bh(&rt_flush_lock
);
378 if (del_timer(&rt_flush_timer
) && delay
> 0 && rt_deadline
) {
379 long tmo
= (long)(rt_deadline
- now
);
381 /* If flush timer is already running
382 and flush request is not immediate (delay > 0):
384 if deadline is not achieved, prolongate timer to "delay",
385 otherwise fire it at deadline time.
388 if (user_mode
&& tmo
< ip_rt_max_delay
-ip_rt_min_delay
)
396 spin_unlock_bh(&rt_flush_lock
);
401 if (rt_deadline
== 0)
402 rt_deadline
= now
+ ip_rt_max_delay
;
404 rt_flush_timer
.expires
= now
+ delay
;
405 add_timer(&rt_flush_timer
);
406 spin_unlock_bh(&rt_flush_lock
);
410 Short description of GC goals.
412 We want to build algorithm, which will keep routing cache
413 at some equilibrium point, when number of aged off entries
414 is kept approximately equal to newly generated ones.
416 Current expiration strength is variable "expire".
417 We try to adjust it dynamically, so that if networking
418 is idle expires is large enough to keep enough of warm entries,
419 and when load increases it reduces to limit cache size.
422 static int rt_garbage_collect(void)
424 static unsigned expire
= RT_GC_TIMEOUT
;
425 static unsigned long last_gc
;
427 static int equilibrium
;
428 struct rtable
*rth
, **rthp
;
429 unsigned long now
= jiffies
;
433 * Garbage collection is pretty expensive,
434 * do not make it too frequently.
436 if (now
- last_gc
< ip_rt_gc_min_interval
&&
437 atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
440 /* Calculate number of entries, which we want to expire now. */
441 goal
= atomic_read(&ipv4_dst_ops
.entries
) - RT_HASH_DIVISOR
*ip_rt_gc_elasticity
;
443 if (equilibrium
< ipv4_dst_ops
.gc_thresh
)
444 equilibrium
= ipv4_dst_ops
.gc_thresh
;
445 goal
= atomic_read(&ipv4_dst_ops
.entries
) - equilibrium
;
447 equilibrium
+= min(goal
/2, RT_HASH_DIVISOR
);
448 goal
= atomic_read(&ipv4_dst_ops
.entries
) - equilibrium
;
451 /* We are in dangerous area. Try to reduce cache really
454 goal
= max(goal
/2, RT_HASH_DIVISOR
);
455 equilibrium
= atomic_read(&ipv4_dst_ops
.entries
) - goal
;
458 if (now
- last_gc
>= ip_rt_gc_min_interval
)
469 write_lock_bh(&rt_hash_lock
);
470 for (i
=0, k
=rover
; i
<RT_HASH_DIVISOR
; i
++) {
471 unsigned tmo
= expire
;
473 k
= (k
+ 1) & (RT_HASH_DIVISOR
-1);
474 rthp
= &rt_hash_table
[k
];
475 while ((rth
= *rthp
) != NULL
) {
476 if (!rt_may_expire(rth
, tmo
, expire
)) {
478 rthp
= &rth
->u
.rt_next
;
481 *rthp
= rth
->u
.rt_next
;
482 rth
->u
.rt_next
= NULL
;
490 write_unlock_bh(&rt_hash_lock
);
495 /* Goal is not achieved. We stop process if:
497 - if expire reduced to zero. Otherwise, expire is halfed.
498 - if table is not full.
499 - if we are called from interrupt.
500 - jiffies check is just fallback/debug loop breaker.
501 We will not spin here for long time in any case.
508 #if RT_CACHE_DEBUG >= 2
509 printk(KERN_DEBUG
"expire>> %u %d %d %d\n", expire
, atomic_read(&ipv4_dst_ops
.entries
), goal
, i
);
512 if (atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
514 } while (!in_interrupt() && jiffies
- now
< 1);
516 if (atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
519 printk("dst cache overflow\n");
523 expire
+= ip_rt_gc_min_interval
;
524 if (expire
> ip_rt_gc_timeout
||
525 atomic_read(&ipv4_dst_ops
.entries
) < ipv4_dst_ops
.gc_thresh
)
526 expire
= ip_rt_gc_timeout
;
527 #if RT_CACHE_DEBUG >= 2
528 printk(KERN_DEBUG
"expire++ %u %d %d %d\n", expire
, atomic_read(&ipv4_dst_ops
.entries
), goal
, rover
);
533 static int rt_intern_hash(unsigned hash
, struct rtable
* rt
, struct rtable
** rp
)
535 struct rtable
*rth
, **rthp
;
536 unsigned long now
= jiffies
;
537 int attempts
= !in_interrupt();
540 rthp
= &rt_hash_table
[hash
];
542 write_lock_bh(&rt_hash_lock
);
543 while ((rth
= *rthp
) != NULL
) {
544 if (memcmp(&rth
->key
, &rt
->key
, sizeof(rt
->key
)) == 0) {
546 *rthp
= rth
->u
.rt_next
;
547 rth
->u
.rt_next
= rt_hash_table
[hash
];
548 rt_hash_table
[hash
] = rth
;
550 atomic_inc(&rth
->u
.dst
.refcnt
);
551 atomic_inc(&rth
->u
.dst
.use
);
552 rth
->u
.dst
.lastuse
= now
;
553 write_unlock_bh(&rt_hash_lock
);
560 rthp
= &rth
->u
.rt_next
;
563 /* Try to bind route to arp only if it is output
564 route or unicast forwarding path.
566 if (rt
->rt_type
== RTN_UNICAST
|| rt
->key
.iif
== 0) {
567 if (!arp_bind_neighbour(&rt
->u
.dst
)) {
568 write_unlock_bh(&rt_hash_lock
);
570 /* Neighbour tables are full and nothing
571 can be released. Try to shrink route cache,
572 it is most likely it holds some neighbour records.
574 if (attempts
-- > 0) {
575 int saved_elasticity
= ip_rt_gc_elasticity
;
576 int saved_int
= ip_rt_gc_min_interval
;
577 ip_rt_gc_elasticity
= 1;
578 ip_rt_gc_min_interval
= 0;
579 rt_garbage_collect();
580 ip_rt_gc_min_interval
= saved_int
;
581 ip_rt_gc_elasticity
= saved_elasticity
;
587 printk("neighbour table overflow\n");
592 rt
->u
.rt_next
= rt_hash_table
[hash
];
593 #if RT_CACHE_DEBUG >= 2
596 printk("rt_cache @%02x: %08x", hash
, rt
->rt_dst
);
597 for (trt
=rt
->u
.rt_next
; trt
; trt
=trt
->u
.rt_next
)
598 printk(" . %08x", trt
->rt_dst
);
602 rt_hash_table
[hash
] = rt
;
604 write_unlock_bh(&rt_hash_lock
);
608 void ip_rt_redirect(u32 old_gw
, u32 daddr
, u32 new_gw
,
609 u32 saddr
, u8 tos
, struct device
*dev
)
612 struct in_device
*in_dev
= dev
->ip_ptr
;
613 struct rtable
*rth
, **rthp
;
614 u32 skeys
[2] = { saddr
, 0 };
615 int ikeys
[2] = { dev
->ifindex
, 0 };
617 tos
&= IPTOS_TOS_MASK
;
622 if (new_gw
== old_gw
|| !IN_DEV_RX_REDIRECTS(in_dev
)
623 || MULTICAST(new_gw
) || BADCLASS(new_gw
) || ZERONET(new_gw
))
624 goto reject_redirect
;
626 if (!IN_DEV_SHARED_MEDIA(in_dev
)) {
627 if (!inet_addr_onlink(in_dev
, new_gw
, old_gw
))
628 goto reject_redirect
;
629 if (IN_DEV_SEC_REDIRECTS(in_dev
) && ip_fib_check_default(new_gw
, dev
))
630 goto reject_redirect
;
632 if (inet_addr_type(new_gw
) != RTN_UNICAST
)
633 goto reject_redirect
;
636 for (i
=0; i
<2; i
++) {
637 for (k
=0; k
<2; k
++) {
638 unsigned hash
= rt_hash_code(daddr
, skeys
[i
]^(ikeys
[k
]<<5), tos
);
640 rthp
=&rt_hash_table
[hash
];
642 write_lock_bh(&rt_hash_lock
);
643 while ( (rth
= *rthp
) != NULL
) {
646 if (rth
->key
.dst
!= daddr
||
647 rth
->key
.src
!= skeys
[i
] ||
648 rth
->key
.tos
!= tos
||
649 rth
->key
.oif
!= ikeys
[k
] ||
651 rthp
= &rth
->u
.rt_next
;
655 if (rth
->rt_dst
!= daddr
||
656 rth
->rt_src
!= saddr
||
658 rth
->rt_gateway
!= old_gw
||
659 rth
->u
.dst
.dev
!= dev
)
662 dst_clone(&rth
->u
.dst
);
664 rt
= dst_alloc(sizeof(struct rtable
), &ipv4_dst_ops
);
667 write_unlock_bh(&rt_hash_lock
);
672 * Copy all the information.
675 atomic_set(&rt
->u
.dst
.refcnt
, 1);
676 atomic_set(&rt
->u
.dst
.use
, 1);
677 rt
->u
.dst
.lastuse
= jiffies
;
678 rt
->u
.dst
.neighbour
= NULL
;
681 rt
->rt_flags
|= RTCF_REDIRECTED
;
683 /* Gateway is different ... */
684 rt
->rt_gateway
= new_gw
;
686 /* Redirect received -> path was valid */
687 dst_confirm(&rth
->u
.dst
);
689 if (!arp_bind_neighbour(&rt
->u
.dst
) ||
690 !(rt
->u
.dst
.neighbour
->nud_state
&NUD_VALID
)) {
691 if (rt
->u
.dst
.neighbour
)
692 neigh_event_send(rt
->u
.dst
.neighbour
, NULL
);
698 *rthp
= rth
->u
.rt_next
;
699 if (!rt_intern_hash(hash
, rt
, &rt
))
704 write_unlock_bh(&rt_hash_lock
);
710 #ifdef CONFIG_IP_ROUTE_VERBOSE
711 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
712 printk(KERN_INFO
"Redirect from %lX/%s to %lX ignored."
713 "Path = %lX -> %lX, tos %02x\n",
714 ntohl(old_gw
), dev
->name
, ntohl(new_gw
),
715 ntohl(saddr
), ntohl(daddr
), tos
);
719 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
)
721 struct rtable
*rt
= (struct rtable
*)dst
;
728 if ((rt
->rt_flags
&RTCF_REDIRECTED
) || rt
->u
.dst
.expires
) {
729 unsigned hash
= rt_hash_code(rt
->key
.dst
, rt
->key
.src
^(rt
->key
.oif
<<5), rt
->key
.tos
);
730 struct rtable
**rthp
;
731 #if RT_CACHE_DEBUG >= 1
732 printk(KERN_DEBUG
"ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt
->rt_dst
), rt
->key
.tos
);
735 write_lock_bh(&rt_hash_lock
);
736 for (rthp
= &rt_hash_table
[hash
]; *rthp
; rthp
= &(*rthp
)->u
.rt_next
) {
738 *rthp
= rt
->u
.rt_next
;
743 write_unlock_bh(&rt_hash_lock
);
752 * 1. The first ip_rt_redirect_number redirects are sent
753 * with exponential backoff, then we stop sending them at all,
754 * assuming that the host ignores our redirects.
755 * 2. If we did not see packets requiring redirects
756 * during ip_rt_redirect_silence, we assume that the host
757 * forgot redirected route and start to send redirects again.
759 * This algorithm is much cheaper and more intelligent than dumb load limiting
762 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
763 * and "frag. need" (breaks PMTU discovery) in icmp.c.
766 void ip_rt_send_redirect(struct sk_buff
*skb
)
768 struct rtable
*rt
= (struct rtable
*)skb
->dst
;
769 struct in_device
*in_dev
= (struct in_device
*)rt
->u
.dst
.dev
->ip_ptr
;
771 if (!in_dev
|| !IN_DEV_TX_REDIRECTS(in_dev
))
774 /* No redirected packets during ip_rt_redirect_silence;
775 * reset the algorithm.
777 if (jiffies
- rt
->u
.dst
.rate_last
> ip_rt_redirect_silence
)
778 rt
->u
.dst
.rate_tokens
= 0;
780 /* Too many ignored redirects; do not send anything
781 * set u.dst.rate_last to the last seen redirected packet.
783 if (rt
->u
.dst
.rate_tokens
>= ip_rt_redirect_number
) {
784 rt
->u
.dst
.rate_last
= jiffies
;
788 /* Check for load limit; set rate_last to the latest sent
791 if (jiffies
- rt
->u
.dst
.rate_last
> (ip_rt_redirect_load
<<rt
->u
.dst
.rate_tokens
)) {
792 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, rt
->rt_gateway
);
793 rt
->u
.dst
.rate_last
= jiffies
;
794 ++rt
->u
.dst
.rate_tokens
;
795 #ifdef CONFIG_IP_ROUTE_VERBOSE
796 if (IN_DEV_LOG_MARTIANS(in_dev
) &&
797 rt
->u
.dst
.rate_tokens
== ip_rt_redirect_number
&& net_ratelimit())
798 printk(KERN_WARNING
"host %08x/if%d ignores redirects for %08x to %08x.\n",
799 rt
->rt_src
, rt
->rt_iif
, rt
->rt_dst
, rt
->rt_gateway
);
804 static int ip_error(struct sk_buff
*skb
)
806 struct rtable
*rt
= (struct rtable
*)skb
->dst
;
810 switch (rt
->u
.dst
.error
) {
816 code
= ICMP_HOST_UNREACH
;
819 code
= ICMP_NET_UNREACH
;
822 code
= ICMP_PKT_FILTERED
;
827 if ((rt
->u
.dst
.rate_tokens
+= (now
- rt
->u
.dst
.rate_last
)) > ip_rt_error_burst
)
828 rt
->u
.dst
.rate_tokens
= ip_rt_error_burst
;
829 rt
->u
.dst
.rate_last
= now
;
830 if (rt
->u
.dst
.rate_tokens
>= ip_rt_error_cost
) {
831 rt
->u
.dst
.rate_tokens
-= ip_rt_error_cost
;
832 icmp_send(skb
, ICMP_DEST_UNREACH
, code
, 0);
840 * The last two values are not from the RFC but
841 * are needed for AMPRnet AX.25 paths.
844 static unsigned short mtu_plateau
[] =
845 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
847 static __inline__
unsigned short guess_mtu(unsigned short old_mtu
)
851 for (i
= 0; i
< sizeof(mtu_plateau
)/sizeof(mtu_plateau
[0]); i
++)
852 if (old_mtu
> mtu_plateau
[i
])
853 return mtu_plateau
[i
];
857 unsigned short ip_rt_frag_needed(struct iphdr
*iph
, unsigned short new_mtu
)
860 unsigned short old_mtu
= ntohs(iph
->tot_len
);
862 u32 skeys
[2] = { iph
->saddr
, 0, };
863 u32 daddr
= iph
->daddr
;
864 u8 tos
= iph
->tos
& IPTOS_TOS_MASK
;
865 unsigned short est_mtu
= 0;
867 if (ipv4_config
.no_pmtu_disc
)
870 for (i
=0; i
<2; i
++) {
871 unsigned hash
= rt_hash_code(daddr
, skeys
[i
], tos
);
873 read_lock_bh(&rt_hash_lock
);
874 for (rth
= rt_hash_table
[hash
]; rth
; rth
= rth
->u
.rt_next
) {
875 if (rth
->key
.dst
== daddr
&&
876 rth
->key
.src
== skeys
[i
] &&
877 rth
->rt_dst
== daddr
&&
878 rth
->rt_src
== iph
->saddr
&&
879 rth
->key
.tos
== tos
&&
881 !(rth
->u
.dst
.mxlock
&(1<<RTAX_MTU
))) {
882 unsigned short mtu
= new_mtu
;
884 if (new_mtu
< 68 || new_mtu
>= old_mtu
) {
886 /* BSD 4.2 compatibility hack :-( */
887 if (mtu
== 0 && old_mtu
>= rth
->u
.dst
.pmtu
&&
888 old_mtu
>= 68 + (iph
->ihl
<<2))
889 old_mtu
-= iph
->ihl
<<2;
891 mtu
= guess_mtu(old_mtu
);
893 if (mtu
<= rth
->u
.dst
.pmtu
) {
894 if (mtu
< rth
->u
.dst
.pmtu
) {
895 dst_confirm(&rth
->u
.dst
);
896 rth
->u
.dst
.pmtu
= mtu
;
897 dst_set_expires(&rth
->u
.dst
, ip_rt_mtu_expires
);
903 read_unlock_bh(&rt_hash_lock
);
905 return est_mtu
? : new_mtu
;
908 void ip_rt_update_pmtu(struct dst_entry
*dst
, unsigned mtu
)
910 if (dst
->pmtu
> mtu
&& mtu
>= 68 &&
911 !(dst
->mxlock
&(1<<RTAX_MTU
))) {
913 dst_set_expires(dst
, ip_rt_mtu_expires
);
917 static struct dst_entry
* ipv4_dst_check(struct dst_entry
* dst
, u32 cookie
)
923 static struct dst_entry
* ipv4_dst_reroute(struct dst_entry
* dst
,
929 static void ipv4_link_failure(struct sk_buff
*skb
)
933 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_HOST_UNREACH
, 0);
935 rt
= (struct rtable
*) skb
->dst
;
937 dst_set_expires(&rt
->u
.dst
, 0);
940 static int ip_rt_bug(struct sk_buff
*skb
)
942 printk(KERN_DEBUG
"ip_rt_bug: %08x -> %08x, %s\n", skb
->nh
.iph
->saddr
,
943 skb
->nh
.iph
->daddr
, skb
->dev
? skb
->dev
->name
: "?");
949 We do not cache source address of outgoing interface,
950 because it is used only by IP RR, TS and SRR options,
951 so that it out of fast path.
953 BTW remember: "addr" is allowed to be not aligned
957 void ip_rt_get_source(u8
*addr
, struct rtable
*rt
)
960 struct fib_result res
;
962 if (rt
->key
.iif
== 0)
964 else if (fib_lookup(&rt
->key
, &res
) == 0)
965 src
= FIB_RES_PREFSRC(res
);
967 src
= inet_select_addr(rt
->u
.dst
.dev
, rt
->rt_gateway
, RT_SCOPE_UNIVERSE
);
968 memcpy(addr
, &src
, 4);
971 #ifdef CONFIG_NET_CLS_ROUTE
972 static void set_class_tag(struct rtable
*rt
, u32 tag
)
974 if (!(rt
->u
.dst
.tclassid
&0xFFFF))
975 rt
->u
.dst
.tclassid
|= tag
&0xFFFF;
976 if (!(rt
->u
.dst
.tclassid
&0xFFFF0000))
977 rt
->u
.dst
.tclassid
|= tag
&0xFFFF0000;
981 static void rt_set_nexthop(struct rtable
*rt
, struct fib_result
*res
, u32 itag
)
983 struct fib_info
*fi
= res
->fi
;
986 if (FIB_RES_GW(*res
) && FIB_RES_NH(*res
).nh_scope
== RT_SCOPE_LINK
)
987 rt
->rt_gateway
= FIB_RES_GW(*res
);
988 rt
->u
.dst
.mxlock
= fi
->fib_metrics
[RTAX_LOCK
-1];
989 rt
->u
.dst
.pmtu
= fi
->fib_mtu
;
990 if (fi
->fib_mtu
== 0) {
991 rt
->u
.dst
.pmtu
= rt
->u
.dst
.dev
->mtu
;
992 if (rt
->u
.dst
.pmtu
> IP_MAX_MTU
)
993 rt
->u
.dst
.pmtu
= IP_MAX_MTU
;
994 if (rt
->u
.dst
.mxlock
&(1<<RTAX_MTU
) &&
995 rt
->rt_gateway
!= rt
->rt_dst
&&
996 rt
->u
.dst
.pmtu
> 576)
997 rt
->u
.dst
.pmtu
= 576;
999 rt
->u
.dst
.window
= fi
->fib_window
? : 0;
1000 rt
->u
.dst
.rtt
= fi
->fib_rtt
? : TCP_TIMEOUT_INIT
;
1001 #ifdef CONFIG_NET_CLS_ROUTE
1002 rt
->u
.dst
.tclassid
= FIB_RES_NH(*res
).nh_tclassid
;
1005 rt
->u
.dst
.pmtu
= rt
->u
.dst
.dev
->mtu
;
1006 if (rt
->u
.dst
.pmtu
> IP_MAX_MTU
)
1007 rt
->u
.dst
.pmtu
= IP_MAX_MTU
;
1008 rt
->u
.dst
.window
= 0;
1009 rt
->u
.dst
.rtt
= TCP_TIMEOUT_INIT
;
1011 #ifdef CONFIG_NET_CLS_ROUTE
1012 #ifdef CONFIG_IP_MULTIPLE_TABLES
1013 set_class_tag(rt
, fib_rules_tclass(res
));
1015 set_class_tag(rt
, itag
);
1017 rt
->rt_type
= res
->type
;
1021 ip_route_input_mc(struct sk_buff
*skb
, u32 daddr
, u32 saddr
,
1022 u8 tos
, struct device
*dev
, int our
)
1027 struct in_device
*in_dev
= dev
->ip_ptr
;
1030 /* Primary sanity checks. */
1032 if (MULTICAST(saddr
) || BADCLASS(saddr
) || LOOPBACK(saddr
) ||
1033 in_dev
== NULL
|| skb
->protocol
!= __constant_htons(ETH_P_IP
))
1036 if (ZERONET(saddr
)) {
1037 if (!LOCAL_MCAST(daddr
))
1039 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
1040 } else if (fib_validate_source(saddr
, 0, tos
, 0, dev
, &spec_dst
, &itag
) < 0)
1043 rth
= dst_alloc(sizeof(struct rtable
), &ipv4_dst_ops
);
1047 rth
->u
.dst
.output
= ip_rt_bug
;
1049 atomic_set(&rth
->u
.dst
.use
, 1);
1050 rth
->key
.dst
= daddr
;
1051 rth
->rt_dst
= daddr
;
1053 #ifdef CONFIG_IP_ROUTE_FWMARK
1054 rth
->key
.fwmark
= skb
->fwmark
;
1056 rth
->key
.src
= saddr
;
1057 rth
->rt_src
= saddr
;
1058 #ifdef CONFIG_IP_ROUTE_NAT
1059 rth
->rt_dst_map
= daddr
;
1060 rth
->rt_src_map
= saddr
;
1062 #ifdef CONFIG_NET_CLS_ROUTE
1063 rth
->u
.dst
.tclassid
= itag
;
1066 rth
->key
.iif
= dev
->ifindex
;
1067 rth
->u
.dst
.dev
= &loopback_dev
;
1069 rth
->rt_gateway
= daddr
;
1070 rth
->rt_spec_dst
= spec_dst
;
1071 rth
->rt_type
= RTN_MULTICAST
;
1072 rth
->rt_flags
= RTCF_MULTICAST
;
1074 rth
->u
.dst
.input
= ip_local_deliver
;
1075 rth
->rt_flags
|= RTCF_LOCAL
;
1078 #ifdef CONFIG_IP_MROUTE
1079 if (!LOCAL_MCAST(daddr
) && IN_DEV_MFORWARD(in_dev
))
1080 rth
->u
.dst
.input
= ip_mr_input
;
1083 hash
= rt_hash_code(daddr
, saddr
^(dev
->ifindex
<<5), tos
);
1084 return rt_intern_hash(hash
, rth
, (struct rtable
**)&skb
->dst
);
1088 * NOTE. We drop all the packets that has local source
1089 * addresses, because every properly looped back packet
1090 * must have correct destination already attached by output routine.
1092 * Such approach solves two big problems:
1093 * 1. Not simplex devices are handled properly.
1094 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1097 int ip_route_input_slow(struct sk_buff
*skb
, u32 daddr
, u32 saddr
,
1098 u8 tos
, struct device
*dev
)
1101 struct fib_result res
;
1102 struct in_device
*in_dev
= dev
->ip_ptr
;
1103 struct in_device
*out_dev
;
1106 struct rtable
* rth
;
1112 * IP on this device is disabled.
1121 #ifdef CONFIG_IP_ROUTE_FWMARK
1122 key
.fwmark
= skb
->fwmark
;
1124 key
.iif
= dev
->ifindex
;
1126 key
.scope
= RT_SCOPE_UNIVERSE
;
1128 hash
= rt_hash_code(daddr
, saddr
^(key
.iif
<<5), tos
);
1130 /* Check for the most weird martians, which can be not detected
1134 if (MULTICAST(saddr
) || BADCLASS(saddr
) || LOOPBACK(saddr
))
1135 goto martian_source
;
1137 if (daddr
== 0xFFFFFFFF || (saddr
== 0 && daddr
== 0))
1140 /* Accept zero addresses only to limited broadcast;
1141 * I even do not know to fix it or not. Waiting for complains :-)
1144 goto martian_source
;
1146 if (BADCLASS(daddr
) || ZERONET(daddr
) || LOOPBACK(daddr
))
1147 goto martian_destination
;
1150 * Now we are ready to route packet.
1152 if ((err
= fib_lookup(&key
, &res
))) {
1153 if (!IN_DEV_FORWARD(in_dev
))
1158 #ifdef CONFIG_IP_ROUTE_NAT
1159 /* Policy is applied before mapping destination,
1160 but rerouting after map should be made with old source.
1164 u32 src_map
= saddr
;
1166 src_map
= fib_rules_policy(saddr
, &res
, &flags
);
1168 if (res
.type
== RTN_NAT
) {
1169 key
.dst
= fib_rules_map_destination(daddr
, &res
);
1170 if (fib_lookup(&key
, &res
) || res
.type
!= RTN_UNICAST
)
1178 if (res
.type
== RTN_BROADCAST
)
1181 if (res
.type
== RTN_LOCAL
) {
1183 result
= fib_validate_source(saddr
, daddr
, tos
, loopback_dev
.ifindex
,
1184 dev
, &spec_dst
, &itag
);
1186 goto martian_source
;
1188 flags
|= RTCF_DIRECTSRC
;
1193 if (!IN_DEV_FORWARD(in_dev
))
1195 if (res
.type
!= RTN_UNICAST
)
1196 goto martian_destination
;
1198 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1199 if (res
.fi
->fib_nhs
> 1 && key
.oif
== 0)
1200 fib_select_multipath(&key
, &res
);
1202 out_dev
= FIB_RES_DEV(res
)->ip_ptr
;
1203 if (out_dev
== NULL
) {
1204 if (net_ratelimit())
1205 printk(KERN_CRIT
"Bug in ip_route_input_slow(). Please, report\n");
1209 err
= fib_validate_source(saddr
, daddr
, tos
, FIB_RES_OIF(res
), dev
, &spec_dst
, &itag
);
1211 goto martian_source
;
1214 flags
|= RTCF_DIRECTSRC
;
1216 if (out_dev
== in_dev
&& err
&& !(flags
&(RTCF_NAT
|RTCF_MASQ
)) &&
1217 (IN_DEV_SHARED_MEDIA(out_dev
)
1218 || inet_addr_onlink(out_dev
, saddr
, FIB_RES_GW(res
))))
1219 flags
|= RTCF_DOREDIRECT
;
1221 if (skb
->protocol
!= __constant_htons(ETH_P_IP
)) {
1222 /* Not IP (i.e. ARP). Do not create route, if it is
1223 * invalid for proxy arp. DNAT routes are always valid.
1225 if (out_dev
== in_dev
&& !(flags
&RTCF_DNAT
))
1229 rth
= dst_alloc(sizeof(struct rtable
), &ipv4_dst_ops
);
1233 atomic_set(&rth
->u
.dst
.use
, 1);
1234 rth
->key
.dst
= daddr
;
1235 rth
->rt_dst
= daddr
;
1237 #ifdef CONFIG_IP_ROUTE_FWMARK
1238 rth
->key
.fwmark
= skb
->fwmark
;
1240 rth
->key
.src
= saddr
;
1241 rth
->rt_src
= saddr
;
1242 rth
->rt_gateway
= daddr
;
1243 #ifdef CONFIG_IP_ROUTE_NAT
1244 rth
->rt_src_map
= key
.src
;
1245 rth
->rt_dst_map
= key
.dst
;
1246 if (flags
&RTCF_DNAT
)
1247 rth
->rt_gateway
= key
.dst
;
1250 rth
->key
.iif
= dev
->ifindex
;
1251 rth
->u
.dst
.dev
= out_dev
->dev
;
1253 rth
->rt_spec_dst
= spec_dst
;
1255 rth
->u
.dst
.input
= ip_forward
;
1256 rth
->u
.dst
.output
= ip_output
;
1258 rt_set_nexthop(rth
, &res
, itag
);
1260 rth
->rt_flags
= flags
;
1262 #ifdef CONFIG_NET_FASTROUTE
1263 if (netdev_fastroute
&& !(flags
&(RTCF_NAT
|RTCF_MASQ
|RTCF_DOREDIRECT
))) {
1264 struct device
*odev
= rth
->u
.dst
.dev
;
1266 dev
->accept_fastpath
&&
1267 odev
->mtu
>= dev
->mtu
&&
1268 dev
->accept_fastpath(dev
, &rth
->u
.dst
) == 0)
1269 rth
->rt_flags
|= RTCF_FAST
;
1273 return rt_intern_hash(hash
, rth
, (struct rtable
**)&skb
->dst
);
1276 if (skb
->protocol
!= __constant_htons(ETH_P_IP
))
1279 if (ZERONET(saddr
)) {
1280 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
1282 err
= fib_validate_source(saddr
, 0, tos
, 0, dev
, &spec_dst
, &itag
);
1284 goto martian_source
;
1286 flags
|= RTCF_DIRECTSRC
;
1288 flags
|= RTCF_BROADCAST
;
1289 res
.type
= RTN_BROADCAST
;
1292 rth
= dst_alloc(sizeof(struct rtable
), &ipv4_dst_ops
);
1296 rth
->u
.dst
.output
= ip_rt_bug
;
1298 atomic_set(&rth
->u
.dst
.use
, 1);
1299 rth
->key
.dst
= daddr
;
1300 rth
->rt_dst
= daddr
;
1302 #ifdef CONFIG_IP_ROUTE_FWMARK
1303 rth
->key
.fwmark
= skb
->fwmark
;
1305 rth
->key
.src
= saddr
;
1306 rth
->rt_src
= saddr
;
1307 #ifdef CONFIG_IP_ROUTE_NAT
1308 rth
->rt_dst_map
= key
.dst
;
1309 rth
->rt_src_map
= key
.src
;
1311 #ifdef CONFIG_NET_CLS_ROUTE
1312 rth
->u
.dst
.tclassid
= itag
;
1315 rth
->key
.iif
= dev
->ifindex
;
1316 rth
->u
.dst
.dev
= &loopback_dev
;
1318 rth
->rt_gateway
= daddr
;
1319 rth
->rt_spec_dst
= spec_dst
;
1320 rth
->u
.dst
.input
= ip_local_deliver
;
1321 rth
->rt_flags
= flags
|RTCF_LOCAL
;
1322 if (res
.type
== RTN_UNREACHABLE
) {
1323 rth
->u
.dst
.input
= ip_error
;
1324 rth
->u
.dst
.error
= -err
;
1325 rth
->rt_flags
&= ~RTCF_LOCAL
;
1327 rth
->rt_type
= res
.type
;
1328 return rt_intern_hash(hash
, rth
, (struct rtable
**)&skb
->dst
);
1331 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_UNIVERSE
);
1332 res
.type
= RTN_UNREACHABLE
;
1336 * Do not cache martian addresses: they should be logged (RFC1812)
1338 martian_destination
:
1339 #ifdef CONFIG_IP_ROUTE_VERBOSE
1340 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
1341 printk(KERN_WARNING
"martian destination %08x from %08x, dev %s\n", daddr
, saddr
, dev
->name
);
1346 #ifdef CONFIG_IP_ROUTE_VERBOSE
1347 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit()) {
1349 * RFC1812 recommenadtion, if source is martian,
1350 * the only hint is MAC header.
1352 printk(KERN_WARNING
"martian source %08x for %08x, dev %s\n", saddr
, daddr
, dev
->name
);
1353 if (dev
->hard_header_len
) {
1355 unsigned char *p
= skb
->mac
.raw
;
1356 printk(KERN_WARNING
"ll header:");
1357 for (i
=0; i
<dev
->hard_header_len
; i
++, p
++)
1358 printk(" %02x", *p
);
1366 int ip_route_input(struct sk_buff
*skb
, u32 daddr
, u32 saddr
,
1367 u8 tos
, struct device
*dev
)
1369 struct rtable
* rth
;
1371 int iif
= dev
->ifindex
;
1373 tos
&= IPTOS_TOS_MASK
;
1374 hash
= rt_hash_code(daddr
, saddr
^(iif
<<5), tos
);
1376 read_lock_bh(&rt_hash_lock
);
1377 for (rth
=rt_hash_table
[hash
]; rth
; rth
=rth
->u
.rt_next
) {
1378 if (rth
->key
.dst
== daddr
&&
1379 rth
->key
.src
== saddr
&&
1380 rth
->key
.iif
== iif
&&
1381 rth
->key
.oif
== 0 &&
1382 #ifdef CONFIG_IP_ROUTE_FWMARK
1383 rth
->key
.fwmark
== skb
->fwmark
&&
1385 rth
->key
.tos
== tos
) {
1386 rth
->u
.dst
.lastuse
= jiffies
;
1387 atomic_inc(&rth
->u
.dst
.use
);
1388 atomic_inc(&rth
->u
.dst
.refcnt
);
1389 read_unlock_bh(&rt_hash_lock
);
1390 skb
->dst
= (struct dst_entry
*)rth
;
1394 read_unlock_bh(&rt_hash_lock
);
1396 /* Multicast recognition logic is moved from route cache to here.
1397 The problem was that too many Ethernet cards have broken/missing
1398 hardware multicast filters :-( As result the host on multicasting
1399 network acquires a lot of useless route cache entries, sort of
1400 SDR messages from all the world. Now we try to get rid of them.
1401 Really, provided software IP multicast filter is organized
1402 reasonably (at least, hashed), it does not result in a slowdown
1403 comparing with route cache reject entries.
1404 Note, that multicast routers are not affected, because
1405 route cache entry is created eventually.
1407 if (MULTICAST(daddr
)) {
1408 int our
= ip_check_mc(dev
, daddr
);
1410 #ifdef CONFIG_IP_MROUTE
1411 && (LOCAL_MCAST(daddr
) || !dev
->ip_ptr
||
1412 !IN_DEV_MFORWARD((struct in_device
*)dev
->ip_ptr
))
1415 return ip_route_input_mc(skb
, daddr
, saddr
, tos
, dev
, our
);
1417 return ip_route_input_slow(skb
, daddr
, saddr
, tos
, dev
);
1421 * Major route resolver routine.
1424 int ip_route_output_slow(struct rtable
**rp
, u32 daddr
, u32 saddr
, u32 tos
, int oif
)
1427 struct fib_result res
;
1430 struct device
*dev_out
= NULL
;
1432 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1433 u32 nochecksrc
= (tos
& RTO_TPROXY
);
1436 tos
&= IPTOS_TOS_MASK
|RTO_ONLINK
;
1439 key
.tos
= tos
&IPTOS_TOS_MASK
;
1440 key
.iif
= loopback_dev
.ifindex
;
1442 key
.scope
= (tos
&RTO_ONLINK
) ? RT_SCOPE_LINK
: RT_SCOPE_UNIVERSE
;
1444 #ifdef CONFIG_IP_MULTIPLE_TABLES
1449 if (MULTICAST(saddr
) || BADCLASS(saddr
) || ZERONET(saddr
))
1452 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1453 dev_out
= ip_dev_find(saddr
);
1454 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1455 /* If address is not local, test for transparent proxy flag;
1456 if address is local --- clear the flag.
1458 if (dev_out
== NULL
) {
1459 if (nochecksrc
== 0 || inet_addr_type(saddr
) != RTN_UNICAST
)
1461 flags
|= RTCF_TPROXY
;
1464 if (dev_out
== NULL
)
1468 /* I removed check for oif == dev_out->oif here.
1469 It was wrong by three reasons:
1470 1. ip_dev_find(saddr) can return wrong iface, if saddr is
1471 assigned to multiple interfaces.
1472 2. Moreover, we are allowed to send packets with saddr
1473 of another iface. --ANK
1477 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1480 (MULTICAST(daddr
) || daddr
== 0xFFFFFFFF)) {
1481 /* Special hack: user can direct multicasts
1482 and limited broadcast via necessary interface
1483 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1484 This hack is not just for fun, it allows
1485 vic,vat and friends to work.
1486 They bind socket to loopback, set ttl to zero
1487 and expect that it will work.
1488 From the viewpoint of routing cache they are broken,
1489 because we are not allowed to build multicast path
1490 with loopback source addr (look, routing cache
1491 cannot know, that ttl is zero, so that packet
1492 will not leave this host and route is valid).
1493 Luckily, this hack is good workaround.
1496 key
.oif
= dev_out
->ifindex
;
1502 dev_out
= dev_get_by_index(oif
);
1503 if (dev_out
== NULL
)
1505 if (dev_out
->ip_ptr
== NULL
)
1506 return -ENODEV
; /* Wrong error code */
1508 if (LOCAL_MCAST(daddr
) || daddr
== 0xFFFFFFFF) {
1510 key
.src
= inet_select_addr(dev_out
, 0, RT_SCOPE_LINK
);
1514 if (MULTICAST(daddr
))
1515 key
.src
= inet_select_addr(dev_out
, 0, key
.scope
);
1517 key
.src
= inet_select_addr(dev_out
, 0, RT_SCOPE_HOST
);
1524 key
.dst
= key
.src
= htonl(INADDR_LOOPBACK
);
1525 dev_out
= &loopback_dev
;
1526 key
.oif
= loopback_dev
.ifindex
;
1527 res
.type
= RTN_LOCAL
;
1528 flags
|= RTCF_LOCAL
;
1532 if (fib_lookup(&key
, &res
)) {
1535 /* Apparently, routing tables are wrong. Assume,
1536 that the destination is on link.
1539 Because we are allowed to send to iface
1540 even if it has NO routes and NO assigned
1541 addresses. When oif is specified, routing
1542 tables are looked up with only one purpose:
1543 to catch if destination is gatewayed, rather than
1544 direct. Moreover, if MSG_DONTROUTE is set,
1545 we send packet, ignoring both routing tables
1546 and ifaddr state. --ANK
1549 We could make it even if oif is unknown,
1550 likely IPv6, but we do not.
1554 key
.src
= inet_select_addr(dev_out
, 0, RT_SCOPE_LINK
);
1555 res
.type
= RTN_UNICAST
;
1558 return -ENETUNREACH
;
1561 if (res
.type
== RTN_NAT
)
1564 if (res
.type
== RTN_LOCAL
) {
1567 dev_out
= &loopback_dev
;
1568 key
.oif
= dev_out
->ifindex
;
1570 flags
|= RTCF_LOCAL
;
1574 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1575 if (res
.fi
->fib_nhs
> 1 && key
.oif
== 0)
1576 fib_select_multipath(&key
, &res
);
1579 if (res
.prefixlen
==0 && res
.type
== RTN_UNICAST
&& key
.oif
== 0)
1580 fib_select_default(&key
, &res
);
1583 key
.src
= FIB_RES_PREFSRC(res
);
1585 dev_out
= FIB_RES_DEV(res
);
1586 key
.oif
= dev_out
->ifindex
;
1589 if (LOOPBACK(key
.src
) && !(dev_out
->flags
&IFF_LOOPBACK
))
1592 if (key
.dst
== 0xFFFFFFFF)
1593 res
.type
= RTN_BROADCAST
;
1594 else if (MULTICAST(key
.dst
))
1595 res
.type
= RTN_MULTICAST
;
1596 else if (BADCLASS(key
.dst
) || ZERONET(key
.dst
))
1599 if (dev_out
->flags
&IFF_LOOPBACK
)
1600 flags
|= RTCF_LOCAL
;
1602 if (res
.type
== RTN_BROADCAST
) {
1603 flags
|= RTCF_BROADCAST
|RTCF_LOCAL
;
1605 } else if (res
.type
== RTN_MULTICAST
) {
1606 flags
|= RTCF_MULTICAST
|RTCF_LOCAL
;
1607 if (!ip_check_mc(dev_out
, daddr
))
1608 flags
&= ~RTCF_LOCAL
;
1609 /* If multicast route do not exist use
1610 default one, but do not gateway in this case.
1613 if (res
.fi
&& res
.prefixlen
< 4)
1617 rth
= dst_alloc(sizeof(struct rtable
), &ipv4_dst_ops
);
1621 atomic_set(&rth
->u
.dst
.use
, 1);
1622 rth
->key
.dst
= daddr
;
1624 rth
->key
.src
= saddr
;
1627 rth
->rt_dst
= key
.dst
;
1628 rth
->rt_src
= key
.src
;
1629 #ifdef CONFIG_IP_ROUTE_NAT
1630 rth
->rt_dst_map
= key
.dst
;
1631 rth
->rt_src_map
= key
.src
;
1633 rth
->rt_iif
= oif
? : dev_out
->ifindex
;
1634 rth
->u
.dst
.dev
= dev_out
;
1635 rth
->rt_gateway
= key
.dst
;
1636 rth
->rt_spec_dst
= key
.src
;
1638 rth
->u
.dst
.output
=ip_output
;
1640 if (flags
&RTCF_LOCAL
) {
1641 rth
->u
.dst
.input
= ip_local_deliver
;
1642 rth
->rt_spec_dst
= key
.dst
;
1644 if (flags
&(RTCF_BROADCAST
|RTCF_MULTICAST
)) {
1645 rth
->rt_spec_dst
= key
.src
;
1646 if (flags
&RTCF_LOCAL
&& !(dev_out
->flags
&IFF_LOOPBACK
))
1647 rth
->u
.dst
.output
= ip_mc_output
;
1648 #ifdef CONFIG_IP_MROUTE
1649 if (res
.type
== RTN_MULTICAST
&& dev_out
->ip_ptr
) {
1650 struct in_device
*in_dev
= dev_out
->ip_ptr
;
1651 if (IN_DEV_MFORWARD(in_dev
) && !LOCAL_MCAST(daddr
)) {
1652 rth
->u
.dst
.input
= ip_mr_input
;
1653 rth
->u
.dst
.output
= ip_mc_output
;
1659 rt_set_nexthop(rth
, &res
, 0);
1661 rth
->rt_flags
= flags
;
1663 hash
= rt_hash_code(daddr
, saddr
^(oif
<<5), tos
);
1664 return rt_intern_hash(hash
, rth
, rp
);
1667 int ip_route_output(struct rtable
**rp
, u32 daddr
, u32 saddr
, u32 tos
, int oif
)
1672 hash
= rt_hash_code(daddr
, saddr
^(oif
<<5), tos
);
1674 read_lock_bh(&rt_hash_lock
);
1675 for (rth
=rt_hash_table
[hash
]; rth
; rth
=rth
->u
.rt_next
) {
1676 if (rth
->key
.dst
== daddr
&&
1677 rth
->key
.src
== saddr
&&
1678 rth
->key
.iif
== 0 &&
1679 rth
->key
.oif
== oif
&&
1680 #ifndef CONFIG_IP_TRANSPARENT_PROXY
1683 !((rth
->key
.tos
^tos
)&(IPTOS_TOS_MASK
|RTO_ONLINK
)) &&
1684 ((tos
&RTO_TPROXY
) || !(rth
->rt_flags
&RTCF_TPROXY
))
1687 rth
->u
.dst
.lastuse
= jiffies
;
1688 atomic_inc(&rth
->u
.dst
.use
);
1689 atomic_inc(&rth
->u
.dst
.refcnt
);
1690 read_unlock_bh(&rt_hash_lock
);
1695 read_unlock_bh(&rt_hash_lock
);
1697 return ip_route_output_slow(rp
, daddr
, saddr
, tos
, oif
);
1700 #ifdef CONFIG_RTNETLINK
1702 static int rt_fill_info(struct sk_buff
*skb
, u32 pid
, u32 seq
, int event
, int nowait
)
1704 struct rtable
*rt
= (struct rtable
*)skb
->dst
;
1706 struct nlmsghdr
*nlh
;
1707 unsigned char *b
= skb
->tail
;
1708 struct rta_cacheinfo ci
;
1709 #ifdef CONFIG_IP_MROUTE
1710 struct rtattr
*eptr
;
1714 nlh
= NLMSG_PUT(skb
, pid
, seq
, event
, sizeof(*r
));
1715 r
= NLMSG_DATA(nlh
);
1716 nlh
->nlmsg_flags
= (nowait
&& pid
) ? NLM_F_MULTI
: 0;
1717 r
->rtm_family
= AF_INET
;
1718 r
->rtm_dst_len
= 32;
1720 r
->rtm_tos
= rt
->key
.tos
;
1721 r
->rtm_table
= RT_TABLE_MAIN
;
1722 r
->rtm_type
= rt
->rt_type
;
1723 r
->rtm_scope
= RT_SCOPE_UNIVERSE
;
1724 r
->rtm_protocol
= RTPROT_UNSPEC
;
1725 r
->rtm_flags
= (rt
->rt_flags
&~0xFFFF) | RTM_F_CLONED
;
1726 if (rt
->rt_flags
& RTCF_NOTIFY
)
1727 r
->rtm_flags
|= RTM_F_NOTIFY
;
1728 RTA_PUT(skb
, RTA_DST
, 4, &rt
->rt_dst
);
1730 r
->rtm_src_len
= 32;
1731 RTA_PUT(skb
, RTA_SRC
, 4, &rt
->key
.src
);
1734 RTA_PUT(skb
, RTA_OIF
, sizeof(int), &rt
->u
.dst
.dev
->ifindex
);
1735 #ifdef CONFIG_NET_CLS_ROUTE
1736 if (rt
->u
.dst
.tclassid
)
1737 RTA_PUT(skb
, RTA_FLOW
, 4, &rt
->u
.dst
.tclassid
);
1740 RTA_PUT(skb
, RTA_PREFSRC
, 4, &rt
->rt_spec_dst
);
1741 else if (rt
->rt_src
!= rt
->key
.src
)
1742 RTA_PUT(skb
, RTA_PREFSRC
, 4, &rt
->rt_src
);
1743 if (rt
->rt_dst
!= rt
->rt_gateway
)
1744 RTA_PUT(skb
, RTA_GATEWAY
, 4, &rt
->rt_gateway
);
1745 mx
= (struct rtattr
*)skb
->tail
;
1746 RTA_PUT(skb
, RTA_METRICS
, 0, NULL
);
1747 if (rt
->u
.dst
.mxlock
)
1748 RTA_PUT(skb
, RTAX_LOCK
, sizeof(unsigned), &rt
->u
.dst
.mxlock
);
1750 RTA_PUT(skb
, RTAX_MTU
, sizeof(unsigned), &rt
->u
.dst
.pmtu
);
1751 if (rt
->u
.dst
.window
)
1752 RTA_PUT(skb
, RTAX_WINDOW
, sizeof(unsigned), &rt
->u
.dst
.window
);
1754 RTA_PUT(skb
, RTAX_RTT
, sizeof(unsigned), &rt
->u
.dst
.rtt
);
1755 mx
->rta_len
= skb
->tail
- (u8
*)mx
;
1756 if (mx
->rta_len
== RTA_LENGTH(0))
1757 skb_trim(skb
, (u8
*)mx
- skb
->data
);
1758 ci
.rta_lastuse
= jiffies
- rt
->u
.dst
.lastuse
;
1759 ci
.rta_used
= atomic_read(&rt
->u
.dst
.refcnt
);
1760 ci
.rta_clntref
= atomic_read(&rt
->u
.dst
.use
);
1761 if (rt
->u
.dst
.expires
)
1762 ci
.rta_expires
= rt
->u
.dst
.expires
- jiffies
;
1765 ci
.rta_error
= rt
->u
.dst
.error
;
1766 #ifdef CONFIG_IP_MROUTE
1767 eptr
= (struct rtattr
*)skb
->tail
;
1769 RTA_PUT(skb
, RTA_CACHEINFO
, sizeof(ci
), &ci
);
1771 #ifdef CONFIG_IP_MROUTE
1772 u32 dst
= rt
->rt_dst
;
1774 if (MULTICAST(dst
) && !LOCAL_MCAST(dst
) && ipv4_devconf
.mc_forwarding
) {
1775 int err
= ipmr_get_route(skb
, r
, nowait
);
1782 if (err
== -EMSGSIZE
)
1784 ((struct rta_cacheinfo
*)RTA_DATA(eptr
))->rta_error
= err
;
1790 RTA_PUT(skb
, RTA_IIF
, sizeof(int), &rt
->key
.iif
);
1794 nlh
->nlmsg_len
= skb
->tail
- b
;
1799 skb_trim(skb
, b
- skb
->data
);
1803 int inet_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
1805 struct rtattr
**rta
= arg
;
1806 struct rtmsg
*rtm
= NLMSG_DATA(nlh
);
1807 struct rtable
*rt
= NULL
;
1812 struct sk_buff
*skb
;
1814 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
1818 /* Reserve room for dummy headers, this skb can pass
1819 through good chunk of routing engine.
1821 skb
->mac
.raw
= skb
->data
;
1822 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct iphdr
));
1825 memcpy(&src
, RTA_DATA(rta
[RTA_SRC
-1]), 4);
1827 memcpy(&dst
, RTA_DATA(rta
[RTA_DST
-1]), 4);
1829 memcpy(&iif
, RTA_DATA(rta
[RTA_IIF
-1]), sizeof(int));
1833 dev
= dev_get_by_index(iif
);
1836 skb
->protocol
= __constant_htons(ETH_P_IP
);
1839 err
= ip_route_input(skb
, dst
, src
, rtm
->rtm_tos
, dev
);
1841 rt
= (struct rtable
*)skb
->dst
;
1842 if (!err
&& rt
->u
.dst
.error
)
1843 err
= -rt
->u
.dst
.error
;
1847 memcpy(&oif
, RTA_DATA(rta
[RTA_OIF
-1]), sizeof(int));
1848 err
= ip_route_output(&rt
, dst
, src
, rtm
->rtm_tos
, oif
);
1855 skb
->dst
= &rt
->u
.dst
;
1856 if (rtm
->rtm_flags
& RTM_F_NOTIFY
)
1857 rt
->rt_flags
|= RTCF_NOTIFY
;
1859 NETLINK_CB(skb
).dst_pid
= NETLINK_CB(in_skb
).pid
;
1861 err
= rt_fill_info(skb
, NETLINK_CB(in_skb
).pid
, nlh
->nlmsg_seq
, RTM_NEWROUTE
, 0);
1867 err
= netlink_unicast(rtnl
, skb
, NETLINK_CB(in_skb
).pid
, MSG_DONTWAIT
);
1874 int ip_rt_dump(struct sk_buff
*skb
, struct netlink_callback
*cb
)
1881 s_idx
= idx
= cb
->args
[1];
1882 for (h
=0; h
< RT_HASH_DIVISOR
; h
++) {
1883 if (h
< s_h
) continue;
1886 read_lock_bh(&rt_hash_lock
);
1887 for (rt
= rt_hash_table
[h
], idx
= 0; rt
; rt
= rt
->u
.rt_next
, idx
++) {
1890 skb
->dst
= dst_clone(&rt
->u
.dst
);
1891 if (rt_fill_info(skb
, NETLINK_CB(cb
->skb
).pid
,
1892 cb
->nlh
->nlmsg_seq
, RTM_NEWROUTE
, 1) <= 0) {
1893 dst_release(xchg(&skb
->dst
, NULL
));
1894 read_unlock_bh(&rt_hash_lock
);
1897 dst_release(xchg(&skb
->dst
, NULL
));
1899 read_unlock_bh(&rt_hash_lock
);
1908 #endif /* CONFIG_RTNETLINK */
1910 void ip_rt_multicast_event(struct in_device
*in_dev
)
1917 #ifdef CONFIG_SYSCTL
1919 static int flush_delay
;
1922 int ipv4_sysctl_rtcache_flush(ctl_table
*ctl
, int write
, struct file
* filp
,
1923 void *buffer
, size_t *lenp
)
1926 proc_dointvec(ctl
, write
, filp
, buffer
, lenp
);
1927 rt_cache_flush(flush_delay
);
1933 ctl_table ipv4_route_table
[] = {
1934 {NET_IPV4_ROUTE_FLUSH
, "flush",
1935 &flush_delay
, sizeof(int), 0200, NULL
,
1936 &ipv4_sysctl_rtcache_flush
},
1937 {NET_IPV4_ROUTE_MIN_DELAY
, "min_delay",
1938 &ip_rt_min_delay
, sizeof(int), 0644, NULL
,
1939 &proc_dointvec_jiffies
},
1940 {NET_IPV4_ROUTE_MAX_DELAY
, "max_delay",
1941 &ip_rt_max_delay
, sizeof(int), 0644, NULL
,
1942 &proc_dointvec_jiffies
},
1943 {NET_IPV4_ROUTE_GC_THRESH
, "gc_thresh",
1944 &ipv4_dst_ops
.gc_thresh
, sizeof(int), 0644, NULL
,
1946 {NET_IPV4_ROUTE_MAX_SIZE
, "max_size",
1947 &ip_rt_max_size
, sizeof(int), 0644, NULL
,
1949 {NET_IPV4_ROUTE_GC_MIN_INTERVAL
, "gc_min_interval",
1950 &ip_rt_gc_min_interval
, sizeof(int), 0644, NULL
,
1951 &proc_dointvec_jiffies
},
1952 {NET_IPV4_ROUTE_GC_TIMEOUT
, "gc_timeout",
1953 &ip_rt_gc_timeout
, sizeof(int), 0644, NULL
,
1954 &proc_dointvec_jiffies
},
1955 {NET_IPV4_ROUTE_GC_INTERVAL
, "gc_interval",
1956 &ip_rt_gc_interval
, sizeof(int), 0644, NULL
,
1957 &proc_dointvec_jiffies
},
1958 {NET_IPV4_ROUTE_REDIRECT_LOAD
, "redirect_load",
1959 &ip_rt_redirect_load
, sizeof(int), 0644, NULL
,
1961 {NET_IPV4_ROUTE_REDIRECT_NUMBER
, "redirect_number",
1962 &ip_rt_redirect_number
, sizeof(int), 0644, NULL
,
1964 {NET_IPV4_ROUTE_REDIRECT_SILENCE
, "redirect_silence",
1965 &ip_rt_redirect_silence
, sizeof(int), 0644, NULL
,
1967 {NET_IPV4_ROUTE_ERROR_COST
, "error_cost",
1968 &ip_rt_error_cost
, sizeof(int), 0644, NULL
,
1970 {NET_IPV4_ROUTE_ERROR_BURST
, "error_burst",
1971 &ip_rt_error_burst
, sizeof(int), 0644, NULL
,
1973 {NET_IPV4_ROUTE_GC_ELASTICITY
, "gc_elasticity",
1974 &ip_rt_gc_elasticity
, sizeof(int), 0644, NULL
,
1976 {NET_IPV4_ROUTE_MTU_EXPIRES
, "mtu_expires",
1977 &ip_rt_mtu_expires
, sizeof(int), 0644, NULL
,
1978 &proc_dointvec_jiffies
},
1983 #ifdef CONFIG_NET_CLS_ROUTE
1984 struct ip_rt_acct ip_rt_acct
[256];
1986 #ifdef CONFIG_PROC_FS
1987 static int ip_rt_acct_read(char *buffer
, char **start
, off_t offset
,
1988 int length
, int *eof
, void *data
)
1992 if (offset
+ length
> sizeof(ip_rt_acct
)) {
1993 length
= sizeof(ip_rt_acct
) - offset
;
1998 memcpy(buffer
, ((u8
*)&ip_rt_acct
)+offset
, length
);
2008 __initfunc(void ip_rt_init(void))
2010 #ifdef CONFIG_PROC_FS
2011 #ifdef CONFIG_NET_CLS_ROUTE
2012 struct proc_dir_entry
*ent
;
2017 rt_periodic_timer
.function
= rt_check_expire
;
2018 /* All the timers, started at system startup tend
2019 to synchronize. Perturb it a bit.
2021 rt_periodic_timer
.expires
= jiffies
+ net_random()%ip_rt_gc_interval
2022 + ip_rt_gc_interval
;
2023 add_timer(&rt_periodic_timer
);
2025 #ifdef CONFIG_PROC_FS
2026 proc_net_register(&(struct proc_dir_entry
) {
2027 PROC_NET_RTCACHE
, 8, "rt_cache",
2028 S_IFREG
| S_IRUGO
, 1, 0, 0,
2029 0, &proc_net_inode_operations
,
2032 #ifdef CONFIG_NET_CLS_ROUTE
2033 ent
= create_proc_entry("net/rt_acct", 0, 0);
2034 ent
->read_proc
= ip_rt_acct_read
;