2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.69 1999/06/09 10:11:02 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Splitted to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
56 * This program is free software; you can redistribute it and/or
57 * modify it under the terms of the GNU General Public License
58 * as published by the Free Software Foundation; either version
59 * 2 of the License, or (at your option) any later version.
62 #include <linux/config.h>
63 #include <asm/uaccess.h>
64 #include <asm/system.h>
65 #include <asm/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/sched.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/rtnetlink.h>
81 #include <linux/inetdevice.h>
82 #include <linux/igmp.h>
83 #include <linux/pkt_sched.h>
84 #include <linux/mroute.h>
85 #include <net/protocol.h>
87 #include <net/route.h>
89 #include <net/ip_fib.h>
94 #include <linux/sysctl.h>
97 #define IP_MAX_MTU 0xFFF0
99 #define RT_GC_TIMEOUT (300*HZ)
101 int ip_rt_min_delay
= 2*HZ
;
102 int ip_rt_max_delay
= 10*HZ
;
103 int ip_rt_gc_thresh
= RT_HASH_DIVISOR
;
104 int ip_rt_max_size
= RT_HASH_DIVISOR
*16;
105 int ip_rt_gc_timeout
= RT_GC_TIMEOUT
;
106 int ip_rt_gc_interval
= 60*HZ
;
107 int ip_rt_gc_min_interval
= 5*HZ
;
108 int ip_rt_redirect_number
= 9;
109 int ip_rt_redirect_load
= HZ
/50;
110 int ip_rt_redirect_silence
= ((HZ
/50) << (9+1));
111 int ip_rt_error_cost
= HZ
;
112 int ip_rt_error_burst
= 5*HZ
;
113 int ip_rt_gc_elasticity
= 8;
114 int ip_rt_mtu_expires
= 10*60*HZ
;
116 static unsigned long rt_deadline
= 0;
118 #define RTprint(a...) printk(KERN_DEBUG a)
120 static void rt_run_flush(unsigned long dummy
);
122 static struct timer_list rt_flush_timer
=
123 { NULL
, NULL
, 0, 0L, rt_run_flush
};
124 static struct timer_list rt_periodic_timer
=
125 { NULL
, NULL
, 0, 0L, NULL
};
128 * Interface to generic destination cache.
131 static struct dst_entry
* ipv4_dst_check(struct dst_entry
* dst
, u32
);
132 static struct dst_entry
* ipv4_dst_reroute(struct dst_entry
* dst
,
134 static struct dst_entry
* ipv4_negative_advice(struct dst_entry
*);
135 static void ipv4_link_failure(struct sk_buff
*skb
);
136 static int rt_garbage_collect(void);
139 struct dst_ops ipv4_dst_ops
=
142 __constant_htons(ETH_P_IP
),
149 ipv4_negative_advice
,
153 __u8 ip_tos2prio
[16] = {
166 TC_PRIO_INTERACTIVE_BULK
,
168 TC_PRIO_INTERACTIVE_BULK
,
177 /* The locking scheme is rather straight forward:
179 * 1) A BH protected rwlock protects the central route hash.
180 * 2) Only writers remove entries, and they hold the lock
181 * as they look at rtable reference counts.
182 * 3) Only readers acquire references to rtable entries,
183 * they do so with atomic increments and with the
187 static struct rtable
*rt_hash_table
[RT_HASH_DIVISOR
];
188 static rwlock_t rt_hash_lock
= RW_LOCK_UNLOCKED
;
190 static int rt_intern_hash(unsigned hash
, struct rtable
* rth
, struct rtable
** res
);
192 static __inline__
unsigned rt_hash_code(u32 daddr
, u32 saddr
, u8 tos
)
194 unsigned hash
= ((daddr
&0xF0F0F0F0)>>4)|((daddr
&0x0F0F0F0F)<<4);
195 hash
= hash
^saddr
^tos
;
196 hash
= hash
^(hash
>>16);
197 return (hash
^(hash
>>8)) & 0xFF;
200 #ifdef CONFIG_PROC_FS
202 static int rt_cache_get_info(char *buffer
, char **start
, off_t offset
, int length
, int dummy
)
213 sprintf(buffer
,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst");
218 read_lock_bh(&rt_hash_lock
);
220 for (i
= 0; i
<RT_HASH_DIVISOR
; i
++) {
221 for (r
= rt_hash_table
[i
]; r
; r
= r
->u
.rt_next
) {
223 * Spin through entries until we are ready
231 sprintf(temp
, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
232 r
->u
.dst
.dev
? r
->u
.dst
.dev
->name
: "*",
233 (unsigned long)r
->rt_dst
,
234 (unsigned long)r
->rt_gateway
,
236 atomic_read(&r
->u
.dst
.use
),
237 atomic_read(&r
->u
.dst
.refcnt
),
239 (unsigned long)r
->rt_src
, (int)r
->u
.dst
.pmtu
,
241 (int)r
->u
.dst
.rtt
, r
->key
.tos
,
242 r
->u
.dst
.hh
? atomic_read(&r
->u
.dst
.hh
->hh_refcnt
) : -1,
243 r
->u
.dst
.hh
? (r
->u
.dst
.hh
->hh_output
== dev_queue_xmit
) : 0,
245 sprintf(buffer
+len
,"%-127s\n",temp
);
247 if (pos
>= offset
+length
)
253 read_unlock_bh(&rt_hash_lock
);
255 *start
= buffer
+len
-(pos
-offset
);
263 static __inline__
void rt_free(struct rtable
*rt
)
265 dst_free(&rt
->u
.dst
);
268 static __inline__
void rt_drop(struct rtable
*rt
)
271 dst_free(&rt
->u
.dst
);
274 static __inline__
int rt_fast_clean(struct rtable
*rth
)
276 /* Kill broadcast/multicast entries very aggresively, if they
277 collide in hash table with more useful entries */
278 return ((rth
->rt_flags
&(RTCF_BROADCAST
|RTCF_MULTICAST
))
279 && rth
->key
.iif
&& rth
->u
.rt_next
);
282 static __inline__
int rt_valuable(struct rtable
*rth
)
284 return ((rth
->rt_flags
&(RTCF_REDIRECTED
|RTCF_NOTIFY
))
285 || rth
->u
.dst
.expires
);
288 static __inline__
int rt_may_expire(struct rtable
*rth
, int tmo1
, int tmo2
)
292 if (atomic_read(&rth
->u
.dst
.use
))
295 if (rth
->u
.dst
.expires
&& (long)(rth
->u
.dst
.expires
- jiffies
) <= 0)
298 age
= jiffies
- rth
->u
.dst
.lastuse
;
299 if (age
<= tmo1
&& !rt_fast_clean(rth
))
301 if (age
<= tmo2
&& rt_valuable(rth
))
306 /* This runs via a timer and thus is always in BH context. */
307 static void rt_check_expire(unsigned long dummy
)
311 struct rtable
*rth
, **rthp
;
312 unsigned long now
= jiffies
;
314 for (i
=0; i
<RT_HASH_DIVISOR
/5; i
++) {
315 unsigned tmo
= ip_rt_gc_timeout
;
317 rover
= (rover
+ 1) & (RT_HASH_DIVISOR
-1);
318 rthp
= &rt_hash_table
[rover
];
320 write_lock(&rt_hash_lock
);
321 while ((rth
= *rthp
) != NULL
) {
322 if (rth
->u
.dst
.expires
) {
323 /* Entrie is expired even if it is in use */
324 if ((long)(now
- rth
->u
.dst
.expires
) <= 0) {
326 rthp
= &rth
->u
.rt_next
;
329 } else if (!rt_may_expire(rth
, tmo
, ip_rt_gc_timeout
)) {
331 rthp
= &rth
->u
.rt_next
;
336 * Cleanup aged off entries.
338 *rthp
= rth
->u
.rt_next
;
341 write_unlock(&rt_hash_lock
);
343 /* Fallback loop breaker. */
344 if ((jiffies
- now
) > 0)
347 rt_periodic_timer
.expires
= now
+ ip_rt_gc_interval
;
348 add_timer(&rt_periodic_timer
);
351 /* This can run from both BH and non-BH contexts, the latter
352 * in the case of a forced flush event.
354 static void rt_run_flush(unsigned long dummy
)
357 struct rtable
* rth
, * next
;
361 for (i
=0; i
<RT_HASH_DIVISOR
; i
++) {
362 write_lock_bh(&rt_hash_lock
);
363 rth
= rt_hash_table
[i
];
365 rt_hash_table
[i
] = NULL
;
366 write_unlock_bh(&rt_hash_lock
);
368 for (; rth
; rth
=next
) {
369 next
= rth
->u
.rt_next
;
370 rth
->u
.rt_next
= NULL
;
376 static spinlock_t rt_flush_lock
= SPIN_LOCK_UNLOCKED
;
378 void rt_cache_flush(int delay
)
380 unsigned long now
= jiffies
;
381 int user_mode
= !in_interrupt();
384 delay
= ip_rt_min_delay
;
386 spin_lock_bh(&rt_flush_lock
);
388 if (del_timer(&rt_flush_timer
) && delay
> 0 && rt_deadline
) {
389 long tmo
= (long)(rt_deadline
- now
);
391 /* If flush timer is already running
392 and flush request is not immediate (delay > 0):
394 if deadline is not achieved, prolongate timer to "delay",
395 otherwise fire it at deadline time.
398 if (user_mode
&& tmo
< ip_rt_max_delay
-ip_rt_min_delay
)
406 spin_unlock_bh(&rt_flush_lock
);
411 if (rt_deadline
== 0)
412 rt_deadline
= now
+ ip_rt_max_delay
;
414 rt_flush_timer
.expires
= now
+ delay
;
415 add_timer(&rt_flush_timer
);
416 spin_unlock_bh(&rt_flush_lock
);
420 Short description of GC goals.
422 We want to build algorithm, which will keep routing cache
423 at some equilibrium point, when number of aged off entries
424 is kept approximately equal to newly generated ones.
426 Current expiration strength is variable "expire".
427 We try to adjust it dynamically, so that if networking
428 is idle expires is large enough to keep enough of warm entries,
429 and when load increases it reduces to limit cache size.
432 static int rt_garbage_collect(void)
434 static unsigned expire
= RT_GC_TIMEOUT
;
435 static unsigned long last_gc
;
437 static int equilibrium
;
438 struct rtable
*rth
, **rthp
;
439 unsigned long now
= jiffies
;
443 * Garbage collection is pretty expensive,
444 * do not make it too frequently.
446 if (now
- last_gc
< ip_rt_gc_min_interval
&&
447 atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
450 /* Calculate number of entries, which we want to expire now. */
451 goal
= atomic_read(&ipv4_dst_ops
.entries
) - RT_HASH_DIVISOR
*ip_rt_gc_elasticity
;
453 if (equilibrium
< ipv4_dst_ops
.gc_thresh
)
454 equilibrium
= ipv4_dst_ops
.gc_thresh
;
455 goal
= atomic_read(&ipv4_dst_ops
.entries
) - equilibrium
;
457 equilibrium
+= min(goal
/2, RT_HASH_DIVISOR
);
458 goal
= atomic_read(&ipv4_dst_ops
.entries
) - equilibrium
;
461 /* We are in dangerous area. Try to reduce cache really
464 goal
= max(goal
/2, RT_HASH_DIVISOR
);
465 equilibrium
= atomic_read(&ipv4_dst_ops
.entries
) - goal
;
468 if (now
- last_gc
>= ip_rt_gc_min_interval
)
479 /* The write lock is held during the entire hash
480 * traversal to ensure consistent state of the rover.
482 write_lock_bh(&rt_hash_lock
);
483 for (i
=0, k
=rover
; i
<RT_HASH_DIVISOR
; i
++) {
484 unsigned tmo
= expire
;
486 k
= (k
+ 1) & (RT_HASH_DIVISOR
-1);
487 rthp
= &rt_hash_table
[k
];
488 while ((rth
= *rthp
) != NULL
) {
489 if (!rt_may_expire(rth
, tmo
, expire
)) {
491 rthp
= &rth
->u
.rt_next
;
494 *rthp
= rth
->u
.rt_next
;
495 rth
->u
.rt_next
= NULL
;
503 write_unlock_bh(&rt_hash_lock
);
508 /* Goal is not achieved. We stop process if:
510 - if expire reduced to zero. Otherwise, expire is halfed.
511 - if table is not full.
512 - if we are called from interrupt.
513 - jiffies check is just fallback/debug loop breaker.
514 We will not spin here for long time in any case.
521 #if RT_CACHE_DEBUG >= 2
522 printk(KERN_DEBUG
"expire>> %u %d %d %d\n", expire
, atomic_read(&ipv4_dst_ops
.entries
), goal
, i
);
525 if (atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
527 } while (!in_interrupt() && jiffies
- now
< 1);
529 if (atomic_read(&ipv4_dst_ops
.entries
) < ip_rt_max_size
)
532 printk("dst cache overflow\n");
536 expire
+= ip_rt_gc_min_interval
;
537 if (expire
> ip_rt_gc_timeout
||
538 atomic_read(&ipv4_dst_ops
.entries
) < ipv4_dst_ops
.gc_thresh
)
539 expire
= ip_rt_gc_timeout
;
540 #if RT_CACHE_DEBUG >= 2
541 printk(KERN_DEBUG
"expire++ %u %d %d %d\n", expire
, atomic_read(&ipv4_dst_ops
.entries
), goal
, rover
);
546 static int rt_intern_hash(unsigned hash
, struct rtable
* rt
, struct rtable
** rp
)
548 struct rtable
*rth
, **rthp
;
549 unsigned long now
= jiffies
;
550 int attempts
= !in_interrupt();
553 rthp
= &rt_hash_table
[hash
];
555 write_lock_bh(&rt_hash_lock
);
556 while ((rth
= *rthp
) != NULL
) {
557 if (memcmp(&rth
->key
, &rt
->key
, sizeof(rt
->key
)) == 0) {
559 *rthp
= rth
->u
.rt_next
;
560 rth
->u
.rt_next
= rt_hash_table
[hash
];
561 rt_hash_table
[hash
] = rth
;
563 atomic_inc(&rth
->u
.dst
.refcnt
);
564 atomic_inc(&rth
->u
.dst
.use
);
565 rth
->u
.dst
.lastuse
= now
;
566 write_unlock_bh(&rt_hash_lock
);
573 rthp
= &rth
->u
.rt_next
;
576 /* Try to bind route to arp only if it is output
577 route or unicast forwarding path.
579 if (rt
->rt_type
== RTN_UNICAST
|| rt
->key
.iif
== 0) {
580 if (!arp_bind_neighbour(&rt
->u
.dst
)) {
581 write_unlock_bh(&rt_hash_lock
);
583 /* Neighbour tables are full and nothing
584 can be released. Try to shrink route cache,
585 it is most likely it holds some neighbour records.
587 if (attempts
-- > 0) {
588 int saved_elasticity
= ip_rt_gc_elasticity
;
589 int saved_int
= ip_rt_gc_min_interval
;
590 ip_rt_gc_elasticity
= 1;
591 ip_rt_gc_min_interval
= 0;
592 rt_garbage_collect();
593 ip_rt_gc_min_interval
= saved_int
;
594 ip_rt_gc_elasticity
= saved_elasticity
;
600 printk("neighbour table overflow\n");
605 rt
->u
.rt_next
= rt_hash_table
[hash
];
606 #if RT_CACHE_DEBUG >= 2
609 printk("rt_cache @%02x: %08x", hash
, rt
->rt_dst
);
610 for (trt
=rt
->u
.rt_next
; trt
; trt
=trt
->u
.rt_next
)
611 printk(" . %08x", trt
->rt_dst
);
615 rt_hash_table
[hash
] = rt
;
616 write_unlock_bh(&rt_hash_lock
);
621 void ip_rt_redirect(u32 old_gw
, u32 daddr
, u32 new_gw
,
622 u32 saddr
, u8 tos
, struct device
*dev
)
625 struct in_device
*in_dev
= dev
->ip_ptr
;
626 struct rtable
*rth
, **rthp
;
627 u32 skeys
[2] = { saddr
, 0 };
628 int ikeys
[2] = { dev
->ifindex
, 0 };
630 tos
&= IPTOS_TOS_MASK
;
635 if (new_gw
== old_gw
|| !IN_DEV_RX_REDIRECTS(in_dev
)
636 || MULTICAST(new_gw
) || BADCLASS(new_gw
) || ZERONET(new_gw
))
637 goto reject_redirect
;
639 if (!IN_DEV_SHARED_MEDIA(in_dev
)) {
640 if (!inet_addr_onlink(in_dev
, new_gw
, old_gw
))
641 goto reject_redirect
;
642 if (IN_DEV_SEC_REDIRECTS(in_dev
) && ip_fib_check_default(new_gw
, dev
))
643 goto reject_redirect
;
645 if (inet_addr_type(new_gw
) != RTN_UNICAST
)
646 goto reject_redirect
;
649 for (i
=0; i
<2; i
++) {
650 for (k
=0; k
<2; k
++) {
651 unsigned hash
= rt_hash_code(daddr
, skeys
[i
]^(ikeys
[k
]<<5), tos
);
653 rthp
=&rt_hash_table
[hash
];
655 write_lock_bh(&rt_hash_lock
);
656 while ( (rth
= *rthp
) != NULL
) {
659 if (rth
->key
.dst
!= daddr
||
660 rth
->key
.src
!= skeys
[i
] ||
661 rth
->key
.tos
!= tos
||
662 rth
->key
.oif
!= ikeys
[k
] ||
664 rthp
= &rth
->u
.rt_next
;
668 if (rth
->rt_dst
!= daddr
||
669 rth
->rt_src
!= saddr
||
671 rth
->rt_gateway
!= old_gw
||
672 rth
->u
.dst
.dev
!= dev
)
675 dst_clone(&rth
->u
.dst
);
677 rt
= dst_alloc(sizeof(struct rtable
), &ipv4_dst_ops
);
680 write_unlock_bh(&rt_hash_lock
);
685 * Copy all the information.
688 atomic_set(&rt
->u
.dst
.refcnt
, 1);
689 atomic_set(&rt
->u
.dst
.use
, 1);
690 rt
->u
.dst
.lastuse
= jiffies
;
691 rt
->u
.dst
.neighbour
= NULL
;
694 rt
->rt_flags
|= RTCF_REDIRECTED
;
696 /* Gateway is different ... */
697 rt
->rt_gateway
= new_gw
;
699 /* Redirect received -> path was valid */
700 dst_confirm(&rth
->u
.dst
);
702 if (!arp_bind_neighbour(&rt
->u
.dst
) ||
703 !(rt
->u
.dst
.neighbour
->nud_state
&NUD_VALID
)) {
704 if (rt
->u
.dst
.neighbour
)
705 neigh_event_send(rt
->u
.dst
.neighbour
, NULL
);
711 *rthp
= rth
->u
.rt_next
;
712 write_unlock_bh(&rt_hash_lock
);
713 if (!rt_intern_hash(hash
, rt
, &rt
))
718 write_unlock_bh(&rt_hash_lock
);
726 #ifdef CONFIG_IP_ROUTE_VERBOSE
727 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
728 printk(KERN_INFO
"Redirect from %lX/%s to %lX ignored."
729 "Path = %lX -> %lX, tos %02x\n",
730 ntohl(old_gw
), dev
->name
, ntohl(new_gw
),
731 ntohl(saddr
), ntohl(daddr
), tos
);
735 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
)
737 struct rtable
*rt
= (struct rtable
*)dst
;
744 if ((rt
->rt_flags
&RTCF_REDIRECTED
) || rt
->u
.dst
.expires
) {
745 unsigned hash
= rt_hash_code(rt
->key
.dst
, rt
->key
.src
^(rt
->key
.oif
<<5), rt
->key
.tos
);
746 struct rtable
**rthp
;
747 #if RT_CACHE_DEBUG >= 1
748 printk(KERN_DEBUG
"ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt
->rt_dst
), rt
->key
.tos
);
751 write_lock_bh(&rt_hash_lock
);
752 for (rthp
= &rt_hash_table
[hash
]; *rthp
; rthp
= &(*rthp
)->u
.rt_next
) {
754 *rthp
= rt
->u
.rt_next
;
759 write_unlock_bh(&rt_hash_lock
);
768 * 1. The first ip_rt_redirect_number redirects are sent
769 * with exponential backoff, then we stop sending them at all,
770 * assuming that the host ignores our redirects.
771 * 2. If we did not see packets requiring redirects
772 * during ip_rt_redirect_silence, we assume that the host
773 * forgot redirected route and start to send redirects again.
775 * This algorithm is much cheaper and more intelligent than dumb load limiting
778 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
779 * and "frag. need" (breaks PMTU discovery) in icmp.c.
782 void ip_rt_send_redirect(struct sk_buff
*skb
)
784 struct rtable
*rt
= (struct rtable
*)skb
->dst
;
785 struct in_device
*in_dev
= (struct in_device
*)rt
->u
.dst
.dev
->ip_ptr
;
787 if (!in_dev
|| !IN_DEV_TX_REDIRECTS(in_dev
))
790 /* No redirected packets during ip_rt_redirect_silence;
791 * reset the algorithm.
793 if (jiffies
- rt
->u
.dst
.rate_last
> ip_rt_redirect_silence
)
794 rt
->u
.dst
.rate_tokens
= 0;
796 /* Too many ignored redirects; do not send anything
797 * set u.dst.rate_last to the last seen redirected packet.
799 if (rt
->u
.dst
.rate_tokens
>= ip_rt_redirect_number
) {
800 rt
->u
.dst
.rate_last
= jiffies
;
804 /* Check for load limit; set rate_last to the latest sent
807 if (jiffies
- rt
->u
.dst
.rate_last
> (ip_rt_redirect_load
<<rt
->u
.dst
.rate_tokens
)) {
808 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, rt
->rt_gateway
);
809 rt
->u
.dst
.rate_last
= jiffies
;
810 ++rt
->u
.dst
.rate_tokens
;
811 #ifdef CONFIG_IP_ROUTE_VERBOSE
812 if (IN_DEV_LOG_MARTIANS(in_dev
) &&
813 rt
->u
.dst
.rate_tokens
== ip_rt_redirect_number
&& net_ratelimit())
814 printk(KERN_WARNING
"host %08x/if%d ignores redirects for %08x to %08x.\n",
815 rt
->rt_src
, rt
->rt_iif
, rt
->rt_dst
, rt
->rt_gateway
);
820 static int ip_error(struct sk_buff
*skb
)
822 struct rtable
*rt
= (struct rtable
*)skb
->dst
;
826 switch (rt
->u
.dst
.error
) {
832 code
= ICMP_HOST_UNREACH
;
835 code
= ICMP_NET_UNREACH
;
838 code
= ICMP_PKT_FILTERED
;
843 if ((rt
->u
.dst
.rate_tokens
+= (now
- rt
->u
.dst
.rate_last
)) > ip_rt_error_burst
)
844 rt
->u
.dst
.rate_tokens
= ip_rt_error_burst
;
845 rt
->u
.dst
.rate_last
= now
;
846 if (rt
->u
.dst
.rate_tokens
>= ip_rt_error_cost
) {
847 rt
->u
.dst
.rate_tokens
-= ip_rt_error_cost
;
848 icmp_send(skb
, ICMP_DEST_UNREACH
, code
, 0);
856 * The last two values are not from the RFC but
857 * are needed for AMPRnet AX.25 paths.
860 static unsigned short mtu_plateau
[] =
861 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
863 static __inline__
unsigned short guess_mtu(unsigned short old_mtu
)
867 for (i
= 0; i
< sizeof(mtu_plateau
)/sizeof(mtu_plateau
[0]); i
++)
868 if (old_mtu
> mtu_plateau
[i
])
869 return mtu_plateau
[i
];
873 unsigned short ip_rt_frag_needed(struct iphdr
*iph
, unsigned short new_mtu
)
876 unsigned short old_mtu
= ntohs(iph
->tot_len
);
878 u32 skeys
[2] = { iph
->saddr
, 0, };
879 u32 daddr
= iph
->daddr
;
880 u8 tos
= iph
->tos
& IPTOS_TOS_MASK
;
881 unsigned short est_mtu
= 0;
883 if (ipv4_config
.no_pmtu_disc
)
886 for (i
=0; i
<2; i
++) {
887 unsigned hash
= rt_hash_code(daddr
, skeys
[i
], tos
);
889 read_lock_bh(&rt_hash_lock
);
890 for (rth
= rt_hash_table
[hash
]; rth
; rth
= rth
->u
.rt_next
) {
891 if (rth
->key
.dst
== daddr
&&
892 rth
->key
.src
== skeys
[i
] &&
893 rth
->rt_dst
== daddr
&&
894 rth
->rt_src
== iph
->saddr
&&
895 rth
->key
.tos
== tos
&&
897 !(rth
->u
.dst
.mxlock
&(1<<RTAX_MTU
))) {
898 unsigned short mtu
= new_mtu
;
900 if (new_mtu
< 68 || new_mtu
>= old_mtu
) {
902 /* BSD 4.2 compatibility hack :-( */
903 if (mtu
== 0 && old_mtu
>= rth
->u
.dst
.pmtu
&&
904 old_mtu
>= 68 + (iph
->ihl
<<2))
905 old_mtu
-= iph
->ihl
<<2;
907 mtu
= guess_mtu(old_mtu
);
909 if (mtu
<= rth
->u
.dst
.pmtu
) {
910 if (mtu
< rth
->u
.dst
.pmtu
) {
911 dst_confirm(&rth
->u
.dst
);
912 rth
->u
.dst
.pmtu
= mtu
;
913 dst_set_expires(&rth
->u
.dst
, ip_rt_mtu_expires
);
919 read_unlock_bh(&rt_hash_lock
);
921 return est_mtu
? : new_mtu
;
924 void ip_rt_update_pmtu(struct dst_entry
*dst
, unsigned mtu
)
926 if (dst
->pmtu
> mtu
&& mtu
>= 68 &&
927 !(dst
->mxlock
&(1<<RTAX_MTU
))) {
929 dst_set_expires(dst
, ip_rt_mtu_expires
);
933 static struct dst_entry
* ipv4_dst_check(struct dst_entry
* dst
, u32 cookie
)
939 static struct dst_entry
* ipv4_dst_reroute(struct dst_entry
* dst
,
945 static void ipv4_link_failure(struct sk_buff
*skb
)
949 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_HOST_UNREACH
, 0);
951 rt
= (struct rtable
*) skb
->dst
;
953 dst_set_expires(&rt
->u
.dst
, 0);
956 static int ip_rt_bug(struct sk_buff
*skb
)
958 printk(KERN_DEBUG
"ip_rt_bug: %08x -> %08x, %s\n", skb
->nh
.iph
->saddr
,
959 skb
->nh
.iph
->daddr
, skb
->dev
? skb
->dev
->name
: "?");
965 We do not cache source address of outgoing interface,
966 because it is used only by IP RR, TS and SRR options,
967 so that it out of fast path.
969 BTW remember: "addr" is allowed to be not aligned
973 void ip_rt_get_source(u8
*addr
, struct rtable
*rt
)
976 struct fib_result res
;
978 if (rt
->key
.iif
== 0)
980 else if (fib_lookup(&rt
->key
, &res
) == 0)
981 src
= FIB_RES_PREFSRC(res
);
983 src
= inet_select_addr(rt
->u
.dst
.dev
, rt
->rt_gateway
, RT_SCOPE_UNIVERSE
);
984 memcpy(addr
, &src
, 4);
987 #ifdef CONFIG_NET_CLS_ROUTE
988 static void set_class_tag(struct rtable
*rt
, u32 tag
)
990 if (!(rt
->u
.dst
.tclassid
&0xFFFF))
991 rt
->u
.dst
.tclassid
|= tag
&0xFFFF;
992 if (!(rt
->u
.dst
.tclassid
&0xFFFF0000))
993 rt
->u
.dst
.tclassid
|= tag
&0xFFFF0000;
997 static void rt_set_nexthop(struct rtable
*rt
, struct fib_result
*res
, u32 itag
)
999 struct fib_info
*fi
= res
->fi
;
1002 if (FIB_RES_GW(*res
) && FIB_RES_NH(*res
).nh_scope
== RT_SCOPE_LINK
)
1003 rt
->rt_gateway
= FIB_RES_GW(*res
);
1004 rt
->u
.dst
.mxlock
= fi
->fib_metrics
[RTAX_LOCK
-1];
1005 rt
->u
.dst
.pmtu
= fi
->fib_mtu
;
1006 if (fi
->fib_mtu
== 0) {
1007 rt
->u
.dst
.pmtu
= rt
->u
.dst
.dev
->mtu
;
1008 if (rt
->u
.dst
.pmtu
> IP_MAX_MTU
)
1009 rt
->u
.dst
.pmtu
= IP_MAX_MTU
;
1010 if (rt
->u
.dst
.mxlock
&(1<<RTAX_MTU
) &&
1011 rt
->rt_gateway
!= rt
->rt_dst
&&
1012 rt
->u
.dst
.pmtu
> 576)
1013 rt
->u
.dst
.pmtu
= 576;
1015 rt
->u
.dst
.window
= fi
->fib_window
? : 0;
1016 rt
->u
.dst
.rtt
= fi
->fib_rtt
? : TCP_TIMEOUT_INIT
;
1017 #ifdef CONFIG_NET_CLS_ROUTE
1018 rt
->u
.dst
.tclassid
= FIB_RES_NH(*res
).nh_tclassid
;
1021 rt
->u
.dst
.pmtu
= rt
->u
.dst
.dev
->mtu
;
1022 if (rt
->u
.dst
.pmtu
> IP_MAX_MTU
)
1023 rt
->u
.dst
.pmtu
= IP_MAX_MTU
;
1024 rt
->u
.dst
.window
= 0;
1025 rt
->u
.dst
.rtt
= TCP_TIMEOUT_INIT
;
1027 #ifdef CONFIG_NET_CLS_ROUTE
1028 #ifdef CONFIG_IP_MULTIPLE_TABLES
1029 set_class_tag(rt
, fib_rules_tclass(res
));
1031 set_class_tag(rt
, itag
);
1033 rt
->rt_type
= res
->type
;
1037 ip_route_input_mc(struct sk_buff
*skb
, u32 daddr
, u32 saddr
,
1038 u8 tos
, struct device
*dev
, int our
)
1043 struct in_device
*in_dev
= dev
->ip_ptr
;
1046 /* Primary sanity checks. */
1048 if (MULTICAST(saddr
) || BADCLASS(saddr
) || LOOPBACK(saddr
) ||
1049 in_dev
== NULL
|| skb
->protocol
!= __constant_htons(ETH_P_IP
))
1052 if (ZERONET(saddr
)) {
1053 if (!LOCAL_MCAST(daddr
))
1055 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
1056 } else if (fib_validate_source(saddr
, 0, tos
, 0, dev
, &spec_dst
, &itag
) < 0)
1059 rth
= dst_alloc(sizeof(struct rtable
), &ipv4_dst_ops
);
1063 rth
->u
.dst
.output
= ip_rt_bug
;
1065 atomic_set(&rth
->u
.dst
.use
, 1);
1066 rth
->key
.dst
= daddr
;
1067 rth
->rt_dst
= daddr
;
1069 #ifdef CONFIG_IP_ROUTE_FWMARK
1070 rth
->key
.fwmark
= skb
->fwmark
;
1072 rth
->key
.src
= saddr
;
1073 rth
->rt_src
= saddr
;
1074 #ifdef CONFIG_IP_ROUTE_NAT
1075 rth
->rt_dst_map
= daddr
;
1076 rth
->rt_src_map
= saddr
;
1078 #ifdef CONFIG_NET_CLS_ROUTE
1079 rth
->u
.dst
.tclassid
= itag
;
1082 rth
->key
.iif
= dev
->ifindex
;
1083 rth
->u
.dst
.dev
= &loopback_dev
;
1085 rth
->rt_gateway
= daddr
;
1086 rth
->rt_spec_dst
= spec_dst
;
1087 rth
->rt_type
= RTN_MULTICAST
;
1088 rth
->rt_flags
= RTCF_MULTICAST
;
1090 rth
->u
.dst
.input
= ip_local_deliver
;
1091 rth
->rt_flags
|= RTCF_LOCAL
;
1094 #ifdef CONFIG_IP_MROUTE
1095 if (!LOCAL_MCAST(daddr
) && IN_DEV_MFORWARD(in_dev
))
1096 rth
->u
.dst
.input
= ip_mr_input
;
1099 hash
= rt_hash_code(daddr
, saddr
^(dev
->ifindex
<<5), tos
);
1100 return rt_intern_hash(hash
, rth
, (struct rtable
**)&skb
->dst
);
1104 * NOTE. We drop all the packets that has local source
1105 * addresses, because every properly looped back packet
1106 * must have correct destination already attached by output routine.
1108 * Such approach solves two big problems:
1109 * 1. Not simplex devices are handled properly.
1110 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1113 int ip_route_input_slow(struct sk_buff
*skb
, u32 daddr
, u32 saddr
,
1114 u8 tos
, struct device
*dev
)
1117 struct fib_result res
;
1118 struct in_device
*in_dev
= dev
->ip_ptr
;
1119 struct in_device
*out_dev
;
1122 struct rtable
* rth
;
1128 * IP on this device is disabled.
1137 #ifdef CONFIG_IP_ROUTE_FWMARK
1138 key
.fwmark
= skb
->fwmark
;
1140 key
.iif
= dev
->ifindex
;
1142 key
.scope
= RT_SCOPE_UNIVERSE
;
1144 hash
= rt_hash_code(daddr
, saddr
^(key
.iif
<<5), tos
);
1146 /* Check for the most weird martians, which can be not detected
1150 if (MULTICAST(saddr
) || BADCLASS(saddr
) || LOOPBACK(saddr
))
1151 goto martian_source
;
1153 if (daddr
== 0xFFFFFFFF || (saddr
== 0 && daddr
== 0))
1156 /* Accept zero addresses only to limited broadcast;
1157 * I even do not know to fix it or not. Waiting for complains :-)
1160 goto martian_source
;
1162 if (BADCLASS(daddr
) || ZERONET(daddr
) || LOOPBACK(daddr
))
1163 goto martian_destination
;
1166 * Now we are ready to route packet.
1168 if ((err
= fib_lookup(&key
, &res
))) {
1169 if (!IN_DEV_FORWARD(in_dev
))
1174 #ifdef CONFIG_IP_ROUTE_NAT
1175 /* Policy is applied before mapping destination,
1176 but rerouting after map should be made with old source.
1180 u32 src_map
= saddr
;
1182 src_map
= fib_rules_policy(saddr
, &res
, &flags
);
1184 if (res
.type
== RTN_NAT
) {
1185 key
.dst
= fib_rules_map_destination(daddr
, &res
);
1186 if (fib_lookup(&key
, &res
) || res
.type
!= RTN_UNICAST
)
1194 if (res
.type
== RTN_BROADCAST
)
1197 if (res
.type
== RTN_LOCAL
) {
1199 result
= fib_validate_source(saddr
, daddr
, tos
, loopback_dev
.ifindex
,
1200 dev
, &spec_dst
, &itag
);
1202 goto martian_source
;
1204 flags
|= RTCF_DIRECTSRC
;
1209 if (!IN_DEV_FORWARD(in_dev
))
1211 if (res
.type
!= RTN_UNICAST
)
1212 goto martian_destination
;
1214 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1215 if (res
.fi
->fib_nhs
> 1 && key
.oif
== 0)
1216 fib_select_multipath(&key
, &res
);
1218 out_dev
= FIB_RES_DEV(res
)->ip_ptr
;
1219 if (out_dev
== NULL
) {
1220 if (net_ratelimit())
1221 printk(KERN_CRIT
"Bug in ip_route_input_slow(). Please, report\n");
1225 err
= fib_validate_source(saddr
, daddr
, tos
, FIB_RES_OIF(res
), dev
, &spec_dst
, &itag
);
1227 goto martian_source
;
1230 flags
|= RTCF_DIRECTSRC
;
1232 if (out_dev
== in_dev
&& err
&& !(flags
&(RTCF_NAT
|RTCF_MASQ
)) &&
1233 (IN_DEV_SHARED_MEDIA(out_dev
)
1234 || inet_addr_onlink(out_dev
, saddr
, FIB_RES_GW(res
))))
1235 flags
|= RTCF_DOREDIRECT
;
1237 if (skb
->protocol
!= __constant_htons(ETH_P_IP
)) {
1238 /* Not IP (i.e. ARP). Do not create route, if it is
1239 * invalid for proxy arp. DNAT routes are always valid.
1241 if (out_dev
== in_dev
&& !(flags
&RTCF_DNAT
))
1245 rth
= dst_alloc(sizeof(struct rtable
), &ipv4_dst_ops
);
1249 atomic_set(&rth
->u
.dst
.use
, 1);
1250 rth
->key
.dst
= daddr
;
1251 rth
->rt_dst
= daddr
;
1253 #ifdef CONFIG_IP_ROUTE_FWMARK
1254 rth
->key
.fwmark
= skb
->fwmark
;
1256 rth
->key
.src
= saddr
;
1257 rth
->rt_src
= saddr
;
1258 rth
->rt_gateway
= daddr
;
1259 #ifdef CONFIG_IP_ROUTE_NAT
1260 rth
->rt_src_map
= key
.src
;
1261 rth
->rt_dst_map
= key
.dst
;
1262 if (flags
&RTCF_DNAT
)
1263 rth
->rt_gateway
= key
.dst
;
1266 rth
->key
.iif
= dev
->ifindex
;
1267 rth
->u
.dst
.dev
= out_dev
->dev
;
1269 rth
->rt_spec_dst
= spec_dst
;
1271 rth
->u
.dst
.input
= ip_forward
;
1272 rth
->u
.dst
.output
= ip_output
;
1274 rt_set_nexthop(rth
, &res
, itag
);
1276 rth
->rt_flags
= flags
;
1278 #ifdef CONFIG_NET_FASTROUTE
1279 if (netdev_fastroute
&& !(flags
&(RTCF_NAT
|RTCF_MASQ
|RTCF_DOREDIRECT
))) {
1280 struct device
*odev
= rth
->u
.dst
.dev
;
1282 dev
->accept_fastpath
&&
1283 odev
->mtu
>= dev
->mtu
&&
1284 dev
->accept_fastpath(dev
, &rth
->u
.dst
) == 0)
1285 rth
->rt_flags
|= RTCF_FAST
;
1289 return rt_intern_hash(hash
, rth
, (struct rtable
**)&skb
->dst
);
1292 if (skb
->protocol
!= __constant_htons(ETH_P_IP
))
1295 if (ZERONET(saddr
)) {
1296 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
1298 err
= fib_validate_source(saddr
, 0, tos
, 0, dev
, &spec_dst
, &itag
);
1300 goto martian_source
;
1302 flags
|= RTCF_DIRECTSRC
;
1304 flags
|= RTCF_BROADCAST
;
1305 res
.type
= RTN_BROADCAST
;
1308 rth
= dst_alloc(sizeof(struct rtable
), &ipv4_dst_ops
);
1312 rth
->u
.dst
.output
= ip_rt_bug
;
1314 atomic_set(&rth
->u
.dst
.use
, 1);
1315 rth
->key
.dst
= daddr
;
1316 rth
->rt_dst
= daddr
;
1318 #ifdef CONFIG_IP_ROUTE_FWMARK
1319 rth
->key
.fwmark
= skb
->fwmark
;
1321 rth
->key
.src
= saddr
;
1322 rth
->rt_src
= saddr
;
1323 #ifdef CONFIG_IP_ROUTE_NAT
1324 rth
->rt_dst_map
= key
.dst
;
1325 rth
->rt_src_map
= key
.src
;
1327 #ifdef CONFIG_NET_CLS_ROUTE
1328 rth
->u
.dst
.tclassid
= itag
;
1331 rth
->key
.iif
= dev
->ifindex
;
1332 rth
->u
.dst
.dev
= &loopback_dev
;
1334 rth
->rt_gateway
= daddr
;
1335 rth
->rt_spec_dst
= spec_dst
;
1336 rth
->u
.dst
.input
= ip_local_deliver
;
1337 rth
->rt_flags
= flags
|RTCF_LOCAL
;
1338 if (res
.type
== RTN_UNREACHABLE
) {
1339 rth
->u
.dst
.input
= ip_error
;
1340 rth
->u
.dst
.error
= -err
;
1341 rth
->rt_flags
&= ~RTCF_LOCAL
;
1343 rth
->rt_type
= res
.type
;
1344 return rt_intern_hash(hash
, rth
, (struct rtable
**)&skb
->dst
);
1347 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_UNIVERSE
);
1348 res
.type
= RTN_UNREACHABLE
;
1352 * Do not cache martian addresses: they should be logged (RFC1812)
1354 martian_destination
:
1355 #ifdef CONFIG_IP_ROUTE_VERBOSE
1356 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
1357 printk(KERN_WARNING
"martian destination %08x from %08x, dev %s\n", daddr
, saddr
, dev
->name
);
1362 #ifdef CONFIG_IP_ROUTE_VERBOSE
1363 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit()) {
1365 * RFC1812 recommenadtion, if source is martian,
1366 * the only hint is MAC header.
1368 printk(KERN_WARNING
"martian source %08x for %08x, dev %s\n", saddr
, daddr
, dev
->name
);
1369 if (dev
->hard_header_len
) {
1371 unsigned char *p
= skb
->mac
.raw
;
1372 printk(KERN_WARNING
"ll header:");
1373 for (i
=0; i
<dev
->hard_header_len
; i
++, p
++)
1374 printk(" %02x", *p
);
1382 int ip_route_input(struct sk_buff
*skb
, u32 daddr
, u32 saddr
,
1383 u8 tos
, struct device
*dev
)
1385 struct rtable
* rth
;
1387 int iif
= dev
->ifindex
;
1389 tos
&= IPTOS_TOS_MASK
;
1390 hash
= rt_hash_code(daddr
, saddr
^(iif
<<5), tos
);
1392 read_lock_bh(&rt_hash_lock
);
1393 for (rth
=rt_hash_table
[hash
]; rth
; rth
=rth
->u
.rt_next
) {
1394 if (rth
->key
.dst
== daddr
&&
1395 rth
->key
.src
== saddr
&&
1396 rth
->key
.iif
== iif
&&
1397 rth
->key
.oif
== 0 &&
1398 #ifdef CONFIG_IP_ROUTE_FWMARK
1399 rth
->key
.fwmark
== skb
->fwmark
&&
1401 rth
->key
.tos
== tos
) {
1402 rth
->u
.dst
.lastuse
= jiffies
;
1403 atomic_inc(&rth
->u
.dst
.use
);
1404 atomic_inc(&rth
->u
.dst
.refcnt
);
1405 read_unlock_bh(&rt_hash_lock
);
1406 skb
->dst
= (struct dst_entry
*)rth
;
1410 read_unlock_bh(&rt_hash_lock
);
1412 /* Multicast recognition logic is moved from route cache to here.
1413 The problem was that too many Ethernet cards have broken/missing
1414 hardware multicast filters :-( As result the host on multicasting
1415 network acquires a lot of useless route cache entries, sort of
1416 SDR messages from all the world. Now we try to get rid of them.
1417 Really, provided software IP multicast filter is organized
1418 reasonably (at least, hashed), it does not result in a slowdown
1419 comparing with route cache reject entries.
1420 Note, that multicast routers are not affected, because
1421 route cache entry is created eventually.
1423 if (MULTICAST(daddr
)) {
1424 int our
= ip_check_mc(dev
, daddr
);
1426 #ifdef CONFIG_IP_MROUTE
1427 && (LOCAL_MCAST(daddr
) || !dev
->ip_ptr
||
1428 !IN_DEV_MFORWARD((struct in_device
*)dev
->ip_ptr
))
1431 return ip_route_input_mc(skb
, daddr
, saddr
, tos
, dev
, our
);
1433 return ip_route_input_slow(skb
, daddr
, saddr
, tos
, dev
);
1437 * Major route resolver routine.
1440 int ip_route_output_slow(struct rtable
**rp
, u32 daddr
, u32 saddr
, u32 tos
, int oif
)
1443 struct fib_result res
;
1446 struct device
*dev_out
= NULL
;
1448 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1449 u32 nochecksrc
= (tos
& RTO_TPROXY
);
1452 tos
&= IPTOS_TOS_MASK
|RTO_ONLINK
;
1455 key
.tos
= tos
&IPTOS_TOS_MASK
;
1456 key
.iif
= loopback_dev
.ifindex
;
1458 key
.scope
= (tos
&RTO_ONLINK
) ? RT_SCOPE_LINK
: RT_SCOPE_UNIVERSE
;
1460 #ifdef CONFIG_IP_MULTIPLE_TABLES
1465 if (MULTICAST(saddr
) || BADCLASS(saddr
) || ZERONET(saddr
))
1468 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1469 dev_out
= ip_dev_find(saddr
);
1470 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1471 /* If address is not local, test for transparent proxy flag;
1472 if address is local --- clear the flag.
1474 if (dev_out
== NULL
) {
1475 if (nochecksrc
== 0 || inet_addr_type(saddr
) != RTN_UNICAST
)
1477 flags
|= RTCF_TPROXY
;
1480 if (dev_out
== NULL
)
1484 /* I removed check for oif == dev_out->oif here.
1485 It was wrong by three reasons:
1486 1. ip_dev_find(saddr) can return wrong iface, if saddr is
1487 assigned to multiple interfaces.
1488 2. Moreover, we are allowed to send packets with saddr
1489 of another iface. --ANK
1493 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1496 (MULTICAST(daddr
) || daddr
== 0xFFFFFFFF)) {
1497 /* Special hack: user can direct multicasts
1498 and limited broadcast via necessary interface
1499 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1500 This hack is not just for fun, it allows
1501 vic,vat and friends to work.
1502 They bind socket to loopback, set ttl to zero
1503 and expect that it will work.
1504 From the viewpoint of routing cache they are broken,
1505 because we are not allowed to build multicast path
1506 with loopback source addr (look, routing cache
1507 cannot know, that ttl is zero, so that packet
1508 will not leave this host and route is valid).
1509 Luckily, this hack is good workaround.
1512 key
.oif
= dev_out
->ifindex
;
1518 dev_out
= dev_get_by_index(oif
);
1519 if (dev_out
== NULL
)
1521 if (dev_out
->ip_ptr
== NULL
)
1522 return -ENODEV
; /* Wrong error code */
1524 if (LOCAL_MCAST(daddr
) || daddr
== 0xFFFFFFFF) {
1526 key
.src
= inet_select_addr(dev_out
, 0, RT_SCOPE_LINK
);
1530 if (MULTICAST(daddr
))
1531 key
.src
= inet_select_addr(dev_out
, 0, key
.scope
);
1533 key
.src
= inet_select_addr(dev_out
, 0, RT_SCOPE_HOST
);
1540 key
.dst
= key
.src
= htonl(INADDR_LOOPBACK
);
1541 dev_out
= &loopback_dev
;
1542 key
.oif
= loopback_dev
.ifindex
;
1543 res
.type
= RTN_LOCAL
;
1544 flags
|= RTCF_LOCAL
;
1548 if (fib_lookup(&key
, &res
)) {
1551 /* Apparently, routing tables are wrong. Assume,
1552 that the destination is on link.
1555 Because we are allowed to send to iface
1556 even if it has NO routes and NO assigned
1557 addresses. When oif is specified, routing
1558 tables are looked up with only one purpose:
1559 to catch if destination is gatewayed, rather than
1560 direct. Moreover, if MSG_DONTROUTE is set,
1561 we send packet, ignoring both routing tables
1562 and ifaddr state. --ANK
1565 We could make it even if oif is unknown,
1566 likely IPv6, but we do not.
1570 key
.src
= inet_select_addr(dev_out
, 0, RT_SCOPE_LINK
);
1571 res
.type
= RTN_UNICAST
;
1574 return -ENETUNREACH
;
1577 if (res
.type
== RTN_NAT
)
1580 if (res
.type
== RTN_LOCAL
) {
1583 dev_out
= &loopback_dev
;
1584 key
.oif
= dev_out
->ifindex
;
1586 flags
|= RTCF_LOCAL
;
1590 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1591 if (res
.fi
->fib_nhs
> 1 && key
.oif
== 0)
1592 fib_select_multipath(&key
, &res
);
1595 if (res
.prefixlen
==0 && res
.type
== RTN_UNICAST
&& key
.oif
== 0)
1596 fib_select_default(&key
, &res
);
1599 key
.src
= FIB_RES_PREFSRC(res
);
1601 dev_out
= FIB_RES_DEV(res
);
1602 key
.oif
= dev_out
->ifindex
;
1605 if (LOOPBACK(key
.src
) && !(dev_out
->flags
&IFF_LOOPBACK
))
1608 if (key
.dst
== 0xFFFFFFFF)
1609 res
.type
= RTN_BROADCAST
;
1610 else if (MULTICAST(key
.dst
))
1611 res
.type
= RTN_MULTICAST
;
1612 else if (BADCLASS(key
.dst
) || ZERONET(key
.dst
))
1615 if (dev_out
->flags
&IFF_LOOPBACK
)
1616 flags
|= RTCF_LOCAL
;
1618 if (res
.type
== RTN_BROADCAST
) {
1619 flags
|= RTCF_BROADCAST
|RTCF_LOCAL
;
1621 } else if (res
.type
== RTN_MULTICAST
) {
1622 flags
|= RTCF_MULTICAST
|RTCF_LOCAL
;
1623 if (!ip_check_mc(dev_out
, daddr
))
1624 flags
&= ~RTCF_LOCAL
;
1625 /* If multicast route do not exist use
1626 default one, but do not gateway in this case.
1629 if (res
.fi
&& res
.prefixlen
< 4)
1633 rth
= dst_alloc(sizeof(struct rtable
), &ipv4_dst_ops
);
1637 atomic_set(&rth
->u
.dst
.use
, 1);
1638 rth
->key
.dst
= daddr
;
1640 rth
->key
.src
= saddr
;
1643 rth
->rt_dst
= key
.dst
;
1644 rth
->rt_src
= key
.src
;
1645 #ifdef CONFIG_IP_ROUTE_NAT
1646 rth
->rt_dst_map
= key
.dst
;
1647 rth
->rt_src_map
= key
.src
;
1649 rth
->rt_iif
= oif
? : dev_out
->ifindex
;
1650 rth
->u
.dst
.dev
= dev_out
;
1651 rth
->rt_gateway
= key
.dst
;
1652 rth
->rt_spec_dst
= key
.src
;
1654 rth
->u
.dst
.output
=ip_output
;
1656 if (flags
&RTCF_LOCAL
) {
1657 rth
->u
.dst
.input
= ip_local_deliver
;
1658 rth
->rt_spec_dst
= key
.dst
;
1660 if (flags
&(RTCF_BROADCAST
|RTCF_MULTICAST
)) {
1661 rth
->rt_spec_dst
= key
.src
;
1662 if (flags
&RTCF_LOCAL
&& !(dev_out
->flags
&IFF_LOOPBACK
))
1663 rth
->u
.dst
.output
= ip_mc_output
;
1664 #ifdef CONFIG_IP_MROUTE
1665 if (res
.type
== RTN_MULTICAST
&& dev_out
->ip_ptr
) {
1666 struct in_device
*in_dev
= dev_out
->ip_ptr
;
1667 if (IN_DEV_MFORWARD(in_dev
) && !LOCAL_MCAST(daddr
)) {
1668 rth
->u
.dst
.input
= ip_mr_input
;
1669 rth
->u
.dst
.output
= ip_mc_output
;
1675 rt_set_nexthop(rth
, &res
, 0);
1677 rth
->rt_flags
= flags
;
1679 hash
= rt_hash_code(daddr
, saddr
^(oif
<<5), tos
);
1680 return rt_intern_hash(hash
, rth
, rp
);
1683 int ip_route_output(struct rtable
**rp
, u32 daddr
, u32 saddr
, u32 tos
, int oif
)
1688 hash
= rt_hash_code(daddr
, saddr
^(oif
<<5), tos
);
1690 read_lock_bh(&rt_hash_lock
);
1691 for (rth
=rt_hash_table
[hash
]; rth
; rth
=rth
->u
.rt_next
) {
1692 if (rth
->key
.dst
== daddr
&&
1693 rth
->key
.src
== saddr
&&
1694 rth
->key
.iif
== 0 &&
1695 rth
->key
.oif
== oif
&&
1696 #ifndef CONFIG_IP_TRANSPARENT_PROXY
1699 !((rth
->key
.tos
^tos
)&(IPTOS_TOS_MASK
|RTO_ONLINK
)) &&
1700 ((tos
&RTO_TPROXY
) || !(rth
->rt_flags
&RTCF_TPROXY
))
1703 rth
->u
.dst
.lastuse
= jiffies
;
1704 atomic_inc(&rth
->u
.dst
.use
);
1705 atomic_inc(&rth
->u
.dst
.refcnt
);
1706 read_unlock_bh(&rt_hash_lock
);
1711 read_unlock_bh(&rt_hash_lock
);
1713 return ip_route_output_slow(rp
, daddr
, saddr
, tos
, oif
);
1716 #ifdef CONFIG_RTNETLINK
1718 static int rt_fill_info(struct sk_buff
*skb
, u32 pid
, u32 seq
, int event
, int nowait
)
1720 struct rtable
*rt
= (struct rtable
*)skb
->dst
;
1722 struct nlmsghdr
*nlh
;
1723 unsigned char *b
= skb
->tail
;
1724 struct rta_cacheinfo ci
;
1725 #ifdef CONFIG_IP_MROUTE
1726 struct rtattr
*eptr
;
1730 nlh
= NLMSG_PUT(skb
, pid
, seq
, event
, sizeof(*r
));
1731 r
= NLMSG_DATA(nlh
);
1732 nlh
->nlmsg_flags
= (nowait
&& pid
) ? NLM_F_MULTI
: 0;
1733 r
->rtm_family
= AF_INET
;
1734 r
->rtm_dst_len
= 32;
1736 r
->rtm_tos
= rt
->key
.tos
;
1737 r
->rtm_table
= RT_TABLE_MAIN
;
1738 r
->rtm_type
= rt
->rt_type
;
1739 r
->rtm_scope
= RT_SCOPE_UNIVERSE
;
1740 r
->rtm_protocol
= RTPROT_UNSPEC
;
1741 r
->rtm_flags
= (rt
->rt_flags
&~0xFFFF) | RTM_F_CLONED
;
1742 if (rt
->rt_flags
& RTCF_NOTIFY
)
1743 r
->rtm_flags
|= RTM_F_NOTIFY
;
1744 RTA_PUT(skb
, RTA_DST
, 4, &rt
->rt_dst
);
1746 r
->rtm_src_len
= 32;
1747 RTA_PUT(skb
, RTA_SRC
, 4, &rt
->key
.src
);
1750 RTA_PUT(skb
, RTA_OIF
, sizeof(int), &rt
->u
.dst
.dev
->ifindex
);
1751 #ifdef CONFIG_NET_CLS_ROUTE
1752 if (rt
->u
.dst
.tclassid
)
1753 RTA_PUT(skb
, RTA_FLOW
, 4, &rt
->u
.dst
.tclassid
);
1756 RTA_PUT(skb
, RTA_PREFSRC
, 4, &rt
->rt_spec_dst
);
1757 else if (rt
->rt_src
!= rt
->key
.src
)
1758 RTA_PUT(skb
, RTA_PREFSRC
, 4, &rt
->rt_src
);
1759 if (rt
->rt_dst
!= rt
->rt_gateway
)
1760 RTA_PUT(skb
, RTA_GATEWAY
, 4, &rt
->rt_gateway
);
1761 mx
= (struct rtattr
*)skb
->tail
;
1762 RTA_PUT(skb
, RTA_METRICS
, 0, NULL
);
1763 if (rt
->u
.dst
.mxlock
)
1764 RTA_PUT(skb
, RTAX_LOCK
, sizeof(unsigned), &rt
->u
.dst
.mxlock
);
1766 RTA_PUT(skb
, RTAX_MTU
, sizeof(unsigned), &rt
->u
.dst
.pmtu
);
1767 if (rt
->u
.dst
.window
)
1768 RTA_PUT(skb
, RTAX_WINDOW
, sizeof(unsigned), &rt
->u
.dst
.window
);
1770 RTA_PUT(skb
, RTAX_RTT
, sizeof(unsigned), &rt
->u
.dst
.rtt
);
1771 mx
->rta_len
= skb
->tail
- (u8
*)mx
;
1772 if (mx
->rta_len
== RTA_LENGTH(0))
1773 skb_trim(skb
, (u8
*)mx
- skb
->data
);
1774 ci
.rta_lastuse
= jiffies
- rt
->u
.dst
.lastuse
;
1775 ci
.rta_used
= atomic_read(&rt
->u
.dst
.refcnt
);
1776 ci
.rta_clntref
= atomic_read(&rt
->u
.dst
.use
);
1777 if (rt
->u
.dst
.expires
)
1778 ci
.rta_expires
= rt
->u
.dst
.expires
- jiffies
;
1781 ci
.rta_error
= rt
->u
.dst
.error
;
1782 #ifdef CONFIG_IP_MROUTE
1783 eptr
= (struct rtattr
*)skb
->tail
;
1785 RTA_PUT(skb
, RTA_CACHEINFO
, sizeof(ci
), &ci
);
1787 #ifdef CONFIG_IP_MROUTE
1788 u32 dst
= rt
->rt_dst
;
1790 if (MULTICAST(dst
) && !LOCAL_MCAST(dst
) && ipv4_devconf
.mc_forwarding
) {
1791 int err
= ipmr_get_route(skb
, r
, nowait
);
1798 if (err
== -EMSGSIZE
)
1800 ((struct rta_cacheinfo
*)RTA_DATA(eptr
))->rta_error
= err
;
1806 RTA_PUT(skb
, RTA_IIF
, sizeof(int), &rt
->key
.iif
);
1810 nlh
->nlmsg_len
= skb
->tail
- b
;
1815 skb_trim(skb
, b
- skb
->data
);
1819 int inet_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
1821 struct rtattr
**rta
= arg
;
1822 struct rtmsg
*rtm
= NLMSG_DATA(nlh
);
1823 struct rtable
*rt
= NULL
;
1828 struct sk_buff
*skb
;
1830 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
1834 /* Reserve room for dummy headers, this skb can pass
1835 through good chunk of routing engine.
1837 skb
->mac
.raw
= skb
->data
;
1838 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct iphdr
));
1841 memcpy(&src
, RTA_DATA(rta
[RTA_SRC
-1]), 4);
1843 memcpy(&dst
, RTA_DATA(rta
[RTA_DST
-1]), 4);
1845 memcpy(&iif
, RTA_DATA(rta
[RTA_IIF
-1]), sizeof(int));
1849 dev
= dev_get_by_index(iif
);
1852 skb
->protocol
= __constant_htons(ETH_P_IP
);
1854 err
= ip_route_input(skb
, dst
, src
, rtm
->rtm_tos
, dev
);
1855 rt
= (struct rtable
*)skb
->dst
;
1856 if (!err
&& rt
->u
.dst
.error
)
1857 err
= -rt
->u
.dst
.error
;
1861 memcpy(&oif
, RTA_DATA(rta
[RTA_OIF
-1]), sizeof(int));
1862 err
= ip_route_output(&rt
, dst
, src
, rtm
->rtm_tos
, oif
);
1869 skb
->dst
= &rt
->u
.dst
;
1870 if (rtm
->rtm_flags
& RTM_F_NOTIFY
)
1871 rt
->rt_flags
|= RTCF_NOTIFY
;
1873 NETLINK_CB(skb
).dst_pid
= NETLINK_CB(in_skb
).pid
;
1875 err
= rt_fill_info(skb
, NETLINK_CB(in_skb
).pid
, nlh
->nlmsg_seq
, RTM_NEWROUTE
, 0);
1881 err
= netlink_unicast(rtnl
, skb
, NETLINK_CB(in_skb
).pid
, MSG_DONTWAIT
);
1888 int ip_rt_dump(struct sk_buff
*skb
, struct netlink_callback
*cb
)
1895 s_idx
= idx
= cb
->args
[1];
1896 for (h
=0; h
< RT_HASH_DIVISOR
; h
++) {
1897 if (h
< s_h
) continue;
1900 read_lock_bh(&rt_hash_lock
);
1901 for (rt
= rt_hash_table
[h
], idx
= 0; rt
; rt
= rt
->u
.rt_next
, idx
++) {
1904 skb
->dst
= dst_clone(&rt
->u
.dst
);
1905 if (rt_fill_info(skb
, NETLINK_CB(cb
->skb
).pid
,
1906 cb
->nlh
->nlmsg_seq
, RTM_NEWROUTE
, 1) <= 0) {
1907 dst_release(xchg(&skb
->dst
, NULL
));
1908 read_unlock_bh(&rt_hash_lock
);
1911 dst_release(xchg(&skb
->dst
, NULL
));
1913 read_unlock_bh(&rt_hash_lock
);
1922 #endif /* CONFIG_RTNETLINK */
1924 void ip_rt_multicast_event(struct in_device
*in_dev
)
1931 #ifdef CONFIG_SYSCTL
1933 static int flush_delay
;
1936 int ipv4_sysctl_rtcache_flush(ctl_table
*ctl
, int write
, struct file
* filp
,
1937 void *buffer
, size_t *lenp
)
1940 proc_dointvec(ctl
, write
, filp
, buffer
, lenp
);
1941 rt_cache_flush(flush_delay
);
1947 ctl_table ipv4_route_table
[] = {
1948 {NET_IPV4_ROUTE_FLUSH
, "flush",
1949 &flush_delay
, sizeof(int), 0200, NULL
,
1950 &ipv4_sysctl_rtcache_flush
},
1951 {NET_IPV4_ROUTE_MIN_DELAY
, "min_delay",
1952 &ip_rt_min_delay
, sizeof(int), 0644, NULL
,
1953 &proc_dointvec_jiffies
},
1954 {NET_IPV4_ROUTE_MAX_DELAY
, "max_delay",
1955 &ip_rt_max_delay
, sizeof(int), 0644, NULL
,
1956 &proc_dointvec_jiffies
},
1957 {NET_IPV4_ROUTE_GC_THRESH
, "gc_thresh",
1958 &ipv4_dst_ops
.gc_thresh
, sizeof(int), 0644, NULL
,
1960 {NET_IPV4_ROUTE_MAX_SIZE
, "max_size",
1961 &ip_rt_max_size
, sizeof(int), 0644, NULL
,
1963 {NET_IPV4_ROUTE_GC_MIN_INTERVAL
, "gc_min_interval",
1964 &ip_rt_gc_min_interval
, sizeof(int), 0644, NULL
,
1965 &proc_dointvec_jiffies
},
1966 {NET_IPV4_ROUTE_GC_TIMEOUT
, "gc_timeout",
1967 &ip_rt_gc_timeout
, sizeof(int), 0644, NULL
,
1968 &proc_dointvec_jiffies
},
1969 {NET_IPV4_ROUTE_GC_INTERVAL
, "gc_interval",
1970 &ip_rt_gc_interval
, sizeof(int), 0644, NULL
,
1971 &proc_dointvec_jiffies
},
1972 {NET_IPV4_ROUTE_REDIRECT_LOAD
, "redirect_load",
1973 &ip_rt_redirect_load
, sizeof(int), 0644, NULL
,
1975 {NET_IPV4_ROUTE_REDIRECT_NUMBER
, "redirect_number",
1976 &ip_rt_redirect_number
, sizeof(int), 0644, NULL
,
1978 {NET_IPV4_ROUTE_REDIRECT_SILENCE
, "redirect_silence",
1979 &ip_rt_redirect_silence
, sizeof(int), 0644, NULL
,
1981 {NET_IPV4_ROUTE_ERROR_COST
, "error_cost",
1982 &ip_rt_error_cost
, sizeof(int), 0644, NULL
,
1984 {NET_IPV4_ROUTE_ERROR_BURST
, "error_burst",
1985 &ip_rt_error_burst
, sizeof(int), 0644, NULL
,
1987 {NET_IPV4_ROUTE_GC_ELASTICITY
, "gc_elasticity",
1988 &ip_rt_gc_elasticity
, sizeof(int), 0644, NULL
,
1990 {NET_IPV4_ROUTE_MTU_EXPIRES
, "mtu_expires",
1991 &ip_rt_mtu_expires
, sizeof(int), 0644, NULL
,
1992 &proc_dointvec_jiffies
},
1997 #ifdef CONFIG_NET_CLS_ROUTE
1998 struct ip_rt_acct ip_rt_acct
[256];
1999 rwlock_t ip_rt_acct_lock
= RW_LOCK_UNLOCKED
;
2001 #ifdef CONFIG_PROC_FS
2002 static int ip_rt_acct_read(char *buffer
, char **start
, off_t offset
,
2003 int length
, int *eof
, void *data
)
2007 if (offset
+ length
> sizeof(ip_rt_acct
)) {
2008 length
= sizeof(ip_rt_acct
) - offset
;
2012 read_lock_bh(&ip_rt_acct_lock
);
2013 memcpy(buffer
, ((u8
*)&ip_rt_acct
)+offset
, length
);
2014 read_unlock_bh(&ip_rt_acct_lock
);
2023 __initfunc(void ip_rt_init(void))
2025 #ifdef CONFIG_PROC_FS
2026 #ifdef CONFIG_NET_CLS_ROUTE
2027 struct proc_dir_entry
*ent
;
2032 rt_periodic_timer
.function
= rt_check_expire
;
2033 /* All the timers, started at system startup tend
2034 to synchronize. Perturb it a bit.
2036 rt_periodic_timer
.expires
= jiffies
+ net_random()%ip_rt_gc_interval
2037 + ip_rt_gc_interval
;
2038 add_timer(&rt_periodic_timer
);
2040 #ifdef CONFIG_PROC_FS
2041 proc_net_register(&(struct proc_dir_entry
) {
2042 PROC_NET_RTCACHE
, 8, "rt_cache",
2043 S_IFREG
| S_IRUGO
, 1, 0, 0,
2044 0, &proc_net_inode_operations
,
2047 #ifdef CONFIG_NET_CLS_ROUTE
2048 ent
= create_proc_entry("net/rt_acct", 0, 0);
2049 ent
->read_proc
= ip_rt_acct_read
;