pre-2.3.4..
[davej-history.git] / net / ipv4 / route.c
blob2589f457d67b69ca1df5e2ab45a32a9abdf8e5fd
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.67 1999/05/08 20:00:20 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Splitted to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
56 * This program is free software; you can redistribute it and/or
57 * modify it under the terms of the GNU General Public License
58 * as published by the Free Software Foundation; either version
59 * 2 of the License, or (at your option) any later version.
62 #include <linux/config.h>
63 #include <asm/uaccess.h>
64 #include <asm/system.h>
65 #include <asm/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/sched.h>
69 #include <linux/mm.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/rtnetlink.h>
81 #include <linux/inetdevice.h>
82 #include <linux/igmp.h>
83 #include <linux/pkt_sched.h>
84 #include <linux/mroute.h>
85 #include <net/protocol.h>
86 #include <net/ip.h>
87 #include <net/route.h>
88 #include <net/sock.h>
89 #include <net/ip_fib.h>
90 #include <net/arp.h>
91 #include <net/tcp.h>
92 #include <net/icmp.h>
93 #ifdef CONFIG_SYSCTL
94 #include <linux/sysctl.h>
95 #endif
97 #define IP_MAX_MTU 0xFFF0
99 #define RT_GC_TIMEOUT (300*HZ)
101 int ip_rt_min_delay = 2*HZ;
102 int ip_rt_max_delay = 10*HZ;
103 int ip_rt_gc_thresh = RT_HASH_DIVISOR;
104 int ip_rt_max_size = RT_HASH_DIVISOR*16;
105 int ip_rt_gc_timeout = RT_GC_TIMEOUT;
106 int ip_rt_gc_interval = 60*HZ;
107 int ip_rt_gc_min_interval = 5*HZ;
108 int ip_rt_redirect_number = 9;
109 int ip_rt_redirect_load = HZ/50;
110 int ip_rt_redirect_silence = ((HZ/50) << (9+1));
111 int ip_rt_error_cost = HZ;
112 int ip_rt_error_burst = 5*HZ;
113 int ip_rt_gc_elasticity = 8;
114 int ip_rt_mtu_expires = 10*60*HZ;
116 static unsigned long rt_deadline = 0;
118 #define RTprint(a...) printk(KERN_DEBUG a)
120 static void rt_run_flush(unsigned long dummy);
122 static struct timer_list rt_flush_timer =
123 { NULL, NULL, 0, 0L, rt_run_flush };
124 static struct timer_list rt_periodic_timer =
125 { NULL, NULL, 0, 0L, NULL };
128 * Interface to generic destination cache.
131 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32);
132 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
133 struct sk_buff *);
134 static struct dst_entry * ipv4_negative_advice(struct dst_entry *);
135 static void ipv4_link_failure(struct sk_buff *skb);
136 static int rt_garbage_collect(void);
139 struct dst_ops ipv4_dst_ops =
141 AF_INET,
142 __constant_htons(ETH_P_IP),
143 RT_HASH_DIVISOR,
145 rt_garbage_collect,
146 ipv4_dst_check,
147 ipv4_dst_reroute,
148 NULL,
149 ipv4_negative_advice,
150 ipv4_link_failure,
153 __u8 ip_tos2prio[16] = {
154 TC_PRIO_BESTEFFORT,
155 TC_PRIO_FILLER,
156 TC_PRIO_BESTEFFORT,
157 TC_PRIO_FILLER,
158 TC_PRIO_BULK,
159 TC_PRIO_FILLER,
160 TC_PRIO_BULK,
161 TC_PRIO_FILLER,
162 TC_PRIO_INTERACTIVE,
163 TC_PRIO_FILLER,
164 TC_PRIO_INTERACTIVE,
165 TC_PRIO_FILLER,
166 TC_PRIO_INTERACTIVE_BULK,
167 TC_PRIO_FILLER,
168 TC_PRIO_INTERACTIVE_BULK,
169 TC_PRIO_FILLER
174 * Route cache.
177 static struct rtable *rt_hash_table[RT_HASH_DIVISOR];
178 static rwlock_t rt_hash_lock = RW_LOCK_UNLOCKED;
180 static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
182 static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
184 unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
185 hash = hash^saddr^tos;
186 hash = hash^(hash>>16);
187 return (hash^(hash>>8)) & 0xFF;
190 #ifdef CONFIG_PROC_FS
192 static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
194 int len=0;
195 off_t pos=0;
196 char temp[129];
197 struct rtable *r;
198 int i;
200 pos = 128;
202 if (offset<128) {
203 sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst");
204 len = 128;
208 read_lock_bh(&rt_hash_lock);
210 for (i = 0; i<RT_HASH_DIVISOR; i++) {
211 for (r = rt_hash_table[i]; r; r = r->u.rt_next) {
213 * Spin through entries until we are ready
215 pos += 128;
217 if (pos <= offset) {
218 len = 0;
219 continue;
221 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
222 r->u.dst.dev ? r->u.dst.dev->name : "*",
223 (unsigned long)r->rt_dst,
224 (unsigned long)r->rt_gateway,
225 r->rt_flags,
226 atomic_read(&r->u.dst.use),
227 atomic_read(&r->u.dst.refcnt),
229 (unsigned long)r->rt_src, (int)r->u.dst.pmtu,
230 r->u.dst.window,
231 (int)r->u.dst.rtt, r->key.tos,
232 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
233 r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0,
234 r->rt_spec_dst);
235 sprintf(buffer+len,"%-127s\n",temp);
236 len += 128;
237 if (pos >= offset+length)
238 goto done;
242 done:
243 read_unlock_bh(&rt_hash_lock);
245 *start = buffer+len-(pos-offset);
246 len = pos-offset;
247 if (len>length)
248 len = length;
249 return len;
251 #endif
253 static __inline__ void rt_free(struct rtable *rt)
255 dst_free(&rt->u.dst);
258 static __inline__ void rt_drop(struct rtable *rt)
260 ip_rt_put(rt);
261 dst_free(&rt->u.dst);
264 static __inline__ int rt_fast_clean(struct rtable *rth)
266 /* Kill broadcast/multicast entries very aggresively, if they
267 collide in hash table with more useful entries */
268 return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
269 && rth->key.iif && rth->u.rt_next);
272 static __inline__ int rt_valuable(struct rtable *rth)
274 return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY))
275 || rth->u.dst.expires);
278 static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
280 int age;
282 if (atomic_read(&rth->u.dst.use))
283 return 0;
285 if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0)
286 return 1;
288 age = jiffies - rth->u.dst.lastuse;
289 if (age <= tmo1 && !rt_fast_clean(rth))
290 return 0;
291 if (age <= tmo2 && rt_valuable(rth))
292 return 0;
293 return 1;
296 static void rt_check_expire(unsigned long dummy)
298 int i;
299 static int rover;
300 struct rtable *rth, **rthp;
301 unsigned long now = jiffies;
303 for (i=0; i<RT_HASH_DIVISOR/5; i++) {
304 unsigned tmo = ip_rt_gc_timeout;
306 rover = (rover + 1) & (RT_HASH_DIVISOR-1);
307 rthp = &rt_hash_table[rover];
309 write_lock_bh(&rt_hash_lock);
310 while ((rth = *rthp) != NULL) {
311 if (rth->u.dst.expires) {
312 /* Entrie is expired even if it is in use */
313 if ((long)(now - rth->u.dst.expires) <= 0) {
314 tmo >>= 1;
315 rthp = &rth->u.rt_next;
316 continue;
318 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
319 tmo >>= 1;
320 rthp = &rth->u.rt_next;
321 continue;
325 * Cleanup aged off entries.
327 *rthp = rth->u.rt_next;
328 rt_free(rth);
330 write_unlock_bh(&rt_hash_lock);
332 /* Fallback loop breaker. */
333 if ((jiffies - now) > 0)
334 break;
336 rt_periodic_timer.expires = now + ip_rt_gc_interval;
337 add_timer(&rt_periodic_timer);
340 static void rt_run_flush(unsigned long dummy)
342 int i;
343 struct rtable * rth, * next;
345 rt_deadline = 0;
347 write_lock_bh(&rt_hash_lock);
348 for (i=0; i<RT_HASH_DIVISOR; i++) {
349 rth = rt_hash_table[i];
350 if(rth == NULL)
351 continue;
352 rt_hash_table[i] = NULL;
353 write_unlock_bh(&rt_hash_lock);
355 for (; rth; rth=next) {
356 next = rth->u.rt_next;
357 rth->u.rt_next = NULL;
358 rt_free(rth);
361 write_lock_bh(&rt_hash_lock);
363 write_unlock_bh(&rt_hash_lock);
366 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
368 void rt_cache_flush(int delay)
370 unsigned long now = jiffies;
371 int user_mode = !in_interrupt();
373 if (delay < 0)
374 delay = ip_rt_min_delay;
376 spin_lock_bh(&rt_flush_lock);
378 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
379 long tmo = (long)(rt_deadline - now);
381 /* If flush timer is already running
382 and flush request is not immediate (delay > 0):
384 if deadline is not achieved, prolongate timer to "delay",
385 otherwise fire it at deadline time.
388 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
389 tmo = 0;
391 if (delay > tmo)
392 delay = tmo;
395 if (delay <= 0) {
396 spin_unlock_bh(&rt_flush_lock);
397 rt_run_flush(0);
398 return;
401 if (rt_deadline == 0)
402 rt_deadline = now + ip_rt_max_delay;
404 rt_flush_timer.expires = now + delay;
405 add_timer(&rt_flush_timer);
406 spin_unlock_bh(&rt_flush_lock);
410 Short description of GC goals.
412 We want to build algorithm, which will keep routing cache
413 at some equilibrium point, when number of aged off entries
414 is kept approximately equal to newly generated ones.
416 Current expiration strength is variable "expire".
417 We try to adjust it dynamically, so that if networking
418 is idle expires is large enough to keep enough of warm entries,
419 and when load increases it reduces to limit cache size.
422 static int rt_garbage_collect(void)
424 static unsigned expire = RT_GC_TIMEOUT;
425 static unsigned long last_gc;
426 static int rover;
427 static int equilibrium;
428 struct rtable *rth, **rthp;
429 unsigned long now = jiffies;
430 int goal;
433 * Garbage collection is pretty expensive,
434 * do not make it too frequently.
436 if (now - last_gc < ip_rt_gc_min_interval &&
437 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
438 return 0;
440 /* Calculate number of entries, which we want to expire now. */
441 goal = atomic_read(&ipv4_dst_ops.entries) - RT_HASH_DIVISOR*ip_rt_gc_elasticity;
442 if (goal <= 0) {
443 if (equilibrium < ipv4_dst_ops.gc_thresh)
444 equilibrium = ipv4_dst_ops.gc_thresh;
445 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
446 if (goal > 0) {
447 equilibrium += min(goal/2, RT_HASH_DIVISOR);
448 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
450 } else {
451 /* We are in dangerous area. Try to reduce cache really
452 * aggressively.
454 goal = max(goal/2, RT_HASH_DIVISOR);
455 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
458 if (now - last_gc >= ip_rt_gc_min_interval)
459 last_gc = now;
461 if (goal <= 0) {
462 equilibrium += goal;
463 goto work_done;
466 do {
467 int i, k;
469 write_lock_bh(&rt_hash_lock);
470 for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) {
471 unsigned tmo = expire;
473 k = (k + 1) & (RT_HASH_DIVISOR-1);
474 rthp = &rt_hash_table[k];
475 while ((rth = *rthp) != NULL) {
476 if (!rt_may_expire(rth, tmo, expire)) {
477 tmo >>= 1;
478 rthp = &rth->u.rt_next;
479 continue;
481 *rthp = rth->u.rt_next;
482 rth->u.rt_next = NULL;
483 rt_free(rth);
484 goal--;
486 if (goal <= 0)
487 break;
489 rover = k;
490 write_unlock_bh(&rt_hash_lock);
492 if (goal <= 0)
493 goto work_done;
495 /* Goal is not achieved. We stop process if:
497 - if expire reduced to zero. Otherwise, expire is halfed.
498 - if table is not full.
499 - if we are called from interrupt.
500 - jiffies check is just fallback/debug loop breaker.
501 We will not spin here for long time in any case.
504 if (expire == 0)
505 break;
507 expire >>= 1;
508 #if RT_CACHE_DEBUG >= 2
509 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);
510 #endif
512 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
513 return 0;
514 } while (!in_interrupt() && jiffies - now < 1);
516 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
517 return 0;
518 if (net_ratelimit())
519 printk("dst cache overflow\n");
520 return 1;
522 work_done:
523 expire += ip_rt_gc_min_interval;
524 if (expire > ip_rt_gc_timeout ||
525 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
526 expire = ip_rt_gc_timeout;
527 #if RT_CACHE_DEBUG >= 2
528 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);
529 #endif
530 return 0;
533 static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp)
535 struct rtable *rth, **rthp;
536 unsigned long now = jiffies;
537 int attempts = !in_interrupt();
539 restart:
540 rthp = &rt_hash_table[hash];
542 write_lock_bh(&rt_hash_lock);
543 while ((rth = *rthp) != NULL) {
544 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
545 /* Put it first */
546 *rthp = rth->u.rt_next;
547 rth->u.rt_next = rt_hash_table[hash];
548 rt_hash_table[hash] = rth;
550 atomic_inc(&rth->u.dst.refcnt);
551 atomic_inc(&rth->u.dst.use);
552 rth->u.dst.lastuse = now;
553 write_unlock_bh(&rt_hash_lock);
555 rt_drop(rt);
556 *rp = rth;
557 return 0;
560 rthp = &rth->u.rt_next;
563 /* Try to bind route to arp only if it is output
564 route or unicast forwarding path.
566 if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
567 if (!arp_bind_neighbour(&rt->u.dst)) {
568 write_unlock_bh(&rt_hash_lock);
570 /* Neighbour tables are full and nothing
571 can be released. Try to shrink route cache,
572 it is most likely it holds some neighbour records.
574 if (attempts-- > 0) {
575 int saved_elasticity = ip_rt_gc_elasticity;
576 int saved_int = ip_rt_gc_min_interval;
577 ip_rt_gc_elasticity = 1;
578 ip_rt_gc_min_interval = 0;
579 rt_garbage_collect();
580 ip_rt_gc_min_interval = saved_int;
581 ip_rt_gc_elasticity = saved_elasticity;
582 goto restart;
585 rt_drop(rt);
586 if (net_ratelimit())
587 printk("neighbour table overflow\n");
588 return -ENOBUFS;
592 rt->u.rt_next = rt_hash_table[hash];
593 #if RT_CACHE_DEBUG >= 2
594 if (rt->u.rt_next) {
595 struct rtable * trt;
596 printk("rt_cache @%02x: %08x", hash, rt->rt_dst);
597 for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next)
598 printk(" . %08x", trt->rt_dst);
599 printk("\n");
601 #endif
602 rt_hash_table[hash] = rt;
603 *rp = rt;
604 write_unlock_bh(&rt_hash_lock);
605 return 0;
608 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
609 u32 saddr, u8 tos, struct device *dev)
611 int i, k;
612 struct in_device *in_dev = dev->ip_ptr;
613 struct rtable *rth, **rthp;
614 u32 skeys[2] = { saddr, 0 };
615 int ikeys[2] = { dev->ifindex, 0 };
617 tos &= IPTOS_TOS_MASK;
619 if (!in_dev)
620 return;
622 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
623 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
624 goto reject_redirect;
626 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
627 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
628 goto reject_redirect;
629 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
630 goto reject_redirect;
631 } else {
632 if (inet_addr_type(new_gw) != RTN_UNICAST)
633 goto reject_redirect;
636 for (i=0; i<2; i++) {
637 for (k=0; k<2; k++) {
638 unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos);
640 rthp=&rt_hash_table[hash];
642 write_lock_bh(&rt_hash_lock);
643 while ( (rth = *rthp) != NULL) {
644 struct rtable *rt;
646 if (rth->key.dst != daddr ||
647 rth->key.src != skeys[i] ||
648 rth->key.tos != tos ||
649 rth->key.oif != ikeys[k] ||
650 rth->key.iif != 0) {
651 rthp = &rth->u.rt_next;
652 continue;
655 if (rth->rt_dst != daddr ||
656 rth->rt_src != saddr ||
657 rth->u.dst.error ||
658 rth->rt_gateway != old_gw ||
659 rth->u.dst.dev != dev)
660 break;
662 dst_clone(&rth->u.dst);
664 rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
665 if (rt == NULL) {
666 ip_rt_put(rth);
667 write_unlock_bh(&rt_hash_lock);
668 return;
672 * Copy all the information.
674 *rt = *rth;
675 atomic_set(&rt->u.dst.refcnt, 1);
676 atomic_set(&rt->u.dst.use, 1);
677 rt->u.dst.lastuse = jiffies;
678 rt->u.dst.neighbour = NULL;
679 rt->u.dst.hh = NULL;
681 rt->rt_flags |= RTCF_REDIRECTED;
683 /* Gateway is different ... */
684 rt->rt_gateway = new_gw;
686 /* Redirect received -> path was valid */
687 dst_confirm(&rth->u.dst);
689 if (!arp_bind_neighbour(&rt->u.dst) ||
690 !(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
691 if (rt->u.dst.neighbour)
692 neigh_event_send(rt->u.dst.neighbour, NULL);
693 ip_rt_put(rth);
694 rt_drop(rt);
695 break;
698 *rthp = rth->u.rt_next;
699 if (!rt_intern_hash(hash, rt, &rt))
700 ip_rt_put(rt);
701 rt_drop(rth);
702 break;
704 write_unlock_bh(&rt_hash_lock);
707 return;
709 reject_redirect:
710 #ifdef CONFIG_IP_ROUTE_VERBOSE
711 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
712 printk(KERN_INFO "Redirect from %lX/%s to %lX ignored."
713 "Path = %lX -> %lX, tos %02x\n",
714 ntohl(old_gw), dev->name, ntohl(new_gw),
715 ntohl(saddr), ntohl(daddr), tos);
716 #endif
719 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
721 struct rtable *rt = (struct rtable*)dst;
723 if (rt != NULL) {
724 if (dst->obsolete) {
725 ip_rt_put(rt);
726 return NULL;
728 if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
729 unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
730 struct rtable **rthp;
731 #if RT_CACHE_DEBUG >= 1
732 printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos);
733 #endif
734 ip_rt_put(rt);
735 write_lock_bh(&rt_hash_lock);
736 for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
737 if (*rthp == rt) {
738 *rthp = rt->u.rt_next;
739 rt_free(rt);
740 break;
743 write_unlock_bh(&rt_hash_lock);
744 return NULL;
747 return dst;
751 * Algorithm:
752 * 1. The first ip_rt_redirect_number redirects are sent
753 * with exponential backoff, then we stop sending them at all,
754 * assuming that the host ignores our redirects.
755 * 2. If we did not see packets requiring redirects
756 * during ip_rt_redirect_silence, we assume that the host
757 * forgot redirected route and start to send redirects again.
759 * This algorithm is much cheaper and more intelligent than dumb load limiting
760 * in icmp.c.
762 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
763 * and "frag. need" (breaks PMTU discovery) in icmp.c.
766 void ip_rt_send_redirect(struct sk_buff *skb)
768 struct rtable *rt = (struct rtable*)skb->dst;
769 struct in_device *in_dev = (struct in_device*)rt->u.dst.dev->ip_ptr;
771 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev))
772 return;
774 /* No redirected packets during ip_rt_redirect_silence;
775 * reset the algorithm.
777 if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence)
778 rt->u.dst.rate_tokens = 0;
780 /* Too many ignored redirects; do not send anything
781 * set u.dst.rate_last to the last seen redirected packet.
783 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
784 rt->u.dst.rate_last = jiffies;
785 return;
788 /* Check for load limit; set rate_last to the latest sent
789 * redirect.
791 if (jiffies - rt->u.dst.rate_last > (ip_rt_redirect_load<<rt->u.dst.rate_tokens)) {
792 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
793 rt->u.dst.rate_last = jiffies;
794 ++rt->u.dst.rate_tokens;
795 #ifdef CONFIG_IP_ROUTE_VERBOSE
796 if (IN_DEV_LOG_MARTIANS(in_dev) &&
797 rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit())
798 printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n",
799 rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway);
800 #endif
804 static int ip_error(struct sk_buff *skb)
806 struct rtable *rt = (struct rtable*)skb->dst;
807 unsigned long now;
808 int code;
810 switch (rt->u.dst.error) {
811 case EINVAL:
812 default:
813 kfree_skb(skb);
814 return 0;
815 case EHOSTUNREACH:
816 code = ICMP_HOST_UNREACH;
817 break;
818 case ENETUNREACH:
819 code = ICMP_NET_UNREACH;
820 break;
821 case EACCES:
822 code = ICMP_PKT_FILTERED;
823 break;
826 now = jiffies;
827 if ((rt->u.dst.rate_tokens += (now - rt->u.dst.rate_last)) > ip_rt_error_burst)
828 rt->u.dst.rate_tokens = ip_rt_error_burst;
829 rt->u.dst.rate_last = now;
830 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
831 rt->u.dst.rate_tokens -= ip_rt_error_cost;
832 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
835 kfree_skb(skb);
836 return 0;
840 * The last two values are not from the RFC but
841 * are needed for AMPRnet AX.25 paths.
844 static unsigned short mtu_plateau[] =
845 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
847 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
849 int i;
851 for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++)
852 if (old_mtu > mtu_plateau[i])
853 return mtu_plateau[i];
854 return 68;
857 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
859 int i;
860 unsigned short old_mtu = ntohs(iph->tot_len);
861 struct rtable *rth;
862 u32 skeys[2] = { iph->saddr, 0, };
863 u32 daddr = iph->daddr;
864 u8 tos = iph->tos & IPTOS_TOS_MASK;
865 unsigned short est_mtu = 0;
867 if (ipv4_config.no_pmtu_disc)
868 return 0;
870 for (i=0; i<2; i++) {
871 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
873 read_lock_bh(&rt_hash_lock);
874 for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) {
875 if (rth->key.dst == daddr &&
876 rth->key.src == skeys[i] &&
877 rth->rt_dst == daddr &&
878 rth->rt_src == iph->saddr &&
879 rth->key.tos == tos &&
880 rth->key.iif == 0 &&
881 !(rth->u.dst.mxlock&(1<<RTAX_MTU))) {
882 unsigned short mtu = new_mtu;
884 if (new_mtu < 68 || new_mtu >= old_mtu) {
886 /* BSD 4.2 compatibility hack :-( */
887 if (mtu == 0 && old_mtu >= rth->u.dst.pmtu &&
888 old_mtu >= 68 + (iph->ihl<<2))
889 old_mtu -= iph->ihl<<2;
891 mtu = guess_mtu(old_mtu);
893 if (mtu <= rth->u.dst.pmtu) {
894 if (mtu < rth->u.dst.pmtu) {
895 dst_confirm(&rth->u.dst);
896 rth->u.dst.pmtu = mtu;
897 dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
899 est_mtu = mtu;
903 read_unlock_bh(&rt_hash_lock);
905 return est_mtu ? : new_mtu;
908 void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
910 if (dst->pmtu > mtu && mtu >= 68 &&
911 !(dst->mxlock&(1<<RTAX_MTU))) {
912 dst->pmtu = mtu;
913 dst_set_expires(dst, ip_rt_mtu_expires);
917 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32 cookie)
919 dst_release(dst);
920 return NULL;
923 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
924 struct sk_buff *skb)
926 return NULL;
929 static void ipv4_link_failure(struct sk_buff *skb)
931 struct rtable *rt;
933 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
935 rt = (struct rtable *) skb->dst;
936 if (rt)
937 dst_set_expires(&rt->u.dst, 0);
940 static int ip_rt_bug(struct sk_buff *skb)
942 printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr,
943 skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?");
944 kfree_skb(skb);
945 return 0;
949 We do not cache source address of outgoing interface,
950 because it is used only by IP RR, TS and SRR options,
951 so that it out of fast path.
953 BTW remember: "addr" is allowed to be not aligned
954 in IP options!
957 void ip_rt_get_source(u8 *addr, struct rtable *rt)
959 u32 src;
960 struct fib_result res;
962 if (rt->key.iif == 0)
963 src = rt->rt_src;
964 else if (fib_lookup(&rt->key, &res) == 0)
965 src = FIB_RES_PREFSRC(res);
966 else
967 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
968 memcpy(addr, &src, 4);
971 #ifdef CONFIG_NET_CLS_ROUTE
972 static void set_class_tag(struct rtable *rt, u32 tag)
974 if (!(rt->u.dst.tclassid&0xFFFF))
975 rt->u.dst.tclassid |= tag&0xFFFF;
976 if (!(rt->u.dst.tclassid&0xFFFF0000))
977 rt->u.dst.tclassid |= tag&0xFFFF0000;
979 #endif
981 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
983 struct fib_info *fi = res->fi;
985 if (fi) {
986 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
987 rt->rt_gateway = FIB_RES_GW(*res);
988 rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1];
989 rt->u.dst.pmtu = fi->fib_mtu;
990 if (fi->fib_mtu == 0) {
991 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
992 if (rt->u.dst.pmtu > IP_MAX_MTU)
993 rt->u.dst.pmtu = IP_MAX_MTU;
994 if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
995 rt->rt_gateway != rt->rt_dst &&
996 rt->u.dst.pmtu > 576)
997 rt->u.dst.pmtu = 576;
999 rt->u.dst.window= fi->fib_window ? : 0;
1000 rt->u.dst.rtt = fi->fib_rtt ? : TCP_TIMEOUT_INIT;
1001 #ifdef CONFIG_NET_CLS_ROUTE
1002 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1003 #endif
1004 } else {
1005 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1006 if (rt->u.dst.pmtu > IP_MAX_MTU)
1007 rt->u.dst.pmtu = IP_MAX_MTU;
1008 rt->u.dst.window= 0;
1009 rt->u.dst.rtt = TCP_TIMEOUT_INIT;
1011 #ifdef CONFIG_NET_CLS_ROUTE
1012 #ifdef CONFIG_IP_MULTIPLE_TABLES
1013 set_class_tag(rt, fib_rules_tclass(res));
1014 #endif
1015 set_class_tag(rt, itag);
1016 #endif
1017 rt->rt_type = res->type;
1020 static int
1021 ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1022 u8 tos, struct device *dev, int our)
1024 unsigned hash;
1025 struct rtable *rth;
1026 u32 spec_dst;
1027 struct in_device *in_dev = dev->ip_ptr;
1028 u32 itag = 0;
1030 /* Primary sanity checks. */
1032 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1033 in_dev == NULL || skb->protocol != __constant_htons(ETH_P_IP))
1034 return -EINVAL;
1036 if (ZERONET(saddr)) {
1037 if (!LOCAL_MCAST(daddr))
1038 return -EINVAL;
1039 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1040 } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
1041 return -EINVAL;
1043 rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1044 if (!rth)
1045 return -ENOBUFS;
1047 rth->u.dst.output= ip_rt_bug;
1049 atomic_set(&rth->u.dst.use, 1);
1050 rth->key.dst = daddr;
1051 rth->rt_dst = daddr;
1052 rth->key.tos = tos;
1053 #ifdef CONFIG_IP_ROUTE_FWMARK
1054 rth->key.fwmark = skb->fwmark;
1055 #endif
1056 rth->key.src = saddr;
1057 rth->rt_src = saddr;
1058 #ifdef CONFIG_IP_ROUTE_NAT
1059 rth->rt_dst_map = daddr;
1060 rth->rt_src_map = saddr;
1061 #endif
1062 #ifdef CONFIG_NET_CLS_ROUTE
1063 rth->u.dst.tclassid = itag;
1064 #endif
1065 rth->rt_iif =
1066 rth->key.iif = dev->ifindex;
1067 rth->u.dst.dev = &loopback_dev;
1068 rth->key.oif = 0;
1069 rth->rt_gateway = daddr;
1070 rth->rt_spec_dst= spec_dst;
1071 rth->rt_type = RTN_MULTICAST;
1072 rth->rt_flags = RTCF_MULTICAST;
1073 if (our) {
1074 rth->u.dst.input= ip_local_deliver;
1075 rth->rt_flags |= RTCF_LOCAL;
1078 #ifdef CONFIG_IP_MROUTE
1079 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1080 rth->u.dst.input = ip_mr_input;
1081 #endif
1083 hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
1084 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1088 * NOTE. We drop all the packets that has local source
1089 * addresses, because every properly looped back packet
1090 * must have correct destination already attached by output routine.
1092 * Such approach solves two big problems:
1093 * 1. Not simplex devices are handled properly.
1094 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1097 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1098 u8 tos, struct device *dev)
1100 struct rt_key key;
1101 struct fib_result res;
1102 struct in_device *in_dev = dev->ip_ptr;
1103 struct in_device *out_dev;
1104 unsigned flags = 0;
1105 u32 itag = 0;
1106 struct rtable * rth;
1107 unsigned hash;
1108 u32 spec_dst;
1109 int err = -EINVAL;
1112 * IP on this device is disabled.
1115 if (!in_dev)
1116 return -EINVAL;
1118 key.dst = daddr;
1119 key.src = saddr;
1120 key.tos = tos;
1121 #ifdef CONFIG_IP_ROUTE_FWMARK
1122 key.fwmark = skb->fwmark;
1123 #endif
1124 key.iif = dev->ifindex;
1125 key.oif = 0;
1126 key.scope = RT_SCOPE_UNIVERSE;
1128 hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos);
1130 /* Check for the most weird martians, which can be not detected
1131 by fib_lookup.
1134 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1135 goto martian_source;
1137 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1138 goto brd_input;
1140 /* Accept zero addresses only to limited broadcast;
1141 * I even do not know to fix it or not. Waiting for complains :-)
1143 if (ZERONET(saddr))
1144 goto martian_source;
1146 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1147 goto martian_destination;
1150 * Now we are ready to route packet.
1152 if ((err = fib_lookup(&key, &res))) {
1153 if (!IN_DEV_FORWARD(in_dev))
1154 return -EINVAL;
1155 goto no_route;
1158 #ifdef CONFIG_IP_ROUTE_NAT
1159 /* Policy is applied before mapping destination,
1160 but rerouting after map should be made with old source.
1163 if (1) {
1164 u32 src_map = saddr;
1165 if (res.r)
1166 src_map = fib_rules_policy(saddr, &res, &flags);
1168 if (res.type == RTN_NAT) {
1169 key.dst = fib_rules_map_destination(daddr, &res);
1170 if (fib_lookup(&key, &res) || res.type != RTN_UNICAST)
1171 return -EINVAL;
1172 flags |= RTCF_DNAT;
1174 key.src = src_map;
1176 #endif
1178 if (res.type == RTN_BROADCAST)
1179 goto brd_input;
1181 if (res.type == RTN_LOCAL) {
1182 int result;
1183 result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex,
1184 dev, &spec_dst, &itag);
1185 if (result < 0)
1186 goto martian_source;
1187 if (result)
1188 flags |= RTCF_DIRECTSRC;
1189 spec_dst = daddr;
1190 goto local_input;
1193 if (!IN_DEV_FORWARD(in_dev))
1194 return -EINVAL;
1195 if (res.type != RTN_UNICAST)
1196 goto martian_destination;
1198 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1199 if (res.fi->fib_nhs > 1 && key.oif == 0)
1200 fib_select_multipath(&key, &res);
1201 #endif
1202 out_dev = FIB_RES_DEV(res)->ip_ptr;
1203 if (out_dev == NULL) {
1204 if (net_ratelimit())
1205 printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n");
1206 return -EINVAL;
1209 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
1210 if (err < 0)
1211 goto martian_source;
1213 if (err)
1214 flags |= RTCF_DIRECTSRC;
1216 if (out_dev == in_dev && err && !(flags&(RTCF_NAT|RTCF_MASQ)) &&
1217 (IN_DEV_SHARED_MEDIA(out_dev)
1218 || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1219 flags |= RTCF_DOREDIRECT;
1221 if (skb->protocol != __constant_htons(ETH_P_IP)) {
1222 /* Not IP (i.e. ARP). Do not create route, if it is
1223 * invalid for proxy arp. DNAT routes are always valid.
1225 if (out_dev == in_dev && !(flags&RTCF_DNAT))
1226 return -EINVAL;
1229 rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1230 if (!rth)
1231 return -ENOBUFS;
1233 atomic_set(&rth->u.dst.use, 1);
1234 rth->key.dst = daddr;
1235 rth->rt_dst = daddr;
1236 rth->key.tos = tos;
1237 #ifdef CONFIG_IP_ROUTE_FWMARK
1238 rth->key.fwmark = skb->fwmark;
1239 #endif
1240 rth->key.src = saddr;
1241 rth->rt_src = saddr;
1242 rth->rt_gateway = daddr;
1243 #ifdef CONFIG_IP_ROUTE_NAT
1244 rth->rt_src_map = key.src;
1245 rth->rt_dst_map = key.dst;
1246 if (flags&RTCF_DNAT)
1247 rth->rt_gateway = key.dst;
1248 #endif
1249 rth->rt_iif =
1250 rth->key.iif = dev->ifindex;
1251 rth->u.dst.dev = out_dev->dev;
1252 rth->key.oif = 0;
1253 rth->rt_spec_dst= spec_dst;
1255 rth->u.dst.input = ip_forward;
1256 rth->u.dst.output = ip_output;
1258 rt_set_nexthop(rth, &res, itag);
1260 rth->rt_flags = flags;
1262 #ifdef CONFIG_NET_FASTROUTE
1263 if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1264 struct device *odev = rth->u.dst.dev;
1265 if (odev != dev &&
1266 dev->accept_fastpath &&
1267 odev->mtu >= dev->mtu &&
1268 dev->accept_fastpath(dev, &rth->u.dst) == 0)
1269 rth->rt_flags |= RTCF_FAST;
1271 #endif
1273 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1275 brd_input:
1276 if (skb->protocol != __constant_htons(ETH_P_IP))
1277 return -EINVAL;
1279 if (ZERONET(saddr)) {
1280 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1281 } else {
1282 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag);
1283 if (err < 0)
1284 goto martian_source;
1285 if (err)
1286 flags |= RTCF_DIRECTSRC;
1288 flags |= RTCF_BROADCAST;
1289 res.type = RTN_BROADCAST;
1291 local_input:
1292 rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1293 if (!rth)
1294 return -ENOBUFS;
1296 rth->u.dst.output= ip_rt_bug;
1298 atomic_set(&rth->u.dst.use, 1);
1299 rth->key.dst = daddr;
1300 rth->rt_dst = daddr;
1301 rth->key.tos = tos;
1302 #ifdef CONFIG_IP_ROUTE_FWMARK
1303 rth->key.fwmark = skb->fwmark;
1304 #endif
1305 rth->key.src = saddr;
1306 rth->rt_src = saddr;
1307 #ifdef CONFIG_IP_ROUTE_NAT
1308 rth->rt_dst_map = key.dst;
1309 rth->rt_src_map = key.src;
1310 #endif
1311 #ifdef CONFIG_NET_CLS_ROUTE
1312 rth->u.dst.tclassid = itag;
1313 #endif
1314 rth->rt_iif =
1315 rth->key.iif = dev->ifindex;
1316 rth->u.dst.dev = &loopback_dev;
1317 rth->key.oif = 0;
1318 rth->rt_gateway = daddr;
1319 rth->rt_spec_dst= spec_dst;
1320 rth->u.dst.input= ip_local_deliver;
1321 rth->rt_flags = flags|RTCF_LOCAL;
1322 if (res.type == RTN_UNREACHABLE) {
1323 rth->u.dst.input= ip_error;
1324 rth->u.dst.error= -err;
1325 rth->rt_flags &= ~RTCF_LOCAL;
1327 rth->rt_type = res.type;
1328 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1330 no_route:
1331 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1332 res.type = RTN_UNREACHABLE;
1333 goto local_input;
1336 * Do not cache martian addresses: they should be logged (RFC1812)
1338 martian_destination:
1339 #ifdef CONFIG_IP_ROUTE_VERBOSE
1340 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1341 printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name);
1342 #endif
1343 return -EINVAL;
1345 martian_source:
1346 #ifdef CONFIG_IP_ROUTE_VERBOSE
1347 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1349 * RFC1812 recommenadtion, if source is martian,
1350 * the only hint is MAC header.
1352 printk(KERN_WARNING "martian source %08x for %08x, dev %s\n", saddr, daddr, dev->name);
1353 if (dev->hard_header_len) {
1354 int i;
1355 unsigned char *p = skb->mac.raw;
1356 printk(KERN_WARNING "ll header:");
1357 for (i=0; i<dev->hard_header_len; i++, p++)
1358 printk(" %02x", *p);
1359 printk("\n");
1362 #endif
1363 return -EINVAL;
1366 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1367 u8 tos, struct device *dev)
1369 struct rtable * rth;
1370 unsigned hash;
1371 int iif = dev->ifindex;
1373 tos &= IPTOS_TOS_MASK;
1374 hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
1376 read_lock_bh(&rt_hash_lock);
1377 for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
1378 if (rth->key.dst == daddr &&
1379 rth->key.src == saddr &&
1380 rth->key.iif == iif &&
1381 rth->key.oif == 0 &&
1382 #ifdef CONFIG_IP_ROUTE_FWMARK
1383 rth->key.fwmark == skb->fwmark &&
1384 #endif
1385 rth->key.tos == tos) {
1386 rth->u.dst.lastuse = jiffies;
1387 atomic_inc(&rth->u.dst.use);
1388 atomic_inc(&rth->u.dst.refcnt);
1389 read_unlock_bh(&rt_hash_lock);
1390 skb->dst = (struct dst_entry*)rth;
1391 return 0;
1394 read_unlock_bh(&rt_hash_lock);
1396 /* Multicast recognition logic is moved from route cache to here.
1397 The problem was that too many Ethernet cards have broken/missing
1398 hardware multicast filters :-( As result the host on multicasting
1399 network acquires a lot of useless route cache entries, sort of
1400 SDR messages from all the world. Now we try to get rid of them.
1401 Really, provided software IP multicast filter is organized
1402 reasonably (at least, hashed), it does not result in a slowdown
1403 comparing with route cache reject entries.
1404 Note, that multicast routers are not affected, because
1405 route cache entry is created eventually.
1407 if (MULTICAST(daddr)) {
1408 int our = ip_check_mc(dev, daddr);
1409 if (!our
1410 #ifdef CONFIG_IP_MROUTE
1411 && (LOCAL_MCAST(daddr) || !dev->ip_ptr ||
1412 !IN_DEV_MFORWARD((struct in_device*)dev->ip_ptr))
1413 #endif
1414 ) return -EINVAL;
1415 return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
1417 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1421 * Major route resolver routine.
1424 int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
1426 struct rt_key key;
1427 struct fib_result res;
1428 unsigned flags = 0;
1429 struct rtable *rth;
1430 struct device *dev_out = NULL;
1431 unsigned hash;
1432 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1433 u32 nochecksrc = (tos & RTO_TPROXY);
1434 #endif
1436 tos &= IPTOS_TOS_MASK|RTO_ONLINK;
1437 key.dst = daddr;
1438 key.src = saddr;
1439 key.tos = tos&IPTOS_TOS_MASK;
1440 key.iif = loopback_dev.ifindex;
1441 key.oif = oif;
1442 key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
1443 res.fi = NULL;
1444 #ifdef CONFIG_IP_MULTIPLE_TABLES
1445 res.r = NULL;
1446 #endif
1448 if (saddr) {
1449 if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr))
1450 return -EINVAL;
1452 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1453 dev_out = ip_dev_find(saddr);
1454 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1455 /* If address is not local, test for transparent proxy flag;
1456 if address is local --- clear the flag.
1458 if (dev_out == NULL) {
1459 if (nochecksrc == 0 || inet_addr_type(saddr) != RTN_UNICAST)
1460 return -EINVAL;
1461 flags |= RTCF_TPROXY;
1463 #else
1464 if (dev_out == NULL)
1465 return -EINVAL;
1466 #endif
1468 /* I removed check for oif == dev_out->oif here.
1469 It was wrong by three reasons:
1470 1. ip_dev_find(saddr) can return wrong iface, if saddr is
1471 assigned to multiple interfaces.
1472 2. Moreover, we are allowed to send packets with saddr
1473 of another iface. --ANK
1476 if (oif == 0 &&
1477 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1478 dev_out &&
1479 #endif
1480 (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) {
1481 /* Special hack: user can direct multicasts
1482 and limited broadcast via necessary interface
1483 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1484 This hack is not just for fun, it allows
1485 vic,vat and friends to work.
1486 They bind socket to loopback, set ttl to zero
1487 and expect that it will work.
1488 From the viewpoint of routing cache they are broken,
1489 because we are not allowed to build multicast path
1490 with loopback source addr (look, routing cache
1491 cannot know, that ttl is zero, so that packet
1492 will not leave this host and route is valid).
1493 Luckily, this hack is good workaround.
1496 key.oif = dev_out->ifindex;
1497 goto make_route;
1499 dev_out = NULL;
1501 if (oif) {
1502 dev_out = dev_get_by_index(oif);
1503 if (dev_out == NULL)
1504 return -ENODEV;
1505 if (dev_out->ip_ptr == NULL)
1506 return -ENODEV; /* Wrong error code */
1508 if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) {
1509 if (!key.src)
1510 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1511 goto make_route;
1513 if (!key.src) {
1514 if (MULTICAST(daddr))
1515 key.src = inet_select_addr(dev_out, 0, key.scope);
1516 else if (!daddr)
1517 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST);
1521 if (!key.dst) {
1522 key.dst = key.src;
1523 if (!key.dst)
1524 key.dst = key.src = htonl(INADDR_LOOPBACK);
1525 dev_out = &loopback_dev;
1526 key.oif = loopback_dev.ifindex;
1527 res.type = RTN_LOCAL;
1528 flags |= RTCF_LOCAL;
1529 goto make_route;
1532 if (fib_lookup(&key, &res)) {
1533 res.fi = NULL;
1534 if (oif) {
1535 /* Apparently, routing tables are wrong. Assume,
1536 that the destination is on link.
1538 WHY? DW.
1539 Because we are allowed to send to iface
1540 even if it has NO routes and NO assigned
1541 addresses. When oif is specified, routing
1542 tables are looked up with only one purpose:
1543 to catch if destination is gatewayed, rather than
1544 direct. Moreover, if MSG_DONTROUTE is set,
1545 we send packet, ignoring both routing tables
1546 and ifaddr state. --ANK
1549 We could make it even if oif is unknown,
1550 likely IPv6, but we do not.
1553 if (key.src == 0)
1554 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1555 res.type = RTN_UNICAST;
1556 goto make_route;
1558 return -ENETUNREACH;
1561 if (res.type == RTN_NAT)
1562 return -EINVAL;
1564 if (res.type == RTN_LOCAL) {
1565 if (!key.src)
1566 key.src = key.dst;
1567 dev_out = &loopback_dev;
1568 key.oif = dev_out->ifindex;
1569 res.fi = NULL;
1570 flags |= RTCF_LOCAL;
1571 goto make_route;
1574 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1575 if (res.fi->fib_nhs > 1 && key.oif == 0)
1576 fib_select_multipath(&key, &res);
1577 else
1578 #endif
1579 if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0)
1580 fib_select_default(&key, &res);
1582 if (!key.src)
1583 key.src = FIB_RES_PREFSRC(res);
1585 dev_out = FIB_RES_DEV(res);
1586 key.oif = dev_out->ifindex;
1588 make_route:
1589 if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1590 return -EINVAL;
1592 if (key.dst == 0xFFFFFFFF)
1593 res.type = RTN_BROADCAST;
1594 else if (MULTICAST(key.dst))
1595 res.type = RTN_MULTICAST;
1596 else if (BADCLASS(key.dst) || ZERONET(key.dst))
1597 return -EINVAL;
1599 if (dev_out->flags&IFF_LOOPBACK)
1600 flags |= RTCF_LOCAL;
1602 if (res.type == RTN_BROADCAST) {
1603 flags |= RTCF_BROADCAST|RTCF_LOCAL;
1604 res.fi = NULL;
1605 } else if (res.type == RTN_MULTICAST) {
1606 flags |= RTCF_MULTICAST|RTCF_LOCAL;
1607 if (!ip_check_mc(dev_out, daddr))
1608 flags &= ~RTCF_LOCAL;
1609 /* If multicast route do not exist use
1610 default one, but do not gateway in this case.
1611 Yes, it is hack.
1613 if (res.fi && res.prefixlen < 4)
1614 res.fi = NULL;
1617 rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1618 if (!rth)
1619 return -ENOBUFS;
1621 atomic_set(&rth->u.dst.use, 1);
1622 rth->key.dst = daddr;
1623 rth->key.tos = tos;
1624 rth->key.src = saddr;
1625 rth->key.iif = 0;
1626 rth->key.oif = oif;
1627 rth->rt_dst = key.dst;
1628 rth->rt_src = key.src;
1629 #ifdef CONFIG_IP_ROUTE_NAT
1630 rth->rt_dst_map = key.dst;
1631 rth->rt_src_map = key.src;
1632 #endif
1633 rth->rt_iif = oif ? : dev_out->ifindex;
1634 rth->u.dst.dev = dev_out;
1635 rth->rt_gateway = key.dst;
1636 rth->rt_spec_dst= key.src;
1638 rth->u.dst.output=ip_output;
1640 if (flags&RTCF_LOCAL) {
1641 rth->u.dst.input = ip_local_deliver;
1642 rth->rt_spec_dst = key.dst;
1644 if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
1645 rth->rt_spec_dst = key.src;
1646 if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
1647 rth->u.dst.output = ip_mc_output;
1648 #ifdef CONFIG_IP_MROUTE
1649 if (res.type == RTN_MULTICAST && dev_out->ip_ptr) {
1650 struct in_device *in_dev = dev_out->ip_ptr;
1651 if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) {
1652 rth->u.dst.input = ip_mr_input;
1653 rth->u.dst.output = ip_mc_output;
1656 #endif
1659 rt_set_nexthop(rth, &res, 0);
1661 rth->rt_flags = flags;
1663 hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
1664 return rt_intern_hash(hash, rth, rp);
1667 int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
1669 unsigned hash;
1670 struct rtable *rth;
1672 hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
1674 read_lock_bh(&rt_hash_lock);
1675 for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
1676 if (rth->key.dst == daddr &&
1677 rth->key.src == saddr &&
1678 rth->key.iif == 0 &&
1679 rth->key.oif == oif &&
1680 #ifndef CONFIG_IP_TRANSPARENT_PROXY
1681 rth->key.tos == tos
1682 #else
1683 !((rth->key.tos^tos)&(IPTOS_TOS_MASK|RTO_ONLINK)) &&
1684 ((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
1685 #endif
1687 rth->u.dst.lastuse = jiffies;
1688 atomic_inc(&rth->u.dst.use);
1689 atomic_inc(&rth->u.dst.refcnt);
1690 read_unlock_bh(&rt_hash_lock);
1691 *rp = rth;
1692 return 0;
1695 read_unlock_bh(&rt_hash_lock);
1697 return ip_route_output_slow(rp, daddr, saddr, tos, oif);
1700 #ifdef CONFIG_RTNETLINK
1702 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
1704 struct rtable *rt = (struct rtable*)skb->dst;
1705 struct rtmsg *r;
1706 struct nlmsghdr *nlh;
1707 unsigned char *b = skb->tail;
1708 struct rta_cacheinfo ci;
1709 #ifdef CONFIG_IP_MROUTE
1710 struct rtattr *eptr;
1711 #endif
1712 struct rtattr *mx;
1714 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
1715 r = NLMSG_DATA(nlh);
1716 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
1717 r->rtm_family = AF_INET;
1718 r->rtm_dst_len = 32;
1719 r->rtm_src_len = 0;
1720 r->rtm_tos = rt->key.tos;
1721 r->rtm_table = RT_TABLE_MAIN;
1722 r->rtm_type = rt->rt_type;
1723 r->rtm_scope = RT_SCOPE_UNIVERSE;
1724 r->rtm_protocol = RTPROT_UNSPEC;
1725 r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
1726 if (rt->rt_flags & RTCF_NOTIFY)
1727 r->rtm_flags |= RTM_F_NOTIFY;
1728 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
1729 if (rt->key.src) {
1730 r->rtm_src_len = 32;
1731 RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
1733 if (rt->u.dst.dev)
1734 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
1735 #ifdef CONFIG_NET_CLS_ROUTE
1736 if (rt->u.dst.tclassid)
1737 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
1738 #endif
1739 if (rt->key.iif)
1740 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
1741 else if (rt->rt_src != rt->key.src)
1742 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
1743 if (rt->rt_dst != rt->rt_gateway)
1744 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
1745 mx = (struct rtattr*)skb->tail;
1746 RTA_PUT(skb, RTA_METRICS, 0, NULL);
1747 if (rt->u.dst.mxlock)
1748 RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock);
1749 if (rt->u.dst.pmtu)
1750 RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
1751 if (rt->u.dst.window)
1752 RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window);
1753 if (rt->u.dst.rtt)
1754 RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt);
1755 mx->rta_len = skb->tail - (u8*)mx;
1756 if (mx->rta_len == RTA_LENGTH(0))
1757 skb_trim(skb, (u8*)mx - skb->data);
1758 ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
1759 ci.rta_used = atomic_read(&rt->u.dst.refcnt);
1760 ci.rta_clntref = atomic_read(&rt->u.dst.use);
1761 if (rt->u.dst.expires)
1762 ci.rta_expires = rt->u.dst.expires - jiffies;
1763 else
1764 ci.rta_expires = 0;
1765 ci.rta_error = rt->u.dst.error;
1766 #ifdef CONFIG_IP_MROUTE
1767 eptr = (struct rtattr*)skb->tail;
1768 #endif
1769 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1770 if (rt->key.iif) {
1771 #ifdef CONFIG_IP_MROUTE
1772 u32 dst = rt->rt_dst;
1774 if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
1775 int err = ipmr_get_route(skb, r, nowait);
1776 if (err <= 0) {
1777 if (!nowait) {
1778 if (err == 0)
1779 return 0;
1780 goto nlmsg_failure;
1781 } else {
1782 if (err == -EMSGSIZE)
1783 goto nlmsg_failure;
1784 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
1787 } else
1788 #endif
1790 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
1794 nlh->nlmsg_len = skb->tail - b;
1795 return skb->len;
1797 nlmsg_failure:
1798 rtattr_failure:
1799 skb_trim(skb, b - skb->data);
1800 return -1;
1803 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1805 struct rtattr **rta = arg;
1806 struct rtmsg *rtm = NLMSG_DATA(nlh);
1807 struct rtable *rt = NULL;
1808 u32 dst = 0;
1809 u32 src = 0;
1810 int iif = 0;
1811 int err;
1812 struct sk_buff *skb;
1814 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1815 if (skb == NULL)
1816 return -ENOBUFS;
1818 /* Reserve room for dummy headers, this skb can pass
1819 through good chunk of routing engine.
1821 skb->mac.raw = skb->data;
1822 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
1824 if (rta[RTA_SRC-1])
1825 memcpy(&src, RTA_DATA(rta[RTA_SRC-1]), 4);
1826 if (rta[RTA_DST-1])
1827 memcpy(&dst, RTA_DATA(rta[RTA_DST-1]), 4);
1828 if (rta[RTA_IIF-1])
1829 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1831 if (iif) {
1832 struct device *dev;
1833 dev = dev_get_by_index(iif);
1834 if (!dev)
1835 return -ENODEV;
1836 skb->protocol = __constant_htons(ETH_P_IP);
1837 skb->dev = dev;
1838 start_bh_atomic();
1839 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
1840 end_bh_atomic();
1841 rt = (struct rtable*)skb->dst;
1842 if (!err && rt->u.dst.error)
1843 err = -rt->u.dst.error;
1844 } else {
1845 int oif = 0;
1846 if (rta[RTA_OIF-1])
1847 memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1848 err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
1850 if (err) {
1851 kfree_skb(skb);
1852 return err;
1855 skb->dst = &rt->u.dst;
1856 if (rtm->rtm_flags & RTM_F_NOTIFY)
1857 rt->rt_flags |= RTCF_NOTIFY;
1859 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1861 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
1862 if (err == 0)
1863 return 0;
1864 if (err < 0)
1865 return -EMSGSIZE;
1867 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1868 if (err < 0)
1869 return err;
1870 return 0;
1874 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
1876 struct rtable *rt;
1877 int h, s_h;
1878 int idx, s_idx;
1880 s_h = cb->args[0];
1881 s_idx = idx = cb->args[1];
1882 for (h=0; h < RT_HASH_DIVISOR; h++) {
1883 if (h < s_h) continue;
1884 if (h > s_h)
1885 s_idx = 0;
1886 read_lock_bh(&rt_hash_lock);
1887 for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
1888 if (idx < s_idx)
1889 continue;
1890 skb->dst = dst_clone(&rt->u.dst);
1891 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
1892 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
1893 dst_release(xchg(&skb->dst, NULL));
1894 read_unlock_bh(&rt_hash_lock);
1895 goto done;
1897 dst_release(xchg(&skb->dst, NULL));
1899 read_unlock_bh(&rt_hash_lock);
1902 done:
1903 cb->args[0] = h;
1904 cb->args[1] = idx;
1905 return skb->len;
1908 #endif /* CONFIG_RTNETLINK */
1910 void ip_rt_multicast_event(struct in_device *in_dev)
1912 rt_cache_flush(0);
1917 #ifdef CONFIG_SYSCTL
1919 static int flush_delay;
1921 static
1922 int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
1923 void *buffer, size_t *lenp)
1925 if (write) {
1926 proc_dointvec(ctl, write, filp, buffer, lenp);
1927 rt_cache_flush(flush_delay);
1928 return 0;
1929 } else
1930 return -EINVAL;
1933 ctl_table ipv4_route_table[] = {
1934 {NET_IPV4_ROUTE_FLUSH, "flush",
1935 &flush_delay, sizeof(int), 0200, NULL,
1936 &ipv4_sysctl_rtcache_flush},
1937 {NET_IPV4_ROUTE_MIN_DELAY, "min_delay",
1938 &ip_rt_min_delay, sizeof(int), 0644, NULL,
1939 &proc_dointvec_jiffies},
1940 {NET_IPV4_ROUTE_MAX_DELAY, "max_delay",
1941 &ip_rt_max_delay, sizeof(int), 0644, NULL,
1942 &proc_dointvec_jiffies},
1943 {NET_IPV4_ROUTE_GC_THRESH, "gc_thresh",
1944 &ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL,
1945 &proc_dointvec},
1946 {NET_IPV4_ROUTE_MAX_SIZE, "max_size",
1947 &ip_rt_max_size, sizeof(int), 0644, NULL,
1948 &proc_dointvec},
1949 {NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval",
1950 &ip_rt_gc_min_interval, sizeof(int), 0644, NULL,
1951 &proc_dointvec_jiffies},
1952 {NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout",
1953 &ip_rt_gc_timeout, sizeof(int), 0644, NULL,
1954 &proc_dointvec_jiffies},
1955 {NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval",
1956 &ip_rt_gc_interval, sizeof(int), 0644, NULL,
1957 &proc_dointvec_jiffies},
1958 {NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load",
1959 &ip_rt_redirect_load, sizeof(int), 0644, NULL,
1960 &proc_dointvec},
1961 {NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number",
1962 &ip_rt_redirect_number, sizeof(int), 0644, NULL,
1963 &proc_dointvec},
1964 {NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence",
1965 &ip_rt_redirect_silence, sizeof(int), 0644, NULL,
1966 &proc_dointvec},
1967 {NET_IPV4_ROUTE_ERROR_COST, "error_cost",
1968 &ip_rt_error_cost, sizeof(int), 0644, NULL,
1969 &proc_dointvec},
1970 {NET_IPV4_ROUTE_ERROR_BURST, "error_burst",
1971 &ip_rt_error_burst, sizeof(int), 0644, NULL,
1972 &proc_dointvec},
1973 {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity",
1974 &ip_rt_gc_elasticity, sizeof(int), 0644, NULL,
1975 &proc_dointvec},
1976 {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
1977 &ip_rt_mtu_expires, sizeof(int), 0644, NULL,
1978 &proc_dointvec_jiffies},
1981 #endif
1983 #ifdef CONFIG_NET_CLS_ROUTE
1984 struct ip_rt_acct ip_rt_acct[256];
1986 #ifdef CONFIG_PROC_FS
1987 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
1988 int length, int *eof, void *data)
1990 *start=buffer;
1992 if (offset + length > sizeof(ip_rt_acct)) {
1993 length = sizeof(ip_rt_acct) - offset;
1994 *eof = 1;
1996 if (length > 0) {
1997 start_bh_atomic();
1998 memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
1999 end_bh_atomic();
2000 return length;
2002 return 0;
2004 #endif
2005 #endif
2008 __initfunc(void ip_rt_init(void))
2010 #ifdef CONFIG_PROC_FS
2011 #ifdef CONFIG_NET_CLS_ROUTE
2012 struct proc_dir_entry *ent;
2013 #endif
2014 #endif
2015 devinet_init();
2016 ip_fib_init();
2017 rt_periodic_timer.function = rt_check_expire;
2018 /* All the timers, started at system startup tend
2019 to synchronize. Perturb it a bit.
2021 rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval
2022 + ip_rt_gc_interval;
2023 add_timer(&rt_periodic_timer);
2025 #ifdef CONFIG_PROC_FS
2026 proc_net_register(&(struct proc_dir_entry) {
2027 PROC_NET_RTCACHE, 8, "rt_cache",
2028 S_IFREG | S_IRUGO, 1, 0, 0,
2029 0, &proc_net_inode_operations,
2030 rt_cache_get_info
2032 #ifdef CONFIG_NET_CLS_ROUTE
2033 ent = create_proc_entry("net/rt_acct", 0, 0);
2034 ent->read_proc = ip_rt_acct_read;
2035 #endif
2036 #endif