Import 2.3.10pre5
[davej-history.git] / net / ipv4 / route.c
blob3d9e87de33f296768d9cf45887c2e258d95bd4e3
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.69 1999/06/09 10:11:02 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Splitted to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
56 * This program is free software; you can redistribute it and/or
57 * modify it under the terms of the GNU General Public License
58 * as published by the Free Software Foundation; either version
59 * 2 of the License, or (at your option) any later version.
62 #include <linux/config.h>
63 #include <asm/uaccess.h>
64 #include <asm/system.h>
65 #include <asm/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/sched.h>
69 #include <linux/mm.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/rtnetlink.h>
81 #include <linux/inetdevice.h>
82 #include <linux/igmp.h>
83 #include <linux/pkt_sched.h>
84 #include <linux/mroute.h>
85 #include <net/protocol.h>
86 #include <net/ip.h>
87 #include <net/route.h>
88 #include <net/sock.h>
89 #include <net/ip_fib.h>
90 #include <net/arp.h>
91 #include <net/tcp.h>
92 #include <net/icmp.h>
93 #ifdef CONFIG_SYSCTL
94 #include <linux/sysctl.h>
95 #endif
97 #define IP_MAX_MTU 0xFFF0
99 #define RT_GC_TIMEOUT (300*HZ)
101 int ip_rt_min_delay = 2*HZ;
102 int ip_rt_max_delay = 10*HZ;
103 int ip_rt_gc_thresh = RT_HASH_DIVISOR;
104 int ip_rt_max_size = RT_HASH_DIVISOR*16;
105 int ip_rt_gc_timeout = RT_GC_TIMEOUT;
106 int ip_rt_gc_interval = 60*HZ;
107 int ip_rt_gc_min_interval = 5*HZ;
108 int ip_rt_redirect_number = 9;
109 int ip_rt_redirect_load = HZ/50;
110 int ip_rt_redirect_silence = ((HZ/50) << (9+1));
111 int ip_rt_error_cost = HZ;
112 int ip_rt_error_burst = 5*HZ;
113 int ip_rt_gc_elasticity = 8;
114 int ip_rt_mtu_expires = 10*60*HZ;
116 static unsigned long rt_deadline = 0;
118 #define RTprint(a...) printk(KERN_DEBUG a)
120 static void rt_run_flush(unsigned long dummy);
122 static struct timer_list rt_flush_timer =
123 { NULL, NULL, 0, 0L, rt_run_flush };
124 static struct timer_list rt_periodic_timer =
125 { NULL, NULL, 0, 0L, NULL };
128 * Interface to generic destination cache.
131 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32);
132 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
133 struct sk_buff *);
134 static struct dst_entry * ipv4_negative_advice(struct dst_entry *);
135 static void ipv4_link_failure(struct sk_buff *skb);
136 static int rt_garbage_collect(void);
139 struct dst_ops ipv4_dst_ops =
141 AF_INET,
142 __constant_htons(ETH_P_IP),
143 RT_HASH_DIVISOR,
145 rt_garbage_collect,
146 ipv4_dst_check,
147 ipv4_dst_reroute,
148 NULL,
149 ipv4_negative_advice,
150 ipv4_link_failure,
153 __u8 ip_tos2prio[16] = {
154 TC_PRIO_BESTEFFORT,
155 TC_PRIO_FILLER,
156 TC_PRIO_BESTEFFORT,
157 TC_PRIO_FILLER,
158 TC_PRIO_BULK,
159 TC_PRIO_FILLER,
160 TC_PRIO_BULK,
161 TC_PRIO_FILLER,
162 TC_PRIO_INTERACTIVE,
163 TC_PRIO_FILLER,
164 TC_PRIO_INTERACTIVE,
165 TC_PRIO_FILLER,
166 TC_PRIO_INTERACTIVE_BULK,
167 TC_PRIO_FILLER,
168 TC_PRIO_INTERACTIVE_BULK,
169 TC_PRIO_FILLER
174 * Route cache.
177 /* The locking scheme is rather straight forward:
179 * 1) A BH protected rwlock protects the central route hash.
180 * 2) Only writers remove entries, and they hold the lock
181 * as they look at rtable reference counts.
182 * 3) Only readers acquire references to rtable entries,
183 * they do so with atomic increments and with the
184 * lock held.
187 static struct rtable *rt_hash_table[RT_HASH_DIVISOR];
188 static rwlock_t rt_hash_lock = RW_LOCK_UNLOCKED;
190 static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
192 static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
194 unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
195 hash = hash^saddr^tos;
196 hash = hash^(hash>>16);
197 return (hash^(hash>>8)) & 0xFF;
200 #ifdef CONFIG_PROC_FS
202 static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
204 int len=0;
205 off_t pos=0;
206 char temp[129];
207 struct rtable *r;
208 int i;
210 pos = 128;
212 if (offset<128) {
213 sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst");
214 len = 128;
218 read_lock_bh(&rt_hash_lock);
220 for (i = 0; i<RT_HASH_DIVISOR; i++) {
221 for (r = rt_hash_table[i]; r; r = r->u.rt_next) {
223 * Spin through entries until we are ready
225 pos += 128;
227 if (pos <= offset) {
228 len = 0;
229 continue;
231 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
232 r->u.dst.dev ? r->u.dst.dev->name : "*",
233 (unsigned long)r->rt_dst,
234 (unsigned long)r->rt_gateway,
235 r->rt_flags,
236 atomic_read(&r->u.dst.use),
237 atomic_read(&r->u.dst.refcnt),
239 (unsigned long)r->rt_src, (int)r->u.dst.pmtu,
240 r->u.dst.window,
241 (int)r->u.dst.rtt, r->key.tos,
242 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
243 r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0,
244 r->rt_spec_dst);
245 sprintf(buffer+len,"%-127s\n",temp);
246 len += 128;
247 if (pos >= offset+length)
248 goto done;
252 done:
253 read_unlock_bh(&rt_hash_lock);
255 *start = buffer+len-(pos-offset);
256 len = pos-offset;
257 if (len>length)
258 len = length;
259 return len;
261 #endif
263 static __inline__ void rt_free(struct rtable *rt)
265 dst_free(&rt->u.dst);
268 static __inline__ void rt_drop(struct rtable *rt)
270 ip_rt_put(rt);
271 dst_free(&rt->u.dst);
274 static __inline__ int rt_fast_clean(struct rtable *rth)
276 /* Kill broadcast/multicast entries very aggresively, if they
277 collide in hash table with more useful entries */
278 return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
279 && rth->key.iif && rth->u.rt_next);
282 static __inline__ int rt_valuable(struct rtable *rth)
284 return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY))
285 || rth->u.dst.expires);
288 static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
290 int age;
292 if (atomic_read(&rth->u.dst.use))
293 return 0;
295 if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0)
296 return 1;
298 age = jiffies - rth->u.dst.lastuse;
299 if (age <= tmo1 && !rt_fast_clean(rth))
300 return 0;
301 if (age <= tmo2 && rt_valuable(rth))
302 return 0;
303 return 1;
306 /* This runs via a timer and thus is always in BH context. */
307 static void rt_check_expire(unsigned long dummy)
309 int i;
310 static int rover;
311 struct rtable *rth, **rthp;
312 unsigned long now = jiffies;
314 for (i=0; i<RT_HASH_DIVISOR/5; i++) {
315 unsigned tmo = ip_rt_gc_timeout;
317 rover = (rover + 1) & (RT_HASH_DIVISOR-1);
318 rthp = &rt_hash_table[rover];
320 write_lock(&rt_hash_lock);
321 while ((rth = *rthp) != NULL) {
322 if (rth->u.dst.expires) {
323 /* Entrie is expired even if it is in use */
324 if ((long)(now - rth->u.dst.expires) <= 0) {
325 tmo >>= 1;
326 rthp = &rth->u.rt_next;
327 continue;
329 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
330 tmo >>= 1;
331 rthp = &rth->u.rt_next;
332 continue;
336 * Cleanup aged off entries.
338 *rthp = rth->u.rt_next;
339 rt_free(rth);
341 write_unlock(&rt_hash_lock);
343 /* Fallback loop breaker. */
344 if ((jiffies - now) > 0)
345 break;
347 rt_periodic_timer.expires = now + ip_rt_gc_interval;
348 add_timer(&rt_periodic_timer);
351 /* This can run from both BH and non-BH contexts, the latter
352 * in the case of a forced flush event.
354 static void rt_run_flush(unsigned long dummy)
356 int i;
357 struct rtable * rth, * next;
359 rt_deadline = 0;
361 for (i=0; i<RT_HASH_DIVISOR; i++) {
362 write_lock_bh(&rt_hash_lock);
363 rth = rt_hash_table[i];
364 if(rth != NULL)
365 rt_hash_table[i] = NULL;
366 write_unlock_bh(&rt_hash_lock);
368 for (; rth; rth=next) {
369 next = rth->u.rt_next;
370 rth->u.rt_next = NULL;
371 rt_free(rth);
376 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
378 void rt_cache_flush(int delay)
380 unsigned long now = jiffies;
381 int user_mode = !in_interrupt();
383 if (delay < 0)
384 delay = ip_rt_min_delay;
386 spin_lock_bh(&rt_flush_lock);
388 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
389 long tmo = (long)(rt_deadline - now);
391 /* If flush timer is already running
392 and flush request is not immediate (delay > 0):
394 if deadline is not achieved, prolongate timer to "delay",
395 otherwise fire it at deadline time.
398 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
399 tmo = 0;
401 if (delay > tmo)
402 delay = tmo;
405 if (delay <= 0) {
406 spin_unlock_bh(&rt_flush_lock);
407 rt_run_flush(0);
408 return;
411 if (rt_deadline == 0)
412 rt_deadline = now + ip_rt_max_delay;
414 rt_flush_timer.expires = now + delay;
415 add_timer(&rt_flush_timer);
416 spin_unlock_bh(&rt_flush_lock);
420 Short description of GC goals.
422 We want to build algorithm, which will keep routing cache
423 at some equilibrium point, when number of aged off entries
424 is kept approximately equal to newly generated ones.
426 Current expiration strength is variable "expire".
427 We try to adjust it dynamically, so that if networking
428 is idle expires is large enough to keep enough of warm entries,
429 and when load increases it reduces to limit cache size.
432 static int rt_garbage_collect(void)
434 static unsigned expire = RT_GC_TIMEOUT;
435 static unsigned long last_gc;
436 static int rover;
437 static int equilibrium;
438 struct rtable *rth, **rthp;
439 unsigned long now = jiffies;
440 int goal;
443 * Garbage collection is pretty expensive,
444 * do not make it too frequently.
446 if (now - last_gc < ip_rt_gc_min_interval &&
447 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
448 return 0;
450 /* Calculate number of entries, which we want to expire now. */
451 goal = atomic_read(&ipv4_dst_ops.entries) - RT_HASH_DIVISOR*ip_rt_gc_elasticity;
452 if (goal <= 0) {
453 if (equilibrium < ipv4_dst_ops.gc_thresh)
454 equilibrium = ipv4_dst_ops.gc_thresh;
455 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
456 if (goal > 0) {
457 equilibrium += min(goal/2, RT_HASH_DIVISOR);
458 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
460 } else {
461 /* We are in dangerous area. Try to reduce cache really
462 * aggressively.
464 goal = max(goal/2, RT_HASH_DIVISOR);
465 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
468 if (now - last_gc >= ip_rt_gc_min_interval)
469 last_gc = now;
471 if (goal <= 0) {
472 equilibrium += goal;
473 goto work_done;
476 do {
477 int i, k;
479 /* The write lock is held during the entire hash
480 * traversal to ensure consistent state of the rover.
482 write_lock_bh(&rt_hash_lock);
483 for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) {
484 unsigned tmo = expire;
486 k = (k + 1) & (RT_HASH_DIVISOR-1);
487 rthp = &rt_hash_table[k];
488 while ((rth = *rthp) != NULL) {
489 if (!rt_may_expire(rth, tmo, expire)) {
490 tmo >>= 1;
491 rthp = &rth->u.rt_next;
492 continue;
494 *rthp = rth->u.rt_next;
495 rth->u.rt_next = NULL;
496 rt_free(rth);
497 goal--;
499 if (goal <= 0)
500 break;
502 rover = k;
503 write_unlock_bh(&rt_hash_lock);
505 if (goal <= 0)
506 goto work_done;
508 /* Goal is not achieved. We stop process if:
510 - if expire reduced to zero. Otherwise, expire is halfed.
511 - if table is not full.
512 - if we are called from interrupt.
513 - jiffies check is just fallback/debug loop breaker.
514 We will not spin here for long time in any case.
517 if (expire == 0)
518 break;
520 expire >>= 1;
521 #if RT_CACHE_DEBUG >= 2
522 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);
523 #endif
525 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
526 return 0;
527 } while (!in_interrupt() && jiffies - now < 1);
529 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
530 return 0;
531 if (net_ratelimit())
532 printk("dst cache overflow\n");
533 return 1;
535 work_done:
536 expire += ip_rt_gc_min_interval;
537 if (expire > ip_rt_gc_timeout ||
538 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
539 expire = ip_rt_gc_timeout;
540 #if RT_CACHE_DEBUG >= 2
541 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);
542 #endif
543 return 0;
546 static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp)
548 struct rtable *rth, **rthp;
549 unsigned long now = jiffies;
550 int attempts = !in_interrupt();
552 restart:
553 rthp = &rt_hash_table[hash];
555 write_lock_bh(&rt_hash_lock);
556 while ((rth = *rthp) != NULL) {
557 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
558 /* Put it first */
559 *rthp = rth->u.rt_next;
560 rth->u.rt_next = rt_hash_table[hash];
561 rt_hash_table[hash] = rth;
563 atomic_inc(&rth->u.dst.refcnt);
564 atomic_inc(&rth->u.dst.use);
565 rth->u.dst.lastuse = now;
566 write_unlock_bh(&rt_hash_lock);
568 rt_drop(rt);
569 *rp = rth;
570 return 0;
573 rthp = &rth->u.rt_next;
576 /* Try to bind route to arp only if it is output
577 route or unicast forwarding path.
579 if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
580 if (!arp_bind_neighbour(&rt->u.dst)) {
581 write_unlock_bh(&rt_hash_lock);
583 /* Neighbour tables are full and nothing
584 can be released. Try to shrink route cache,
585 it is most likely it holds some neighbour records.
587 if (attempts-- > 0) {
588 int saved_elasticity = ip_rt_gc_elasticity;
589 int saved_int = ip_rt_gc_min_interval;
590 ip_rt_gc_elasticity = 1;
591 ip_rt_gc_min_interval = 0;
592 rt_garbage_collect();
593 ip_rt_gc_min_interval = saved_int;
594 ip_rt_gc_elasticity = saved_elasticity;
595 goto restart;
598 rt_drop(rt);
599 if (net_ratelimit())
600 printk("neighbour table overflow\n");
601 return -ENOBUFS;
605 rt->u.rt_next = rt_hash_table[hash];
606 #if RT_CACHE_DEBUG >= 2
607 if (rt->u.rt_next) {
608 struct rtable * trt;
609 printk("rt_cache @%02x: %08x", hash, rt->rt_dst);
610 for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next)
611 printk(" . %08x", trt->rt_dst);
612 printk("\n");
614 #endif
615 rt_hash_table[hash] = rt;
616 write_unlock_bh(&rt_hash_lock);
617 *rp = rt;
618 return 0;
621 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
622 u32 saddr, u8 tos, struct device *dev)
624 int i, k;
625 struct in_device *in_dev = dev->ip_ptr;
626 struct rtable *rth, **rthp;
627 u32 skeys[2] = { saddr, 0 };
628 int ikeys[2] = { dev->ifindex, 0 };
630 tos &= IPTOS_TOS_MASK;
632 if (!in_dev)
633 return;
635 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
636 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
637 goto reject_redirect;
639 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
640 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
641 goto reject_redirect;
642 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
643 goto reject_redirect;
644 } else {
645 if (inet_addr_type(new_gw) != RTN_UNICAST)
646 goto reject_redirect;
649 for (i=0; i<2; i++) {
650 for (k=0; k<2; k++) {
651 unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos);
653 rthp=&rt_hash_table[hash];
655 write_lock_bh(&rt_hash_lock);
656 while ( (rth = *rthp) != NULL) {
657 struct rtable *rt;
659 if (rth->key.dst != daddr ||
660 rth->key.src != skeys[i] ||
661 rth->key.tos != tos ||
662 rth->key.oif != ikeys[k] ||
663 rth->key.iif != 0) {
664 rthp = &rth->u.rt_next;
665 continue;
668 if (rth->rt_dst != daddr ||
669 rth->rt_src != saddr ||
670 rth->u.dst.error ||
671 rth->rt_gateway != old_gw ||
672 rth->u.dst.dev != dev)
673 break;
675 dst_clone(&rth->u.dst);
677 rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
678 if (rt == NULL) {
679 ip_rt_put(rth);
680 write_unlock_bh(&rt_hash_lock);
681 return;
685 * Copy all the information.
687 *rt = *rth;
688 atomic_set(&rt->u.dst.refcnt, 1);
689 atomic_set(&rt->u.dst.use, 1);
690 rt->u.dst.lastuse = jiffies;
691 rt->u.dst.neighbour = NULL;
692 rt->u.dst.hh = NULL;
694 rt->rt_flags |= RTCF_REDIRECTED;
696 /* Gateway is different ... */
697 rt->rt_gateway = new_gw;
699 /* Redirect received -> path was valid */
700 dst_confirm(&rth->u.dst);
702 if (!arp_bind_neighbour(&rt->u.dst) ||
703 !(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
704 if (rt->u.dst.neighbour)
705 neigh_event_send(rt->u.dst.neighbour, NULL);
706 ip_rt_put(rth);
707 rt_drop(rt);
708 break;
711 *rthp = rth->u.rt_next;
712 write_unlock_bh(&rt_hash_lock);
713 if (!rt_intern_hash(hash, rt, &rt))
714 ip_rt_put(rt);
715 rt_drop(rth);
716 goto do_next;
718 write_unlock_bh(&rt_hash_lock);
719 do_next:
723 return;
725 reject_redirect:
726 #ifdef CONFIG_IP_ROUTE_VERBOSE
727 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
728 printk(KERN_INFO "Redirect from %lX/%s to %lX ignored."
729 "Path = %lX -> %lX, tos %02x\n",
730 ntohl(old_gw), dev->name, ntohl(new_gw),
731 ntohl(saddr), ntohl(daddr), tos);
732 #endif
735 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
737 struct rtable *rt = (struct rtable*)dst;
739 if (rt != NULL) {
740 if (dst->obsolete) {
741 ip_rt_put(rt);
742 return NULL;
744 if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
745 unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
746 struct rtable **rthp;
747 #if RT_CACHE_DEBUG >= 1
748 printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos);
749 #endif
750 ip_rt_put(rt);
751 write_lock_bh(&rt_hash_lock);
752 for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
753 if (*rthp == rt) {
754 *rthp = rt->u.rt_next;
755 rt_free(rt);
756 break;
759 write_unlock_bh(&rt_hash_lock);
760 return NULL;
763 return dst;
767 * Algorithm:
768 * 1. The first ip_rt_redirect_number redirects are sent
769 * with exponential backoff, then we stop sending them at all,
770 * assuming that the host ignores our redirects.
771 * 2. If we did not see packets requiring redirects
772 * during ip_rt_redirect_silence, we assume that the host
773 * forgot redirected route and start to send redirects again.
775 * This algorithm is much cheaper and more intelligent than dumb load limiting
776 * in icmp.c.
778 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
779 * and "frag. need" (breaks PMTU discovery) in icmp.c.
782 void ip_rt_send_redirect(struct sk_buff *skb)
784 struct rtable *rt = (struct rtable*)skb->dst;
785 struct in_device *in_dev = (struct in_device*)rt->u.dst.dev->ip_ptr;
787 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev))
788 return;
790 /* No redirected packets during ip_rt_redirect_silence;
791 * reset the algorithm.
793 if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence)
794 rt->u.dst.rate_tokens = 0;
796 /* Too many ignored redirects; do not send anything
797 * set u.dst.rate_last to the last seen redirected packet.
799 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
800 rt->u.dst.rate_last = jiffies;
801 return;
804 /* Check for load limit; set rate_last to the latest sent
805 * redirect.
807 if (jiffies - rt->u.dst.rate_last > (ip_rt_redirect_load<<rt->u.dst.rate_tokens)) {
808 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
809 rt->u.dst.rate_last = jiffies;
810 ++rt->u.dst.rate_tokens;
811 #ifdef CONFIG_IP_ROUTE_VERBOSE
812 if (IN_DEV_LOG_MARTIANS(in_dev) &&
813 rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit())
814 printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n",
815 rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway);
816 #endif
820 static int ip_error(struct sk_buff *skb)
822 struct rtable *rt = (struct rtable*)skb->dst;
823 unsigned long now;
824 int code;
826 switch (rt->u.dst.error) {
827 case EINVAL:
828 default:
829 kfree_skb(skb);
830 return 0;
831 case EHOSTUNREACH:
832 code = ICMP_HOST_UNREACH;
833 break;
834 case ENETUNREACH:
835 code = ICMP_NET_UNREACH;
836 break;
837 case EACCES:
838 code = ICMP_PKT_FILTERED;
839 break;
842 now = jiffies;
843 if ((rt->u.dst.rate_tokens += (now - rt->u.dst.rate_last)) > ip_rt_error_burst)
844 rt->u.dst.rate_tokens = ip_rt_error_burst;
845 rt->u.dst.rate_last = now;
846 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
847 rt->u.dst.rate_tokens -= ip_rt_error_cost;
848 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
851 kfree_skb(skb);
852 return 0;
856 * The last two values are not from the RFC but
857 * are needed for AMPRnet AX.25 paths.
860 static unsigned short mtu_plateau[] =
861 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
863 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
865 int i;
867 for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++)
868 if (old_mtu > mtu_plateau[i])
869 return mtu_plateau[i];
870 return 68;
873 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
875 int i;
876 unsigned short old_mtu = ntohs(iph->tot_len);
877 struct rtable *rth;
878 u32 skeys[2] = { iph->saddr, 0, };
879 u32 daddr = iph->daddr;
880 u8 tos = iph->tos & IPTOS_TOS_MASK;
881 unsigned short est_mtu = 0;
883 if (ipv4_config.no_pmtu_disc)
884 return 0;
886 for (i=0; i<2; i++) {
887 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
889 read_lock_bh(&rt_hash_lock);
890 for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) {
891 if (rth->key.dst == daddr &&
892 rth->key.src == skeys[i] &&
893 rth->rt_dst == daddr &&
894 rth->rt_src == iph->saddr &&
895 rth->key.tos == tos &&
896 rth->key.iif == 0 &&
897 !(rth->u.dst.mxlock&(1<<RTAX_MTU))) {
898 unsigned short mtu = new_mtu;
900 if (new_mtu < 68 || new_mtu >= old_mtu) {
902 /* BSD 4.2 compatibility hack :-( */
903 if (mtu == 0 && old_mtu >= rth->u.dst.pmtu &&
904 old_mtu >= 68 + (iph->ihl<<2))
905 old_mtu -= iph->ihl<<2;
907 mtu = guess_mtu(old_mtu);
909 if (mtu <= rth->u.dst.pmtu) {
910 if (mtu < rth->u.dst.pmtu) {
911 dst_confirm(&rth->u.dst);
912 rth->u.dst.pmtu = mtu;
913 dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
915 est_mtu = mtu;
919 read_unlock_bh(&rt_hash_lock);
921 return est_mtu ? : new_mtu;
924 void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
926 if (dst->pmtu > mtu && mtu >= 68 &&
927 !(dst->mxlock&(1<<RTAX_MTU))) {
928 dst->pmtu = mtu;
929 dst_set_expires(dst, ip_rt_mtu_expires);
933 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32 cookie)
935 dst_release(dst);
936 return NULL;
939 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
940 struct sk_buff *skb)
942 return NULL;
945 static void ipv4_link_failure(struct sk_buff *skb)
947 struct rtable *rt;
949 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
951 rt = (struct rtable *) skb->dst;
952 if (rt)
953 dst_set_expires(&rt->u.dst, 0);
956 static int ip_rt_bug(struct sk_buff *skb)
958 printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr,
959 skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?");
960 kfree_skb(skb);
961 return 0;
965 We do not cache source address of outgoing interface,
966 because it is used only by IP RR, TS and SRR options,
967 so that it out of fast path.
969 BTW remember: "addr" is allowed to be not aligned
970 in IP options!
973 void ip_rt_get_source(u8 *addr, struct rtable *rt)
975 u32 src;
976 struct fib_result res;
978 if (rt->key.iif == 0)
979 src = rt->rt_src;
980 else if (fib_lookup(&rt->key, &res) == 0)
981 src = FIB_RES_PREFSRC(res);
982 else
983 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
984 memcpy(addr, &src, 4);
987 #ifdef CONFIG_NET_CLS_ROUTE
988 static void set_class_tag(struct rtable *rt, u32 tag)
990 if (!(rt->u.dst.tclassid&0xFFFF))
991 rt->u.dst.tclassid |= tag&0xFFFF;
992 if (!(rt->u.dst.tclassid&0xFFFF0000))
993 rt->u.dst.tclassid |= tag&0xFFFF0000;
995 #endif
997 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
999 struct fib_info *fi = res->fi;
1001 if (fi) {
1002 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1003 rt->rt_gateway = FIB_RES_GW(*res);
1004 rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1];
1005 rt->u.dst.pmtu = fi->fib_mtu;
1006 if (fi->fib_mtu == 0) {
1007 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1008 if (rt->u.dst.pmtu > IP_MAX_MTU)
1009 rt->u.dst.pmtu = IP_MAX_MTU;
1010 if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
1011 rt->rt_gateway != rt->rt_dst &&
1012 rt->u.dst.pmtu > 576)
1013 rt->u.dst.pmtu = 576;
1015 rt->u.dst.window= fi->fib_window ? : 0;
1016 rt->u.dst.rtt = fi->fib_rtt ? : TCP_TIMEOUT_INIT;
1017 #ifdef CONFIG_NET_CLS_ROUTE
1018 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1019 #endif
1020 } else {
1021 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1022 if (rt->u.dst.pmtu > IP_MAX_MTU)
1023 rt->u.dst.pmtu = IP_MAX_MTU;
1024 rt->u.dst.window= 0;
1025 rt->u.dst.rtt = TCP_TIMEOUT_INIT;
1027 #ifdef CONFIG_NET_CLS_ROUTE
1028 #ifdef CONFIG_IP_MULTIPLE_TABLES
1029 set_class_tag(rt, fib_rules_tclass(res));
1030 #endif
1031 set_class_tag(rt, itag);
1032 #endif
1033 rt->rt_type = res->type;
1036 static int
1037 ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1038 u8 tos, struct device *dev, int our)
1040 unsigned hash;
1041 struct rtable *rth;
1042 u32 spec_dst;
1043 struct in_device *in_dev = dev->ip_ptr;
1044 u32 itag = 0;
1046 /* Primary sanity checks. */
1048 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1049 in_dev == NULL || skb->protocol != __constant_htons(ETH_P_IP))
1050 return -EINVAL;
1052 if (ZERONET(saddr)) {
1053 if (!LOCAL_MCAST(daddr))
1054 return -EINVAL;
1055 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1056 } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
1057 return -EINVAL;
1059 rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1060 if (!rth)
1061 return -ENOBUFS;
1063 rth->u.dst.output= ip_rt_bug;
1065 atomic_set(&rth->u.dst.use, 1);
1066 rth->key.dst = daddr;
1067 rth->rt_dst = daddr;
1068 rth->key.tos = tos;
1069 #ifdef CONFIG_IP_ROUTE_FWMARK
1070 rth->key.fwmark = skb->fwmark;
1071 #endif
1072 rth->key.src = saddr;
1073 rth->rt_src = saddr;
1074 #ifdef CONFIG_IP_ROUTE_NAT
1075 rth->rt_dst_map = daddr;
1076 rth->rt_src_map = saddr;
1077 #endif
1078 #ifdef CONFIG_NET_CLS_ROUTE
1079 rth->u.dst.tclassid = itag;
1080 #endif
1081 rth->rt_iif =
1082 rth->key.iif = dev->ifindex;
1083 rth->u.dst.dev = &loopback_dev;
1084 rth->key.oif = 0;
1085 rth->rt_gateway = daddr;
1086 rth->rt_spec_dst= spec_dst;
1087 rth->rt_type = RTN_MULTICAST;
1088 rth->rt_flags = RTCF_MULTICAST;
1089 if (our) {
1090 rth->u.dst.input= ip_local_deliver;
1091 rth->rt_flags |= RTCF_LOCAL;
1094 #ifdef CONFIG_IP_MROUTE
1095 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1096 rth->u.dst.input = ip_mr_input;
1097 #endif
1099 hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
1100 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1104 * NOTE. We drop all the packets that has local source
1105 * addresses, because every properly looped back packet
1106 * must have correct destination already attached by output routine.
1108 * Such approach solves two big problems:
1109 * 1. Not simplex devices are handled properly.
1110 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1113 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1114 u8 tos, struct device *dev)
1116 struct rt_key key;
1117 struct fib_result res;
1118 struct in_device *in_dev = dev->ip_ptr;
1119 struct in_device *out_dev;
1120 unsigned flags = 0;
1121 u32 itag = 0;
1122 struct rtable * rth;
1123 unsigned hash;
1124 u32 spec_dst;
1125 int err = -EINVAL;
1128 * IP on this device is disabled.
1131 if (!in_dev)
1132 return -EINVAL;
1134 key.dst = daddr;
1135 key.src = saddr;
1136 key.tos = tos;
1137 #ifdef CONFIG_IP_ROUTE_FWMARK
1138 key.fwmark = skb->fwmark;
1139 #endif
1140 key.iif = dev->ifindex;
1141 key.oif = 0;
1142 key.scope = RT_SCOPE_UNIVERSE;
1144 hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos);
1146 /* Check for the most weird martians, which can be not detected
1147 by fib_lookup.
1150 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1151 goto martian_source;
1153 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1154 goto brd_input;
1156 /* Accept zero addresses only to limited broadcast;
1157 * I even do not know to fix it or not. Waiting for complains :-)
1159 if (ZERONET(saddr))
1160 goto martian_source;
1162 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1163 goto martian_destination;
1166 * Now we are ready to route packet.
1168 if ((err = fib_lookup(&key, &res))) {
1169 if (!IN_DEV_FORWARD(in_dev))
1170 return -EINVAL;
1171 goto no_route;
1174 #ifdef CONFIG_IP_ROUTE_NAT
1175 /* Policy is applied before mapping destination,
1176 but rerouting after map should be made with old source.
1179 if (1) {
1180 u32 src_map = saddr;
1181 if (res.r)
1182 src_map = fib_rules_policy(saddr, &res, &flags);
1184 if (res.type == RTN_NAT) {
1185 key.dst = fib_rules_map_destination(daddr, &res);
1186 if (fib_lookup(&key, &res) || res.type != RTN_UNICAST)
1187 return -EINVAL;
1188 flags |= RTCF_DNAT;
1190 key.src = src_map;
1192 #endif
1194 if (res.type == RTN_BROADCAST)
1195 goto brd_input;
1197 if (res.type == RTN_LOCAL) {
1198 int result;
1199 result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex,
1200 dev, &spec_dst, &itag);
1201 if (result < 0)
1202 goto martian_source;
1203 if (result)
1204 flags |= RTCF_DIRECTSRC;
1205 spec_dst = daddr;
1206 goto local_input;
1209 if (!IN_DEV_FORWARD(in_dev))
1210 return -EINVAL;
1211 if (res.type != RTN_UNICAST)
1212 goto martian_destination;
1214 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1215 if (res.fi->fib_nhs > 1 && key.oif == 0)
1216 fib_select_multipath(&key, &res);
1217 #endif
1218 out_dev = FIB_RES_DEV(res)->ip_ptr;
1219 if (out_dev == NULL) {
1220 if (net_ratelimit())
1221 printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n");
1222 return -EINVAL;
1225 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
1226 if (err < 0)
1227 goto martian_source;
1229 if (err)
1230 flags |= RTCF_DIRECTSRC;
1232 if (out_dev == in_dev && err && !(flags&(RTCF_NAT|RTCF_MASQ)) &&
1233 (IN_DEV_SHARED_MEDIA(out_dev)
1234 || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1235 flags |= RTCF_DOREDIRECT;
1237 if (skb->protocol != __constant_htons(ETH_P_IP)) {
1238 /* Not IP (i.e. ARP). Do not create route, if it is
1239 * invalid for proxy arp. DNAT routes are always valid.
1241 if (out_dev == in_dev && !(flags&RTCF_DNAT))
1242 return -EINVAL;
1245 rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1246 if (!rth)
1247 return -ENOBUFS;
1249 atomic_set(&rth->u.dst.use, 1);
1250 rth->key.dst = daddr;
1251 rth->rt_dst = daddr;
1252 rth->key.tos = tos;
1253 #ifdef CONFIG_IP_ROUTE_FWMARK
1254 rth->key.fwmark = skb->fwmark;
1255 #endif
1256 rth->key.src = saddr;
1257 rth->rt_src = saddr;
1258 rth->rt_gateway = daddr;
1259 #ifdef CONFIG_IP_ROUTE_NAT
1260 rth->rt_src_map = key.src;
1261 rth->rt_dst_map = key.dst;
1262 if (flags&RTCF_DNAT)
1263 rth->rt_gateway = key.dst;
1264 #endif
1265 rth->rt_iif =
1266 rth->key.iif = dev->ifindex;
1267 rth->u.dst.dev = out_dev->dev;
1268 rth->key.oif = 0;
1269 rth->rt_spec_dst= spec_dst;
1271 rth->u.dst.input = ip_forward;
1272 rth->u.dst.output = ip_output;
1274 rt_set_nexthop(rth, &res, itag);
1276 rth->rt_flags = flags;
1278 #ifdef CONFIG_NET_FASTROUTE
1279 if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1280 struct device *odev = rth->u.dst.dev;
1281 if (odev != dev &&
1282 dev->accept_fastpath &&
1283 odev->mtu >= dev->mtu &&
1284 dev->accept_fastpath(dev, &rth->u.dst) == 0)
1285 rth->rt_flags |= RTCF_FAST;
1287 #endif
1289 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1291 brd_input:
1292 if (skb->protocol != __constant_htons(ETH_P_IP))
1293 return -EINVAL;
1295 if (ZERONET(saddr)) {
1296 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1297 } else {
1298 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag);
1299 if (err < 0)
1300 goto martian_source;
1301 if (err)
1302 flags |= RTCF_DIRECTSRC;
1304 flags |= RTCF_BROADCAST;
1305 res.type = RTN_BROADCAST;
1307 local_input:
1308 rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1309 if (!rth)
1310 return -ENOBUFS;
1312 rth->u.dst.output= ip_rt_bug;
1314 atomic_set(&rth->u.dst.use, 1);
1315 rth->key.dst = daddr;
1316 rth->rt_dst = daddr;
1317 rth->key.tos = tos;
1318 #ifdef CONFIG_IP_ROUTE_FWMARK
1319 rth->key.fwmark = skb->fwmark;
1320 #endif
1321 rth->key.src = saddr;
1322 rth->rt_src = saddr;
1323 #ifdef CONFIG_IP_ROUTE_NAT
1324 rth->rt_dst_map = key.dst;
1325 rth->rt_src_map = key.src;
1326 #endif
1327 #ifdef CONFIG_NET_CLS_ROUTE
1328 rth->u.dst.tclassid = itag;
1329 #endif
1330 rth->rt_iif =
1331 rth->key.iif = dev->ifindex;
1332 rth->u.dst.dev = &loopback_dev;
1333 rth->key.oif = 0;
1334 rth->rt_gateway = daddr;
1335 rth->rt_spec_dst= spec_dst;
1336 rth->u.dst.input= ip_local_deliver;
1337 rth->rt_flags = flags|RTCF_LOCAL;
1338 if (res.type == RTN_UNREACHABLE) {
1339 rth->u.dst.input= ip_error;
1340 rth->u.dst.error= -err;
1341 rth->rt_flags &= ~RTCF_LOCAL;
1343 rth->rt_type = res.type;
1344 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1346 no_route:
1347 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1348 res.type = RTN_UNREACHABLE;
1349 goto local_input;
1352 * Do not cache martian addresses: they should be logged (RFC1812)
1354 martian_destination:
1355 #ifdef CONFIG_IP_ROUTE_VERBOSE
1356 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1357 printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name);
1358 #endif
1359 return -EINVAL;
1361 martian_source:
1362 #ifdef CONFIG_IP_ROUTE_VERBOSE
1363 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1365 * RFC1812 recommenadtion, if source is martian,
1366 * the only hint is MAC header.
1368 printk(KERN_WARNING "martian source %08x for %08x, dev %s\n", saddr, daddr, dev->name);
1369 if (dev->hard_header_len) {
1370 int i;
1371 unsigned char *p = skb->mac.raw;
1372 printk(KERN_WARNING "ll header:");
1373 for (i=0; i<dev->hard_header_len; i++, p++)
1374 printk(" %02x", *p);
1375 printk("\n");
1378 #endif
1379 return -EINVAL;
1382 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1383 u8 tos, struct device *dev)
1385 struct rtable * rth;
1386 unsigned hash;
1387 int iif = dev->ifindex;
1389 tos &= IPTOS_TOS_MASK;
1390 hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
1392 read_lock_bh(&rt_hash_lock);
1393 for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
1394 if (rth->key.dst == daddr &&
1395 rth->key.src == saddr &&
1396 rth->key.iif == iif &&
1397 rth->key.oif == 0 &&
1398 #ifdef CONFIG_IP_ROUTE_FWMARK
1399 rth->key.fwmark == skb->fwmark &&
1400 #endif
1401 rth->key.tos == tos) {
1402 rth->u.dst.lastuse = jiffies;
1403 atomic_inc(&rth->u.dst.use);
1404 atomic_inc(&rth->u.dst.refcnt);
1405 read_unlock_bh(&rt_hash_lock);
1406 skb->dst = (struct dst_entry*)rth;
1407 return 0;
1410 read_unlock_bh(&rt_hash_lock);
1412 /* Multicast recognition logic is moved from route cache to here.
1413 The problem was that too many Ethernet cards have broken/missing
1414 hardware multicast filters :-( As result the host on multicasting
1415 network acquires a lot of useless route cache entries, sort of
1416 SDR messages from all the world. Now we try to get rid of them.
1417 Really, provided software IP multicast filter is organized
1418 reasonably (at least, hashed), it does not result in a slowdown
1419 comparing with route cache reject entries.
1420 Note, that multicast routers are not affected, because
1421 route cache entry is created eventually.
1423 if (MULTICAST(daddr)) {
1424 int our = ip_check_mc(dev, daddr);
1425 if (!our
1426 #ifdef CONFIG_IP_MROUTE
1427 && (LOCAL_MCAST(daddr) || !dev->ip_ptr ||
1428 !IN_DEV_MFORWARD((struct in_device*)dev->ip_ptr))
1429 #endif
1430 ) return -EINVAL;
1431 return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
1433 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1437 * Major route resolver routine.
1440 int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
1442 struct rt_key key;
1443 struct fib_result res;
1444 unsigned flags = 0;
1445 struct rtable *rth;
1446 struct device *dev_out = NULL;
1447 unsigned hash;
1448 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1449 u32 nochecksrc = (tos & RTO_TPROXY);
1450 #endif
1452 tos &= IPTOS_TOS_MASK|RTO_ONLINK;
1453 key.dst = daddr;
1454 key.src = saddr;
1455 key.tos = tos&IPTOS_TOS_MASK;
1456 key.iif = loopback_dev.ifindex;
1457 key.oif = oif;
1458 key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
1459 res.fi = NULL;
1460 #ifdef CONFIG_IP_MULTIPLE_TABLES
1461 res.r = NULL;
1462 #endif
1464 if (saddr) {
1465 if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr))
1466 return -EINVAL;
1468 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1469 dev_out = ip_dev_find(saddr);
1470 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1471 /* If address is not local, test for transparent proxy flag;
1472 if address is local --- clear the flag.
1474 if (dev_out == NULL) {
1475 if (nochecksrc == 0 || inet_addr_type(saddr) != RTN_UNICAST)
1476 return -EINVAL;
1477 flags |= RTCF_TPROXY;
1479 #else
1480 if (dev_out == NULL)
1481 return -EINVAL;
1482 #endif
1484 /* I removed check for oif == dev_out->oif here.
1485 It was wrong by three reasons:
1486 1. ip_dev_find(saddr) can return wrong iface, if saddr is
1487 assigned to multiple interfaces.
1488 2. Moreover, we are allowed to send packets with saddr
1489 of another iface. --ANK
1492 if (oif == 0 &&
1493 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1494 dev_out &&
1495 #endif
1496 (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) {
1497 /* Special hack: user can direct multicasts
1498 and limited broadcast via necessary interface
1499 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1500 This hack is not just for fun, it allows
1501 vic,vat and friends to work.
1502 They bind socket to loopback, set ttl to zero
1503 and expect that it will work.
1504 From the viewpoint of routing cache they are broken,
1505 because we are not allowed to build multicast path
1506 with loopback source addr (look, routing cache
1507 cannot know, that ttl is zero, so that packet
1508 will not leave this host and route is valid).
1509 Luckily, this hack is good workaround.
1512 key.oif = dev_out->ifindex;
1513 goto make_route;
1515 dev_out = NULL;
1517 if (oif) {
1518 dev_out = dev_get_by_index(oif);
1519 if (dev_out == NULL)
1520 return -ENODEV;
1521 if (dev_out->ip_ptr == NULL)
1522 return -ENODEV; /* Wrong error code */
1524 if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) {
1525 if (!key.src)
1526 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1527 goto make_route;
1529 if (!key.src) {
1530 if (MULTICAST(daddr))
1531 key.src = inet_select_addr(dev_out, 0, key.scope);
1532 else if (!daddr)
1533 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST);
1537 if (!key.dst) {
1538 key.dst = key.src;
1539 if (!key.dst)
1540 key.dst = key.src = htonl(INADDR_LOOPBACK);
1541 dev_out = &loopback_dev;
1542 key.oif = loopback_dev.ifindex;
1543 res.type = RTN_LOCAL;
1544 flags |= RTCF_LOCAL;
1545 goto make_route;
1548 if (fib_lookup(&key, &res)) {
1549 res.fi = NULL;
1550 if (oif) {
1551 /* Apparently, routing tables are wrong. Assume,
1552 that the destination is on link.
1554 WHY? DW.
1555 Because we are allowed to send to iface
1556 even if it has NO routes and NO assigned
1557 addresses. When oif is specified, routing
1558 tables are looked up with only one purpose:
1559 to catch if destination is gatewayed, rather than
1560 direct. Moreover, if MSG_DONTROUTE is set,
1561 we send packet, ignoring both routing tables
1562 and ifaddr state. --ANK
1565 We could make it even if oif is unknown,
1566 likely IPv6, but we do not.
1569 if (key.src == 0)
1570 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1571 res.type = RTN_UNICAST;
1572 goto make_route;
1574 return -ENETUNREACH;
1577 if (res.type == RTN_NAT)
1578 return -EINVAL;
1580 if (res.type == RTN_LOCAL) {
1581 if (!key.src)
1582 key.src = key.dst;
1583 dev_out = &loopback_dev;
1584 key.oif = dev_out->ifindex;
1585 res.fi = NULL;
1586 flags |= RTCF_LOCAL;
1587 goto make_route;
1590 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1591 if (res.fi->fib_nhs > 1 && key.oif == 0)
1592 fib_select_multipath(&key, &res);
1593 else
1594 #endif
1595 if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0)
1596 fib_select_default(&key, &res);
1598 if (!key.src)
1599 key.src = FIB_RES_PREFSRC(res);
1601 dev_out = FIB_RES_DEV(res);
1602 key.oif = dev_out->ifindex;
1604 make_route:
1605 if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1606 return -EINVAL;
1608 if (key.dst == 0xFFFFFFFF)
1609 res.type = RTN_BROADCAST;
1610 else if (MULTICAST(key.dst))
1611 res.type = RTN_MULTICAST;
1612 else if (BADCLASS(key.dst) || ZERONET(key.dst))
1613 return -EINVAL;
1615 if (dev_out->flags&IFF_LOOPBACK)
1616 flags |= RTCF_LOCAL;
1618 if (res.type == RTN_BROADCAST) {
1619 flags |= RTCF_BROADCAST|RTCF_LOCAL;
1620 res.fi = NULL;
1621 } else if (res.type == RTN_MULTICAST) {
1622 flags |= RTCF_MULTICAST|RTCF_LOCAL;
1623 if (!ip_check_mc(dev_out, daddr))
1624 flags &= ~RTCF_LOCAL;
1625 /* If multicast route do not exist use
1626 default one, but do not gateway in this case.
1627 Yes, it is hack.
1629 if (res.fi && res.prefixlen < 4)
1630 res.fi = NULL;
1633 rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1634 if (!rth)
1635 return -ENOBUFS;
1637 atomic_set(&rth->u.dst.use, 1);
1638 rth->key.dst = daddr;
1639 rth->key.tos = tos;
1640 rth->key.src = saddr;
1641 rth->key.iif = 0;
1642 rth->key.oif = oif;
1643 rth->rt_dst = key.dst;
1644 rth->rt_src = key.src;
1645 #ifdef CONFIG_IP_ROUTE_NAT
1646 rth->rt_dst_map = key.dst;
1647 rth->rt_src_map = key.src;
1648 #endif
1649 rth->rt_iif = oif ? : dev_out->ifindex;
1650 rth->u.dst.dev = dev_out;
1651 rth->rt_gateway = key.dst;
1652 rth->rt_spec_dst= key.src;
1654 rth->u.dst.output=ip_output;
1656 if (flags&RTCF_LOCAL) {
1657 rth->u.dst.input = ip_local_deliver;
1658 rth->rt_spec_dst = key.dst;
1660 if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
1661 rth->rt_spec_dst = key.src;
1662 if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
1663 rth->u.dst.output = ip_mc_output;
1664 #ifdef CONFIG_IP_MROUTE
1665 if (res.type == RTN_MULTICAST && dev_out->ip_ptr) {
1666 struct in_device *in_dev = dev_out->ip_ptr;
1667 if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) {
1668 rth->u.dst.input = ip_mr_input;
1669 rth->u.dst.output = ip_mc_output;
1672 #endif
1675 rt_set_nexthop(rth, &res, 0);
1677 rth->rt_flags = flags;
1679 hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
1680 return rt_intern_hash(hash, rth, rp);
1683 int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
1685 unsigned hash;
1686 struct rtable *rth;
1688 hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
1690 read_lock_bh(&rt_hash_lock);
1691 for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
1692 if (rth->key.dst == daddr &&
1693 rth->key.src == saddr &&
1694 rth->key.iif == 0 &&
1695 rth->key.oif == oif &&
1696 #ifndef CONFIG_IP_TRANSPARENT_PROXY
1697 rth->key.tos == tos
1698 #else
1699 !((rth->key.tos^tos)&(IPTOS_TOS_MASK|RTO_ONLINK)) &&
1700 ((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
1701 #endif
1703 rth->u.dst.lastuse = jiffies;
1704 atomic_inc(&rth->u.dst.use);
1705 atomic_inc(&rth->u.dst.refcnt);
1706 read_unlock_bh(&rt_hash_lock);
1707 *rp = rth;
1708 return 0;
1711 read_unlock_bh(&rt_hash_lock);
1713 return ip_route_output_slow(rp, daddr, saddr, tos, oif);
1716 #ifdef CONFIG_RTNETLINK
1718 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
1720 struct rtable *rt = (struct rtable*)skb->dst;
1721 struct rtmsg *r;
1722 struct nlmsghdr *nlh;
1723 unsigned char *b = skb->tail;
1724 struct rta_cacheinfo ci;
1725 #ifdef CONFIG_IP_MROUTE
1726 struct rtattr *eptr;
1727 #endif
1728 struct rtattr *mx;
1730 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
1731 r = NLMSG_DATA(nlh);
1732 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
1733 r->rtm_family = AF_INET;
1734 r->rtm_dst_len = 32;
1735 r->rtm_src_len = 0;
1736 r->rtm_tos = rt->key.tos;
1737 r->rtm_table = RT_TABLE_MAIN;
1738 r->rtm_type = rt->rt_type;
1739 r->rtm_scope = RT_SCOPE_UNIVERSE;
1740 r->rtm_protocol = RTPROT_UNSPEC;
1741 r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
1742 if (rt->rt_flags & RTCF_NOTIFY)
1743 r->rtm_flags |= RTM_F_NOTIFY;
1744 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
1745 if (rt->key.src) {
1746 r->rtm_src_len = 32;
1747 RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
1749 if (rt->u.dst.dev)
1750 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
1751 #ifdef CONFIG_NET_CLS_ROUTE
1752 if (rt->u.dst.tclassid)
1753 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
1754 #endif
1755 if (rt->key.iif)
1756 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
1757 else if (rt->rt_src != rt->key.src)
1758 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
1759 if (rt->rt_dst != rt->rt_gateway)
1760 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
1761 mx = (struct rtattr*)skb->tail;
1762 RTA_PUT(skb, RTA_METRICS, 0, NULL);
1763 if (rt->u.dst.mxlock)
1764 RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock);
1765 if (rt->u.dst.pmtu)
1766 RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
1767 if (rt->u.dst.window)
1768 RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window);
1769 if (rt->u.dst.rtt)
1770 RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt);
1771 mx->rta_len = skb->tail - (u8*)mx;
1772 if (mx->rta_len == RTA_LENGTH(0))
1773 skb_trim(skb, (u8*)mx - skb->data);
1774 ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
1775 ci.rta_used = atomic_read(&rt->u.dst.refcnt);
1776 ci.rta_clntref = atomic_read(&rt->u.dst.use);
1777 if (rt->u.dst.expires)
1778 ci.rta_expires = rt->u.dst.expires - jiffies;
1779 else
1780 ci.rta_expires = 0;
1781 ci.rta_error = rt->u.dst.error;
1782 #ifdef CONFIG_IP_MROUTE
1783 eptr = (struct rtattr*)skb->tail;
1784 #endif
1785 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1786 if (rt->key.iif) {
1787 #ifdef CONFIG_IP_MROUTE
1788 u32 dst = rt->rt_dst;
1790 if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
1791 int err = ipmr_get_route(skb, r, nowait);
1792 if (err <= 0) {
1793 if (!nowait) {
1794 if (err == 0)
1795 return 0;
1796 goto nlmsg_failure;
1797 } else {
1798 if (err == -EMSGSIZE)
1799 goto nlmsg_failure;
1800 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
1803 } else
1804 #endif
1806 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
1810 nlh->nlmsg_len = skb->tail - b;
1811 return skb->len;
1813 nlmsg_failure:
1814 rtattr_failure:
1815 skb_trim(skb, b - skb->data);
1816 return -1;
1819 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1821 struct rtattr **rta = arg;
1822 struct rtmsg *rtm = NLMSG_DATA(nlh);
1823 struct rtable *rt = NULL;
1824 u32 dst = 0;
1825 u32 src = 0;
1826 int iif = 0;
1827 int err;
1828 struct sk_buff *skb;
1830 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1831 if (skb == NULL)
1832 return -ENOBUFS;
1834 /* Reserve room for dummy headers, this skb can pass
1835 through good chunk of routing engine.
1837 skb->mac.raw = skb->data;
1838 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
1840 if (rta[RTA_SRC-1])
1841 memcpy(&src, RTA_DATA(rta[RTA_SRC-1]), 4);
1842 if (rta[RTA_DST-1])
1843 memcpy(&dst, RTA_DATA(rta[RTA_DST-1]), 4);
1844 if (rta[RTA_IIF-1])
1845 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1847 if (iif) {
1848 struct device *dev;
1849 dev = dev_get_by_index(iif);
1850 if (!dev)
1851 return -ENODEV;
1852 skb->protocol = __constant_htons(ETH_P_IP);
1853 skb->dev = dev;
1854 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
1855 rt = (struct rtable*)skb->dst;
1856 if (!err && rt->u.dst.error)
1857 err = -rt->u.dst.error;
1858 } else {
1859 int oif = 0;
1860 if (rta[RTA_OIF-1])
1861 memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1862 err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
1864 if (err) {
1865 kfree_skb(skb);
1866 return err;
1869 skb->dst = &rt->u.dst;
1870 if (rtm->rtm_flags & RTM_F_NOTIFY)
1871 rt->rt_flags |= RTCF_NOTIFY;
1873 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1875 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
1876 if (err == 0)
1877 return 0;
1878 if (err < 0)
1879 return -EMSGSIZE;
1881 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1882 if (err < 0)
1883 return err;
1884 return 0;
1888 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
1890 struct rtable *rt;
1891 int h, s_h;
1892 int idx, s_idx;
1894 s_h = cb->args[0];
1895 s_idx = idx = cb->args[1];
1896 for (h=0; h < RT_HASH_DIVISOR; h++) {
1897 if (h < s_h) continue;
1898 if (h > s_h)
1899 s_idx = 0;
1900 read_lock_bh(&rt_hash_lock);
1901 for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
1902 if (idx < s_idx)
1903 continue;
1904 skb->dst = dst_clone(&rt->u.dst);
1905 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
1906 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
1907 dst_release(xchg(&skb->dst, NULL));
1908 read_unlock_bh(&rt_hash_lock);
1909 goto done;
1911 dst_release(xchg(&skb->dst, NULL));
1913 read_unlock_bh(&rt_hash_lock);
1916 done:
1917 cb->args[0] = h;
1918 cb->args[1] = idx;
1919 return skb->len;
1922 #endif /* CONFIG_RTNETLINK */
1924 void ip_rt_multicast_event(struct in_device *in_dev)
1926 rt_cache_flush(0);
1931 #ifdef CONFIG_SYSCTL
1933 static int flush_delay;
1935 static
1936 int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
1937 void *buffer, size_t *lenp)
1939 if (write) {
1940 proc_dointvec(ctl, write, filp, buffer, lenp);
1941 rt_cache_flush(flush_delay);
1942 return 0;
1943 } else
1944 return -EINVAL;
1947 ctl_table ipv4_route_table[] = {
1948 {NET_IPV4_ROUTE_FLUSH, "flush",
1949 &flush_delay, sizeof(int), 0200, NULL,
1950 &ipv4_sysctl_rtcache_flush},
1951 {NET_IPV4_ROUTE_MIN_DELAY, "min_delay",
1952 &ip_rt_min_delay, sizeof(int), 0644, NULL,
1953 &proc_dointvec_jiffies},
1954 {NET_IPV4_ROUTE_MAX_DELAY, "max_delay",
1955 &ip_rt_max_delay, sizeof(int), 0644, NULL,
1956 &proc_dointvec_jiffies},
1957 {NET_IPV4_ROUTE_GC_THRESH, "gc_thresh",
1958 &ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL,
1959 &proc_dointvec},
1960 {NET_IPV4_ROUTE_MAX_SIZE, "max_size",
1961 &ip_rt_max_size, sizeof(int), 0644, NULL,
1962 &proc_dointvec},
1963 {NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval",
1964 &ip_rt_gc_min_interval, sizeof(int), 0644, NULL,
1965 &proc_dointvec_jiffies},
1966 {NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout",
1967 &ip_rt_gc_timeout, sizeof(int), 0644, NULL,
1968 &proc_dointvec_jiffies},
1969 {NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval",
1970 &ip_rt_gc_interval, sizeof(int), 0644, NULL,
1971 &proc_dointvec_jiffies},
1972 {NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load",
1973 &ip_rt_redirect_load, sizeof(int), 0644, NULL,
1974 &proc_dointvec},
1975 {NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number",
1976 &ip_rt_redirect_number, sizeof(int), 0644, NULL,
1977 &proc_dointvec},
1978 {NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence",
1979 &ip_rt_redirect_silence, sizeof(int), 0644, NULL,
1980 &proc_dointvec},
1981 {NET_IPV4_ROUTE_ERROR_COST, "error_cost",
1982 &ip_rt_error_cost, sizeof(int), 0644, NULL,
1983 &proc_dointvec},
1984 {NET_IPV4_ROUTE_ERROR_BURST, "error_burst",
1985 &ip_rt_error_burst, sizeof(int), 0644, NULL,
1986 &proc_dointvec},
1987 {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity",
1988 &ip_rt_gc_elasticity, sizeof(int), 0644, NULL,
1989 &proc_dointvec},
1990 {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
1991 &ip_rt_mtu_expires, sizeof(int), 0644, NULL,
1992 &proc_dointvec_jiffies},
1995 #endif
1997 #ifdef CONFIG_NET_CLS_ROUTE
1998 struct ip_rt_acct ip_rt_acct[256];
1999 rwlock_t ip_rt_acct_lock = RW_LOCK_UNLOCKED;
2001 #ifdef CONFIG_PROC_FS
2002 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2003 int length, int *eof, void *data)
2005 *start=buffer;
2007 if (offset + length > sizeof(ip_rt_acct)) {
2008 length = sizeof(ip_rt_acct) - offset;
2009 *eof = 1;
2011 if (length > 0) {
2012 read_lock_bh(&ip_rt_acct_lock);
2013 memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
2014 read_unlock_bh(&ip_rt_acct_lock);
2015 return length;
2017 return 0;
2019 #endif
2020 #endif
2023 __initfunc(void ip_rt_init(void))
2025 #ifdef CONFIG_PROC_FS
2026 #ifdef CONFIG_NET_CLS_ROUTE
2027 struct proc_dir_entry *ent;
2028 #endif
2029 #endif
2030 devinet_init();
2031 ip_fib_init();
2032 rt_periodic_timer.function = rt_check_expire;
2033 /* All the timers, started at system startup tend
2034 to synchronize. Perturb it a bit.
2036 rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval
2037 + ip_rt_gc_interval;
2038 add_timer(&rt_periodic_timer);
2040 #ifdef CONFIG_PROC_FS
2041 proc_net_register(&(struct proc_dir_entry) {
2042 PROC_NET_RTCACHE, 8, "rt_cache",
2043 S_IFREG | S_IRUGO, 1, 0, 0,
2044 0, &proc_net_inode_operations,
2045 rt_cache_get_info
2047 #ifdef CONFIG_NET_CLS_ROUTE
2048 ent = create_proc_entry("net/rt_acct", 0, 0);
2049 ent->read_proc = ip_rt_acct_read;
2050 #endif
2051 #endif