Ok. I didn't make 2.4.0 in 2000. Tough. I tried, but we had some
[davej-history.git] / net / ipv4 / route.c
blobcacf36b71d1db7d0dd87f47de5501d0728a00689
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.91 2000/10/03 07:29:00 anton Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Splitted to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
56 * This program is free software; you can redistribute it and/or
57 * modify it under the terms of the GNU General Public License
58 * as published by the Free Software Foundation; either version
59 * 2 of the License, or (at your option) any later version.
62 #include <linux/config.h>
63 #include <asm/uaccess.h>
64 #include <asm/system.h>
65 #include <asm/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/sched.h>
69 #include <linux/mm.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/rtnetlink.h>
81 #include <linux/inetdevice.h>
82 #include <linux/igmp.h>
83 #include <linux/pkt_sched.h>
84 #include <linux/mroute.h>
85 #include <linux/netfilter_ipv4.h>
86 #include <linux/random.h>
87 #include <net/protocol.h>
88 #include <net/ip.h>
89 #include <net/route.h>
90 #include <net/inetpeer.h>
91 #include <net/sock.h>
92 #include <net/ip_fib.h>
93 #include <net/arp.h>
94 #include <net/tcp.h>
95 #include <net/icmp.h>
96 #ifdef CONFIG_SYSCTL
97 #include <linux/sysctl.h>
98 #endif
100 #define IP_MAX_MTU 0xFFF0
102 #define RT_GC_TIMEOUT (300*HZ)
104 int ip_rt_min_delay = 2*HZ;
105 int ip_rt_max_delay = 10*HZ;
106 int ip_rt_max_size;
107 int ip_rt_gc_timeout = RT_GC_TIMEOUT;
108 int ip_rt_gc_interval = 60*HZ;
109 int ip_rt_gc_min_interval = 5*HZ;
110 int ip_rt_redirect_number = 9;
111 int ip_rt_redirect_load = HZ/50;
112 int ip_rt_redirect_silence = ((HZ/50) << (9+1));
113 int ip_rt_error_cost = HZ;
114 int ip_rt_error_burst = 5*HZ;
115 int ip_rt_gc_elasticity = 8;
116 int ip_rt_mtu_expires = 10*60*HZ;
117 int ip_rt_min_pmtu = 512+20+20;
118 int ip_rt_min_advmss = 536;
120 static unsigned long rt_deadline;
122 #define RTprint(a...) printk(KERN_DEBUG a)
124 static struct timer_list rt_flush_timer;
125 static struct timer_list rt_periodic_timer;
128 * Interface to generic destination cache.
131 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32);
132 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
133 struct sk_buff *);
134 static void ipv4_dst_destroy(struct dst_entry * dst);
135 static struct dst_entry * ipv4_negative_advice(struct dst_entry *);
136 static void ipv4_link_failure(struct sk_buff *skb);
137 static int rt_garbage_collect(void);
140 struct dst_ops ipv4_dst_ops =
142 AF_INET,
143 __constant_htons(ETH_P_IP),
146 rt_garbage_collect,
147 ipv4_dst_check,
148 ipv4_dst_reroute,
149 ipv4_dst_destroy,
150 ipv4_negative_advice,
151 ipv4_link_failure,
152 sizeof(struct rtable),
155 #ifdef CONFIG_INET_ECN
156 #define ECN_OR_COST(class) TC_PRIO_##class
157 #else
158 #define ECN_OR_COST(class) TC_PRIO_FILLER
159 #endif
161 __u8 ip_tos2prio[16] = {
162 TC_PRIO_BESTEFFORT,
163 ECN_OR_COST(FILLER),
164 TC_PRIO_BESTEFFORT,
165 ECN_OR_COST(BESTEFFORT),
166 TC_PRIO_BULK,
167 ECN_OR_COST(BULK),
168 TC_PRIO_BULK,
169 ECN_OR_COST(BULK),
170 TC_PRIO_INTERACTIVE,
171 ECN_OR_COST(INTERACTIVE),
172 TC_PRIO_INTERACTIVE,
173 ECN_OR_COST(INTERACTIVE),
174 TC_PRIO_INTERACTIVE_BULK,
175 ECN_OR_COST(INTERACTIVE_BULK),
176 TC_PRIO_INTERACTIVE_BULK,
177 ECN_OR_COST(INTERACTIVE_BULK)
182 * Route cache.
185 /* The locking scheme is rather straight forward:
187 * 1) A BH protected rwlocks protect buckets of the central route hash.
188 * 2) Only writers remove entries, and they hold the lock
189 * as they look at rtable reference counts.
190 * 3) Only readers acquire references to rtable entries,
191 * they do so with atomic increments and with the
192 * lock held.
195 struct rt_hash_bucket {
196 struct rtable *chain;
197 rwlock_t lock;
198 } __attribute__((__aligned__(8)));
200 static struct rt_hash_bucket *rt_hash_table;
201 static unsigned rt_hash_mask;
202 static int rt_hash_log;
204 static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
206 static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
208 unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
209 hash ^= saddr^tos;
210 hash ^= (hash>>16);
211 return (hash^(hash>>8)) & rt_hash_mask;
214 static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length)
216 int len=0;
217 off_t pos=0;
218 char temp[129];
219 struct rtable *r;
220 int i;
222 pos = 128;
224 if (offset<128) {
225 sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst");
226 len = 128;
229 for (i = rt_hash_mask; i>=0; i--) {
230 read_lock_bh(&rt_hash_table[i].lock);
231 for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
233 * Spin through entries until we are ready
235 pos += 128;
237 if (pos <= offset) {
238 len = 0;
239 continue;
241 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
242 r->u.dst.dev ? r->u.dst.dev->name : "*",
243 (unsigned long)r->rt_dst,
244 (unsigned long)r->rt_gateway,
245 r->rt_flags,
246 atomic_read(&r->u.dst.__refcnt),
247 r->u.dst.__use,
249 (unsigned long)r->rt_src, (int)r->u.dst.advmss + 40,
250 r->u.dst.window,
251 (int)((r->u.dst.rtt>>3) + r->u.dst.rttvar),
252 r->key.tos,
253 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
254 r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0,
255 r->rt_spec_dst);
256 sprintf(buffer+len,"%-127s\n",temp);
257 len += 128;
258 if (pos >= offset+length) {
259 read_unlock_bh(&rt_hash_table[i].lock);
260 goto done;
263 read_unlock_bh(&rt_hash_table[i].lock);
266 done:
267 *start = buffer+len-(pos-offset);
268 len = pos-offset;
269 if (len>length)
270 len = length;
271 return len;
274 static __inline__ void rt_free(struct rtable *rt)
276 dst_free(&rt->u.dst);
279 static __inline__ void rt_drop(struct rtable *rt)
281 ip_rt_put(rt);
282 dst_free(&rt->u.dst);
285 static __inline__ int rt_fast_clean(struct rtable *rth)
287 /* Kill broadcast/multicast entries very aggresively, if they
288 collide in hash table with more useful entries */
289 return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
290 && rth->key.iif && rth->u.rt_next);
293 static __inline__ int rt_valuable(struct rtable *rth)
295 return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY))
296 || rth->u.dst.expires);
299 static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
301 int age;
303 if (atomic_read(&rth->u.dst.__refcnt))
304 return 0;
306 if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0)
307 return 1;
309 age = jiffies - rth->u.dst.lastuse;
310 if (age <= tmo1 && !rt_fast_clean(rth))
311 return 0;
312 if (age <= tmo2 && rt_valuable(rth))
313 return 0;
314 return 1;
317 /* This runs via a timer and thus is always in BH context. */
318 static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy)
320 int i, t;
321 static int rover;
322 struct rtable *rth, **rthp;
323 unsigned long now = jiffies;
325 i = rover;
327 for (t=(ip_rt_gc_interval<<rt_hash_log); t>=0; t -= ip_rt_gc_timeout) {
328 unsigned tmo = ip_rt_gc_timeout;
330 i = (i + 1) & rt_hash_mask;
331 rthp = &rt_hash_table[i].chain;
333 write_lock(&rt_hash_table[i].lock);
334 while ((rth = *rthp) != NULL) {
335 if (rth->u.dst.expires) {
336 /* Entry is expired even if it is in use */
337 if ((long)(now - rth->u.dst.expires) <= 0) {
338 tmo >>= 1;
339 rthp = &rth->u.rt_next;
340 continue;
342 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
343 tmo >>= 1;
344 rthp = &rth->u.rt_next;
345 continue;
349 * Cleanup aged off entries.
351 *rthp = rth->u.rt_next;
352 rt_free(rth);
354 write_unlock(&rt_hash_table[i].lock);
356 /* Fallback loop breaker. */
357 if ((jiffies - now) > 0)
358 break;
360 rover = i;
361 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
364 SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task);
366 /* This can run from both BH and non-BH contexts, the latter
367 * in the case of a forced flush event.
369 static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy)
371 int i;
372 struct rtable * rth, * next;
374 rt_deadline = 0;
376 for (i=rt_hash_mask; i>=0; i--) {
377 write_lock_bh(&rt_hash_table[i].lock);
378 rth = rt_hash_table[i].chain;
379 if (rth)
380 rt_hash_table[i].chain = NULL;
381 write_unlock_bh(&rt_hash_table[i].lock);
383 for (; rth; rth=next) {
384 next = rth->u.rt_next;
385 rt_free(rth);
390 SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task);
392 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
394 void rt_cache_flush(int delay)
396 unsigned long now = jiffies;
397 int user_mode = !in_softirq();
399 if (delay < 0)
400 delay = ip_rt_min_delay;
402 spin_lock_bh(&rt_flush_lock);
404 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
405 long tmo = (long)(rt_deadline - now);
407 /* If flush timer is already running
408 and flush request is not immediate (delay > 0):
410 if deadline is not achieved, prolongate timer to "delay",
411 otherwise fire it at deadline time.
414 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
415 tmo = 0;
417 if (delay > tmo)
418 delay = tmo;
421 if (delay <= 0) {
422 spin_unlock_bh(&rt_flush_lock);
423 SMP_TIMER_NAME(rt_run_flush)(0);
424 return;
427 if (rt_deadline == 0)
428 rt_deadline = now + ip_rt_max_delay;
430 mod_timer(&rt_flush_timer, now+delay);
431 spin_unlock_bh(&rt_flush_lock);
435 Short description of GC goals.
437 We want to build algorithm, which will keep routing cache
438 at some equilibrium point, when number of aged off entries
439 is kept approximately equal to newly generated ones.
441 Current expiration strength is variable "expire".
442 We try to adjust it dynamically, so that if networking
443 is idle expires is large enough to keep enough of warm entries,
444 and when load increases it reduces to limit cache size.
447 static int rt_garbage_collect(void)
449 static unsigned expire = RT_GC_TIMEOUT;
450 static unsigned long last_gc;
451 static int rover;
452 static int equilibrium;
453 struct rtable *rth, **rthp;
454 unsigned long now = jiffies;
455 int goal;
458 * Garbage collection is pretty expensive,
459 * do not make it too frequently.
461 if (now - last_gc < ip_rt_gc_min_interval &&
462 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
463 return 0;
465 /* Calculate number of entries, which we want to expire now. */
466 goal = atomic_read(&ipv4_dst_ops.entries) - (ip_rt_gc_elasticity<<rt_hash_log);
467 if (goal <= 0) {
468 if (equilibrium < ipv4_dst_ops.gc_thresh)
469 equilibrium = ipv4_dst_ops.gc_thresh;
470 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
471 if (goal > 0) {
472 equilibrium += min(goal/2, rt_hash_mask+1);
473 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
475 } else {
476 /* We are in dangerous area. Try to reduce cache really
477 * aggressively.
479 goal = max(goal/2, rt_hash_mask+1);
480 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
483 if (now - last_gc >= ip_rt_gc_min_interval)
484 last_gc = now;
486 if (goal <= 0) {
487 equilibrium += goal;
488 goto work_done;
491 do {
492 int i, k;
494 for (i=rt_hash_mask, k=rover; i>=0; i--) {
495 unsigned tmo = expire;
497 k = (k + 1) & rt_hash_mask;
498 rthp = &rt_hash_table[k].chain;
499 write_lock_bh(&rt_hash_table[k].lock);
500 while ((rth = *rthp) != NULL) {
501 if (!rt_may_expire(rth, tmo, expire)) {
502 tmo >>= 1;
503 rthp = &rth->u.rt_next;
504 continue;
506 *rthp = rth->u.rt_next;
507 rt_free(rth);
508 goal--;
510 write_unlock_bh(&rt_hash_table[k].lock);
511 if (goal <= 0)
512 break;
514 rover = k;
516 if (goal <= 0)
517 goto work_done;
519 /* Goal is not achieved. We stop process if:
521 - if expire reduced to zero. Otherwise, expire is halfed.
522 - if table is not full.
523 - if we are called from interrupt.
524 - jiffies check is just fallback/debug loop breaker.
525 We will not spin here for long time in any case.
528 if (expire == 0)
529 break;
531 expire >>= 1;
532 #if RT_CACHE_DEBUG >= 2
533 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);
534 #endif
536 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
537 return 0;
538 } while (!in_softirq() && jiffies - now < 1);
540 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
541 return 0;
542 if (net_ratelimit())
543 printk("dst cache overflow\n");
544 return 1;
546 work_done:
547 expire += ip_rt_gc_min_interval;
548 if (expire > ip_rt_gc_timeout ||
549 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
550 expire = ip_rt_gc_timeout;
551 #if RT_CACHE_DEBUG >= 2
552 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);
553 #endif
554 return 0;
557 static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp)
559 struct rtable *rth, **rthp;
560 unsigned long now = jiffies;
561 int attempts = !in_softirq();
563 restart:
564 rthp = &rt_hash_table[hash].chain;
566 write_lock_bh(&rt_hash_table[hash].lock);
567 while ((rth = *rthp) != NULL) {
568 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
569 /* Put it first */
570 *rthp = rth->u.rt_next;
571 rth->u.rt_next = rt_hash_table[hash].chain;
572 rt_hash_table[hash].chain = rth;
574 rth->u.dst.__use++;
575 dst_hold(&rth->u.dst);
576 rth->u.dst.lastuse = now;
577 write_unlock_bh(&rt_hash_table[hash].lock);
579 rt_drop(rt);
580 *rp = rth;
581 return 0;
584 rthp = &rth->u.rt_next;
587 /* Try to bind route to arp only if it is output
588 route or unicast forwarding path.
590 if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
591 int err = arp_bind_neighbour(&rt->u.dst);
592 if (err) {
593 write_unlock_bh(&rt_hash_table[hash].lock);
595 if (err != -ENOBUFS) {
596 rt_drop(rt);
597 return err;
600 /* Neighbour tables are full and nothing
601 can be released. Try to shrink route cache,
602 it is most likely it holds some neighbour records.
604 if (attempts-- > 0) {
605 int saved_elasticity = ip_rt_gc_elasticity;
606 int saved_int = ip_rt_gc_min_interval;
607 ip_rt_gc_elasticity = 1;
608 ip_rt_gc_min_interval = 0;
609 rt_garbage_collect();
610 ip_rt_gc_min_interval = saved_int;
611 ip_rt_gc_elasticity = saved_elasticity;
612 goto restart;
615 if (net_ratelimit())
616 printk("Neighbour table overflow.\n");
617 rt_drop(rt);
618 return -ENOBUFS;
622 rt->u.rt_next = rt_hash_table[hash].chain;
623 #if RT_CACHE_DEBUG >= 2
624 if (rt->u.rt_next) {
625 struct rtable * trt;
626 printk("rt_cache @%02x: %u.%u.%u.%u", hash, NIPQUAD(rt->rt_dst));
627 for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next)
628 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
629 printk("\n");
631 #endif
632 rt_hash_table[hash].chain = rt;
633 write_unlock_bh(&rt_hash_table[hash].lock);
634 *rp = rt;
635 return 0;
638 void rt_bind_peer(struct rtable *rt, int create)
640 static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
641 struct inet_peer *peer;
643 peer = inet_getpeer(rt->rt_dst, create);
645 spin_lock_bh(&rt_peer_lock);
646 if (rt->peer == NULL) {
647 rt->peer = peer;
648 peer = NULL;
650 spin_unlock_bh(&rt_peer_lock);
651 if (peer)
652 inet_putpeer(peer);
656 * Peer allocation may fail only in serious out-of-memory conditions. However
657 * we still can generate some output.
658 * Random ID selection looks a bit dangerous because we have no chances to
659 * select ID being unique in a reasonable period of time.
660 * But broken packet identifier may be better than no packet at all.
662 static void ip_select_fb_ident(struct iphdr *iph)
664 static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
665 static u32 ip_fallback_id;
666 u32 salt;
668 spin_lock_bh(&ip_fb_id_lock);
669 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
670 iph->id = salt & 0xFFFF;
671 ip_fallback_id = salt;
672 spin_unlock_bh(&ip_fb_id_lock);
675 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
677 struct rtable *rt = (struct rtable *) dst;
679 if (rt) {
680 if (rt->peer == NULL)
681 rt_bind_peer(rt, 1);
683 /* If peer is attached to destination, it is never detached,
684 so that we need not to grab a lock to dereference it.
686 if (rt->peer) {
687 iph->id = inet_getid(rt->peer);
688 return;
690 } else {
691 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
694 ip_select_fb_ident(iph);
697 static void rt_del(unsigned hash, struct rtable *rt)
699 struct rtable **rthp;
701 write_lock_bh(&rt_hash_table[hash].lock);
702 ip_rt_put(rt);
703 for (rthp = &rt_hash_table[hash].chain; *rthp; rthp = &(*rthp)->u.rt_next) {
704 if (*rthp == rt) {
705 *rthp = rt->u.rt_next;
706 rt_free(rt);
707 break;
710 write_unlock_bh(&rt_hash_table[hash].lock);
713 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
714 u32 saddr, u8 tos, struct net_device *dev)
716 int i, k;
717 struct in_device *in_dev = in_dev_get(dev);
718 struct rtable *rth, **rthp;
719 u32 skeys[2] = { saddr, 0 };
720 int ikeys[2] = { dev->ifindex, 0 };
722 tos &= IPTOS_RT_MASK;
724 if (!in_dev)
725 return;
727 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
728 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
729 goto reject_redirect;
731 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
732 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
733 goto reject_redirect;
734 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
735 goto reject_redirect;
736 } else {
737 if (inet_addr_type(new_gw) != RTN_UNICAST)
738 goto reject_redirect;
741 for (i=0; i<2; i++) {
742 for (k=0; k<2; k++) {
743 unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos);
745 rthp=&rt_hash_table[hash].chain;
747 read_lock(&rt_hash_table[hash].lock);
748 while ( (rth = *rthp) != NULL) {
749 struct rtable *rt;
751 if (rth->key.dst != daddr ||
752 rth->key.src != skeys[i] ||
753 rth->key.tos != tos ||
754 rth->key.oif != ikeys[k] ||
755 rth->key.iif != 0) {
756 rthp = &rth->u.rt_next;
757 continue;
760 if (rth->rt_dst != daddr ||
761 rth->rt_src != saddr ||
762 rth->u.dst.error ||
763 rth->rt_gateway != old_gw ||
764 rth->u.dst.dev != dev)
765 break;
767 dst_clone(&rth->u.dst);
768 read_unlock(&rt_hash_table[hash].lock);
770 rt = dst_alloc(&ipv4_dst_ops);
771 if (rt == NULL) {
772 ip_rt_put(rth);
773 in_dev_put(in_dev);
774 return;
778 * Copy all the information.
780 *rt = *rth;
781 rt->u.dst.__use = 1;
782 atomic_set(&rt->u.dst.__refcnt, 1);
783 if (rt->u.dst.dev)
784 dev_hold(rt->u.dst.dev);
785 rt->u.dst.lastuse = jiffies;
786 rt->u.dst.neighbour = NULL;
787 rt->u.dst.hh = NULL;
788 rt->u.dst.obsolete = 0;
790 rt->rt_flags |= RTCF_REDIRECTED;
792 /* Gateway is different ... */
793 rt->rt_gateway = new_gw;
795 /* Redirect received -> path was valid */
796 dst_confirm(&rth->u.dst);
798 if (rt->peer)
799 atomic_inc(&rt->peer->refcnt);
801 if (arp_bind_neighbour(&rt->u.dst) ||
802 !(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
803 if (rt->u.dst.neighbour)
804 neigh_event_send(rt->u.dst.neighbour, NULL);
805 ip_rt_put(rth);
806 rt_drop(rt);
807 goto do_next;
810 rt_del(hash, rth);
811 if (!rt_intern_hash(hash, rt, &rt))
812 ip_rt_put(rt);
813 goto do_next;
815 read_unlock(&rt_hash_table[hash].lock);
816 do_next:
820 in_dev_put(in_dev);
821 return;
823 reject_redirect:
824 #ifdef CONFIG_IP_ROUTE_VERBOSE
825 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
826 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about %u.%u.%u.%u ignored.\n"
827 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, tos %02x\n",
828 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
829 NIPQUAD(saddr), NIPQUAD(daddr), tos);
830 #endif
831 in_dev_put(in_dev);
834 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
836 struct rtable *rt = (struct rtable*)dst;
838 if (rt != NULL) {
839 if (dst->obsolete) {
840 ip_rt_put(rt);
841 return NULL;
843 if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
844 unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
845 #if RT_CACHE_DEBUG >= 1
846 printk(KERN_DEBUG "ip_rt_advice: redirect to %u.%u.%u.%u/%02x dropped\n",
847 NIPQUAD(rt->rt_dst), rt->key.tos);
848 #endif
849 rt_del(hash, rt);
850 return NULL;
853 return dst;
857 * Algorithm:
858 * 1. The first ip_rt_redirect_number redirects are sent
859 * with exponential backoff, then we stop sending them at all,
860 * assuming that the host ignores our redirects.
861 * 2. If we did not see packets requiring redirects
862 * during ip_rt_redirect_silence, we assume that the host
863 * forgot redirected route and start to send redirects again.
865 * This algorithm is much cheaper and more intelligent than dumb load limiting
866 * in icmp.c.
868 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
869 * and "frag. need" (breaks PMTU discovery) in icmp.c.
872 void ip_rt_send_redirect(struct sk_buff *skb)
874 struct rtable *rt = (struct rtable*)skb->dst;
875 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
877 if (!in_dev)
878 return;
880 if (!IN_DEV_TX_REDIRECTS(in_dev))
881 goto out;
883 /* No redirected packets during ip_rt_redirect_silence;
884 * reset the algorithm.
886 if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence)
887 rt->u.dst.rate_tokens = 0;
889 /* Too many ignored redirects; do not send anything
890 * set u.dst.rate_last to the last seen redirected packet.
892 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
893 rt->u.dst.rate_last = jiffies;
894 goto out;
897 /* Check for load limit; set rate_last to the latest sent
898 * redirect.
900 if (jiffies - rt->u.dst.rate_last > (ip_rt_redirect_load<<rt->u.dst.rate_tokens)) {
901 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
902 rt->u.dst.rate_last = jiffies;
903 ++rt->u.dst.rate_tokens;
904 #ifdef CONFIG_IP_ROUTE_VERBOSE
905 if (IN_DEV_LOG_MARTIANS(in_dev) &&
906 rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit())
907 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores redirects for "
908 "%u.%u.%u.%u to %u.%u.%u.%u.\n",
909 NIPQUAD(rt->rt_src), rt->rt_iif,
910 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
911 #endif
913 out:
914 in_dev_put(in_dev);
917 static int ip_error(struct sk_buff *skb)
919 struct rtable *rt = (struct rtable*)skb->dst;
920 unsigned long now;
921 int code;
923 switch (rt->u.dst.error) {
924 case EINVAL:
925 default:
926 kfree_skb(skb);
927 return 0;
928 case EHOSTUNREACH:
929 code = ICMP_HOST_UNREACH;
930 break;
931 case ENETUNREACH:
932 code = ICMP_NET_UNREACH;
933 break;
934 case EACCES:
935 code = ICMP_PKT_FILTERED;
936 break;
939 now = jiffies;
940 if ((rt->u.dst.rate_tokens += (now - rt->u.dst.rate_last)) > ip_rt_error_burst)
941 rt->u.dst.rate_tokens = ip_rt_error_burst;
942 rt->u.dst.rate_last = now;
943 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
944 rt->u.dst.rate_tokens -= ip_rt_error_cost;
945 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
948 kfree_skb(skb);
949 return 0;
953 * The last two values are not from the RFC but
954 * are needed for AMPRnet AX.25 paths.
957 static unsigned short mtu_plateau[] =
958 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
960 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
962 int i;
964 for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++)
965 if (old_mtu > mtu_plateau[i])
966 return mtu_plateau[i];
967 return 68;
970 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
972 int i;
973 unsigned short old_mtu = ntohs(iph->tot_len);
974 struct rtable *rth;
975 u32 skeys[2] = { iph->saddr, 0, };
976 u32 daddr = iph->daddr;
977 u8 tos = iph->tos & IPTOS_RT_MASK;
978 unsigned short est_mtu = 0;
980 if (ipv4_config.no_pmtu_disc)
981 return 0;
983 for (i=0; i<2; i++) {
984 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
986 read_lock(&rt_hash_table[hash].lock);
987 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
988 if (rth->key.dst == daddr &&
989 rth->key.src == skeys[i] &&
990 rth->rt_dst == daddr &&
991 rth->rt_src == iph->saddr &&
992 rth->key.tos == tos &&
993 rth->key.iif == 0 &&
994 !(rth->u.dst.mxlock&(1<<RTAX_MTU))) {
995 unsigned short mtu = new_mtu;
997 if (new_mtu < 68 || new_mtu >= old_mtu) {
999 /* BSD 4.2 compatibility hack :-( */
1000 if (mtu == 0 && old_mtu >= rth->u.dst.pmtu &&
1001 old_mtu >= 68 + (iph->ihl<<2))
1002 old_mtu -= iph->ihl<<2;
1004 mtu = guess_mtu(old_mtu);
1006 if (mtu <= rth->u.dst.pmtu) {
1007 if (mtu < rth->u.dst.pmtu) {
1008 dst_confirm(&rth->u.dst);
1009 if (mtu < ip_rt_min_pmtu) {
1010 mtu = ip_rt_min_pmtu;
1011 rth->u.dst.mxlock |= (1<<RTAX_MTU);
1013 rth->u.dst.pmtu = mtu;
1014 dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
1016 est_mtu = mtu;
1020 read_unlock(&rt_hash_table[hash].lock);
1022 return est_mtu ? : new_mtu;
1025 void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
1027 if (dst->pmtu > mtu && mtu >= 68 &&
1028 !(dst->mxlock&(1<<RTAX_MTU))) {
1029 if (mtu < ip_rt_min_pmtu) {
1030 mtu = ip_rt_min_pmtu;
1031 dst->mxlock |= (1<<RTAX_MTU);
1033 dst->pmtu = mtu;
1034 dst_set_expires(dst, ip_rt_mtu_expires);
1038 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32 cookie)
1040 dst_release(dst);
1041 return NULL;
1044 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
1045 struct sk_buff *skb)
1047 return NULL;
1050 static void ipv4_dst_destroy(struct dst_entry * dst)
1052 struct rtable *rt = (struct rtable *) dst;
1053 struct inet_peer *peer = rt->peer;
1055 if (peer) {
1056 rt->peer = NULL;
1057 inet_putpeer(peer);
1061 static void ipv4_link_failure(struct sk_buff *skb)
1063 struct rtable *rt;
1065 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1067 rt = (struct rtable *) skb->dst;
1068 if (rt)
1069 dst_set_expires(&rt->u.dst, 0);
1072 static int ip_rt_bug(struct sk_buff *skb)
1074 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1075 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1076 skb->dev ? skb->dev->name : "?");
1077 kfree_skb(skb);
1078 return 0;
1082 We do not cache source address of outgoing interface,
1083 because it is used only by IP RR, TS and SRR options,
1084 so that it out of fast path.
1086 BTW remember: "addr" is allowed to be not aligned
1087 in IP options!
1090 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1092 u32 src;
1093 struct fib_result res;
1095 if (rt->key.iif == 0)
1096 src = rt->rt_src;
1097 else if (fib_lookup(&rt->key, &res) == 0) {
1098 #ifdef CONFIG_IP_ROUTE_NAT
1099 if (res.type == RTN_NAT)
1100 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
1101 else
1102 #endif
1103 src = FIB_RES_PREFSRC(res);
1104 fib_res_put(&res);
1105 } else
1106 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
1107 memcpy(addr, &src, 4);
1110 #ifdef CONFIG_NET_CLS_ROUTE
1111 static void set_class_tag(struct rtable *rt, u32 tag)
1113 if (!(rt->u.dst.tclassid&0xFFFF))
1114 rt->u.dst.tclassid |= tag&0xFFFF;
1115 if (!(rt->u.dst.tclassid&0xFFFF0000))
1116 rt->u.dst.tclassid |= tag&0xFFFF0000;
1118 #endif
1120 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1122 struct fib_info *fi = res->fi;
1124 if (fi) {
1125 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1126 rt->rt_gateway = FIB_RES_GW(*res);
1127 memcpy(&rt->u.dst.mxlock, fi->fib_metrics, sizeof(fi->fib_metrics));
1128 if (fi->fib_mtu == 0) {
1129 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1130 if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
1131 rt->rt_gateway != rt->rt_dst &&
1132 rt->u.dst.pmtu > 576)
1133 rt->u.dst.pmtu = 576;
1135 #ifdef CONFIG_NET_CLS_ROUTE
1136 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1137 #endif
1138 } else {
1139 rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1141 if (rt->u.dst.pmtu > IP_MAX_MTU)
1142 rt->u.dst.pmtu = IP_MAX_MTU;
1143 if (rt->u.dst.advmss == 0)
1144 rt->u.dst.advmss = max(rt->u.dst.dev->mtu-40, ip_rt_min_advmss);
1145 if (rt->u.dst.advmss > 65535-40)
1146 rt->u.dst.advmss = 65535-40;
1148 #ifdef CONFIG_NET_CLS_ROUTE
1149 #ifdef CONFIG_IP_MULTIPLE_TABLES
1150 set_class_tag(rt, fib_rules_tclass(res));
1151 #endif
1152 set_class_tag(rt, itag);
1153 #endif
1154 rt->rt_type = res->type;
1157 static int
1158 ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1159 u8 tos, struct net_device *dev, int our)
1161 unsigned hash;
1162 struct rtable *rth;
1163 u32 spec_dst;
1164 struct in_device *in_dev = in_dev_get(dev);
1165 u32 itag = 0;
1167 /* Primary sanity checks. */
1169 if (in_dev == NULL)
1170 return -EINVAL;
1172 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1173 skb->protocol != __constant_htons(ETH_P_IP))
1174 goto e_inval;
1176 if (ZERONET(saddr)) {
1177 if (!LOCAL_MCAST(daddr))
1178 goto e_inval;
1179 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1180 } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
1181 goto e_inval;
1183 rth = dst_alloc(&ipv4_dst_ops);
1184 if (!rth)
1185 goto e_nobufs;
1187 rth->u.dst.output= ip_rt_bug;
1189 atomic_set(&rth->u.dst.__refcnt, 1);
1190 rth->u.dst.flags= DST_HOST;
1191 rth->key.dst = daddr;
1192 rth->rt_dst = daddr;
1193 rth->key.tos = tos;
1194 #ifdef CONFIG_IP_ROUTE_FWMARK
1195 rth->key.fwmark = skb->nfmark;
1196 #endif
1197 rth->key.src = saddr;
1198 rth->rt_src = saddr;
1199 #ifdef CONFIG_IP_ROUTE_NAT
1200 rth->rt_dst_map = daddr;
1201 rth->rt_src_map = saddr;
1202 #endif
1203 #ifdef CONFIG_NET_CLS_ROUTE
1204 rth->u.dst.tclassid = itag;
1205 #endif
1206 rth->rt_iif =
1207 rth->key.iif = dev->ifindex;
1208 rth->u.dst.dev = &loopback_dev;
1209 dev_hold(rth->u.dst.dev);
1210 rth->key.oif = 0;
1211 rth->rt_gateway = daddr;
1212 rth->rt_spec_dst= spec_dst;
1213 rth->rt_type = RTN_MULTICAST;
1214 rth->rt_flags = RTCF_MULTICAST;
1215 if (our) {
1216 rth->u.dst.input= ip_local_deliver;
1217 rth->rt_flags |= RTCF_LOCAL;
1220 #ifdef CONFIG_IP_MROUTE
1221 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1222 rth->u.dst.input = ip_mr_input;
1223 #endif
1225 in_dev_put(in_dev);
1226 hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
1227 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1229 e_nobufs:
1230 in_dev_put(in_dev);
1231 return -ENOBUFS;
1233 e_inval:
1234 in_dev_put(in_dev);
1235 return -EINVAL;
1239 * NOTE. We drop all the packets that has local source
1240 * addresses, because every properly looped back packet
1241 * must have correct destination already attached by output routine.
1243 * Such approach solves two big problems:
1244 * 1. Not simplex devices are handled properly.
1245 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1248 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1249 u8 tos, struct net_device *dev)
1251 struct rt_key key;
1252 struct fib_result res;
1253 struct in_device *in_dev = in_dev_get(dev);
1254 struct in_device *out_dev = NULL;
1255 unsigned flags = 0;
1256 u32 itag = 0;
1257 struct rtable * rth;
1258 unsigned hash;
1259 u32 spec_dst;
1260 int err = -EINVAL;
1261 int free_res = 0;
1264 * IP on this device is disabled.
1267 if (!in_dev)
1268 return -EINVAL;
1270 key.dst = daddr;
1271 key.src = saddr;
1272 key.tos = tos;
1273 #ifdef CONFIG_IP_ROUTE_FWMARK
1274 key.fwmark = skb->nfmark;
1275 #endif
1276 key.iif = dev->ifindex;
1277 key.oif = 0;
1278 key.scope = RT_SCOPE_UNIVERSE;
1280 hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos);
1282 /* Check for the most weird martians, which can be not detected
1283 by fib_lookup.
1286 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1287 goto martian_source;
1289 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1290 goto brd_input;
1292 /* Accept zero addresses only to limited broadcast;
1293 * I even do not know to fix it or not. Waiting for complains :-)
1295 if (ZERONET(saddr))
1296 goto martian_source;
1298 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1299 goto martian_destination;
1302 * Now we are ready to route packet.
1304 if ((err = fib_lookup(&key, &res)) != 0) {
1305 if (!IN_DEV_FORWARD(in_dev))
1306 goto e_inval;
1307 goto no_route;
1309 free_res = 1;
1311 #ifdef CONFIG_IP_ROUTE_NAT
1312 /* Policy is applied before mapping destination,
1313 but rerouting after map should be made with old source.
1316 if (1) {
1317 u32 src_map = saddr;
1318 if (res.r)
1319 src_map = fib_rules_policy(saddr, &res, &flags);
1321 if (res.type == RTN_NAT) {
1322 key.dst = fib_rules_map_destination(daddr, &res);
1323 fib_res_put(&res);
1324 free_res = 0;
1325 if (fib_lookup(&key, &res))
1326 goto e_inval;
1327 free_res = 1;
1328 if (res.type != RTN_UNICAST)
1329 goto e_inval;
1330 flags |= RTCF_DNAT;
1332 key.src = src_map;
1334 #endif
1336 if (res.type == RTN_BROADCAST)
1337 goto brd_input;
1339 if (res.type == RTN_LOCAL) {
1340 int result;
1341 result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex,
1342 dev, &spec_dst, &itag);
1343 if (result < 0)
1344 goto martian_source;
1345 if (result)
1346 flags |= RTCF_DIRECTSRC;
1347 spec_dst = daddr;
1348 goto local_input;
1351 if (!IN_DEV_FORWARD(in_dev))
1352 goto e_inval;
1353 if (res.type != RTN_UNICAST)
1354 goto martian_destination;
1356 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1357 if (res.fi->fib_nhs > 1 && key.oif == 0)
1358 fib_select_multipath(&key, &res);
1359 #endif
1360 out_dev = in_dev_get(FIB_RES_DEV(res));
1361 if (out_dev == NULL) {
1362 if (net_ratelimit())
1363 printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n");
1364 goto e_inval;
1367 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
1368 if (err < 0)
1369 goto martian_source;
1371 if (err)
1372 flags |= RTCF_DIRECTSRC;
1374 if (out_dev == in_dev && err && !(flags&(RTCF_NAT|RTCF_MASQ)) &&
1375 (IN_DEV_SHARED_MEDIA(out_dev)
1376 || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1377 flags |= RTCF_DOREDIRECT;
1379 if (skb->protocol != __constant_htons(ETH_P_IP)) {
1380 /* Not IP (i.e. ARP). Do not create route, if it is
1381 * invalid for proxy arp. DNAT routes are always valid.
1383 if (out_dev == in_dev && !(flags&RTCF_DNAT))
1384 goto e_inval;
1387 rth = dst_alloc(&ipv4_dst_ops);
1388 if (!rth)
1389 goto e_nobufs;
1391 atomic_set(&rth->u.dst.__refcnt, 1);
1392 rth->u.dst.flags= DST_HOST;
1393 rth->key.dst = daddr;
1394 rth->rt_dst = daddr;
1395 rth->key.tos = tos;
1396 #ifdef CONFIG_IP_ROUTE_FWMARK
1397 rth->key.fwmark = skb->nfmark;
1398 #endif
1399 rth->key.src = saddr;
1400 rth->rt_src = saddr;
1401 rth->rt_gateway = daddr;
1402 #ifdef CONFIG_IP_ROUTE_NAT
1403 rth->rt_src_map = key.src;
1404 rth->rt_dst_map = key.dst;
1405 if (flags&RTCF_DNAT)
1406 rth->rt_gateway = key.dst;
1407 #endif
1408 rth->rt_iif =
1409 rth->key.iif = dev->ifindex;
1410 rth->u.dst.dev = out_dev->dev;
1411 dev_hold(rth->u.dst.dev);
1412 rth->key.oif = 0;
1413 rth->rt_spec_dst= spec_dst;
1415 rth->u.dst.input = ip_forward;
1416 rth->u.dst.output = ip_output;
1418 rt_set_nexthop(rth, &res, itag);
1420 rth->rt_flags = flags;
1422 #ifdef CONFIG_NET_FASTROUTE
1423 if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1424 struct net_device *odev = rth->u.dst.dev;
1425 if (odev != dev &&
1426 dev->accept_fastpath &&
1427 odev->mtu >= dev->mtu &&
1428 dev->accept_fastpath(dev, &rth->u.dst) == 0)
1429 rth->rt_flags |= RTCF_FAST;
1431 #endif
1433 intern:
1434 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1435 done:
1436 in_dev_put(in_dev);
1437 if (out_dev)
1438 in_dev_put(out_dev);
1439 if (free_res)
1440 fib_res_put(&res);
1441 return err;
1443 brd_input:
1444 if (skb->protocol != __constant_htons(ETH_P_IP))
1445 goto e_inval;
1447 if (ZERONET(saddr)) {
1448 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1449 } else {
1450 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag);
1451 if (err < 0)
1452 goto martian_source;
1453 if (err)
1454 flags |= RTCF_DIRECTSRC;
1456 flags |= RTCF_BROADCAST;
1457 res.type = RTN_BROADCAST;
1459 local_input:
1460 rth = dst_alloc(&ipv4_dst_ops);
1461 if (!rth)
1462 goto e_nobufs;
1464 rth->u.dst.output= ip_rt_bug;
1466 atomic_set(&rth->u.dst.__refcnt, 1);
1467 rth->u.dst.flags= DST_HOST;
1468 rth->key.dst = daddr;
1469 rth->rt_dst = daddr;
1470 rth->key.tos = tos;
1471 #ifdef CONFIG_IP_ROUTE_FWMARK
1472 rth->key.fwmark = skb->nfmark;
1473 #endif
1474 rth->key.src = saddr;
1475 rth->rt_src = saddr;
1476 #ifdef CONFIG_IP_ROUTE_NAT
1477 rth->rt_dst_map = key.dst;
1478 rth->rt_src_map = key.src;
1479 #endif
1480 #ifdef CONFIG_NET_CLS_ROUTE
1481 rth->u.dst.tclassid = itag;
1482 #endif
1483 rth->rt_iif =
1484 rth->key.iif = dev->ifindex;
1485 rth->u.dst.dev = &loopback_dev;
1486 dev_hold(rth->u.dst.dev);
1487 rth->key.oif = 0;
1488 rth->rt_gateway = daddr;
1489 rth->rt_spec_dst= spec_dst;
1490 rth->u.dst.input= ip_local_deliver;
1491 rth->rt_flags = flags|RTCF_LOCAL;
1492 if (res.type == RTN_UNREACHABLE) {
1493 rth->u.dst.input= ip_error;
1494 rth->u.dst.error= -err;
1495 rth->rt_flags &= ~RTCF_LOCAL;
1497 rth->rt_type = res.type;
1498 goto intern;
1500 no_route:
1501 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1502 res.type = RTN_UNREACHABLE;
1503 goto local_input;
1506 * Do not cache martian addresses: they should be logged (RFC1812)
1508 martian_destination:
1509 #ifdef CONFIG_IP_ROUTE_VERBOSE
1510 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1511 printk(KERN_WARNING "martian destination %u.%u.%u.%u from %u.%u.%u.%u, dev %s\n",
1512 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1513 #endif
1514 e_inval:
1515 err = -EINVAL;
1516 goto done;
1518 e_nobufs:
1519 err = -ENOBUFS;
1520 goto done;
1522 martian_source:
1523 #ifdef CONFIG_IP_ROUTE_VERBOSE
1524 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1526 * RFC1812 recommendation, if source is martian,
1527 * the only hint is MAC header.
1529 printk(KERN_WARNING "martian source %u.%u.%u.%u from %u.%u.%u.%u, on dev %s\n",
1530 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1531 if (dev->hard_header_len) {
1532 int i;
1533 unsigned char *p = skb->mac.raw;
1534 printk(KERN_WARNING "ll header: ");
1535 for (i=0; i<dev->hard_header_len; i++, p++) {
1536 printk("%02x", *p);
1537 if(i<(dev->hard_header_len-1))
1538 printk(":");
1540 printk("\n");
1543 #endif
1544 goto e_inval;
1547 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1548 u8 tos, struct net_device *dev)
1550 struct rtable * rth;
1551 unsigned hash;
1552 int iif = dev->ifindex;
1554 tos &= IPTOS_RT_MASK;
1555 hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
1557 read_lock(&rt_hash_table[hash].lock);
1558 for (rth=rt_hash_table[hash].chain; rth; rth=rth->u.rt_next) {
1559 if (rth->key.dst == daddr &&
1560 rth->key.src == saddr &&
1561 rth->key.iif == iif &&
1562 rth->key.oif == 0 &&
1563 #ifdef CONFIG_IP_ROUTE_FWMARK
1564 rth->key.fwmark == skb->nfmark &&
1565 #endif
1566 rth->key.tos == tos) {
1567 rth->u.dst.lastuse = jiffies;
1568 dst_hold(&rth->u.dst);
1569 rth->u.dst.__use++;
1570 read_unlock(&rt_hash_table[hash].lock);
1571 skb->dst = (struct dst_entry*)rth;
1572 return 0;
1575 read_unlock(&rt_hash_table[hash].lock);
1577 /* Multicast recognition logic is moved from route cache to here.
1578 The problem was that too many Ethernet cards have broken/missing
1579 hardware multicast filters :-( As result the host on multicasting
1580 network acquires a lot of useless route cache entries, sort of
1581 SDR messages from all the world. Now we try to get rid of them.
1582 Really, provided software IP multicast filter is organized
1583 reasonably (at least, hashed), it does not result in a slowdown
1584 comparing with route cache reject entries.
1585 Note, that multicast routers are not affected, because
1586 route cache entry is created eventually.
1588 if (MULTICAST(daddr)) {
1589 struct in_device *in_dev;
1591 read_lock(&inetdev_lock);
1592 if ((in_dev = __in_dev_get(dev)) != NULL) {
1593 int our = ip_check_mc(in_dev, daddr);
1594 if (our
1595 #ifdef CONFIG_IP_MROUTE
1596 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1597 #endif
1599 read_unlock(&inetdev_lock);
1600 return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
1603 read_unlock(&inetdev_lock);
1604 return -EINVAL;
1606 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1610 * Major route resolver routine.
1613 int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
1615 struct rt_key key;
1616 struct fib_result res;
1617 unsigned flags = 0;
1618 struct rtable *rth;
1619 struct net_device *dev_out = NULL;
1620 unsigned hash;
1621 int free_res = 0;
1622 int err;
1623 u32 tos;
1625 tos = oldkey->tos & (IPTOS_RT_MASK|RTO_ONLINK);
1626 key.dst = oldkey->dst;
1627 key.src = oldkey->src;
1628 key.tos = tos&IPTOS_RT_MASK;
1629 key.iif = loopback_dev.ifindex;
1630 key.oif = oldkey->oif;
1631 #ifdef CONFIG_IP_ROUTE_FWMARK
1632 key.fwmark = oldkey->fwmark;
1633 #endif
1634 key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
1635 res.fi = NULL;
1636 #ifdef CONFIG_IP_MULTIPLE_TABLES
1637 res.r = NULL;
1638 #endif
1640 if (oldkey->src) {
1641 if (MULTICAST(oldkey->src)
1642 || BADCLASS(oldkey->src)
1643 || ZERONET(oldkey->src))
1644 return -EINVAL;
1646 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1647 dev_out = ip_dev_find(oldkey->src);
1648 if (dev_out == NULL)
1649 return -EINVAL;
1651 /* I removed check for oif == dev_out->oif here.
1652 It was wrong by three reasons:
1653 1. ip_dev_find(saddr) can return wrong iface, if saddr is
1654 assigned to multiple interfaces.
1655 2. Moreover, we are allowed to send packets with saddr
1656 of another iface. --ANK
1659 if (oldkey->oif == 0
1660 && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
1661 /* Special hack: user can direct multicasts
1662 and limited broadcast via necessary interface
1663 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1664 This hack is not just for fun, it allows
1665 vic,vat and friends to work.
1666 They bind socket to loopback, set ttl to zero
1667 and expect that it will work.
1668 From the viewpoint of routing cache they are broken,
1669 because we are not allowed to build multicast path
1670 with loopback source addr (look, routing cache
1671 cannot know, that ttl is zero, so that packet
1672 will not leave this host and route is valid).
1673 Luckily, this hack is good workaround.
1676 key.oif = dev_out->ifindex;
1677 goto make_route;
1679 if (dev_out)
1680 dev_put(dev_out);
1681 dev_out = NULL;
1683 if (oldkey->oif) {
1684 dev_out = dev_get_by_index(oldkey->oif);
1685 if (dev_out == NULL)
1686 return -ENODEV;
1687 if (__in_dev_get(dev_out) == NULL) {
1688 dev_put(dev_out);
1689 return -ENODEV; /* Wrong error code */
1692 if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
1693 if (!key.src)
1694 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1695 goto make_route;
1697 if (!key.src) {
1698 if (MULTICAST(oldkey->dst))
1699 key.src = inet_select_addr(dev_out, 0, key.scope);
1700 else if (!oldkey->dst)
1701 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST);
1705 if (!key.dst) {
1706 key.dst = key.src;
1707 if (!key.dst)
1708 key.dst = key.src = htonl(INADDR_LOOPBACK);
1709 if (dev_out)
1710 dev_put(dev_out);
1711 dev_out = &loopback_dev;
1712 dev_hold(dev_out);
1713 key.oif = loopback_dev.ifindex;
1714 res.type = RTN_LOCAL;
1715 flags |= RTCF_LOCAL;
1716 goto make_route;
1719 if (fib_lookup(&key, &res)) {
1720 res.fi = NULL;
1721 if (oldkey->oif) {
1722 /* Apparently, routing tables are wrong. Assume,
1723 that the destination is on link.
1725 WHY? DW.
1726 Because we are allowed to send to iface
1727 even if it has NO routes and NO assigned
1728 addresses. When oif is specified, routing
1729 tables are looked up with only one purpose:
1730 to catch if destination is gatewayed, rather than
1731 direct. Moreover, if MSG_DONTROUTE is set,
1732 we send packet, ignoring both routing tables
1733 and ifaddr state. --ANK
1736 We could make it even if oif is unknown,
1737 likely IPv6, but we do not.
1740 if (key.src == 0)
1741 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1742 res.type = RTN_UNICAST;
1743 goto make_route;
1745 if (dev_out)
1746 dev_put(dev_out);
1747 return -ENETUNREACH;
1749 free_res = 1;
1751 if (res.type == RTN_NAT)
1752 goto e_inval;
1754 if (res.type == RTN_LOCAL) {
1755 if (!key.src)
1756 key.src = key.dst;
1757 if (dev_out)
1758 dev_put(dev_out);
1759 dev_out = &loopback_dev;
1760 dev_hold(dev_out);
1761 key.oif = dev_out->ifindex;
1762 if (res.fi)
1763 fib_info_put(res.fi);
1764 res.fi = NULL;
1765 flags |= RTCF_LOCAL;
1766 goto make_route;
1769 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1770 if (res.fi->fib_nhs > 1 && key.oif == 0)
1771 fib_select_multipath(&key, &res);
1772 else
1773 #endif
1774 if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0)
1775 fib_select_default(&key, &res);
1777 if (!key.src)
1778 key.src = FIB_RES_PREFSRC(res);
1780 if (dev_out)
1781 dev_put(dev_out);
1782 dev_out = FIB_RES_DEV(res);
1783 dev_hold(dev_out);
1784 key.oif = dev_out->ifindex;
1786 make_route:
1787 if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1788 goto e_inval;
1790 if (key.dst == 0xFFFFFFFF)
1791 res.type = RTN_BROADCAST;
1792 else if (MULTICAST(key.dst))
1793 res.type = RTN_MULTICAST;
1794 else if (BADCLASS(key.dst) || ZERONET(key.dst))
1795 goto e_inval;
1797 if (dev_out->flags&IFF_LOOPBACK)
1798 flags |= RTCF_LOCAL;
1800 if (res.type == RTN_BROADCAST) {
1801 flags |= RTCF_BROADCAST|RTCF_LOCAL;
1802 if (res.fi) {
1803 fib_info_put(res.fi);
1804 res.fi = NULL;
1806 } else if (res.type == RTN_MULTICAST) {
1807 flags |= RTCF_MULTICAST|RTCF_LOCAL;
1808 read_lock(&inetdev_lock);
1809 if (!__in_dev_get(dev_out) || !ip_check_mc(__in_dev_get(dev_out), oldkey->dst))
1810 flags &= ~RTCF_LOCAL;
1811 read_unlock(&inetdev_lock);
1812 /* If multicast route do not exist use
1813 default one, but do not gateway in this case.
1814 Yes, it is hack.
1816 if (res.fi && res.prefixlen < 4) {
1817 fib_info_put(res.fi);
1818 res.fi = NULL;
1822 rth = dst_alloc(&ipv4_dst_ops);
1823 if (!rth)
1824 goto e_nobufs;
1826 atomic_set(&rth->u.dst.__refcnt, 1);
1827 rth->u.dst.flags= DST_HOST;
1828 rth->key.dst = oldkey->dst;
1829 rth->key.tos = tos;
1830 rth->key.src = oldkey->src;
1831 rth->key.iif = 0;
1832 rth->key.oif = oldkey->oif;
1833 #ifdef CONFIG_IP_ROUTE_FWMARK
1834 rth->key.fwmark = oldkey->fwmark;
1835 #endif
1836 rth->rt_dst = key.dst;
1837 rth->rt_src = key.src;
1838 #ifdef CONFIG_IP_ROUTE_NAT
1839 rth->rt_dst_map = key.dst;
1840 rth->rt_src_map = key.src;
1841 #endif
1842 rth->rt_iif = oldkey->oif ? : dev_out->ifindex;
1843 rth->u.dst.dev = dev_out;
1844 dev_hold(dev_out);
1845 rth->rt_gateway = key.dst;
1846 rth->rt_spec_dst= key.src;
1848 rth->u.dst.output=ip_output;
1850 if (flags&RTCF_LOCAL) {
1851 rth->u.dst.input = ip_local_deliver;
1852 rth->rt_spec_dst = key.dst;
1854 if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
1855 rth->rt_spec_dst = key.src;
1856 if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
1857 rth->u.dst.output = ip_mc_output;
1858 #ifdef CONFIG_IP_MROUTE
1859 if (res.type == RTN_MULTICAST) {
1860 struct in_device *in_dev = in_dev_get(dev_out);
1861 if (in_dev) {
1862 if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(oldkey->dst)) {
1863 rth->u.dst.input = ip_mr_input;
1864 rth->u.dst.output = ip_mc_output;
1866 in_dev_put(in_dev);
1869 #endif
1872 rt_set_nexthop(rth, &res, 0);
1874 rth->rt_flags = flags;
1876 hash = rt_hash_code(oldkey->dst, oldkey->src^(oldkey->oif<<5), tos);
1877 err = rt_intern_hash(hash, rth, rp);
1878 done:
1879 if (free_res)
1880 fib_res_put(&res);
1881 if (dev_out)
1882 dev_put(dev_out);
1883 return err;
1885 e_inval:
1886 err = -EINVAL;
1887 goto done;
1888 e_nobufs:
1889 err = -ENOBUFS;
1890 goto done;
1893 int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
1895 unsigned hash;
1896 struct rtable *rth;
1898 hash = rt_hash_code(key->dst, key->src^(key->oif<<5), key->tos);
1900 read_lock_bh(&rt_hash_table[hash].lock);
1901 for (rth=rt_hash_table[hash].chain; rth; rth=rth->u.rt_next) {
1902 if (rth->key.dst == key->dst &&
1903 rth->key.src == key->src &&
1904 rth->key.iif == 0 &&
1905 rth->key.oif == key->oif &&
1906 #ifdef CONFIG_IP_ROUTE_FWMARK
1907 rth->key.fwmark == key->fwmark &&
1908 #endif
1909 !((rth->key.tos^key->tos)&(IPTOS_RT_MASK|RTO_ONLINK)) &&
1910 ((key->tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
1912 rth->u.dst.lastuse = jiffies;
1913 dst_hold(&rth->u.dst);
1914 rth->u.dst.__use++;
1915 read_unlock_bh(&rt_hash_table[hash].lock);
1916 *rp = rth;
1917 return 0;
1920 read_unlock_bh(&rt_hash_table[hash].lock);
1922 return ip_route_output_slow(rp, key);
1925 #ifdef CONFIG_RTNETLINK
1927 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
1929 struct rtable *rt = (struct rtable*)skb->dst;
1930 struct rtmsg *r;
1931 struct nlmsghdr *nlh;
1932 unsigned char *b = skb->tail;
1933 struct rta_cacheinfo ci;
1934 #ifdef CONFIG_IP_MROUTE
1935 struct rtattr *eptr;
1936 #endif
1938 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
1939 r = NLMSG_DATA(nlh);
1940 nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
1941 r->rtm_family = AF_INET;
1942 r->rtm_dst_len = 32;
1943 r->rtm_src_len = 0;
1944 r->rtm_tos = rt->key.tos;
1945 r->rtm_table = RT_TABLE_MAIN;
1946 r->rtm_type = rt->rt_type;
1947 r->rtm_scope = RT_SCOPE_UNIVERSE;
1948 r->rtm_protocol = RTPROT_UNSPEC;
1949 r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
1950 if (rt->rt_flags & RTCF_NOTIFY)
1951 r->rtm_flags |= RTM_F_NOTIFY;
1952 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
1953 if (rt->key.src) {
1954 r->rtm_src_len = 32;
1955 RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
1957 if (rt->u.dst.dev)
1958 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
1959 #ifdef CONFIG_NET_CLS_ROUTE
1960 if (rt->u.dst.tclassid)
1961 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
1962 #endif
1963 if (rt->key.iif)
1964 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
1965 else if (rt->rt_src != rt->key.src)
1966 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
1967 if (rt->rt_dst != rt->rt_gateway)
1968 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
1969 if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
1970 goto rtattr_failure;
1971 ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
1972 ci.rta_used = rt->u.dst.__use;
1973 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1974 if (rt->u.dst.expires)
1975 ci.rta_expires = rt->u.dst.expires - jiffies;
1976 else
1977 ci.rta_expires = 0;
1978 ci.rta_error = rt->u.dst.error;
1979 ci.rta_id = 0;
1980 ci.rta_ts = 0;
1981 ci.rta_tsage = 0;
1982 if (rt->peer) {
1983 ci.rta_id = rt->peer->ip_id_count;
1984 if (rt->peer->tcp_ts_stamp) {
1985 ci.rta_ts = rt->peer->tcp_ts;
1986 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
1989 #ifdef CONFIG_IP_MROUTE
1990 eptr = (struct rtattr*)skb->tail;
1991 #endif
1992 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1993 if (rt->key.iif) {
1994 #ifdef CONFIG_IP_MROUTE
1995 u32 dst = rt->rt_dst;
1997 if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
1998 int err = ipmr_get_route(skb, r, nowait);
1999 if (err <= 0) {
2000 if (!nowait) {
2001 if (err == 0)
2002 return 0;
2003 goto nlmsg_failure;
2004 } else {
2005 if (err == -EMSGSIZE)
2006 goto nlmsg_failure;
2007 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2010 } else
2011 #endif
2013 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
2017 nlh->nlmsg_len = skb->tail - b;
2018 return skb->len;
2020 nlmsg_failure:
2021 rtattr_failure:
2022 skb_trim(skb, b - skb->data);
2023 return -1;
2026 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2028 struct rtattr **rta = arg;
2029 struct rtmsg *rtm = NLMSG_DATA(nlh);
2030 struct rtable *rt = NULL;
2031 u32 dst = 0;
2032 u32 src = 0;
2033 int iif = 0;
2034 int err;
2035 struct sk_buff *skb;
2037 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2038 if (skb == NULL)
2039 return -ENOBUFS;
2041 /* Reserve room for dummy headers, this skb can pass
2042 through good chunk of routing engine.
2044 skb->mac.raw = skb->data;
2045 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2047 if (rta[RTA_SRC-1])
2048 memcpy(&src, RTA_DATA(rta[RTA_SRC-1]), 4);
2049 if (rta[RTA_DST-1])
2050 memcpy(&dst, RTA_DATA(rta[RTA_DST-1]), 4);
2051 if (rta[RTA_IIF-1])
2052 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
2054 if (iif) {
2055 struct net_device *dev;
2056 dev = __dev_get_by_index(iif);
2057 if (!dev)
2058 return -ENODEV;
2059 skb->protocol = __constant_htons(ETH_P_IP);
2060 skb->dev = dev;
2061 local_bh_disable();
2062 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2063 local_bh_enable();
2064 rt = (struct rtable*)skb->dst;
2065 if (!err && rt->u.dst.error)
2066 err = -rt->u.dst.error;
2067 } else {
2068 int oif = 0;
2069 if (rta[RTA_OIF-1])
2070 memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
2071 err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
2073 if (err) {
2074 kfree_skb(skb);
2075 return err;
2078 skb->dst = &rt->u.dst;
2079 if (rtm->rtm_flags & RTM_F_NOTIFY)
2080 rt->rt_flags |= RTCF_NOTIFY;
2082 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2084 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
2085 if (err == 0)
2086 return 0;
2087 if (err < 0)
2088 return -EMSGSIZE;
2090 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2091 if (err < 0)
2092 return err;
2093 return 0;
2097 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2099 struct rtable *rt;
2100 int h, s_h;
2101 int idx, s_idx;
2103 s_h = cb->args[0];
2104 s_idx = idx = cb->args[1];
2105 for (h=0; h <= rt_hash_mask; h++) {
2106 if (h < s_h) continue;
2107 if (h > s_h)
2108 s_idx = 0;
2109 read_lock_bh(&rt_hash_table[h].lock);
2110 for (rt = rt_hash_table[h].chain, idx = 0; rt; rt = rt->u.rt_next, idx++) {
2111 if (idx < s_idx)
2112 continue;
2113 skb->dst = dst_clone(&rt->u.dst);
2114 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2115 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
2116 dst_release(xchg(&skb->dst, NULL));
2117 read_unlock_bh(&rt_hash_table[h].lock);
2118 goto done;
2120 dst_release(xchg(&skb->dst, NULL));
2122 read_unlock_bh(&rt_hash_table[h].lock);
2125 done:
2126 cb->args[0] = h;
2127 cb->args[1] = idx;
2128 return skb->len;
2131 #endif /* CONFIG_RTNETLINK */
2133 void ip_rt_multicast_event(struct in_device *in_dev)
2135 rt_cache_flush(0);
2140 #ifdef CONFIG_SYSCTL
2142 static int flush_delay;
2144 static
2145 int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2146 void *buffer, size_t *lenp)
2148 if (write) {
2149 proc_dointvec(ctl, write, filp, buffer, lenp);
2150 rt_cache_flush(flush_delay);
2151 return 0;
2152 } else
2153 return -EINVAL;
2156 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name, int nlen,
2157 void *oldval, size_t *oldlenp,
2158 void *newval, size_t newlen,
2159 void **context)
2161 int delay;
2162 if (newlen != sizeof(int))
2163 return -EINVAL;
2164 if (get_user(delay,(int *)newval))
2165 return -EFAULT;
2166 rt_cache_flush(delay);
2167 return 0;
2170 ctl_table ipv4_route_table[] = {
2171 {NET_IPV4_ROUTE_FLUSH, "flush",
2172 &flush_delay, sizeof(int), 0644, NULL,
2173 &ipv4_sysctl_rtcache_flush, &ipv4_sysctl_rtcache_flush_strategy },
2174 {NET_IPV4_ROUTE_MIN_DELAY, "min_delay",
2175 &ip_rt_min_delay, sizeof(int), 0644, NULL,
2176 &proc_dointvec_jiffies, &sysctl_jiffies},
2177 {NET_IPV4_ROUTE_MAX_DELAY, "max_delay",
2178 &ip_rt_max_delay, sizeof(int), 0644, NULL,
2179 &proc_dointvec_jiffies, &sysctl_jiffies},
2180 {NET_IPV4_ROUTE_GC_THRESH, "gc_thresh",
2181 &ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL,
2182 &proc_dointvec},
2183 {NET_IPV4_ROUTE_MAX_SIZE, "max_size",
2184 &ip_rt_max_size, sizeof(int), 0644, NULL,
2185 &proc_dointvec},
2186 {NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval",
2187 &ip_rt_gc_min_interval, sizeof(int), 0644, NULL,
2188 &proc_dointvec_jiffies, &sysctl_jiffies},
2189 {NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout",
2190 &ip_rt_gc_timeout, sizeof(int), 0644, NULL,
2191 &proc_dointvec_jiffies, &sysctl_jiffies},
2192 {NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval",
2193 &ip_rt_gc_interval, sizeof(int), 0644, NULL,
2194 &proc_dointvec_jiffies, &sysctl_jiffies},
2195 {NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load",
2196 &ip_rt_redirect_load, sizeof(int), 0644, NULL,
2197 &proc_dointvec},
2198 {NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number",
2199 &ip_rt_redirect_number, sizeof(int), 0644, NULL,
2200 &proc_dointvec},
2201 {NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence",
2202 &ip_rt_redirect_silence, sizeof(int), 0644, NULL,
2203 &proc_dointvec},
2204 {NET_IPV4_ROUTE_ERROR_COST, "error_cost",
2205 &ip_rt_error_cost, sizeof(int), 0644, NULL,
2206 &proc_dointvec},
2207 {NET_IPV4_ROUTE_ERROR_BURST, "error_burst",
2208 &ip_rt_error_burst, sizeof(int), 0644, NULL,
2209 &proc_dointvec},
2210 {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity",
2211 &ip_rt_gc_elasticity, sizeof(int), 0644, NULL,
2212 &proc_dointvec},
2213 {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
2214 &ip_rt_mtu_expires, sizeof(int), 0644, NULL,
2215 &proc_dointvec_jiffies, &sysctl_jiffies},
2216 {NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu",
2217 &ip_rt_min_pmtu, sizeof(int), 0644, NULL,
2218 &proc_dointvec},
2219 {NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss",
2220 &ip_rt_min_advmss, sizeof(int), 0644, NULL,
2221 &proc_dointvec},
2224 #endif
2226 #ifdef CONFIG_NET_CLS_ROUTE
2227 struct ip_rt_acct *ip_rt_acct;
2229 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2230 int length, int *eof, void *data)
2232 *start=buffer;
2234 if ((offset&3) || (length&3))
2235 return -EIO;
2237 if (offset + length >= sizeof(struct ip_rt_acct)*256) {
2238 length = sizeof(struct ip_rt_acct)*256 - offset;
2239 *eof = 1;
2241 if (length > 0) {
2242 u32 *dst = (u32*)buffer;
2243 u32 *src = (u32*)(((u8*)ip_rt_acct) + offset);
2245 memcpy(dst, src, length);
2247 #ifdef CONFIG_SMP
2248 if (smp_num_cpus > 1 || cpu_logical_map(0) != 0) {
2249 int i;
2250 int cnt = length/4;
2252 for (i=0; i<smp_num_cpus; i++) {
2253 int cpu = cpu_logical_map(i);
2254 int k;
2256 if (cpu == 0)
2257 continue;
2259 src = (u32*)(((u8*)ip_rt_acct) + offset +
2260 cpu*256*sizeof(struct ip_rt_acct));
2262 for (k=0; k<cnt; k++)
2263 dst[k] += src[k];
2266 #endif
2267 return length;
2269 return 0;
2271 #endif
2273 void __init ip_rt_init(void)
2275 int i, order, goal;
2277 #ifdef CONFIG_NET_CLS_ROUTE
2278 for (order=0;
2279 (PAGE_SIZE<<order) < 256*sizeof(ip_rt_acct)*NR_CPUS; order++)
2280 /* NOTHING */;
2281 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2282 if (!ip_rt_acct)
2283 panic("IP: failed to allocate ip_rt_acct\n");
2284 memset(ip_rt_acct, 0, PAGE_SIZE<<order);
2285 #endif
2287 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2288 sizeof(struct rtable),
2289 0, SLAB_HWCACHE_ALIGN,
2290 NULL, NULL);
2292 if (!ipv4_dst_ops.kmem_cachep)
2293 panic("IP: failed to allocate ip_dst_cache\n");
2295 goal = num_physpages >> (26 - PAGE_SHIFT);
2297 for (order = 0; (1UL << order) < goal; order++)
2298 /* NOTHING */;
2300 do {
2301 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2302 sizeof(struct rt_hash_bucket);
2303 while (rt_hash_mask & (rt_hash_mask-1))
2304 rt_hash_mask--;
2305 rt_hash_table = (struct rt_hash_bucket *)
2306 __get_free_pages(GFP_ATOMIC, order);
2307 } while (rt_hash_table == NULL && --order > 0);
2309 if (!rt_hash_table)
2310 panic("Failed to allocate IP route cache hash table\n");
2312 printk("IP: routing cache hash table of %u buckets, %ldKbytes\n",
2313 rt_hash_mask,
2314 (long) (rt_hash_mask*sizeof(struct rt_hash_bucket))/1024);
2316 for (rt_hash_log=0; (1<<rt_hash_log) != rt_hash_mask; rt_hash_log++)
2317 /* NOTHING */;
2319 rt_hash_mask--;
2320 for (i = 0; i <= rt_hash_mask; i++) {
2321 rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
2322 rt_hash_table[i].chain = NULL;
2325 ipv4_dst_ops.gc_thresh = (rt_hash_mask+1);
2326 ip_rt_max_size = (rt_hash_mask+1)*16;
2328 devinet_init();
2329 ip_fib_init();
2331 rt_flush_timer.function = rt_run_flush;
2332 rt_periodic_timer.function = rt_check_expire;
2334 /* All the timers, started at system startup tend
2335 to synchronize. Perturb it a bit.
2337 rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval
2338 + ip_rt_gc_interval;
2339 add_timer(&rt_periodic_timer);
2341 proc_net_create ("rt_cache", 0, rt_cache_get_info);
2342 #ifdef CONFIG_NET_CLS_ROUTE
2343 create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
2344 #endif