initial commit with v2.6.9
[linux-2.6.9-moxart.git] / net / ipv4 / fib_semantics.c
blobe2a772d3bc4cc27cf07bd6b7ac7fe2a1822a53fb
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * IPv4 Forwarding Information Base: semantics.
8 * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 #include <linux/config.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <asm/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/jiffies.h>
25 #include <linux/mm.h>
26 #include <linux/string.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
29 #include <linux/errno.h>
30 #include <linux/in.h>
31 #include <linux/inet.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/netlink.h>
37 #include <linux/init.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
46 #include "fib_lookup.h"
48 #define FSprintk(a...)
50 static rwlock_t fib_info_lock = RW_LOCK_UNLOCKED;
51 static struct hlist_head *fib_info_hash;
52 static struct hlist_head *fib_info_laddrhash;
53 static unsigned int fib_hash_size;
54 static unsigned int fib_info_cnt;
56 #define DEVINDEX_HASHBITS 8
57 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
58 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
60 #ifdef CONFIG_IP_ROUTE_MULTIPATH
62 static spinlock_t fib_multipath_lock = SPIN_LOCK_UNLOCKED;
64 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
65 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
67 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
68 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
70 #else /* CONFIG_IP_ROUTE_MULTIPATH */
72 /* Hope, that gcc will optimize it to get rid of dummy loop */
74 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
75 for (nhsel=0; nhsel < 1; nhsel++)
77 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
78 for (nhsel=0; nhsel < 1; nhsel++)
80 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
82 #define endfor_nexthops(fi) }
85 static struct
87 int error;
88 u8 scope;
89 } fib_props[RTA_MAX + 1] = {
91 .error = 0,
92 .scope = RT_SCOPE_NOWHERE,
93 }, /* RTN_UNSPEC */
95 .error = 0,
96 .scope = RT_SCOPE_UNIVERSE,
97 }, /* RTN_UNICAST */
99 .error = 0,
100 .scope = RT_SCOPE_HOST,
101 }, /* RTN_LOCAL */
103 .error = 0,
104 .scope = RT_SCOPE_LINK,
105 }, /* RTN_BROADCAST */
107 .error = 0,
108 .scope = RT_SCOPE_LINK,
109 }, /* RTN_ANYCAST */
111 .error = 0,
112 .scope = RT_SCOPE_UNIVERSE,
113 }, /* RTN_MULTICAST */
115 .error = -EINVAL,
116 .scope = RT_SCOPE_UNIVERSE,
117 }, /* RTN_BLACKHOLE */
119 .error = -EHOSTUNREACH,
120 .scope = RT_SCOPE_UNIVERSE,
121 }, /* RTN_UNREACHABLE */
123 .error = -EACCES,
124 .scope = RT_SCOPE_UNIVERSE,
125 }, /* RTN_PROHIBIT */
127 .error = -EAGAIN,
128 .scope = RT_SCOPE_UNIVERSE,
129 }, /* RTN_THROW */
131 .error = -EINVAL,
132 .scope = RT_SCOPE_NOWHERE,
133 }, /* RTN_NAT */
135 .error = -EINVAL,
136 .scope = RT_SCOPE_NOWHERE,
137 }, /* RTN_XRESOLVE */
141 /* Release a nexthop info record */
143 void free_fib_info(struct fib_info *fi)
145 if (fi->fib_dead == 0) {
146 printk("Freeing alive fib_info %p\n", fi);
147 return;
149 change_nexthops(fi) {
150 if (nh->nh_dev)
151 dev_put(nh->nh_dev);
152 nh->nh_dev = NULL;
153 } endfor_nexthops(fi);
154 fib_info_cnt--;
155 kfree(fi);
158 void fib_release_info(struct fib_info *fi)
160 write_lock(&fib_info_lock);
161 if (fi && --fi->fib_treeref == 0) {
162 hlist_del(&fi->fib_hash);
163 if (fi->fib_prefsrc)
164 hlist_del(&fi->fib_lhash);
165 change_nexthops(fi) {
166 hlist_del(&nh->nh_hash);
167 } endfor_nexthops(fi)
168 fi->fib_dead = 1;
169 fib_info_put(fi);
171 write_unlock(&fib_info_lock);
174 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
176 const struct fib_nh *onh = ofi->fib_nh;
178 for_nexthops(fi) {
179 if (nh->nh_oif != onh->nh_oif ||
180 nh->nh_gw != onh->nh_gw ||
181 nh->nh_scope != onh->nh_scope ||
182 #ifdef CONFIG_IP_ROUTE_MULTIPATH
183 nh->nh_weight != onh->nh_weight ||
184 #endif
185 #ifdef CONFIG_NET_CLS_ROUTE
186 nh->nh_tclassid != onh->nh_tclassid ||
187 #endif
188 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
189 return -1;
190 onh++;
191 } endfor_nexthops(fi);
192 return 0;
195 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
197 unsigned int mask = (fib_hash_size - 1);
198 unsigned int val = fi->fib_nhs;
200 val ^= fi->fib_protocol;
201 val ^= fi->fib_prefsrc;
202 val ^= fi->fib_priority;
204 return (val ^ (val >> 7) ^ (val >> 12)) & mask;
207 static struct fib_info *fib_find_info(const struct fib_info *nfi)
209 struct hlist_head *head;
210 struct hlist_node *node;
211 struct fib_info *fi;
212 unsigned int hash;
214 hash = fib_info_hashfn(nfi);
215 head = &fib_info_hash[hash];
217 hlist_for_each_entry(fi, node, head, fib_hash) {
218 if (fi->fib_nhs != nfi->fib_nhs)
219 continue;
220 if (nfi->fib_protocol == fi->fib_protocol &&
221 nfi->fib_prefsrc == fi->fib_prefsrc &&
222 nfi->fib_priority == fi->fib_priority &&
223 memcmp(nfi->fib_metrics, fi->fib_metrics,
224 sizeof(fi->fib_metrics)) == 0 &&
225 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
226 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
227 return fi;
230 return NULL;
233 static inline unsigned int fib_devindex_hashfn(unsigned int val)
235 unsigned int mask = DEVINDEX_HASHSIZE - 1;
237 return (val ^
238 (val >> DEVINDEX_HASHBITS) ^
239 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
242 /* Check, that the gateway is already configured.
243 Used only by redirect accept routine.
246 int ip_fib_check_default(u32 gw, struct net_device *dev)
248 struct hlist_head *head;
249 struct hlist_node *node;
250 struct fib_nh *nh;
251 unsigned int hash;
253 read_lock(&fib_info_lock);
255 hash = fib_devindex_hashfn(dev->ifindex);
256 head = &fib_info_devhash[hash];
257 hlist_for_each_entry(nh, node, head, nh_hash) {
258 if (nh->nh_dev == dev &&
259 nh->nh_gw == gw &&
260 !(nh->nh_flags&RTNH_F_DEAD)) {
261 read_unlock(&fib_info_lock);
262 return 0;
266 read_unlock(&fib_info_lock);
268 return -1;
271 #ifdef CONFIG_IP_ROUTE_MULTIPATH
273 static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
275 while (RTA_OK(attr,attrlen)) {
276 if (attr->rta_type == type)
277 return *(u32*)RTA_DATA(attr);
278 attr = RTA_NEXT(attr, attrlen);
280 return 0;
283 static int
284 fib_count_nexthops(struct rtattr *rta)
286 int nhs = 0;
287 struct rtnexthop *nhp = RTA_DATA(rta);
288 int nhlen = RTA_PAYLOAD(rta);
290 while (nhlen >= (int)sizeof(struct rtnexthop)) {
291 if ((nhlen -= nhp->rtnh_len) < 0)
292 return 0;
293 nhs++;
294 nhp = RTNH_NEXT(nhp);
296 return nhs;
299 static int
300 fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
302 struct rtnexthop *nhp = RTA_DATA(rta);
303 int nhlen = RTA_PAYLOAD(rta);
305 change_nexthops(fi) {
306 int attrlen = nhlen - sizeof(struct rtnexthop);
307 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
308 return -EINVAL;
309 nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
310 nh->nh_oif = nhp->rtnh_ifindex;
311 nh->nh_weight = nhp->rtnh_hops + 1;
312 if (attrlen) {
313 nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
314 #ifdef CONFIG_NET_CLS_ROUTE
315 nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
316 #endif
318 nhp = RTNH_NEXT(nhp);
319 } endfor_nexthops(fi);
320 return 0;
323 #endif
325 int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
326 struct fib_info *fi)
328 #ifdef CONFIG_IP_ROUTE_MULTIPATH
329 struct rtnexthop *nhp;
330 int nhlen;
331 #endif
333 if (rta->rta_priority &&
334 *rta->rta_priority != fi->fib_priority)
335 return 1;
337 if (rta->rta_oif || rta->rta_gw) {
338 if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
339 (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
340 return 0;
341 return 1;
344 #ifdef CONFIG_IP_ROUTE_MULTIPATH
345 if (rta->rta_mp == NULL)
346 return 0;
347 nhp = RTA_DATA(rta->rta_mp);
348 nhlen = RTA_PAYLOAD(rta->rta_mp);
350 for_nexthops(fi) {
351 int attrlen = nhlen - sizeof(struct rtnexthop);
352 u32 gw;
354 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
355 return -EINVAL;
356 if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
357 return 1;
358 if (attrlen) {
359 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
360 if (gw && gw != nh->nh_gw)
361 return 1;
362 #ifdef CONFIG_NET_CLS_ROUTE
363 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
364 if (gw && gw != nh->nh_tclassid)
365 return 1;
366 #endif
368 nhp = RTNH_NEXT(nhp);
369 } endfor_nexthops(fi);
370 #endif
371 return 0;
376 Picture
377 -------
379 Semantics of nexthop is very messy by historical reasons.
380 We have to take into account, that:
381 a) gateway can be actually local interface address,
382 so that gatewayed route is direct.
383 b) gateway must be on-link address, possibly
384 described not by an ifaddr, but also by a direct route.
385 c) If both gateway and interface are specified, they should not
386 contradict.
387 d) If we use tunnel routes, gateway could be not on-link.
389 Attempt to reconcile all of these (alas, self-contradictory) conditions
390 results in pretty ugly and hairy code with obscure logic.
392 I chose to generalized it instead, so that the size
393 of code does not increase practically, but it becomes
394 much more general.
395 Every prefix is assigned a "scope" value: "host" is local address,
396 "link" is direct route,
397 [ ... "site" ... "interior" ... ]
398 and "universe" is true gateway route with global meaning.
400 Every prefix refers to a set of "nexthop"s (gw, oif),
401 where gw must have narrower scope. This recursion stops
402 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
403 which means that gw is forced to be on link.
405 Code is still hairy, but now it is apparently logically
406 consistent and very flexible. F.e. as by-product it allows
407 to co-exists in peace independent exterior and interior
408 routing processes.
410 Normally it looks as following.
412 {universe prefix} -> (gw, oif) [scope link]
414 |-> {link prefix} -> (gw, oif) [scope local]
416 |-> {local prefix} (terminal node)
419 static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
421 int err;
423 if (nh->nh_gw) {
424 struct fib_result res;
426 #ifdef CONFIG_IP_ROUTE_PERVASIVE
427 if (nh->nh_flags&RTNH_F_PERVASIVE)
428 return 0;
429 #endif
430 if (nh->nh_flags&RTNH_F_ONLINK) {
431 struct net_device *dev;
433 if (r->rtm_scope >= RT_SCOPE_LINK)
434 return -EINVAL;
435 if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
436 return -EINVAL;
437 if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
438 return -ENODEV;
439 if (!(dev->flags&IFF_UP))
440 return -ENETDOWN;
441 nh->nh_dev = dev;
442 dev_hold(dev);
443 nh->nh_scope = RT_SCOPE_LINK;
444 return 0;
447 struct flowi fl = { .nl_u = { .ip4_u =
448 { .daddr = nh->nh_gw,
449 .scope = r->rtm_scope + 1 } },
450 .oif = nh->nh_oif };
452 /* It is not necessary, but requires a bit of thinking */
453 if (fl.fl4_scope < RT_SCOPE_LINK)
454 fl.fl4_scope = RT_SCOPE_LINK;
455 if ((err = fib_lookup(&fl, &res)) != 0)
456 return err;
458 err = -EINVAL;
459 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
460 goto out;
461 nh->nh_scope = res.scope;
462 nh->nh_oif = FIB_RES_OIF(res);
463 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
464 goto out;
465 dev_hold(nh->nh_dev);
466 err = -ENETDOWN;
467 if (!(nh->nh_dev->flags & IFF_UP))
468 goto out;
469 err = 0;
470 out:
471 fib_res_put(&res);
472 return err;
473 } else {
474 struct in_device *in_dev;
476 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
477 return -EINVAL;
479 in_dev = inetdev_by_index(nh->nh_oif);
480 if (in_dev == NULL)
481 return -ENODEV;
482 if (!(in_dev->dev->flags&IFF_UP)) {
483 in_dev_put(in_dev);
484 return -ENETDOWN;
486 nh->nh_dev = in_dev->dev;
487 dev_hold(nh->nh_dev);
488 nh->nh_scope = RT_SCOPE_HOST;
489 in_dev_put(in_dev);
491 return 0;
494 static inline unsigned int fib_laddr_hashfn(u32 val)
496 unsigned int mask = (fib_hash_size - 1);
498 return (val ^ (val >> 7) ^ (val >> 14)) & mask;
501 static struct hlist_head *fib_hash_alloc(int bytes)
503 if (bytes <= PAGE_SIZE)
504 return kmalloc(bytes, GFP_KERNEL);
505 else
506 return (struct hlist_head *)
507 __get_free_pages(GFP_KERNEL, get_order(bytes));
510 static void fib_hash_free(struct hlist_head *hash, int bytes)
512 if (!hash)
513 return;
515 if (bytes <= PAGE_SIZE)
516 kfree(hash);
517 else
518 free_pages((unsigned long) hash, get_order(bytes));
521 static void fib_hash_move(struct hlist_head *new_info_hash,
522 struct hlist_head *new_laddrhash,
523 unsigned int new_size)
525 unsigned int old_size = fib_hash_size;
526 unsigned int i;
528 write_lock(&fib_info_lock);
529 fib_hash_size = new_size;
531 for (i = 0; i < old_size; i++) {
532 struct hlist_head *head = &fib_info_hash[i];
533 struct hlist_node *node, *n;
534 struct fib_info *fi;
536 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
537 struct hlist_head *dest;
538 unsigned int new_hash;
540 hlist_del(&fi->fib_hash);
542 new_hash = fib_info_hashfn(fi);
543 dest = &new_info_hash[new_hash];
544 hlist_add_head(&fi->fib_hash, dest);
547 fib_info_hash = new_info_hash;
549 for (i = 0; i < old_size; i++) {
550 struct hlist_head *lhead = &fib_info_laddrhash[i];
551 struct hlist_node *node, *n;
552 struct fib_info *fi;
554 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
555 struct hlist_head *ldest;
556 unsigned int new_hash;
558 hlist_del(&fi->fib_lhash);
560 new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
561 ldest = &new_laddrhash[new_hash];
562 hlist_add_head(&fi->fib_lhash, ldest);
565 fib_info_laddrhash = new_laddrhash;
567 write_unlock(&fib_info_lock);
570 struct fib_info *
571 fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
572 const struct nlmsghdr *nlh, int *errp)
574 int err;
575 struct fib_info *fi = NULL;
576 struct fib_info *ofi;
577 #ifdef CONFIG_IP_ROUTE_MULTIPATH
578 int nhs = 1;
579 #else
580 const int nhs = 1;
581 #endif
583 /* Fast check to catch the most weird cases */
584 if (fib_props[r->rtm_type].scope > r->rtm_scope)
585 goto err_inval;
587 #ifdef CONFIG_IP_ROUTE_MULTIPATH
588 if (rta->rta_mp) {
589 nhs = fib_count_nexthops(rta->rta_mp);
590 if (nhs == 0)
591 goto err_inval;
593 #endif
595 err = -ENOBUFS;
596 if (fib_info_cnt >= fib_hash_size) {
597 unsigned int new_size = fib_hash_size << 1;
598 struct hlist_head *new_info_hash;
599 struct hlist_head *new_laddrhash;
600 unsigned int bytes;
602 if (!new_size)
603 new_size = 1;
604 bytes = new_size * sizeof(struct hlist_head *);
605 new_info_hash = fib_hash_alloc(bytes);
606 new_laddrhash = fib_hash_alloc(bytes);
607 if (!new_info_hash || !new_laddrhash) {
608 fib_hash_free(new_info_hash, bytes);
609 fib_hash_free(new_laddrhash, bytes);
610 } else {
611 memset(new_info_hash, 0, bytes);
612 memset(new_laddrhash, 0, bytes);
614 fib_hash_move(new_info_hash, new_laddrhash, new_size);
617 if (!fib_hash_size)
618 goto failure;
621 fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
622 if (fi == NULL)
623 goto failure;
624 fib_info_cnt++;
625 memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
627 fi->fib_protocol = r->rtm_protocol;
629 fi->fib_nhs = nhs;
630 change_nexthops(fi) {
631 nh->nh_parent = fi;
632 } endfor_nexthops(fi)
634 fi->fib_flags = r->rtm_flags;
635 if (rta->rta_priority)
636 fi->fib_priority = *rta->rta_priority;
637 if (rta->rta_mx) {
638 int attrlen = RTA_PAYLOAD(rta->rta_mx);
639 struct rtattr *attr = RTA_DATA(rta->rta_mx);
641 while (RTA_OK(attr, attrlen)) {
642 unsigned flavor = attr->rta_type;
643 if (flavor) {
644 if (flavor > RTAX_MAX)
645 goto err_inval;
646 fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
648 attr = RTA_NEXT(attr, attrlen);
651 if (rta->rta_prefsrc)
652 memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
654 if (rta->rta_mp) {
655 #ifdef CONFIG_IP_ROUTE_MULTIPATH
656 if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
657 goto failure;
658 if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
659 goto err_inval;
660 if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
661 goto err_inval;
662 #ifdef CONFIG_NET_CLS_ROUTE
663 if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
664 goto err_inval;
665 #endif
666 #else
667 goto err_inval;
668 #endif
669 } else {
670 struct fib_nh *nh = fi->fib_nh;
671 if (rta->rta_oif)
672 nh->nh_oif = *rta->rta_oif;
673 if (rta->rta_gw)
674 memcpy(&nh->nh_gw, rta->rta_gw, 4);
675 #ifdef CONFIG_NET_CLS_ROUTE
676 if (rta->rta_flow)
677 memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
678 #endif
679 nh->nh_flags = r->rtm_flags;
680 #ifdef CONFIG_IP_ROUTE_MULTIPATH
681 nh->nh_weight = 1;
682 #endif
685 if (fib_props[r->rtm_type].error) {
686 if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
687 goto err_inval;
688 goto link_it;
691 if (r->rtm_scope > RT_SCOPE_HOST)
692 goto err_inval;
694 if (r->rtm_scope == RT_SCOPE_HOST) {
695 struct fib_nh *nh = fi->fib_nh;
697 /* Local address is added. */
698 if (nhs != 1 || nh->nh_gw)
699 goto err_inval;
700 nh->nh_scope = RT_SCOPE_NOWHERE;
701 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
702 err = -ENODEV;
703 if (nh->nh_dev == NULL)
704 goto failure;
705 } else {
706 change_nexthops(fi) {
707 if ((err = fib_check_nh(r, fi, nh)) != 0)
708 goto failure;
709 } endfor_nexthops(fi)
712 if (fi->fib_prefsrc) {
713 if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
714 memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
715 if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
716 goto err_inval;
719 link_it:
720 if ((ofi = fib_find_info(fi)) != NULL) {
721 fi->fib_dead = 1;
722 free_fib_info(fi);
723 ofi->fib_treeref++;
724 return ofi;
727 fi->fib_treeref++;
728 atomic_inc(&fi->fib_clntref);
729 write_lock(&fib_info_lock);
730 hlist_add_head(&fi->fib_hash,
731 &fib_info_hash[fib_info_hashfn(fi)]);
732 if (fi->fib_prefsrc) {
733 struct hlist_head *head;
735 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
736 hlist_add_head(&fi->fib_lhash, head);
738 change_nexthops(fi) {
739 struct hlist_head *head;
740 unsigned int hash;
742 if (!nh->nh_dev)
743 continue;
744 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
745 head = &fib_info_devhash[hash];
746 hlist_add_head(&nh->nh_hash, head);
747 } endfor_nexthops(fi)
748 write_unlock(&fib_info_lock);
749 return fi;
751 err_inval:
752 err = -EINVAL;
754 failure:
755 *errp = err;
756 if (fi) {
757 fi->fib_dead = 1;
758 free_fib_info(fi);
760 return NULL;
763 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
764 struct fib_result *res, int prefixlen)
766 struct fib_alias *fa;
767 int nh_sel = 0;
769 list_for_each_entry(fa, head, fa_list) {
770 int err;
772 if (fa->fa_tos &&
773 fa->fa_tos != flp->fl4_tos)
774 continue;
776 if (fa->fa_scope < flp->fl4_scope)
777 continue;
779 fa->fa_state |= FA_S_ACCESSED;
781 err = fib_props[fa->fa_type].error;
782 if (err == 0) {
783 struct fib_info *fi = fa->fa_info;
785 if (fi->fib_flags & RTNH_F_DEAD)
786 continue;
788 switch (fa->fa_type) {
789 case RTN_UNICAST:
790 case RTN_LOCAL:
791 case RTN_BROADCAST:
792 case RTN_ANYCAST:
793 case RTN_MULTICAST:
794 for_nexthops(fi) {
795 if (nh->nh_flags&RTNH_F_DEAD)
796 continue;
797 if (!flp->oif || flp->oif == nh->nh_oif)
798 break;
800 #ifdef CONFIG_IP_ROUTE_MULTIPATH
801 if (nhsel < fi->fib_nhs) {
802 nh_sel = nhsel;
803 goto out_fill_res;
805 #else
806 if (nhsel < 1) {
807 goto out_fill_res;
809 #endif
810 endfor_nexthops(fi);
811 continue;
813 default:
814 printk(KERN_DEBUG "impossible 102\n");
815 return -EINVAL;
818 return err;
820 return 1;
822 out_fill_res:
823 res->prefixlen = prefixlen;
824 res->nh_sel = nh_sel;
825 res->type = fa->fa_type;
826 res->scope = fa->fa_scope;
827 res->fi = fa->fa_info;
828 atomic_inc(&res->fi->fib_clntref);
829 return 0;
832 /* Find appropriate source address to this destination */
834 u32 __fib_res_prefsrc(struct fib_result *res)
836 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
840 fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
841 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
842 struct fib_info *fi)
844 struct rtmsg *rtm;
845 struct nlmsghdr *nlh;
846 unsigned char *b = skb->tail;
848 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
849 rtm = NLMSG_DATA(nlh);
850 rtm->rtm_family = AF_INET;
851 rtm->rtm_dst_len = dst_len;
852 rtm->rtm_src_len = 0;
853 rtm->rtm_tos = tos;
854 rtm->rtm_table = tb_id;
855 rtm->rtm_type = type;
856 rtm->rtm_flags = fi->fib_flags;
857 rtm->rtm_scope = scope;
858 if (rtm->rtm_dst_len)
859 RTA_PUT(skb, RTA_DST, 4, dst);
860 rtm->rtm_protocol = fi->fib_protocol;
861 if (fi->fib_priority)
862 RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
863 #ifdef CONFIG_NET_CLS_ROUTE
864 if (fi->fib_nh[0].nh_tclassid)
865 RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
866 #endif
867 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
868 goto rtattr_failure;
869 if (fi->fib_prefsrc)
870 RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
871 if (fi->fib_nhs == 1) {
872 if (fi->fib_nh->nh_gw)
873 RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
874 if (fi->fib_nh->nh_oif)
875 RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
877 #ifdef CONFIG_IP_ROUTE_MULTIPATH
878 if (fi->fib_nhs > 1) {
879 struct rtnexthop *nhp;
880 struct rtattr *mp_head;
881 if (skb_tailroom(skb) <= RTA_SPACE(0))
882 goto rtattr_failure;
883 mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
885 for_nexthops(fi) {
886 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
887 goto rtattr_failure;
888 nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
889 nhp->rtnh_flags = nh->nh_flags & 0xFF;
890 nhp->rtnh_hops = nh->nh_weight-1;
891 nhp->rtnh_ifindex = nh->nh_oif;
892 if (nh->nh_gw)
893 RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
894 nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
895 } endfor_nexthops(fi);
896 mp_head->rta_type = RTA_MULTIPATH;
897 mp_head->rta_len = skb->tail - (u8*)mp_head;
899 #endif
900 nlh->nlmsg_len = skb->tail - b;
901 return skb->len;
903 nlmsg_failure:
904 rtattr_failure:
905 skb_trim(skb, b - skb->data);
906 return -1;
909 #ifndef CONFIG_IP_NOSIOCRT
912 fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
913 struct kern_rta *rta, struct rtentry *r)
915 int plen;
916 u32 *ptr;
918 memset(rtm, 0, sizeof(*rtm));
919 memset(rta, 0, sizeof(*rta));
921 if (r->rt_dst.sa_family != AF_INET)
922 return -EAFNOSUPPORT;
924 /* Check mask for validity:
925 a) it must be contiguous.
926 b) destination must have all host bits clear.
927 c) if application forgot to set correct family (AF_INET),
928 reject request unless it is absolutely clear i.e.
929 both family and mask are zero.
931 plen = 32;
932 ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
933 if (!(r->rt_flags&RTF_HOST)) {
934 u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
935 if (r->rt_genmask.sa_family != AF_INET) {
936 if (mask || r->rt_genmask.sa_family)
937 return -EAFNOSUPPORT;
939 if (bad_mask(mask, *ptr))
940 return -EINVAL;
941 plen = inet_mask_len(mask);
944 nl->nlmsg_flags = NLM_F_REQUEST;
945 nl->nlmsg_pid = 0;
946 nl->nlmsg_seq = 0;
947 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
948 if (cmd == SIOCDELRT) {
949 nl->nlmsg_type = RTM_DELROUTE;
950 nl->nlmsg_flags = 0;
951 } else {
952 nl->nlmsg_type = RTM_NEWROUTE;
953 nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
954 rtm->rtm_protocol = RTPROT_BOOT;
957 rtm->rtm_dst_len = plen;
958 rta->rta_dst = ptr;
960 if (r->rt_metric) {
961 *(u32*)&r->rt_pad3 = r->rt_metric - 1;
962 rta->rta_priority = (u32*)&r->rt_pad3;
964 if (r->rt_flags&RTF_REJECT) {
965 rtm->rtm_scope = RT_SCOPE_HOST;
966 rtm->rtm_type = RTN_UNREACHABLE;
967 return 0;
969 rtm->rtm_scope = RT_SCOPE_NOWHERE;
970 rtm->rtm_type = RTN_UNICAST;
972 if (r->rt_dev) {
973 char *colon;
974 struct net_device *dev;
975 char devname[IFNAMSIZ];
977 if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
978 return -EFAULT;
979 devname[IFNAMSIZ-1] = 0;
980 colon = strchr(devname, ':');
981 if (colon)
982 *colon = 0;
983 dev = __dev_get_by_name(devname);
984 if (!dev)
985 return -ENODEV;
986 rta->rta_oif = &dev->ifindex;
987 if (colon) {
988 struct in_ifaddr *ifa;
989 struct in_device *in_dev = __in_dev_get(dev);
990 if (!in_dev)
991 return -ENODEV;
992 *colon = ':';
993 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
994 if (strcmp(ifa->ifa_label, devname) == 0)
995 break;
996 if (ifa == NULL)
997 return -ENODEV;
998 rta->rta_prefsrc = &ifa->ifa_local;
1002 ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
1003 if (r->rt_gateway.sa_family == AF_INET && *ptr) {
1004 rta->rta_gw = ptr;
1005 if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
1006 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1009 if (cmd == SIOCDELRT)
1010 return 0;
1012 if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
1013 return -EINVAL;
1015 if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
1016 rtm->rtm_scope = RT_SCOPE_LINK;
1018 if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
1019 struct rtattr *rec;
1020 struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
1021 if (mx == NULL)
1022 return -ENOMEM;
1023 rta->rta_mx = mx;
1024 mx->rta_type = RTA_METRICS;
1025 mx->rta_len = RTA_LENGTH(0);
1026 if (r->rt_flags&RTF_MTU) {
1027 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1028 rec->rta_type = RTAX_ADVMSS;
1029 rec->rta_len = RTA_LENGTH(4);
1030 mx->rta_len += RTA_LENGTH(4);
1031 *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
1033 if (r->rt_flags&RTF_WINDOW) {
1034 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1035 rec->rta_type = RTAX_WINDOW;
1036 rec->rta_len = RTA_LENGTH(4);
1037 mx->rta_len += RTA_LENGTH(4);
1038 *(u32*)RTA_DATA(rec) = r->rt_window;
1040 if (r->rt_flags&RTF_IRTT) {
1041 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1042 rec->rta_type = RTAX_RTT;
1043 rec->rta_len = RTA_LENGTH(4);
1044 mx->rta_len += RTA_LENGTH(4);
1045 *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
1048 return 0;
1051 #endif
1054 Update FIB if:
1055 - local address disappeared -> we must delete all the entries
1056 referring to it.
1057 - device went down -> we must shutdown all nexthops going via it.
1060 int fib_sync_down(u32 local, struct net_device *dev, int force)
1062 int ret = 0;
1063 int scope = RT_SCOPE_NOWHERE;
1065 if (force)
1066 scope = -1;
1068 if (local && fib_info_laddrhash) {
1069 unsigned int hash = fib_laddr_hashfn(local);
1070 struct hlist_head *head = &fib_info_laddrhash[hash];
1071 struct hlist_node *node;
1072 struct fib_info *fi;
1074 hlist_for_each_entry(fi, node, head, fib_lhash) {
1075 if (fi->fib_prefsrc == local) {
1076 fi->fib_flags |= RTNH_F_DEAD;
1077 ret++;
1082 if (dev) {
1083 struct fib_info *prev_fi = NULL;
1084 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1085 struct hlist_head *head = &fib_info_devhash[hash];
1086 struct hlist_node *node;
1087 struct fib_nh *nh;
1089 hlist_for_each_entry(nh, node, head, nh_hash) {
1090 struct fib_info *fi = nh->nh_parent;
1091 int dead;
1093 BUG_ON(!fi->fib_nhs);
1094 if (nh->nh_dev != dev || fi == prev_fi)
1095 continue;
1096 prev_fi = fi;
1097 dead = 0;
1098 change_nexthops(fi) {
1099 if (nh->nh_flags&RTNH_F_DEAD)
1100 dead++;
1101 else if (nh->nh_dev == dev &&
1102 nh->nh_scope != scope) {
1103 nh->nh_flags |= RTNH_F_DEAD;
1104 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1105 spin_lock_bh(&fib_multipath_lock);
1106 fi->fib_power -= nh->nh_power;
1107 nh->nh_power = 0;
1108 spin_unlock_bh(&fib_multipath_lock);
1109 #endif
1110 dead++;
1112 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1113 if (force > 1 && nh->nh_dev == dev) {
1114 dead = fi->fib_nhs;
1115 break;
1117 #endif
1118 } endfor_nexthops(fi)
1119 if (dead == fi->fib_nhs) {
1120 fi->fib_flags |= RTNH_F_DEAD;
1121 ret++;
1126 return ret;
1129 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1132 Dead device goes up. We wake up dead nexthops.
1133 It takes sense only on multipath routes.
1136 int fib_sync_up(struct net_device *dev)
1138 struct fib_info *prev_fi;
1139 unsigned int hash;
1140 struct hlist_head *head;
1141 struct hlist_node *node;
1142 struct fib_nh *nh;
1143 int ret;
1145 if (!(dev->flags&IFF_UP))
1146 return 0;
1148 prev_fi = NULL;
1149 hash = fib_devindex_hashfn(dev->ifindex);
1150 head = &fib_info_devhash[hash];
1151 ret = 0;
1153 hlist_for_each_entry(nh, node, head, nh_hash) {
1154 struct fib_info *fi = nh->nh_parent;
1155 int alive;
1157 BUG_ON(!fi->fib_nhs);
1158 if (nh->nh_dev != dev || fi == prev_fi)
1159 continue;
1161 prev_fi = fi;
1162 alive = 0;
1163 change_nexthops(fi) {
1164 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1165 alive++;
1166 continue;
1168 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1169 continue;
1170 if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
1171 continue;
1172 alive++;
1173 spin_lock_bh(&fib_multipath_lock);
1174 nh->nh_power = 0;
1175 nh->nh_flags &= ~RTNH_F_DEAD;
1176 spin_unlock_bh(&fib_multipath_lock);
1177 } endfor_nexthops(fi)
1179 if (alive > 0) {
1180 fi->fib_flags &= ~RTNH_F_DEAD;
1181 ret++;
1185 return ret;
1189 The algorithm is suboptimal, but it provides really
1190 fair weighted route distribution.
1193 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1195 struct fib_info *fi = res->fi;
1196 int w;
1198 spin_lock_bh(&fib_multipath_lock);
1199 if (fi->fib_power <= 0) {
1200 int power = 0;
1201 change_nexthops(fi) {
1202 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1203 power += nh->nh_weight;
1204 nh->nh_power = nh->nh_weight;
1206 } endfor_nexthops(fi);
1207 fi->fib_power = power;
1208 if (power <= 0) {
1209 spin_unlock_bh(&fib_multipath_lock);
1210 /* Race condition: route has just become dead. */
1211 res->nh_sel = 0;
1212 return;
1217 /* w should be random number [0..fi->fib_power-1],
1218 it is pretty bad approximation.
1221 w = jiffies % fi->fib_power;
1223 change_nexthops(fi) {
1224 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1225 if ((w -= nh->nh_power) <= 0) {
1226 nh->nh_power--;
1227 fi->fib_power--;
1228 res->nh_sel = nhsel;
1229 spin_unlock_bh(&fib_multipath_lock);
1230 return;
1233 } endfor_nexthops(fi);
1235 /* Race condition: route has just become dead. */
1236 res->nh_sel = 0;
1237 spin_unlock_bh(&fib_multipath_lock);
1239 #endif