[IPV4]: Fix nexthop realm dumping for multipath routes
[linux-2.6.22.y-op.git] / net / ipv4 / fib_semantics.c
blob9be53a8e72c338d7865dadcc747d6003cd1c8b20
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * IPv4 Forwarding Information Base: semantics.
8 * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/netlink.h>
37 #include <linux/init.h>
39 #include <net/arp.h>
40 #include <net/ip.h>
41 #include <net/protocol.h>
42 #include <net/route.h>
43 #include <net/tcp.h>
44 #include <net/sock.h>
45 #include <net/ip_fib.h>
46 #include <net/ip_mp_alg.h>
48 #include "fib_lookup.h"
50 #define FSprintk(a...)
52 static DEFINE_RWLOCK(fib_info_lock);
53 static struct hlist_head *fib_info_hash;
54 static struct hlist_head *fib_info_laddrhash;
55 static unsigned int fib_hash_size;
56 static unsigned int fib_info_cnt;
58 #define DEVINDEX_HASHBITS 8
59 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
60 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
62 #ifdef CONFIG_IP_ROUTE_MULTIPATH
64 static DEFINE_SPINLOCK(fib_multipath_lock);
66 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
67 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
69 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
70 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
72 #else /* CONFIG_IP_ROUTE_MULTIPATH */
74 /* Hope, that gcc will optimize it to get rid of dummy loop */
76 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
77 for (nhsel=0; nhsel < 1; nhsel++)
79 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
80 for (nhsel=0; nhsel < 1; nhsel++)
82 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
84 #define endfor_nexthops(fi) }
87 static const struct
89 int error;
90 u8 scope;
91 } fib_props[RTA_MAX + 1] = {
93 .error = 0,
94 .scope = RT_SCOPE_NOWHERE,
95 }, /* RTN_UNSPEC */
97 .error = 0,
98 .scope = RT_SCOPE_UNIVERSE,
99 }, /* RTN_UNICAST */
101 .error = 0,
102 .scope = RT_SCOPE_HOST,
103 }, /* RTN_LOCAL */
105 .error = 0,
106 .scope = RT_SCOPE_LINK,
107 }, /* RTN_BROADCAST */
109 .error = 0,
110 .scope = RT_SCOPE_LINK,
111 }, /* RTN_ANYCAST */
113 .error = 0,
114 .scope = RT_SCOPE_UNIVERSE,
115 }, /* RTN_MULTICAST */
117 .error = -EINVAL,
118 .scope = RT_SCOPE_UNIVERSE,
119 }, /* RTN_BLACKHOLE */
121 .error = -EHOSTUNREACH,
122 .scope = RT_SCOPE_UNIVERSE,
123 }, /* RTN_UNREACHABLE */
125 .error = -EACCES,
126 .scope = RT_SCOPE_UNIVERSE,
127 }, /* RTN_PROHIBIT */
129 .error = -EAGAIN,
130 .scope = RT_SCOPE_UNIVERSE,
131 }, /* RTN_THROW */
133 .error = -EINVAL,
134 .scope = RT_SCOPE_NOWHERE,
135 }, /* RTN_NAT */
137 .error = -EINVAL,
138 .scope = RT_SCOPE_NOWHERE,
139 }, /* RTN_XRESOLVE */
143 /* Release a nexthop info record */
145 void free_fib_info(struct fib_info *fi)
147 if (fi->fib_dead == 0) {
148 printk("Freeing alive fib_info %p\n", fi);
149 return;
151 change_nexthops(fi) {
152 if (nh->nh_dev)
153 dev_put(nh->nh_dev);
154 nh->nh_dev = NULL;
155 } endfor_nexthops(fi);
156 fib_info_cnt--;
157 kfree(fi);
160 void fib_release_info(struct fib_info *fi)
162 write_lock(&fib_info_lock);
163 if (fi && --fi->fib_treeref == 0) {
164 hlist_del(&fi->fib_hash);
165 if (fi->fib_prefsrc)
166 hlist_del(&fi->fib_lhash);
167 change_nexthops(fi) {
168 if (!nh->nh_dev)
169 continue;
170 hlist_del(&nh->nh_hash);
171 } endfor_nexthops(fi)
172 fi->fib_dead = 1;
173 fib_info_put(fi);
175 write_unlock(&fib_info_lock);
178 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
180 const struct fib_nh *onh = ofi->fib_nh;
182 for_nexthops(fi) {
183 if (nh->nh_oif != onh->nh_oif ||
184 nh->nh_gw != onh->nh_gw ||
185 nh->nh_scope != onh->nh_scope ||
186 #ifdef CONFIG_IP_ROUTE_MULTIPATH
187 nh->nh_weight != onh->nh_weight ||
188 #endif
189 #ifdef CONFIG_NET_CLS_ROUTE
190 nh->nh_tclassid != onh->nh_tclassid ||
191 #endif
192 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
193 return -1;
194 onh++;
195 } endfor_nexthops(fi);
196 return 0;
199 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
201 unsigned int mask = (fib_hash_size - 1);
202 unsigned int val = fi->fib_nhs;
204 val ^= fi->fib_protocol;
205 val ^= fi->fib_prefsrc;
206 val ^= fi->fib_priority;
208 return (val ^ (val >> 7) ^ (val >> 12)) & mask;
211 static struct fib_info *fib_find_info(const struct fib_info *nfi)
213 struct hlist_head *head;
214 struct hlist_node *node;
215 struct fib_info *fi;
216 unsigned int hash;
218 hash = fib_info_hashfn(nfi);
219 head = &fib_info_hash[hash];
221 hlist_for_each_entry(fi, node, head, fib_hash) {
222 if (fi->fib_nhs != nfi->fib_nhs)
223 continue;
224 if (nfi->fib_protocol == fi->fib_protocol &&
225 nfi->fib_prefsrc == fi->fib_prefsrc &&
226 nfi->fib_priority == fi->fib_priority &&
227 memcmp(nfi->fib_metrics, fi->fib_metrics,
228 sizeof(fi->fib_metrics)) == 0 &&
229 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
230 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
231 return fi;
234 return NULL;
237 static inline unsigned int fib_devindex_hashfn(unsigned int val)
239 unsigned int mask = DEVINDEX_HASHSIZE - 1;
241 return (val ^
242 (val >> DEVINDEX_HASHBITS) ^
243 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
246 /* Check, that the gateway is already configured.
247 Used only by redirect accept routine.
250 int ip_fib_check_default(u32 gw, struct net_device *dev)
252 struct hlist_head *head;
253 struct hlist_node *node;
254 struct fib_nh *nh;
255 unsigned int hash;
257 read_lock(&fib_info_lock);
259 hash = fib_devindex_hashfn(dev->ifindex);
260 head = &fib_info_devhash[hash];
261 hlist_for_each_entry(nh, node, head, nh_hash) {
262 if (nh->nh_dev == dev &&
263 nh->nh_gw == gw &&
264 !(nh->nh_flags&RTNH_F_DEAD)) {
265 read_unlock(&fib_info_lock);
266 return 0;
270 read_unlock(&fib_info_lock);
272 return -1;
275 void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
276 int z, int tb_id,
277 struct nlmsghdr *n, struct netlink_skb_parms *req)
279 struct sk_buff *skb;
280 u32 pid = req ? req->pid : n->nlmsg_pid;
281 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
283 skb = alloc_skb(size, GFP_KERNEL);
284 if (!skb)
285 return;
287 if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
288 fa->fa_type, fa->fa_scope, &key, z,
289 fa->fa_tos,
290 fa->fa_info, 0) < 0) {
291 kfree_skb(skb);
292 return;
294 NETLINK_CB(skb).dst_group = RTNLGRP_IPV4_ROUTE;
295 if (n->nlmsg_flags&NLM_F_ECHO)
296 atomic_inc(&skb->users);
297 netlink_broadcast(rtnl, skb, pid, RTNLGRP_IPV4_ROUTE, GFP_KERNEL);
298 if (n->nlmsg_flags&NLM_F_ECHO)
299 netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
302 /* Return the first fib alias matching TOS with
303 * priority less than or equal to PRIO.
305 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
307 if (fah) {
308 struct fib_alias *fa;
309 list_for_each_entry(fa, fah, fa_list) {
310 if (fa->fa_tos > tos)
311 continue;
312 if (fa->fa_info->fib_priority >= prio ||
313 fa->fa_tos < tos)
314 return fa;
317 return NULL;
320 int fib_detect_death(struct fib_info *fi, int order,
321 struct fib_info **last_resort, int *last_idx, int *dflt)
323 struct neighbour *n;
324 int state = NUD_NONE;
326 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
327 if (n) {
328 state = n->nud_state;
329 neigh_release(n);
331 if (state==NUD_REACHABLE)
332 return 0;
333 if ((state&NUD_VALID) && order != *dflt)
334 return 0;
335 if ((state&NUD_VALID) ||
336 (*last_idx<0 && order > *dflt)) {
337 *last_resort = fi;
338 *last_idx = order;
340 return 1;
343 #ifdef CONFIG_IP_ROUTE_MULTIPATH
345 static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
347 while (RTA_OK(attr,attrlen)) {
348 if (attr->rta_type == type)
349 return *(u32*)RTA_DATA(attr);
350 attr = RTA_NEXT(attr, attrlen);
352 return 0;
355 static int
356 fib_count_nexthops(struct rtattr *rta)
358 int nhs = 0;
359 struct rtnexthop *nhp = RTA_DATA(rta);
360 int nhlen = RTA_PAYLOAD(rta);
362 while (nhlen >= (int)sizeof(struct rtnexthop)) {
363 if ((nhlen -= nhp->rtnh_len) < 0)
364 return 0;
365 nhs++;
366 nhp = RTNH_NEXT(nhp);
368 return nhs;
371 static int
372 fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
374 struct rtnexthop *nhp = RTA_DATA(rta);
375 int nhlen = RTA_PAYLOAD(rta);
377 change_nexthops(fi) {
378 int attrlen = nhlen - sizeof(struct rtnexthop);
379 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
380 return -EINVAL;
381 nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
382 nh->nh_oif = nhp->rtnh_ifindex;
383 nh->nh_weight = nhp->rtnh_hops + 1;
384 if (attrlen) {
385 nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
386 #ifdef CONFIG_NET_CLS_ROUTE
387 nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
388 #endif
390 nhp = RTNH_NEXT(nhp);
391 } endfor_nexthops(fi);
392 return 0;
395 #endif
397 int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
398 struct fib_info *fi)
400 #ifdef CONFIG_IP_ROUTE_MULTIPATH
401 struct rtnexthop *nhp;
402 int nhlen;
403 #endif
405 if (rta->rta_priority &&
406 *rta->rta_priority != fi->fib_priority)
407 return 1;
409 if (rta->rta_oif || rta->rta_gw) {
410 if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
411 (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
412 return 0;
413 return 1;
416 #ifdef CONFIG_IP_ROUTE_MULTIPATH
417 if (rta->rta_mp == NULL)
418 return 0;
419 nhp = RTA_DATA(rta->rta_mp);
420 nhlen = RTA_PAYLOAD(rta->rta_mp);
422 for_nexthops(fi) {
423 int attrlen = nhlen - sizeof(struct rtnexthop);
424 u32 gw;
426 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
427 return -EINVAL;
428 if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
429 return 1;
430 if (attrlen) {
431 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
432 if (gw && gw != nh->nh_gw)
433 return 1;
434 #ifdef CONFIG_NET_CLS_ROUTE
435 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
436 if (gw && gw != nh->nh_tclassid)
437 return 1;
438 #endif
440 nhp = RTNH_NEXT(nhp);
441 } endfor_nexthops(fi);
442 #endif
443 return 0;
448 Picture
449 -------
451 Semantics of nexthop is very messy by historical reasons.
452 We have to take into account, that:
453 a) gateway can be actually local interface address,
454 so that gatewayed route is direct.
455 b) gateway must be on-link address, possibly
456 described not by an ifaddr, but also by a direct route.
457 c) If both gateway and interface are specified, they should not
458 contradict.
459 d) If we use tunnel routes, gateway could be not on-link.
461 Attempt to reconcile all of these (alas, self-contradictory) conditions
462 results in pretty ugly and hairy code with obscure logic.
464 I chose to generalized it instead, so that the size
465 of code does not increase practically, but it becomes
466 much more general.
467 Every prefix is assigned a "scope" value: "host" is local address,
468 "link" is direct route,
469 [ ... "site" ... "interior" ... ]
470 and "universe" is true gateway route with global meaning.
472 Every prefix refers to a set of "nexthop"s (gw, oif),
473 where gw must have narrower scope. This recursion stops
474 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
475 which means that gw is forced to be on link.
477 Code is still hairy, but now it is apparently logically
478 consistent and very flexible. F.e. as by-product it allows
479 to co-exists in peace independent exterior and interior
480 routing processes.
482 Normally it looks as following.
484 {universe prefix} -> (gw, oif) [scope link]
486 |-> {link prefix} -> (gw, oif) [scope local]
488 |-> {local prefix} (terminal node)
491 static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
493 int err;
495 if (nh->nh_gw) {
496 struct fib_result res;
498 #ifdef CONFIG_IP_ROUTE_PERVASIVE
499 if (nh->nh_flags&RTNH_F_PERVASIVE)
500 return 0;
501 #endif
502 if (nh->nh_flags&RTNH_F_ONLINK) {
503 struct net_device *dev;
505 if (r->rtm_scope >= RT_SCOPE_LINK)
506 return -EINVAL;
507 if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
508 return -EINVAL;
509 if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
510 return -ENODEV;
511 if (!(dev->flags&IFF_UP))
512 return -ENETDOWN;
513 nh->nh_dev = dev;
514 dev_hold(dev);
515 nh->nh_scope = RT_SCOPE_LINK;
516 return 0;
519 struct flowi fl = { .nl_u = { .ip4_u =
520 { .daddr = nh->nh_gw,
521 .scope = r->rtm_scope + 1 } },
522 .oif = nh->nh_oif };
524 /* It is not necessary, but requires a bit of thinking */
525 if (fl.fl4_scope < RT_SCOPE_LINK)
526 fl.fl4_scope = RT_SCOPE_LINK;
527 if ((err = fib_lookup(&fl, &res)) != 0)
528 return err;
530 err = -EINVAL;
531 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
532 goto out;
533 nh->nh_scope = res.scope;
534 nh->nh_oif = FIB_RES_OIF(res);
535 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
536 goto out;
537 dev_hold(nh->nh_dev);
538 err = -ENETDOWN;
539 if (!(nh->nh_dev->flags & IFF_UP))
540 goto out;
541 err = 0;
542 out:
543 fib_res_put(&res);
544 return err;
545 } else {
546 struct in_device *in_dev;
548 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
549 return -EINVAL;
551 in_dev = inetdev_by_index(nh->nh_oif);
552 if (in_dev == NULL)
553 return -ENODEV;
554 if (!(in_dev->dev->flags&IFF_UP)) {
555 in_dev_put(in_dev);
556 return -ENETDOWN;
558 nh->nh_dev = in_dev->dev;
559 dev_hold(nh->nh_dev);
560 nh->nh_scope = RT_SCOPE_HOST;
561 in_dev_put(in_dev);
563 return 0;
566 static inline unsigned int fib_laddr_hashfn(u32 val)
568 unsigned int mask = (fib_hash_size - 1);
570 return (val ^ (val >> 7) ^ (val >> 14)) & mask;
573 static struct hlist_head *fib_hash_alloc(int bytes)
575 if (bytes <= PAGE_SIZE)
576 return kmalloc(bytes, GFP_KERNEL);
577 else
578 return (struct hlist_head *)
579 __get_free_pages(GFP_KERNEL, get_order(bytes));
582 static void fib_hash_free(struct hlist_head *hash, int bytes)
584 if (!hash)
585 return;
587 if (bytes <= PAGE_SIZE)
588 kfree(hash);
589 else
590 free_pages((unsigned long) hash, get_order(bytes));
593 static void fib_hash_move(struct hlist_head *new_info_hash,
594 struct hlist_head *new_laddrhash,
595 unsigned int new_size)
597 struct hlist_head *old_info_hash, *old_laddrhash;
598 unsigned int old_size = fib_hash_size;
599 unsigned int i, bytes;
601 write_lock(&fib_info_lock);
602 old_info_hash = fib_info_hash;
603 old_laddrhash = fib_info_laddrhash;
604 fib_hash_size = new_size;
606 for (i = 0; i < old_size; i++) {
607 struct hlist_head *head = &fib_info_hash[i];
608 struct hlist_node *node, *n;
609 struct fib_info *fi;
611 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
612 struct hlist_head *dest;
613 unsigned int new_hash;
615 hlist_del(&fi->fib_hash);
617 new_hash = fib_info_hashfn(fi);
618 dest = &new_info_hash[new_hash];
619 hlist_add_head(&fi->fib_hash, dest);
622 fib_info_hash = new_info_hash;
624 for (i = 0; i < old_size; i++) {
625 struct hlist_head *lhead = &fib_info_laddrhash[i];
626 struct hlist_node *node, *n;
627 struct fib_info *fi;
629 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
630 struct hlist_head *ldest;
631 unsigned int new_hash;
633 hlist_del(&fi->fib_lhash);
635 new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
636 ldest = &new_laddrhash[new_hash];
637 hlist_add_head(&fi->fib_lhash, ldest);
640 fib_info_laddrhash = new_laddrhash;
642 write_unlock(&fib_info_lock);
644 bytes = old_size * sizeof(struct hlist_head *);
645 fib_hash_free(old_info_hash, bytes);
646 fib_hash_free(old_laddrhash, bytes);
649 struct fib_info *
650 fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
651 const struct nlmsghdr *nlh, int *errp)
653 int err;
654 struct fib_info *fi = NULL;
655 struct fib_info *ofi;
656 #ifdef CONFIG_IP_ROUTE_MULTIPATH
657 int nhs = 1;
658 #else
659 const int nhs = 1;
660 #endif
661 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
662 u32 mp_alg = IP_MP_ALG_NONE;
663 #endif
665 /* Fast check to catch the most weird cases */
666 if (fib_props[r->rtm_type].scope > r->rtm_scope)
667 goto err_inval;
669 #ifdef CONFIG_IP_ROUTE_MULTIPATH
670 if (rta->rta_mp) {
671 nhs = fib_count_nexthops(rta->rta_mp);
672 if (nhs == 0)
673 goto err_inval;
675 #endif
676 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
677 if (rta->rta_mp_alg) {
678 mp_alg = *rta->rta_mp_alg;
680 if (mp_alg < IP_MP_ALG_NONE ||
681 mp_alg > IP_MP_ALG_MAX)
682 goto err_inval;
684 #endif
686 err = -ENOBUFS;
687 if (fib_info_cnt >= fib_hash_size) {
688 unsigned int new_size = fib_hash_size << 1;
689 struct hlist_head *new_info_hash;
690 struct hlist_head *new_laddrhash;
691 unsigned int bytes;
693 if (!new_size)
694 new_size = 1;
695 bytes = new_size * sizeof(struct hlist_head *);
696 new_info_hash = fib_hash_alloc(bytes);
697 new_laddrhash = fib_hash_alloc(bytes);
698 if (!new_info_hash || !new_laddrhash) {
699 fib_hash_free(new_info_hash, bytes);
700 fib_hash_free(new_laddrhash, bytes);
701 } else {
702 memset(new_info_hash, 0, bytes);
703 memset(new_laddrhash, 0, bytes);
705 fib_hash_move(new_info_hash, new_laddrhash, new_size);
708 if (!fib_hash_size)
709 goto failure;
712 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
713 if (fi == NULL)
714 goto failure;
715 fib_info_cnt++;
717 fi->fib_protocol = r->rtm_protocol;
719 fi->fib_nhs = nhs;
720 change_nexthops(fi) {
721 nh->nh_parent = fi;
722 } endfor_nexthops(fi)
724 fi->fib_flags = r->rtm_flags;
725 if (rta->rta_priority)
726 fi->fib_priority = *rta->rta_priority;
727 if (rta->rta_mx) {
728 int attrlen = RTA_PAYLOAD(rta->rta_mx);
729 struct rtattr *attr = RTA_DATA(rta->rta_mx);
731 while (RTA_OK(attr, attrlen)) {
732 unsigned flavor = attr->rta_type;
733 if (flavor) {
734 if (flavor > RTAX_MAX)
735 goto err_inval;
736 fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
738 attr = RTA_NEXT(attr, attrlen);
741 if (rta->rta_prefsrc)
742 memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
744 if (rta->rta_mp) {
745 #ifdef CONFIG_IP_ROUTE_MULTIPATH
746 if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
747 goto failure;
748 if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
749 goto err_inval;
750 if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
751 goto err_inval;
752 #ifdef CONFIG_NET_CLS_ROUTE
753 if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
754 goto err_inval;
755 #endif
756 #else
757 goto err_inval;
758 #endif
759 } else {
760 struct fib_nh *nh = fi->fib_nh;
761 if (rta->rta_oif)
762 nh->nh_oif = *rta->rta_oif;
763 if (rta->rta_gw)
764 memcpy(&nh->nh_gw, rta->rta_gw, 4);
765 #ifdef CONFIG_NET_CLS_ROUTE
766 if (rta->rta_flow)
767 memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
768 #endif
769 nh->nh_flags = r->rtm_flags;
770 #ifdef CONFIG_IP_ROUTE_MULTIPATH
771 nh->nh_weight = 1;
772 #endif
775 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
776 fi->fib_mp_alg = mp_alg;
777 #endif
779 if (fib_props[r->rtm_type].error) {
780 if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
781 goto err_inval;
782 goto link_it;
785 if (r->rtm_scope > RT_SCOPE_HOST)
786 goto err_inval;
788 if (r->rtm_scope == RT_SCOPE_HOST) {
789 struct fib_nh *nh = fi->fib_nh;
791 /* Local address is added. */
792 if (nhs != 1 || nh->nh_gw)
793 goto err_inval;
794 nh->nh_scope = RT_SCOPE_NOWHERE;
795 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
796 err = -ENODEV;
797 if (nh->nh_dev == NULL)
798 goto failure;
799 } else {
800 change_nexthops(fi) {
801 if ((err = fib_check_nh(r, fi, nh)) != 0)
802 goto failure;
803 } endfor_nexthops(fi)
806 if (fi->fib_prefsrc) {
807 if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
808 memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
809 if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
810 goto err_inval;
813 link_it:
814 if ((ofi = fib_find_info(fi)) != NULL) {
815 fi->fib_dead = 1;
816 free_fib_info(fi);
817 ofi->fib_treeref++;
818 return ofi;
821 fi->fib_treeref++;
822 atomic_inc(&fi->fib_clntref);
823 write_lock(&fib_info_lock);
824 hlist_add_head(&fi->fib_hash,
825 &fib_info_hash[fib_info_hashfn(fi)]);
826 if (fi->fib_prefsrc) {
827 struct hlist_head *head;
829 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
830 hlist_add_head(&fi->fib_lhash, head);
832 change_nexthops(fi) {
833 struct hlist_head *head;
834 unsigned int hash;
836 if (!nh->nh_dev)
837 continue;
838 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
839 head = &fib_info_devhash[hash];
840 hlist_add_head(&nh->nh_hash, head);
841 } endfor_nexthops(fi)
842 write_unlock(&fib_info_lock);
843 return fi;
845 err_inval:
846 err = -EINVAL;
848 failure:
849 *errp = err;
850 if (fi) {
851 fi->fib_dead = 1;
852 free_fib_info(fi);
854 return NULL;
857 /* Note! fib_semantic_match intentionally uses RCU list functions. */
858 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
859 struct fib_result *res, __u32 zone, __u32 mask,
860 int prefixlen)
862 struct fib_alias *fa;
863 int nh_sel = 0;
865 list_for_each_entry_rcu(fa, head, fa_list) {
866 int err;
868 if (fa->fa_tos &&
869 fa->fa_tos != flp->fl4_tos)
870 continue;
872 if (fa->fa_scope < flp->fl4_scope)
873 continue;
875 fa->fa_state |= FA_S_ACCESSED;
877 err = fib_props[fa->fa_type].error;
878 if (err == 0) {
879 struct fib_info *fi = fa->fa_info;
881 if (fi->fib_flags & RTNH_F_DEAD)
882 continue;
884 switch (fa->fa_type) {
885 case RTN_UNICAST:
886 case RTN_LOCAL:
887 case RTN_BROADCAST:
888 case RTN_ANYCAST:
889 case RTN_MULTICAST:
890 for_nexthops(fi) {
891 if (nh->nh_flags&RTNH_F_DEAD)
892 continue;
893 if (!flp->oif || flp->oif == nh->nh_oif)
894 break;
896 #ifdef CONFIG_IP_ROUTE_MULTIPATH
897 if (nhsel < fi->fib_nhs) {
898 nh_sel = nhsel;
899 goto out_fill_res;
901 #else
902 if (nhsel < 1) {
903 goto out_fill_res;
905 #endif
906 endfor_nexthops(fi);
907 continue;
909 default:
910 printk(KERN_DEBUG "impossible 102\n");
911 return -EINVAL;
914 return err;
916 return 1;
918 out_fill_res:
919 res->prefixlen = prefixlen;
920 res->nh_sel = nh_sel;
921 res->type = fa->fa_type;
922 res->scope = fa->fa_scope;
923 res->fi = fa->fa_info;
924 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
925 res->netmask = mask;
926 res->network = zone &
927 (0xFFFFFFFF >> (32 - prefixlen));
928 #endif
929 atomic_inc(&res->fi->fib_clntref);
930 return 0;
933 /* Find appropriate source address to this destination */
935 u32 __fib_res_prefsrc(struct fib_result *res)
937 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
941 fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
942 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
943 struct fib_info *fi, unsigned int flags)
945 struct rtmsg *rtm;
946 struct nlmsghdr *nlh;
947 unsigned char *b = skb->tail;
949 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
950 rtm = NLMSG_DATA(nlh);
951 rtm->rtm_family = AF_INET;
952 rtm->rtm_dst_len = dst_len;
953 rtm->rtm_src_len = 0;
954 rtm->rtm_tos = tos;
955 rtm->rtm_table = tb_id;
956 rtm->rtm_type = type;
957 rtm->rtm_flags = fi->fib_flags;
958 rtm->rtm_scope = scope;
959 if (rtm->rtm_dst_len)
960 RTA_PUT(skb, RTA_DST, 4, dst);
961 rtm->rtm_protocol = fi->fib_protocol;
962 if (fi->fib_priority)
963 RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
964 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
965 goto rtattr_failure;
966 if (fi->fib_prefsrc)
967 RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
968 if (fi->fib_nhs == 1) {
969 if (fi->fib_nh->nh_gw)
970 RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
971 if (fi->fib_nh->nh_oif)
972 RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
973 #ifdef CONFIG_NET_CLS_ROUTE
974 if (fi->fib_nh[0].nh_tclassid)
975 RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
976 #endif
978 #ifdef CONFIG_IP_ROUTE_MULTIPATH
979 if (fi->fib_nhs > 1) {
980 struct rtnexthop *nhp;
981 struct rtattr *mp_head;
982 if (skb_tailroom(skb) <= RTA_SPACE(0))
983 goto rtattr_failure;
984 mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
986 for_nexthops(fi) {
987 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
988 goto rtattr_failure;
989 nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
990 nhp->rtnh_flags = nh->nh_flags & 0xFF;
991 nhp->rtnh_hops = nh->nh_weight-1;
992 nhp->rtnh_ifindex = nh->nh_oif;
993 if (nh->nh_gw)
994 RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
995 #ifdef CONFIG_NET_CLS_ROUTE
996 if (nh->nh_tclassid)
997 RTA_PUT(skb, RTA_FLOW, 4, &nh->nh_tclassid);
998 #endif
999 nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
1000 } endfor_nexthops(fi);
1001 mp_head->rta_type = RTA_MULTIPATH;
1002 mp_head->rta_len = skb->tail - (u8*)mp_head;
1004 #endif
1005 nlh->nlmsg_len = skb->tail - b;
1006 return skb->len;
1008 nlmsg_failure:
1009 rtattr_failure:
1010 skb_trim(skb, b - skb->data);
1011 return -1;
1014 #ifndef CONFIG_IP_NOSIOCRT
1017 fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
1018 struct kern_rta *rta, struct rtentry *r)
1020 int plen;
1021 u32 *ptr;
1023 memset(rtm, 0, sizeof(*rtm));
1024 memset(rta, 0, sizeof(*rta));
1026 if (r->rt_dst.sa_family != AF_INET)
1027 return -EAFNOSUPPORT;
1029 /* Check mask for validity:
1030 a) it must be contiguous.
1031 b) destination must have all host bits clear.
1032 c) if application forgot to set correct family (AF_INET),
1033 reject request unless it is absolutely clear i.e.
1034 both family and mask are zero.
1036 plen = 32;
1037 ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
1038 if (!(r->rt_flags&RTF_HOST)) {
1039 u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
1040 if (r->rt_genmask.sa_family != AF_INET) {
1041 if (mask || r->rt_genmask.sa_family)
1042 return -EAFNOSUPPORT;
1044 if (bad_mask(mask, *ptr))
1045 return -EINVAL;
1046 plen = inet_mask_len(mask);
1049 nl->nlmsg_flags = NLM_F_REQUEST;
1050 nl->nlmsg_pid = 0;
1051 nl->nlmsg_seq = 0;
1052 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
1053 if (cmd == SIOCDELRT) {
1054 nl->nlmsg_type = RTM_DELROUTE;
1055 nl->nlmsg_flags = 0;
1056 } else {
1057 nl->nlmsg_type = RTM_NEWROUTE;
1058 nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
1059 rtm->rtm_protocol = RTPROT_BOOT;
1062 rtm->rtm_dst_len = plen;
1063 rta->rta_dst = ptr;
1065 if (r->rt_metric) {
1066 *(u32*)&r->rt_pad3 = r->rt_metric - 1;
1067 rta->rta_priority = (u32*)&r->rt_pad3;
1069 if (r->rt_flags&RTF_REJECT) {
1070 rtm->rtm_scope = RT_SCOPE_HOST;
1071 rtm->rtm_type = RTN_UNREACHABLE;
1072 return 0;
1074 rtm->rtm_scope = RT_SCOPE_NOWHERE;
1075 rtm->rtm_type = RTN_UNICAST;
1077 if (r->rt_dev) {
1078 char *colon;
1079 struct net_device *dev;
1080 char devname[IFNAMSIZ];
1082 if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
1083 return -EFAULT;
1084 devname[IFNAMSIZ-1] = 0;
1085 colon = strchr(devname, ':');
1086 if (colon)
1087 *colon = 0;
1088 dev = __dev_get_by_name(devname);
1089 if (!dev)
1090 return -ENODEV;
1091 rta->rta_oif = &dev->ifindex;
1092 if (colon) {
1093 struct in_ifaddr *ifa;
1094 struct in_device *in_dev = __in_dev_get_rtnl(dev);
1095 if (!in_dev)
1096 return -ENODEV;
1097 *colon = ':';
1098 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
1099 if (strcmp(ifa->ifa_label, devname) == 0)
1100 break;
1101 if (ifa == NULL)
1102 return -ENODEV;
1103 rta->rta_prefsrc = &ifa->ifa_local;
1107 ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
1108 if (r->rt_gateway.sa_family == AF_INET && *ptr) {
1109 rta->rta_gw = ptr;
1110 if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
1111 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1114 if (cmd == SIOCDELRT)
1115 return 0;
1117 if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
1118 return -EINVAL;
1120 if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
1121 rtm->rtm_scope = RT_SCOPE_LINK;
1123 if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
1124 struct rtattr *rec;
1125 struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
1126 if (mx == NULL)
1127 return -ENOMEM;
1128 rta->rta_mx = mx;
1129 mx->rta_type = RTA_METRICS;
1130 mx->rta_len = RTA_LENGTH(0);
1131 if (r->rt_flags&RTF_MTU) {
1132 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1133 rec->rta_type = RTAX_ADVMSS;
1134 rec->rta_len = RTA_LENGTH(4);
1135 mx->rta_len += RTA_LENGTH(4);
1136 *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
1138 if (r->rt_flags&RTF_WINDOW) {
1139 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1140 rec->rta_type = RTAX_WINDOW;
1141 rec->rta_len = RTA_LENGTH(4);
1142 mx->rta_len += RTA_LENGTH(4);
1143 *(u32*)RTA_DATA(rec) = r->rt_window;
1145 if (r->rt_flags&RTF_IRTT) {
1146 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1147 rec->rta_type = RTAX_RTT;
1148 rec->rta_len = RTA_LENGTH(4);
1149 mx->rta_len += RTA_LENGTH(4);
1150 *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
1153 return 0;
1156 #endif
1159 Update FIB if:
1160 - local address disappeared -> we must delete all the entries
1161 referring to it.
1162 - device went down -> we must shutdown all nexthops going via it.
1165 int fib_sync_down(u32 local, struct net_device *dev, int force)
1167 int ret = 0;
1168 int scope = RT_SCOPE_NOWHERE;
1170 if (force)
1171 scope = -1;
1173 if (local && fib_info_laddrhash) {
1174 unsigned int hash = fib_laddr_hashfn(local);
1175 struct hlist_head *head = &fib_info_laddrhash[hash];
1176 struct hlist_node *node;
1177 struct fib_info *fi;
1179 hlist_for_each_entry(fi, node, head, fib_lhash) {
1180 if (fi->fib_prefsrc == local) {
1181 fi->fib_flags |= RTNH_F_DEAD;
1182 ret++;
1187 if (dev) {
1188 struct fib_info *prev_fi = NULL;
1189 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1190 struct hlist_head *head = &fib_info_devhash[hash];
1191 struct hlist_node *node;
1192 struct fib_nh *nh;
1194 hlist_for_each_entry(nh, node, head, nh_hash) {
1195 struct fib_info *fi = nh->nh_parent;
1196 int dead;
1198 BUG_ON(!fi->fib_nhs);
1199 if (nh->nh_dev != dev || fi == prev_fi)
1200 continue;
1201 prev_fi = fi;
1202 dead = 0;
1203 change_nexthops(fi) {
1204 if (nh->nh_flags&RTNH_F_DEAD)
1205 dead++;
1206 else if (nh->nh_dev == dev &&
1207 nh->nh_scope != scope) {
1208 nh->nh_flags |= RTNH_F_DEAD;
1209 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1210 spin_lock_bh(&fib_multipath_lock);
1211 fi->fib_power -= nh->nh_power;
1212 nh->nh_power = 0;
1213 spin_unlock_bh(&fib_multipath_lock);
1214 #endif
1215 dead++;
1217 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1218 if (force > 1 && nh->nh_dev == dev) {
1219 dead = fi->fib_nhs;
1220 break;
1222 #endif
1223 } endfor_nexthops(fi)
1224 if (dead == fi->fib_nhs) {
1225 fi->fib_flags |= RTNH_F_DEAD;
1226 ret++;
1231 return ret;
1234 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1237 Dead device goes up. We wake up dead nexthops.
1238 It takes sense only on multipath routes.
1241 int fib_sync_up(struct net_device *dev)
1243 struct fib_info *prev_fi;
1244 unsigned int hash;
1245 struct hlist_head *head;
1246 struct hlist_node *node;
1247 struct fib_nh *nh;
1248 int ret;
1250 if (!(dev->flags&IFF_UP))
1251 return 0;
1253 prev_fi = NULL;
1254 hash = fib_devindex_hashfn(dev->ifindex);
1255 head = &fib_info_devhash[hash];
1256 ret = 0;
1258 hlist_for_each_entry(nh, node, head, nh_hash) {
1259 struct fib_info *fi = nh->nh_parent;
1260 int alive;
1262 BUG_ON(!fi->fib_nhs);
1263 if (nh->nh_dev != dev || fi == prev_fi)
1264 continue;
1266 prev_fi = fi;
1267 alive = 0;
1268 change_nexthops(fi) {
1269 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1270 alive++;
1271 continue;
1273 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1274 continue;
1275 if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1276 continue;
1277 alive++;
1278 spin_lock_bh(&fib_multipath_lock);
1279 nh->nh_power = 0;
1280 nh->nh_flags &= ~RTNH_F_DEAD;
1281 spin_unlock_bh(&fib_multipath_lock);
1282 } endfor_nexthops(fi)
1284 if (alive > 0) {
1285 fi->fib_flags &= ~RTNH_F_DEAD;
1286 ret++;
1290 return ret;
1294 The algorithm is suboptimal, but it provides really
1295 fair weighted route distribution.
1298 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1300 struct fib_info *fi = res->fi;
1301 int w;
1303 spin_lock_bh(&fib_multipath_lock);
1304 if (fi->fib_power <= 0) {
1305 int power = 0;
1306 change_nexthops(fi) {
1307 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1308 power += nh->nh_weight;
1309 nh->nh_power = nh->nh_weight;
1311 } endfor_nexthops(fi);
1312 fi->fib_power = power;
1313 if (power <= 0) {
1314 spin_unlock_bh(&fib_multipath_lock);
1315 /* Race condition: route has just become dead. */
1316 res->nh_sel = 0;
1317 return;
1322 /* w should be random number [0..fi->fib_power-1],
1323 it is pretty bad approximation.
1326 w = jiffies % fi->fib_power;
1328 change_nexthops(fi) {
1329 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1330 if ((w -= nh->nh_power) <= 0) {
1331 nh->nh_power--;
1332 fi->fib_power--;
1333 res->nh_sel = nhsel;
1334 spin_unlock_bh(&fib_multipath_lock);
1335 return;
1338 } endfor_nexthops(fi);
1340 /* Race condition: route has just become dead. */
1341 res->nh_sel = 0;
1342 spin_unlock_bh(&fib_multipath_lock);
1344 #endif