2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * IPv4 Forwarding Information Base: semantics.
8 * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
44 #include <net/ip_fib.h>
45 #include <net/netlink.h>
46 #include <net/nexthop.h>
48 #include "fib_lookup.h"
50 static DEFINE_SPINLOCK(fib_info_lock
);
51 static struct hlist_head
*fib_info_hash
;
52 static struct hlist_head
*fib_info_laddrhash
;
53 static unsigned int fib_hash_size
;
54 static unsigned int fib_info_cnt
;
56 #define DEVINDEX_HASHBITS 8
57 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
58 static struct hlist_head fib_info_devhash
[DEVINDEX_HASHSIZE
];
60 #ifdef CONFIG_IP_ROUTE_MULTIPATH
62 static DEFINE_SPINLOCK(fib_multipath_lock
);
64 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
65 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
67 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
68 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
70 #else /* CONFIG_IP_ROUTE_MULTIPATH */
72 /* Hope, that gcc will optimize it to get rid of dummy loop */
74 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
75 for (nhsel=0; nhsel < 1; nhsel++)
77 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
78 for (nhsel=0; nhsel < 1; nhsel++)
80 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
82 #define endfor_nexthops(fi) }
89 } fib_props
[RTN_MAX
+ 1] = {
92 .scope
= RT_SCOPE_NOWHERE
,
96 .scope
= RT_SCOPE_UNIVERSE
,
100 .scope
= RT_SCOPE_HOST
,
104 .scope
= RT_SCOPE_LINK
,
105 }, /* RTN_BROADCAST */
108 .scope
= RT_SCOPE_LINK
,
112 .scope
= RT_SCOPE_UNIVERSE
,
113 }, /* RTN_MULTICAST */
116 .scope
= RT_SCOPE_UNIVERSE
,
117 }, /* RTN_BLACKHOLE */
119 .error
= -EHOSTUNREACH
,
120 .scope
= RT_SCOPE_UNIVERSE
,
121 }, /* RTN_UNREACHABLE */
124 .scope
= RT_SCOPE_UNIVERSE
,
125 }, /* RTN_PROHIBIT */
128 .scope
= RT_SCOPE_UNIVERSE
,
132 .scope
= RT_SCOPE_NOWHERE
,
136 .scope
= RT_SCOPE_NOWHERE
,
137 }, /* RTN_XRESOLVE */
141 /* Release a nexthop info record */
143 void free_fib_info(struct fib_info
*fi
)
145 if (fi
->fib_dead
== 0) {
146 printk(KERN_WARNING
"Freeing alive fib_info %p\n", fi
);
149 change_nexthops(fi
) {
153 } endfor_nexthops(fi
);
155 release_net(fi
->fib_net
);
159 void fib_release_info(struct fib_info
*fi
)
161 spin_lock_bh(&fib_info_lock
);
162 if (fi
&& --fi
->fib_treeref
== 0) {
163 hlist_del(&fi
->fib_hash
);
165 hlist_del(&fi
->fib_lhash
);
166 change_nexthops(fi
) {
169 hlist_del(&nh
->nh_hash
);
170 } endfor_nexthops(fi
)
174 spin_unlock_bh(&fib_info_lock
);
177 static __inline__
int nh_comp(const struct fib_info
*fi
, const struct fib_info
*ofi
)
179 const struct fib_nh
*onh
= ofi
->fib_nh
;
182 if (nh
->nh_oif
!= onh
->nh_oif
||
183 nh
->nh_gw
!= onh
->nh_gw
||
184 nh
->nh_scope
!= onh
->nh_scope
||
185 #ifdef CONFIG_IP_ROUTE_MULTIPATH
186 nh
->nh_weight
!= onh
->nh_weight
||
188 #ifdef CONFIG_NET_CLS_ROUTE
189 nh
->nh_tclassid
!= onh
->nh_tclassid
||
191 ((nh
->nh_flags
^onh
->nh_flags
)&~RTNH_F_DEAD
))
194 } endfor_nexthops(fi
);
198 static inline unsigned int fib_devindex_hashfn(unsigned int val
)
200 unsigned int mask
= DEVINDEX_HASHSIZE
- 1;
203 (val
>> DEVINDEX_HASHBITS
) ^
204 (val
>> (DEVINDEX_HASHBITS
* 2))) & mask
;
207 static inline unsigned int fib_info_hashfn(const struct fib_info
*fi
)
209 unsigned int mask
= (fib_hash_size
- 1);
210 unsigned int val
= fi
->fib_nhs
;
212 val
^= fi
->fib_protocol
;
213 val
^= (__force u32
)fi
->fib_prefsrc
;
214 val
^= fi
->fib_priority
;
216 val
^= fib_devindex_hashfn(nh
->nh_oif
);
217 } endfor_nexthops(fi
)
219 return (val
^ (val
>> 7) ^ (val
>> 12)) & mask
;
222 static struct fib_info
*fib_find_info(const struct fib_info
*nfi
)
224 struct hlist_head
*head
;
225 struct hlist_node
*node
;
229 hash
= fib_info_hashfn(nfi
);
230 head
= &fib_info_hash
[hash
];
232 hlist_for_each_entry(fi
, node
, head
, fib_hash
) {
233 if (fi
->fib_net
!= nfi
->fib_net
)
235 if (fi
->fib_nhs
!= nfi
->fib_nhs
)
237 if (nfi
->fib_protocol
== fi
->fib_protocol
&&
238 nfi
->fib_prefsrc
== fi
->fib_prefsrc
&&
239 nfi
->fib_priority
== fi
->fib_priority
&&
240 memcmp(nfi
->fib_metrics
, fi
->fib_metrics
,
241 sizeof(fi
->fib_metrics
)) == 0 &&
242 ((nfi
->fib_flags
^fi
->fib_flags
)&~RTNH_F_DEAD
) == 0 &&
243 (nfi
->fib_nhs
== 0 || nh_comp(fi
, nfi
) == 0))
250 /* Check, that the gateway is already configured.
251 Used only by redirect accept routine.
254 int ip_fib_check_default(__be32 gw
, struct net_device
*dev
)
256 struct hlist_head
*head
;
257 struct hlist_node
*node
;
261 spin_lock(&fib_info_lock
);
263 hash
= fib_devindex_hashfn(dev
->ifindex
);
264 head
= &fib_info_devhash
[hash
];
265 hlist_for_each_entry(nh
, node
, head
, nh_hash
) {
266 if (nh
->nh_dev
== dev
&&
268 !(nh
->nh_flags
&RTNH_F_DEAD
)) {
269 spin_unlock(&fib_info_lock
);
274 spin_unlock(&fib_info_lock
);
279 static inline size_t fib_nlmsg_size(struct fib_info
*fi
)
281 size_t payload
= NLMSG_ALIGN(sizeof(struct rtmsg
))
282 + nla_total_size(4) /* RTA_TABLE */
283 + nla_total_size(4) /* RTA_DST */
284 + nla_total_size(4) /* RTA_PRIORITY */
285 + nla_total_size(4); /* RTA_PREFSRC */
287 /* space for nested metrics */
288 payload
+= nla_total_size((RTAX_MAX
* nla_total_size(4)));
291 /* Also handles the special case fib_nhs == 1 */
293 /* each nexthop is packed in an attribute */
294 size_t nhsize
= nla_total_size(sizeof(struct rtnexthop
));
296 /* may contain flow and gateway attribute */
297 nhsize
+= 2 * nla_total_size(4);
299 /* all nexthops are packed in a nested attribute */
300 payload
+= nla_total_size(fi
->fib_nhs
* nhsize
);
306 void rtmsg_fib(int event
, __be32 key
, struct fib_alias
*fa
,
307 int dst_len
, u32 tb_id
, struct nl_info
*info
,
308 unsigned int nlm_flags
)
311 u32 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
314 skb
= nlmsg_new(fib_nlmsg_size(fa
->fa_info
), GFP_KERNEL
);
318 err
= fib_dump_info(skb
, info
->pid
, seq
, event
, tb_id
,
319 fa
->fa_type
, fa
->fa_scope
, key
, dst_len
,
320 fa
->fa_tos
, fa
->fa_info
, nlm_flags
);
322 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
323 WARN_ON(err
== -EMSGSIZE
);
327 err
= rtnl_notify(skb
, info
->nl_net
, info
->pid
, RTNLGRP_IPV4_ROUTE
,
328 info
->nlh
, GFP_KERNEL
);
331 rtnl_set_sk_err(info
->nl_net
, RTNLGRP_IPV4_ROUTE
, err
);
334 /* Return the first fib alias matching TOS with
335 * priority less than or equal to PRIO.
337 struct fib_alias
*fib_find_alias(struct list_head
*fah
, u8 tos
, u32 prio
)
340 struct fib_alias
*fa
;
341 list_for_each_entry(fa
, fah
, fa_list
) {
342 if (fa
->fa_tos
> tos
)
344 if (fa
->fa_info
->fib_priority
>= prio
||
352 int fib_detect_death(struct fib_info
*fi
, int order
,
353 struct fib_info
**last_resort
, int *last_idx
, int dflt
)
356 int state
= NUD_NONE
;
358 n
= neigh_lookup(&arp_tbl
, &fi
->fib_nh
[0].nh_gw
, fi
->fib_dev
);
360 state
= n
->nud_state
;
363 if (state
==NUD_REACHABLE
)
365 if ((state
&NUD_VALID
) && order
!= dflt
)
367 if ((state
&NUD_VALID
) ||
368 (*last_idx
<0 && order
> dflt
)) {
375 #ifdef CONFIG_IP_ROUTE_MULTIPATH
377 static int fib_count_nexthops(struct rtnexthop
*rtnh
, int remaining
)
381 while (rtnh_ok(rtnh
, remaining
)) {
383 rtnh
= rtnh_next(rtnh
, &remaining
);
386 /* leftover implies invalid nexthop configuration, discard it */
387 return remaining
> 0 ? 0 : nhs
;
390 static int fib_get_nhs(struct fib_info
*fi
, struct rtnexthop
*rtnh
,
391 int remaining
, struct fib_config
*cfg
)
393 change_nexthops(fi
) {
396 if (!rtnh_ok(rtnh
, remaining
))
399 nh
->nh_flags
= (cfg
->fc_flags
& ~0xFF) | rtnh
->rtnh_flags
;
400 nh
->nh_oif
= rtnh
->rtnh_ifindex
;
401 nh
->nh_weight
= rtnh
->rtnh_hops
+ 1;
403 attrlen
= rtnh_attrlen(rtnh
);
405 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
407 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
408 nh
->nh_gw
= nla
? nla_get_be32(nla
) : 0;
409 #ifdef CONFIG_NET_CLS_ROUTE
410 nla
= nla_find(attrs
, attrlen
, RTA_FLOW
);
411 nh
->nh_tclassid
= nla
? nla_get_u32(nla
) : 0;
415 rtnh
= rtnh_next(rtnh
, &remaining
);
416 } endfor_nexthops(fi
);
423 int fib_nh_match(struct fib_config
*cfg
, struct fib_info
*fi
)
425 #ifdef CONFIG_IP_ROUTE_MULTIPATH
426 struct rtnexthop
*rtnh
;
430 if (cfg
->fc_priority
&& cfg
->fc_priority
!= fi
->fib_priority
)
433 if (cfg
->fc_oif
|| cfg
->fc_gw
) {
434 if ((!cfg
->fc_oif
|| cfg
->fc_oif
== fi
->fib_nh
->nh_oif
) &&
435 (!cfg
->fc_gw
|| cfg
->fc_gw
== fi
->fib_nh
->nh_gw
))
440 #ifdef CONFIG_IP_ROUTE_MULTIPATH
441 if (cfg
->fc_mp
== NULL
)
445 remaining
= cfg
->fc_mp_len
;
450 if (!rtnh_ok(rtnh
, remaining
))
453 if (rtnh
->rtnh_ifindex
&& rtnh
->rtnh_ifindex
!= nh
->nh_oif
)
456 attrlen
= rtnh_attrlen(rtnh
);
458 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
460 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
461 if (nla
&& nla_get_be32(nla
) != nh
->nh_gw
)
463 #ifdef CONFIG_NET_CLS_ROUTE
464 nla
= nla_find(attrs
, attrlen
, RTA_FLOW
);
465 if (nla
&& nla_get_u32(nla
) != nh
->nh_tclassid
)
470 rtnh
= rtnh_next(rtnh
, &remaining
);
471 } endfor_nexthops(fi
);
481 Semantics of nexthop is very messy by historical reasons.
482 We have to take into account, that:
483 a) gateway can be actually local interface address,
484 so that gatewayed route is direct.
485 b) gateway must be on-link address, possibly
486 described not by an ifaddr, but also by a direct route.
487 c) If both gateway and interface are specified, they should not
489 d) If we use tunnel routes, gateway could be not on-link.
491 Attempt to reconcile all of these (alas, self-contradictory) conditions
492 results in pretty ugly and hairy code with obscure logic.
494 I chose to generalized it instead, so that the size
495 of code does not increase practically, but it becomes
497 Every prefix is assigned a "scope" value: "host" is local address,
498 "link" is direct route,
499 [ ... "site" ... "interior" ... ]
500 and "universe" is true gateway route with global meaning.
502 Every prefix refers to a set of "nexthop"s (gw, oif),
503 where gw must have narrower scope. This recursion stops
504 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
505 which means that gw is forced to be on link.
507 Code is still hairy, but now it is apparently logically
508 consistent and very flexible. F.e. as by-product it allows
509 to co-exists in peace independent exterior and interior
512 Normally it looks as following.
514 {universe prefix} -> (gw, oif) [scope link]
516 |-> {link prefix} -> (gw, oif) [scope local]
518 |-> {local prefix} (terminal node)
521 static int fib_check_nh(struct fib_config
*cfg
, struct fib_info
*fi
,
527 net
= cfg
->fc_nlinfo
.nl_net
;
529 struct fib_result res
;
531 #ifdef CONFIG_IP_ROUTE_PERVASIVE
532 if (nh
->nh_flags
&RTNH_F_PERVASIVE
)
535 if (nh
->nh_flags
&RTNH_F_ONLINK
) {
536 struct net_device
*dev
;
538 if (cfg
->fc_scope
>= RT_SCOPE_LINK
)
540 if (inet_addr_type(net
, nh
->nh_gw
) != RTN_UNICAST
)
542 if ((dev
= __dev_get_by_index(net
, nh
->nh_oif
)) == NULL
)
544 if (!(dev
->flags
&IFF_UP
))
548 nh
->nh_scope
= RT_SCOPE_LINK
;
556 .scope
= cfg
->fc_scope
+ 1,
562 /* It is not necessary, but requires a bit of thinking */
563 if (fl
.fl4_scope
< RT_SCOPE_LINK
)
564 fl
.fl4_scope
= RT_SCOPE_LINK
;
565 if ((err
= fib_lookup(net
, &fl
, &res
)) != 0)
569 if (res
.type
!= RTN_UNICAST
&& res
.type
!= RTN_LOCAL
)
571 nh
->nh_scope
= res
.scope
;
572 nh
->nh_oif
= FIB_RES_OIF(res
);
573 if ((nh
->nh_dev
= FIB_RES_DEV(res
)) == NULL
)
575 dev_hold(nh
->nh_dev
);
577 if (!(nh
->nh_dev
->flags
& IFF_UP
))
584 struct in_device
*in_dev
;
586 if (nh
->nh_flags
&(RTNH_F_PERVASIVE
|RTNH_F_ONLINK
))
589 in_dev
= inetdev_by_index(net
, nh
->nh_oif
);
592 if (!(in_dev
->dev
->flags
&IFF_UP
)) {
596 nh
->nh_dev
= in_dev
->dev
;
597 dev_hold(nh
->nh_dev
);
598 nh
->nh_scope
= RT_SCOPE_HOST
;
604 static inline unsigned int fib_laddr_hashfn(__be32 val
)
606 unsigned int mask
= (fib_hash_size
- 1);
608 return ((__force u32
)val
^ ((__force u32
)val
>> 7) ^ ((__force u32
)val
>> 14)) & mask
;
611 static struct hlist_head
*fib_hash_alloc(int bytes
)
613 if (bytes
<= PAGE_SIZE
)
614 return kzalloc(bytes
, GFP_KERNEL
);
616 return (struct hlist_head
*)
617 __get_free_pages(GFP_KERNEL
| __GFP_ZERO
, get_order(bytes
));
620 static void fib_hash_free(struct hlist_head
*hash
, int bytes
)
625 if (bytes
<= PAGE_SIZE
)
628 free_pages((unsigned long) hash
, get_order(bytes
));
631 static void fib_hash_move(struct hlist_head
*new_info_hash
,
632 struct hlist_head
*new_laddrhash
,
633 unsigned int new_size
)
635 struct hlist_head
*old_info_hash
, *old_laddrhash
;
636 unsigned int old_size
= fib_hash_size
;
637 unsigned int i
, bytes
;
639 spin_lock_bh(&fib_info_lock
);
640 old_info_hash
= fib_info_hash
;
641 old_laddrhash
= fib_info_laddrhash
;
642 fib_hash_size
= new_size
;
644 for (i
= 0; i
< old_size
; i
++) {
645 struct hlist_head
*head
= &fib_info_hash
[i
];
646 struct hlist_node
*node
, *n
;
649 hlist_for_each_entry_safe(fi
, node
, n
, head
, fib_hash
) {
650 struct hlist_head
*dest
;
651 unsigned int new_hash
;
653 hlist_del(&fi
->fib_hash
);
655 new_hash
= fib_info_hashfn(fi
);
656 dest
= &new_info_hash
[new_hash
];
657 hlist_add_head(&fi
->fib_hash
, dest
);
660 fib_info_hash
= new_info_hash
;
662 for (i
= 0; i
< old_size
; i
++) {
663 struct hlist_head
*lhead
= &fib_info_laddrhash
[i
];
664 struct hlist_node
*node
, *n
;
667 hlist_for_each_entry_safe(fi
, node
, n
, lhead
, fib_lhash
) {
668 struct hlist_head
*ldest
;
669 unsigned int new_hash
;
671 hlist_del(&fi
->fib_lhash
);
673 new_hash
= fib_laddr_hashfn(fi
->fib_prefsrc
);
674 ldest
= &new_laddrhash
[new_hash
];
675 hlist_add_head(&fi
->fib_lhash
, ldest
);
678 fib_info_laddrhash
= new_laddrhash
;
680 spin_unlock_bh(&fib_info_lock
);
682 bytes
= old_size
* sizeof(struct hlist_head
*);
683 fib_hash_free(old_info_hash
, bytes
);
684 fib_hash_free(old_laddrhash
, bytes
);
687 struct fib_info
*fib_create_info(struct fib_config
*cfg
)
690 struct fib_info
*fi
= NULL
;
691 struct fib_info
*ofi
;
693 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
695 /* Fast check to catch the most weird cases */
696 if (fib_props
[cfg
->fc_type
].scope
> cfg
->fc_scope
)
699 #ifdef CONFIG_IP_ROUTE_MULTIPATH
701 nhs
= fib_count_nexthops(cfg
->fc_mp
, cfg
->fc_mp_len
);
708 if (fib_info_cnt
>= fib_hash_size
) {
709 unsigned int new_size
= fib_hash_size
<< 1;
710 struct hlist_head
*new_info_hash
;
711 struct hlist_head
*new_laddrhash
;
716 bytes
= new_size
* sizeof(struct hlist_head
*);
717 new_info_hash
= fib_hash_alloc(bytes
);
718 new_laddrhash
= fib_hash_alloc(bytes
);
719 if (!new_info_hash
|| !new_laddrhash
) {
720 fib_hash_free(new_info_hash
, bytes
);
721 fib_hash_free(new_laddrhash
, bytes
);
723 fib_hash_move(new_info_hash
, new_laddrhash
, new_size
);
729 fi
= kzalloc(sizeof(*fi
)+nhs
*sizeof(struct fib_nh
), GFP_KERNEL
);
734 fi
->fib_net
= hold_net(net
);
735 fi
->fib_protocol
= cfg
->fc_protocol
;
736 fi
->fib_flags
= cfg
->fc_flags
;
737 fi
->fib_priority
= cfg
->fc_priority
;
738 fi
->fib_prefsrc
= cfg
->fc_prefsrc
;
741 change_nexthops(fi
) {
743 } endfor_nexthops(fi
)
749 nla_for_each_attr(nla
, cfg
->fc_mx
, cfg
->fc_mx_len
, remaining
) {
750 int type
= nla_type(nla
);
755 fi
->fib_metrics
[type
- 1] = nla_get_u32(nla
);
761 #ifdef CONFIG_IP_ROUTE_MULTIPATH
762 err
= fib_get_nhs(fi
, cfg
->fc_mp
, cfg
->fc_mp_len
, cfg
);
765 if (cfg
->fc_oif
&& fi
->fib_nh
->nh_oif
!= cfg
->fc_oif
)
767 if (cfg
->fc_gw
&& fi
->fib_nh
->nh_gw
!= cfg
->fc_gw
)
769 #ifdef CONFIG_NET_CLS_ROUTE
770 if (cfg
->fc_flow
&& fi
->fib_nh
->nh_tclassid
!= cfg
->fc_flow
)
777 struct fib_nh
*nh
= fi
->fib_nh
;
779 nh
->nh_oif
= cfg
->fc_oif
;
780 nh
->nh_gw
= cfg
->fc_gw
;
781 nh
->nh_flags
= cfg
->fc_flags
;
782 #ifdef CONFIG_NET_CLS_ROUTE
783 nh
->nh_tclassid
= cfg
->fc_flow
;
785 #ifdef CONFIG_IP_ROUTE_MULTIPATH
790 if (fib_props
[cfg
->fc_type
].error
) {
791 if (cfg
->fc_gw
|| cfg
->fc_oif
|| cfg
->fc_mp
)
796 if (cfg
->fc_scope
> RT_SCOPE_HOST
)
799 if (cfg
->fc_scope
== RT_SCOPE_HOST
) {
800 struct fib_nh
*nh
= fi
->fib_nh
;
802 /* Local address is added. */
803 if (nhs
!= 1 || nh
->nh_gw
)
805 nh
->nh_scope
= RT_SCOPE_NOWHERE
;
806 nh
->nh_dev
= dev_get_by_index(net
, fi
->fib_nh
->nh_oif
);
808 if (nh
->nh_dev
== NULL
)
811 change_nexthops(fi
) {
812 if ((err
= fib_check_nh(cfg
, fi
, nh
)) != 0)
814 } endfor_nexthops(fi
)
817 if (fi
->fib_prefsrc
) {
818 if (cfg
->fc_type
!= RTN_LOCAL
|| !cfg
->fc_dst
||
819 fi
->fib_prefsrc
!= cfg
->fc_dst
)
820 if (inet_addr_type(net
, fi
->fib_prefsrc
) != RTN_LOCAL
)
825 if ((ofi
= fib_find_info(fi
)) != NULL
) {
833 atomic_inc(&fi
->fib_clntref
);
834 spin_lock_bh(&fib_info_lock
);
835 hlist_add_head(&fi
->fib_hash
,
836 &fib_info_hash
[fib_info_hashfn(fi
)]);
837 if (fi
->fib_prefsrc
) {
838 struct hlist_head
*head
;
840 head
= &fib_info_laddrhash
[fib_laddr_hashfn(fi
->fib_prefsrc
)];
841 hlist_add_head(&fi
->fib_lhash
, head
);
843 change_nexthops(fi
) {
844 struct hlist_head
*head
;
849 hash
= fib_devindex_hashfn(nh
->nh_dev
->ifindex
);
850 head
= &fib_info_devhash
[hash
];
851 hlist_add_head(&nh
->nh_hash
, head
);
852 } endfor_nexthops(fi
)
853 spin_unlock_bh(&fib_info_lock
);
868 /* Note! fib_semantic_match intentionally uses RCU list functions. */
869 int fib_semantic_match(struct list_head
*head
, const struct flowi
*flp
,
870 struct fib_result
*res
, __be32 zone
, __be32 mask
,
873 struct fib_alias
*fa
;
876 list_for_each_entry_rcu(fa
, head
, fa_list
) {
880 fa
->fa_tos
!= flp
->fl4_tos
)
883 if (fa
->fa_scope
< flp
->fl4_scope
)
886 fa
->fa_state
|= FA_S_ACCESSED
;
888 err
= fib_props
[fa
->fa_type
].error
;
890 struct fib_info
*fi
= fa
->fa_info
;
892 if (fi
->fib_flags
& RTNH_F_DEAD
)
895 switch (fa
->fa_type
) {
902 if (nh
->nh_flags
&RTNH_F_DEAD
)
904 if (!flp
->oif
|| flp
->oif
== nh
->nh_oif
)
907 #ifdef CONFIG_IP_ROUTE_MULTIPATH
908 if (nhsel
< fi
->fib_nhs
) {
921 printk(KERN_WARNING
"fib_semantic_match bad type %#x\n",
931 res
->prefixlen
= prefixlen
;
932 res
->nh_sel
= nh_sel
;
933 res
->type
= fa
->fa_type
;
934 res
->scope
= fa
->fa_scope
;
935 res
->fi
= fa
->fa_info
;
936 atomic_inc(&res
->fi
->fib_clntref
);
940 /* Find appropriate source address to this destination */
942 __be32
__fib_res_prefsrc(struct fib_result
*res
)
944 return inet_select_addr(FIB_RES_DEV(*res
), FIB_RES_GW(*res
), res
->scope
);
947 int fib_dump_info(struct sk_buff
*skb
, u32 pid
, u32 seq
, int event
,
948 u32 tb_id
, u8 type
, u8 scope
, __be32 dst
, int dst_len
, u8 tos
,
949 struct fib_info
*fi
, unsigned int flags
)
951 struct nlmsghdr
*nlh
;
954 nlh
= nlmsg_put(skb
, pid
, seq
, event
, sizeof(*rtm
), flags
);
958 rtm
= nlmsg_data(nlh
);
959 rtm
->rtm_family
= AF_INET
;
960 rtm
->rtm_dst_len
= dst_len
;
961 rtm
->rtm_src_len
= 0;
964 rtm
->rtm_table
= tb_id
;
966 rtm
->rtm_table
= RT_TABLE_COMPAT
;
967 NLA_PUT_U32(skb
, RTA_TABLE
, tb_id
);
968 rtm
->rtm_type
= type
;
969 rtm
->rtm_flags
= fi
->fib_flags
;
970 rtm
->rtm_scope
= scope
;
971 rtm
->rtm_protocol
= fi
->fib_protocol
;
973 if (rtm
->rtm_dst_len
)
974 NLA_PUT_BE32(skb
, RTA_DST
, dst
);
976 if (fi
->fib_priority
)
977 NLA_PUT_U32(skb
, RTA_PRIORITY
, fi
->fib_priority
);
979 if (rtnetlink_put_metrics(skb
, fi
->fib_metrics
) < 0)
980 goto nla_put_failure
;
983 NLA_PUT_BE32(skb
, RTA_PREFSRC
, fi
->fib_prefsrc
);
985 if (fi
->fib_nhs
== 1) {
986 if (fi
->fib_nh
->nh_gw
)
987 NLA_PUT_BE32(skb
, RTA_GATEWAY
, fi
->fib_nh
->nh_gw
);
989 if (fi
->fib_nh
->nh_oif
)
990 NLA_PUT_U32(skb
, RTA_OIF
, fi
->fib_nh
->nh_oif
);
991 #ifdef CONFIG_NET_CLS_ROUTE
992 if (fi
->fib_nh
[0].nh_tclassid
)
993 NLA_PUT_U32(skb
, RTA_FLOW
, fi
->fib_nh
[0].nh_tclassid
);
996 #ifdef CONFIG_IP_ROUTE_MULTIPATH
997 if (fi
->fib_nhs
> 1) {
998 struct rtnexthop
*rtnh
;
1001 mp
= nla_nest_start(skb
, RTA_MULTIPATH
);
1003 goto nla_put_failure
;
1006 rtnh
= nla_reserve_nohdr(skb
, sizeof(*rtnh
));
1008 goto nla_put_failure
;
1010 rtnh
->rtnh_flags
= nh
->nh_flags
& 0xFF;
1011 rtnh
->rtnh_hops
= nh
->nh_weight
- 1;
1012 rtnh
->rtnh_ifindex
= nh
->nh_oif
;
1015 NLA_PUT_BE32(skb
, RTA_GATEWAY
, nh
->nh_gw
);
1016 #ifdef CONFIG_NET_CLS_ROUTE
1017 if (nh
->nh_tclassid
)
1018 NLA_PUT_U32(skb
, RTA_FLOW
, nh
->nh_tclassid
);
1020 /* length of rtnetlink header + attributes */
1021 rtnh
->rtnh_len
= nlmsg_get_pos(skb
) - (void *) rtnh
;
1022 } endfor_nexthops(fi
);
1024 nla_nest_end(skb
, mp
);
1027 return nlmsg_end(skb
, nlh
);
1030 nlmsg_cancel(skb
, nlh
);
1036 - local address disappeared -> we must delete all the entries
1038 - device went down -> we must shutdown all nexthops going via it.
1040 int fib_sync_down_addr(struct net
*net
, __be32 local
)
1043 unsigned int hash
= fib_laddr_hashfn(local
);
1044 struct hlist_head
*head
= &fib_info_laddrhash
[hash
];
1045 struct hlist_node
*node
;
1046 struct fib_info
*fi
;
1048 if (fib_info_laddrhash
== NULL
|| local
== 0)
1051 hlist_for_each_entry(fi
, node
, head
, fib_lhash
) {
1052 if (fi
->fib_net
!= net
)
1054 if (fi
->fib_prefsrc
== local
) {
1055 fi
->fib_flags
|= RTNH_F_DEAD
;
1062 int fib_sync_down_dev(struct net_device
*dev
, int force
)
1065 int scope
= RT_SCOPE_NOWHERE
;
1066 struct fib_info
*prev_fi
= NULL
;
1067 unsigned int hash
= fib_devindex_hashfn(dev
->ifindex
);
1068 struct hlist_head
*head
= &fib_info_devhash
[hash
];
1069 struct hlist_node
*node
;
1075 hlist_for_each_entry(nh
, node
, head
, nh_hash
) {
1076 struct fib_info
*fi
= nh
->nh_parent
;
1079 BUG_ON(!fi
->fib_nhs
);
1080 if (nh
->nh_dev
!= dev
|| fi
== prev_fi
)
1084 change_nexthops(fi
) {
1085 if (nh
->nh_flags
&RTNH_F_DEAD
)
1087 else if (nh
->nh_dev
== dev
&&
1088 nh
->nh_scope
!= scope
) {
1089 nh
->nh_flags
|= RTNH_F_DEAD
;
1090 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1091 spin_lock_bh(&fib_multipath_lock
);
1092 fi
->fib_power
-= nh
->nh_power
;
1094 spin_unlock_bh(&fib_multipath_lock
);
1098 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1099 if (force
> 1 && nh
->nh_dev
== dev
) {
1104 } endfor_nexthops(fi
)
1105 if (dead
== fi
->fib_nhs
) {
1106 fi
->fib_flags
|= RTNH_F_DEAD
;
1114 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1117 Dead device goes up. We wake up dead nexthops.
1118 It takes sense only on multipath routes.
1121 int fib_sync_up(struct net_device
*dev
)
1123 struct fib_info
*prev_fi
;
1125 struct hlist_head
*head
;
1126 struct hlist_node
*node
;
1130 if (!(dev
->flags
&IFF_UP
))
1134 hash
= fib_devindex_hashfn(dev
->ifindex
);
1135 head
= &fib_info_devhash
[hash
];
1138 hlist_for_each_entry(nh
, node
, head
, nh_hash
) {
1139 struct fib_info
*fi
= nh
->nh_parent
;
1142 BUG_ON(!fi
->fib_nhs
);
1143 if (nh
->nh_dev
!= dev
|| fi
== prev_fi
)
1148 change_nexthops(fi
) {
1149 if (!(nh
->nh_flags
&RTNH_F_DEAD
)) {
1153 if (nh
->nh_dev
== NULL
|| !(nh
->nh_dev
->flags
&IFF_UP
))
1155 if (nh
->nh_dev
!= dev
|| !__in_dev_get_rtnl(dev
))
1158 spin_lock_bh(&fib_multipath_lock
);
1160 nh
->nh_flags
&= ~RTNH_F_DEAD
;
1161 spin_unlock_bh(&fib_multipath_lock
);
1162 } endfor_nexthops(fi
)
1165 fi
->fib_flags
&= ~RTNH_F_DEAD
;
1174 The algorithm is suboptimal, but it provides really
1175 fair weighted route distribution.
1178 void fib_select_multipath(const struct flowi
*flp
, struct fib_result
*res
)
1180 struct fib_info
*fi
= res
->fi
;
1183 spin_lock_bh(&fib_multipath_lock
);
1184 if (fi
->fib_power
<= 0) {
1186 change_nexthops(fi
) {
1187 if (!(nh
->nh_flags
&RTNH_F_DEAD
)) {
1188 power
+= nh
->nh_weight
;
1189 nh
->nh_power
= nh
->nh_weight
;
1191 } endfor_nexthops(fi
);
1192 fi
->fib_power
= power
;
1194 spin_unlock_bh(&fib_multipath_lock
);
1195 /* Race condition: route has just become dead. */
1202 /* w should be random number [0..fi->fib_power-1],
1203 it is pretty bad approximation.
1206 w
= jiffies
% fi
->fib_power
;
1208 change_nexthops(fi
) {
1209 if (!(nh
->nh_flags
&RTNH_F_DEAD
) && nh
->nh_power
) {
1210 if ((w
-= nh
->nh_power
) <= 0) {
1213 res
->nh_sel
= nhsel
;
1214 spin_unlock_bh(&fib_multipath_lock
);
1218 } endfor_nexthops(fi
);
1220 /* Race condition: route has just become dead. */
1222 spin_unlock_bh(&fib_multipath_lock
);