2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * IPv4 Forwarding Information Base: semantics.
8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 #include <asm/uaccess.h>
17 #include <asm/system.h>
18 #include <linux/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/jiffies.h>
23 #include <linux/string.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/errno.h>
28 #include <linux/inet.h>
29 #include <linux/inetdevice.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/proc_fs.h>
33 #include <linux/skbuff.h>
34 #include <linux/init.h>
38 #include <net/protocol.h>
39 #include <net/route.h>
42 #include <net/ip_fib.h>
43 #include <net/netlink.h>
44 #include <net/nexthop.h>
46 #include "fib_lookup.h"
48 static DEFINE_SPINLOCK(fib_info_lock
);
49 static struct hlist_head
*fib_info_hash
;
50 static struct hlist_head
*fib_info_laddrhash
;
51 static unsigned int fib_hash_size
;
52 static unsigned int fib_info_cnt
;
54 #define DEVINDEX_HASHBITS 8
55 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
56 static struct hlist_head fib_info_devhash
[DEVINDEX_HASHSIZE
];
58 #ifdef CONFIG_IP_ROUTE_MULTIPATH
60 static DEFINE_SPINLOCK(fib_multipath_lock
);
62 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
63 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
65 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
66 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
68 #else /* CONFIG_IP_ROUTE_MULTIPATH */
70 /* Hope, that gcc will optimize it to get rid of dummy loop */
72 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
73 for (nhsel=0; nhsel < 1; nhsel++)
75 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
76 for (nhsel=0; nhsel < 1; nhsel++)
78 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
80 #define endfor_nexthops(fi) }
87 } fib_props
[RTN_MAX
+ 1] = {
90 .scope
= RT_SCOPE_NOWHERE
,
94 .scope
= RT_SCOPE_UNIVERSE
,
98 .scope
= RT_SCOPE_HOST
,
102 .scope
= RT_SCOPE_LINK
,
103 }, /* RTN_BROADCAST */
106 .scope
= RT_SCOPE_LINK
,
110 .scope
= RT_SCOPE_UNIVERSE
,
111 }, /* RTN_MULTICAST */
114 .scope
= RT_SCOPE_UNIVERSE
,
115 }, /* RTN_BLACKHOLE */
117 .error
= -EHOSTUNREACH
,
118 .scope
= RT_SCOPE_UNIVERSE
,
119 }, /* RTN_UNREACHABLE */
122 .scope
= RT_SCOPE_UNIVERSE
,
123 }, /* RTN_PROHIBIT */
126 .scope
= RT_SCOPE_UNIVERSE
,
130 .scope
= RT_SCOPE_NOWHERE
,
134 .scope
= RT_SCOPE_NOWHERE
,
135 }, /* RTN_XRESOLVE */
139 /* Release a nexthop info record */
141 void free_fib_info(struct fib_info
*fi
)
143 if (fi
->fib_dead
== 0) {
144 printk(KERN_WARNING
"Freeing alive fib_info %p\n", fi
);
147 change_nexthops(fi
) {
151 } endfor_nexthops(fi
);
153 release_net(fi
->fib_net
);
157 void fib_release_info(struct fib_info
*fi
)
159 spin_lock_bh(&fib_info_lock
);
160 if (fi
&& --fi
->fib_treeref
== 0) {
161 hlist_del(&fi
->fib_hash
);
163 hlist_del(&fi
->fib_lhash
);
164 change_nexthops(fi
) {
167 hlist_del(&nh
->nh_hash
);
168 } endfor_nexthops(fi
)
172 spin_unlock_bh(&fib_info_lock
);
175 static __inline__
int nh_comp(const struct fib_info
*fi
, const struct fib_info
*ofi
)
177 const struct fib_nh
*onh
= ofi
->fib_nh
;
180 if (nh
->nh_oif
!= onh
->nh_oif
||
181 nh
->nh_gw
!= onh
->nh_gw
||
182 nh
->nh_scope
!= onh
->nh_scope
||
183 #ifdef CONFIG_IP_ROUTE_MULTIPATH
184 nh
->nh_weight
!= onh
->nh_weight
||
186 #ifdef CONFIG_NET_CLS_ROUTE
187 nh
->nh_tclassid
!= onh
->nh_tclassid
||
189 ((nh
->nh_flags
^onh
->nh_flags
)&~RTNH_F_DEAD
))
192 } endfor_nexthops(fi
);
196 static inline unsigned int fib_devindex_hashfn(unsigned int val
)
198 unsigned int mask
= DEVINDEX_HASHSIZE
- 1;
201 (val
>> DEVINDEX_HASHBITS
) ^
202 (val
>> (DEVINDEX_HASHBITS
* 2))) & mask
;
205 static inline unsigned int fib_info_hashfn(const struct fib_info
*fi
)
207 unsigned int mask
= (fib_hash_size
- 1);
208 unsigned int val
= fi
->fib_nhs
;
210 val
^= fi
->fib_protocol
;
211 val
^= (__force u32
)fi
->fib_prefsrc
;
212 val
^= fi
->fib_priority
;
214 val
^= fib_devindex_hashfn(nh
->nh_oif
);
215 } endfor_nexthops(fi
)
217 return (val
^ (val
>> 7) ^ (val
>> 12)) & mask
;
220 static struct fib_info
*fib_find_info(const struct fib_info
*nfi
)
222 struct hlist_head
*head
;
223 struct hlist_node
*node
;
227 hash
= fib_info_hashfn(nfi
);
228 head
= &fib_info_hash
[hash
];
230 hlist_for_each_entry(fi
, node
, head
, fib_hash
) {
231 if (fi
->fib_net
!= nfi
->fib_net
)
233 if (fi
->fib_nhs
!= nfi
->fib_nhs
)
235 if (nfi
->fib_protocol
== fi
->fib_protocol
&&
236 nfi
->fib_prefsrc
== fi
->fib_prefsrc
&&
237 nfi
->fib_priority
== fi
->fib_priority
&&
238 memcmp(nfi
->fib_metrics
, fi
->fib_metrics
,
239 sizeof(fi
->fib_metrics
)) == 0 &&
240 ((nfi
->fib_flags
^fi
->fib_flags
)&~RTNH_F_DEAD
) == 0 &&
241 (nfi
->fib_nhs
== 0 || nh_comp(fi
, nfi
) == 0))
248 /* Check, that the gateway is already configured.
249 Used only by redirect accept routine.
252 int ip_fib_check_default(__be32 gw
, struct net_device
*dev
)
254 struct hlist_head
*head
;
255 struct hlist_node
*node
;
259 spin_lock(&fib_info_lock
);
261 hash
= fib_devindex_hashfn(dev
->ifindex
);
262 head
= &fib_info_devhash
[hash
];
263 hlist_for_each_entry(nh
, node
, head
, nh_hash
) {
264 if (nh
->nh_dev
== dev
&&
266 !(nh
->nh_flags
&RTNH_F_DEAD
)) {
267 spin_unlock(&fib_info_lock
);
272 spin_unlock(&fib_info_lock
);
277 static inline size_t fib_nlmsg_size(struct fib_info
*fi
)
279 size_t payload
= NLMSG_ALIGN(sizeof(struct rtmsg
))
280 + nla_total_size(4) /* RTA_TABLE */
281 + nla_total_size(4) /* RTA_DST */
282 + nla_total_size(4) /* RTA_PRIORITY */
283 + nla_total_size(4); /* RTA_PREFSRC */
285 /* space for nested metrics */
286 payload
+= nla_total_size((RTAX_MAX
* nla_total_size(4)));
289 /* Also handles the special case fib_nhs == 1 */
291 /* each nexthop is packed in an attribute */
292 size_t nhsize
= nla_total_size(sizeof(struct rtnexthop
));
294 /* may contain flow and gateway attribute */
295 nhsize
+= 2 * nla_total_size(4);
297 /* all nexthops are packed in a nested attribute */
298 payload
+= nla_total_size(fi
->fib_nhs
* nhsize
);
304 void rtmsg_fib(int event
, __be32 key
, struct fib_alias
*fa
,
305 int dst_len
, u32 tb_id
, struct nl_info
*info
,
306 unsigned int nlm_flags
)
309 u32 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
312 skb
= nlmsg_new(fib_nlmsg_size(fa
->fa_info
), GFP_KERNEL
);
316 err
= fib_dump_info(skb
, info
->pid
, seq
, event
, tb_id
,
317 fa
->fa_type
, fa
->fa_scope
, key
, dst_len
,
318 fa
->fa_tos
, fa
->fa_info
, nlm_flags
);
320 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
321 WARN_ON(err
== -EMSGSIZE
);
325 err
= rtnl_notify(skb
, info
->nl_net
, info
->pid
, RTNLGRP_IPV4_ROUTE
,
326 info
->nlh
, GFP_KERNEL
);
329 rtnl_set_sk_err(info
->nl_net
, RTNLGRP_IPV4_ROUTE
, err
);
332 /* Return the first fib alias matching TOS with
333 * priority less than or equal to PRIO.
335 struct fib_alias
*fib_find_alias(struct list_head
*fah
, u8 tos
, u32 prio
)
338 struct fib_alias
*fa
;
339 list_for_each_entry(fa
, fah
, fa_list
) {
340 if (fa
->fa_tos
> tos
)
342 if (fa
->fa_info
->fib_priority
>= prio
||
350 int fib_detect_death(struct fib_info
*fi
, int order
,
351 struct fib_info
**last_resort
, int *last_idx
, int dflt
)
354 int state
= NUD_NONE
;
356 n
= neigh_lookup(&arp_tbl
, &fi
->fib_nh
[0].nh_gw
, fi
->fib_dev
);
358 state
= n
->nud_state
;
361 if (state
==NUD_REACHABLE
)
363 if ((state
&NUD_VALID
) && order
!= dflt
)
365 if ((state
&NUD_VALID
) ||
366 (*last_idx
<0 && order
> dflt
)) {
373 #ifdef CONFIG_IP_ROUTE_MULTIPATH
375 static int fib_count_nexthops(struct rtnexthop
*rtnh
, int remaining
)
379 while (rtnh_ok(rtnh
, remaining
)) {
381 rtnh
= rtnh_next(rtnh
, &remaining
);
384 /* leftover implies invalid nexthop configuration, discard it */
385 return remaining
> 0 ? 0 : nhs
;
388 static int fib_get_nhs(struct fib_info
*fi
, struct rtnexthop
*rtnh
,
389 int remaining
, struct fib_config
*cfg
)
391 change_nexthops(fi
) {
394 if (!rtnh_ok(rtnh
, remaining
))
397 nh
->nh_flags
= (cfg
->fc_flags
& ~0xFF) | rtnh
->rtnh_flags
;
398 nh
->nh_oif
= rtnh
->rtnh_ifindex
;
399 nh
->nh_weight
= rtnh
->rtnh_hops
+ 1;
401 attrlen
= rtnh_attrlen(rtnh
);
403 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
405 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
406 nh
->nh_gw
= nla
? nla_get_be32(nla
) : 0;
407 #ifdef CONFIG_NET_CLS_ROUTE
408 nla
= nla_find(attrs
, attrlen
, RTA_FLOW
);
409 nh
->nh_tclassid
= nla
? nla_get_u32(nla
) : 0;
413 rtnh
= rtnh_next(rtnh
, &remaining
);
414 } endfor_nexthops(fi
);
421 int fib_nh_match(struct fib_config
*cfg
, struct fib_info
*fi
)
423 #ifdef CONFIG_IP_ROUTE_MULTIPATH
424 struct rtnexthop
*rtnh
;
428 if (cfg
->fc_priority
&& cfg
->fc_priority
!= fi
->fib_priority
)
431 if (cfg
->fc_oif
|| cfg
->fc_gw
) {
432 if ((!cfg
->fc_oif
|| cfg
->fc_oif
== fi
->fib_nh
->nh_oif
) &&
433 (!cfg
->fc_gw
|| cfg
->fc_gw
== fi
->fib_nh
->nh_gw
))
438 #ifdef CONFIG_IP_ROUTE_MULTIPATH
439 if (cfg
->fc_mp
== NULL
)
443 remaining
= cfg
->fc_mp_len
;
448 if (!rtnh_ok(rtnh
, remaining
))
451 if (rtnh
->rtnh_ifindex
&& rtnh
->rtnh_ifindex
!= nh
->nh_oif
)
454 attrlen
= rtnh_attrlen(rtnh
);
456 struct nlattr
*nla
, *attrs
= rtnh_attrs(rtnh
);
458 nla
= nla_find(attrs
, attrlen
, RTA_GATEWAY
);
459 if (nla
&& nla_get_be32(nla
) != nh
->nh_gw
)
461 #ifdef CONFIG_NET_CLS_ROUTE
462 nla
= nla_find(attrs
, attrlen
, RTA_FLOW
);
463 if (nla
&& nla_get_u32(nla
) != nh
->nh_tclassid
)
468 rtnh
= rtnh_next(rtnh
, &remaining
);
469 } endfor_nexthops(fi
);
479 Semantics of nexthop is very messy by historical reasons.
480 We have to take into account, that:
481 a) gateway can be actually local interface address,
482 so that gatewayed route is direct.
483 b) gateway must be on-link address, possibly
484 described not by an ifaddr, but also by a direct route.
485 c) If both gateway and interface are specified, they should not
487 d) If we use tunnel routes, gateway could be not on-link.
489 Attempt to reconcile all of these (alas, self-contradictory) conditions
490 results in pretty ugly and hairy code with obscure logic.
492 I chose to generalized it instead, so that the size
493 of code does not increase practically, but it becomes
495 Every prefix is assigned a "scope" value: "host" is local address,
496 "link" is direct route,
497 [ ... "site" ... "interior" ... ]
498 and "universe" is true gateway route with global meaning.
500 Every prefix refers to a set of "nexthop"s (gw, oif),
501 where gw must have narrower scope. This recursion stops
502 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
503 which means that gw is forced to be on link.
505 Code is still hairy, but now it is apparently logically
506 consistent and very flexible. F.e. as by-product it allows
507 to co-exists in peace independent exterior and interior
510 Normally it looks as following.
512 {universe prefix} -> (gw, oif) [scope link]
514 |-> {link prefix} -> (gw, oif) [scope local]
516 |-> {local prefix} (terminal node)
519 static int fib_check_nh(struct fib_config
*cfg
, struct fib_info
*fi
,
525 net
= cfg
->fc_nlinfo
.nl_net
;
527 struct fib_result res
;
529 #ifdef CONFIG_IP_ROUTE_PERVASIVE
530 if (nh
->nh_flags
&RTNH_F_PERVASIVE
)
533 if (nh
->nh_flags
&RTNH_F_ONLINK
) {
534 struct net_device
*dev
;
536 if (cfg
->fc_scope
>= RT_SCOPE_LINK
)
538 if (inet_addr_type(net
, nh
->nh_gw
) != RTN_UNICAST
)
540 if ((dev
= __dev_get_by_index(net
, nh
->nh_oif
)) == NULL
)
542 if (!(dev
->flags
&IFF_UP
))
546 nh
->nh_scope
= RT_SCOPE_LINK
;
554 .scope
= cfg
->fc_scope
+ 1,
560 /* It is not necessary, but requires a bit of thinking */
561 if (fl
.fl4_scope
< RT_SCOPE_LINK
)
562 fl
.fl4_scope
= RT_SCOPE_LINK
;
563 if ((err
= fib_lookup(net
, &fl
, &res
)) != 0)
567 if (res
.type
!= RTN_UNICAST
&& res
.type
!= RTN_LOCAL
)
569 nh
->nh_scope
= res
.scope
;
570 nh
->nh_oif
= FIB_RES_OIF(res
);
571 if ((nh
->nh_dev
= FIB_RES_DEV(res
)) == NULL
)
573 dev_hold(nh
->nh_dev
);
575 if (!(nh
->nh_dev
->flags
& IFF_UP
))
582 struct in_device
*in_dev
;
584 if (nh
->nh_flags
&(RTNH_F_PERVASIVE
|RTNH_F_ONLINK
))
587 in_dev
= inetdev_by_index(net
, nh
->nh_oif
);
590 if (!(in_dev
->dev
->flags
&IFF_UP
)) {
594 nh
->nh_dev
= in_dev
->dev
;
595 dev_hold(nh
->nh_dev
);
596 nh
->nh_scope
= RT_SCOPE_HOST
;
602 static inline unsigned int fib_laddr_hashfn(__be32 val
)
604 unsigned int mask
= (fib_hash_size
- 1);
606 return ((__force u32
)val
^ ((__force u32
)val
>> 7) ^ ((__force u32
)val
>> 14)) & mask
;
609 static struct hlist_head
*fib_hash_alloc(int bytes
)
611 if (bytes
<= PAGE_SIZE
)
612 return kzalloc(bytes
, GFP_KERNEL
);
614 return (struct hlist_head
*)
615 __get_free_pages(GFP_KERNEL
| __GFP_ZERO
, get_order(bytes
));
618 static void fib_hash_free(struct hlist_head
*hash
, int bytes
)
623 if (bytes
<= PAGE_SIZE
)
626 free_pages((unsigned long) hash
, get_order(bytes
));
629 static void fib_hash_move(struct hlist_head
*new_info_hash
,
630 struct hlist_head
*new_laddrhash
,
631 unsigned int new_size
)
633 struct hlist_head
*old_info_hash
, *old_laddrhash
;
634 unsigned int old_size
= fib_hash_size
;
635 unsigned int i
, bytes
;
637 spin_lock_bh(&fib_info_lock
);
638 old_info_hash
= fib_info_hash
;
639 old_laddrhash
= fib_info_laddrhash
;
640 fib_hash_size
= new_size
;
642 for (i
= 0; i
< old_size
; i
++) {
643 struct hlist_head
*head
= &fib_info_hash
[i
];
644 struct hlist_node
*node
, *n
;
647 hlist_for_each_entry_safe(fi
, node
, n
, head
, fib_hash
) {
648 struct hlist_head
*dest
;
649 unsigned int new_hash
;
651 hlist_del(&fi
->fib_hash
);
653 new_hash
= fib_info_hashfn(fi
);
654 dest
= &new_info_hash
[new_hash
];
655 hlist_add_head(&fi
->fib_hash
, dest
);
658 fib_info_hash
= new_info_hash
;
660 for (i
= 0; i
< old_size
; i
++) {
661 struct hlist_head
*lhead
= &fib_info_laddrhash
[i
];
662 struct hlist_node
*node
, *n
;
665 hlist_for_each_entry_safe(fi
, node
, n
, lhead
, fib_lhash
) {
666 struct hlist_head
*ldest
;
667 unsigned int new_hash
;
669 hlist_del(&fi
->fib_lhash
);
671 new_hash
= fib_laddr_hashfn(fi
->fib_prefsrc
);
672 ldest
= &new_laddrhash
[new_hash
];
673 hlist_add_head(&fi
->fib_lhash
, ldest
);
676 fib_info_laddrhash
= new_laddrhash
;
678 spin_unlock_bh(&fib_info_lock
);
680 bytes
= old_size
* sizeof(struct hlist_head
*);
681 fib_hash_free(old_info_hash
, bytes
);
682 fib_hash_free(old_laddrhash
, bytes
);
685 struct fib_info
*fib_create_info(struct fib_config
*cfg
)
688 struct fib_info
*fi
= NULL
;
689 struct fib_info
*ofi
;
691 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
693 /* Fast check to catch the most weird cases */
694 if (fib_props
[cfg
->fc_type
].scope
> cfg
->fc_scope
)
697 #ifdef CONFIG_IP_ROUTE_MULTIPATH
699 nhs
= fib_count_nexthops(cfg
->fc_mp
, cfg
->fc_mp_len
);
706 if (fib_info_cnt
>= fib_hash_size
) {
707 unsigned int new_size
= fib_hash_size
<< 1;
708 struct hlist_head
*new_info_hash
;
709 struct hlist_head
*new_laddrhash
;
714 bytes
= new_size
* sizeof(struct hlist_head
*);
715 new_info_hash
= fib_hash_alloc(bytes
);
716 new_laddrhash
= fib_hash_alloc(bytes
);
717 if (!new_info_hash
|| !new_laddrhash
) {
718 fib_hash_free(new_info_hash
, bytes
);
719 fib_hash_free(new_laddrhash
, bytes
);
721 fib_hash_move(new_info_hash
, new_laddrhash
, new_size
);
727 fi
= kzalloc(sizeof(*fi
)+nhs
*sizeof(struct fib_nh
), GFP_KERNEL
);
732 fi
->fib_net
= hold_net(net
);
733 fi
->fib_protocol
= cfg
->fc_protocol
;
734 fi
->fib_flags
= cfg
->fc_flags
;
735 fi
->fib_priority
= cfg
->fc_priority
;
736 fi
->fib_prefsrc
= cfg
->fc_prefsrc
;
739 change_nexthops(fi
) {
741 } endfor_nexthops(fi
)
747 nla_for_each_attr(nla
, cfg
->fc_mx
, cfg
->fc_mx_len
, remaining
) {
748 int type
= nla_type(nla
);
753 fi
->fib_metrics
[type
- 1] = nla_get_u32(nla
);
759 #ifdef CONFIG_IP_ROUTE_MULTIPATH
760 err
= fib_get_nhs(fi
, cfg
->fc_mp
, cfg
->fc_mp_len
, cfg
);
763 if (cfg
->fc_oif
&& fi
->fib_nh
->nh_oif
!= cfg
->fc_oif
)
765 if (cfg
->fc_gw
&& fi
->fib_nh
->nh_gw
!= cfg
->fc_gw
)
767 #ifdef CONFIG_NET_CLS_ROUTE
768 if (cfg
->fc_flow
&& fi
->fib_nh
->nh_tclassid
!= cfg
->fc_flow
)
775 struct fib_nh
*nh
= fi
->fib_nh
;
777 nh
->nh_oif
= cfg
->fc_oif
;
778 nh
->nh_gw
= cfg
->fc_gw
;
779 nh
->nh_flags
= cfg
->fc_flags
;
780 #ifdef CONFIG_NET_CLS_ROUTE
781 nh
->nh_tclassid
= cfg
->fc_flow
;
783 #ifdef CONFIG_IP_ROUTE_MULTIPATH
788 if (fib_props
[cfg
->fc_type
].error
) {
789 if (cfg
->fc_gw
|| cfg
->fc_oif
|| cfg
->fc_mp
)
794 if (cfg
->fc_scope
> RT_SCOPE_HOST
)
797 if (cfg
->fc_scope
== RT_SCOPE_HOST
) {
798 struct fib_nh
*nh
= fi
->fib_nh
;
800 /* Local address is added. */
801 if (nhs
!= 1 || nh
->nh_gw
)
803 nh
->nh_scope
= RT_SCOPE_NOWHERE
;
804 nh
->nh_dev
= dev_get_by_index(net
, fi
->fib_nh
->nh_oif
);
806 if (nh
->nh_dev
== NULL
)
809 change_nexthops(fi
) {
810 if ((err
= fib_check_nh(cfg
, fi
, nh
)) != 0)
812 } endfor_nexthops(fi
)
815 if (fi
->fib_prefsrc
) {
816 if (cfg
->fc_type
!= RTN_LOCAL
|| !cfg
->fc_dst
||
817 fi
->fib_prefsrc
!= cfg
->fc_dst
)
818 if (inet_addr_type(net
, fi
->fib_prefsrc
) != RTN_LOCAL
)
823 if ((ofi
= fib_find_info(fi
)) != NULL
) {
831 atomic_inc(&fi
->fib_clntref
);
832 spin_lock_bh(&fib_info_lock
);
833 hlist_add_head(&fi
->fib_hash
,
834 &fib_info_hash
[fib_info_hashfn(fi
)]);
835 if (fi
->fib_prefsrc
) {
836 struct hlist_head
*head
;
838 head
= &fib_info_laddrhash
[fib_laddr_hashfn(fi
->fib_prefsrc
)];
839 hlist_add_head(&fi
->fib_lhash
, head
);
841 change_nexthops(fi
) {
842 struct hlist_head
*head
;
847 hash
= fib_devindex_hashfn(nh
->nh_dev
->ifindex
);
848 head
= &fib_info_devhash
[hash
];
849 hlist_add_head(&nh
->nh_hash
, head
);
850 } endfor_nexthops(fi
)
851 spin_unlock_bh(&fib_info_lock
);
866 /* Note! fib_semantic_match intentionally uses RCU list functions. */
867 int fib_semantic_match(struct list_head
*head
, const struct flowi
*flp
,
868 struct fib_result
*res
, __be32 zone
, __be32 mask
,
871 struct fib_alias
*fa
;
874 list_for_each_entry_rcu(fa
, head
, fa_list
) {
878 fa
->fa_tos
!= flp
->fl4_tos
)
881 if (fa
->fa_scope
< flp
->fl4_scope
)
884 fa
->fa_state
|= FA_S_ACCESSED
;
886 err
= fib_props
[fa
->fa_type
].error
;
888 struct fib_info
*fi
= fa
->fa_info
;
890 if (fi
->fib_flags
& RTNH_F_DEAD
)
893 switch (fa
->fa_type
) {
900 if (nh
->nh_flags
&RTNH_F_DEAD
)
902 if (!flp
->oif
|| flp
->oif
== nh
->nh_oif
)
905 #ifdef CONFIG_IP_ROUTE_MULTIPATH
906 if (nhsel
< fi
->fib_nhs
) {
919 printk(KERN_WARNING
"fib_semantic_match bad type %#x\n",
929 res
->prefixlen
= prefixlen
;
930 res
->nh_sel
= nh_sel
;
931 res
->type
= fa
->fa_type
;
932 res
->scope
= fa
->fa_scope
;
933 res
->fi
= fa
->fa_info
;
934 atomic_inc(&res
->fi
->fib_clntref
);
938 /* Find appropriate source address to this destination */
940 __be32
__fib_res_prefsrc(struct fib_result
*res
)
942 return inet_select_addr(FIB_RES_DEV(*res
), FIB_RES_GW(*res
), res
->scope
);
945 int fib_dump_info(struct sk_buff
*skb
, u32 pid
, u32 seq
, int event
,
946 u32 tb_id
, u8 type
, u8 scope
, __be32 dst
, int dst_len
, u8 tos
,
947 struct fib_info
*fi
, unsigned int flags
)
949 struct nlmsghdr
*nlh
;
952 nlh
= nlmsg_put(skb
, pid
, seq
, event
, sizeof(*rtm
), flags
);
956 rtm
= nlmsg_data(nlh
);
957 rtm
->rtm_family
= AF_INET
;
958 rtm
->rtm_dst_len
= dst_len
;
959 rtm
->rtm_src_len
= 0;
962 rtm
->rtm_table
= tb_id
;
964 rtm
->rtm_table
= RT_TABLE_COMPAT
;
965 NLA_PUT_U32(skb
, RTA_TABLE
, tb_id
);
966 rtm
->rtm_type
= type
;
967 rtm
->rtm_flags
= fi
->fib_flags
;
968 rtm
->rtm_scope
= scope
;
969 rtm
->rtm_protocol
= fi
->fib_protocol
;
971 if (rtm
->rtm_dst_len
)
972 NLA_PUT_BE32(skb
, RTA_DST
, dst
);
974 if (fi
->fib_priority
)
975 NLA_PUT_U32(skb
, RTA_PRIORITY
, fi
->fib_priority
);
977 if (rtnetlink_put_metrics(skb
, fi
->fib_metrics
) < 0)
978 goto nla_put_failure
;
981 NLA_PUT_BE32(skb
, RTA_PREFSRC
, fi
->fib_prefsrc
);
983 if (fi
->fib_nhs
== 1) {
984 if (fi
->fib_nh
->nh_gw
)
985 NLA_PUT_BE32(skb
, RTA_GATEWAY
, fi
->fib_nh
->nh_gw
);
987 if (fi
->fib_nh
->nh_oif
)
988 NLA_PUT_U32(skb
, RTA_OIF
, fi
->fib_nh
->nh_oif
);
989 #ifdef CONFIG_NET_CLS_ROUTE
990 if (fi
->fib_nh
[0].nh_tclassid
)
991 NLA_PUT_U32(skb
, RTA_FLOW
, fi
->fib_nh
[0].nh_tclassid
);
994 #ifdef CONFIG_IP_ROUTE_MULTIPATH
995 if (fi
->fib_nhs
> 1) {
996 struct rtnexthop
*rtnh
;
999 mp
= nla_nest_start(skb
, RTA_MULTIPATH
);
1001 goto nla_put_failure
;
1004 rtnh
= nla_reserve_nohdr(skb
, sizeof(*rtnh
));
1006 goto nla_put_failure
;
1008 rtnh
->rtnh_flags
= nh
->nh_flags
& 0xFF;
1009 rtnh
->rtnh_hops
= nh
->nh_weight
- 1;
1010 rtnh
->rtnh_ifindex
= nh
->nh_oif
;
1013 NLA_PUT_BE32(skb
, RTA_GATEWAY
, nh
->nh_gw
);
1014 #ifdef CONFIG_NET_CLS_ROUTE
1015 if (nh
->nh_tclassid
)
1016 NLA_PUT_U32(skb
, RTA_FLOW
, nh
->nh_tclassid
);
1018 /* length of rtnetlink header + attributes */
1019 rtnh
->rtnh_len
= nlmsg_get_pos(skb
) - (void *) rtnh
;
1020 } endfor_nexthops(fi
);
1022 nla_nest_end(skb
, mp
);
1025 return nlmsg_end(skb
, nlh
);
1028 nlmsg_cancel(skb
, nlh
);
1034 - local address disappeared -> we must delete all the entries
1036 - device went down -> we must shutdown all nexthops going via it.
1038 int fib_sync_down_addr(struct net
*net
, __be32 local
)
1041 unsigned int hash
= fib_laddr_hashfn(local
);
1042 struct hlist_head
*head
= &fib_info_laddrhash
[hash
];
1043 struct hlist_node
*node
;
1044 struct fib_info
*fi
;
1046 if (fib_info_laddrhash
== NULL
|| local
== 0)
1049 hlist_for_each_entry(fi
, node
, head
, fib_lhash
) {
1050 if (fi
->fib_net
!= net
)
1052 if (fi
->fib_prefsrc
== local
) {
1053 fi
->fib_flags
|= RTNH_F_DEAD
;
1060 int fib_sync_down_dev(struct net_device
*dev
, int force
)
1063 int scope
= RT_SCOPE_NOWHERE
;
1064 struct fib_info
*prev_fi
= NULL
;
1065 unsigned int hash
= fib_devindex_hashfn(dev
->ifindex
);
1066 struct hlist_head
*head
= &fib_info_devhash
[hash
];
1067 struct hlist_node
*node
;
1073 hlist_for_each_entry(nh
, node
, head
, nh_hash
) {
1074 struct fib_info
*fi
= nh
->nh_parent
;
1077 BUG_ON(!fi
->fib_nhs
);
1078 if (nh
->nh_dev
!= dev
|| fi
== prev_fi
)
1082 change_nexthops(fi
) {
1083 if (nh
->nh_flags
&RTNH_F_DEAD
)
1085 else if (nh
->nh_dev
== dev
&&
1086 nh
->nh_scope
!= scope
) {
1087 nh
->nh_flags
|= RTNH_F_DEAD
;
1088 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1089 spin_lock_bh(&fib_multipath_lock
);
1090 fi
->fib_power
-= nh
->nh_power
;
1092 spin_unlock_bh(&fib_multipath_lock
);
1096 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1097 if (force
> 1 && nh
->nh_dev
== dev
) {
1102 } endfor_nexthops(fi
)
1103 if (dead
== fi
->fib_nhs
) {
1104 fi
->fib_flags
|= RTNH_F_DEAD
;
1112 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1115 Dead device goes up. We wake up dead nexthops.
1116 It takes sense only on multipath routes.
1119 int fib_sync_up(struct net_device
*dev
)
1121 struct fib_info
*prev_fi
;
1123 struct hlist_head
*head
;
1124 struct hlist_node
*node
;
1128 if (!(dev
->flags
&IFF_UP
))
1132 hash
= fib_devindex_hashfn(dev
->ifindex
);
1133 head
= &fib_info_devhash
[hash
];
1136 hlist_for_each_entry(nh
, node
, head
, nh_hash
) {
1137 struct fib_info
*fi
= nh
->nh_parent
;
1140 BUG_ON(!fi
->fib_nhs
);
1141 if (nh
->nh_dev
!= dev
|| fi
== prev_fi
)
1146 change_nexthops(fi
) {
1147 if (!(nh
->nh_flags
&RTNH_F_DEAD
)) {
1151 if (nh
->nh_dev
== NULL
|| !(nh
->nh_dev
->flags
&IFF_UP
))
1153 if (nh
->nh_dev
!= dev
|| !__in_dev_get_rtnl(dev
))
1156 spin_lock_bh(&fib_multipath_lock
);
1158 nh
->nh_flags
&= ~RTNH_F_DEAD
;
1159 spin_unlock_bh(&fib_multipath_lock
);
1160 } endfor_nexthops(fi
)
1163 fi
->fib_flags
&= ~RTNH_F_DEAD
;
1172 The algorithm is suboptimal, but it provides really
1173 fair weighted route distribution.
1176 void fib_select_multipath(const struct flowi
*flp
, struct fib_result
*res
)
1178 struct fib_info
*fi
= res
->fi
;
1181 spin_lock_bh(&fib_multipath_lock
);
1182 if (fi
->fib_power
<= 0) {
1184 change_nexthops(fi
) {
1185 if (!(nh
->nh_flags
&RTNH_F_DEAD
)) {
1186 power
+= nh
->nh_weight
;
1187 nh
->nh_power
= nh
->nh_weight
;
1189 } endfor_nexthops(fi
);
1190 fi
->fib_power
= power
;
1192 spin_unlock_bh(&fib_multipath_lock
);
1193 /* Race condition: route has just become dead. */
1200 /* w should be random number [0..fi->fib_power-1],
1201 it is pretty bad approximation.
1204 w
= jiffies
% fi
->fib_power
;
1206 change_nexthops(fi
) {
1207 if (!(nh
->nh_flags
&RTNH_F_DEAD
) && nh
->nh_power
) {
1208 if ((w
-= nh
->nh_power
) <= 0) {
1211 res
->nh_sel
= nhsel
;
1212 spin_unlock_bh(&fib_multipath_lock
);
1216 } endfor_nexthops(fi
);
1218 /* Race condition: route has just become dead. */
1220 spin_unlock_bh(&fib_multipath_lock
);