5 * Kazunori MIYAZAWA @USAGI
6 * YOSHIFUJI Hideaki @USAGI
7 * Split up af-specific portion
11 #include <linux/err.h>
12 #include <linux/kernel.h>
13 #include <linux/inetdevice.h>
18 static struct dst_ops xfrm4_dst_ops
;
19 static struct xfrm_policy_afinfo xfrm4_policy_afinfo
;
21 static struct dst_entry
*xfrm4_dst_lookup(struct net
*net
, int tos
,
22 xfrm_address_t
*saddr
,
23 xfrm_address_t
*daddr
)
33 struct dst_entry
*dst
;
38 fl
.fl4_src
= saddr
->a4
;
40 err
= __ip_route_output_key(net
, &rt
, &fl
);
47 static int xfrm4_get_saddr(struct net
*net
,
48 xfrm_address_t
*saddr
, xfrm_address_t
*daddr
)
50 struct dst_entry
*dst
;
53 dst
= xfrm4_dst_lookup(net
, 0, NULL
, daddr
);
57 rt
= (struct rtable
*)dst
;
58 saddr
->a4
= rt
->rt_src
;
63 static struct dst_entry
*
64 __xfrm4_find_bundle(struct flowi
*fl
, struct xfrm_policy
*policy
)
66 struct dst_entry
*dst
;
68 read_lock_bh(&policy
->lock
);
69 for (dst
= policy
->bundles
; dst
; dst
= dst
->next
) {
70 struct xfrm_dst
*xdst
= (struct xfrm_dst
*)dst
;
71 if (xdst
->u
.rt
.fl
.oif
== fl
->oif
&& /*XXX*/
72 xdst
->u
.rt
.fl
.fl4_dst
== fl
->fl4_dst
&&
73 xdst
->u
.rt
.fl
.fl4_src
== fl
->fl4_src
&&
74 xdst
->u
.rt
.fl
.fl4_tos
== fl
->fl4_tos
&&
75 xfrm_bundle_ok(policy
, xdst
, fl
, AF_INET
, 0)) {
80 read_unlock_bh(&policy
->lock
);
84 static int xfrm4_get_tos(struct flowi
*fl
)
89 static int xfrm4_init_path(struct xfrm_dst
*path
, struct dst_entry
*dst
,
95 static int xfrm4_fill_dst(struct xfrm_dst
*xdst
, struct net_device
*dev
)
97 struct rtable
*rt
= (struct rtable
*)xdst
->route
;
99 xdst
->u
.rt
.fl
= rt
->fl
;
101 xdst
->u
.dst
.dev
= dev
;
104 xdst
->u
.rt
.idev
= in_dev_get(dev
);
105 if (!xdst
->u
.rt
.idev
)
108 xdst
->u
.rt
.peer
= rt
->peer
;
110 atomic_inc(&rt
->peer
->refcnt
);
112 /* Sheit... I remember I did this right. Apparently,
113 * it was magically lost, so this code needs audit */
114 xdst
->u
.rt
.rt_flags
= rt
->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
|
116 xdst
->u
.rt
.rt_type
= rt
->rt_type
;
117 xdst
->u
.rt
.rt_src
= rt
->rt_src
;
118 xdst
->u
.rt
.rt_dst
= rt
->rt_dst
;
119 xdst
->u
.rt
.rt_gateway
= rt
->rt_gateway
;
120 xdst
->u
.rt
.rt_spec_dst
= rt
->rt_spec_dst
;
126 _decode_session4(struct sk_buff
*skb
, struct flowi
*fl
, int reverse
)
128 struct iphdr
*iph
= ip_hdr(skb
);
129 u8
*xprth
= skb_network_header(skb
) + iph
->ihl
* 4;
131 memset(fl
, 0, sizeof(struct flowi
));
132 if (!(iph
->frag_off
& htons(IP_MF
| IP_OFFSET
))) {
133 switch (iph
->protocol
) {
135 case IPPROTO_UDPLITE
:
139 if (xprth
+ 4 < skb
->data
||
140 pskb_may_pull(skb
, xprth
+ 4 - skb
->data
)) {
141 __be16
*ports
= (__be16
*)xprth
;
143 fl
->fl_ip_sport
= ports
[!!reverse
];
144 fl
->fl_ip_dport
= ports
[!reverse
];
149 if (pskb_may_pull(skb
, xprth
+ 2 - skb
->data
)) {
152 fl
->fl_icmp_type
= icmp
[0];
153 fl
->fl_icmp_code
= icmp
[1];
158 if (pskb_may_pull(skb
, xprth
+ 4 - skb
->data
)) {
159 __be32
*ehdr
= (__be32
*)xprth
;
161 fl
->fl_ipsec_spi
= ehdr
[0];
166 if (pskb_may_pull(skb
, xprth
+ 8 - skb
->data
)) {
167 __be32
*ah_hdr
= (__be32
*)xprth
;
169 fl
->fl_ipsec_spi
= ah_hdr
[1];
174 if (pskb_may_pull(skb
, xprth
+ 4 - skb
->data
)) {
175 __be16
*ipcomp_hdr
= (__be16
*)xprth
;
177 fl
->fl_ipsec_spi
= htonl(ntohs(ipcomp_hdr
[1]));
181 fl
->fl_ipsec_spi
= 0;
185 fl
->proto
= iph
->protocol
;
186 fl
->fl4_dst
= reverse
? iph
->saddr
: iph
->daddr
;
187 fl
->fl4_src
= reverse
? iph
->daddr
: iph
->saddr
;
188 fl
->fl4_tos
= iph
->tos
;
191 static inline int xfrm4_garbage_collect(struct dst_ops
*ops
)
193 xfrm4_policy_afinfo
.garbage_collect(&init_net
);
194 return (atomic_read(&xfrm4_dst_ops
.entries
) > xfrm4_dst_ops
.gc_thresh
*2);
197 static void xfrm4_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
199 struct xfrm_dst
*xdst
= (struct xfrm_dst
*)dst
;
200 struct dst_entry
*path
= xdst
->route
;
202 path
->ops
->update_pmtu(path
, mtu
);
205 static void xfrm4_dst_destroy(struct dst_entry
*dst
)
207 struct xfrm_dst
*xdst
= (struct xfrm_dst
*)dst
;
209 if (likely(xdst
->u
.rt
.idev
))
210 in_dev_put(xdst
->u
.rt
.idev
);
211 if (likely(xdst
->u
.rt
.peer
))
212 inet_putpeer(xdst
->u
.rt
.peer
);
213 xfrm_dst_destroy(xdst
);
216 static void xfrm4_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
219 struct xfrm_dst
*xdst
;
224 xdst
= (struct xfrm_dst
*)dst
;
225 if (xdst
->u
.rt
.idev
->dev
== dev
) {
226 struct in_device
*loopback_idev
=
227 in_dev_get(dev_net(dev
)->loopback_dev
);
228 BUG_ON(!loopback_idev
);
231 in_dev_put(xdst
->u
.rt
.idev
);
232 xdst
->u
.rt
.idev
= loopback_idev
;
233 in_dev_hold(loopback_idev
);
234 xdst
= (struct xfrm_dst
*)xdst
->u
.dst
.child
;
235 } while (xdst
->u
.dst
.xfrm
);
237 __in_dev_put(loopback_idev
);
240 xfrm_dst_ifdown(dst
, dev
);
243 static struct dst_ops xfrm4_dst_ops
= {
245 .protocol
= cpu_to_be16(ETH_P_IP
),
246 .gc
= xfrm4_garbage_collect
,
247 .update_pmtu
= xfrm4_update_pmtu
,
248 .destroy
= xfrm4_dst_destroy
,
249 .ifdown
= xfrm4_dst_ifdown
,
250 .local_out
= __ip_local_out
,
252 .entries
= ATOMIC_INIT(0),
255 static struct xfrm_policy_afinfo xfrm4_policy_afinfo
= {
257 .dst_ops
= &xfrm4_dst_ops
,
258 .dst_lookup
= xfrm4_dst_lookup
,
259 .get_saddr
= xfrm4_get_saddr
,
260 .find_bundle
= __xfrm4_find_bundle
,
261 .decode_session
= _decode_session4
,
262 .get_tos
= xfrm4_get_tos
,
263 .init_path
= xfrm4_init_path
,
264 .fill_dst
= xfrm4_fill_dst
,
268 static struct ctl_table xfrm4_policy_table
[] = {
270 .ctl_name
= CTL_UNNUMBERED
,
271 .procname
= "xfrm4_gc_thresh",
272 .data
= &xfrm4_dst_ops
.gc_thresh
,
273 .maxlen
= sizeof(int),
275 .proc_handler
= proc_dointvec
,
280 static struct ctl_table_header
*sysctl_hdr
;
283 static void __init
xfrm4_policy_init(void)
285 xfrm_policy_register_afinfo(&xfrm4_policy_afinfo
);
288 static void __exit
xfrm4_policy_fini(void)
292 unregister_net_sysctl_table(sysctl_hdr
);
294 xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo
);
297 void __init
xfrm4_init(int rt_max_size
)
302 * Select a default value for the gc_thresh based on the main route
303 * table hash size. It seems to me the worst case scenario is when
304 * we have ipsec operating in transport mode, in which we create a
305 * dst_entry per socket. The xfrm gc algorithm starts trying to remove
306 * entries at gc_thresh, and prevents new allocations as 2*gc_thresh
307 * so lets set an initial xfrm gc_thresh value at the rt_max_size/2.
308 * That will let us store an ipsec connection per route table entry,
309 * and start cleaning when were 1/2 full
311 xfrm4_dst_ops
.gc_thresh
= rt_max_size
/2;
313 sysctl_hdr
= register_net_sysctl_table(&init_net
, net_ipv4_ctl_path
,