5 * Kazunori MIYAZAWA @USAGI
6 * YOSHIFUJI Hideaki @USAGI
7 * Split up af-specific portion
11 #include <linux/err.h>
12 #include <linux/kernel.h>
13 #include <linux/inetdevice.h>
14 #include <linux/if_tunnel.h>
19 static struct xfrm_policy_afinfo xfrm4_policy_afinfo
;
21 static struct dst_entry
*__xfrm4_dst_lookup(struct net
*net
, struct flowi4
*fl4
,
23 const xfrm_address_t
*saddr
,
24 const xfrm_address_t
*daddr
)
28 memset(fl4
, 0, sizeof(*fl4
));
29 fl4
->daddr
= daddr
->a4
;
30 fl4
->flowi4_tos
= tos
;
32 fl4
->saddr
= saddr
->a4
;
34 rt
= __ip_route_output_key(net
, fl4
);
41 static struct dst_entry
*xfrm4_dst_lookup(struct net
*net
, int tos
,
42 const xfrm_address_t
*saddr
,
43 const xfrm_address_t
*daddr
)
47 return __xfrm4_dst_lookup(net
, &fl4
, tos
, saddr
, daddr
);
50 static int xfrm4_get_saddr(struct net
*net
,
51 xfrm_address_t
*saddr
, xfrm_address_t
*daddr
)
53 struct dst_entry
*dst
;
56 dst
= __xfrm4_dst_lookup(net
, &fl4
, 0, NULL
, daddr
);
60 saddr
->a4
= fl4
.saddr
;
65 static int xfrm4_get_tos(const struct flowi
*fl
)
67 return IPTOS_RT_MASK
& fl
->u
.ip4
.flowi4_tos
; /* Strip ECN bits */
70 static int xfrm4_init_path(struct xfrm_dst
*path
, struct dst_entry
*dst
,
76 static int xfrm4_fill_dst(struct xfrm_dst
*xdst
, struct net_device
*dev
,
77 const struct flowi
*fl
)
79 struct rtable
*rt
= (struct rtable
*)xdst
->route
;
80 const struct flowi4
*fl4
= &fl
->u
.ip4
;
82 xdst
->u
.rt
.rt_key_dst
= fl4
->daddr
;
83 xdst
->u
.rt
.rt_key_src
= fl4
->saddr
;
84 xdst
->u
.rt
.rt_key_tos
= fl4
->flowi4_tos
;
85 xdst
->u
.rt
.rt_route_iif
= fl4
->flowi4_iif
;
86 xdst
->u
.rt
.rt_iif
= fl4
->flowi4_iif
;
87 xdst
->u
.rt
.rt_oif
= fl4
->flowi4_oif
;
88 xdst
->u
.rt
.rt_mark
= fl4
->flowi4_mark
;
90 xdst
->u
.dst
.dev
= dev
;
93 xdst
->u
.rt
.peer
= rt
->peer
;
95 atomic_inc(&rt
->peer
->refcnt
);
97 /* Sheit... I remember I did this right. Apparently,
98 * it was magically lost, so this code needs audit */
99 xdst
->u
.rt
.rt_flags
= rt
->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
|
101 xdst
->u
.rt
.rt_type
= rt
->rt_type
;
102 xdst
->u
.rt
.rt_src
= rt
->rt_src
;
103 xdst
->u
.rt
.rt_dst
= rt
->rt_dst
;
104 xdst
->u
.rt
.rt_gateway
= rt
->rt_gateway
;
105 xdst
->u
.rt
.rt_spec_dst
= rt
->rt_spec_dst
;
111 _decode_session4(struct sk_buff
*skb
, struct flowi
*fl
, int reverse
)
113 const struct iphdr
*iph
= ip_hdr(skb
);
114 u8
*xprth
= skb_network_header(skb
) + iph
->ihl
* 4;
115 struct flowi4
*fl4
= &fl
->u
.ip4
;
117 memset(fl4
, 0, sizeof(struct flowi4
));
118 fl4
->flowi4_mark
= skb
->mark
;
120 if (!ip_is_fragment(iph
)) {
121 switch (iph
->protocol
) {
123 case IPPROTO_UDPLITE
:
127 if (xprth
+ 4 < skb
->data
||
128 pskb_may_pull(skb
, xprth
+ 4 - skb
->data
)) {
129 __be16
*ports
= (__be16
*)xprth
;
131 fl4
->fl4_sport
= ports
[!!reverse
];
132 fl4
->fl4_dport
= ports
[!reverse
];
137 if (pskb_may_pull(skb
, xprth
+ 2 - skb
->data
)) {
140 fl4
->fl4_icmp_type
= icmp
[0];
141 fl4
->fl4_icmp_code
= icmp
[1];
146 if (pskb_may_pull(skb
, xprth
+ 4 - skb
->data
)) {
147 __be32
*ehdr
= (__be32
*)xprth
;
149 fl4
->fl4_ipsec_spi
= ehdr
[0];
154 if (pskb_may_pull(skb
, xprth
+ 8 - skb
->data
)) {
155 __be32
*ah_hdr
= (__be32
*)xprth
;
157 fl4
->fl4_ipsec_spi
= ah_hdr
[1];
162 if (pskb_may_pull(skb
, xprth
+ 4 - skb
->data
)) {
163 __be16
*ipcomp_hdr
= (__be16
*)xprth
;
165 fl4
->fl4_ipsec_spi
= htonl(ntohs(ipcomp_hdr
[1]));
170 if (pskb_may_pull(skb
, xprth
+ 12 - skb
->data
)) {
171 __be16
*greflags
= (__be16
*)xprth
;
172 __be32
*gre_hdr
= (__be32
*)xprth
;
174 if (greflags
[0] & GRE_KEY
) {
175 if (greflags
[0] & GRE_CSUM
)
177 fl4
->fl4_gre_key
= gre_hdr
[1];
183 fl4
->fl4_ipsec_spi
= 0;
187 fl4
->flowi4_proto
= iph
->protocol
;
188 fl4
->daddr
= reverse
? iph
->saddr
: iph
->daddr
;
189 fl4
->saddr
= reverse
? iph
->daddr
: iph
->saddr
;
190 fl4
->flowi4_tos
= iph
->tos
;
193 static inline int xfrm4_garbage_collect(struct dst_ops
*ops
)
195 struct net
*net
= container_of(ops
, struct net
, xfrm
.xfrm4_dst_ops
);
197 xfrm4_policy_afinfo
.garbage_collect(net
);
198 return (dst_entries_get_slow(ops
) > ops
->gc_thresh
* 2);
201 static void xfrm4_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
203 struct xfrm_dst
*xdst
= (struct xfrm_dst
*)dst
;
204 struct dst_entry
*path
= xdst
->route
;
206 path
->ops
->update_pmtu(path
, mtu
);
209 static void xfrm4_dst_destroy(struct dst_entry
*dst
)
211 struct xfrm_dst
*xdst
= (struct xfrm_dst
*)dst
;
213 dst_destroy_metrics_generic(dst
);
215 if (likely(xdst
->u
.rt
.peer
))
216 inet_putpeer(xdst
->u
.rt
.peer
);
218 xfrm_dst_destroy(xdst
);
221 static void xfrm4_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
227 xfrm_dst_ifdown(dst
, dev
);
230 static struct dst_ops xfrm4_dst_ops
= {
232 .protocol
= cpu_to_be16(ETH_P_IP
),
233 .gc
= xfrm4_garbage_collect
,
234 .update_pmtu
= xfrm4_update_pmtu
,
235 .cow_metrics
= dst_cow_metrics_generic
,
236 .destroy
= xfrm4_dst_destroy
,
237 .ifdown
= xfrm4_dst_ifdown
,
238 .local_out
= __ip_local_out
,
242 static struct xfrm_policy_afinfo xfrm4_policy_afinfo
= {
244 .dst_ops
= &xfrm4_dst_ops
,
245 .dst_lookup
= xfrm4_dst_lookup
,
246 .get_saddr
= xfrm4_get_saddr
,
247 .decode_session
= _decode_session4
,
248 .get_tos
= xfrm4_get_tos
,
249 .init_path
= xfrm4_init_path
,
250 .fill_dst
= xfrm4_fill_dst
,
251 .blackhole_route
= ipv4_blackhole_route
,
255 static struct ctl_table xfrm4_policy_table
[] = {
257 .procname
= "xfrm4_gc_thresh",
258 .data
= &init_net
.xfrm
.xfrm4_dst_ops
.gc_thresh
,
259 .maxlen
= sizeof(int),
261 .proc_handler
= proc_dointvec
,
266 static struct ctl_table_header
*sysctl_hdr
;
269 static void __init
xfrm4_policy_init(void)
271 xfrm_policy_register_afinfo(&xfrm4_policy_afinfo
);
274 static void __exit
xfrm4_policy_fini(void)
278 unregister_net_sysctl_table(sysctl_hdr
);
280 xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo
);
283 void __init
xfrm4_init(int rt_max_size
)
286 * Select a default value for the gc_thresh based on the main route
287 * table hash size. It seems to me the worst case scenario is when
288 * we have ipsec operating in transport mode, in which we create a
289 * dst_entry per socket. The xfrm gc algorithm starts trying to remove
290 * entries at gc_thresh, and prevents new allocations as 2*gc_thresh
291 * so lets set an initial xfrm gc_thresh value at the rt_max_size/2.
292 * That will let us store an ipsec connection per route table entry,
293 * and start cleaning when were 1/2 full
295 xfrm4_dst_ops
.gc_thresh
= rt_max_size
/2;
296 dst_entries_init(&xfrm4_dst_ops
);
301 sysctl_hdr
= register_net_sysctl_table(&init_net
, net_ipv4_ctl_path
,