2 * ip_vs_xmit.c: various packet transmitters for IPVS
4 * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 * Julian Anastasov <ja@ssi.bg>
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
18 #include <linux/config.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/compiler.h>
23 #include <linux/tcp.h> /* for tcphdr */
24 #include <net/tcp.h> /* for csum_tcpudp_magic */
26 #include <net/icmp.h> /* for icmp_send */
27 #include <net/route.h> /* for ip_route_output */
28 #include <linux/netfilter.h>
29 #include <linux/netfilter_ipv4.h>
31 #include <net/ip_vs.h>
35 * Destination cache to speed up outgoing route lookup
38 __ip_vs_dst_set(struct ip_vs_dest
*dest
, u32 rtos
, struct dst_entry
*dst
)
40 struct dst_entry
*old_dst
;
42 old_dst
= dest
->dst_cache
;
43 dest
->dst_cache
= dst
;
44 dest
->dst_rtos
= rtos
;
48 static inline struct dst_entry
*
49 __ip_vs_dst_check(struct ip_vs_dest
*dest
, u32 rtos
, u32 cookie
)
51 struct dst_entry
*dst
= dest
->dst_cache
;
55 if ((dst
->obsolete
|| rtos
!= dest
->dst_rtos
) &&
56 dst
->ops
->check(dst
, cookie
) == NULL
) {
64 static inline struct rtable
*
65 __ip_vs_get_out_rt(struct ip_vs_conn
*cp
, u32 rtos
)
67 struct rtable
*rt
; /* Route to the other host */
68 struct ip_vs_dest
*dest
= cp
->dest
;
71 spin_lock(&dest
->dst_lock
);
72 if (!(rt
= (struct rtable
*)
73 __ip_vs_dst_check(dest
, rtos
, 0))) {
81 .proto
= cp
->protocol
,
84 if (ip_route_output_key(&rt
, &fl
)) {
85 spin_unlock(&dest
->dst_lock
);
86 IP_VS_DBG_RL("ip_route_output error, "
87 "dest: %u.%u.%u.%u\n",
91 __ip_vs_dst_set(dest
, rtos
, dst_clone(&rt
->u
.dst
));
92 IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
94 atomic_read(&rt
->u
.dst
.__refcnt
), rtos
);
96 spin_unlock(&dest
->dst_lock
);
105 .proto
= cp
->protocol
,
108 if (ip_route_output_key(&rt
, &fl
)) {
109 IP_VS_DBG_RL("ip_route_output error, dest: "
110 "%u.%u.%u.%u\n", NIPQUAD(cp
->daddr
));
120 * Release dest->dst_cache before a dest is removed
123 ip_vs_dst_reset(struct ip_vs_dest
*dest
)
125 struct dst_entry
*old_dst
;
127 old_dst
= dest
->dst_cache
;
128 dest
->dst_cache
= NULL
;
129 dst_release(old_dst
);
134 ip_vs_skb_cow(struct sk_buff
*skb
, unsigned int headroom
,
135 struct iphdr
**iph_p
, unsigned char **t_p
)
137 int delta
= (headroom
> 16 ? headroom
: 16) - skb_headroom(skb
);
142 if (delta
||skb_cloned(skb
)) {
143 if (pskb_expand_head(skb
, (delta
+15)&~15, 0, GFP_ATOMIC
))
146 /* skb data changed, update pointers */
147 *iph_p
= skb
->nh
.iph
;
148 *t_p
= (char*) (*iph_p
) + (*iph_p
)->ihl
* 4;
154 #define IP_VS_XMIT(skb, rt) \
156 skb->nfcache |= NFC_IPVS_PROPERTY; \
157 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, \
158 rt->u.dst.dev, dst_output); \
163 * NULL transmitter (do nothing except return NF_ACCEPT)
166 ip_vs_null_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
167 struct ip_vs_protocol
*pp
)
175 * Let packets bypass the destination when the destination is not
176 * available, it may be only used in transparent cache cluster.
179 ip_vs_bypass_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
180 struct ip_vs_protocol
*pp
)
182 struct rtable
*rt
; /* Route to the other host */
183 struct iphdr
*iph
= skb
->nh
.iph
;
192 .tos
= RT_TOS(tos
), } },
193 .proto
= iph
->protocol
,
198 if (ip_route_output_key(&rt
, &fl
)) {
199 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
200 "dest: %u.%u.%u.%u\n", NIPQUAD(iph
->daddr
));
205 mtu
= dst_pmtu(&rt
->u
.dst
);
206 if ((skb
->len
> mtu
) && (iph
->frag_off
&__constant_htons(IP_DF
))) {
208 icmp_send(skb
, ICMP_DEST_UNREACH
,ICMP_FRAG_NEEDED
, htonl(mtu
));
209 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
213 if (skb_is_nonlinear(skb
) && skb
->len
<= mtu
)
216 if (unlikely(skb_headroom(skb
) < rt
->u
.dst
.dev
->hard_header_len
)) {
217 if (skb_cow(skb
, rt
->u
.dst
.dev
->hard_header_len
)) {
219 IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n");
225 dst_release(skb
->dst
);
226 skb
->dst
= &rt
->u
.dst
;
228 #ifdef CONFIG_NETFILTER_DEBUG
230 #endif /* CONFIG_NETFILTER_DEBUG */
237 dst_link_failure(skb
);
245 * NAT transmitter (only for outside-to-inside nat forwarding)
246 * Not used for related ICMP
249 ip_vs_nat_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
250 struct ip_vs_protocol
*pp
)
252 struct rtable
*rt
; /* Route to the other host */
262 * If it has ip_vs_app helper, the helper may change the payload,
263 * so it needs full checksum checking and checksum calculation.
264 * If not, only the header (such as IP address and port number)
265 * will be changed, so it is fast to do incremental checksum update,
266 * and let the destination host do final checksum checking.
269 if (unlikely(cp
->app
&& !pp
->slave
)) {
270 if (skb_is_nonlinear(skb
) &&
271 skb_linearize(skb
, GFP_ATOMIC
) != 0)
277 h
.raw
= (char*) iph
+ ihl
;
278 size
= ntohs(iph
->tot_len
) - ihl
;
280 /* do TCP/UDP checksum checking if it has application helper */
281 if (unlikely(cp
->app
&& pp
->csum_check
&& !pp
->slave
)) {
282 if (!pp
->csum_check(skb
, pp
, iph
, h
, size
))
287 * Check if it is no clinet port connection ...
289 if (unlikely(cp
->flags
& IP_VS_CONN_F_NO_CPORT
)) {
290 ip_vs_conn_fill_cport(cp
, h
.portp
[0]);
291 IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp
->dport
));
294 if (!(rt
= __ip_vs_get_out_rt(cp
, RT_TOS(iph
->tos
))))
298 mtu
= dst_pmtu(&rt
->u
.dst
);
299 if ((skb
->len
> mtu
) && (iph
->frag_off
&__constant_htons(IP_DF
))) {
301 icmp_send(skb
, ICMP_DEST_UNREACH
,ICMP_FRAG_NEEDED
, htonl(mtu
));
302 IP_VS_DBG_RL_PKT(0, pp
, iph
, "ip_vs_nat_xmit(): frag needed for");
307 dst_release(skb
->dst
);
308 skb
->dst
= &rt
->u
.dst
;
310 /* copy-on-write the packet before mangling it */
311 if (ip_vs_skb_cow(skb
, rt
->u
.dst
.dev
->hard_header_len
, &iph
, &h
.raw
))
314 /* mangle the packet */
315 iph
->daddr
= cp
->daddr
;
316 if (pp
->dnat_handler
) {
317 pp
->dnat_handler(skb
, pp
, cp
, iph
, h
, size
);
319 h
.raw
= (char*) iph
+ ihl
;
323 IP_VS_DBG_PKT(10, pp
, iph
, "After DNAT");
325 /* FIXME: when application helper enlarges the packet and the length
326 is larger than the MTU of outgoing device, there will be still
329 #ifdef CONFIG_NETFILTER_DEBUG
331 #endif /* CONFIG_NETFILTER_DEBUG */
338 dst_link_failure(skb
);
346 * IP Tunneling transmitter
348 * This function encapsulates the packet in a new IP packet, its
349 * destination will be set to cp->daddr. Most code of this function
350 * is taken from ipip.c.
352 * It is used in VS/TUN cluster. The load balancer selects a real
353 * server from a cluster based on a scheduling algorithm,
354 * encapsulates the request packet and forwards it to the selected
355 * server. For example, all real servers are configured with
356 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
357 * the encapsulated packet, it will decapsulate the packet, processe
358 * the request and return the response packets directly to the client
359 * without passing the load balancer. This can greatly increase the
360 * scalability of virtual server.
362 * Used for ANY protocol
365 ip_vs_tunnel_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
366 struct ip_vs_protocol
*pp
)
368 struct rtable
*rt
; /* Route to the other host */
369 struct net_device
*tdev
; /* Device to other host */
370 struct iphdr
*old_iph
= skb
->nh
.iph
;
371 u8 tos
= old_iph
->tos
;
372 u16 df
= old_iph
->frag_off
;
373 struct iphdr
*iph
; /* Our new IP header */
374 int max_headroom
; /* The extra header space needed */
379 if (skb
->protocol
!= __constant_htons(ETH_P_IP
)) {
380 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
381 "ETH_P_IP: %d, skb protocol: %d\n",
382 __constant_htons(ETH_P_IP
), skb
->protocol
);
386 if (!(rt
= __ip_vs_get_out_rt(cp
, RT_TOS(tos
))))
389 tdev
= rt
->u
.dst
.dev
;
391 mtu
= dst_pmtu(&rt
->u
.dst
) - sizeof(struct iphdr
);
394 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
398 skb
->dst
->ops
->update_pmtu(skb
->dst
, mtu
);
400 df
|= (old_iph
->frag_off
&__constant_htons(IP_DF
));
402 if ((old_iph
->frag_off
&__constant_htons(IP_DF
))
403 && mtu
< ntohs(old_iph
->tot_len
)) {
404 icmp_send(skb
, ICMP_DEST_UNREACH
,ICMP_FRAG_NEEDED
, htonl(mtu
));
406 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
410 if (skb_is_nonlinear(skb
))
411 ip_send_check(old_iph
);
413 skb
->h
.raw
= skb
->nh
.raw
;
416 * Okay, now see if we can stuff it in the buffer as-is.
418 max_headroom
= (((tdev
->hard_header_len
+15)&~15)+sizeof(struct iphdr
));
420 if (skb_headroom(skb
) < max_headroom
421 || skb_cloned(skb
) || skb_shared(skb
)) {
422 struct sk_buff
*new_skb
=
423 skb_realloc_headroom(skb
, max_headroom
);
427 IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
434 skb
->nh
.raw
= skb_push(skb
, sizeof(struct iphdr
));
435 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
438 dst_release(skb
->dst
);
439 skb
->dst
= &rt
->u
.dst
;
442 * Push down and install the IPIP header.
446 iph
->ihl
= sizeof(struct iphdr
)>>2;
448 iph
->protocol
= IPPROTO_IPIP
;
450 iph
->daddr
= rt
->rt_dst
;
451 iph
->saddr
= rt
->rt_src
;
452 iph
->ttl
= old_iph
->ttl
;
453 iph
->tot_len
= htons(skb
->len
);
454 ip_select_ident(iph
, &rt
->u
.dst
, NULL
);
457 skb
->ip_summed
= CHECKSUM_NONE
;
458 #ifdef CONFIG_NETFILTER_DEBUG
460 #endif /* CONFIG_NETFILTER_DEBUG */
468 dst_link_failure(skb
);
476 * Direct Routing transmitter
477 * Used for ANY protocol
480 ip_vs_dr_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
481 struct ip_vs_protocol
*pp
)
483 struct rtable
*rt
; /* Route to the other host */
484 struct iphdr
*iph
= skb
->nh
.iph
;
489 if (!(rt
= __ip_vs_get_out_rt(cp
, RT_TOS(iph
->tos
))))
493 mtu
= dst_pmtu(&rt
->u
.dst
);
494 if ((iph
->frag_off
&__constant_htons(IP_DF
)) && skb
->len
> mtu
) {
495 icmp_send(skb
, ICMP_DEST_UNREACH
,ICMP_FRAG_NEEDED
, htonl(mtu
));
497 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
501 if (skb_is_nonlinear(skb
) && skb
->len
<= mtu
)
504 if (unlikely(skb_headroom(skb
) < rt
->u
.dst
.dev
->hard_header_len
)) {
505 if (skb_cow(skb
, rt
->u
.dst
.dev
->hard_header_len
)) {
507 IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n");
513 dst_release(skb
->dst
);
514 skb
->dst
= &rt
->u
.dst
;
516 #ifdef CONFIG_NETFILTER_DEBUG
518 #endif /* CONFIG_NETFILTER_DEBUG */
525 dst_link_failure(skb
);
533 * ICMP packet transmitter
534 * called by the ip_vs_in_icmp
537 ip_vs_icmp_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
538 struct ip_vs_protocol
*pp
)
540 struct rtable
*rt
; /* Route to the other host */
542 struct icmphdr
*icmph
;
543 struct iphdr
*ciph
; /* The ip header contained within the ICMP */
551 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
552 forwarded directly here, because there is no need to
553 translate address/port back */
554 if (IP_VS_FWD_METHOD(cp
) != IP_VS_CONN_F_MASQ
) {
556 rc
= cp
->packet_xmit(skb
, cp
, pp
);
559 atomic_inc(&cp
->in_pkts
);
560 __ip_vs_conn_put(cp
);
565 icmph
= (struct icmphdr
*)((char *)iph
+(iph
->ihl
<<2));
566 len
= ntohs(iph
->tot_len
) - (iph
->ihl
<<2);
569 * mangle and send the packet here (only for VS/NAT)
572 if (!(rt
= __ip_vs_get_out_rt(cp
, RT_TOS(iph
->tos
))))
576 mtu
= dst_pmtu(&rt
->u
.dst
);
577 if ((skb
->len
> mtu
) && (iph
->frag_off
&__constant_htons(IP_DF
))) {
579 icmp_send(skb
, ICMP_DEST_UNREACH
,ICMP_FRAG_NEEDED
, htonl(mtu
));
580 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
585 dst_release(skb
->dst
);
586 skb
->dst
= &rt
->u
.dst
;
588 /* copy-on-write the packet before mangling it */
589 if (ip_vs_skb_cow(skb
, rt
->u
.dst
.dev
->hard_header_len
,
590 &iph
, (unsigned char**)&icmph
)) {
594 ciph
= (struct iphdr
*) (icmph
+ 1);
595 h
.raw
= (char *) ciph
+ (ciph
->ihl
<< 2);
597 /* The ICMP packet for VS/NAT must be written to correct addresses
598 before being forwarded to the right server */
600 /* First change the dest IP address, and recalc checksum */
601 iph
->daddr
= cp
->daddr
;
604 /* Now change the *source* address in the contained IP */
605 ciph
->saddr
= cp
->daddr
;
608 /* the TCP/UDP source port - cannot redo check */
609 if (IPPROTO_TCP
== ciph
->protocol
|| IPPROTO_UDP
== ciph
->protocol
)
610 h
.portp
[0] = cp
->dport
;
612 /* And finally the ICMP checksum */
614 icmph
->checksum
= ip_compute_csum((unsigned char *) icmph
, len
);
615 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
617 IP_VS_DBG_PKT(11, pp
, ciph
, "Forwarding incoming ICMP");
619 #ifdef CONFIG_NETFILTER_DEBUG
621 #endif /* CONFIG_NETFILTER_DEBUG */
628 dst_link_failure(skb
);