Merge with Linux 2.6.0-test1.
[linux-2.6/linux-mips.git] / net / ipv4 / ipvs / ip_vs_xmit.c
blob0207a3f3f8d03d01ac65c3ddb13cfb1321c688f2
1 /*
2 * ip_vs_xmit.c: various packet transmitters for IPVS
4 * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
7 * Julian Anastasov <ja@ssi.bg>
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
14 * Changes:
18 #include <linux/config.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/compiler.h>
22 #include <linux/ip.h>
23 #include <linux/tcp.h> /* for tcphdr */
24 #include <net/tcp.h> /* for csum_tcpudp_magic */
25 #include <net/udp.h>
26 #include <net/icmp.h> /* for icmp_send */
27 #include <net/route.h> /* for ip_route_output */
28 #include <linux/netfilter.h>
29 #include <linux/netfilter_ipv4.h>
31 #include <net/ip_vs.h>
35 * Destination cache to speed up outgoing route lookup
37 static inline void
38 __ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
40 struct dst_entry *old_dst;
42 old_dst = dest->dst_cache;
43 dest->dst_cache = dst;
44 dest->dst_rtos = rtos;
45 dst_release(old_dst);
48 static inline struct dst_entry *
49 __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
51 struct dst_entry *dst = dest->dst_cache;
53 if (!dst)
54 return NULL;
55 if ((dst->obsolete || rtos != dest->dst_rtos) &&
56 dst->ops->check(dst, cookie) == NULL) {
57 dest->dst_cache = 0;
58 return NULL;
60 dst_hold(dst);
61 return dst;
64 static inline struct rtable *
65 __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
67 struct rtable *rt; /* Route to the other host */
68 struct ip_vs_dest *dest = cp->dest;
70 if (dest) {
71 spin_lock(&dest->dst_lock);
72 if (!(rt = (struct rtable *)
73 __ip_vs_dst_check(dest, rtos, 0))) {
74 struct flowi fl = {
75 .oif = 0,
76 .nl_u = {
77 .ip4_u = {
78 .daddr = dest->addr,
79 .saddr = 0,
80 .tos = rtos, } },
81 .proto = cp->protocol,
84 if (ip_route_output_key(&rt, &fl)) {
85 spin_unlock(&dest->dst_lock);
86 IP_VS_DBG_RL("ip_route_output error, "
87 "dest: %u.%u.%u.%u\n",
88 NIPQUAD(dest->addr));
89 return NULL;
91 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
92 IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
93 NIPQUAD(dest->addr),
94 atomic_read(&rt->u.dst.__refcnt), rtos);
96 spin_unlock(&dest->dst_lock);
97 } else {
98 struct flowi fl = {
99 .oif = 0,
100 .nl_u = {
101 .ip4_u = {
102 .daddr = dest->addr,
103 .saddr = 0,
104 .tos = rtos, } },
105 .proto = cp->protocol,
108 if (ip_route_output_key(&rt, &fl)) {
109 IP_VS_DBG_RL("ip_route_output error, dest: "
110 "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
111 return NULL;
115 return rt;
120 * Release dest->dst_cache before a dest is removed
122 void
123 ip_vs_dst_reset(struct ip_vs_dest *dest)
125 struct dst_entry *old_dst;
127 old_dst = dest->dst_cache;
128 dest->dst_cache = NULL;
129 dst_release(old_dst);
133 static inline int
134 ip_vs_skb_cow(struct sk_buff *skb, unsigned int headroom,
135 struct iphdr **iph_p, unsigned char **t_p)
137 int delta = (headroom > 16 ? headroom : 16) - skb_headroom(skb);
139 if (delta < 0)
140 delta = 0;
142 if (delta ||skb_cloned(skb)) {
143 if (pskb_expand_head(skb, (delta+15)&~15, 0, GFP_ATOMIC))
144 return -ENOMEM;
146 /* skb data changed, update pointers */
147 *iph_p = skb->nh.iph;
148 *t_p = (char*) (*iph_p) + (*iph_p)->ihl * 4;
150 return 0;
154 #define IP_VS_XMIT(skb, rt) \
155 do { \
156 skb->nfcache |= NFC_IPVS_PROPERTY; \
157 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, \
158 rt->u.dst.dev, dst_output); \
159 } while (0)
163 * NULL transmitter (do nothing except return NF_ACCEPT)
166 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
167 struct ip_vs_protocol *pp)
169 return NF_ACCEPT;
174 * Bypass transmitter
175 * Let packets bypass the destination when the destination is not
176 * available, it may be only used in transparent cache cluster.
179 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
180 struct ip_vs_protocol *pp)
182 struct rtable *rt; /* Route to the other host */
183 struct iphdr *iph = skb->nh.iph;
184 u8 tos = iph->tos;
185 int mtu;
186 struct flowi fl = {
187 .oif = 0,
188 .nl_u = {
189 .ip4_u = {
190 .daddr = iph->daddr,
191 .saddr = 0,
192 .tos = RT_TOS(tos), } },
193 .proto = iph->protocol,
196 EnterFunction(10);
198 if (ip_route_output_key(&rt, &fl)) {
199 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
200 "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
201 goto tx_error_icmp;
204 /* MTU checking */
205 mtu = dst_pmtu(&rt->u.dst);
206 if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
207 ip_rt_put(rt);
208 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
209 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
210 goto tx_error;
213 if (skb_is_nonlinear(skb) && skb->len <= mtu)
214 ip_send_check(iph);
216 if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
217 if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
218 ip_rt_put(rt);
219 IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n");
220 goto tx_error;
224 /* drop old route */
225 dst_release(skb->dst);
226 skb->dst = &rt->u.dst;
228 #ifdef CONFIG_NETFILTER_DEBUG
229 skb->nf_debug = 0;
230 #endif /* CONFIG_NETFILTER_DEBUG */
231 IP_VS_XMIT(skb, rt);
233 LeaveFunction(10);
234 return NF_STOLEN;
236 tx_error_icmp:
237 dst_link_failure(skb);
238 tx_error:
239 kfree_skb(skb);
240 return NF_STOLEN;
245 * NAT transmitter (only for outside-to-inside nat forwarding)
246 * Not used for related ICMP
249 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
250 struct ip_vs_protocol *pp)
252 struct rtable *rt; /* Route to the other host */
253 struct iphdr *iph;
254 union ip_vs_tphdr h;
255 int ihl;
256 unsigned short size;
257 int mtu;
259 EnterFunction(10);
262 * If it has ip_vs_app helper, the helper may change the payload,
263 * so it needs full checksum checking and checksum calculation.
264 * If not, only the header (such as IP address and port number)
265 * will be changed, so it is fast to do incremental checksum update,
266 * and let the destination host do final checksum checking.
269 if (unlikely(cp->app && !pp->slave)) {
270 if (skb_is_nonlinear(skb) &&
271 skb_linearize(skb, GFP_ATOMIC) != 0)
272 return NF_DROP;
275 iph = skb->nh.iph;
276 ihl = iph->ihl << 2;
277 h.raw = (char*) iph + ihl;
278 size = ntohs(iph->tot_len) - ihl;
280 /* do TCP/UDP checksum checking if it has application helper */
281 if (unlikely(cp->app && pp->csum_check && !pp->slave)) {
282 if (!pp->csum_check(skb, pp, iph, h, size))
283 goto tx_error;
287 * Check if it is no clinet port connection ...
289 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
290 ip_vs_conn_fill_cport(cp, h.portp[0]);
291 IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp->dport));
294 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
295 goto tx_error_icmp;
297 /* MTU checking */
298 mtu = dst_pmtu(&rt->u.dst);
299 if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
300 ip_rt_put(rt);
301 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
302 IP_VS_DBG_RL_PKT(0, pp, iph, "ip_vs_nat_xmit(): frag needed for");
303 goto tx_error;
306 /* drop old route */
307 dst_release(skb->dst);
308 skb->dst = &rt->u.dst;
310 /* copy-on-write the packet before mangling it */
311 if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, &iph, &h.raw))
312 return NF_DROP;
314 /* mangle the packet */
315 iph->daddr = cp->daddr;
316 if (pp->dnat_handler) {
317 pp->dnat_handler(skb, pp, cp, iph, h, size);
318 iph = skb->nh.iph;
319 h.raw = (char*) iph + ihl;
321 ip_send_check(iph);
323 IP_VS_DBG_PKT(10, pp, iph, "After DNAT");
325 /* FIXME: when application helper enlarges the packet and the length
326 is larger than the MTU of outgoing device, there will be still
327 MTU problem. */
329 #ifdef CONFIG_NETFILTER_DEBUG
330 skb->nf_debug = 0;
331 #endif /* CONFIG_NETFILTER_DEBUG */
332 IP_VS_XMIT(skb, rt);
334 LeaveFunction(10);
335 return NF_STOLEN;
337 tx_error_icmp:
338 dst_link_failure(skb);
339 tx_error:
340 kfree_skb(skb);
341 return NF_STOLEN;
346 * IP Tunneling transmitter
348 * This function encapsulates the packet in a new IP packet, its
349 * destination will be set to cp->daddr. Most code of this function
350 * is taken from ipip.c.
352 * It is used in VS/TUN cluster. The load balancer selects a real
353 * server from a cluster based on a scheduling algorithm,
354 * encapsulates the request packet and forwards it to the selected
355 * server. For example, all real servers are configured with
356 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
357 * the encapsulated packet, it will decapsulate the packet, processe
358 * the request and return the response packets directly to the client
359 * without passing the load balancer. This can greatly increase the
360 * scalability of virtual server.
362 * Used for ANY protocol
365 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
366 struct ip_vs_protocol *pp)
368 struct rtable *rt; /* Route to the other host */
369 struct net_device *tdev; /* Device to other host */
370 struct iphdr *old_iph = skb->nh.iph;
371 u8 tos = old_iph->tos;
372 u16 df = old_iph->frag_off;
373 struct iphdr *iph; /* Our new IP header */
374 int max_headroom; /* The extra header space needed */
375 int mtu;
377 EnterFunction(10);
379 if (skb->protocol != __constant_htons(ETH_P_IP)) {
380 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
381 "ETH_P_IP: %d, skb protocol: %d\n",
382 __constant_htons(ETH_P_IP), skb->protocol);
383 goto tx_error;
386 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
387 goto tx_error_icmp;
389 tdev = rt->u.dst.dev;
391 mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr);
392 if (mtu < 68) {
393 ip_rt_put(rt);
394 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
395 goto tx_error;
397 if (skb->dst)
398 skb->dst->ops->update_pmtu(skb->dst, mtu);
400 df |= (old_iph->frag_off&__constant_htons(IP_DF));
402 if ((old_iph->frag_off&__constant_htons(IP_DF))
403 && mtu < ntohs(old_iph->tot_len)) {
404 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
405 ip_rt_put(rt);
406 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
407 goto tx_error;
410 if (skb_is_nonlinear(skb))
411 ip_send_check(old_iph);
413 skb->h.raw = skb->nh.raw;
416 * Okay, now see if we can stuff it in the buffer as-is.
418 max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
420 if (skb_headroom(skb) < max_headroom
421 || skb_cloned(skb) || skb_shared(skb)) {
422 struct sk_buff *new_skb =
423 skb_realloc_headroom(skb, max_headroom);
424 if (!new_skb) {
425 ip_rt_put(rt);
426 kfree_skb(skb);
427 IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
428 return -EINVAL;
430 kfree_skb(skb);
431 skb = new_skb;
434 skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
435 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
437 /* drop old route */
438 dst_release(skb->dst);
439 skb->dst = &rt->u.dst;
442 * Push down and install the IPIP header.
444 iph = skb->nh.iph;
445 iph->version = 4;
446 iph->ihl = sizeof(struct iphdr)>>2;
447 iph->frag_off = df;
448 iph->protocol = IPPROTO_IPIP;
449 iph->tos = tos;
450 iph->daddr = rt->rt_dst;
451 iph->saddr = rt->rt_src;
452 iph->ttl = old_iph->ttl;
453 iph->tot_len = htons(skb->len);
454 ip_select_ident(iph, &rt->u.dst, NULL);
455 ip_send_check(iph);
457 skb->ip_summed = CHECKSUM_NONE;
458 #ifdef CONFIG_NETFILTER_DEBUG
459 skb->nf_debug = 0;
460 #endif /* CONFIG_NETFILTER_DEBUG */
461 IP_VS_XMIT(skb, rt);
463 LeaveFunction(10);
465 return NF_STOLEN;
467 tx_error_icmp:
468 dst_link_failure(skb);
469 tx_error:
470 kfree_skb(skb);
471 return NF_STOLEN;
476 * Direct Routing transmitter
477 * Used for ANY protocol
480 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
481 struct ip_vs_protocol *pp)
483 struct rtable *rt; /* Route to the other host */
484 struct iphdr *iph = skb->nh.iph;
485 int mtu;
487 EnterFunction(10);
489 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
490 goto tx_error_icmp;
492 /* MTU checking */
493 mtu = dst_pmtu(&rt->u.dst);
494 if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
495 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
496 ip_rt_put(rt);
497 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
498 goto tx_error;
501 if (skb_is_nonlinear(skb) && skb->len <= mtu)
502 ip_send_check(iph);
504 if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
505 if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
506 ip_rt_put(rt);
507 IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n");
508 goto tx_error;
512 /* drop old route */
513 dst_release(skb->dst);
514 skb->dst = &rt->u.dst;
516 #ifdef CONFIG_NETFILTER_DEBUG
517 skb->nf_debug = 0;
518 #endif /* CONFIG_NETFILTER_DEBUG */
519 IP_VS_XMIT(skb, rt);
521 LeaveFunction(10);
522 return NF_STOLEN;
524 tx_error_icmp:
525 dst_link_failure(skb);
526 tx_error:
527 kfree_skb(skb);
528 return NF_STOLEN;
533 * ICMP packet transmitter
534 * called by the ip_vs_in_icmp
537 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
538 struct ip_vs_protocol *pp)
540 struct rtable *rt; /* Route to the other host */
541 struct iphdr *iph;
542 struct icmphdr *icmph;
543 struct iphdr *ciph; /* The ip header contained within the ICMP */
544 unsigned short len;
545 union ip_vs_tphdr h;
546 int mtu;
547 int rc;
549 EnterFunction(10);
551 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
552 forwarded directly here, because there is no need to
553 translate address/port back */
554 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
555 if (cp->packet_xmit)
556 rc = cp->packet_xmit(skb, cp, pp);
557 else
558 rc = NF_ACCEPT;
559 atomic_inc(&cp->in_pkts);
560 __ip_vs_conn_put(cp);
561 goto out;
564 iph = skb->nh.iph;
565 icmph = (struct icmphdr *)((char *)iph+(iph->ihl<<2));
566 len = ntohs(iph->tot_len) - (iph->ihl<<2);
569 * mangle and send the packet here (only for VS/NAT)
572 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
573 goto tx_error_icmp;
575 /* MTU checking */
576 mtu = dst_pmtu(&rt->u.dst);
577 if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
578 ip_rt_put(rt);
579 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
580 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
581 goto tx_error;
584 /* drop old route */
585 dst_release(skb->dst);
586 skb->dst = &rt->u.dst;
588 /* copy-on-write the packet before mangling it */
589 if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len,
590 &iph, (unsigned char**)&icmph)) {
591 rc = NF_DROP;
592 goto out;
594 ciph = (struct iphdr *) (icmph + 1);
595 h.raw = (char *) ciph + (ciph->ihl << 2);
597 /* The ICMP packet for VS/NAT must be written to correct addresses
598 before being forwarded to the right server */
600 /* First change the dest IP address, and recalc checksum */
601 iph->daddr = cp->daddr;
602 ip_send_check(iph);
604 /* Now change the *source* address in the contained IP */
605 ciph->saddr = cp->daddr;
606 ip_send_check(ciph);
608 /* the TCP/UDP source port - cannot redo check */
609 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol)
610 h.portp[0] = cp->dport;
612 /* And finally the ICMP checksum */
613 icmph->checksum = 0;
614 icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
615 skb->ip_summed = CHECKSUM_UNNECESSARY;
617 IP_VS_DBG_PKT(11, pp, ciph, "Forwarding incoming ICMP");
619 #ifdef CONFIG_NETFILTER_DEBUG
620 skb->nf_debug = 0;
621 #endif /* CONFIG_NETFILTER_DEBUG */
622 IP_VS_XMIT(skb, rt);
624 rc = NF_STOLEN;
625 goto out;
627 tx_error_icmp:
628 dst_link_failure(skb);
629 tx_error:
630 dev_kfree_skb(skb);
631 rc = NF_STOLEN;
632 out:
633 LeaveFunction(10);
634 return rc;