2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
22 * Paul `Rusty' Russell properly handle non-linear skbs
23 * Harald Welte don't use nfcache
27 #define KMSG_COMPONENT "IPVS"
28 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
30 #include <linux/module.h>
31 #include <linux/kernel.h>
33 #include <linux/tcp.h>
34 #include <linux/sctp.h>
35 #include <linux/icmp.h>
36 #include <linux/slab.h>
41 #include <net/icmp.h> /* for icmp_send */
42 #include <net/route.h>
43 #include <net/ip6_checksum.h>
45 #include <linux/netfilter.h>
46 #include <linux/netfilter_ipv4.h>
48 #ifdef CONFIG_IP_VS_IPV6
50 #include <linux/netfilter_ipv6.h>
51 #include <net/ip6_route.h>
54 #include <net/ip_vs.h>
57 EXPORT_SYMBOL(register_ip_vs_scheduler
);
58 EXPORT_SYMBOL(unregister_ip_vs_scheduler
);
59 EXPORT_SYMBOL(ip_vs_proto_name
);
60 EXPORT_SYMBOL(ip_vs_conn_new
);
61 EXPORT_SYMBOL(ip_vs_conn_in_get
);
62 EXPORT_SYMBOL(ip_vs_conn_out_get
);
63 #ifdef CONFIG_IP_VS_PROTO_TCP
64 EXPORT_SYMBOL(ip_vs_tcp_conn_listen
);
66 EXPORT_SYMBOL(ip_vs_conn_put
);
67 #ifdef CONFIG_IP_VS_DEBUG
68 EXPORT_SYMBOL(ip_vs_get_debug_level
);
72 /* ID used in ICMP lookups */
73 #define icmp_id(icmph) (((icmph)->un).echo.id)
74 #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier)
76 const char *ip_vs_proto_name(unsigned proto
)
91 #ifdef CONFIG_IP_VS_IPV6
96 sprintf(buf
, "IP_%d", proto
);
101 void ip_vs_init_hash_table(struct list_head
*table
, int rows
)
104 INIT_LIST_HEAD(&table
[rows
]);
108 ip_vs_in_stats(struct ip_vs_conn
*cp
, struct sk_buff
*skb
)
110 struct ip_vs_dest
*dest
= cp
->dest
;
111 if (dest
&& (dest
->flags
& IP_VS_DEST_F_AVAILABLE
)) {
112 spin_lock(&dest
->stats
.lock
);
113 dest
->stats
.ustats
.inpkts
++;
114 dest
->stats
.ustats
.inbytes
+= skb
->len
;
115 spin_unlock(&dest
->stats
.lock
);
117 spin_lock(&dest
->svc
->stats
.lock
);
118 dest
->svc
->stats
.ustats
.inpkts
++;
119 dest
->svc
->stats
.ustats
.inbytes
+= skb
->len
;
120 spin_unlock(&dest
->svc
->stats
.lock
);
122 spin_lock(&ip_vs_stats
.lock
);
123 ip_vs_stats
.ustats
.inpkts
++;
124 ip_vs_stats
.ustats
.inbytes
+= skb
->len
;
125 spin_unlock(&ip_vs_stats
.lock
);
131 ip_vs_out_stats(struct ip_vs_conn
*cp
, struct sk_buff
*skb
)
133 struct ip_vs_dest
*dest
= cp
->dest
;
134 if (dest
&& (dest
->flags
& IP_VS_DEST_F_AVAILABLE
)) {
135 spin_lock(&dest
->stats
.lock
);
136 dest
->stats
.ustats
.outpkts
++;
137 dest
->stats
.ustats
.outbytes
+= skb
->len
;
138 spin_unlock(&dest
->stats
.lock
);
140 spin_lock(&dest
->svc
->stats
.lock
);
141 dest
->svc
->stats
.ustats
.outpkts
++;
142 dest
->svc
->stats
.ustats
.outbytes
+= skb
->len
;
143 spin_unlock(&dest
->svc
->stats
.lock
);
145 spin_lock(&ip_vs_stats
.lock
);
146 ip_vs_stats
.ustats
.outpkts
++;
147 ip_vs_stats
.ustats
.outbytes
+= skb
->len
;
148 spin_unlock(&ip_vs_stats
.lock
);
154 ip_vs_conn_stats(struct ip_vs_conn
*cp
, struct ip_vs_service
*svc
)
156 spin_lock(&cp
->dest
->stats
.lock
);
157 cp
->dest
->stats
.ustats
.conns
++;
158 spin_unlock(&cp
->dest
->stats
.lock
);
160 spin_lock(&svc
->stats
.lock
);
161 svc
->stats
.ustats
.conns
++;
162 spin_unlock(&svc
->stats
.lock
);
164 spin_lock(&ip_vs_stats
.lock
);
165 ip_vs_stats
.ustats
.conns
++;
166 spin_unlock(&ip_vs_stats
.lock
);
171 ip_vs_set_state(struct ip_vs_conn
*cp
, int direction
,
172 const struct sk_buff
*skb
,
173 struct ip_vs_protocol
*pp
)
175 if (unlikely(!pp
->state_transition
))
177 return pp
->state_transition(cp
, direction
, skb
, pp
);
181 ip_vs_conn_fill_param_persist(const struct ip_vs_service
*svc
,
182 struct sk_buff
*skb
, int protocol
,
183 const union nf_inet_addr
*caddr
, __be16 cport
,
184 const union nf_inet_addr
*vaddr
, __be16 vport
,
185 struct ip_vs_conn_param
*p
)
187 ip_vs_conn_fill_param(svc
->af
, protocol
, caddr
, cport
, vaddr
, vport
, p
);
189 if (p
->pe
&& p
->pe
->fill_param
)
190 return p
->pe
->fill_param(p
, skb
);
196 * IPVS persistent scheduling function
197 * It creates a connection entry according to its template if exists,
198 * or selects a server and creates a connection entry plus a template.
199 * Locking: we are svc user (svc->refcnt), so we hold all dests too
200 * Protocols supported: TCP, UDP
202 static struct ip_vs_conn
*
203 ip_vs_sched_persist(struct ip_vs_service
*svc
,
205 __be16 src_port
, __be16 dst_port
, int *ignored
)
207 struct ip_vs_conn
*cp
= NULL
;
208 struct ip_vs_iphdr iph
;
209 struct ip_vs_dest
*dest
;
210 struct ip_vs_conn
*ct
;
211 __be16 dport
= 0; /* destination port to forward */
213 struct ip_vs_conn_param param
;
214 union nf_inet_addr snet
; /* source network of the client,
217 ip_vs_fill_iphdr(svc
->af
, skb_network_header(skb
), &iph
);
219 /* Mask saddr with the netmask to adjust template granularity */
220 #ifdef CONFIG_IP_VS_IPV6
221 if (svc
->af
== AF_INET6
)
222 ipv6_addr_prefix(&snet
.in6
, &iph
.saddr
.in6
, svc
->netmask
);
225 snet
.ip
= iph
.saddr
.ip
& svc
->netmask
;
227 IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
229 IP_VS_DBG_ADDR(svc
->af
, &iph
.saddr
), ntohs(src_port
),
230 IP_VS_DBG_ADDR(svc
->af
, &iph
.daddr
), ntohs(dst_port
),
231 IP_VS_DBG_ADDR(svc
->af
, &snet
));
234 * As far as we know, FTP is a very complicated network protocol, and
235 * it uses control connection and data connections. For active FTP,
236 * FTP server initialize data connection to the client, its source port
237 * is often 20. For passive FTP, FTP server tells the clients the port
238 * that it passively listens to, and the client issues the data
239 * connection. In the tunneling or direct routing mode, the load
240 * balancer is on the client-to-server half of connection, the port
241 * number is unknown to the load balancer. So, a conn template like
242 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
243 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
244 * is created for other persistent services.
247 int protocol
= iph
.protocol
;
248 const union nf_inet_addr
*vaddr
= &iph
.daddr
;
249 const union nf_inet_addr fwmark
= { .ip
= htonl(svc
->fwmark
) };
252 if (dst_port
== svc
->port
) {
254 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
256 * <protocol, caddr, 0, vaddr, 0, daddr, 0>
258 if (svc
->port
!= FTPPORT
)
261 /* Note: persistent fwmark-based services and
262 * persistent port zero service are handled here.
264 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
265 * port zero template:
266 * <protocol,caddr,0,vaddr,0,daddr,0>
269 protocol
= IPPROTO_IP
;
273 /* return *ignored = -1 so NF_DROP can be used */
274 if (ip_vs_conn_fill_param_persist(svc
, skb
, protocol
, &snet
, 0,
275 vaddr
, vport
, ¶m
) < 0) {
281 /* Check if a template already exists */
282 ct
= ip_vs_ct_in_get(¶m
);
283 if (!ct
|| !ip_vs_check_template(ct
)) {
285 * No template found or the dest of the connection
286 * template is not available.
287 * return *ignored=0 i.e. ICMP and NF_DROP
289 dest
= svc
->scheduler
->schedule(svc
, skb
);
291 IP_VS_DBG(1, "p-schedule: no dest found.\n");
292 kfree(param
.pe_data
);
297 if (dst_port
== svc
->port
&& svc
->port
!= FTPPORT
)
301 * This adds param.pe_data to the template,
302 * and thus param.pe_data will be destroyed
303 * when the template expires */
304 ct
= ip_vs_conn_new(¶m
, &dest
->addr
, dport
,
305 IP_VS_CONN_F_TEMPLATE
, dest
, skb
->mark
);
307 kfree(param
.pe_data
);
312 ct
->timeout
= svc
->timeout
;
314 /* set destination with the found template */
316 kfree(param
.pe_data
);
320 if (dport
== svc
->port
&& dest
->port
)
323 flags
= (svc
->flags
& IP_VS_SVC_F_ONEPACKET
324 && iph
.protocol
== IPPROTO_UDP
)?
325 IP_VS_CONN_F_ONE_PACKET
: 0;
328 * Create a new connection according to the template
330 ip_vs_conn_fill_param(svc
->af
, iph
.protocol
, &iph
.saddr
, src_port
,
331 &iph
.daddr
, dst_port
, ¶m
);
333 cp
= ip_vs_conn_new(¶m
, &dest
->addr
, dport
, flags
, dest
, skb
->mark
);
343 ip_vs_control_add(cp
, ct
);
346 ip_vs_conn_stats(cp
, svc
);
352 * IPVS main scheduling function
353 * It selects a server according to the virtual service, and
354 * creates a connection entry.
355 * Protocols supported: TCP, UDP
359 * 1 : protocol tried to schedule (eg. on SYN), found svc but the
360 * svc/scheduler decides that this packet should be accepted with
361 * NF_ACCEPT because it must not be scheduled.
363 * 0 : scheduler can not find destination, so try bypass or
364 * return ICMP and then NF_DROP (ip_vs_leave).
366 * -1 : scheduler tried to schedule but fatal error occurred, eg.
367 * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
368 * failure such as missing Call-ID, ENOMEM on skb_linearize
369 * or pe_data. In this case we should return NF_DROP without
370 * any attempts to send ICMP with ip_vs_leave.
373 ip_vs_schedule(struct ip_vs_service
*svc
, struct sk_buff
*skb
,
374 struct ip_vs_protocol
*pp
, int *ignored
)
376 struct ip_vs_conn
*cp
= NULL
;
377 struct ip_vs_iphdr iph
;
378 struct ip_vs_dest
*dest
;
379 __be16 _ports
[2], *pptr
;
383 ip_vs_fill_iphdr(svc
->af
, skb_network_header(skb
), &iph
);
384 pptr
= skb_header_pointer(skb
, iph
.len
, sizeof(_ports
), _ports
);
389 * FTPDATA needs this check when using local real server.
390 * Never schedule Active FTPDATA connections from real server.
391 * For LVS-NAT they must be already created. For other methods
392 * with persistence the connection is created on SYN+ACK.
394 if (pptr
[0] == FTPDATA
) {
395 IP_VS_DBG_PKT(12, svc
->af
, pp
, skb
, 0,
396 "Not scheduling FTPDATA");
401 * Do not schedule replies from local real server.
403 if ((!skb
->dev
|| skb
->dev
->flags
& IFF_LOOPBACK
) &&
404 (cp
= pp
->conn_in_get(svc
->af
, skb
, pp
, &iph
, iph
.len
, 1))) {
405 IP_VS_DBG_PKT(12, svc
->af
, pp
, skb
, 0,
406 "Not scheduling reply for existing connection");
407 __ip_vs_conn_put(cp
);
414 if (svc
->flags
& IP_VS_SVC_F_PERSISTENT
)
415 return ip_vs_sched_persist(svc
, skb
, pptr
[0], pptr
[1], ignored
);
420 * Non-persistent service
422 if (!svc
->fwmark
&& pptr
[1] != svc
->port
) {
424 pr_err("Schedule: port zero only supported "
425 "in persistent services, "
426 "check your ipvs configuration\n");
430 dest
= svc
->scheduler
->schedule(svc
, skb
);
432 IP_VS_DBG(1, "Schedule: no dest found.\n");
436 flags
= (svc
->flags
& IP_VS_SVC_F_ONEPACKET
437 && iph
.protocol
== IPPROTO_UDP
)?
438 IP_VS_CONN_F_ONE_PACKET
: 0;
441 * Create a connection entry.
444 struct ip_vs_conn_param p
;
445 ip_vs_conn_fill_param(svc
->af
, iph
.protocol
, &iph
.saddr
,
446 pptr
[0], &iph
.daddr
, pptr
[1], &p
);
447 cp
= ip_vs_conn_new(&p
, &dest
->addr
,
448 dest
->port
? dest
->port
: pptr
[1],
449 flags
, dest
, skb
->mark
);
456 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
457 "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
459 IP_VS_DBG_ADDR(svc
->af
, &cp
->caddr
), ntohs(cp
->cport
),
460 IP_VS_DBG_ADDR(svc
->af
, &cp
->vaddr
), ntohs(cp
->vport
),
461 IP_VS_DBG_ADDR(svc
->af
, &cp
->daddr
), ntohs(cp
->dport
),
462 cp
->flags
, atomic_read(&cp
->refcnt
));
464 ip_vs_conn_stats(cp
, svc
);
470 * Pass or drop the packet.
471 * Called by ip_vs_in, when the virtual service is available but
472 * no destination is available for a new connection.
474 int ip_vs_leave(struct ip_vs_service
*svc
, struct sk_buff
*skb
,
475 struct ip_vs_protocol
*pp
)
477 __be16 _ports
[2], *pptr
;
478 struct ip_vs_iphdr iph
;
480 ip_vs_fill_iphdr(svc
->af
, skb_network_header(skb
), &iph
);
482 pptr
= skb_header_pointer(skb
, iph
.len
, sizeof(_ports
), _ports
);
484 ip_vs_service_put(svc
);
488 #ifdef CONFIG_IP_VS_IPV6
489 if (svc
->af
== AF_INET6
)
490 unicast
= ipv6_addr_type(&iph
.daddr
.in6
) & IPV6_ADDR_UNICAST
;
493 unicast
= (inet_addr_type(&init_net
, iph
.daddr
.ip
) == RTN_UNICAST
);
495 /* if it is fwmark-based service, the cache_bypass sysctl is up
496 and the destination is a non-local unicast, then create
497 a cache_bypass connection entry */
498 if (sysctl_ip_vs_cache_bypass
&& svc
->fwmark
&& unicast
) {
500 struct ip_vs_conn
*cp
;
501 unsigned int flags
= (svc
->flags
& IP_VS_SVC_F_ONEPACKET
&&
502 iph
.protocol
== IPPROTO_UDP
)?
503 IP_VS_CONN_F_ONE_PACKET
: 0;
504 union nf_inet_addr daddr
= { .all
= { 0, 0, 0, 0 } };
506 ip_vs_service_put(svc
);
508 /* create a new connection entry */
509 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__
);
511 struct ip_vs_conn_param p
;
512 ip_vs_conn_fill_param(svc
->af
, iph
.protocol
,
514 &iph
.daddr
, pptr
[1], &p
);
515 cp
= ip_vs_conn_new(&p
, &daddr
, 0,
516 IP_VS_CONN_F_BYPASS
| flags
,
523 ip_vs_in_stats(cp
, skb
);
526 cs
= ip_vs_set_state(cp
, IP_VS_DIR_INPUT
, skb
, pp
);
528 /* transmit the first SYN packet */
529 ret
= cp
->packet_xmit(skb
, cp
, pp
);
530 /* do not touch skb anymore */
532 atomic_inc(&cp
->in_pkts
);
538 * When the virtual ftp service is presented, packets destined
539 * for other services on the VIP may get here (except services
540 * listed in the ipvs table), pass the packets, because it is
541 * not ipvs job to decide to drop the packets.
543 if ((svc
->port
== FTPPORT
) && (pptr
[1] != FTPPORT
)) {
544 ip_vs_service_put(svc
);
548 ip_vs_service_put(svc
);
551 * Notify the client that the destination is unreachable, and
552 * release the socket buffer.
553 * Since it is in IP layer, the TCP socket is not actually
554 * created, the TCP RST packet cannot be sent, instead that
555 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
557 #ifdef CONFIG_IP_VS_IPV6
558 if (svc
->af
== AF_INET6
) {
560 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
562 skb
->dev
= net
->loopback_dev
;
564 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, ICMPV6_PORT_UNREACH
, 0);
567 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_PORT_UNREACH
, 0);
572 __sum16
ip_vs_checksum_complete(struct sk_buff
*skb
, int offset
)
574 return csum_fold(skb_checksum(skb
, offset
, skb
->len
- offset
, 0));
577 static inline enum ip_defrag_users
ip_vs_defrag_user(unsigned int hooknum
)
579 if (NF_INET_LOCAL_IN
== hooknum
)
580 return IP_DEFRAG_VS_IN
;
581 if (NF_INET_FORWARD
== hooknum
)
582 return IP_DEFRAG_VS_FWD
;
583 return IP_DEFRAG_VS_OUT
;
586 static inline int ip_vs_gather_frags(struct sk_buff
*skb
, u_int32_t user
)
588 int err
= ip_defrag(skb
, user
);
591 ip_send_check(ip_hdr(skb
));
596 #ifdef CONFIG_IP_VS_IPV6
597 static inline int ip_vs_gather_frags_v6(struct sk_buff
*skb
, u_int32_t user
)
599 /* TODO IPv6: Find out what to do here for IPv6 */
605 * Packet has been made sufficiently writable in caller
606 * - inout: 1=in->out, 0=out->in
608 void ip_vs_nat_icmp(struct sk_buff
*skb
, struct ip_vs_protocol
*pp
,
609 struct ip_vs_conn
*cp
, int inout
)
611 struct iphdr
*iph
= ip_hdr(skb
);
612 unsigned int icmp_offset
= iph
->ihl
*4;
613 struct icmphdr
*icmph
= (struct icmphdr
*)(skb_network_header(skb
) +
615 struct iphdr
*ciph
= (struct iphdr
*)(icmph
+ 1);
618 iph
->saddr
= cp
->vaddr
.ip
;
620 ciph
->daddr
= cp
->vaddr
.ip
;
623 iph
->daddr
= cp
->daddr
.ip
;
625 ciph
->saddr
= cp
->daddr
.ip
;
629 /* the TCP/UDP/SCTP port */
630 if (IPPROTO_TCP
== ciph
->protocol
|| IPPROTO_UDP
== ciph
->protocol
||
631 IPPROTO_SCTP
== ciph
->protocol
) {
632 __be16
*ports
= (void *)ciph
+ ciph
->ihl
*4;
635 ports
[1] = cp
->vport
;
637 ports
[0] = cp
->dport
;
640 /* And finally the ICMP checksum */
642 icmph
->checksum
= ip_vs_checksum_complete(skb
, icmp_offset
);
643 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
646 IP_VS_DBG_PKT(11, AF_INET
, pp
, skb
, (void *)ciph
- (void *)iph
,
647 "Forwarding altered outgoing ICMP");
649 IP_VS_DBG_PKT(11, AF_INET
, pp
, skb
, (void *)ciph
- (void *)iph
,
650 "Forwarding altered incoming ICMP");
653 #ifdef CONFIG_IP_VS_IPV6
654 void ip_vs_nat_icmp_v6(struct sk_buff
*skb
, struct ip_vs_protocol
*pp
,
655 struct ip_vs_conn
*cp
, int inout
)
657 struct ipv6hdr
*iph
= ipv6_hdr(skb
);
658 unsigned int icmp_offset
= sizeof(struct ipv6hdr
);
659 struct icmp6hdr
*icmph
= (struct icmp6hdr
*)(skb_network_header(skb
) +
661 struct ipv6hdr
*ciph
= (struct ipv6hdr
*)(icmph
+ 1);
664 iph
->saddr
= cp
->vaddr
.in6
;
665 ciph
->daddr
= cp
->vaddr
.in6
;
667 iph
->daddr
= cp
->daddr
.in6
;
668 ciph
->saddr
= cp
->daddr
.in6
;
671 /* the TCP/UDP/SCTP port */
672 if (IPPROTO_TCP
== ciph
->nexthdr
|| IPPROTO_UDP
== ciph
->nexthdr
||
673 IPPROTO_SCTP
== ciph
->nexthdr
) {
674 __be16
*ports
= (void *)ciph
+ sizeof(struct ipv6hdr
);
677 ports
[1] = cp
->vport
;
679 ports
[0] = cp
->dport
;
682 /* And finally the ICMP checksum */
683 icmph
->icmp6_cksum
= ~csum_ipv6_magic(&iph
->saddr
, &iph
->daddr
,
684 skb
->len
- icmp_offset
,
686 skb
->csum_start
= skb_network_header(skb
) - skb
->head
+ icmp_offset
;
687 skb
->csum_offset
= offsetof(struct icmp6hdr
, icmp6_cksum
);
688 skb
->ip_summed
= CHECKSUM_PARTIAL
;
691 IP_VS_DBG_PKT(11, AF_INET6
, pp
, skb
,
692 (void *)ciph
- (void *)iph
,
693 "Forwarding altered outgoing ICMPv6");
695 IP_VS_DBG_PKT(11, AF_INET6
, pp
, skb
,
696 (void *)ciph
- (void *)iph
,
697 "Forwarding altered incoming ICMPv6");
701 /* Handle relevant response ICMP messages - forward to the right
702 * destination host. Used for NAT and local client.
704 static int handle_response_icmp(int af
, struct sk_buff
*skb
,
705 union nf_inet_addr
*snet
,
706 __u8 protocol
, struct ip_vs_conn
*cp
,
707 struct ip_vs_protocol
*pp
,
708 unsigned int offset
, unsigned int ihl
)
710 unsigned int verdict
= NF_DROP
;
712 if (IP_VS_FWD_METHOD(cp
) != 0) {
713 pr_err("shouldn't reach here, because the box is on the "
714 "half connection in the tun/dr module.\n");
717 /* Ensure the checksum is correct */
718 if (!skb_csum_unnecessary(skb
) && ip_vs_checksum_complete(skb
, ihl
)) {
719 /* Failed checksum! */
720 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
721 IP_VS_DBG_ADDR(af
, snet
));
725 if (IPPROTO_TCP
== protocol
|| IPPROTO_UDP
== protocol
||
726 IPPROTO_SCTP
== protocol
)
727 offset
+= 2 * sizeof(__u16
);
728 if (!skb_make_writable(skb
, offset
))
731 #ifdef CONFIG_IP_VS_IPV6
733 ip_vs_nat_icmp_v6(skb
, pp
, cp
, 1);
736 ip_vs_nat_icmp(skb
, pp
, cp
, 1);
738 #ifdef CONFIG_IP_VS_IPV6
739 if (af
== AF_INET6
) {
740 if (sysctl_ip_vs_snat_reroute
&& ip6_route_me_harder(skb
) != 0)
744 if ((sysctl_ip_vs_snat_reroute
||
745 skb_rtable(skb
)->rt_flags
& RTCF_LOCAL
) &&
746 ip_route_me_harder(skb
, RTN_LOCAL
) != 0)
749 /* do the statistics and put it back */
750 ip_vs_out_stats(cp
, skb
);
752 skb
->ipvs_property
= 1;
753 if (!(cp
->flags
& IP_VS_CONN_F_NFCT
))
756 ip_vs_update_conntrack(skb
, cp
, 0);
760 __ip_vs_conn_put(cp
);
766 * Handle ICMP messages in the inside-to-outside direction (outgoing).
767 * Find any that might be relevant, check against existing connections.
768 * Currently handles error types - unreachable, quench, ttl exceeded.
770 static int ip_vs_out_icmp(struct sk_buff
*skb
, int *related
,
771 unsigned int hooknum
)
774 struct icmphdr _icmph
, *ic
;
775 struct iphdr _ciph
, *cih
; /* The ip header contained within the ICMP */
776 struct ip_vs_iphdr ciph
;
777 struct ip_vs_conn
*cp
;
778 struct ip_vs_protocol
*pp
;
779 unsigned int offset
, ihl
;
780 union nf_inet_addr snet
;
784 /* reassemble IP fragments */
785 if (ip_hdr(skb
)->frag_off
& htons(IP_MF
| IP_OFFSET
)) {
786 if (ip_vs_gather_frags(skb
, ip_vs_defrag_user(hooknum
)))
791 offset
= ihl
= iph
->ihl
* 4;
792 ic
= skb_header_pointer(skb
, offset
, sizeof(_icmph
), &_icmph
);
796 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
797 ic
->type
, ntohs(icmp_id(ic
)),
798 &iph
->saddr
, &iph
->daddr
);
801 * Work through seeing if this is for us.
802 * These checks are supposed to be in an order that means easy
803 * things are checked first to speed up processing.... however
804 * this means that some packets will manage to get a long way
805 * down this stack and then be rejected, but that's life.
807 if ((ic
->type
!= ICMP_DEST_UNREACH
) &&
808 (ic
->type
!= ICMP_SOURCE_QUENCH
) &&
809 (ic
->type
!= ICMP_TIME_EXCEEDED
)) {
814 /* Now find the contained IP header */
815 offset
+= sizeof(_icmph
);
816 cih
= skb_header_pointer(skb
, offset
, sizeof(_ciph
), &_ciph
);
818 return NF_ACCEPT
; /* The packet looks wrong, ignore */
820 pp
= ip_vs_proto_get(cih
->protocol
);
824 /* Is the embedded protocol header present? */
825 if (unlikely(cih
->frag_off
& htons(IP_OFFSET
) &&
829 IP_VS_DBG_PKT(11, AF_INET
, pp
, skb
, offset
,
830 "Checking outgoing ICMP for");
832 offset
+= cih
->ihl
* 4;
834 ip_vs_fill_iphdr(AF_INET
, cih
, &ciph
);
835 /* The embedded headers contain source and dest in reverse order */
836 cp
= pp
->conn_out_get(AF_INET
, skb
, pp
, &ciph
, offset
, 1);
840 snet
.ip
= iph
->saddr
;
841 return handle_response_icmp(AF_INET
, skb
, &snet
, cih
->protocol
, cp
,
845 #ifdef CONFIG_IP_VS_IPV6
846 static int ip_vs_out_icmp_v6(struct sk_buff
*skb
, int *related
,
847 unsigned int hooknum
)
850 struct icmp6hdr _icmph
, *ic
;
851 struct ipv6hdr _ciph
, *cih
; /* The ip header contained
853 struct ip_vs_iphdr ciph
;
854 struct ip_vs_conn
*cp
;
855 struct ip_vs_protocol
*pp
;
857 union nf_inet_addr snet
;
861 /* reassemble IP fragments */
862 if (ipv6_hdr(skb
)->nexthdr
== IPPROTO_FRAGMENT
) {
863 if (ip_vs_gather_frags_v6(skb
, ip_vs_defrag_user(hooknum
)))
868 offset
= sizeof(struct ipv6hdr
);
869 ic
= skb_header_pointer(skb
, offset
, sizeof(_icmph
), &_icmph
);
873 IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
874 ic
->icmp6_type
, ntohs(icmpv6_id(ic
)),
875 &iph
->saddr
, &iph
->daddr
);
878 * Work through seeing if this is for us.
879 * These checks are supposed to be in an order that means easy
880 * things are checked first to speed up processing.... however
881 * this means that some packets will manage to get a long way
882 * down this stack and then be rejected, but that's life.
884 if ((ic
->icmp6_type
!= ICMPV6_DEST_UNREACH
) &&
885 (ic
->icmp6_type
!= ICMPV6_PKT_TOOBIG
) &&
886 (ic
->icmp6_type
!= ICMPV6_TIME_EXCEED
)) {
891 /* Now find the contained IP header */
892 offset
+= sizeof(_icmph
);
893 cih
= skb_header_pointer(skb
, offset
, sizeof(_ciph
), &_ciph
);
895 return NF_ACCEPT
; /* The packet looks wrong, ignore */
897 pp
= ip_vs_proto_get(cih
->nexthdr
);
901 /* Is the embedded protocol header present? */
902 /* TODO: we don't support fragmentation at the moment anyways */
903 if (unlikely(cih
->nexthdr
== IPPROTO_FRAGMENT
&& pp
->dont_defrag
))
906 IP_VS_DBG_PKT(11, AF_INET6
, pp
, skb
, offset
,
907 "Checking outgoing ICMPv6 for");
909 offset
+= sizeof(struct ipv6hdr
);
911 ip_vs_fill_iphdr(AF_INET6
, cih
, &ciph
);
912 /* The embedded headers contain source and dest in reverse order */
913 cp
= pp
->conn_out_get(AF_INET6
, skb
, pp
, &ciph
, offset
, 1);
917 ipv6_addr_copy(&snet
.in6
, &iph
->saddr
);
918 return handle_response_icmp(AF_INET6
, skb
, &snet
, cih
->nexthdr
, cp
,
919 pp
, offset
, sizeof(struct ipv6hdr
));
924 * Check if sctp chunc is ABORT chunk
926 static inline int is_sctp_abort(const struct sk_buff
*skb
, int nh_len
)
928 sctp_chunkhdr_t
*sch
, schunk
;
929 sch
= skb_header_pointer(skb
, nh_len
+ sizeof(sctp_sctphdr_t
),
930 sizeof(schunk
), &schunk
);
933 if (sch
->type
== SCTP_CID_ABORT
)
938 static inline int is_tcp_reset(const struct sk_buff
*skb
, int nh_len
)
940 struct tcphdr _tcph
, *th
;
942 th
= skb_header_pointer(skb
, nh_len
, sizeof(_tcph
), &_tcph
);
948 /* Handle response packets: rewrite addresses and send away...
949 * Used for NAT and local client.
952 handle_response(int af
, struct sk_buff
*skb
, struct ip_vs_protocol
*pp
,
953 struct ip_vs_conn
*cp
, int ihl
)
955 IP_VS_DBG_PKT(11, af
, pp
, skb
, 0, "Outgoing packet");
957 if (!skb_make_writable(skb
, ihl
))
960 /* mangle the packet */
961 if (pp
->snat_handler
&& !pp
->snat_handler(skb
, pp
, cp
))
964 #ifdef CONFIG_IP_VS_IPV6
966 ipv6_hdr(skb
)->saddr
= cp
->vaddr
.in6
;
970 ip_hdr(skb
)->saddr
= cp
->vaddr
.ip
;
971 ip_send_check(ip_hdr(skb
));
975 * nf_iterate does not expect change in the skb->dst->dev.
976 * It looks like it is not fatal to enable this code for hooks
977 * where our handlers are at the end of the chain list and
978 * when all next handlers use skb->dst->dev and not outdev.
979 * It will definitely route properly the inout NAT traffic
980 * when multiple paths are used.
983 /* For policy routing, packets originating from this
984 * machine itself may be routed differently to packets
985 * passing through. We want this packet to be routed as
986 * if it came from this machine itself. So re-compute
987 * the routing information.
989 #ifdef CONFIG_IP_VS_IPV6
990 if (af
== AF_INET6
) {
991 if (sysctl_ip_vs_snat_reroute
&& ip6_route_me_harder(skb
) != 0)
995 if ((sysctl_ip_vs_snat_reroute
||
996 skb_rtable(skb
)->rt_flags
& RTCF_LOCAL
) &&
997 ip_route_me_harder(skb
, RTN_LOCAL
) != 0)
1000 IP_VS_DBG_PKT(10, af
, pp
, skb
, 0, "After SNAT");
1002 ip_vs_out_stats(cp
, skb
);
1003 ip_vs_set_state(cp
, IP_VS_DIR_OUTPUT
, skb
, pp
);
1004 skb
->ipvs_property
= 1;
1005 if (!(cp
->flags
& IP_VS_CONN_F_NFCT
))
1008 ip_vs_update_conntrack(skb
, cp
, 0);
1022 * Check if outgoing packet belongs to the established ip_vs_conn.
1025 ip_vs_out(unsigned int hooknum
, struct sk_buff
*skb
, int af
)
1027 struct ip_vs_iphdr iph
;
1028 struct ip_vs_protocol
*pp
;
1029 struct ip_vs_conn
*cp
;
1033 /* Already marked as IPVS request or reply? */
1034 if (skb
->ipvs_property
)
1037 /* Bad... Do not break raw sockets */
1038 if (unlikely(skb
->sk
!= NULL
&& hooknum
== NF_INET_LOCAL_OUT
&&
1040 struct sock
*sk
= skb
->sk
;
1041 struct inet_sock
*inet
= inet_sk(skb
->sk
);
1043 if (inet
&& sk
->sk_family
== PF_INET
&& inet
->nodefrag
)
1047 if (unlikely(!skb_dst(skb
)))
1050 ip_vs_fill_iphdr(af
, skb_network_header(skb
), &iph
);
1051 #ifdef CONFIG_IP_VS_IPV6
1052 if (af
== AF_INET6
) {
1053 if (unlikely(iph
.protocol
== IPPROTO_ICMPV6
)) {
1055 int verdict
= ip_vs_out_icmp_v6(skb
, &related
,
1060 ip_vs_fill_iphdr(af
, skb_network_header(skb
), &iph
);
1064 if (unlikely(iph
.protocol
== IPPROTO_ICMP
)) {
1066 int verdict
= ip_vs_out_icmp(skb
, &related
, hooknum
);
1070 ip_vs_fill_iphdr(af
, skb_network_header(skb
), &iph
);
1073 pp
= ip_vs_proto_get(iph
.protocol
);
1077 /* reassemble IP fragments */
1078 #ifdef CONFIG_IP_VS_IPV6
1079 if (af
== AF_INET6
) {
1080 if (ipv6_hdr(skb
)->nexthdr
== IPPROTO_FRAGMENT
) {
1081 if (ip_vs_gather_frags_v6(skb
,
1082 ip_vs_defrag_user(hooknum
)))
1086 ip_vs_fill_iphdr(af
, skb_network_header(skb
), &iph
);
1089 if (unlikely(ip_hdr(skb
)->frag_off
& htons(IP_MF
|IP_OFFSET
) &&
1090 !pp
->dont_defrag
)) {
1091 if (ip_vs_gather_frags(skb
,
1092 ip_vs_defrag_user(hooknum
)))
1095 ip_vs_fill_iphdr(af
, skb_network_header(skb
), &iph
);
1099 * Check if the packet belongs to an existing entry
1101 cp
= pp
->conn_out_get(af
, skb
, pp
, &iph
, iph
.len
, 0);
1104 return handle_response(af
, skb
, pp
, cp
, iph
.len
);
1105 if (sysctl_ip_vs_nat_icmp_send
&&
1106 (pp
->protocol
== IPPROTO_TCP
||
1107 pp
->protocol
== IPPROTO_UDP
||
1108 pp
->protocol
== IPPROTO_SCTP
)) {
1109 __be16 _ports
[2], *pptr
;
1111 pptr
= skb_header_pointer(skb
, iph
.len
,
1112 sizeof(_ports
), _ports
);
1114 return NF_ACCEPT
; /* Not for me */
1115 if (ip_vs_lookup_real_service(af
, iph
.protocol
,
1119 * Notify the real server: there is no
1120 * existing entry if it is not RST
1121 * packet or not TCP packet.
1123 if ((iph
.protocol
!= IPPROTO_TCP
&&
1124 iph
.protocol
!= IPPROTO_SCTP
)
1125 || ((iph
.protocol
== IPPROTO_TCP
1126 && !is_tcp_reset(skb
, iph
.len
))
1127 || (iph
.protocol
== IPPROTO_SCTP
1128 && !is_sctp_abort(skb
,
1130 #ifdef CONFIG_IP_VS_IPV6
1131 if (af
== AF_INET6
) {
1133 dev_net(skb_dst(skb
)->dev
);
1136 skb
->dev
= net
->loopback_dev
;
1138 ICMPV6_DEST_UNREACH
,
1139 ICMPV6_PORT_UNREACH
,
1145 ICMP_PORT_UNREACH
, 0);
1150 IP_VS_DBG_PKT(12, af
, pp
, skb
, 0,
1151 "ip_vs_out: packet continues traversal as normal");
1156 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1157 * used only for VS/NAT.
1158 * Check if packet is reply for established ip_vs_conn.
1161 ip_vs_reply4(unsigned int hooknum
, struct sk_buff
*skb
,
1162 const struct net_device
*in
, const struct net_device
*out
,
1163 int (*okfn
)(struct sk_buff
*))
1165 return ip_vs_out(hooknum
, skb
, AF_INET
);
1169 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1170 * Check if packet is reply for established ip_vs_conn.
1173 ip_vs_local_reply4(unsigned int hooknum
, struct sk_buff
*skb
,
1174 const struct net_device
*in
, const struct net_device
*out
,
1175 int (*okfn
)(struct sk_buff
*))
1177 unsigned int verdict
;
1179 /* Disable BH in LOCAL_OUT until all places are fixed */
1181 verdict
= ip_vs_out(hooknum
, skb
, AF_INET
);
1186 #ifdef CONFIG_IP_VS_IPV6
1189 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1190 * used only for VS/NAT.
1191 * Check if packet is reply for established ip_vs_conn.
1194 ip_vs_reply6(unsigned int hooknum
, struct sk_buff
*skb
,
1195 const struct net_device
*in
, const struct net_device
*out
,
1196 int (*okfn
)(struct sk_buff
*))
1198 return ip_vs_out(hooknum
, skb
, AF_INET6
);
1202 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1203 * Check if packet is reply for established ip_vs_conn.
1206 ip_vs_local_reply6(unsigned int hooknum
, struct sk_buff
*skb
,
1207 const struct net_device
*in
, const struct net_device
*out
,
1208 int (*okfn
)(struct sk_buff
*))
1210 unsigned int verdict
;
1212 /* Disable BH in LOCAL_OUT until all places are fixed */
1214 verdict
= ip_vs_out(hooknum
, skb
, AF_INET6
);
1222 * Handle ICMP messages in the outside-to-inside direction (incoming).
1223 * Find any that might be relevant, check against existing connections,
1224 * forward to the right destination host if relevant.
1225 * Currently handles error types - unreachable, quench, ttl exceeded.
1228 ip_vs_in_icmp(struct sk_buff
*skb
, int *related
, unsigned int hooknum
)
1231 struct icmphdr _icmph
, *ic
;
1232 struct iphdr _ciph
, *cih
; /* The ip header contained within the ICMP */
1233 struct ip_vs_iphdr ciph
;
1234 struct ip_vs_conn
*cp
;
1235 struct ip_vs_protocol
*pp
;
1236 unsigned int offset
, ihl
, verdict
;
1237 union nf_inet_addr snet
;
1241 /* reassemble IP fragments */
1242 if (ip_hdr(skb
)->frag_off
& htons(IP_MF
| IP_OFFSET
)) {
1243 if (ip_vs_gather_frags(skb
, ip_vs_defrag_user(hooknum
)))
1248 offset
= ihl
= iph
->ihl
* 4;
1249 ic
= skb_header_pointer(skb
, offset
, sizeof(_icmph
), &_icmph
);
1253 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1254 ic
->type
, ntohs(icmp_id(ic
)),
1255 &iph
->saddr
, &iph
->daddr
);
1258 * Work through seeing if this is for us.
1259 * These checks are supposed to be in an order that means easy
1260 * things are checked first to speed up processing.... however
1261 * this means that some packets will manage to get a long way
1262 * down this stack and then be rejected, but that's life.
1264 if ((ic
->type
!= ICMP_DEST_UNREACH
) &&
1265 (ic
->type
!= ICMP_SOURCE_QUENCH
) &&
1266 (ic
->type
!= ICMP_TIME_EXCEEDED
)) {
1271 /* Now find the contained IP header */
1272 offset
+= sizeof(_icmph
);
1273 cih
= skb_header_pointer(skb
, offset
, sizeof(_ciph
), &_ciph
);
1275 return NF_ACCEPT
; /* The packet looks wrong, ignore */
1277 pp
= ip_vs_proto_get(cih
->protocol
);
1281 /* Is the embedded protocol header present? */
1282 if (unlikely(cih
->frag_off
& htons(IP_OFFSET
) &&
1286 IP_VS_DBG_PKT(11, AF_INET
, pp
, skb
, offset
,
1287 "Checking incoming ICMP for");
1289 offset
+= cih
->ihl
* 4;
1291 ip_vs_fill_iphdr(AF_INET
, cih
, &ciph
);
1292 /* The embedded headers contain source and dest in reverse order */
1293 cp
= pp
->conn_in_get(AF_INET
, skb
, pp
, &ciph
, offset
, 1);
1295 /* The packet could also belong to a local client */
1296 cp
= pp
->conn_out_get(AF_INET
, skb
, pp
, &ciph
, offset
, 1);
1298 snet
.ip
= iph
->saddr
;
1299 return handle_response_icmp(AF_INET
, skb
, &snet
,
1300 cih
->protocol
, cp
, pp
,
1308 /* Ensure the checksum is correct */
1309 if (!skb_csum_unnecessary(skb
) && ip_vs_checksum_complete(skb
, ihl
)) {
1310 /* Failed checksum! */
1311 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1316 /* do the statistics and put it back */
1317 ip_vs_in_stats(cp
, skb
);
1318 if (IPPROTO_TCP
== cih
->protocol
|| IPPROTO_UDP
== cih
->protocol
)
1319 offset
+= 2 * sizeof(__u16
);
1320 verdict
= ip_vs_icmp_xmit(skb
, cp
, pp
, offset
);
1321 /* LOCALNODE from FORWARD hook is not supported */
1322 if (verdict
== NF_ACCEPT
&& hooknum
== NF_INET_FORWARD
&&
1323 skb_rtable(skb
)->rt_flags
& RTCF_LOCAL
) {
1324 IP_VS_DBG(1, "%s(): "
1325 "local delivery to %pI4 but in FORWARD\n",
1326 __func__
, &skb_rtable(skb
)->rt_dst
);
1331 __ip_vs_conn_put(cp
);
1336 #ifdef CONFIG_IP_VS_IPV6
1338 ip_vs_in_icmp_v6(struct sk_buff
*skb
, int *related
, unsigned int hooknum
)
1340 struct ipv6hdr
*iph
;
1341 struct icmp6hdr _icmph
, *ic
;
1342 struct ipv6hdr _ciph
, *cih
; /* The ip header contained
1344 struct ip_vs_iphdr ciph
;
1345 struct ip_vs_conn
*cp
;
1346 struct ip_vs_protocol
*pp
;
1347 unsigned int offset
, verdict
;
1348 union nf_inet_addr snet
;
1349 struct rt6_info
*rt
;
1353 /* reassemble IP fragments */
1354 if (ipv6_hdr(skb
)->nexthdr
== IPPROTO_FRAGMENT
) {
1355 if (ip_vs_gather_frags_v6(skb
, ip_vs_defrag_user(hooknum
)))
1359 iph
= ipv6_hdr(skb
);
1360 offset
= sizeof(struct ipv6hdr
);
1361 ic
= skb_header_pointer(skb
, offset
, sizeof(_icmph
), &_icmph
);
1365 IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
1366 ic
->icmp6_type
, ntohs(icmpv6_id(ic
)),
1367 &iph
->saddr
, &iph
->daddr
);
1370 * Work through seeing if this is for us.
1371 * These checks are supposed to be in an order that means easy
1372 * things are checked first to speed up processing.... however
1373 * this means that some packets will manage to get a long way
1374 * down this stack and then be rejected, but that's life.
1376 if ((ic
->icmp6_type
!= ICMPV6_DEST_UNREACH
) &&
1377 (ic
->icmp6_type
!= ICMPV6_PKT_TOOBIG
) &&
1378 (ic
->icmp6_type
!= ICMPV6_TIME_EXCEED
)) {
1383 /* Now find the contained IP header */
1384 offset
+= sizeof(_icmph
);
1385 cih
= skb_header_pointer(skb
, offset
, sizeof(_ciph
), &_ciph
);
1387 return NF_ACCEPT
; /* The packet looks wrong, ignore */
1389 pp
= ip_vs_proto_get(cih
->nexthdr
);
1393 /* Is the embedded protocol header present? */
1394 /* TODO: we don't support fragmentation at the moment anyways */
1395 if (unlikely(cih
->nexthdr
== IPPROTO_FRAGMENT
&& pp
->dont_defrag
))
1398 IP_VS_DBG_PKT(11, AF_INET6
, pp
, skb
, offset
,
1399 "Checking incoming ICMPv6 for");
1401 offset
+= sizeof(struct ipv6hdr
);
1403 ip_vs_fill_iphdr(AF_INET6
, cih
, &ciph
);
1404 /* The embedded headers contain source and dest in reverse order */
1405 cp
= pp
->conn_in_get(AF_INET6
, skb
, pp
, &ciph
, offset
, 1);
1407 /* The packet could also belong to a local client */
1408 cp
= pp
->conn_out_get(AF_INET6
, skb
, pp
, &ciph
, offset
, 1);
1410 ipv6_addr_copy(&snet
.in6
, &iph
->saddr
);
1411 return handle_response_icmp(AF_INET6
, skb
, &snet
,
1414 sizeof(struct ipv6hdr
));
1421 /* do the statistics and put it back */
1422 ip_vs_in_stats(cp
, skb
);
1423 if (IPPROTO_TCP
== cih
->nexthdr
|| IPPROTO_UDP
== cih
->nexthdr
||
1424 IPPROTO_SCTP
== cih
->nexthdr
)
1425 offset
+= 2 * sizeof(__u16
);
1426 verdict
= ip_vs_icmp_xmit_v6(skb
, cp
, pp
, offset
);
1427 /* LOCALNODE from FORWARD hook is not supported */
1428 if (verdict
== NF_ACCEPT
&& hooknum
== NF_INET_FORWARD
&&
1429 (rt
= (struct rt6_info
*) skb_dst(skb
)) &&
1430 rt
->rt6i_dev
&& rt
->rt6i_dev
->flags
& IFF_LOOPBACK
) {
1431 IP_VS_DBG(1, "%s(): "
1432 "local delivery to %pI6 but in FORWARD\n",
1433 __func__
, &rt
->rt6i_dst
);
1437 __ip_vs_conn_put(cp
);
1445 * Check if it's for virtual services, look it up,
1446 * and send it on its way...
1449 ip_vs_in(unsigned int hooknum
, struct sk_buff
*skb
, int af
)
1451 struct ip_vs_iphdr iph
;
1452 struct ip_vs_protocol
*pp
;
1453 struct ip_vs_conn
*cp
;
1454 int ret
, restart
, pkts
;
1456 /* Already marked as IPVS request or reply? */
1457 if (skb
->ipvs_property
)
1462 * - remote client: only PACKET_HOST
1463 * - route: used for struct net when skb->dev is unset
1465 if (unlikely((skb
->pkt_type
!= PACKET_HOST
&&
1466 hooknum
!= NF_INET_LOCAL_OUT
) ||
1468 ip_vs_fill_iphdr(af
, skb_network_header(skb
), &iph
);
1469 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
1470 " ignored in hook %u\n",
1471 skb
->pkt_type
, iph
.protocol
,
1472 IP_VS_DBG_ADDR(af
, &iph
.daddr
), hooknum
);
1475 ip_vs_fill_iphdr(af
, skb_network_header(skb
), &iph
);
1477 /* Bad... Do not break raw sockets */
1478 if (unlikely(skb
->sk
!= NULL
&& hooknum
== NF_INET_LOCAL_OUT
&&
1480 struct sock
*sk
= skb
->sk
;
1481 struct inet_sock
*inet
= inet_sk(skb
->sk
);
1483 if (inet
&& sk
->sk_family
== PF_INET
&& inet
->nodefrag
)
1487 #ifdef CONFIG_IP_VS_IPV6
1488 if (af
== AF_INET6
) {
1489 if (unlikely(iph
.protocol
== IPPROTO_ICMPV6
)) {
1491 int verdict
= ip_vs_in_icmp_v6(skb
, &related
, hooknum
);
1495 ip_vs_fill_iphdr(af
, skb_network_header(skb
), &iph
);
1499 if (unlikely(iph
.protocol
== IPPROTO_ICMP
)) {
1501 int verdict
= ip_vs_in_icmp(skb
, &related
, hooknum
);
1505 ip_vs_fill_iphdr(af
, skb_network_header(skb
), &iph
);
1508 /* Protocol supported? */
1509 pp
= ip_vs_proto_get(iph
.protocol
);
1514 * Check if the packet belongs to an existing connection entry
1516 cp
= pp
->conn_in_get(af
, skb
, pp
, &iph
, iph
.len
, 0);
1518 if (unlikely(!cp
)) {
1521 if (!pp
->conn_schedule(af
, skb
, pp
, &v
, &cp
))
1525 if (unlikely(!cp
)) {
1526 /* sorry, all this trouble for a no-hit :) */
1527 IP_VS_DBG_PKT(12, af
, pp
, skb
, 0,
1528 "ip_vs_in: packet continues traversal as normal");
1532 IP_VS_DBG_PKT(11, af
, pp
, skb
, 0, "Incoming packet");
1534 /* Check the server status */
1535 if (cp
->dest
&& !(cp
->dest
->flags
& IP_VS_DEST_F_AVAILABLE
)) {
1536 /* the destination server is not available */
1538 if (sysctl_ip_vs_expire_nodest_conn
) {
1539 /* try to expire the connection immediately */
1540 ip_vs_conn_expire_now(cp
);
1542 /* don't restart its timer, and silently
1544 __ip_vs_conn_put(cp
);
1548 ip_vs_in_stats(cp
, skb
);
1549 restart
= ip_vs_set_state(cp
, IP_VS_DIR_INPUT
, skb
, pp
);
1550 if (cp
->packet_xmit
)
1551 ret
= cp
->packet_xmit(skb
, cp
, pp
);
1552 /* do not touch skb anymore */
1554 IP_VS_DBG_RL("warning: packet_xmit is null");
1558 /* Increase its packet counter and check if it is needed
1559 * to be synchronized
1561 * Sync connection if it is about to close to
1562 * encorage the standby servers to update the connections timeout
1564 pkts
= atomic_add_return(1, &cp
->in_pkts
);
1565 if (af
== AF_INET
&& (ip_vs_sync_state
& IP_VS_STATE_MASTER
) &&
1566 cp
->protocol
== IPPROTO_SCTP
) {
1567 if ((cp
->state
== IP_VS_SCTP_S_ESTABLISHED
&&
1568 (pkts
% sysctl_ip_vs_sync_threshold
[1]
1569 == sysctl_ip_vs_sync_threshold
[0])) ||
1570 (cp
->old_state
!= cp
->state
&&
1571 ((cp
->state
== IP_VS_SCTP_S_CLOSED
) ||
1572 (cp
->state
== IP_VS_SCTP_S_SHUT_ACK_CLI
) ||
1573 (cp
->state
== IP_VS_SCTP_S_SHUT_ACK_SER
)))) {
1574 ip_vs_sync_conn(cp
);
1579 /* Keep this block last: TCP and others with pp->num_states <= 1 */
1580 else if (af
== AF_INET
&&
1581 (ip_vs_sync_state
& IP_VS_STATE_MASTER
) &&
1582 (((cp
->protocol
!= IPPROTO_TCP
||
1583 cp
->state
== IP_VS_TCP_S_ESTABLISHED
) &&
1584 (pkts
% sysctl_ip_vs_sync_threshold
[1]
1585 == sysctl_ip_vs_sync_threshold
[0])) ||
1586 ((cp
->protocol
== IPPROTO_TCP
) && (cp
->old_state
!= cp
->state
) &&
1587 ((cp
->state
== IP_VS_TCP_S_FIN_WAIT
) ||
1588 (cp
->state
== IP_VS_TCP_S_CLOSE
) ||
1589 (cp
->state
== IP_VS_TCP_S_CLOSE_WAIT
) ||
1590 (cp
->state
== IP_VS_TCP_S_TIME_WAIT
)))))
1591 ip_vs_sync_conn(cp
);
1593 cp
->old_state
= cp
->state
;
1600 * AF_INET handler in NF_INET_LOCAL_IN chain
1601 * Schedule and forward packets from remote clients
1604 ip_vs_remote_request4(unsigned int hooknum
, struct sk_buff
*skb
,
1605 const struct net_device
*in
,
1606 const struct net_device
*out
,
1607 int (*okfn
)(struct sk_buff
*))
1609 return ip_vs_in(hooknum
, skb
, AF_INET
);
1613 * AF_INET handler in NF_INET_LOCAL_OUT chain
1614 * Schedule and forward packets from local clients
1617 ip_vs_local_request4(unsigned int hooknum
, struct sk_buff
*skb
,
1618 const struct net_device
*in
, const struct net_device
*out
,
1619 int (*okfn
)(struct sk_buff
*))
1621 unsigned int verdict
;
1623 /* Disable BH in LOCAL_OUT until all places are fixed */
1625 verdict
= ip_vs_in(hooknum
, skb
, AF_INET
);
1630 #ifdef CONFIG_IP_VS_IPV6
1633 * AF_INET6 handler in NF_INET_LOCAL_IN chain
1634 * Schedule and forward packets from remote clients
1637 ip_vs_remote_request6(unsigned int hooknum
, struct sk_buff
*skb
,
1638 const struct net_device
*in
,
1639 const struct net_device
*out
,
1640 int (*okfn
)(struct sk_buff
*))
1642 return ip_vs_in(hooknum
, skb
, AF_INET6
);
1646 * AF_INET6 handler in NF_INET_LOCAL_OUT chain
1647 * Schedule and forward packets from local clients
1650 ip_vs_local_request6(unsigned int hooknum
, struct sk_buff
*skb
,
1651 const struct net_device
*in
, const struct net_device
*out
,
1652 int (*okfn
)(struct sk_buff
*))
1654 unsigned int verdict
;
1656 /* Disable BH in LOCAL_OUT until all places are fixed */
1658 verdict
= ip_vs_in(hooknum
, skb
, AF_INET6
);
1667 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1668 * related packets destined for 0.0.0.0/0.
1669 * When fwmark-based virtual service is used, such as transparent
1670 * cache cluster, TCP packets can be marked and routed to ip_vs_in,
1671 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1672 * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1673 * and send them to ip_vs_in_icmp.
1676 ip_vs_forward_icmp(unsigned int hooknum
, struct sk_buff
*skb
,
1677 const struct net_device
*in
, const struct net_device
*out
,
1678 int (*okfn
)(struct sk_buff
*))
1682 if (ip_hdr(skb
)->protocol
!= IPPROTO_ICMP
)
1685 return ip_vs_in_icmp(skb
, &r
, hooknum
);
1688 #ifdef CONFIG_IP_VS_IPV6
1690 ip_vs_forward_icmp_v6(unsigned int hooknum
, struct sk_buff
*skb
,
1691 const struct net_device
*in
, const struct net_device
*out
,
1692 int (*okfn
)(struct sk_buff
*))
1696 if (ipv6_hdr(skb
)->nexthdr
!= IPPROTO_ICMPV6
)
1699 return ip_vs_in_icmp_v6(skb
, &r
, hooknum
);
1704 static struct nf_hook_ops ip_vs_ops
[] __read_mostly
= {
1705 /* After packet filtering, change source only for VS/NAT */
1707 .hook
= ip_vs_reply4
,
1708 .owner
= THIS_MODULE
,
1710 .hooknum
= NF_INET_LOCAL_IN
,
1713 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1714 * or VS/NAT(change destination), so that filtering rules can be
1715 * applied to IPVS. */
1717 .hook
= ip_vs_remote_request4
,
1718 .owner
= THIS_MODULE
,
1720 .hooknum
= NF_INET_LOCAL_IN
,
1723 /* Before ip_vs_in, change source only for VS/NAT */
1725 .hook
= ip_vs_local_reply4
,
1726 .owner
= THIS_MODULE
,
1728 .hooknum
= NF_INET_LOCAL_OUT
,
1731 /* After mangle, schedule and forward local requests */
1733 .hook
= ip_vs_local_request4
,
1734 .owner
= THIS_MODULE
,
1736 .hooknum
= NF_INET_LOCAL_OUT
,
1739 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1740 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1742 .hook
= ip_vs_forward_icmp
,
1743 .owner
= THIS_MODULE
,
1745 .hooknum
= NF_INET_FORWARD
,
1748 /* After packet filtering, change source only for VS/NAT */
1750 .hook
= ip_vs_reply4
,
1751 .owner
= THIS_MODULE
,
1753 .hooknum
= NF_INET_FORWARD
,
1756 #ifdef CONFIG_IP_VS_IPV6
1757 /* After packet filtering, change source only for VS/NAT */
1759 .hook
= ip_vs_reply6
,
1760 .owner
= THIS_MODULE
,
1762 .hooknum
= NF_INET_LOCAL_IN
,
1765 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1766 * or VS/NAT(change destination), so that filtering rules can be
1767 * applied to IPVS. */
1769 .hook
= ip_vs_remote_request6
,
1770 .owner
= THIS_MODULE
,
1772 .hooknum
= NF_INET_LOCAL_IN
,
1775 /* Before ip_vs_in, change source only for VS/NAT */
1777 .hook
= ip_vs_local_reply6
,
1778 .owner
= THIS_MODULE
,
1780 .hooknum
= NF_INET_LOCAL_OUT
,
1783 /* After mangle, schedule and forward local requests */
1785 .hook
= ip_vs_local_request6
,
1786 .owner
= THIS_MODULE
,
1788 .hooknum
= NF_INET_LOCAL_OUT
,
1791 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1792 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1794 .hook
= ip_vs_forward_icmp_v6
,
1795 .owner
= THIS_MODULE
,
1797 .hooknum
= NF_INET_FORWARD
,
1800 /* After packet filtering, change source only for VS/NAT */
1802 .hook
= ip_vs_reply6
,
1803 .owner
= THIS_MODULE
,
1805 .hooknum
= NF_INET_FORWARD
,
1813 * Initialize IP Virtual Server
1815 static int __init
ip_vs_init(void)
1819 ip_vs_estimator_init();
1821 ret
= ip_vs_control_init();
1823 pr_err("can't setup control.\n");
1824 goto cleanup_estimator
;
1827 ip_vs_protocol_init();
1829 ret
= ip_vs_app_init();
1831 pr_err("can't setup application helper.\n");
1832 goto cleanup_protocol
;
1835 ret
= ip_vs_conn_init();
1837 pr_err("can't setup connection table.\n");
1841 ret
= nf_register_hooks(ip_vs_ops
, ARRAY_SIZE(ip_vs_ops
));
1843 pr_err("can't register hooks.\n");
1847 pr_info("ipvs loaded.\n");
1851 ip_vs_conn_cleanup();
1853 ip_vs_app_cleanup();
1855 ip_vs_protocol_cleanup();
1856 ip_vs_control_cleanup();
1858 ip_vs_estimator_cleanup();
1862 static void __exit
ip_vs_cleanup(void)
1864 nf_unregister_hooks(ip_vs_ops
, ARRAY_SIZE(ip_vs_ops
));
1865 ip_vs_conn_cleanup();
1866 ip_vs_app_cleanup();
1867 ip_vs_protocol_cleanup();
1868 ip_vs_control_cleanup();
1869 ip_vs_estimator_cleanup();
1870 pr_info("ipvs unloaded.\n");
1873 module_init(ip_vs_init
);
1874 module_exit(ip_vs_cleanup
);
1875 MODULE_LICENSE("GPL");