mlxsw: spectrum: Add support for access cable info via ethtool
[linux-2.6/btrfs-unstable.git] / net / netfilter / ipvs / ip_vs_proto_tcp.c
blob12dc8d5bc37d7ea03ba8448514a6d3caf03a62b0
1 /*
2 * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
12 * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
14 * Network name space (netns) aware.
15 * Global data moved to netns i.e struct netns_ipvs
16 * tcp_timeouts table has copy per netns in a hash table per
17 * protocol ip_vs_proto_data and is handled by netns
20 #define KMSG_COMPONENT "IPVS"
21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23 #include <linux/kernel.h>
24 #include <linux/ip.h>
25 #include <linux/tcp.h> /* for tcphdr */
26 #include <net/ip.h>
27 #include <net/tcp.h> /* for csum_tcpudp_magic */
28 #include <net/ip6_checksum.h>
29 #include <linux/netfilter.h>
30 #include <linux/netfilter_ipv4.h>
32 #include <net/ip_vs.h>
34 static int
35 tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
36 struct ip_vs_proto_data *pd,
37 int *verdict, struct ip_vs_conn **cpp,
38 struct ip_vs_iphdr *iph)
40 struct ip_vs_service *svc;
41 struct tcphdr _tcph, *th;
42 __be16 _ports[2], *ports = NULL;
44 /* In the event of icmp, we're only guaranteed to have the first 8
45 * bytes of the transport header, so we only check the rest of the
46 * TCP packet for non-ICMP packets
48 if (likely(!ip_vs_iph_icmp(iph))) {
49 th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
50 if (th) {
51 if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn))
52 return 1;
53 ports = &th->source;
55 } else {
56 ports = skb_header_pointer(
57 skb, iph->len, sizeof(_ports), &_ports);
60 if (!ports) {
61 *verdict = NF_DROP;
62 return 0;
65 /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
66 rcu_read_lock();
68 if (likely(!ip_vs_iph_inverse(iph)))
69 svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
70 &iph->daddr, ports[1]);
71 else
72 svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
73 &iph->saddr, ports[0]);
75 if (svc) {
76 int ignored;
78 if (ip_vs_todrop(ipvs)) {
80 * It seems that we are very loaded.
81 * We have to drop this packet :(
83 rcu_read_unlock();
84 *verdict = NF_DROP;
85 return 0;
89 * Let the virtual server select a real server for the
90 * incoming connection, and create a connection entry.
92 *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
93 if (!*cpp && ignored <= 0) {
94 if (!ignored)
95 *verdict = ip_vs_leave(svc, skb, pd, iph);
96 else
97 *verdict = NF_DROP;
98 rcu_read_unlock();
99 return 0;
102 rcu_read_unlock();
103 /* NF_ACCEPT */
104 return 1;
108 static inline void
109 tcp_fast_csum_update(int af, struct tcphdr *tcph,
110 const union nf_inet_addr *oldip,
111 const union nf_inet_addr *newip,
112 __be16 oldport, __be16 newport)
114 #ifdef CONFIG_IP_VS_IPV6
115 if (af == AF_INET6)
116 tcph->check =
117 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
118 ip_vs_check_diff2(oldport, newport,
119 ~csum_unfold(tcph->check))));
120 else
121 #endif
122 tcph->check =
123 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
124 ip_vs_check_diff2(oldport, newport,
125 ~csum_unfold(tcph->check))));
129 static inline void
130 tcp_partial_csum_update(int af, struct tcphdr *tcph,
131 const union nf_inet_addr *oldip,
132 const union nf_inet_addr *newip,
133 __be16 oldlen, __be16 newlen)
135 #ifdef CONFIG_IP_VS_IPV6
136 if (af == AF_INET6)
137 tcph->check =
138 ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
139 ip_vs_check_diff2(oldlen, newlen,
140 csum_unfold(tcph->check))));
141 else
142 #endif
143 tcph->check =
144 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
145 ip_vs_check_diff2(oldlen, newlen,
146 csum_unfold(tcph->check))));
150 static int
151 tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
152 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
154 struct tcphdr *tcph;
155 unsigned int tcphoff = iph->len;
156 int oldlen;
157 int payload_csum = 0;
159 #ifdef CONFIG_IP_VS_IPV6
160 if (cp->af == AF_INET6 && iph->fragoffs)
161 return 1;
162 #endif
163 oldlen = skb->len - tcphoff;
165 /* csum_check requires unshared skb */
166 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
167 return 0;
169 if (unlikely(cp->app != NULL)) {
170 int ret;
172 /* Some checks before mangling */
173 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
174 return 0;
176 /* Call application helper if needed */
177 if (!(ret = ip_vs_app_pkt_out(cp, skb)))
178 return 0;
179 /* ret=2: csum update is needed after payload mangling */
180 if (ret == 1)
181 oldlen = skb->len - tcphoff;
182 else
183 payload_csum = 1;
186 tcph = (void *)skb_network_header(skb) + tcphoff;
187 tcph->source = cp->vport;
189 /* Adjust TCP checksums */
190 if (skb->ip_summed == CHECKSUM_PARTIAL) {
191 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
192 htons(oldlen),
193 htons(skb->len - tcphoff));
194 } else if (!payload_csum) {
195 /* Only port and addr are changed, do fast csum update */
196 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
197 cp->dport, cp->vport);
198 if (skb->ip_summed == CHECKSUM_COMPLETE)
199 skb->ip_summed = (cp->app && pp->csum_check) ?
200 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
201 } else {
202 /* full checksum calculation */
203 tcph->check = 0;
204 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
205 #ifdef CONFIG_IP_VS_IPV6
206 if (cp->af == AF_INET6)
207 tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
208 &cp->caddr.in6,
209 skb->len - tcphoff,
210 cp->protocol, skb->csum);
211 else
212 #endif
213 tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
214 cp->caddr.ip,
215 skb->len - tcphoff,
216 cp->protocol,
217 skb->csum);
218 skb->ip_summed = CHECKSUM_UNNECESSARY;
220 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
221 pp->name, tcph->check,
222 (char*)&(tcph->check) - (char*)tcph);
224 return 1;
228 static int
229 tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
230 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
232 struct tcphdr *tcph;
233 unsigned int tcphoff = iph->len;
234 int oldlen;
235 int payload_csum = 0;
237 #ifdef CONFIG_IP_VS_IPV6
238 if (cp->af == AF_INET6 && iph->fragoffs)
239 return 1;
240 #endif
241 oldlen = skb->len - tcphoff;
243 /* csum_check requires unshared skb */
244 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
245 return 0;
247 if (unlikely(cp->app != NULL)) {
248 int ret;
250 /* Some checks before mangling */
251 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
252 return 0;
255 * Attempt ip_vs_app call.
256 * It will fix ip_vs_conn and iph ack_seq stuff
258 if (!(ret = ip_vs_app_pkt_in(cp, skb)))
259 return 0;
260 /* ret=2: csum update is needed after payload mangling */
261 if (ret == 1)
262 oldlen = skb->len - tcphoff;
263 else
264 payload_csum = 1;
267 tcph = (void *)skb_network_header(skb) + tcphoff;
268 tcph->dest = cp->dport;
271 * Adjust TCP checksums
273 if (skb->ip_summed == CHECKSUM_PARTIAL) {
274 tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
275 htons(oldlen),
276 htons(skb->len - tcphoff));
277 } else if (!payload_csum) {
278 /* Only port and addr are changed, do fast csum update */
279 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
280 cp->vport, cp->dport);
281 if (skb->ip_summed == CHECKSUM_COMPLETE)
282 skb->ip_summed = (cp->app && pp->csum_check) ?
283 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
284 } else {
285 /* full checksum calculation */
286 tcph->check = 0;
287 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
288 #ifdef CONFIG_IP_VS_IPV6
289 if (cp->af == AF_INET6)
290 tcph->check = csum_ipv6_magic(&cp->caddr.in6,
291 &cp->daddr.in6,
292 skb->len - tcphoff,
293 cp->protocol, skb->csum);
294 else
295 #endif
296 tcph->check = csum_tcpudp_magic(cp->caddr.ip,
297 cp->daddr.ip,
298 skb->len - tcphoff,
299 cp->protocol,
300 skb->csum);
301 skb->ip_summed = CHECKSUM_UNNECESSARY;
303 return 1;
307 static int
308 tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
310 unsigned int tcphoff;
312 #ifdef CONFIG_IP_VS_IPV6
313 if (af == AF_INET6)
314 tcphoff = sizeof(struct ipv6hdr);
315 else
316 #endif
317 tcphoff = ip_hdrlen(skb);
319 switch (skb->ip_summed) {
320 case CHECKSUM_NONE:
321 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
322 case CHECKSUM_COMPLETE:
323 #ifdef CONFIG_IP_VS_IPV6
324 if (af == AF_INET6) {
325 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
326 &ipv6_hdr(skb)->daddr,
327 skb->len - tcphoff,
328 ipv6_hdr(skb)->nexthdr,
329 skb->csum)) {
330 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
331 "Failed checksum for");
332 return 0;
334 } else
335 #endif
336 if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
337 ip_hdr(skb)->daddr,
338 skb->len - tcphoff,
339 ip_hdr(skb)->protocol,
340 skb->csum)) {
341 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
342 "Failed checksum for");
343 return 0;
345 break;
346 default:
347 /* No need to checksum. */
348 break;
351 return 1;
355 #define TCP_DIR_INPUT 0
356 #define TCP_DIR_OUTPUT 4
357 #define TCP_DIR_INPUT_ONLY 8
359 static const int tcp_state_off[IP_VS_DIR_LAST] = {
360 [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
361 [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
362 [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
366 * Timeout table[state]
368 static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
369 [IP_VS_TCP_S_NONE] = 2*HZ,
370 [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
371 [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
372 [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
373 [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
374 [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
375 [IP_VS_TCP_S_CLOSE] = 10*HZ,
376 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
377 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
378 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
379 [IP_VS_TCP_S_SYNACK] = 120*HZ,
380 [IP_VS_TCP_S_LAST] = 2*HZ,
383 static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
384 [IP_VS_TCP_S_NONE] = "NONE",
385 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
386 [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
387 [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
388 [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
389 [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
390 [IP_VS_TCP_S_CLOSE] = "CLOSE",
391 [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
392 [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
393 [IP_VS_TCP_S_LISTEN] = "LISTEN",
394 [IP_VS_TCP_S_SYNACK] = "SYNACK",
395 [IP_VS_TCP_S_LAST] = "BUG!",
398 static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = {
399 [IP_VS_TCP_S_NONE] = false,
400 [IP_VS_TCP_S_ESTABLISHED] = true,
401 [IP_VS_TCP_S_SYN_SENT] = true,
402 [IP_VS_TCP_S_SYN_RECV] = true,
403 [IP_VS_TCP_S_FIN_WAIT] = false,
404 [IP_VS_TCP_S_TIME_WAIT] = false,
405 [IP_VS_TCP_S_CLOSE] = false,
406 [IP_VS_TCP_S_CLOSE_WAIT] = false,
407 [IP_VS_TCP_S_LAST_ACK] = false,
408 [IP_VS_TCP_S_LISTEN] = false,
409 [IP_VS_TCP_S_SYNACK] = true,
412 #define sNO IP_VS_TCP_S_NONE
413 #define sES IP_VS_TCP_S_ESTABLISHED
414 #define sSS IP_VS_TCP_S_SYN_SENT
415 #define sSR IP_VS_TCP_S_SYN_RECV
416 #define sFW IP_VS_TCP_S_FIN_WAIT
417 #define sTW IP_VS_TCP_S_TIME_WAIT
418 #define sCL IP_VS_TCP_S_CLOSE
419 #define sCW IP_VS_TCP_S_CLOSE_WAIT
420 #define sLA IP_VS_TCP_S_LAST_ACK
421 #define sLI IP_VS_TCP_S_LISTEN
422 #define sSA IP_VS_TCP_S_SYNACK
424 struct tcp_states_t {
425 int next_state[IP_VS_TCP_S_LAST];
428 static const char * tcp_state_name(int state)
430 if (state >= IP_VS_TCP_S_LAST)
431 return "ERR!";
432 return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
435 static bool tcp_state_active(int state)
437 if (state >= IP_VS_TCP_S_LAST)
438 return false;
439 return tcp_state_active_table[state];
442 static struct tcp_states_t tcp_states [] = {
443 /* INPUT */
444 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
445 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
446 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
447 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
448 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
450 /* OUTPUT */
451 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
452 /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
453 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
454 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
455 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
457 /* INPUT-ONLY */
458 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
459 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
460 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
461 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
462 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
465 static struct tcp_states_t tcp_states_dos [] = {
466 /* INPUT */
467 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
468 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
469 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
470 /*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
471 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
473 /* OUTPUT */
474 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
475 /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
476 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
477 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
478 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
480 /* INPUT-ONLY */
481 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
482 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
483 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
484 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
485 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
488 static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
490 int on = (flags & 1); /* secure_tcp */
493 ** FIXME: change secure_tcp to independent sysctl var
494 ** or make it per-service or per-app because it is valid
495 ** for most if not for all of the applications. Something
496 ** like "capabilities" (flags) for each object.
498 pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
501 static inline int tcp_state_idx(struct tcphdr *th)
503 if (th->rst)
504 return 3;
505 if (th->syn)
506 return 0;
507 if (th->fin)
508 return 1;
509 if (th->ack)
510 return 2;
511 return -1;
514 static inline void
515 set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
516 int direction, struct tcphdr *th)
518 int state_idx;
519 int new_state = IP_VS_TCP_S_CLOSE;
520 int state_off = tcp_state_off[direction];
523 * Update state offset to INPUT_ONLY if necessary
524 * or delete NO_OUTPUT flag if output packet detected
526 if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
527 if (state_off == TCP_DIR_OUTPUT)
528 cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
529 else
530 state_off = TCP_DIR_INPUT_ONLY;
533 if ((state_idx = tcp_state_idx(th)) < 0) {
534 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
535 goto tcp_state_out;
538 new_state =
539 pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
541 tcp_state_out:
542 if (new_state != cp->state) {
543 struct ip_vs_dest *dest = cp->dest;
545 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
546 "%s:%d state: %s->%s conn->refcnt:%d\n",
547 pd->pp->name,
548 ((state_off == TCP_DIR_OUTPUT) ?
549 "output " : "input "),
550 th->syn ? 'S' : '.',
551 th->fin ? 'F' : '.',
552 th->ack ? 'A' : '.',
553 th->rst ? 'R' : '.',
554 IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
555 ntohs(cp->dport),
556 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
557 ntohs(cp->cport),
558 tcp_state_name(cp->state),
559 tcp_state_name(new_state),
560 refcount_read(&cp->refcnt));
562 if (dest) {
563 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
564 !tcp_state_active(new_state)) {
565 atomic_dec(&dest->activeconns);
566 atomic_inc(&dest->inactconns);
567 cp->flags |= IP_VS_CONN_F_INACTIVE;
568 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
569 tcp_state_active(new_state)) {
570 atomic_inc(&dest->activeconns);
571 atomic_dec(&dest->inactconns);
572 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
577 if (likely(pd))
578 cp->timeout = pd->timeout_table[cp->state = new_state];
579 else /* What to do ? */
580 cp->timeout = tcp_timeouts[cp->state = new_state];
584 * Handle state transitions
586 static void
587 tcp_state_transition(struct ip_vs_conn *cp, int direction,
588 const struct sk_buff *skb,
589 struct ip_vs_proto_data *pd)
591 struct tcphdr _tcph, *th;
593 #ifdef CONFIG_IP_VS_IPV6
594 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
595 #else
596 int ihl = ip_hdrlen(skb);
597 #endif
599 th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
600 if (th == NULL)
601 return;
603 spin_lock_bh(&cp->lock);
604 set_tcp_state(pd, cp, direction, th);
605 spin_unlock_bh(&cp->lock);
608 static inline __u16 tcp_app_hashkey(__be16 port)
610 return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
611 & TCP_APP_TAB_MASK;
615 static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
617 struct ip_vs_app *i;
618 __u16 hash;
619 __be16 port = inc->port;
620 int ret = 0;
621 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
623 hash = tcp_app_hashkey(port);
625 list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
626 if (i->port == port) {
627 ret = -EEXIST;
628 goto out;
631 list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
632 atomic_inc(&pd->appcnt);
634 out:
635 return ret;
639 static void
640 tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
642 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
644 atomic_dec(&pd->appcnt);
645 list_del_rcu(&inc->p_list);
649 static int
650 tcp_app_conn_bind(struct ip_vs_conn *cp)
652 struct netns_ipvs *ipvs = cp->ipvs;
653 int hash;
654 struct ip_vs_app *inc;
655 int result = 0;
657 /* Default binding: bind app only for NAT */
658 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
659 return 0;
661 /* Lookup application incarnations and bind the right one */
662 hash = tcp_app_hashkey(cp->vport);
664 rcu_read_lock();
665 list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
666 if (inc->port == cp->vport) {
667 if (unlikely(!ip_vs_app_inc_get(inc)))
668 break;
669 rcu_read_unlock();
671 IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
672 "%s:%u to app %s on port %u\n",
673 __func__,
674 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
675 ntohs(cp->cport),
676 IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
677 ntohs(cp->vport),
678 inc->name, ntohs(inc->port));
680 cp->app = inc;
681 if (inc->init_conn)
682 result = inc->init_conn(inc, cp);
683 goto out;
686 rcu_read_unlock();
688 out:
689 return result;
694 * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
696 void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
698 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP);
700 spin_lock_bh(&cp->lock);
701 cp->state = IP_VS_TCP_S_LISTEN;
702 cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
703 : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
704 spin_unlock_bh(&cp->lock);
707 /* ---------------------------------------------
708 * timeouts is netns related now.
709 * ---------------------------------------------
711 static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
713 ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
714 pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
715 sizeof(tcp_timeouts));
716 if (!pd->timeout_table)
717 return -ENOMEM;
718 pd->tcp_state_table = tcp_states;
719 return 0;
722 static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
724 kfree(pd->timeout_table);
728 struct ip_vs_protocol ip_vs_protocol_tcp = {
729 .name = "TCP",
730 .protocol = IPPROTO_TCP,
731 .num_states = IP_VS_TCP_S_LAST,
732 .dont_defrag = 0,
733 .init = NULL,
734 .exit = NULL,
735 .init_netns = __ip_vs_tcp_init,
736 .exit_netns = __ip_vs_tcp_exit,
737 .register_app = tcp_register_app,
738 .unregister_app = tcp_unregister_app,
739 .conn_schedule = tcp_conn_schedule,
740 .conn_in_get = ip_vs_conn_in_get_proto,
741 .conn_out_get = ip_vs_conn_out_get_proto,
742 .snat_handler = tcp_snat_handler,
743 .dnat_handler = tcp_dnat_handler,
744 .csum_check = tcp_csum_check,
745 .state_name = tcp_state_name,
746 .state_transition = tcp_state_transition,
747 .app_conn_bind = tcp_app_conn_bind,
748 .debug_packet = ip_vs_tcpudp_debug_packet,
749 .timeout_change = tcp_timeout_change,