4492 zone hung in down state, stuck in ilb_stack_fini
[unleashed.git] / usr / src / uts / common / inet / ilb / ilb_conn.c
blob7f79d41dd61e9256dd93a34e7b639129e2cf31e7
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2014 Joyent, Inc. All rights reserved.
28 #include <sys/sysmacros.h>
29 #include <sys/types.h>
30 #include <sys/conf.h>
31 #include <sys/time.h>
32 #include <sys/taskq.h>
33 #include <sys/cmn_err.h>
34 #include <sys/sdt.h>
35 #include <sys/atomic.h>
36 #include <netinet/in.h>
37 #include <inet/ip.h>
38 #include <inet/ip6.h>
39 #include <inet/tcp.h>
40 #include <inet/udp_impl.h>
41 #include <inet/ilb.h>
43 #include "ilb_stack.h"
44 #include "ilb_impl.h"
45 #include "ilb_conn.h"
46 #include "ilb_nat.h"
49 * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection
51 * start: starting index into the hash table to do gc
52 * end: ending index into the hash table to do gc
53 * ilbs: pointer to the ilb_stack_t of the IP stack
54 * tid_lock: mutex to protect the timer id.
55 * tid: timer id of the timer
57 typedef struct ilb_timer_s {
58 uint32_t start;
59 uint32_t end;
60 ilb_stack_t *ilbs;
61 kmutex_t tid_lock;
62 timeout_id_t tid;
63 } ilb_timer_t;
65 /* Hash macro for finding the index to the conn hash table */
66 #define ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size) \
67 (((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 + \
68 (*((saddr) + 2) ^ *((daddr) + 2)) * 1369 + \
69 (*((saddr) + 1) ^ *((daddr) + 1)) * 37 + \
70 (*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) & \
71 ((hash_size) - 1))
73 /* Kmem cache for the conn hash entry */
74 static struct kmem_cache *ilb_conn_cache = NULL;
77 * There are 60 timers running to do conn cache garbage collection. Each
78 * gc thread is responsible for 1/60 of the conn hash table.
80 static int ilb_conn_timer_size = 60;
82 /* Each of the above gc timers wake up every 15s to do the gc. */
83 static int ilb_conn_cache_timeout = 15;
85 #define ILB_STICKY_HASH(saddr, rule, hash_size) \
86 (((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 + \
87 (*((saddr) + 2) ^ ((rule) >> 16)) * 961 + \
88 (*((saddr) + 1) ^ ((rule) >> 8)) * 31 + \
89 (*(saddr) ^ (rule))) & ((hash_size) - 1))
91 static struct kmem_cache *ilb_sticky_cache = NULL;
94 * There are 60 timers running to do sticky cache garbage collection. Each
95 * gc thread is responsible for 1/60 of the sticky hash table.
97 static int ilb_sticky_timer_size = 60;
99 /* Each of the above gc timers wake up every 15s to do the gc. */
100 static int ilb_sticky_timeout = 15;
102 #define ILB_STICKY_REFRELE(s) \
104 mutex_enter(&(s)->hash->sticky_lock); \
105 (s)->refcnt--; \
106 (s)->atime = ddi_get_lbolt64(); \
107 mutex_exit(&s->hash->sticky_lock); \
111 static void
112 ilb_conn_cache_init(void)
114 ilb_conn_cache = kmem_cache_create("ilb_conn_cache",
115 sizeof (ilb_conn_t), 0, NULL, NULL, NULL, NULL, NULL,
116 ilb_kmem_flags);
119 void
120 ilb_conn_cache_fini(void)
122 if (ilb_conn_cache != NULL) {
123 kmem_cache_destroy(ilb_conn_cache);
124 ilb_conn_cache = NULL;
128 static void
129 ilb_conn_remove_common(ilb_conn_t *connp, boolean_t c2s)
131 ilb_conn_hash_t *hash;
132 ilb_conn_t **next, **prev;
133 ilb_conn_t **next_prev, **prev_next;
135 if (c2s) {
136 hash = connp->conn_c2s_hash;
137 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
138 next = &connp->conn_c2s_next;
139 prev = &connp->conn_c2s_prev;
140 if (*next != NULL)
141 next_prev = &(*next)->conn_c2s_prev;
142 if (*prev != NULL)
143 prev_next = &(*prev)->conn_c2s_next;
144 } else {
145 hash = connp->conn_s2c_hash;
146 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
147 next = &connp->conn_s2c_next;
148 prev = &connp->conn_s2c_prev;
149 if (*next != NULL)
150 next_prev = &(*next)->conn_s2c_prev;
151 if (*prev != NULL)
152 prev_next = &(*prev)->conn_s2c_next;
155 if (hash->ilb_connp == connp) {
156 hash->ilb_connp = *next;
157 if (*next != NULL)
158 *next_prev = NULL;
159 } else {
160 if (*prev != NULL)
161 *prev_next = *next;
162 if (*next != NULL)
163 *next_prev = *prev;
165 ASSERT(hash->ilb_conn_cnt > 0);
166 hash->ilb_conn_cnt--;
168 *next = NULL;
169 *prev = NULL;
172 static void
173 ilb_conn_remove(ilb_conn_t *connp)
175 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
176 ilb_conn_remove_common(connp, B_TRUE);
177 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
178 ilb_conn_remove_common(connp, B_FALSE);
180 if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
181 in_port_t port;
183 port = ntohs(connp->conn_rule_cache.info.nat_sport);
184 vmem_free(connp->conn_rule_cache.info.src_ent->nse_port_arena,
185 (void *)(uintptr_t)port, 1);
188 if (connp->conn_sticky != NULL)
189 ILB_STICKY_REFRELE(connp->conn_sticky);
190 ILB_SERVER_REFRELE(connp->conn_server);
191 kmem_cache_free(ilb_conn_cache, connp);
195 * Routine to do periodic garbage collection of conn hash entries. When
196 * a conn hash timer fires, it dispatches a taskq to call this function
197 * to do the gc. Note that each taskq is responisble for a portion of
198 * the table. The portion is stored in timer->start, timer->end.
200 static void
201 ilb_conn_cleanup(void *arg)
203 ilb_timer_t *timer = (ilb_timer_t *)arg;
204 uint32_t i;
205 ilb_stack_t *ilbs;
206 ilb_conn_hash_t *c2s_hash, *s2c_hash;
207 ilb_conn_t *connp, *nxt_connp;
208 int64_t now;
209 int64_t expiry;
210 boolean_t die_now;
212 ilbs = timer->ilbs;
213 c2s_hash = ilbs->ilbs_c2s_conn_hash;
214 ASSERT(c2s_hash != NULL);
216 now = ddi_get_lbolt64();
217 for (i = timer->start; i < timer->end; i++) {
218 mutex_enter(&c2s_hash[i].ilb_conn_hash_lock);
219 if ((connp = c2s_hash[i].ilb_connp) == NULL) {
220 ASSERT(c2s_hash[i].ilb_conn_cnt == 0);
221 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
222 continue;
224 do {
225 ASSERT(c2s_hash[i].ilb_conn_cnt > 0);
226 ASSERT(connp->conn_c2s_hash == &c2s_hash[i]);
227 nxt_connp = connp->conn_c2s_next;
228 expiry = now - SEC_TO_TICK(connp->conn_expiry);
229 if (connp->conn_server->iser_die_time != 0 &&
230 connp->conn_server->iser_die_time < now)
231 die_now = B_TRUE;
232 else
233 die_now = B_FALSE;
234 s2c_hash = connp->conn_s2c_hash;
235 mutex_enter(&s2c_hash->ilb_conn_hash_lock);
237 if (connp->conn_gc || die_now ||
238 (connp->conn_c2s_atime < expiry &&
239 connp->conn_s2c_atime < expiry)) {
240 /* Need to update the nat list cur_connp */
241 if (connp == ilbs->ilbs_conn_list_connp) {
242 ilbs->ilbs_conn_list_connp =
243 connp->conn_c2s_next;
245 ilb_conn_remove(connp);
246 goto nxt_connp;
249 if (connp->conn_l4 != IPPROTO_TCP)
250 goto nxt_connp;
252 /* Update and check TCP related conn info */
253 if (connp->conn_c2s_tcp_fin_sent &&
254 SEQ_GT(connp->conn_s2c_tcp_ack,
255 connp->conn_c2s_tcp_fss)) {
256 connp->conn_c2s_tcp_fin_acked = B_TRUE;
258 if (connp->conn_s2c_tcp_fin_sent &&
259 SEQ_GT(connp->conn_c2s_tcp_ack,
260 connp->conn_s2c_tcp_fss)) {
261 connp->conn_s2c_tcp_fin_acked = B_TRUE;
263 if (connp->conn_c2s_tcp_fin_acked &&
264 connp->conn_s2c_tcp_fin_acked) {
265 ilb_conn_remove(connp);
267 nxt_connp:
268 mutex_exit(&s2c_hash->ilb_conn_hash_lock);
269 connp = nxt_connp;
270 } while (connp != NULL);
271 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
275 /* Conn hash timer routine. It dispatches a taskq and restart the timer */
276 static void
277 ilb_conn_timer(void *arg)
279 ilb_timer_t *timer = (ilb_timer_t *)arg;
281 (void) taskq_dispatch(timer->ilbs->ilbs_conn_taskq, ilb_conn_cleanup,
282 arg, TQ_SLEEP);
283 mutex_enter(&timer->tid_lock);
284 if (timer->tid == 0) {
285 mutex_exit(&timer->tid_lock);
286 } else {
287 timer->tid = timeout(ilb_conn_timer, arg,
288 SEC_TO_TICK(ilb_conn_cache_timeout));
289 mutex_exit(&timer->tid_lock);
293 void
294 ilb_conn_hash_init(ilb_stack_t *ilbs)
296 extern pri_t minclsyspri;
297 int i, part;
298 ilb_timer_t *tm;
299 char tq_name[TASKQ_NAMELEN];
302 * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to
303 * the next power of 2.
305 if (!ISP2(ilbs->ilbs_conn_hash_size)) {
306 for (i = 0; i < 31; i++) {
307 if (ilbs->ilbs_conn_hash_size < (1 << i))
308 break;
310 ilbs->ilbs_conn_hash_size = 1 << i;
314 * Can sleep since this should be called when a rule is being added,
315 * hence we are not in interrupt context.
317 ilbs->ilbs_c2s_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
318 ilbs->ilbs_conn_hash_size, KM_SLEEP);
319 ilbs->ilbs_s2c_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
320 ilbs->ilbs_conn_hash_size, KM_SLEEP);
322 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
323 mutex_init(&ilbs->ilbs_c2s_conn_hash[i].ilb_conn_hash_lock,
324 NULL, MUTEX_DEFAULT, NULL);
326 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
327 mutex_init(&ilbs->ilbs_s2c_conn_hash[i].ilb_conn_hash_lock,
328 NULL, MUTEX_DEFAULT, NULL);
331 if (ilb_conn_cache == NULL)
332 ilb_conn_cache_init();
334 (void) snprintf(tq_name, sizeof (tq_name), "ilb_conn_taskq_%p",
335 (void *)ilbs->ilbs_netstack);
336 ASSERT(ilbs->ilbs_conn_taskq == NULL);
337 ilbs->ilbs_conn_taskq = taskq_create(tq_name,
338 ilb_conn_timer_size * 2, minclsyspri, ilb_conn_timer_size,
339 ilb_conn_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
341 ASSERT(ilbs->ilbs_conn_timer_list == NULL);
342 ilbs->ilbs_conn_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
343 ilb_conn_timer_size, KM_SLEEP);
346 * The hash table is divided in equal partition for those timers
347 * to do garbage collection.
349 part = ilbs->ilbs_conn_hash_size / ilb_conn_timer_size + 1;
350 for (i = 0; i < ilb_conn_timer_size; i++) {
351 tm = ilbs->ilbs_conn_timer_list + i;
352 tm->start = i * part;
353 tm->end = i * part + part;
354 if (tm->end > ilbs->ilbs_conn_hash_size)
355 tm->end = ilbs->ilbs_conn_hash_size;
356 tm->ilbs = ilbs;
357 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
358 /* Spread out the starting execution time of all the timers. */
359 tm->tid = timeout(ilb_conn_timer, tm,
360 SEC_TO_TICK(ilb_conn_cache_timeout + i));
364 void
365 ilb_conn_hash_fini(ilb_stack_t *ilbs)
367 uint32_t i;
368 ilb_conn_t *connp;
369 ilb_conn_hash_t *hash;
371 if (ilbs->ilbs_c2s_conn_hash == NULL) {
372 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
373 return;
376 /* Stop all the timers first. */
377 for (i = 0; i < ilb_conn_timer_size; i++) {
378 timeout_id_t tid;
380 /* Setting tid to 0 tells the timer handler not to restart. */
381 mutex_enter(&ilbs->ilbs_conn_timer_list[i].tid_lock);
382 tid = ilbs->ilbs_conn_timer_list[i].tid;
383 ilbs->ilbs_conn_timer_list[i].tid = 0;
384 mutex_exit(&ilbs->ilbs_conn_timer_list[i].tid_lock);
385 (void) untimeout(tid);
387 kmem_free(ilbs->ilbs_conn_timer_list, sizeof (ilb_timer_t) *
388 ilb_conn_timer_size);
389 taskq_destroy(ilbs->ilbs_conn_taskq);
390 ilbs->ilbs_conn_taskq = NULL;
392 /* Then remove all the conns. */
393 hash = ilbs->ilbs_s2c_conn_hash;
394 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
395 while ((connp = hash[i].ilb_connp) != NULL) {
396 hash[i].ilb_connp = connp->conn_s2c_next;
397 ILB_SERVER_REFRELE(connp->conn_server);
398 if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
399 ilb_nat_src_entry_t *ent;
400 in_port_t port;
403 * src_ent will be freed in ilb_nat_src_fini().
405 port = ntohs(
406 connp->conn_rule_cache.info.nat_sport);
407 ent = connp->conn_rule_cache.info.src_ent;
408 vmem_free(ent->nse_port_arena,
409 (void *)(uintptr_t)port, 1);
411 kmem_cache_free(ilb_conn_cache, connp);
414 kmem_free(ilbs->ilbs_c2s_conn_hash, sizeof (ilb_conn_hash_t) *
415 ilbs->ilbs_conn_hash_size);
416 kmem_free(ilbs->ilbs_s2c_conn_hash, sizeof (ilb_conn_hash_t) *
417 ilbs->ilbs_conn_hash_size);
421 * Internet checksum adjustment calculation routines. We pre-calculate
422 * checksum adjustment so that we don't need to compute the checksum on
423 * the whole packet when we change address/port in the packet.
426 static void
427 hnat_cksum_v4(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
428 in_port_t new_port, uint32_t *adj_sum)
430 uint32_t sum;
432 sum = *oaddr + *(oaddr + 1) + old_port;
433 while ((sum >> 16) != 0)
434 sum = (sum & 0xffff) + (sum >> 16);
435 *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + new_port;
438 static void
439 hnat_cksum_v6(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
440 in_port_t new_port, uint32_t *adj_sum)
442 uint32_t sum = 0;
444 sum = *oaddr + *(oaddr + 1) + *(oaddr + 2) + *(oaddr + 3) +
445 *(oaddr + 4) + *(oaddr + 5) + *(oaddr + 6) + *(oaddr + 7) +
446 old_port;
447 while ((sum >> 16) != 0)
448 sum = (sum & 0xffff) + (sum >> 16);
449 *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) +
450 *(naddr + 2) + *(naddr + 3) + *(naddr + 4) + *(naddr + 5) +
451 *(naddr + 6) + *(naddr + 7) + new_port;
454 static void
455 fnat_cksum_v4(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
456 uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
457 in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
459 uint32_t sum;
461 sum = *oaddr1 + *(oaddr1 + 1) + old_port1 + *oaddr2 + *(oaddr2 + 1) +
462 old_port2;
463 while ((sum >> 16) != 0)
464 sum = (sum & 0xffff) + (sum >> 16);
465 *adj_sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + new_port1 +
466 *naddr2 + *(naddr2 + 1) + new_port2;
469 static void
470 fnat_cksum_v6(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
471 uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
472 in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
474 uint32_t sum = 0;
476 sum = *oaddr1 + *(oaddr1 + 1) + *(oaddr1 + 2) + *(oaddr1 + 3) +
477 *(oaddr1 + 4) + *(oaddr1 + 5) + *(oaddr1 + 6) + *(oaddr1 + 7) +
478 old_port1;
479 sum += *oaddr2 + *(oaddr2 + 1) + *(oaddr2 + 2) + *(oaddr2 + 3) +
480 *(oaddr2 + 4) + *(oaddr2 + 5) + *(oaddr2 + 6) + *(oaddr2 + 7) +
481 old_port2;
482 while ((sum >> 16) != 0)
483 sum = (sum & 0xffff) + (sum >> 16);
484 sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + *(naddr1 + 2) +
485 *(naddr1 + 3) + *(naddr1 + 4) + *(naddr1 + 5) + *(naddr1 + 6) +
486 *(naddr1 + 7) + new_port1;
487 *adj_sum = sum + *naddr2 + *(naddr2 + 1) + *(naddr2 + 2) +
488 *(naddr2 + 3) + *(naddr2 + 4) + *(naddr2 + 5) + *(naddr2 + 6) +
489 *(naddr2 + 7) + new_port2;
493 * Add a conn hash entry to the tables. Note that a conn hash entry
494 * (ilb_conn_t) contains info on both directions. And there are two hash
495 * tables, one for client to server and the other for server to client.
496 * So the same entry is added to both tables and can be ccessed by two
497 * thread simultaneously. But each thread will only access data on one
498 * direction, so there is no conflict.
501 ilb_conn_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_t *server,
502 in6_addr_t *src, in_port_t sport, in6_addr_t *dst, in_port_t dport,
503 ilb_nat_info_t *info, uint32_t *ip_sum, uint32_t *tp_sum, ilb_sticky_t *s)
505 ilb_conn_t *connp;
506 ilb_conn_hash_t *hash;
507 int i;
509 connp = kmem_cache_alloc(ilb_conn_cache, KM_NOSLEEP);
510 if (connp == NULL) {
511 if (s != NULL) {
512 if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
513 ilb_nat_src_entry_t **entry;
515 entry = s->server->iser_nat_src->src_list;
516 vmem_free(entry[s->nat_src_idx]->nse_port_arena,
517 (void *)(uintptr_t)ntohs(info->nat_sport),
520 ILB_STICKY_REFRELE(s);
522 return (ENOMEM);
525 connp->conn_l4 = rule->ir_proto;
527 connp->conn_server = server;
528 ILB_SERVER_REFHOLD(server);
529 connp->conn_sticky = s;
531 connp->conn_rule_cache.topo = rule->ir_topo;
532 connp->conn_rule_cache.info = *info;
534 connp->conn_gc = B_FALSE;
536 connp->conn_expiry = rule->ir_nat_expiry;
537 connp->conn_cr_time = ddi_get_lbolt64();
539 /* Client to server info. */
540 connp->conn_c2s_saddr = *src;
541 connp->conn_c2s_sport = sport;
542 connp->conn_c2s_daddr = *dst;
543 connp->conn_c2s_dport = dport;
545 connp->conn_c2s_atime = ddi_get_lbolt64();
546 /* The packet ths triggers this creation should be counted */
547 connp->conn_c2s_pkt_cnt = 1;
548 connp->conn_c2s_tcp_fin_sent = B_FALSE;
549 connp->conn_c2s_tcp_fin_acked = B_FALSE;
551 /* Server to client info, before NAT */
552 switch (rule->ir_topo) {
553 case ILB_TOPO_IMPL_HALF_NAT:
554 connp->conn_s2c_saddr = info->nat_dst;
555 connp->conn_s2c_sport = info->nat_dport;
556 connp->conn_s2c_daddr = *src;
557 connp->conn_s2c_dport = sport;
559 /* Pre-calculate checksum changes for both directions */
560 if (rule->ir_ipver == IPPROTO_IP) {
561 hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
562 (uint16_t *)&info->nat_dst.s6_addr32[3], 0, 0,
563 &connp->conn_c2s_ip_sum);
564 hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
565 (uint16_t *)&info->nat_dst.s6_addr32[3], dport,
566 info->nat_dport, &connp->conn_c2s_tp_sum);
567 *ip_sum = connp->conn_c2s_ip_sum;
568 *tp_sum = connp->conn_c2s_tp_sum;
570 hnat_cksum_v4(
571 (uint16_t *)&info->nat_dst.s6_addr32[3],
572 (uint16_t *)&dst->s6_addr32[3], 0, 0,
573 &connp->conn_s2c_ip_sum);
574 hnat_cksum_v4(
575 (uint16_t *)&info->nat_dst.s6_addr32[3],
576 (uint16_t *)&dst->s6_addr32[3],
577 info->nat_dport, dport,
578 &connp->conn_s2c_tp_sum);
579 } else {
580 connp->conn_c2s_ip_sum = 0;
581 hnat_cksum_v6((uint16_t *)dst,
582 (uint16_t *)&info->nat_dst, dport,
583 info->nat_dport, &connp->conn_c2s_tp_sum);
584 *ip_sum = 0;
585 *tp_sum = connp->conn_c2s_tp_sum;
587 connp->conn_s2c_ip_sum = 0;
588 hnat_cksum_v6((uint16_t *)&info->nat_dst,
589 (uint16_t *)dst, info->nat_dport, dport,
590 &connp->conn_s2c_tp_sum);
592 break;
593 case ILB_TOPO_IMPL_NAT:
594 connp->conn_s2c_saddr = info->nat_dst;
595 connp->conn_s2c_sport = info->nat_dport;
596 connp->conn_s2c_daddr = info->nat_src;
597 connp->conn_s2c_dport = info->nat_sport;
599 if (rule->ir_ipver == IPPROTO_IP) {
600 fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
601 (uint16_t *)&dst->s6_addr32[3],
602 (uint16_t *)&info->nat_src.s6_addr32[3],
603 (uint16_t *)&info->nat_dst.s6_addr32[3],
604 0, 0, 0, 0, &connp->conn_c2s_ip_sum);
605 fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
606 (uint16_t *)&dst->s6_addr32[3],
607 (uint16_t *)&info->nat_src.s6_addr32[3],
608 (uint16_t *)&info->nat_dst.s6_addr32[3],
609 sport, dport, info->nat_sport,
610 info->nat_dport, &connp->conn_c2s_tp_sum);
611 *ip_sum = connp->conn_c2s_ip_sum;
612 *tp_sum = connp->conn_c2s_tp_sum;
614 fnat_cksum_v4(
615 (uint16_t *)&info->nat_src.s6_addr32[3],
616 (uint16_t *)&info->nat_dst.s6_addr32[3],
617 (uint16_t *)&src->s6_addr32[3],
618 (uint16_t *)&dst->s6_addr32[3],
619 0, 0, 0, 0, &connp->conn_s2c_ip_sum);
620 fnat_cksum_v4(
621 (uint16_t *)&info->nat_src.s6_addr32[3],
622 (uint16_t *)&info->nat_dst.s6_addr32[3],
623 (uint16_t *)&src->s6_addr32[3],
624 (uint16_t *)&dst->s6_addr32[3],
625 info->nat_sport, info->nat_dport,
626 sport, dport, &connp->conn_s2c_tp_sum);
627 } else {
628 fnat_cksum_v6((uint16_t *)src, (uint16_t *)dst,
629 (uint16_t *)&info->nat_src,
630 (uint16_t *)&info->nat_dst,
631 sport, dport, info->nat_sport,
632 info->nat_dport, &connp->conn_c2s_tp_sum);
633 connp->conn_c2s_ip_sum = 0;
634 *ip_sum = 0;
635 *tp_sum = connp->conn_c2s_tp_sum;
637 fnat_cksum_v6((uint16_t *)&info->nat_src,
638 (uint16_t *)&info->nat_dst, (uint16_t *)src,
639 (uint16_t *)dst, info->nat_sport,
640 info->nat_dport, sport, dport,
641 &connp->conn_s2c_tp_sum);
642 connp->conn_s2c_ip_sum = 0;
644 break;
647 connp->conn_s2c_atime = ddi_get_lbolt64();
648 connp->conn_s2c_pkt_cnt = 1;
649 connp->conn_s2c_tcp_fin_sent = B_FALSE;
650 connp->conn_s2c_tcp_fin_acked = B_FALSE;
652 /* Add it to the s2c hash table. */
653 hash = ilbs->ilbs_s2c_conn_hash;
654 i = ILB_CONN_HASH((uint8_t *)&connp->conn_s2c_saddr.s6_addr32[3],
655 ntohs(connp->conn_s2c_sport),
656 (uint8_t *)&connp->conn_s2c_daddr.s6_addr32[3],
657 ntohs(connp->conn_s2c_dport), ilbs->ilbs_conn_hash_size);
658 connp->conn_s2c_hash = &hash[i];
659 DTRACE_PROBE2(ilb__conn__hash__add__s2c, ilb_conn_t *, connp, int, i);
661 mutex_enter(&hash[i].ilb_conn_hash_lock);
662 hash[i].ilb_conn_cnt++;
663 connp->conn_s2c_next = hash[i].ilb_connp;
664 if (hash[i].ilb_connp != NULL)
665 hash[i].ilb_connp->conn_s2c_prev = connp;
666 connp->conn_s2c_prev = NULL;
667 hash[i].ilb_connp = connp;
668 mutex_exit(&hash[i].ilb_conn_hash_lock);
670 /* Add it to the c2s hash table. */
671 hash = ilbs->ilbs_c2s_conn_hash;
672 i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
673 (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
674 ilbs->ilbs_conn_hash_size);
675 connp->conn_c2s_hash = &hash[i];
676 DTRACE_PROBE2(ilb__conn__hash__add__c2s, ilb_conn_t *, connp, int, i);
678 mutex_enter(&hash[i].ilb_conn_hash_lock);
679 hash[i].ilb_conn_cnt++;
680 connp->conn_c2s_next = hash[i].ilb_connp;
681 if (hash[i].ilb_connp != NULL)
682 hash[i].ilb_connp->conn_c2s_prev = connp;
683 connp->conn_c2s_prev = NULL;
684 hash[i].ilb_connp = connp;
685 mutex_exit(&hash[i].ilb_conn_hash_lock);
687 return (0);
691 * If a connection is using TCP, we keep track of simple TCP state transition
692 * so that we know when to clean up an entry.
694 static boolean_t
695 update_conn_tcp(ilb_conn_t *connp, void *iph, tcpha_t *tcpha, int32_t pkt_len,
696 boolean_t c2s)
698 uint32_t ack, seq;
699 int32_t seg_len;
701 if (tcpha->tha_flags & TH_RST)
702 return (B_FALSE);
704 seg_len = pkt_len - ((uint8_t *)tcpha - (uint8_t *)iph) -
705 TCP_HDR_LENGTH((tcph_t *)tcpha);
707 if (tcpha->tha_flags & TH_ACK)
708 ack = ntohl(tcpha->tha_ack);
709 seq = ntohl(tcpha->tha_seq);
710 if (c2s) {
711 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
712 if (tcpha->tha_flags & TH_FIN) {
713 connp->conn_c2s_tcp_fss = seq + seg_len;
714 connp->conn_c2s_tcp_fin_sent = B_TRUE;
716 connp->conn_c2s_tcp_ack = ack;
718 /* Port reuse by the client, restart the conn. */
719 if (connp->conn_c2s_tcp_fin_sent &&
720 SEQ_GT(seq, connp->conn_c2s_tcp_fss + 1)) {
721 connp->conn_c2s_tcp_fin_sent = B_FALSE;
722 connp->conn_c2s_tcp_fin_acked = B_FALSE;
724 } else {
725 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
726 if (tcpha->tha_flags & TH_FIN) {
727 connp->conn_s2c_tcp_fss = seq + seg_len;
728 connp->conn_s2c_tcp_fin_sent = B_TRUE;
730 connp->conn_s2c_tcp_ack = ack;
732 /* Port reuse by the client, restart the conn. */
733 if (connp->conn_s2c_tcp_fin_sent &&
734 SEQ_GT(seq, connp->conn_s2c_tcp_fss + 1)) {
735 connp->conn_s2c_tcp_fin_sent = B_FALSE;
736 connp->conn_s2c_tcp_fin_acked = B_FALSE;
740 return (B_TRUE);
744 * Helper routint to find conn hash entry given some packet information and
745 * the traffic direction (c2s, client to server?)
747 static boolean_t
748 ilb_find_conn(ilb_stack_t *ilbs, void *iph, void *tph, int l4, in6_addr_t *src,
749 in_port_t sport, in6_addr_t *dst, in_port_t dport,
750 ilb_rule_info_t *rule_cache, uint32_t *ip_sum, uint32_t *tp_sum,
751 int32_t pkt_len, boolean_t c2s)
753 ilb_conn_hash_t *hash;
754 uint_t i;
755 ilb_conn_t *connp;
756 boolean_t tcp_alive;
757 boolean_t ret = B_FALSE;
759 i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
760 (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
761 ilbs->ilbs_conn_hash_size);
762 if (c2s) {
763 hash = ilbs->ilbs_c2s_conn_hash;
764 mutex_enter(&hash[i].ilb_conn_hash_lock);
765 for (connp = hash[i].ilb_connp; connp != NULL;
766 connp = connp->conn_c2s_next) {
767 if (connp->conn_l4 == l4 &&
768 connp->conn_c2s_dport == dport &&
769 connp->conn_c2s_sport == sport &&
770 IN6_ARE_ADDR_EQUAL(src, &connp->conn_c2s_saddr) &&
771 IN6_ARE_ADDR_EQUAL(dst, &connp->conn_c2s_daddr)) {
772 connp->conn_c2s_atime = ddi_get_lbolt64();
773 connp->conn_c2s_pkt_cnt++;
774 *rule_cache = connp->conn_rule_cache;
775 *ip_sum = connp->conn_c2s_ip_sum;
776 *tp_sum = connp->conn_c2s_tp_sum;
777 ret = B_TRUE;
778 break;
781 } else {
782 hash = ilbs->ilbs_s2c_conn_hash;
783 mutex_enter(&hash[i].ilb_conn_hash_lock);
784 for (connp = hash[i].ilb_connp; connp != NULL;
785 connp = connp->conn_s2c_next) {
786 if (connp->conn_l4 == l4 &&
787 connp->conn_s2c_dport == dport &&
788 connp->conn_s2c_sport == sport &&
789 IN6_ARE_ADDR_EQUAL(src, &connp->conn_s2c_saddr) &&
790 IN6_ARE_ADDR_EQUAL(dst, &connp->conn_s2c_daddr)) {
791 connp->conn_s2c_atime = ddi_get_lbolt64();
792 connp->conn_s2c_pkt_cnt++;
793 *rule_cache = connp->conn_rule_cache;
794 *ip_sum = connp->conn_s2c_ip_sum;
795 *tp_sum = connp->conn_s2c_tp_sum;
796 ret = B_TRUE;
797 break;
801 if (ret) {
802 ILB_S_KSTAT(connp->conn_server, pkt_processed);
803 ILB_S_KSTAT_UPDATE(connp->conn_server, bytes_processed,
804 pkt_len);
806 switch (l4) {
807 case (IPPROTO_TCP):
808 tcp_alive = update_conn_tcp(connp, iph, tph, pkt_len,
809 c2s);
810 if (!tcp_alive) {
811 connp->conn_gc = B_TRUE;
813 break;
814 default:
815 break;
818 mutex_exit(&hash[i].ilb_conn_hash_lock);
820 return (ret);
824 * To check if a give packet matches an existing conn hash entry. If it
825 * does, return the information about this entry so that the caller can
826 * do the proper NAT.
828 boolean_t
829 ilb_check_conn(ilb_stack_t *ilbs, int l3, void *iph, int l4, void *tph,
830 in6_addr_t *src, in6_addr_t *dst, in_port_t sport, in_port_t dport,
831 uint32_t pkt_len, in6_addr_t *lb_dst)
833 ilb_rule_info_t rule_cache;
834 uint32_t adj_ip_sum, adj_tp_sum;
835 boolean_t ret;
837 /* Check the incoming hash table. */
838 if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
839 &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_TRUE)) {
840 switch (rule_cache.topo) {
841 case ILB_TOPO_IMPL_NAT:
842 *lb_dst = rule_cache.info.nat_dst;
843 ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
844 adj_ip_sum, adj_tp_sum, B_TRUE);
845 ret = B_TRUE;
846 break;
847 case ILB_TOPO_IMPL_HALF_NAT:
848 *lb_dst = rule_cache.info.nat_dst;
849 ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
850 adj_ip_sum, adj_tp_sum, B_TRUE);
851 ret = B_TRUE;
852 break;
853 default:
854 ret = B_FALSE;
855 break;
857 return (ret);
859 if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
860 &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_FALSE)) {
861 switch (rule_cache.topo) {
862 case ILB_TOPO_IMPL_NAT:
863 *lb_dst = rule_cache.info.src;
864 ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
865 adj_ip_sum, adj_tp_sum, B_FALSE);
866 ret = B_TRUE;
867 break;
868 case ILB_TOPO_IMPL_HALF_NAT:
869 *lb_dst = *dst;
870 ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
871 adj_ip_sum, adj_tp_sum, B_FALSE);
872 ret = B_TRUE;
873 break;
874 default:
875 ret = B_FALSE;
876 break;
878 return (ret);
881 return (B_FALSE);
885 * To check if an ICMP packet belongs to a connection in one of the conn
886 * hash entries.
888 boolean_t
889 ilb_check_icmp_conn(ilb_stack_t *ilbs, mblk_t *mp, int l3, void *out_iph,
890 void *icmph, in6_addr_t *lb_dst)
892 ilb_conn_hash_t *hash;
893 ipha_t *in_iph4;
894 ip6_t *in_iph6;
895 icmph_t *icmph4;
896 icmp6_t *icmph6;
897 in6_addr_t *in_src_p, *in_dst_p;
898 in_port_t *sport, *dport;
899 int l4;
900 uint_t i;
901 ilb_conn_t *connp;
902 ilb_rule_info_t rule_cache;
903 uint32_t adj_ip_sum;
904 boolean_t full_nat;
906 if (l3 == IPPROTO_IP) {
907 in6_addr_t in_src, in_dst;
909 icmph4 = (icmph_t *)icmph;
910 in_iph4 = (ipha_t *)&icmph4[1];
912 if ((uint8_t *)in_iph4 + IPH_HDR_LENGTH(in_iph4) +
913 ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
914 return (B_FALSE);
917 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_src, &in_src);
918 in_src_p = &in_src;
919 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_dst, &in_dst);
920 in_dst_p = &in_dst;
922 l4 = in_iph4->ipha_protocol;
923 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
924 return (B_FALSE);
926 sport = (in_port_t *)((char *)in_iph4 +
927 IPH_HDR_LENGTH(in_iph4));
928 dport = sport + 1;
930 DTRACE_PROBE4(ilb__chk__icmp__conn__v4, uint32_t,
931 in_iph4->ipha_src, uint32_t, in_iph4->ipha_dst, uint16_t,
932 ntohs(*sport), uint16_t, ntohs(*dport));
933 } else {
934 ASSERT(l3 == IPPROTO_IPV6);
936 icmph6 = (icmp6_t *)icmph;
937 in_iph6 = (ip6_t *)&icmph6[1];
938 in_src_p = &in_iph6->ip6_src;
939 in_dst_p = &in_iph6->ip6_dst;
941 if ((uint8_t *)in_iph6 + sizeof (ip6_t) +
942 ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
943 return (B_FALSE);
946 l4 = in_iph6->ip6_nxt;
947 /* We don't go deep inside an IPv6 packet yet. */
948 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
949 return (B_FALSE);
951 sport = (in_port_t *)&in_iph6[1];
952 dport = sport + 1;
954 DTRACE_PROBE4(ilb__chk__icmp__conn__v6, in6_addr_t *,
955 &in_iph6->ip6_src, in6_addr_t *, &in_iph6->ip6_dst,
956 uint16_t, ntohs(*sport), uint16_t, ntohs(*dport));
959 i = ILB_CONN_HASH((uint8_t *)&in_dst_p->s6_addr32[3], ntohs(*dport),
960 (uint8_t *)&in_src_p->s6_addr32[3], ntohs(*sport),
961 ilbs->ilbs_conn_hash_size);
962 hash = ilbs->ilbs_c2s_conn_hash;
964 mutex_enter(&hash[i].ilb_conn_hash_lock);
965 for (connp = hash[i].ilb_connp; connp != NULL;
966 connp = connp->conn_c2s_next) {
967 if (connp->conn_l4 == l4 &&
968 connp->conn_c2s_dport == *sport &&
969 connp->conn_c2s_sport == *dport &&
970 IN6_ARE_ADDR_EQUAL(in_dst_p, &connp->conn_c2s_saddr) &&
971 IN6_ARE_ADDR_EQUAL(in_src_p, &connp->conn_c2s_daddr)) {
972 connp->conn_c2s_atime = ddi_get_lbolt64();
973 connp->conn_c2s_pkt_cnt++;
974 rule_cache = connp->conn_rule_cache;
975 adj_ip_sum = connp->conn_c2s_ip_sum;
976 break;
979 mutex_exit(&hash[i].ilb_conn_hash_lock);
981 if (connp == NULL) {
982 DTRACE_PROBE(ilb__chk__icmp__conn__failed);
983 return (B_FALSE);
986 switch (rule_cache.topo) {
987 case ILB_TOPO_IMPL_NAT:
988 full_nat = B_TRUE;
989 break;
990 case ILB_TOPO_IMPL_HALF_NAT:
991 full_nat = B_FALSE;
992 break;
993 default:
994 return (B_FALSE);
997 *lb_dst = rule_cache.info.nat_dst;
998 if (l3 == IPPROTO_IP) {
999 ilb_nat_icmpv4(mp, out_iph, icmph4, in_iph4, sport, dport,
1000 &rule_cache.info, adj_ip_sum, full_nat);
1001 } else {
1002 ilb_nat_icmpv6(mp, out_iph, icmph6, in_iph6, sport, dport,
1003 &rule_cache.info, full_nat);
1005 return (B_TRUE);
1009 * This routine sends up the conn hash table to user land. Note that the
1010 * request is an ioctl, hence we cannot really differentiate requests
1011 * from different clients. There is no context shared between different
1012 * ioctls. Here we make the assumption that the user land ilbd will
1013 * only allow one client to show the conn hash table at any time.
1014 * Otherwise, the results will be "very" inconsistent.
1016 * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants
1017 * to read from the beginning of the able. After a certain entries
1018 * are reported, the kernel remembers the position of the last returned
1019 * entry. When the next ioctl comes in with the ILB_LIST_BEGIN flag,
1020 * it will return entries starting from where it was left off. When
1021 * the end of table is reached, a flag (ILB_LIST_END) is set to tell
1022 * the client that there is no more entry.
1024 * It is assumed that the caller has checked the size of nat so that it
1025 * can hold num entries.
1027 /* ARGSUSED */
1029 ilb_list_nat(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_nat_entry_t *nat,
1030 uint32_t *num, uint32_t *flags)
1032 ilb_conn_hash_t *hash;
1033 ilb_conn_t *cur_connp;
1034 uint32_t i, j;
1035 int ret = 0;
1037 mutex_enter(&ilbs->ilbs_conn_list_lock);
1038 while (ilbs->ilbs_conn_list_busy) {
1039 if (cv_wait_sig(&ilbs->ilbs_conn_list_cv,
1040 &ilbs->ilbs_conn_list_lock) == 0) {
1041 mutex_exit(&ilbs->ilbs_conn_list_lock);
1042 return (EINTR);
1045 if ((hash = ilbs->ilbs_c2s_conn_hash) == NULL) {
1046 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
1047 mutex_exit(&ilbs->ilbs_conn_list_lock);
1048 *num = 0;
1049 *flags |= ILB_LIST_END;
1050 return (0);
1052 ilbs->ilbs_conn_list_busy = B_TRUE;
1053 mutex_exit(&ilbs->ilbs_conn_list_lock);
1055 if (*flags & ILB_LIST_BEGIN) {
1056 i = 0;
1057 mutex_enter(&hash[0].ilb_conn_hash_lock);
1058 cur_connp = hash[0].ilb_connp;
1059 } else if (*flags & ILB_LIST_CONT) {
1060 if (ilbs->ilbs_conn_list_cur == ilbs->ilbs_conn_hash_size) {
1061 *num = 0;
1062 *flags |= ILB_LIST_END;
1063 goto done;
1065 i = ilbs->ilbs_conn_list_cur;
1066 mutex_enter(&hash[i].ilb_conn_hash_lock);
1067 cur_connp = ilbs->ilbs_conn_list_connp;
1068 } else {
1069 ret = EINVAL;
1070 goto done;
1073 j = 0;
1074 while (j < *num) {
1075 if (cur_connp == NULL) {
1076 mutex_exit(&hash[i].ilb_conn_hash_lock);
1077 if (++i == ilbs->ilbs_conn_hash_size) {
1078 *flags |= ILB_LIST_END;
1079 break;
1081 mutex_enter(&hash[i].ilb_conn_hash_lock);
1082 cur_connp = hash[i].ilb_connp;
1083 continue;
1085 nat[j].proto = cur_connp->conn_l4;
1087 nat[j].in_global = cur_connp->conn_c2s_daddr;
1088 nat[j].in_global_port = cur_connp->conn_c2s_dport;
1089 nat[j].out_global = cur_connp->conn_c2s_saddr;
1090 nat[j].out_global_port = cur_connp->conn_c2s_sport;
1092 nat[j].in_local = cur_connp->conn_s2c_saddr;
1093 nat[j].in_local_port = cur_connp->conn_s2c_sport;
1094 nat[j].out_local = cur_connp->conn_s2c_daddr;
1095 nat[j].out_local_port = cur_connp->conn_s2c_dport;
1097 nat[j].create_time = TICK_TO_MSEC(cur_connp->conn_cr_time);
1098 nat[j].last_access_time =
1099 TICK_TO_MSEC(cur_connp->conn_c2s_atime);
1102 * The conn_s2c_pkt_cnt may not be accurate since we are not
1103 * holding the s2c hash lock.
1105 nat[j].pkt_cnt = cur_connp->conn_c2s_pkt_cnt +
1106 cur_connp->conn_s2c_pkt_cnt;
1107 j++;
1109 cur_connp = cur_connp->conn_c2s_next;
1111 ilbs->ilbs_conn_list_connp = cur_connp;
1112 if (j == *num)
1113 mutex_exit(&hash[i].ilb_conn_hash_lock);
1115 ilbs->ilbs_conn_list_cur = i;
1117 *num = j;
1118 done:
1119 mutex_enter(&ilbs->ilbs_conn_list_lock);
1120 ilbs->ilbs_conn_list_busy = B_FALSE;
1121 cv_signal(&ilbs->ilbs_conn_list_cv);
1122 mutex_exit(&ilbs->ilbs_conn_list_lock);
1124 return (ret);
1129 * Stickiness (persistence) handling routines.
1133 static void
1134 ilb_sticky_cache_init(void)
1136 ilb_sticky_cache = kmem_cache_create("ilb_sticky_cache",
1137 sizeof (ilb_sticky_t), 0, NULL, NULL, NULL, NULL, NULL,
1138 ilb_kmem_flags);
1141 void
1142 ilb_sticky_cache_fini(void)
1144 if (ilb_sticky_cache != NULL) {
1145 kmem_cache_destroy(ilb_sticky_cache);
1146 ilb_sticky_cache = NULL;
1150 void
1151 ilb_sticky_refrele(ilb_sticky_t *s)
1153 ILB_STICKY_REFRELE(s);
1156 static ilb_sticky_t *
1157 ilb_sticky_lookup(ilb_sticky_hash_t *hash, ilb_rule_t *rule, in6_addr_t *src)
1159 ilb_sticky_t *s;
1161 ASSERT(mutex_owned(&hash->sticky_lock));
1163 for (s = list_head(&hash->sticky_head); s != NULL;
1164 s = list_next(&hash->sticky_head, s)) {
1165 if (s->rule_instance == rule->ir_ks_instance) {
1166 if (IN6_ARE_ADDR_EQUAL(src, &s->src))
1167 return (s);
1170 return (NULL);
1173 static ilb_sticky_t *
1174 ilb_sticky_add(ilb_sticky_hash_t *hash, ilb_rule_t *rule, ilb_server_t *server,
1175 in6_addr_t *src)
1177 ilb_sticky_t *s;
1179 ASSERT(mutex_owned(&hash->sticky_lock));
1181 if ((s = kmem_cache_alloc(ilb_sticky_cache, KM_NOSLEEP)) == NULL)
1182 return (NULL);
1185 * The rule instance is for handling the scenario when the same
1186 * client talks to different rules at the same time. Stickiness
1187 * is per rule so we can use the rule instance to differentiate
1188 * the client's request.
1190 s->rule_instance = rule->ir_ks_instance;
1192 * Copy the rule name for listing all sticky cache entry. ir_name
1193 * is guaranteed to be NULL terminated.
1195 (void) strcpy(s->rule_name, rule->ir_name);
1196 s->server = server;
1199 * Grab a ref cnt on the server so that it won't go away while
1200 * it is still in the sticky table.
1202 ILB_SERVER_REFHOLD(server);
1203 s->src = *src;
1204 s->expiry = rule->ir_sticky_expiry;
1205 s->refcnt = 1;
1206 s->hash = hash;
1209 * There is no need to set atime here since the refcnt is not
1210 * zero. A sticky entry is removed only when the refcnt is
1211 * zero. But just set it here for debugging purpose. The
1212 * atime is set when a refrele is done on a sticky entry.
1214 s->atime = ddi_get_lbolt64();
1216 list_insert_head(&hash->sticky_head, s);
1217 hash->sticky_cnt++;
1218 return (s);
1222 * This routine checks if there is an existing sticky entry which matches
1223 * a given packet. If there is one, return it. If there is not, create
1224 * a sticky entry using the packet's info.
1226 ilb_server_t *
1227 ilb_sticky_find_add(ilb_stack_t *ilbs, ilb_rule_t *rule, in6_addr_t *src,
1228 ilb_server_t *server, ilb_sticky_t **res, uint16_t *src_ent_idx)
1230 int i;
1231 ilb_sticky_hash_t *hash;
1232 ilb_sticky_t *s;
1234 ASSERT(server != NULL);
1236 *res = NULL;
1238 i = ILB_STICKY_HASH((uint8_t *)&src->s6_addr32[3],
1239 (uint32_t)(uintptr_t)rule, ilbs->ilbs_sticky_hash_size);
1240 hash = &ilbs->ilbs_sticky_hash[i];
1242 /* First check if there is already an entry. */
1243 mutex_enter(&hash->sticky_lock);
1244 s = ilb_sticky_lookup(hash, rule, src);
1246 /* No sticky entry, add one. */
1247 if (s == NULL) {
1248 add_new_entry:
1249 s = ilb_sticky_add(hash, rule, server, src);
1250 if (s == NULL) {
1251 mutex_exit(&hash->sticky_lock);
1252 return (NULL);
1255 * Find a source for this server. All subseqent requests from
1256 * the same client matching this sticky entry will use this
1257 * source address in doing NAT. The current algorithm is
1258 * simple, rotate the source address. Note that the
1259 * source address array does not change after it's created, so
1260 * it is OK to just increment the cur index.
1262 if (server->iser_nat_src != NULL) {
1263 /* It is a hint, does not need to be atomic. */
1264 *src_ent_idx = (server->iser_nat_src->cur++ %
1265 server->iser_nat_src->num_src);
1266 s->nat_src_idx = *src_ent_idx;
1268 mutex_exit(&hash->sticky_lock);
1269 *res = s;
1270 return (server);
1274 * We don't hold any lock accessing iser_enabled. Refer to the
1275 * comment in ilb_server_add() about iser_lock.
1277 if (!s->server->iser_enabled) {
1279 * s->server == server can only happen if there is a race in
1280 * toggling the iser_enabled flag (we don't hold a lock doing
1281 * that) so that the load balance algorithm still returns a
1282 * disabled server. In this case, just drop the packet...
1284 if (s->server == server) {
1285 mutex_exit(&hash->sticky_lock);
1286 return (NULL);
1290 * The old server is disabled and there is a new server, use
1291 * the new one to create a sticky entry. Since we will
1292 * add the entry at the beginning, subsequent lookup will
1293 * find this new entry instead of the old one.
1295 goto add_new_entry;
1298 s->refcnt++;
1299 *res = s;
1300 mutex_exit(&hash->sticky_lock);
1301 if (server->iser_nat_src != NULL)
1302 *src_ent_idx = s->nat_src_idx;
1303 return (s->server);
1306 static void
1307 ilb_sticky_cleanup(void *arg)
1309 ilb_timer_t *timer = (ilb_timer_t *)arg;
1310 uint32_t i;
1311 ilb_stack_t *ilbs;
1312 ilb_sticky_hash_t *hash;
1313 ilb_sticky_t *s, *nxt_s;
1314 int64_t now, expiry;
1316 ilbs = timer->ilbs;
1317 hash = ilbs->ilbs_sticky_hash;
1318 ASSERT(hash != NULL);
1320 now = ddi_get_lbolt64();
1321 for (i = timer->start; i < timer->end; i++) {
1322 mutex_enter(&hash[i].sticky_lock);
1323 for (s = list_head(&hash[i].sticky_head); s != NULL;
1324 s = nxt_s) {
1325 nxt_s = list_next(&hash[i].sticky_head, s);
1326 if (s->refcnt != 0)
1327 continue;
1328 expiry = now - SEC_TO_TICK(s->expiry);
1329 if (s->atime < expiry) {
1330 ILB_SERVER_REFRELE(s->server);
1331 list_remove(&hash[i].sticky_head, s);
1332 kmem_cache_free(ilb_sticky_cache, s);
1333 hash[i].sticky_cnt--;
1336 mutex_exit(&hash[i].sticky_lock);
1340 static void
1341 ilb_sticky_timer(void *arg)
1343 ilb_timer_t *timer = (ilb_timer_t *)arg;
1345 (void) taskq_dispatch(timer->ilbs->ilbs_sticky_taskq,
1346 ilb_sticky_cleanup, arg, TQ_SLEEP);
1347 mutex_enter(&timer->tid_lock);
1348 if (timer->tid == 0) {
1349 mutex_exit(&timer->tid_lock);
1350 } else {
1351 timer->tid = timeout(ilb_sticky_timer, arg,
1352 SEC_TO_TICK(ilb_sticky_timeout));
1353 mutex_exit(&timer->tid_lock);
1357 void
1358 ilb_sticky_hash_init(ilb_stack_t *ilbs)
1360 extern pri_t minclsyspri;
1361 int i, part;
1362 char tq_name[TASKQ_NAMELEN];
1363 ilb_timer_t *tm;
1365 if (!ISP2(ilbs->ilbs_sticky_hash_size)) {
1366 for (i = 0; i < 31; i++) {
1367 if (ilbs->ilbs_sticky_hash_size < (1 << i))
1368 break;
1370 ilbs->ilbs_sticky_hash_size = 1 << i;
1373 ilbs->ilbs_sticky_hash = kmem_zalloc(sizeof (ilb_sticky_hash_t) *
1374 ilbs->ilbs_sticky_hash_size, KM_SLEEP);
1375 for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1376 mutex_init(&ilbs->ilbs_sticky_hash[i].sticky_lock, NULL,
1377 MUTEX_DEFAULT, NULL);
1378 list_create(&ilbs->ilbs_sticky_hash[i].sticky_head,
1379 sizeof (ilb_sticky_t),
1380 offsetof(ilb_sticky_t, list));
1383 if (ilb_sticky_cache == NULL)
1384 ilb_sticky_cache_init();
1386 (void) snprintf(tq_name, sizeof (tq_name), "ilb_sticky_taskq_%p",
1387 (void *)ilbs->ilbs_netstack);
1388 ASSERT(ilbs->ilbs_sticky_taskq == NULL);
1389 ilbs->ilbs_sticky_taskq = taskq_create(tq_name,
1390 ilb_sticky_timer_size * 2, minclsyspri, ilb_sticky_timer_size,
1391 ilb_sticky_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
1393 ASSERT(ilbs->ilbs_sticky_timer_list == NULL);
1394 ilbs->ilbs_sticky_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
1395 ilb_sticky_timer_size, KM_SLEEP);
1396 part = ilbs->ilbs_sticky_hash_size / ilb_sticky_timer_size + 1;
1397 for (i = 0; i < ilb_sticky_timer_size; i++) {
1398 tm = ilbs->ilbs_sticky_timer_list + i;
1399 tm->start = i * part;
1400 tm->end = i * part + part;
1401 if (tm->end > ilbs->ilbs_sticky_hash_size)
1402 tm->end = ilbs->ilbs_sticky_hash_size;
1403 tm->ilbs = ilbs;
1404 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
1405 /* Spread out the starting execution time of all the timers. */
1406 tm->tid = timeout(ilb_sticky_timer, tm,
1407 SEC_TO_TICK(ilb_sticky_timeout + i));
1411 void
1412 ilb_sticky_hash_fini(ilb_stack_t *ilbs)
1414 int i;
1415 ilb_sticky_t *s;
1417 if (ilbs->ilbs_sticky_hash == NULL)
1418 return;
1420 /* Stop all the timers first. */
1421 for (i = 0; i < ilb_sticky_timer_size; i++) {
1422 timeout_id_t tid;
1424 /* Setting tid to 0 tells the timer handler not to restart. */
1425 mutex_enter(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1426 tid = ilbs->ilbs_sticky_timer_list[i].tid;
1427 ilbs->ilbs_sticky_timer_list[i].tid = 0;
1428 mutex_exit(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1429 (void) untimeout(tid);
1431 kmem_free(ilbs->ilbs_sticky_timer_list, sizeof (ilb_timer_t) *
1432 ilb_sticky_timer_size);
1433 taskq_destroy(ilbs->ilbs_sticky_taskq);
1434 ilbs->ilbs_sticky_taskq = NULL;
1436 for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1437 while ((s = list_head(&ilbs->ilbs_sticky_hash[i].sticky_head))
1438 != NULL) {
1439 list_remove(&ilbs->ilbs_sticky_hash[i].sticky_head, s);
1440 ILB_SERVER_REFRELE(s->server);
1441 kmem_free(s, sizeof (ilb_sticky_t));
1444 kmem_free(ilbs->ilbs_sticky_hash, ilbs->ilbs_sticky_hash_size *
1445 sizeof (ilb_sticky_hash_t));
1449 * This routine sends up the sticky hash table to user land. Refer to
1450 * the comments before ilb_list_nat(). Both routines assume similar
1451 * conditions.
1453 * It is assumed that the caller has checked the size of st so that it
1454 * can hold num entries.
1456 /* ARGSUSED */
1458 ilb_list_sticky(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_sticky_entry_t *st,
1459 uint32_t *num, uint32_t *flags)
1461 ilb_sticky_hash_t *hash;
1462 ilb_sticky_t *curp;
1463 uint32_t i, j;
1464 int ret = 0;
1466 mutex_enter(&ilbs->ilbs_sticky_list_lock);
1467 while (ilbs->ilbs_sticky_list_busy) {
1468 if (cv_wait_sig(&ilbs->ilbs_sticky_list_cv,
1469 &ilbs->ilbs_sticky_list_lock) == 0) {
1470 mutex_exit(&ilbs->ilbs_sticky_list_lock);
1471 return (EINTR);
1474 if ((hash = ilbs->ilbs_sticky_hash) == NULL) {
1475 mutex_exit(&ilbs->ilbs_sticky_list_lock);
1476 *num = 0;
1477 *flags |= ILB_LIST_END;
1478 return (0);
1480 ilbs->ilbs_sticky_list_busy = B_TRUE;
1481 mutex_exit(&ilbs->ilbs_sticky_list_lock);
1483 if (*flags & ILB_LIST_BEGIN) {
1484 i = 0;
1485 mutex_enter(&hash[0].sticky_lock);
1486 curp = list_head(&hash[0].sticky_head);
1487 } else if (*flags & ILB_LIST_CONT) {
1488 if (ilbs->ilbs_sticky_list_cur == ilbs->ilbs_sticky_hash_size) {
1489 *num = 0;
1490 *flags |= ILB_LIST_END;
1491 goto done;
1493 i = ilbs->ilbs_sticky_list_cur;
1494 mutex_enter(&hash[i].sticky_lock);
1495 curp = ilbs->ilbs_sticky_list_curp;
1496 } else {
1497 ret = EINVAL;
1498 goto done;
1501 j = 0;
1502 while (j < *num) {
1503 if (curp == NULL) {
1504 mutex_exit(&hash[i].sticky_lock);
1505 if (++i == ilbs->ilbs_sticky_hash_size) {
1506 *flags |= ILB_LIST_END;
1507 break;
1509 mutex_enter(&hash[i].sticky_lock);
1510 curp = list_head(&hash[i].sticky_head);
1511 continue;
1513 (void) strcpy(st[j].rule_name, curp->rule_name);
1514 st[j].req_addr = curp->src;
1515 st[j].srv_addr = curp->server->iser_addr_v6;
1516 st[j].expiry_time = TICK_TO_MSEC(curp->expiry);
1517 j++;
1518 curp = list_next(&hash[i].sticky_head, curp);
1520 ilbs->ilbs_sticky_list_curp = curp;
1521 if (j == *num)
1522 mutex_exit(&hash[i].sticky_lock);
1524 ilbs->ilbs_sticky_list_cur = i;
1526 *num = j;
1527 done:
1528 mutex_enter(&ilbs->ilbs_sticky_list_lock);
1529 ilbs->ilbs_sticky_list_busy = B_FALSE;
1530 cv_signal(&ilbs->ilbs_sticky_list_cv);
1531 mutex_exit(&ilbs->ilbs_sticky_list_lock);
1533 return (ret);