4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2014 Joyent, Inc. All rights reserved.
28 #include <sys/sysmacros.h>
29 #include <sys/types.h>
32 #include <sys/taskq.h>
33 #include <sys/cmn_err.h>
35 #include <sys/atomic.h>
36 #include <netinet/in.h>
40 #include <inet/udp_impl.h>
43 #include "ilb_stack.h"
49 * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection
51 * start: starting index into the hash table to do gc
52 * end: ending index into the hash table to do gc
53 * ilbs: pointer to the ilb_stack_t of the IP stack
54 * tid_lock: mutex to protect the timer id.
55 * tid: timer id of the timer
57 typedef struct ilb_timer_s
{
65 /* Hash macro for finding the index to the conn hash table */
66 #define ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size) \
67 (((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 + \
68 (*((saddr) + 2) ^ *((daddr) + 2)) * 1369 + \
69 (*((saddr) + 1) ^ *((daddr) + 1)) * 37 + \
70 (*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) & \
73 /* Kmem cache for the conn hash entry */
74 static struct kmem_cache
*ilb_conn_cache
= NULL
;
77 * There are 60 timers running to do conn cache garbage collection. Each
78 * gc thread is responsible for 1/60 of the conn hash table.
80 static int ilb_conn_timer_size
= 60;
82 /* Each of the above gc timers wake up every 15s to do the gc. */
83 static int ilb_conn_cache_timeout
= 15;
85 #define ILB_STICKY_HASH(saddr, rule, hash_size) \
86 (((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 + \
87 (*((saddr) + 2) ^ ((rule) >> 16)) * 961 + \
88 (*((saddr) + 1) ^ ((rule) >> 8)) * 31 + \
89 (*(saddr) ^ (rule))) & ((hash_size) - 1))
91 static struct kmem_cache
*ilb_sticky_cache
= NULL
;
94 * There are 60 timers running to do sticky cache garbage collection. Each
95 * gc thread is responsible for 1/60 of the sticky hash table.
97 static int ilb_sticky_timer_size
= 60;
99 /* Each of the above gc timers wake up every 15s to do the gc. */
100 static int ilb_sticky_timeout
= 15;
102 #define ILB_STICKY_REFRELE(s) \
104 mutex_enter(&(s)->hash->sticky_lock); \
106 (s)->atime = ddi_get_lbolt64(); \
107 mutex_exit(&s->hash->sticky_lock); \
112 ilb_conn_cache_init(void)
114 ilb_conn_cache
= kmem_cache_create("ilb_conn_cache",
115 sizeof (ilb_conn_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
,
120 ilb_conn_cache_fini(void)
122 if (ilb_conn_cache
!= NULL
) {
123 kmem_cache_destroy(ilb_conn_cache
);
124 ilb_conn_cache
= NULL
;
129 ilb_conn_remove_common(ilb_conn_t
*connp
, boolean_t c2s
)
131 ilb_conn_hash_t
*hash
;
132 ilb_conn_t
**next
, **prev
;
133 ilb_conn_t
**next_prev
, **prev_next
;
136 hash
= connp
->conn_c2s_hash
;
137 ASSERT(MUTEX_HELD(&hash
->ilb_conn_hash_lock
));
138 next
= &connp
->conn_c2s_next
;
139 prev
= &connp
->conn_c2s_prev
;
141 next_prev
= &(*next
)->conn_c2s_prev
;
143 prev_next
= &(*prev
)->conn_c2s_next
;
145 hash
= connp
->conn_s2c_hash
;
146 ASSERT(MUTEX_HELD(&hash
->ilb_conn_hash_lock
));
147 next
= &connp
->conn_s2c_next
;
148 prev
= &connp
->conn_s2c_prev
;
150 next_prev
= &(*next
)->conn_s2c_prev
;
152 prev_next
= &(*prev
)->conn_s2c_next
;
155 if (hash
->ilb_connp
== connp
) {
156 hash
->ilb_connp
= *next
;
165 ASSERT(hash
->ilb_conn_cnt
> 0);
166 hash
->ilb_conn_cnt
--;
173 ilb_conn_remove(ilb_conn_t
*connp
)
175 ASSERT(MUTEX_HELD(&connp
->conn_c2s_hash
->ilb_conn_hash_lock
));
176 ilb_conn_remove_common(connp
, B_TRUE
);
177 ASSERT(MUTEX_HELD(&connp
->conn_s2c_hash
->ilb_conn_hash_lock
));
178 ilb_conn_remove_common(connp
, B_FALSE
);
180 if (connp
->conn_rule_cache
.topo
== ILB_TOPO_IMPL_NAT
) {
183 port
= ntohs(connp
->conn_rule_cache
.info
.nat_sport
);
184 vmem_free(connp
->conn_rule_cache
.info
.src_ent
->nse_port_arena
,
185 (void *)(uintptr_t)port
, 1);
188 if (connp
->conn_sticky
!= NULL
)
189 ILB_STICKY_REFRELE(connp
->conn_sticky
);
190 ILB_SERVER_REFRELE(connp
->conn_server
);
191 kmem_cache_free(ilb_conn_cache
, connp
);
195 * Routine to do periodic garbage collection of conn hash entries. When
196 * a conn hash timer fires, it dispatches a taskq to call this function
197 * to do the gc. Note that each taskq is responisble for a portion of
198 * the table. The portion is stored in timer->start, timer->end.
201 ilb_conn_cleanup(void *arg
)
203 ilb_timer_t
*timer
= (ilb_timer_t
*)arg
;
206 ilb_conn_hash_t
*c2s_hash
, *s2c_hash
;
207 ilb_conn_t
*connp
, *nxt_connp
;
213 c2s_hash
= ilbs
->ilbs_c2s_conn_hash
;
214 ASSERT(c2s_hash
!= NULL
);
216 now
= ddi_get_lbolt64();
217 for (i
= timer
->start
; i
< timer
->end
; i
++) {
218 mutex_enter(&c2s_hash
[i
].ilb_conn_hash_lock
);
219 if ((connp
= c2s_hash
[i
].ilb_connp
) == NULL
) {
220 ASSERT(c2s_hash
[i
].ilb_conn_cnt
== 0);
221 mutex_exit(&c2s_hash
[i
].ilb_conn_hash_lock
);
225 ASSERT(c2s_hash
[i
].ilb_conn_cnt
> 0);
226 ASSERT(connp
->conn_c2s_hash
== &c2s_hash
[i
]);
227 nxt_connp
= connp
->conn_c2s_next
;
228 expiry
= now
- SEC_TO_TICK(connp
->conn_expiry
);
229 if (connp
->conn_server
->iser_die_time
!= 0 &&
230 connp
->conn_server
->iser_die_time
< now
)
234 s2c_hash
= connp
->conn_s2c_hash
;
235 mutex_enter(&s2c_hash
->ilb_conn_hash_lock
);
237 if (connp
->conn_gc
|| die_now
||
238 (connp
->conn_c2s_atime
< expiry
&&
239 connp
->conn_s2c_atime
< expiry
)) {
240 /* Need to update the nat list cur_connp */
241 if (connp
== ilbs
->ilbs_conn_list_connp
) {
242 ilbs
->ilbs_conn_list_connp
=
243 connp
->conn_c2s_next
;
245 ilb_conn_remove(connp
);
249 if (connp
->conn_l4
!= IPPROTO_TCP
)
252 /* Update and check TCP related conn info */
253 if (connp
->conn_c2s_tcp_fin_sent
&&
254 SEQ_GT(connp
->conn_s2c_tcp_ack
,
255 connp
->conn_c2s_tcp_fss
)) {
256 connp
->conn_c2s_tcp_fin_acked
= B_TRUE
;
258 if (connp
->conn_s2c_tcp_fin_sent
&&
259 SEQ_GT(connp
->conn_c2s_tcp_ack
,
260 connp
->conn_s2c_tcp_fss
)) {
261 connp
->conn_s2c_tcp_fin_acked
= B_TRUE
;
263 if (connp
->conn_c2s_tcp_fin_acked
&&
264 connp
->conn_s2c_tcp_fin_acked
) {
265 ilb_conn_remove(connp
);
268 mutex_exit(&s2c_hash
->ilb_conn_hash_lock
);
270 } while (connp
!= NULL
);
271 mutex_exit(&c2s_hash
[i
].ilb_conn_hash_lock
);
275 /* Conn hash timer routine. It dispatches a taskq and restart the timer */
277 ilb_conn_timer(void *arg
)
279 ilb_timer_t
*timer
= (ilb_timer_t
*)arg
;
281 (void) taskq_dispatch(timer
->ilbs
->ilbs_conn_taskq
, ilb_conn_cleanup
,
283 mutex_enter(&timer
->tid_lock
);
284 if (timer
->tid
== 0) {
285 mutex_exit(&timer
->tid_lock
);
287 timer
->tid
= timeout(ilb_conn_timer
, arg
,
288 SEC_TO_TICK(ilb_conn_cache_timeout
));
289 mutex_exit(&timer
->tid_lock
);
294 ilb_conn_hash_init(ilb_stack_t
*ilbs
)
296 extern pri_t minclsyspri
;
299 char tq_name
[TASKQ_NAMELEN
];
302 * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to
303 * the next power of 2.
305 if (!ISP2(ilbs
->ilbs_conn_hash_size
)) {
306 for (i
= 0; i
< 31; i
++) {
307 if (ilbs
->ilbs_conn_hash_size
< (1 << i
))
310 ilbs
->ilbs_conn_hash_size
= 1 << i
;
314 * Can sleep since this should be called when a rule is being added,
315 * hence we are not in interrupt context.
317 ilbs
->ilbs_c2s_conn_hash
= kmem_zalloc(sizeof (ilb_conn_hash_t
) *
318 ilbs
->ilbs_conn_hash_size
, KM_SLEEP
);
319 ilbs
->ilbs_s2c_conn_hash
= kmem_zalloc(sizeof (ilb_conn_hash_t
) *
320 ilbs
->ilbs_conn_hash_size
, KM_SLEEP
);
322 for (i
= 0; i
< ilbs
->ilbs_conn_hash_size
; i
++) {
323 mutex_init(&ilbs
->ilbs_c2s_conn_hash
[i
].ilb_conn_hash_lock
,
324 NULL
, MUTEX_DEFAULT
, NULL
);
326 for (i
= 0; i
< ilbs
->ilbs_conn_hash_size
; i
++) {
327 mutex_init(&ilbs
->ilbs_s2c_conn_hash
[i
].ilb_conn_hash_lock
,
328 NULL
, MUTEX_DEFAULT
, NULL
);
331 if (ilb_conn_cache
== NULL
)
332 ilb_conn_cache_init();
334 (void) snprintf(tq_name
, sizeof (tq_name
), "ilb_conn_taskq_%p",
335 (void *)ilbs
->ilbs_netstack
);
336 ASSERT(ilbs
->ilbs_conn_taskq
== NULL
);
337 ilbs
->ilbs_conn_taskq
= taskq_create(tq_name
,
338 ilb_conn_timer_size
* 2, minclsyspri
, ilb_conn_timer_size
,
339 ilb_conn_timer_size
* 2, TASKQ_PREPOPULATE
|TASKQ_DYNAMIC
);
341 ASSERT(ilbs
->ilbs_conn_timer_list
== NULL
);
342 ilbs
->ilbs_conn_timer_list
= kmem_zalloc(sizeof (ilb_timer_t
) *
343 ilb_conn_timer_size
, KM_SLEEP
);
346 * The hash table is divided in equal partition for those timers
347 * to do garbage collection.
349 part
= ilbs
->ilbs_conn_hash_size
/ ilb_conn_timer_size
+ 1;
350 for (i
= 0; i
< ilb_conn_timer_size
; i
++) {
351 tm
= ilbs
->ilbs_conn_timer_list
+ i
;
352 tm
->start
= i
* part
;
353 tm
->end
= i
* part
+ part
;
354 if (tm
->end
> ilbs
->ilbs_conn_hash_size
)
355 tm
->end
= ilbs
->ilbs_conn_hash_size
;
357 mutex_init(&tm
->tid_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
358 /* Spread out the starting execution time of all the timers. */
359 tm
->tid
= timeout(ilb_conn_timer
, tm
,
360 SEC_TO_TICK(ilb_conn_cache_timeout
+ i
));
365 ilb_conn_hash_fini(ilb_stack_t
*ilbs
)
369 ilb_conn_hash_t
*hash
;
371 if (ilbs
->ilbs_c2s_conn_hash
== NULL
) {
372 ASSERT(ilbs
->ilbs_s2c_conn_hash
== NULL
);
376 /* Stop all the timers first. */
377 for (i
= 0; i
< ilb_conn_timer_size
; i
++) {
380 /* Setting tid to 0 tells the timer handler not to restart. */
381 mutex_enter(&ilbs
->ilbs_conn_timer_list
[i
].tid_lock
);
382 tid
= ilbs
->ilbs_conn_timer_list
[i
].tid
;
383 ilbs
->ilbs_conn_timer_list
[i
].tid
= 0;
384 mutex_exit(&ilbs
->ilbs_conn_timer_list
[i
].tid_lock
);
385 (void) untimeout(tid
);
387 kmem_free(ilbs
->ilbs_conn_timer_list
, sizeof (ilb_timer_t
) *
388 ilb_conn_timer_size
);
389 taskq_destroy(ilbs
->ilbs_conn_taskq
);
390 ilbs
->ilbs_conn_taskq
= NULL
;
392 /* Then remove all the conns. */
393 hash
= ilbs
->ilbs_s2c_conn_hash
;
394 for (i
= 0; i
< ilbs
->ilbs_conn_hash_size
; i
++) {
395 while ((connp
= hash
[i
].ilb_connp
) != NULL
) {
396 hash
[i
].ilb_connp
= connp
->conn_s2c_next
;
397 ILB_SERVER_REFRELE(connp
->conn_server
);
398 if (connp
->conn_rule_cache
.topo
== ILB_TOPO_IMPL_NAT
) {
399 ilb_nat_src_entry_t
*ent
;
403 * src_ent will be freed in ilb_nat_src_fini().
406 connp
->conn_rule_cache
.info
.nat_sport
);
407 ent
= connp
->conn_rule_cache
.info
.src_ent
;
408 vmem_free(ent
->nse_port_arena
,
409 (void *)(uintptr_t)port
, 1);
411 kmem_cache_free(ilb_conn_cache
, connp
);
414 kmem_free(ilbs
->ilbs_c2s_conn_hash
, sizeof (ilb_conn_hash_t
) *
415 ilbs
->ilbs_conn_hash_size
);
416 kmem_free(ilbs
->ilbs_s2c_conn_hash
, sizeof (ilb_conn_hash_t
) *
417 ilbs
->ilbs_conn_hash_size
);
421 * Internet checksum adjustment calculation routines. We pre-calculate
422 * checksum adjustment so that we don't need to compute the checksum on
423 * the whole packet when we change address/port in the packet.
427 hnat_cksum_v4(uint16_t *oaddr
, uint16_t *naddr
, in_port_t old_port
,
428 in_port_t new_port
, uint32_t *adj_sum
)
432 sum
= *oaddr
+ *(oaddr
+ 1) + old_port
;
433 while ((sum
>> 16) != 0)
434 sum
= (sum
& 0xffff) + (sum
>> 16);
435 *adj_sum
= (uint16_t)~sum
+ *naddr
+ *(naddr
+ 1) + new_port
;
439 hnat_cksum_v6(uint16_t *oaddr
, uint16_t *naddr
, in_port_t old_port
,
440 in_port_t new_port
, uint32_t *adj_sum
)
444 sum
= *oaddr
+ *(oaddr
+ 1) + *(oaddr
+ 2) + *(oaddr
+ 3) +
445 *(oaddr
+ 4) + *(oaddr
+ 5) + *(oaddr
+ 6) + *(oaddr
+ 7) +
447 while ((sum
>> 16) != 0)
448 sum
= (sum
& 0xffff) + (sum
>> 16);
449 *adj_sum
= (uint16_t)~sum
+ *naddr
+ *(naddr
+ 1) +
450 *(naddr
+ 2) + *(naddr
+ 3) + *(naddr
+ 4) + *(naddr
+ 5) +
451 *(naddr
+ 6) + *(naddr
+ 7) + new_port
;
455 fnat_cksum_v4(uint16_t *oaddr1
, uint16_t *oaddr2
, uint16_t *naddr1
,
456 uint16_t *naddr2
, in_port_t old_port1
, in_port_t old_port2
,
457 in_port_t new_port1
, in_port_t new_port2
, uint32_t *adj_sum
)
461 sum
= *oaddr1
+ *(oaddr1
+ 1) + old_port1
+ *oaddr2
+ *(oaddr2
+ 1) +
463 while ((sum
>> 16) != 0)
464 sum
= (sum
& 0xffff) + (sum
>> 16);
465 *adj_sum
= (uint16_t)~sum
+ *naddr1
+ *(naddr1
+ 1) + new_port1
+
466 *naddr2
+ *(naddr2
+ 1) + new_port2
;
470 fnat_cksum_v6(uint16_t *oaddr1
, uint16_t *oaddr2
, uint16_t *naddr1
,
471 uint16_t *naddr2
, in_port_t old_port1
, in_port_t old_port2
,
472 in_port_t new_port1
, in_port_t new_port2
, uint32_t *adj_sum
)
476 sum
= *oaddr1
+ *(oaddr1
+ 1) + *(oaddr1
+ 2) + *(oaddr1
+ 3) +
477 *(oaddr1
+ 4) + *(oaddr1
+ 5) + *(oaddr1
+ 6) + *(oaddr1
+ 7) +
479 sum
+= *oaddr2
+ *(oaddr2
+ 1) + *(oaddr2
+ 2) + *(oaddr2
+ 3) +
480 *(oaddr2
+ 4) + *(oaddr2
+ 5) + *(oaddr2
+ 6) + *(oaddr2
+ 7) +
482 while ((sum
>> 16) != 0)
483 sum
= (sum
& 0xffff) + (sum
>> 16);
484 sum
= (uint16_t)~sum
+ *naddr1
+ *(naddr1
+ 1) + *(naddr1
+ 2) +
485 *(naddr1
+ 3) + *(naddr1
+ 4) + *(naddr1
+ 5) + *(naddr1
+ 6) +
486 *(naddr1
+ 7) + new_port1
;
487 *adj_sum
= sum
+ *naddr2
+ *(naddr2
+ 1) + *(naddr2
+ 2) +
488 *(naddr2
+ 3) + *(naddr2
+ 4) + *(naddr2
+ 5) + *(naddr2
+ 6) +
489 *(naddr2
+ 7) + new_port2
;
493 * Add a conn hash entry to the tables. Note that a conn hash entry
494 * (ilb_conn_t) contains info on both directions. And there are two hash
495 * tables, one for client to server and the other for server to client.
496 * So the same entry is added to both tables and can be ccessed by two
497 * thread simultaneously. But each thread will only access data on one
498 * direction, so there is no conflict.
501 ilb_conn_add(ilb_stack_t
*ilbs
, ilb_rule_t
*rule
, ilb_server_t
*server
,
502 in6_addr_t
*src
, in_port_t sport
, in6_addr_t
*dst
, in_port_t dport
,
503 ilb_nat_info_t
*info
, uint32_t *ip_sum
, uint32_t *tp_sum
, ilb_sticky_t
*s
)
506 ilb_conn_hash_t
*hash
;
509 connp
= kmem_cache_alloc(ilb_conn_cache
, KM_NOSLEEP
);
512 if (rule
->ir_topo
== ILB_TOPO_IMPL_NAT
) {
513 ilb_nat_src_entry_t
**entry
;
515 entry
= s
->server
->iser_nat_src
->src_list
;
516 vmem_free(entry
[s
->nat_src_idx
]->nse_port_arena
,
517 (void *)(uintptr_t)ntohs(info
->nat_sport
),
520 ILB_STICKY_REFRELE(s
);
525 connp
->conn_l4
= rule
->ir_proto
;
527 connp
->conn_server
= server
;
528 ILB_SERVER_REFHOLD(server
);
529 connp
->conn_sticky
= s
;
531 connp
->conn_rule_cache
.topo
= rule
->ir_topo
;
532 connp
->conn_rule_cache
.info
= *info
;
534 connp
->conn_gc
= B_FALSE
;
536 connp
->conn_expiry
= rule
->ir_nat_expiry
;
537 connp
->conn_cr_time
= ddi_get_lbolt64();
539 /* Client to server info. */
540 connp
->conn_c2s_saddr
= *src
;
541 connp
->conn_c2s_sport
= sport
;
542 connp
->conn_c2s_daddr
= *dst
;
543 connp
->conn_c2s_dport
= dport
;
545 connp
->conn_c2s_atime
= ddi_get_lbolt64();
546 /* The packet ths triggers this creation should be counted */
547 connp
->conn_c2s_pkt_cnt
= 1;
548 connp
->conn_c2s_tcp_fin_sent
= B_FALSE
;
549 connp
->conn_c2s_tcp_fin_acked
= B_FALSE
;
551 /* Server to client info, before NAT */
552 switch (rule
->ir_topo
) {
553 case ILB_TOPO_IMPL_HALF_NAT
:
554 connp
->conn_s2c_saddr
= info
->nat_dst
;
555 connp
->conn_s2c_sport
= info
->nat_dport
;
556 connp
->conn_s2c_daddr
= *src
;
557 connp
->conn_s2c_dport
= sport
;
559 /* Pre-calculate checksum changes for both directions */
560 if (rule
->ir_ipver
== IPPROTO_IP
) {
561 hnat_cksum_v4((uint16_t *)&dst
->s6_addr32
[3],
562 (uint16_t *)&info
->nat_dst
.s6_addr32
[3], 0, 0,
563 &connp
->conn_c2s_ip_sum
);
564 hnat_cksum_v4((uint16_t *)&dst
->s6_addr32
[3],
565 (uint16_t *)&info
->nat_dst
.s6_addr32
[3], dport
,
566 info
->nat_dport
, &connp
->conn_c2s_tp_sum
);
567 *ip_sum
= connp
->conn_c2s_ip_sum
;
568 *tp_sum
= connp
->conn_c2s_tp_sum
;
571 (uint16_t *)&info
->nat_dst
.s6_addr32
[3],
572 (uint16_t *)&dst
->s6_addr32
[3], 0, 0,
573 &connp
->conn_s2c_ip_sum
);
575 (uint16_t *)&info
->nat_dst
.s6_addr32
[3],
576 (uint16_t *)&dst
->s6_addr32
[3],
577 info
->nat_dport
, dport
,
578 &connp
->conn_s2c_tp_sum
);
580 connp
->conn_c2s_ip_sum
= 0;
581 hnat_cksum_v6((uint16_t *)dst
,
582 (uint16_t *)&info
->nat_dst
, dport
,
583 info
->nat_dport
, &connp
->conn_c2s_tp_sum
);
585 *tp_sum
= connp
->conn_c2s_tp_sum
;
587 connp
->conn_s2c_ip_sum
= 0;
588 hnat_cksum_v6((uint16_t *)&info
->nat_dst
,
589 (uint16_t *)dst
, info
->nat_dport
, dport
,
590 &connp
->conn_s2c_tp_sum
);
593 case ILB_TOPO_IMPL_NAT
:
594 connp
->conn_s2c_saddr
= info
->nat_dst
;
595 connp
->conn_s2c_sport
= info
->nat_dport
;
596 connp
->conn_s2c_daddr
= info
->nat_src
;
597 connp
->conn_s2c_dport
= info
->nat_sport
;
599 if (rule
->ir_ipver
== IPPROTO_IP
) {
600 fnat_cksum_v4((uint16_t *)&src
->s6_addr32
[3],
601 (uint16_t *)&dst
->s6_addr32
[3],
602 (uint16_t *)&info
->nat_src
.s6_addr32
[3],
603 (uint16_t *)&info
->nat_dst
.s6_addr32
[3],
604 0, 0, 0, 0, &connp
->conn_c2s_ip_sum
);
605 fnat_cksum_v4((uint16_t *)&src
->s6_addr32
[3],
606 (uint16_t *)&dst
->s6_addr32
[3],
607 (uint16_t *)&info
->nat_src
.s6_addr32
[3],
608 (uint16_t *)&info
->nat_dst
.s6_addr32
[3],
609 sport
, dport
, info
->nat_sport
,
610 info
->nat_dport
, &connp
->conn_c2s_tp_sum
);
611 *ip_sum
= connp
->conn_c2s_ip_sum
;
612 *tp_sum
= connp
->conn_c2s_tp_sum
;
615 (uint16_t *)&info
->nat_src
.s6_addr32
[3],
616 (uint16_t *)&info
->nat_dst
.s6_addr32
[3],
617 (uint16_t *)&src
->s6_addr32
[3],
618 (uint16_t *)&dst
->s6_addr32
[3],
619 0, 0, 0, 0, &connp
->conn_s2c_ip_sum
);
621 (uint16_t *)&info
->nat_src
.s6_addr32
[3],
622 (uint16_t *)&info
->nat_dst
.s6_addr32
[3],
623 (uint16_t *)&src
->s6_addr32
[3],
624 (uint16_t *)&dst
->s6_addr32
[3],
625 info
->nat_sport
, info
->nat_dport
,
626 sport
, dport
, &connp
->conn_s2c_tp_sum
);
628 fnat_cksum_v6((uint16_t *)src
, (uint16_t *)dst
,
629 (uint16_t *)&info
->nat_src
,
630 (uint16_t *)&info
->nat_dst
,
631 sport
, dport
, info
->nat_sport
,
632 info
->nat_dport
, &connp
->conn_c2s_tp_sum
);
633 connp
->conn_c2s_ip_sum
= 0;
635 *tp_sum
= connp
->conn_c2s_tp_sum
;
637 fnat_cksum_v6((uint16_t *)&info
->nat_src
,
638 (uint16_t *)&info
->nat_dst
, (uint16_t *)src
,
639 (uint16_t *)dst
, info
->nat_sport
,
640 info
->nat_dport
, sport
, dport
,
641 &connp
->conn_s2c_tp_sum
);
642 connp
->conn_s2c_ip_sum
= 0;
647 connp
->conn_s2c_atime
= ddi_get_lbolt64();
648 connp
->conn_s2c_pkt_cnt
= 1;
649 connp
->conn_s2c_tcp_fin_sent
= B_FALSE
;
650 connp
->conn_s2c_tcp_fin_acked
= B_FALSE
;
652 /* Add it to the s2c hash table. */
653 hash
= ilbs
->ilbs_s2c_conn_hash
;
654 i
= ILB_CONN_HASH((uint8_t *)&connp
->conn_s2c_saddr
.s6_addr32
[3],
655 ntohs(connp
->conn_s2c_sport
),
656 (uint8_t *)&connp
->conn_s2c_daddr
.s6_addr32
[3],
657 ntohs(connp
->conn_s2c_dport
), ilbs
->ilbs_conn_hash_size
);
658 connp
->conn_s2c_hash
= &hash
[i
];
659 DTRACE_PROBE2(ilb__conn__hash__add__s2c
, ilb_conn_t
*, connp
, int, i
);
661 mutex_enter(&hash
[i
].ilb_conn_hash_lock
);
662 hash
[i
].ilb_conn_cnt
++;
663 connp
->conn_s2c_next
= hash
[i
].ilb_connp
;
664 if (hash
[i
].ilb_connp
!= NULL
)
665 hash
[i
].ilb_connp
->conn_s2c_prev
= connp
;
666 connp
->conn_s2c_prev
= NULL
;
667 hash
[i
].ilb_connp
= connp
;
668 mutex_exit(&hash
[i
].ilb_conn_hash_lock
);
670 /* Add it to the c2s hash table. */
671 hash
= ilbs
->ilbs_c2s_conn_hash
;
672 i
= ILB_CONN_HASH((uint8_t *)&src
->s6_addr32
[3], ntohs(sport
),
673 (uint8_t *)&dst
->s6_addr32
[3], ntohs(dport
),
674 ilbs
->ilbs_conn_hash_size
);
675 connp
->conn_c2s_hash
= &hash
[i
];
676 DTRACE_PROBE2(ilb__conn__hash__add__c2s
, ilb_conn_t
*, connp
, int, i
);
678 mutex_enter(&hash
[i
].ilb_conn_hash_lock
);
679 hash
[i
].ilb_conn_cnt
++;
680 connp
->conn_c2s_next
= hash
[i
].ilb_connp
;
681 if (hash
[i
].ilb_connp
!= NULL
)
682 hash
[i
].ilb_connp
->conn_c2s_prev
= connp
;
683 connp
->conn_c2s_prev
= NULL
;
684 hash
[i
].ilb_connp
= connp
;
685 mutex_exit(&hash
[i
].ilb_conn_hash_lock
);
691 * If a connection is using TCP, we keep track of simple TCP state transition
692 * so that we know when to clean up an entry.
695 update_conn_tcp(ilb_conn_t
*connp
, void *iph
, tcpha_t
*tcpha
, int32_t pkt_len
,
701 if (tcpha
->tha_flags
& TH_RST
)
704 seg_len
= pkt_len
- ((uint8_t *)tcpha
- (uint8_t *)iph
) -
705 TCP_HDR_LENGTH((tcph_t
*)tcpha
);
707 if (tcpha
->tha_flags
& TH_ACK
)
708 ack
= ntohl(tcpha
->tha_ack
);
709 seq
= ntohl(tcpha
->tha_seq
);
711 ASSERT(MUTEX_HELD(&connp
->conn_c2s_hash
->ilb_conn_hash_lock
));
712 if (tcpha
->tha_flags
& TH_FIN
) {
713 connp
->conn_c2s_tcp_fss
= seq
+ seg_len
;
714 connp
->conn_c2s_tcp_fin_sent
= B_TRUE
;
716 connp
->conn_c2s_tcp_ack
= ack
;
718 /* Port reuse by the client, restart the conn. */
719 if (connp
->conn_c2s_tcp_fin_sent
&&
720 SEQ_GT(seq
, connp
->conn_c2s_tcp_fss
+ 1)) {
721 connp
->conn_c2s_tcp_fin_sent
= B_FALSE
;
722 connp
->conn_c2s_tcp_fin_acked
= B_FALSE
;
725 ASSERT(MUTEX_HELD(&connp
->conn_s2c_hash
->ilb_conn_hash_lock
));
726 if (tcpha
->tha_flags
& TH_FIN
) {
727 connp
->conn_s2c_tcp_fss
= seq
+ seg_len
;
728 connp
->conn_s2c_tcp_fin_sent
= B_TRUE
;
730 connp
->conn_s2c_tcp_ack
= ack
;
732 /* Port reuse by the client, restart the conn. */
733 if (connp
->conn_s2c_tcp_fin_sent
&&
734 SEQ_GT(seq
, connp
->conn_s2c_tcp_fss
+ 1)) {
735 connp
->conn_s2c_tcp_fin_sent
= B_FALSE
;
736 connp
->conn_s2c_tcp_fin_acked
= B_FALSE
;
744 * Helper routint to find conn hash entry given some packet information and
745 * the traffic direction (c2s, client to server?)
748 ilb_find_conn(ilb_stack_t
*ilbs
, void *iph
, void *tph
, int l4
, in6_addr_t
*src
,
749 in_port_t sport
, in6_addr_t
*dst
, in_port_t dport
,
750 ilb_rule_info_t
*rule_cache
, uint32_t *ip_sum
, uint32_t *tp_sum
,
751 int32_t pkt_len
, boolean_t c2s
)
753 ilb_conn_hash_t
*hash
;
757 boolean_t ret
= B_FALSE
;
759 i
= ILB_CONN_HASH((uint8_t *)&src
->s6_addr32
[3], ntohs(sport
),
760 (uint8_t *)&dst
->s6_addr32
[3], ntohs(dport
),
761 ilbs
->ilbs_conn_hash_size
);
763 hash
= ilbs
->ilbs_c2s_conn_hash
;
764 mutex_enter(&hash
[i
].ilb_conn_hash_lock
);
765 for (connp
= hash
[i
].ilb_connp
; connp
!= NULL
;
766 connp
= connp
->conn_c2s_next
) {
767 if (connp
->conn_l4
== l4
&&
768 connp
->conn_c2s_dport
== dport
&&
769 connp
->conn_c2s_sport
== sport
&&
770 IN6_ARE_ADDR_EQUAL(src
, &connp
->conn_c2s_saddr
) &&
771 IN6_ARE_ADDR_EQUAL(dst
, &connp
->conn_c2s_daddr
)) {
772 connp
->conn_c2s_atime
= ddi_get_lbolt64();
773 connp
->conn_c2s_pkt_cnt
++;
774 *rule_cache
= connp
->conn_rule_cache
;
775 *ip_sum
= connp
->conn_c2s_ip_sum
;
776 *tp_sum
= connp
->conn_c2s_tp_sum
;
782 hash
= ilbs
->ilbs_s2c_conn_hash
;
783 mutex_enter(&hash
[i
].ilb_conn_hash_lock
);
784 for (connp
= hash
[i
].ilb_connp
; connp
!= NULL
;
785 connp
= connp
->conn_s2c_next
) {
786 if (connp
->conn_l4
== l4
&&
787 connp
->conn_s2c_dport
== dport
&&
788 connp
->conn_s2c_sport
== sport
&&
789 IN6_ARE_ADDR_EQUAL(src
, &connp
->conn_s2c_saddr
) &&
790 IN6_ARE_ADDR_EQUAL(dst
, &connp
->conn_s2c_daddr
)) {
791 connp
->conn_s2c_atime
= ddi_get_lbolt64();
792 connp
->conn_s2c_pkt_cnt
++;
793 *rule_cache
= connp
->conn_rule_cache
;
794 *ip_sum
= connp
->conn_s2c_ip_sum
;
795 *tp_sum
= connp
->conn_s2c_tp_sum
;
802 ILB_S_KSTAT(connp
->conn_server
, pkt_processed
);
803 ILB_S_KSTAT_UPDATE(connp
->conn_server
, bytes_processed
,
808 tcp_alive
= update_conn_tcp(connp
, iph
, tph
, pkt_len
,
811 connp
->conn_gc
= B_TRUE
;
818 mutex_exit(&hash
[i
].ilb_conn_hash_lock
);
824 * To check if a give packet matches an existing conn hash entry. If it
825 * does, return the information about this entry so that the caller can
829 ilb_check_conn(ilb_stack_t
*ilbs
, int l3
, void *iph
, int l4
, void *tph
,
830 in6_addr_t
*src
, in6_addr_t
*dst
, in_port_t sport
, in_port_t dport
,
831 uint32_t pkt_len
, in6_addr_t
*lb_dst
)
833 ilb_rule_info_t rule_cache
;
834 uint32_t adj_ip_sum
, adj_tp_sum
;
837 /* Check the incoming hash table. */
838 if (ilb_find_conn(ilbs
, iph
, tph
, l4
, src
, sport
, dst
, dport
,
839 &rule_cache
, &adj_ip_sum
, &adj_tp_sum
, pkt_len
, B_TRUE
)) {
840 switch (rule_cache
.topo
) {
841 case ILB_TOPO_IMPL_NAT
:
842 *lb_dst
= rule_cache
.info
.nat_dst
;
843 ilb_full_nat(l3
, iph
, l4
, tph
, &rule_cache
.info
,
844 adj_ip_sum
, adj_tp_sum
, B_TRUE
);
847 case ILB_TOPO_IMPL_HALF_NAT
:
848 *lb_dst
= rule_cache
.info
.nat_dst
;
849 ilb_half_nat(l3
, iph
, l4
, tph
, &rule_cache
.info
,
850 adj_ip_sum
, adj_tp_sum
, B_TRUE
);
859 if (ilb_find_conn(ilbs
, iph
, tph
, l4
, src
, sport
, dst
, dport
,
860 &rule_cache
, &adj_ip_sum
, &adj_tp_sum
, pkt_len
, B_FALSE
)) {
861 switch (rule_cache
.topo
) {
862 case ILB_TOPO_IMPL_NAT
:
863 *lb_dst
= rule_cache
.info
.src
;
864 ilb_full_nat(l3
, iph
, l4
, tph
, &rule_cache
.info
,
865 adj_ip_sum
, adj_tp_sum
, B_FALSE
);
868 case ILB_TOPO_IMPL_HALF_NAT
:
870 ilb_half_nat(l3
, iph
, l4
, tph
, &rule_cache
.info
,
871 adj_ip_sum
, adj_tp_sum
, B_FALSE
);
885 * To check if an ICMP packet belongs to a connection in one of the conn
889 ilb_check_icmp_conn(ilb_stack_t
*ilbs
, mblk_t
*mp
, int l3
, void *out_iph
,
890 void *icmph
, in6_addr_t
*lb_dst
)
892 ilb_conn_hash_t
*hash
;
897 in6_addr_t
*in_src_p
, *in_dst_p
;
898 in_port_t
*sport
, *dport
;
902 ilb_rule_info_t rule_cache
;
906 if (l3
== IPPROTO_IP
) {
907 in6_addr_t in_src
, in_dst
;
909 icmph4
= (icmph_t
*)icmph
;
910 in_iph4
= (ipha_t
*)&icmph4
[1];
912 if ((uint8_t *)in_iph4
+ IPH_HDR_LENGTH(in_iph4
) +
913 ICMP_MIN_TP_HDR_LEN
> mp
->b_wptr
) {
917 IN6_IPADDR_TO_V4MAPPED(in_iph4
->ipha_src
, &in_src
);
919 IN6_IPADDR_TO_V4MAPPED(in_iph4
->ipha_dst
, &in_dst
);
922 l4
= in_iph4
->ipha_protocol
;
923 if (l4
!= IPPROTO_TCP
&& l4
!= IPPROTO_UDP
)
926 sport
= (in_port_t
*)((char *)in_iph4
+
927 IPH_HDR_LENGTH(in_iph4
));
930 DTRACE_PROBE4(ilb__chk__icmp__conn__v4
, uint32_t,
931 in_iph4
->ipha_src
, uint32_t, in_iph4
->ipha_dst
, uint16_t,
932 ntohs(*sport
), uint16_t, ntohs(*dport
));
934 ASSERT(l3
== IPPROTO_IPV6
);
936 icmph6
= (icmp6_t
*)icmph
;
937 in_iph6
= (ip6_t
*)&icmph6
[1];
938 in_src_p
= &in_iph6
->ip6_src
;
939 in_dst_p
= &in_iph6
->ip6_dst
;
941 if ((uint8_t *)in_iph6
+ sizeof (ip6_t
) +
942 ICMP_MIN_TP_HDR_LEN
> mp
->b_wptr
) {
946 l4
= in_iph6
->ip6_nxt
;
947 /* We don't go deep inside an IPv6 packet yet. */
948 if (l4
!= IPPROTO_TCP
&& l4
!= IPPROTO_UDP
)
951 sport
= (in_port_t
*)&in_iph6
[1];
954 DTRACE_PROBE4(ilb__chk__icmp__conn__v6
, in6_addr_t
*,
955 &in_iph6
->ip6_src
, in6_addr_t
*, &in_iph6
->ip6_dst
,
956 uint16_t, ntohs(*sport
), uint16_t, ntohs(*dport
));
959 i
= ILB_CONN_HASH((uint8_t *)&in_dst_p
->s6_addr32
[3], ntohs(*dport
),
960 (uint8_t *)&in_src_p
->s6_addr32
[3], ntohs(*sport
),
961 ilbs
->ilbs_conn_hash_size
);
962 hash
= ilbs
->ilbs_c2s_conn_hash
;
964 mutex_enter(&hash
[i
].ilb_conn_hash_lock
);
965 for (connp
= hash
[i
].ilb_connp
; connp
!= NULL
;
966 connp
= connp
->conn_c2s_next
) {
967 if (connp
->conn_l4
== l4
&&
968 connp
->conn_c2s_dport
== *sport
&&
969 connp
->conn_c2s_sport
== *dport
&&
970 IN6_ARE_ADDR_EQUAL(in_dst_p
, &connp
->conn_c2s_saddr
) &&
971 IN6_ARE_ADDR_EQUAL(in_src_p
, &connp
->conn_c2s_daddr
)) {
972 connp
->conn_c2s_atime
= ddi_get_lbolt64();
973 connp
->conn_c2s_pkt_cnt
++;
974 rule_cache
= connp
->conn_rule_cache
;
975 adj_ip_sum
= connp
->conn_c2s_ip_sum
;
979 mutex_exit(&hash
[i
].ilb_conn_hash_lock
);
982 DTRACE_PROBE(ilb__chk__icmp__conn__failed
);
986 switch (rule_cache
.topo
) {
987 case ILB_TOPO_IMPL_NAT
:
990 case ILB_TOPO_IMPL_HALF_NAT
:
997 *lb_dst
= rule_cache
.info
.nat_dst
;
998 if (l3
== IPPROTO_IP
) {
999 ilb_nat_icmpv4(mp
, out_iph
, icmph4
, in_iph4
, sport
, dport
,
1000 &rule_cache
.info
, adj_ip_sum
, full_nat
);
1002 ilb_nat_icmpv6(mp
, out_iph
, icmph6
, in_iph6
, sport
, dport
,
1003 &rule_cache
.info
, full_nat
);
1009 * This routine sends up the conn hash table to user land. Note that the
1010 * request is an ioctl, hence we cannot really differentiate requests
1011 * from different clients. There is no context shared between different
1012 * ioctls. Here we make the assumption that the user land ilbd will
1013 * only allow one client to show the conn hash table at any time.
1014 * Otherwise, the results will be "very" inconsistent.
1016 * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants
1017 * to read from the beginning of the able. After a certain entries
1018 * are reported, the kernel remembers the position of the last returned
1019 * entry. When the next ioctl comes in with the ILB_LIST_BEGIN flag,
1020 * it will return entries starting from where it was left off. When
1021 * the end of table is reached, a flag (ILB_LIST_END) is set to tell
1022 * the client that there is no more entry.
1024 * It is assumed that the caller has checked the size of nat so that it
1025 * can hold num entries.
1029 ilb_list_nat(ilb_stack_t
*ilbs
, zoneid_t zoneid
, ilb_nat_entry_t
*nat
,
1030 uint32_t *num
, uint32_t *flags
)
1032 ilb_conn_hash_t
*hash
;
1033 ilb_conn_t
*cur_connp
;
1037 mutex_enter(&ilbs
->ilbs_conn_list_lock
);
1038 while (ilbs
->ilbs_conn_list_busy
) {
1039 if (cv_wait_sig(&ilbs
->ilbs_conn_list_cv
,
1040 &ilbs
->ilbs_conn_list_lock
) == 0) {
1041 mutex_exit(&ilbs
->ilbs_conn_list_lock
);
1045 if ((hash
= ilbs
->ilbs_c2s_conn_hash
) == NULL
) {
1046 ASSERT(ilbs
->ilbs_s2c_conn_hash
== NULL
);
1047 mutex_exit(&ilbs
->ilbs_conn_list_lock
);
1049 *flags
|= ILB_LIST_END
;
1052 ilbs
->ilbs_conn_list_busy
= B_TRUE
;
1053 mutex_exit(&ilbs
->ilbs_conn_list_lock
);
1055 if (*flags
& ILB_LIST_BEGIN
) {
1057 mutex_enter(&hash
[0].ilb_conn_hash_lock
);
1058 cur_connp
= hash
[0].ilb_connp
;
1059 } else if (*flags
& ILB_LIST_CONT
) {
1060 if (ilbs
->ilbs_conn_list_cur
== ilbs
->ilbs_conn_hash_size
) {
1062 *flags
|= ILB_LIST_END
;
1065 i
= ilbs
->ilbs_conn_list_cur
;
1066 mutex_enter(&hash
[i
].ilb_conn_hash_lock
);
1067 cur_connp
= ilbs
->ilbs_conn_list_connp
;
1075 if (cur_connp
== NULL
) {
1076 mutex_exit(&hash
[i
].ilb_conn_hash_lock
);
1077 if (++i
== ilbs
->ilbs_conn_hash_size
) {
1078 *flags
|= ILB_LIST_END
;
1081 mutex_enter(&hash
[i
].ilb_conn_hash_lock
);
1082 cur_connp
= hash
[i
].ilb_connp
;
1085 nat
[j
].proto
= cur_connp
->conn_l4
;
1087 nat
[j
].in_global
= cur_connp
->conn_c2s_daddr
;
1088 nat
[j
].in_global_port
= cur_connp
->conn_c2s_dport
;
1089 nat
[j
].out_global
= cur_connp
->conn_c2s_saddr
;
1090 nat
[j
].out_global_port
= cur_connp
->conn_c2s_sport
;
1092 nat
[j
].in_local
= cur_connp
->conn_s2c_saddr
;
1093 nat
[j
].in_local_port
= cur_connp
->conn_s2c_sport
;
1094 nat
[j
].out_local
= cur_connp
->conn_s2c_daddr
;
1095 nat
[j
].out_local_port
= cur_connp
->conn_s2c_dport
;
1097 nat
[j
].create_time
= TICK_TO_MSEC(cur_connp
->conn_cr_time
);
1098 nat
[j
].last_access_time
=
1099 TICK_TO_MSEC(cur_connp
->conn_c2s_atime
);
1102 * The conn_s2c_pkt_cnt may not be accurate since we are not
1103 * holding the s2c hash lock.
1105 nat
[j
].pkt_cnt
= cur_connp
->conn_c2s_pkt_cnt
+
1106 cur_connp
->conn_s2c_pkt_cnt
;
1109 cur_connp
= cur_connp
->conn_c2s_next
;
1111 ilbs
->ilbs_conn_list_connp
= cur_connp
;
1113 mutex_exit(&hash
[i
].ilb_conn_hash_lock
);
1115 ilbs
->ilbs_conn_list_cur
= i
;
1119 mutex_enter(&ilbs
->ilbs_conn_list_lock
);
1120 ilbs
->ilbs_conn_list_busy
= B_FALSE
;
1121 cv_signal(&ilbs
->ilbs_conn_list_cv
);
1122 mutex_exit(&ilbs
->ilbs_conn_list_lock
);
1129 * Stickiness (persistence) handling routines.
1134 ilb_sticky_cache_init(void)
1136 ilb_sticky_cache
= kmem_cache_create("ilb_sticky_cache",
1137 sizeof (ilb_sticky_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
,
1142 ilb_sticky_cache_fini(void)
1144 if (ilb_sticky_cache
!= NULL
) {
1145 kmem_cache_destroy(ilb_sticky_cache
);
1146 ilb_sticky_cache
= NULL
;
1151 ilb_sticky_refrele(ilb_sticky_t
*s
)
1153 ILB_STICKY_REFRELE(s
);
1156 static ilb_sticky_t
*
1157 ilb_sticky_lookup(ilb_sticky_hash_t
*hash
, ilb_rule_t
*rule
, in6_addr_t
*src
)
1161 ASSERT(mutex_owned(&hash
->sticky_lock
));
1163 for (s
= list_head(&hash
->sticky_head
); s
!= NULL
;
1164 s
= list_next(&hash
->sticky_head
, s
)) {
1165 if (s
->rule_instance
== rule
->ir_ks_instance
) {
1166 if (IN6_ARE_ADDR_EQUAL(src
, &s
->src
))
1173 static ilb_sticky_t
*
1174 ilb_sticky_add(ilb_sticky_hash_t
*hash
, ilb_rule_t
*rule
, ilb_server_t
*server
,
1179 ASSERT(mutex_owned(&hash
->sticky_lock
));
1181 if ((s
= kmem_cache_alloc(ilb_sticky_cache
, KM_NOSLEEP
)) == NULL
)
1185 * The rule instance is for handling the scenario when the same
1186 * client talks to different rules at the same time. Stickiness
1187 * is per rule so we can use the rule instance to differentiate
1188 * the client's request.
1190 s
->rule_instance
= rule
->ir_ks_instance
;
1192 * Copy the rule name for listing all sticky cache entry. ir_name
1193 * is guaranteed to be NULL terminated.
1195 (void) strcpy(s
->rule_name
, rule
->ir_name
);
1199 * Grab a ref cnt on the server so that it won't go away while
1200 * it is still in the sticky table.
1202 ILB_SERVER_REFHOLD(server
);
1204 s
->expiry
= rule
->ir_sticky_expiry
;
1209 * There is no need to set atime here since the refcnt is not
1210 * zero. A sticky entry is removed only when the refcnt is
1211 * zero. But just set it here for debugging purpose. The
1212 * atime is set when a refrele is done on a sticky entry.
1214 s
->atime
= ddi_get_lbolt64();
1216 list_insert_head(&hash
->sticky_head
, s
);
1222 * This routine checks if there is an existing sticky entry which matches
1223 * a given packet. If there is one, return it. If there is not, create
1224 * a sticky entry using the packet's info.
1227 ilb_sticky_find_add(ilb_stack_t
*ilbs
, ilb_rule_t
*rule
, in6_addr_t
*src
,
1228 ilb_server_t
*server
, ilb_sticky_t
**res
, uint16_t *src_ent_idx
)
1231 ilb_sticky_hash_t
*hash
;
1234 ASSERT(server
!= NULL
);
1238 i
= ILB_STICKY_HASH((uint8_t *)&src
->s6_addr32
[3],
1239 (uint32_t)(uintptr_t)rule
, ilbs
->ilbs_sticky_hash_size
);
1240 hash
= &ilbs
->ilbs_sticky_hash
[i
];
1242 /* First check if there is already an entry. */
1243 mutex_enter(&hash
->sticky_lock
);
1244 s
= ilb_sticky_lookup(hash
, rule
, src
);
1246 /* No sticky entry, add one. */
1249 s
= ilb_sticky_add(hash
, rule
, server
, src
);
1251 mutex_exit(&hash
->sticky_lock
);
1255 * Find a source for this server. All subseqent requests from
1256 * the same client matching this sticky entry will use this
1257 * source address in doing NAT. The current algorithm is
1258 * simple, rotate the source address. Note that the
1259 * source address array does not change after it's created, so
1260 * it is OK to just increment the cur index.
1262 if (server
->iser_nat_src
!= NULL
) {
1263 /* It is a hint, does not need to be atomic. */
1264 *src_ent_idx
= (server
->iser_nat_src
->cur
++ %
1265 server
->iser_nat_src
->num_src
);
1266 s
->nat_src_idx
= *src_ent_idx
;
1268 mutex_exit(&hash
->sticky_lock
);
1274 * We don't hold any lock accessing iser_enabled. Refer to the
1275 * comment in ilb_server_add() about iser_lock.
1277 if (!s
->server
->iser_enabled
) {
1279 * s->server == server can only happen if there is a race in
1280 * toggling the iser_enabled flag (we don't hold a lock doing
1281 * that) so that the load balance algorithm still returns a
1282 * disabled server. In this case, just drop the packet...
1284 if (s
->server
== server
) {
1285 mutex_exit(&hash
->sticky_lock
);
1290 * The old server is disabled and there is a new server, use
1291 * the new one to create a sticky entry. Since we will
1292 * add the entry at the beginning, subsequent lookup will
1293 * find this new entry instead of the old one.
1300 mutex_exit(&hash
->sticky_lock
);
1301 if (server
->iser_nat_src
!= NULL
)
1302 *src_ent_idx
= s
->nat_src_idx
;
1307 ilb_sticky_cleanup(void *arg
)
1309 ilb_timer_t
*timer
= (ilb_timer_t
*)arg
;
1312 ilb_sticky_hash_t
*hash
;
1313 ilb_sticky_t
*s
, *nxt_s
;
1314 int64_t now
, expiry
;
1317 hash
= ilbs
->ilbs_sticky_hash
;
1318 ASSERT(hash
!= NULL
);
1320 now
= ddi_get_lbolt64();
1321 for (i
= timer
->start
; i
< timer
->end
; i
++) {
1322 mutex_enter(&hash
[i
].sticky_lock
);
1323 for (s
= list_head(&hash
[i
].sticky_head
); s
!= NULL
;
1325 nxt_s
= list_next(&hash
[i
].sticky_head
, s
);
1328 expiry
= now
- SEC_TO_TICK(s
->expiry
);
1329 if (s
->atime
< expiry
) {
1330 ILB_SERVER_REFRELE(s
->server
);
1331 list_remove(&hash
[i
].sticky_head
, s
);
1332 kmem_cache_free(ilb_sticky_cache
, s
);
1333 hash
[i
].sticky_cnt
--;
1336 mutex_exit(&hash
[i
].sticky_lock
);
1341 ilb_sticky_timer(void *arg
)
1343 ilb_timer_t
*timer
= (ilb_timer_t
*)arg
;
1345 (void) taskq_dispatch(timer
->ilbs
->ilbs_sticky_taskq
,
1346 ilb_sticky_cleanup
, arg
, TQ_SLEEP
);
1347 mutex_enter(&timer
->tid_lock
);
1348 if (timer
->tid
== 0) {
1349 mutex_exit(&timer
->tid_lock
);
1351 timer
->tid
= timeout(ilb_sticky_timer
, arg
,
1352 SEC_TO_TICK(ilb_sticky_timeout
));
1353 mutex_exit(&timer
->tid_lock
);
1358 ilb_sticky_hash_init(ilb_stack_t
*ilbs
)
1360 extern pri_t minclsyspri
;
1362 char tq_name
[TASKQ_NAMELEN
];
1365 if (!ISP2(ilbs
->ilbs_sticky_hash_size
)) {
1366 for (i
= 0; i
< 31; i
++) {
1367 if (ilbs
->ilbs_sticky_hash_size
< (1 << i
))
1370 ilbs
->ilbs_sticky_hash_size
= 1 << i
;
1373 ilbs
->ilbs_sticky_hash
= kmem_zalloc(sizeof (ilb_sticky_hash_t
) *
1374 ilbs
->ilbs_sticky_hash_size
, KM_SLEEP
);
1375 for (i
= 0; i
< ilbs
->ilbs_sticky_hash_size
; i
++) {
1376 mutex_init(&ilbs
->ilbs_sticky_hash
[i
].sticky_lock
, NULL
,
1377 MUTEX_DEFAULT
, NULL
);
1378 list_create(&ilbs
->ilbs_sticky_hash
[i
].sticky_head
,
1379 sizeof (ilb_sticky_t
),
1380 offsetof(ilb_sticky_t
, list
));
1383 if (ilb_sticky_cache
== NULL
)
1384 ilb_sticky_cache_init();
1386 (void) snprintf(tq_name
, sizeof (tq_name
), "ilb_sticky_taskq_%p",
1387 (void *)ilbs
->ilbs_netstack
);
1388 ASSERT(ilbs
->ilbs_sticky_taskq
== NULL
);
1389 ilbs
->ilbs_sticky_taskq
= taskq_create(tq_name
,
1390 ilb_sticky_timer_size
* 2, minclsyspri
, ilb_sticky_timer_size
,
1391 ilb_sticky_timer_size
* 2, TASKQ_PREPOPULATE
|TASKQ_DYNAMIC
);
1393 ASSERT(ilbs
->ilbs_sticky_timer_list
== NULL
);
1394 ilbs
->ilbs_sticky_timer_list
= kmem_zalloc(sizeof (ilb_timer_t
) *
1395 ilb_sticky_timer_size
, KM_SLEEP
);
1396 part
= ilbs
->ilbs_sticky_hash_size
/ ilb_sticky_timer_size
+ 1;
1397 for (i
= 0; i
< ilb_sticky_timer_size
; i
++) {
1398 tm
= ilbs
->ilbs_sticky_timer_list
+ i
;
1399 tm
->start
= i
* part
;
1400 tm
->end
= i
* part
+ part
;
1401 if (tm
->end
> ilbs
->ilbs_sticky_hash_size
)
1402 tm
->end
= ilbs
->ilbs_sticky_hash_size
;
1404 mutex_init(&tm
->tid_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1405 /* Spread out the starting execution time of all the timers. */
1406 tm
->tid
= timeout(ilb_sticky_timer
, tm
,
1407 SEC_TO_TICK(ilb_sticky_timeout
+ i
));
1412 ilb_sticky_hash_fini(ilb_stack_t
*ilbs
)
1417 if (ilbs
->ilbs_sticky_hash
== NULL
)
1420 /* Stop all the timers first. */
1421 for (i
= 0; i
< ilb_sticky_timer_size
; i
++) {
1424 /* Setting tid to 0 tells the timer handler not to restart. */
1425 mutex_enter(&ilbs
->ilbs_sticky_timer_list
[i
].tid_lock
);
1426 tid
= ilbs
->ilbs_sticky_timer_list
[i
].tid
;
1427 ilbs
->ilbs_sticky_timer_list
[i
].tid
= 0;
1428 mutex_exit(&ilbs
->ilbs_sticky_timer_list
[i
].tid_lock
);
1429 (void) untimeout(tid
);
1431 kmem_free(ilbs
->ilbs_sticky_timer_list
, sizeof (ilb_timer_t
) *
1432 ilb_sticky_timer_size
);
1433 taskq_destroy(ilbs
->ilbs_sticky_taskq
);
1434 ilbs
->ilbs_sticky_taskq
= NULL
;
1436 for (i
= 0; i
< ilbs
->ilbs_sticky_hash_size
; i
++) {
1437 while ((s
= list_head(&ilbs
->ilbs_sticky_hash
[i
].sticky_head
))
1439 list_remove(&ilbs
->ilbs_sticky_hash
[i
].sticky_head
, s
);
1440 ILB_SERVER_REFRELE(s
->server
);
1441 kmem_free(s
, sizeof (ilb_sticky_t
));
1444 kmem_free(ilbs
->ilbs_sticky_hash
, ilbs
->ilbs_sticky_hash_size
*
1445 sizeof (ilb_sticky_hash_t
));
1449 * This routine sends up the sticky hash table to user land. Refer to
1450 * the comments before ilb_list_nat(). Both routines assume similar
1453 * It is assumed that the caller has checked the size of st so that it
1454 * can hold num entries.
1458 ilb_list_sticky(ilb_stack_t
*ilbs
, zoneid_t zoneid
, ilb_sticky_entry_t
*st
,
1459 uint32_t *num
, uint32_t *flags
)
1461 ilb_sticky_hash_t
*hash
;
1466 mutex_enter(&ilbs
->ilbs_sticky_list_lock
);
1467 while (ilbs
->ilbs_sticky_list_busy
) {
1468 if (cv_wait_sig(&ilbs
->ilbs_sticky_list_cv
,
1469 &ilbs
->ilbs_sticky_list_lock
) == 0) {
1470 mutex_exit(&ilbs
->ilbs_sticky_list_lock
);
1474 if ((hash
= ilbs
->ilbs_sticky_hash
) == NULL
) {
1475 mutex_exit(&ilbs
->ilbs_sticky_list_lock
);
1477 *flags
|= ILB_LIST_END
;
1480 ilbs
->ilbs_sticky_list_busy
= B_TRUE
;
1481 mutex_exit(&ilbs
->ilbs_sticky_list_lock
);
1483 if (*flags
& ILB_LIST_BEGIN
) {
1485 mutex_enter(&hash
[0].sticky_lock
);
1486 curp
= list_head(&hash
[0].sticky_head
);
1487 } else if (*flags
& ILB_LIST_CONT
) {
1488 if (ilbs
->ilbs_sticky_list_cur
== ilbs
->ilbs_sticky_hash_size
) {
1490 *flags
|= ILB_LIST_END
;
1493 i
= ilbs
->ilbs_sticky_list_cur
;
1494 mutex_enter(&hash
[i
].sticky_lock
);
1495 curp
= ilbs
->ilbs_sticky_list_curp
;
1504 mutex_exit(&hash
[i
].sticky_lock
);
1505 if (++i
== ilbs
->ilbs_sticky_hash_size
) {
1506 *flags
|= ILB_LIST_END
;
1509 mutex_enter(&hash
[i
].sticky_lock
);
1510 curp
= list_head(&hash
[i
].sticky_head
);
1513 (void) strcpy(st
[j
].rule_name
, curp
->rule_name
);
1514 st
[j
].req_addr
= curp
->src
;
1515 st
[j
].srv_addr
= curp
->server
->iser_addr_v6
;
1516 st
[j
].expiry_time
= TICK_TO_MSEC(curp
->expiry
);
1518 curp
= list_next(&hash
[i
].sticky_head
, curp
);
1520 ilbs
->ilbs_sticky_list_curp
= curp
;
1522 mutex_exit(&hash
[i
].sticky_lock
);
1524 ilbs
->ilbs_sticky_list_cur
= i
;
1528 mutex_enter(&ilbs
->ilbs_sticky_list_lock
);
1529 ilbs
->ilbs_sticky_list_busy
= B_FALSE
;
1530 cv_signal(&ilbs
->ilbs_sticky_list_cv
);
1531 mutex_exit(&ilbs
->ilbs_sticky_list_lock
);