7888 installboot: print version info of the file
[unleashed.git] / kernel / net / ip / ipclassifier.c
blobc3c3a613b829d24a97bd08ada9a836b19af11416
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
26 * IP PACKET CLASSIFIER
28 * The IP packet classifier provides mapping between IP packets and persistent
29 * connection state for connection-oriented protocols. It also provides
30 * interface for managing connection states.
32 * The connection state is kept in conn_t data structure and contains, among
33 * other things:
35 * o local/remote address and ports
36 * o Transport protocol
37 * o squeue for the connection (for TCP only)
38 * o reference counter
39 * o Connection state
40 * o hash table linkage
41 * o interface/ire information
42 * o credentials
43 * o ipsec policy
44 * o send and receive functions.
45 * o mutex lock.
47 * Connections use a reference counting scheme. They are freed when the
48 * reference counter drops to zero. A reference is incremented when connection
49 * is placed in a list or table, when incoming packet for the connection arrives
50 * and when connection is processed via squeue (squeue processing may be
51 * asynchronous and the reference protects the connection from being destroyed
52 * before its processing is finished).
54 * conn_recv is used to pass up packets to the ULP.
55 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
56 * a listener, and changes to tcp_input_listener as the listener has picked a
57 * good squeue. For other cases it is set to tcp_input_data.
59 * conn_recvicmp is used to pass up ICMP errors to the ULP.
61 * Classifier uses several hash tables:
63 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state
64 * ipcl_bind_fanout: contains all connections in BOUND state
65 * ipcl_proto_fanout: IPv4 protocol fanout
66 * ipcl_proto_fanout_v6: IPv6 protocol fanout
67 * ipcl_udp_fanout: contains all UDP connections
68 * ipcl_iptun_fanout: contains all IP tunnel connections
69 * ipcl_globalhash_fanout: contains all connections
71 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
72 * which need to view all existing connections.
74 * All tables are protected by per-bucket locks. When both per-bucket lock and
75 * connection lock need to be held, the per-bucket lock should be acquired
76 * first, followed by the connection lock.
78 * All functions doing search in one of these tables increment a reference
79 * counter on the connection found (if any). This reference should be dropped
80 * when the caller has finished processing the connection.
83 * INTERFACES:
84 * ===========
86 * Connection Lookup:
87 * ------------------
89 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
90 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
92 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
93 * it can't find any associated connection. If the connection is found, its
94 * reference counter is incremented.
96 * mp: mblock, containing packet header. The full header should fit
97 * into a single mblock. It should also contain at least full IP
98 * and TCP or UDP header.
100 * protocol: Either IPPROTO_TCP or IPPROTO_UDP.
102 * hdr_len: The size of IP header. It is used to find TCP or UDP header in
103 * the packet.
105 * ira->ira_zoneid: The zone in which the returned connection must be; the
106 * zoneid corresponding to the ire_zoneid on the IRE located for
107 * the packet's destination address.
109 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
110 * IRAF_TX_SHARED_ADDR flags
112 * For TCP connections, the lookup order is as follows:
113 * 5-tuple {src, dst, protocol, local port, remote port}
114 * lookup in ipcl_conn_fanout table.
115 * 3-tuple {dst, remote port, protocol} lookup in
116 * ipcl_bind_fanout table.
118 * For UDP connections, a 5-tuple {src, dst, protocol, local port,
119 * remote port} lookup is done on ipcl_udp_fanout. Note that,
120 * these interfaces do not handle cases where a packets belongs
121 * to multiple UDP clients, which is handled in IP itself.
123 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
124 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
125 * ip_stack);
127 * Lookup routine to find a exact match for {src, dst, local port,
128 * remote port) for TCP connections in ipcl_conn_fanout. The address and
129 * ports are read from the IP and TCP header respectively.
131 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol,
132 * zoneid, ip_stack);
133 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
134 * zoneid, ip_stack);
136 * Lookup routine to find a listener with the tuple {lport, laddr,
137 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional
138 * parameter interface index is also compared.
140 * void ipcl_walk(func, arg, ip_stack)
142 * Apply 'func' to every connection available. The 'func' is called as
143 * (*func)(connp, arg). The walk is non-atomic so connections may be
144 * created and destroyed during the walk. The CONN_CONDEMNED and
145 * CONN_INCIPIENT flags ensure that connections which are newly created
146 * or being destroyed are not selected by the walker.
148 * Table Updates
149 * -------------
151 * int ipcl_conn_insert(connp);
152 * int ipcl_conn_insert_v4(connp);
153 * int ipcl_conn_insert_v6(connp);
155 * Insert 'connp' in the ipcl_conn_fanout.
156 * Arguements :
157 * connp conn_t to be inserted
159 * Return value :
160 * 0 if connp was inserted
161 * EADDRINUSE if the connection with the same tuple
162 * already exists.
164 * int ipcl_bind_insert(connp);
165 * int ipcl_bind_insert_v4(connp);
166 * int ipcl_bind_insert_v6(connp);
168 * Insert 'connp' in ipcl_bind_fanout.
169 * Arguements :
170 * connp conn_t to be inserted
173 * void ipcl_hash_remove(connp);
175 * Removes the 'connp' from the connection fanout table.
177 * Connection Creation/Destruction
178 * -------------------------------
180 * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
182 * Creates a new conn based on the type flag, inserts it into
183 * globalhash table.
185 * type: This flag determines the type of conn_t which needs to be
186 * created i.e., which kmem_cache it comes from.
187 * IPCL_TCPCONN indicates a TCP connection
188 * IPCL_SCTPCONN indicates a SCTP connection
189 * IPCL_UDPCONN indicates a UDP conn_t.
190 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t.
191 * IPCL_RTSCONN indicates a RTS conn_t.
192 * IPCL_IPCCONN indicates all other connections.
194 * void ipcl_conn_destroy(connp)
196 * Destroys the connection state, removes it from the global
197 * connection hash table and frees its memory.
200 #include <sys/types.h>
201 #include <sys/stream.h>
202 #include <sys/stropts.h>
203 #include <sys/sysmacros.h>
204 #include <sys/strsubr.h>
205 #include <sys/strsun.h>
206 #define _SUN_TPI_VERSION 2
207 #include <sys/ddi.h>
208 #include <sys/cmn_err.h>
209 #include <sys/debug.h>
211 #include <sys/systm.h>
212 #include <sys/param.h>
213 #include <sys/kmem.h>
214 #include <sys/isa_defs.h>
215 #include <inet/common.h>
216 #include <netinet/ip6.h>
217 #include <netinet/icmp6.h>
219 #include <inet/ip.h>
220 #include <inet/ip_if.h>
221 #include <inet/ip_ire.h>
222 #include <inet/ip6.h>
223 #include <inet/ip_ndp.h>
224 #include <inet/ip_impl.h>
225 #include <inet/udp_impl.h>
226 #include <inet/sctp_ip.h>
227 #include <inet/sctp/sctp_impl.h>
228 #include <inet/rawip_impl.h>
229 #include <inet/rts_impl.h>
230 #include <inet/iptun/iptun_impl.h>
232 #include <sys/cpuvar.h>
234 #include <inet/ipclassifier.h>
235 #include <inet/tcp.h>
236 #include <inet/ipsec_impl.h>
238 #include <sys/sockio.h>
240 /* Old value for compatibility. Setable in /etc/system */
241 uint_t tcp_conn_hash_size = 0;
243 /* New value. Zero means choose automatically. Setable in /etc/system */
244 uint_t ipcl_conn_hash_size = 0;
245 uint_t ipcl_conn_hash_memfactor = 8192;
246 uint_t ipcl_conn_hash_maxsize = 82500;
248 /* bind/udp fanout table size */
249 uint_t ipcl_bind_fanout_size = 512;
250 uint_t ipcl_udp_fanout_size = 16384;
252 /* Raw socket fanout size. Must be a power of 2. */
253 uint_t ipcl_raw_fanout_size = 256;
256 * The IPCL_IPTUN_HASH() function works best with a prime table size. We
257 * expect that most large deployments would have hundreds of tunnels, and
258 * thousands in the extreme case.
260 uint_t ipcl_iptun_fanout_size = 6143;
263 * Power of 2^N Primes useful for hashing for N of 0-28,
264 * these primes are the nearest prime <= 2^N - 2^(N-2).
267 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \
268 6143, 12281, 24571, 49139, 98299, 196597, 393209, \
269 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \
270 50331599, 100663291, 201326557, 0}
273 * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
274 * are aligned on cache lines.
276 typedef union itc_s {
277 conn_t itc_conn;
278 char itcu_filler[CACHE_ALIGN(conn_s)];
279 } itc_t;
281 struct kmem_cache *tcp_conn_cache;
282 struct kmem_cache *ip_conn_cache;
283 extern struct kmem_cache *sctp_conn_cache;
284 struct kmem_cache *udp_conn_cache;
285 struct kmem_cache *rawip_conn_cache;
286 struct kmem_cache *rts_conn_cache;
288 extern void tcp_timermp_free(tcp_t *);
289 extern mblk_t *tcp_timermp_alloc(int);
291 static int ip_conn_constructor(void *, void *, int);
292 static void ip_conn_destructor(void *, void *);
294 static int tcp_conn_constructor(void *, void *, int);
295 static void tcp_conn_destructor(void *, void *);
297 static int udp_conn_constructor(void *, void *, int);
298 static void udp_conn_destructor(void *, void *);
300 static int rawip_conn_constructor(void *, void *, int);
301 static void rawip_conn_destructor(void *, void *);
303 static int rts_conn_constructor(void *, void *, int);
304 static void rts_conn_destructor(void *, void *);
307 * Global (for all stack instances) init routine
309 void
310 ipcl_g_init(void)
312 ip_conn_cache = kmem_cache_create("ip_conn_cache",
313 sizeof (conn_t), CACHE_ALIGN_SIZE,
314 ip_conn_constructor, ip_conn_destructor,
315 NULL, NULL, NULL, 0);
317 tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
318 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
319 tcp_conn_constructor, tcp_conn_destructor,
320 tcp_conn_reclaim, NULL, NULL, 0);
322 udp_conn_cache = kmem_cache_create("udp_conn_cache",
323 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
324 udp_conn_constructor, udp_conn_destructor,
325 NULL, NULL, NULL, 0);
327 rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
328 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
329 rawip_conn_constructor, rawip_conn_destructor,
330 NULL, NULL, NULL, 0);
332 rts_conn_cache = kmem_cache_create("rts_conn_cache",
333 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
334 rts_conn_constructor, rts_conn_destructor,
335 NULL, NULL, NULL, 0);
339 * ipclassifier intialization routine, sets up hash tables.
341 void
342 ipcl_init(ip_stack_t *ipst)
344 int i;
345 int sizes[] = P2Ps();
348 * Calculate size of conn fanout table from /etc/system settings
350 if (ipcl_conn_hash_size != 0) {
351 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
352 } else if (tcp_conn_hash_size != 0) {
353 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
354 } else {
355 extern pgcnt_t freemem;
357 ipst->ips_ipcl_conn_fanout_size =
358 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
360 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
361 ipst->ips_ipcl_conn_fanout_size =
362 ipcl_conn_hash_maxsize;
366 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
367 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
368 break;
371 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
372 /* Out of range, use the 2^16 value */
373 ipst->ips_ipcl_conn_fanout_size = sizes[16];
376 /* Take values from /etc/system */
377 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
378 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
379 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
380 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
382 ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
384 ipst->ips_ipcl_conn_fanout = kmem_zalloc(
385 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
387 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
388 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
389 MUTEX_DEFAULT, NULL);
392 ipst->ips_ipcl_bind_fanout = kmem_zalloc(
393 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
395 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
396 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
397 MUTEX_DEFAULT, NULL);
400 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
401 sizeof (connf_t), KM_SLEEP);
402 for (i = 0; i < IPPROTO_MAX; i++) {
403 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
404 MUTEX_DEFAULT, NULL);
407 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
408 sizeof (connf_t), KM_SLEEP);
409 for (i = 0; i < IPPROTO_MAX; i++) {
410 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
411 MUTEX_DEFAULT, NULL);
414 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
415 mutex_init(&ipst->ips_rts_clients->connf_lock,
416 NULL, MUTEX_DEFAULT, NULL);
418 ipst->ips_ipcl_udp_fanout = kmem_zalloc(
419 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
420 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
421 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
422 MUTEX_DEFAULT, NULL);
425 ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
426 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
427 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
428 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
429 MUTEX_DEFAULT, NULL);
432 ipst->ips_ipcl_raw_fanout = kmem_zalloc(
433 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
434 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
435 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
436 MUTEX_DEFAULT, NULL);
439 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
440 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
441 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
442 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
443 NULL, MUTEX_DEFAULT, NULL);
447 void
448 ipcl_g_destroy(void)
450 kmem_cache_destroy(ip_conn_cache);
451 kmem_cache_destroy(tcp_conn_cache);
452 kmem_cache_destroy(udp_conn_cache);
453 kmem_cache_destroy(rawip_conn_cache);
454 kmem_cache_destroy(rts_conn_cache);
458 * All user-level and kernel use of the stack must be gone
459 * by now.
461 void
462 ipcl_destroy(ip_stack_t *ipst)
464 int i;
466 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
467 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
468 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
470 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
471 sizeof (connf_t));
472 ipst->ips_ipcl_conn_fanout = NULL;
474 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
475 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
476 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
478 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
479 sizeof (connf_t));
480 ipst->ips_ipcl_bind_fanout = NULL;
482 for (i = 0; i < IPPROTO_MAX; i++) {
483 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
484 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
486 kmem_free(ipst->ips_ipcl_proto_fanout_v4,
487 IPPROTO_MAX * sizeof (connf_t));
488 ipst->ips_ipcl_proto_fanout_v4 = NULL;
490 for (i = 0; i < IPPROTO_MAX; i++) {
491 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
492 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
494 kmem_free(ipst->ips_ipcl_proto_fanout_v6,
495 IPPROTO_MAX * sizeof (connf_t));
496 ipst->ips_ipcl_proto_fanout_v6 = NULL;
498 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
499 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
500 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
502 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
503 sizeof (connf_t));
504 ipst->ips_ipcl_udp_fanout = NULL;
506 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
507 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
508 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
510 kmem_free(ipst->ips_ipcl_iptun_fanout,
511 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
512 ipst->ips_ipcl_iptun_fanout = NULL;
514 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
515 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
516 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
518 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
519 sizeof (connf_t));
520 ipst->ips_ipcl_raw_fanout = NULL;
522 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
523 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
524 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
526 kmem_free(ipst->ips_ipcl_globalhash_fanout,
527 sizeof (connf_t) * CONN_G_HASH_SIZE);
528 ipst->ips_ipcl_globalhash_fanout = NULL;
530 ASSERT(ipst->ips_rts_clients->connf_head == NULL);
531 mutex_destroy(&ipst->ips_rts_clients->connf_lock);
532 kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
533 ipst->ips_rts_clients = NULL;
537 * conn creation routine. initialize the conn, sets the reference
538 * and inserts it in the global hash table.
540 conn_t *
541 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
543 conn_t *connp;
544 struct kmem_cache *conn_cache;
546 switch (type) {
547 case IPCL_SCTPCONN:
548 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
549 return (NULL);
550 sctp_conn_init(connp);
551 netstack_hold(ns);
552 connp->conn_netstack = ns;
553 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
554 connp->conn_ixa->ixa_conn_id = (long)connp;
555 ipcl_globalhash_insert(connp);
556 return (connp);
558 case IPCL_TCPCONN:
559 conn_cache = tcp_conn_cache;
560 break;
562 case IPCL_UDPCONN:
563 conn_cache = udp_conn_cache;
564 break;
566 case IPCL_RAWIPCONN:
567 conn_cache = rawip_conn_cache;
568 break;
570 case IPCL_RTSCONN:
571 conn_cache = rts_conn_cache;
572 break;
574 case IPCL_IPCCONN:
575 conn_cache = ip_conn_cache;
576 break;
578 default:
579 connp = NULL;
580 ASSERT(0);
583 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
584 return (NULL);
586 connp->conn_ref = 1;
587 netstack_hold(ns);
588 connp->conn_netstack = ns;
589 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
590 connp->conn_ixa->ixa_conn_id = (long)connp;
591 ipcl_globalhash_insert(connp);
592 return (connp);
595 void
596 ipcl_conn_destroy(conn_t *connp)
598 mblk_t *mp;
599 netstack_t *ns = connp->conn_netstack;
601 ASSERT(!MUTEX_HELD(&connp->conn_lock));
602 ASSERT(connp->conn_ref == 0);
603 ASSERT(connp->conn_ioctlref == 0);
605 DTRACE_PROBE1(conn__destroy, conn_t *, connp);
607 if (connp->conn_cred != NULL) {
608 crfree(connp->conn_cred);
609 connp->conn_cred = NULL;
610 /* ixa_cred done in ipcl_conn_cleanup below */
613 if (connp->conn_ht_iphc != NULL) {
614 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
615 connp->conn_ht_iphc = NULL;
616 connp->conn_ht_iphc_allocated = 0;
617 connp->conn_ht_iphc_len = 0;
618 connp->conn_ht_ulp = NULL;
619 connp->conn_ht_ulp_len = 0;
621 ip_pkt_free(&connp->conn_xmit_ipp);
623 ipcl_globalhash_remove(connp);
625 if (connp->conn_latch != NULL) {
626 IPLATCH_REFRELE(connp->conn_latch);
627 connp->conn_latch = NULL;
629 if (connp->conn_latch_in_policy != NULL) {
630 IPPOL_REFRELE(connp->conn_latch_in_policy);
631 connp->conn_latch_in_policy = NULL;
633 if (connp->conn_latch_in_action != NULL) {
634 IPACT_REFRELE(connp->conn_latch_in_action);
635 connp->conn_latch_in_action = NULL;
637 if (connp->conn_policy != NULL) {
638 IPPH_REFRELE(connp->conn_policy, ns);
639 connp->conn_policy = NULL;
642 if (connp->conn_ipsec_opt_mp != NULL) {
643 freemsg(connp->conn_ipsec_opt_mp);
644 connp->conn_ipsec_opt_mp = NULL;
647 if (connp->conn_flags & IPCL_TCPCONN) {
648 tcp_t *tcp = connp->conn_tcp;
650 tcp_free(tcp);
651 mp = tcp->tcp_timercache;
653 tcp->tcp_tcps = NULL;
656 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
657 * the mblk.
659 if (tcp->tcp_rsrv_mp != NULL) {
660 freeb(tcp->tcp_rsrv_mp);
661 tcp->tcp_rsrv_mp = NULL;
662 mutex_destroy(&tcp->tcp_rsrv_mp_lock);
665 ipcl_conn_cleanup(connp);
666 connp->conn_flags = IPCL_TCPCONN;
667 if (ns != NULL) {
668 ASSERT(tcp->tcp_tcps == NULL);
669 connp->conn_netstack = NULL;
670 connp->conn_ixa->ixa_ipst = NULL;
671 netstack_rele(ns);
674 bzero(tcp, sizeof (tcp_t));
676 tcp->tcp_timercache = mp;
677 tcp->tcp_connp = connp;
678 kmem_cache_free(tcp_conn_cache, connp);
679 return;
682 if (connp->conn_flags & IPCL_SCTPCONN) {
683 ASSERT(ns != NULL);
684 sctp_free(connp);
685 return;
688 ipcl_conn_cleanup(connp);
689 if (ns != NULL) {
690 connp->conn_netstack = NULL;
691 connp->conn_ixa->ixa_ipst = NULL;
692 netstack_rele(ns);
695 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
696 if (connp->conn_flags & IPCL_UDPCONN) {
697 connp->conn_flags = IPCL_UDPCONN;
698 kmem_cache_free(udp_conn_cache, connp);
699 } else if (connp->conn_flags & IPCL_RAWIPCONN) {
700 connp->conn_flags = IPCL_RAWIPCONN;
701 connp->conn_proto = IPPROTO_ICMP;
702 connp->conn_ixa->ixa_protocol = connp->conn_proto;
703 kmem_cache_free(rawip_conn_cache, connp);
704 } else if (connp->conn_flags & IPCL_RTSCONN) {
705 connp->conn_flags = IPCL_RTSCONN;
706 kmem_cache_free(rts_conn_cache, connp);
707 } else {
708 connp->conn_flags = IPCL_IPCCONN;
709 ASSERT(connp->conn_flags & IPCL_IPCCONN);
710 ASSERT(connp->conn_priv == NULL);
711 kmem_cache_free(ip_conn_cache, connp);
716 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
717 * which table the conn belonged to). So for debugging we can see which hash
718 * table this connection was in.
720 #define IPCL_HASH_REMOVE(connp) { \
721 connf_t *connfp = (connp)->conn_fanout; \
722 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \
723 if (connfp != NULL) { \
724 mutex_enter(&connfp->connf_lock); \
725 if ((connp)->conn_next != NULL) \
726 (connp)->conn_next->conn_prev = \
727 (connp)->conn_prev; \
728 if ((connp)->conn_prev != NULL) \
729 (connp)->conn_prev->conn_next = \
730 (connp)->conn_next; \
731 else \
732 connfp->connf_head = (connp)->conn_next; \
733 (connp)->conn_fanout = NULL; \
734 (connp)->conn_next = NULL; \
735 (connp)->conn_prev = NULL; \
736 (connp)->conn_flags |= IPCL_REMOVED; \
737 CONN_DEC_REF((connp)); \
738 mutex_exit(&connfp->connf_lock); \
742 void
743 ipcl_hash_remove(conn_t *connp)
745 uint8_t protocol = connp->conn_proto;
747 IPCL_HASH_REMOVE(connp);
748 if (protocol == IPPROTO_RSVP)
749 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
753 * The whole purpose of this function is allow removal of
754 * a conn_t from the connected hash for timewait reclaim.
755 * This is essentially a TW reclaim fastpath where timewait
756 * collector checks under fanout lock (so no one else can
757 * get access to the conn_t) that refcnt is 2 i.e. one for
758 * TCP and one for the classifier hash list. If ref count
759 * is indeed 2, we can just remove the conn under lock and
760 * avoid cleaning up the conn under squeue. This gives us
761 * improved performance.
763 void
764 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
766 ASSERT(MUTEX_HELD(&connfp->connf_lock));
767 ASSERT(MUTEX_HELD(&connp->conn_lock));
769 if ((connp)->conn_next != NULL) {
770 (connp)->conn_next->conn_prev = (connp)->conn_prev;
772 if ((connp)->conn_prev != NULL) {
773 (connp)->conn_prev->conn_next = (connp)->conn_next;
774 } else {
775 connfp->connf_head = (connp)->conn_next;
777 (connp)->conn_fanout = NULL;
778 (connp)->conn_next = NULL;
779 (connp)->conn_prev = NULL;
780 (connp)->conn_flags |= IPCL_REMOVED;
781 ASSERT((connp)->conn_ref == 2);
782 (connp)->conn_ref--;
785 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \
786 ASSERT((connp)->conn_fanout == NULL); \
787 ASSERT((connp)->conn_next == NULL); \
788 ASSERT((connp)->conn_prev == NULL); \
789 if ((connfp)->connf_head != NULL) { \
790 (connfp)->connf_head->conn_prev = (connp); \
791 (connp)->conn_next = (connfp)->connf_head; \
793 (connp)->conn_fanout = (connfp); \
794 (connfp)->connf_head = (connp); \
795 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
796 IPCL_CONNECTED; \
797 CONN_INC_REF(connp); \
800 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \
801 IPCL_HASH_REMOVE((connp)); \
802 mutex_enter(&(connfp)->connf_lock); \
803 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \
804 mutex_exit(&(connfp)->connf_lock); \
807 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \
808 conn_t *pconnp = NULL, *nconnp; \
809 IPCL_HASH_REMOVE((connp)); \
810 mutex_enter(&(connfp)->connf_lock); \
811 nconnp = (connfp)->connf_head; \
812 while (nconnp != NULL && \
813 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \
814 pconnp = nconnp; \
815 nconnp = nconnp->conn_next; \
817 if (pconnp != NULL) { \
818 pconnp->conn_next = (connp); \
819 (connp)->conn_prev = pconnp; \
820 } else { \
821 (connfp)->connf_head = (connp); \
823 if (nconnp != NULL) { \
824 (connp)->conn_next = nconnp; \
825 nconnp->conn_prev = (connp); \
827 (connp)->conn_fanout = (connfp); \
828 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
829 IPCL_BOUND; \
830 CONN_INC_REF(connp); \
831 mutex_exit(&(connfp)->connf_lock); \
834 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \
835 conn_t **list, *prev, *next; \
836 boolean_t isv4mapped = \
837 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \
838 IPCL_HASH_REMOVE((connp)); \
839 mutex_enter(&(connfp)->connf_lock); \
840 list = &(connfp)->connf_head; \
841 prev = NULL; \
842 while ((next = *list) != NULL) { \
843 if (isv4mapped && \
844 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \
845 connp->conn_zoneid == next->conn_zoneid) { \
846 (connp)->conn_next = next; \
847 if (prev != NULL) \
848 prev = next->conn_prev; \
849 next->conn_prev = (connp); \
850 break; \
852 list = &next->conn_next; \
853 prev = next; \
855 (connp)->conn_prev = prev; \
856 *list = (connp); \
857 (connp)->conn_fanout = (connfp); \
858 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
859 IPCL_BOUND; \
860 CONN_INC_REF((connp)); \
861 mutex_exit(&(connfp)->connf_lock); \
864 void
865 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
867 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
871 * Because the classifier is used to classify inbound packets, the destination
872 * address is meant to be our local tunnel address (tunnel source), and the
873 * source the remote tunnel address (tunnel destination).
875 * Note that conn_proto can't be used for fanout since the upper protocol
876 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
878 conn_t *
879 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
881 connf_t *connfp;
882 conn_t *connp;
884 /* first look for IPv4 tunnel links */
885 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
886 mutex_enter(&connfp->connf_lock);
887 for (connp = connfp->connf_head; connp != NULL;
888 connp = connp->conn_next) {
889 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
890 break;
892 if (connp != NULL)
893 goto done;
895 mutex_exit(&connfp->connf_lock);
897 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
898 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
899 INADDR_ANY)];
900 mutex_enter(&connfp->connf_lock);
901 for (connp = connfp->connf_head; connp != NULL;
902 connp = connp->conn_next) {
903 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
904 break;
906 done:
907 if (connp != NULL)
908 CONN_INC_REF(connp);
909 mutex_exit(&connfp->connf_lock);
910 return (connp);
913 conn_t *
914 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
916 connf_t *connfp;
917 conn_t *connp;
919 /* Look for an IPv6 tunnel link */
920 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
921 mutex_enter(&connfp->connf_lock);
922 for (connp = connfp->connf_head; connp != NULL;
923 connp = connp->conn_next) {
924 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
925 CONN_INC_REF(connp);
926 break;
929 mutex_exit(&connfp->connf_lock);
930 return (connp);
934 * This function is used only for inserting SCTP raw socket now.
935 * This may change later.
937 * Note that only one raw socket can be bound to a port. The param
938 * lport is in network byte order.
940 static int
941 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
943 connf_t *connfp;
944 conn_t *oconnp;
945 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
947 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
949 /* Check for existing raw socket already bound to the port. */
950 mutex_enter(&connfp->connf_lock);
951 for (oconnp = connfp->connf_head; oconnp != NULL;
952 oconnp = oconnp->conn_next) {
953 if (oconnp->conn_lport == lport &&
954 oconnp->conn_zoneid == connp->conn_zoneid &&
955 oconnp->conn_family == connp->conn_family &&
956 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
957 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
958 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
959 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
960 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
961 &connp->conn_laddr_v6))) {
962 break;
965 mutex_exit(&connfp->connf_lock);
966 if (oconnp != NULL)
967 return (EADDRNOTAVAIL);
969 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
970 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
971 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
972 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
973 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
974 } else {
975 IPCL_HASH_INSERT_BOUND(connfp, connp);
977 } else {
978 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
980 return (0);
983 static int
984 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
986 connf_t *connfp;
987 conn_t *tconnp;
988 ipaddr_t laddr = connp->conn_laddr_v4;
989 ipaddr_t faddr = connp->conn_faddr_v4;
991 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
992 mutex_enter(&connfp->connf_lock);
993 for (tconnp = connfp->connf_head; tconnp != NULL;
994 tconnp = tconnp->conn_next) {
995 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
996 /* A tunnel is already bound to these addresses. */
997 mutex_exit(&connfp->connf_lock);
998 return (EADDRINUSE);
1001 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1002 mutex_exit(&connfp->connf_lock);
1003 return (0);
1006 static int
1007 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1009 connf_t *connfp;
1010 conn_t *tconnp;
1011 in6_addr_t *laddr = &connp->conn_laddr_v6;
1012 in6_addr_t *faddr = &connp->conn_faddr_v6;
1014 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1015 mutex_enter(&connfp->connf_lock);
1016 for (tconnp = connfp->connf_head; tconnp != NULL;
1017 tconnp = tconnp->conn_next) {
1018 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1019 /* A tunnel is already bound to these addresses. */
1020 mutex_exit(&connfp->connf_lock);
1021 return (EADDRINUSE);
1024 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1025 mutex_exit(&connfp->connf_lock);
1026 return (0);
1030 * (v4, v6) bind hash insertion routines
1031 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1035 ipcl_bind_insert(conn_t *connp)
1037 if (connp->conn_ipversion == IPV6_VERSION)
1038 return (ipcl_bind_insert_v6(connp));
1039 else
1040 return (ipcl_bind_insert_v4(connp));
1044 ipcl_bind_insert_v4(conn_t *connp)
1046 connf_t *connfp;
1047 int ret = 0;
1048 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1049 uint16_t lport = connp->conn_lport;
1050 uint8_t protocol = connp->conn_proto;
1052 if (IPCL_IS_IPTUN(connp))
1053 return (ipcl_iptun_hash_insert(connp, ipst));
1055 switch (protocol) {
1056 default:
1057 case IPPROTO_UDP:
1058 if (protocol == IPPROTO_UDP) {
1059 connfp = &ipst->ips_ipcl_udp_fanout[
1060 IPCL_UDP_HASH(lport, ipst)];
1061 } else {
1062 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1065 if (connp->conn_faddr_v4 != INADDR_ANY) {
1066 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1067 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1068 IPCL_HASH_INSERT_BOUND(connfp, connp);
1069 } else {
1070 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1072 if (protocol == IPPROTO_RSVP)
1073 ill_set_inputfn_all(ipst);
1074 break;
1076 case IPPROTO_TCP:
1077 /* Insert it in the Bind Hash */
1078 ASSERT(connp->conn_zoneid != ALL_ZONES);
1079 connfp = &ipst->ips_ipcl_bind_fanout[
1080 IPCL_BIND_HASH(lport, ipst)];
1081 if (connp->conn_laddr_v4 != INADDR_ANY) {
1082 IPCL_HASH_INSERT_BOUND(connfp, connp);
1083 } else {
1084 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1086 break;
1088 case IPPROTO_SCTP:
1089 ret = ipcl_sctp_hash_insert(connp, lport);
1090 break;
1093 return (ret);
1097 ipcl_bind_insert_v6(conn_t *connp)
1099 connf_t *connfp;
1100 int ret = 0;
1101 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1102 uint16_t lport = connp->conn_lport;
1103 uint8_t protocol = connp->conn_proto;
1105 if (IPCL_IS_IPTUN(connp)) {
1106 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1109 switch (protocol) {
1110 default:
1111 case IPPROTO_UDP:
1112 if (protocol == IPPROTO_UDP) {
1113 connfp = &ipst->ips_ipcl_udp_fanout[
1114 IPCL_UDP_HASH(lport, ipst)];
1115 } else {
1116 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1119 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1120 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1121 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1122 IPCL_HASH_INSERT_BOUND(connfp, connp);
1123 } else {
1124 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1126 break;
1128 case IPPROTO_TCP:
1129 /* Insert it in the Bind Hash */
1130 ASSERT(connp->conn_zoneid != ALL_ZONES);
1131 connfp = &ipst->ips_ipcl_bind_fanout[
1132 IPCL_BIND_HASH(lport, ipst)];
1133 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1134 IPCL_HASH_INSERT_BOUND(connfp, connp);
1135 } else {
1136 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1138 break;
1140 case IPPROTO_SCTP:
1141 ret = ipcl_sctp_hash_insert(connp, lport);
1142 break;
1145 return (ret);
1149 * ipcl_conn_hash insertion routines.
1150 * The caller has already set conn_proto and the addresses/ports in the conn_t.
1154 ipcl_conn_insert(conn_t *connp)
1156 if (connp->conn_ipversion == IPV6_VERSION)
1157 return (ipcl_conn_insert_v6(connp));
1158 else
1159 return (ipcl_conn_insert_v4(connp));
1163 ipcl_conn_insert_v4(conn_t *connp)
1165 connf_t *connfp;
1166 conn_t *tconnp;
1167 int ret = 0;
1168 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1169 uint16_t lport = connp->conn_lport;
1170 uint8_t protocol = connp->conn_proto;
1172 if (IPCL_IS_IPTUN(connp))
1173 return (ipcl_iptun_hash_insert(connp, ipst));
1175 switch (protocol) {
1176 case IPPROTO_TCP:
1178 * For TCP, we check whether the connection tuple already
1179 * exists before allowing the connection to proceed. We
1180 * also allow indexing on the zoneid. This is to allow
1181 * multiple shared stack zones to have the same tcp
1182 * connection tuple. In practice this only happens for
1183 * INADDR_LOOPBACK as it's the only local address which
1184 * doesn't have to be unique.
1186 connfp = &ipst->ips_ipcl_conn_fanout[
1187 IPCL_CONN_HASH(connp->conn_faddr_v4,
1188 connp->conn_ports, ipst)];
1189 mutex_enter(&connfp->connf_lock);
1190 for (tconnp = connfp->connf_head; tconnp != NULL;
1191 tconnp = tconnp->conn_next) {
1192 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1193 connp->conn_faddr_v4, connp->conn_laddr_v4,
1194 connp->conn_ports) &&
1195 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1196 /* Already have a conn. bail out */
1197 mutex_exit(&connfp->connf_lock);
1198 return (EADDRINUSE);
1201 if (connp->conn_fanout != NULL) {
1203 * Probably a XTI/TLI application trying to do a
1204 * rebind. Let it happen.
1206 mutex_exit(&connfp->connf_lock);
1207 IPCL_HASH_REMOVE(connp);
1208 mutex_enter(&connfp->connf_lock);
1211 ASSERT(connp->conn_recv != NULL);
1212 ASSERT(connp->conn_recvicmp != NULL);
1214 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1215 mutex_exit(&connfp->connf_lock);
1216 break;
1218 case IPPROTO_SCTP:
1220 * The raw socket may have already been bound, remove it
1221 * from the hash first.
1223 IPCL_HASH_REMOVE(connp);
1224 ret = ipcl_sctp_hash_insert(connp, lport);
1225 break;
1227 default:
1228 case IPPROTO_UDP:
1229 if (protocol == IPPROTO_UDP) {
1230 connfp = &ipst->ips_ipcl_udp_fanout[
1231 IPCL_UDP_HASH(lport, ipst)];
1232 } else {
1233 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1236 if (connp->conn_faddr_v4 != INADDR_ANY) {
1237 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1238 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1239 IPCL_HASH_INSERT_BOUND(connfp, connp);
1240 } else {
1241 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1243 break;
1246 return (ret);
1250 ipcl_conn_insert_v6(conn_t *connp)
1252 connf_t *connfp;
1253 conn_t *tconnp;
1254 int ret = 0;
1255 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1256 uint16_t lport = connp->conn_lport;
1257 uint8_t protocol = connp->conn_proto;
1258 uint_t ifindex = connp->conn_bound_if;
1260 if (IPCL_IS_IPTUN(connp))
1261 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1263 switch (protocol) {
1264 case IPPROTO_TCP:
1267 * For tcp, we check whether the connection tuple already
1268 * exists before allowing the connection to proceed. We
1269 * also allow indexing on the zoneid. This is to allow
1270 * multiple shared stack zones to have the same tcp
1271 * connection tuple. In practice this only happens for
1272 * ipv6_loopback as it's the only local address which
1273 * doesn't have to be unique.
1275 connfp = &ipst->ips_ipcl_conn_fanout[
1276 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1277 ipst)];
1278 mutex_enter(&connfp->connf_lock);
1279 for (tconnp = connfp->connf_head; tconnp != NULL;
1280 tconnp = tconnp->conn_next) {
1281 /* NOTE: need to match zoneid. Bug in onnv-gate */
1282 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1283 connp->conn_faddr_v6, connp->conn_laddr_v6,
1284 connp->conn_ports) &&
1285 (tconnp->conn_bound_if == 0 ||
1286 tconnp->conn_bound_if == ifindex) &&
1287 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1288 /* Already have a conn. bail out */
1289 mutex_exit(&connfp->connf_lock);
1290 return (EADDRINUSE);
1293 if (connp->conn_fanout != NULL) {
1295 * Probably a XTI/TLI application trying to do a
1296 * rebind. Let it happen.
1298 mutex_exit(&connfp->connf_lock);
1299 IPCL_HASH_REMOVE(connp);
1300 mutex_enter(&connfp->connf_lock);
1302 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1303 mutex_exit(&connfp->connf_lock);
1304 break;
1306 case IPPROTO_SCTP:
1307 IPCL_HASH_REMOVE(connp);
1308 ret = ipcl_sctp_hash_insert(connp, lport);
1309 break;
1311 default:
1312 case IPPROTO_UDP:
1313 if (protocol == IPPROTO_UDP) {
1314 connfp = &ipst->ips_ipcl_udp_fanout[
1315 IPCL_UDP_HASH(lport, ipst)];
1316 } else {
1317 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1320 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1321 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1322 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1323 IPCL_HASH_INSERT_BOUND(connfp, connp);
1324 } else {
1325 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1327 break;
1330 return (ret);
1334 * v4 packet classifying function. looks up the fanout table to
1335 * find the conn, the packet belongs to. returns the conn with
1336 * the reference held, null otherwise.
1338 * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1339 * Lookup" comment block are applied. Labels are also checked as described
1340 * above.
1342 conn_t *
1343 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1344 ip_recv_attr_t *ira, ip_stack_t *ipst)
1346 ipha_t *ipha;
1347 connf_t *connfp, *bind_connfp;
1348 uint16_t lport;
1349 uint16_t fport;
1350 uint32_t ports;
1351 conn_t *connp;
1352 uint16_t *up;
1353 zoneid_t zoneid = ira->ira_zoneid;
1355 ipha = (ipha_t *)mp->b_rptr;
1356 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1358 switch (protocol) {
1359 case IPPROTO_TCP:
1360 ports = *(uint32_t *)up;
1361 connfp =
1362 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1363 ports, ipst)];
1364 mutex_enter(&connfp->connf_lock);
1365 for (connp = connfp->connf_head; connp != NULL;
1366 connp = connp->conn_next) {
1367 if (IPCL_CONN_MATCH(connp, protocol,
1368 ipha->ipha_src, ipha->ipha_dst, ports) &&
1369 (connp->conn_zoneid == zoneid ||
1370 connp->conn_allzones))
1371 break;
1374 if (connp != NULL) {
1375 /* We have a fully-bound TCP connection. */
1376 CONN_INC_REF(connp);
1377 mutex_exit(&connfp->connf_lock);
1378 return (connp);
1381 mutex_exit(&connfp->connf_lock);
1382 lport = up[1];
1383 bind_connfp =
1384 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1385 mutex_enter(&bind_connfp->connf_lock);
1386 for (connp = bind_connfp->connf_head; connp != NULL;
1387 connp = connp->conn_next) {
1388 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1389 lport) &&
1390 (connp->conn_zoneid == zoneid ||
1391 connp->conn_allzones))
1392 break;
1395 if (connp != NULL) {
1396 /* Have a listener at least */
1397 CONN_INC_REF(connp);
1398 mutex_exit(&bind_connfp->connf_lock);
1399 return (connp);
1402 mutex_exit(&bind_connfp->connf_lock);
1403 break;
1405 case IPPROTO_UDP:
1406 lport = up[1];
1407 fport = up[0];
1408 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1409 mutex_enter(&connfp->connf_lock);
1410 for (connp = connfp->connf_head; connp != NULL;
1411 connp = connp->conn_next) {
1412 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1413 fport, ipha->ipha_src) &&
1414 (connp->conn_zoneid == zoneid ||
1415 connp->conn_allzones))
1416 break;
1419 if (connp != NULL) {
1420 CONN_INC_REF(connp);
1421 mutex_exit(&connfp->connf_lock);
1422 return (connp);
1426 * We shouldn't come here for multicast/broadcast packets
1428 mutex_exit(&connfp->connf_lock);
1430 break;
1432 case IPPROTO_ENCAP:
1433 case IPPROTO_IPV6:
1434 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1435 &ipha->ipha_dst, ipst));
1438 return (NULL);
1441 conn_t *
1442 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1443 ip_recv_attr_t *ira, ip_stack_t *ipst)
1445 ip6_t *ip6h;
1446 connf_t *connfp, *bind_connfp;
1447 uint16_t lport;
1448 uint16_t fport;
1449 tcpha_t *tcpha;
1450 uint32_t ports;
1451 conn_t *connp;
1452 uint16_t *up;
1453 zoneid_t zoneid = ira->ira_zoneid;
1455 ip6h = (ip6_t *)mp->b_rptr;
1457 switch (protocol) {
1458 case IPPROTO_TCP:
1459 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1460 up = &tcpha->tha_lport;
1461 ports = *(uint32_t *)up;
1463 connfp =
1464 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1465 ports, ipst)];
1466 mutex_enter(&connfp->connf_lock);
1467 for (connp = connfp->connf_head; connp != NULL;
1468 connp = connp->conn_next) {
1469 if (IPCL_CONN_MATCH_V6(connp, protocol,
1470 ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1471 (connp->conn_zoneid == zoneid ||
1472 connp->conn_allzones))
1473 break;
1476 if (connp != NULL) {
1477 /* We have a fully-bound TCP connection. */
1478 CONN_INC_REF(connp);
1479 mutex_exit(&connfp->connf_lock);
1480 return (connp);
1483 mutex_exit(&connfp->connf_lock);
1485 lport = up[1];
1486 bind_connfp =
1487 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1488 mutex_enter(&bind_connfp->connf_lock);
1489 for (connp = bind_connfp->connf_head; connp != NULL;
1490 connp = connp->conn_next) {
1491 if (IPCL_BIND_MATCH_V6(connp, protocol,
1492 ip6h->ip6_dst, lport) &&
1493 (connp->conn_zoneid == zoneid ||
1494 connp->conn_allzones))
1495 break;
1498 if (connp != NULL) {
1499 /* Have a listner at least */
1500 CONN_INC_REF(connp);
1501 mutex_exit(&bind_connfp->connf_lock);
1502 return (connp);
1505 mutex_exit(&bind_connfp->connf_lock);
1506 break;
1508 case IPPROTO_UDP:
1509 up = (uint16_t *)&mp->b_rptr[hdr_len];
1510 lport = up[1];
1511 fport = up[0];
1512 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1513 mutex_enter(&connfp->connf_lock);
1514 for (connp = connfp->connf_head; connp != NULL;
1515 connp = connp->conn_next) {
1516 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1517 fport, ip6h->ip6_src) &&
1518 (connp->conn_zoneid == zoneid ||
1519 connp->conn_allzones))
1520 break;
1523 if (connp != NULL) {
1524 CONN_INC_REF(connp);
1525 mutex_exit(&connfp->connf_lock);
1526 return (connp);
1530 * We shouldn't come here for multicast/broadcast packets
1532 mutex_exit(&connfp->connf_lock);
1533 break;
1534 case IPPROTO_ENCAP:
1535 case IPPROTO_IPV6:
1536 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1537 &ip6h->ip6_dst, ipst));
1540 return (NULL);
1544 * wrapper around ipcl_classify_(v4,v6) routines.
1546 conn_t *
1547 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1549 if (ira->ira_flags & IRAF_IS_IPV4) {
1550 return (ipcl_classify_v4(mp, ira->ira_protocol,
1551 ira->ira_ip_hdr_length, ira, ipst));
1552 } else {
1553 return (ipcl_classify_v6(mp, ira->ira_protocol,
1554 ira->ira_ip_hdr_length, ira, ipst));
1559 * Only used to classify SCTP RAW sockets
1561 conn_t *
1562 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1563 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1565 connf_t *connfp;
1566 conn_t *connp;
1567 in_port_t lport;
1568 int ipversion;
1569 const void *dst;
1570 zoneid_t zoneid = ira->ira_zoneid;
1572 lport = ((uint16_t *)&ports)[1];
1573 if (ira->ira_flags & IRAF_IS_IPV4) {
1574 dst = (const void *)&ipha->ipha_dst;
1575 ipversion = IPV4_VERSION;
1576 } else {
1577 dst = (const void *)&ip6h->ip6_dst;
1578 ipversion = IPV6_VERSION;
1581 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1582 mutex_enter(&connfp->connf_lock);
1583 for (connp = connfp->connf_head; connp != NULL;
1584 connp = connp->conn_next) {
1585 /* We don't allow v4 fallback for v6 raw socket. */
1586 if (ipversion != connp->conn_ipversion)
1587 continue;
1588 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1589 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1590 if (ipversion == IPV4_VERSION) {
1591 if (!IPCL_CONN_MATCH(connp, protocol,
1592 ipha->ipha_src, ipha->ipha_dst, ports))
1593 continue;
1594 } else {
1595 if (!IPCL_CONN_MATCH_V6(connp, protocol,
1596 ip6h->ip6_src, ip6h->ip6_dst, ports))
1597 continue;
1599 } else {
1600 if (ipversion == IPV4_VERSION) {
1601 if (!IPCL_BIND_MATCH(connp, protocol,
1602 ipha->ipha_dst, lport))
1603 continue;
1604 } else {
1605 if (!IPCL_BIND_MATCH_V6(connp, protocol,
1606 ip6h->ip6_dst, lport))
1607 continue;
1611 if (connp->conn_zoneid == zoneid || connp->conn_allzones)
1612 break;
1615 if (connp != NULL)
1616 goto found;
1617 mutex_exit(&connfp->connf_lock);
1619 /* Try to look for a wildcard SCTP RAW socket match. */
1620 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1621 mutex_enter(&connfp->connf_lock);
1622 for (connp = connfp->connf_head; connp != NULL;
1623 connp = connp->conn_next) {
1624 /* We don't allow v4 fallback for v6 raw socket. */
1625 if (ipversion != connp->conn_ipversion)
1626 continue;
1627 if (!IPCL_ZONE_MATCH(connp, zoneid))
1628 continue;
1630 if (ipversion == IPV4_VERSION) {
1631 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1632 break;
1633 } else {
1634 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1635 break;
1640 if (connp != NULL)
1641 goto found;
1643 mutex_exit(&connfp->connf_lock);
1644 return (NULL);
1646 found:
1647 ASSERT(connp != NULL);
1648 CONN_INC_REF(connp);
1649 mutex_exit(&connfp->connf_lock);
1650 return (connp);
1653 /* ARGSUSED */
1654 static int
1655 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1657 itc_t *itc = (itc_t *)buf;
1658 conn_t *connp = &itc->itc_conn;
1659 tcp_t *tcp = (tcp_t *)&itc[1];
1661 bzero(connp, sizeof (conn_t));
1662 bzero(tcp, sizeof (tcp_t));
1664 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1665 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1666 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1667 tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1668 if (tcp->tcp_timercache == NULL)
1669 return (ENOMEM);
1670 connp->conn_tcp = tcp;
1671 connp->conn_flags = IPCL_TCPCONN;
1672 connp->conn_proto = IPPROTO_TCP;
1673 tcp->tcp_connp = connp;
1674 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1676 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1677 if (connp->conn_ixa == NULL) {
1678 tcp_timermp_free(tcp);
1679 return (ENOMEM);
1681 connp->conn_ixa->ixa_refcnt = 1;
1682 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1683 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1684 return (0);
1687 /* ARGSUSED */
1688 static void
1689 tcp_conn_destructor(void *buf, void *cdrarg)
1691 itc_t *itc = (itc_t *)buf;
1692 conn_t *connp = &itc->itc_conn;
1693 tcp_t *tcp = (tcp_t *)&itc[1];
1695 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1696 ASSERT(tcp->tcp_connp == connp);
1697 ASSERT(connp->conn_tcp == tcp);
1698 tcp_timermp_free(tcp);
1699 mutex_destroy(&connp->conn_lock);
1700 cv_destroy(&connp->conn_cv);
1701 cv_destroy(&connp->conn_sq_cv);
1702 rw_destroy(&connp->conn_ilg_lock);
1704 /* Can be NULL if constructor failed */
1705 if (connp->conn_ixa != NULL) {
1706 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1707 ASSERT(connp->conn_ixa->ixa_ire == NULL);
1708 ASSERT(connp->conn_ixa->ixa_nce == NULL);
1709 ixa_refrele(connp->conn_ixa);
1713 /* ARGSUSED */
1714 static int
1715 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1717 itc_t *itc = (itc_t *)buf;
1718 conn_t *connp = &itc->itc_conn;
1720 bzero(connp, sizeof (conn_t));
1721 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1722 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1723 connp->conn_flags = IPCL_IPCCONN;
1724 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1726 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1727 if (connp->conn_ixa == NULL)
1728 return (ENOMEM);
1729 connp->conn_ixa->ixa_refcnt = 1;
1730 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1731 return (0);
1734 /* ARGSUSED */
1735 static void
1736 ip_conn_destructor(void *buf, void *cdrarg)
1738 itc_t *itc = (itc_t *)buf;
1739 conn_t *connp = &itc->itc_conn;
1741 ASSERT(connp->conn_flags & IPCL_IPCCONN);
1742 ASSERT(connp->conn_priv == NULL);
1743 mutex_destroy(&connp->conn_lock);
1744 cv_destroy(&connp->conn_cv);
1745 rw_destroy(&connp->conn_ilg_lock);
1747 /* Can be NULL if constructor failed */
1748 if (connp->conn_ixa != NULL) {
1749 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1750 ASSERT(connp->conn_ixa->ixa_ire == NULL);
1751 ASSERT(connp->conn_ixa->ixa_nce == NULL);
1752 ixa_refrele(connp->conn_ixa);
1756 /* ARGSUSED */
1757 static int
1758 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1760 itc_t *itc = (itc_t *)buf;
1761 conn_t *connp = &itc->itc_conn;
1762 udp_t *udp = (udp_t *)&itc[1];
1764 bzero(connp, sizeof (conn_t));
1765 bzero(udp, sizeof (udp_t));
1767 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1768 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1769 connp->conn_udp = udp;
1770 connp->conn_flags = IPCL_UDPCONN;
1771 connp->conn_proto = IPPROTO_UDP;
1772 udp->udp_connp = connp;
1773 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1774 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1775 if (connp->conn_ixa == NULL)
1776 return (ENOMEM);
1777 connp->conn_ixa->ixa_refcnt = 1;
1778 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1779 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1780 return (0);
1783 /* ARGSUSED */
1784 static void
1785 udp_conn_destructor(void *buf, void *cdrarg)
1787 itc_t *itc = (itc_t *)buf;
1788 conn_t *connp = &itc->itc_conn;
1789 udp_t *udp = (udp_t *)&itc[1];
1791 ASSERT(connp->conn_flags & IPCL_UDPCONN);
1792 ASSERT(udp->udp_connp == connp);
1793 ASSERT(connp->conn_udp == udp);
1794 mutex_destroy(&connp->conn_lock);
1795 cv_destroy(&connp->conn_cv);
1796 rw_destroy(&connp->conn_ilg_lock);
1798 /* Can be NULL if constructor failed */
1799 if (connp->conn_ixa != NULL) {
1800 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1801 ASSERT(connp->conn_ixa->ixa_ire == NULL);
1802 ASSERT(connp->conn_ixa->ixa_nce == NULL);
1803 ixa_refrele(connp->conn_ixa);
1807 /* ARGSUSED */
1808 static int
1809 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1811 itc_t *itc = (itc_t *)buf;
1812 conn_t *connp = &itc->itc_conn;
1813 icmp_t *icmp = (icmp_t *)&itc[1];
1815 bzero(connp, sizeof (conn_t));
1816 bzero(icmp, sizeof (icmp_t));
1818 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1819 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1820 connp->conn_icmp = icmp;
1821 connp->conn_flags = IPCL_RAWIPCONN;
1822 connp->conn_proto = IPPROTO_ICMP;
1823 icmp->icmp_connp = connp;
1824 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1825 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1826 if (connp->conn_ixa == NULL)
1827 return (ENOMEM);
1828 connp->conn_ixa->ixa_refcnt = 1;
1829 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1830 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1831 return (0);
1834 /* ARGSUSED */
1835 static void
1836 rawip_conn_destructor(void *buf, void *cdrarg)
1838 itc_t *itc = (itc_t *)buf;
1839 conn_t *connp = &itc->itc_conn;
1840 icmp_t *icmp = (icmp_t *)&itc[1];
1842 ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
1843 ASSERT(icmp->icmp_connp == connp);
1844 ASSERT(connp->conn_icmp == icmp);
1845 mutex_destroy(&connp->conn_lock);
1846 cv_destroy(&connp->conn_cv);
1847 rw_destroy(&connp->conn_ilg_lock);
1849 /* Can be NULL if constructor failed */
1850 if (connp->conn_ixa != NULL) {
1851 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1852 ASSERT(connp->conn_ixa->ixa_ire == NULL);
1853 ASSERT(connp->conn_ixa->ixa_nce == NULL);
1854 ixa_refrele(connp->conn_ixa);
1858 /* ARGSUSED */
1859 static int
1860 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
1862 itc_t *itc = (itc_t *)buf;
1863 conn_t *connp = &itc->itc_conn;
1864 rts_t *rts = (rts_t *)&itc[1];
1866 bzero(connp, sizeof (conn_t));
1867 bzero(rts, sizeof (rts_t));
1869 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1870 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1871 connp->conn_rts = rts;
1872 connp->conn_flags = IPCL_RTSCONN;
1873 rts->rts_connp = connp;
1874 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1875 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1876 if (connp->conn_ixa == NULL)
1877 return (ENOMEM);
1878 connp->conn_ixa->ixa_refcnt = 1;
1879 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1880 return (0);
1883 /* ARGSUSED */
1884 static void
1885 rts_conn_destructor(void *buf, void *cdrarg)
1887 itc_t *itc = (itc_t *)buf;
1888 conn_t *connp = &itc->itc_conn;
1889 rts_t *rts = (rts_t *)&itc[1];
1891 ASSERT(connp->conn_flags & IPCL_RTSCONN);
1892 ASSERT(rts->rts_connp == connp);
1893 ASSERT(connp->conn_rts == rts);
1894 mutex_destroy(&connp->conn_lock);
1895 cv_destroy(&connp->conn_cv);
1896 rw_destroy(&connp->conn_ilg_lock);
1898 /* Can be NULL if constructor failed */
1899 if (connp->conn_ixa != NULL) {
1900 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1901 ASSERT(connp->conn_ixa->ixa_ire == NULL);
1902 ASSERT(connp->conn_ixa->ixa_nce == NULL);
1903 ixa_refrele(connp->conn_ixa);
1908 * Called as part of ipcl_conn_destroy to assert and clear any pointers
1909 * in the conn_t.
1911 * Below we list all the pointers in the conn_t as a documentation aid.
1912 * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
1913 * If you add any pointers to the conn_t please add an ASSERT here
1914 * and #ifdef it out if it can't be actually asserted to be NULL.
1915 * In any case, we bzero most of the conn_t at the end of the function.
1917 void
1918 ipcl_conn_cleanup(conn_t *connp)
1920 ip_xmit_attr_t *ixa;
1922 ASSERT(connp->conn_latch == NULL);
1923 ASSERT(connp->conn_latch_in_policy == NULL);
1924 ASSERT(connp->conn_latch_in_action == NULL);
1925 #ifdef notdef
1926 ASSERT(connp->conn_rq == NULL);
1927 ASSERT(connp->conn_wq == NULL);
1928 #endif
1929 ASSERT(connp->conn_cred == NULL);
1930 ASSERT(connp->conn_g_fanout == NULL);
1931 ASSERT(connp->conn_g_next == NULL);
1932 ASSERT(connp->conn_g_prev == NULL);
1933 ASSERT(connp->conn_policy == NULL);
1934 ASSERT(connp->conn_fanout == NULL);
1935 ASSERT(connp->conn_next == NULL);
1936 ASSERT(connp->conn_prev == NULL);
1937 ASSERT(connp->conn_oper_pending_ill == NULL);
1938 ASSERT(connp->conn_ilg == NULL);
1939 ASSERT(connp->conn_drain_next == NULL);
1940 ASSERT(connp->conn_drain_prev == NULL);
1941 #ifdef notdef
1942 /* conn_idl is not cleared when removed from idl list */
1943 ASSERT(connp->conn_idl == NULL);
1944 #endif
1945 ASSERT(connp->conn_ipsec_opt_mp == NULL);
1946 #ifdef notdef
1947 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
1948 ASSERT(connp->conn_netstack == NULL);
1949 #endif
1951 ASSERT(connp->conn_helper_info == NULL);
1952 ASSERT(connp->conn_ixa != NULL);
1953 ixa = connp->conn_ixa;
1954 ASSERT(ixa->ixa_refcnt == 1);
1955 /* Need to preserve ixa_protocol */
1956 ixa_cleanup(ixa);
1957 ixa->ixa_flags = 0;
1959 /* Clear out the conn_t fields that are not preserved */
1960 bzero(&connp->conn_start_clr,
1961 sizeof (conn_t) -
1962 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
1966 * All conns are inserted in a global multi-list for the benefit of
1967 * walkers. The walk is guaranteed to walk all open conns at the time
1968 * of the start of the walk exactly once. This property is needed to
1969 * achieve some cleanups during unplumb of interfaces. This is achieved
1970 * as follows.
1972 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
1973 * call the insert and delete functions below at creation and deletion
1974 * time respectively. The conn never moves or changes its position in this
1975 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
1976 * won't increase due to walkers, once the conn deletion has started. Note
1977 * that we can't remove the conn from the global list and then wait for
1978 * the refcnt to drop to zero, since walkers would then see a truncated
1979 * list. CONN_INCIPIENT ensures that walkers don't start looking at
1980 * conns until ip_open is ready to make them globally visible.
1981 * The global round robin multi-list locks are held only to get the
1982 * next member/insertion/deletion and contention should be negligible
1983 * if the multi-list is much greater than the number of cpus.
1985 void
1986 ipcl_globalhash_insert(conn_t *connp)
1988 int index;
1989 struct connf_s *connfp;
1990 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1993 * No need for atomic here. Approximate even distribution
1994 * in the global lists is sufficient.
1996 ipst->ips_conn_g_index++;
1997 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
1999 connp->conn_g_prev = NULL;
2001 * Mark as INCIPIENT, so that walkers will ignore this
2002 * for now, till ip_open is ready to make it visible globally.
2004 connp->conn_state_flags |= CONN_INCIPIENT;
2006 connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2007 /* Insert at the head of the list */
2008 mutex_enter(&connfp->connf_lock);
2009 connp->conn_g_next = connfp->connf_head;
2010 if (connp->conn_g_next != NULL)
2011 connp->conn_g_next->conn_g_prev = connp;
2012 connfp->connf_head = connp;
2014 /* The fanout bucket this conn points to */
2015 connp->conn_g_fanout = connfp;
2017 mutex_exit(&connfp->connf_lock);
2020 void
2021 ipcl_globalhash_remove(conn_t *connp)
2023 struct connf_s *connfp;
2026 * We were never inserted in the global multi list.
2027 * IPCL_NONE variety is never inserted in the global multilist
2028 * since it is presumed to not need any cleanup and is transient.
2030 if (connp->conn_g_fanout == NULL)
2031 return;
2033 connfp = connp->conn_g_fanout;
2034 mutex_enter(&connfp->connf_lock);
2035 if (connp->conn_g_prev != NULL)
2036 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2037 else
2038 connfp->connf_head = connp->conn_g_next;
2039 if (connp->conn_g_next != NULL)
2040 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2041 mutex_exit(&connfp->connf_lock);
2043 /* Better to stumble on a null pointer than to corrupt memory */
2044 connp->conn_g_next = NULL;
2045 connp->conn_g_prev = NULL;
2046 connp->conn_g_fanout = NULL;
2050 * Walk the list of all conn_t's in the system, calling the function provided
2051 * With the specified argument for each.
2052 * Applies to both IPv4 and IPv6.
2054 * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2055 * conn_oper_pending_ill). To guard against stale pointers
2056 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2057 * unplumbed or removed. New conn_t's that are created while we are walking
2058 * may be missed by this walk, because they are not necessarily inserted
2059 * at the tail of the list. They are new conn_t's and thus don't have any
2060 * stale pointers. The CONN_CLOSING flag ensures that no new reference
2061 * is created to the struct that is going away.
2063 void
2064 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2066 int i;
2067 conn_t *connp;
2068 conn_t *prev_connp;
2070 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2071 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2072 prev_connp = NULL;
2073 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2074 while (connp != NULL) {
2075 mutex_enter(&connp->conn_lock);
2076 if (connp->conn_state_flags &
2077 (CONN_CONDEMNED | CONN_INCIPIENT)) {
2078 mutex_exit(&connp->conn_lock);
2079 connp = connp->conn_g_next;
2080 continue;
2082 CONN_INC_REF_LOCKED(connp);
2083 mutex_exit(&connp->conn_lock);
2084 mutex_exit(
2085 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2086 (*func)(connp, arg);
2087 if (prev_connp != NULL)
2088 CONN_DEC_REF(prev_connp);
2089 mutex_enter(
2090 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2091 prev_connp = connp;
2092 connp = connp->conn_g_next;
2094 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2095 if (prev_connp != NULL)
2096 CONN_DEC_REF(prev_connp);
2101 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2102 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2103 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2104 * (peer tcp in ESTABLISHED state).
2106 conn_t *
2107 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2108 ip_stack_t *ipst)
2110 uint32_t ports;
2111 uint16_t *pports = (uint16_t *)&ports;
2112 connf_t *connfp;
2113 conn_t *tconnp;
2114 boolean_t zone_chk;
2117 * If either the source of destination address is loopback, then
2118 * both endpoints must be in the same Zone. Otherwise, both of
2119 * the addresses are system-wide unique (tcp is in ESTABLISHED
2120 * state) and the endpoints may reside in different Zones.
2122 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2123 ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2125 pports[0] = tcpha->tha_fport;
2126 pports[1] = tcpha->tha_lport;
2128 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2129 ports, ipst)];
2131 mutex_enter(&connfp->connf_lock);
2132 for (tconnp = connfp->connf_head; tconnp != NULL;
2133 tconnp = tconnp->conn_next) {
2135 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2136 ipha->ipha_dst, ipha->ipha_src, ports) &&
2137 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2138 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2140 ASSERT(tconnp != connp);
2141 CONN_INC_REF(tconnp);
2142 mutex_exit(&connfp->connf_lock);
2143 return (tconnp);
2146 mutex_exit(&connfp->connf_lock);
2147 return (NULL);
2151 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2152 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2153 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2154 * (peer tcp in ESTABLISHED state).
2156 conn_t *
2157 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2158 ip_stack_t *ipst)
2160 uint32_t ports;
2161 uint16_t *pports = (uint16_t *)&ports;
2162 connf_t *connfp;
2163 conn_t *tconnp;
2164 boolean_t zone_chk;
2167 * If either the source of destination address is loopback, then
2168 * both endpoints must be in the same Zone. Otherwise, both of
2169 * the addresses are system-wide unique (tcp is in ESTABLISHED
2170 * state) and the endpoints may reside in different Zones. We
2171 * don't do Zone check for link local address(es) because the
2172 * current Zone implementation treats each link local address as
2173 * being unique per system node, i.e. they belong to global Zone.
2175 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2176 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2178 pports[0] = tcpha->tha_fport;
2179 pports[1] = tcpha->tha_lport;
2181 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2182 ports, ipst)];
2184 mutex_enter(&connfp->connf_lock);
2185 for (tconnp = connfp->connf_head; tconnp != NULL;
2186 tconnp = tconnp->conn_next) {
2188 /* We skip conn_bound_if check here as this is loopback tcp */
2189 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2190 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2191 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2192 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2194 ASSERT(tconnp != connp);
2195 CONN_INC_REF(tconnp);
2196 mutex_exit(&connfp->connf_lock);
2197 return (tconnp);
2200 mutex_exit(&connfp->connf_lock);
2201 return (NULL);
2205 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2206 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2207 * Only checks for connected entries i.e. no INADDR_ANY checks.
2209 conn_t *
2210 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2211 ip_stack_t *ipst)
2213 uint32_t ports;
2214 uint16_t *pports;
2215 connf_t *connfp;
2216 conn_t *tconnp;
2218 pports = (uint16_t *)&ports;
2219 pports[0] = tcpha->tha_fport;
2220 pports[1] = tcpha->tha_lport;
2222 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2223 ports, ipst)];
2225 mutex_enter(&connfp->connf_lock);
2226 for (tconnp = connfp->connf_head; tconnp != NULL;
2227 tconnp = tconnp->conn_next) {
2229 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2230 ipha->ipha_dst, ipha->ipha_src, ports) &&
2231 tconnp->conn_tcp->tcp_state >= min_state) {
2233 CONN_INC_REF(tconnp);
2234 mutex_exit(&connfp->connf_lock);
2235 return (tconnp);
2238 mutex_exit(&connfp->connf_lock);
2239 return (NULL);
2243 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2244 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2245 * Only checks for connected entries i.e. no INADDR_ANY checks.
2246 * Match on ifindex in addition to addresses.
2248 conn_t *
2249 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2250 uint_t ifindex, ip_stack_t *ipst)
2252 tcp_t *tcp;
2253 uint32_t ports;
2254 uint16_t *pports;
2255 connf_t *connfp;
2256 conn_t *tconnp;
2258 pports = (uint16_t *)&ports;
2259 pports[0] = tcpha->tha_fport;
2260 pports[1] = tcpha->tha_lport;
2262 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2263 ports, ipst)];
2265 mutex_enter(&connfp->connf_lock);
2266 for (tconnp = connfp->connf_head; tconnp != NULL;
2267 tconnp = tconnp->conn_next) {
2269 tcp = tconnp->conn_tcp;
2270 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2271 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2272 tcp->tcp_state >= min_state &&
2273 (tconnp->conn_bound_if == 0 ||
2274 tconnp->conn_bound_if == ifindex)) {
2276 CONN_INC_REF(tconnp);
2277 mutex_exit(&connfp->connf_lock);
2278 return (tconnp);
2281 mutex_exit(&connfp->connf_lock);
2282 return (NULL);
2286 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2287 * a listener when changing state.
2289 conn_t *
2290 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2291 ip_stack_t *ipst)
2293 connf_t *bind_connfp;
2294 conn_t *connp;
2295 tcp_t *tcp;
2298 * Avoid false matches for packets sent to an IP destination of
2299 * all zeros.
2301 if (laddr == 0)
2302 return (NULL);
2304 ASSERT(zoneid != ALL_ZONES);
2306 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2307 mutex_enter(&bind_connfp->connf_lock);
2308 for (connp = bind_connfp->connf_head; connp != NULL;
2309 connp = connp->conn_next) {
2310 tcp = connp->conn_tcp;
2311 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2312 IPCL_ZONE_MATCH(connp, zoneid) &&
2313 (tcp->tcp_listener == NULL)) {
2314 CONN_INC_REF(connp);
2315 mutex_exit(&bind_connfp->connf_lock);
2316 return (connp);
2319 mutex_exit(&bind_connfp->connf_lock);
2320 return (NULL);
2324 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2325 * a listener when changing state.
2327 conn_t *
2328 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2329 zoneid_t zoneid, ip_stack_t *ipst)
2331 connf_t *bind_connfp;
2332 conn_t *connp = NULL;
2333 tcp_t *tcp;
2336 * Avoid false matches for packets sent to an IP destination of
2337 * all zeros.
2339 if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2340 return (NULL);
2342 ASSERT(zoneid != ALL_ZONES);
2344 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2345 mutex_enter(&bind_connfp->connf_lock);
2346 for (connp = bind_connfp->connf_head; connp != NULL;
2347 connp = connp->conn_next) {
2348 tcp = connp->conn_tcp;
2349 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2350 IPCL_ZONE_MATCH(connp, zoneid) &&
2351 (connp->conn_bound_if == 0 ||
2352 connp->conn_bound_if == ifindex) &&
2353 tcp->tcp_listener == NULL) {
2354 CONN_INC_REF(connp);
2355 mutex_exit(&bind_connfp->connf_lock);
2356 return (connp);
2359 mutex_exit(&bind_connfp->connf_lock);
2360 return (NULL);
2364 * ipcl_get_next_conn
2365 * get the next entry in the conn global list
2366 * and put a reference on the next_conn.
2367 * decrement the reference on the current conn.
2369 * This is an iterator based walker function that also provides for
2370 * some selection by the caller. It walks through the conn_hash bucket
2371 * searching for the next valid connp in the list, and selects connections
2372 * that are neither closed nor condemned. It also REFHOLDS the conn
2373 * thus ensuring that the conn exists when the caller uses the conn.
2375 conn_t *
2376 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2378 conn_t *next_connp;
2380 if (connfp == NULL)
2381 return (NULL);
2383 mutex_enter(&connfp->connf_lock);
2385 next_connp = (connp == NULL) ?
2386 connfp->connf_head : connp->conn_g_next;
2388 while (next_connp != NULL) {
2389 mutex_enter(&next_connp->conn_lock);
2390 if (!(next_connp->conn_flags & conn_flags) ||
2391 (next_connp->conn_state_flags &
2392 (CONN_CONDEMNED | CONN_INCIPIENT))) {
2394 * This conn has been condemned or
2395 * is closing, or the flags don't match
2397 mutex_exit(&next_connp->conn_lock);
2398 next_connp = next_connp->conn_g_next;
2399 continue;
2401 CONN_INC_REF_LOCKED(next_connp);
2402 mutex_exit(&next_connp->conn_lock);
2403 break;
2406 mutex_exit(&connfp->connf_lock);
2408 if (connp != NULL)
2409 CONN_DEC_REF(connp);
2411 return (next_connp);
2414 #ifdef CONN_DEBUG
2416 * Trace of the last NBUF refhold/refrele
2419 conn_trace_ref(conn_t *connp)
2421 int last;
2422 conn_trace_t *ctb;
2424 ASSERT(MUTEX_HELD(&connp->conn_lock));
2425 last = connp->conn_trace_last;
2426 last++;
2427 if (last == CONN_TRACE_MAX)
2428 last = 0;
2430 ctb = &connp->conn_trace_buf[last];
2431 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2432 connp->conn_trace_last = last;
2433 return (1);
2437 conn_untrace_ref(conn_t *connp)
2439 int last;
2440 conn_trace_t *ctb;
2442 ASSERT(MUTEX_HELD(&connp->conn_lock));
2443 last = connp->conn_trace_last;
2444 last++;
2445 if (last == CONN_TRACE_MAX)
2446 last = 0;
2448 ctb = &connp->conn_trace_buf[last];
2449 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2450 connp->conn_trace_last = last;
2451 return (1);
2453 #endif