4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
33 #include <sys/sunddi.h>
34 #include <sys/cmn_err.h>
35 #include <sys/debug.h>
36 #include <sys/atomic.h>
37 #include <sys/callb.h>
38 #define _SUN_TPI_VERSION 2
39 #include <sys/tihdr.h>
41 #include <inet/common.h>
43 #include <inet/mib2.h>
44 #include <inet/snmpcom.h>
46 #include <netinet/ip6.h>
47 #include <netinet/icmp6.h>
50 #include <inet/ip_impl.h>
52 #include <inet/ip6_asp.h>
53 #include <inet/ip_multi.h>
54 #include <inet/ip_if.h>
55 #include <inet/ip_ire.h>
56 #include <inet/ip_ftable.h>
57 #include <inet/ip_rts.h>
58 #include <inet/ip_ndp.h>
59 #include <inet/ipclassifier.h>
60 #include <inet/ip_listutils.h>
62 #include <sys/sunddi.h>
65 * Routines for handling destination cache entries.
66 * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
67 * That entry holds both the IP ident value and the dce generation number.
69 * Any time a DCE is changed significantly (different path MTU, but NOT
70 * different ULP info!), the dce_generation number is increased.
71 * Also, when a new DCE is created, the dce_generation number in the default
72 * DCE is bumped. That allows the dce_t information to be cached efficiently
73 * as long as the entity caching the dce_t also caches the dce_generation,
74 * and compares the cached generation to detect any changes.
75 * Furthermore, when a DCE is deleted, if there are any outstanding references
76 * to the DCE it will be marked as condemned. The condemned mark is
77 * a designated generation number which is never otherwise used, hence
78 * the single comparison with the generation number captures that as well.
80 * An example of code which caches is as follows:
82 * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
84 * mystruct->my_dce = dce_lookup_pkt(mp, ixa,
85 * &mystruct->my_dce_generation);
86 * Not needed in practice, since we have the default DCE:
87 * if (DCE_IS_CONDEMNED(mystruct->my_dce))
91 * Note that for IPv6 link-local addresses we record the ifindex since the
92 * link-locals are not globally unique.
96 * Hash bucket structure for DCEs
98 typedef struct dcb_s
{
104 static void dce_delete_locked(dcb_t
*, dce_t
*);
105 static void dce_make_condemned(dce_t
*);
107 static kmem_cache_t
*dce_cache
;
108 static kthread_t
*dce_reclaim_thread
;
109 static kmutex_t dce_reclaim_lock
;
110 static kcondvar_t dce_reclaim_cv
;
111 static int dce_reclaim_shutdown
;
113 /* Global so it can be tuned in /etc/system. This must be a power of two. */
114 uint_t ip_dce_hash_size
= 1024;
116 /* The time in seconds between executions of the IP DCE reclaim worker. */
117 uint_t ip_dce_reclaim_interval
= 60;
119 /* The factor of the DCE threshold at which to start hard reclaims */
120 uint_t ip_dce_reclaim_threshold_hard
= 2;
122 /* Operates on a uint64_t */
123 #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
126 * Reclaim a fraction of dce's in the dcb.
127 * For now we have a higher probability to delete DCEs without DCE_PMTU.
130 dcb_reclaim(dcb_t
*dcb
, ip_stack_t
*ipst
, uint_t fraction
)
132 uint_t fraction_pmtu
= fraction
*4;
134 dce_t
*dce
, *nextdce
;
135 hrtime_t seed
= gethrtime();
137 uint_t max
= ipst
->ips_ip_dce_reclaim_threshold
;
139 max
*= ip_dce_reclaim_threshold_hard
;
141 rw_enter(&dcb
->dcb_lock
, RW_WRITER
);
142 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= nextdce
) {
143 nextdce
= dce
->dce_next
;
144 /* Clear DCEF_PMTU if the pmtu is too old */
145 mutex_enter(&dce
->dce_lock
);
146 if ((dce
->dce_flags
& DCEF_PMTU
) &&
147 TICK_TO_SEC(ddi_get_lbolt64()) - dce
->dce_last_change_time
>
148 ipst
->ips_ip_pathmtu_interval
) {
149 dce
->dce_flags
&= ~DCEF_PMTU
;
150 mutex_exit(&dce
->dce_lock
);
151 dce_increment_generation(dce
);
153 mutex_exit(&dce
->dce_lock
);
156 if (max
== 0 || retained
< max
) {
157 hash
= RANDOM_HASH((uint64_t)((uintptr_t)dce
| seed
));
159 if (dce
->dce_flags
& DCEF_PMTU
) {
160 if (hash
% fraction_pmtu
!= 0) {
165 if (hash
% fraction
!= 0) {
172 IP_STAT(ipst
, ip_dce_reclaim_deleted
);
173 dce_delete_locked(dcb
, dce
);
176 rw_exit(&dcb
->dcb_lock
);
180 * kmem_cache callback to free up memory.
184 ip_dce_reclaim_stack(ip_stack_t
*ipst
)
188 IP_STAT(ipst
, ip_dce_reclaim_calls
);
189 for (i
= 0; i
< ipst
->ips_dce_hashsize
; i
++) {
190 dcb_reclaim(&ipst
->ips_dce_hash_v4
[i
], ipst
,
191 ipst
->ips_ip_dce_reclaim_fraction
);
193 dcb_reclaim(&ipst
->ips_dce_hash_v6
[i
], ipst
,
194 ipst
->ips_ip_dce_reclaim_fraction
);
198 * Walk all CONNs that can have a reference on an ire, nce or dce.
199 * Get them to update any stale references to drop any refholds they
202 ipcl_walk(conn_ixa_cleanup
, (void *)B_FALSE
, ipst
);
206 * Called by dce_reclaim_worker() below, and no one else. Typically this will
207 * mean that the number of entries in the hash buckets has exceeded a tunable
213 netstack_handle_t nh
;
217 ASSERT(curthread
== dce_reclaim_thread
);
219 netstack_next_init(&nh
);
220 while ((ns
= netstack_next(&nh
)) != NULL
) {
222 * netstack_next() can return a netstack_t with a NULL
223 * netstack_ip at boot time.
225 if ((ipst
= ns
->netstack_ip
) == NULL
) {
229 if (atomic_swap_uint(&ipst
->ips_dce_reclaim_needed
, 0) != 0)
230 ip_dce_reclaim_stack(ipst
);
233 netstack_next_fini(&nh
);
238 dce_reclaim_worker(void *arg
)
242 CALLB_CPR_INIT(&cprinfo
, &dce_reclaim_lock
, callb_generic_cpr
,
243 "dce_reclaim_worker");
245 mutex_enter(&dce_reclaim_lock
);
246 while (!dce_reclaim_shutdown
) {
247 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
248 (void) cv_timedwait(&dce_reclaim_cv
, &dce_reclaim_lock
,
249 ddi_get_lbolt() + ip_dce_reclaim_interval
* hz
);
250 CALLB_CPR_SAFE_END(&cprinfo
, &dce_reclaim_lock
);
252 if (dce_reclaim_shutdown
)
255 mutex_exit(&dce_reclaim_lock
);
257 mutex_enter(&dce_reclaim_lock
);
260 ASSERT(MUTEX_HELD(&dce_reclaim_lock
));
261 dce_reclaim_thread
= NULL
;
262 dce_reclaim_shutdown
= 0;
263 cv_broadcast(&dce_reclaim_cv
);
264 CALLB_CPR_EXIT(&cprinfo
); /* drops the lock */
272 dce_cache
= kmem_cache_create("dce_cache",
273 sizeof (dce_t
), 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
275 mutex_init(&dce_reclaim_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
276 cv_init(&dce_reclaim_cv
, NULL
, CV_DEFAULT
, NULL
);
278 dce_reclaim_thread
= thread_create(NULL
, 0, dce_reclaim_worker
,
279 NULL
, 0, &p0
, TS_RUN
, minclsyspri
);
285 mutex_enter(&dce_reclaim_lock
);
286 dce_reclaim_shutdown
= 1;
287 cv_signal(&dce_reclaim_cv
);
288 while (dce_reclaim_thread
!= NULL
)
289 cv_wait(&dce_reclaim_cv
, &dce_reclaim_lock
);
290 mutex_exit(&dce_reclaim_lock
);
292 cv_destroy(&dce_reclaim_cv
);
293 mutex_destroy(&dce_reclaim_lock
);
295 kmem_cache_destroy(dce_cache
);
299 * Allocate a default DCE and a hash table for per-IP address DCEs
302 dce_stack_init(ip_stack_t
*ipst
)
306 ipst
->ips_dce_default
= kmem_cache_alloc(dce_cache
, KM_SLEEP
);
307 bzero(ipst
->ips_dce_default
, sizeof (dce_t
));
308 ipst
->ips_dce_default
->dce_flags
= DCEF_DEFAULT
;
309 ipst
->ips_dce_default
->dce_generation
= DCE_GENERATION_INITIAL
;
310 ipst
->ips_dce_default
->dce_last_change_time
=
311 TICK_TO_SEC(ddi_get_lbolt64());
312 ipst
->ips_dce_default
->dce_refcnt
= 1; /* Should never go away */
313 ipst
->ips_dce_default
->dce_ipst
= ipst
;
315 /* This must be a power of two since we are using IRE_ADDR_HASH macro */
316 ipst
->ips_dce_hashsize
= ip_dce_hash_size
;
317 ipst
->ips_dce_hash_v4
= kmem_zalloc(ipst
->ips_dce_hashsize
*
318 sizeof (dcb_t
), KM_SLEEP
);
319 ipst
->ips_dce_hash_v6
= kmem_zalloc(ipst
->ips_dce_hashsize
*
320 sizeof (dcb_t
), KM_SLEEP
);
321 for (i
= 0; i
< ipst
->ips_dce_hashsize
; i
++) {
322 rw_init(&ipst
->ips_dce_hash_v4
[i
].dcb_lock
, NULL
, RW_DEFAULT
,
324 rw_init(&ipst
->ips_dce_hash_v6
[i
].dcb_lock
, NULL
, RW_DEFAULT
,
330 dce_stack_destroy(ip_stack_t
*ipst
)
333 for (i
= 0; i
< ipst
->ips_dce_hashsize
; i
++) {
334 rw_destroy(&ipst
->ips_dce_hash_v4
[i
].dcb_lock
);
335 rw_destroy(&ipst
->ips_dce_hash_v6
[i
].dcb_lock
);
337 kmem_free(ipst
->ips_dce_hash_v4
,
338 ipst
->ips_dce_hashsize
* sizeof (dcb_t
));
339 ipst
->ips_dce_hash_v4
= NULL
;
340 kmem_free(ipst
->ips_dce_hash_v6
,
341 ipst
->ips_dce_hashsize
* sizeof (dcb_t
));
342 ipst
->ips_dce_hash_v6
= NULL
;
343 ipst
->ips_dce_hashsize
= 0;
345 ASSERT(ipst
->ips_dce_default
->dce_refcnt
== 1);
346 kmem_cache_free(dce_cache
, ipst
->ips_dce_default
);
347 ipst
->ips_dce_default
= NULL
;
350 /* When any DCE is good enough */
352 dce_get_default(ip_stack_t
*ipst
)
356 dce
= ipst
->ips_dce_default
;
362 * Generic for IPv4 and IPv6.
364 * Used by callers that need to cache e.g., the datapath
365 * Returns the generation number in the last argument.
368 dce_lookup_pkt(mblk_t
*mp
, ip_xmit_attr_t
*ixa
, uint_t
*generationp
)
370 if (ixa
->ixa_flags
& IXAF_IS_IPV4
) {
372 * If we have a source route we need to look for the final
373 * destination in the source route option.
376 ipha_t
*ipha
= (ipha_t
*)mp
->b_rptr
;
378 final_dst
= ip_get_dst(ipha
);
379 return (dce_lookup_v4(final_dst
, ixa
->ixa_ipst
, generationp
));
383 * If we have a routing header we need to look for the final
384 * destination in the routing extension header.
386 in6_addr_t final_dst
;
387 ip6_t
*ip6h
= (ip6_t
*)mp
->b_rptr
;
389 final_dst
= ip_get_dst_v6(ip6h
, mp
, NULL
);
391 if (IN6_IS_ADDR_LINKSCOPE(&final_dst
) && ixa
->ixa_nce
!= NULL
) {
392 ifindex
= ixa
->ixa_nce
->nce_common
->ncec_ill
->
393 ill_phyint
->phyint_ifindex
;
395 return (dce_lookup_v6(&final_dst
, ifindex
, ixa
->ixa_ipst
,
401 * Used by callers that need to cache e.g., the datapath
402 * Returns the generation number in the last argument.
405 dce_lookup_v4(ipaddr_t dst
, ip_stack_t
*ipst
, uint_t
*generationp
)
411 /* Set *generationp before dropping the lock(s) that allow additions */
412 if (generationp
!= NULL
)
413 *generationp
= ipst
->ips_dce_default
->dce_generation
;
415 hash
= IRE_ADDR_HASH(dst
, ipst
->ips_dce_hashsize
);
416 dcb
= &ipst
->ips_dce_hash_v4
[hash
];
417 rw_enter(&dcb
->dcb_lock
, RW_READER
);
418 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= dce
->dce_next
) {
419 if (dce
->dce_v4addr
== dst
) {
420 mutex_enter(&dce
->dce_lock
);
421 if (!DCE_IS_CONDEMNED(dce
)) {
423 if (generationp
!= NULL
)
424 *generationp
= dce
->dce_generation
;
425 mutex_exit(&dce
->dce_lock
);
426 rw_exit(&dcb
->dcb_lock
);
429 mutex_exit(&dce
->dce_lock
);
432 rw_exit(&dcb
->dcb_lock
);
434 dce
= ipst
->ips_dce_default
;
440 * Used by callers that need to cache e.g., the datapath
441 * Returns the generation number in the last argument.
442 * ifindex should only be set for link-locals
445 dce_lookup_v6(const in6_addr_t
*dst
, uint_t ifindex
, ip_stack_t
*ipst
,
452 /* Set *generationp before dropping the lock(s) that allow additions */
453 if (generationp
!= NULL
)
454 *generationp
= ipst
->ips_dce_default
->dce_generation
;
456 hash
= IRE_ADDR_HASH_V6(*dst
, ipst
->ips_dce_hashsize
);
457 dcb
= &ipst
->ips_dce_hash_v6
[hash
];
458 rw_enter(&dcb
->dcb_lock
, RW_READER
);
459 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= dce
->dce_next
) {
460 if (IN6_ARE_ADDR_EQUAL(&dce
->dce_v6addr
, dst
) &&
461 dce
->dce_ifindex
== ifindex
) {
462 mutex_enter(&dce
->dce_lock
);
463 if (!DCE_IS_CONDEMNED(dce
)) {
465 if (generationp
!= NULL
)
466 *generationp
= dce
->dce_generation
;
467 mutex_exit(&dce
->dce_lock
);
468 rw_exit(&dcb
->dcb_lock
);
471 mutex_exit(&dce
->dce_lock
);
474 rw_exit(&dcb
->dcb_lock
);
476 dce
= ipst
->ips_dce_default
;
482 * Atomically looks for a non-default DCE, and if not found tries to create one.
483 * If there is no memory it returns NULL.
484 * When an entry is created we increase the generation number on
485 * the default DCE so that conn_ip_output will detect there is a new DCE.
488 dce_lookup_and_add_v4(ipaddr_t dst
, ip_stack_t
*ipst
)
494 hash
= IRE_ADDR_HASH(dst
, ipst
->ips_dce_hashsize
);
495 dcb
= &ipst
->ips_dce_hash_v4
[hash
];
497 * Assuming that we get fairly even distribution across all of the
498 * buckets, once one bucket is overly full, prune the whole cache.
500 if (dcb
->dcb_cnt
> ipst
->ips_ip_dce_reclaim_threshold
)
501 atomic_or_uint(&ipst
->ips_dce_reclaim_needed
, 1);
502 rw_enter(&dcb
->dcb_lock
, RW_WRITER
);
503 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= dce
->dce_next
) {
504 if (dce
->dce_v4addr
== dst
) {
505 mutex_enter(&dce
->dce_lock
);
506 if (!DCE_IS_CONDEMNED(dce
)) {
508 mutex_exit(&dce
->dce_lock
);
509 rw_exit(&dcb
->dcb_lock
);
512 mutex_exit(&dce
->dce_lock
);
515 dce
= kmem_cache_alloc(dce_cache
, KM_NOSLEEP
);
517 rw_exit(&dcb
->dcb_lock
);
520 bzero(dce
, sizeof (dce_t
));
521 dce
->dce_ipst
= ipst
; /* No netstack_hold */
522 dce
->dce_v4addr
= dst
;
523 dce
->dce_generation
= DCE_GENERATION_INITIAL
;
524 dce
->dce_ipversion
= IPV4_VERSION
;
525 dce
->dce_last_change_time
= TICK_TO_SEC(ddi_get_lbolt64());
526 dce_refhold(dce
); /* For the hash list */
529 if (dcb
->dcb_dce
!= NULL
)
530 dcb
->dcb_dce
->dce_ptpn
= &dce
->dce_next
;
531 dce
->dce_next
= dcb
->dcb_dce
;
532 dce
->dce_ptpn
= &dcb
->dcb_dce
;
534 dce
->dce_bucket
= dcb
;
535 atomic_add_32(&dcb
->dcb_cnt
, 1);
536 dce_refhold(dce
); /* For the caller */
537 rw_exit(&dcb
->dcb_lock
);
539 /* Initialize dce_ident to be different than for the last packet */
540 dce
->dce_ident
= ipst
->ips_dce_default
->dce_ident
+ 1;
542 dce_increment_generation(ipst
->ips_dce_default
);
547 * Atomically looks for a non-default DCE, and if not found tries to create one.
548 * If there is no memory it returns NULL.
549 * When an entry is created we increase the generation number on
550 * the default DCE so that conn_ip_output will detect there is a new DCE.
551 * ifindex should only be used with link-local addresses.
554 dce_lookup_and_add_v6(const in6_addr_t
*dst
, uint_t ifindex
, ip_stack_t
*ipst
)
560 /* We should not create entries for link-locals w/o an ifindex */
561 ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst
)) || ifindex
!= 0);
563 hash
= IRE_ADDR_HASH_V6(*dst
, ipst
->ips_dce_hashsize
);
564 dcb
= &ipst
->ips_dce_hash_v6
[hash
];
566 * Assuming that we get fairly even distribution across all of the
567 * buckets, once one bucket is overly full, prune the whole cache.
569 if (dcb
->dcb_cnt
> ipst
->ips_ip_dce_reclaim_threshold
)
570 atomic_or_uint(&ipst
->ips_dce_reclaim_needed
, 1);
571 rw_enter(&dcb
->dcb_lock
, RW_WRITER
);
572 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= dce
->dce_next
) {
573 if (IN6_ARE_ADDR_EQUAL(&dce
->dce_v6addr
, dst
) &&
574 dce
->dce_ifindex
== ifindex
) {
575 mutex_enter(&dce
->dce_lock
);
576 if (!DCE_IS_CONDEMNED(dce
)) {
578 mutex_exit(&dce
->dce_lock
);
579 rw_exit(&dcb
->dcb_lock
);
582 mutex_exit(&dce
->dce_lock
);
586 dce
= kmem_cache_alloc(dce_cache
, KM_NOSLEEP
);
588 rw_exit(&dcb
->dcb_lock
);
591 bzero(dce
, sizeof (dce_t
));
592 dce
->dce_ipst
= ipst
; /* No netstack_hold */
593 dce
->dce_v6addr
= *dst
;
594 dce
->dce_ifindex
= ifindex
;
595 dce
->dce_generation
= DCE_GENERATION_INITIAL
;
596 dce
->dce_ipversion
= IPV6_VERSION
;
597 dce
->dce_last_change_time
= TICK_TO_SEC(ddi_get_lbolt64());
598 dce_refhold(dce
); /* For the hash list */
601 if (dcb
->dcb_dce
!= NULL
)
602 dcb
->dcb_dce
->dce_ptpn
= &dce
->dce_next
;
603 dce
->dce_next
= dcb
->dcb_dce
;
604 dce
->dce_ptpn
= &dcb
->dcb_dce
;
606 dce
->dce_bucket
= dcb
;
607 atomic_add_32(&dcb
->dcb_cnt
, 1);
608 dce_refhold(dce
); /* For the caller */
609 rw_exit(&dcb
->dcb_lock
);
611 /* Initialize dce_ident to be different than for the last packet */
612 dce
->dce_ident
= ipst
->ips_dce_default
->dce_ident
+ 1;
613 dce_increment_generation(ipst
->ips_dce_default
);
618 * Set/update uinfo. Creates a per-destination dce if none exists.
620 * Note that we do not bump the generation number here.
621 * New connections will find the new uinfo.
623 * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
626 dce_setuinfo(dce_t
*dce
, iulp_t
*uinfo
)
629 * Update the round trip time estimate and/or the max frag size
630 * and/or the slow start threshold.
632 * We serialize multiple advises using dce_lock.
634 mutex_enter(&dce
->dce_lock
);
635 /* Gard against setting to zero */
636 if (uinfo
->iulp_rtt
!= 0) {
638 * If there is no old cached values, initialize them
639 * conservatively. Set them to be (1.5 * new value).
641 if (dce
->dce_uinfo
.iulp_rtt
!= 0) {
642 dce
->dce_uinfo
.iulp_rtt
= (dce
->dce_uinfo
.iulp_rtt
+
643 uinfo
->iulp_rtt
) >> 1;
645 dce
->dce_uinfo
.iulp_rtt
= uinfo
->iulp_rtt
+
646 (uinfo
->iulp_rtt
>> 1);
648 if (dce
->dce_uinfo
.iulp_rtt_sd
!= 0) {
649 dce
->dce_uinfo
.iulp_rtt_sd
=
650 (dce
->dce_uinfo
.iulp_rtt_sd
+
651 uinfo
->iulp_rtt_sd
) >> 1;
653 dce
->dce_uinfo
.iulp_rtt_sd
= uinfo
->iulp_rtt_sd
+
654 (uinfo
->iulp_rtt_sd
>> 1);
657 if (uinfo
->iulp_mtu
!= 0) {
658 if (dce
->dce_flags
& DCEF_PMTU
) {
659 dce
->dce_pmtu
= MIN(uinfo
->iulp_mtu
, dce
->dce_pmtu
);
661 dce
->dce_pmtu
= MIN(uinfo
->iulp_mtu
, IP_MAXPACKET
);
662 dce
->dce_flags
|= DCEF_PMTU
;
664 dce
->dce_last_change_time
= TICK_TO_SEC(ddi_get_lbolt64());
666 if (uinfo
->iulp_ssthresh
!= 0) {
667 if (dce
->dce_uinfo
.iulp_ssthresh
!= 0)
668 dce
->dce_uinfo
.iulp_ssthresh
=
669 (uinfo
->iulp_ssthresh
+
670 dce
->dce_uinfo
.iulp_ssthresh
) >> 1;
672 dce
->dce_uinfo
.iulp_ssthresh
= uinfo
->iulp_ssthresh
;
674 /* We have uinfo for sure */
675 dce
->dce_flags
|= DCEF_UINFO
;
676 mutex_exit(&dce
->dce_lock
);
681 dce_update_uinfo_v4(ipaddr_t dst
, iulp_t
*uinfo
, ip_stack_t
*ipst
)
685 dce
= dce_lookup_and_add_v4(dst
, ipst
);
689 dce_setuinfo(dce
, uinfo
);
695 dce_update_uinfo_v6(const in6_addr_t
*dst
, uint_t ifindex
, iulp_t
*uinfo
,
700 dce
= dce_lookup_and_add_v6(dst
, ifindex
, ipst
);
704 dce_setuinfo(dce
, uinfo
);
709 /* Common routine for IPv4 and IPv6 */
711 dce_update_uinfo(const in6_addr_t
*dst
, uint_t ifindex
, iulp_t
*uinfo
,
716 if (IN6_IS_ADDR_V4MAPPED_ANY(dst
)) {
717 IN6_V4MAPPED_TO_IPADDR(dst
, dst4
);
718 return (dce_update_uinfo_v4(dst4
, uinfo
, ipst
));
720 return (dce_update_uinfo_v6(dst
, ifindex
, uinfo
, ipst
));
725 dce_make_condemned(dce_t
*dce
)
727 ip_stack_t
*ipst
= dce
->dce_ipst
;
729 mutex_enter(&dce
->dce_lock
);
730 ASSERT(!DCE_IS_CONDEMNED(dce
));
731 dce
->dce_generation
= DCE_GENERATION_CONDEMNED
;
732 mutex_exit(&dce
->dce_lock
);
733 /* Count how many condemned dces for kmem_cache callback */
734 atomic_add_32(&ipst
->ips_num_dce_condemned
, 1);
738 * Increment the generation avoiding the special condemned value
741 dce_increment_generation(dce_t
*dce
)
745 mutex_enter(&dce
->dce_lock
);
746 if (!DCE_IS_CONDEMNED(dce
)) {
747 generation
= dce
->dce_generation
+ 1;
748 if (generation
== DCE_GENERATION_CONDEMNED
)
749 generation
= DCE_GENERATION_INITIAL
;
750 ASSERT(generation
!= DCE_GENERATION_VERIFY
);
751 dce
->dce_generation
= generation
;
753 mutex_exit(&dce
->dce_lock
);
757 * Increment the generation number on all dces that have a path MTU and
758 * the default DCE. Used when ill_mtu or ill_mc_mtu changes.
761 dce_increment_all_generations(boolean_t isv6
, ip_stack_t
*ipst
)
767 for (i
= 0; i
< ipst
->ips_dce_hashsize
; i
++) {
769 dcb
= &ipst
->ips_dce_hash_v6
[i
];
771 dcb
= &ipst
->ips_dce_hash_v4
[i
];
772 rw_enter(&dcb
->dcb_lock
, RW_WRITER
);
773 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= dce
->dce_next
) {
774 if (DCE_IS_CONDEMNED(dce
))
776 dce_increment_generation(dce
);
778 rw_exit(&dcb
->dcb_lock
);
780 dce_increment_generation(ipst
->ips_dce_default
);
784 * Caller needs to do a dce_refrele since we can't do the
785 * dce_refrele under dcb_lock.
788 dce_delete_locked(dcb_t
*dcb
, dce_t
*dce
)
790 dce
->dce_bucket
= NULL
;
791 *dce
->dce_ptpn
= dce
->dce_next
;
792 if (dce
->dce_next
!= NULL
)
793 dce
->dce_next
->dce_ptpn
= dce
->dce_ptpn
;
794 dce
->dce_ptpn
= NULL
;
795 dce
->dce_next
= NULL
;
796 atomic_add_32(&dcb
->dcb_cnt
, -1);
797 dce_make_condemned(dce
);
801 dce_inactive(dce_t
*dce
)
803 ip_stack_t
*ipst
= dce
->dce_ipst
;
805 ASSERT(!(dce
->dce_flags
& DCEF_DEFAULT
));
806 ASSERT(dce
->dce_ptpn
== NULL
);
807 ASSERT(dce
->dce_bucket
== NULL
);
809 /* Count how many condemned dces for kmem_cache callback */
810 if (DCE_IS_CONDEMNED(dce
))
811 atomic_add_32(&ipst
->ips_num_dce_condemned
, -1);
813 kmem_cache_free(dce_cache
, dce
);
817 dce_refrele(dce_t
*dce
)
819 ASSERT(dce
->dce_refcnt
!= 0);
820 if (atomic_add_32_nv(&dce
->dce_refcnt
, -1) == 0)
825 dce_refhold(dce_t
*dce
)
827 atomic_add_32(&dce
->dce_refcnt
, 1);
828 ASSERT(dce
->dce_refcnt
!= 0);
831 /* No tracing support yet hence the same as the above functions */
833 dce_refrele_notr(dce_t
*dce
)
835 ASSERT(dce
->dce_refcnt
!= 0);
836 if (atomic_add_32_nv(&dce
->dce_refcnt
, -1) == 0)
841 dce_refhold_notr(dce_t
*dce
)
843 atomic_add_32(&dce
->dce_refcnt
, 1);
844 ASSERT(dce
->dce_refcnt
!= 0);
847 /* Report both the IPv4 and IPv6 DCEs. */
849 ip_snmp_get_mib2_ip_dce(queue_t
*q
, mblk_t
*mpctl
, ip_stack_t
*ipst
)
853 dest_cache_entry_t dest_cache
;
854 mblk_t
*mp_tail
= NULL
;
858 uint64_t current_time
;
860 current_time
= TICK_TO_SEC(ddi_get_lbolt64());
863 * make a copy of the original message
865 mp2ctl
= copymsg(mpctl
);
867 /* First we do IPv4 entries */
868 optp
= (struct opthdr
*)&mpctl
->b_rptr
[
869 sizeof (struct T_optmgmt_ack
)];
870 optp
->level
= MIB2_IP
;
871 optp
->name
= EXPER_IP_DCE
;
873 for (i
= 0; i
< ipst
->ips_dce_hashsize
; i
++) {
874 dcb
= &ipst
->ips_dce_hash_v4
[i
];
875 rw_enter(&dcb
->dcb_lock
, RW_READER
);
876 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= dce
->dce_next
) {
877 dest_cache
.DestIpv4Address
= dce
->dce_v4addr
;
878 dest_cache
.DestFlags
= dce
->dce_flags
;
879 if (dce
->dce_flags
& DCEF_PMTU
)
880 dest_cache
.DestPmtu
= dce
->dce_pmtu
;
882 dest_cache
.DestPmtu
= 0;
883 dest_cache
.DestIdent
= dce
->dce_ident
;
884 dest_cache
.DestIfindex
= 0;
885 dest_cache
.DestAge
= current_time
-
886 dce
->dce_last_change_time
;
887 if (!snmp_append_data2(mpctl
->b_cont
, &mp_tail
,
888 (char *)&dest_cache
, (int)sizeof (dest_cache
))) {
889 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
890 "failed to allocate %u bytes\n",
891 (uint_t
)sizeof (dest_cache
)));
894 rw_exit(&dcb
->dcb_lock
);
896 optp
->len
= (t_uscalar_t
)msgdsize(mpctl
->b_cont
);
897 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
898 (int)optp
->level
, (int)optp
->name
, (int)optp
->len
));
901 if (mp2ctl
== NULL
) {
902 /* Copymsg failed above */
909 mp2ctl
= copymsg(mpctl
);
910 optp
= (struct opthdr
*)&mpctl
->b_rptr
[
911 sizeof (struct T_optmgmt_ack
)];
912 optp
->level
= MIB2_IP6
;
913 optp
->name
= EXPER_IP_DCE
;
915 for (i
= 0; i
< ipst
->ips_dce_hashsize
; i
++) {
916 dcb
= &ipst
->ips_dce_hash_v6
[i
];
917 rw_enter(&dcb
->dcb_lock
, RW_READER
);
918 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= dce
->dce_next
) {
919 dest_cache
.DestIpv6Address
= dce
->dce_v6addr
;
920 dest_cache
.DestFlags
= dce
->dce_flags
;
921 if (dce
->dce_flags
& DCEF_PMTU
)
922 dest_cache
.DestPmtu
= dce
->dce_pmtu
;
924 dest_cache
.DestPmtu
= 0;
925 dest_cache
.DestIdent
= dce
->dce_ident
;
926 if (IN6_IS_ADDR_LINKSCOPE(&dce
->dce_v6addr
))
927 dest_cache
.DestIfindex
= dce
->dce_ifindex
;
929 dest_cache
.DestIfindex
= 0;
930 dest_cache
.DestAge
= current_time
-
931 dce
->dce_last_change_time
;
932 if (!snmp_append_data2(mpctl
->b_cont
, &mp_tail
,
933 (char *)&dest_cache
, (int)sizeof (dest_cache
))) {
934 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
935 "failed to allocate %u bytes\n",
936 (uint_t
)sizeof (dest_cache
)));
939 rw_exit(&dcb
->dcb_lock
);
941 optp
->len
= (t_uscalar_t
)msgdsize(mpctl
->b_cont
);
942 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
943 (int)optp
->level
, (int)optp
->name
, (int)optp
->len
));
950 * Remove IPv6 DCEs which refer to an ifindex that is going away.
951 * This is not required for correctness, but it avoids netstat -d
952 * showing stale stuff that will never be used.
955 dce_cleanup(uint_t ifindex
, ip_stack_t
*ipst
)
959 dce_t
*dce
, *nextdce
;
961 for (i
= 0; i
< ipst
->ips_dce_hashsize
; i
++) {
962 dcb
= &ipst
->ips_dce_hash_v6
[i
];
963 rw_enter(&dcb
->dcb_lock
, RW_WRITER
);
965 for (dce
= dcb
->dcb_dce
; dce
!= NULL
; dce
= nextdce
) {
966 nextdce
= dce
->dce_next
;
967 if (dce
->dce_ifindex
== ifindex
) {
968 dce_delete_locked(dcb
, dce
);
972 rw_exit(&dcb
->dcb_lock
);