Merge commit 'ea01a15a654b9e1c7b37d958f4d1911882ed7781'
[unleashed.git] / kernel / net / ip / ip_dce.c
blob8abeb4800ecb56df322b4de8191afddf233deec5
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
25 * Copyright 2017, OmniTI Computer Consulting, Inc. All rights reserved.
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsun.h>
31 #include <sys/zone.h>
32 #include <sys/ddi.h>
33 #include <sys/disp.h>
34 #include <sys/sunddi.h>
35 #include <sys/cmn_err.h>
36 #include <sys/debug.h>
37 #include <sys/atomic.h>
38 #include <sys/callb.h>
39 #define _SUN_TPI_VERSION 2
40 #include <sys/tihdr.h>
42 #include <inet/common.h>
43 #include <inet/mi.h>
44 #include <inet/mib2.h>
45 #include <inet/snmpcom.h>
47 #include <netinet/ip6.h>
48 #include <netinet/icmp6.h>
50 #include <inet/ip.h>
51 #include <inet/ip_impl.h>
52 #include <inet/ip6.h>
53 #include <inet/ip6_asp.h>
54 #include <inet/ip_multi.h>
55 #include <inet/ip_if.h>
56 #include <inet/ip_ire.h>
57 #include <inet/ip_ftable.h>
58 #include <inet/ip_rts.h>
59 #include <inet/ip_ndp.h>
60 #include <inet/ipclassifier.h>
61 #include <inet/ip_listutils.h>
63 #include <sys/sunddi.h>
66 * Routines for handling destination cache entries.
67 * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
68 * That entry holds both the IP ident value and the dce generation number.
70 * Any time a DCE is changed significantly (different path MTU, but NOT
71 * different ULP info!), the dce_generation number is increased.
72 * Also, when a new DCE is created, the dce_generation number in the default
73 * DCE is bumped. That allows the dce_t information to be cached efficiently
74 * as long as the entity caching the dce_t also caches the dce_generation,
75 * and compares the cached generation to detect any changes.
76 * Furthermore, when a DCE is deleted, if there are any outstanding references
77 * to the DCE it will be marked as condemned. The condemned mark is
78 * a designated generation number which is never otherwise used, hence
79 * the single comparison with the generation number captures that as well.
81 * An example of code which caches is as follows:
83 * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
84 * The DCE has changed
85 * mystruct->my_dce = dce_lookup_pkt(mp, ixa,
86 * &mystruct->my_dce_generation);
87 * Not needed in practice, since we have the default DCE:
88 * if (DCE_IS_CONDEMNED(mystruct->my_dce))
89 * return failure;
90 * }
92 * Note that for IPv6 link-local addresses we record the ifindex since the
93 * link-locals are not globally unique.
95 * DCEs can remain for an arbitrarily long time, until memory pressure or
96 * too-deep hash buckets (see dce_lookup_and_add*()) enable the reclaim thread
97 * to actually remove DCEs from the cache.
101 * Hash bucket structure for DCEs
103 typedef struct dcb_s {
104 krwlock_t dcb_lock;
105 uint32_t dcb_cnt;
106 dce_t *dcb_dce;
107 } dcb_t;
109 static void dce_delete_locked(dcb_t *, dce_t *);
110 static void dce_make_condemned(dce_t *);
112 static kmem_cache_t *dce_cache;
113 static kthread_t *dce_reclaim_thread;
114 static kmutex_t dce_reclaim_lock;
115 static kcondvar_t dce_reclaim_cv;
116 static int dce_reclaim_shutdown;
118 /* Global so it can be tuned in /etc/system. This must be a power of two. */
119 uint_t ip_dce_hash_size = 1024;
121 /* The time in seconds between executions of the IP DCE reclaim worker. */
122 uint_t ip_dce_reclaim_interval = 60;
124 /* The factor of the DCE threshold at which to start hard reclaims */
125 uint_t ip_dce_reclaim_threshold_hard = 2;
127 /* Operates on a uint64_t */
128 #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
131 * Reclaim a fraction of dce's in the dcb.
132 * For now we have a higher probability to delete DCEs without DCE_PMTU.
134 static void
135 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
137 uint_t fraction_pmtu = fraction*4;
138 uint_t hash;
139 dce_t *dce, *nextdce;
140 hrtime_t seed = gethrtime();
141 uint_t retained = 0;
142 uint_t max = ipst->ips_ip_dce_reclaim_threshold;
144 max *= ip_dce_reclaim_threshold_hard;
146 rw_enter(&dcb->dcb_lock, RW_WRITER);
147 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
148 nextdce = dce->dce_next;
149 /* Clear DCEF_PMTU if the pmtu is too old */
150 mutex_enter(&dce->dce_lock);
151 if ((dce->dce_flags & DCEF_PMTU) &&
152 TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
153 ipst->ips_ip_pathmtu_interval) {
154 dce->dce_flags &= ~DCEF_PMTU;
155 mutex_exit(&dce->dce_lock);
156 dce_increment_generation(dce);
157 } else {
158 mutex_exit(&dce->dce_lock);
161 if (max == 0 || retained < max) {
162 hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed));
164 if (dce->dce_flags & DCEF_PMTU) {
165 if (hash % fraction_pmtu != 0) {
166 retained++;
167 continue;
169 } else {
170 if (hash % fraction != 0) {
171 retained++;
172 continue;
177 IP_STAT(ipst, ip_dce_reclaim_deleted);
178 dce_delete_locked(dcb, dce);
179 dce_refrele(dce);
181 rw_exit(&dcb->dcb_lock);
185 * kmem_cache callback to free up memory.
188 static void
189 ip_dce_reclaim_stack(ip_stack_t *ipst)
191 int i;
193 IP_STAT(ipst, ip_dce_reclaim_calls);
194 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
195 dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
196 ipst->ips_ip_dce_reclaim_fraction);
198 dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
199 ipst->ips_ip_dce_reclaim_fraction);
203 * Walk all CONNs that can have a reference on an ire, nce or dce.
204 * Get them to update any stale references to drop any refholds they
205 * have.
207 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
211 * Called by dce_reclaim_worker() below, and no one else. Typically this will
212 * mean that the number of entries in the hash buckets has exceeded a tunable
213 * threshold.
215 static void
216 ip_dce_reclaim(void)
218 netstack_handle_t nh;
219 netstack_t *ns;
220 ip_stack_t *ipst;
222 ASSERT(curthread == dce_reclaim_thread);
224 netstack_next_init(&nh);
225 while ((ns = netstack_next(&nh)) != NULL) {
227 * netstack_next() can return a netstack_t with a NULL
228 * netstack_ip at boot time.
230 if ((ipst = ns->netstack_ip) == NULL) {
231 netstack_rele(ns);
232 continue;
234 if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0)
235 ip_dce_reclaim_stack(ipst);
236 netstack_rele(ns);
238 netstack_next_fini(&nh);
241 /* ARGSUSED */
242 static void
243 dce_reclaim_worker(void *arg)
245 callb_cpr_t cprinfo;
247 CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr,
248 "dce_reclaim_worker");
250 mutex_enter(&dce_reclaim_lock);
251 while (!dce_reclaim_shutdown) {
252 CALLB_CPR_SAFE_BEGIN(&cprinfo);
253 (void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock,
254 ddi_get_lbolt() + ip_dce_reclaim_interval * hz);
255 CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock);
257 if (dce_reclaim_shutdown)
258 break;
260 mutex_exit(&dce_reclaim_lock);
261 ip_dce_reclaim();
262 mutex_enter(&dce_reclaim_lock);
265 ASSERT(MUTEX_HELD(&dce_reclaim_lock));
266 dce_reclaim_thread = NULL;
267 dce_reclaim_shutdown = 0;
268 cv_broadcast(&dce_reclaim_cv);
269 CALLB_CPR_EXIT(&cprinfo); /* drops the lock */
271 thread_exit();
274 void
275 dce_g_init(void)
277 dce_cache = kmem_cache_create("dce_cache",
278 sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
280 mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
281 cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL);
283 dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker,
284 NULL, 0, &p0, TS_RUN, minclsyspri);
287 void
288 dce_g_destroy(void)
290 mutex_enter(&dce_reclaim_lock);
291 dce_reclaim_shutdown = 1;
292 cv_signal(&dce_reclaim_cv);
293 while (dce_reclaim_thread != NULL)
294 cv_wait(&dce_reclaim_cv, &dce_reclaim_lock);
295 mutex_exit(&dce_reclaim_lock);
297 cv_destroy(&dce_reclaim_cv);
298 mutex_destroy(&dce_reclaim_lock);
300 kmem_cache_destroy(dce_cache);
304 * Allocate a default DCE and a hash table for per-IP address DCEs
306 void
307 dce_stack_init(ip_stack_t *ipst)
309 int i;
311 ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
312 bzero(ipst->ips_dce_default, sizeof (dce_t));
313 ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
314 ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
315 ipst->ips_dce_default->dce_last_change_time =
316 TICK_TO_SEC(ddi_get_lbolt64());
317 ipst->ips_dce_default->dce_refcnt = 1; /* Should never go away */
318 ipst->ips_dce_default->dce_ipst = ipst;
320 /* This must be a power of two since we are using IRE_ADDR_HASH macro */
321 ipst->ips_dce_hashsize = ip_dce_hash_size;
322 ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
323 sizeof (dcb_t), KM_SLEEP);
324 ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
325 sizeof (dcb_t), KM_SLEEP);
326 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
327 rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
328 NULL);
329 rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
330 NULL);
335 * Given a DCE hash bucket, unlink DCE entries from it. Some callers need
336 * ifindex-specific matching, others don't. Don't overload ifindex to indicate
337 * specificity, just indicate so explicitly.
339 static void
340 dce_bucket_clean(dcb_t *dcb, boolean_t specific_ifindex, uint_t ifindex)
342 dce_t *dce, *nextdce;
344 rw_enter(&dcb->dcb_lock, RW_WRITER);
346 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
347 nextdce = dce->dce_next;
348 if ((!specific_ifindex) || dce->dce_ifindex == ifindex) {
349 dce_delete_locked(dcb, dce);
350 dce_refrele(dce);
354 rw_exit(&dcb->dcb_lock);
357 void
358 dce_stack_destroy(ip_stack_t *ipst)
360 int i;
361 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
362 dce_bucket_clean(&ipst->ips_dce_hash_v4[i], B_FALSE, 0);
363 rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
364 dce_bucket_clean(&ipst->ips_dce_hash_v6[i], B_FALSE, 0);
365 rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
367 kmem_free(ipst->ips_dce_hash_v4,
368 ipst->ips_dce_hashsize * sizeof (dcb_t));
369 ipst->ips_dce_hash_v4 = NULL;
370 kmem_free(ipst->ips_dce_hash_v6,
371 ipst->ips_dce_hashsize * sizeof (dcb_t));
372 ipst->ips_dce_hash_v6 = NULL;
373 ipst->ips_dce_hashsize = 0;
375 ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
376 kmem_cache_free(dce_cache, ipst->ips_dce_default);
377 ipst->ips_dce_default = NULL;
380 /* When any DCE is good enough */
381 dce_t *
382 dce_get_default(ip_stack_t *ipst)
384 dce_t *dce;
386 dce = ipst->ips_dce_default;
387 dce_refhold(dce);
388 return (dce);
392 * Generic for IPv4 and IPv6.
394 * Used by callers that need to cache e.g., the datapath
395 * Returns the generation number in the last argument.
397 dce_t *
398 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
400 if (ixa->ixa_flags & IXAF_IS_IPV4) {
402 * If we have a source route we need to look for the final
403 * destination in the source route option.
405 ipaddr_t final_dst;
406 ipha_t *ipha = (ipha_t *)mp->b_rptr;
408 final_dst = ip_get_dst(ipha);
409 return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
410 } else {
411 uint_t ifindex;
413 * If we have a routing header we need to look for the final
414 * destination in the routing extension header.
416 in6_addr_t final_dst;
417 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
419 final_dst = ip_get_dst_v6(ip6h, mp, NULL);
420 ifindex = 0;
421 if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
422 ifindex = ixa->ixa_nce->nce_common->ncec_ill->
423 ill_phyint->phyint_ifindex;
425 return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
426 generationp));
431 * Used by callers that need to cache e.g., the datapath
432 * Returns the generation number in the last argument.
434 dce_t *
435 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
437 uint_t hash;
438 dcb_t *dcb;
439 dce_t *dce;
441 /* Set *generationp before dropping the lock(s) that allow additions */
442 if (generationp != NULL)
443 *generationp = ipst->ips_dce_default->dce_generation;
445 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
446 dcb = &ipst->ips_dce_hash_v4[hash];
447 rw_enter(&dcb->dcb_lock, RW_READER);
448 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
449 if (dce->dce_v4addr == dst) {
450 mutex_enter(&dce->dce_lock);
451 if (!DCE_IS_CONDEMNED(dce)) {
452 dce_refhold(dce);
453 if (generationp != NULL)
454 *generationp = dce->dce_generation;
455 mutex_exit(&dce->dce_lock);
456 rw_exit(&dcb->dcb_lock);
457 return (dce);
459 mutex_exit(&dce->dce_lock);
462 rw_exit(&dcb->dcb_lock);
463 /* Not found */
464 dce = ipst->ips_dce_default;
465 dce_refhold(dce);
466 return (dce);
470 * Used by callers that need to cache e.g., the datapath
471 * Returns the generation number in the last argument.
472 * ifindex should only be set for link-locals
474 dce_t *
475 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
476 uint_t *generationp)
478 uint_t hash;
479 dcb_t *dcb;
480 dce_t *dce;
482 /* Set *generationp before dropping the lock(s) that allow additions */
483 if (generationp != NULL)
484 *generationp = ipst->ips_dce_default->dce_generation;
486 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
487 dcb = &ipst->ips_dce_hash_v6[hash];
488 rw_enter(&dcb->dcb_lock, RW_READER);
489 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
490 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
491 dce->dce_ifindex == ifindex) {
492 mutex_enter(&dce->dce_lock);
493 if (!DCE_IS_CONDEMNED(dce)) {
494 dce_refhold(dce);
495 if (generationp != NULL)
496 *generationp = dce->dce_generation;
497 mutex_exit(&dce->dce_lock);
498 rw_exit(&dcb->dcb_lock);
499 return (dce);
501 mutex_exit(&dce->dce_lock);
504 rw_exit(&dcb->dcb_lock);
505 /* Not found */
506 dce = ipst->ips_dce_default;
507 dce_refhold(dce);
508 return (dce);
512 * Atomically looks for a non-default DCE, and if not found tries to create one.
513 * If there is no memory it returns NULL.
514 * When an entry is created we increase the generation number on
515 * the default DCE so that conn_ip_output will detect there is a new DCE.
517 dce_t *
518 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
520 uint_t hash;
521 dcb_t *dcb;
522 dce_t *dce;
524 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
525 dcb = &ipst->ips_dce_hash_v4[hash];
527 * Assuming that we get fairly even distribution across all of the
528 * buckets, once one bucket is overly full, prune the whole cache.
530 if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
531 atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
532 rw_enter(&dcb->dcb_lock, RW_WRITER);
533 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
534 if (dce->dce_v4addr == dst) {
535 mutex_enter(&dce->dce_lock);
536 if (!DCE_IS_CONDEMNED(dce)) {
537 dce_refhold(dce);
538 mutex_exit(&dce->dce_lock);
539 rw_exit(&dcb->dcb_lock);
540 return (dce);
542 mutex_exit(&dce->dce_lock);
545 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
546 if (dce == NULL) {
547 rw_exit(&dcb->dcb_lock);
548 return (NULL);
550 bzero(dce, sizeof (dce_t));
551 dce->dce_ipst = ipst; /* No netstack_hold */
552 dce->dce_v4addr = dst;
553 dce->dce_generation = DCE_GENERATION_INITIAL;
554 dce->dce_ipversion = IPV4_VERSION;
555 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
556 dce_refhold(dce); /* For the hash list */
558 /* Link into list */
559 if (dcb->dcb_dce != NULL)
560 dcb->dcb_dce->dce_ptpn = &dce->dce_next;
561 dce->dce_next = dcb->dcb_dce;
562 dce->dce_ptpn = &dcb->dcb_dce;
563 dcb->dcb_dce = dce;
564 dce->dce_bucket = dcb;
565 atomic_inc_32(&dcb->dcb_cnt);
566 dce_refhold(dce); /* For the caller */
567 rw_exit(&dcb->dcb_lock);
569 /* Initialize dce_ident to be different than for the last packet */
570 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
572 dce_increment_generation(ipst->ips_dce_default);
573 return (dce);
577 * Atomically looks for a non-default DCE, and if not found tries to create one.
578 * If there is no memory it returns NULL.
579 * When an entry is created we increase the generation number on
580 * the default DCE so that conn_ip_output will detect there is a new DCE.
581 * ifindex should only be used with link-local addresses.
583 dce_t *
584 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
586 uint_t hash;
587 dcb_t *dcb;
588 dce_t *dce;
590 /* We should not create entries for link-locals w/o an ifindex */
591 ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
593 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
594 dcb = &ipst->ips_dce_hash_v6[hash];
596 * Assuming that we get fairly even distribution across all of the
597 * buckets, once one bucket is overly full, prune the whole cache.
599 if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
600 atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
601 rw_enter(&dcb->dcb_lock, RW_WRITER);
602 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
603 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
604 dce->dce_ifindex == ifindex) {
605 mutex_enter(&dce->dce_lock);
606 if (!DCE_IS_CONDEMNED(dce)) {
607 dce_refhold(dce);
608 mutex_exit(&dce->dce_lock);
609 rw_exit(&dcb->dcb_lock);
610 return (dce);
612 mutex_exit(&dce->dce_lock);
616 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
617 if (dce == NULL) {
618 rw_exit(&dcb->dcb_lock);
619 return (NULL);
621 bzero(dce, sizeof (dce_t));
622 dce->dce_ipst = ipst; /* No netstack_hold */
623 dce->dce_v6addr = *dst;
624 dce->dce_ifindex = ifindex;
625 dce->dce_generation = DCE_GENERATION_INITIAL;
626 dce->dce_ipversion = IPV6_VERSION;
627 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
628 dce_refhold(dce); /* For the hash list */
630 /* Link into list */
631 if (dcb->dcb_dce != NULL)
632 dcb->dcb_dce->dce_ptpn = &dce->dce_next;
633 dce->dce_next = dcb->dcb_dce;
634 dce->dce_ptpn = &dcb->dcb_dce;
635 dcb->dcb_dce = dce;
636 dce->dce_bucket = dcb;
637 atomic_inc_32(&dcb->dcb_cnt);
638 dce_refhold(dce); /* For the caller */
639 rw_exit(&dcb->dcb_lock);
641 /* Initialize dce_ident to be different than for the last packet */
642 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
643 dce_increment_generation(ipst->ips_dce_default);
644 return (dce);
648 * Set/update uinfo. Creates a per-destination dce if none exists.
650 * Note that we do not bump the generation number here.
651 * New connections will find the new uinfo.
653 * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
655 static void
656 dce_setuinfo(dce_t *dce, iulp_t *uinfo)
659 * Update the round trip time estimate and/or the max frag size
660 * and/or the slow start threshold.
662 * We serialize multiple advises using dce_lock.
664 mutex_enter(&dce->dce_lock);
665 /* Gard against setting to zero */
666 if (uinfo->iulp_rtt != 0) {
668 * If there is no old cached values, initialize them
669 * conservatively. Set them to be (1.5 * new value).
671 if (dce->dce_uinfo.iulp_rtt != 0) {
672 dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
673 uinfo->iulp_rtt) >> 1;
674 } else {
675 dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
676 (uinfo->iulp_rtt >> 1);
678 if (dce->dce_uinfo.iulp_rtt_sd != 0) {
679 dce->dce_uinfo.iulp_rtt_sd =
680 (dce->dce_uinfo.iulp_rtt_sd +
681 uinfo->iulp_rtt_sd) >> 1;
682 } else {
683 dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
684 (uinfo->iulp_rtt_sd >> 1);
687 if (uinfo->iulp_mtu != 0) {
688 if (dce->dce_flags & DCEF_PMTU) {
689 dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
690 } else {
691 dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
692 dce->dce_flags |= DCEF_PMTU;
694 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
696 if (uinfo->iulp_ssthresh != 0) {
697 if (dce->dce_uinfo.iulp_ssthresh != 0)
698 dce->dce_uinfo.iulp_ssthresh =
699 (uinfo->iulp_ssthresh +
700 dce->dce_uinfo.iulp_ssthresh) >> 1;
701 else
702 dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
704 /* We have uinfo for sure */
705 dce->dce_flags |= DCEF_UINFO;
706 mutex_exit(&dce->dce_lock);
711 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
713 dce_t *dce;
715 dce = dce_lookup_and_add_v4(dst, ipst);
716 if (dce == NULL)
717 return (ENOMEM);
719 dce_setuinfo(dce, uinfo);
720 dce_refrele(dce);
721 return (0);
725 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
726 ip_stack_t *ipst)
728 dce_t *dce;
730 dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
731 if (dce == NULL)
732 return (ENOMEM);
734 dce_setuinfo(dce, uinfo);
735 dce_refrele(dce);
736 return (0);
739 /* Common routine for IPv4 and IPv6 */
741 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
742 ip_stack_t *ipst)
744 ipaddr_t dst4;
746 if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
747 IN6_V4MAPPED_TO_IPADDR(dst, dst4);
748 return (dce_update_uinfo_v4(dst4, uinfo, ipst));
749 } else {
750 return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
754 static void
755 dce_make_condemned(dce_t *dce)
757 ip_stack_t *ipst = dce->dce_ipst;
759 mutex_enter(&dce->dce_lock);
760 ASSERT(!DCE_IS_CONDEMNED(dce));
761 dce->dce_generation = DCE_GENERATION_CONDEMNED;
762 mutex_exit(&dce->dce_lock);
763 /* Count how many condemned dces for kmem_cache callback */
764 atomic_inc_32(&ipst->ips_num_dce_condemned);
768 * Increment the generation avoiding the special condemned value
770 void
771 dce_increment_generation(dce_t *dce)
773 uint_t generation;
775 mutex_enter(&dce->dce_lock);
776 if (!DCE_IS_CONDEMNED(dce)) {
777 generation = dce->dce_generation + 1;
778 if (generation == DCE_GENERATION_CONDEMNED)
779 generation = DCE_GENERATION_INITIAL;
780 ASSERT(generation != DCE_GENERATION_VERIFY);
781 dce->dce_generation = generation;
783 mutex_exit(&dce->dce_lock);
787 * Increment the generation number on all dces that have a path MTU and
788 * the default DCE. Used when ill_mtu or ill_mc_mtu changes.
790 void
791 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
793 int i;
794 dcb_t *dcb;
795 dce_t *dce;
797 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
798 if (isv6)
799 dcb = &ipst->ips_dce_hash_v6[i];
800 else
801 dcb = &ipst->ips_dce_hash_v4[i];
802 rw_enter(&dcb->dcb_lock, RW_WRITER);
803 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
804 if (DCE_IS_CONDEMNED(dce))
805 continue;
806 dce_increment_generation(dce);
808 rw_exit(&dcb->dcb_lock);
810 dce_increment_generation(ipst->ips_dce_default);
814 * Caller needs to do a dce_refrele since we can't do the
815 * dce_refrele under dcb_lock.
817 static void
818 dce_delete_locked(dcb_t *dcb, dce_t *dce)
820 dce->dce_bucket = NULL;
821 *dce->dce_ptpn = dce->dce_next;
822 if (dce->dce_next != NULL)
823 dce->dce_next->dce_ptpn = dce->dce_ptpn;
824 dce->dce_ptpn = NULL;
825 dce->dce_next = NULL;
826 atomic_dec_32(&dcb->dcb_cnt);
827 dce_make_condemned(dce);
830 static void
831 dce_inactive(dce_t *dce)
833 ip_stack_t *ipst = dce->dce_ipst;
835 ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
836 ASSERT(dce->dce_ptpn == NULL);
837 ASSERT(dce->dce_bucket == NULL);
839 /* Count how many condemned dces for kmem_cache callback */
840 if (DCE_IS_CONDEMNED(dce))
841 atomic_dec_32(&ipst->ips_num_dce_condemned);
843 kmem_cache_free(dce_cache, dce);
846 void
847 dce_refrele(dce_t *dce)
849 ASSERT(dce->dce_refcnt != 0);
850 if (atomic_dec_32_nv(&dce->dce_refcnt) == 0)
851 dce_inactive(dce);
854 void
855 dce_refhold(dce_t *dce)
857 atomic_inc_32(&dce->dce_refcnt);
858 ASSERT(dce->dce_refcnt != 0);
861 /* No tracing support yet hence the same as the above functions */
862 void
863 dce_refrele_notr(dce_t *dce)
865 ASSERT(dce->dce_refcnt != 0);
866 if (atomic_dec_32_nv(&dce->dce_refcnt) == 0)
867 dce_inactive(dce);
870 void
871 dce_refhold_notr(dce_t *dce)
873 atomic_inc_32(&dce->dce_refcnt);
874 ASSERT(dce->dce_refcnt != 0);
877 /* Report both the IPv4 and IPv6 DCEs. */
878 mblk_t *
879 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
881 struct opthdr *optp;
882 mblk_t *mp2ctl;
883 dest_cache_entry_t dest_cache;
884 mblk_t *mp_tail = NULL;
885 dce_t *dce;
886 dcb_t *dcb;
887 int i;
888 uint64_t current_time;
890 current_time = TICK_TO_SEC(ddi_get_lbolt64());
893 * make a copy of the original message
895 mp2ctl = copymsg(mpctl);
897 /* First we do IPv4 entries */
898 optp = (struct opthdr *)&mpctl->b_rptr[
899 sizeof (struct T_optmgmt_ack)];
900 optp->level = MIB2_IP;
901 optp->name = EXPER_IP_DCE;
903 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
904 dcb = &ipst->ips_dce_hash_v4[i];
905 rw_enter(&dcb->dcb_lock, RW_READER);
906 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
907 dest_cache.DestIpv4Address = dce->dce_v4addr;
908 dest_cache.DestFlags = dce->dce_flags;
909 if (dce->dce_flags & DCEF_PMTU)
910 dest_cache.DestPmtu = dce->dce_pmtu;
911 else
912 dest_cache.DestPmtu = 0;
913 dest_cache.DestIdent = dce->dce_ident;
914 dest_cache.DestIfindex = 0;
915 dest_cache.DestAge = current_time -
916 dce->dce_last_change_time;
917 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
918 (char *)&dest_cache, (int)sizeof (dest_cache))) {
919 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
920 "failed to allocate %u bytes\n",
921 (uint_t)sizeof (dest_cache)));
924 rw_exit(&dcb->dcb_lock);
926 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
927 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
928 (int)optp->level, (int)optp->name, (int)optp->len));
929 qreply(q, mpctl);
931 if (mp2ctl == NULL) {
932 /* Copymsg failed above */
933 return (NULL);
936 /* Now for IPv6 */
937 mpctl = mp2ctl;
938 mp_tail = NULL;
939 mp2ctl = copymsg(mpctl);
940 optp = (struct opthdr *)&mpctl->b_rptr[
941 sizeof (struct T_optmgmt_ack)];
942 optp->level = MIB2_IP6;
943 optp->name = EXPER_IP_DCE;
945 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
946 dcb = &ipst->ips_dce_hash_v6[i];
947 rw_enter(&dcb->dcb_lock, RW_READER);
948 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
949 dest_cache.DestIpv6Address = dce->dce_v6addr;
950 dest_cache.DestFlags = dce->dce_flags;
951 if (dce->dce_flags & DCEF_PMTU)
952 dest_cache.DestPmtu = dce->dce_pmtu;
953 else
954 dest_cache.DestPmtu = 0;
955 dest_cache.DestIdent = dce->dce_ident;
956 if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
957 dest_cache.DestIfindex = dce->dce_ifindex;
958 else
959 dest_cache.DestIfindex = 0;
960 dest_cache.DestAge = current_time -
961 dce->dce_last_change_time;
962 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
963 (char *)&dest_cache, (int)sizeof (dest_cache))) {
964 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
965 "failed to allocate %u bytes\n",
966 (uint_t)sizeof (dest_cache)));
969 rw_exit(&dcb->dcb_lock);
971 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
972 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
973 (int)optp->level, (int)optp->name, (int)optp->len));
974 qreply(q, mpctl);
976 return (mp2ctl);
980 * Remove IPv6 DCEs which refer to an ifindex that is going away.
981 * This is not required for correctness, but it avoids netstat -d
982 * showing stale stuff that will never be used.
984 void
985 dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
987 uint_t i;
989 for (i = 0; i < ipst->ips_dce_hashsize; i++)
990 dce_bucket_clean(&ipst->ips_dce_hash_v6[i], B_TRUE, ifindex);