Merge commit 'ea01a15a654b9e1c7b37d958f4d1911882ed7781'
[unleashed.git] / kernel / net / ip / ip_ndp.c
blobe5490764830b42c3cf1f1e575739d2af9459524f
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
25 #include <sys/types.h>
26 #include <sys/stream.h>
27 #include <sys/stropts.h>
28 #include <sys/strsun.h>
29 #include <sys/sysmacros.h>
30 #include <sys/errno.h>
31 #include <sys/dlpi.h>
32 #include <sys/socket.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/cmn_err.h>
36 #include <sys/debug.h>
37 #include <sys/vtrace.h>
38 #include <sys/kmem.h>
39 #include <sys/zone.h>
40 #include <sys/ethernet.h>
41 #include <sys/sdt.h>
42 #include <sys/mac.h>
44 #include <net/if.h>
45 #include <net/if_types.h>
46 #include <net/if_dl.h>
47 #include <net/route.h>
48 #include <netinet/in.h>
49 #include <netinet/ip6.h>
50 #include <netinet/icmp6.h>
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/nd.h>
56 #include <inet/ip.h>
57 #include <inet/ip_impl.h>
58 #include <inet/ipclassifier.h>
59 #include <inet/ip_if.h>
60 #include <inet/ip_ire.h>
61 #include <inet/ip_rts.h>
62 #include <inet/ip6.h>
63 #include <inet/ip_ndp.h>
64 #include <inet/sctp_ip.h>
65 #include <inet/ip_arp.h>
66 #include <inet/ip2mac_impl.h>
68 #define ANNOUNCE_INTERVAL(isv6) \
69 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
70 ipst->ips_ip_arp_publish_interval)
72 #define DEFENSE_INTERVAL(isv6) \
73 (isv6 ? ipst->ips_ndp_defend_interval : \
74 ipst->ips_arp_defend_interval)
76 /* Non-tunable probe interval, based on link capabilities */
77 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
80 * The IPv4 Link Local address space is special; we do extra duplicate checking
81 * there, as the entire assignment mechanism rests on random numbers.
83 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \
84 ((uchar_t *)ptr)[1] == 254)
87 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
88 * in to the ncec*add* functions.
90 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
91 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
92 * that we will respond to requests for the protocol address.
94 #define NCE_EXTERNAL_FLAGS_MASK \
95 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
96 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
97 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
100 * Lock ordering:
102 * ndp_g_lock -> ill_lock -> ncec_lock
104 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
105 * ncec_next. ncec_lock protects the contents of the NCE (particularly
106 * ncec_refcnt).
109 static void nce_cleanup_list(ncec_t *ncec);
110 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
111 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
112 ncec_t *);
113 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *);
114 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
115 uint16_t ncec_flags, nce_t **newnce);
116 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
117 uint16_t ncec_flags, nce_t **newnce);
118 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation,
119 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
120 const in6_addr_t *target, int flag);
121 static void ncec_refhold_locked(ncec_t *);
122 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
123 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
124 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
125 uint16_t, uint16_t, nce_t **);
126 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
127 static nce_t *nce_add(ill_t *, ncec_t *);
128 static void nce_inactive(nce_t *);
129 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *);
130 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
131 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
132 uint16_t, uint16_t, nce_t **);
133 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
134 uint16_t, uint16_t, nce_t **);
135 static int nce_add_v6_postprocess(nce_t *);
136 static int nce_add_v4_postprocess(nce_t *);
137 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
138 static clock_t nce_fuzz_interval(clock_t, boolean_t);
139 static void nce_resolv_ipmp_ok(ncec_t *);
140 static void nce_walk_common(ill_t *, pfi_t, void *);
141 static void nce_start_timer(ncec_t *, uint_t);
142 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
143 static void nce_fastpath_trigger(nce_t *);
144 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
146 #ifdef DEBUG
147 static void ncec_trace_cleanup(const ncec_t *);
148 #endif
150 #define NCE_HASH_PTR_V4(ipst, addr) \
151 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
153 #define NCE_HASH_PTR_V6(ipst, addr) \
154 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
155 NCE_TABLE_SIZE)]))
157 extern kmem_cache_t *ncec_cache;
158 extern kmem_cache_t *nce_cache;
161 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
162 * If src_ill is not null, the ncec_addr is bound to src_ill. The
163 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
164 * the probe is sent on the ncec_ill (in the non-IPMP case) or the
165 * IPMP cast_ill (in the IPMP case).
167 * Note that the probe interval is based on the src_ill for IPv6, and
168 * the ncec_xmit_interval for IPv4.
170 static void
171 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
173 boolean_t dropped;
174 uint32_t probe_interval;
176 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
177 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
178 if (ncec->ncec_ipversion == IPV6_VERSION) {
179 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
180 ncec->ncec_lladdr, ncec->ncec_lladdr_length,
181 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
182 probe_interval = ILL_PROBE_INTERVAL(src_ill);
183 } else {
184 /* IPv4 DAD delay the initial probe. */
185 if (send_probe)
186 dropped = arp_probe(ncec);
187 else
188 dropped = B_TRUE;
189 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
190 !send_probe);
192 if (!dropped) {
193 mutex_enter(&ncec->ncec_lock);
194 ncec->ncec_pcnt--;
195 mutex_exit(&ncec->ncec_lock);
197 nce_restart_timer(ncec, probe_interval);
201 * Compute default flags to use for an advertisement of this ncec's address.
203 static int
204 nce_advert_flags(const ncec_t *ncec)
206 int flag = 0;
208 if (ncec->ncec_flags & NCE_F_ISROUTER)
209 flag |= NDP_ISROUTER;
210 if (!(ncec->ncec_flags & NCE_F_ANYCAST))
211 flag |= NDP_ORIDE;
213 return (flag);
217 * NDP Cache Entry creation routine.
218 * This routine must always be called with ndp6->ndp_g_lock held.
221 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
222 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
224 int err;
225 nce_t *nce;
227 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
228 ASSERT(ill != NULL && ill->ill_isv6);
230 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
231 &nce);
232 if (err != 0)
233 return (err);
234 ASSERT(newnce != NULL);
235 *newnce = nce;
236 return (err);
240 * Post-processing routine to be executed after nce_add_v6(). This function
241 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
242 * and must be called without any locks held.
245 nce_add_v6_postprocess(nce_t *nce)
247 ncec_t *ncec = nce->nce_common;
248 boolean_t dropped = B_FALSE;
249 uchar_t *hw_addr = ncec->ncec_lladdr;
250 uint_t hw_addr_len = ncec->ncec_lladdr_length;
251 ill_t *ill = ncec->ncec_ill;
252 int err = 0;
253 uint16_t flags = ncec->ncec_flags;
254 ip_stack_t *ipst = ill->ill_ipst;
255 boolean_t trigger_fastpath = B_TRUE;
258 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
259 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
260 * We call nce_fastpath from nce_update if the link layer address of
261 * the peer changes from nce_update
263 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
264 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
265 trigger_fastpath = B_FALSE;
267 if (trigger_fastpath)
268 nce_fastpath_trigger(nce);
269 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
270 ill_t *hwaddr_ill;
272 * Unicast entry that needs DAD.
274 if (IS_IPMP(ill)) {
275 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
276 hw_addr, hw_addr_len);
277 } else {
278 hwaddr_ill = ill;
280 nce_dad(ncec, hwaddr_ill, B_TRUE);
281 err = EINPROGRESS;
282 } else if (flags & NCE_F_UNSOL_ADV) {
284 * We account for the transmit below by assigning one
285 * less than the ndd variable. Subsequent decrements
286 * are done in nce_timer.
288 mutex_enter(&ncec->ncec_lock);
289 ncec->ncec_unsolicit_count =
290 ipst->ips_ip_ndp_unsolicit_count - 1;
291 mutex_exit(&ncec->ncec_lock);
292 dropped = ndp_xmit(ill,
293 ND_NEIGHBOR_ADVERT,
294 hw_addr,
295 hw_addr_len,
296 &ncec->ncec_addr, /* Source and target of the adv */
297 &ipv6_all_hosts_mcast, /* Destination of the packet */
298 nce_advert_flags(ncec));
299 mutex_enter(&ncec->ncec_lock);
300 if (dropped)
301 ncec->ncec_unsolicit_count++;
302 else
303 ncec->ncec_last_time_defended = ddi_get_lbolt();
304 if (ncec->ncec_unsolicit_count != 0) {
305 nce_start_timer(ncec,
306 ipst->ips_ip_ndp_unsolicit_interval);
308 mutex_exit(&ncec->ncec_lock);
310 return (err);
314 * Atomically lookup and add (if needed) Neighbor Cache information for
315 * an address.
317 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
318 * are always added pointing at the ipmp_ill. Thus, when the ill passed
319 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
320 * entries will be created, both pointing at the same ncec_t. The nce_t
321 * entries will have their nce_ill set to the ipmp_ill and the under_ill
322 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
323 * Local addresses are always created on the ill passed to nce_add_v6.
326 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
327 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
329 int err = 0;
330 ip_stack_t *ipst = ill->ill_ipst;
331 nce_t *nce, *upper_nce = NULL;
332 ill_t *in_ill = ill;
333 boolean_t need_ill_refrele = B_FALSE;
335 if (flags & NCE_F_MCAST) {
337 * hw_addr will be figured out in nce_set_multicast_v6;
338 * caller has to select the cast_ill
340 ASSERT(hw_addr == NULL);
341 ASSERT(!IS_IPMP(ill));
342 err = nce_set_multicast_v6(ill, addr, flags, newnce);
343 return (err);
345 ASSERT(ill->ill_isv6);
346 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
347 ill = ipmp_ill_hold_ipmp_ill(ill);
348 if (ill == NULL)
349 return (ENXIO);
350 need_ill_refrele = B_TRUE;
353 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
354 nce = nce_lookup_addr(ill, addr);
355 if (nce == NULL) {
356 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
357 &nce);
358 } else {
359 err = EEXIST;
361 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
362 if (err == 0)
363 err = nce_add_v6_postprocess(nce);
364 if (in_ill != ill && nce != NULL) {
365 nce_t *under_nce = NULL;
368 * in_ill was the under_ill. Try to create the under_nce.
369 * Hold the ill_g_lock to prevent changes to group membership
370 * until we are done.
372 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
373 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
374 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
375 ill_t *, ill);
376 rw_exit(&ipst->ips_ill_g_lock);
377 err = ENXIO;
378 nce_refrele(nce);
379 nce = NULL;
380 goto bail;
382 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
383 if (under_nce == NULL) {
384 rw_exit(&ipst->ips_ill_g_lock);
385 err = EINVAL;
386 nce_refrele(nce);
387 nce = NULL;
388 goto bail;
390 rw_exit(&ipst->ips_ill_g_lock);
391 upper_nce = nce;
392 nce = under_nce; /* will be returned to caller */
393 if (NCE_ISREACHABLE(nce->nce_common))
394 nce_fastpath_trigger(under_nce);
396 /* nce_refrele is deferred until the lock is dropped */
397 if (nce != NULL) {
398 if (newnce != NULL)
399 *newnce = nce;
400 else
401 nce_refrele(nce);
403 bail:
404 if (upper_nce != NULL)
405 nce_refrele(upper_nce);
406 if (need_ill_refrele)
407 ill_refrele(ill);
408 return (err);
412 * Remove all the CONDEMNED nces from the appropriate hash table.
413 * We create a private list of NCEs, these may have ires pointing
414 * to them, so the list will be passed through to clean up dependent
415 * ires and only then we can do ncec_refrele() which can make NCE inactive.
417 static void
418 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
420 ncec_t *ncec1;
421 ncec_t **ptpn;
423 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
424 ASSERT(ndp->ndp_g_walker == 0);
425 for (; ncec; ncec = ncec1) {
426 ncec1 = ncec->ncec_next;
427 mutex_enter(&ncec->ncec_lock);
428 if (NCE_ISCONDEMNED(ncec)) {
429 ptpn = ncec->ncec_ptpn;
430 ncec1 = ncec->ncec_next;
431 if (ncec1 != NULL)
432 ncec1->ncec_ptpn = ptpn;
433 *ptpn = ncec1;
434 ncec->ncec_ptpn = NULL;
435 ncec->ncec_next = NULL;
436 ncec->ncec_next = *free_nce_list;
437 *free_nce_list = ncec;
439 mutex_exit(&ncec->ncec_lock);
444 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
445 * will return this NCE. Also no new timeouts will
446 * be started (See nce_restart_timer).
447 * 2. Cancel any currently running timeouts.
448 * 3. If there is an ndp walker, return. The walker will do the cleanup.
449 * This ensures that walkers see a consistent list of NCEs while walking.
450 * 4. Otherwise remove the NCE from the list of NCEs
452 void
453 ncec_delete(ncec_t *ncec)
455 ncec_t **ptpn;
456 ncec_t *ncec1;
457 int ipversion = ncec->ncec_ipversion;
458 ndp_g_t *ndp;
459 ip_stack_t *ipst = ncec->ncec_ipst;
461 if (ipversion == IPV4_VERSION)
462 ndp = ipst->ips_ndp4;
463 else
464 ndp = ipst->ips_ndp6;
466 /* Serialize deletes */
467 mutex_enter(&ncec->ncec_lock);
468 if (NCE_ISCONDEMNED(ncec)) {
469 /* Some other thread is doing the delete */
470 mutex_exit(&ncec->ncec_lock);
471 return;
474 * Caller has a refhold. Also 1 ref for being in the list. Thus
475 * refcnt has to be >= 2
477 ASSERT(ncec->ncec_refcnt >= 2);
478 ncec->ncec_flags |= NCE_F_CONDEMNED;
479 mutex_exit(&ncec->ncec_lock);
481 /* Count how many condemned ires for kmem_cache callback */
482 atomic_inc_32(&ipst->ips_num_nce_condemned);
483 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
485 /* Complete any waiting callbacks */
486 ncec_cb_dispatch(ncec);
489 * Cancel any running timer. Timeout can't be restarted
490 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
491 * Passing invalid timeout id is fine.
493 if (ncec->ncec_timeout_id != 0) {
494 (void) untimeout(ncec->ncec_timeout_id);
495 ncec->ncec_timeout_id = 0;
498 mutex_enter(&ndp->ndp_g_lock);
499 if (ncec->ncec_ptpn == NULL) {
501 * The last ndp walker has already removed this ncec from
502 * the list after we marked the ncec CONDEMNED and before
503 * we grabbed the global lock.
505 mutex_exit(&ndp->ndp_g_lock);
506 return;
508 if (ndp->ndp_g_walker > 0) {
510 * Can't unlink. The walker will clean up
512 ndp->ndp_g_walker_cleanup = B_TRUE;
513 mutex_exit(&ndp->ndp_g_lock);
514 return;
518 * Now remove the ncec from the list. nce_restart_timer won't restart
519 * the timer since it is marked CONDEMNED.
521 ptpn = ncec->ncec_ptpn;
522 ncec1 = ncec->ncec_next;
523 if (ncec1 != NULL)
524 ncec1->ncec_ptpn = ptpn;
525 *ptpn = ncec1;
526 ncec->ncec_ptpn = NULL;
527 ncec->ncec_next = NULL;
528 mutex_exit(&ndp->ndp_g_lock);
530 /* Removed from ncec_ptpn/ncec_next list */
531 ncec_refrele_notr(ncec);
534 void
535 ncec_inactive(ncec_t *ncec)
537 mblk_t **mpp;
538 ill_t *ill = ncec->ncec_ill;
539 ip_stack_t *ipst = ncec->ncec_ipst;
541 ASSERT(ncec->ncec_refcnt == 0);
542 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
544 /* Count how many condemned nces for kmem_cache callback */
545 if (NCE_ISCONDEMNED(ncec))
546 atomic_add_32(&ipst->ips_num_nce_condemned, -1);
548 /* Free all allocated messages */
549 mpp = &ncec->ncec_qd_mp;
550 while (*mpp != NULL) {
551 mblk_t *mp;
553 mp = *mpp;
554 *mpp = mp->b_next;
556 inet_freemsg(mp);
559 * must have been cleaned up in ncec_delete
561 ASSERT(list_is_empty(&ncec->ncec_cb));
562 list_destroy(&ncec->ncec_cb);
564 * free the ncec_lladdr if one was allocated in nce_add_common()
566 if (ncec->ncec_lladdr_length > 0)
567 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
569 #ifdef DEBUG
570 ncec_trace_cleanup(ncec);
571 #endif
573 mutex_enter(&ill->ill_lock);
574 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
575 (char *), "ncec", (void *), ncec);
576 ill->ill_ncec_cnt--;
577 ncec->ncec_ill = NULL;
579 * If the number of ncec's associated with this ill have dropped
580 * to zero, check whether we need to restart any operation that
581 * is waiting for this to happen.
583 if (ILL_DOWN_OK(ill)) {
584 /* ipif_ill_refrele_tail drops the ill_lock */
585 ipif_ill_refrele_tail(ill);
586 } else {
587 mutex_exit(&ill->ill_lock);
590 mutex_destroy(&ncec->ncec_lock);
591 kmem_cache_free(ncec_cache, ncec);
595 * ncec_walk routine. Delete the ncec if it is associated with the ill
596 * that is going away. Always called as a writer.
598 void
599 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg)
601 if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) {
602 ncec_delete(ncec);
607 * Neighbor Cache cleanup logic for a list of ncec_t entries.
609 static void
610 nce_cleanup_list(ncec_t *ncec)
612 ncec_t *ncec_next;
614 ASSERT(ncec != NULL);
615 while (ncec != NULL) {
616 ncec_next = ncec->ncec_next;
617 ncec->ncec_next = NULL;
620 * It is possible for the last ndp walker (this thread)
621 * to come here after ncec_delete has marked the ncec CONDEMNED
622 * and before it has removed the ncec from the fastpath list
623 * or called untimeout. So we need to do it here. It is safe
624 * for both ncec_delete and this thread to do it twice or
625 * even simultaneously since each of the threads has a
626 * reference on the ncec.
628 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
630 * Cancel any running timer. Timeout can't be restarted
631 * since CONDEMNED is set. The ncec_lock can't be
632 * held across untimeout though passing invalid timeout
633 * id is fine.
635 if (ncec->ncec_timeout_id != 0) {
636 (void) untimeout(ncec->ncec_timeout_id);
637 ncec->ncec_timeout_id = 0;
639 /* Removed from ncec_ptpn/ncec_next list */
640 ncec_refrele_notr(ncec);
641 ncec = ncec_next;
646 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted.
648 boolean_t
649 nce_restart_dad(ncec_t *ncec)
651 boolean_t started;
652 ill_t *ill, *hwaddr_ill;
654 if (ncec == NULL)
655 return (B_FALSE);
656 ill = ncec->ncec_ill;
657 mutex_enter(&ncec->ncec_lock);
658 if (ncec->ncec_state == ND_PROBE) {
659 mutex_exit(&ncec->ncec_lock);
660 started = B_TRUE;
661 } else if (ncec->ncec_state == ND_REACHABLE) {
662 ASSERT(ncec->ncec_lladdr != NULL);
663 ncec->ncec_state = ND_PROBE;
664 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
666 * Slight cheat here: we don't use the initial probe delay
667 * for IPv4 in this obscure case.
669 mutex_exit(&ncec->ncec_lock);
670 if (IS_IPMP(ill)) {
671 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
672 ncec->ncec_lladdr, ncec->ncec_lladdr_length);
673 } else {
674 hwaddr_ill = ill;
676 nce_dad(ncec, hwaddr_ill, B_TRUE);
677 started = B_TRUE;
678 } else {
679 mutex_exit(&ncec->ncec_lock);
680 started = B_FALSE;
682 return (started);
686 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed.
687 * If one is found, the refcnt on the ncec will be incremented.
689 ncec_t *
690 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
692 ncec_t *ncec;
693 ip_stack_t *ipst = ill->ill_ipst;
695 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
696 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
698 /* Get head of v6 hash table */
699 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
700 ncec = ncec_lookup_illgrp(ill, addr, ncec);
701 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
702 rw_exit(&ipst->ips_ill_g_lock);
703 return (ncec);
706 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed.
707 * If one is found, the refcnt on the ncec will be incremented.
709 ncec_t *
710 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
712 ncec_t *ncec = NULL;
713 in6_addr_t addr6;
714 ip_stack_t *ipst = ill->ill_ipst;
716 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
717 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
719 /* Get head of v4 hash table */
720 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
721 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
722 ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
723 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
724 rw_exit(&ipst->ips_ill_g_lock);
725 return (ncec);
729 * Cache entry lookup. Try to find an ncec matching the parameters passed.
730 * If an ncec is found, increment the hold count on that ncec.
731 * The caller passes in the start of the appropriate hash table, and must
732 * be holding the appropriate global lock (ndp_g_lock). In addition, since
733 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
734 * must be held as reader.
736 * This function always matches across the ipmp group.
738 ncec_t *
739 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
741 ndp_g_t *ndp;
742 ip_stack_t *ipst = ill->ill_ipst;
744 if (ill->ill_isv6)
745 ndp = ipst->ips_ndp6;
746 else
747 ndp = ipst->ips_ndp4;
749 ASSERT(ill != NULL);
750 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
751 if (IN6_IS_ADDR_UNSPECIFIED(addr))
752 return (NULL);
753 for (; ncec != NULL; ncec = ncec->ncec_next) {
754 if (ncec->ncec_ill == ill ||
755 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
756 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
757 mutex_enter(&ncec->ncec_lock);
758 if (!NCE_ISCONDEMNED(ncec)) {
759 ncec_refhold_locked(ncec);
760 mutex_exit(&ncec->ncec_lock);
761 break;
763 mutex_exit(&ncec->ncec_lock);
767 return (ncec);
771 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
772 * entries for ill only, i.e., when ill is part of an ipmp group,
773 * nce_lookup_v4 will never try to match across the group.
775 nce_t *
776 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
778 nce_t *nce;
779 in6_addr_t addr6;
780 ip_stack_t *ipst = ill->ill_ipst;
782 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
783 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
784 nce = nce_lookup_addr(ill, &addr6);
785 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
786 return (nce);
790 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
791 * entries for ill only, i.e., when ill is part of an ipmp group,
792 * nce_lookup_v6 will never try to match across the group.
794 nce_t *
795 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
797 nce_t *nce;
798 ip_stack_t *ipst = ill->ill_ipst;
800 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
801 nce = nce_lookup_addr(ill, addr6);
802 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
803 return (nce);
806 static nce_t *
807 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
809 nce_t *nce;
811 ASSERT(ill != NULL);
812 #ifdef DEBUG
813 if (ill->ill_isv6)
814 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
815 else
816 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
817 #endif
818 mutex_enter(&ill->ill_lock);
819 nce = nce_lookup(ill, addr);
820 mutex_exit(&ill->ill_lock);
821 return (nce);
826 * Router turned to host. We need to make sure that cached copies of the ncec
827 * are not used for forwarding packets if they were derived from the default
828 * route, and that the default route itself is removed, as required by
829 * section 7.2.5 of RFC 2461.
831 * Note that the ncec itself probably has valid link-layer information for the
832 * nexthop, so that there is no reason to delete the ncec, as long as the
833 * ISROUTER flag is turned off.
835 static void
836 ncec_router_to_host(ncec_t *ncec)
838 ire_t *ire;
839 ip_stack_t *ipst = ncec->ncec_ipst;
841 mutex_enter(&ncec->ncec_lock);
842 ncec->ncec_flags &= ~NCE_F_ISROUTER;
843 mutex_exit(&ncec->ncec_lock);
845 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
846 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES,
847 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
848 if (ire != NULL) {
849 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
850 ire_delete(ire);
851 ire_refrele(ire);
856 * Process passed in parameters either from an incoming packet or via
857 * user ioctl.
859 void
860 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
862 ill_t *ill = ncec->ncec_ill;
863 uint32_t hw_addr_len = ill->ill_phys_addr_length;
864 boolean_t ll_updated = B_FALSE;
865 boolean_t ll_changed;
866 nce_t *nce;
868 ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
870 * No updates of link layer address or the neighbor state is
871 * allowed, when the cache is in NONUD state. This still
872 * allows for responding to reachability solicitation.
874 mutex_enter(&ncec->ncec_lock);
875 if (ncec->ncec_state == ND_INCOMPLETE) {
876 if (hw_addr == NULL) {
877 mutex_exit(&ncec->ncec_lock);
878 return;
880 nce_set_ll(ncec, hw_addr);
882 * Update ncec state and send the queued packets
883 * back to ip this time ire will be added.
885 if (flag & ND_NA_FLAG_SOLICITED) {
886 nce_update(ncec, ND_REACHABLE, NULL);
887 } else {
888 nce_update(ncec, ND_STALE, NULL);
890 mutex_exit(&ncec->ncec_lock);
891 nce = nce_fastpath(ncec, B_TRUE, NULL);
892 nce_resolv_ok(ncec);
893 if (nce != NULL)
894 nce_refrele(nce);
895 return;
897 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
898 if (!is_adv) {
899 /* If this is a SOLICITATION request only */
900 if (ll_changed)
901 nce_update(ncec, ND_STALE, hw_addr);
902 mutex_exit(&ncec->ncec_lock);
903 ncec_cb_dispatch(ncec);
904 return;
906 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
907 /* If in any other state than REACHABLE, ignore */
908 if (ncec->ncec_state == ND_REACHABLE) {
909 nce_update(ncec, ND_STALE, NULL);
911 mutex_exit(&ncec->ncec_lock);
912 ncec_cb_dispatch(ncec);
913 return;
914 } else {
915 if (ll_changed) {
916 nce_update(ncec, ND_UNCHANGED, hw_addr);
917 ll_updated = B_TRUE;
919 if (flag & ND_NA_FLAG_SOLICITED) {
920 nce_update(ncec, ND_REACHABLE, NULL);
921 } else {
922 if (ll_updated) {
923 nce_update(ncec, ND_STALE, NULL);
926 mutex_exit(&ncec->ncec_lock);
927 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
928 NCE_F_ISROUTER)) {
929 ncec_router_to_host(ncec);
930 } else {
931 ncec_cb_dispatch(ncec);
937 * Pass arg1 to the pfi supplied, along with each ncec in existence.
938 * ncec_walk() places a REFHOLD on the ncec and drops the lock when
939 * walking the hash list.
941 void
942 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
943 boolean_t trace)
945 ncec_t *ncec;
946 ncec_t *ncec1;
947 ncec_t **ncep;
948 ncec_t *free_nce_list = NULL;
950 mutex_enter(&ndp->ndp_g_lock);
951 /* Prevent ncec_delete from unlink and free of NCE */
952 ndp->ndp_g_walker++;
953 mutex_exit(&ndp->ndp_g_lock);
954 for (ncep = ndp->nce_hash_tbl;
955 ncep < A_END(ndp->nce_hash_tbl); ncep++) {
956 for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
957 ncec1 = ncec->ncec_next;
958 if (ill == NULL || ncec->ncec_ill == ill) {
959 if (trace) {
960 ncec_refhold(ncec);
961 (*pfi)(ncec, arg1);
962 ncec_refrele(ncec);
963 } else {
964 ncec_refhold_notr(ncec);
965 (*pfi)(ncec, arg1);
966 ncec_refrele_notr(ncec);
971 mutex_enter(&ndp->ndp_g_lock);
972 ndp->ndp_g_walker--;
973 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
974 /* Time to delete condemned entries */
975 for (ncep = ndp->nce_hash_tbl;
976 ncep < A_END(ndp->nce_hash_tbl); ncep++) {
977 ncec = *ncep;
978 if (ncec != NULL) {
979 nce_remove(ndp, ncec, &free_nce_list);
982 ndp->ndp_g_walker_cleanup = B_FALSE;
985 mutex_exit(&ndp->ndp_g_lock);
987 if (free_nce_list != NULL) {
988 nce_cleanup_list(free_nce_list);
993 * Walk everything.
994 * Note that ill can be NULL hence can't derive the ipst from it.
996 void
997 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
999 ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1000 ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1004 * For each interface an entry is added for the unspecified multicast group.
1005 * Here that mapping is used to form the multicast cache entry for a particular
1006 * multicast destination.
1008 static int
1009 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1010 uint16_t flags, nce_t **newnce)
1012 uchar_t *hw_addr;
1013 int err = 0;
1014 ip_stack_t *ipst = ill->ill_ipst;
1015 nce_t *nce;
1017 ASSERT(ill != NULL);
1018 ASSERT(ill->ill_isv6);
1019 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1021 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1022 nce = nce_lookup_addr(ill, dst);
1023 if (nce != NULL) {
1024 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1025 goto done;
1027 if (ill->ill_net_type == IRE_IF_RESOLVER) {
1029 * For IRE_IF_RESOLVER a hardware mapping can be
1030 * generated.
1032 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1033 if (hw_addr == NULL) {
1034 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1035 return (ENOMEM);
1037 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1038 } else {
1039 /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1040 hw_addr = NULL;
1042 ASSERT((flags & NCE_F_MCAST) != 0);
1043 ASSERT((flags & NCE_F_NONUD) != 0);
1044 /* nce_state will be computed by nce_add_common() */
1045 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1046 ND_UNCHANGED, &nce);
1047 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1048 if (err == 0)
1049 err = nce_add_v6_postprocess(nce);
1050 if (hw_addr != NULL)
1051 kmem_free(hw_addr, ill->ill_nd_lla_len);
1052 if (err != 0) {
1053 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1054 return (err);
1056 done:
1057 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1058 if (newnce != NULL)
1059 *newnce = nce;
1060 else
1061 nce_refrele(nce);
1062 return (0);
1066 * Return the link layer address, and any flags of a ncec.
1069 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1071 ncec_t *ncec;
1072 in6_addr_t *addr;
1073 sin6_t *sin6;
1075 ASSERT(ill != NULL && ill->ill_isv6);
1076 sin6 = (sin6_t *)&lnr->lnr_addr;
1077 addr = &sin6->sin6_addr;
1080 * NOTE: if the ill is an IPMP interface, then match against the whole
1081 * illgrp. This e.g. allows in.ndpd to retrieve the link layer
1082 * addresses for the data addresses on an IPMP interface even though
1083 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1085 ncec = ncec_lookup_illgrp_v6(ill, addr);
1086 if (ncec == NULL)
1087 return (ESRCH);
1088 /* If no link layer address is available yet, return ESRCH */
1089 if (!NCE_ISREACHABLE(ncec)) {
1090 ncec_refrele(ncec);
1091 return (ESRCH);
1093 lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1094 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1095 lnr->lnr_hdw_len);
1096 if (ncec->ncec_flags & NCE_F_ISROUTER)
1097 lnr->lnr_flags = NDF_ISROUTER_ON;
1098 if (ncec->ncec_flags & NCE_F_ANYCAST)
1099 lnr->lnr_flags |= NDF_ANYCAST_ON;
1100 if (ncec->ncec_flags & NCE_F_STATIC)
1101 lnr->lnr_flags |= NDF_STATIC;
1102 ncec_refrele(ncec);
1103 return (0);
1107 * Finish setting up the Enable/Disable multicast for the driver.
1109 mblk_t *
1110 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1111 uint32_t hw_addr_offset, mblk_t *mp)
1113 uchar_t *hw_addr;
1114 ipaddr_t v4group;
1115 uchar_t *addr;
1117 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1118 if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1119 IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1121 ASSERT(CLASSD(v4group));
1122 ASSERT(!(ill->ill_isv6));
1124 addr = (uchar_t *)&v4group;
1125 } else {
1126 ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1127 ASSERT(ill->ill_isv6);
1129 addr = (uchar_t *)v6group;
1131 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1132 if (hw_addr == NULL) {
1133 ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1134 freemsg(mp);
1135 return (NULL);
1138 ip_mcast_mapping(ill, addr, hw_addr);
1139 return (mp);
1142 void
1143 ip_ndp_resolve(ncec_t *ncec)
1145 in_addr_t sender4 = INADDR_ANY;
1146 in6_addr_t sender6 = ipv6_all_zeros;
1147 ill_t *src_ill;
1148 uint32_t ms;
1150 src_ill = nce_resolve_src(ncec, &sender6);
1151 if (src_ill == NULL) {
1152 /* Make sure we try again later */
1153 ms = ncec->ncec_ill->ill_reachable_retrans_time;
1154 nce_restart_timer(ncec, (clock_t)ms);
1155 return;
1157 if (ncec->ncec_ipversion == IPV4_VERSION)
1158 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1159 mutex_enter(&ncec->ncec_lock);
1160 if (ncec->ncec_ipversion == IPV6_VERSION)
1161 ms = ndp_solicit(ncec, sender6, src_ill);
1162 else
1163 ms = arp_request(ncec, sender4, src_ill);
1164 mutex_exit(&ncec->ncec_lock);
1165 if (ms == 0) {
1166 if (ncec->ncec_state != ND_REACHABLE) {
1167 if (ncec->ncec_ipversion == IPV6_VERSION)
1168 ndp_resolv_failed(ncec);
1169 else
1170 arp_resolv_failed(ncec);
1171 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1172 nce_make_unreachable(ncec);
1173 ncec_delete(ncec);
1175 } else {
1176 nce_restart_timer(ncec, (clock_t)ms);
1178 done:
1179 ill_refrele(src_ill);
1183 * Send an IPv6 neighbor solicitation.
1184 * Returns number of milliseconds after which we should either rexmit or abort.
1185 * Return of zero means we should abort.
1186 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1187 * The optional source address is used as a hint to ndp_solicit for
1188 * which source to use in the packet.
1190 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1191 * the packet.
1193 uint32_t
1194 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1196 in6_addr_t dst;
1197 boolean_t dropped = B_FALSE;
1199 ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1200 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1202 if (ncec->ncec_rcnt == 0)
1203 return (0);
1205 dst = ncec->ncec_addr;
1206 ncec->ncec_rcnt--;
1207 mutex_exit(&ncec->ncec_lock);
1208 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1209 ill->ill_phys_addr_length, &src, &dst, 0);
1210 mutex_enter(&ncec->ncec_lock);
1211 if (dropped)
1212 ncec->ncec_rcnt++;
1213 return (ncec->ncec_ill->ill_reachable_retrans_time);
1217 * Attempt to recover an address on an interface that's been marked as a
1218 * duplicate. Because NCEs are destroyed when the interface goes down, there's
1219 * no easy way to just probe the address and have the right thing happen if
1220 * it's no longer in use. Instead, we just bring it up normally and allow the
1221 * regular interface start-up logic to probe for a remaining duplicate and take
1222 * us back down if necessary.
1223 * Neither DHCP nor temporary addresses arrive here; they're excluded by
1224 * ip_ndp_excl.
1226 /* ARGSUSED */
1227 void
1228 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1230 ill_t *ill = rq->q_ptr;
1231 ipif_t *ipif;
1232 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1233 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1234 boolean_t addr_equal;
1236 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1238 * We do not support recovery of proxy ARP'd interfaces,
1239 * because the system lacks a complete proxy ARP mechanism.
1241 if (ill->ill_isv6) {
1242 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1243 addr6);
1244 } else {
1245 addr_equal = (ipif->ipif_lcl_addr == *addr4);
1248 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1249 continue;
1252 * If we have already recovered or if the interface is going
1253 * away, then ignore.
1255 mutex_enter(&ill->ill_lock);
1256 if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1257 (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1258 mutex_exit(&ill->ill_lock);
1259 continue;
1262 ipif->ipif_flags &= ~IPIF_DUPLICATE;
1263 ill->ill_ipif_dup_count--;
1264 mutex_exit(&ill->ill_lock);
1265 ipif->ipif_was_dup = B_TRUE;
1267 if (ill->ill_isv6) {
1268 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1269 (void) ipif_up_done_v6(ipif);
1270 } else {
1271 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1272 EINPROGRESS);
1273 (void) ipif_up_done(ipif);
1276 freeb(mp);
1280 * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1281 * As long as someone else holds the address, the interface will stay down.
1282 * When that conflict goes away, the interface is brought back up. This is
1283 * done so that accidental shutdowns of addresses aren't made permanent. Your
1284 * server will recover from a failure.
1286 * For DHCP and temporary addresses, recovery is not done in the kernel.
1287 * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1289 * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1291 void
1292 ipif_dup_recovery(void *arg)
1294 ipif_t *ipif = arg;
1296 ipif->ipif_recovery_id = 0;
1297 if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1298 return;
1301 * No lock, because this is just an optimization.
1303 if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1304 return;
1306 /* If the link is down, we'll retry this later */
1307 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1308 return;
1310 ipif_do_recovery(ipif);
1314 * Perform interface recovery by forcing the duplicate interfaces up and
1315 * allowing the system to determine which ones should stay up.
1317 * Called both by recovery timer expiry and link-up notification.
1319 void
1320 ipif_do_recovery(ipif_t *ipif)
1322 ill_t *ill = ipif->ipif_ill;
1323 mblk_t *mp;
1324 ip_stack_t *ipst = ill->ill_ipst;
1325 size_t mp_size;
1327 if (ipif->ipif_isv6)
1328 mp_size = sizeof (ipif->ipif_v6lcl_addr);
1329 else
1330 mp_size = sizeof (ipif->ipif_lcl_addr);
1331 mp = allocb(mp_size, BPRI_MED);
1332 if (mp == NULL) {
1333 mutex_enter(&ill->ill_lock);
1334 if (ipst->ips_ip_dup_recovery > 0 &&
1335 ipif->ipif_recovery_id == 0 &&
1336 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1337 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1338 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1340 mutex_exit(&ill->ill_lock);
1341 } else {
1343 * A recovery timer may still be running if we got here from
1344 * ill_restart_dad(); cancel that timer.
1346 if (ipif->ipif_recovery_id != 0)
1347 (void) untimeout(ipif->ipif_recovery_id);
1348 ipif->ipif_recovery_id = 0;
1350 if (ipif->ipif_isv6) {
1351 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1352 sizeof (ipif->ipif_v6lcl_addr));
1353 } else {
1354 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1355 sizeof (ipif->ipif_lcl_addr));
1357 ill_refhold(ill);
1358 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1359 B_FALSE);
1364 * Find the MAC and IP addresses in an NA/NS message.
1366 static void
1367 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1368 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1370 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1371 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1372 uchar_t *addr;
1373 int alen;
1375 /* icmp_inbound_v6 ensures this */
1376 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1378 addr = ira->ira_l2src;
1379 alen = ill->ill_phys_addr_length;
1380 if (alen > 0) {
1381 *haddr = addr;
1382 *haddrlenp = alen;
1383 } else {
1384 *haddr = NULL;
1385 *haddrlenp = 0;
1388 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1389 *targp = ns->nd_ns_target;
1393 * This is for exclusive changes due to NDP duplicate address detection
1394 * failure.
1396 /* ARGSUSED */
1397 static void
1398 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1400 ill_t *ill = rq->q_ptr;
1401 ipif_t *ipif;
1402 uchar_t *haddr;
1403 uint_t haddrlen;
1404 ip_stack_t *ipst = ill->ill_ipst;
1405 in6_addr_t targ;
1406 ip_recv_attr_t iras;
1407 mblk_t *attrmp;
1409 attrmp = mp;
1410 mp = mp->b_cont;
1411 attrmp->b_cont = NULL;
1412 if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1413 /* The ill or ip_stack_t disappeared on us */
1414 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1415 ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1416 freemsg(mp);
1417 ira_cleanup(&iras, B_TRUE);
1418 return;
1421 ASSERT(ill == iras.ira_rill);
1423 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1424 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1426 * Ignore conflicts generated by misbehaving switches that
1427 * just reflect our own messages back to us. For IPMP, we may
1428 * see reflections across any ill in the illgrp.
1430 * RFC2462 and revisions tried to detect both the case
1431 * when a statically configured IPv6 address is a duplicate,
1432 * and the case when the L2 address itself is a duplicate. The
1433 * later is important because, with stateles address autoconf,
1434 * if the L2 address is a duplicate, the resulting IPv6
1435 * address(es) would also be duplicates. We rely on DAD of the
1436 * IPv6 address itself to detect the latter case.
1438 /* For an under ill_grp can change under lock */
1439 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1440 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1441 IS_UNDER_IPMP(ill) &&
1442 ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1443 haddrlen) != NULL) {
1444 rw_exit(&ipst->ips_ill_g_lock);
1445 goto ignore_conflict;
1447 rw_exit(&ipst->ips_ill_g_lock);
1451 * Look up the appropriate ipif.
1453 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1454 if (ipif == NULL)
1455 goto ignore_conflict;
1457 /* Reload the ill to match the ipif */
1458 ill = ipif->ipif_ill;
1460 /* If it's already duplicate or ineligible, then don't do anything. */
1461 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1462 ipif_refrele(ipif);
1463 goto ignore_conflict;
1467 * If this is a failure during duplicate recovery, then don't
1468 * complain. It may take a long time to recover.
1470 if (!ipif->ipif_was_dup) {
1471 char ibuf[LIFNAMSIZ];
1472 char hbuf[MAC_STR_LEN];
1473 char sbuf[INET6_ADDRSTRLEN];
1475 ipif_get_name(ipif, ibuf, sizeof (ibuf));
1476 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1477 " disabled", ibuf,
1478 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1479 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1481 mutex_enter(&ill->ill_lock);
1482 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1483 ipif->ipif_flags |= IPIF_DUPLICATE;
1484 ill->ill_ipif_dup_count++;
1485 mutex_exit(&ill->ill_lock);
1486 (void) ipif_down(ipif, NULL, NULL);
1487 (void) ipif_down_tail(ipif);
1488 mutex_enter(&ill->ill_lock);
1489 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1490 ill->ill_net_type == IRE_IF_RESOLVER &&
1491 !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1492 ipst->ips_ip_dup_recovery > 0) {
1493 ASSERT(ipif->ipif_recovery_id == 0);
1494 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1495 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1497 mutex_exit(&ill->ill_lock);
1498 ipif_refrele(ipif);
1500 ignore_conflict:
1501 freemsg(mp);
1502 ira_cleanup(&iras, B_TRUE);
1506 * Handle failure by tearing down the ipifs with the specified address. Note
1507 * that tearing down the ipif also means deleting the ncec through ipif_down, so
1508 * it's not possible to do recovery by just restarting the ncec timer. Instead,
1509 * we start a timer on the ipif.
1510 * Caller has to free mp;
1512 static void
1513 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1515 const uchar_t *haddr;
1516 ill_t *ill = ira->ira_rill;
1519 * Ignore conflicts generated by misbehaving switches that just
1520 * reflect our own messages back to us.
1523 /* icmp_inbound_v6 ensures this */
1524 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1525 haddr = ira->ira_l2src;
1526 if (haddr != NULL &&
1527 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1528 return;
1531 if ((mp = copymsg(mp)) != NULL) {
1532 mblk_t *attrmp;
1534 attrmp = ip_recv_attr_to_mblk(ira);
1535 if (attrmp == NULL) {
1536 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1537 ip_drop_input("ipIfStatsInDiscards", mp, ill);
1538 freemsg(mp);
1539 } else {
1540 ASSERT(attrmp->b_cont == NULL);
1541 attrmp->b_cont = mp;
1542 mp = attrmp;
1543 ill_refhold(ill);
1544 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1545 B_FALSE);
1551 * Handle a discovered conflict: some other system is advertising that it owns
1552 * one of our IP addresses. We need to defend ourselves, or just shut down the
1553 * interface.
1555 * Handles both IPv4 and IPv6
1557 boolean_t
1558 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1560 ipif_t *ipif;
1561 clock_t now;
1562 uint_t maxdefense;
1563 uint_t defs;
1564 ill_t *ill = ira->ira_ill;
1565 ip_stack_t *ipst = ill->ill_ipst;
1566 uint32_t elapsed;
1567 boolean_t isv6 = ill->ill_isv6;
1568 ipaddr_t ncec_addr;
1570 if (isv6) {
1571 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1572 ipst);
1573 } else {
1574 if (arp_no_defense) {
1576 * Yes, there is a conflict, but no, we do not
1577 * defend ourself.
1579 return (B_TRUE);
1581 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1582 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1583 ipst);
1585 if (ipif == NULL)
1586 return (B_FALSE);
1589 * First, figure out if this address is disposable.
1591 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1592 maxdefense = ipst->ips_ip_max_temp_defend;
1593 else
1594 maxdefense = ipst->ips_ip_max_defend;
1597 * Now figure out how many times we've defended ourselves. Ignore
1598 * defenses that happened long in the past.
1600 now = ddi_get_lbolt();
1601 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1602 mutex_enter(&ncec->ncec_lock);
1603 if ((defs = ncec->ncec_defense_count) > 0 &&
1604 elapsed > ipst->ips_ip_defend_interval) {
1606 * ip_defend_interval has elapsed.
1607 * reset the defense count.
1609 ncec->ncec_defense_count = defs = 0;
1611 ncec->ncec_defense_count++;
1612 ncec->ncec_last_time_defended = now;
1613 mutex_exit(&ncec->ncec_lock);
1614 ipif_refrele(ipif);
1617 * If we've defended ourselves too many times already, then give up and
1618 * tear down the interface(s) using this address.
1619 * Otherwise, caller has to defend by sending out an announce.
1621 if (defs >= maxdefense) {
1622 if (isv6)
1623 ndp_failure(mp, ira);
1624 else
1625 arp_failure(mp, ira);
1626 } else {
1627 return (B_TRUE); /* caller must defend this address */
1629 return (B_FALSE);
1633 * Handle reception of Neighbor Solicitation messages.
1635 static void
1636 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1638 ill_t *ill = ira->ira_ill, *under_ill;
1639 nd_neighbor_solicit_t *ns;
1640 uint32_t hlen = ill->ill_phys_addr_length;
1641 uchar_t *haddr = NULL;
1642 icmp6_t *icmp_nd;
1643 ip6_t *ip6h;
1644 ncec_t *our_ncec = NULL;
1645 in6_addr_t target;
1646 in6_addr_t src;
1647 int len;
1648 int flag = 0;
1649 nd_opt_hdr_t *opt = NULL;
1650 boolean_t bad_solicit = B_FALSE;
1651 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
1652 boolean_t need_ill_refrele = B_FALSE;
1654 ip6h = (ip6_t *)mp->b_rptr;
1655 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1656 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1657 src = ip6h->ip6_src;
1658 ns = (nd_neighbor_solicit_t *)icmp_nd;
1659 target = ns->nd_ns_target;
1660 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1661 IN6_IS_ADDR_LOOPBACK(&target)) {
1662 if (ip_debug > 2) {
1663 /* ip1dbg */
1664 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1665 AF_INET6, &target);
1667 bad_solicit = B_TRUE;
1668 goto done;
1670 if (len > sizeof (nd_neighbor_solicit_t)) {
1671 /* Options present */
1672 opt = (nd_opt_hdr_t *)&ns[1];
1673 len -= sizeof (nd_neighbor_solicit_t);
1674 if (!ndp_verify_optlen(opt, len)) {
1675 ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1676 bad_solicit = B_TRUE;
1677 goto done;
1680 if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1681 /* Check to see if this is a valid DAD solicitation */
1682 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1683 if (ip_debug > 2) {
1684 /* ip1dbg */
1685 pr_addr_dbg("ndp_input_solicit: IPv6 "
1686 "Destination is not solicited node "
1687 "multicast %s\n", AF_INET6,
1688 &ip6h->ip6_dst);
1690 bad_solicit = B_TRUE;
1691 goto done;
1696 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1697 * received this packet if it's multicast) is not the ill tied to
1698 * e.g. the IPMP ill's data link-local. So we match across the illgrp
1699 * to ensure we find the associated NCE.
1701 our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1703 * If this is a valid Solicitation for an address we are publishing,
1704 * then a PUBLISH entry should exist in the cache
1706 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1707 ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1708 "ifname=%s ", ill->ill_name));
1709 if (ip_debug > 2) {
1710 /* ip1dbg */
1711 pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1713 if (our_ncec == NULL)
1714 bad_solicit = B_TRUE;
1715 goto done;
1718 /* At this point we should have a verified NS per spec */
1719 if (opt != NULL) {
1720 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1721 if (opt != NULL) {
1722 haddr = (uchar_t *)&opt[1];
1723 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1724 hlen == 0) {
1725 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1726 bad_solicit = B_TRUE;
1727 goto done;
1732 /* If sending directly to peer, set the unicast flag */
1733 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1734 flag |= NDP_UNICAST;
1737 * Create/update the entry for the soliciting node on the ipmp_ill.
1738 * or respond to outstanding queries, don't if
1739 * the source is unspecified address.
1741 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1742 int err;
1743 nce_t *nnce;
1745 ASSERT(ill->ill_isv6);
1747 * Regular solicitations *must* include the Source Link-Layer
1748 * Address option. Ignore messages that do not.
1750 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1751 ip1dbg(("ndp_input_solicit: source link-layer address "
1752 "option missing with a specified source.\n"));
1753 bad_solicit = B_TRUE;
1754 goto done;
1758 * This is a regular solicitation. If we're still in the
1759 * process of verifying the address, then don't respond at all
1760 * and don't keep track of the sender.
1762 if (our_ncec->ncec_state == ND_PROBE)
1763 goto done;
1766 * If the solicitation doesn't have sender hardware address
1767 * (legal for unicast solicitation), then process without
1768 * installing the return NCE. Either we already know it, or
1769 * we'll be forced to look it up when (and if) we reply to the
1770 * packet.
1772 if (haddr == NULL)
1773 goto no_source;
1775 under_ill = ill;
1776 if (IS_UNDER_IPMP(under_ill)) {
1777 ill = ipmp_ill_hold_ipmp_ill(under_ill);
1778 if (ill == NULL)
1779 ill = under_ill;
1780 else
1781 need_ill_refrele = B_TRUE;
1783 err = nce_lookup_then_add_v6(ill,
1784 haddr, hlen,
1785 &src, /* Soliciting nodes address */
1787 ND_STALE,
1788 &nnce);
1790 if (need_ill_refrele) {
1791 ill_refrele(ill);
1792 ill = under_ill;
1793 need_ill_refrele = B_FALSE;
1795 switch (err) {
1796 case 0:
1797 /* done with this entry */
1798 nce_refrele(nnce);
1799 break;
1800 case EEXIST:
1802 * B_FALSE indicates this is not an an advertisement.
1804 nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1805 nce_refrele(nnce);
1806 break;
1807 default:
1808 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1809 err));
1810 goto done;
1812 no_source:
1813 flag |= NDP_SOLICITED;
1814 } else {
1816 * No source link layer address option should be present in a
1817 * valid DAD request.
1819 if (haddr != NULL) {
1820 ip1dbg(("ndp_input_solicit: source link-layer address "
1821 "option present with an unspecified source.\n"));
1822 bad_solicit = B_TRUE;
1823 goto done;
1825 if (our_ncec->ncec_state == ND_PROBE) {
1827 * Internally looped-back probes will have
1828 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1829 * transmissions.
1831 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1833 * If someone else is probing our address, then
1834 * we've crossed wires. Declare failure.
1836 ndp_failure(mp, ira);
1838 goto done;
1841 * This is a DAD probe. Multicast the advertisement to the
1842 * all-nodes address.
1844 src = ipv6_all_hosts_mcast;
1846 flag |= nce_advert_flags(our_ncec);
1847 (void) ndp_xmit(ill,
1848 ND_NEIGHBOR_ADVERT,
1849 our_ncec->ncec_lladdr,
1850 our_ncec->ncec_lladdr_length,
1851 &target, /* Source and target of the advertisement pkt */
1852 &src, /* IP Destination (source of original pkt) */
1853 flag);
1854 done:
1855 if (bad_solicit)
1856 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1857 if (our_ncec != NULL)
1858 ncec_refrele(our_ncec);
1862 * Handle reception of Neighbor Solicitation messages
1864 void
1865 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1867 ill_t *ill = ira->ira_ill;
1868 nd_neighbor_advert_t *na;
1869 uint32_t hlen = ill->ill_phys_addr_length;
1870 uchar_t *haddr = NULL;
1871 icmp6_t *icmp_nd;
1872 ip6_t *ip6h;
1873 ncec_t *dst_ncec = NULL;
1874 in6_addr_t target;
1875 nd_opt_hdr_t *opt = NULL;
1876 int len;
1877 ip_stack_t *ipst = ill->ill_ipst;
1878 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
1880 ip6h = (ip6_t *)mp->b_rptr;
1881 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1882 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1883 na = (nd_neighbor_advert_t *)icmp_nd;
1885 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1886 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1887 ip1dbg(("ndp_input_advert: Target is multicast but the "
1888 "solicited flag is not zero\n"));
1889 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1890 return;
1892 target = na->nd_na_target;
1893 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1894 IN6_IS_ADDR_LOOPBACK(&target)) {
1895 if (ip_debug > 2) {
1896 /* ip1dbg */
1897 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1898 AF_INET6, &target);
1900 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1901 return;
1903 if (len > sizeof (nd_neighbor_advert_t)) {
1904 opt = (nd_opt_hdr_t *)&na[1];
1905 if (!ndp_verify_optlen(opt,
1906 len - sizeof (nd_neighbor_advert_t))) {
1907 ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1908 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1909 return;
1911 /* At this point we have a verified NA per spec */
1912 len -= sizeof (nd_neighbor_advert_t);
1913 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1914 if (opt != NULL) {
1915 haddr = (uchar_t *)&opt[1];
1916 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1917 hlen == 0) {
1918 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1919 BUMP_MIB(mib,
1920 ipv6IfIcmpInBadNeighborAdvertisements);
1921 return;
1927 * NOTE: we match across the illgrp since we need to do DAD for all of
1928 * our local addresses, and those are spread across all the active
1929 * ills in the group.
1931 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1932 return;
1934 if (NCE_PUBLISH(dst_ncec)) {
1936 * Someone just advertised an addresses that we publish. First,
1937 * check it it was us -- if so, we can safely ignore it.
1938 * We don't get the haddr from the ira_l2src because, in the
1939 * case that the packet originated from us, on an IPMP group,
1940 * the ira_l2src may would be the link-layer address of the
1941 * cast_ill used to send the packet, which may not be the same
1942 * as the dst_ncec->ncec_lladdr of the address.
1944 if (haddr != NULL) {
1945 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1946 goto out;
1948 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1949 goto out; /* from us -- no conflict */
1952 * If we're in an IPMP group, check if this is an echo
1953 * from another ill in the group. Use the double-
1954 * checked locking pattern to avoid grabbing
1955 * ill_g_lock in the non-IPMP case.
1957 if (IS_UNDER_IPMP(ill)) {
1958 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1959 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1960 ill->ill_grp, haddr, hlen) != NULL) {
1961 rw_exit(&ipst->ips_ill_g_lock);
1962 goto out;
1964 rw_exit(&ipst->ips_ill_g_lock);
1969 * This appears to be a real conflict. If we're trying to
1970 * configure this NCE (ND_PROBE), then shut it down.
1971 * Otherwise, handle the discovered conflict.
1973 if (dst_ncec->ncec_state == ND_PROBE) {
1974 ndp_failure(mp, ira);
1975 } else {
1976 if (ip_nce_conflict(mp, ira, dst_ncec)) {
1977 char hbuf[MAC_STR_LEN];
1978 char sbuf[INET6_ADDRSTRLEN];
1980 cmn_err(CE_WARN,
1981 "node '%s' is using %s on %s",
1982 inet_ntop(AF_INET6, &target, sbuf,
1983 sizeof (sbuf)),
1984 haddr == NULL ? "<none>" :
1985 mac_colon_addr(haddr, hlen, hbuf,
1986 sizeof (hbuf)), ill->ill_name);
1988 * RFC 4862, Section 5.4.4 does not mandate
1989 * any specific behavior when an NA matches
1990 * a non-tentative address assigned to the
1991 * receiver. We make the choice of defending
1992 * our address, based on the assumption that
1993 * the sender has not detected the Duplicate.
1995 * ncec_last_time_defended has been adjusted
1996 * in ip_nce_conflict()
1998 (void) ndp_announce(dst_ncec);
2001 } else {
2002 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2003 dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2005 /* B_TRUE indicates this an advertisement */
2006 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2008 out:
2009 ncec_refrele(dst_ncec);
2013 * Process NDP neighbor solicitation/advertisement messages.
2014 * The checksum has already checked o.k before reaching here.
2015 * Information about the datalink header is contained in ira_l2src, but
2016 * that should be ignored for loopback packets.
2018 void
2019 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2021 ill_t *ill = ira->ira_rill;
2022 icmp6_t *icmp_nd;
2023 ip6_t *ip6h;
2024 int len;
2025 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
2026 ill_t *orig_ill = NULL;
2029 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2030 * and make it be the IPMP upper so avoid being confused by a packet
2031 * addressed to a unicast address on a different ill.
2033 if (IS_UNDER_IPMP(ill)) {
2034 orig_ill = ill;
2035 ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2036 if (ill == NULL) {
2037 ill = orig_ill;
2038 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2039 ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2040 mp, ill);
2041 freemsg(mp);
2042 return;
2044 ASSERT(ill != orig_ill);
2045 orig_ill = ira->ira_ill;
2046 ira->ira_ill = ill;
2047 mib = ill->ill_icmp6_mib;
2049 if (!pullupmsg(mp, -1)) {
2050 ip1dbg(("ndp_input: pullupmsg failed\n"));
2051 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2052 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2053 goto done;
2055 ip6h = (ip6_t *)mp->b_rptr;
2056 if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2057 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2058 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2059 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2060 goto done;
2063 * NDP does not accept any extension headers between the
2064 * IP header and the ICMP header since e.g. a routing
2065 * header could be dangerous.
2066 * This assumes that any AH or ESP headers are removed
2067 * by ip prior to passing the packet to ndp_input.
2069 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2070 ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2071 ip6h->ip6_nxt));
2072 ip_drop_input("Wrong next header", mp, ill);
2073 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2074 goto done;
2076 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2077 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2078 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2079 if (icmp_nd->icmp6_code != 0) {
2080 ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2081 ip_drop_input("code non-zero", mp, ill);
2082 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2083 goto done;
2085 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2087 * Make sure packet length is large enough for either
2088 * a NS or a NA icmp packet.
2090 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2091 ip1dbg(("ndp_input: packet too short\n"));
2092 ip_drop_input("packet too short", mp, ill);
2093 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2094 goto done;
2096 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2097 ndp_input_solicit(mp, ira);
2098 } else {
2099 ndp_input_advert(mp, ira);
2101 done:
2102 freemsg(mp);
2103 if (orig_ill != NULL) {
2104 ill_refrele(ill);
2105 ira->ira_ill = orig_ill;
2110 * ndp_xmit is called to form and transmit a ND solicitation or
2111 * advertisement ICMP packet.
2113 * If the source address is unspecified and this isn't a probe (used for
2114 * duplicate address detection), an appropriate source address and link layer
2115 * address will be chosen here. The link layer address option is included if
2116 * the source is specified (i.e., all non-probe packets), and omitted (per the
2117 * specification) otherwise.
2119 * It returns B_FALSE only if it does a successful put() to the
2120 * corresponding ill's ill_wq otherwise returns B_TRUE.
2122 static boolean_t
2123 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2124 const in6_addr_t *sender, const in6_addr_t *target, int flag)
2126 uint32_t len;
2127 icmp6_t *icmp6;
2128 mblk_t *mp;
2129 ip6_t *ip6h;
2130 nd_opt_hdr_t *opt;
2131 uint_t plen;
2132 zoneid_t zoneid = GLOBAL_ZONEID;
2133 ill_t *hwaddr_ill = ill;
2134 ip_xmit_attr_t ixas;
2135 ip_stack_t *ipst = ill->ill_ipst;
2136 boolean_t need_refrele = B_FALSE;
2137 boolean_t probe = B_FALSE;
2139 if (IS_UNDER_IPMP(ill)) {
2140 probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2142 * We send non-probe packets on the upper IPMP interface.
2143 * ip_output_simple() will use cast_ill for sending any
2144 * multicast packets. Note that we can't follow the same
2145 * logic for probe packets because all interfaces in the ipmp
2146 * group may have failed, so that we really want to only try
2147 * to send the ND packet on the ill corresponding to the src
2148 * address.
2150 if (!probe) {
2151 ill = ipmp_ill_hold_ipmp_ill(ill);
2152 if (ill != NULL)
2153 need_refrele = B_TRUE;
2154 else
2155 ill = hwaddr_ill;
2160 * If we have a unspecified source(sender) address, select a
2161 * proper source address for the solicitation here itself so
2162 * that we can initialize the h/w address correctly.
2164 * If the sender is specified then we use this address in order
2165 * to lookup the zoneid before calling ip_output_v6(). This is to
2166 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2167 * by IP (we cannot guarantee that the global zone has an interface
2168 * route to the destination).
2170 * Note that the NA never comes here with the unspecified source
2171 * address.
2175 * Probes will have unspec src at this point.
2177 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2178 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2180 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2181 * ALL_ZONES if it cannot find a matching ipif for the address
2182 * we are trying to use. In this case we err on the side of
2183 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2185 if (zoneid == ALL_ZONES)
2186 zoneid = GLOBAL_ZONEID;
2189 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2190 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2191 mp = allocb(len, BPRI_LO);
2192 if (mp == NULL) {
2193 if (need_refrele)
2194 ill_refrele(ill);
2195 return (B_TRUE);
2198 bzero((char *)mp->b_rptr, len);
2199 mp->b_wptr = mp->b_rptr + len;
2201 bzero(&ixas, sizeof (ixas));
2202 ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2204 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2205 ixas.ixa_ipst = ipst;
2206 ixas.ixa_cred = kcred;
2207 ixas.ixa_cpid = NOPID;
2208 ixas.ixa_zoneid = zoneid;
2210 ip6h = (ip6_t *)mp->b_rptr;
2211 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2212 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2213 ip6h->ip6_nxt = IPPROTO_ICMPV6;
2214 ip6h->ip6_hops = IPV6_MAX_HOPS;
2215 ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2216 ip6h->ip6_dst = *target;
2217 icmp6 = (icmp6_t *)&ip6h[1];
2219 if (hw_addr_len != 0) {
2220 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2221 sizeof (nd_neighbor_advert_t));
2222 } else {
2223 opt = NULL;
2225 if (operation == ND_NEIGHBOR_SOLICIT) {
2226 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2228 if (opt != NULL && !(flag & NDP_PROBE)) {
2230 * Note that we don't send out SLLA for ND probes
2231 * per RFC 4862, even though we do send out the src
2232 * haddr for IPv4 DAD probes, even though both IPv4
2233 * and IPv6 go out with the unspecified/INADDR_ANY
2234 * src IP addr.
2236 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2238 ip6h->ip6_src = *sender;
2239 ns->nd_ns_target = *target;
2240 if (!(flag & NDP_UNICAST)) {
2241 /* Form multicast address of the target */
2242 ip6h->ip6_dst = ipv6_solicited_node_mcast;
2243 ip6h->ip6_dst.s6_addr32[3] |=
2244 ns->nd_ns_target.s6_addr32[3];
2246 } else {
2247 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2249 ASSERT(!(flag & NDP_PROBE));
2250 if (opt != NULL)
2251 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2252 ip6h->ip6_src = *sender;
2253 na->nd_na_target = *sender;
2254 if (flag & NDP_ISROUTER)
2255 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2256 if (flag & NDP_SOLICITED)
2257 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2258 if (flag & NDP_ORIDE)
2259 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2262 if (!(flag & NDP_PROBE)) {
2263 if (hw_addr != NULL && opt != NULL) {
2264 /* Fill in link layer address and option len */
2265 opt->nd_opt_len = (uint8_t)plen;
2266 bcopy(hw_addr, &opt[1], hw_addr_len);
2269 if (opt != NULL && opt->nd_opt_type == 0) {
2270 /* If there's no link layer address option, then strip it. */
2271 len -= plen * 8;
2272 mp->b_wptr = mp->b_rptr + len;
2273 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2276 icmp6->icmp6_type = (uint8_t)operation;
2277 icmp6->icmp6_code = 0;
2279 * Prepare for checksum by putting icmp length in the icmp
2280 * checksum field. The checksum is calculated in ip_output.c.
2282 icmp6->icmp6_cksum = ip6h->ip6_plen;
2284 (void) ip_output_simple(mp, &ixas);
2285 ixa_cleanup(&ixas);
2286 if (need_refrele)
2287 ill_refrele(ill);
2288 return (B_FALSE);
2292 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2293 * The datapath uses this as an indication that there
2294 * is a problem (as opposed to a NCE that was just
2295 * reclaimed due to lack of memory.
2296 * Note that static ARP entries never become unreachable.
2298 void
2299 nce_make_unreachable(ncec_t *ncec)
2301 mutex_enter(&ncec->ncec_lock);
2302 ncec->ncec_state = ND_UNREACHABLE;
2303 mutex_exit(&ncec->ncec_lock);
2307 * NCE retransmit timer. Common to IPv4 and IPv6.
2308 * This timer goes off when:
2309 * a. It is time to retransmit a resolution for resolver.
2310 * b. It is time to send reachability probes.
2312 void
2313 nce_timer(void *arg)
2315 ncec_t *ncec = arg;
2316 ill_t *ill = ncec->ncec_ill, *src_ill;
2317 char addrbuf[INET6_ADDRSTRLEN];
2318 boolean_t dropped = B_FALSE;
2319 ip_stack_t *ipst = ncec->ncec_ipst;
2320 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2321 in_addr_t sender4 = INADDR_ANY;
2322 in6_addr_t sender6 = ipv6_all_zeros;
2325 * The timer has to be cancelled by ncec_delete before doing the final
2326 * refrele. So the NCE is guaranteed to exist when the timer runs
2327 * until it clears the timeout_id. Before clearing the timeout_id
2328 * bump up the refcnt so that we can continue to use the ncec
2330 ASSERT(ncec != NULL);
2331 mutex_enter(&ncec->ncec_lock);
2332 ncec_refhold_locked(ncec);
2333 ncec->ncec_timeout_id = 0;
2334 mutex_exit(&ncec->ncec_lock);
2336 src_ill = nce_resolve_src(ncec, &sender6);
2337 /* if we could not find a sender address, return */
2338 if (src_ill == NULL) {
2339 if (!isv6) {
2340 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2341 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2342 &sender4, addrbuf, sizeof (addrbuf))));
2343 } else {
2344 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2345 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2347 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2348 ncec_refrele(ncec);
2349 return;
2351 if (!isv6)
2352 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2354 mutex_enter(&ncec->ncec_lock);
2356 * Check the reachability state.
2358 switch (ncec->ncec_state) {
2359 case ND_DELAY:
2360 ASSERT(ncec->ncec_lladdr != NULL);
2361 ncec->ncec_state = ND_PROBE;
2362 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2363 if (isv6) {
2364 mutex_exit(&ncec->ncec_lock);
2365 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2366 src_ill->ill_phys_addr,
2367 src_ill->ill_phys_addr_length,
2368 &sender6, &ncec->ncec_addr,
2369 NDP_UNICAST);
2370 } else {
2371 dropped = (arp_request(ncec, sender4, src_ill) == 0);
2372 mutex_exit(&ncec->ncec_lock);
2374 if (!dropped) {
2375 mutex_enter(&ncec->ncec_lock);
2376 ncec->ncec_pcnt--;
2377 mutex_exit(&ncec->ncec_lock);
2379 if (ip_debug > 3) {
2380 /* ip2dbg */
2381 pr_addr_dbg("nce_timer: state for %s changed "
2382 "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2384 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2385 break;
2386 case ND_PROBE:
2387 /* must be retransmit timer */
2388 ASSERT(ncec->ncec_pcnt >= -1);
2389 if (ncec->ncec_pcnt > 0) {
2391 * As per RFC2461, the ncec gets deleted after
2392 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2393 * Note that the first unicast solicitation is sent
2394 * during the DELAY state.
2396 ip2dbg(("nce_timer: pcount=%x dst %s\n",
2397 ncec->ncec_pcnt,
2398 inet_ntop((isv6? AF_INET6 : AF_INET),
2399 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2400 if (NCE_PUBLISH(ncec)) {
2401 mutex_exit(&ncec->ncec_lock);
2403 * send out a probe; note that src_ill
2404 * is ignored by nce_dad() for all
2405 * DAD message types other than IPv6
2406 * unicast probes
2408 nce_dad(ncec, src_ill, B_TRUE);
2409 } else {
2410 ASSERT(src_ill != NULL);
2411 if (isv6) {
2412 mutex_exit(&ncec->ncec_lock);
2413 dropped = ndp_xmit(src_ill,
2414 ND_NEIGHBOR_SOLICIT,
2415 src_ill->ill_phys_addr,
2416 src_ill->ill_phys_addr_length,
2417 &sender6, &ncec->ncec_addr,
2418 NDP_UNICAST);
2419 } else {
2421 * since the nce is REACHABLE,
2422 * the ARP request will be sent out
2423 * as a link-layer unicast.
2425 dropped = (arp_request(ncec, sender4,
2426 src_ill) == 0);
2427 mutex_exit(&ncec->ncec_lock);
2429 if (!dropped) {
2430 mutex_enter(&ncec->ncec_lock);
2431 ncec->ncec_pcnt--;
2432 mutex_exit(&ncec->ncec_lock);
2434 nce_restart_timer(ncec,
2435 ill->ill_reachable_retrans_time);
2437 } else if (ncec->ncec_pcnt < 0) {
2438 /* No hope, delete the ncec */
2439 /* Tell datapath it went bad */
2440 ncec->ncec_state = ND_UNREACHABLE;
2441 mutex_exit(&ncec->ncec_lock);
2442 if (ip_debug > 2) {
2443 /* ip1dbg */
2444 pr_addr_dbg("nce_timer: Delete NCE for"
2445 " dst %s\n", (isv6? AF_INET6: AF_INET),
2446 &ncec->ncec_addr);
2448 /* if static ARP can't delete. */
2449 if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2450 ncec_delete(ncec);
2452 } else if (!NCE_PUBLISH(ncec)) {
2454 * Probe count is 0 for a dynamic entry (one that we
2455 * ourselves are not publishing). We should never get
2456 * here if NONUD was requested, hence the ASSERT below.
2458 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2459 ip2dbg(("nce_timer: pcount=%x dst %s\n",
2460 ncec->ncec_pcnt, inet_ntop(AF_INET6,
2461 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2462 ncec->ncec_pcnt--;
2463 mutex_exit(&ncec->ncec_lock);
2464 /* Wait one interval before killing */
2465 nce_restart_timer(ncec,
2466 ill->ill_reachable_retrans_time);
2467 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2468 ipif_t *ipif;
2469 ipaddr_t ncec_addr;
2472 * We're done probing, and we can now declare this
2473 * address to be usable. Let IP know that it's ok to
2474 * use.
2476 ncec->ncec_state = ND_REACHABLE;
2477 ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2478 mutex_exit(&ncec->ncec_lock);
2479 if (isv6) {
2480 ipif = ipif_lookup_addr_exact_v6(
2481 &ncec->ncec_addr, ill, ipst);
2482 } else {
2483 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2484 ncec_addr);
2485 ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2486 ipst);
2488 if (ipif != NULL) {
2489 if (ipif->ipif_was_dup) {
2490 char ibuf[LIFNAMSIZ];
2491 char sbuf[INET6_ADDRSTRLEN];
2493 ipif->ipif_was_dup = B_FALSE;
2494 (void) inet_ntop(AF_INET6,
2495 &ipif->ipif_v6lcl_addr,
2496 sbuf, sizeof (sbuf));
2497 ipif_get_name(ipif, ibuf,
2498 sizeof (ibuf));
2499 cmn_err(CE_NOTE, "recovered address "
2500 "%s on %s", sbuf, ibuf);
2502 if ((ipif->ipif_flags & IPIF_UP) &&
2503 !ipif->ipif_addr_ready)
2504 ipif_up_notify(ipif);
2505 ipif->ipif_addr_ready = 1;
2506 ipif_refrele(ipif);
2508 if (!isv6 && arp_no_defense)
2509 break;
2510 /* Begin defending our new address */
2511 if (ncec->ncec_unsolicit_count > 0) {
2512 ncec->ncec_unsolicit_count--;
2513 if (isv6) {
2514 dropped = ndp_announce(ncec);
2515 } else {
2516 dropped = arp_announce(ncec);
2519 if (dropped)
2520 ncec->ncec_unsolicit_count++;
2521 else
2522 ncec->ncec_last_time_defended =
2523 ddi_get_lbolt();
2525 if (ncec->ncec_unsolicit_count > 0) {
2526 nce_restart_timer(ncec,
2527 ANNOUNCE_INTERVAL(isv6));
2528 } else if (DEFENSE_INTERVAL(isv6) != 0) {
2529 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2531 } else {
2533 * This is an address we're probing to be our own, but
2534 * the ill is down. Wait until it comes back before
2535 * doing anything, but switch to reachable state so
2536 * that the restart will work.
2538 ncec->ncec_state = ND_REACHABLE;
2539 mutex_exit(&ncec->ncec_lock);
2541 break;
2542 case ND_INCOMPLETE: {
2543 mblk_t *mp, *nextmp;
2544 mblk_t **prevmpp;
2547 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2548 * for any IPMP probe packets, and toss them. IPMP probe
2549 * packets will always be at the head of ncec_qd_mp, so that
2550 * we can stop at the first queued ND packet that is
2551 * not a probe packet.
2553 prevmpp = &ncec->ncec_qd_mp;
2554 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2555 nextmp = mp->b_next;
2557 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2558 inet_freemsg(mp);
2559 ncec->ncec_nprobes--;
2560 *prevmpp = nextmp;
2561 } else {
2562 prevmpp = &mp->b_next;
2567 * Must be resolver's retransmit timer.
2569 mutex_exit(&ncec->ncec_lock);
2570 ip_ndp_resolve(ncec);
2571 break;
2573 case ND_REACHABLE:
2574 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2575 ncec->ncec_unsolicit_count != 0) ||
2576 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2577 if (ncec->ncec_unsolicit_count > 0) {
2578 ncec->ncec_unsolicit_count--;
2579 mutex_exit(&ncec->ncec_lock);
2581 * When we get to zero announcements left,
2582 * switch to address defense
2584 } else {
2585 boolean_t rate_limit;
2587 mutex_exit(&ncec->ncec_lock);
2588 rate_limit = ill_defend_rate_limit(ill, ncec);
2589 if (rate_limit) {
2590 nce_restart_timer(ncec,
2591 DEFENSE_INTERVAL(isv6));
2592 break;
2595 if (isv6) {
2596 dropped = ndp_announce(ncec);
2597 } else {
2598 dropped = arp_announce(ncec);
2600 mutex_enter(&ncec->ncec_lock);
2601 if (dropped) {
2602 ncec->ncec_unsolicit_count++;
2603 } else {
2604 ncec->ncec_last_time_defended =
2605 ddi_get_lbolt();
2607 mutex_exit(&ncec->ncec_lock);
2608 if (ncec->ncec_unsolicit_count != 0) {
2609 nce_restart_timer(ncec,
2610 ANNOUNCE_INTERVAL(isv6));
2611 } else {
2612 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2614 } else {
2615 mutex_exit(&ncec->ncec_lock);
2617 break;
2618 default:
2619 mutex_exit(&ncec->ncec_lock);
2620 break;
2622 done:
2623 ncec_refrele(ncec);
2624 ill_refrele(src_ill);
2628 * Set a link layer address from the ll_addr passed in.
2629 * Copy SAP from ill.
2631 static void
2632 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2634 ill_t *ill = ncec->ncec_ill;
2636 ASSERT(ll_addr != NULL);
2637 if (ill->ill_phys_addr_length > 0) {
2639 * The bcopy() below used to be called for the physical address
2640 * length rather than the link layer address length. For
2641 * ethernet and many other media, the phys_addr and lla are
2642 * identical.
2644 * The phys_addr and lla may not be the same for devices that
2645 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2646 * no known instances of these.
2648 * For PPP or other interfaces with a zero length
2649 * physical address, don't do anything here.
2650 * The bcopy() with a zero phys_addr length was previously
2651 * a no-op for interfaces with a zero-length physical address.
2652 * Using the lla for them would change the way they operate.
2653 * Doing nothing in such cases preserves expected behavior.
2655 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2659 boolean_t
2660 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2661 uint32_t ll_addr_len)
2663 ASSERT(ncec->ncec_lladdr != NULL);
2664 if (ll_addr == NULL)
2665 return (B_FALSE);
2666 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2667 return (B_TRUE);
2668 return (B_FALSE);
2672 * Updates the link layer address or the reachability state of
2673 * a cache entry. Reset probe counter if needed.
2675 void
2676 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2678 ill_t *ill = ncec->ncec_ill;
2679 boolean_t need_stop_timer = B_FALSE;
2680 boolean_t need_fastpath_update = B_FALSE;
2681 nce_t *nce = NULL;
2682 timeout_id_t tid;
2684 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2686 * If this interface does not do NUD, there is no point
2687 * in allowing an update to the cache entry. Although
2688 * we will respond to NS.
2689 * The only time we accept an update for a resolver when
2690 * NUD is turned off is when it has just been created.
2691 * Non-Resolvers will always be created as REACHABLE.
2693 if (new_state != ND_UNCHANGED) {
2694 if ((ncec->ncec_flags & NCE_F_NONUD) &&
2695 (ncec->ncec_state != ND_INCOMPLETE))
2696 return;
2697 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2698 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2699 need_stop_timer = B_TRUE;
2700 if (new_state == ND_REACHABLE)
2701 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2702 else {
2703 /* We force NUD in this case */
2704 ncec->ncec_last = 0;
2706 ncec->ncec_state = new_state;
2707 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2708 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2709 new_state == ND_INCOMPLETE);
2711 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2712 tid = ncec->ncec_timeout_id;
2713 ncec->ncec_timeout_id = 0;
2716 * Re-trigger fastpath probe and
2717 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2718 * whatever packets that happens to be transmitting at the time.
2720 if (new_ll_addr != NULL) {
2721 bcopy(new_ll_addr, ncec->ncec_lladdr,
2722 ill->ill_phys_addr_length);
2723 need_fastpath_update = B_TRUE;
2725 mutex_exit(&ncec->ncec_lock);
2726 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2727 if (tid != 0)
2728 (void) untimeout(tid);
2730 if (need_fastpath_update) {
2732 * Delete any existing existing dlur_mp and fp_mp information.
2733 * For IPMP interfaces, all underlying ill's must be checked
2734 * and purged.
2736 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2738 * add the new dlur_mp and fp_mp
2740 nce = nce_fastpath(ncec, B_TRUE, NULL);
2741 if (nce != NULL)
2742 nce_refrele(nce);
2744 mutex_enter(&ncec->ncec_lock);
2747 static void
2748 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2750 uint_t count = 0;
2751 mblk_t **mpp, *tmp;
2753 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2755 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2756 if (++count > ncec->ncec_ill->ill_max_buf) {
2757 tmp = ncec->ncec_qd_mp->b_next;
2758 ncec->ncec_qd_mp->b_next = NULL;
2760 * if we never create data addrs on the under_ill
2761 * does this matter?
2763 BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2764 ipIfStatsOutDiscards);
2765 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2766 ncec->ncec_ill);
2767 freemsg(ncec->ncec_qd_mp);
2768 ncec->ncec_qd_mp = tmp;
2772 if (head_insert) {
2773 ncec->ncec_nprobes++;
2774 mp->b_next = ncec->ncec_qd_mp;
2775 ncec->ncec_qd_mp = mp;
2776 } else {
2777 *mpp = mp;
2782 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2783 * queued at the head or tail of the queue based on the input argument
2784 * 'head_insert'. The caller should specify this argument as B_TRUE if this
2785 * packet is an IPMP probe packet, in which case the following happens:
2787 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal
2788 * (non-ipmp_probe) load-speading case where the source address of the ND
2789 * packet is not tied to ncec_ill. If the ill bound to the source address
2790 * cannot receive, the response to the ND packet will not be received.
2791 * However, if ND packets for ncec_ill's probes are queued behind that ND
2792 * packet, those probes will also fail to be sent, and thus in.mpathd will
2793 * erroneously conclude that ncec_ill has also failed.
2795 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on
2796 * the first attempt. This ensures that ND problems do not manifest as
2797 * probe RTT spikes.
2799 * We achieve this by inserting ipmp_probe() packets at the head of the
2800 * nce_queue.
2802 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2803 * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2805 void
2806 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2808 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2809 nce_queue_mp_common(ncec, mp, head_insert);
2813 * Called when address resolution failed due to a timeout.
2814 * Send an ICMP unreachable in response to all queued packets.
2816 void
2817 ndp_resolv_failed(ncec_t *ncec)
2819 mblk_t *mp, *nxt_mp;
2820 char buf[INET6_ADDRSTRLEN];
2821 ill_t *ill = ncec->ncec_ill;
2822 ip_recv_attr_t iras;
2824 bzero(&iras, sizeof (iras));
2825 iras.ira_flags = 0;
2827 * we are setting the ira_rill to the ipmp_ill (instead of
2828 * the actual ill on which the packet was received), but this
2829 * is ok because we don't actually need the real ira_rill.
2830 * to send the icmp unreachable to the sender.
2832 iras.ira_ill = iras.ira_rill = ill;
2833 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2834 iras.ira_rifindex = iras.ira_ruifindex;
2836 ip1dbg(("ndp_resolv_failed: dst %s\n",
2837 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2838 mutex_enter(&ncec->ncec_lock);
2839 mp = ncec->ncec_qd_mp;
2840 ncec->ncec_qd_mp = NULL;
2841 ncec->ncec_nprobes = 0;
2842 mutex_exit(&ncec->ncec_lock);
2843 while (mp != NULL) {
2844 nxt_mp = mp->b_next;
2845 mp->b_next = NULL;
2847 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2848 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2849 mp, ill);
2850 icmp_unreachable_v6(mp,
2851 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2852 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2853 mp = nxt_mp;
2855 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2859 * Handle the completion of NDP and ARP resolution.
2861 void
2862 nce_resolv_ok(ncec_t *ncec)
2864 mblk_t *mp;
2865 uint_t pkt_len;
2866 iaflags_t ixaflags = IXAF_NO_TRACE;
2867 nce_t *nce;
2868 ill_t *ill = ncec->ncec_ill;
2869 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2870 ip_stack_t *ipst = ill->ill_ipst;
2872 if (IS_IPMP(ncec->ncec_ill)) {
2873 nce_resolv_ipmp_ok(ncec);
2874 return;
2876 /* non IPMP case */
2878 mutex_enter(&ncec->ncec_lock);
2879 ASSERT(ncec->ncec_nprobes == 0);
2880 mp = ncec->ncec_qd_mp;
2881 ncec->ncec_qd_mp = NULL;
2882 mutex_exit(&ncec->ncec_lock);
2884 while (mp != NULL) {
2885 mblk_t *nxt_mp;
2887 if (ill->ill_isv6) {
2888 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2890 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2891 } else {
2892 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2894 ixaflags |= IXAF_IS_IPV4;
2895 pkt_len = ntohs(ipha->ipha_length);
2897 nxt_mp = mp->b_next;
2898 mp->b_next = NULL;
2900 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2901 * longer available, but it's ok to drop this flag because TCP
2902 * has its own flow-control in effect, so TCP packets
2903 * are not likely to get here when flow-control is in effect.
2905 mutex_enter(&ill->ill_lock);
2906 nce = nce_lookup(ill, &ncec->ncec_addr);
2907 mutex_exit(&ill->ill_lock);
2909 if (nce == NULL) {
2910 if (isv6) {
2911 BUMP_MIB(&ipst->ips_ip6_mib,
2912 ipIfStatsOutDiscards);
2913 } else {
2914 BUMP_MIB(&ipst->ips_ip_mib,
2915 ipIfStatsOutDiscards);
2917 ip_drop_output("ipIfStatsOutDiscards - no nce",
2918 mp, NULL);
2919 freemsg(mp);
2920 } else {
2922 * We don't know the zoneid, but
2923 * ip_xmit does not care since IXAF_NO_TRACE
2924 * is set. (We traced the packet the first
2925 * time through ip_xmit.)
2927 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2928 ALL_ZONES, 0, NULL);
2929 nce_refrele(nce);
2931 mp = nxt_mp;
2934 ncec_cb_dispatch(ncec); /* complete callbacks */
2938 * Called by SIOCSNDP* ioctl to add/change an ncec entry
2939 * and the corresponding attributes.
2940 * Disallow states other than ND_REACHABLE or ND_STALE.
2943 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2945 sin6_t *sin6;
2946 in6_addr_t *addr;
2947 ncec_t *ncec;
2948 nce_t *nce;
2949 int err = 0;
2950 uint16_t new_flags = 0;
2951 uint16_t old_flags = 0;
2952 int inflags = lnr->lnr_flags;
2953 ip_stack_t *ipst = ill->ill_ipst;
2954 boolean_t do_postprocess = B_FALSE;
2956 ASSERT(ill->ill_isv6);
2957 if ((lnr->lnr_state_create != ND_REACHABLE) &&
2958 (lnr->lnr_state_create != ND_STALE))
2959 return (EINVAL);
2961 sin6 = (sin6_t *)&lnr->lnr_addr;
2962 addr = &sin6->sin6_addr;
2964 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2965 ASSERT(!IS_UNDER_IPMP(ill));
2966 nce = nce_lookup_addr(ill, addr);
2967 if (nce != NULL)
2968 new_flags = nce->nce_common->ncec_flags;
2970 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2971 case NDF_ISROUTER_ON:
2972 new_flags |= NCE_F_ISROUTER;
2973 break;
2974 case NDF_ISROUTER_OFF:
2975 new_flags &= ~NCE_F_ISROUTER;
2976 break;
2977 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2978 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2979 if (nce != NULL)
2980 nce_refrele(nce);
2981 return (EINVAL);
2983 if (inflags & NDF_STATIC)
2984 new_flags |= NCE_F_STATIC;
2986 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2987 case NDF_ANYCAST_ON:
2988 new_flags |= NCE_F_ANYCAST;
2989 break;
2990 case NDF_ANYCAST_OFF:
2991 new_flags &= ~NCE_F_ANYCAST;
2992 break;
2993 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2994 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2995 if (nce != NULL)
2996 nce_refrele(nce);
2997 return (EINVAL);
3000 if (nce == NULL) {
3001 err = nce_add_v6(ill,
3002 (uchar_t *)lnr->lnr_hdw_addr,
3003 ill->ill_phys_addr_length,
3004 addr,
3005 new_flags,
3006 lnr->lnr_state_create,
3007 &nce);
3008 if (err != 0) {
3009 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3010 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3011 return (err);
3012 } else {
3013 do_postprocess = B_TRUE;
3016 ncec = nce->nce_common;
3017 old_flags = ncec->ncec_flags;
3018 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3019 ncec_router_to_host(ncec);
3020 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3021 if (do_postprocess)
3022 err = nce_add_v6_postprocess(nce);
3023 nce_refrele(nce);
3024 return (0);
3026 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3028 if (do_postprocess)
3029 err = nce_add_v6_postprocess(nce);
3031 * err cannot be anything other than 0 because we don't support
3032 * proxy arp of static addresses.
3034 ASSERT(err == 0);
3036 mutex_enter(&ncec->ncec_lock);
3037 ncec->ncec_flags = new_flags;
3038 mutex_exit(&ncec->ncec_lock);
3040 * Note that we ignore the state at this point, which
3041 * should be either STALE or REACHABLE. Instead we let
3042 * the link layer address passed in to determine the state
3043 * much like incoming packets.
3045 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3046 nce_refrele(nce);
3047 return (0);
3051 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3052 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3053 * be held to ensure that they are in the same group.
3055 static nce_t *
3056 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3059 nce_t *nce;
3061 nce = nce_ill_lookup_then_add(ill, ncec);
3063 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3064 return (nce);
3067 * hold the ncec_lock to synchronize with nce_update() so that,
3068 * at the end of this function, the contents of nce_dlur_mp are
3069 * consistent with ncec->ncec_lladdr, even though some intermediate
3070 * packet may have been sent out with a mangled address, which would
3071 * only be a transient condition.
3073 mutex_enter(&ncec->ncec_lock);
3074 if (ncec->ncec_lladdr != NULL) {
3075 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3076 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3077 } else {
3078 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3079 ill->ill_sap_length);
3081 mutex_exit(&ncec->ncec_lock);
3082 return (nce);
3086 * we make nce_fp_mp to have an M_DATA prepend.
3087 * The caller ensures there is hold on ncec for this function.
3088 * Note that since ill_fastpath_probe() copies the mblk there is
3089 * no need to hold the nce or ncec beyond this function.
3091 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3092 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3093 * and will be returned back by this function, so that no extra nce_refrele
3094 * is required for the caller. The calls from nce_add_common() use this
3095 * method. All other callers (that pass in NULL ncec_nce) will have to do a
3096 * nce_refrele of the returned nce (when it is non-null).
3098 nce_t *
3099 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3101 nce_t *nce;
3102 ill_t *ill = ncec->ncec_ill;
3104 ASSERT(ill != NULL);
3106 if (IS_IPMP(ill) && trigger_fp_req) {
3107 trigger_fp_req = B_FALSE;
3108 ipmp_ncec_refresh_nce(ncec);
3112 * If the caller already has the nce corresponding to the ill, use
3113 * that one. Otherwise we have to lookup/add the nce. Calls from
3114 * nce_add_common() fall in the former category, and have just done
3115 * the nce lookup/add that can be reused.
3117 if (ncec_nce == NULL)
3118 nce = nce_fastpath_create(ill, ncec);
3119 else
3120 nce = ncec_nce;
3122 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3123 return (nce);
3125 if (trigger_fp_req)
3126 nce_fastpath_trigger(nce);
3127 return (nce);
3131 * Trigger fastpath on nce. No locks may be held.
3133 static void
3134 nce_fastpath_trigger(nce_t *nce)
3136 int res;
3137 ill_t *ill = nce->nce_ill;
3138 ncec_t *ncec = nce->nce_common;
3140 res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3142 * EAGAIN is an indication of a transient error
3143 * i.e. allocation failure etc. leave the ncec in the list it
3144 * will be updated when another probe happens for another ire
3145 * if not it will be taken out of the list when the ire is
3146 * deleted.
3148 if (res != 0 && res != EAGAIN && res != ENOTSUP)
3149 nce_fastpath_list_delete(ill, ncec, NULL);
3153 * Add ncec to the nce fastpath list on ill.
3155 static nce_t *
3156 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3158 nce_t *nce = NULL;
3160 ASSERT(MUTEX_HELD(&ill->ill_lock));
3162 * Atomically ensure that the ill is not CONDEMNED and is not going
3163 * down, before adding the NCE.
3165 if (ill->ill_state_flags & ILL_CONDEMNED)
3166 return (NULL);
3167 mutex_enter(&ncec->ncec_lock);
3169 * if ncec has not been deleted and
3170 * is not already in the list add it.
3172 if (!NCE_ISCONDEMNED(ncec)) {
3173 nce = nce_lookup(ill, &ncec->ncec_addr);
3174 if (nce != NULL)
3175 goto done;
3176 nce = nce_add(ill, ncec);
3178 done:
3179 mutex_exit(&ncec->ncec_lock);
3180 return (nce);
3183 nce_t *
3184 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3186 nce_t *nce;
3188 mutex_enter(&ill->ill_lock);
3189 nce = nce_ill_lookup_then_add_locked(ill, ncec);
3190 mutex_exit(&ill->ill_lock);
3191 return (nce);
3196 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3197 * nce is added to the 'dead' list, and the caller must nce_refrele() the
3198 * entry after all locks have been dropped.
3200 void
3201 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3203 nce_t *nce;
3205 ASSERT(ill != NULL);
3207 /* delete any nces referencing the ncec from underlying ills */
3208 if (IS_IPMP(ill))
3209 ipmp_ncec_delete_nce(ncec);
3211 /* now the ill itself */
3212 mutex_enter(&ill->ill_lock);
3213 for (nce = list_head(&ill->ill_nce); nce != NULL;
3214 nce = list_next(&ill->ill_nce, nce)) {
3215 if (nce->nce_common == ncec) {
3216 nce_refhold(nce);
3217 nce_delete(nce);
3218 break;
3221 mutex_exit(&ill->ill_lock);
3222 if (nce != NULL) {
3223 if (dead == NULL)
3224 nce_refrele(nce);
3225 else
3226 list_insert_tail(dead, nce);
3231 * when the fastpath response does not fit in the datab
3232 * associated with the existing nce_fp_mp, we delete and
3233 * add the nce to retrigger fastpath based on the information
3234 * in the ncec_t.
3236 static nce_t *
3237 nce_delete_then_add(nce_t *nce)
3239 ill_t *ill = nce->nce_ill;
3240 nce_t *newnce = NULL;
3242 ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3243 (void *)nce, ill->ill_name));
3244 mutex_enter(&ill->ill_lock);
3245 mutex_enter(&nce->nce_common->ncec_lock);
3246 nce_delete(nce);
3248 * Make sure that ncec is not condemned before adding. We hold the
3249 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3250 * ipmp_ncec_delete_nce()
3252 if (!NCE_ISCONDEMNED(nce->nce_common))
3253 newnce = nce_add(ill, nce->nce_common);
3254 mutex_exit(&nce->nce_common->ncec_lock);
3255 mutex_exit(&ill->ill_lock);
3256 nce_refrele(nce);
3257 return (newnce); /* could be null if nomem */
3260 typedef struct nce_fp_match_s {
3261 nce_t *nce_fp_match_res;
3262 mblk_t *nce_fp_match_ack_mp;
3263 } nce_fp_match_t;
3265 /* ARGSUSED */
3266 static int
3267 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3269 nce_fp_match_t *nce_fp_marg = arg;
3270 ncec_t *ncec = nce->nce_common;
3271 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp;
3272 uchar_t *mp_rptr, *ud_mp_rptr;
3273 mblk_t *ud_mp = nce->nce_dlur_mp;
3274 ptrdiff_t cmplen;
3277 * mp is the mp associated with the fastpath ack.
3278 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3279 * under consideration. If the contents match, then the
3280 * fastpath ack is used to update the nce.
3282 if (ud_mp == NULL)
3283 return (0);
3284 mp_rptr = mp->b_rptr;
3285 cmplen = mp->b_wptr - mp_rptr;
3286 ASSERT(cmplen >= 0);
3288 ud_mp_rptr = ud_mp->b_rptr;
3290 * The ncec is locked here to prevent any other threads from accessing
3291 * and changing nce_dlur_mp when the address becomes resolved to an
3292 * lla while we're in the middle of looking at and comparing the
3293 * hardware address (lla). It is also locked to prevent multiple
3294 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3295 * time.
3297 mutex_enter(&ncec->ncec_lock);
3298 if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3299 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3300 nce_fp_marg->nce_fp_match_res = nce;
3301 mutex_exit(&ncec->ncec_lock);
3302 nce_refhold(nce);
3303 return (1);
3305 mutex_exit(&ncec->ncec_lock);
3306 return (0);
3310 * Update all NCE's that are not in fastpath mode and
3311 * have an nce_fp_mp that matches mp. mp->b_cont contains
3312 * the fastpath header.
3314 * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3316 void
3317 nce_fastpath_update(ill_t *ill, mblk_t *mp)
3319 nce_fp_match_t nce_fp_marg;
3320 nce_t *nce;
3321 mblk_t *nce_fp_mp, *fp_mp;
3323 nce_fp_marg.nce_fp_match_res = NULL;
3324 nce_fp_marg.nce_fp_match_ack_mp = mp;
3326 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3328 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3329 return;
3331 mutex_enter(&nce->nce_lock);
3332 nce_fp_mp = nce->nce_fp_mp;
3334 if (nce_fp_mp != NULL) {
3335 fp_mp = mp->b_cont;
3336 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3337 nce_fp_mp->b_datap->db_lim) {
3338 mutex_exit(&nce->nce_lock);
3339 nce = nce_delete_then_add(nce);
3340 if (nce == NULL) {
3341 return;
3343 mutex_enter(&nce->nce_lock);
3344 nce_fp_mp = nce->nce_fp_mp;
3348 /* Matched - install mp as the fastpath mp */
3349 if (nce_fp_mp == NULL) {
3350 fp_mp = dupb(mp->b_cont);
3351 nce->nce_fp_mp = fp_mp;
3352 } else {
3353 fp_mp = mp->b_cont;
3354 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3355 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3356 + MBLKL(fp_mp);
3358 mutex_exit(&nce->nce_lock);
3359 nce_refrele(nce);
3363 * Return a pointer to a given option in the packet.
3364 * Assumes that option part of the packet have already been validated.
3366 nd_opt_hdr_t *
3367 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3369 while (optlen > 0) {
3370 if (opt->nd_opt_type == opt_type)
3371 return (opt);
3372 optlen -= 8 * opt->nd_opt_len;
3373 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3375 return (NULL);
3379 * Verify all option lengths present are > 0, also check to see
3380 * if the option lengths and packet length are consistent.
3382 boolean_t
3383 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3385 ASSERT(opt != NULL);
3386 while (optlen > 0) {
3387 if (opt->nd_opt_len == 0)
3388 return (B_FALSE);
3389 optlen -= 8 * opt->nd_opt_len;
3390 if (optlen < 0)
3391 return (B_FALSE);
3392 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3394 return (B_TRUE);
3398 * ncec_walk function.
3399 * Free a fraction of the NCE cache entries.
3401 * A possible optimization here would be to use ncec_last where possible, and
3402 * delete the least-frequently used entry, which would require more complex
3403 * computation as we walk through the ncec's (e.g., track ncec entries by
3404 * order of ncec_last and/or maintain state)
3406 static void
3407 ncec_cache_reclaim(ncec_t *ncec, char *arg)
3409 ip_stack_t *ipst = ncec->ncec_ipst;
3410 uint_t fraction = *(uint_t *)arg;
3411 uint_t rand;
3413 if ((ncec->ncec_flags &
3414 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3415 return;
3418 rand = (uint_t)ddi_get_lbolt() +
3419 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3420 if ((rand/fraction)*fraction == rand) {
3421 IP_STAT(ipst, ip_nce_reclaim_deleted);
3422 ncec_delete(ncec);
3427 * kmem_cache callback to free up memory.
3429 * For now we just delete a fixed fraction.
3431 static void
3432 ip_nce_reclaim_stack(ip_stack_t *ipst)
3434 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction;
3436 IP_STAT(ipst, ip_nce_reclaim_calls);
3438 ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst);
3441 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3442 * Get them to update any stale references to drop any refholds they
3443 * have.
3445 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3449 * Called by the memory allocator subsystem directly, when the system
3450 * is running low on memory.
3452 /* ARGSUSED */
3453 void
3454 ip_nce_reclaim(void *args)
3456 netstack_handle_t nh;
3457 netstack_t *ns;
3458 ip_stack_t *ipst;
3460 netstack_next_init(&nh);
3461 while ((ns = netstack_next(&nh)) != NULL) {
3463 * netstack_next() can return a netstack_t with a NULL
3464 * netstack_ip at boot time.
3466 if ((ipst = ns->netstack_ip) == NULL) {
3467 netstack_rele(ns);
3468 continue;
3470 ip_nce_reclaim_stack(ipst);
3471 netstack_rele(ns);
3473 netstack_next_fini(&nh);
3476 #ifdef DEBUG
3477 void
3478 ncec_trace_ref(ncec_t *ncec)
3480 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3482 if (ncec->ncec_trace_disable)
3483 return;
3485 if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3486 ncec->ncec_trace_disable = B_TRUE;
3487 ncec_trace_cleanup(ncec);
3491 void
3492 ncec_untrace_ref(ncec_t *ncec)
3494 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3496 if (!ncec->ncec_trace_disable)
3497 th_trace_unref(ncec);
3500 static void
3501 ncec_trace_cleanup(const ncec_t *ncec)
3503 th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3505 #endif
3508 * Called when address resolution fails due to a timeout.
3509 * Send an ICMP unreachable in response to all queued packets.
3511 void
3512 arp_resolv_failed(ncec_t *ncec)
3514 mblk_t *mp, *nxt_mp;
3515 char buf[INET6_ADDRSTRLEN];
3516 struct in_addr ipv4addr;
3517 ill_t *ill = ncec->ncec_ill;
3518 ip_stack_t *ipst = ncec->ncec_ipst;
3519 ip_recv_attr_t iras;
3521 bzero(&iras, sizeof (iras));
3522 iras.ira_flags = IRAF_IS_IPV4;
3524 * we are setting the ira_rill to the ipmp_ill (instead of
3525 * the actual ill on which the packet was received), but this
3526 * is ok because we don't actually need the real ira_rill.
3527 * to send the icmp unreachable to the sender.
3529 iras.ira_ill = iras.ira_rill = ill;
3530 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3531 iras.ira_rifindex = iras.ira_ruifindex;
3533 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3534 ip3dbg(("arp_resolv_failed: dst %s\n",
3535 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3536 mutex_enter(&ncec->ncec_lock);
3537 mp = ncec->ncec_qd_mp;
3538 ncec->ncec_qd_mp = NULL;
3539 ncec->ncec_nprobes = 0;
3540 mutex_exit(&ncec->ncec_lock);
3541 while (mp != NULL) {
3542 nxt_mp = mp->b_next;
3543 mp->b_next = NULL;
3545 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3546 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3547 mp, ill);
3548 if (ipst->ips_ip_arp_icmp_error) {
3549 ip3dbg(("arp_resolv_failed: "
3550 "Calling icmp_unreachable\n"));
3551 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3552 } else {
3553 freemsg(mp);
3555 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3556 mp = nxt_mp;
3558 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3562 * if ill is an under_ill, translate it to the ipmp_ill and add the
3563 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3564 * one on the underlying in_ill) will be created for the
3565 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3568 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3569 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3571 int err;
3572 in6_addr_t addr6;
3573 ip_stack_t *ipst = ill->ill_ipst;
3574 nce_t *nce, *upper_nce = NULL;
3575 ill_t *in_ill = ill, *under = NULL;
3576 boolean_t need_ill_refrele = B_FALSE;
3578 if (flags & NCE_F_MCAST) {
3580 * hw_addr will be figured out in nce_set_multicast_v4;
3581 * caller needs to pass in the cast_ill for ipmp
3583 ASSERT(hw_addr == NULL);
3584 ASSERT(!IS_IPMP(ill));
3585 err = nce_set_multicast_v4(ill, addr, flags, newnce);
3586 return (err);
3589 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3590 ill = ipmp_ill_hold_ipmp_ill(ill);
3591 if (ill == NULL)
3592 return (ENXIO);
3593 need_ill_refrele = B_TRUE;
3595 if ((flags & NCE_F_BCAST) != 0) {
3597 * IPv4 broadcast ncec: compute the hwaddr.
3599 if (IS_IPMP(ill)) {
3600 under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3601 if (under == NULL) {
3602 if (need_ill_refrele)
3603 ill_refrele(ill);
3604 return (ENETDOWN);
3606 hw_addr = under->ill_bcast_mp->b_rptr +
3607 NCE_LL_ADDR_OFFSET(under);
3608 hw_addr_len = under->ill_phys_addr_length;
3609 } else {
3610 hw_addr = ill->ill_bcast_mp->b_rptr +
3611 NCE_LL_ADDR_OFFSET(ill),
3612 hw_addr_len = ill->ill_phys_addr_length;
3616 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3617 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3618 nce = nce_lookup_addr(ill, &addr6);
3619 if (nce == NULL) {
3620 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3621 state, &nce);
3622 } else {
3623 err = EEXIST;
3625 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3626 if (err == 0)
3627 err = nce_add_v4_postprocess(nce);
3629 if (in_ill != ill && nce != NULL) {
3630 nce_t *under_nce = NULL;
3633 * in_ill was the under_ill. Try to create the under_nce.
3634 * Hold the ill_g_lock to prevent changes to group membership
3635 * until we are done.
3637 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3638 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3639 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3640 ill_t *, ill);
3641 rw_exit(&ipst->ips_ill_g_lock);
3642 err = ENXIO;
3643 nce_refrele(nce);
3644 nce = NULL;
3645 goto bail;
3647 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3648 if (under_nce == NULL) {
3649 rw_exit(&ipst->ips_ill_g_lock);
3650 err = EINVAL;
3651 nce_refrele(nce);
3652 nce = NULL;
3653 goto bail;
3655 rw_exit(&ipst->ips_ill_g_lock);
3656 upper_nce = nce;
3657 nce = under_nce; /* will be returned to caller */
3658 if (NCE_ISREACHABLE(nce->nce_common))
3659 nce_fastpath_trigger(under_nce);
3661 if (nce != NULL) {
3662 if (newnce != NULL)
3663 *newnce = nce;
3664 else
3665 nce_refrele(nce);
3667 bail:
3668 if (under != NULL)
3669 ill_refrele(under);
3670 if (upper_nce != NULL)
3671 nce_refrele(upper_nce);
3672 if (need_ill_refrele)
3673 ill_refrele(ill);
3675 return (err);
3679 * NDP Cache Entry creation routine for IPv4.
3680 * This routine must always be called with ndp4->ndp_g_lock held.
3681 * Prior to return, ncec_refcnt is incremented.
3683 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3684 * are always added pointing at the ipmp_ill. Thus, when the ill passed
3685 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3686 * entries will be created, both pointing at the same ncec_t. The nce_t
3687 * entries will have their nce_ill set to the ipmp_ill and the under_ill
3688 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3689 * Local addresses are always created on the ill passed to nce_add_v4.
3692 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3693 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3695 int err;
3696 boolean_t is_multicast = (flags & NCE_F_MCAST);
3697 struct in6_addr addr6;
3698 nce_t *nce;
3700 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3701 ASSERT(!ill->ill_isv6);
3702 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3704 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3705 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3706 &nce);
3707 ASSERT(newnce != NULL);
3708 *newnce = nce;
3709 return (err);
3713 * Post-processing routine to be executed after nce_add_v4(). This function
3714 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3715 * and must be called without any locks held.
3717 * Always returns 0, but we return an int to keep this symmetric with the
3718 * IPv6 counter-part.
3721 nce_add_v4_postprocess(nce_t *nce)
3723 ncec_t *ncec = nce->nce_common;
3724 uint16_t flags = ncec->ncec_flags;
3725 boolean_t ndp_need_dad = B_FALSE;
3726 boolean_t dropped;
3727 clock_t delay;
3728 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst;
3729 uchar_t *hw_addr = ncec->ncec_lladdr;
3730 boolean_t trigger_fastpath = B_TRUE;
3733 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3734 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3735 * We call nce_fastpath from nce_update if the link layer address of
3736 * the peer changes from nce_update
3738 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3739 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3740 trigger_fastpath = B_FALSE;
3742 if (trigger_fastpath)
3743 nce_fastpath_trigger(nce);
3745 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3747 * Either the caller (by passing in ND_PROBE)
3748 * or nce_add_common() (by the internally computed state
3749 * based on ncec_addr and ill_net_type) has determined
3750 * that this unicast entry needs DAD. Trigger DAD.
3752 ndp_need_dad = B_TRUE;
3753 } else if (flags & NCE_F_UNSOL_ADV) {
3755 * We account for the transmit below by assigning one
3756 * less than the ndd variable. Subsequent decrements
3757 * are done in nce_timer.
3759 mutex_enter(&ncec->ncec_lock);
3760 ncec->ncec_unsolicit_count =
3761 ipst->ips_ip_arp_publish_count - 1;
3762 mutex_exit(&ncec->ncec_lock);
3763 dropped = arp_announce(ncec);
3764 mutex_enter(&ncec->ncec_lock);
3765 if (dropped)
3766 ncec->ncec_unsolicit_count++;
3767 else
3768 ncec->ncec_last_time_defended = ddi_get_lbolt();
3769 if (ncec->ncec_unsolicit_count != 0) {
3770 nce_start_timer(ncec,
3771 ipst->ips_ip_arp_publish_interval);
3773 mutex_exit(&ncec->ncec_lock);
3777 * If ncec_xmit_interval is 0, user has configured us to send the first
3778 * probe right away. Do so, and set up for the subsequent probes.
3780 if (ndp_need_dad) {
3781 mutex_enter(&ncec->ncec_lock);
3782 if (ncec->ncec_pcnt == 0) {
3784 * DAD probes and announce can be
3785 * administratively disabled by setting the
3786 * probe_count to zero. Restart the timer in
3787 * this case to mark the ipif as ready.
3789 ncec->ncec_unsolicit_count = 0;
3790 mutex_exit(&ncec->ncec_lock);
3791 nce_restart_timer(ncec, 0);
3792 } else {
3793 mutex_exit(&ncec->ncec_lock);
3794 delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3795 ipst->ips_arp_probe_delay :
3796 ipst->ips_arp_fastprobe_delay);
3797 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3800 return (0);
3804 * ncec_walk routine to update all entries that have a given destination or
3805 * gateway address and cached link layer (MAC) address. This is used when ARP
3806 * informs us that a network-to-link-layer mapping may have changed.
3808 void
3809 nce_update_hw_changed(ncec_t *ncec, void *arg)
3811 nce_hw_map_t *hwm = arg;
3812 ipaddr_t ncec_addr;
3814 if (ncec->ncec_state != ND_REACHABLE)
3815 return;
3817 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3818 if (ncec_addr != hwm->hwm_addr)
3819 return;
3821 mutex_enter(&ncec->ncec_lock);
3822 if (hwm->hwm_flags != 0)
3823 ncec->ncec_flags = hwm->hwm_flags;
3824 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3825 mutex_exit(&ncec->ncec_lock);
3828 void
3829 ncec_refhold(ncec_t *ncec)
3831 mutex_enter(&(ncec)->ncec_lock);
3832 (ncec)->ncec_refcnt++;
3833 ASSERT((ncec)->ncec_refcnt != 0);
3834 #ifdef DEBUG
3835 ncec_trace_ref(ncec);
3836 #endif
3837 mutex_exit(&(ncec)->ncec_lock);
3840 void
3841 ncec_refhold_notr(ncec_t *ncec)
3843 mutex_enter(&(ncec)->ncec_lock);
3844 (ncec)->ncec_refcnt++;
3845 ASSERT((ncec)->ncec_refcnt != 0);
3846 mutex_exit(&(ncec)->ncec_lock);
3849 static void
3850 ncec_refhold_locked(ncec_t *ncec)
3852 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3853 (ncec)->ncec_refcnt++;
3854 #ifdef DEBUG
3855 ncec_trace_ref(ncec);
3856 #endif
3859 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3860 void
3861 ncec_refrele(ncec_t *ncec)
3863 mutex_enter(&(ncec)->ncec_lock);
3864 #ifdef DEBUG
3865 ncec_untrace_ref(ncec);
3866 #endif
3867 ASSERT((ncec)->ncec_refcnt != 0);
3868 if (--(ncec)->ncec_refcnt == 0) {
3869 ncec_inactive(ncec);
3870 } else {
3871 mutex_exit(&(ncec)->ncec_lock);
3875 void
3876 ncec_refrele_notr(ncec_t *ncec)
3878 mutex_enter(&(ncec)->ncec_lock);
3879 ASSERT((ncec)->ncec_refcnt != 0);
3880 if (--(ncec)->ncec_refcnt == 0) {
3881 ncec_inactive(ncec);
3882 } else {
3883 mutex_exit(&(ncec)->ncec_lock);
3888 * Common to IPv4 and IPv6.
3890 void
3891 nce_restart_timer(ncec_t *ncec, uint_t ms)
3893 timeout_id_t tid;
3895 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3897 /* First cancel any running timer */
3898 mutex_enter(&ncec->ncec_lock);
3899 tid = ncec->ncec_timeout_id;
3900 ncec->ncec_timeout_id = 0;
3901 if (tid != 0) {
3902 mutex_exit(&ncec->ncec_lock);
3903 (void) untimeout(tid);
3904 mutex_enter(&ncec->ncec_lock);
3907 /* Restart timer */
3908 nce_start_timer(ncec, ms);
3909 mutex_exit(&ncec->ncec_lock);
3912 static void
3913 nce_start_timer(ncec_t *ncec, uint_t ms)
3915 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3917 * Don't start the timer if the ncec has been deleted, or if the timer
3918 * is already running
3920 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3921 ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3922 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3927 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3928 uint16_t flags, nce_t **newnce)
3930 uchar_t *hw_addr;
3931 int err = 0;
3932 ip_stack_t *ipst = ill->ill_ipst;
3933 in6_addr_t dst6;
3934 nce_t *nce;
3936 ASSERT(!ill->ill_isv6);
3938 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3939 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3940 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3941 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3942 goto done;
3944 if (ill->ill_net_type == IRE_IF_RESOLVER) {
3946 * For IRE_IF_RESOLVER a hardware mapping can be
3947 * generated, for IRE_IF_NORESOLVER, resolution cookie
3948 * in the ill is copied in nce_add_v4().
3950 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3951 if (hw_addr == NULL) {
3952 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3953 return (ENOMEM);
3955 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3956 } else {
3958 * IRE_IF_NORESOLVER type simply copies the resolution
3959 * cookie passed in. So no hw_addr is needed.
3961 hw_addr = NULL;
3963 ASSERT(flags & NCE_F_MCAST);
3964 ASSERT(flags & NCE_F_NONUD);
3965 /* nce_state will be computed by nce_add_common() */
3966 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3967 ND_UNCHANGED, &nce);
3968 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3969 if (err == 0)
3970 err = nce_add_v4_postprocess(nce);
3971 if (hw_addr != NULL)
3972 kmem_free(hw_addr, ill->ill_phys_addr_length);
3973 if (err != 0) {
3974 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3975 return (err);
3977 done:
3978 if (newnce != NULL)
3979 *newnce = nce;
3980 else
3981 nce_refrele(nce);
3982 return (0);
3986 * This is used when scanning for "old" (least recently broadcast) NCEs. We
3987 * don't want to have to walk the list for every single one, so we gather up
3988 * batches at a time.
3990 #define NCE_RESCHED_LIST_LEN 8
3992 typedef struct {
3993 ill_t *ncert_ill;
3994 uint_t ncert_num;
3995 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN];
3996 } nce_resched_t;
3999 * Pick the longest waiting NCEs for defense.
4001 /* ARGSUSED */
4002 static int
4003 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4005 nce_resched_t *ncert = arg;
4006 ncec_t **ncecs;
4007 ncec_t **ncec_max;
4008 ncec_t *ncec_temp;
4009 ncec_t *ncec = nce->nce_common;
4011 ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4013 * Only reachable entries that are ready for announcement are eligible.
4015 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4016 return (0);
4017 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4018 ncec_refhold(ncec);
4019 ncert->ncert_nces[ncert->ncert_num++] = ncec;
4020 } else {
4021 ncecs = ncert->ncert_nces;
4022 ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4023 ncec_refhold(ncec);
4024 for (; ncecs < ncec_max; ncecs++) {
4025 ASSERT(ncec != NULL);
4026 if ((*ncecs)->ncec_last_time_defended >
4027 ncec->ncec_last_time_defended) {
4028 ncec_temp = *ncecs;
4029 *ncecs = ncec;
4030 ncec = ncec_temp;
4033 ncec_refrele(ncec);
4035 return (0);
4039 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this
4040 * doesn't happen very often (if at all), and thus it needn't be highly
4041 * optimized. (Note, though, that it's actually O(N) complexity, because the
4042 * outer loop is bounded by a constant rather than by the length of the list.)
4044 static void
4045 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4047 ncec_t *ncec;
4048 ip_stack_t *ipst = ill->ill_ipst;
4049 uint_t i, defend_rate;
4051 i = ill->ill_defend_count;
4052 ill->ill_defend_count = 0;
4053 if (ill->ill_isv6)
4054 defend_rate = ipst->ips_ndp_defend_rate;
4055 else
4056 defend_rate = ipst->ips_arp_defend_rate;
4057 /* If none could be sitting around, then don't reschedule */
4058 if (i < defend_rate) {
4059 DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4060 return;
4062 ncert->ncert_ill = ill;
4063 while (ill->ill_defend_count < defend_rate) {
4064 nce_walk_common(ill, ncec_reschedule, ncert);
4065 for (i = 0; i < ncert->ncert_num; i++) {
4067 ncec = ncert->ncert_nces[i];
4068 mutex_enter(&ncec->ncec_lock);
4069 ncec->ncec_flags |= NCE_F_DELAYED;
4070 mutex_exit(&ncec->ncec_lock);
4072 * we plan to schedule this ncec, so incr the
4073 * defend_count in anticipation.
4075 if (++ill->ill_defend_count >= defend_rate)
4076 break;
4078 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4079 break;
4084 * Check if the current rate-limiting parameters permit the sending
4085 * of another address defense announcement for both IPv4 and IPv6.
4086 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4087 * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4088 * determines how many address defense announcements are permitted
4089 * in any `defense_perio' interval.
4091 static boolean_t
4092 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4094 clock_t now = ddi_get_lbolt();
4095 ip_stack_t *ipst = ill->ill_ipst;
4096 clock_t start = ill->ill_defend_start;
4097 uint32_t elapsed, defend_period, defend_rate;
4098 nce_resched_t ncert;
4099 boolean_t ret;
4100 int i;
4102 if (ill->ill_isv6) {
4103 defend_period = ipst->ips_ndp_defend_period;
4104 defend_rate = ipst->ips_ndp_defend_rate;
4105 } else {
4106 defend_period = ipst->ips_arp_defend_period;
4107 defend_rate = ipst->ips_arp_defend_rate;
4109 if (defend_rate == 0)
4110 return (B_TRUE);
4111 bzero(&ncert, sizeof (ncert));
4112 mutex_enter(&ill->ill_lock);
4113 if (start > 0) {
4114 elapsed = now - start;
4115 if (elapsed > SEC_TO_TICK(defend_period)) {
4116 ill->ill_defend_start = now;
4118 * nce_ill_reschedule will attempt to
4119 * prevent starvation by reschduling the
4120 * oldest entries, which are marked with
4121 * the NCE_F_DELAYED flag.
4123 nce_ill_reschedule(ill, &ncert);
4125 } else {
4126 ill->ill_defend_start = now;
4128 ASSERT(ill->ill_defend_count <= defend_rate);
4129 mutex_enter(&ncec->ncec_lock);
4130 if (ncec->ncec_flags & NCE_F_DELAYED) {
4132 * This ncec was rescheduled as one of the really old
4133 * entries needing on-going defense. The
4134 * ill_defend_count was already incremented in
4135 * nce_ill_reschedule. Go ahead and send the announce.
4137 ncec->ncec_flags &= ~NCE_F_DELAYED;
4138 mutex_exit(&ncec->ncec_lock);
4139 ret = B_FALSE;
4140 goto done;
4142 mutex_exit(&ncec->ncec_lock);
4143 if (ill->ill_defend_count < defend_rate)
4144 ill->ill_defend_count++;
4145 if (ill->ill_defend_count == defend_rate) {
4147 * we are no longer allowed to send unbidden defense
4148 * messages. Wait for rescheduling.
4150 ret = B_TRUE;
4151 } else {
4152 ret = B_FALSE;
4154 done:
4155 mutex_exit(&ill->ill_lock);
4157 * After all the locks have been dropped we can restart nce timer,
4158 * and refrele the delayed ncecs
4160 for (i = 0; i < ncert.ncert_num; i++) {
4161 clock_t xmit_interval;
4162 ncec_t *tmp;
4164 tmp = ncert.ncert_nces[i];
4165 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4166 B_FALSE);
4167 nce_restart_timer(tmp, xmit_interval);
4168 ncec_refrele(tmp);
4170 return (ret);
4173 boolean_t
4174 ndp_announce(ncec_t *ncec)
4176 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4177 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4178 nce_advert_flags(ncec)));
4181 ill_t *
4182 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4184 mblk_t *mp;
4185 in6_addr_t src6;
4186 ipaddr_t src4;
4187 ill_t *ill = ncec->ncec_ill;
4188 ill_t *src_ill = NULL;
4189 ipif_t *ipif = NULL;
4190 boolean_t is_myaddr = NCE_MYADDR(ncec);
4191 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4193 ASSERT(src != NULL);
4194 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4195 src6 = *src;
4196 if (is_myaddr) {
4197 src6 = ncec->ncec_addr;
4198 if (!isv6)
4199 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4200 } else {
4202 * try to find one from the outgoing packet.
4204 mutex_enter(&ncec->ncec_lock);
4205 mp = ncec->ncec_qd_mp;
4206 if (mp != NULL) {
4207 if (isv6) {
4208 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4210 src6 = ip6h->ip6_src;
4211 } else {
4212 ipha_t *ipha = (ipha_t *)mp->b_rptr;
4214 src4 = ipha->ipha_src;
4215 IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4218 mutex_exit(&ncec->ncec_lock);
4222 * For outgoing packets, if the src of outgoing packet is one
4223 * of the assigned interface addresses use it, otherwise we
4224 * will pick the source address below.
4225 * For local addresses (is_myaddr) doing DAD, NDP announce
4226 * messages are mcast. So we use the (IPMP) cast_ill or the
4227 * (non-IPMP) ncec_ill for these message types. The only case
4228 * of unicast DAD messages are for IPv6 ND probes, for which
4229 * we find the ipif_bound_ill corresponding to the ncec_addr.
4231 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4232 if (isv6) {
4233 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4234 ill->ill_ipst);
4235 } else {
4236 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4237 ill->ill_ipst);
4241 * If no relevant ipif can be found, then it's not one of our
4242 * addresses. Reset to :: and try to find a src for the NS or
4243 * ARP request using ipif_select_source_v[4,6] below.
4244 * If an ipif can be found, but it's not yet done with
4245 * DAD verification, and we are not being invoked for
4246 * DAD (i.e., !is_myaddr), then just postpone this
4247 * transmission until later.
4249 if (ipif == NULL) {
4250 src6 = ipv6_all_zeros;
4251 src4 = INADDR_ANY;
4252 } else if (!ipif->ipif_addr_ready && !is_myaddr) {
4253 DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4254 ncec_t *, ncec, ipif_t *, ipif);
4255 ipif_refrele(ipif);
4256 return (NULL);
4260 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4262 * Pick a source address for this solicitation, but
4263 * restrict the selection to addresses assigned to the
4264 * output interface. We do this because the destination will
4265 * create a neighbor cache entry for the source address of
4266 * this packet, so the source address had better be a valid
4267 * neighbor.
4269 if (isv6) {
4270 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4271 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4272 B_FALSE, NULL);
4273 } else {
4274 ipaddr_t nce_addr;
4276 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4277 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4278 B_FALSE, NULL);
4280 if (ipif == NULL && IS_IPMP(ill)) {
4281 ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4283 if (send_ill != NULL) {
4284 if (isv6) {
4285 ipif = ipif_select_source_v6(send_ill,
4286 &ncec->ncec_addr, B_TRUE,
4287 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4288 B_FALSE, NULL);
4289 } else {
4290 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4291 src4);
4292 ipif = ipif_select_source_v4(send_ill,
4293 src4, ALL_ZONES, B_TRUE, NULL);
4295 ill_refrele(send_ill);
4299 if (ipif == NULL) {
4300 char buf[INET6_ADDRSTRLEN];
4302 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4303 inet_ntop((isv6 ? AF_INET6 : AF_INET),
4304 (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4305 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4306 return (NULL);
4308 src6 = ipif->ipif_v6lcl_addr;
4310 *src = src6;
4311 if (ipif != NULL) {
4312 src_ill = ipif->ipif_ill;
4313 if (IS_IPMP(src_ill))
4314 src_ill = ipmp_ipif_hold_bound_ill(ipif);
4315 else
4316 ill_refhold(src_ill);
4317 ipif_refrele(ipif);
4318 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4319 ill_t *, src_ill);
4321 return (src_ill);
4324 void
4325 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4326 uchar_t *hwaddr, int hwaddr_len, int flags)
4328 ill_t *ill;
4329 ncec_t *ncec;
4330 nce_t *nce;
4331 uint16_t new_state;
4333 ill = (ipif ? ipif->ipif_ill : NULL);
4334 if (ill != NULL) {
4336 * only one ncec is possible
4338 nce = nce_lookup_v4(ill, addr);
4339 if (nce != NULL) {
4340 ncec = nce->nce_common;
4341 mutex_enter(&ncec->ncec_lock);
4342 if (NCE_ISREACHABLE(ncec))
4343 new_state = ND_UNCHANGED;
4344 else
4345 new_state = ND_STALE;
4346 ncec->ncec_flags = flags;
4347 nce_update(ncec, new_state, hwaddr);
4348 mutex_exit(&ncec->ncec_lock);
4349 nce_refrele(nce);
4350 return;
4352 } else {
4354 * ill is wildcard; clean up all ncec's and ire's
4355 * that match on addr.
4357 nce_hw_map_t hwm;
4359 hwm.hwm_addr = *addr;
4360 hwm.hwm_hwlen = hwaddr_len;
4361 hwm.hwm_hwaddr = hwaddr;
4362 hwm.hwm_flags = flags;
4364 ncec_walk_common(ipst->ips_ndp4, NULL,
4365 (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE);
4370 * Common function to add ncec entries.
4371 * we always add the ncec with ncec_ill == ill, and always create
4372 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4373 * ncec is !reachable.
4375 * When the caller passes in an nce_state of ND_UNCHANGED,
4376 * nce_add_common() will determine the state of the created nce based
4377 * on the ill_net_type and nce_flags used. Otherwise, the nce will
4378 * be created with state set to the passed in nce_state.
4380 static int
4381 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4382 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4384 static ncec_t nce_nil;
4385 uchar_t *template = NULL;
4386 int err;
4387 ncec_t *ncec;
4388 ncec_t **ncep;
4389 ip_stack_t *ipst = ill->ill_ipst;
4390 uint16_t state;
4391 boolean_t fastprobe = B_FALSE;
4392 struct ndp_g_s *ndp;
4393 nce_t *nce = NULL;
4394 mblk_t *dlur_mp = NULL;
4396 if (ill->ill_isv6)
4397 ndp = ill->ill_ipst->ips_ndp6;
4398 else
4399 ndp = ill->ill_ipst->ips_ndp4;
4401 *retnce = NULL;
4403 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4405 if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4406 ip0dbg(("nce_add_common: no addr\n"));
4407 return (EINVAL);
4409 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4410 ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4411 return (EINVAL);
4414 if (ill->ill_isv6) {
4415 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4416 } else {
4417 ipaddr_t v4addr;
4419 IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4420 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4424 * The caller has ensured that there is no nce on ill, but there could
4425 * still be an nce_common_t for the address, so that we find exisiting
4426 * ncec_t strucutures first, and atomically add a new nce_t if
4427 * one is found. The ndp_g_lock ensures that we don't cross threads
4428 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4429 * compare for matches across the illgrp because this function is
4430 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4431 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4432 * appropriate.
4434 ncec = *ncep;
4435 for (; ncec != NULL; ncec = ncec->ncec_next) {
4436 if (ncec->ncec_ill == ill) {
4437 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4439 * We should never find *retnce to be
4440 * MYADDR, since the caller may then
4441 * incorrectly restart a DAD timer that's
4442 * already running. However, if we are in
4443 * forwarding mode, and the interface is
4444 * moving in/out of groups, the data
4445 * path ire lookup (e.g., ire_revalidate_nce)
4446 * may have determined that some destination
4447 * is offlink while the control path is adding
4448 * that address as a local address.
4449 * Recover from this case by failing the
4450 * lookup
4452 if (NCE_MYADDR(ncec))
4453 return (ENXIO);
4454 *retnce = nce_ill_lookup_then_add(ill, ncec);
4455 if (*retnce != NULL)
4456 break;
4460 if (*retnce != NULL) /* caller must trigger fastpath on nce */
4461 return (0);
4463 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4464 if (ncec == NULL)
4465 return (ENOMEM);
4466 *ncec = nce_nil;
4467 ncec->ncec_ill = ill;
4468 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4469 ncec->ncec_flags = flags;
4470 ncec->ncec_ipst = ipst; /* No netstack_hold */
4472 if (!ill->ill_isv6) {
4473 ipaddr_t addr4;
4476 * DAD probe interval and probe count are set based on
4477 * fast/slow probe settings. If the underlying link doesn't
4478 * have reliably up/down notifications or if we're working
4479 * with IPv4 169.254.0.0/16 Link Local Address space, then
4480 * don't use the fast timers. Otherwise, use them.
4482 ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4483 IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4484 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4485 fastprobe = B_TRUE;
4486 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4487 !IS_IPV4_LL_SPACE(&addr4)) {
4488 ill_t *hwaddr_ill;
4490 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4491 hw_addr_len);
4492 if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4493 fastprobe = B_TRUE;
4495 if (fastprobe) {
4496 ncec->ncec_xmit_interval =
4497 ipst->ips_arp_fastprobe_interval;
4498 ncec->ncec_pcnt =
4499 ipst->ips_arp_fastprobe_count;
4500 ncec->ncec_flags |= NCE_F_FAST;
4501 } else {
4502 ncec->ncec_xmit_interval =
4503 ipst->ips_arp_probe_interval;
4504 ncec->ncec_pcnt =
4505 ipst->ips_arp_probe_count;
4507 if (NCE_PUBLISH(ncec)) {
4508 ncec->ncec_unsolicit_count =
4509 ipst->ips_ip_arp_publish_count;
4511 } else {
4513 * probe interval is constant: ILL_PROBE_INTERVAL
4514 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4516 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4517 if (NCE_PUBLISH(ncec)) {
4518 ncec->ncec_unsolicit_count =
4519 ipst->ips_ip_ndp_unsolicit_count;
4522 ncec->ncec_rcnt = ill->ill_xmit_count;
4523 ncec->ncec_addr = *addr;
4524 ncec->ncec_qd_mp = NULL;
4525 ncec->ncec_refcnt = 1; /* for ncec getting created */
4526 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4527 ncec->ncec_trace_disable = B_FALSE;
4530 * ncec_lladdr holds link layer address
4532 if (hw_addr_len > 0) {
4533 template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4534 if (template == NULL) {
4535 err = ENOMEM;
4536 goto err_ret;
4538 ncec->ncec_lladdr = template;
4539 ncec->ncec_lladdr_length = hw_addr_len;
4540 bzero(ncec->ncec_lladdr, hw_addr_len);
4542 if ((flags & NCE_F_BCAST) != 0) {
4543 state = ND_REACHABLE;
4544 ASSERT(hw_addr_len > 0);
4545 } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4546 state = ND_INITIAL;
4547 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4549 * NORESOLVER entries are always created in the REACHABLE
4550 * state.
4552 state = ND_REACHABLE;
4553 if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4554 ill->ill_mactype != DL_IPV4 &&
4555 ill->ill_mactype != DL_6TO4) {
4557 * We create a nce_res_mp with the IP nexthop address
4558 * as the destination address if the physical length
4559 * is exactly 4 bytes for point-to-multipoint links
4560 * that do their own resolution from IP to link-layer
4561 * address (e.g. IP over X.25).
4563 bcopy((uchar_t *)addr,
4564 ncec->ncec_lladdr, ill->ill_phys_addr_length);
4566 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4567 ill->ill_mactype != DL_IPV6) {
4569 * We create a nce_res_mp with the IP nexthop address
4570 * as the destination address if the physical legnth
4571 * is exactly 16 bytes for point-to-multipoint links
4572 * that do their own resolution from IP to link-layer
4573 * address.
4575 bcopy((uchar_t *)addr,
4576 ncec->ncec_lladdr, ill->ill_phys_addr_length);
4579 * Since NUD is not part of the base IPv4 protocol definition,
4580 * IPv4 neighbor entries on NORESOLVER interfaces will never
4581 * age, and are marked NCE_F_NONUD.
4583 if (!ill->ill_isv6)
4584 ncec->ncec_flags |= NCE_F_NONUD;
4585 } else if (ill->ill_net_type == IRE_LOOPBACK) {
4586 state = ND_REACHABLE;
4589 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4591 * We are adding an ncec with a deterministic hw_addr,
4592 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4594 * if we are adding a unicast ncec for the local address
4595 * it would be REACHABLE; we would be adding a ND_STALE entry
4596 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4597 * addresses are added in PROBE to trigger DAD.
4599 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4600 ill->ill_net_type == IRE_IF_NORESOLVER)
4601 state = ND_REACHABLE;
4602 else if (!NCE_PUBLISH(ncec))
4603 state = ND_STALE;
4604 else
4605 state = ND_PROBE;
4606 if (hw_addr != NULL)
4607 nce_set_ll(ncec, hw_addr);
4609 /* caller overrides internally computed state */
4610 if (nce_state != ND_UNCHANGED)
4611 state = nce_state;
4613 if (state == ND_PROBE)
4614 ncec->ncec_flags |= NCE_F_UNVERIFIED;
4616 ncec->ncec_state = state;
4618 if (state == ND_REACHABLE) {
4619 ncec->ncec_last = ncec->ncec_init_time =
4620 TICK_TO_MSEC(ddi_get_lbolt64());
4621 } else {
4622 ncec->ncec_last = 0;
4623 if (state == ND_INITIAL)
4624 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4626 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4627 offsetof(ncec_cb_t, ncec_cb_node));
4629 * have all the memory allocations out of the way before taking locks
4630 * and adding the nce.
4632 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4633 if (nce == NULL) {
4634 err = ENOMEM;
4635 goto err_ret;
4637 if (ncec->ncec_lladdr != NULL ||
4638 ill->ill_net_type == IRE_IF_NORESOLVER) {
4639 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4640 ill->ill_phys_addr_length, ill->ill_sap,
4641 ill->ill_sap_length);
4642 if (dlur_mp == NULL) {
4643 err = ENOMEM;
4644 goto err_ret;
4649 * Atomically ensure that the ill is not CONDEMNED, before
4650 * adding the NCE.
4652 mutex_enter(&ill->ill_lock);
4653 if (ill->ill_state_flags & ILL_CONDEMNED) {
4654 mutex_exit(&ill->ill_lock);
4655 err = EINVAL;
4656 goto err_ret;
4658 if (!NCE_MYADDR(ncec) &&
4659 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4660 mutex_exit(&ill->ill_lock);
4661 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4662 err = EINVAL;
4663 goto err_ret;
4666 * Acquire the ncec_lock even before adding the ncec to the list
4667 * so that it cannot get deleted after the ncec is added, but
4668 * before we add the nce.
4670 mutex_enter(&ncec->ncec_lock);
4671 if ((ncec->ncec_next = *ncep) != NULL)
4672 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4673 *ncep = ncec;
4674 ncec->ncec_ptpn = ncep;
4676 /* Bump up the number of ncec's referencing this ill */
4677 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4678 (char *), "ncec", (void *), ncec);
4679 ill->ill_ncec_cnt++;
4681 * Since we hold the ncec_lock at this time, the ncec cannot be
4682 * condemned, and we can safely add the nce.
4684 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4685 mutex_exit(&ncec->ncec_lock);
4686 mutex_exit(&ill->ill_lock);
4688 /* caller must trigger fastpath on *retnce */
4689 return (0);
4691 err_ret:
4692 if (ncec != NULL)
4693 kmem_cache_free(ncec_cache, ncec);
4694 if (nce != NULL)
4695 kmem_cache_free(nce_cache, nce);
4696 freemsg(dlur_mp);
4697 if (template != NULL)
4698 kmem_free(template, ill->ill_phys_addr_length);
4699 return (err);
4703 * take a ref on the nce
4705 void
4706 nce_refhold(nce_t *nce)
4708 mutex_enter(&nce->nce_lock);
4709 nce->nce_refcnt++;
4710 ASSERT((nce)->nce_refcnt != 0);
4711 mutex_exit(&nce->nce_lock);
4715 * release a ref on the nce; In general, this
4716 * cannot be called with locks held because nce_inactive
4717 * may result in nce_inactive which will take the ill_lock,
4718 * do ipif_ill_refrele_tail etc. Thus the one exception
4719 * where this can be called with locks held is when the caller
4720 * is certain that the nce_refcnt is sufficient to prevent
4721 * the invocation of nce_inactive.
4723 void
4724 nce_refrele(nce_t *nce)
4726 ASSERT((nce)->nce_refcnt != 0);
4727 mutex_enter(&nce->nce_lock);
4728 if (--nce->nce_refcnt == 0)
4729 nce_inactive(nce); /* destroys the mutex */
4730 else
4731 mutex_exit(&nce->nce_lock);
4735 * free the nce after all refs have gone away.
4737 static void
4738 nce_inactive(nce_t *nce)
4740 ill_t *ill = nce->nce_ill;
4742 ASSERT(nce->nce_refcnt == 0);
4744 ncec_refrele_notr(nce->nce_common);
4745 nce->nce_common = NULL;
4746 freemsg(nce->nce_fp_mp);
4747 freemsg(nce->nce_dlur_mp);
4749 mutex_enter(&ill->ill_lock);
4750 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4751 (char *), "nce", (void *), nce);
4752 ill->ill_nce_cnt--;
4753 nce->nce_ill = NULL;
4755 * If the number of ncec's associated with this ill have dropped
4756 * to zero, check whether we need to restart any operation that
4757 * is waiting for this to happen.
4759 if (ILL_DOWN_OK(ill)) {
4760 /* ipif_ill_refrele_tail drops the ill_lock */
4761 ipif_ill_refrele_tail(ill);
4762 } else {
4763 mutex_exit(&ill->ill_lock);
4766 mutex_destroy(&nce->nce_lock);
4767 kmem_cache_free(nce_cache, nce);
4771 * Add an nce to the ill_nce list.
4773 static nce_t *
4774 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4776 bzero(nce, sizeof (*nce));
4777 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4778 nce->nce_common = ncec;
4779 nce->nce_addr = ncec->ncec_addr;
4780 nce->nce_ill = ill;
4781 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4782 (char *), "nce", (void *), nce);
4783 ill->ill_nce_cnt++;
4785 nce->nce_refcnt = 1; /* for the thread */
4786 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4787 nce->nce_dlur_mp = dlur_mp;
4789 /* add nce to the ill's fastpath list. */
4790 nce->nce_refcnt++; /* for the list */
4791 list_insert_head(&ill->ill_nce, nce);
4792 return (nce);
4795 static nce_t *
4796 nce_add(ill_t *ill, ncec_t *ncec)
4798 nce_t *nce;
4799 mblk_t *dlur_mp = NULL;
4801 ASSERT(MUTEX_HELD(&ill->ill_lock));
4802 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4804 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4805 if (nce == NULL)
4806 return (NULL);
4807 if (ncec->ncec_lladdr != NULL ||
4808 ill->ill_net_type == IRE_IF_NORESOLVER) {
4809 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4810 ill->ill_phys_addr_length, ill->ill_sap,
4811 ill->ill_sap_length);
4812 if (dlur_mp == NULL) {
4813 kmem_cache_free(nce_cache, nce);
4814 return (NULL);
4817 return (nce_add_impl(ill, ncec, nce, dlur_mp));
4821 * remove the nce from the ill_faspath list
4823 void
4824 nce_delete(nce_t *nce)
4826 ill_t *ill = nce->nce_ill;
4828 ASSERT(MUTEX_HELD(&ill->ill_lock));
4830 mutex_enter(&nce->nce_lock);
4831 if (nce->nce_is_condemned) {
4833 * some other thread has removed this nce from the ill_nce list
4835 mutex_exit(&nce->nce_lock);
4836 return;
4838 nce->nce_is_condemned = B_TRUE;
4839 mutex_exit(&nce->nce_lock);
4841 list_remove(&ill->ill_nce, nce);
4843 * even though we are holding the ill_lock, it is ok to
4844 * call nce_refrele here because we know that we should have
4845 * at least 2 refs on the nce: one for the thread, and one
4846 * for the list. The refrele below will release the one for
4847 * the list.
4849 nce_refrele(nce);
4852 nce_t *
4853 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4855 nce_t *nce = NULL;
4857 ASSERT(ill != NULL);
4858 ASSERT(MUTEX_HELD(&ill->ill_lock));
4860 for (nce = list_head(&ill->ill_nce); nce != NULL;
4861 nce = list_next(&ill->ill_nce, nce)) {
4862 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4863 break;
4867 * if we found the nce on the ill_nce list while holding
4868 * the ill_lock, then it cannot be condemned yet.
4870 if (nce != NULL) {
4871 ASSERT(!nce->nce_is_condemned);
4872 nce_refhold(nce);
4874 return (nce);
4878 * Walk the ill_nce list on ill. The callback function func() cannot perform
4879 * any destructive actions.
4881 static void
4882 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4884 nce_t *nce = NULL, *nce_next;
4886 ASSERT(MUTEX_HELD(&ill->ill_lock));
4887 for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4888 nce_next = list_next(&ill->ill_nce, nce);
4889 if (func(ill, nce, arg) != 0)
4890 break;
4891 nce = nce_next;
4895 void
4896 nce_walk(ill_t *ill, pfi_t func, void *arg)
4898 mutex_enter(&ill->ill_lock);
4899 nce_walk_common(ill, func, arg);
4900 mutex_exit(&ill->ill_lock);
4903 void
4904 nce_flush(ill_t *ill, boolean_t flushall)
4906 nce_t *nce, *nce_next;
4907 list_t dead;
4909 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4910 mutex_enter(&ill->ill_lock);
4911 for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4912 nce_next = list_next(&ill->ill_nce, nce);
4913 if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4914 nce = nce_next;
4915 continue;
4918 * nce_delete requires that the caller should either not
4919 * be holding locks, or should hold a ref to ensure that
4920 * we wont hit ncec_inactive. So take a ref and clean up
4921 * after the list is flushed.
4923 nce_refhold(nce);
4924 nce_delete(nce);
4925 list_insert_tail(&dead, nce);
4926 nce = nce_next;
4928 mutex_exit(&ill->ill_lock);
4929 while ((nce = list_head(&dead)) != NULL) {
4930 list_remove(&dead, nce);
4931 nce_refrele(nce);
4933 ASSERT(list_is_empty(&dead));
4934 list_destroy(&dead);
4937 /* Return an interval that is anywhere in the [1 .. intv] range */
4938 static clock_t
4939 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4941 clock_t rnd, frac;
4943 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4944 /* Note that clock_t is signed; must chop off bits */
4945 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4946 if (initial_time) {
4947 if (intv <= 0)
4948 intv = 1;
4949 else
4950 intv = (rnd % intv) + 1;
4951 } else {
4952 /* Compute 'frac' as 20% of the configured interval */
4953 if ((frac = intv / 5) <= 1)
4954 frac = 2;
4955 /* Set intv randomly in the range [intv-frac .. intv+frac] */
4956 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4957 intv = 1;
4959 return (intv);
4962 void
4963 nce_resolv_ipmp_ok(ncec_t *ncec)
4965 mblk_t *mp;
4966 uint_t pkt_len;
4967 iaflags_t ixaflags = IXAF_NO_TRACE;
4968 nce_t *under_nce;
4969 ill_t *ill = ncec->ncec_ill;
4970 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4971 ipif_t *src_ipif = NULL;
4972 ip_stack_t *ipst = ill->ill_ipst;
4973 ill_t *send_ill;
4974 uint_t nprobes;
4976 ASSERT(IS_IPMP(ill));
4978 mutex_enter(&ncec->ncec_lock);
4979 nprobes = ncec->ncec_nprobes;
4980 mp = ncec->ncec_qd_mp;
4981 ncec->ncec_qd_mp = NULL;
4982 ncec->ncec_nprobes = 0;
4983 mutex_exit(&ncec->ncec_lock);
4985 while (mp != NULL) {
4986 mblk_t *nxt_mp;
4988 nxt_mp = mp->b_next;
4989 mp->b_next = NULL;
4990 if (isv6) {
4991 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4993 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4994 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
4995 ill, ALL_ZONES, ipst);
4996 } else {
4997 ipha_t *ipha = (ipha_t *)mp->b_rptr;
4999 ixaflags |= IXAF_IS_IPV4;
5000 pkt_len = ntohs(ipha->ipha_length);
5001 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5002 ill, ALL_ZONES, ipst);
5006 * find a new nce based on an under_ill. The first IPMP probe
5007 * packet gets queued, so we could still find a src_ipif that
5008 * matches an IPMP test address.
5010 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5012 * if src_ipif is null, this could be either a
5013 * forwarded packet or a probe whose src got deleted.
5014 * We identify the former case by looking for the
5015 * ncec_nprobes: the first ncec_nprobes packets are
5016 * probes;
5018 if (src_ipif == NULL && nprobes > 0)
5019 goto drop_pkt;
5022 * For forwarded packets, we use the ipmp rotor
5023 * to find send_ill.
5025 send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5026 B_TRUE);
5027 } else {
5028 send_ill = src_ipif->ipif_ill;
5029 ill_refhold(send_ill);
5032 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5033 (ncec_t *), ncec, (ipif_t *),
5034 src_ipif, (ill_t *), send_ill);
5036 if (send_ill == NULL) {
5037 if (src_ipif != NULL)
5038 ipif_refrele(src_ipif);
5039 goto drop_pkt;
5041 /* create an under_nce on send_ill */
5042 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5043 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5044 under_nce = nce_fastpath_create(send_ill, ncec);
5045 else
5046 under_nce = NULL;
5047 rw_exit(&ipst->ips_ill_g_lock);
5048 if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5049 nce_fastpath_trigger(under_nce);
5051 ill_refrele(send_ill);
5052 if (src_ipif != NULL)
5053 ipif_refrele(src_ipif);
5055 if (under_nce != NULL) {
5056 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5057 ALL_ZONES, 0, NULL);
5058 nce_refrele(under_nce);
5059 if (nprobes > 0)
5060 nprobes--;
5061 mp = nxt_mp;
5062 continue;
5064 drop_pkt:
5065 if (isv6) {
5066 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5067 } else {
5068 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5070 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5071 freemsg(mp);
5072 if (nprobes > 0)
5073 nprobes--;
5074 mp = nxt_mp;
5076 ncec_cb_dispatch(ncec); /* complete callbacks */