9903 qinfo: add typed members
[unleashed.git] / usr / src / uts / common / inet / ipnet / ipnet.c
blob5220236dfb97e2483cad24b4d33ea202c84771e2
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
29 * Copyright (c) 2016, Joyent, Inc. All rights reserved.
33 * The ipnet device defined here provides access to packets at the IP layer. To
34 * provide access to packets at this layer it registers a callback function in
35 * the ip module and when there are open instances of the device ip will pass
36 * packets into the device. Packets from ip are passed on the input, output and
37 * loopback paths. Internally the module returns to ip as soon as possible by
38 * deferring processing using a taskq.
40 * Management of the devices in /dev/ipnet/ is handled by the devname
41 * filesystem and use of the neti interfaces. This module registers for NIC
42 * events using the neti framework so that when IP interfaces are bought up,
43 * taken down etc. the ipnet module is notified and its view of the interfaces
44 * configured on the system adjusted. On attach, the module gets an initial
45 * view of the system again using the neti framework but as it has already
46 * registered for IP interface events, it is still up-to-date with any changes.
49 #include <sys/types.h>
50 #include <sys/conf.h>
51 #include <sys/cred.h>
52 #include <sys/stat.h>
53 #include <sys/ddi.h>
54 #include <sys/sunddi.h>
55 #include <sys/modctl.h>
56 #include <sys/dlpi.h>
57 #include <sys/strsun.h>
58 #include <sys/id_space.h>
59 #include <sys/kmem.h>
60 #include <sys/mkdev.h>
61 #include <sys/neti.h>
62 #include <net/if.h>
63 #include <sys/errno.h>
64 #include <sys/list.h>
65 #include <sys/ksynch.h>
66 #include <sys/hook_event.h>
67 #include <sys/sdt.h>
68 #include <sys/stropts.h>
69 #include <sys/sysmacros.h>
70 #include <inet/ip.h>
71 #include <inet/ip_if.h>
72 #include <inet/ip_multi.h>
73 #include <inet/ip6.h>
74 #include <inet/ipnet.h>
75 #include <net/bpf.h>
76 #include <net/bpfdesc.h>
77 #include <net/dlt.h>
79 static struct module_info ipnet_minfo = {
80 1, /* mi_idnum */
81 "ipnet", /* mi_idname */
82 0, /* mi_minpsz */
83 INFPSZ, /* mi_maxpsz */
84 2048, /* mi_hiwat */
85 0 /* mi_lowat */
89 * List to hold static view of ipnetif_t's on the system. This is needed to
90 * avoid holding the lock protecting the avl tree of ipnetif's over the
91 * callback into the dev filesystem.
93 typedef struct ipnetif_cbdata {
94 char ic_ifname[LIFNAMSIZ];
95 dev_t ic_dev;
96 list_node_t ic_next;
97 } ipnetif_cbdata_t;
100 * Convenience enumerated type for ipnet_accept(). It describes the
101 * properties of a given ipnet_addrp_t relative to a single ipnet_t
102 * client stream. The values represent whether the address is ...
104 typedef enum {
105 IPNETADDR_MYADDR, /* an address on my ipnetif_t. */
106 IPNETADDR_MBCAST, /* a multicast or broadcast address. */
107 IPNETADDR_UNKNOWN /* none of the above. */
108 } ipnet_addrtype_t;
110 /* Argument used for the ipnet_nicevent_taskq callback. */
111 typedef struct ipnet_nicevent_s {
112 nic_event_t ipne_event;
113 net_handle_t ipne_protocol;
114 netstackid_t ipne_stackid;
115 uint64_t ipne_ifindex;
116 uint64_t ipne_lifindex;
117 char ipne_ifname[LIFNAMSIZ];
118 } ipnet_nicevent_t;
120 static dev_info_t *ipnet_dip;
121 static major_t ipnet_major;
122 static ddi_taskq_t *ipnet_taskq; /* taskq for packets */
123 static ddi_taskq_t *ipnet_nicevent_taskq; /* taskq for NIC events */
124 static id_space_t *ipnet_minor_space;
125 static const int IPNET_MINOR_LO = 1; /* minor number for /dev/lo0 */
126 static const int IPNET_MINOR_MIN = 2; /* start of dynamic minors */
127 static dl_info_ack_t ipnet_infoack = IPNET_INFO_ACK_INIT;
128 static ipnet_acceptfn_t ipnet_accept, ipnet_loaccept;
129 static bpf_itap_fn_t ipnet_itap;
131 static void ipnet_input(mblk_t *);
132 static int ipnet_wput(queue_t *, mblk_t *);
133 static int ipnet_rsrv(queue_t *);
134 static int ipnet_open(queue_t *, dev_t *, int, int, cred_t *);
135 static int ipnet_close(queue_t *, int, cred_t *);
136 static void ipnet_ioctl(queue_t *, mblk_t *);
137 static void ipnet_iocdata(queue_t *, mblk_t *);
138 static void ipnet_wputnondata(queue_t *, mblk_t *);
139 static int ipnet_attach(dev_info_t *, ddi_attach_cmd_t);
140 static int ipnet_detach(dev_info_t *, ddi_detach_cmd_t);
141 static int ipnet_devinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
142 static void ipnet_inforeq(queue_t *q, mblk_t *mp);
143 static void ipnet_bindreq(queue_t *q, mblk_t *mp);
144 static void ipnet_unbindreq(queue_t *q, mblk_t *mp);
145 static void ipnet_dlpromisconreq(queue_t *q, mblk_t *mp);
146 static void ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp);
147 static int ipnet_join_allmulti(ipnetif_t *, ipnet_stack_t *);
148 static void ipnet_leave_allmulti(ipnetif_t *, ipnet_stack_t *);
149 static int ipnet_nicevent_cb(hook_event_token_t, hook_data_t, void *);
150 static void ipnet_nicevent_task(void *);
151 static ipnetif_t *ipnetif_create(const char *, uint64_t, ipnet_stack_t *,
152 uint64_t);
153 static void ipnetif_remove(ipnetif_t *, ipnet_stack_t *);
154 static ipnetif_addr_t *ipnet_match_lif(ipnetif_t *, lif_if_t, boolean_t);
155 static ipnetif_t *ipnetif_getby_index(uint64_t, ipnet_stack_t *);
156 static ipnetif_t *ipnetif_getby_dev(dev_t, ipnet_stack_t *);
157 static boolean_t ipnetif_in_zone(ipnetif_t *, zoneid_t, ipnet_stack_t *);
158 static void ipnetif_zonecheck(ipnetif_t *, ipnet_stack_t *);
159 static int ipnet_populate_if(net_handle_t, ipnet_stack_t *, boolean_t);
160 static int ipnetif_compare_name(const void *, const void *);
161 static int ipnetif_compare_name_zone(const void *, const void *);
162 static int ipnetif_compare_index(const void *, const void *);
163 static void ipnet_add_ifaddr(uint64_t, ipnetif_t *, net_handle_t);
164 static void ipnet_delete_ifaddr(ipnetif_addr_t *, ipnetif_t *, boolean_t);
165 static void ipnetif_refhold(ipnetif_t *);
166 static void ipnetif_refrele(ipnetif_t *);
167 static void ipnet_walkers_inc(ipnet_stack_t *);
168 static void ipnet_walkers_dec(ipnet_stack_t *);
169 static void ipnet_register_netihook(ipnet_stack_t *);
170 static void *ipnet_stack_init(netstackid_t, netstack_t *);
171 static void ipnet_stack_fini(netstackid_t, void *);
172 static void ipnet_dispatch(void *);
173 static int ipobs_bounce_func(hook_event_token_t, hook_data_t, void *);
174 static int ipnet_bpf_bounce(hook_event_token_t, hook_data_t, void *);
175 static ipnetif_t *ipnetif_clone_create(ipnetif_t *, zoneid_t);
176 static void ipnetif_clone_release(ipnetif_t *);
178 static struct qinit ipnet_rinit = {
179 NULL, /* qi_putp */
180 ipnet_rsrv, /* qi_srvp */
181 ipnet_open, /* qi_qopen */
182 ipnet_close, /* qi_qclose */
183 NULL, /* qi_qadmin */
184 &ipnet_minfo, /* qi_minfo */
187 static struct qinit ipnet_winit = {
188 ipnet_wput, /* qi_putp */
189 NULL, /* qi_srvp */
190 NULL, /* qi_qopen */
191 NULL, /* qi_qclose */
192 NULL, /* qi_qadmin */
193 &ipnet_minfo, /* qi_minfo */
196 static struct streamtab ipnet_info = {
197 &ipnet_rinit, &ipnet_winit
200 DDI_DEFINE_STREAM_OPS(ipnet_ops, nulldev, nulldev, ipnet_attach,
201 ipnet_detach, nodev, ipnet_devinfo, D_MP | D_MTPERMOD, &ipnet_info,
202 ddi_quiesce_not_supported);
204 static struct modldrv modldrv = {
205 &mod_driverops,
206 "STREAMS ipnet driver",
207 &ipnet_ops
210 static struct modlinkage modlinkage = {
211 MODREV_1, &modldrv, NULL
215 * This structure contains the template data (names and type) that is
216 * copied, in bulk, into the new kstats structure created by net_kstat_create.
217 * No actual statistical information is stored in this instance of the
218 * ipnet_kstats_t structure.
220 static ipnet_kstats_t stats_template = {
221 { "duplicationFail", KSTAT_DATA_UINT64 },
222 { "dispatchOk", KSTAT_DATA_UINT64 },
223 { "dispatchFail", KSTAT_DATA_UINT64 },
224 { "dispatchHeaderDrop", KSTAT_DATA_UINT64 },
225 { "dispatchDupDrop", KSTAT_DATA_UINT64 },
226 { "dispatchDeliver", KSTAT_DATA_UINT64 },
227 { "acceptOk", KSTAT_DATA_UINT64 },
228 { "acceptFail", KSTAT_DATA_UINT64 }
232 * Walk the list of physical interfaces on the machine, for each
233 * interface create a new ipnetif_t and add any addresses to it. We
234 * need to do the walk twice, once for IPv4 and once for IPv6.
236 * The interfaces are destroyed as part of ipnet_stack_fini() for each
237 * stack. Note that we cannot do this initialization in
238 * ipnet_stack_init(), since ipnet_stack_init() cannot fail.
240 static int
241 ipnetif_init(void)
243 netstack_handle_t nh;
244 netstack_t *ns;
245 ipnet_stack_t *ips;
246 int ret = 0;
248 netstack_next_init(&nh);
249 while ((ns = netstack_next(&nh)) != NULL) {
250 ips = ns->netstack_ipnet;
251 if ((ret = ipnet_populate_if(ips->ips_ndv4, ips, B_FALSE)) == 0)
252 ret = ipnet_populate_if(ips->ips_ndv6, ips, B_TRUE);
253 netstack_rele(ns);
254 if (ret != 0)
255 break;
257 netstack_next_fini(&nh);
258 return (ret);
262 * Standard module entry points.
265 _init(void)
267 int ret;
268 boolean_t netstack_registered = B_FALSE;
270 if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
271 return (ENODEV);
272 ipnet_minor_space = id_space_create("ipnet_minor_space",
273 IPNET_MINOR_MIN, MAXMIN32);
276 * We call ddi_taskq_create() with nthread == 1 to ensure in-order
277 * delivery of packets to clients. Note that we need to create the
278 * taskqs before calling netstack_register() since ipnet_stack_init()
279 * registers callbacks that use 'em.
281 ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
282 ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
283 1, TASKQ_DEFAULTPRI, 0);
284 if (ipnet_taskq == NULL || ipnet_nicevent_taskq == NULL) {
285 ret = ENOMEM;
286 goto done;
289 netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
290 netstack_registered = B_TRUE;
292 if ((ret = ipnetif_init()) == 0)
293 ret = mod_install(&modlinkage);
294 done:
295 if (ret != 0) {
296 if (ipnet_taskq != NULL)
297 ddi_taskq_destroy(ipnet_taskq);
298 if (ipnet_nicevent_taskq != NULL)
299 ddi_taskq_destroy(ipnet_nicevent_taskq);
300 if (netstack_registered)
301 netstack_unregister(NS_IPNET);
302 id_space_destroy(ipnet_minor_space);
304 return (ret);
308 _fini(void)
310 int err;
312 if ((err = mod_remove(&modlinkage)) != 0)
313 return (err);
315 netstack_unregister(NS_IPNET);
316 ddi_taskq_destroy(ipnet_nicevent_taskq);
317 ddi_taskq_destroy(ipnet_taskq);
318 id_space_destroy(ipnet_minor_space);
319 return (0);
323 _info(struct modinfo *modinfop)
325 return (mod_info(&modlinkage, modinfop));
328 static void
329 ipnet_register_netihook(ipnet_stack_t *ips)
331 int ret;
332 zoneid_t zoneid;
333 netid_t netid;
335 HOOK_INIT(ips->ips_nicevents, ipnet_nicevent_cb, "ipnet_nicevents",
336 ips);
339 * It is possible for an exclusive stack to be in the process of
340 * shutting down here, and the netid and protocol lookups could fail
341 * in that case.
343 zoneid = netstackid_to_zoneid(ips->ips_netstack->netstack_stackid);
344 if ((netid = net_zoneidtonetid(zoneid)) == -1)
345 return;
347 if ((ips->ips_ndv4 = net_protocol_lookup(netid, NHF_INET)) != NULL) {
348 if ((ret = net_hook_register(ips->ips_ndv4, NH_NIC_EVENTS,
349 ips->ips_nicevents)) != 0) {
350 VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
351 ips->ips_ndv4 = NULL;
352 cmn_err(CE_WARN, "unable to register IPv4 netinfo hooks"
353 " in zone %d: %d", zoneid, ret);
356 if ((ips->ips_ndv6 = net_protocol_lookup(netid, NHF_INET6)) != NULL) {
357 if ((ret = net_hook_register(ips->ips_ndv6, NH_NIC_EVENTS,
358 ips->ips_nicevents)) != 0) {
359 VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
360 ips->ips_ndv6 = NULL;
361 cmn_err(CE_WARN, "unable to register IPv6 netinfo hooks"
362 " in zone %d: %d", zoneid, ret);
367 * Create a local set of kstats for each zone.
369 ips->ips_kstatp = net_kstat_create(netid, "ipnet", 0, "ipnet_stats",
370 "misc", KSTAT_TYPE_NAMED,
371 sizeof (ipnet_kstats_t) / sizeof (kstat_named_t), 0);
372 if (ips->ips_kstatp != NULL) {
373 bcopy(&stats_template, &ips->ips_stats,
374 sizeof (ips->ips_stats));
375 ips->ips_kstatp->ks_data = &ips->ips_stats;
376 ips->ips_kstatp->ks_private =
377 (void *)(uintptr_t)ips->ips_netstack->netstack_stackid;
378 kstat_install(ips->ips_kstatp);
379 } else {
380 cmn_err(CE_WARN, "net_kstat_create(%s,%s,%s) failed",
381 "ipnet", "ipnet_stats", "misc");
386 * This function is called on attach to build an initial view of the
387 * interfaces on the system. It will be called once for IPv4 and once
388 * for IPv6, although there is only one ipnet interface for both IPv4
389 * and IPv6 there are separate address lists.
391 static int
392 ipnet_populate_if(net_handle_t nd, ipnet_stack_t *ips, boolean_t isv6)
394 phy_if_t phyif;
395 lif_if_t lif;
396 ipnetif_t *ipnetif;
397 char name[LIFNAMSIZ];
398 boolean_t new_if = B_FALSE;
399 uint64_t ifflags;
400 int ret = 0;
403 * If ipnet_register_netihook() was unable to initialize this
404 * stack's net_handle_t, then we cannot populate any interface
405 * information. This usually happens when we attempted to
406 * grab a net_handle_t as a stack was shutting down. We don't
407 * want to fail the entire _init() operation because of a
408 * stack shutdown (other stacks will continue to work just
409 * fine), so we silently return success here.
411 if (nd == NULL)
412 return (0);
415 * Make sure we're not processing NIC events during the
416 * population of our interfaces and address lists.
418 mutex_enter(&ips->ips_event_lock);
420 for (phyif = net_phygetnext(nd, 0); phyif != 0;
421 phyif = net_phygetnext(nd, phyif)) {
422 if (net_getifname(nd, phyif, name, LIFNAMSIZ) != 0)
423 continue;
424 ifflags = 0;
425 (void) net_getlifflags(nd, phyif, 0, &ifflags);
426 if ((ipnetif = ipnetif_getby_index(phyif, ips)) == NULL) {
427 ipnetif = ipnetif_create(name, phyif, ips, ifflags);
428 if (ipnetif == NULL) {
429 ret = ENOMEM;
430 goto done;
432 new_if = B_TRUE;
434 ipnetif->if_flags |=
435 isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
437 for (lif = net_lifgetnext(nd, phyif, 0); lif != 0;
438 lif = net_lifgetnext(nd, phyif, lif)) {
440 * Skip addresses that aren't up. We'll add
441 * them when we receive an NE_LIF_UP event.
443 if (net_getlifflags(nd, phyif, lif, &ifflags) != 0 ||
444 !(ifflags & IFF_UP))
445 continue;
446 /* Don't add it if we already have it. */
447 if (ipnet_match_lif(ipnetif, lif, isv6) != NULL)
448 continue;
449 ipnet_add_ifaddr(lif, ipnetif, nd);
451 if (!new_if)
452 ipnetif_refrele(ipnetif);
455 done:
456 mutex_exit(&ips->ips_event_lock);
457 return (ret);
460 static int
461 ipnet_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
463 if (cmd != DDI_ATTACH)
464 return (DDI_FAILURE);
466 if (ddi_create_minor_node(dip, "lo0", S_IFCHR, IPNET_MINOR_LO,
467 DDI_PSEUDO, 0) == DDI_FAILURE)
468 return (DDI_FAILURE);
470 ipnet_dip = dip;
471 return (DDI_SUCCESS);
474 static int
475 ipnet_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
477 if (cmd != DDI_DETACH)
478 return (DDI_FAILURE);
480 ASSERT(dip == ipnet_dip);
481 ddi_remove_minor_node(ipnet_dip, NULL);
482 ipnet_dip = NULL;
483 return (DDI_SUCCESS);
486 /* ARGSUSED */
487 static int
488 ipnet_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
490 int error = DDI_FAILURE;
492 switch (infocmd) {
493 case DDI_INFO_DEVT2INSTANCE:
494 *result = (void *)0;
495 error = DDI_SUCCESS;
496 break;
497 case DDI_INFO_DEVT2DEVINFO:
498 if (ipnet_dip != NULL) {
499 *result = ipnet_dip;
500 error = DDI_SUCCESS;
502 break;
504 return (error);
507 /* ARGSUSED */
508 static int
509 ipnet_open(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
511 ipnet_t *ipnet;
512 netstack_t *ns = NULL;
513 ipnet_stack_t *ips;
514 int err = 0;
515 zoneid_t zoneid = crgetzoneid(crp);
518 * If the system is labeled, only the global zone is allowed to open
519 * IP observability nodes.
521 if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
522 return (EACCES);
524 /* We don't support open as a module */
525 if (sflag & MODOPEN)
526 return (ENOTSUP);
528 /* This driver is self-cloning, we don't support re-open. */
529 if (rq->q_ptr != NULL)
530 return (EBUSY);
532 if ((ipnet = kmem_zalloc(sizeof (*ipnet), KM_NOSLEEP)) == NULL)
533 return (ENOMEM);
535 VERIFY((ns = netstack_find_by_cred(crp)) != NULL);
536 ips = ns->netstack_ipnet;
538 rq->q_ptr = WR(rq)->q_ptr = ipnet;
539 ipnet->ipnet_rq = rq;
540 ipnet->ipnet_minor = (minor_t)id_alloc(ipnet_minor_space);
541 ipnet->ipnet_zoneid = zoneid;
542 ipnet->ipnet_dlstate = DL_UNBOUND;
543 ipnet->ipnet_ns = ns;
546 * We need to hold ips_event_lock here as any NE_LIF_DOWN events need
547 * to be processed after ipnet_if is set and the ipnet_t has been
548 * inserted in the ips_str_list.
550 mutex_enter(&ips->ips_event_lock);
551 if (getminor(*dev) == IPNET_MINOR_LO) {
552 ipnet->ipnet_flags |= IPNET_LOMODE;
553 ipnet->ipnet_acceptfn = ipnet_loaccept;
554 } else {
555 ipnet->ipnet_acceptfn = ipnet_accept;
556 ipnet->ipnet_if = ipnetif_getby_dev(*dev, ips);
557 if (ipnet->ipnet_if == NULL ||
558 !ipnetif_in_zone(ipnet->ipnet_if, zoneid, ips)) {
559 err = ENODEV;
560 goto done;
564 mutex_enter(&ips->ips_walkers_lock);
565 while (ips->ips_walkers_cnt != 0)
566 cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
567 list_insert_head(&ips->ips_str_list, ipnet);
568 *dev = makedevice(getmajor(*dev), ipnet->ipnet_minor);
569 qprocson(rq);
572 * Only register our callback if we're the first open client; we call
573 * unregister in close() for the last open client.
575 if (list_head(&ips->ips_str_list) == list_tail(&ips->ips_str_list))
576 ips->ips_hook = ipobs_register_hook(ns, ipnet_input);
577 mutex_exit(&ips->ips_walkers_lock);
579 done:
580 mutex_exit(&ips->ips_event_lock);
581 if (err != 0) {
582 netstack_rele(ns);
583 id_free(ipnet_minor_space, ipnet->ipnet_minor);
584 if (ipnet->ipnet_if != NULL)
585 ipnetif_refrele(ipnet->ipnet_if);
586 kmem_free(ipnet, sizeof (*ipnet));
588 return (err);
591 /* ARGSUSED */
592 static int
593 ipnet_close(queue_t *rq, int flags __unused, cred_t *credp __unused)
595 ipnet_t *ipnet = rq->q_ptr;
596 ipnet_stack_t *ips = ipnet->ipnet_ns->netstack_ipnet;
598 if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
599 ipnet_leave_allmulti(ipnet->ipnet_if, ips);
600 if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
601 ipnet_leave_allmulti(ipnet->ipnet_if, ips);
603 mutex_enter(&ips->ips_walkers_lock);
604 while (ips->ips_walkers_cnt != 0)
605 cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
607 qprocsoff(rq);
609 list_remove(&ips->ips_str_list, ipnet);
610 if (ipnet->ipnet_if != NULL)
611 ipnetif_refrele(ipnet->ipnet_if);
612 id_free(ipnet_minor_space, ipnet->ipnet_minor);
614 if (list_is_empty(&ips->ips_str_list)) {
615 ipobs_unregister_hook(ips->ips_netstack, ips->ips_hook);
616 ips->ips_hook = NULL;
619 kmem_free(ipnet, sizeof (*ipnet));
621 mutex_exit(&ips->ips_walkers_lock);
622 netstack_rele(ips->ips_netstack);
623 return (0);
626 static int
627 ipnet_wput(queue_t *q, mblk_t *mp)
629 switch (mp->b_datap->db_type) {
630 case M_FLUSH:
631 if (*mp->b_rptr & FLUSHW) {
632 flushq(q, FLUSHDATA);
633 *mp->b_rptr &= ~FLUSHW;
635 if (*mp->b_rptr & FLUSHR)
636 qreply(q, mp);
637 else
638 freemsg(mp);
639 break;
640 case M_PROTO:
641 case M_PCPROTO:
642 ipnet_wputnondata(q, mp);
643 break;
644 case M_IOCTL:
645 ipnet_ioctl(q, mp);
646 break;
647 case M_IOCDATA:
648 ipnet_iocdata(q, mp);
649 break;
650 default:
651 freemsg(mp);
652 break;
654 return (0);
657 static int
658 ipnet_rsrv(queue_t *q)
660 mblk_t *mp;
662 while ((mp = getq(q)) != NULL) {
663 ASSERT(DB_TYPE(mp) == M_DATA);
664 if (canputnext(q)) {
665 putnext(q, mp);
666 } else {
667 (void) putbq(q, mp);
668 break;
671 return (0);
674 static void
675 ipnet_ioctl(queue_t *q, mblk_t *mp)
677 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
679 switch (iocp->ioc_cmd) {
680 case DLIOCRAW:
681 miocack(q, mp, 0, 0);
682 break;
683 case DLIOCIPNETINFO:
684 if (iocp->ioc_count == TRANSPARENT) {
685 mcopyin(mp, NULL, sizeof (uint_t), NULL);
686 qreply(q, mp);
687 break;
689 /* We don't support I_STR with DLIOCIPNETINFO. */
690 /* FALLTHROUGH */
691 default:
692 miocnak(q, mp, 0, EINVAL);
693 break;
697 static void
698 ipnet_iocdata(queue_t *q, mblk_t *mp)
700 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
701 ipnet_t *ipnet = q->q_ptr;
703 switch (iocp->ioc_cmd) {
704 case DLIOCIPNETINFO:
705 if (*(int *)mp->b_cont->b_rptr == 1)
706 ipnet->ipnet_flags |= IPNET_INFO;
707 else if (*(int *)mp->b_cont->b_rptr == 0)
708 ipnet->ipnet_flags &= ~IPNET_INFO;
709 else
710 goto iocnak;
711 miocack(q, mp, 0, DL_IPNETINFO_VERSION);
712 break;
713 default:
714 iocnak:
715 miocnak(q, mp, 0, EINVAL);
716 break;
720 static void
721 ipnet_wputnondata(queue_t *q, mblk_t *mp)
723 union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr;
724 t_uscalar_t prim = dlp->dl_primitive;
726 switch (prim) {
727 case DL_INFO_REQ:
728 ipnet_inforeq(q, mp);
729 break;
730 case DL_UNBIND_REQ:
731 ipnet_unbindreq(q, mp);
732 break;
733 case DL_BIND_REQ:
734 ipnet_bindreq(q, mp);
735 break;
736 case DL_PROMISCON_REQ:
737 ipnet_dlpromisconreq(q, mp);
738 break;
739 case DL_PROMISCOFF_REQ:
740 ipnet_dlpromiscoffreq(q, mp);
741 break;
742 case DL_UNITDATA_REQ:
743 case DL_DETACH_REQ:
744 case DL_PHYS_ADDR_REQ:
745 case DL_SET_PHYS_ADDR_REQ:
746 case DL_ENABMULTI_REQ:
747 case DL_DISABMULTI_REQ:
748 case DL_ATTACH_REQ:
749 dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
750 break;
751 default:
752 dlerrorack(q, mp, prim, DL_BADPRIM, 0);
753 break;
757 static void
758 ipnet_inforeq(queue_t *q, mblk_t *mp)
760 dl_info_ack_t *dlip;
761 size_t size = sizeof (dl_info_ack_t) + sizeof (ushort_t);
763 if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
764 dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
765 return;
768 if ((mp = mexchange(q, mp, size, M_PCPROTO, DL_INFO_ACK)) == NULL)
769 return;
771 dlip = (dl_info_ack_t *)mp->b_rptr;
772 *dlip = ipnet_infoack;
773 qreply(q, mp);
776 static void
777 ipnet_bindreq(queue_t *q, mblk_t *mp)
779 union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr;
780 ipnet_t *ipnet = q->q_ptr;
782 if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
783 dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
784 return;
787 switch (dlp->bind_req.dl_sap) {
788 case 0 :
789 ipnet->ipnet_family = AF_UNSPEC;
790 break;
791 case IPV4_VERSION :
792 ipnet->ipnet_family = AF_INET;
793 break;
794 case IPV6_VERSION :
795 ipnet->ipnet_family = AF_INET6;
796 break;
797 default :
798 dlerrorack(q, mp, DL_BIND_REQ, DL_BADSAP, 0);
799 return;
800 /*NOTREACHED*/
803 ipnet->ipnet_dlstate = DL_IDLE;
804 dlbindack(q, mp, dlp->bind_req.dl_sap, 0, 0, 0, 0);
807 static void
808 ipnet_unbindreq(queue_t *q, mblk_t *mp)
810 ipnet_t *ipnet = q->q_ptr;
812 if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
813 dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
814 return;
817 if (ipnet->ipnet_dlstate != DL_IDLE) {
818 dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, 0);
819 } else {
820 ipnet->ipnet_dlstate = DL_UNBOUND;
821 ipnet->ipnet_family = AF_UNSPEC;
822 dlokack(q, mp, DL_UNBIND_REQ);
826 static void
827 ipnet_dlpromisconreq(queue_t *q, mblk_t *mp)
829 ipnet_t *ipnet = q->q_ptr;
830 t_uscalar_t level;
831 int err;
833 if (MBLKL(mp) < DL_PROMISCON_REQ_SIZE) {
834 dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
835 return;
838 if (ipnet->ipnet_flags & IPNET_LOMODE) {
839 dlokack(q, mp, DL_PROMISCON_REQ);
840 return;
843 level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
844 if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
845 if ((err = ipnet_join_allmulti(ipnet->ipnet_if,
846 ipnet->ipnet_ns->netstack_ipnet)) != 0) {
847 dlerrorack(q, mp, DL_PROMISCON_REQ, DL_SYSERR, err);
848 return;
852 switch (level) {
853 case DL_PROMISC_PHYS:
854 ipnet->ipnet_flags |= IPNET_PROMISC_PHYS;
855 break;
856 case DL_PROMISC_SAP:
857 ipnet->ipnet_flags |= IPNET_PROMISC_SAP;
858 break;
859 case DL_PROMISC_MULTI:
860 ipnet->ipnet_flags |= IPNET_PROMISC_MULTI;
861 break;
862 default:
863 dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
864 return;
867 dlokack(q, mp, DL_PROMISCON_REQ);
870 static void
871 ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp)
873 ipnet_t *ipnet = q->q_ptr;
874 t_uscalar_t level;
875 uint16_t orig_ipnet_flags = ipnet->ipnet_flags;
877 if (MBLKL(mp) < DL_PROMISCOFF_REQ_SIZE) {
878 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
879 return;
882 if (ipnet->ipnet_flags & IPNET_LOMODE) {
883 dlokack(q, mp, DL_PROMISCOFF_REQ);
884 return;
887 level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
888 switch (level) {
889 case DL_PROMISC_PHYS:
890 if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
891 ipnet->ipnet_flags &= ~IPNET_PROMISC_PHYS;
892 break;
893 case DL_PROMISC_SAP:
894 if (ipnet->ipnet_flags & IPNET_PROMISC_SAP)
895 ipnet->ipnet_flags &= ~IPNET_PROMISC_SAP;
896 break;
897 case DL_PROMISC_MULTI:
898 if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
899 ipnet->ipnet_flags &= ~IPNET_PROMISC_MULTI;
900 break;
901 default:
902 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
903 return;
906 if (orig_ipnet_flags == ipnet->ipnet_flags) {
907 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_NOTENAB, 0);
908 return;
911 if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
912 ipnet_leave_allmulti(ipnet->ipnet_if,
913 ipnet->ipnet_ns->netstack_ipnet);
916 dlokack(q, mp, DL_PROMISCOFF_REQ);
919 static int
920 ipnet_join_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
922 int err = 0;
923 ip_stack_t *ipst = ips->ips_netstack->netstack_ip;
924 uint64_t index = ipnetif->if_index;
926 mutex_enter(&ips->ips_event_lock);
927 if (ipnetif->if_multicnt == 0) {
928 ASSERT((ipnetif->if_flags &
929 (IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0);
930 if (ipnetif->if_flags & IPNETIF_IPV4PLUMBED) {
931 err = ip_join_allmulti(index, B_FALSE, ipst);
932 if (err != 0)
933 goto done;
934 ipnetif->if_flags |= IPNETIF_IPV4ALLMULTI;
936 if (ipnetif->if_flags & IPNETIF_IPV6PLUMBED) {
937 err = ip_join_allmulti(index, B_TRUE, ipst);
938 if (err != 0 &&
939 (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI)) {
940 (void) ip_leave_allmulti(index, B_FALSE, ipst);
941 ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
942 goto done;
944 ipnetif->if_flags |= IPNETIF_IPV6ALLMULTI;
947 ipnetif->if_multicnt++;
949 done:
950 mutex_exit(&ips->ips_event_lock);
951 return (err);
954 static void
955 ipnet_leave_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
957 int err;
958 ip_stack_t *ipst = ips->ips_netstack->netstack_ip;
959 uint64_t index = ipnetif->if_index;
961 mutex_enter(&ips->ips_event_lock);
962 ASSERT(ipnetif->if_multicnt != 0);
963 if (--ipnetif->if_multicnt == 0) {
964 if (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI) {
965 err = ip_leave_allmulti(index, B_FALSE, ipst);
966 ASSERT(err == 0 || err == ENODEV);
967 ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
969 if (ipnetif->if_flags & IPNETIF_IPV6ALLMULTI) {
970 err = ip_leave_allmulti(index, B_TRUE, ipst);
971 ASSERT(err == 0 || err == ENODEV);
972 ipnetif->if_flags &= ~IPNETIF_IPV6ALLMULTI;
975 mutex_exit(&ips->ips_event_lock);
979 * Allocate a new mblk_t and put a dl_ipnetinfo_t in it.
980 * The structure it copies the header information from,
981 * hook_pkt_observe_t, is constructed using network byte
982 * order in ipobs_hook(), so there is no conversion here.
984 static mblk_t *
985 ipnet_addheader(hook_pkt_observe_t *hdr, mblk_t *mp)
987 mblk_t *dlhdr;
988 dl_ipnetinfo_t *dl;
990 if ((dlhdr = allocb(sizeof (dl_ipnetinfo_t), BPRI_HI)) == NULL) {
991 freemsg(mp);
992 return (NULL);
994 dl = (dl_ipnetinfo_t *)dlhdr->b_rptr;
995 dl->dli_version = DL_IPNETINFO_VERSION;
996 dl->dli_family = hdr->hpo_family;
997 dl->dli_htype = hdr->hpo_htype;
998 dl->dli_pktlen = hdr->hpo_pktlen;
999 dl->dli_ifindex = hdr->hpo_ifindex;
1000 dl->dli_grifindex = hdr->hpo_grifindex;
1001 dl->dli_zsrc = hdr->hpo_zsrc;
1002 dl->dli_zdst = hdr->hpo_zdst;
1003 dlhdr->b_wptr += sizeof (*dl);
1004 dlhdr->b_cont = mp;
1006 return (dlhdr);
1009 static ipnet_addrtype_t
1010 ipnet_get_addrtype(ipnet_t *ipnet, ipnet_addrp_t *addr)
1012 list_t *list;
1013 ipnetif_t *ipnetif = ipnet->ipnet_if;
1014 ipnetif_addr_t *ifaddr;
1015 ipnet_addrtype_t addrtype = IPNETADDR_UNKNOWN;
1017 /* First check if the address is multicast or limited broadcast. */
1018 switch (addr->iap_family) {
1019 case AF_INET:
1020 if (CLASSD(*(addr->iap_addr4)) ||
1021 *(addr->iap_addr4) == INADDR_BROADCAST)
1022 return (IPNETADDR_MBCAST);
1023 break;
1024 case AF_INET6:
1025 if (IN6_IS_ADDR_MULTICAST(addr->iap_addr6))
1026 return (IPNETADDR_MBCAST);
1027 break;
1031 * Walk the address list to see if the address belongs to our
1032 * interface or is one of our subnet broadcast addresses.
1034 mutex_enter(&ipnetif->if_addr_lock);
1035 list = (addr->iap_family == AF_INET) ?
1036 &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list;
1037 for (ifaddr = list_head(list);
1038 ifaddr != NULL && addrtype == IPNETADDR_UNKNOWN;
1039 ifaddr = list_next(list, ifaddr)) {
1041 * If we're not in the global zone, then only look at
1042 * addresses in our zone.
1044 if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1045 ipnet->ipnet_zoneid != ifaddr->ifa_zone)
1046 continue;
1047 switch (addr->iap_family) {
1048 case AF_INET:
1049 if (ifaddr->ifa_ip4addr != INADDR_ANY &&
1050 *(addr->iap_addr4) == ifaddr->ifa_ip4addr)
1051 addrtype = IPNETADDR_MYADDR;
1052 else if (ifaddr->ifa_brdaddr != INADDR_ANY &&
1053 *(addr->iap_addr4) == ifaddr->ifa_brdaddr)
1054 addrtype = IPNETADDR_MBCAST;
1055 break;
1056 case AF_INET6:
1057 if (IN6_ARE_ADDR_EQUAL(addr->iap_addr6,
1058 &ifaddr->ifa_ip6addr))
1059 addrtype = IPNETADDR_MYADDR;
1060 break;
1063 mutex_exit(&ipnetif->if_addr_lock);
1065 return (addrtype);
1069 * Verify if the packet contained in hdr should be passed up to the
1070 * ipnet client stream.
1072 static boolean_t
1073 ipnet_accept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1074 ipnet_addrp_t *dst)
1076 boolean_t obsif;
1077 uint64_t ifindex = ipnet->ipnet_if->if_index;
1078 ipnet_addrtype_t srctype;
1079 ipnet_addrtype_t dsttype;
1081 srctype = ipnet_get_addrtype(ipnet, src);
1082 dsttype = ipnet_get_addrtype(ipnet, dst);
1085 * If the packet's ifindex matches ours, or the packet's group ifindex
1086 * matches ours, it's on the interface we're observing. (Thus,
1087 * observing on the group ifindex matches all ifindexes in the group.)
1089 obsif = (ntohl(hdr->hpo_ifindex) == ifindex ||
1090 ntohl(hdr->hpo_grifindex) == ifindex);
1092 DTRACE_PROBE5(ipnet_accept__addr,
1093 ipnet_addrtype_t, srctype, ipnet_addrp_t *, src,
1094 ipnet_addrtype_t, dsttype, ipnet_addrp_t *, dst,
1095 boolean_t, obsif);
1098 * Do not allow an ipnet stream to see packets that are not from or to
1099 * its zone. The exception is when zones are using the shared stack
1100 * model. In this case, streams in the global zone have visibility
1101 * into other shared-stack zones, and broadcast and multicast traffic
1102 * is visible by all zones in the stack.
1104 if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1105 dsttype != IPNETADDR_MBCAST) {
1106 if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1107 ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1108 return (B_FALSE);
1112 * If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the
1113 * packet's IP version.
1115 if (!(ipnet->ipnet_flags & IPNET_PROMISC_SAP) &&
1116 ipnet->ipnet_family != hdr->hpo_family)
1117 return (B_FALSE);
1119 /* If the destination address is ours, then accept the packet. */
1120 if (dsttype == IPNETADDR_MYADDR)
1121 return (B_TRUE);
1124 * If DL_PROMISC_PHYS is enabled, then we can see all packets that are
1125 * sent or received on the interface we're observing, or packets that
1126 * have our source address (this allows us to see packets we send).
1128 if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
1129 if (srctype == IPNETADDR_MYADDR || obsif)
1130 return (B_TRUE);
1134 * We accept multicast and broadcast packets transmitted or received
1135 * on the interface we're observing.
1137 if (dsttype == IPNETADDR_MBCAST && obsif)
1138 return (B_TRUE);
1140 return (B_FALSE);
1144 * Verify if the packet contained in hdr should be passed up to the ipnet
1145 * client stream that's in IPNET_LOMODE.
1147 /* ARGSUSED */
1148 static boolean_t
1149 ipnet_loaccept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1150 ipnet_addrp_t *dst)
1152 if (hdr->hpo_htype != htons(IPOBS_HOOK_LOCAL)) {
1154 * ipnet_if is only NULL for IPNET_MINOR_LO devices.
1156 if (ipnet->ipnet_if == NULL)
1157 return (B_FALSE);
1161 * An ipnet stream must not see packets that are not from/to its zone.
1163 if (ipnet->ipnet_zoneid != GLOBAL_ZONEID) {
1164 if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1165 ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1166 return (B_FALSE);
1169 return (ipnet->ipnet_family == AF_UNSPEC ||
1170 ipnet->ipnet_family == hdr->hpo_family);
1173 static void
1174 ipnet_dispatch(void *arg)
1176 mblk_t *mp = arg;
1177 hook_pkt_observe_t *hdr = (hook_pkt_observe_t *)mp->b_rptr;
1178 ipnet_t *ipnet;
1179 mblk_t *netmp;
1180 list_t *list;
1181 ipnet_stack_t *ips;
1182 ipnet_addrp_t src;
1183 ipnet_addrp_t dst;
1185 ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1187 netmp = hdr->hpo_pkt->b_cont;
1188 src.iap_family = hdr->hpo_family;
1189 dst.iap_family = hdr->hpo_family;
1191 if (hdr->hpo_family == AF_INET) {
1192 src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
1193 dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
1194 } else {
1195 src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
1196 dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
1199 ipnet_walkers_inc(ips);
1201 list = &ips->ips_str_list;
1202 for (ipnet = list_head(list); ipnet != NULL;
1203 ipnet = list_next(list, ipnet)) {
1204 if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
1205 IPSK_BUMP(ips, ik_acceptFail);
1206 continue;
1208 IPSK_BUMP(ips, ik_acceptOk);
1210 if (list_next(list, ipnet) == NULL) {
1211 netmp = hdr->hpo_pkt->b_cont;
1212 hdr->hpo_pkt->b_cont = NULL;
1213 } else {
1214 if ((netmp = dupmsg(hdr->hpo_pkt->b_cont)) == NULL &&
1215 (netmp = copymsg(hdr->hpo_pkt->b_cont)) == NULL) {
1216 IPSK_BUMP(ips, ik_duplicationFail);
1217 continue;
1221 if (ipnet->ipnet_flags & IPNET_INFO) {
1222 if ((netmp = ipnet_addheader(hdr, netmp)) == NULL) {
1223 IPSK_BUMP(ips, ik_dispatchHeaderDrop);
1224 continue;
1228 if (ipnet->ipnet_rq->q_first == NULL &&
1229 canputnext(ipnet->ipnet_rq)) {
1230 putnext(ipnet->ipnet_rq, netmp);
1231 IPSK_BUMP(ips, ik_dispatchDeliver);
1232 } else if (canput(ipnet->ipnet_rq)) {
1233 (void) putq(ipnet->ipnet_rq, netmp);
1234 IPSK_BUMP(ips, ik_dispatchDeliver);
1235 } else {
1236 freemsg(netmp);
1237 IPSK_BUMP(ips, ik_dispatchPutDrop);
1241 ipnet_walkers_dec(ips);
1243 freemsg(mp);
1246 static void
1247 ipnet_input(mblk_t *mp)
1249 hook_pkt_observe_t *hdr = (hook_pkt_observe_t *)mp->b_rptr;
1250 ipnet_stack_t *ips;
1252 ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1254 if (ddi_taskq_dispatch(ipnet_taskq, ipnet_dispatch, mp, DDI_NOSLEEP) !=
1255 DDI_SUCCESS) {
1256 IPSK_BUMP(ips, ik_dispatchFail);
1257 freemsg(mp);
1258 } else {
1259 IPSK_BUMP(ips, ik_dispatchOk);
1263 static ipnetif_t *
1264 ipnet_alloc_if(ipnet_stack_t *ips)
1266 ipnetif_t *ipnetif;
1268 if ((ipnetif = kmem_zalloc(sizeof (*ipnetif), KM_NOSLEEP)) == NULL)
1269 return (NULL);
1271 mutex_init(&ipnetif->if_addr_lock, NULL, MUTEX_DEFAULT, 0);
1272 list_create(&ipnetif->if_ip4addr_list, sizeof (ipnetif_addr_t),
1273 offsetof(ipnetif_addr_t, ifa_link));
1274 list_create(&ipnetif->if_ip6addr_list, sizeof (ipnetif_addr_t),
1275 offsetof(ipnetif_addr_t, ifa_link));
1276 mutex_init(&ipnetif->if_reflock, NULL, MUTEX_DEFAULT, 0);
1278 ipnetif->if_stackp = ips;
1280 return (ipnetif);
1284 * Create a new ipnetif_t and new minor node for it. If creation is
1285 * successful the new ipnetif_t is inserted into an avl_tree
1286 * containing ipnetif's for this stack instance.
1288 static ipnetif_t *
1289 ipnetif_create(const char *name, uint64_t index, ipnet_stack_t *ips,
1290 uint64_t ifflags)
1292 ipnetif_t *ipnetif;
1293 avl_index_t where = 0;
1294 minor_t ifminor;
1297 * Because ipnetif_create() can be called from a NIC event
1298 * callback, it should not block.
1300 ifminor = (minor_t)id_alloc_nosleep(ipnet_minor_space);
1301 if (ifminor == (minor_t)-1)
1302 return (NULL);
1303 if ((ipnetif = ipnet_alloc_if(ips)) == NULL) {
1304 id_free(ipnet_minor_space, ifminor);
1305 return (NULL);
1308 (void) strlcpy(ipnetif->if_name, name, LIFNAMSIZ);
1309 ipnetif->if_index = (uint_t)index;
1310 ipnetif->if_zoneid = netstack_get_zoneid(ips->ips_netstack);
1311 ipnetif->if_dev = makedevice(ipnet_major, ifminor);
1313 ipnetif->if_refcnt = 1;
1314 if ((ifflags & IFF_LOOPBACK) != 0)
1315 ipnetif->if_flags = IPNETIF_LOOPBACK;
1317 mutex_enter(&ips->ips_avl_lock);
1318 VERIFY(avl_find(&ips->ips_avl_by_index, &index, &where) == NULL);
1319 avl_insert(&ips->ips_avl_by_index, ipnetif, where);
1320 VERIFY(avl_find(&ips->ips_avl_by_name, (void *)name, &where) == NULL);
1321 avl_insert(&ips->ips_avl_by_name, ipnetif, where);
1322 mutex_exit(&ips->ips_avl_lock);
1324 return (ipnetif);
1327 static void
1328 ipnetif_remove(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1330 ipnet_t *ipnet;
1332 ipnet_walkers_inc(ips);
1333 /* Send a SIGHUP to all open streams associated with this ipnetif. */
1334 for (ipnet = list_head(&ips->ips_str_list); ipnet != NULL;
1335 ipnet = list_next(&ips->ips_str_list, ipnet)) {
1336 if (ipnet->ipnet_if == ipnetif)
1337 (void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1339 ipnet_walkers_dec(ips);
1340 mutex_enter(&ips->ips_avl_lock);
1341 avl_remove(&ips->ips_avl_by_index, ipnetif);
1342 avl_remove(&ips->ips_avl_by_name, ipnetif);
1343 mutex_exit(&ips->ips_avl_lock);
1345 * Release the reference we implicitly held in ipnetif_create().
1347 ipnetif_refrele(ipnetif);
1350 static void
1351 ipnet_purge_addrlist(list_t *addrlist)
1353 ipnetif_addr_t *ifa;
1355 while ((ifa = list_head(addrlist)) != NULL) {
1356 list_remove(addrlist, ifa);
1357 if (ifa->ifa_shared != NULL)
1358 ipnetif_clone_release(ifa->ifa_shared);
1359 kmem_free(ifa, sizeof (*ifa));
1363 static void
1364 ipnetif_free(ipnetif_t *ipnetif)
1366 ASSERT(ipnetif->if_refcnt == 0);
1367 ASSERT(ipnetif->if_sharecnt == 0);
1369 /* Remove IPv4/v6 address lists from the ipnetif */
1370 ipnet_purge_addrlist(&ipnetif->if_ip4addr_list);
1371 list_destroy(&ipnetif->if_ip4addr_list);
1372 ipnet_purge_addrlist(&ipnetif->if_ip6addr_list);
1373 list_destroy(&ipnetif->if_ip6addr_list);
1374 mutex_destroy(&ipnetif->if_addr_lock);
1375 mutex_destroy(&ipnetif->if_reflock);
1376 if (ipnetif->if_dev != 0)
1377 id_free(ipnet_minor_space, getminor(ipnetif->if_dev));
1378 kmem_free(ipnetif, sizeof (*ipnetif));
1382 * Create an ipnetif_addr_t with the given logical interface id (lif)
1383 * and add it to the supplied ipnetif. The lif is the netinfo
1384 * representation of logical interface id, and we use this id to match
1385 * incoming netinfo events against our lists of addresses.
1387 static void
1388 ipnet_add_ifaddr(uint64_t lif, ipnetif_t *ipnetif, net_handle_t nd)
1390 ipnetif_addr_t *ifaddr;
1391 zoneid_t zoneid;
1392 struct sockaddr_in bcast;
1393 struct sockaddr_storage addr;
1394 net_ifaddr_t type = NA_ADDRESS;
1395 uint64_t phyif = ipnetif->if_index;
1397 if (net_getlifaddr(nd, phyif, lif, 1, &type, &addr) != 0 ||
1398 net_getlifzone(nd, phyif, lif, &zoneid) != 0)
1399 return;
1401 if ((ifaddr = kmem_alloc(sizeof (*ifaddr), KM_NOSLEEP)) == NULL)
1402 return;
1403 ifaddr->ifa_zone = zoneid;
1404 ifaddr->ifa_id = lif;
1405 ifaddr->ifa_shared = NULL;
1407 switch (addr.ss_family) {
1408 case AF_INET:
1409 ifaddr->ifa_ip4addr =
1410 ((struct sockaddr_in *)&addr)->sin_addr.s_addr;
1412 * Try and get the broadcast address. Note that it's okay for
1413 * an interface to not have a broadcast address, so we don't
1414 * fail the entire operation if net_getlifaddr() fails here.
1416 type = NA_BROADCAST;
1417 if (net_getlifaddr(nd, phyif, lif, 1, &type, &bcast) == 0)
1418 ifaddr->ifa_brdaddr = bcast.sin_addr.s_addr;
1419 break;
1420 case AF_INET6:
1421 ifaddr->ifa_ip6addr = ((struct sockaddr_in6 *)&addr)->sin6_addr;
1422 break;
1426 * The zoneid stored in ipnetif_t needs to correspond to the actual
1427 * zone the address is being used in. This facilitates finding the
1428 * correct netstack_t pointer, amongst other things, later.
1430 if (zoneid == ALL_ZONES)
1431 zoneid = GLOBAL_ZONEID;
1433 mutex_enter(&ipnetif->if_addr_lock);
1434 if (zoneid != ipnetif->if_zoneid) {
1435 ipnetif_t *ifp2;
1437 ifp2 = ipnetif_clone_create(ipnetif, zoneid);
1438 ifaddr->ifa_shared = ifp2;
1440 list_insert_tail(addr.ss_family == AF_INET ?
1441 &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list, ifaddr);
1442 mutex_exit(&ipnetif->if_addr_lock);
1445 static void
1446 ipnet_delete_ifaddr(ipnetif_addr_t *ifaddr, ipnetif_t *ipnetif, boolean_t isv6)
1448 mutex_enter(&ipnetif->if_addr_lock);
1449 if (ifaddr->ifa_shared != NULL)
1450 ipnetif_clone_release(ifaddr->ifa_shared);
1452 list_remove(isv6 ?
1453 &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list, ifaddr);
1454 mutex_exit(&ipnetif->if_addr_lock);
1455 kmem_free(ifaddr, sizeof (*ifaddr));
1458 static void
1459 ipnet_plumb_ev(ipnet_nicevent_t *ipne, ipnet_stack_t *ips, boolean_t isv6)
1461 ipnetif_t *ipnetif;
1462 boolean_t refrele_needed = B_TRUE;
1463 uint64_t ifflags;
1464 uint64_t ifindex;
1465 char *ifname;
1467 ifflags = 0;
1468 ifname = ipne->ipne_ifname;
1469 ifindex = ipne->ipne_ifindex;
1471 (void) net_getlifflags(ipne->ipne_protocol, ifindex, 0, &ifflags);
1473 if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) {
1474 ipnetif = ipnetif_create(ifname, ifindex, ips, ifflags);
1475 refrele_needed = B_FALSE;
1477 if (ipnetif != NULL) {
1478 ipnetif->if_flags |=
1479 isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
1482 if (ipnetif->if_multicnt != 0) {
1483 if (ip_join_allmulti(ifindex, isv6,
1484 ips->ips_netstack->netstack_ip) == 0) {
1485 ipnetif->if_flags |=
1486 isv6 ? IPNETIF_IPV6ALLMULTI : IPNETIF_IPV4ALLMULTI;
1490 if (refrele_needed)
1491 ipnetif_refrele(ipnetif);
1494 static void
1495 ipnet_unplumb_ev(uint64_t ifindex, ipnet_stack_t *ips, boolean_t isv6)
1497 ipnetif_t *ipnetif;
1499 if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1500 return;
1502 mutex_enter(&ipnetif->if_addr_lock);
1503 ipnet_purge_addrlist(isv6 ?
1504 &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list);
1505 mutex_exit(&ipnetif->if_addr_lock);
1508 * Note that we have one ipnetif for both IPv4 and IPv6, but we receive
1509 * separate NE_UNPLUMB events for IPv4 and IPv6. We remove the ipnetif
1510 * if both IPv4 and IPv6 interfaces have been unplumbed.
1512 ipnetif->if_flags &= isv6 ? ~IPNETIF_IPV6PLUMBED : ~IPNETIF_IPV4PLUMBED;
1513 if (!(ipnetif->if_flags & (IPNETIF_IPV4PLUMBED | IPNETIF_IPV6PLUMBED)))
1514 ipnetif_remove(ipnetif, ips);
1515 ipnetif_refrele(ipnetif);
1518 static void
1519 ipnet_lifup_ev(uint64_t ifindex, uint64_t lifindex, net_handle_t nd,
1520 ipnet_stack_t *ips, boolean_t isv6)
1522 ipnetif_t *ipnetif;
1523 ipnetif_addr_t *ifaddr;
1525 if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1526 return;
1527 if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) {
1529 * We must have missed a NE_LIF_DOWN event. Delete this
1530 * ifaddr and re-create it.
1532 ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1535 ipnet_add_ifaddr(lifindex, ipnetif, nd);
1536 ipnetif_refrele(ipnetif);
1539 static void
1540 ipnet_lifdown_ev(uint64_t ifindex, uint64_t lifindex, ipnet_stack_t *ips,
1541 boolean_t isv6)
1543 ipnetif_t *ipnetif;
1544 ipnetif_addr_t *ifaddr;
1546 if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1547 return;
1548 if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL)
1549 ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1550 ipnetif_refrele(ipnetif);
1552 * Make sure that open streams on this ipnetif are still allowed to
1553 * have it open.
1555 ipnetif_zonecheck(ipnetif, ips);
1559 * This callback from the NIC event framework dispatches a taskq as the event
1560 * handlers may block.
1562 /* ARGSUSED */
1563 static int
1564 ipnet_nicevent_cb(hook_event_token_t token, hook_data_t info, void *arg)
1566 ipnet_stack_t *ips = arg;
1567 hook_nic_event_t *hn = (hook_nic_event_t *)info;
1568 ipnet_nicevent_t *ipne;
1570 if ((ipne = kmem_alloc(sizeof (ipnet_nicevent_t), KM_NOSLEEP)) == NULL)
1571 return (0);
1572 ipne->ipne_event = hn->hne_event;
1573 ipne->ipne_protocol = hn->hne_protocol;
1574 ipne->ipne_stackid = ips->ips_netstack->netstack_stackid;
1575 ipne->ipne_ifindex = hn->hne_nic;
1576 ipne->ipne_lifindex = hn->hne_lif;
1577 if (hn->hne_datalen != 0) {
1578 (void) strlcpy(ipne->ipne_ifname, hn->hne_data,
1579 sizeof (ipne->ipne_ifname));
1581 (void) ddi_taskq_dispatch(ipnet_nicevent_taskq, ipnet_nicevent_task,
1582 ipne, DDI_NOSLEEP);
1583 return (0);
1586 static void
1587 ipnet_nicevent_task(void *arg)
1589 ipnet_nicevent_t *ipne = arg;
1590 netstack_t *ns;
1591 ipnet_stack_t *ips;
1592 boolean_t isv6;
1594 if ((ns = netstack_find_by_stackid(ipne->ipne_stackid)) == NULL)
1595 goto done;
1596 ips = ns->netstack_ipnet;
1597 isv6 = (ipne->ipne_protocol == ips->ips_ndv6);
1599 mutex_enter(&ips->ips_event_lock);
1600 switch (ipne->ipne_event) {
1601 case NE_PLUMB:
1602 ipnet_plumb_ev(ipne, ips, isv6);
1603 break;
1604 case NE_UNPLUMB:
1605 ipnet_unplumb_ev(ipne->ipne_ifindex, ips, isv6);
1606 break;
1607 case NE_LIF_UP:
1608 ipnet_lifup_ev(ipne->ipne_ifindex, ipne->ipne_lifindex,
1609 ipne->ipne_protocol, ips, isv6);
1610 break;
1611 case NE_LIF_DOWN:
1612 ipnet_lifdown_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, ips,
1613 isv6);
1614 break;
1615 default:
1616 break;
1618 mutex_exit(&ips->ips_event_lock);
1619 done:
1620 if (ns != NULL)
1621 netstack_rele(ns);
1622 kmem_free(ipne, sizeof (ipnet_nicevent_t));
1625 dev_t
1626 ipnet_if_getdev(char *name, zoneid_t zoneid)
1628 netstack_t *ns;
1629 ipnet_stack_t *ips;
1630 ipnetif_t *ipnetif;
1631 dev_t dev = (dev_t)-1;
1633 if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1634 return (dev);
1635 if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1636 return (dev);
1638 ips = ns->netstack_ipnet;
1639 mutex_enter(&ips->ips_avl_lock);
1640 if ((ipnetif = avl_find(&ips->ips_avl_by_name, name, NULL)) != NULL) {
1641 if (ipnetif_in_zone(ipnetif, zoneid, ips))
1642 dev = ipnetif->if_dev;
1644 mutex_exit(&ips->ips_avl_lock);
1645 netstack_rele(ns);
1647 return (dev);
1650 static ipnetif_t *
1651 ipnetif_getby_index(uint64_t id, ipnet_stack_t *ips)
1653 ipnetif_t *ipnetif;
1655 mutex_enter(&ips->ips_avl_lock);
1656 if ((ipnetif = avl_find(&ips->ips_avl_by_index, &id, NULL)) != NULL)
1657 ipnetif_refhold(ipnetif);
1658 mutex_exit(&ips->ips_avl_lock);
1659 return (ipnetif);
1662 static ipnetif_t *
1663 ipnetif_getby_dev(dev_t dev, ipnet_stack_t *ips)
1665 ipnetif_t *ipnetif;
1666 avl_tree_t *tree;
1668 mutex_enter(&ips->ips_avl_lock);
1669 tree = &ips->ips_avl_by_index;
1670 for (ipnetif = avl_first(tree); ipnetif != NULL;
1671 ipnetif = avl_walk(tree, ipnetif, AVL_AFTER)) {
1672 if (ipnetif->if_dev == dev) {
1673 ipnetif_refhold(ipnetif);
1674 break;
1677 mutex_exit(&ips->ips_avl_lock);
1678 return (ipnetif);
1681 static ipnetif_addr_t *
1682 ipnet_match_lif(ipnetif_t *ipnetif, lif_if_t lid, boolean_t isv6)
1684 ipnetif_addr_t *ifaddr;
1685 list_t *list;
1687 mutex_enter(&ipnetif->if_addr_lock);
1688 list = isv6 ? &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list;
1689 for (ifaddr = list_head(list); ifaddr != NULL;
1690 ifaddr = list_next(list, ifaddr)) {
1691 if (lid == ifaddr->ifa_id)
1692 break;
1694 mutex_exit(&ipnetif->if_addr_lock);
1695 return (ifaddr);
1698 /* ARGSUSED */
1699 static void *
1700 ipnet_stack_init(netstackid_t stackid, netstack_t *ns)
1702 ipnet_stack_t *ips;
1704 ips = kmem_zalloc(sizeof (*ips), KM_SLEEP);
1705 ips->ips_netstack = ns;
1706 mutex_init(&ips->ips_avl_lock, NULL, MUTEX_DEFAULT, 0);
1707 avl_create(&ips->ips_avl_by_index, ipnetif_compare_index,
1708 sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_index));
1709 avl_create(&ips->ips_avl_by_name, ipnetif_compare_name,
1710 sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_name));
1711 avl_create(&ips->ips_avl_by_shared, ipnetif_compare_name_zone,
1712 sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_shared));
1713 mutex_init(&ips->ips_walkers_lock, NULL, MUTEX_DEFAULT, NULL);
1714 cv_init(&ips->ips_walkers_cv, NULL, CV_DRIVER, NULL);
1715 list_create(&ips->ips_str_list, sizeof (ipnet_t),
1716 offsetof(ipnet_t, ipnet_next));
1717 ipnet_register_netihook(ips);
1718 return (ips);
1721 /* ARGSUSED */
1722 static void
1723 ipnet_stack_fini(netstackid_t stackid, void *arg)
1725 ipnet_stack_t *ips = arg;
1726 ipnetif_t *ipnetif, *nipnetif;
1728 if (ips->ips_kstatp != NULL) {
1729 zoneid_t zoneid;
1731 zoneid = netstackid_to_zoneid(stackid);
1732 net_kstat_delete(net_zoneidtonetid(zoneid), ips->ips_kstatp);
1734 if (ips->ips_ndv4 != NULL) {
1735 VERIFY(net_hook_unregister(ips->ips_ndv4, NH_NIC_EVENTS,
1736 ips->ips_nicevents) == 0);
1737 VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
1739 if (ips->ips_ndv6 != NULL) {
1740 VERIFY(net_hook_unregister(ips->ips_ndv6, NH_NIC_EVENTS,
1741 ips->ips_nicevents) == 0);
1742 VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
1744 hook_free(ips->ips_nicevents);
1746 for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1747 ipnetif = nipnetif) {
1748 nipnetif = AVL_NEXT(&ips->ips_avl_by_index, ipnetif);
1749 ipnetif_remove(ipnetif, ips);
1751 avl_destroy(&ips->ips_avl_by_shared);
1752 avl_destroy(&ips->ips_avl_by_index);
1753 avl_destroy(&ips->ips_avl_by_name);
1754 mutex_destroy(&ips->ips_avl_lock);
1755 mutex_destroy(&ips->ips_walkers_lock);
1756 cv_destroy(&ips->ips_walkers_cv);
1757 list_destroy(&ips->ips_str_list);
1758 kmem_free(ips, sizeof (*ips));
1761 /* Do any of the addresses in addrlist belong the supplied zoneid? */
1762 static boolean_t
1763 ipnet_addrs_in_zone(list_t *addrlist, zoneid_t zoneid)
1765 ipnetif_addr_t *ifa;
1767 for (ifa = list_head(addrlist); ifa != NULL;
1768 ifa = list_next(addrlist, ifa)) {
1769 if (ifa->ifa_zone == zoneid)
1770 return (B_TRUE);
1772 return (B_FALSE);
1775 /* Should the supplied ipnetif be visible from the supplied zoneid? */
1776 static boolean_t
1777 ipnetif_in_zone(ipnetif_t *ipnetif, zoneid_t zoneid, ipnet_stack_t *ips)
1779 int ret;
1782 * The global zone has visibility into all interfaces in the global
1783 * stack, and exclusive stack zones have visibility into all
1784 * interfaces in their stack.
1786 if (zoneid == GLOBAL_ZONEID ||
1787 ips->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
1788 return (B_TRUE);
1791 * Shared-stack zones only have visibility for interfaces that have
1792 * addresses in their zone.
1794 mutex_enter(&ipnetif->if_addr_lock);
1795 ret = ipnet_addrs_in_zone(&ipnetif->if_ip4addr_list, zoneid) ||
1796 ipnet_addrs_in_zone(&ipnetif->if_ip6addr_list, zoneid);
1797 mutex_exit(&ipnetif->if_addr_lock);
1798 return (ret);
1802 * Verify that any ipnet_t that has a reference to the supplied ipnetif should
1803 * still be allowed to have it open. A given ipnet_t may no longer be allowed
1804 * to have an ipnetif open if there are no longer any addresses that belong to
1805 * the ipnetif in the ipnet_t's non-global shared-stack zoneid. If that's the
1806 * case, send the ipnet_t an M_HANGUP.
1808 static void
1809 ipnetif_zonecheck(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1811 list_t *strlist = &ips->ips_str_list;
1812 ipnet_t *ipnet;
1814 ipnet_walkers_inc(ips);
1815 for (ipnet = list_head(strlist); ipnet != NULL;
1816 ipnet = list_next(strlist, ipnet)) {
1817 if (ipnet->ipnet_if != ipnetif)
1818 continue;
1819 if (!ipnetif_in_zone(ipnetif, ipnet->ipnet_zoneid, ips))
1820 (void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1822 ipnet_walkers_dec(ips);
1825 void
1826 ipnet_walk_if(ipnet_walkfunc_t *cb, void *arg, zoneid_t zoneid)
1828 ipnetif_t *ipnetif;
1829 list_t cbdata;
1830 ipnetif_cbdata_t *cbnode;
1831 netstack_t *ns;
1832 ipnet_stack_t *ips;
1835 * On labeled systems, non-global zones shouldn't see anything
1836 * in /dev/ipnet.
1838 if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1839 return;
1841 if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1842 return;
1844 ips = ns->netstack_ipnet;
1845 list_create(&cbdata, sizeof (ipnetif_cbdata_t),
1846 offsetof(ipnetif_cbdata_t, ic_next));
1848 mutex_enter(&ips->ips_avl_lock);
1849 for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1850 ipnetif = avl_walk(&ips->ips_avl_by_index, ipnetif, AVL_AFTER)) {
1851 if (!ipnetif_in_zone(ipnetif, zoneid, ips))
1852 continue;
1853 cbnode = kmem_zalloc(sizeof (ipnetif_cbdata_t), KM_SLEEP);
1854 (void) strlcpy(cbnode->ic_ifname, ipnetif->if_name, LIFNAMSIZ);
1855 cbnode->ic_dev = ipnetif->if_dev;
1856 list_insert_head(&cbdata, cbnode);
1858 mutex_exit(&ips->ips_avl_lock);
1860 while ((cbnode = list_head(&cbdata)) != NULL) {
1861 cb(cbnode->ic_ifname, arg, cbnode->ic_dev);
1862 list_remove(&cbdata, cbnode);
1863 kmem_free(cbnode, sizeof (ipnetif_cbdata_t));
1865 list_destroy(&cbdata);
1866 netstack_rele(ns);
1869 static int
1870 ipnetif_compare_index(const void *index_ptr, const void *ipnetifp)
1872 int64_t index1 = *((int64_t *)index_ptr);
1873 int64_t index2 = (int64_t)((ipnetif_t *)ipnetifp)->if_index;
1875 return (SIGNOF(index2 - index1));
1878 static int
1879 ipnetif_compare_name(const void *name_ptr, const void *ipnetifp)
1881 int res;
1883 res = strcmp(((ipnetif_t *)ipnetifp)->if_name, name_ptr);
1884 return (SIGNOF(res));
1887 static int
1888 ipnetif_compare_name_zone(const void *key_ptr, const void *ipnetifp)
1890 const uintptr_t *ptr = key_ptr;
1891 const ipnetif_t *ifp;
1892 int res;
1894 ifp = ipnetifp;
1895 res = ifp->if_zoneid - ptr[0];
1896 if (res != 0)
1897 return (SIGNOF(res));
1898 res = strcmp(ifp->if_name, (char *)ptr[1]);
1899 return (SIGNOF(res));
1902 static void
1903 ipnetif_refhold(ipnetif_t *ipnetif)
1905 mutex_enter(&ipnetif->if_reflock);
1906 ipnetif->if_refcnt++;
1907 mutex_exit(&ipnetif->if_reflock);
1910 static void
1911 ipnetif_refrele(ipnetif_t *ipnetif)
1913 mutex_enter(&ipnetif->if_reflock);
1914 ASSERT(ipnetif->if_refcnt > 0);
1915 if (--ipnetif->if_refcnt == 0)
1916 ipnetif_free(ipnetif);
1917 else
1918 mutex_exit(&ipnetif->if_reflock);
1921 static void
1922 ipnet_walkers_inc(ipnet_stack_t *ips)
1924 mutex_enter(&ips->ips_walkers_lock);
1925 ips->ips_walkers_cnt++;
1926 mutex_exit(&ips->ips_walkers_lock);
1929 static void
1930 ipnet_walkers_dec(ipnet_stack_t *ips)
1932 mutex_enter(&ips->ips_walkers_lock);
1933 ASSERT(ips->ips_walkers_cnt != 0);
1934 if (--ips->ips_walkers_cnt == 0)
1935 cv_broadcast(&ips->ips_walkers_cv);
1936 mutex_exit(&ips->ips_walkers_lock);
1939 /*ARGSUSED*/
1940 static int
1941 ipobs_bounce_func(hook_event_token_t token, hook_data_t info, void *arg)
1943 hook_pkt_observe_t *hdr;
1944 pfv_t func = (pfv_t)arg;
1945 mblk_t *mp;
1947 hdr = (hook_pkt_observe_t *)info;
1949 * Code in ip_input() expects that it is the only one accessing the
1950 * packet.
1952 mp = copymsg(hdr->hpo_pkt);
1953 if (mp == NULL) {
1954 netstack_t *ns = hdr->hpo_ctx;
1955 ipnet_stack_t *ips = ns->netstack_ipnet;
1957 IPSK_BUMP(ips, ik_dispatchDupDrop);
1958 return (0);
1961 hdr = (hook_pkt_observe_t *)mp->b_rptr;
1962 hdr->hpo_pkt = mp;
1964 func(mp);
1966 return (0);
1969 hook_t *
1970 ipobs_register_hook(netstack_t *ns, pfv_t func)
1972 ip_stack_t *ipst = ns->netstack_ip;
1973 char name[32];
1974 hook_t *hook;
1976 HOOK_INIT(hook, ipobs_bounce_func, "", (void *)func);
1977 VERIFY(hook != NULL);
1980 * To register multiple hooks with the same callback function,
1981 * a unique name is needed.
1983 (void) snprintf(name, sizeof (name), "ipobserve_%p", (void *)hook);
1984 hook->h_name = strdup(name);
1986 (void) net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1987 (void) net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1989 return (hook);
1992 void
1993 ipobs_unregister_hook(netstack_t *ns, hook_t *hook)
1995 ip_stack_t *ipst = ns->netstack_ip;
1997 (void) net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1999 (void) net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
2001 strfree(hook->h_name);
2003 hook_free(hook);
2006 /* ******************************************************************** */
2007 /* BPF Functions below */
2008 /* ******************************************************************** */
2011 * Convenience function to make mapping a zoneid to an ipnet_stack_t easy.
2013 ipnet_stack_t *
2014 ipnet_find_by_zoneid(zoneid_t zoneid)
2016 netstack_t *ns;
2018 VERIFY((ns = netstack_find_by_zoneid(zoneid)) != NULL);
2019 return (ns->netstack_ipnet);
2023 * Functions, such as the above ipnet_find_by_zoneid(), will return a
2024 * pointer to ipnet_stack_t by calling a netstack lookup function.
2025 * The netstack_find_*() functions return a pointer after doing a "hold"
2026 * on the data structure and thereby require a "release" when the caller
2027 * is finished with it. We need to mirror that API here and thus a caller
2028 * of ipnet_find_by_zoneid() is required to call ipnet_rele().
2030 void
2031 ipnet_rele(ipnet_stack_t *ips)
2033 netstack_rele(ips->ips_netstack);
2038 void
2039 ipnet_set_itap(bpf_itap_fn_t tapfunc)
2041 ipnet_itap = tapfunc;
2045 * The list of interfaces available via ipnet is private for each zone,
2046 * so the AVL tree of each zone must be searched for a given name, even
2047 * if all names are unique.
2050 ipnet_open_byname(const char *name, ipnetif_t **ptr, zoneid_t zoneid)
2052 ipnet_stack_t *ips;
2053 ipnetif_t *ipnetif;
2055 ASSERT(ptr != NULL);
2056 VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2058 mutex_enter(&ips->ips_avl_lock);
2061 * Shared instance zone?
2063 if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2064 uintptr_t key[2] = { zoneid, (uintptr_t)name };
2066 ipnetif = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2067 } else {
2068 ipnetif = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2070 if (ipnetif != NULL)
2071 ipnetif_refhold(ipnetif);
2072 mutex_exit(&ips->ips_avl_lock);
2074 *ptr = ipnetif;
2075 ipnet_rele(ips);
2077 if (ipnetif == NULL)
2078 return (ESRCH);
2079 return (0);
2082 void
2083 ipnet_close_byhandle(ipnetif_t *ifp)
2085 ASSERT(ifp != NULL);
2086 ipnetif_refrele(ifp);
2089 const char *
2090 ipnet_name(ipnetif_t *ifp)
2092 ASSERT(ifp != NULL);
2093 return (ifp->if_name);
2097 * To find the linkid for a given name, it is necessary to know which zone
2098 * the interface name belongs to and to search the avl tree for that zone
2099 * as there is no master list of all interfaces and which zone they belong
2100 * to. It is assumed that the caller of this function is somehow already
2101 * working with the ipnet interfaces and hence the ips_event_lock is held.
2102 * When BPF calls into this function, it is doing so because of an event
2103 * in ipnet, and thus ipnet holds the ips_event_lock. Thus the datalink id
2104 * value returned has meaning without the need for grabbing a hold on the
2105 * owning structure.
2108 ipnet_get_linkid_byname(const char *name, uint_t *idp, zoneid_t zoneid)
2110 ipnet_stack_t *ips;
2111 ipnetif_t *ifp;
2113 VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2114 ASSERT(mutex_owned(&ips->ips_event_lock));
2116 mutex_enter(&ips->ips_avl_lock);
2117 ifp = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2118 if (ifp != NULL)
2119 *idp = (uint_t)ifp->if_index;
2122 * Shared instance zone?
2124 if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2125 uintptr_t key[2] = { zoneid, (uintptr_t)name };
2127 ifp = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2128 if (ifp != NULL)
2129 *idp = (uint_t)ifp->if_index;
2132 mutex_exit(&ips->ips_avl_lock);
2133 ipnet_rele(ips);
2135 if (ifp == NULL)
2136 return (ESRCH);
2137 return (0);
2141 * Strictly speaking, there is no such thing as a "client" in ipnet, like
2142 * there is in mac. BPF only needs to have this because it is required as
2143 * part of interfacing correctly with mac. The reuse of the original
2144 * ipnetif_t as a client poses no danger, so long as it is done with its
2145 * own ref-count'd hold that is given up on close.
2148 ipnet_client_open(ipnetif_t *ptr, ipnetif_t **result)
2150 ASSERT(ptr != NULL);
2151 ASSERT(result != NULL);
2152 ipnetif_refhold(ptr);
2153 *result = ptr;
2155 return (0);
2158 void
2159 ipnet_client_close(ipnetif_t *ptr)
2161 ASSERT(ptr != NULL);
2162 ipnetif_refrele(ptr);
2166 * This is called from BPF when it needs to start receiving packets
2167 * from ipnet.
2169 * The use of the ipnet_t structure here is somewhat lightweight when
2170 * compared to how it is used elsewhere but it already has all of the
2171 * right fields in it, so reuse here doesn't seem out of order. Its
2172 * primary purpose here is to provide the means to store pointers for
2173 * use when ipnet_promisc_remove() needs to be called.
2175 * This should never be called for the IPNET_MINOR_LO device as it is
2176 * never created via ipnetif_create.
2178 /*ARGSUSED*/
2180 ipnet_promisc_add(void *handle, uint_t how, void *data, uintptr_t *mhandle,
2181 int flags)
2183 ip_stack_t *ipst;
2184 netstack_t *ns;
2185 ipnetif_t *ifp;
2186 ipnet_t *ipnet;
2187 char name[32];
2188 int error;
2190 ifp = (ipnetif_t *)handle;
2192 if (how != DL_PROMISC_PHYS && how != DL_PROMISC_MULTI)
2193 return (EINVAL);
2195 ns = netstack_find_by_zoneid(ifp->if_zoneid);
2197 if ((error = ipnet_join_allmulti(ifp, ns->netstack_ipnet)) != 0) {
2198 netstack_rele(ns);
2199 return (error);
2202 ipnet = kmem_zalloc(sizeof (*ipnet), KM_SLEEP);
2203 ipnet->ipnet_if = ifp;
2204 ipnet->ipnet_ns = ns;
2205 ipnet->ipnet_flags = flags;
2207 if ((ifp->if_flags & IPNETIF_LOOPBACK) != 0) {
2208 ipnet->ipnet_acceptfn = ipnet_loaccept;
2209 } else {
2210 ipnet->ipnet_acceptfn = ipnet_accept;
2214 * To register multiple hooks with the same callback function,
2215 * a unique name is needed.
2217 HOOK_INIT(ipnet->ipnet_hook, ipnet_bpf_bounce, "", ipnet);
2218 (void) snprintf(name, sizeof (name), "ipnet_promisc_%p",
2219 (void *)ipnet->ipnet_hook);
2220 ipnet->ipnet_hook->h_name = strdup(name);
2221 ipnet->ipnet_data = data;
2222 ipnet->ipnet_zoneid = ifp->if_zoneid;
2224 ipst = ns->netstack_ip;
2226 error = net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2227 ipnet->ipnet_hook);
2228 if (error != 0)
2229 goto regfail;
2231 error = net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2232 ipnet->ipnet_hook);
2233 if (error != 0) {
2234 (void) net_hook_unregister(ipst->ips_ip4_observe_pr,
2235 NH_OBSERVE, ipnet->ipnet_hook);
2236 goto regfail;
2239 *mhandle = (uintptr_t)ipnet;
2240 netstack_rele(ns);
2242 return (0);
2244 regfail:
2245 cmn_err(CE_WARN, "net_hook_register failed: %d", error);
2246 strfree(ipnet->ipnet_hook->h_name);
2247 hook_free(ipnet->ipnet_hook);
2248 netstack_rele(ns);
2249 return (error);
2252 void
2253 ipnet_promisc_remove(void *data)
2255 ip_stack_t *ipst;
2256 ipnet_t *ipnet;
2257 hook_t *hook;
2259 ipnet = data;
2260 ipst = ipnet->ipnet_ns->netstack_ip;
2261 hook = ipnet->ipnet_hook;
2263 VERIFY(net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2264 hook) == 0);
2266 VERIFY(net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2267 hook) == 0);
2269 strfree(hook->h_name);
2271 hook_free(hook);
2273 kmem_free(ipnet, sizeof (*ipnet));
2277 * arg here comes from the ipnet_t allocated in ipnet_promisc_add.
2278 * An important field from that structure is "ipnet_data" that
2279 * contains the "data" pointer passed into ipnet_promisc_add: it needs
2280 * to be passed back to bpf when we call into ipnet_itap.
2282 * ipnet_itap is set by ipnet_set_bpfattach, which in turn is called
2283 * from BPF.
2285 /*ARGSUSED*/
2286 static int
2287 ipnet_bpf_bounce(hook_event_token_t token, hook_data_t info, void *arg)
2289 hook_pkt_observe_t *hdr;
2290 ipnet_addrp_t src;
2291 ipnet_addrp_t dst;
2292 ipnet_stack_t *ips;
2293 ipnet_t *ipnet;
2294 mblk_t *netmp;
2295 mblk_t *mp;
2297 hdr = (hook_pkt_observe_t *)info;
2298 mp = hdr->hpo_pkt;
2299 ipnet = (ipnet_t *)arg;
2300 ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
2302 netmp = hdr->hpo_pkt->b_cont;
2303 src.iap_family = hdr->hpo_family;
2304 dst.iap_family = hdr->hpo_family;
2306 if (hdr->hpo_family == AF_INET) {
2307 src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
2308 dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
2309 } else {
2310 src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
2311 dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
2314 if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
2315 IPSK_BUMP(ips, ik_acceptFail);
2316 return (0);
2318 IPSK_BUMP(ips, ik_acceptOk);
2320 ipnet_itap(ipnet->ipnet_data, mp,
2321 hdr->hpo_htype == htons(IPOBS_HOOK_OUTBOUND),
2322 ntohl(hdr->hpo_pktlen) + MBLKL(mp));
2324 return (0);
2328 * clone'd ipnetif_t's are created when a shared IP instance zone comes
2329 * to life and configures an IP address. The model that BPF uses is that
2330 * each interface must have a unique pointer and each interface must be
2331 * representative of what it can capture. They are limited to one DLT
2332 * per interface and one zone per interface. Thus every interface that
2333 * can be seen in a zone must be announced via an attach to bpf. For
2334 * shared instance zones, this means the ipnet driver needs to detect
2335 * when an address is added to an interface in a zone for the first
2336 * time (and also when the last address is removed.)
2338 static ipnetif_t *
2339 ipnetif_clone_create(ipnetif_t *ifp, zoneid_t zoneid)
2341 uintptr_t key[2] = { zoneid, (uintptr_t)ifp->if_name };
2342 ipnet_stack_t *ips = ifp->if_stackp;
2343 avl_index_t where = 0;
2344 ipnetif_t *newif;
2346 mutex_enter(&ips->ips_avl_lock);
2347 newif = avl_find(&ips->ips_avl_by_shared, (void *)key, &where);
2348 if (newif != NULL) {
2349 ipnetif_refhold(newif);
2350 newif->if_sharecnt++;
2351 mutex_exit(&ips->ips_avl_lock);
2352 return (newif);
2355 newif = ipnet_alloc_if(ips);
2356 if (newif == NULL) {
2357 mutex_exit(&ips->ips_avl_lock);
2358 return (NULL);
2361 newif->if_refcnt = 1;
2362 newif->if_sharecnt = 1;
2363 newif->if_zoneid = zoneid;
2364 (void) strlcpy(newif->if_name, ifp->if_name, LIFNAMSIZ);
2365 newif->if_flags = ifp->if_flags & IPNETIF_LOOPBACK;
2366 newif->if_index = ifp->if_index;
2368 avl_insert(&ips->ips_avl_by_shared, newif, where);
2369 mutex_exit(&ips->ips_avl_lock);
2371 return (newif);
2374 static void
2375 ipnetif_clone_release(ipnetif_t *ipnetif)
2377 boolean_t dofree = B_FALSE;
2378 boolean_t doremove = B_FALSE;
2379 ipnet_stack_t *ips = ipnetif->if_stackp;
2381 mutex_enter(&ipnetif->if_reflock);
2382 ASSERT(ipnetif->if_refcnt > 0);
2383 if (--ipnetif->if_refcnt == 0)
2384 dofree = B_TRUE;
2385 ASSERT(ipnetif->if_sharecnt > 0);
2386 if (--ipnetif->if_sharecnt == 0)
2387 doremove = B_TRUE;
2388 mutex_exit(&ipnetif->if_reflock);
2389 if (doremove) {
2390 mutex_enter(&ips->ips_avl_lock);
2391 avl_remove(&ips->ips_avl_by_shared, ipnetif);
2392 mutex_exit(&ips->ips_avl_lock);
2394 if (dofree) {
2395 ASSERT(ipnetif->if_sharecnt == 0);
2396 ipnetif_free(ipnetif);