Merge commit 'ea01a15a654b9e1c7b37d958f4d1911882ed7781'
[unleashed.git] / kernel / net / ipnet / ipnet.c
blob98e97f7951945ae70df7b3a3471e9ef1b418b712
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
29 * Copyright (c) 2016, Joyent, Inc. All rights reserved.
33 * The ipnet device defined here provides access to packets at the IP layer. To
34 * provide access to packets at this layer it registers a callback function in
35 * the ip module and when there are open instances of the device ip will pass
36 * packets into the device. Packets from ip are passed on the input, output and
37 * loopback paths. Internally the module returns to ip as soon as possible by
38 * deferring processing using a taskq.
40 * Management of the devices in /dev/ipnet/ is handled by the devname
41 * filesystem and use of the neti interfaces. This module registers for NIC
42 * events using the neti framework so that when IP interfaces are bought up,
43 * taken down etc. the ipnet module is notified and its view of the interfaces
44 * configured on the system adjusted. On attach, the module gets an initial
45 * view of the system again using the neti framework but as it has already
46 * registered for IP interface events, it is still up-to-date with any changes.
49 #include <sys/types.h>
50 #include <sys/conf.h>
51 #include <sys/cred.h>
52 #include <sys/stat.h>
53 #include <sys/ddi.h>
54 #include <sys/sunddi.h>
55 #include <sys/modctl.h>
56 #include <sys/dlpi.h>
57 #include <sys/strsun.h>
58 #include <sys/id_space.h>
59 #include <sys/kmem.h>
60 #include <sys/mkdev.h>
61 #include <sys/neti.h>
62 #include <net/if.h>
63 #include <sys/errno.h>
64 #include <sys/list.h>
65 #include <sys/ksynch.h>
66 #include <sys/hook_event.h>
67 #include <sys/sdt.h>
68 #include <sys/stropts.h>
69 #include <sys/sysmacros.h>
70 #include <inet/ip.h>
71 #include <inet/ip_if.h>
72 #include <inet/ip_multi.h>
73 #include <inet/ip6.h>
74 #include <inet/ipnet.h>
75 #include <net/bpf.h>
76 #include <net/bpfdesc.h>
77 #include <net/dlt.h>
79 static struct module_info ipnet_minfo = {
80 1, /* mi_idnum */
81 "ipnet", /* mi_idname */
82 0, /* mi_minpsz */
83 INFPSZ, /* mi_maxpsz */
84 2048, /* mi_hiwat */
85 0 /* mi_lowat */
89 * List to hold static view of ipnetif_t's on the system. This is needed to
90 * avoid holding the lock protecting the avl tree of ipnetif's over the
91 * callback into the dev filesystem.
93 typedef struct ipnetif_cbdata {
94 char ic_ifname[LIFNAMSIZ];
95 dev_t ic_dev;
96 list_node_t ic_next;
97 } ipnetif_cbdata_t;
100 * Convenience enumerated type for ipnet_accept(). It describes the
101 * properties of a given ipnet_addrp_t relative to a single ipnet_t
102 * client stream. The values represent whether the address is ...
104 typedef enum {
105 IPNETADDR_MYADDR, /* an address on my ipnetif_t. */
106 IPNETADDR_MBCAST, /* a multicast or broadcast address. */
107 IPNETADDR_UNKNOWN /* none of the above. */
108 } ipnet_addrtype_t;
110 /* Argument used for the ipnet_nicevent_taskq callback. */
111 typedef struct ipnet_nicevent_s {
112 nic_event_t ipne_event;
113 net_handle_t ipne_protocol;
114 netstackid_t ipne_stackid;
115 uint64_t ipne_ifindex;
116 uint64_t ipne_lifindex;
117 char ipne_ifname[LIFNAMSIZ];
118 } ipnet_nicevent_t;
120 static dev_info_t *ipnet_dip;
121 static major_t ipnet_major;
122 static ddi_taskq_t *ipnet_taskq; /* taskq for packets */
123 static ddi_taskq_t *ipnet_nicevent_taskq; /* taskq for NIC events */
124 static id_space_t *ipnet_minor_space;
125 static const int IPNET_MINOR_LO = 1; /* minor number for /dev/lo0 */
126 static const int IPNET_MINOR_MIN = 2; /* start of dynamic minors */
127 static dl_info_ack_t ipnet_infoack = IPNET_INFO_ACK_INIT;
128 static ipnet_acceptfn_t ipnet_accept, ipnet_loaccept;
129 static bpf_itap_fn_t ipnet_itap;
131 static void ipnet_input(mblk_t *);
132 static int ipnet_wput(queue_t *, mblk_t *);
133 static int ipnet_rsrv(queue_t *);
134 static int ipnet_open(queue_t *, dev_t *, int, int, cred_t *);
135 static int ipnet_close(queue_t *);
136 static void ipnet_ioctl(queue_t *, mblk_t *);
137 static void ipnet_iocdata(queue_t *, mblk_t *);
138 static void ipnet_wputnondata(queue_t *, mblk_t *);
139 static int ipnet_attach(dev_info_t *, ddi_attach_cmd_t);
140 static int ipnet_detach(dev_info_t *, ddi_detach_cmd_t);
141 static int ipnet_devinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
142 static void ipnet_inforeq(queue_t *q, mblk_t *mp);
143 static void ipnet_bindreq(queue_t *q, mblk_t *mp);
144 static void ipnet_unbindreq(queue_t *q, mblk_t *mp);
145 static void ipnet_dlpromisconreq(queue_t *q, mblk_t *mp);
146 static void ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp);
147 static int ipnet_join_allmulti(ipnetif_t *, ipnet_stack_t *);
148 static void ipnet_leave_allmulti(ipnetif_t *, ipnet_stack_t *);
149 static int ipnet_nicevent_cb(hook_event_token_t, hook_data_t, void *);
150 static void ipnet_nicevent_task(void *);
151 static ipnetif_t *ipnetif_create(const char *, uint64_t, ipnet_stack_t *,
152 uint64_t);
153 static void ipnetif_remove(ipnetif_t *, ipnet_stack_t *);
154 static ipnetif_addr_t *ipnet_match_lif(ipnetif_t *, lif_if_t, boolean_t);
155 static ipnetif_t *ipnetif_getby_index(uint64_t, ipnet_stack_t *);
156 static ipnetif_t *ipnetif_getby_dev(dev_t, ipnet_stack_t *);
157 static boolean_t ipnetif_in_zone(ipnetif_t *, zoneid_t, ipnet_stack_t *);
158 static void ipnetif_zonecheck(ipnetif_t *, ipnet_stack_t *);
159 static int ipnet_populate_if(net_handle_t, ipnet_stack_t *, boolean_t);
160 static int ipnetif_compare_name(const void *, const void *);
161 static int ipnetif_compare_name_zone(const void *, const void *);
162 static int ipnetif_compare_index(const void *, const void *);
163 static void ipnet_add_ifaddr(uint64_t, ipnetif_t *, net_handle_t);
164 static void ipnet_delete_ifaddr(ipnetif_addr_t *, ipnetif_t *, boolean_t);
165 static void ipnetif_refhold(ipnetif_t *);
166 static void ipnetif_refrele(ipnetif_t *);
167 static void ipnet_walkers_inc(ipnet_stack_t *);
168 static void ipnet_walkers_dec(ipnet_stack_t *);
169 static void ipnet_register_netihook(ipnet_stack_t *);
170 static void *ipnet_stack_init(netstackid_t, netstack_t *);
171 static void ipnet_stack_fini(netstackid_t, void *);
172 static void ipnet_dispatch(void *);
173 static int ipobs_bounce_func(hook_event_token_t, hook_data_t, void *);
174 static int ipnet_bpf_bounce(hook_event_token_t, hook_data_t, void *);
175 static ipnetif_t *ipnetif_clone_create(ipnetif_t *, zoneid_t);
176 static void ipnetif_clone_release(ipnetif_t *);
178 static struct qinit ipnet_rinit = {
179 NULL, /* qi_putp */
180 ipnet_rsrv, /* qi_srvp */
181 ipnet_open, /* qi_qopen */
182 ipnet_close, /* qi_qclose */
183 NULL, /* qi_qadmin */
184 &ipnet_minfo, /* qi_minfo */
187 static struct qinit ipnet_winit = {
188 ipnet_wput, /* qi_putp */
189 NULL, /* qi_srvp */
190 NULL, /* qi_qopen */
191 NULL, /* qi_qclose */
192 NULL, /* qi_qadmin */
193 &ipnet_minfo, /* qi_minfo */
196 static struct streamtab ipnet_info = {
197 &ipnet_rinit, &ipnet_winit
200 DDI_DEFINE_STREAM_OPS(ipnet_ops, nulldev, nulldev, ipnet_attach,
201 ipnet_detach, nodev, ipnet_devinfo, D_MP | D_MTPERMOD, &ipnet_info,
202 ddi_quiesce_not_supported);
204 static struct modldrv modldrv = {
205 &mod_driverops,
206 "STREAMS ipnet driver",
207 &ipnet_ops
210 static struct modlinkage modlinkage = {
211 MODREV_1, &modldrv, NULL
215 * This structure contains the template data (names and type) that is
216 * copied, in bulk, into the new kstats structure created by net_kstat_create.
217 * No actual statistical information is stored in this instance of the
218 * ipnet_kstats_t structure.
220 static ipnet_kstats_t stats_template = {
221 { "duplicationFail", KSTAT_DATA_UINT64 },
222 { "dispatchOk", KSTAT_DATA_UINT64 },
223 { "dispatchFail", KSTAT_DATA_UINT64 },
224 { "dispatchHeaderDrop", KSTAT_DATA_UINT64 },
225 { "dispatchDupDrop", KSTAT_DATA_UINT64 },
226 { "dispatchDeliver", KSTAT_DATA_UINT64 },
227 { "acceptOk", KSTAT_DATA_UINT64 },
228 { "acceptFail", KSTAT_DATA_UINT64 }
232 * Walk the list of physical interfaces on the machine, for each
233 * interface create a new ipnetif_t and add any addresses to it. We
234 * need to do the walk twice, once for IPv4 and once for IPv6.
236 * The interfaces are destroyed as part of ipnet_stack_fini() for each
237 * stack. Note that we cannot do this initialization in
238 * ipnet_stack_init(), since ipnet_stack_init() cannot fail.
240 static int
241 ipnetif_init(void)
243 netstack_handle_t nh;
244 netstack_t *ns;
245 ipnet_stack_t *ips;
246 int ret = 0;
248 netstack_next_init(&nh);
249 while ((ns = netstack_next(&nh)) != NULL) {
250 ips = ns->netstack_ipnet;
251 if ((ret = ipnet_populate_if(ips->ips_ndv4, ips, B_FALSE)) == 0)
252 ret = ipnet_populate_if(ips->ips_ndv6, ips, B_TRUE);
253 netstack_rele(ns);
254 if (ret != 0)
255 break;
257 netstack_next_fini(&nh);
258 return (ret);
262 * Standard module entry points.
265 _init(void)
267 int ret;
268 boolean_t netstack_registered = B_FALSE;
270 if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
271 return (ENODEV);
272 ipnet_minor_space = id_space_create("ipnet_minor_space",
273 IPNET_MINOR_MIN, MAXMIN32);
276 * We call ddi_taskq_create() with nthread == 1 to ensure in-order
277 * delivery of packets to clients. Note that we need to create the
278 * taskqs before calling netstack_register() since ipnet_stack_init()
279 * registers callbacks that use 'em.
281 ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
282 ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
283 1, TASKQ_DEFAULTPRI, 0);
284 if (ipnet_taskq == NULL || ipnet_nicevent_taskq == NULL) {
285 ret = ENOMEM;
286 goto done;
289 netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
290 netstack_registered = B_TRUE;
292 if ((ret = ipnetif_init()) == 0)
293 ret = mod_install(&modlinkage);
294 done:
295 if (ret != 0) {
296 if (ipnet_taskq != NULL)
297 ddi_taskq_destroy(ipnet_taskq);
298 if (ipnet_nicevent_taskq != NULL)
299 ddi_taskq_destroy(ipnet_nicevent_taskq);
300 if (netstack_registered)
301 netstack_unregister(NS_IPNET);
302 id_space_destroy(ipnet_minor_space);
304 return (ret);
308 _fini(void)
310 int err;
312 if ((err = mod_remove(&modlinkage)) != 0)
313 return (err);
315 netstack_unregister(NS_IPNET);
316 ddi_taskq_destroy(ipnet_nicevent_taskq);
317 ddi_taskq_destroy(ipnet_taskq);
318 id_space_destroy(ipnet_minor_space);
319 return (0);
323 _info(struct modinfo *modinfop)
325 return (mod_info(&modlinkage, modinfop));
328 static void
329 ipnet_register_netihook(ipnet_stack_t *ips)
331 int ret;
332 zoneid_t zoneid;
333 netid_t netid;
335 HOOK_INIT(ips->ips_nicevents, ipnet_nicevent_cb, "ipnet_nicevents",
336 ips);
339 * It is possible for an exclusive stack to be in the process of
340 * shutting down here, and the netid and protocol lookups could fail
341 * in that case.
343 zoneid = netstackid_to_zoneid(ips->ips_netstack->netstack_stackid);
344 if ((netid = net_zoneidtonetid(zoneid)) == -1)
345 return;
347 if ((ips->ips_ndv4 = net_protocol_lookup(netid, NHF_INET)) != NULL) {
348 if ((ret = net_hook_register(ips->ips_ndv4, NH_NIC_EVENTS,
349 ips->ips_nicevents)) != 0) {
350 VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
351 ips->ips_ndv4 = NULL;
352 cmn_err(CE_WARN, "unable to register IPv4 netinfo hooks"
353 " in zone %d: %d", zoneid, ret);
356 if ((ips->ips_ndv6 = net_protocol_lookup(netid, NHF_INET6)) != NULL) {
357 if ((ret = net_hook_register(ips->ips_ndv6, NH_NIC_EVENTS,
358 ips->ips_nicevents)) != 0) {
359 VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
360 ips->ips_ndv6 = NULL;
361 cmn_err(CE_WARN, "unable to register IPv6 netinfo hooks"
362 " in zone %d: %d", zoneid, ret);
367 * Create a local set of kstats for each zone.
369 ips->ips_kstatp = net_kstat_create(netid, "ipnet", 0, "ipnet_stats",
370 "misc", KSTAT_TYPE_NAMED,
371 sizeof (ipnet_kstats_t) / sizeof (kstat_named_t), 0);
372 if (ips->ips_kstatp != NULL) {
373 bcopy(&stats_template, &ips->ips_stats,
374 sizeof (ips->ips_stats));
375 ips->ips_kstatp->ks_data = &ips->ips_stats;
376 ips->ips_kstatp->ks_private =
377 (void *)(uintptr_t)ips->ips_netstack->netstack_stackid;
378 kstat_install(ips->ips_kstatp);
379 } else {
380 cmn_err(CE_WARN, "net_kstat_create(%s,%s,%s) failed",
381 "ipnet", "ipnet_stats", "misc");
386 * This function is called on attach to build an initial view of the
387 * interfaces on the system. It will be called once for IPv4 and once
388 * for IPv6, although there is only one ipnet interface for both IPv4
389 * and IPv6 there are separate address lists.
391 static int
392 ipnet_populate_if(net_handle_t nd, ipnet_stack_t *ips, boolean_t isv6)
394 phy_if_t phyif;
395 lif_if_t lif;
396 ipnetif_t *ipnetif;
397 char name[LIFNAMSIZ];
398 boolean_t new_if = B_FALSE;
399 uint64_t ifflags;
400 int ret = 0;
403 * If ipnet_register_netihook() was unable to initialize this
404 * stack's net_handle_t, then we cannot populate any interface
405 * information. This usually happens when we attempted to
406 * grab a net_handle_t as a stack was shutting down. We don't
407 * want to fail the entire _init() operation because of a
408 * stack shutdown (other stacks will continue to work just
409 * fine), so we silently return success here.
411 if (nd == NULL)
412 return (0);
415 * Make sure we're not processing NIC events during the
416 * population of our interfaces and address lists.
418 mutex_enter(&ips->ips_event_lock);
420 for (phyif = net_phygetnext(nd, 0); phyif != 0;
421 phyif = net_phygetnext(nd, phyif)) {
422 if (net_getifname(nd, phyif, name, LIFNAMSIZ) != 0)
423 continue;
424 ifflags = 0;
425 (void) net_getlifflags(nd, phyif, 0, &ifflags);
426 if ((ipnetif = ipnetif_getby_index(phyif, ips)) == NULL) {
427 ipnetif = ipnetif_create(name, phyif, ips, ifflags);
428 if (ipnetif == NULL) {
429 ret = ENOMEM;
430 goto done;
432 new_if = B_TRUE;
434 ipnetif->if_flags |=
435 isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
437 for (lif = net_lifgetnext(nd, phyif, 0); lif != 0;
438 lif = net_lifgetnext(nd, phyif, lif)) {
440 * Skip addresses that aren't up. We'll add
441 * them when we receive an NE_LIF_UP event.
443 if (net_getlifflags(nd, phyif, lif, &ifflags) != 0 ||
444 !(ifflags & IFF_UP))
445 continue;
446 /* Don't add it if we already have it. */
447 if (ipnet_match_lif(ipnetif, lif, isv6) != NULL)
448 continue;
449 ipnet_add_ifaddr(lif, ipnetif, nd);
451 if (!new_if)
452 ipnetif_refrele(ipnetif);
455 done:
456 mutex_exit(&ips->ips_event_lock);
457 return (ret);
460 static int
461 ipnet_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
463 if (cmd != DDI_ATTACH)
464 return (DDI_FAILURE);
466 if (ddi_create_minor_node(dip, "lo0", S_IFCHR, IPNET_MINOR_LO,
467 DDI_PSEUDO, 0) == DDI_FAILURE)
468 return (DDI_FAILURE);
470 ipnet_dip = dip;
471 return (DDI_SUCCESS);
474 static int
475 ipnet_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
477 if (cmd != DDI_DETACH)
478 return (DDI_FAILURE);
480 ASSERT(dip == ipnet_dip);
481 ddi_remove_minor_node(ipnet_dip, NULL);
482 ipnet_dip = NULL;
483 return (DDI_SUCCESS);
486 /* ARGSUSED */
487 static int
488 ipnet_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
490 int error = DDI_FAILURE;
492 switch (infocmd) {
493 case DDI_INFO_DEVT2INSTANCE:
494 *result = NULL;
495 error = DDI_SUCCESS;
496 break;
497 case DDI_INFO_DEVT2DEVINFO:
498 if (ipnet_dip != NULL) {
499 *result = ipnet_dip;
500 error = DDI_SUCCESS;
502 break;
504 return (error);
507 /* ARGSUSED */
508 static int
509 ipnet_open(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
511 ipnet_t *ipnet;
512 netstack_t *ns = NULL;
513 ipnet_stack_t *ips;
514 int err = 0;
515 zoneid_t zoneid = crgetzoneid(crp);
517 /* We don't support open as a module */
518 if (sflag & MODOPEN)
519 return (ENOTSUP);
521 /* This driver is self-cloning, we don't support re-open. */
522 if (rq->q_ptr != NULL)
523 return (EBUSY);
525 if ((ipnet = kmem_zalloc(sizeof (*ipnet), KM_NOSLEEP)) == NULL)
526 return (ENOMEM);
528 VERIFY((ns = netstack_find_by_cred(crp)) != NULL);
529 ips = ns->netstack_ipnet;
531 rq->q_ptr = WR(rq)->q_ptr = ipnet;
532 ipnet->ipnet_rq = rq;
533 ipnet->ipnet_minor = (minor_t)id_alloc(ipnet_minor_space);
534 ipnet->ipnet_zoneid = zoneid;
535 ipnet->ipnet_dlstate = DL_UNBOUND;
536 ipnet->ipnet_ns = ns;
539 * We need to hold ips_event_lock here as any NE_LIF_DOWN events need
540 * to be processed after ipnet_if is set and the ipnet_t has been
541 * inserted in the ips_str_list.
543 mutex_enter(&ips->ips_event_lock);
544 if (getminor(*dev) == IPNET_MINOR_LO) {
545 ipnet->ipnet_flags |= IPNET_LOMODE;
546 ipnet->ipnet_acceptfn = ipnet_loaccept;
547 } else {
548 ipnet->ipnet_acceptfn = ipnet_accept;
549 ipnet->ipnet_if = ipnetif_getby_dev(*dev, ips);
550 if (ipnet->ipnet_if == NULL ||
551 !ipnetif_in_zone(ipnet->ipnet_if, zoneid, ips)) {
552 err = ENODEV;
553 goto done;
557 mutex_enter(&ips->ips_walkers_lock);
558 while (ips->ips_walkers_cnt != 0)
559 cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
560 list_insert_head(&ips->ips_str_list, ipnet);
561 *dev = makedevice(getmajor(*dev), ipnet->ipnet_minor);
562 qprocson(rq);
565 * Only register our callback if we're the first open client; we call
566 * unregister in close() for the last open client.
568 if (list_head(&ips->ips_str_list) == list_tail(&ips->ips_str_list))
569 ips->ips_hook = ipobs_register_hook(ns, ipnet_input);
570 mutex_exit(&ips->ips_walkers_lock);
572 done:
573 mutex_exit(&ips->ips_event_lock);
574 if (err != 0) {
575 netstack_rele(ns);
576 id_free(ipnet_minor_space, ipnet->ipnet_minor);
577 if (ipnet->ipnet_if != NULL)
578 ipnetif_refrele(ipnet->ipnet_if);
579 kmem_free(ipnet, sizeof (*ipnet));
581 return (err);
584 static int
585 ipnet_close(queue_t *rq)
587 ipnet_t *ipnet = rq->q_ptr;
588 ipnet_stack_t *ips = ipnet->ipnet_ns->netstack_ipnet;
590 if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
591 ipnet_leave_allmulti(ipnet->ipnet_if, ips);
592 if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
593 ipnet_leave_allmulti(ipnet->ipnet_if, ips);
595 mutex_enter(&ips->ips_walkers_lock);
596 while (ips->ips_walkers_cnt != 0)
597 cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
599 qprocsoff(rq);
601 list_remove(&ips->ips_str_list, ipnet);
602 if (ipnet->ipnet_if != NULL)
603 ipnetif_refrele(ipnet->ipnet_if);
604 id_free(ipnet_minor_space, ipnet->ipnet_minor);
606 if (list_is_empty(&ips->ips_str_list)) {
607 ipobs_unregister_hook(ips->ips_netstack, ips->ips_hook);
608 ips->ips_hook = NULL;
611 kmem_free(ipnet, sizeof (*ipnet));
613 mutex_exit(&ips->ips_walkers_lock);
614 netstack_rele(ips->ips_netstack);
615 return (0);
618 static int
619 ipnet_wput(queue_t *q, mblk_t *mp)
621 switch (mp->b_datap->db_type) {
622 case M_FLUSH:
623 if (*mp->b_rptr & FLUSHW) {
624 flushq(q, FLUSHDATA);
625 *mp->b_rptr &= ~FLUSHW;
627 if (*mp->b_rptr & FLUSHR)
628 qreply(q, mp);
629 else
630 freemsg(mp);
631 break;
632 case M_PROTO:
633 case M_PCPROTO:
634 ipnet_wputnondata(q, mp);
635 break;
636 case M_IOCTL:
637 ipnet_ioctl(q, mp);
638 break;
639 case M_IOCDATA:
640 ipnet_iocdata(q, mp);
641 break;
642 default:
643 freemsg(mp);
644 break;
646 return (0);
649 static int
650 ipnet_rsrv(queue_t *q)
652 mblk_t *mp;
654 while ((mp = getq(q)) != NULL) {
655 ASSERT(DB_TYPE(mp) == M_DATA);
656 if (canputnext(q)) {
657 putnext(q, mp);
658 } else {
659 (void) putbq(q, mp);
660 break;
663 return (0);
666 static void
667 ipnet_ioctl(queue_t *q, mblk_t *mp)
669 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
671 switch (iocp->ioc_cmd) {
672 case DLIOCRAW:
673 miocack(q, mp, 0, 0);
674 break;
675 case DLIOCIPNETINFO:
676 if (iocp->ioc_count == TRANSPARENT) {
677 mcopyin(mp, NULL, sizeof (uint_t), NULL);
678 qreply(q, mp);
679 break;
681 /* We don't support I_STR with DLIOCIPNETINFO. */
682 /* FALLTHROUGH */
683 default:
684 miocnak(q, mp, 0, EINVAL);
685 break;
689 static void
690 ipnet_iocdata(queue_t *q, mblk_t *mp)
692 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
693 ipnet_t *ipnet = q->q_ptr;
695 switch (iocp->ioc_cmd) {
696 case DLIOCIPNETINFO:
697 if (*(int *)mp->b_cont->b_rptr == 1)
698 ipnet->ipnet_flags |= IPNET_INFO;
699 else if (*(int *)mp->b_cont->b_rptr == 0)
700 ipnet->ipnet_flags &= ~IPNET_INFO;
701 else
702 goto iocnak;
703 miocack(q, mp, 0, DL_IPNETINFO_VERSION);
704 break;
705 default:
706 iocnak:
707 miocnak(q, mp, 0, EINVAL);
708 break;
712 static void
713 ipnet_wputnondata(queue_t *q, mblk_t *mp)
715 union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr;
716 t_uscalar_t prim = dlp->dl_primitive;
718 switch (prim) {
719 case DL_INFO_REQ:
720 ipnet_inforeq(q, mp);
721 break;
722 case DL_UNBIND_REQ:
723 ipnet_unbindreq(q, mp);
724 break;
725 case DL_BIND_REQ:
726 ipnet_bindreq(q, mp);
727 break;
728 case DL_PROMISCON_REQ:
729 ipnet_dlpromisconreq(q, mp);
730 break;
731 case DL_PROMISCOFF_REQ:
732 ipnet_dlpromiscoffreq(q, mp);
733 break;
734 case DL_UNITDATA_REQ:
735 case DL_DETACH_REQ:
736 case DL_PHYS_ADDR_REQ:
737 case DL_SET_PHYS_ADDR_REQ:
738 case DL_ENABMULTI_REQ:
739 case DL_DISABMULTI_REQ:
740 case DL_ATTACH_REQ:
741 dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
742 break;
743 default:
744 dlerrorack(q, mp, prim, DL_BADPRIM, 0);
745 break;
749 static void
750 ipnet_inforeq(queue_t *q, mblk_t *mp)
752 dl_info_ack_t *dlip;
753 size_t size = sizeof (dl_info_ack_t) + sizeof (ushort_t);
755 if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
756 dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
757 return;
760 if ((mp = mexchange(q, mp, size, M_PCPROTO, DL_INFO_ACK)) == NULL)
761 return;
763 dlip = (dl_info_ack_t *)mp->b_rptr;
764 *dlip = ipnet_infoack;
765 qreply(q, mp);
768 static void
769 ipnet_bindreq(queue_t *q, mblk_t *mp)
771 union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr;
772 ipnet_t *ipnet = q->q_ptr;
774 if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
775 dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
776 return;
779 switch (dlp->bind_req.dl_sap) {
780 case 0 :
781 ipnet->ipnet_family = AF_UNSPEC;
782 break;
783 case IPV4_VERSION :
784 ipnet->ipnet_family = AF_INET;
785 break;
786 case IPV6_VERSION :
787 ipnet->ipnet_family = AF_INET6;
788 break;
789 default :
790 dlerrorack(q, mp, DL_BIND_REQ, DL_BADSAP, 0);
791 return;
792 /*NOTREACHED*/
795 ipnet->ipnet_dlstate = DL_IDLE;
796 dlbindack(q, mp, dlp->bind_req.dl_sap, 0, 0, 0, 0);
799 static void
800 ipnet_unbindreq(queue_t *q, mblk_t *mp)
802 ipnet_t *ipnet = q->q_ptr;
804 if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
805 dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
806 return;
809 if (ipnet->ipnet_dlstate != DL_IDLE) {
810 dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, 0);
811 } else {
812 ipnet->ipnet_dlstate = DL_UNBOUND;
813 ipnet->ipnet_family = AF_UNSPEC;
814 dlokack(q, mp, DL_UNBIND_REQ);
818 static void
819 ipnet_dlpromisconreq(queue_t *q, mblk_t *mp)
821 ipnet_t *ipnet = q->q_ptr;
822 t_uscalar_t level;
823 int err;
825 if (MBLKL(mp) < DL_PROMISCON_REQ_SIZE) {
826 dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
827 return;
830 if (ipnet->ipnet_flags & IPNET_LOMODE) {
831 dlokack(q, mp, DL_PROMISCON_REQ);
832 return;
835 level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
836 if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
837 if ((err = ipnet_join_allmulti(ipnet->ipnet_if,
838 ipnet->ipnet_ns->netstack_ipnet)) != 0) {
839 dlerrorack(q, mp, DL_PROMISCON_REQ, DL_SYSERR, err);
840 return;
844 switch (level) {
845 case DL_PROMISC_PHYS:
846 ipnet->ipnet_flags |= IPNET_PROMISC_PHYS;
847 break;
848 case DL_PROMISC_SAP:
849 ipnet->ipnet_flags |= IPNET_PROMISC_SAP;
850 break;
851 case DL_PROMISC_MULTI:
852 ipnet->ipnet_flags |= IPNET_PROMISC_MULTI;
853 break;
854 default:
855 dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
856 return;
859 dlokack(q, mp, DL_PROMISCON_REQ);
862 static void
863 ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp)
865 ipnet_t *ipnet = q->q_ptr;
866 t_uscalar_t level;
867 uint16_t orig_ipnet_flags = ipnet->ipnet_flags;
869 if (MBLKL(mp) < DL_PROMISCOFF_REQ_SIZE) {
870 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
871 return;
874 if (ipnet->ipnet_flags & IPNET_LOMODE) {
875 dlokack(q, mp, DL_PROMISCOFF_REQ);
876 return;
879 level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
880 switch (level) {
881 case DL_PROMISC_PHYS:
882 if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
883 ipnet->ipnet_flags &= ~IPNET_PROMISC_PHYS;
884 break;
885 case DL_PROMISC_SAP:
886 if (ipnet->ipnet_flags & IPNET_PROMISC_SAP)
887 ipnet->ipnet_flags &= ~IPNET_PROMISC_SAP;
888 break;
889 case DL_PROMISC_MULTI:
890 if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
891 ipnet->ipnet_flags &= ~IPNET_PROMISC_MULTI;
892 break;
893 default:
894 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
895 return;
898 if (orig_ipnet_flags == ipnet->ipnet_flags) {
899 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_NOTENAB, 0);
900 return;
903 if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
904 ipnet_leave_allmulti(ipnet->ipnet_if,
905 ipnet->ipnet_ns->netstack_ipnet);
908 dlokack(q, mp, DL_PROMISCOFF_REQ);
911 static int
912 ipnet_join_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
914 int err = 0;
915 ip_stack_t *ipst = ips->ips_netstack->netstack_ip;
916 uint64_t index = ipnetif->if_index;
918 mutex_enter(&ips->ips_event_lock);
919 if (ipnetif->if_multicnt == 0) {
920 ASSERT((ipnetif->if_flags &
921 (IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0);
922 if (ipnetif->if_flags & IPNETIF_IPV4PLUMBED) {
923 err = ip_join_allmulti(index, B_FALSE, ipst);
924 if (err != 0)
925 goto done;
926 ipnetif->if_flags |= IPNETIF_IPV4ALLMULTI;
928 if (ipnetif->if_flags & IPNETIF_IPV6PLUMBED) {
929 err = ip_join_allmulti(index, B_TRUE, ipst);
930 if (err != 0 &&
931 (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI)) {
932 (void) ip_leave_allmulti(index, B_FALSE, ipst);
933 ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
934 goto done;
936 ipnetif->if_flags |= IPNETIF_IPV6ALLMULTI;
939 ipnetif->if_multicnt++;
941 done:
942 mutex_exit(&ips->ips_event_lock);
943 return (err);
946 static void
947 ipnet_leave_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
949 int err;
950 ip_stack_t *ipst = ips->ips_netstack->netstack_ip;
951 uint64_t index = ipnetif->if_index;
953 mutex_enter(&ips->ips_event_lock);
954 ASSERT(ipnetif->if_multicnt != 0);
955 if (--ipnetif->if_multicnt == 0) {
956 if (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI) {
957 err = ip_leave_allmulti(index, B_FALSE, ipst);
958 ASSERT(err == 0 || err == ENODEV);
959 ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
961 if (ipnetif->if_flags & IPNETIF_IPV6ALLMULTI) {
962 err = ip_leave_allmulti(index, B_TRUE, ipst);
963 ASSERT(err == 0 || err == ENODEV);
964 ipnetif->if_flags &= ~IPNETIF_IPV6ALLMULTI;
967 mutex_exit(&ips->ips_event_lock);
971 * Allocate a new mblk_t and put a dl_ipnetinfo_t in it.
972 * The structure it copies the header information from,
973 * hook_pkt_observe_t, is constructed using network byte
974 * order in ipobs_hook(), so there is no conversion here.
976 static mblk_t *
977 ipnet_addheader(hook_pkt_observe_t *hdr, mblk_t *mp)
979 mblk_t *dlhdr;
980 dl_ipnetinfo_t *dl;
982 if ((dlhdr = allocb(sizeof (dl_ipnetinfo_t), BPRI_HI)) == NULL) {
983 freemsg(mp);
984 return (NULL);
986 dl = (dl_ipnetinfo_t *)dlhdr->b_rptr;
987 dl->dli_version = DL_IPNETINFO_VERSION;
988 dl->dli_family = hdr->hpo_family;
989 dl->dli_htype = hdr->hpo_htype;
990 dl->dli_pktlen = hdr->hpo_pktlen;
991 dl->dli_ifindex = hdr->hpo_ifindex;
992 dl->dli_grifindex = hdr->hpo_grifindex;
993 dl->dli_zsrc = hdr->hpo_zsrc;
994 dl->dli_zdst = hdr->hpo_zdst;
995 dlhdr->b_wptr += sizeof (*dl);
996 dlhdr->b_cont = mp;
998 return (dlhdr);
1001 static ipnet_addrtype_t
1002 ipnet_get_addrtype(ipnet_t *ipnet, ipnet_addrp_t *addr)
1004 list_t *list;
1005 ipnetif_t *ipnetif = ipnet->ipnet_if;
1006 ipnetif_addr_t *ifaddr;
1007 ipnet_addrtype_t addrtype = IPNETADDR_UNKNOWN;
1009 /* First check if the address is multicast or limited broadcast. */
1010 switch (addr->iap_family) {
1011 case AF_INET:
1012 if (CLASSD(*(addr->iap_addr4)) ||
1013 *(addr->iap_addr4) == INADDR_BROADCAST)
1014 return (IPNETADDR_MBCAST);
1015 break;
1016 case AF_INET6:
1017 if (IN6_IS_ADDR_MULTICAST(addr->iap_addr6))
1018 return (IPNETADDR_MBCAST);
1019 break;
1023 * Walk the address list to see if the address belongs to our
1024 * interface or is one of our subnet broadcast addresses.
1026 mutex_enter(&ipnetif->if_addr_lock);
1027 list = (addr->iap_family == AF_INET) ?
1028 &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list;
1029 for (ifaddr = list_head(list);
1030 ifaddr != NULL && addrtype == IPNETADDR_UNKNOWN;
1031 ifaddr = list_next(list, ifaddr)) {
1033 * If we're not in the global zone, then only look at
1034 * addresses in our zone.
1036 if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1037 ipnet->ipnet_zoneid != ifaddr->ifa_zone)
1038 continue;
1039 switch (addr->iap_family) {
1040 case AF_INET:
1041 if (ifaddr->ifa_ip4addr != INADDR_ANY &&
1042 *(addr->iap_addr4) == ifaddr->ifa_ip4addr)
1043 addrtype = IPNETADDR_MYADDR;
1044 else if (ifaddr->ifa_brdaddr != INADDR_ANY &&
1045 *(addr->iap_addr4) == ifaddr->ifa_brdaddr)
1046 addrtype = IPNETADDR_MBCAST;
1047 break;
1048 case AF_INET6:
1049 if (IN6_ARE_ADDR_EQUAL(addr->iap_addr6,
1050 &ifaddr->ifa_ip6addr))
1051 addrtype = IPNETADDR_MYADDR;
1052 break;
1055 mutex_exit(&ipnetif->if_addr_lock);
1057 return (addrtype);
1061 * Verify if the packet contained in hdr should be passed up to the
1062 * ipnet client stream.
1064 static boolean_t
1065 ipnet_accept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1066 ipnet_addrp_t *dst)
1068 boolean_t obsif;
1069 uint64_t ifindex = ipnet->ipnet_if->if_index;
1070 ipnet_addrtype_t srctype;
1071 ipnet_addrtype_t dsttype;
1073 srctype = ipnet_get_addrtype(ipnet, src);
1074 dsttype = ipnet_get_addrtype(ipnet, dst);
1077 * If the packet's ifindex matches ours, or the packet's group ifindex
1078 * matches ours, it's on the interface we're observing. (Thus,
1079 * observing on the group ifindex matches all ifindexes in the group.)
1081 obsif = (ntohl(hdr->hpo_ifindex) == ifindex ||
1082 ntohl(hdr->hpo_grifindex) == ifindex);
1084 DTRACE_PROBE5(ipnet_accept__addr,
1085 ipnet_addrtype_t, srctype, ipnet_addrp_t *, src,
1086 ipnet_addrtype_t, dsttype, ipnet_addrp_t *, dst,
1087 boolean_t, obsif);
1090 * Do not allow an ipnet stream to see packets that are not from or to
1091 * its zone. The exception is when zones are using the shared stack
1092 * model. In this case, streams in the global zone have visibility
1093 * into other shared-stack zones, and broadcast and multicast traffic
1094 * is visible by all zones in the stack.
1096 if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1097 dsttype != IPNETADDR_MBCAST) {
1098 if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1099 ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1100 return (B_FALSE);
1104 * If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the
1105 * packet's IP version.
1107 if (!(ipnet->ipnet_flags & IPNET_PROMISC_SAP) &&
1108 ipnet->ipnet_family != hdr->hpo_family)
1109 return (B_FALSE);
1111 /* If the destination address is ours, then accept the packet. */
1112 if (dsttype == IPNETADDR_MYADDR)
1113 return (B_TRUE);
1116 * If DL_PROMISC_PHYS is enabled, then we can see all packets that are
1117 * sent or received on the interface we're observing, or packets that
1118 * have our source address (this allows us to see packets we send).
1120 if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
1121 if (srctype == IPNETADDR_MYADDR || obsif)
1122 return (B_TRUE);
1126 * We accept multicast and broadcast packets transmitted or received
1127 * on the interface we're observing.
1129 if (dsttype == IPNETADDR_MBCAST && obsif)
1130 return (B_TRUE);
1132 return (B_FALSE);
1136 * Verify if the packet contained in hdr should be passed up to the ipnet
1137 * client stream that's in IPNET_LOMODE.
1139 /* ARGSUSED */
1140 static boolean_t
1141 ipnet_loaccept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1142 ipnet_addrp_t *dst)
1144 if (hdr->hpo_htype != htons(IPOBS_HOOK_LOCAL)) {
1146 * ipnet_if is only NULL for IPNET_MINOR_LO devices.
1148 if (ipnet->ipnet_if == NULL)
1149 return (B_FALSE);
1153 * An ipnet stream must not see packets that are not from/to its zone.
1155 if (ipnet->ipnet_zoneid != GLOBAL_ZONEID) {
1156 if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1157 ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1158 return (B_FALSE);
1161 return (ipnet->ipnet_family == AF_UNSPEC ||
1162 ipnet->ipnet_family == hdr->hpo_family);
1165 static void
1166 ipnet_dispatch(void *arg)
1168 mblk_t *mp = arg;
1169 hook_pkt_observe_t *hdr = (hook_pkt_observe_t *)mp->b_rptr;
1170 ipnet_t *ipnet;
1171 mblk_t *netmp;
1172 list_t *list;
1173 ipnet_stack_t *ips;
1174 ipnet_addrp_t src;
1175 ipnet_addrp_t dst;
1177 ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1179 netmp = hdr->hpo_pkt->b_cont;
1180 src.iap_family = hdr->hpo_family;
1181 dst.iap_family = hdr->hpo_family;
1183 if (hdr->hpo_family == AF_INET) {
1184 src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
1185 dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
1186 } else {
1187 src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
1188 dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
1191 ipnet_walkers_inc(ips);
1193 list = &ips->ips_str_list;
1194 for (ipnet = list_head(list); ipnet != NULL;
1195 ipnet = list_next(list, ipnet)) {
1196 if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
1197 IPSK_BUMP(ips, ik_acceptFail);
1198 continue;
1200 IPSK_BUMP(ips, ik_acceptOk);
1202 if (list_next(list, ipnet) == NULL) {
1203 netmp = hdr->hpo_pkt->b_cont;
1204 hdr->hpo_pkt->b_cont = NULL;
1205 } else {
1206 if ((netmp = dupmsg(hdr->hpo_pkt->b_cont)) == NULL &&
1207 (netmp = copymsg(hdr->hpo_pkt->b_cont)) == NULL) {
1208 IPSK_BUMP(ips, ik_duplicationFail);
1209 continue;
1213 if (ipnet->ipnet_flags & IPNET_INFO) {
1214 if ((netmp = ipnet_addheader(hdr, netmp)) == NULL) {
1215 IPSK_BUMP(ips, ik_dispatchHeaderDrop);
1216 continue;
1220 if (ipnet->ipnet_rq->q_first == NULL &&
1221 canputnext(ipnet->ipnet_rq)) {
1222 putnext(ipnet->ipnet_rq, netmp);
1223 IPSK_BUMP(ips, ik_dispatchDeliver);
1224 } else if (canput(ipnet->ipnet_rq)) {
1225 (void) putq(ipnet->ipnet_rq, netmp);
1226 IPSK_BUMP(ips, ik_dispatchDeliver);
1227 } else {
1228 freemsg(netmp);
1229 IPSK_BUMP(ips, ik_dispatchPutDrop);
1233 ipnet_walkers_dec(ips);
1235 freemsg(mp);
1238 static void
1239 ipnet_input(mblk_t *mp)
1241 hook_pkt_observe_t *hdr = (hook_pkt_observe_t *)mp->b_rptr;
1242 ipnet_stack_t *ips;
1244 ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1246 if (ddi_taskq_dispatch(ipnet_taskq, ipnet_dispatch, mp, DDI_NOSLEEP) !=
1247 DDI_SUCCESS) {
1248 IPSK_BUMP(ips, ik_dispatchFail);
1249 freemsg(mp);
1250 } else {
1251 IPSK_BUMP(ips, ik_dispatchOk);
1255 static ipnetif_t *
1256 ipnet_alloc_if(ipnet_stack_t *ips)
1258 ipnetif_t *ipnetif;
1260 if ((ipnetif = kmem_zalloc(sizeof (*ipnetif), KM_NOSLEEP)) == NULL)
1261 return (NULL);
1263 mutex_init(&ipnetif->if_addr_lock, NULL, MUTEX_DEFAULT, 0);
1264 list_create(&ipnetif->if_ip4addr_list, sizeof (ipnetif_addr_t),
1265 offsetof(ipnetif_addr_t, ifa_link));
1266 list_create(&ipnetif->if_ip6addr_list, sizeof (ipnetif_addr_t),
1267 offsetof(ipnetif_addr_t, ifa_link));
1268 mutex_init(&ipnetif->if_reflock, NULL, MUTEX_DEFAULT, 0);
1270 ipnetif->if_stackp = ips;
1272 return (ipnetif);
1276 * Create a new ipnetif_t and new minor node for it. If creation is
1277 * successful the new ipnetif_t is inserted into an avl_tree
1278 * containing ipnetif's for this stack instance.
1280 static ipnetif_t *
1281 ipnetif_create(const char *name, uint64_t index, ipnet_stack_t *ips,
1282 uint64_t ifflags)
1284 ipnetif_t *ipnetif;
1285 avl_index_t where = 0;
1286 minor_t ifminor;
1289 * Because ipnetif_create() can be called from a NIC event
1290 * callback, it should not block.
1292 ifminor = (minor_t)id_alloc_nosleep(ipnet_minor_space);
1293 if (ifminor == (minor_t)-1)
1294 return (NULL);
1295 if ((ipnetif = ipnet_alloc_if(ips)) == NULL) {
1296 id_free(ipnet_minor_space, ifminor);
1297 return (NULL);
1300 (void) strlcpy(ipnetif->if_name, name, LIFNAMSIZ);
1301 ipnetif->if_index = (uint_t)index;
1302 ipnetif->if_zoneid = netstack_get_zoneid(ips->ips_netstack);
1303 ipnetif->if_dev = makedevice(ipnet_major, ifminor);
1305 ipnetif->if_refcnt = 1;
1306 if ((ifflags & IFF_LOOPBACK) != 0)
1307 ipnetif->if_flags = IPNETIF_LOOPBACK;
1309 mutex_enter(&ips->ips_avl_lock);
1310 VERIFY(avl_find(&ips->ips_avl_by_index, &index, &where) == NULL);
1311 avl_insert(&ips->ips_avl_by_index, ipnetif, where);
1312 VERIFY(avl_find(&ips->ips_avl_by_name, (void *)name, &where) == NULL);
1313 avl_insert(&ips->ips_avl_by_name, ipnetif, where);
1314 mutex_exit(&ips->ips_avl_lock);
1316 return (ipnetif);
1319 static void
1320 ipnetif_remove(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1322 ipnet_t *ipnet;
1324 ipnet_walkers_inc(ips);
1325 /* Send a SIGHUP to all open streams associated with this ipnetif. */
1326 for (ipnet = list_head(&ips->ips_str_list); ipnet != NULL;
1327 ipnet = list_next(&ips->ips_str_list, ipnet)) {
1328 if (ipnet->ipnet_if == ipnetif)
1329 (void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1331 ipnet_walkers_dec(ips);
1332 mutex_enter(&ips->ips_avl_lock);
1333 avl_remove(&ips->ips_avl_by_index, ipnetif);
1334 avl_remove(&ips->ips_avl_by_name, ipnetif);
1335 mutex_exit(&ips->ips_avl_lock);
1337 * Release the reference we implicitly held in ipnetif_create().
1339 ipnetif_refrele(ipnetif);
1342 static void
1343 ipnet_purge_addrlist(list_t *addrlist)
1345 ipnetif_addr_t *ifa;
1347 while ((ifa = list_head(addrlist)) != NULL) {
1348 list_remove(addrlist, ifa);
1349 if (ifa->ifa_shared != NULL)
1350 ipnetif_clone_release(ifa->ifa_shared);
1351 kmem_free(ifa, sizeof (*ifa));
1355 static void
1356 ipnetif_free(ipnetif_t *ipnetif)
1358 ASSERT(ipnetif->if_refcnt == 0);
1359 ASSERT(ipnetif->if_sharecnt == 0);
1361 /* Remove IPv4/v6 address lists from the ipnetif */
1362 ipnet_purge_addrlist(&ipnetif->if_ip4addr_list);
1363 list_destroy(&ipnetif->if_ip4addr_list);
1364 ipnet_purge_addrlist(&ipnetif->if_ip6addr_list);
1365 list_destroy(&ipnetif->if_ip6addr_list);
1366 mutex_destroy(&ipnetif->if_addr_lock);
1367 mutex_destroy(&ipnetif->if_reflock);
1368 if (ipnetif->if_dev != 0)
1369 id_free(ipnet_minor_space, getminor(ipnetif->if_dev));
1370 kmem_free(ipnetif, sizeof (*ipnetif));
1374 * Create an ipnetif_addr_t with the given logical interface id (lif)
1375 * and add it to the supplied ipnetif. The lif is the netinfo
1376 * representation of logical interface id, and we use this id to match
1377 * incoming netinfo events against our lists of addresses.
1379 static void
1380 ipnet_add_ifaddr(uint64_t lif, ipnetif_t *ipnetif, net_handle_t nd)
1382 ipnetif_addr_t *ifaddr;
1383 zoneid_t zoneid;
1384 struct sockaddr_in bcast;
1385 struct sockaddr_storage addr;
1386 net_ifaddr_t type = NA_ADDRESS;
1387 uint64_t phyif = ipnetif->if_index;
1389 if (net_getlifaddr(nd, phyif, lif, 1, &type, &addr) != 0 ||
1390 net_getlifzone(nd, phyif, lif, &zoneid) != 0)
1391 return;
1393 if ((ifaddr = kmem_alloc(sizeof (*ifaddr), KM_NOSLEEP)) == NULL)
1394 return;
1395 ifaddr->ifa_zone = zoneid;
1396 ifaddr->ifa_id = lif;
1397 ifaddr->ifa_shared = NULL;
1399 switch (addr.ss_family) {
1400 case AF_INET:
1401 ifaddr->ifa_ip4addr =
1402 ((struct sockaddr_in *)&addr)->sin_addr.s_addr;
1404 * Try and get the broadcast address. Note that it's okay for
1405 * an interface to not have a broadcast address, so we don't
1406 * fail the entire operation if net_getlifaddr() fails here.
1408 type = NA_BROADCAST;
1409 if (net_getlifaddr(nd, phyif, lif, 1, &type, &bcast) == 0)
1410 ifaddr->ifa_brdaddr = bcast.sin_addr.s_addr;
1411 break;
1412 case AF_INET6:
1413 ifaddr->ifa_ip6addr = ((struct sockaddr_in6 *)&addr)->sin6_addr;
1414 break;
1418 * The zoneid stored in ipnetif_t needs to correspond to the actual
1419 * zone the address is being used in. This facilitates finding the
1420 * correct netstack_t pointer, amongst other things, later.
1422 if (zoneid == ALL_ZONES)
1423 zoneid = GLOBAL_ZONEID;
1425 mutex_enter(&ipnetif->if_addr_lock);
1426 if (zoneid != ipnetif->if_zoneid) {
1427 ipnetif_t *ifp2;
1429 ifp2 = ipnetif_clone_create(ipnetif, zoneid);
1430 ifaddr->ifa_shared = ifp2;
1432 list_insert_tail(addr.ss_family == AF_INET ?
1433 &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list, ifaddr);
1434 mutex_exit(&ipnetif->if_addr_lock);
1437 static void
1438 ipnet_delete_ifaddr(ipnetif_addr_t *ifaddr, ipnetif_t *ipnetif, boolean_t isv6)
1440 mutex_enter(&ipnetif->if_addr_lock);
1441 if (ifaddr->ifa_shared != NULL)
1442 ipnetif_clone_release(ifaddr->ifa_shared);
1444 list_remove(isv6 ?
1445 &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list, ifaddr);
1446 mutex_exit(&ipnetif->if_addr_lock);
1447 kmem_free(ifaddr, sizeof (*ifaddr));
1450 static void
1451 ipnet_plumb_ev(ipnet_nicevent_t *ipne, ipnet_stack_t *ips, boolean_t isv6)
1453 ipnetif_t *ipnetif;
1454 boolean_t refrele_needed = B_TRUE;
1455 uint64_t ifflags;
1456 uint64_t ifindex;
1457 char *ifname;
1459 ifflags = 0;
1460 ifname = ipne->ipne_ifname;
1461 ifindex = ipne->ipne_ifindex;
1463 (void) net_getlifflags(ipne->ipne_protocol, ifindex, 0, &ifflags);
1465 if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) {
1466 ipnetif = ipnetif_create(ifname, ifindex, ips, ifflags);
1467 refrele_needed = B_FALSE;
1469 if (ipnetif != NULL) {
1470 ipnetif->if_flags |=
1471 isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
1474 if (ipnetif->if_multicnt != 0) {
1475 if (ip_join_allmulti(ifindex, isv6,
1476 ips->ips_netstack->netstack_ip) == 0) {
1477 ipnetif->if_flags |=
1478 isv6 ? IPNETIF_IPV6ALLMULTI : IPNETIF_IPV4ALLMULTI;
1482 if (refrele_needed)
1483 ipnetif_refrele(ipnetif);
1486 static void
1487 ipnet_unplumb_ev(uint64_t ifindex, ipnet_stack_t *ips, boolean_t isv6)
1489 ipnetif_t *ipnetif;
1491 if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1492 return;
1494 mutex_enter(&ipnetif->if_addr_lock);
1495 ipnet_purge_addrlist(isv6 ?
1496 &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list);
1497 mutex_exit(&ipnetif->if_addr_lock);
1500 * Note that we have one ipnetif for both IPv4 and IPv6, but we receive
1501 * separate NE_UNPLUMB events for IPv4 and IPv6. We remove the ipnetif
1502 * if both IPv4 and IPv6 interfaces have been unplumbed.
1504 ipnetif->if_flags &= isv6 ? ~IPNETIF_IPV6PLUMBED : ~IPNETIF_IPV4PLUMBED;
1505 if (!(ipnetif->if_flags & (IPNETIF_IPV4PLUMBED | IPNETIF_IPV6PLUMBED)))
1506 ipnetif_remove(ipnetif, ips);
1507 ipnetif_refrele(ipnetif);
1510 static void
1511 ipnet_lifup_ev(uint64_t ifindex, uint64_t lifindex, net_handle_t nd,
1512 ipnet_stack_t *ips, boolean_t isv6)
1514 ipnetif_t *ipnetif;
1515 ipnetif_addr_t *ifaddr;
1517 if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1518 return;
1519 if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) {
1521 * We must have missed a NE_LIF_DOWN event. Delete this
1522 * ifaddr and re-create it.
1524 ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1527 ipnet_add_ifaddr(lifindex, ipnetif, nd);
1528 ipnetif_refrele(ipnetif);
1531 static void
1532 ipnet_lifdown_ev(uint64_t ifindex, uint64_t lifindex, ipnet_stack_t *ips,
1533 boolean_t isv6)
1535 ipnetif_t *ipnetif;
1536 ipnetif_addr_t *ifaddr;
1538 if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1539 return;
1540 if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL)
1541 ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1542 ipnetif_refrele(ipnetif);
1544 * Make sure that open streams on this ipnetif are still allowed to
1545 * have it open.
1547 ipnetif_zonecheck(ipnetif, ips);
1551 * This callback from the NIC event framework dispatches a taskq as the event
1552 * handlers may block.
1554 /* ARGSUSED */
1555 static int
1556 ipnet_nicevent_cb(hook_event_token_t token, hook_data_t info, void *arg)
1558 ipnet_stack_t *ips = arg;
1559 hook_nic_event_t *hn = (hook_nic_event_t *)info;
1560 ipnet_nicevent_t *ipne;
1562 if ((ipne = kmem_alloc(sizeof (ipnet_nicevent_t), KM_NOSLEEP)) == NULL)
1563 return (0);
1564 ipne->ipne_event = hn->hne_event;
1565 ipne->ipne_protocol = hn->hne_protocol;
1566 ipne->ipne_stackid = ips->ips_netstack->netstack_stackid;
1567 ipne->ipne_ifindex = hn->hne_nic;
1568 ipne->ipne_lifindex = hn->hne_lif;
1569 if (hn->hne_datalen != 0) {
1570 (void) strlcpy(ipne->ipne_ifname, hn->hne_data,
1571 sizeof (ipne->ipne_ifname));
1573 (void) ddi_taskq_dispatch(ipnet_nicevent_taskq, ipnet_nicevent_task,
1574 ipne, DDI_NOSLEEP);
1575 return (0);
1578 static void
1579 ipnet_nicevent_task(void *arg)
1581 ipnet_nicevent_t *ipne = arg;
1582 netstack_t *ns;
1583 ipnet_stack_t *ips;
1584 boolean_t isv6;
1586 if ((ns = netstack_find_by_stackid(ipne->ipne_stackid)) == NULL)
1587 goto done;
1588 ips = ns->netstack_ipnet;
1589 isv6 = (ipne->ipne_protocol == ips->ips_ndv6);
1591 mutex_enter(&ips->ips_event_lock);
1592 switch (ipne->ipne_event) {
1593 case NE_PLUMB:
1594 ipnet_plumb_ev(ipne, ips, isv6);
1595 break;
1596 case NE_UNPLUMB:
1597 ipnet_unplumb_ev(ipne->ipne_ifindex, ips, isv6);
1598 break;
1599 case NE_LIF_UP:
1600 ipnet_lifup_ev(ipne->ipne_ifindex, ipne->ipne_lifindex,
1601 ipne->ipne_protocol, ips, isv6);
1602 break;
1603 case NE_LIF_DOWN:
1604 ipnet_lifdown_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, ips,
1605 isv6);
1606 break;
1607 default:
1608 break;
1610 mutex_exit(&ips->ips_event_lock);
1611 done:
1612 if (ns != NULL)
1613 netstack_rele(ns);
1614 kmem_free(ipne, sizeof (ipnet_nicevent_t));
1617 dev_t
1618 ipnet_if_getdev(char *name, zoneid_t zoneid)
1620 netstack_t *ns;
1621 ipnet_stack_t *ips;
1622 ipnetif_t *ipnetif;
1623 dev_t dev = (dev_t)-1;
1625 if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1626 return (dev);
1628 ips = ns->netstack_ipnet;
1629 mutex_enter(&ips->ips_avl_lock);
1630 if ((ipnetif = avl_find(&ips->ips_avl_by_name, name, NULL)) != NULL) {
1631 if (ipnetif_in_zone(ipnetif, zoneid, ips))
1632 dev = ipnetif->if_dev;
1634 mutex_exit(&ips->ips_avl_lock);
1635 netstack_rele(ns);
1637 return (dev);
1640 static ipnetif_t *
1641 ipnetif_getby_index(uint64_t id, ipnet_stack_t *ips)
1643 ipnetif_t *ipnetif;
1645 mutex_enter(&ips->ips_avl_lock);
1646 if ((ipnetif = avl_find(&ips->ips_avl_by_index, &id, NULL)) != NULL)
1647 ipnetif_refhold(ipnetif);
1648 mutex_exit(&ips->ips_avl_lock);
1649 return (ipnetif);
1652 static ipnetif_t *
1653 ipnetif_getby_dev(dev_t dev, ipnet_stack_t *ips)
1655 ipnetif_t *ipnetif;
1656 avl_tree_t *tree;
1658 mutex_enter(&ips->ips_avl_lock);
1659 tree = &ips->ips_avl_by_index;
1660 for (ipnetif = avl_first(tree); ipnetif != NULL;
1661 ipnetif = avl_walk(tree, ipnetif, AVL_AFTER)) {
1662 if (ipnetif->if_dev == dev) {
1663 ipnetif_refhold(ipnetif);
1664 break;
1667 mutex_exit(&ips->ips_avl_lock);
1668 return (ipnetif);
1671 static ipnetif_addr_t *
1672 ipnet_match_lif(ipnetif_t *ipnetif, lif_if_t lid, boolean_t isv6)
1674 ipnetif_addr_t *ifaddr;
1675 list_t *list;
1677 mutex_enter(&ipnetif->if_addr_lock);
1678 list = isv6 ? &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list;
1679 for (ifaddr = list_head(list); ifaddr != NULL;
1680 ifaddr = list_next(list, ifaddr)) {
1681 if (lid == ifaddr->ifa_id)
1682 break;
1684 mutex_exit(&ipnetif->if_addr_lock);
1685 return (ifaddr);
1688 /* ARGSUSED */
1689 static void *
1690 ipnet_stack_init(netstackid_t stackid, netstack_t *ns)
1692 ipnet_stack_t *ips;
1694 ips = kmem_zalloc(sizeof (*ips), KM_SLEEP);
1695 ips->ips_netstack = ns;
1696 mutex_init(&ips->ips_avl_lock, NULL, MUTEX_DEFAULT, 0);
1697 avl_create(&ips->ips_avl_by_index, ipnetif_compare_index,
1698 sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_index));
1699 avl_create(&ips->ips_avl_by_name, ipnetif_compare_name,
1700 sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_name));
1701 avl_create(&ips->ips_avl_by_shared, ipnetif_compare_name_zone,
1702 sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_shared));
1703 mutex_init(&ips->ips_walkers_lock, NULL, MUTEX_DEFAULT, NULL);
1704 cv_init(&ips->ips_walkers_cv, NULL, CV_DRIVER, NULL);
1705 list_create(&ips->ips_str_list, sizeof (ipnet_t),
1706 offsetof(ipnet_t, ipnet_next));
1707 ipnet_register_netihook(ips);
1708 return (ips);
1711 /* ARGSUSED */
1712 static void
1713 ipnet_stack_fini(netstackid_t stackid, void *arg)
1715 ipnet_stack_t *ips = arg;
1716 ipnetif_t *ipnetif, *nipnetif;
1718 if (ips->ips_kstatp != NULL) {
1719 zoneid_t zoneid;
1721 zoneid = netstackid_to_zoneid(stackid);
1722 net_kstat_delete(net_zoneidtonetid(zoneid), ips->ips_kstatp);
1724 if (ips->ips_ndv4 != NULL) {
1725 VERIFY(net_hook_unregister(ips->ips_ndv4, NH_NIC_EVENTS,
1726 ips->ips_nicevents) == 0);
1727 VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
1729 if (ips->ips_ndv6 != NULL) {
1730 VERIFY(net_hook_unregister(ips->ips_ndv6, NH_NIC_EVENTS,
1731 ips->ips_nicevents) == 0);
1732 VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
1734 hook_free(ips->ips_nicevents);
1736 for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1737 ipnetif = nipnetif) {
1738 nipnetif = AVL_NEXT(&ips->ips_avl_by_index, ipnetif);
1739 ipnetif_remove(ipnetif, ips);
1741 avl_destroy(&ips->ips_avl_by_shared);
1742 avl_destroy(&ips->ips_avl_by_index);
1743 avl_destroy(&ips->ips_avl_by_name);
1744 mutex_destroy(&ips->ips_avl_lock);
1745 mutex_destroy(&ips->ips_walkers_lock);
1746 cv_destroy(&ips->ips_walkers_cv);
1747 list_destroy(&ips->ips_str_list);
1748 kmem_free(ips, sizeof (*ips));
1751 /* Do any of the addresses in addrlist belong the supplied zoneid? */
1752 static boolean_t
1753 ipnet_addrs_in_zone(list_t *addrlist, zoneid_t zoneid)
1755 ipnetif_addr_t *ifa;
1757 for (ifa = list_head(addrlist); ifa != NULL;
1758 ifa = list_next(addrlist, ifa)) {
1759 if (ifa->ifa_zone == zoneid)
1760 return (B_TRUE);
1762 return (B_FALSE);
1765 /* Should the supplied ipnetif be visible from the supplied zoneid? */
1766 static boolean_t
1767 ipnetif_in_zone(ipnetif_t *ipnetif, zoneid_t zoneid, ipnet_stack_t *ips)
1769 int ret;
1772 * The global zone has visibility into all interfaces in the global
1773 * stack, and exclusive stack zones have visibility into all
1774 * interfaces in their stack.
1776 if (zoneid == GLOBAL_ZONEID ||
1777 ips->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
1778 return (B_TRUE);
1781 * Shared-stack zones only have visibility for interfaces that have
1782 * addresses in their zone.
1784 mutex_enter(&ipnetif->if_addr_lock);
1785 ret = ipnet_addrs_in_zone(&ipnetif->if_ip4addr_list, zoneid) ||
1786 ipnet_addrs_in_zone(&ipnetif->if_ip6addr_list, zoneid);
1787 mutex_exit(&ipnetif->if_addr_lock);
1788 return (ret);
1792 * Verify that any ipnet_t that has a reference to the supplied ipnetif should
1793 * still be allowed to have it open. A given ipnet_t may no longer be allowed
1794 * to have an ipnetif open if there are no longer any addresses that belong to
1795 * the ipnetif in the ipnet_t's non-global shared-stack zoneid. If that's the
1796 * case, send the ipnet_t an M_HANGUP.
1798 static void
1799 ipnetif_zonecheck(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1801 list_t *strlist = &ips->ips_str_list;
1802 ipnet_t *ipnet;
1804 ipnet_walkers_inc(ips);
1805 for (ipnet = list_head(strlist); ipnet != NULL;
1806 ipnet = list_next(strlist, ipnet)) {
1807 if (ipnet->ipnet_if != ipnetif)
1808 continue;
1809 if (!ipnetif_in_zone(ipnetif, ipnet->ipnet_zoneid, ips))
1810 (void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1812 ipnet_walkers_dec(ips);
1815 void
1816 ipnet_walk_if(ipnet_walkfunc_t *cb, void *arg, zoneid_t zoneid)
1818 ipnetif_t *ipnetif;
1819 list_t cbdata;
1820 ipnetif_cbdata_t *cbnode;
1821 netstack_t *ns;
1822 ipnet_stack_t *ips;
1824 if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1825 return;
1827 ips = ns->netstack_ipnet;
1828 list_create(&cbdata, sizeof (ipnetif_cbdata_t),
1829 offsetof(ipnetif_cbdata_t, ic_next));
1831 mutex_enter(&ips->ips_avl_lock);
1832 for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1833 ipnetif = avl_walk(&ips->ips_avl_by_index, ipnetif, AVL_AFTER)) {
1834 if (!ipnetif_in_zone(ipnetif, zoneid, ips))
1835 continue;
1836 cbnode = kmem_zalloc(sizeof (ipnetif_cbdata_t), KM_SLEEP);
1837 (void) strlcpy(cbnode->ic_ifname, ipnetif->if_name, LIFNAMSIZ);
1838 cbnode->ic_dev = ipnetif->if_dev;
1839 list_insert_head(&cbdata, cbnode);
1841 mutex_exit(&ips->ips_avl_lock);
1843 while ((cbnode = list_head(&cbdata)) != NULL) {
1844 cb(cbnode->ic_ifname, arg, cbnode->ic_dev);
1845 list_remove(&cbdata, cbnode);
1846 kmem_free(cbnode, sizeof (ipnetif_cbdata_t));
1848 list_destroy(&cbdata);
1849 netstack_rele(ns);
1852 static int
1853 ipnetif_compare_index(const void *index_ptr, const void *ipnetifp)
1855 int64_t index1 = *((int64_t *)index_ptr);
1856 int64_t index2 = (int64_t)((ipnetif_t *)ipnetifp)->if_index;
1858 return (SIGNOF(index2 - index1));
1861 static int
1862 ipnetif_compare_name(const void *name_ptr, const void *ipnetifp)
1864 int res;
1866 res = strcmp(((ipnetif_t *)ipnetifp)->if_name, name_ptr);
1867 return (SIGNOF(res));
1870 static int
1871 ipnetif_compare_name_zone(const void *key_ptr, const void *ipnetifp)
1873 const uintptr_t *ptr = key_ptr;
1874 const ipnetif_t *ifp;
1875 int res;
1877 ifp = ipnetifp;
1878 res = ifp->if_zoneid - ptr[0];
1879 if (res != 0)
1880 return (SIGNOF(res));
1881 res = strcmp(ifp->if_name, (char *)ptr[1]);
1882 return (SIGNOF(res));
1885 static void
1886 ipnetif_refhold(ipnetif_t *ipnetif)
1888 mutex_enter(&ipnetif->if_reflock);
1889 ipnetif->if_refcnt++;
1890 mutex_exit(&ipnetif->if_reflock);
1893 static void
1894 ipnetif_refrele(ipnetif_t *ipnetif)
1896 mutex_enter(&ipnetif->if_reflock);
1897 ASSERT(ipnetif->if_refcnt > 0);
1898 if (--ipnetif->if_refcnt == 0)
1899 ipnetif_free(ipnetif);
1900 else
1901 mutex_exit(&ipnetif->if_reflock);
1904 static void
1905 ipnet_walkers_inc(ipnet_stack_t *ips)
1907 mutex_enter(&ips->ips_walkers_lock);
1908 ips->ips_walkers_cnt++;
1909 mutex_exit(&ips->ips_walkers_lock);
1912 static void
1913 ipnet_walkers_dec(ipnet_stack_t *ips)
1915 mutex_enter(&ips->ips_walkers_lock);
1916 ASSERT(ips->ips_walkers_cnt != 0);
1917 if (--ips->ips_walkers_cnt == 0)
1918 cv_broadcast(&ips->ips_walkers_cv);
1919 mutex_exit(&ips->ips_walkers_lock);
1922 /*ARGSUSED*/
1923 static int
1924 ipobs_bounce_func(hook_event_token_t token, hook_data_t info, void *arg)
1926 hook_pkt_observe_t *hdr;
1927 pfv_t func = (pfv_t)arg;
1928 mblk_t *mp;
1930 hdr = (hook_pkt_observe_t *)info;
1932 * Code in ip_input() expects that it is the only one accessing the
1933 * packet.
1935 mp = copymsg(hdr->hpo_pkt);
1936 if (mp == NULL) {
1937 netstack_t *ns = hdr->hpo_ctx;
1938 ipnet_stack_t *ips = ns->netstack_ipnet;
1940 IPSK_BUMP(ips, ik_dispatchDupDrop);
1941 return (0);
1944 hdr = (hook_pkt_observe_t *)mp->b_rptr;
1945 hdr->hpo_pkt = mp;
1947 func(mp);
1949 return (0);
1952 hook_t *
1953 ipobs_register_hook(netstack_t *ns, pfv_t func)
1955 ip_stack_t *ipst = ns->netstack_ip;
1956 char name[32];
1957 hook_t *hook;
1959 HOOK_INIT(hook, ipobs_bounce_func, "", (void *)func);
1960 VERIFY(hook != NULL);
1963 * To register multiple hooks with the same callback function,
1964 * a unique name is needed.
1966 (void) snprintf(name, sizeof (name), "ipobserve_%p", (void *)hook);
1967 hook->h_name = strdup(name);
1969 (void) net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1970 (void) net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1972 return (hook);
1975 void
1976 ipobs_unregister_hook(netstack_t *ns, hook_t *hook)
1978 ip_stack_t *ipst = ns->netstack_ip;
1980 (void) net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1982 (void) net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1984 strfree(hook->h_name);
1986 hook_free(hook);
1989 /* ******************************************************************** */
1990 /* BPF Functions below */
1991 /* ******************************************************************** */
1994 * Convenience function to make mapping a zoneid to an ipnet_stack_t easy.
1996 ipnet_stack_t *
1997 ipnet_find_by_zoneid(zoneid_t zoneid)
1999 netstack_t *ns;
2001 VERIFY((ns = netstack_find_by_zoneid(zoneid)) != NULL);
2002 return (ns->netstack_ipnet);
2006 * Functions, such as the above ipnet_find_by_zoneid(), will return a
2007 * pointer to ipnet_stack_t by calling a netstack lookup function.
2008 * The netstack_find_*() functions return a pointer after doing a "hold"
2009 * on the data structure and thereby require a "release" when the caller
2010 * is finished with it. We need to mirror that API here and thus a caller
2011 * of ipnet_find_by_zoneid() is required to call ipnet_rele().
2013 void
2014 ipnet_rele(ipnet_stack_t *ips)
2016 netstack_rele(ips->ips_netstack);
2021 void
2022 ipnet_set_itap(bpf_itap_fn_t tapfunc)
2024 ipnet_itap = tapfunc;
2028 * The list of interfaces available via ipnet is private for each zone,
2029 * so the AVL tree of each zone must be searched for a given name, even
2030 * if all names are unique.
2033 ipnet_open_byname(const char *name, ipnetif_t **ptr, zoneid_t zoneid)
2035 ipnet_stack_t *ips;
2036 ipnetif_t *ipnetif;
2038 ASSERT(ptr != NULL);
2039 VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2041 mutex_enter(&ips->ips_avl_lock);
2044 * Shared instance zone?
2046 if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2047 uintptr_t key[2] = { zoneid, (uintptr_t)name };
2049 ipnetif = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2050 } else {
2051 ipnetif = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2053 if (ipnetif != NULL)
2054 ipnetif_refhold(ipnetif);
2055 mutex_exit(&ips->ips_avl_lock);
2057 *ptr = ipnetif;
2058 ipnet_rele(ips);
2060 if (ipnetif == NULL)
2061 return (ESRCH);
2062 return (0);
2065 void
2066 ipnet_close_byhandle(ipnetif_t *ifp)
2068 ASSERT(ifp != NULL);
2069 ipnetif_refrele(ifp);
2072 const char *
2073 ipnet_name(ipnetif_t *ifp)
2075 ASSERT(ifp != NULL);
2076 return (ifp->if_name);
2080 * To find the linkid for a given name, it is necessary to know which zone
2081 * the interface name belongs to and to search the avl tree for that zone
2082 * as there is no master list of all interfaces and which zone they belong
2083 * to. It is assumed that the caller of this function is somehow already
2084 * working with the ipnet interfaces and hence the ips_event_lock is held.
2085 * When BPF calls into this function, it is doing so because of an event
2086 * in ipnet, and thus ipnet holds the ips_event_lock. Thus the datalink id
2087 * value returned has meaning without the need for grabbing a hold on the
2088 * owning structure.
2091 ipnet_get_linkid_byname(const char *name, uint_t *idp, zoneid_t zoneid)
2093 ipnet_stack_t *ips;
2094 ipnetif_t *ifp;
2096 VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2097 ASSERT(mutex_owned(&ips->ips_event_lock));
2099 mutex_enter(&ips->ips_avl_lock);
2100 ifp = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2101 if (ifp != NULL)
2102 *idp = (uint_t)ifp->if_index;
2105 * Shared instance zone?
2107 if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2108 uintptr_t key[2] = { zoneid, (uintptr_t)name };
2110 ifp = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2111 if (ifp != NULL)
2112 *idp = (uint_t)ifp->if_index;
2115 mutex_exit(&ips->ips_avl_lock);
2116 ipnet_rele(ips);
2118 if (ifp == NULL)
2119 return (ESRCH);
2120 return (0);
2124 * Strictly speaking, there is no such thing as a "client" in ipnet, like
2125 * there is in mac. BPF only needs to have this because it is required as
2126 * part of interfacing correctly with mac. The reuse of the original
2127 * ipnetif_t as a client poses no danger, so long as it is done with its
2128 * own ref-count'd hold that is given up on close.
2131 ipnet_client_open(ipnetif_t *ptr, ipnetif_t **result)
2133 ASSERT(ptr != NULL);
2134 ASSERT(result != NULL);
2135 ipnetif_refhold(ptr);
2136 *result = ptr;
2138 return (0);
2141 void
2142 ipnet_client_close(ipnetif_t *ptr)
2144 ASSERT(ptr != NULL);
2145 ipnetif_refrele(ptr);
2149 * This is called from BPF when it needs to start receiving packets
2150 * from ipnet.
2152 * The use of the ipnet_t structure here is somewhat lightweight when
2153 * compared to how it is used elsewhere but it already has all of the
2154 * right fields in it, so reuse here doesn't seem out of order. Its
2155 * primary purpose here is to provide the means to store pointers for
2156 * use when ipnet_promisc_remove() needs to be called.
2158 * This should never be called for the IPNET_MINOR_LO device as it is
2159 * never created via ipnetif_create.
2161 /*ARGSUSED*/
2163 ipnet_promisc_add(void *handle, uint_t how, void *data, uintptr_t *mhandle,
2164 int flags)
2166 ip_stack_t *ipst;
2167 netstack_t *ns;
2168 ipnetif_t *ifp;
2169 ipnet_t *ipnet;
2170 char name[32];
2171 int error;
2173 ifp = (ipnetif_t *)handle;
2175 if (how != DL_PROMISC_PHYS && how != DL_PROMISC_MULTI)
2176 return (EINVAL);
2178 ns = netstack_find_by_zoneid(ifp->if_zoneid);
2180 if ((error = ipnet_join_allmulti(ifp, ns->netstack_ipnet)) != 0) {
2181 netstack_rele(ns);
2182 return (error);
2185 ipnet = kmem_zalloc(sizeof (*ipnet), KM_SLEEP);
2186 ipnet->ipnet_if = ifp;
2187 ipnet->ipnet_ns = ns;
2188 ipnet->ipnet_flags = flags;
2190 if ((ifp->if_flags & IPNETIF_LOOPBACK) != 0) {
2191 ipnet->ipnet_acceptfn = ipnet_loaccept;
2192 } else {
2193 ipnet->ipnet_acceptfn = ipnet_accept;
2197 * To register multiple hooks with the same callback function,
2198 * a unique name is needed.
2200 HOOK_INIT(ipnet->ipnet_hook, ipnet_bpf_bounce, "", ipnet);
2201 (void) snprintf(name, sizeof (name), "ipnet_promisc_%p",
2202 (void *)ipnet->ipnet_hook);
2203 ipnet->ipnet_hook->h_name = strdup(name);
2204 ipnet->ipnet_data = data;
2205 ipnet->ipnet_zoneid = ifp->if_zoneid;
2207 ipst = ns->netstack_ip;
2209 error = net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2210 ipnet->ipnet_hook);
2211 if (error != 0)
2212 goto regfail;
2214 error = net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2215 ipnet->ipnet_hook);
2216 if (error != 0) {
2217 (void) net_hook_unregister(ipst->ips_ip4_observe_pr,
2218 NH_OBSERVE, ipnet->ipnet_hook);
2219 goto regfail;
2222 *mhandle = (uintptr_t)ipnet;
2223 netstack_rele(ns);
2225 return (0);
2227 regfail:
2228 cmn_err(CE_WARN, "net_hook_register failed: %d", error);
2229 strfree(ipnet->ipnet_hook->h_name);
2230 hook_free(ipnet->ipnet_hook);
2231 netstack_rele(ns);
2232 return (error);
2235 void
2236 ipnet_promisc_remove(void *data)
2238 ip_stack_t *ipst;
2239 ipnet_t *ipnet;
2240 hook_t *hook;
2242 ipnet = data;
2243 ipst = ipnet->ipnet_ns->netstack_ip;
2244 hook = ipnet->ipnet_hook;
2246 VERIFY(net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2247 hook) == 0);
2249 VERIFY(net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2250 hook) == 0);
2252 strfree(hook->h_name);
2254 hook_free(hook);
2256 kmem_free(ipnet, sizeof (*ipnet));
2260 * arg here comes from the ipnet_t allocated in ipnet_promisc_add.
2261 * An important field from that structure is "ipnet_data" that
2262 * contains the "data" pointer passed into ipnet_promisc_add: it needs
2263 * to be passed back to bpf when we call into ipnet_itap.
2265 * ipnet_itap is set by ipnet_set_bpfattach, which in turn is called
2266 * from BPF.
2268 /*ARGSUSED*/
2269 static int
2270 ipnet_bpf_bounce(hook_event_token_t token, hook_data_t info, void *arg)
2272 hook_pkt_observe_t *hdr;
2273 ipnet_addrp_t src;
2274 ipnet_addrp_t dst;
2275 ipnet_stack_t *ips;
2276 ipnet_t *ipnet;
2277 mblk_t *netmp;
2278 mblk_t *mp;
2280 hdr = (hook_pkt_observe_t *)info;
2281 mp = hdr->hpo_pkt;
2282 ipnet = (ipnet_t *)arg;
2283 ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
2285 netmp = hdr->hpo_pkt->b_cont;
2286 src.iap_family = hdr->hpo_family;
2287 dst.iap_family = hdr->hpo_family;
2289 if (hdr->hpo_family == AF_INET) {
2290 src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
2291 dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
2292 } else {
2293 src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
2294 dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
2297 if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
2298 IPSK_BUMP(ips, ik_acceptFail);
2299 return (0);
2301 IPSK_BUMP(ips, ik_acceptOk);
2303 ipnet_itap(ipnet->ipnet_data, mp,
2304 hdr->hpo_htype == htons(IPOBS_HOOK_OUTBOUND),
2305 ntohl(hdr->hpo_pktlen) + MBLKL(mp));
2307 return (0);
2311 * clone'd ipnetif_t's are created when a shared IP instance zone comes
2312 * to life and configures an IP address. The model that BPF uses is that
2313 * each interface must have a unique pointer and each interface must be
2314 * representative of what it can capture. They are limited to one DLT
2315 * per interface and one zone per interface. Thus every interface that
2316 * can be seen in a zone must be announced via an attach to bpf. For
2317 * shared instance zones, this means the ipnet driver needs to detect
2318 * when an address is added to an interface in a zone for the first
2319 * time (and also when the last address is removed.)
2321 static ipnetif_t *
2322 ipnetif_clone_create(ipnetif_t *ifp, zoneid_t zoneid)
2324 uintptr_t key[2] = { zoneid, (uintptr_t)ifp->if_name };
2325 ipnet_stack_t *ips = ifp->if_stackp;
2326 avl_index_t where = 0;
2327 ipnetif_t *newif;
2329 mutex_enter(&ips->ips_avl_lock);
2330 newif = avl_find(&ips->ips_avl_by_shared, (void *)key, &where);
2331 if (newif != NULL) {
2332 ipnetif_refhold(newif);
2333 newif->if_sharecnt++;
2334 mutex_exit(&ips->ips_avl_lock);
2335 return (newif);
2338 newif = ipnet_alloc_if(ips);
2339 if (newif == NULL) {
2340 mutex_exit(&ips->ips_avl_lock);
2341 return (NULL);
2344 newif->if_refcnt = 1;
2345 newif->if_sharecnt = 1;
2346 newif->if_zoneid = zoneid;
2347 (void) strlcpy(newif->if_name, ifp->if_name, LIFNAMSIZ);
2348 newif->if_flags = ifp->if_flags & IPNETIF_LOOPBACK;
2349 newif->if_index = ifp->if_index;
2351 avl_insert(&ips->ips_avl_by_shared, newif, where);
2352 mutex_exit(&ips->ips_avl_lock);
2354 return (newif);
2357 static void
2358 ipnetif_clone_release(ipnetif_t *ipnetif)
2360 boolean_t dofree = B_FALSE;
2361 boolean_t doremove = B_FALSE;
2362 ipnet_stack_t *ips = ipnetif->if_stackp;
2364 mutex_enter(&ipnetif->if_reflock);
2365 ASSERT(ipnetif->if_refcnt > 0);
2366 if (--ipnetif->if_refcnt == 0)
2367 dofree = B_TRUE;
2368 ASSERT(ipnetif->if_sharecnt > 0);
2369 if (--ipnetif->if_sharecnt == 0)
2370 doremove = B_TRUE;
2371 mutex_exit(&ipnetif->if_reflock);
2372 if (doremove) {
2373 mutex_enter(&ips->ips_avl_lock);
2374 avl_remove(&ips->ips_avl_by_shared, ipnetif);
2375 mutex_exit(&ips->ips_avl_lock);
2377 if (dofree) {
2378 ASSERT(ipnetif->if_sharecnt == 0);
2379 ipnetif_free(ipnetif);