kernel: remove unused utsname_set_machine()
[unleashed.git] / usr / src / uts / sun4v / io / vnet.c
blob02e017941ac414b050dfe6a2904ebe5b1f19c7e9
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/param.h>
30 #include <sys/callb.h>
31 #include <sys/stream.h>
32 #include <sys/kmem.h>
33 #include <sys/conf.h>
34 #include <sys/devops.h>
35 #include <sys/ksynch.h>
36 #include <sys/stat.h>
37 #include <sys/modctl.h>
38 #include <sys/modhash.h>
39 #include <sys/debug.h>
40 #include <sys/ethernet.h>
41 #include <sys/dlpi.h>
42 #include <net/if.h>
43 #include <sys/mac_provider.h>
44 #include <sys/mac_client.h>
45 #include <sys/mac_client_priv.h>
46 #include <sys/mac_ether.h>
47 #include <sys/ddi.h>
48 #include <sys/sunddi.h>
49 #include <sys/strsun.h>
50 #include <sys/note.h>
51 #include <sys/atomic.h>
52 #include <sys/vnet.h>
53 #include <sys/vlan.h>
54 #include <sys/vnet_mailbox.h>
55 #include <sys/vnet_common.h>
56 #include <sys/dds.h>
57 #include <sys/strsubr.h>
58 #include <sys/taskq.h>
61 * Function prototypes.
64 /* DDI entrypoints */
65 static int vnetdevinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
66 static int vnetattach(dev_info_t *, ddi_attach_cmd_t);
67 static int vnetdetach(dev_info_t *, ddi_detach_cmd_t);
69 /* MAC entrypoints */
70 static int vnet_m_stat(void *, uint_t, uint64_t *);
71 static int vnet_m_start(void *);
72 static void vnet_m_stop(void *);
73 static int vnet_m_promisc(void *, boolean_t);
74 static int vnet_m_multicst(void *, boolean_t, const uint8_t *);
75 static int vnet_m_unicst(void *, const uint8_t *);
76 mblk_t *vnet_m_tx(void *, mblk_t *);
77 static void vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp);
78 #ifdef VNET_IOC_DEBUG
79 static void vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp);
80 #endif
81 static boolean_t vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data);
82 static void vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index,
83 const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle);
84 static void vnet_get_group(void *arg, mac_ring_type_t type, const int index,
85 mac_group_info_t *infop, mac_group_handle_t handle);
86 static int vnet_rx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num);
87 static void vnet_rx_ring_stop(mac_ring_driver_t rdriver);
88 static int vnet_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat,
89 uint64_t *val);
90 static int vnet_tx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num);
91 static void vnet_tx_ring_stop(mac_ring_driver_t rdriver);
92 static int vnet_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat,
93 uint64_t *val);
94 static int vnet_ring_enable_intr(void *arg);
95 static int vnet_ring_disable_intr(void *arg);
96 static mblk_t *vnet_rx_poll(void *arg, int bytes_to_pickup);
97 static int vnet_addmac(void *arg, const uint8_t *mac_addr);
98 static int vnet_remmac(void *arg, const uint8_t *mac_addr);
100 /* vnet internal functions */
101 static int vnet_unattach(vnet_t *vnetp);
102 static void vnet_ring_grp_init(vnet_t *vnetp);
103 static void vnet_ring_grp_uninit(vnet_t *vnetp);
104 static int vnet_mac_register(vnet_t *);
105 static int vnet_read_mac_address(vnet_t *vnetp);
106 static int vnet_bind_vgenring(vnet_res_t *vresp);
107 static void vnet_unbind_vgenring(vnet_res_t *vresp);
108 static int vnet_bind_hwrings(vnet_t *vnetp);
109 static void vnet_unbind_hwrings(vnet_t *vnetp);
110 static int vnet_bind_rings(vnet_res_t *vresp);
111 static void vnet_unbind_rings(vnet_res_t *vresp);
112 static int vnet_hio_stat(void *, uint_t, uint64_t *);
113 static int vnet_hio_start(void *);
114 static void vnet_hio_stop(void *);
115 mblk_t *vnet_hio_tx(void *, mblk_t *);
117 /* Forwarding database (FDB) routines */
118 static void vnet_fdb_create(vnet_t *vnetp);
119 static void vnet_fdb_destroy(vnet_t *vnetp);
120 static vnet_res_t *vnet_fdbe_find(vnet_t *vnetp, struct ether_addr *addrp);
121 static void vnet_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
122 void vnet_fdbe_add(vnet_t *vnetp, vnet_res_t *vresp);
123 static void vnet_fdbe_del(vnet_t *vnetp, vnet_res_t *vresp);
125 static void vnet_rx_frames_untag(uint16_t pvid, mblk_t **mp);
126 static void vnet_rx(vio_net_handle_t vrh, mblk_t *mp);
127 static void vnet_tx_update(vio_net_handle_t vrh);
128 static void vnet_res_start_task(void *arg);
129 static void vnet_start_resources(vnet_t *vnetp);
130 static void vnet_stop_resources(vnet_t *vnetp);
131 static void vnet_dispatch_res_task(vnet_t *vnetp);
132 static void vnet_res_start_task(void *arg);
133 static void vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err);
134 static void vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp);
135 static vnet_res_t *vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp);
136 static void vnet_tx_notify_thread(void *);
138 /* Exported to vnet_gen */
139 int vnet_mtu_update(vnet_t *vnetp, uint32_t mtu);
140 void vnet_link_update(vnet_t *vnetp, link_state_t link_state);
141 void vnet_dds_cleanup_hio(vnet_t *vnetp);
143 static kstat_t *vnet_hio_setup_kstats(char *ks_mod, char *ks_name,
144 vnet_res_t *vresp);
145 static int vnet_hio_update_kstats(kstat_t *ksp, int rw);
146 static void vnet_hio_get_stats(vnet_res_t *vresp, vnet_hio_stats_t *statsp);
147 static void vnet_hio_destroy_kstats(kstat_t *ksp);
149 /* Exported to to vnet_dds */
150 int vnet_send_dds_msg(vnet_t *vnetp, void *dmsg);
151 int vnet_hio_mac_init(vnet_t *vnetp, char *ifname);
152 void vnet_hio_mac_cleanup(vnet_t *vnetp);
154 /* Externs that are imported from vnet_gen */
155 extern int vgen_init(void *vnetp, uint64_t regprop, dev_info_t *vnetdip,
156 const uint8_t *macaddr, void **vgenhdl);
157 extern int vgen_init_mdeg(void *arg);
158 extern void vgen_uninit(void *arg);
159 extern int vgen_dds_tx(void *arg, void *dmsg);
160 extern int vgen_enable_intr(void *arg);
161 extern int vgen_disable_intr(void *arg);
162 extern mblk_t *vgen_rx_poll(void *arg, int bytes_to_pickup);
164 /* Externs that are imported from vnet_dds */
165 extern void vdds_mod_init(void);
166 extern void vdds_mod_fini(void);
167 extern int vdds_init(vnet_t *vnetp);
168 extern void vdds_cleanup(vnet_t *vnetp);
169 extern void vdds_process_dds_msg(vnet_t *vnetp, vio_dds_msg_t *dmsg);
170 extern void vdds_cleanup_hybrid_res(void *arg);
171 extern void vdds_cleanup_hio(vnet_t *vnetp);
173 extern pri_t minclsyspri;
175 #define DRV_NAME "vnet"
176 #define VNET_FDBE_REFHOLD(p) \
178 atomic_inc_32(&(p)->refcnt); \
179 ASSERT((p)->refcnt != 0); \
182 #define VNET_FDBE_REFRELE(p) \
184 ASSERT((p)->refcnt != 0); \
185 atomic_dec_32(&(p)->refcnt); \
188 #ifdef VNET_IOC_DEBUG
189 #define VNET_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB)
190 #else
191 #define VNET_M_CALLBACK_FLAGS (MC_GETCAPAB)
192 #endif
194 static mac_callbacks_t vnet_m_callbacks = {
195 VNET_M_CALLBACK_FLAGS,
196 vnet_m_stat,
197 vnet_m_start,
198 vnet_m_stop,
199 vnet_m_promisc,
200 vnet_m_multicst,
201 NULL, /* m_unicst entry must be NULL while rx rings are exposed */
202 NULL, /* m_tx entry must be NULL while tx rings are exposed */
203 NULL,
204 vnet_m_ioctl,
205 vnet_m_capab,
206 NULL
209 static mac_callbacks_t vnet_hio_res_callbacks = {
211 vnet_hio_stat,
212 vnet_hio_start,
213 vnet_hio_stop,
214 NULL,
215 NULL,
216 NULL,
217 vnet_hio_tx,
218 NULL,
219 NULL,
220 NULL
224 * Linked list of "vnet_t" structures - one per instance.
226 static vnet_t *vnet_headp = NULL;
227 static krwlock_t vnet_rw;
229 /* Tunables */
230 uint32_t vnet_num_descriptors = VNET_NUM_DESCRIPTORS;
233 * Configure tx serialization in mac layer for the vnet device. This tunable
234 * should be enabled to improve performance only if HybridIO is configured for
235 * the vnet device.
237 boolean_t vnet_mac_tx_serialize = B_FALSE;
239 /* Configure enqueing at Rx soft rings in mac layer for the vnet device */
240 boolean_t vnet_mac_rx_queuing = B_TRUE;
243 * Set this to non-zero to enable additional internal receive buffer pools
244 * based on the MTU of the device for better performance at the cost of more
245 * memory consumption. This is turned off by default, to use allocb(9F) for
246 * receive buffer allocations of sizes > 2K.
248 boolean_t vnet_jumbo_rxpools = B_FALSE;
250 /* # of chains in fdb hash table */
251 uint32_t vnet_fdb_nchains = VNET_NFDB_HASH;
253 /* Internal tunables */
254 uint32_t vnet_ethermtu = 1500; /* mtu of the device */
257 * Default vlan id. This is only used internally when the "default-vlan-id"
258 * property is not present in the MD device node. Therefore, this should not be
259 * used as a tunable; if this value is changed, the corresponding variable
260 * should be updated to the same value in vsw and also other vnets connected to
261 * the same vsw.
263 uint16_t vnet_default_vlan_id = 1;
265 /* delay in usec to wait for all references on a fdb entry to be dropped */
266 uint32_t vnet_fdbe_refcnt_delay = 10;
268 static struct ether_addr etherbroadcastaddr = {
269 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
272 /* mac_open() retry delay in usec */
273 uint32_t vnet_mac_open_delay = 100; /* 0.1 ms */
275 /* max # of mac_open() retries */
276 uint32_t vnet_mac_open_retries = 100;
279 * Property names
281 static char macaddr_propname[] = "local-mac-address";
284 * This is the string displayed by modinfo(1m).
286 static char vnet_ident[] = "vnet driver";
287 extern struct mod_ops mod_driverops;
288 static struct cb_ops cb_vnetops = {
289 nulldev, /* cb_open */
290 nulldev, /* cb_close */
291 nodev, /* cb_strategy */
292 nodev, /* cb_print */
293 nodev, /* cb_dump */
294 nodev, /* cb_read */
295 nodev, /* cb_write */
296 nodev, /* cb_ioctl */
297 nodev, /* cb_devmap */
298 nodev, /* cb_mmap */
299 nodev, /* cb_segmap */
300 nochpoll, /* cb_chpoll */
301 ddi_prop_op, /* cb_prop_op */
302 NULL, /* cb_stream */
303 (int)(D_MP) /* cb_flag */
306 static struct dev_ops vnetops = {
307 DEVO_REV, /* devo_rev */
308 0, /* devo_refcnt */
309 NULL, /* devo_getinfo */
310 nulldev, /* devo_identify */
311 nulldev, /* devo_probe */
312 vnetattach, /* devo_attach */
313 vnetdetach, /* devo_detach */
314 nodev, /* devo_reset */
315 &cb_vnetops, /* devo_cb_ops */
316 NULL, /* devo_bus_ops */
317 NULL, /* devo_power */
318 ddi_quiesce_not_supported, /* devo_quiesce */
321 static struct modldrv modldrv = {
322 &mod_driverops, /* Type of module. This one is a driver */
323 vnet_ident, /* ID string */
324 &vnetops /* driver specific ops */
327 static struct modlinkage modlinkage = {
328 MODREV_1, (void *)&modldrv, NULL
331 #ifdef DEBUG
333 #define DEBUG_PRINTF debug_printf
336 * Print debug messages - set to 0xf to enable all msgs
338 int vnet_dbglevel = 0x8;
340 static void
341 debug_printf(const char *fname, void *arg, const char *fmt, ...)
343 char buf[512];
344 va_list ap;
345 vnet_t *vnetp = (vnet_t *)arg;
346 char *bufp = buf;
348 if (vnetp == NULL) {
349 (void) sprintf(bufp, "%s: ", fname);
350 bufp += strlen(bufp);
351 } else {
352 (void) sprintf(bufp, "vnet%d:%s: ", vnetp->instance, fname);
353 bufp += strlen(bufp);
355 va_start(ap, fmt);
356 (void) vsprintf(bufp, fmt, ap);
357 va_end(ap);
358 cmn_err(CE_CONT, "%s\n", buf);
361 #endif
363 /* _init(9E): initialize the loadable module */
365 _init(void)
367 int status;
369 DBG1(NULL, "enter\n");
371 mac_init_ops(&vnetops, "vnet");
372 status = mod_install(&modlinkage);
373 if (status != 0) {
374 mac_fini_ops(&vnetops);
376 vdds_mod_init();
377 DBG1(NULL, "exit(%d)\n", status);
378 return (status);
381 /* _fini(9E): prepare the module for unloading. */
383 _fini(void)
385 int status;
387 DBG1(NULL, "enter\n");
389 status = mod_remove(&modlinkage);
390 if (status != 0)
391 return (status);
392 mac_fini_ops(&vnetops);
393 vdds_mod_fini();
395 DBG1(NULL, "exit(%d)\n", status);
396 return (status);
399 /* _info(9E): return information about the loadable module */
401 _info(struct modinfo *modinfop)
403 return (mod_info(&modlinkage, modinfop));
407 * attach(9E): attach a device to the system.
408 * called once for each instance of the device on the system.
410 static int
411 vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
413 vnet_t *vnetp;
414 int status;
415 int instance;
416 uint64_t reg;
417 char qname[TASKQ_NAMELEN];
418 vnet_attach_progress_t attach_progress;
420 attach_progress = AST_init;
422 switch (cmd) {
423 case DDI_ATTACH:
424 break;
425 case DDI_RESUME:
426 case DDI_PM_RESUME:
427 default:
428 goto vnet_attach_fail;
431 instance = ddi_get_instance(dip);
432 DBG1(NULL, "instance(%d) enter\n", instance);
434 /* allocate vnet_t and mac_t structures */
435 vnetp = kmem_zalloc(sizeof (vnet_t), KM_SLEEP);
436 vnetp->dip = dip;
437 vnetp->instance = instance;
438 rw_init(&vnetp->vrwlock, NULL, RW_DRIVER, NULL);
439 rw_init(&vnetp->vsw_fp_rw, NULL, RW_DRIVER, NULL);
440 attach_progress |= AST_vnet_alloc;
442 vnet_ring_grp_init(vnetp);
443 attach_progress |= AST_ring_init;
445 status = vdds_init(vnetp);
446 if (status != 0) {
447 goto vnet_attach_fail;
449 attach_progress |= AST_vdds_init;
451 /* setup links to vnet_t from both devinfo and mac_t */
452 ddi_set_driver_private(dip, (caddr_t)vnetp);
454 /* read the mac address */
455 status = vnet_read_mac_address(vnetp);
456 if (status != DDI_SUCCESS) {
457 goto vnet_attach_fail;
459 attach_progress |= AST_read_macaddr;
461 reg = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
462 DDI_PROP_DONTPASS, "reg", -1);
463 if (reg == -1) {
464 goto vnet_attach_fail;
466 vnetp->reg = reg;
468 vnet_fdb_create(vnetp);
469 attach_progress |= AST_fdbh_alloc;
471 (void) snprintf(qname, TASKQ_NAMELEN, "vres_taskq%d", instance);
472 if ((vnetp->taskqp = ddi_taskq_create(dip, qname, 1,
473 TASKQ_DEFAULTPRI, 0)) == NULL) {
474 cmn_err(CE_WARN, "!vnet%d: Unable to create task queue",
475 instance);
476 goto vnet_attach_fail;
478 attach_progress |= AST_taskq_create;
480 /* add to the list of vnet devices */
481 WRITE_ENTER(&vnet_rw);
482 vnetp->nextp = vnet_headp;
483 vnet_headp = vnetp;
484 RW_EXIT(&vnet_rw);
486 attach_progress |= AST_vnet_list;
489 * Initialize the generic vnet plugin which provides communication via
490 * sun4v LDC (logical domain channel) based resources. This involves 2
491 * steps; first, vgen_init() is invoked to read the various properties
492 * of the vnet device from its MD node (including its mtu which is
493 * needed to mac_register()) and obtain a handle to the vgen layer.
494 * After mac_register() is done and we have a mac handle, we then
495 * invoke vgen_init_mdeg() which registers with the the MD event
496 * generator (mdeg) framework to allow LDC resource notifications.
497 * Note: this sequence also allows us to report the correct default #
498 * of pseudo rings (2TX and 3RX) in vnet_m_capab() which gets invoked
499 * in the context of mac_register(); and avoids conflicting with
500 * dynamic pseudo rx rings which get added/removed as a result of mdeg
501 * events in vgen.
503 status = vgen_init(vnetp, reg, vnetp->dip,
504 (uint8_t *)vnetp->curr_macaddr, &vnetp->vgenhdl);
505 if (status != DDI_SUCCESS) {
506 DERR(vnetp, "vgen_init() failed\n");
507 goto vnet_attach_fail;
509 attach_progress |= AST_vgen_init;
511 status = vnet_mac_register(vnetp);
512 if (status != DDI_SUCCESS) {
513 goto vnet_attach_fail;
515 vnetp->link_state = LINK_STATE_UNKNOWN;
516 attach_progress |= AST_macreg;
518 status = vgen_init_mdeg(vnetp->vgenhdl);
519 if (status != DDI_SUCCESS) {
520 goto vnet_attach_fail;
522 attach_progress |= AST_init_mdeg;
524 vnetp->attach_progress = attach_progress;
526 DBG1(NULL, "instance(%d) exit\n", instance);
527 return (DDI_SUCCESS);
529 vnet_attach_fail:
530 vnetp->attach_progress = attach_progress;
531 status = vnet_unattach(vnetp);
532 ASSERT(status == 0);
533 return (DDI_FAILURE);
537 * detach(9E): detach a device from the system.
539 static int
540 vnetdetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
542 vnet_t *vnetp;
543 int instance;
545 instance = ddi_get_instance(dip);
546 DBG1(NULL, "instance(%d) enter\n", instance);
548 vnetp = ddi_get_driver_private(dip);
549 if (vnetp == NULL) {
550 goto vnet_detach_fail;
553 switch (cmd) {
554 case DDI_DETACH:
555 break;
556 case DDI_SUSPEND:
557 case DDI_PM_SUSPEND:
558 default:
559 goto vnet_detach_fail;
562 if (vnet_unattach(vnetp) != 0) {
563 goto vnet_detach_fail;
566 return (DDI_SUCCESS);
568 vnet_detach_fail:
569 return (DDI_FAILURE);
573 * Common routine to handle vnetattach() failure and vnetdetach(). Note that
574 * the only reason this function could fail is if mac_unregister() fails.
575 * Otherwise, this function must ensure that all resources are freed and return
576 * success.
578 static int
579 vnet_unattach(vnet_t *vnetp)
581 vnet_attach_progress_t attach_progress;
583 attach_progress = vnetp->attach_progress;
586 * Disable the mac device in the gldv3 subsystem. This can fail, in
587 * particular if there are still any open references to this mac
588 * device; in which case we just return failure without continuing to
589 * detach further.
590 * If it succeeds, we then invoke vgen_uninit() which should unregister
591 * any pseudo rings registered with the mac layer. Note we keep the
592 * AST_macreg flag on, so we can unregister with the mac layer at
593 * the end of this routine.
595 if (attach_progress & AST_macreg) {
596 if (mac_disable(vnetp->mh) != 0) {
597 return (1);
602 * Now that we have disabled the device, we must finish all other steps
603 * and successfully return from this function; otherwise we will end up
604 * leaving the device in a broken/unusable state.
606 * First, release any hybrid resources assigned to this vnet device.
608 if (attach_progress & AST_vdds_init) {
609 vdds_cleanup(vnetp);
610 attach_progress &= ~AST_vdds_init;
614 * Uninit vgen. This stops further mdeg callbacks to this vnet
615 * device and/or its ports; and detaches any existing ports.
617 if (attach_progress & (AST_vgen_init|AST_init_mdeg)) {
618 vgen_uninit(vnetp->vgenhdl);
619 attach_progress &= ~AST_vgen_init;
620 attach_progress &= ~AST_init_mdeg;
623 /* Destroy the taskq. */
624 if (attach_progress & AST_taskq_create) {
625 ddi_taskq_destroy(vnetp->taskqp);
626 attach_progress &= ~AST_taskq_create;
629 /* Destroy fdb. */
630 if (attach_progress & AST_fdbh_alloc) {
631 vnet_fdb_destroy(vnetp);
632 attach_progress &= ~AST_fdbh_alloc;
635 /* Remove from the device list */
636 if (attach_progress & AST_vnet_list) {
637 vnet_t **vnetpp;
638 /* unlink from instance(vnet_t) list */
639 WRITE_ENTER(&vnet_rw);
640 for (vnetpp = &vnet_headp; *vnetpp;
641 vnetpp = &(*vnetpp)->nextp) {
642 if (*vnetpp == vnetp) {
643 *vnetpp = vnetp->nextp;
644 break;
647 RW_EXIT(&vnet_rw);
648 attach_progress &= ~AST_vnet_list;
651 if (attach_progress & AST_ring_init) {
652 vnet_ring_grp_uninit(vnetp);
653 attach_progress &= ~AST_ring_init;
656 if (attach_progress & AST_macreg) {
657 VERIFY(mac_unregister(vnetp->mh) == 0);
658 vnetp->mh = NULL;
659 attach_progress &= ~AST_macreg;
662 if (attach_progress & AST_vnet_alloc) {
663 rw_destroy(&vnetp->vrwlock);
664 rw_destroy(&vnetp->vsw_fp_rw);
665 attach_progress &= ~AST_vnet_list;
666 KMEM_FREE(vnetp);
669 return (0);
672 /* enable the device for transmit/receive */
673 static int
674 vnet_m_start(void *arg)
676 vnet_t *vnetp = arg;
678 DBG1(vnetp, "enter\n");
680 WRITE_ENTER(&vnetp->vrwlock);
681 vnetp->flags |= VNET_STARTED;
682 vnet_start_resources(vnetp);
683 RW_EXIT(&vnetp->vrwlock);
685 DBG1(vnetp, "exit\n");
686 return (VNET_SUCCESS);
690 /* stop transmit/receive for the device */
691 static void
692 vnet_m_stop(void *arg)
694 vnet_t *vnetp = arg;
696 DBG1(vnetp, "enter\n");
698 WRITE_ENTER(&vnetp->vrwlock);
699 if (vnetp->flags & VNET_STARTED) {
701 * Set the flags appropriately; this should prevent starting of
702 * any new resources that are added(see vnet_res_start_task()),
703 * while we release the vrwlock in vnet_stop_resources() before
704 * stopping each resource.
706 vnetp->flags &= ~VNET_STARTED;
707 vnetp->flags |= VNET_STOPPING;
708 vnet_stop_resources(vnetp);
709 vnetp->flags &= ~VNET_STOPPING;
711 RW_EXIT(&vnetp->vrwlock);
713 DBG1(vnetp, "exit\n");
716 /* set the unicast mac address of the device */
717 static int
718 vnet_m_unicst(void *arg, const uint8_t *macaddr)
720 _NOTE(ARGUNUSED(macaddr))
722 vnet_t *vnetp = arg;
724 DBG1(vnetp, "enter\n");
726 * NOTE: setting mac address dynamically is not supported.
728 DBG1(vnetp, "exit\n");
730 return (VNET_FAILURE);
733 /* enable/disable a multicast address */
734 static int
735 vnet_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
737 _NOTE(ARGUNUSED(add, mca))
739 vnet_t *vnetp = arg;
740 vnet_res_t *vresp;
741 mac_register_t *macp;
742 mac_callbacks_t *cbp;
743 int rv = VNET_SUCCESS;
745 DBG1(vnetp, "enter\n");
747 READ_ENTER(&vnetp->vsw_fp_rw);
748 if (vnetp->vsw_fp == NULL) {
749 RW_EXIT(&vnetp->vsw_fp_rw);
750 return (EAGAIN);
752 VNET_FDBE_REFHOLD(vnetp->vsw_fp);
753 RW_EXIT(&vnetp->vsw_fp_rw);
755 vresp = vnetp->vsw_fp;
756 macp = &vresp->macreg;
757 cbp = macp->m_callbacks;
758 rv = cbp->mc_multicst(macp->m_driver, add, mca);
760 VNET_FDBE_REFRELE(vnetp->vsw_fp);
762 DBG1(vnetp, "exit(%d)\n", rv);
763 return (rv);
766 /* set or clear promiscuous mode on the device */
767 static int
768 vnet_m_promisc(void *arg, boolean_t on)
770 _NOTE(ARGUNUSED(on))
772 vnet_t *vnetp = arg;
773 DBG1(vnetp, "enter\n");
775 * NOTE: setting promiscuous mode is not supported, just return success.
777 DBG1(vnetp, "exit\n");
778 return (VNET_SUCCESS);
782 * Transmit a chain of packets. This function provides switching functionality
783 * based on the destination mac address to reach other guests (within ldoms) or
784 * external hosts.
786 mblk_t *
787 vnet_tx_ring_send(void *arg, mblk_t *mp)
789 vnet_pseudo_tx_ring_t *tx_ringp;
790 vnet_tx_ring_stats_t *statsp;
791 vnet_t *vnetp;
792 vnet_res_t *vresp;
793 mblk_t *next;
794 mblk_t *resid_mp;
795 mac_register_t *macp;
796 struct ether_header *ehp;
797 boolean_t is_unicast;
798 boolean_t is_pvid; /* non-default pvid ? */
799 boolean_t hres; /* Hybrid resource ? */
800 void *tx_arg;
801 size_t size;
803 tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
804 statsp = &tx_ringp->tx_ring_stats;
805 vnetp = (vnet_t *)tx_ringp->vnetp;
806 DBG1(vnetp, "enter\n");
807 ASSERT(mp != NULL);
809 is_pvid = (vnetp->pvid != vnetp->default_vlan_id) ? B_TRUE : B_FALSE;
811 while (mp != NULL) {
813 next = mp->b_next;
814 mp->b_next = NULL;
816 /* update stats */
817 size = msgsize(mp);
820 * Find fdb entry for the destination
821 * and hold a reference to it.
823 ehp = (struct ether_header *)mp->b_rptr;
824 vresp = vnet_fdbe_find(vnetp, &ehp->ether_dhost);
825 if (vresp != NULL) {
828 * Destination found in FDB.
829 * The destination is a vnet device within ldoms
830 * and directly reachable, invoke the tx function
831 * in the fdb entry.
833 macp = &vresp->macreg;
834 resid_mp = macp->m_callbacks->mc_tx(macp->m_driver, mp);
836 /* tx done; now release ref on fdb entry */
837 VNET_FDBE_REFRELE(vresp);
839 if (resid_mp != NULL) {
840 /* m_tx failed */
841 mp->b_next = next;
842 break;
844 } else {
845 is_unicast = !(IS_BROADCAST(ehp) ||
846 (IS_MULTICAST(ehp)));
848 * Destination is not in FDB.
849 * If the destination is broadcast or multicast,
850 * then forward the packet to vswitch.
851 * If a Hybrid resource avilable, then send the
852 * unicast packet via hybrid resource, otherwise
853 * forward it to vswitch.
855 READ_ENTER(&vnetp->vsw_fp_rw);
857 if ((is_unicast) && (vnetp->hio_fp != NULL)) {
858 vresp = vnetp->hio_fp;
859 hres = B_TRUE;
860 } else {
861 vresp = vnetp->vsw_fp;
862 hres = B_FALSE;
864 if (vresp == NULL) {
866 * no fdb entry to vsw? drop the packet.
868 RW_EXIT(&vnetp->vsw_fp_rw);
869 freemsg(mp);
870 mp = next;
871 continue;
874 /* ref hold the fdb entry to vsw */
875 VNET_FDBE_REFHOLD(vresp);
877 RW_EXIT(&vnetp->vsw_fp_rw);
880 * In the case of a hybrid resource we need to insert
881 * the tag for the pvid case here; unlike packets that
882 * are destined to a vnet/vsw in which case the vgen
883 * layer does the tagging before sending it over ldc.
885 if (hres == B_TRUE) {
887 * Determine if the frame being transmitted
888 * over the hybrid resource is untagged. If so,
889 * insert the tag before transmitting.
891 if (is_pvid == B_TRUE &&
892 ehp->ether_type != htons(ETHERTYPE_VLAN)) {
894 mp = vnet_vlan_insert_tag(mp,
895 vnetp->pvid);
896 if (mp == NULL) {
897 VNET_FDBE_REFRELE(vresp);
898 mp = next;
899 continue;
904 macp = &vresp->macreg;
905 tx_arg = tx_ringp;
906 } else {
907 macp = &vresp->macreg;
908 tx_arg = macp->m_driver;
910 resid_mp = macp->m_callbacks->mc_tx(tx_arg, mp);
912 /* tx done; now release ref on fdb entry */
913 VNET_FDBE_REFRELE(vresp);
915 if (resid_mp != NULL) {
916 /* m_tx failed */
917 mp->b_next = next;
918 break;
922 statsp->obytes += size;
923 statsp->opackets++;
924 mp = next;
927 DBG1(vnetp, "exit\n");
928 return (mp);
931 /* get statistics from the device */
933 vnet_m_stat(void *arg, uint_t stat, uint64_t *val)
935 vnet_t *vnetp = arg;
936 vnet_res_t *vresp;
937 mac_register_t *macp;
938 mac_callbacks_t *cbp;
939 uint64_t val_total = 0;
941 DBG1(vnetp, "enter\n");
944 * get the specified statistic from each transport and return the
945 * aggregate val. This obviously only works for counters.
947 if ((IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat)) ||
948 (IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat))) {
949 return (ENOTSUP);
952 READ_ENTER(&vnetp->vrwlock);
953 for (vresp = vnetp->vres_list; vresp != NULL; vresp = vresp->nextp) {
954 macp = &vresp->macreg;
955 cbp = macp->m_callbacks;
956 if (cbp->mc_getstat(macp->m_driver, stat, val) == 0)
957 val_total += *val;
959 RW_EXIT(&vnetp->vrwlock);
961 *val = val_total;
963 DBG1(vnetp, "exit\n");
964 return (0);
967 static void
968 vnet_ring_grp_init(vnet_t *vnetp)
970 vnet_pseudo_rx_group_t *rx_grp;
971 vnet_pseudo_rx_ring_t *rx_ringp;
972 vnet_pseudo_tx_group_t *tx_grp;
973 vnet_pseudo_tx_ring_t *tx_ringp;
974 int i;
976 tx_grp = &vnetp->tx_grp[0];
977 tx_ringp = kmem_zalloc(sizeof (vnet_pseudo_tx_ring_t) *
978 VNET_NUM_PSEUDO_TXRINGS, KM_SLEEP);
979 for (i = 0; i < VNET_NUM_PSEUDO_TXRINGS; i++) {
980 tx_ringp[i].state |= VNET_TXRING_SHARED;
982 tx_grp->rings = tx_ringp;
983 tx_grp->ring_cnt = VNET_NUM_PSEUDO_TXRINGS;
984 mutex_init(&tx_grp->flowctl_lock, NULL, MUTEX_DRIVER, NULL);
985 cv_init(&tx_grp->flowctl_cv, NULL, CV_DRIVER, NULL);
986 tx_grp->flowctl_thread = thread_create(NULL, 0,
987 vnet_tx_notify_thread, tx_grp, 0, &p0, TS_RUN, minclsyspri);
989 rx_grp = &vnetp->rx_grp[0];
990 rx_grp->max_ring_cnt = MAX_RINGS_PER_GROUP;
991 rw_init(&rx_grp->lock, NULL, RW_DRIVER, NULL);
992 rx_ringp = kmem_zalloc(sizeof (vnet_pseudo_rx_ring_t) *
993 rx_grp->max_ring_cnt, KM_SLEEP);
996 * Setup the first 3 Pseudo RX Rings that are reserved;
997 * 1 for LDC resource to vswitch + 2 for RX rings of Hybrid resource.
999 rx_ringp[0].state |= VNET_RXRING_INUSE|VNET_RXRING_LDC_SERVICE;
1000 rx_ringp[0].index = 0;
1001 rx_ringp[1].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID;
1002 rx_ringp[1].index = 1;
1003 rx_ringp[2].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID;
1004 rx_ringp[2].index = 2;
1006 rx_grp->ring_cnt = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
1007 rx_grp->rings = rx_ringp;
1009 for (i = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
1010 i < rx_grp->max_ring_cnt; i++) {
1011 rx_ringp = &rx_grp->rings[i];
1012 rx_ringp->state = VNET_RXRING_FREE;
1013 rx_ringp->index = i;
1017 static void
1018 vnet_ring_grp_uninit(vnet_t *vnetp)
1020 vnet_pseudo_rx_group_t *rx_grp;
1021 vnet_pseudo_tx_group_t *tx_grp;
1022 kt_did_t tid = 0;
1024 tx_grp = &vnetp->tx_grp[0];
1026 /* Inform tx_notify_thread to exit */
1027 mutex_enter(&tx_grp->flowctl_lock);
1028 if (tx_grp->flowctl_thread != NULL) {
1029 tid = tx_grp->flowctl_thread->t_did;
1030 tx_grp->flowctl_done = B_TRUE;
1031 cv_signal(&tx_grp->flowctl_cv);
1033 mutex_exit(&tx_grp->flowctl_lock);
1034 if (tid != 0)
1035 thread_join(tid);
1037 if (tx_grp->rings != NULL) {
1038 ASSERT(tx_grp->ring_cnt == VNET_NUM_PSEUDO_TXRINGS);
1039 kmem_free(tx_grp->rings, sizeof (vnet_pseudo_tx_ring_t) *
1040 tx_grp->ring_cnt);
1041 tx_grp->rings = NULL;
1044 rx_grp = &vnetp->rx_grp[0];
1045 if (rx_grp->rings != NULL) {
1046 ASSERT(rx_grp->max_ring_cnt == MAX_RINGS_PER_GROUP);
1047 ASSERT(rx_grp->ring_cnt == VNET_NUM_PSEUDO_RXRINGS_DEFAULT);
1048 kmem_free(rx_grp->rings, sizeof (vnet_pseudo_rx_ring_t) *
1049 rx_grp->max_ring_cnt);
1050 rx_grp->rings = NULL;
1054 static vnet_pseudo_rx_ring_t *
1055 vnet_alloc_pseudo_rx_ring(vnet_t *vnetp)
1057 vnet_pseudo_rx_group_t *rx_grp;
1058 vnet_pseudo_rx_ring_t *rx_ringp;
1059 int index;
1061 rx_grp = &vnetp->rx_grp[0];
1062 WRITE_ENTER(&rx_grp->lock);
1064 if (rx_grp->ring_cnt == rx_grp->max_ring_cnt) {
1065 /* no rings available */
1066 RW_EXIT(&rx_grp->lock);
1067 return (NULL);
1070 for (index = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
1071 index < rx_grp->max_ring_cnt; index++) {
1072 rx_ringp = &rx_grp->rings[index];
1073 if (rx_ringp->state == VNET_RXRING_FREE) {
1074 rx_ringp->state |= VNET_RXRING_INUSE;
1075 rx_grp->ring_cnt++;
1076 break;
1080 RW_EXIT(&rx_grp->lock);
1081 return (rx_ringp);
1084 static void
1085 vnet_free_pseudo_rx_ring(vnet_t *vnetp, vnet_pseudo_rx_ring_t *ringp)
1087 vnet_pseudo_rx_group_t *rx_grp;
1089 ASSERT(ringp->index >= VNET_NUM_PSEUDO_RXRINGS_DEFAULT);
1090 rx_grp = &vnetp->rx_grp[0];
1091 WRITE_ENTER(&rx_grp->lock);
1093 if (ringp->state != VNET_RXRING_FREE) {
1094 ringp->state = VNET_RXRING_FREE;
1095 ringp->handle = NULL;
1096 rx_grp->ring_cnt--;
1099 RW_EXIT(&rx_grp->lock);
1102 /* wrapper function for mac_register() */
1103 static int
1104 vnet_mac_register(vnet_t *vnetp)
1106 mac_register_t *macp;
1107 int err;
1109 if ((macp = mac_alloc(MAC_VERSION)) == NULL)
1110 return (DDI_FAILURE);
1111 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1112 macp->m_driver = vnetp;
1113 macp->m_dip = vnetp->dip;
1114 macp->m_src_addr = vnetp->curr_macaddr;
1115 macp->m_callbacks = &vnet_m_callbacks;
1116 macp->m_min_sdu = 0;
1117 macp->m_max_sdu = vnetp->mtu;
1118 macp->m_margin = VLAN_TAGSZ;
1120 macp->m_v12n = MAC_VIRT_LEVEL1;
1123 * Finally, we're ready to register ourselves with the MAC layer
1124 * interface; if this succeeds, we're all ready to start()
1126 err = mac_register(macp, &vnetp->mh);
1127 mac_free(macp);
1128 return (err == 0 ? DDI_SUCCESS : DDI_FAILURE);
1131 /* read the mac address of the device */
1132 static int
1133 vnet_read_mac_address(vnet_t *vnetp)
1135 uchar_t *macaddr;
1136 uint32_t size;
1137 int rv;
1139 rv = ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, vnetp->dip,
1140 DDI_PROP_DONTPASS, macaddr_propname, &macaddr, &size);
1141 if ((rv != DDI_PROP_SUCCESS) || (size != ETHERADDRL)) {
1142 DWARN(vnetp, "prop_lookup failed(%s) err(%d)\n",
1143 macaddr_propname, rv);
1144 return (DDI_FAILURE);
1146 bcopy(macaddr, (caddr_t)vnetp->vendor_addr, ETHERADDRL);
1147 bcopy(macaddr, (caddr_t)vnetp->curr_macaddr, ETHERADDRL);
1148 ddi_prop_free(macaddr);
1150 return (DDI_SUCCESS);
1153 static void
1154 vnet_fdb_create(vnet_t *vnetp)
1156 char hashname[MAXNAMELEN];
1158 (void) snprintf(hashname, MAXNAMELEN, "vnet%d-fdbhash",
1159 vnetp->instance);
1160 vnetp->fdb_nchains = vnet_fdb_nchains;
1161 vnetp->fdb_hashp = mod_hash_create_ptrhash(hashname, vnetp->fdb_nchains,
1162 mod_hash_null_valdtor, sizeof (void *));
1165 static void
1166 vnet_fdb_destroy(vnet_t *vnetp)
1168 /* destroy fdb-hash-table */
1169 if (vnetp->fdb_hashp != NULL) {
1170 mod_hash_destroy_hash(vnetp->fdb_hashp);
1171 vnetp->fdb_hashp = NULL;
1172 vnetp->fdb_nchains = 0;
1177 * Add an entry into the fdb.
1179 void
1180 vnet_fdbe_add(vnet_t *vnetp, vnet_res_t *vresp)
1182 uint64_t addr = 0;
1183 int rv;
1185 KEY_HASH(addr, vresp->rem_macaddr);
1188 * If the entry being added corresponds to LDC_SERVICE resource,
1189 * that is, vswitch connection, it is added to the hash and also
1190 * the entry is cached, an additional reference count reflects
1191 * this. The HYBRID resource is not added to the hash, but only
1192 * cached, as it is only used for sending out packets for unknown
1193 * unicast destinations.
1195 (vresp->type == VIO_NET_RES_LDC_SERVICE) ?
1196 (vresp->refcnt = 1) : (vresp->refcnt = 0);
1199 * Note: duplicate keys will be rejected by mod_hash.
1201 if (vresp->type != VIO_NET_RES_HYBRID) {
1202 rv = mod_hash_insert(vnetp->fdb_hashp, (mod_hash_key_t)addr,
1203 (mod_hash_val_t)vresp);
1204 if (rv != 0) {
1205 DWARN(vnetp, "Duplicate macaddr key(%lx)\n", addr);
1206 return;
1210 if (vresp->type == VIO_NET_RES_LDC_SERVICE) {
1211 /* Cache the fdb entry to vsw-port */
1212 WRITE_ENTER(&vnetp->vsw_fp_rw);
1213 if (vnetp->vsw_fp == NULL)
1214 vnetp->vsw_fp = vresp;
1215 RW_EXIT(&vnetp->vsw_fp_rw);
1216 } else if (vresp->type == VIO_NET_RES_HYBRID) {
1217 /* Cache the fdb entry to hybrid resource */
1218 WRITE_ENTER(&vnetp->vsw_fp_rw);
1219 if (vnetp->hio_fp == NULL)
1220 vnetp->hio_fp = vresp;
1221 RW_EXIT(&vnetp->vsw_fp_rw);
1226 * Remove an entry from fdb.
1228 static void
1229 vnet_fdbe_del(vnet_t *vnetp, vnet_res_t *vresp)
1231 uint64_t addr = 0;
1232 int rv;
1233 uint32_t refcnt;
1234 vnet_res_t *tmp;
1236 KEY_HASH(addr, vresp->rem_macaddr);
1239 * Remove the entry from fdb hash table.
1240 * This prevents further references to this fdb entry.
1242 if (vresp->type != VIO_NET_RES_HYBRID) {
1243 rv = mod_hash_remove(vnetp->fdb_hashp, (mod_hash_key_t)addr,
1244 (mod_hash_val_t *)&tmp);
1245 if (rv != 0) {
1247 * As the resources are added to the hash only
1248 * after they are started, this can occur if
1249 * a resource unregisters before it is ever started.
1251 return;
1255 if (vresp->type == VIO_NET_RES_LDC_SERVICE) {
1256 WRITE_ENTER(&vnetp->vsw_fp_rw);
1258 ASSERT(tmp == vnetp->vsw_fp);
1259 vnetp->vsw_fp = NULL;
1261 RW_EXIT(&vnetp->vsw_fp_rw);
1262 } else if (vresp->type == VIO_NET_RES_HYBRID) {
1263 WRITE_ENTER(&vnetp->vsw_fp_rw);
1265 vnetp->hio_fp = NULL;
1267 RW_EXIT(&vnetp->vsw_fp_rw);
1271 * If there are threads already ref holding before the entry was
1272 * removed from hash table, then wait for ref count to drop to zero.
1274 (vresp->type == VIO_NET_RES_LDC_SERVICE) ?
1275 (refcnt = 1) : (refcnt = 0);
1276 while (vresp->refcnt > refcnt) {
1277 delay(drv_usectohz(vnet_fdbe_refcnt_delay));
1282 * Search fdb for a given mac address. If an entry is found, hold
1283 * a reference to it and return the entry; else returns NULL.
1285 static vnet_res_t *
1286 vnet_fdbe_find(vnet_t *vnetp, struct ether_addr *addrp)
1288 uint64_t key = 0;
1289 vnet_res_t *vresp;
1290 int rv;
1292 KEY_HASH(key, addrp->ether_addr_octet);
1294 rv = mod_hash_find_cb(vnetp->fdb_hashp, (mod_hash_key_t)key,
1295 (mod_hash_val_t *)&vresp, vnet_fdbe_find_cb);
1297 if (rv != 0)
1298 return (NULL);
1300 return (vresp);
1304 * Callback function provided to mod_hash_find_cb(). After finding the fdb
1305 * entry corresponding to the key (macaddr), this callback will be invoked by
1306 * mod_hash_find_cb() to atomically increment the reference count on the fdb
1307 * entry before returning the found entry.
1309 static void
1310 vnet_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
1312 _NOTE(ARGUNUSED(key))
1313 VNET_FDBE_REFHOLD((vnet_res_t *)val);
1317 * Frames received that are tagged with the pvid of the vnet device must be
1318 * untagged before sending up the stack. This function walks the chain of rx
1319 * frames, untags any such frames and returns the updated chain.
1321 * Arguments:
1322 * pvid: pvid of the vnet device for which packets are being received
1323 * mp: head of pkt chain to be validated and untagged
1325 * Returns:
1326 * mp: head of updated chain of packets
1328 static void
1329 vnet_rx_frames_untag(uint16_t pvid, mblk_t **mp)
1331 struct ether_vlan_header *evhp;
1332 mblk_t *bp;
1333 mblk_t *bpt;
1334 mblk_t *bph;
1335 mblk_t *bpn;
1337 bpn = bph = bpt = NULL;
1339 for (bp = *mp; bp != NULL; bp = bpn) {
1341 bpn = bp->b_next;
1342 bp->b_next = bp->b_prev = NULL;
1344 evhp = (struct ether_vlan_header *)bp->b_rptr;
1346 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN &&
1347 VLAN_ID(ntohs(evhp->ether_tci)) == pvid) {
1349 bp = vnet_vlan_remove_tag(bp);
1350 if (bp == NULL) {
1351 continue;
1356 /* build a chain of processed packets */
1357 if (bph == NULL) {
1358 bph = bpt = bp;
1359 } else {
1360 bpt->b_next = bp;
1361 bpt = bp;
1366 *mp = bph;
1369 static void
1370 vnet_rx(vio_net_handle_t vrh, mblk_t *mp)
1372 vnet_res_t *vresp = (vnet_res_t *)vrh;
1373 vnet_t *vnetp = vresp->vnetp;
1374 vnet_pseudo_rx_ring_t *ringp;
1376 if ((vnetp == NULL) || (vnetp->mh == 0)) {
1377 freemsgchain(mp);
1378 return;
1381 ringp = vresp->rx_ringp;
1382 mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num);
1385 void
1386 vnet_tx_update(vio_net_handle_t vrh)
1388 vnet_res_t *vresp = (vnet_res_t *)vrh;
1389 vnet_t *vnetp = vresp->vnetp;
1390 vnet_pseudo_tx_ring_t *tx_ringp;
1391 vnet_pseudo_tx_group_t *tx_grp;
1392 int i;
1394 if (vnetp == NULL || vnetp->mh == NULL) {
1395 return;
1399 * Currently, the tx hwring API (used to access rings that belong to
1400 * a Hybrid IO resource) does not provide us a per ring flow ctrl
1401 * update; also the pseudo rings are shared by the ports/ldcs in the
1402 * vgen layer. Thus we can't figure out which pseudo ring is being
1403 * re-enabled for transmits. To work around this, when we get a tx
1404 * restart notification from below, we simply propagate that to all
1405 * the tx pseudo rings registered with the mac layer above.
1407 * There are a couple of side effects with this approach, but they are
1408 * not harmful, as outlined below:
1410 * A) We might send an invalid ring_update() for a ring that is not
1411 * really flow controlled. This will not have any effect in the mac
1412 * layer and packets will continue to be transmitted on that ring.
1414 * B) We might end up clearing the flow control in the mac layer for
1415 * a ring that is still flow controlled in the underlying resource.
1416 * This will result in the mac layer restarting transmit, only to be
1417 * flow controlled again on that ring.
1419 tx_grp = &vnetp->tx_grp[0];
1420 for (i = 0; i < tx_grp->ring_cnt; i++) {
1421 tx_ringp = &tx_grp->rings[i];
1422 mac_tx_ring_update(vnetp->mh, tx_ringp->handle);
1427 * vnet_tx_notify_thread:
1429 * vnet_tx_ring_update() callback function wakes up this thread when
1430 * it gets called. This thread will call mac_tx_ring_update() to
1431 * notify upper mac of flow control getting relieved. Note that
1432 * vnet_tx_ring_update() cannot call mac_tx_ring_update() directly
1433 * because vnet_tx_ring_update() is called from lower mac with
1434 * mi_rw_lock held and mac_tx_ring_update() would also try to grab
1435 * the same lock.
1437 static void
1438 vnet_tx_notify_thread(void *arg)
1440 callb_cpr_t cprinfo;
1441 vnet_pseudo_tx_group_t *tx_grp = (vnet_pseudo_tx_group_t *)arg;
1442 vnet_pseudo_tx_ring_t *tx_ringp;
1443 vnet_t *vnetp;
1444 int i;
1446 CALLB_CPR_INIT(&cprinfo, &tx_grp->flowctl_lock, callb_generic_cpr,
1447 "vnet_tx_notify_thread");
1449 mutex_enter(&tx_grp->flowctl_lock);
1450 while (!tx_grp->flowctl_done) {
1451 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1452 cv_wait(&tx_grp->flowctl_cv, &tx_grp->flowctl_lock);
1453 CALLB_CPR_SAFE_END(&cprinfo, &tx_grp->flowctl_lock);
1455 for (i = 0; i < tx_grp->ring_cnt; i++) {
1456 tx_ringp = &tx_grp->rings[i];
1457 if (tx_ringp->woken_up) {
1458 tx_ringp->woken_up = B_FALSE;
1459 vnetp = tx_ringp->vnetp;
1460 mac_tx_ring_update(vnetp->mh, tx_ringp->handle);
1465 * The tx_grp is being destroyed, exit the thread.
1467 tx_grp->flowctl_thread = NULL;
1468 CALLB_CPR_EXIT(&cprinfo);
1469 thread_exit();
1472 void
1473 vnet_tx_ring_update(void *arg1, uintptr_t arg2)
1475 vnet_t *vnetp = (vnet_t *)arg1;
1476 vnet_pseudo_tx_group_t *tx_grp;
1477 vnet_pseudo_tx_ring_t *tx_ringp;
1478 int i;
1480 tx_grp = &vnetp->tx_grp[0];
1481 for (i = 0; i < tx_grp->ring_cnt; i++) {
1482 tx_ringp = &tx_grp->rings[i];
1483 if (tx_ringp->hw_rh == (mac_ring_handle_t)arg2) {
1484 mutex_enter(&tx_grp->flowctl_lock);
1485 tx_ringp->woken_up = B_TRUE;
1486 cv_signal(&tx_grp->flowctl_cv);
1487 mutex_exit(&tx_grp->flowctl_lock);
1488 break;
1494 * Update the new mtu of vnet into the mac layer. First check if the device has
1495 * been plumbed and if so fail the mtu update. Returns 0 on success.
1498 vnet_mtu_update(vnet_t *vnetp, uint32_t mtu)
1500 int rv;
1502 if (vnetp == NULL || vnetp->mh == NULL) {
1503 return (EINVAL);
1506 WRITE_ENTER(&vnetp->vrwlock);
1508 if (vnetp->flags & VNET_STARTED) {
1509 RW_EXIT(&vnetp->vrwlock);
1510 cmn_err(CE_NOTE, "!vnet%d: Unable to process mtu "
1511 "update as the device is plumbed\n",
1512 vnetp->instance);
1513 return (EBUSY);
1516 /* update mtu in the mac layer */
1517 rv = mac_maxsdu_update(vnetp->mh, mtu);
1518 if (rv != 0) {
1519 RW_EXIT(&vnetp->vrwlock);
1520 cmn_err(CE_NOTE,
1521 "!vnet%d: Unable to update mtu with mac layer\n",
1522 vnetp->instance);
1523 return (EIO);
1526 vnetp->mtu = mtu;
1528 RW_EXIT(&vnetp->vrwlock);
1530 return (0);
1534 * Update the link state of vnet to the mac layer.
1536 void
1537 vnet_link_update(vnet_t *vnetp, link_state_t link_state)
1539 if (vnetp == NULL || vnetp->mh == NULL) {
1540 return;
1543 WRITE_ENTER(&vnetp->vrwlock);
1544 if (vnetp->link_state == link_state) {
1545 RW_EXIT(&vnetp->vrwlock);
1546 return;
1548 vnetp->link_state = link_state;
1549 RW_EXIT(&vnetp->vrwlock);
1551 mac_link_update(vnetp->mh, link_state);
1555 * vio_net_resource_reg -- An interface called to register a resource
1556 * with vnet.
1557 * macp -- a GLDv3 mac_register that has all the details of
1558 * a resource and its callbacks etc.
1559 * type -- resource type.
1560 * local_macaddr -- resource's MAC address. This is used to
1561 * associate a resource with a corresponding vnet.
1562 * remote_macaddr -- remote side MAC address. This is ignored for
1563 * the Hybrid resources.
1564 * vhp -- A handle returned to the caller.
1565 * vcb -- A set of callbacks provided to the callers.
1567 int vio_net_resource_reg(mac_register_t *macp, vio_net_res_type_t type,
1568 ether_addr_t local_macaddr, ether_addr_t rem_macaddr, vio_net_handle_t *vhp,
1569 vio_net_callbacks_t *vcb)
1571 vnet_t *vnetp;
1572 vnet_res_t *vresp;
1574 vresp = kmem_zalloc(sizeof (vnet_res_t), KM_SLEEP);
1575 ether_copy(local_macaddr, vresp->local_macaddr);
1576 ether_copy(rem_macaddr, vresp->rem_macaddr);
1577 vresp->type = type;
1578 bcopy(macp, &vresp->macreg, sizeof (mac_register_t));
1580 DBG1(NULL, "Resource Registerig type=0%X\n", type);
1582 READ_ENTER(&vnet_rw);
1583 vnetp = vnet_headp;
1584 while (vnetp != NULL) {
1585 if (VNET_MATCH_RES(vresp, vnetp)) {
1586 vresp->vnetp = vnetp;
1588 /* Setup kstats for hio resource */
1589 if (vresp->type == VIO_NET_RES_HYBRID) {
1590 vresp->ksp = vnet_hio_setup_kstats(DRV_NAME,
1591 "hio", vresp);
1592 if (vresp->ksp == NULL) {
1593 cmn_err(CE_NOTE, "!vnet%d: Cannot "
1594 "create kstats for hio resource",
1595 vnetp->instance);
1598 vnet_add_resource(vnetp, vresp);
1599 break;
1601 vnetp = vnetp->nextp;
1603 RW_EXIT(&vnet_rw);
1604 if (vresp->vnetp == NULL) {
1605 DWARN(NULL, "No vnet instance");
1606 kmem_free(vresp, sizeof (vnet_res_t));
1607 return (ENXIO);
1610 *vhp = vresp;
1611 vcb->vio_net_rx_cb = vnet_rx;
1612 vcb->vio_net_tx_update = vnet_tx_update;
1613 vcb->vio_net_report_err = vnet_handle_res_err;
1615 /* Bind the resource to pseudo ring(s) */
1616 if (vnet_bind_rings(vresp) != 0) {
1617 (void) vnet_rem_resource(vnetp, vresp);
1618 vnet_hio_destroy_kstats(vresp->ksp);
1619 KMEM_FREE(vresp);
1620 return (1);
1623 /* Dispatch a task to start resources */
1624 vnet_dispatch_res_task(vnetp);
1625 return (0);
1629 * vio_net_resource_unreg -- An interface to unregister a resource.
1631 void
1632 vio_net_resource_unreg(vio_net_handle_t vhp)
1634 vnet_res_t *vresp = (vnet_res_t *)vhp;
1635 vnet_t *vnetp = vresp->vnetp;
1637 DBG1(NULL, "Resource Registerig hdl=0x%p", vhp);
1639 ASSERT(vnetp != NULL);
1641 * Remove the resource from fdb; this ensures
1642 * there are no references to the resource.
1644 vnet_fdbe_del(vnetp, vresp);
1646 vnet_unbind_rings(vresp);
1648 /* Now remove the resource from the list */
1649 (void) vnet_rem_resource(vnetp, vresp);
1651 vnet_hio_destroy_kstats(vresp->ksp);
1652 KMEM_FREE(vresp);
1655 static void
1656 vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp)
1658 WRITE_ENTER(&vnetp->vrwlock);
1659 vresp->nextp = vnetp->vres_list;
1660 vnetp->vres_list = vresp;
1661 RW_EXIT(&vnetp->vrwlock);
1664 static vnet_res_t *
1665 vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp)
1667 vnet_res_t *vrp;
1669 WRITE_ENTER(&vnetp->vrwlock);
1670 if (vresp == vnetp->vres_list) {
1671 vnetp->vres_list = vresp->nextp;
1672 } else {
1673 vrp = vnetp->vres_list;
1674 while (vrp->nextp != NULL) {
1675 if (vrp->nextp == vresp) {
1676 vrp->nextp = vresp->nextp;
1677 break;
1679 vrp = vrp->nextp;
1682 vresp->vnetp = NULL;
1683 vresp->nextp = NULL;
1685 RW_EXIT(&vnetp->vrwlock);
1687 return (vresp);
1691 * vnet_dds_rx -- an interface called by vgen to DDS messages.
1693 void
1694 vnet_dds_rx(void *arg, void *dmsg)
1696 vnet_t *vnetp = arg;
1697 vdds_process_dds_msg(vnetp, dmsg);
1701 * vnet_send_dds_msg -- An interface provided to DDS to send
1702 * DDS messages. This simply sends meessages via vgen.
1705 vnet_send_dds_msg(vnet_t *vnetp, void *dmsg)
1707 int rv;
1709 if (vnetp->vgenhdl != NULL) {
1710 rv = vgen_dds_tx(vnetp->vgenhdl, dmsg);
1712 return (rv);
1716 * vnet_cleanup_hio -- an interface called by vgen to cleanup hio resources.
1718 void
1719 vnet_dds_cleanup_hio(vnet_t *vnetp)
1721 vdds_cleanup_hio(vnetp);
1725 * vnet_handle_res_err -- A callback function called by a resource
1726 * to report an error. For example, vgen can call to report
1727 * an LDC down/reset event. This will trigger cleanup of associated
1728 * Hybrid resource.
1730 /* ARGSUSED */
1731 static void
1732 vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err)
1734 vnet_res_t *vresp = (vnet_res_t *)vrh;
1735 vnet_t *vnetp = vresp->vnetp;
1737 if (vnetp == NULL) {
1738 return;
1740 if ((vresp->type != VIO_NET_RES_LDC_SERVICE) &&
1741 (vresp->type != VIO_NET_RES_HYBRID)) {
1742 return;
1745 vdds_cleanup_hio(vnetp);
1749 * vnet_dispatch_res_task -- A function to dispatch tasks start resources.
1751 static void
1752 vnet_dispatch_res_task(vnet_t *vnetp)
1754 int rv;
1757 * Dispatch the task. It could be the case that vnetp->flags does
1758 * not have VNET_STARTED set. This is ok as vnet_rest_start_task()
1759 * can abort the task when the task is started. See related comments
1760 * in vnet_m_stop() and vnet_stop_resources().
1762 rv = ddi_taskq_dispatch(vnetp->taskqp, vnet_res_start_task,
1763 vnetp, DDI_NOSLEEP);
1764 if (rv != DDI_SUCCESS) {
1765 cmn_err(CE_WARN,
1766 "vnet%d:Can't dispatch start resource task",
1767 vnetp->instance);
1772 * vnet_res_start_task -- A taskq callback function that starts a resource.
1774 static void
1775 vnet_res_start_task(void *arg)
1777 vnet_t *vnetp = arg;
1779 WRITE_ENTER(&vnetp->vrwlock);
1780 if (vnetp->flags & VNET_STARTED) {
1781 vnet_start_resources(vnetp);
1783 RW_EXIT(&vnetp->vrwlock);
1787 * vnet_start_resources -- starts all resources associated with
1788 * a vnet.
1790 static void
1791 vnet_start_resources(vnet_t *vnetp)
1793 mac_register_t *macp;
1794 mac_callbacks_t *cbp;
1795 vnet_res_t *vresp;
1796 int rv;
1798 DBG1(vnetp, "enter\n");
1800 ASSERT(RW_WRITE_HELD(&vnetp->vrwlock));
1802 for (vresp = vnetp->vres_list; vresp != NULL; vresp = vresp->nextp) {
1803 /* skip if it is already started */
1804 if (vresp->flags & VNET_STARTED) {
1805 continue;
1807 macp = &vresp->macreg;
1808 cbp = macp->m_callbacks;
1809 rv = cbp->mc_start(macp->m_driver);
1810 if (rv == 0) {
1812 * Successfully started the resource, so now
1813 * add it to the fdb.
1815 vresp->flags |= VNET_STARTED;
1816 vnet_fdbe_add(vnetp, vresp);
1820 DBG1(vnetp, "exit\n");
1825 * vnet_stop_resources -- stop all resources associated with a vnet.
1827 static void
1828 vnet_stop_resources(vnet_t *vnetp)
1830 vnet_res_t *vresp;
1831 mac_register_t *macp;
1832 mac_callbacks_t *cbp;
1834 DBG1(vnetp, "enter\n");
1836 ASSERT(RW_WRITE_HELD(&vnetp->vrwlock));
1838 for (vresp = vnetp->vres_list; vresp != NULL; ) {
1839 if (vresp->flags & VNET_STARTED) {
1841 * Release the lock while invoking mc_stop() of the
1842 * underlying resource. We hold a reference to this
1843 * resource to prevent being removed from the list in
1844 * vio_net_resource_unreg(). Note that new resources
1845 * can be added to the head of the list while the lock
1846 * is released, but they won't be started, as
1847 * VNET_STARTED flag has been cleared for the vnet
1848 * device in vnet_m_stop(). Also, while the lock is
1849 * released a resource could be removed from the list
1850 * in vio_net_resource_unreg(); but that is ok, as we
1851 * re-acquire the lock and only then access the forward
1852 * link (vresp->nextp) to continue with the next
1853 * resource.
1855 vresp->flags &= ~VNET_STARTED;
1856 vresp->flags |= VNET_STOPPING;
1857 macp = &vresp->macreg;
1858 cbp = macp->m_callbacks;
1859 VNET_FDBE_REFHOLD(vresp);
1860 RW_EXIT(&vnetp->vrwlock);
1862 cbp->mc_stop(macp->m_driver);
1864 WRITE_ENTER(&vnetp->vrwlock);
1865 vresp->flags &= ~VNET_STOPPING;
1866 VNET_FDBE_REFRELE(vresp);
1868 vresp = vresp->nextp;
1870 DBG1(vnetp, "exit\n");
1874 * Setup kstats for the HIO statistics.
1875 * NOTE: the synchronization for the statistics is the
1876 * responsibility of the caller.
1878 kstat_t *
1879 vnet_hio_setup_kstats(char *ks_mod, char *ks_name, vnet_res_t *vresp)
1881 kstat_t *ksp;
1882 vnet_t *vnetp = vresp->vnetp;
1883 vnet_hio_kstats_t *hiokp;
1884 size_t size;
1886 ASSERT(vnetp != NULL);
1887 size = sizeof (vnet_hio_kstats_t) / sizeof (kstat_named_t);
1888 ksp = kstat_create(ks_mod, vnetp->instance, ks_name, "net",
1889 KSTAT_TYPE_NAMED, size, 0);
1890 if (ksp == NULL) {
1891 return (NULL);
1894 hiokp = (vnet_hio_kstats_t *)ksp->ks_data;
1895 kstat_named_init(&hiokp->ipackets, "ipackets",
1896 KSTAT_DATA_ULONG);
1897 kstat_named_init(&hiokp->ierrors, "ierrors",
1898 KSTAT_DATA_ULONG);
1899 kstat_named_init(&hiokp->opackets, "opackets",
1900 KSTAT_DATA_ULONG);
1901 kstat_named_init(&hiokp->oerrors, "oerrors",
1902 KSTAT_DATA_ULONG);
1905 /* MIB II kstat variables */
1906 kstat_named_init(&hiokp->rbytes, "rbytes",
1907 KSTAT_DATA_ULONG);
1908 kstat_named_init(&hiokp->obytes, "obytes",
1909 KSTAT_DATA_ULONG);
1910 kstat_named_init(&hiokp->multircv, "multircv",
1911 KSTAT_DATA_ULONG);
1912 kstat_named_init(&hiokp->multixmt, "multixmt",
1913 KSTAT_DATA_ULONG);
1914 kstat_named_init(&hiokp->brdcstrcv, "brdcstrcv",
1915 KSTAT_DATA_ULONG);
1916 kstat_named_init(&hiokp->brdcstxmt, "brdcstxmt",
1917 KSTAT_DATA_ULONG);
1918 kstat_named_init(&hiokp->norcvbuf, "norcvbuf",
1919 KSTAT_DATA_ULONG);
1920 kstat_named_init(&hiokp->noxmtbuf, "noxmtbuf",
1921 KSTAT_DATA_ULONG);
1923 ksp->ks_update = vnet_hio_update_kstats;
1924 ksp->ks_private = (void *)vresp;
1925 kstat_install(ksp);
1926 return (ksp);
1930 * Destroy kstats.
1932 static void
1933 vnet_hio_destroy_kstats(kstat_t *ksp)
1935 if (ksp != NULL)
1936 kstat_delete(ksp);
1940 * Update the kstats.
1942 static int
1943 vnet_hio_update_kstats(kstat_t *ksp, int rw)
1945 vnet_t *vnetp;
1946 vnet_res_t *vresp;
1947 vnet_hio_stats_t statsp;
1948 vnet_hio_kstats_t *hiokp;
1950 vresp = (vnet_res_t *)ksp->ks_private;
1951 vnetp = vresp->vnetp;
1953 bzero(&statsp, sizeof (vnet_hio_stats_t));
1955 READ_ENTER(&vnetp->vsw_fp_rw);
1956 if (vnetp->hio_fp == NULL) {
1957 /* not using hio resources, just return */
1958 RW_EXIT(&vnetp->vsw_fp_rw);
1959 return (0);
1961 VNET_FDBE_REFHOLD(vnetp->hio_fp);
1962 RW_EXIT(&vnetp->vsw_fp_rw);
1963 vnet_hio_get_stats(vnetp->hio_fp, &statsp);
1964 VNET_FDBE_REFRELE(vnetp->hio_fp);
1966 hiokp = (vnet_hio_kstats_t *)ksp->ks_data;
1968 if (rw == KSTAT_READ) {
1969 /* Link Input/Output stats */
1970 hiokp->ipackets.value.ul = (uint32_t)statsp.ipackets;
1971 hiokp->ipackets64.value.ull = statsp.ipackets;
1972 hiokp->ierrors.value.ul = statsp.ierrors;
1973 hiokp->opackets.value.ul = (uint32_t)statsp.opackets;
1974 hiokp->opackets64.value.ull = statsp.opackets;
1975 hiokp->oerrors.value.ul = statsp.oerrors;
1977 /* MIB II kstat variables */
1978 hiokp->rbytes.value.ul = (uint32_t)statsp.rbytes;
1979 hiokp->rbytes64.value.ull = statsp.rbytes;
1980 hiokp->obytes.value.ul = (uint32_t)statsp.obytes;
1981 hiokp->obytes64.value.ull = statsp.obytes;
1982 hiokp->multircv.value.ul = statsp.multircv;
1983 hiokp->multixmt.value.ul = statsp.multixmt;
1984 hiokp->brdcstrcv.value.ul = statsp.brdcstrcv;
1985 hiokp->brdcstxmt.value.ul = statsp.brdcstxmt;
1986 hiokp->norcvbuf.value.ul = statsp.norcvbuf;
1987 hiokp->noxmtbuf.value.ul = statsp.noxmtbuf;
1988 } else {
1989 return (EACCES);
1992 return (0);
1995 static void
1996 vnet_hio_get_stats(vnet_res_t *vresp, vnet_hio_stats_t *statsp)
1998 mac_register_t *macp;
1999 mac_callbacks_t *cbp;
2000 uint64_t val;
2001 int stat;
2004 * get the specified statistics from the underlying nxge.
2006 macp = &vresp->macreg;
2007 cbp = macp->m_callbacks;
2008 for (stat = MAC_STAT_MIN; stat < MAC_STAT_OVERFLOWS; stat++) {
2009 if (cbp->mc_getstat(macp->m_driver, stat, &val) == 0) {
2010 switch (stat) {
2011 case MAC_STAT_IPACKETS:
2012 statsp->ipackets = val;
2013 break;
2015 case MAC_STAT_IERRORS:
2016 statsp->ierrors = val;
2017 break;
2019 case MAC_STAT_OPACKETS:
2020 statsp->opackets = val;
2021 break;
2023 case MAC_STAT_OERRORS:
2024 statsp->oerrors = val;
2025 break;
2027 case MAC_STAT_RBYTES:
2028 statsp->rbytes = val;
2029 break;
2031 case MAC_STAT_OBYTES:
2032 statsp->obytes = val;
2033 break;
2035 case MAC_STAT_MULTIRCV:
2036 statsp->multircv = val;
2037 break;
2039 case MAC_STAT_MULTIXMT:
2040 statsp->multixmt = val;
2041 break;
2043 case MAC_STAT_BRDCSTRCV:
2044 statsp->brdcstrcv = val;
2045 break;
2047 case MAC_STAT_BRDCSTXMT:
2048 statsp->brdcstxmt = val;
2049 break;
2051 case MAC_STAT_NOXMTBUF:
2052 statsp->noxmtbuf = val;
2053 break;
2055 case MAC_STAT_NORCVBUF:
2056 statsp->norcvbuf = val;
2057 break;
2059 default:
2061 * parameters not interested.
2063 break;
2069 static boolean_t
2070 vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data)
2072 vnet_t *vnetp = (vnet_t *)arg;
2074 if (vnetp == NULL) {
2075 return (0);
2078 switch (cap) {
2080 case MAC_CAPAB_RINGS: {
2082 mac_capab_rings_t *cap_rings = cap_data;
2084 * Rings Capability Notes:
2085 * We advertise rings to make use of the rings framework in
2086 * gldv3 mac layer, to improve the performance. This is
2087 * specifically needed when a Hybrid resource (with multiple
2088 * tx/rx hardware rings) is assigned to a vnet device. We also
2089 * leverage this for the normal case when no Hybrid resource is
2090 * assigned.
2092 * Ring Allocation:
2093 * - TX path:
2094 * We expose a pseudo ring group with 2 pseudo tx rings (as
2095 * currently HybridIO exports only 2 rings) In the normal case,
2096 * transmit traffic that comes down to the driver through the
2097 * mri_tx (vnet_tx_ring_send()) entry point goes through the
2098 * distributed switching algorithm in vnet and gets transmitted
2099 * over a port/LDC in the vgen layer to either the vswitch or a
2100 * peer vnet. If and when a Hybrid resource is assigned to the
2101 * vnet, we obtain the tx ring information of the Hybrid device
2102 * (nxge) and map the pseudo rings 1:1 to the 2 hw tx rings.
2103 * Traffic being sent over the Hybrid resource by the mac layer
2104 * gets spread across both hw rings, as they are mapped to the
2105 * 2 pseudo tx rings in vnet.
2107 * - RX path:
2108 * We expose a pseudo ring group with 3 pseudo rx rings (static
2109 * rings) initially. The first (default) pseudo rx ring is
2110 * reserved for the resource that connects to the vswitch
2111 * service. The next 2 rings are reserved for a Hybrid resource
2112 * that may be assigned to the vnet device. If and when a
2113 * Hybrid resource is assigned to the vnet, we obtain the rx
2114 * ring information of the Hybrid device (nxge) and map these
2115 * pseudo rings 1:1 to the 2 hw rx rings. For each additional
2116 * resource that connects to a peer vnet, we dynamically
2117 * allocate a pseudo rx ring and map it to that resource, when
2118 * the resource gets added; and the pseudo rx ring is
2119 * dynamically registered with the upper mac layer. We do the
2120 * reverse and unregister the ring with the mac layer when
2121 * the resource gets removed.
2123 * Synchronization notes:
2124 * We don't need any lock to protect members of ring structure,
2125 * specifically ringp->hw_rh, in either the TX or the RX ring,
2126 * as explained below.
2127 * - TX ring:
2128 * ring->hw_rh is initialized only when a Hybrid resource is
2129 * associated; and gets referenced only in vnet_hio_tx(). The
2130 * Hybrid resource itself is available in fdb only after tx
2131 * hwrings are found and mapped; i.e, in vio_net_resource_reg()
2132 * we call vnet_bind_rings() first and then call
2133 * vnet_start_resources() which adds an entry to fdb. For
2134 * traffic going over LDC resources, we don't reference
2135 * ring->hw_rh at all.
2136 * - RX ring:
2137 * For rings mapped to Hybrid resource ring->hw_rh is
2138 * initialized and only then do we add the rx callback for
2139 * the underlying Hybrid resource; we disable callbacks before
2140 * we unmap ring->hw_rh. For rings mapped to LDC resources, we
2141 * stop the rx callbacks (in vgen) before we remove ring->hw_rh
2142 * (vio_net_resource_unreg()).
2143 * Also, we access ring->hw_rh in vnet_rx_ring_stat().
2144 * Note that for rings mapped to Hybrid resource, though the
2145 * rings are statically registered with the mac layer, its
2146 * hardware ring mapping (ringp->hw_rh) can be torn down in
2147 * vnet_unbind_hwrings() while the kstat operation is in
2148 * progress. To protect against this, we hold a reference to
2149 * the resource in FDB; this ensures that the thread in
2150 * vio_net_resource_unreg() waits for the reference to be
2151 * dropped before unbinding the ring.
2153 * We don't need to do this for rings mapped to LDC resources.
2154 * These rings are registered/unregistered dynamically with
2155 * the mac layer and so any attempt to unregister the ring
2156 * while kstat operation is in progress will block in
2157 * mac_group_rem_ring(). Thus implicitly protects the
2158 * resource (ringp->hw_rh) from disappearing.
2161 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2162 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2165 * The ring_cnt for rx grp is initialized in
2166 * vnet_ring_grp_init(). Later, the ring_cnt gets
2167 * updated dynamically whenever LDC resources are added
2168 * or removed.
2170 cap_rings->mr_rnum = vnetp->rx_grp[0].ring_cnt;
2171 cap_rings->mr_rget = vnet_get_ring;
2173 cap_rings->mr_gnum = VNET_NUM_PSEUDO_GROUPS;
2174 cap_rings->mr_gget = vnet_get_group;
2175 cap_rings->mr_gaddring = NULL;
2176 cap_rings->mr_gremring = NULL;
2177 } else {
2178 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2181 * The ring_cnt for tx grp is initialized in
2182 * vnet_ring_grp_init() and remains constant, as we
2183 * do not support dymanic tx rings for now.
2185 cap_rings->mr_rnum = vnetp->tx_grp[0].ring_cnt;
2186 cap_rings->mr_rget = vnet_get_ring;
2189 * Transmit rings are not grouped; i.e, the number of
2190 * transmit ring groups advertised should be set to 0.
2192 cap_rings->mr_gnum = 0;
2194 cap_rings->mr_gget = vnet_get_group;
2195 cap_rings->mr_gaddring = NULL;
2196 cap_rings->mr_gremring = NULL;
2198 return (B_TRUE);
2202 default:
2203 break;
2207 return (B_FALSE);
2211 * Callback funtion for MAC layer to get ring information.
2213 static void
2214 vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index,
2215 const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle)
2217 vnet_t *vnetp = arg;
2219 switch (rtype) {
2221 case MAC_RING_TYPE_RX: {
2223 vnet_pseudo_rx_group_t *rx_grp;
2224 vnet_pseudo_rx_ring_t *rx_ringp;
2225 mac_intr_t *mintr;
2227 /* We advertised only one RX group */
2228 ASSERT(g_index == 0);
2229 rx_grp = &vnetp->rx_grp[g_index];
2231 /* Check the current # of rings in the rx group */
2232 ASSERT((r_index >= 0) && (r_index < rx_grp->max_ring_cnt));
2234 /* Get the ring based on the index */
2235 rx_ringp = &rx_grp->rings[r_index];
2237 rx_ringp->handle = r_handle;
2239 * Note: we don't need to save the incoming r_index in rx_ring,
2240 * as vnet_ring_grp_init() would have initialized the index for
2241 * each ring in the array.
2243 rx_ringp->grp = rx_grp;
2244 rx_ringp->vnetp = vnetp;
2246 mintr = &infop->mri_intr;
2247 mintr->mi_handle = (mac_intr_handle_t)rx_ringp;
2248 mintr->mi_enable = (mac_intr_enable_t)vnet_ring_enable_intr;
2249 mintr->mi_disable = (mac_intr_disable_t)vnet_ring_disable_intr;
2251 infop->mri_driver = (mac_ring_driver_t)rx_ringp;
2252 infop->mri_start = vnet_rx_ring_start;
2253 infop->mri_stop = vnet_rx_ring_stop;
2254 infop->mri_stat = vnet_rx_ring_stat;
2256 /* Set the poll function, as this is an rx ring */
2257 infop->mri_poll = vnet_rx_poll;
2259 * MAC_RING_RX_ENQUEUE bit needed to be set for nxge
2260 * which was not sending packet chains in interrupt
2261 * context. For such drivers, packets are queued in
2262 * Rx soft rings so that we get a chance to switch
2263 * into a polling mode under backlog. This bug (not
2264 * sending packet chains) has now been fixed. Once
2265 * the performance impact is measured, this change
2266 * will be removed.
2268 infop->mri_flags = (vnet_mac_rx_queuing ?
2269 MAC_RING_RX_ENQUEUE : 0);
2270 break;
2273 case MAC_RING_TYPE_TX: {
2274 vnet_pseudo_tx_group_t *tx_grp;
2275 vnet_pseudo_tx_ring_t *tx_ringp;
2278 * No need to check grp index; mac layer passes -1 for it.
2280 tx_grp = &vnetp->tx_grp[0];
2282 /* Check the # of rings in the tx group */
2283 ASSERT((r_index >= 0) && (r_index < tx_grp->ring_cnt));
2285 /* Get the ring based on the index */
2286 tx_ringp = &tx_grp->rings[r_index];
2288 tx_ringp->handle = r_handle;
2289 tx_ringp->index = r_index;
2290 tx_ringp->grp = tx_grp;
2291 tx_ringp->vnetp = vnetp;
2293 infop->mri_driver = (mac_ring_driver_t)tx_ringp;
2294 infop->mri_start = vnet_tx_ring_start;
2295 infop->mri_stop = vnet_tx_ring_stop;
2296 infop->mri_stat = vnet_tx_ring_stat;
2298 /* Set the transmit function, as this is a tx ring */
2299 infop->mri_tx = vnet_tx_ring_send;
2301 * MAC_RING_TX_SERIALIZE bit needs to be set while
2302 * hybridIO is enabled to workaround tx lock
2303 * contention issues in nxge.
2305 infop->mri_flags = (vnet_mac_tx_serialize ?
2306 MAC_RING_TX_SERIALIZE : 0);
2307 break;
2310 default:
2311 break;
2316 * Callback funtion for MAC layer to get group information.
2318 static void
2319 vnet_get_group(void *arg, mac_ring_type_t type, const int index,
2320 mac_group_info_t *infop, mac_group_handle_t handle)
2322 vnet_t *vnetp = (vnet_t *)arg;
2324 switch (type) {
2326 case MAC_RING_TYPE_RX:
2328 vnet_pseudo_rx_group_t *rx_grp;
2330 /* We advertised only one RX group */
2331 ASSERT(index == 0);
2333 rx_grp = &vnetp->rx_grp[index];
2334 rx_grp->handle = handle;
2335 rx_grp->index = index;
2336 rx_grp->vnetp = vnetp;
2338 infop->mgi_driver = (mac_group_driver_t)rx_grp;
2339 infop->mgi_start = NULL;
2340 infop->mgi_stop = NULL;
2341 infop->mgi_addmac = vnet_addmac;
2342 infop->mgi_remmac = vnet_remmac;
2343 infop->mgi_count = rx_grp->ring_cnt;
2345 break;
2348 case MAC_RING_TYPE_TX:
2350 vnet_pseudo_tx_group_t *tx_grp;
2352 /* We advertised only one TX group */
2353 ASSERT(index == 0);
2355 tx_grp = &vnetp->tx_grp[index];
2356 tx_grp->handle = handle;
2357 tx_grp->index = index;
2358 tx_grp->vnetp = vnetp;
2360 infop->mgi_driver = (mac_group_driver_t)tx_grp;
2361 infop->mgi_start = NULL;
2362 infop->mgi_stop = NULL;
2363 infop->mgi_addmac = NULL;
2364 infop->mgi_remmac = NULL;
2365 infop->mgi_count = VNET_NUM_PSEUDO_TXRINGS;
2367 break;
2370 default:
2371 break;
2376 static int
2377 vnet_rx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num)
2379 vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2380 int err;
2383 * If this ring is mapped to a LDC resource, simply mark the state to
2384 * indicate the ring is started and return.
2386 if ((rx_ringp->state &
2387 (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) {
2388 rx_ringp->gen_num = mr_gen_num;
2389 rx_ringp->state |= VNET_RXRING_STARTED;
2390 return (0);
2393 ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
2396 * This must be a ring reserved for a hwring. If the hwring is not
2397 * bound yet, simply mark the state to indicate the ring is started and
2398 * return. If and when a hybrid resource is activated for this vnet
2399 * device, we will bind the hwring and start it then. If a hwring is
2400 * already bound, start it now.
2402 if (rx_ringp->hw_rh == NULL) {
2403 rx_ringp->gen_num = mr_gen_num;
2404 rx_ringp->state |= VNET_RXRING_STARTED;
2405 return (0);
2408 err = mac_hwring_start(rx_ringp->hw_rh);
2409 if (err == 0) {
2410 rx_ringp->gen_num = mr_gen_num;
2411 rx_ringp->state |= VNET_RXRING_STARTED;
2412 } else {
2413 err = ENXIO;
2416 return (err);
2419 static void
2420 vnet_rx_ring_stop(mac_ring_driver_t arg)
2422 vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2425 * If this ring is mapped to a LDC resource, simply mark the state to
2426 * indicate the ring is now stopped and return.
2428 if ((rx_ringp->state &
2429 (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) {
2430 rx_ringp->state &= ~VNET_RXRING_STARTED;
2431 return;
2434 ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
2437 * This must be a ring reserved for a hwring. If the hwring is not
2438 * bound yet, simply mark the state to indicate the ring is stopped and
2439 * return. If a hwring is already bound, stop it now.
2441 if (rx_ringp->hw_rh == NULL) {
2442 rx_ringp->state &= ~VNET_RXRING_STARTED;
2443 return;
2446 mac_hwring_stop(rx_ringp->hw_rh);
2447 rx_ringp->state &= ~VNET_RXRING_STARTED;
2450 static int
2451 vnet_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2453 vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)rdriver;
2454 vnet_t *vnetp = (vnet_t *)rx_ringp->vnetp;
2455 vnet_res_t *vresp;
2456 mac_register_t *macp;
2457 mac_callbacks_t *cbp;
2460 * Refer to vnet_m_capab() function for detailed comments on ring
2461 * synchronization.
2463 if ((rx_ringp->state & VNET_RXRING_HYBRID) != 0) {
2464 READ_ENTER(&vnetp->vsw_fp_rw);
2465 if (vnetp->hio_fp == NULL) {
2466 RW_EXIT(&vnetp->vsw_fp_rw);
2467 return (0);
2470 VNET_FDBE_REFHOLD(vnetp->hio_fp);
2471 RW_EXIT(&vnetp->vsw_fp_rw);
2472 (void) mac_hwring_getstat(rx_ringp->hw_rh, stat, val);
2473 VNET_FDBE_REFRELE(vnetp->hio_fp);
2474 return (0);
2477 ASSERT((rx_ringp->state &
2478 (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0);
2479 vresp = (vnet_res_t *)rx_ringp->hw_rh;
2480 macp = &vresp->macreg;
2481 cbp = macp->m_callbacks;
2483 cbp->mc_getstat(macp->m_driver, stat, val);
2485 return (0);
2488 /* ARGSUSED */
2489 static int
2490 vnet_tx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num)
2492 vnet_pseudo_tx_ring_t *tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
2494 tx_ringp->state |= VNET_TXRING_STARTED;
2495 return (0);
2498 static void
2499 vnet_tx_ring_stop(mac_ring_driver_t arg)
2501 vnet_pseudo_tx_ring_t *tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
2503 tx_ringp->state &= ~VNET_TXRING_STARTED;
2506 static int
2507 vnet_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2509 vnet_pseudo_tx_ring_t *tx_ringp = (vnet_pseudo_tx_ring_t *)rdriver;
2510 vnet_tx_ring_stats_t *statsp;
2512 statsp = &tx_ringp->tx_ring_stats;
2514 switch (stat) {
2515 case MAC_STAT_OPACKETS:
2516 *val = statsp->opackets;
2517 break;
2519 case MAC_STAT_OBYTES:
2520 *val = statsp->obytes;
2521 break;
2523 default:
2524 *val = 0;
2525 return (ENOTSUP);
2528 return (0);
2532 * Disable polling for a ring and enable its interrupt.
2534 static int
2535 vnet_ring_enable_intr(void *arg)
2537 vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2538 vnet_res_t *vresp;
2540 if (rx_ringp->hw_rh == NULL) {
2542 * Ring enable intr func is being invoked, but the ring is
2543 * not bound to any underlying resource ? This must be a ring
2544 * reserved for Hybrid resource and no such resource has been
2545 * assigned to this vnet device yet. We simply return success.
2547 ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
2548 return (0);
2552 * The rx ring has been bound to either a LDC or a Hybrid resource.
2553 * Call the appropriate function to enable interrupts for the ring.
2555 if (rx_ringp->state & VNET_RXRING_HYBRID) {
2556 return (mac_hwring_enable_intr(rx_ringp->hw_rh));
2557 } else {
2558 vresp = (vnet_res_t *)rx_ringp->hw_rh;
2559 return (vgen_enable_intr(vresp->macreg.m_driver));
2564 * Enable polling for a ring and disable its interrupt.
2566 static int
2567 vnet_ring_disable_intr(void *arg)
2569 vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2570 vnet_res_t *vresp;
2572 if (rx_ringp->hw_rh == NULL) {
2574 * Ring disable intr func is being invoked, but the ring is
2575 * not bound to any underlying resource ? This must be a ring
2576 * reserved for Hybrid resource and no such resource has been
2577 * assigned to this vnet device yet. We simply return success.
2579 ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
2580 return (0);
2584 * The rx ring has been bound to either a LDC or a Hybrid resource.
2585 * Call the appropriate function to disable interrupts for the ring.
2587 if (rx_ringp->state & VNET_RXRING_HYBRID) {
2588 return (mac_hwring_disable_intr(rx_ringp->hw_rh));
2589 } else {
2590 vresp = (vnet_res_t *)rx_ringp->hw_rh;
2591 return (vgen_disable_intr(vresp->macreg.m_driver));
2596 * Poll 'bytes_to_pickup' bytes of message from the rx ring.
2598 static mblk_t *
2599 vnet_rx_poll(void *arg, int bytes_to_pickup)
2601 vnet_pseudo_rx_ring_t *rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2602 mblk_t *mp = NULL;
2603 vnet_res_t *vresp;
2604 vnet_t *vnetp = rx_ringp->vnetp;
2606 if (rx_ringp->hw_rh == NULL) {
2607 return (NULL);
2610 if (rx_ringp->state & VNET_RXRING_HYBRID) {
2611 mp = mac_hwring_poll(rx_ringp->hw_rh, bytes_to_pickup);
2613 * Packets received over a hybrid resource need additional
2614 * processing to remove the tag, for the pvid case. The
2615 * underlying resource is not aware of the vnet's pvid and thus
2616 * packets are received with the vlan tag in the header; unlike
2617 * packets that are received over a ldc channel in which case
2618 * the peer vnet/vsw would have already removed the tag.
2620 if (vnetp->pvid != vnetp->default_vlan_id) {
2621 vnet_rx_frames_untag(vnetp->pvid, &mp);
2623 } else {
2624 vresp = (vnet_res_t *)rx_ringp->hw_rh;
2625 mp = vgen_rx_poll(vresp->macreg.m_driver, bytes_to_pickup);
2627 return (mp);
2630 /* ARGSUSED */
2631 void
2632 vnet_hio_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
2633 boolean_t loopback)
2635 vnet_t *vnetp = (vnet_t *)arg;
2636 vnet_pseudo_rx_ring_t *ringp = (vnet_pseudo_rx_ring_t *)mrh;
2639 * Packets received over a hybrid resource need additional processing
2640 * to remove the tag, for the pvid case. The underlying resource is
2641 * not aware of the vnet's pvid and thus packets are received with the
2642 * vlan tag in the header; unlike packets that are received over a ldc
2643 * channel in which case the peer vnet/vsw would have already removed
2644 * the tag.
2646 if (vnetp->pvid != vnetp->default_vlan_id) {
2647 vnet_rx_frames_untag(vnetp->pvid, &mp);
2648 if (mp == NULL) {
2649 return;
2652 mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num);
2655 static int
2656 vnet_addmac(void *arg, const uint8_t *mac_addr)
2658 vnet_pseudo_rx_group_t *rx_grp = (vnet_pseudo_rx_group_t *)arg;
2659 vnet_t *vnetp;
2661 vnetp = rx_grp->vnetp;
2663 if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) {
2664 return (0);
2667 cmn_err(CE_CONT, "!vnet%d: %s: Multiple macaddr unsupported\n",
2668 vnetp->instance, __func__);
2669 return (EINVAL);
2672 static int
2673 vnet_remmac(void *arg, const uint8_t *mac_addr)
2675 vnet_pseudo_rx_group_t *rx_grp = (vnet_pseudo_rx_group_t *)arg;
2676 vnet_t *vnetp;
2678 vnetp = rx_grp->vnetp;
2680 if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) {
2681 return (0);
2684 cmn_err(CE_CONT, "!vnet%d: %s: Invalid macaddr: %s\n",
2685 vnetp->instance, __func__, ether_sprintf((void *)mac_addr));
2686 return (EINVAL);
2690 vnet_hio_mac_init(vnet_t *vnetp, char *ifname)
2692 mac_handle_t mh;
2693 mac_client_handle_t mch = NULL;
2694 mac_unicast_handle_t muh = NULL;
2695 mac_diag_t diag;
2696 mac_register_t *macp;
2697 char client_name[MAXNAMELEN];
2698 int rv;
2699 uint16_t mac_flags = MAC_UNICAST_TAG_DISABLE |
2700 MAC_UNICAST_STRIP_DISABLE | MAC_UNICAST_PRIMARY;
2701 vio_net_callbacks_t vcb;
2702 ether_addr_t rem_addr =
2703 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
2704 uint32_t retries = 0;
2706 if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2707 return (EAGAIN);
2710 do {
2711 rv = mac_open_by_linkname(ifname, &mh);
2712 if (rv == 0) {
2713 break;
2715 if (rv != ENOENT || (retries++ >= vnet_mac_open_retries)) {
2716 mac_free(macp);
2717 return (rv);
2719 drv_usecwait(vnet_mac_open_delay);
2720 } while (rv == ENOENT);
2722 vnetp->hio_mh = mh;
2724 (void) snprintf(client_name, MAXNAMELEN, "vnet%d-%s", vnetp->instance,
2725 ifname);
2726 rv = mac_client_open(mh, &mch, client_name, MAC_OPEN_FLAGS_EXCLUSIVE);
2727 if (rv != 0) {
2728 goto fail;
2730 vnetp->hio_mch = mch;
2732 rv = mac_unicast_add(mch, vnetp->curr_macaddr, mac_flags, &muh, 0,
2733 &diag);
2734 if (rv != 0) {
2735 goto fail;
2737 vnetp->hio_muh = muh;
2739 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
2740 macp->m_driver = vnetp;
2741 macp->m_dip = NULL;
2742 macp->m_src_addr = NULL;
2743 macp->m_callbacks = &vnet_hio_res_callbacks;
2744 macp->m_min_sdu = 0;
2745 macp->m_max_sdu = ETHERMTU;
2747 rv = vio_net_resource_reg(macp, VIO_NET_RES_HYBRID,
2748 vnetp->curr_macaddr, rem_addr, &vnetp->hio_vhp, &vcb);
2749 if (rv != 0) {
2750 goto fail;
2752 mac_free(macp);
2754 /* add the recv callback */
2755 mac_rx_set(vnetp->hio_mch, vnet_hio_rx_cb, vnetp);
2757 return (0);
2759 fail:
2760 mac_free(macp);
2761 vnet_hio_mac_cleanup(vnetp);
2762 return (1);
2765 void
2766 vnet_hio_mac_cleanup(vnet_t *vnetp)
2768 if (vnetp->hio_vhp != NULL) {
2769 vio_net_resource_unreg(vnetp->hio_vhp);
2770 vnetp->hio_vhp = NULL;
2773 if (vnetp->hio_muh != NULL) {
2774 (void) mac_unicast_remove(vnetp->hio_mch, vnetp->hio_muh);
2775 vnetp->hio_muh = NULL;
2778 if (vnetp->hio_mch != NULL) {
2779 mac_client_close(vnetp->hio_mch, 0);
2780 vnetp->hio_mch = NULL;
2783 if (vnetp->hio_mh != NULL) {
2784 mac_close(vnetp->hio_mh);
2785 vnetp->hio_mh = NULL;
2789 /* Bind pseudo rings to hwrings */
2790 static int
2791 vnet_bind_hwrings(vnet_t *vnetp)
2793 mac_ring_handle_t hw_rh[VNET_NUM_HYBRID_RINGS];
2794 mac_perim_handle_t mph1;
2795 vnet_pseudo_rx_group_t *rx_grp;
2796 vnet_pseudo_rx_ring_t *rx_ringp;
2797 vnet_pseudo_tx_group_t *tx_grp;
2798 vnet_pseudo_tx_ring_t *tx_ringp;
2799 int hw_ring_cnt;
2800 int i;
2801 int rv;
2803 mac_perim_enter_by_mh(vnetp->hio_mh, &mph1);
2805 /* Get the list of the underlying RX rings. */
2806 hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->rx_hwgh, hw_rh,
2807 MAC_RING_TYPE_RX);
2809 /* We expect the the # of hw rx rings to match VNET_NUM_HYBRID_RINGS */
2810 if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) {
2811 cmn_err(CE_WARN,
2812 "!vnet%d: vnet_bind_hwrings: bad rx hw_ring_cnt(%d)\n",
2813 vnetp->instance, hw_ring_cnt);
2814 goto fail;
2817 if (vnetp->rx_hwgh != NULL) {
2819 * Quiesce the HW ring and the mac srs on the ring. Note
2820 * that the HW ring will be restarted when the pseudo ring
2821 * is started. At that time all the packets will be
2822 * directly passed up to the pseudo RX ring and handled
2823 * by mac srs created over the pseudo RX ring.
2825 mac_rx_client_quiesce(vnetp->hio_mch);
2826 mac_srs_perm_quiesce(vnetp->hio_mch, B_TRUE);
2830 * Bind the pseudo rings to the hwrings and start the hwrings.
2831 * Note we don't need to register these with the upper mac, as we have
2832 * statically exported these pseudo rxrings which are reserved for
2833 * rxrings of Hybrid resource.
2835 rx_grp = &vnetp->rx_grp[0];
2836 for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
2837 /* Pick the rxrings reserved for Hybrid resource */
2838 rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX];
2840 /* Store the hw ring handle */
2841 rx_ringp->hw_rh = hw_rh[i];
2843 /* Bind the pseudo ring to the underlying hwring */
2844 mac_hwring_setup(rx_ringp->hw_rh,
2845 (mac_resource_handle_t)rx_ringp, NULL);
2847 /* Start the hwring if needed */
2848 if (rx_ringp->state & VNET_RXRING_STARTED) {
2849 rv = mac_hwring_start(rx_ringp->hw_rh);
2850 if (rv != 0) {
2851 mac_hwring_teardown(rx_ringp->hw_rh);
2852 rx_ringp->hw_rh = NULL;
2853 goto fail;
2858 /* Get the list of the underlying TX rings. */
2859 hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->tx_hwgh, hw_rh,
2860 MAC_RING_TYPE_TX);
2862 /* We expect the # of hw tx rings to match VNET_NUM_HYBRID_RINGS */
2863 if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) {
2864 cmn_err(CE_WARN,
2865 "!vnet%d: vnet_bind_hwrings: bad tx hw_ring_cnt(%d)\n",
2866 vnetp->instance, hw_ring_cnt);
2867 goto fail;
2871 * Now map the pseudo txrings to the hw txrings. Note we don't need
2872 * to register these with the upper mac, as we have statically exported
2873 * these rings. Note that these rings will continue to be used for LDC
2874 * resources to peer vnets and vswitch (shared ring).
2876 tx_grp = &vnetp->tx_grp[0];
2877 for (i = 0; i < tx_grp->ring_cnt; i++) {
2878 tx_ringp = &tx_grp->rings[i];
2879 tx_ringp->hw_rh = hw_rh[i];
2880 tx_ringp->state |= VNET_TXRING_HYBRID;
2882 tx_grp->tx_notify_handle =
2883 mac_client_tx_notify(vnetp->hio_mch, vnet_tx_ring_update, vnetp);
2885 mac_perim_exit(mph1);
2886 return (0);
2888 fail:
2889 mac_perim_exit(mph1);
2890 vnet_unbind_hwrings(vnetp);
2891 return (1);
2894 /* Unbind pseudo rings from hwrings */
2895 static void
2896 vnet_unbind_hwrings(vnet_t *vnetp)
2898 mac_perim_handle_t mph1;
2899 vnet_pseudo_rx_ring_t *rx_ringp;
2900 vnet_pseudo_rx_group_t *rx_grp;
2901 vnet_pseudo_tx_group_t *tx_grp;
2902 vnet_pseudo_tx_ring_t *tx_ringp;
2903 int i;
2905 mac_perim_enter_by_mh(vnetp->hio_mh, &mph1);
2907 tx_grp = &vnetp->tx_grp[0];
2908 for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
2909 tx_ringp = &tx_grp->rings[i];
2910 if (tx_ringp->state & VNET_TXRING_HYBRID) {
2911 tx_ringp->state &= ~VNET_TXRING_HYBRID;
2912 tx_ringp->hw_rh = NULL;
2915 (void) mac_client_tx_notify(vnetp->hio_mch, NULL,
2916 tx_grp->tx_notify_handle);
2918 rx_grp = &vnetp->rx_grp[0];
2919 for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
2920 rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX];
2921 if (rx_ringp->hw_rh != NULL) {
2922 /* Stop the hwring */
2923 mac_hwring_stop(rx_ringp->hw_rh);
2925 /* Teardown the hwring */
2926 mac_hwring_teardown(rx_ringp->hw_rh);
2927 rx_ringp->hw_rh = NULL;
2931 if (vnetp->rx_hwgh != NULL) {
2932 vnetp->rx_hwgh = NULL;
2934 * First clear the permanent-quiesced flag of the RX srs then
2935 * restart the HW ring and the mac srs on the ring.
2937 mac_srs_perm_quiesce(vnetp->hio_mch, B_FALSE);
2938 mac_rx_client_restart(vnetp->hio_mch);
2941 mac_perim_exit(mph1);
2944 /* Bind pseudo ring to a LDC resource */
2945 static int
2946 vnet_bind_vgenring(vnet_res_t *vresp)
2948 vnet_t *vnetp;
2949 vnet_pseudo_rx_group_t *rx_grp;
2950 vnet_pseudo_rx_ring_t *rx_ringp;
2951 mac_perim_handle_t mph1;
2952 int rv;
2953 int type;
2955 vnetp = vresp->vnetp;
2956 type = vresp->type;
2957 rx_grp = &vnetp->rx_grp[0];
2959 if (type == VIO_NET_RES_LDC_SERVICE) {
2961 * Ring Index 0 is the default ring in the group and is
2962 * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring
2963 * is allocated statically and is reported to the mac layer
2964 * in vnet_m_capab(). So, all we need to do here, is save a
2965 * reference to the associated vresp.
2967 rx_ringp = &rx_grp->rings[0];
2968 rx_ringp->hw_rh = (mac_ring_handle_t)vresp;
2969 vresp->rx_ringp = (void *)rx_ringp;
2970 return (0);
2972 ASSERT(type == VIO_NET_RES_LDC_GUEST);
2974 mac_perim_enter_by_mh(vnetp->mh, &mph1);
2976 rx_ringp = vnet_alloc_pseudo_rx_ring(vnetp);
2977 if (rx_ringp == NULL) {
2978 cmn_err(CE_WARN, "!vnet%d: Failed to allocate pseudo rx ring",
2979 vnetp->instance);
2980 goto fail;
2983 /* Store the LDC resource itself as the ring handle */
2984 rx_ringp->hw_rh = (mac_ring_handle_t)vresp;
2987 * Save a reference to the ring in the resource for lookup during
2988 * unbind. Note this is only done for LDC resources. We don't need this
2989 * in the case of a Hybrid resource (see vnet_bind_hwrings()), as its
2990 * rx rings are mapped to reserved pseudo rx rings (index 1 and 2).
2992 vresp->rx_ringp = (void *)rx_ringp;
2993 rx_ringp->state |= VNET_RXRING_LDC_GUEST;
2995 /* Register the pseudo ring with upper-mac */
2996 rv = mac_group_add_ring(rx_grp->handle, rx_ringp->index);
2997 if (rv != 0) {
2998 rx_ringp->state &= ~VNET_RXRING_LDC_GUEST;
2999 rx_ringp->hw_rh = NULL;
3000 vnet_free_pseudo_rx_ring(vnetp, rx_ringp);
3001 goto fail;
3004 mac_perim_exit(mph1);
3005 return (0);
3006 fail:
3007 mac_perim_exit(mph1);
3008 return (1);
3011 /* Unbind pseudo ring from a LDC resource */
3012 static void
3013 vnet_unbind_vgenring(vnet_res_t *vresp)
3015 vnet_t *vnetp;
3016 vnet_pseudo_rx_group_t *rx_grp;
3017 vnet_pseudo_rx_ring_t *rx_ringp;
3018 mac_perim_handle_t mph1;
3019 int type;
3021 vnetp = vresp->vnetp;
3022 type = vresp->type;
3023 rx_grp = &vnetp->rx_grp[0];
3025 if (vresp->rx_ringp == NULL) {
3026 return;
3029 if (type == VIO_NET_RES_LDC_SERVICE) {
3031 * Ring Index 0 is the default ring in the group and is
3032 * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring
3033 * is allocated statically and is reported to the mac layer
3034 * in vnet_m_capab(). So, all we need to do here, is remove its
3035 * reference to the associated vresp.
3037 rx_ringp = &rx_grp->rings[0];
3038 rx_ringp->hw_rh = NULL;
3039 vresp->rx_ringp = NULL;
3040 return;
3042 ASSERT(type == VIO_NET_RES_LDC_GUEST);
3044 mac_perim_enter_by_mh(vnetp->mh, &mph1);
3046 rx_ringp = (vnet_pseudo_rx_ring_t *)vresp->rx_ringp;
3047 vresp->rx_ringp = NULL;
3049 if (rx_ringp != NULL && (rx_ringp->state & VNET_RXRING_LDC_GUEST)) {
3050 /* Unregister the pseudo ring with upper-mac */
3051 mac_group_rem_ring(rx_grp->handle, rx_ringp->handle);
3053 rx_ringp->hw_rh = NULL;
3054 rx_ringp->state &= ~VNET_RXRING_LDC_GUEST;
3056 /* Free the pseudo rx ring */
3057 vnet_free_pseudo_rx_ring(vnetp, rx_ringp);
3060 mac_perim_exit(mph1);
3063 static void
3064 vnet_unbind_rings(vnet_res_t *vresp)
3066 switch (vresp->type) {
3068 case VIO_NET_RES_LDC_SERVICE:
3069 case VIO_NET_RES_LDC_GUEST:
3070 vnet_unbind_vgenring(vresp);
3071 break;
3073 case VIO_NET_RES_HYBRID:
3074 vnet_unbind_hwrings(vresp->vnetp);
3075 break;
3077 default:
3078 break;
3083 static int
3084 vnet_bind_rings(vnet_res_t *vresp)
3086 int rv;
3088 switch (vresp->type) {
3090 case VIO_NET_RES_LDC_SERVICE:
3091 case VIO_NET_RES_LDC_GUEST:
3092 rv = vnet_bind_vgenring(vresp);
3093 break;
3095 case VIO_NET_RES_HYBRID:
3096 rv = vnet_bind_hwrings(vresp->vnetp);
3097 break;
3099 default:
3100 rv = 1;
3101 break;
3105 return (rv);
3108 /* ARGSUSED */
3110 vnet_hio_stat(void *arg, uint_t stat, uint64_t *val)
3112 vnet_t *vnetp = (vnet_t *)arg;
3114 *val = mac_stat_get(vnetp->hio_mh, stat);
3115 return (0);
3119 * The start() and stop() routines for the Hybrid resource below, are just
3120 * dummy functions. This is provided to avoid resource type specific code in
3121 * vnet_start_resources() and vnet_stop_resources(). The starting and stopping
3122 * of the Hybrid resource happens in the context of the mac_client interfaces
3123 * that are invoked in vnet_hio_mac_init() and vnet_hio_mac_cleanup().
3125 /* ARGSUSED */
3126 static int
3127 vnet_hio_start(void *arg)
3129 return (0);
3132 /* ARGSUSED */
3133 static void
3134 vnet_hio_stop(void *arg)
3138 mblk_t *
3139 vnet_hio_tx(void *arg, mblk_t *mp)
3141 vnet_pseudo_tx_ring_t *tx_ringp;
3142 mblk_t *nextp;
3143 mblk_t *ret_mp;
3145 tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
3146 for (;;) {
3147 nextp = mp->b_next;
3148 mp->b_next = NULL;
3150 ret_mp = mac_hwring_tx(tx_ringp->hw_rh, mp);
3151 if (ret_mp != NULL) {
3152 ret_mp->b_next = nextp;
3153 mp = ret_mp;
3154 break;
3157 if ((mp = nextp) == NULL)
3158 break;
3160 return (mp);
3163 #ifdef VNET_IOC_DEBUG
3166 * The ioctl entry point is used only for debugging for now. The ioctl commands
3167 * can be used to force the link state of the channel connected to vsw.
3169 static void
3170 vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
3172 struct iocblk *iocp;
3173 vnet_t *vnetp;
3175 iocp = (struct iocblk *)(uintptr_t)mp->b_rptr;
3176 iocp->ioc_error = 0;
3177 vnetp = (vnet_t *)arg;
3179 if (vnetp == NULL) {
3180 miocnak(q, mp, 0, EINVAL);
3181 return;
3184 switch (iocp->ioc_cmd) {
3186 case VNET_FORCE_LINK_DOWN:
3187 case VNET_FORCE_LINK_UP:
3188 vnet_force_link_state(vnetp, q, mp);
3189 break;
3191 default:
3192 iocp->ioc_error = EINVAL;
3193 miocnak(q, mp, 0, iocp->ioc_error);
3194 break;
3199 static void
3200 vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp)
3202 mac_register_t *macp;
3203 mac_callbacks_t *cbp;
3204 vnet_res_t *vresp;
3206 READ_ENTER(&vnetp->vsw_fp_rw);
3208 vresp = vnetp->vsw_fp;
3209 if (vresp == NULL) {
3210 RW_EXIT(&vnetp->vsw_fp_rw);
3211 return;
3214 macp = &vresp->macreg;
3215 cbp = macp->m_callbacks;
3216 cbp->mc_ioctl(macp->m_driver, q, mp);
3218 RW_EXIT(&vnetp->vsw_fp_rw);
3221 #else
3223 static void
3224 vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
3226 vnet_t *vnetp;
3228 vnetp = (vnet_t *)arg;
3230 if (vnetp == NULL) {
3231 miocnak(q, mp, 0, EINVAL);
3232 return;
3235 /* ioctl support only for debugging */
3236 miocnak(q, mp, 0, ENOTSUP);
3239 #endif