kernel: remove unused utsname_set_machine()
[unleashed.git] / usr / src / uts / sun4v / io / vsw_ldc.c
blobee651e692bb210f6c72d1b60d9df77a532b45dee
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
26 #include <sys/types.h>
27 #include <sys/errno.h>
28 #include <sys/debug.h>
29 #include <sys/time.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/user.h>
33 #include <sys/stropts.h>
34 #include <sys/stream.h>
35 #include <sys/strlog.h>
36 #include <sys/strsubr.h>
37 #include <sys/cmn_err.h>
38 #include <sys/cpu.h>
39 #include <sys/kmem.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/ksynch.h>
44 #include <sys/stat.h>
45 #include <sys/kstat.h>
46 #include <sys/vtrace.h>
47 #include <sys/strsun.h>
48 #include <sys/dlpi.h>
49 #include <sys/ethernet.h>
50 #include <net/if.h>
51 #include <sys/varargs.h>
52 #include <sys/machsystm.h>
53 #include <sys/modctl.h>
54 #include <sys/modhash.h>
55 #include <sys/mac.h>
56 #include <sys/mac_ether.h>
57 #include <sys/taskq.h>
58 #include <sys/note.h>
59 #include <sys/mach_descrip.h>
60 #include <sys/mdeg.h>
61 #include <sys/ldc.h>
62 #include <sys/vsw_fdb.h>
63 #include <sys/vsw.h>
64 #include <sys/vio_mailbox.h>
65 #include <sys/vnet_mailbox.h>
66 #include <sys/vnet_common.h>
67 #include <sys/vio_util.h>
68 #include <sys/sdt.h>
69 #include <sys/atomic.h>
70 #include <sys/callb.h>
71 #include <sys/vlan.h>
73 /* Port add/deletion/etc routines */
74 static void vsw_port_delete(vsw_port_t *port);
75 static int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
76 static void vsw_ldc_detach(vsw_ldc_t *ldcp);
77 static int vsw_ldc_init(vsw_ldc_t *ldcp);
78 static void vsw_ldc_uninit(vsw_ldc_t *ldcp);
79 static void vsw_ldc_drain(vsw_ldc_t *ldcp);
80 static void vsw_drain_port_taskq(vsw_port_t *port);
81 static void vsw_marker_task(void *);
82 static int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
83 void vsw_detach_ports(vsw_t *vswp);
84 int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
85 mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
86 int vsw_port_detach(vsw_t *vswp, int p_instance);
87 int vsw_portsend(vsw_port_t *port, mblk_t *mp);
88 int vsw_port_attach(vsw_port_t *portp);
89 vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
90 void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
91 void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
92 void vsw_reset_ports(vsw_t *vswp);
93 void vsw_port_reset(vsw_port_t *portp);
94 void vsw_physlink_update_ports(vsw_t *vswp);
95 static void vsw_port_physlink_update(vsw_port_t *portp);
97 /* Interrupt routines */
98 static uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
100 /* Handshake routines */
101 static void vsw_ldc_reinit(vsw_ldc_t *);
102 static void vsw_conn_task(void *);
103 static int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
104 static void vsw_next_milestone(vsw_ldc_t *);
105 static int vsw_supported_version(vio_ver_msg_t *);
106 static void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
107 static void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
108 void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);
110 /* Data processing routines */
111 void vsw_process_pkt(void *);
112 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *, int);
113 static void vsw_process_ctrl_pkt(void *);
114 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
115 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
116 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
117 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
118 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
119 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
120 static void vsw_process_physlink_msg(vsw_ldc_t *, void *);
121 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
122 uint32_t);
123 static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
124 static void vsw_process_pkt_data(void *, void *, uint32_t);
125 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
126 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
127 static void vsw_process_evt_read(vsw_ldc_t *ldcp);
128 static void vsw_ldc_rcv(vsw_ldc_t *ldcp);
130 /* Switching/data transmit routines */
131 static int vsw_descrsend(vsw_ldc_t *, mblk_t *);
132 static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
133 static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
134 static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
135 static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
137 /* Packet creation routines */
138 static void vsw_send_ver(void *);
139 static void vsw_send_attr(vsw_ldc_t *);
140 static void vsw_send_dring_info(vsw_ldc_t *);
141 static void vsw_send_rdx(vsw_ldc_t *);
142 static void vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state);
144 /* Dring routines */
145 static void vsw_create_privring(vsw_ldc_t *);
146 static dring_info_t *vsw_map_dring(vsw_ldc_t *ldcp, void *pkt);
147 static void vsw_unmap_dring(vsw_ldc_t *ldcp);
148 static void vsw_destroy_dring(vsw_ldc_t *ldcp);
149 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
150 static int vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt);
151 static void vsw_set_lane_attr(vsw_t *, lane_t *);
152 dring_info_t *vsw_map_dring_cmn(vsw_ldc_t *ldcp,
153 vio_dring_reg_msg_t *dring_pkt);
154 static int vsw_mapin_avail(vsw_ldc_t *ldcp);
156 /* tx/msg/rcv thread routines */
157 static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
158 static void vsw_ldc_tx_worker(void *arg);
160 /* Misc support routines */
161 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
162 static int vsw_get_same_dest_list(struct ether_header *ehp,
163 mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
164 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
166 /* Debugging routines */
167 static void dump_flags(uint64_t);
168 static void display_state(void);
169 static void display_lane(lane_t *);
170 static void display_ring(dring_info_t *);
173 * Functions imported from other files.
175 extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
176 extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
177 extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
178 extern void vsw_del_mcst_port(vsw_port_t *port);
179 extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
180 extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
181 extern void vsw_fdbe_add(vsw_t *vswp, void *port);
182 extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
183 extern void vsw_create_vlans(void *arg, int type);
184 extern void vsw_destroy_vlans(void *arg, int type);
185 extern void vsw_vlan_add_ids(void *arg, int type);
186 extern void vsw_vlan_remove_ids(void *arg, int type);
187 extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
188 struct ether_header *ehp, uint16_t *vidp);
189 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
190 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
191 mblk_t **npt);
192 extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
193 extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
194 extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
195 extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
196 extern void vsw_hio_stop_port(vsw_port_t *portp);
197 extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
198 extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
199 extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
200 extern void vsw_destroy_rxpools(void *arg);
201 extern void vsw_stop_msg_thread(vsw_ldc_t *ldcp);
202 extern int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
203 extern int vsw_dringsend(vsw_ldc_t *, mblk_t *);
204 extern int vsw_reclaim_dring(dring_info_t *dp, int start);
205 extern int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
206 int *);
207 extern vio_dring_reg_msg_t *vsw_create_tx_dring_info(vsw_ldc_t *);
208 extern int vsw_setup_tx_dring(vsw_ldc_t *ldcp, dring_info_t *dp);
209 extern void vsw_destroy_tx_dring(vsw_ldc_t *ldcp);
210 extern dring_info_t *vsw_map_rx_dring(vsw_ldc_t *ldcp, void *pkt);
211 extern void vsw_unmap_rx_dring(vsw_ldc_t *ldcp);
212 extern void vsw_ldc_msg_worker(void *arg);
213 extern void vsw_process_dringdata(void *, void *);
214 extern vio_dring_reg_msg_t *vsw_create_rx_dring_info(vsw_ldc_t *);
215 extern void vsw_destroy_rx_dring(vsw_ldc_t *ldcp);
216 extern dring_info_t *vsw_map_tx_dring(vsw_ldc_t *ldcp, void *pkt);
217 extern void vsw_unmap_tx_dring(vsw_ldc_t *ldcp);
218 extern void vsw_ldc_rcv_worker(void *arg);
219 extern void vsw_stop_rcv_thread(vsw_ldc_t *ldcp);
220 extern int vsw_dringsend_shm(vsw_ldc_t *, mblk_t *);
221 extern void vsw_process_dringdata_shm(void *, void *);
224 * Tunables used in this file.
226 extern int vsw_num_handshakes;
227 extern int vsw_ldc_tx_delay;
228 extern int vsw_ldc_tx_retries;
229 extern int vsw_ldc_retries;
230 extern int vsw_ldc_delay;
231 extern boolean_t vsw_ldc_rxthr_enabled;
232 extern boolean_t vsw_ldc_txthr_enabled;
233 extern uint32_t vsw_num_descriptors;
234 extern uint8_t vsw_dring_mode;
235 extern uint32_t vsw_max_tx_qcount;
236 extern boolean_t vsw_obp_ver_proto_workaround;
237 extern uint32_t vsw_publish_macaddr_count;
238 extern uint32_t vsw_nrbufs_factor;
240 #define LDC_ENTER_LOCK(ldcp) \
241 mutex_enter(&((ldcp)->ldc_cblock));\
242 mutex_enter(&((ldcp)->ldc_rxlock));\
243 mutex_enter(&((ldcp)->ldc_txlock));
244 #define LDC_EXIT_LOCK(ldcp) \
245 mutex_exit(&((ldcp)->ldc_txlock));\
246 mutex_exit(&((ldcp)->ldc_rxlock));\
247 mutex_exit(&((ldcp)->ldc_cblock));
249 #define VSW_VER_EQ(ldcp, major, minor) \
250 ((ldcp)->lane_out.ver_major == (major) && \
251 (ldcp)->lane_out.ver_minor == (minor))
253 #define VSW_VER_LT(ldcp, major, minor) \
254 (((ldcp)->lane_out.ver_major < (major)) || \
255 ((ldcp)->lane_out.ver_major == (major) && \
256 (ldcp)->lane_out.ver_minor < (minor)))
258 #define VSW_VER_GTEQ(ldcp, major, minor) \
259 (((ldcp)->lane_out.ver_major > (major)) || \
260 ((ldcp)->lane_out.ver_major == (major) && \
261 (ldcp)->lane_out.ver_minor >= (minor)))
263 #define VSW_VER_LTEQ(ldcp, major, minor) \
264 (((ldcp)->lane_out.ver_major < (major)) || \
265 ((ldcp)->lane_out.ver_major == (major) && \
266 (ldcp)->lane_out.ver_minor <= (minor)))
269 * VIO Protocol Version Info:
271 * The version specified below represents the version of protocol currently
272 * supported in the driver. It means the driver can negotiate with peers with
273 * versions <= this version. Here is a summary of the feature(s) that are
274 * supported at each version of the protocol:
276 * 1.0 Basic VIO protocol.
277 * 1.1 vDisk protocol update (no virtual network update).
278 * 1.2 Support for priority frames (priority-ether-types).
279 * 1.3 VLAN and HybridIO support.
280 * 1.4 Jumbo Frame support.
281 * 1.5 Link State Notification support with optional support
282 * for Physical Link information.
283 * 1.6 Support for RxDringData mode.
285 static ver_sup_t vsw_versions[] = { {1, 6} };
288 * For the moment the state dump routines have their own
289 * private flag.
291 #define DUMP_STATE 0
293 #if DUMP_STATE
295 #define DUMP_TAG(tag) \
297 D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
298 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype); \
299 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env); \
302 #define DUMP_TAG_PTR(tag) \
304 D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
305 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \
306 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env); \
309 #define DUMP_FLAGS(flags) dump_flags(flags);
310 #define DISPLAY_STATE() display_state()
312 #else
314 #define DUMP_TAG(tag)
315 #define DUMP_TAG_PTR(tag)
316 #define DUMP_FLAGS(state)
317 #define DISPLAY_STATE()
319 #endif /* DUMP_STATE */
322 * Attach the specified port.
324 * Returns 0 on success, 1 on failure.
327 vsw_port_attach(vsw_port_t *port)
329 vsw_t *vswp = port->p_vswp;
330 vsw_port_list_t *plist = &vswp->plist;
331 vsw_port_t *p, **pp;
332 int nids = port->num_ldcs;
333 uint64_t *ldcids;
334 int rv;
336 D1(vswp, "%s: enter : port %d", __func__, port->p_instance);
338 /* port already exists? */
339 READ_ENTER(&plist->lockrw);
340 for (p = plist->head; p != NULL; p = p->p_next) {
341 if (p->p_instance == port->p_instance) {
342 DWARN(vswp, "%s: port instance %d already attached",
343 __func__, p->p_instance);
344 RW_EXIT(&plist->lockrw);
345 return (1);
348 RW_EXIT(&plist->lockrw);
350 mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
351 mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
352 rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);
354 mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
355 cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
356 port->state = VSW_PORT_INIT;
358 D2(vswp, "%s: %d nids", __func__, nids);
359 ldcids = port->ldc_ids;
360 D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[0]);
361 if (vsw_ldc_attach(port, (uint64_t)ldcids[0]) != 0) {
362 DERR(vswp, "%s: ldc_attach failed", __func__);
363 goto exit_error;
366 if (vswp->switching_setup_done == B_TRUE) {
368 * If the underlying network device has been setup,
369 * then open a mac client and porgram the mac address
370 * for this port.
372 rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
373 if (rv != 0) {
374 goto exit_error;
378 /* create the fdb entry for this port/mac address */
379 vsw_fdbe_add(vswp, port);
381 vsw_create_vlans(port, VSW_VNETPORT);
383 WRITE_ENTER(&plist->lockrw);
385 /* link it into the list of ports for this vsw instance */
386 pp = (vsw_port_t **)(&plist->head);
387 port->p_next = *pp;
388 *pp = port;
389 plist->num_ports++;
391 RW_EXIT(&plist->lockrw);
394 * Initialise the port and any ldc's under it.
396 (void) vsw_ldc_init(port->ldcp);
398 /* announce macaddr of vnet to the physical switch */
399 if (vsw_publish_macaddr_count != 0) { /* enabled */
400 vsw_publish_macaddr(vswp, port);
403 D1(vswp, "%s: exit", __func__);
404 return (0);
406 exit_error:
408 cv_destroy(&port->state_cv);
409 mutex_destroy(&port->state_lock);
411 rw_destroy(&port->maccl_rwlock);
412 mutex_destroy(&port->tx_lock);
413 mutex_destroy(&port->mca_lock);
414 kmem_free(port, sizeof (vsw_port_t));
415 return (1);
419 * Detach the specified port.
421 * Returns 0 on success, 1 on failure.
424 vsw_port_detach(vsw_t *vswp, int p_instance)
426 vsw_port_t *port = NULL;
427 vsw_port_list_t *plist = &vswp->plist;
429 D1(vswp, "%s: enter: port id %d", __func__, p_instance);
431 WRITE_ENTER(&plist->lockrw);
433 if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
434 RW_EXIT(&plist->lockrw);
435 return (1);
438 if (vsw_plist_del_node(vswp, port)) {
439 RW_EXIT(&plist->lockrw);
440 return (1);
443 /* cleanup any HybridIO for this port */
444 vsw_hio_stop_port(port);
447 * No longer need to hold writer lock on port list now
448 * that we have unlinked the target port from the list.
450 RW_EXIT(&plist->lockrw);
452 /* Cleanup and close the mac client */
453 vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
455 /* Remove the fdb entry for this port/mac address */
456 vsw_fdbe_del(vswp, &(port->p_macaddr));
457 vsw_destroy_vlans(port, VSW_VNETPORT);
459 /* Remove any multicast addresses.. */
460 vsw_del_mcst_port(port);
462 vsw_port_delete(port);
464 D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
465 return (0);
469 * Detach all active ports.
471 void
472 vsw_detach_ports(vsw_t *vswp)
474 vsw_port_list_t *plist = &vswp->plist;
475 vsw_port_t *port = NULL;
477 D1(vswp, "%s: enter", __func__);
479 WRITE_ENTER(&plist->lockrw);
481 while ((port = plist->head) != NULL) {
482 (void) vsw_plist_del_node(vswp, port);
484 /* cleanup any HybridIO for this port */
485 vsw_hio_stop_port(port);
487 /* Cleanup and close the mac client */
488 vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
490 /* Remove the fdb entry for this port/mac address */
491 vsw_fdbe_del(vswp, &(port->p_macaddr));
492 vsw_destroy_vlans(port, VSW_VNETPORT);
494 /* Remove any multicast addresses.. */
495 vsw_del_mcst_port(port);
498 * No longer need to hold the lock on the port list
499 * now that we have unlinked the target port from the
500 * list.
502 RW_EXIT(&plist->lockrw);
503 vsw_port_delete(port);
504 WRITE_ENTER(&plist->lockrw);
506 RW_EXIT(&plist->lockrw);
508 D1(vswp, "%s: exit", __func__);
512 * Delete the specified port.
514 static void
515 vsw_port_delete(vsw_port_t *port)
517 vsw_t *vswp = port->p_vswp;
519 D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
521 vsw_ldc_uninit(port->ldcp);
524 * Wait for any pending ctrl msg tasks which reference this
525 * port to finish.
527 vsw_drain_port_taskq(port);
530 * Wait for any active callbacks to finish
532 vsw_ldc_drain(port->ldcp);
534 vsw_ldc_detach(port->ldcp);
536 rw_destroy(&port->maccl_rwlock);
537 mutex_destroy(&port->mca_lock);
538 mutex_destroy(&port->tx_lock);
540 cv_destroy(&port->state_cv);
541 mutex_destroy(&port->state_lock);
543 if (port->num_ldcs != 0) {
544 kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
545 port->num_ldcs = 0;
548 if (port->nvids != 0) {
549 kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
552 kmem_free(port, sizeof (vsw_port_t));
554 D1(vswp, "%s: exit", __func__);
558 * Attach a logical domain channel (ldc) under a specified port.
560 * Returns 0 on success, 1 on failure.
562 static int
563 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
565 vsw_t *vswp = port->p_vswp;
566 vsw_ldc_t *ldcp = NULL;
567 ldc_attr_t attr;
568 ldc_status_t istatus;
569 int status = DDI_FAILURE;
570 char kname[MAXNAMELEN];
571 enum { PROG_init = 0x0,
572 PROG_callback = 0x1,
573 PROG_tx_thread = 0x2}
574 progress;
576 progress = PROG_init;
578 D1(vswp, "%s: enter", __func__);
580 ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
581 if (ldcp == NULL) {
582 DERR(vswp, "%s: kmem_zalloc failed", __func__);
583 return (1);
585 ldcp->ldc_id = ldc_id;
587 mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
588 mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
589 mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
590 ldcp->msg_thr_flags = 0;
591 mutex_init(&ldcp->msg_thr_lock, NULL, MUTEX_DRIVER, NULL);
592 cv_init(&ldcp->msg_thr_cv, NULL, CV_DRIVER, NULL);
593 ldcp->rcv_thr_flags = 0;
594 mutex_init(&ldcp->rcv_thr_lock, NULL, MUTEX_DRIVER, NULL);
595 cv_init(&ldcp->rcv_thr_cv, NULL, CV_DRIVER, NULL);
596 mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
597 cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
599 /* required for handshake with peer */
600 ldcp->local_session = (uint64_t)ddi_get_lbolt();
601 ldcp->peer_session = 0;
602 ldcp->session_status = 0;
603 ldcp->hss_id = 1; /* Initial handshake session id */
604 ldcp->hphase = VSW_MILESTONE0;
606 (void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
608 /* only set for outbound lane, inbound set by peer */
609 vsw_set_lane_attr(vswp, &ldcp->lane_out);
611 attr.devclass = LDC_DEV_NT_SVC;
612 attr.instance = ddi_get_instance(vswp->dip);
613 attr.mode = LDC_MODE_UNRELIABLE;
614 attr.mtu = VSW_LDC_MTU;
615 status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
616 if (status != 0) {
617 DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
618 __func__, ldc_id, status);
619 goto ldc_attach_fail;
622 if (vsw_ldc_txthr_enabled) {
623 ldcp->tx_thr_flags = 0;
624 ldcp->tx_mhead = ldcp->tx_mtail = NULL;
626 mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
627 cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
628 ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
629 vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
631 progress |= PROG_tx_thread;
632 if (ldcp->tx_thread == NULL) {
633 DWARN(vswp, "%s(%lld): Failed to create worker thread",
634 __func__, ldc_id);
635 goto ldc_attach_fail;
639 status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
640 if (status != 0) {
641 DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
642 __func__, ldc_id, status);
643 (void) ldc_fini(ldcp->ldc_handle);
644 goto ldc_attach_fail;
647 * allocate a message for ldc_read()s, big enough to hold ctrl and
648 * data msgs, including raw data msgs used to recv priority frames.
650 ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
651 ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);
653 progress |= PROG_callback;
655 mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);
657 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
658 DERR(vswp, "%s: ldc_status failed", __func__);
659 mutex_destroy(&ldcp->status_lock);
660 goto ldc_attach_fail;
663 ldcp->ldc_status = istatus;
664 ldcp->ldc_port = port;
665 ldcp->ldc_vswp = vswp;
667 vsw_reset_vnet_proto_ops(ldcp);
669 (void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
670 ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
671 kname, &ldcp->ldc_stats);
672 if (ldcp->ksp == NULL) {
673 DERR(vswp, "%s: kstats setup failed", __func__);
674 goto ldc_attach_fail;
677 /* link it into this port */
678 port->ldcp = ldcp;
680 D1(vswp, "%s: exit", __func__);
681 return (0);
683 ldc_attach_fail:
685 if (progress & PROG_callback) {
686 (void) ldc_unreg_callback(ldcp->ldc_handle);
687 kmem_free(ldcp->ldcmsg, ldcp->msglen);
690 if (progress & PROG_tx_thread) {
691 if (ldcp->tx_thread != NULL) {
692 vsw_stop_tx_thread(ldcp);
694 mutex_destroy(&ldcp->tx_thr_lock);
695 cv_destroy(&ldcp->tx_thr_cv);
697 if (ldcp->ksp != NULL) {
698 vgen_destroy_kstats(ldcp->ksp);
700 mutex_destroy(&ldcp->msg_thr_lock);
701 mutex_destroy(&ldcp->rcv_thr_lock);
702 mutex_destroy(&ldcp->ldc_txlock);
703 mutex_destroy(&ldcp->ldc_rxlock);
704 mutex_destroy(&ldcp->ldc_cblock);
705 mutex_destroy(&ldcp->drain_cv_lock);
706 cv_destroy(&ldcp->msg_thr_cv);
707 cv_destroy(&ldcp->rcv_thr_cv);
708 cv_destroy(&ldcp->drain_cv);
710 kmem_free(ldcp, sizeof (vsw_ldc_t));
712 return (1);
716 * Detach a logical domain channel (ldc) belonging to a
717 * particular port.
719 static void
720 vsw_ldc_detach(vsw_ldc_t *ldcp)
722 int rv;
723 vsw_t *vswp = ldcp->ldc_port->p_vswp;
724 int retries = 0;
726 D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
728 /* Stop msg/rcv thread */
729 if (ldcp->rcv_thread != NULL) {
730 vsw_stop_rcv_thread(ldcp);
731 } else if (ldcp->msg_thread != NULL) {
732 vsw_stop_msg_thread(ldcp);
734 kmem_free(ldcp->ldcmsg, ldcp->msglen);
736 /* Stop the tx thread */
737 if (ldcp->tx_thread != NULL) {
738 vsw_stop_tx_thread(ldcp);
739 mutex_destroy(&ldcp->tx_thr_lock);
740 cv_destroy(&ldcp->tx_thr_cv);
741 if (ldcp->tx_mhead != NULL) {
742 freemsgchain(ldcp->tx_mhead);
743 ldcp->tx_mhead = ldcp->tx_mtail = NULL;
744 ldcp->tx_cnt = 0;
748 /* Destory kstats */
749 vgen_destroy_kstats(ldcp->ksp);
752 * Before we can close the channel we must release any mapped
753 * resources (e.g. drings).
755 vsw_free_lane_resources(ldcp, INBOUND);
756 vsw_free_lane_resources(ldcp, OUTBOUND);
759 * Close the channel, retry on EAAGIN.
761 while ((rv = ldc_close(ldcp->ldc_handle)) == EAGAIN) {
762 if (++retries > vsw_ldc_retries) {
763 break;
765 drv_usecwait(vsw_ldc_delay);
767 if (rv != 0) {
768 cmn_err(CE_NOTE,
769 "!vsw%d: Error(%d) closing the channel(0x%lx)\n",
770 vswp->instance, rv, ldcp->ldc_id);
773 (void) ldc_fini(ldcp->ldc_handle);
775 ldcp->ldc_status = LDC_INIT;
776 ldcp->ldc_handle = NULL;
777 ldcp->ldc_vswp = NULL;
779 mutex_destroy(&ldcp->msg_thr_lock);
780 mutex_destroy(&ldcp->rcv_thr_lock);
781 mutex_destroy(&ldcp->ldc_txlock);
782 mutex_destroy(&ldcp->ldc_rxlock);
783 mutex_destroy(&ldcp->ldc_cblock);
784 mutex_destroy(&ldcp->drain_cv_lock);
785 mutex_destroy(&ldcp->status_lock);
786 cv_destroy(&ldcp->msg_thr_cv);
787 cv_destroy(&ldcp->rcv_thr_cv);
788 cv_destroy(&ldcp->drain_cv);
790 kmem_free(ldcp, sizeof (vsw_ldc_t));
794 * Open and attempt to bring up the channel. Note that channel
795 * can only be brought up if peer has also opened channel.
797 * Returns 0 if can open and bring up channel, otherwise
798 * returns 1.
800 static int
801 vsw_ldc_init(vsw_ldc_t *ldcp)
803 vsw_t *vswp = ldcp->ldc_vswp;
804 ldc_status_t istatus = 0;
805 int rv;
807 D1(vswp, "%s: enter", __func__);
809 LDC_ENTER_LOCK(ldcp);
811 /* don't start at 0 in case clients don't like that */
812 ldcp->next_ident = 1;
814 rv = ldc_open(ldcp->ldc_handle);
815 if (rv != 0) {
816 DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
817 __func__, ldcp->ldc_id, rv);
818 LDC_EXIT_LOCK(ldcp);
819 return (1);
822 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
823 DERR(vswp, "%s: unable to get status", __func__);
824 LDC_EXIT_LOCK(ldcp);
825 return (1);
827 } else if (istatus != LDC_OPEN && istatus != LDC_READY) {
828 DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
829 __func__, ldcp->ldc_id, istatus);
830 LDC_EXIT_LOCK(ldcp);
831 return (1);
834 mutex_enter(&ldcp->status_lock);
835 ldcp->ldc_status = istatus;
836 mutex_exit(&ldcp->status_lock);
838 rv = ldc_up(ldcp->ldc_handle);
839 if (rv != 0) {
841 * Not a fatal error for ldc_up() to fail, as peer
842 * end point may simply not be ready yet.
844 D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
845 ldcp->ldc_id, rv);
846 LDC_EXIT_LOCK(ldcp);
847 return (1);
851 * ldc_up() call is non-blocking so need to explicitly
852 * check channel status to see if in fact the channel
853 * is UP.
855 mutex_enter(&ldcp->status_lock);
856 if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
857 DERR(vswp, "%s: unable to get status", __func__);
858 mutex_exit(&ldcp->status_lock);
859 LDC_EXIT_LOCK(ldcp);
860 return (1);
864 if (ldcp->ldc_status == LDC_UP) {
865 D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
866 ldcp->ldc_id, istatus);
867 mutex_exit(&ldcp->status_lock);
868 LDC_EXIT_LOCK(ldcp);
870 vsw_process_conn_evt(ldcp, VSW_CONN_UP);
871 return (0);
874 mutex_exit(&ldcp->status_lock);
875 LDC_EXIT_LOCK(ldcp);
877 D1(vswp, "%s: exit", __func__);
878 return (0);
881 /* disable callbacks on the channel */
882 static void
883 vsw_ldc_uninit(vsw_ldc_t *ldcp)
885 vsw_t *vswp = ldcp->ldc_vswp;
886 int rv;
888 D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
890 LDC_ENTER_LOCK(ldcp);
892 rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
893 if (rv != 0) {
894 cmn_err(CE_NOTE, "!vsw_ldc_uninit(%ld): error disabling "
895 "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
898 mutex_enter(&ldcp->status_lock);
899 ldcp->ldc_status = LDC_INIT;
900 mutex_exit(&ldcp->status_lock);
902 LDC_EXIT_LOCK(ldcp);
904 D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
908 * Wait until the callback(s) associated with the ldcs under the specified
909 * port have completed.
911 * Prior to this function being invoked each channel under this port
912 * should have been quiesced via ldc_set_cb_mode(DISABLE).
914 * A short explaination of what we are doing below..
916 * The simplest approach would be to have a reference counter in
917 * the ldc structure which is increment/decremented by the callbacks as
918 * they use the channel. The drain function could then simply disable any
919 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
920 * there is a tiny window here - before the callback is able to get the lock
921 * on the channel it is interrupted and this function gets to execute. It
922 * sees that the ref count is zero and believes its free to delete the
923 * associated data structures.
925 * We get around this by taking advantage of the fact that before the ldc
926 * framework invokes a callback it sets a flag to indicate that there is a
927 * callback active (or about to become active). If when we attempt to
928 * unregister a callback when this active flag is set then the unregister
929 * will fail with EWOULDBLOCK.
931 * If the unregister fails we do a cv_timedwait. We will either be signaled
932 * by the callback as it is exiting (note we have to wait a short period to
933 * allow the callback to return fully to the ldc framework and it to clear
934 * the active flag), or by the timer expiring. In either case we again attempt
935 * the unregister. We repeat this until we can succesfully unregister the
936 * callback.
938 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
939 * the case where the callback has finished but the ldc framework has not yet
940 * cleared the active flag. In this case we would never get a cv_signal.
942 static void
943 vsw_ldc_drain(vsw_ldc_t *ldcp)
945 vsw_t *vswp = ldcp->ldc_port->p_vswp;
947 D1(vswp, "%s: enter", __func__);
950 * If we can unregister the channel callback then we
951 * know that there is no callback either running or
952 * scheduled to run for this channel so move on to next
953 * channel in the list.
955 mutex_enter(&ldcp->drain_cv_lock);
957 /* prompt active callbacks to quit */
958 ldcp->drain_state = VSW_LDC_DRAINING;
960 if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
961 D2(vswp, "%s: unreg callback for chan %ld", __func__,
962 ldcp->ldc_id);
963 mutex_exit(&ldcp->drain_cv_lock);
964 } else {
966 * If we end up here we know that either 1) a callback
967 * is currently executing, 2) is about to start (i.e.
968 * the ldc framework has set the active flag but
969 * has not actually invoked the callback yet, or 3)
970 * has finished and has returned to the ldc framework
971 * but the ldc framework has not yet cleared the
972 * active bit.
974 * Wait for it to finish.
976 while (ldc_unreg_callback(ldcp->ldc_handle) == EWOULDBLOCK) {
977 (void) cv_timedwait(&ldcp->drain_cv,
978 &ldcp->drain_cv_lock, ddi_get_lbolt() + hz);
981 mutex_exit(&ldcp->drain_cv_lock);
982 D2(vswp, "%s: unreg callback for chan %ld after "
983 "timeout", __func__, ldcp->ldc_id);
986 D1(vswp, "%s: exit", __func__);
990 * Wait until all tasks which reference this port have completed.
992 * Prior to this function being invoked each channel under this port
993 * should have been quiesced via ldc_set_cb_mode(DISABLE).
995 static void
996 vsw_drain_port_taskq(vsw_port_t *port)
998 vsw_t *vswp = port->p_vswp;
1000 D1(vswp, "%s: enter", __func__);
1003 * Mark the port as in the process of being detached, and
1004 * dispatch a marker task to the queue so we know when all
1005 * relevant tasks have completed.
1007 mutex_enter(&port->state_lock);
1008 port->state = VSW_PORT_DETACHING;
1010 if ((vswp->taskq_p == NULL) ||
1011 (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
1012 port, DDI_NOSLEEP) != DDI_SUCCESS)) {
1013 cmn_err(CE_NOTE, "!vsw%d: unable to dispatch marker task",
1014 vswp->instance);
1015 mutex_exit(&port->state_lock);
1016 return;
1020 * Wait for the marker task to finish.
1022 while (port->state != VSW_PORT_DETACHABLE)
1023 cv_wait(&port->state_cv, &port->state_lock);
1025 mutex_exit(&port->state_lock);
1027 D1(vswp, "%s: exit", __func__);
1030 static void
1031 vsw_marker_task(void *arg)
1033 vsw_port_t *port = arg;
1034 vsw_t *vswp = port->p_vswp;
1036 D1(vswp, "%s: enter", __func__);
1038 mutex_enter(&port->state_lock);
1041 * No further tasks should be dispatched which reference
1042 * this port so ok to mark it as safe to detach.
1044 port->state = VSW_PORT_DETACHABLE;
1046 cv_signal(&port->state_cv);
1048 mutex_exit(&port->state_lock);
1050 D1(vswp, "%s: exit", __func__);
1053 vsw_port_t *
1054 vsw_lookup_port(vsw_t *vswp, int p_instance)
1056 vsw_port_list_t *plist = &vswp->plist;
1057 vsw_port_t *port;
1059 for (port = plist->head; port != NULL; port = port->p_next) {
1060 if (port->p_instance == p_instance) {
1061 D2(vswp, "vsw_lookup_port: found p_instance\n");
1062 return (port);
1066 return (NULL);
1069 void
1070 vsw_vlan_unaware_port_reset(vsw_port_t *portp)
1072 vsw_ldc_t *ldcp = portp->ldcp;
1074 mutex_enter(&ldcp->ldc_cblock);
1077 * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
1078 * the connection. See comments in vsw_set_vnet_proto_ops().
1080 if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
1081 portp->nvids != 0) {
1082 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1085 mutex_exit(&ldcp->ldc_cblock);
1088 void
1089 vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
1091 vsw_ldc_t *ldcp = portp->ldcp;
1093 mutex_enter(&ldcp->ldc_cblock);
1096 * If the peer is HybridIO capable (ver >= 1.3), reset channel
1097 * to trigger re-negotiation, which inturn trigger HybridIO
1098 * setup/cleanup.
1100 if ((ldcp->hphase == VSW_MILESTONE4) &&
1101 (portp->p_hio_capable == B_TRUE)) {
1102 if (immediate == B_TRUE) {
1103 (void) ldc_down(ldcp->ldc_handle);
1104 } else {
1105 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1109 mutex_exit(&ldcp->ldc_cblock);
1112 void
1113 vsw_port_reset(vsw_port_t *portp)
1115 vsw_ldc_t *ldcp = portp->ldcp;
1117 mutex_enter(&ldcp->ldc_cblock);
1120 * reset channel and terminate the connection.
1122 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1124 mutex_exit(&ldcp->ldc_cblock);
1127 void
1128 vsw_reset_ports(vsw_t *vswp)
1130 vsw_port_list_t *plist = &vswp->plist;
1131 vsw_port_t *portp;
1133 READ_ENTER(&plist->lockrw);
1134 for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1135 if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1136 vsw_hio_stop_port(portp);
1138 vsw_port_reset(portp);
1140 RW_EXIT(&plist->lockrw);
1143 static void
1144 vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state)
1146 vnet_physlink_msg_t msg;
1147 vnet_physlink_msg_t *msgp = &msg;
1148 uint32_t physlink_info = 0;
1150 if (plink_state == LINK_STATE_UP) {
1151 physlink_info |= VNET_PHYSLINK_STATE_UP;
1152 } else {
1153 physlink_info |= VNET_PHYSLINK_STATE_DOWN;
1156 msgp->tag.vio_msgtype = VIO_TYPE_CTRL;
1157 msgp->tag.vio_subtype = VIO_SUBTYPE_INFO;
1158 msgp->tag.vio_subtype_env = VNET_PHYSLINK_INFO;
1159 msgp->tag.vio_sid = ldcp->local_session;
1160 msgp->physlink_info = physlink_info;
1162 (void) vsw_send_msg(ldcp, msgp, sizeof (msg), B_TRUE);
1165 static void
1166 vsw_port_physlink_update(vsw_port_t *portp)
1168 vsw_ldc_t *ldcp;
1169 vsw_t *vswp;
1171 vswp = portp->p_vswp;
1172 ldcp = portp->ldcp;
1174 mutex_enter(&ldcp->ldc_cblock);
1177 * If handshake has completed successfully and if the vnet device
1178 * has negotiated to get physical link state updates, send a message
1179 * with the current state.
1181 if (ldcp->hphase == VSW_MILESTONE4 && ldcp->pls_negotiated == B_TRUE) {
1182 vsw_send_physlink_msg(ldcp, vswp->phys_link_state);
1185 mutex_exit(&ldcp->ldc_cblock);
1188 void
1189 vsw_physlink_update_ports(vsw_t *vswp)
1191 vsw_port_list_t *plist = &vswp->plist;
1192 vsw_port_t *portp;
1194 READ_ENTER(&plist->lockrw);
1195 for (portp = plist->head; portp != NULL; portp = portp->p_next) {
1196 vsw_port_physlink_update(portp);
1198 RW_EXIT(&plist->lockrw);
1202 * Search for and remove the specified port from the port
1203 * list. Returns 0 if able to locate and remove port, otherwise
1204 * returns 1.
1206 static int
1207 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
1209 vsw_port_list_t *plist = &vswp->plist;
1210 vsw_port_t *curr_p, *prev_p;
1212 if (plist->head == NULL)
1213 return (1);
1215 curr_p = prev_p = plist->head;
1217 while (curr_p != NULL) {
1218 if (curr_p == port) {
1219 if (prev_p == curr_p) {
1220 plist->head = curr_p->p_next;
1221 } else {
1222 prev_p->p_next = curr_p->p_next;
1224 plist->num_ports--;
1225 break;
1226 } else {
1227 prev_p = curr_p;
1228 curr_p = curr_p->p_next;
1231 return (0);
1235 * Interrupt handler for ldc messages.
1237 static uint_t
1238 vsw_ldc_cb(uint64_t event, caddr_t arg)
1240 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
1241 vsw_t *vswp = ldcp->ldc_vswp;
1243 D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
1245 mutex_enter(&ldcp->ldc_cblock);
1246 ldcp->ldc_stats.callbacks++;
1248 mutex_enter(&ldcp->status_lock);
1249 if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
1250 mutex_exit(&ldcp->status_lock);
1251 mutex_exit(&ldcp->ldc_cblock);
1252 return (LDC_SUCCESS);
1254 mutex_exit(&ldcp->status_lock);
1256 if (event & LDC_EVT_UP) {
1258 * Channel has come up.
1260 D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
1261 __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1263 vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1265 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1268 if (event & LDC_EVT_READ) {
1270 * Data available for reading.
1272 D2(vswp, "%s: id(ld) event(%llx) data READ",
1273 __func__, ldcp->ldc_id, event);
1275 vsw_process_evt_read(ldcp);
1277 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
1279 goto vsw_cb_exit;
1282 if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
1283 D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
1284 __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1286 vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
1290 * Catch either LDC_EVT_WRITE which we don't support or any
1291 * unknown event.
1293 if (event &
1294 ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
1295 DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
1296 __func__, ldcp->ldc_id, event, ldcp->ldc_status);
1299 vsw_cb_exit:
1300 mutex_exit(&ldcp->ldc_cblock);
1303 * Let the drain function know we are finishing if it
1304 * is waiting.
1306 mutex_enter(&ldcp->drain_cv_lock);
1307 if (ldcp->drain_state == VSW_LDC_DRAINING)
1308 cv_signal(&ldcp->drain_cv);
1309 mutex_exit(&ldcp->drain_cv_lock);
1311 return (LDC_SUCCESS);
1315 * Reinitialise data structures associated with the channel.
1317 static void
1318 vsw_ldc_reinit(vsw_ldc_t *ldcp)
1320 vsw_t *vswp = ldcp->ldc_vswp;
1321 vsw_port_t *port;
1323 D1(vswp, "%s: enter", __func__);
1325 port = ldcp->ldc_port;
1327 D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
1328 ldcp->lane_in.lstate, ldcp->lane_out.lstate);
1330 vsw_free_lane_resources(ldcp, INBOUND);
1331 vsw_free_lane_resources(ldcp, OUTBOUND);
1333 ldcp->lane_in.lstate = 0;
1334 ldcp->lane_out.lstate = 0;
1337 * Remove parent port from any multicast groups
1338 * it may have registered with. Client must resend
1339 * multicast add command after handshake completes.
1341 vsw_del_mcst_port(port);
1343 ldcp->peer_session = 0;
1344 ldcp->session_status = 0;
1345 ldcp->hcnt = 0;
1346 ldcp->hphase = VSW_MILESTONE0;
1348 vsw_reset_vnet_proto_ops(ldcp);
1350 D1(vswp, "%s: exit", __func__);
1354 * Process a connection event.
1356 void
1357 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
1359 vsw_t *vswp = ldcp->ldc_vswp;
1360 vsw_conn_evt_t *conn = NULL;
1362 D1(vswp, "%s: enter", __func__);
1365 * Check if either a reset or restart event is pending
1366 * or in progress. If so just return.
1368 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
1369 * being received by the callback handler, or a ECONNRESET error
1370 * code being returned from a ldc_read() or ldc_write() call.
1372 * A VSW_CONN_RESTART event occurs when some error checking code
1373 * decides that there is a problem with data from the channel,
1374 * and that the handshake should be restarted.
1376 if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
1377 (ldstub((uint8_t *)&ldcp->reset_active)))
1378 return;
1381 * If it is an LDC_UP event we first check the recorded
1382 * state of the channel. If this is UP then we know that
1383 * the channel moving to the UP state has already been dealt
1384 * with and don't need to dispatch a new task.
1386 * The reason for this check is that when we do a ldc_up(),
1387 * depending on the state of the peer, we may or may not get
1388 * a LDC_UP event. As we can't depend on getting a LDC_UP evt
1389 * every time we do ldc_up() we explicitly check the channel
1390 * status to see has it come up (ldc_up() is asynch and will
1391 * complete at some undefined time), and take the appropriate
1392 * action.
1394 * The flip side of this is that we may get a LDC_UP event
1395 * when we have already seen that the channel is up and have
1396 * dealt with that.
1398 mutex_enter(&ldcp->status_lock);
1399 if (evt == VSW_CONN_UP) {
1400 if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
1401 mutex_exit(&ldcp->status_lock);
1402 return;
1405 mutex_exit(&ldcp->status_lock);
1408 * The transaction group id allows us to identify and discard
1409 * any tasks which are still pending on the taskq and refer
1410 * to the handshake session we are about to restart or reset.
1411 * These stale messages no longer have any real meaning.
1413 (void) atomic_inc_32(&ldcp->hss_id);
1415 ASSERT(vswp->taskq_p != NULL);
1417 if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
1418 cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
1419 " connection event", vswp->instance);
1420 goto err_exit;
1423 conn->evt = evt;
1424 conn->ldcp = ldcp;
1426 if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
1427 DDI_NOSLEEP) != DDI_SUCCESS) {
1428 cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
1429 vswp->instance);
1431 kmem_free(conn, sizeof (vsw_conn_evt_t));
1432 goto err_exit;
1435 D1(vswp, "%s: exit", __func__);
1436 return;
1438 err_exit:
1440 * Have mostly likely failed due to memory shortage. Clear the flag so
1441 * that future requests will at least be attempted and will hopefully
1442 * succeed.
1444 if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1445 ldcp->reset_active = 0;
1449 * Deal with events relating to a connection. Invoked from a taskq.
1451 static void
1452 vsw_conn_task(void *arg)
1454 vsw_conn_evt_t *conn = (vsw_conn_evt_t *)arg;
1455 vsw_ldc_t *ldcp = NULL;
1456 vsw_port_t *portp;
1457 vsw_t *vswp = NULL;
1458 uint16_t evt;
1459 ldc_status_t curr_status;
1461 ldcp = conn->ldcp;
1462 evt = conn->evt;
1463 vswp = ldcp->ldc_vswp;
1464 portp = ldcp->ldc_port;
1466 D1(vswp, "%s: enter", __func__);
1468 /* can safely free now have copied out data */
1469 kmem_free(conn, sizeof (vsw_conn_evt_t));
1471 if (ldcp->rcv_thread != NULL) {
1472 vsw_stop_rcv_thread(ldcp);
1473 } else if (ldcp->msg_thread != NULL) {
1474 vsw_stop_msg_thread(ldcp);
1477 mutex_enter(&ldcp->status_lock);
1478 if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1479 cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1480 "channel %ld", vswp->instance, ldcp->ldc_id);
1481 mutex_exit(&ldcp->status_lock);
1482 return;
1486 * If we wish to restart the handshake on this channel, then if
1487 * the channel is UP we bring it DOWN to flush the underlying
1488 * ldc queue.
1490 if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
1491 (void) ldc_down(ldcp->ldc_handle);
1493 if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
1494 vsw_hio_stop(vswp, ldcp);
1498 * re-init all the associated data structures.
1500 vsw_ldc_reinit(ldcp);
1503 * Bring the channel back up (note it does no harm to
1504 * do this even if the channel is already UP, Just
1505 * becomes effectively a no-op).
1507 (void) ldc_up(ldcp->ldc_handle);
1510 * Check if channel is now UP. This will only happen if
1511 * peer has also done a ldc_up().
1513 if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
1514 cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
1515 "channel %ld", vswp->instance, ldcp->ldc_id);
1516 mutex_exit(&ldcp->status_lock);
1517 return;
1520 ldcp->ldc_status = curr_status;
1522 /* channel UP so restart handshake by sending version info */
1523 if (curr_status == LDC_UP) {
1524 if (ldcp->hcnt++ > vsw_num_handshakes) {
1525 cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
1526 " handshake attempts (%d) on channel %ld",
1527 vswp->instance, ldcp->hcnt, ldcp->ldc_id);
1528 mutex_exit(&ldcp->status_lock);
1529 return;
1532 if (vsw_obp_ver_proto_workaround == B_FALSE &&
1533 (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
1534 DDI_NOSLEEP) != DDI_SUCCESS)) {
1535 cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
1536 vswp->instance);
1539 * Don't count as valid restart attempt if couldn't
1540 * send version msg.
1542 if (ldcp->hcnt > 0)
1543 ldcp->hcnt--;
1548 * Mark that the process is complete by clearing the flag.
1550 * Note is it possible that the taskq dispatch above may have failed,
1551 * most likely due to memory shortage. We still clear the flag so
1552 * future attempts will at least be attempted and will hopefully
1553 * succeed.
1555 if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
1556 ldcp->reset_active = 0;
1558 mutex_exit(&ldcp->status_lock);
1560 D1(vswp, "%s: exit", __func__);
1564 * returns 0 if legal for event signified by flag to have
1565 * occured at the time it did. Otherwise returns 1.
1568 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
1570 vsw_t *vswp = ldcp->ldc_vswp;
1571 uint64_t state;
1572 uint64_t phase;
1574 if (dir == INBOUND)
1575 state = ldcp->lane_in.lstate;
1576 else
1577 state = ldcp->lane_out.lstate;
1579 phase = ldcp->hphase;
1581 switch (flag) {
1582 case VSW_VER_INFO_RECV:
1583 if (phase > VSW_MILESTONE0) {
1584 DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
1585 " when in state %d\n", ldcp->ldc_id, phase);
1586 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1587 return (1);
1589 break;
1591 case VSW_VER_ACK_RECV:
1592 case VSW_VER_NACK_RECV:
1593 if (!(state & VSW_VER_INFO_SENT)) {
1594 DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
1595 "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
1596 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1597 return (1);
1598 } else
1599 state &= ~VSW_VER_INFO_SENT;
1600 break;
1602 case VSW_ATTR_INFO_RECV:
1603 if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
1604 DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
1605 " when in state %d\n", ldcp->ldc_id, phase);
1606 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1607 return (1);
1609 break;
1611 case VSW_ATTR_ACK_RECV:
1612 case VSW_ATTR_NACK_RECV:
1613 if (!(state & VSW_ATTR_INFO_SENT)) {
1614 DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
1615 " or ATTR_NACK when in state %d\n",
1616 ldcp->ldc_id, phase);
1617 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1618 return (1);
1619 } else
1620 state &= ~VSW_ATTR_INFO_SENT;
1621 break;
1623 case VSW_DRING_INFO_RECV:
1624 if (phase < VSW_MILESTONE1) {
1625 DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
1626 " when in state %d\n", ldcp->ldc_id, phase);
1627 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1628 return (1);
1630 break;
1632 case VSW_DRING_ACK_RECV:
1633 case VSW_DRING_NACK_RECV:
1634 if (!(state & VSW_DRING_INFO_SENT)) {
1635 DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
1636 " or DRING_NACK when in state %d\n",
1637 ldcp->ldc_id, phase);
1638 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1639 return (1);
1640 } else
1641 state &= ~VSW_DRING_INFO_SENT;
1642 break;
1644 case VSW_RDX_INFO_RECV:
1645 if (phase < VSW_MILESTONE3) {
1646 DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
1647 " when in state %d\n", ldcp->ldc_id, phase);
1648 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1649 return (1);
1651 break;
1653 case VSW_RDX_ACK_RECV:
1654 case VSW_RDX_NACK_RECV:
1655 if (!(state & VSW_RDX_INFO_SENT)) {
1656 DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
1657 "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
1658 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1659 return (1);
1660 } else
1661 state &= ~VSW_RDX_INFO_SENT;
1662 break;
1664 case VSW_MCST_INFO_RECV:
1665 if (phase < VSW_MILESTONE3) {
1666 DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
1667 " when in state %d\n", ldcp->ldc_id, phase);
1668 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
1669 return (1);
1671 break;
1673 default:
1674 DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
1675 ldcp->ldc_id, flag);
1676 return (1);
1679 if (dir == INBOUND)
1680 ldcp->lane_in.lstate = state;
1681 else
1682 ldcp->lane_out.lstate = state;
1684 D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
1686 return (0);
1689 void
1690 vsw_next_milestone(vsw_ldc_t *ldcp)
1692 vsw_t *vswp = ldcp->ldc_vswp;
1693 vsw_port_t *portp = ldcp->ldc_port;
1694 lane_t *lane_out = &ldcp->lane_out;
1695 lane_t *lane_in = &ldcp->lane_in;
1697 D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
1698 ldcp->ldc_id, ldcp->hphase);
1700 DUMP_FLAGS(lane_in->lstate);
1701 DUMP_FLAGS(lane_out->lstate);
1703 switch (ldcp->hphase) {
1705 case VSW_MILESTONE0:
1707 * If we haven't started to handshake with our peer,
1708 * start to do so now.
1710 if (lane_out->lstate == 0) {
1711 D2(vswp, "%s: (chan %lld) starting handshake "
1712 "with peer", __func__, ldcp->ldc_id);
1713 vsw_process_conn_evt(ldcp, VSW_CONN_UP);
1717 * Only way to pass this milestone is to have successfully
1718 * negotiated version info.
1720 if ((lane_in->lstate & VSW_VER_ACK_SENT) &&
1721 (lane_out->lstate & VSW_VER_ACK_RECV)) {
1723 D2(vswp, "%s: (chan %lld) leaving milestone 0",
1724 __func__, ldcp->ldc_id);
1726 vsw_set_vnet_proto_ops(ldcp);
1729 * Next milestone is passed when attribute
1730 * information has been successfully exchanged.
1732 ldcp->hphase = VSW_MILESTONE1;
1733 vsw_send_attr(ldcp);
1736 break;
1738 case VSW_MILESTONE1:
1740 * Only way to pass this milestone is to have successfully
1741 * negotiated attribute information, in both directions.
1743 if (!((lane_in->lstate & VSW_ATTR_ACK_SENT) &&
1744 (lane_out->lstate & VSW_ATTR_ACK_RECV))) {
1745 break;
1748 ldcp->hphase = VSW_MILESTONE2;
1751 * If the peer device has said it wishes to
1752 * use descriptor rings then we send it our ring
1753 * info, otherwise we just set up a private ring
1754 * which we use an internal buffer
1756 if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1757 (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
1758 (VSW_VER_LT(ldcp, 1, 2) &&
1759 (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
1760 vsw_send_dring_info(ldcp);
1761 break;
1765 * The peer doesn't operate in dring mode; we
1766 * can simply fallthru to the RDX phase from
1767 * here.
1769 /*FALLTHRU*/
1771 case VSW_MILESTONE2:
1773 * If peer has indicated in its attribute message that
1774 * it wishes to use descriptor rings then the only way
1775 * to pass this milestone is for us to have received
1776 * valid dring info.
1778 * If peer is not using descriptor rings then just fall
1779 * through.
1781 if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
1782 (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
1783 (VSW_VER_LT(ldcp, 1, 2) &&
1784 (lane_in->xfer_mode ==
1785 VIO_DRING_MODE_V1_0))) {
1786 if (!(lane_in->lstate & VSW_DRING_ACK_SENT))
1787 break;
1790 D2(vswp, "%s: (chan %lld) leaving milestone 2",
1791 __func__, ldcp->ldc_id);
1793 ldcp->hphase = VSW_MILESTONE3;
1794 vsw_send_rdx(ldcp);
1795 break;
1797 case VSW_MILESTONE3:
1799 * Pass this milestone when all paramaters have been
1800 * successfully exchanged and RDX sent in both directions.
1802 * Mark the relevant lane as available to transmit data. In
1803 * RxDringData mode, lane_in is associated with transmit and
1804 * lane_out is associated with receive. It is the reverse in
1805 * TxDring mode.
1807 if ((lane_out->lstate & VSW_RDX_ACK_SENT) &&
1808 (lane_in->lstate & VSW_RDX_ACK_RECV)) {
1810 D2(vswp, "%s: (chan %lld) leaving milestone 3",
1811 __func__, ldcp->ldc_id);
1812 D2(vswp, "%s: ** handshake complete (0x%llx : "
1813 "0x%llx) **", __func__, lane_in->lstate,
1814 lane_out->lstate);
1815 if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
1816 lane_in->lstate |= VSW_LANE_ACTIVE;
1817 } else {
1818 lane_out->lstate |= VSW_LANE_ACTIVE;
1820 ldcp->hphase = VSW_MILESTONE4;
1821 ldcp->hcnt = 0;
1822 DISPLAY_STATE();
1823 /* Start HIO if enabled and capable */
1824 if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
1825 D2(vswp, "%s: start HybridIO setup", __func__);
1826 vsw_hio_start(vswp, ldcp);
1829 if (ldcp->pls_negotiated == B_TRUE) {
1831 * The vnet device has negotiated to get phys
1832 * link updates. Now that the handshake with
1833 * the vnet device is complete, send an initial
1834 * update with the current physical link state.
1836 vsw_send_physlink_msg(ldcp,
1837 vswp->phys_link_state);
1840 } else {
1841 D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
1842 __func__, lane_in->lstate,
1843 lane_out->lstate);
1845 break;
1847 case VSW_MILESTONE4:
1848 D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
1849 ldcp->ldc_id);
1850 break;
1852 default:
1853 DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
1854 ldcp->ldc_id, ldcp->hphase);
1857 D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
1858 ldcp->hphase);
1862 * Check if major version is supported.
1864 * Returns 0 if finds supported major number, and if necessary
1865 * adjusts the minor field.
1867 * Returns 1 if can't match major number exactly. Sets mjor/minor
1868 * to next lowest support values, or to zero if no other values possible.
1870 static int
1871 vsw_supported_version(vio_ver_msg_t *vp)
1873 int i;
1875 D1(NULL, "vsw_supported_version: enter");
1877 for (i = 0; i < VSW_NUM_VER; i++) {
1878 if (vsw_versions[i].ver_major == vp->ver_major) {
1880 * Matching or lower major version found. Update
1881 * minor number if necessary.
1883 if (vp->ver_minor > vsw_versions[i].ver_minor) {
1884 D2(NULL, "%s: adjusting minor value from %d "
1885 "to %d", __func__, vp->ver_minor,
1886 vsw_versions[i].ver_minor);
1887 vp->ver_minor = vsw_versions[i].ver_minor;
1890 return (0);
1894 * If the message contains a higher major version number, set
1895 * the message's major/minor versions to the current values
1896 * and return false, so this message will get resent with
1897 * these values.
1899 if (vsw_versions[i].ver_major < vp->ver_major) {
1900 D2(NULL, "%s: adjusting major and minor "
1901 "values to %d, %d\n",
1902 __func__, vsw_versions[i].ver_major,
1903 vsw_versions[i].ver_minor);
1904 vp->ver_major = vsw_versions[i].ver_major;
1905 vp->ver_minor = vsw_versions[i].ver_minor;
1906 return (1);
1910 /* No match was possible, zero out fields */
1911 vp->ver_major = 0;
1912 vp->ver_minor = 0;
1914 D1(NULL, "vsw_supported_version: exit");
1916 return (1);
1920 * Set vnet-protocol-version dependent functions based on version.
1922 static void
1923 vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
1925 vsw_t *vswp = ldcp->ldc_vswp;
1926 lane_t *lp = &ldcp->lane_out;
1929 * Setup the appropriate dring data processing routine and any
1930 * associated thread based on the version.
1932 * In versions < 1.6, we support only TxDring mode. In this mode, the
1933 * msg worker thread processes all types of VIO msgs (ctrl and data).
1935 * In versions >= 1.6, we also support RxDringData mode. In this mode,
1936 * the rcv worker thread processes dring data messages (msgtype:
1937 * VIO_TYPE_DATA, subtype: VIO_SUBTYPE_INFO, env: VIO_DRING_DATA). The
1938 * rest of the data messages (including acks) and ctrl messages are
1939 * handled directly by the callback (intr) thread.
1941 * However, for versions >= 1.6, we could still fallback to TxDring
1942 * mode. This could happen if RxDringData mode has been disabled (see
1943 * below) on this guest or on the peer guest. This info is determined
1944 * as part of attr exchange phase of handshake. Hence, we setup these
1945 * pointers for v1.6 after attr msg phase completes during handshake.
1947 if (VSW_VER_GTEQ(ldcp, 1, 6)) {
1949 * Set data dring mode for vsw_send_attr(). We setup msg worker
1950 * thread in TxDring mode or rcv worker thread in RxDringData
1951 * mode when attr phase of handshake completes.
1953 if (vsw_mapin_avail(ldcp) == B_TRUE) {
1954 lp->dring_mode = (VIO_RX_DRING_DATA | VIO_TX_DRING);
1955 } else {
1956 lp->dring_mode = VIO_TX_DRING;
1958 } else {
1959 lp->dring_mode = VIO_TX_DRING;
1963 * Setup the MTU for attribute negotiation based on the version.
1965 if (VSW_VER_GTEQ(ldcp, 1, 4)) {
1967 * If the version negotiated with peer is >= 1.4(Jumbo Frame
1968 * Support), set the mtu in our attributes to max_frame_size.
1970 lp->mtu = vswp->max_frame_size;
1971 } else if (VSW_VER_EQ(ldcp, 1, 3)) {
1973 * If the version negotiated with peer is == 1.3 (Vlan Tag
1974 * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
1976 lp->mtu = ETHERMAX + VLAN_TAGSZ;
1977 } else {
1978 vsw_port_t *portp = ldcp->ldc_port;
1980 * Pre-1.3 peers expect max frame size of ETHERMAX.
1981 * We can negotiate that size with those peers provided only
1982 * pvid is defined for our peer and there are no vids. Then we
1983 * can send/recv only untagged frames of max size ETHERMAX.
1984 * Note that pvid of the peer can be different, as vsw has to
1985 * serve the vnet in that vlan even if itself is not assigned
1986 * to that vlan.
1988 if (portp->nvids == 0) {
1989 lp->mtu = ETHERMAX;
1994 * Setup version dependent data processing functions.
1996 if (VSW_VER_GTEQ(ldcp, 1, 2)) {
1997 /* Versions >= 1.2 */
1999 if (VSW_PRI_ETH_DEFINED(vswp)) {
2001 * enable priority routines and pkt mode only if
2002 * at least one pri-eth-type is specified in MD.
2004 ldcp->tx = vsw_ldctx_pri;
2005 ldcp->rx_pktdata = vsw_process_pkt_data;
2007 /* set xfer mode for vsw_send_attr() */
2008 lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
2009 } else {
2010 /* no priority eth types defined in MD */
2012 ldcp->tx = vsw_ldctx;
2013 ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2015 /* set xfer mode for vsw_send_attr() */
2016 lp->xfer_mode = VIO_DRING_MODE_V1_2;
2019 } else {
2020 /* Versions prior to 1.2 */
2022 vsw_reset_vnet_proto_ops(ldcp);
2027 * Reset vnet-protocol-version dependent functions to v1.0.
2029 static void
2030 vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
2032 lane_t *lp = &ldcp->lane_out;
2034 ldcp->tx = vsw_ldctx;
2035 ldcp->rx_pktdata = vsw_process_pkt_data_nop;
2037 /* set xfer mode for vsw_send_attr() */
2038 lp->xfer_mode = VIO_DRING_MODE_V1_0;
2041 static void
2042 vsw_process_evt_read(vsw_ldc_t *ldcp)
2044 if (ldcp->msg_thread != NULL) {
2046 * TxDring mode; wakeup message worker
2047 * thread to process the VIO messages.
2049 mutex_exit(&ldcp->ldc_cblock);
2050 mutex_enter(&ldcp->msg_thr_lock);
2051 if (!(ldcp->msg_thr_flags & VSW_WTHR_DATARCVD)) {
2052 ldcp->msg_thr_flags |= VSW_WTHR_DATARCVD;
2053 cv_signal(&ldcp->msg_thr_cv);
2055 mutex_exit(&ldcp->msg_thr_lock);
2056 mutex_enter(&ldcp->ldc_cblock);
2057 } else {
2059 * We invoke vsw_process_pkt() in the context of the LDC
2060 * callback (vsw_ldc_cb()) during handshake, until the dring
2061 * mode is negotiated. After the dring mode is negotiated, the
2062 * msgs are processed by the msg worker thread (above case) if
2063 * the dring mode is TxDring. Otherwise (in RxDringData mode)
2064 * we continue to process the msgs directly in the callback
2065 * context.
2067 vsw_process_pkt(ldcp);
2072 * Main routine for processing messages received over LDC.
2074 void
2075 vsw_process_pkt(void *arg)
2077 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
2078 vsw_t *vswp = ldcp->ldc_vswp;
2079 size_t msglen;
2080 vio_msg_tag_t *tagp;
2081 uint64_t *ldcmsg;
2082 int rv = 0;
2085 D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2087 ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));
2089 ldcmsg = ldcp->ldcmsg;
2091 * If channel is up read messages until channel is empty.
2093 do {
2094 msglen = ldcp->msglen;
2095 rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);
2097 if (rv != 0) {
2098 DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
2099 __func__, ldcp->ldc_id, rv, msglen);
2102 /* channel has been reset */
2103 if (rv == ECONNRESET) {
2104 vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
2105 break;
2108 if (msglen == 0) {
2109 D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
2110 ldcp->ldc_id);
2111 break;
2114 D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
2115 ldcp->ldc_id, msglen);
2118 * Figure out what sort of packet we have gotten by
2119 * examining the msg tag, and then switch it appropriately.
2121 tagp = (vio_msg_tag_t *)ldcmsg;
2123 switch (tagp->vio_msgtype) {
2124 case VIO_TYPE_CTRL:
2125 vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp, msglen);
2126 break;
2127 case VIO_TYPE_DATA:
2128 vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
2129 break;
2130 case VIO_TYPE_ERR:
2131 vsw_process_err_pkt(ldcp, ldcmsg, tagp);
2132 break;
2133 default:
2134 DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
2135 "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
2136 break;
2138 } while (msglen);
2140 D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
2144 * Dispatch a task to process a VIO control message.
2146 static void
2147 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp,
2148 int msglen)
2150 vsw_ctrl_task_t *ctaskp = NULL;
2151 vsw_port_t *port = ldcp->ldc_port;
2152 vsw_t *vswp = port->p_vswp;
2154 D1(vswp, "%s: enter", __func__);
2157 * We need to handle RDX ACK messages in-band as once they
2158 * are exchanged it is possible that we will get an
2159 * immediate (legitimate) data packet.
2161 if ((tagp->vio_subtype_env == VIO_RDX) &&
2162 (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {
2164 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
2165 return;
2167 ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
2168 D2(vswp, "%s (%ld) handling RDX_ACK in place "
2169 "(ostate 0x%llx : hphase %d)", __func__,
2170 ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
2171 vsw_next_milestone(ldcp);
2172 return;
2175 ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
2177 if (ctaskp == NULL) {
2178 DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
2179 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2180 return;
2183 ctaskp->ldcp = ldcp;
2184 bcopy((def_msg_t *)cpkt, &ctaskp->pktp, msglen);
2185 ctaskp->hss_id = ldcp->hss_id;
2188 * Dispatch task to processing taskq if port is not in
2189 * the process of being detached.
2191 mutex_enter(&port->state_lock);
2192 if (port->state == VSW_PORT_INIT) {
2193 if ((vswp->taskq_p == NULL) ||
2194 (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
2195 ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
2196 mutex_exit(&port->state_lock);
2197 DERR(vswp, "%s: unable to dispatch task to taskq",
2198 __func__);
2199 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2200 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2201 return;
2203 } else {
2204 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2205 DWARN(vswp, "%s: port %d detaching, not dispatching "
2206 "task", __func__, port->p_instance);
2209 mutex_exit(&port->state_lock);
2211 D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
2212 ldcp->ldc_id);
2213 D1(vswp, "%s: exit", __func__);
2217 * Process a VIO ctrl message. Invoked from taskq.
2219 static void
2220 vsw_process_ctrl_pkt(void *arg)
2222 vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg;
2223 vsw_ldc_t *ldcp = ctaskp->ldcp;
2224 vsw_t *vswp = ldcp->ldc_vswp;
2225 vio_msg_tag_t tag;
2226 uint16_t env;
2228 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2230 bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
2231 env = tag.vio_subtype_env;
2233 /* stale pkt check */
2234 if (ctaskp->hss_id < ldcp->hss_id) {
2235 DWARN(vswp, "%s: discarding stale packet belonging to earlier"
2236 " (%ld) handshake session", __func__, ctaskp->hss_id);
2237 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2238 return;
2241 /* session id check */
2242 if (ldcp->session_status & VSW_PEER_SESSION) {
2243 if (ldcp->peer_session != tag.vio_sid) {
2244 DERR(vswp, "%s (chan %d): invalid session id (%llx)",
2245 __func__, ldcp->ldc_id, tag.vio_sid);
2246 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2247 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
2248 return;
2253 * Switch on vio_subtype envelope, then let lower routines
2254 * decide if its an INFO, ACK or NACK packet.
2256 switch (env) {
2257 case VIO_VER_INFO:
2258 vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
2259 break;
2260 case VIO_DRING_REG:
2261 vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
2262 break;
2263 case VIO_DRING_UNREG:
2264 vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
2265 break;
2266 case VIO_ATTR_INFO:
2267 vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
2268 break;
2269 case VNET_MCAST_INFO:
2270 vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
2271 break;
2272 case VIO_RDX:
2273 vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
2274 break;
2275 case VIO_DDS_INFO:
2276 vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
2277 break;
2279 case VNET_PHYSLINK_INFO:
2280 vsw_process_physlink_msg(ldcp, &ctaskp->pktp);
2281 break;
2282 default:
2283 DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
2286 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
2287 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
2291 * Version negotiation. We can end up here either because our peer
2292 * has responded to a handshake message we have sent it, or our peer
2293 * has initiated a handshake with us. If its the former then can only
2294 * be ACK or NACK, if its the later can only be INFO.
2296 * If its an ACK we move to the next stage of the handshake, namely
2297 * attribute exchange. If its a NACK we see if we can specify another
2298 * version, if we can't we stop.
2300 * If it is an INFO we reset all params associated with communication
2301 * in that direction over this channel (remember connection is
2302 * essentially 2 independent simplex channels).
2304 void
2305 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
2307 vio_ver_msg_t *ver_pkt;
2308 vsw_t *vswp = ldcp->ldc_vswp;
2310 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
2313 * We know this is a ctrl/version packet so
2314 * cast it into the correct structure.
2316 ver_pkt = (vio_ver_msg_t *)pkt;
2318 switch (ver_pkt->tag.vio_subtype) {
2319 case VIO_SUBTYPE_INFO:
2320 D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
2323 * Record the session id, which we will use from now
2324 * until we see another VER_INFO msg. Even then the
2325 * session id in most cases will be unchanged, execpt
2326 * if channel was reset.
2328 if ((ldcp->session_status & VSW_PEER_SESSION) &&
2329 (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
2330 DERR(vswp, "%s: updating session id for chan %lld "
2331 "from %llx to %llx", __func__, ldcp->ldc_id,
2332 ldcp->peer_session, ver_pkt->tag.vio_sid);
2335 ldcp->peer_session = ver_pkt->tag.vio_sid;
2336 ldcp->session_status |= VSW_PEER_SESSION;
2338 /* Legal message at this time ? */
2339 if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
2340 return;
2343 * First check the device class. Currently only expect
2344 * to be talking to a network device. In the future may
2345 * also talk to another switch.
2347 if (ver_pkt->dev_class != VDEV_NETWORK) {
2348 DERR(vswp, "%s: illegal device class %d", __func__,
2349 ver_pkt->dev_class);
2351 ver_pkt->tag.vio_sid = ldcp->local_session;
2352 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2354 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2356 (void) vsw_send_msg(ldcp, (void *)ver_pkt,
2357 sizeof (vio_ver_msg_t), B_TRUE);
2359 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2360 vsw_next_milestone(ldcp);
2361 return;
2362 } else {
2363 ldcp->dev_class = ver_pkt->dev_class;
2367 * Now check the version.
2369 if (vsw_supported_version(ver_pkt) == 0) {
2371 * Support this major version and possibly
2372 * adjusted minor version.
2375 D2(vswp, "%s: accepted ver %d:%d", __func__,
2376 ver_pkt->ver_major, ver_pkt->ver_minor);
2378 /* Store accepted values */
2379 ldcp->lane_in.ver_major = ver_pkt->ver_major;
2380 ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2382 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2384 ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
2386 if (vsw_obp_ver_proto_workaround == B_TRUE) {
2388 * Send a version info message
2389 * using the accepted version that
2390 * we are about to ack. Also note that
2391 * we send our ver info before we ack.
2392 * Otherwise, as soon as receiving the
2393 * ack, obp sends attr info msg, which
2394 * breaks vsw_check_flag() invoked
2395 * from vsw_process_ctrl_attr_pkt();
2396 * as we also need VSW_VER_ACK_RECV to
2397 * be set in lane_out.lstate, before
2398 * we can receive attr info.
2400 vsw_send_ver(ldcp);
2402 } else {
2404 * NACK back with the next lower major/minor
2405 * pairing we support (if don't suuport any more
2406 * versions then they will be set to zero.
2409 D2(vswp, "%s: replying with ver %d:%d", __func__,
2410 ver_pkt->ver_major, ver_pkt->ver_minor);
2412 /* Store updated values */
2413 ldcp->lane_in.ver_major = ver_pkt->ver_major;
2414 ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
2416 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2418 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
2421 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2422 ver_pkt->tag.vio_sid = ldcp->local_session;
2423 (void) vsw_send_msg(ldcp, (void *)ver_pkt,
2424 sizeof (vio_ver_msg_t), B_TRUE);
2426 vsw_next_milestone(ldcp);
2427 break;
2429 case VIO_SUBTYPE_ACK:
2430 D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
2432 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
2433 return;
2435 /* Store updated values */
2436 ldcp->lane_out.ver_major = ver_pkt->ver_major;
2437 ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2439 ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
2440 vsw_next_milestone(ldcp);
2442 break;
2444 case VIO_SUBTYPE_NACK:
2445 D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
2447 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
2448 return;
2451 * If our peer sent us a NACK with the ver fields set to
2452 * zero then there is nothing more we can do. Otherwise see
2453 * if we support either the version suggested, or a lesser
2454 * one.
2456 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2457 DERR(vswp, "%s: peer unable to negotiate any "
2458 "further.", __func__);
2459 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2460 vsw_next_milestone(ldcp);
2461 return;
2465 * Check to see if we support this major version or
2466 * a lower one. If we don't then maj/min will be set
2467 * to zero.
2469 (void) vsw_supported_version(ver_pkt);
2470 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
2471 /* Nothing more we can do */
2472 DERR(vswp, "%s: version negotiation failed.\n",
2473 __func__);
2474 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
2475 vsw_next_milestone(ldcp);
2476 } else {
2477 /* found a supported major version */
2478 ldcp->lane_out.ver_major = ver_pkt->ver_major;
2479 ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
2481 D2(vswp, "%s: resending with updated values (%x, %x)",
2482 __func__, ver_pkt->ver_major, ver_pkt->ver_minor);
2484 ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
2485 ver_pkt->tag.vio_sid = ldcp->local_session;
2486 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
2488 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
2490 (void) vsw_send_msg(ldcp, (void *)ver_pkt,
2491 sizeof (vio_ver_msg_t), B_TRUE);
2493 vsw_next_milestone(ldcp);
2496 break;
2498 default:
2499 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2500 ver_pkt->tag.vio_subtype);
2503 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
2506 static int
2507 vsw_process_attr_info(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
2509 vsw_t *vswp = ldcp->ldc_vswp;
2510 vsw_port_t *port = ldcp->ldc_port;
2511 struct ether_addr ea;
2512 uint64_t macaddr = 0;
2513 lane_t *lane_out = &ldcp->lane_out;
2514 lane_t *lane_in = &ldcp->lane_in;
2515 uint32_t mtu;
2516 int i;
2517 uint8_t dring_mode;
2519 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2521 if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) {
2522 return (1);
2525 if ((msg->xfer_mode != VIO_DESC_MODE) &&
2526 (msg->xfer_mode != lane_out->xfer_mode)) {
2527 D2(NULL, "%s: unknown mode %x\n", __func__, msg->xfer_mode);
2528 return (1);
2531 /* Only support MAC addresses at moment. */
2532 if ((msg->addr_type != ADDR_TYPE_MAC) || (msg->addr == 0)) {
2533 D2(NULL, "%s: invalid addr_type %x, or address 0x%llx\n",
2534 __func__, msg->addr_type, msg->addr);
2535 return (1);
2539 * MAC address supplied by device should match that stored
2540 * in the vsw-port OBP node. Need to decide what to do if they
2541 * don't match, for the moment just warn but don't fail.
2543 vnet_macaddr_ultostr(msg->addr, ea.ether_addr_octet);
2544 if (ether_cmp(&ea, &port->p_macaddr) != 0) {
2545 DERR(NULL, "%s: device supplied address "
2546 "0x%llx doesn't match node address 0x%llx\n",
2547 __func__, msg->addr, port->p_macaddr);
2551 * Ack freq only makes sense in pkt mode, in shared
2552 * mode the ring descriptors say whether or not to
2553 * send back an ACK.
2555 if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2556 (msg->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2557 (VSW_VER_LT(ldcp, 1, 2) &&
2558 (msg->xfer_mode == VIO_DRING_MODE_V1_0))) {
2559 if (msg->ack_freq > 0) {
2560 D2(NULL, "%s: non zero ack freq in SHM mode\n",
2561 __func__);
2562 return (1);
2567 * Process dring mode attribute.
2569 if (VSW_VER_GTEQ(ldcp, 1, 6)) {
2571 * Versions >= 1.6:
2572 * Though we are operating in v1.6 mode, it is possible that
2573 * RxDringData mode has been disabled either on this guest or
2574 * on the peer guest. If so, we revert to pre v1.6 behavior of
2575 * TxDring mode. But this must be agreed upon in both
2576 * directions of attr exchange. We first determine the mode
2577 * that can be negotiated.
2579 if ((msg->options & VIO_RX_DRING_DATA) != 0 &&
2580 vsw_mapin_avail(ldcp) == B_TRUE) {
2582 * The peer is capable of handling RxDringData AND we
2583 * are also capable of it; we enable RxDringData mode
2584 * on this channel.
2586 dring_mode = VIO_RX_DRING_DATA;
2587 } else if ((msg->options & VIO_TX_DRING) != 0) {
2589 * If the peer is capable of TxDring mode, we
2590 * negotiate TxDring mode on this channel.
2592 dring_mode = VIO_TX_DRING;
2593 } else {
2595 * We support only VIO_TX_DRING and VIO_RX_DRING_DATA
2596 * modes. We don't support VIO_RX_DRING mode.
2598 return (1);
2602 * If we have received an ack for the attr info that we sent,
2603 * then check if the dring mode matches what the peer had ack'd
2604 * (saved in lane_out). If they don't match, we fail the
2605 * handshake.
2607 if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2608 if (msg->options != lane_out->dring_mode) {
2609 /* send NACK */
2610 return (1);
2612 } else {
2614 * Save the negotiated dring mode in our attr
2615 * parameters, so it gets sent in the attr info from us
2616 * to the peer.
2618 lane_out->dring_mode = dring_mode;
2621 /* save the negotiated dring mode in the msg to be replied */
2622 msg->options = dring_mode;
2626 * Process MTU attribute.
2628 if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2630 * Versions >= 1.4:
2631 * Validate mtu of the peer is at least ETHERMAX. Then, the mtu
2632 * is negotiated down to the minimum of our mtu and peer's mtu.
2634 if (msg->mtu < ETHERMAX) {
2635 return (1);
2638 mtu = MIN(msg->mtu, vswp->max_frame_size);
2641 * If we have received an ack for the attr info
2642 * that we sent, then check if the mtu computed
2643 * above matches the mtu that the peer had ack'd
2644 * (saved in local hparams). If they don't
2645 * match, we fail the handshake.
2647 if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
2648 if (mtu != lane_out->mtu) {
2649 /* send NACK */
2650 return (1);
2652 } else {
2654 * Save the mtu computed above in our
2655 * attr parameters, so it gets sent in
2656 * the attr info from us to the peer.
2658 lane_out->mtu = mtu;
2661 /* save the MIN mtu in the msg to be replied */
2662 msg->mtu = mtu;
2663 } else {
2664 /* Versions < 1.4, mtu must match */
2665 if (msg->mtu != lane_out->mtu) {
2666 D2(NULL, "%s: invalid MTU (0x%llx)\n",
2667 __func__, msg->mtu);
2668 return (1);
2673 * Otherwise store attributes for this lane and update
2674 * lane state.
2676 lane_in->mtu = msg->mtu;
2677 lane_in->addr = msg->addr;
2678 lane_in->addr_type = msg->addr_type;
2679 lane_in->xfer_mode = msg->xfer_mode;
2680 lane_in->ack_freq = msg->ack_freq;
2681 lane_in->physlink_update = msg->physlink_update;
2682 lane_in->dring_mode = msg->options;
2685 * Check if the client has requested physlink state updates.
2686 * If there is a physical device bound to this vswitch (L2
2687 * mode), set the ack bits to indicate it is supported.
2688 * Otherwise, set the nack bits.
2690 if (VSW_VER_GTEQ(ldcp, 1, 5)) { /* Protocol ver >= 1.5 */
2692 /* Does the vnet need phys link state updates ? */
2693 if ((lane_in->physlink_update &
2694 PHYSLINK_UPDATE_STATE_MASK) ==
2695 PHYSLINK_UPDATE_STATE) {
2697 if (vswp->smode & VSW_LAYER2) {
2698 /* is a net-dev assigned to us ? */
2699 msg->physlink_update =
2700 PHYSLINK_UPDATE_STATE_ACK;
2701 ldcp->pls_negotiated = B_TRUE;
2702 } else {
2703 /* not in L2 mode */
2704 msg->physlink_update =
2705 PHYSLINK_UPDATE_STATE_NACK;
2706 ldcp->pls_negotiated = B_FALSE;
2709 } else {
2710 msg->physlink_update =
2711 PHYSLINK_UPDATE_NONE;
2712 ldcp->pls_negotiated = B_FALSE;
2715 } else {
2717 * physlink_update bits are ignored
2718 * if set by clients < v1.5 protocol.
2720 msg->physlink_update = PHYSLINK_UPDATE_NONE;
2721 ldcp->pls_negotiated = B_FALSE;
2724 macaddr = lane_in->addr;
2725 for (i = ETHERADDRL - 1; i >= 0; i--) {
2726 port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
2727 macaddr >>= 8;
2731 * Setup device specific xmit routines. Note this could be changed
2732 * further in vsw_send_dring_info() for versions >= 1.6 if operating in
2733 * RxDringData mode.
2735 mutex_enter(&port->tx_lock);
2737 if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
2738 (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
2739 (VSW_VER_LT(ldcp, 1, 2) &&
2740 (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
2741 D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
2742 port->transmit = vsw_dringsend;
2743 } else if (lane_in->xfer_mode == VIO_DESC_MODE) {
2744 D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
2745 vsw_create_privring(ldcp);
2746 port->transmit = vsw_descrsend;
2747 lane_out->xfer_mode = VIO_DESC_MODE;
2751 * HybridIO is supported only vnet, not by OBP.
2752 * So, set hio_capable to true only when in DRING mode.
2754 if (VSW_VER_GTEQ(ldcp, 1, 3) &&
2755 (lane_in->xfer_mode != VIO_DESC_MODE)) {
2756 (void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
2757 } else {
2758 (void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
2761 mutex_exit(&port->tx_lock);
2763 return (0);
2766 static int
2767 vsw_process_attr_ack(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
2769 vsw_t *vswp = ldcp->ldc_vswp;
2770 lane_t *lane_out = &ldcp->lane_out;
2771 lane_t *lane_in = &ldcp->lane_in;
2773 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
2775 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) {
2776 return (1);
2780 * Process dring mode attribute.
2782 if (VSW_VER_GTEQ(ldcp, 1, 6)) {
2784 * Versions >= 1.6:
2785 * The ack msg sent by the peer contains the negotiated dring
2786 * mode between our capability (that we had sent in our attr
2787 * info) and the peer's capability.
2789 if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2791 * If we have sent an ack for the attr info msg from
2792 * the peer, check if the dring mode that was
2793 * negotiated then (saved in lane_out) matches the
2794 * mode that the peer has ack'd. If they don't match,
2795 * we fail the handshake.
2797 if (lane_out->dring_mode != msg->options) {
2798 return (1);
2800 } else {
2801 if ((msg->options & lane_out->dring_mode) == 0) {
2803 * Peer ack'd with a mode that we don't
2804 * support; we fail the handshake.
2806 return (1);
2808 if ((msg->options & (VIO_TX_DRING|VIO_RX_DRING_DATA))
2809 == (VIO_TX_DRING|VIO_RX_DRING_DATA)) {
2811 * Peer must ack with only one negotiated mode.
2812 * Otherwise fail handshake.
2814 return (1);
2818 * Save the negotiated mode, so we can validate it when
2819 * we receive attr info from the peer.
2821 lane_out->dring_mode = msg->options;
2826 * Process MTU attribute.
2828 if (VSW_VER_GTEQ(ldcp, 1, 4)) {
2830 * Versions >= 1.4:
2831 * The ack msg sent by the peer contains the minimum of
2832 * our mtu (that we had sent in our attr info) and the
2833 * peer's mtu.
2835 * If we have sent an ack for the attr info msg from
2836 * the peer, check if the mtu that was computed then
2837 * (saved in lane_out params) matches the mtu that the
2838 * peer has ack'd. If they don't match, we fail the
2839 * handshake.
2841 if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
2842 if (lane_out->mtu != msg->mtu) {
2843 return (1);
2845 } else {
2847 * If the mtu ack'd by the peer is > our mtu
2848 * fail handshake. Otherwise, save the mtu, so
2849 * we can validate it when we receive attr info
2850 * from our peer.
2852 if (msg->mtu <= lane_out->mtu) {
2853 lane_out->mtu = msg->mtu;
2854 } else {
2855 return (1);
2860 return (0);
2864 * Process an attribute packet. We can end up here either because our peer
2865 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
2866 * peer has sent us an attribute INFO message
2868 * If its an ACK we then move to the next stage of the handshake which
2869 * is to send our descriptor ring info to our peer. If its a NACK then
2870 * there is nothing more we can (currently) do.
2872 * If we get a valid/acceptable INFO packet (and we have already negotiated
2873 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
2874 * NACK back and reset channel state to INACTIV.
2876 * FUTURE: in time we will probably negotiate over attributes, but for
2877 * the moment unacceptable attributes are regarded as a fatal error.
2880 void
2881 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
2883 vnet_attr_msg_t *attr_pkt;
2884 vsw_t *vswp = ldcp->ldc_vswp;
2885 lane_t *lane_out = &ldcp->lane_out;
2886 lane_t *lane_in = &ldcp->lane_in;
2887 int rv;
2889 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
2892 * We know this is a ctrl/attr packet so
2893 * cast it into the correct structure.
2895 attr_pkt = (vnet_attr_msg_t *)pkt;
2897 switch (attr_pkt->tag.vio_subtype) {
2898 case VIO_SUBTYPE_INFO:
2900 rv = vsw_process_attr_info(ldcp, attr_pkt);
2901 if (rv != 0) {
2902 vsw_free_lane_resources(ldcp, INBOUND);
2903 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
2904 ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
2905 } else {
2906 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
2907 lane_in->lstate |= VSW_ATTR_ACK_SENT;
2909 attr_pkt->tag.vio_sid = ldcp->local_session;
2910 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
2911 (void) vsw_send_msg(ldcp, (void *)attr_pkt,
2912 sizeof (vnet_attr_msg_t), B_TRUE);
2913 vsw_next_milestone(ldcp);
2914 break;
2916 case VIO_SUBTYPE_ACK:
2918 rv = vsw_process_attr_ack(ldcp, attr_pkt);
2919 if (rv != 0) {
2920 return;
2922 lane_out->lstate |= VSW_ATTR_ACK_RECV;
2923 vsw_next_milestone(ldcp);
2924 break;
2926 case VIO_SUBTYPE_NACK:
2927 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
2929 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
2930 return;
2932 lane_out->lstate |= VSW_ATTR_NACK_RECV;
2933 vsw_next_milestone(ldcp);
2934 break;
2936 default:
2937 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
2938 attr_pkt->tag.vio_subtype);
2941 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
2944 static int
2945 vsw_process_dring_reg_info(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
2947 int rv;
2948 vsw_t *vswp = ldcp->ldc_vswp;
2949 lane_t *lp = &ldcp->lane_out;
2950 dring_info_t *dp = NULL;
2952 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
2954 rv = vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV);
2955 if (rv != 0) {
2956 return (1);
2959 if (VSW_VER_GTEQ(ldcp, 1, 6) &&
2960 (lp->dring_mode != ((vio_dring_reg_msg_t *)tagp)->options)) {
2962 * The earlier version of Solaris vnet driver doesn't set the
2963 * option (VIO_TX_DRING in its case) correctly in its dring reg
2964 * message. We workaround that here by doing the check only
2965 * for versions >= v1.6.
2967 DWARN(vswp, "%s(%lld): Rcvd dring reg option (%d), "
2968 "negotiated mode (%d)\n", __func__, ldcp->ldc_id,
2969 ((vio_dring_reg_msg_t *)tagp)->options, lp->dring_mode);
2970 return (1);
2974 * Map dring exported by the peer.
2976 dp = vsw_map_dring(ldcp, (void *)tagp);
2977 if (dp == NULL) {
2978 return (1);
2982 * Map data buffers exported by the peer if we are in RxDringData mode.
2984 if (lp->dring_mode == VIO_RX_DRING_DATA) {
2985 rv = vsw_map_data(ldcp, dp, (void *)tagp);
2986 if (rv != 0) {
2987 vsw_unmap_dring(ldcp);
2988 return (1);
2992 return (0);
2995 static int
2996 vsw_process_dring_reg_ack(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
2998 vsw_t *vswp = ldcp->ldc_vswp;
2999 dring_info_t *dp;
3001 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3003 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) {
3004 return (1);
3007 dp = ldcp->lane_out.dringp;
3009 /* save dring_ident acked by peer */
3010 dp->ident = ((vio_dring_reg_msg_t *)tagp)->dring_ident;
3012 return (0);
3016 * Process a dring info packet. We can end up here either because our peer
3017 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
3018 * peer has sent us a dring INFO message.
3020 * If we get a valid/acceptable INFO packet (and we have already negotiated
3021 * a version) we ACK back and update the lane state, otherwise we NACK back.
3023 * FUTURE: nothing to stop client from sending us info on multiple dring's
3024 * but for the moment we will just use the first one we are given.
3027 void
3028 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
3030 int rv;
3031 int msgsize;
3032 dring_info_t *dp;
3033 vio_msg_tag_t *tagp = (vio_msg_tag_t *)pkt;
3034 vsw_t *vswp = ldcp->ldc_vswp;
3035 lane_t *lane_out = &ldcp->lane_out;
3036 lane_t *lane_in = &ldcp->lane_in;
3038 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3040 switch (tagp->vio_subtype) {
3041 case VIO_SUBTYPE_INFO:
3042 rv = vsw_process_dring_reg_info(ldcp, tagp);
3043 if (rv != 0) {
3044 vsw_free_lane_resources(ldcp, INBOUND);
3045 tagp->vio_subtype = VIO_SUBTYPE_NACK;
3046 lane_in->lstate |= VSW_DRING_NACK_SENT;
3047 } else {
3048 tagp->vio_subtype = VIO_SUBTYPE_ACK;
3049 lane_in->lstate |= VSW_DRING_ACK_SENT;
3051 tagp->vio_sid = ldcp->local_session;
3052 DUMP_TAG_PTR(tagp);
3053 if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
3054 dp = lane_in->dringp;
3055 msgsize =
3056 VNET_DRING_REG_EXT_MSG_SIZE(dp->data_ncookies);
3057 } else {
3058 msgsize = sizeof (vio_dring_reg_msg_t);
3060 (void) vsw_send_msg(ldcp, (void *)tagp, msgsize, B_TRUE);
3061 vsw_next_milestone(ldcp);
3062 break;
3064 case VIO_SUBTYPE_ACK:
3065 rv = vsw_process_dring_reg_ack(ldcp, tagp);
3066 if (rv != 0) {
3067 return;
3069 lane_out->lstate |= VSW_DRING_ACK_RECV;
3070 vsw_next_milestone(ldcp);
3071 break;
3073 case VIO_SUBTYPE_NACK:
3074 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3076 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
3077 return;
3079 lane_out->lstate |= VSW_DRING_NACK_RECV;
3080 vsw_next_milestone(ldcp);
3081 break;
3083 default:
3084 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3085 tagp->vio_subtype);
3088 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3092 * Process a request from peer to unregister a dring.
3094 * For the moment we just restart the handshake if our
3095 * peer endpoint attempts to unregister a dring.
3097 void
3098 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
3100 vsw_t *vswp = ldcp->ldc_vswp;
3101 vio_dring_unreg_msg_t *dring_pkt;
3104 * We know this is a ctrl/dring packet so
3105 * cast it into the correct structure.
3107 dring_pkt = (vio_dring_unreg_msg_t *)pkt;
3109 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3111 switch (dring_pkt->tag.vio_subtype) {
3112 case VIO_SUBTYPE_INFO:
3113 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3115 DWARN(vswp, "%s: restarting handshake..", __func__);
3116 break;
3118 case VIO_SUBTYPE_ACK:
3119 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3121 DWARN(vswp, "%s: restarting handshake..", __func__);
3122 break;
3124 case VIO_SUBTYPE_NACK:
3125 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3127 DWARN(vswp, "%s: restarting handshake..", __func__);
3128 break;
3130 default:
3131 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3132 dring_pkt->tag.vio_subtype);
3135 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3137 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3140 #define SND_MCST_NACK(ldcp, pkt) \
3141 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
3142 pkt->tag.vio_sid = ldcp->local_session; \
3143 (void) vsw_send_msg(ldcp, (void *)pkt, \
3144 sizeof (vnet_mcast_msg_t), B_TRUE);
3147 * Process a multicast request from a vnet.
3149 * Vnet's specify a multicast address that they are interested in. This
3150 * address is used as a key into the hash table which forms the multicast
3151 * forwarding database (mFDB).
3153 * The table keys are the multicast addresses, while the table entries
3154 * are pointers to lists of ports which wish to receive packets for the
3155 * specified multicast address.
3157 * When a multicast packet is being switched we use the address as a key
3158 * into the hash table, and then walk the appropriate port list forwarding
3159 * the pkt to each port in turn.
3161 * If a vnet is no longer interested in a particular multicast grouping
3162 * we simply find the correct location in the hash table and then delete
3163 * the relevant port from the port list.
3165 * To deal with the case whereby a port is being deleted without first
3166 * removing itself from the lists in the hash table, we maintain a list
3167 * of multicast addresses the port has registered an interest in, within
3168 * the port structure itself. We then simply walk that list of addresses
3169 * using them as keys into the hash table and remove the port from the
3170 * appropriate lists.
3172 static void
3173 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
3175 vnet_mcast_msg_t *mcst_pkt;
3176 vsw_port_t *port = ldcp->ldc_port;
3177 vsw_t *vswp = ldcp->ldc_vswp;
3178 int i;
3180 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3183 * We know this is a ctrl/mcast packet so
3184 * cast it into the correct structure.
3186 mcst_pkt = (vnet_mcast_msg_t *)pkt;
3188 switch (mcst_pkt->tag.vio_subtype) {
3189 case VIO_SUBTYPE_INFO:
3190 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3193 * Check if in correct state to receive a multicast
3194 * message (i.e. handshake complete). If not reset
3195 * the handshake.
3197 if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
3198 return;
3201 * Before attempting to add or remove address check
3202 * that they are valid multicast addresses.
3203 * If not, then NACK back.
3205 for (i = 0; i < mcst_pkt->count; i++) {
3206 if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
3207 DERR(vswp, "%s: invalid multicast address",
3208 __func__);
3209 SND_MCST_NACK(ldcp, mcst_pkt);
3210 return;
3215 * Now add/remove the addresses. If this fails we
3216 * NACK back.
3218 if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
3219 SND_MCST_NACK(ldcp, mcst_pkt);
3220 return;
3223 mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3224 mcst_pkt->tag.vio_sid = ldcp->local_session;
3226 DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
3228 (void) vsw_send_msg(ldcp, (void *)mcst_pkt,
3229 sizeof (vnet_mcast_msg_t), B_TRUE);
3230 break;
3232 case VIO_SUBTYPE_ACK:
3233 DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3236 * We shouldn't ever get a multicast ACK message as
3237 * at the moment we never request multicast addresses
3238 * to be set on some other device. This may change in
3239 * the future if we have cascading switches.
3241 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
3242 return;
3244 /* Do nothing */
3245 break;
3247 case VIO_SUBTYPE_NACK:
3248 DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3251 * We shouldn't get a multicast NACK packet for the
3252 * same reasons as we shouldn't get a ACK packet.
3254 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
3255 return;
3257 /* Do nothing */
3258 break;
3260 default:
3261 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
3262 mcst_pkt->tag.vio_subtype);
3265 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3268 static void
3269 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
3271 vio_rdx_msg_t *rdx_pkt;
3272 vsw_t *vswp = ldcp->ldc_vswp;
3275 * We know this is a ctrl/rdx packet so
3276 * cast it into the correct structure.
3278 rdx_pkt = (vio_rdx_msg_t *)pkt;
3280 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3282 switch (rdx_pkt->tag.vio_subtype) {
3283 case VIO_SUBTYPE_INFO:
3284 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3286 if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
3287 return;
3289 rdx_pkt->tag.vio_sid = ldcp->local_session;
3290 rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
3292 DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
3294 ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;
3296 (void) vsw_send_msg(ldcp, (void *)rdx_pkt,
3297 sizeof (vio_rdx_msg_t), B_TRUE);
3299 vsw_next_milestone(ldcp);
3300 break;
3302 case VIO_SUBTYPE_ACK:
3304 * Should be handled in-band by callback handler.
3306 DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
3307 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3308 break;
3310 case VIO_SUBTYPE_NACK:
3311 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3313 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
3314 return;
3316 ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
3317 vsw_next_milestone(ldcp);
3318 break;
3320 default:
3321 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3322 rdx_pkt->tag.vio_subtype);
3325 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3328 static void
3329 vsw_process_physlink_msg(vsw_ldc_t *ldcp, void *pkt)
3331 vnet_physlink_msg_t *msgp;
3332 vsw_t *vswp = ldcp->ldc_vswp;
3334 msgp = (vnet_physlink_msg_t *)pkt;
3336 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
3338 switch (msgp->tag.vio_subtype) {
3339 case VIO_SUBTYPE_INFO:
3341 /* vsw shouldn't recv physlink info */
3342 DWARN(vswp, "%s: Unexpected VIO_SUBTYPE_INFO", __func__);
3343 break;
3345 case VIO_SUBTYPE_ACK:
3347 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3348 break;
3350 case VIO_SUBTYPE_NACK:
3352 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3353 break;
3355 default:
3356 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
3357 msgp->tag.vio_subtype);
3360 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3363 static void
3364 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
3365 uint32_t msglen)
3367 uint16_t env = tagp->vio_subtype_env;
3368 vsw_t *vswp = ldcp->ldc_vswp;
3369 lane_t *lp = &ldcp->lane_out;
3370 uint8_t dring_mode = lp->dring_mode;
3372 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3374 /* session id check */
3375 if (ldcp->session_status & VSW_PEER_SESSION) {
3376 if (ldcp->peer_session != tagp->vio_sid) {
3377 DERR(vswp, "%s (chan %d): invalid session id (%llx)",
3378 __func__, ldcp->ldc_id, tagp->vio_sid);
3379 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3380 return;
3385 * It is an error for us to be getting data packets
3386 * before the handshake has completed.
3388 if (ldcp->hphase != VSW_MILESTONE4) {
3389 DERR(vswp, "%s: got data packet before handshake complete "
3390 "hphase %d (%x: %x)", __func__, ldcp->hphase,
3391 ldcp->lane_in.lstate, ldcp->lane_out.lstate);
3392 DUMP_FLAGS(ldcp->lane_in.lstate);
3393 DUMP_FLAGS(ldcp->lane_out.lstate);
3394 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
3395 return;
3397 if (dring_mode == VIO_TX_DRING) {
3399 * To reduce the locking contention, release the ldc_cblock
3400 * here and re-acquire it once we are done receiving packets.
3401 * We do this only in TxDring mode to allow further callbaks to
3402 * continue while the msg worker thread processes the messages.
3403 * In RxDringData mode, we process the messages in the callback
3404 * itself and wake up rcv worker thread to process only data
3405 * info messages.
3407 mutex_exit(&ldcp->ldc_cblock);
3408 mutex_enter(&ldcp->ldc_rxlock);
3412 * Switch on vio_subtype envelope, then let lower routines
3413 * decide if its an INFO, ACK or NACK packet.
3415 if (env == VIO_DRING_DATA) {
3416 ldcp->rx_dringdata(ldcp, dpkt);
3417 } else if (env == VIO_PKT_DATA) {
3418 ldcp->rx_pktdata(ldcp, dpkt, msglen);
3419 } else if (env == VIO_DESC_DATA) {
3420 vsw_process_data_ibnd_pkt(ldcp, dpkt);
3421 } else {
3422 DERR(vswp, "%s: unknown vio_subtype_env (%x)\n",
3423 __func__, env);
3426 if (dring_mode == VIO_TX_DRING) {
3427 mutex_exit(&ldcp->ldc_rxlock);
3428 mutex_enter(&ldcp->ldc_cblock);
3431 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
3435 * dummy pkt data handler function for vnet protocol version 1.0
3437 static void
3438 vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
3440 _NOTE(ARGUNUSED(arg1, arg2, msglen))
3444 * This function handles raw pkt data messages received over the channel.
3445 * Currently, only priority-eth-type frames are received through this mechanism.
3446 * In this case, the frame(data) is present within the message itself which
3447 * is copied into an mblk before switching it.
3449 static void
3450 vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
3452 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg1;
3453 vio_raw_data_msg_t *dpkt = (vio_raw_data_msg_t *)arg2;
3454 uint32_t size;
3455 mblk_t *mp;
3456 vio_mblk_t *vmp;
3457 vsw_t *vswp = ldcp->ldc_vswp;
3458 vgen_stats_t *statsp = &ldcp->ldc_stats;
3459 lane_t *lp = &ldcp->lane_out;
3461 size = msglen - VIO_PKT_DATA_HDRSIZE;
3462 if (size < ETHERMIN || size > lp->mtu) {
3463 (void) atomic_inc_32(&statsp->rx_pri_fail);
3464 DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3465 ldcp->ldc_id, size);
3466 return;
3469 vmp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
3470 if (vmp == NULL) {
3471 mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
3472 if (mp == NULL) {
3473 (void) atomic_inc_32(&statsp->rx_pri_fail);
3474 DWARN(vswp, "%s(%lld) allocb failure, "
3475 "unable to process priority frame\n", __func__,
3476 ldcp->ldc_id);
3477 return;
3479 } else {
3480 mp = vmp->mp;
3483 /* skip over the extra space for vlan tag */
3484 mp->b_rptr += VLAN_TAGSZ;
3486 /* copy the frame from the payload of raw data msg into the mblk */
3487 bcopy(dpkt->data, mp->b_rptr, size);
3488 mp->b_wptr = mp->b_rptr + size;
3490 if (vmp != NULL) {
3491 vmp->state = VIO_MBLK_HAS_DATA;
3494 /* update stats */
3495 (void) atomic_inc_64(&statsp->rx_pri_packets);
3496 (void) atomic_add_64(&statsp->rx_pri_bytes, size);
3499 * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
3501 (void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3503 /* switch the frame to destination */
3504 vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
3508 * Process an in-band descriptor message (most likely from
3509 * OBP).
3511 static void
3512 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
3514 vnet_ibnd_desc_t *ibnd_desc;
3515 dring_info_t *dp = NULL;
3516 vsw_private_desc_t *priv_addr = NULL;
3517 vsw_t *vswp = ldcp->ldc_vswp;
3518 mblk_t *mp = NULL;
3519 size_t nbytes = 0;
3520 size_t off = 0;
3521 uint64_t idx = 0;
3522 uint32_t num = 1, len, datalen = 0;
3523 uint64_t ncookies = 0;
3524 int i, rv;
3525 int j = 0;
3527 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
3529 ibnd_desc = (vnet_ibnd_desc_t *)pkt;
3531 switch (ibnd_desc->hdr.tag.vio_subtype) {
3532 case VIO_SUBTYPE_INFO:
3533 D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
3535 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
3536 return;
3539 * Data is padded to align on a 8 byte boundary,
3540 * nbytes is actual data length, i.e. minus that
3541 * padding.
3543 datalen = ibnd_desc->nbytes;
3545 D2(vswp, "%s(%lld): processing inband desc : "
3546 ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
3548 ncookies = ibnd_desc->ncookies;
3551 * allocb(9F) returns an aligned data block. We
3552 * need to ensure that we ask ldc for an aligned
3553 * number of bytes also.
3555 nbytes = datalen;
3556 if (nbytes & 0x7) {
3557 off = 8 - (nbytes & 0x7);
3558 nbytes += off;
3561 /* alloc extra space for VLAN_TAG */
3562 mp = allocb(datalen + 8, BPRI_MED);
3563 if (mp == NULL) {
3564 DERR(vswp, "%s(%lld): allocb failed",
3565 __func__, ldcp->ldc_id);
3566 ldcp->ldc_stats.rx_allocb_fail++;
3567 return;
3570 /* skip over the extra space for VLAN_TAG */
3571 mp->b_rptr += 8;
3573 rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
3574 0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
3575 LDC_COPY_IN);
3577 if (rv != 0) {
3578 DERR(vswp, "%s(%d): unable to copy in data from "
3579 "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
3580 freemsg(mp);
3581 ldcp->ldc_stats.ierrors++;
3582 return;
3585 D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
3586 __func__, ldcp->ldc_id, nbytes, ncookies);
3588 /* point to the actual end of data */
3589 mp->b_wptr = mp->b_rptr + datalen;
3590 ldcp->ldc_stats.ipackets++;
3591 ldcp->ldc_stats.rbytes += datalen;
3594 * We ACK back every in-band descriptor message we process
3596 ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
3597 ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
3598 (void) vsw_send_msg(ldcp, (void *)ibnd_desc,
3599 sizeof (vnet_ibnd_desc_t), B_TRUE);
3602 * there is extra space alloc'd for VLAN_TAG
3604 (void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);
3606 /* send the packet to be switched */
3607 vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
3608 ldcp->ldc_port, NULL);
3610 break;
3612 case VIO_SUBTYPE_ACK:
3613 D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
3615 /* Verify the ACK is valid */
3616 idx = ibnd_desc->hdr.desc_handle;
3618 if (idx >= vsw_num_descriptors) {
3619 cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
3620 "(idx %ld)", vswp->instance, idx);
3621 return;
3624 if ((dp = ldcp->lane_out.dringp) == NULL) {
3625 DERR(vswp, "%s: no dring found", __func__);
3626 return;
3629 len = dp->num_descriptors;
3631 * If the descriptor we are being ACK'ed for is not the
3632 * one we expected, then pkts were lost somwhere, either
3633 * when we tried to send a msg, or a previous ACK msg from
3634 * our peer. In either case we now reclaim the descriptors
3635 * in the range from the last ACK we received up to the
3636 * current ACK.
3638 if (idx != dp->last_ack_recv) {
3639 DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
3640 __func__, dp->last_ack_recv, idx);
3641 num = idx >= dp->last_ack_recv ?
3642 idx - dp->last_ack_recv + 1:
3643 (len - dp->last_ack_recv + 1) + idx;
3647 * When we sent the in-band message to our peer we
3648 * marked the copy in our private ring as READY. We now
3649 * check that the descriptor we are being ACK'ed for is in
3650 * fact READY, i.e. it is one we have shared with our peer.
3652 * If its not we flag an error, but still reset the descr
3653 * back to FREE.
3655 for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
3656 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
3657 mutex_enter(&priv_addr->dstate_lock);
3658 if (priv_addr->dstate != VIO_DESC_READY) {
3659 DERR(vswp, "%s: (%ld) desc at index %ld not "
3660 "READY (0x%lx)", __func__,
3661 ldcp->ldc_id, idx, priv_addr->dstate);
3662 DERR(vswp, "%s: bound %d: ncookies %ld : "
3663 "datalen %ld", __func__,
3664 priv_addr->bound, priv_addr->ncookies,
3665 priv_addr->datalen);
3667 D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
3668 ldcp->ldc_id, idx);
3669 /* release resources associated with sent msg */
3670 priv_addr->datalen = 0;
3671 priv_addr->dstate = VIO_DESC_FREE;
3672 mutex_exit(&priv_addr->dstate_lock);
3674 /* update to next expected value */
3675 dp->last_ack_recv = (idx + 1) % dp->num_descriptors;
3677 break;
3679 case VIO_SUBTYPE_NACK:
3680 DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
3683 * We should only get a NACK if our peer doesn't like
3684 * something about a message we have sent it. If this
3685 * happens we just release the resources associated with
3686 * the message. (We are relying on higher layers to decide
3687 * whether or not to resend.
3690 /* limit check */
3691 idx = ibnd_desc->hdr.desc_handle;
3693 if (idx >= vsw_num_descriptors) {
3694 DERR(vswp, "%s: corrupted NACK received (idx %lld)",
3695 __func__, idx);
3696 return;
3699 if ((dp = ldcp->lane_out.dringp) == NULL) {
3700 DERR(vswp, "%s: no dring found", __func__);
3701 return;
3704 priv_addr = (vsw_private_desc_t *)dp->priv_addr;
3706 /* move to correct location in ring */
3707 priv_addr += idx;
3709 /* release resources associated with sent msg */
3710 mutex_enter(&priv_addr->dstate_lock);
3711 priv_addr->datalen = 0;
3712 priv_addr->dstate = VIO_DESC_FREE;
3713 mutex_exit(&priv_addr->dstate_lock);
3715 break;
3717 default:
3718 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
3719 ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
3722 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
3725 static void
3726 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
3728 _NOTE(ARGUNUSED(epkt))
3730 vsw_t *vswp = ldcp->ldc_vswp;
3731 uint16_t env = tagp->vio_subtype_env;
3733 D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
3736 * Error vio_subtypes have yet to be defined. So for
3737 * the moment we can't do anything.
3739 D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
3741 D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
3744 /* transmit the packet over the given port */
3746 vsw_portsend(vsw_port_t *port, mblk_t *mp)
3748 mblk_t *mpt;
3749 int count;
3750 vsw_ldc_t *ldcp = port->ldcp;
3751 int status = 0;
3753 count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
3754 if (count != 0) {
3755 status = ldcp->tx(ldcp, mp, mpt, count);
3757 return (status);
3761 * Break up frames into 2 seperate chains: normal and
3762 * priority, based on the frame type. The number of
3763 * priority frames is also counted and returned.
3765 * Params:
3766 * vswp: pointer to the instance of vsw
3767 * np: head of packet chain to be broken
3768 * npt: tail of packet chain to be broken
3770 * Returns:
3771 * np: head of normal data packets
3772 * npt: tail of normal data packets
3773 * hp: head of high priority packets
3774 * hpt: tail of high priority packets
3776 static uint32_t
3777 vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
3778 mblk_t **hp, mblk_t **hpt)
3780 mblk_t *tmp = NULL;
3781 mblk_t *smp = NULL;
3782 mblk_t *hmp = NULL; /* high prio pkts head */
3783 mblk_t *hmpt = NULL; /* high prio pkts tail */
3784 mblk_t *nmp = NULL; /* normal pkts head */
3785 mblk_t *nmpt = NULL; /* normal pkts tail */
3786 uint32_t count = 0;
3787 int i;
3788 struct ether_header *ehp;
3789 uint32_t num_types;
3790 uint16_t *types;
3792 tmp = *np;
3793 while (tmp != NULL) {
3795 smp = tmp;
3796 tmp = tmp->b_next;
3797 smp->b_next = NULL;
3798 smp->b_prev = NULL;
3800 ehp = (struct ether_header *)smp->b_rptr;
3801 num_types = vswp->pri_num_types;
3802 types = vswp->pri_types;
3803 for (i = 0; i < num_types; i++) {
3804 if (ehp->ether_type == types[i]) {
3805 /* high priority frame */
3807 if (hmp != NULL) {
3808 hmpt->b_next = smp;
3809 hmpt = smp;
3810 } else {
3811 hmp = hmpt = smp;
3813 count++;
3814 break;
3817 if (i == num_types) {
3818 /* normal data frame */
3820 if (nmp != NULL) {
3821 nmpt->b_next = smp;
3822 nmpt = smp;
3823 } else {
3824 nmp = nmpt = smp;
3829 *hp = hmp;
3830 *hpt = hmpt;
3831 *np = nmp;
3832 *npt = nmpt;
3834 return (count);
3838 * Wrapper function to transmit normal and/or priority frames over the channel.
3840 static int
3841 vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
3843 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
3844 mblk_t *tmp;
3845 mblk_t *smp;
3846 mblk_t *hmp; /* high prio pkts head */
3847 mblk_t *hmpt; /* high prio pkts tail */
3848 mblk_t *nmp; /* normal pkts head */
3849 mblk_t *nmpt; /* normal pkts tail */
3850 uint32_t n = 0;
3851 vsw_t *vswp = ldcp->ldc_vswp;
3853 ASSERT(VSW_PRI_ETH_DEFINED(vswp));
3854 ASSERT(count != 0);
3856 nmp = mp;
3857 nmpt = mpt;
3859 /* gather any priority frames from the chain of packets */
3860 n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);
3862 /* transmit priority frames */
3863 tmp = hmp;
3864 while (tmp != NULL) {
3865 smp = tmp;
3866 tmp = tmp->b_next;
3867 smp->b_next = NULL;
3868 vsw_ldcsend_pkt(ldcp, smp);
3871 count -= n;
3873 if (count == 0) {
3874 /* no normal data frames to process */
3875 return (0);
3878 return (vsw_ldctx(ldcp, nmp, nmpt, count));
3882 * Wrapper function to transmit normal frames over the channel.
3884 static int
3885 vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
3887 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
3888 mblk_t *tmp = NULL;
3890 ASSERT(count != 0);
3892 * If the TX thread is enabled, then queue the
3893 * ordinary frames and signal the tx thread.
3895 if (ldcp->tx_thread != NULL) {
3897 mutex_enter(&ldcp->tx_thr_lock);
3899 if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
3901 * If we reached queue limit,
3902 * do not queue new packets,
3903 * drop them.
3905 ldcp->ldc_stats.tx_qfull += count;
3906 mutex_exit(&ldcp->tx_thr_lock);
3907 freemsgchain(mp);
3908 goto exit;
3910 if (ldcp->tx_mhead == NULL) {
3911 ldcp->tx_mhead = mp;
3912 ldcp->tx_mtail = mpt;
3913 cv_signal(&ldcp->tx_thr_cv);
3914 } else {
3915 ldcp->tx_mtail->b_next = mp;
3916 ldcp->tx_mtail = mpt;
3918 ldcp->tx_cnt += count;
3919 mutex_exit(&ldcp->tx_thr_lock);
3920 } else {
3921 while (mp != NULL) {
3922 tmp = mp->b_next;
3923 mp->b_next = mp->b_prev = NULL;
3924 (void) vsw_ldcsend(ldcp, mp, 1);
3925 mp = tmp;
3929 exit:
3930 return (0);
3934 * This function transmits the frame in the payload of a raw data
3935 * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
3936 * send special frames with high priorities, without going through
3937 * the normal data path which uses descriptor ring mechanism.
3939 static void
3940 vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
3942 vio_raw_data_msg_t *pkt;
3943 mblk_t *bp;
3944 mblk_t *nmp = NULL;
3945 vio_mblk_t *vmp;
3946 caddr_t dst;
3947 uint32_t mblksz;
3948 uint32_t size;
3949 uint32_t nbytes;
3950 int rv;
3951 vsw_t *vswp = ldcp->ldc_vswp;
3952 vgen_stats_t *statsp = &ldcp->ldc_stats;
3954 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
3955 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
3956 (void) atomic_inc_32(&statsp->tx_pri_fail);
3957 DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
3958 "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
3959 ldcp->lane_out.lstate);
3960 goto send_pkt_exit;
3963 size = msgsize(mp);
3965 /* frame size bigger than available payload len of raw data msg ? */
3966 if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
3967 (void) atomic_inc_32(&statsp->tx_pri_fail);
3968 DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
3969 ldcp->ldc_id, size);
3970 goto send_pkt_exit;
3973 if (size < ETHERMIN)
3974 size = ETHERMIN;
3976 /* alloc space for a raw data message */
3977 vmp = vio_allocb(vswp->pri_tx_vmp);
3978 if (vmp == NULL) {
3979 (void) atomic_inc_32(&statsp->tx_pri_fail);
3980 DWARN(vswp, "vio_allocb failed\n");
3981 goto send_pkt_exit;
3982 } else {
3983 nmp = vmp->mp;
3985 pkt = (vio_raw_data_msg_t *)nmp->b_rptr;
3987 /* copy frame into the payload of raw data message */
3988 dst = (caddr_t)pkt->data;
3989 for (bp = mp; bp != NULL; bp = bp->b_cont) {
3990 mblksz = MBLKL(bp);
3991 bcopy(bp->b_rptr, dst, mblksz);
3992 dst += mblksz;
3995 vmp->state = VIO_MBLK_HAS_DATA;
3997 /* setup the raw data msg */
3998 pkt->tag.vio_msgtype = VIO_TYPE_DATA;
3999 pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
4000 pkt->tag.vio_subtype_env = VIO_PKT_DATA;
4001 pkt->tag.vio_sid = ldcp->local_session;
4002 nbytes = VIO_PKT_DATA_HDRSIZE + size;
4004 /* send the msg over ldc */
4005 rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
4006 if (rv != 0) {
4007 (void) atomic_inc_32(&statsp->tx_pri_fail);
4008 DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
4009 ldcp->ldc_id);
4010 goto send_pkt_exit;
4013 /* update stats */
4014 (void) atomic_inc_64(&statsp->tx_pri_packets);
4015 (void) atomic_add_64(&statsp->tx_pri_packets, size);
4017 send_pkt_exit:
4018 if (nmp != NULL)
4019 freemsg(nmp);
4020 freemsg(mp);
4024 * Transmit the packet over the given LDC channel.
4026 * The 'retries' argument indicates how many times a packet
4027 * is retried before it is dropped. Note, the retry is done
4028 * only for a resource related failure, for all other failures
4029 * the packet is dropped immediately.
4031 static int
4032 vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
4034 int i;
4035 int rc;
4036 int status = 0;
4037 vsw_port_t *port = ldcp->ldc_port;
4038 dring_info_t *dp = NULL;
4039 lane_t *lp = &ldcp->lane_out;
4041 for (i = 0; i < retries; ) {
4043 * Send the message out using the appropriate
4044 * transmit function which will free mblock when it
4045 * is finished with it.
4047 mutex_enter(&port->tx_lock);
4048 if (port->transmit != NULL) {
4049 status = (*port->transmit)(ldcp, mp);
4051 if (status == LDC_TX_SUCCESS) {
4052 mutex_exit(&port->tx_lock);
4053 break;
4055 i++; /* increment the counter here */
4057 /* If its the last retry, then update the oerror */
4058 if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
4059 ldcp->ldc_stats.oerrors++;
4061 mutex_exit(&port->tx_lock);
4063 if (status != LDC_TX_NORESOURCES) {
4065 * No retrying required for errors un-related
4066 * to resources.
4068 break;
4070 if (((dp = ldcp->lane_out.dringp) != NULL) &&
4071 ((VSW_VER_GTEQ(ldcp, 1, 2) &&
4072 (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
4073 ((VSW_VER_LT(ldcp, 1, 2) &&
4074 (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {
4076 /* Need to reclaim in TxDring mode. */
4077 if (lp->dring_mode == VIO_TX_DRING) {
4078 rc = vsw_reclaim_dring(dp, dp->end_idx);
4081 } else {
4083 * If there is no dring or the xfer_mode is
4084 * set to DESC_MODE(ie., OBP), then simply break here.
4086 break;
4090 * Delay only if none were reclaimed
4091 * and its not the last retry.
4093 if ((rc == 0) && (i < retries)) {
4094 delay(drv_usectohz(vsw_ldc_tx_delay));
4097 freemsg(mp);
4098 return (status);
4102 * Send an in-band descriptor message over ldc.
4104 static int
4105 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
4107 vsw_t *vswp = ldcp->ldc_vswp;
4108 vnet_ibnd_desc_t ibnd_msg;
4109 vsw_private_desc_t *priv_desc = NULL;
4110 dring_info_t *dp = NULL;
4111 size_t n, size = 0;
4112 caddr_t bufp;
4113 mblk_t *bp;
4114 int idx, i;
4115 int status = LDC_TX_SUCCESS;
4116 static int warn_msg = 1;
4117 lane_t *lp = &ldcp->lane_out;
4119 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4121 ASSERT(mp != NULL);
4123 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
4124 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
4125 DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
4126 __func__, ldcp->ldc_id, ldcp->ldc_status,
4127 ldcp->lane_out.lstate);
4128 ldcp->ldc_stats.oerrors++;
4129 return (LDC_TX_FAILURE);
4133 * The dring here is as an internal buffer,
4134 * rather than a transfer channel.
4136 if ((dp = ldcp->lane_out.dringp) == NULL) {
4137 DERR(vswp, "%s(%lld): no dring for outbound lane",
4138 __func__, ldcp->ldc_id);
4139 DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
4140 ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
4141 ldcp->ldc_stats.oerrors++;
4142 return (LDC_TX_FAILURE);
4145 size = msgsize(mp);
4146 if (size > (size_t)lp->mtu) {
4147 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
4148 ldcp->ldc_id, size);
4149 ldcp->ldc_stats.oerrors++;
4150 return (LDC_TX_FAILURE);
4154 * Find a free descriptor in our buffer ring
4156 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
4157 if (warn_msg) {
4158 DERR(vswp, "%s(%lld): no descriptor available for ring "
4159 "at 0x%llx", __func__, ldcp->ldc_id, dp);
4160 warn_msg = 0;
4163 /* nothing more we can do */
4164 status = LDC_TX_NORESOURCES;
4165 goto vsw_descrsend_free_exit;
4166 } else {
4167 D2(vswp, "%s(%lld): free private descriptor found at pos "
4168 "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
4169 warn_msg = 1;
4172 /* copy data into the descriptor */
4173 bufp = priv_desc->datap;
4174 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
4175 n = MBLKL(bp);
4176 bcopy(bp->b_rptr, bufp, n);
4177 bufp += n;
4180 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
4182 /* create and send the in-band descp msg */
4183 ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
4184 ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
4185 ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
4186 ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
4189 * Copy the mem cookies describing the data from the
4190 * private region of the descriptor ring into the inband
4191 * descriptor.
4193 for (i = 0; i < priv_desc->ncookies; i++) {
4194 bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
4195 sizeof (ldc_mem_cookie_t));
4198 ibnd_msg.hdr.desc_handle = idx;
4199 ibnd_msg.ncookies = priv_desc->ncookies;
4200 ibnd_msg.nbytes = size;
4202 ldcp->ldc_stats.opackets++;
4203 ldcp->ldc_stats.obytes += size;
4205 (void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
4206 sizeof (vnet_ibnd_desc_t), B_TRUE);
4208 vsw_descrsend_free_exit:
4210 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4211 return (status);
4214 static void
4215 vsw_send_ver(void *arg)
4217 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
4218 vsw_t *vswp = ldcp->ldc_vswp;
4219 lane_t *lp = &ldcp->lane_out;
4220 vio_ver_msg_t ver_msg;
4222 D1(vswp, "%s enter", __func__);
4224 ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4225 ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4226 ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
4227 ver_msg.tag.vio_sid = ldcp->local_session;
4229 if (vsw_obp_ver_proto_workaround == B_FALSE) {
4230 ver_msg.ver_major = vsw_versions[0].ver_major;
4231 ver_msg.ver_minor = vsw_versions[0].ver_minor;
4232 } else {
4233 /* use the major,minor that we've ack'd */
4234 lane_t *lpi = &ldcp->lane_in;
4235 ver_msg.ver_major = lpi->ver_major;
4236 ver_msg.ver_minor = lpi->ver_minor;
4238 ver_msg.dev_class = VDEV_NETWORK_SWITCH;
4240 lp->lstate |= VSW_VER_INFO_SENT;
4241 lp->ver_major = ver_msg.ver_major;
4242 lp->ver_minor = ver_msg.ver_minor;
4244 DUMP_TAG(ver_msg.tag);
4246 (void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);
4248 D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
4251 static void
4252 vsw_send_attr(vsw_ldc_t *ldcp)
4254 vsw_t *vswp = ldcp->ldc_vswp;
4255 lane_t *lp = &ldcp->lane_out;
4256 vnet_attr_msg_t attr_msg;
4258 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4261 * Subtype is set to INFO by default
4263 attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4264 attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4265 attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
4266 attr_msg.tag.vio_sid = ldcp->local_session;
4268 /* payload copied from default settings for lane */
4269 attr_msg.mtu = lp->mtu;
4270 attr_msg.addr_type = lp->addr_type;
4271 attr_msg.xfer_mode = lp->xfer_mode;
4272 attr_msg.ack_freq = lp->xfer_mode;
4273 attr_msg.options = lp->dring_mode;
4275 READ_ENTER(&vswp->if_lockrw);
4276 attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
4277 RW_EXIT(&vswp->if_lockrw);
4279 ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
4281 DUMP_TAG(attr_msg.tag);
4283 (void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);
4285 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4288 static void
4289 vsw_send_dring_info(vsw_ldc_t *ldcp)
4291 int msgsize;
4292 void *msg;
4293 vsw_t *vswp = ldcp->ldc_vswp;
4294 vsw_port_t *port = ldcp->ldc_port;
4295 lane_t *lp = &ldcp->lane_out;
4296 vgen_stats_t *statsp = &ldcp->ldc_stats;
4298 D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
4300 /* dring mode has been negotiated in attr phase; save in stats */
4301 statsp->dring_mode = lp->dring_mode;
4303 if (lp->dring_mode == VIO_RX_DRING_DATA) {
4305 * Change the transmit routine for RxDringData mode.
4307 port->transmit = vsw_dringsend_shm;
4308 msg = (void *) vsw_create_rx_dring_info(ldcp);
4309 if (msg == NULL) {
4310 return;
4312 msgsize =
4313 VNET_DRING_REG_EXT_MSG_SIZE(lp->dringp->data_ncookies);
4314 ldcp->rcv_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
4315 vsw_ldc_rcv_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
4316 ldcp->rx_dringdata = vsw_process_dringdata_shm;
4317 } else {
4318 msg = (void *) vsw_create_tx_dring_info(ldcp);
4319 if (msg == NULL) {
4320 return;
4322 msgsize = sizeof (vio_dring_reg_msg_t);
4323 ldcp->msg_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
4324 vsw_ldc_msg_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
4325 ldcp->rx_dringdata = vsw_process_dringdata;
4328 lp->lstate |= VSW_DRING_INFO_SENT;
4329 DUMP_TAG_PTR((vio_msg_tag_t *)msg);
4330 (void) vsw_send_msg(ldcp, msg, msgsize, B_TRUE);
4331 kmem_free(msg, msgsize);
4333 D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
4336 static void
4337 vsw_send_rdx(vsw_ldc_t *ldcp)
4339 vsw_t *vswp = ldcp->ldc_vswp;
4340 vio_rdx_msg_t rdx_msg;
4342 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
4344 rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
4345 rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
4346 rdx_msg.tag.vio_subtype_env = VIO_RDX;
4347 rdx_msg.tag.vio_sid = ldcp->local_session;
4349 ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;
4351 DUMP_TAG(rdx_msg.tag);
4353 (void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);
4355 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
4359 * Remove the specified address from the list of address maintained
4360 * in this port node.
4362 mcst_addr_t *
4363 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
4365 vsw_t *vswp = NULL;
4366 vsw_port_t *port = NULL;
4367 mcst_addr_t *prev_p = NULL;
4368 mcst_addr_t *curr_p = NULL;
4370 D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
4371 __func__, devtype, addr);
4373 if (devtype == VSW_VNETPORT) {
4374 port = (vsw_port_t *)arg;
4375 mutex_enter(&port->mca_lock);
4376 prev_p = curr_p = port->mcap;
4377 } else {
4378 vswp = (vsw_t *)arg;
4379 mutex_enter(&vswp->mca_lock);
4380 prev_p = curr_p = vswp->mcap;
4383 while (curr_p != NULL) {
4384 if (curr_p->addr == addr) {
4385 D2(NULL, "%s: address found", __func__);
4386 /* match found */
4387 if (prev_p == curr_p) {
4388 /* list head */
4389 if (devtype == VSW_VNETPORT)
4390 port->mcap = curr_p->nextp;
4391 else
4392 vswp->mcap = curr_p->nextp;
4393 } else {
4394 prev_p->nextp = curr_p->nextp;
4396 break;
4397 } else {
4398 prev_p = curr_p;
4399 curr_p = curr_p->nextp;
4403 if (devtype == VSW_VNETPORT)
4404 mutex_exit(&port->mca_lock);
4405 else
4406 mutex_exit(&vswp->mca_lock);
4408 D1(NULL, "%s: exit", __func__);
4410 return (curr_p);
4414 * Create a ring consisting of just a private portion and link
4415 * it into the list of rings for the outbound lane.
4417 * These type of rings are used primarily for temporary data
4418 * storage (i.e. as data buffers).
4420 void
4421 vsw_create_privring(vsw_ldc_t *ldcp)
4423 dring_info_t *dp;
4424 vsw_t *vswp = ldcp->ldc_vswp;
4426 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
4428 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4429 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
4430 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
4431 ldcp->lane_out.dringp = dp;
4433 /* no public section */
4434 dp->pub_addr = NULL;
4435 dp->priv_addr = kmem_zalloc(
4436 (sizeof (vsw_private_desc_t) * vsw_num_descriptors), KM_SLEEP);
4437 dp->num_descriptors = vsw_num_descriptors;
4439 if (vsw_setup_tx_dring(ldcp, dp)) {
4440 DERR(vswp, "%s: setup of ring failed", __func__);
4441 vsw_destroy_tx_dring(ldcp);
4442 return;
4445 /* haven't used any descriptors yet */
4446 dp->end_idx = 0;
4447 dp->restart_reqd = B_TRUE;
4449 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
4453 * Set the default lane attributes. These are copied into
4454 * the attr msg we send to our peer. If they are not acceptable
4455 * then (currently) the handshake ends.
4457 static void
4458 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
4460 bzero(lp, sizeof (lane_t));
4462 READ_ENTER(&vswp->if_lockrw);
4463 ether_copy(&(vswp->if_addr), &(lp->addr));
4464 RW_EXIT(&vswp->if_lockrw);
4466 lp->mtu = vswp->max_frame_size;
4467 lp->addr_type = ADDR_TYPE_MAC;
4468 lp->xfer_mode = VIO_DRING_MODE_V1_0;
4469 lp->ack_freq = 0; /* for shared mode */
4470 lp->seq_num = VNET_ISS;
4474 * Map the descriptor ring exported by the peer.
4476 static dring_info_t *
4477 vsw_map_dring(vsw_ldc_t *ldcp, void *pkt)
4479 dring_info_t *dp = NULL;
4480 lane_t *lp = &ldcp->lane_out;
4482 if (lp->dring_mode == VIO_RX_DRING_DATA) {
4484 * In RxDringData mode, dring that we map in
4485 * becomes our transmit descriptor ring.
4487 dp = vsw_map_tx_dring(ldcp, pkt);
4488 } else {
4490 * In TxDring mode, dring that we map in
4491 * becomes our receive descriptor ring.
4493 dp = vsw_map_rx_dring(ldcp, pkt);
4495 return (dp);
4499 * Common dring mapping function used in both TxDring and RxDringData modes.
4501 dring_info_t *
4502 vsw_map_dring_cmn(vsw_ldc_t *ldcp, vio_dring_reg_msg_t *dring_pkt)
4504 int rv;
4505 dring_info_t *dp;
4506 ldc_mem_info_t minfo;
4507 vsw_t *vswp = ldcp->ldc_vswp;
4510 * If the dring params are unacceptable then we NACK back.
4512 if ((dring_pkt->num_descriptors == 0) ||
4513 (dring_pkt->descriptor_size == 0) ||
4514 (dring_pkt->ncookies != 1)) {
4515 DERR(vswp, "%s (%lld): invalid dring info",
4516 __func__, ldcp->ldc_id);
4517 return (NULL);
4520 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
4522 dp->num_descriptors = dring_pkt->num_descriptors;
4523 dp->descriptor_size = dring_pkt->descriptor_size;
4524 dp->options = dring_pkt->options;
4525 dp->dring_ncookies = dring_pkt->ncookies;
4528 * Note: should only get one cookie. Enforced in
4529 * the ldc layer.
4531 bcopy(&dring_pkt->cookie[0], &dp->dring_cookie[0],
4532 sizeof (ldc_mem_cookie_t));
4534 rv = ldc_mem_dring_map(ldcp->ldc_handle, &dp->dring_cookie[0],
4535 dp->dring_ncookies, dp->num_descriptors, dp->descriptor_size,
4536 LDC_DIRECT_MAP, &(dp->dring_handle));
4537 if (rv != 0) {
4538 goto fail;
4541 rv = ldc_mem_dring_info(dp->dring_handle, &minfo);
4542 if (rv != 0) {
4543 goto fail;
4545 /* store the address of the ring */
4546 dp->pub_addr = minfo.vaddr;
4548 /* cache the dring mtype */
4549 dp->dring_mtype = minfo.mtype;
4551 /* no private section as we are importing */
4552 dp->priv_addr = NULL;
4555 * Using simple mono increasing int for ident at the moment.
4557 dp->ident = ldcp->next_ident;
4558 ldcp->next_ident++;
4561 * Acknowledge it; we send back a unique dring identifier that
4562 * the sending side will use in future to refer to this
4563 * descriptor ring.
4565 dring_pkt->dring_ident = dp->ident;
4567 return (dp);
4568 fail:
4569 if (dp->dring_handle != NULL) {
4570 (void) ldc_mem_dring_unmap(dp->dring_handle);
4572 kmem_free(dp, sizeof (*dp));
4573 return (NULL);
4577 * Unmap the descriptor ring exported by the peer.
4579 static void
4580 vsw_unmap_dring(vsw_ldc_t *ldcp)
4582 lane_t *lane_out = &ldcp->lane_out;
4584 if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
4585 vsw_unmap_tx_dring(ldcp);
4586 } else {
4587 vsw_unmap_rx_dring(ldcp);
4592 * Map the shared memory data buffer area exported by the peer.
4593 * Used in RxDringData mode only.
4595 static int
4596 vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt)
4598 int rv;
4599 vio_dring_reg_ext_msg_t *emsg;
4600 vio_dring_reg_msg_t *msg = pkt;
4601 uint8_t *buf = (uint8_t *)msg->cookie;
4602 vsw_t *vswp = ldcp->ldc_vswp;
4603 ldc_mem_info_t minfo;
4605 /* skip over dring cookies */
4606 ASSERT(msg->ncookies == 1);
4607 buf += (msg->ncookies * sizeof (ldc_mem_cookie_t));
4609 emsg = (vio_dring_reg_ext_msg_t *)buf;
4610 if (emsg->data_ncookies > VNET_DATA_AREA_COOKIES) {
4611 return (1);
4614 /* save # of data area cookies */
4615 dp->data_ncookies = emsg->data_ncookies;
4617 /* save data area size */
4618 dp->data_sz = emsg->data_area_size;
4620 /* allocate ldc mem handle for data area */
4621 rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &dp->data_handle);
4622 if (rv != 0) {
4623 cmn_err(CE_WARN, "ldc_mem_alloc_handle failed\n");
4624 DWARN(vswp, "%s (%lld) ldc_mem_alloc_handle() failed: %d\n",
4625 __func__, ldcp->ldc_id, rv);
4626 return (1);
4629 /* map the data area */
4630 rv = ldc_mem_map(dp->data_handle, emsg->data_cookie,
4631 emsg->data_ncookies, LDC_DIRECT_MAP, LDC_MEM_R,
4632 (caddr_t *)&dp->data_addr, NULL);
4633 if (rv != 0) {
4634 cmn_err(CE_WARN, "ldc_mem_map failed\n");
4635 DWARN(vswp, "%s (%lld) ldc_mem_map() failed: %d\n",
4636 __func__, ldcp->ldc_id, rv);
4637 return (1);
4640 /* get the map info */
4641 rv = ldc_mem_info(dp->data_handle, &minfo);
4642 if (rv != 0) {
4643 cmn_err(CE_WARN, "ldc_mem_info failed\n");
4644 DWARN(vswp, "%s (%lld) ldc_mem_info() failed: %d\n",
4645 __func__, ldcp->ldc_id, rv);
4646 return (1);
4649 if (minfo.mtype != LDC_DIRECT_MAP) {
4650 DWARN(vswp, "%s (%lld) mtype(%d) is not direct map\n",
4651 __func__, ldcp->ldc_id, minfo.mtype);
4652 return (1);
4655 /* allocate memory for data area cookies */
4656 dp->data_cookie = kmem_zalloc(emsg->data_ncookies *
4657 sizeof (ldc_mem_cookie_t), KM_SLEEP);
4659 /* save data area cookies */
4660 bcopy(emsg->data_cookie, dp->data_cookie,
4661 emsg->data_ncookies * sizeof (ldc_mem_cookie_t));
4663 return (0);
4667 * Reset and free all the resources associated with the channel.
4669 static void
4670 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
4672 lane_t *lp;
4674 D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
4676 if (dir == INBOUND) {
4677 D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
4678 " of channel %lld", __func__, ldcp->ldc_id);
4679 lp = &ldcp->lane_in;
4680 } else {
4681 D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
4682 " of channel %lld", __func__, ldcp->ldc_id);
4683 lp = &ldcp->lane_out;
4686 lp->lstate = VSW_LANE_INACTIV;
4687 lp->seq_num = VNET_ISS;
4689 if (dir == INBOUND) {
4690 /* Unmap the remote dring which is imported from the peer */
4691 vsw_unmap_dring(ldcp);
4692 } else {
4693 /* Destroy the local dring which is exported to the peer */
4694 vsw_destroy_dring(ldcp);
4697 D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
4701 * Destroy the descriptor ring.
4703 static void
4704 vsw_destroy_dring(vsw_ldc_t *ldcp)
4706 lane_t *lp = &ldcp->lane_out;
4708 if (lp->dring_mode == VIO_RX_DRING_DATA) {
4709 vsw_destroy_rx_dring(ldcp);
4710 } else {
4711 vsw_destroy_tx_dring(ldcp);
4716 * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
4717 * This thread is woken up by the vsw_portsend to transmit
4718 * packets.
4720 static void
4721 vsw_ldc_tx_worker(void *arg)
4723 callb_cpr_t cprinfo;
4724 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
4725 vsw_t *vswp = ldcp->ldc_vswp;
4726 mblk_t *mp;
4727 mblk_t *tmp;
4729 D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
4730 CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
4731 "vnet_tx_thread");
4732 mutex_enter(&ldcp->tx_thr_lock);
4733 while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {
4735 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4737 * Wait until the data is received or a stop
4738 * request is received.
4740 while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
4741 (ldcp->tx_mhead == NULL)) {
4742 cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
4744 CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)
4747 * First process the stop request.
4749 if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
4750 D2(vswp, "%s(%lld):tx thread stopped\n",
4751 __func__, ldcp->ldc_id);
4752 break;
4754 mp = ldcp->tx_mhead;
4755 ldcp->tx_mhead = ldcp->tx_mtail = NULL;
4756 ldcp->tx_cnt = 0;
4757 mutex_exit(&ldcp->tx_thr_lock);
4758 D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
4759 __func__, ldcp->ldc_id);
4760 while (mp != NULL) {
4761 tmp = mp->b_next;
4762 mp->b_next = mp->b_prev = NULL;
4763 (void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
4764 mp = tmp;
4766 mutex_enter(&ldcp->tx_thr_lock);
4770 * Update the run status and wakeup the thread that
4771 * has sent the stop request.
4773 ldcp->tx_thr_flags &= ~VSW_WTHR_STOP;
4774 ldcp->tx_thread = NULL;
4775 CALLB_CPR_EXIT(&cprinfo);
4776 D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
4777 thread_exit();
4780 /* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
4781 static void
4782 vsw_stop_tx_thread(vsw_ldc_t *ldcp)
4784 kt_did_t tid = 0;
4785 vsw_t *vswp = ldcp->ldc_vswp;
4787 D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
4789 * Send a stop request by setting the stop flag and
4790 * wait until the receive thread stops.
4792 mutex_enter(&ldcp->tx_thr_lock);
4793 if (ldcp->tx_thread != NULL) {
4794 tid = ldcp->tx_thread->t_did;
4795 ldcp->tx_thr_flags |= VSW_WTHR_STOP;
4796 cv_signal(&ldcp->tx_thr_cv);
4798 mutex_exit(&ldcp->tx_thr_lock);
4800 if (tid != 0) {
4801 thread_join(tid);
4804 D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
4807 static int
4808 vsw_mapin_avail(vsw_ldc_t *ldcp)
4810 int rv;
4811 ldc_info_t info;
4812 uint64_t mapin_sz_req;
4813 uint64_t dblk_sz;
4814 vsw_t *vswp = ldcp->ldc_vswp;
4816 rv = ldc_info(ldcp->ldc_handle, &info);
4817 if (rv != 0) {
4818 return (B_FALSE);
4821 dblk_sz = RXDRING_DBLK_SZ(vswp->max_frame_size);
4822 mapin_sz_req = (VSW_RXDRING_NRBUFS * dblk_sz);
4824 if (info.direct_map_size_max >= mapin_sz_req) {
4825 return (B_TRUE);
4828 return (B_FALSE);
4832 * Debugging routines
4834 static void
4835 display_state(void)
4837 vsw_t *vswp;
4838 vsw_port_list_t *plist;
4839 vsw_port_t *port;
4840 vsw_ldc_t *ldcp;
4841 extern vsw_t *vsw_head;
4843 cmn_err(CE_NOTE, "***** system state *****");
4845 for (vswp = vsw_head; vswp; vswp = vswp->next) {
4846 plist = &vswp->plist;
4847 READ_ENTER(&plist->lockrw);
4848 cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
4849 vswp->instance, plist->num_ports);
4851 for (port = plist->head; port != NULL; port = port->p_next) {
4852 cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
4853 port->p_instance, port->num_ldcs);
4854 ldcp = port->ldcp;
4855 cmn_err(CE_CONT, "chan %lu : dev %d : "
4856 "status %d : phase %u\n",
4857 ldcp->ldc_id, ldcp->dev_class,
4858 ldcp->ldc_status, ldcp->hphase);
4859 cmn_err(CE_CONT, "chan %lu : lsession %lu : "
4860 "psession %lu\n", ldcp->ldc_id,
4861 ldcp->local_session, ldcp->peer_session);
4863 cmn_err(CE_CONT, "Inbound lane:\n");
4864 display_lane(&ldcp->lane_in);
4865 cmn_err(CE_CONT, "Outbound lane:\n");
4866 display_lane(&ldcp->lane_out);
4868 RW_EXIT(&plist->lockrw);
4870 cmn_err(CE_NOTE, "***** system state *****");
4873 static void
4874 display_lane(lane_t *lp)
4876 dring_info_t *drp = lp->dringp;
4878 cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
4879 lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
4880 cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
4881 lp->addr_type, lp->addr, lp->xfer_mode);
4882 cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
4884 cmn_err(CE_CONT, "Dring info:\n");
4885 cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
4886 drp->num_descriptors, drp->descriptor_size);
4887 cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->dring_handle);
4888 cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
4889 (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
4890 cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
4891 drp->ident, drp->end_idx);
4892 display_ring(drp);
4895 static void
4896 display_ring(dring_info_t *dringp)
4898 uint64_t i;
4899 uint64_t priv_count = 0;
4900 uint64_t pub_count = 0;
4901 vnet_public_desc_t *pub_addr = NULL;
4902 vsw_private_desc_t *priv_addr = NULL;
4904 for (i = 0; i < vsw_num_descriptors; i++) {
4905 if (dringp->pub_addr != NULL) {
4906 pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
4908 if (pub_addr->hdr.dstate == VIO_DESC_FREE)
4909 pub_count++;
4912 if (dringp->priv_addr != NULL) {
4913 priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;
4915 if (priv_addr->dstate == VIO_DESC_FREE)
4916 priv_count++;
4919 cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
4920 i, priv_count, pub_count);
4923 static void
4924 dump_flags(uint64_t state)
4926 int i;
4928 typedef struct flag_name {
4929 int flag_val;
4930 char *flag_name;
4931 } flag_name_t;
4933 flag_name_t flags[] = {
4934 VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
4935 VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
4936 VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
4937 VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
4938 VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
4939 VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
4940 VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
4941 VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
4942 VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
4943 VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
4944 VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
4945 VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
4946 VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
4947 VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
4948 VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
4949 VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
4950 VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
4951 VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
4952 VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
4953 VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
4954 VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
4955 VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
4956 VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
4957 VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
4958 VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
4959 VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
4960 VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
4961 VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
4962 VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
4963 VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
4964 VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
4966 DERR(NULL, "DUMP_FLAGS: %llx\n", state);
4967 for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
4968 if (state & flags[i].flag_val)
4969 DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);