Merge illumos-gate
[unleashed.git] / usr / src / uts / common / io / ib / clients / rds / rdsddi.c
blobc9bcede785be8dabcdeb58796afc3c1786c6258a
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 2018, Joyent, Inc.
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/modctl.h>
33 #include <sys/stat.h>
34 #include <sys/stream.h>
35 #include <sys/strsun.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/priv_names.h>
39 #include <inet/common.h>
41 #define _SUN_TPI_VERSION 2
42 #include <sys/tihdr.h>
43 #include <sys/timod.h>
44 #include <sys/tiuser.h>
45 #include <sys/suntpi.h>
46 #include <inet/common.h>
47 #include <inet/ip.h>
48 #include <inet/mi.h>
49 #include <inet/proto_set.h>
50 #include <sys/ib/clients/rds/rds.h>
51 #include <sys/policy.h>
52 #include <inet/ipclassifier.h>
53 #include <sys/ib/clients/rds/rds_kstat.h>
54 #include "sys/random.h"
55 #include <sys/ib/clients/rds/rds_transport.h>
56 #include <sys/ib/ibtl/ibti.h>
59 #define RDS_NAME "rds"
60 #define RDS_STRTAB rdsinfo
61 #define RDS_DEVDESC "RDS STREAMS driver"
62 #define RDS_DEVMINOR 0
63 #define RDS_DEVMTFLAGS D_MP | D_SYNCSTR
64 #define RDS_DEFAULT_PRIV_MODE 0666
66 #define rds_smallest_port 1
67 #define rds_largest_port 65535
69 #define RDS_RECV_HIWATER (56 * 1024)
70 #define RDS_RECV_LOWATER 128
71 #define RDS_XMIT_HIWATER (56 * 1024)
72 #define RDS_XMIT_LOWATER 1024
74 #define RDS_DPRINTF2 0 &&
75 #define LABEL "RDS"
77 typedef struct rdsahdr_s {
78 in_port_t uha_src_port; /* Source port */
79 in_port_t uha_dst_port; /* Destination port */
80 } rdsha_t;
82 #define RDSH_SIZE 4
84 int rds_recv_hiwat = RDS_RECV_HIWATER;
85 int rds_recv_lowat = RDS_RECV_LOWATER;
86 int rds_xmit_hiwat = RDS_XMIT_HIWATER;
87 int rds_xmit_lowat = RDS_XMIT_LOWATER;
89 int rdsdebug;
91 static dev_info_t *rds_dev_info;
93 /* Hint not protected by any lock */
94 static in_port_t rds_next_port_to_try;
96 ldi_ident_t rds_li;
97 static int loopmax = rds_largest_port - rds_smallest_port + 1;
99 /* global configuration variables */
100 uint_t UserBufferSize;
101 uint_t rds_rx_pkts_pending_hwm;
103 extern void rds_ioctl(queue_t *, mblk_t *);
104 extern void rds_ioctl_copyin_done(queue_t *q, mblk_t *mp);
106 int rds_open_transport_driver();
107 int rds_close_transport_driver();
109 #define RDS_CURRENT_PORT_QUOTA() \
110 (rds_rx_pkts_pending_hwm/RDS_GET_NPORT())
112 krwlock_t rds_transport_lock;
113 ldi_handle_t rds_transport_handle = NULL;
114 rds_transport_ops_t *rds_transport_ops = NULL;
116 static int
117 rds_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
119 int ret;
121 if (cmd != DDI_ATTACH)
122 return (DDI_FAILURE);
124 rds_dev_info = devi;
126 ret = ddi_create_minor_node(devi, RDS_NAME, S_IFCHR,
127 RDS_DEVMINOR, DDI_PSEUDO, 0);
128 if (ret != DDI_SUCCESS) {
129 return (ret);
132 return (DDI_SUCCESS);
135 static int
136 rds_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
138 if (cmd != DDI_DETACH)
139 return (DDI_FAILURE);
141 ASSERT(devi == rds_dev_info);
143 ddi_remove_minor_node(devi, NULL);
145 return (DDI_SUCCESS);
148 /* ARGSUSED */
149 static int
150 rds_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
152 int error = DDI_FAILURE;
154 switch (cmd) {
155 case DDI_INFO_DEVT2DEVINFO:
156 if (rds_dev_info != NULL) {
157 *result = (void *)rds_dev_info;
158 error = DDI_SUCCESS;
160 break;
162 case DDI_INFO_DEVT2INSTANCE:
163 *result = NULL;
164 error = DDI_SUCCESS;
165 break;
167 default:
168 break;
171 return (error);
175 /*ARGSUSED*/
176 static int
177 rds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
179 rds_t *rds;
180 int ret;
182 /* Open the transport driver if IB HW is present */
183 rw_enter(&rds_transport_lock, RW_READER);
184 if (rds_transport_handle == NULL) {
185 rw_exit(&rds_transport_lock);
186 ret = rds_open_transport_driver();
187 rw_enter(&rds_transport_lock, RW_READER);
189 if (ret != 0) {
190 /* Transport driver failed to load */
191 rw_exit(&rds_transport_lock);
192 return (ret);
195 rw_exit(&rds_transport_lock);
197 if (sflag == MODOPEN) {
198 return (EINVAL);
201 /* Reopen not supported */
202 if (q->q_ptr != NULL) {
203 dprint(2, ("%s: Reopen is not supported: %p", LABEL, q->q_ptr));
204 return (0);
207 rds = rds_create(q, credp);
208 if (rds == NULL) {
209 dprint(2, ("%s: rds_create failed", LABEL));
210 return (0);
213 q->q_ptr = WR(q)->q_ptr = rds;
214 rds->rds_state = TS_UNBND;
215 rds->rds_family = AF_INET_OFFLOAD;
217 q->q_hiwat = rds_recv_hiwat;
218 q->q_lowat = rds_recv_lowat;
220 qprocson(q);
222 WR(q)->q_hiwat = rds_xmit_hiwat;
223 WR(q)->q_lowat = rds_xmit_lowat;
225 /* Set the Stream head watermarks */
226 (void) proto_set_rx_hiwat(q, NULL, rds_recv_hiwat);
227 (void) proto_set_rx_lowat(q, NULL, rds_recv_lowat);
229 return (0);
232 /* ARGSUSED */
233 static int
234 rds_close(queue_t *q, int flags __unused, cred_t *credp __unused)
236 rds_t *rdsp = (rds_t *)q->q_ptr;
238 qprocsoff(q);
241 * NPORT should be decremented only if this socket was previously
242 * bound to an RDS port.
244 if (rdsp->rds_state >= TS_IDLE) {
245 RDS_DECR_NPORT();
246 RDS_SET_PORT_QUOTA(RDS_CURRENT_PORT_QUOTA());
247 rds_transport_ops->
248 rds_transport_resume_port(ntohs(rdsp->rds_port));
251 /* close the transport driver if this is the last socket */
252 if (RDS_GET_NPORT() == 1) {
253 (void) rds_close_transport_driver();
257 * We set the flags without holding a lock as this is
258 * just a hint for the fanout lookup to skip this rds.
259 * We dont free the struct until it's out of the hash and
260 * the ref count goes down.
262 rdsp->rds_flags |= RDS_CLOSING;
263 rds_bind_hash_remove(rdsp, B_FALSE);
264 mutex_enter(&rdsp->rds_lock);
265 ASSERT(rdsp->rds_refcnt > 0);
266 if (rdsp->rds_refcnt != 1) {
267 cv_wait(&rdsp->rds_refcv, &rdsp->rds_lock);
269 mutex_exit(&rdsp->rds_lock);
270 RDS_DEC_REF_CNT(rdsp);
271 RD(q)->q_ptr = NULL;
272 WR(q)->q_ptr = NULL;
273 return (0);
277 * Add a new message to the socket
280 rds_deliver_new_msg(mblk_t *mp, ipaddr_t local_addr, ipaddr_t rem_addr,
281 in_port_t local_port, in_port_t rem_port, zoneid_t zoneid)
283 rds_t *rds;
284 struct T_unitdata_ind *tudi;
285 int udi_size; /* Size of T_unitdata_ind */
286 mblk_t *mp1;
287 sin_t *sin;
288 int error = 0;
290 local_port = htons(local_port);
291 rem_port = htons(rem_port);
293 ASSERT(mp->b_datap->db_type == M_DATA);
294 rds = rds_fanout(local_addr, rem_addr, local_port, rem_port, zoneid);
295 if (rds == NULL) {
296 dprint(2, ("%s: rds_fanout failed: (0x%x 0x%x %d %d)", LABEL,
297 local_addr, rem_addr, ntohs(local_port), ntohs(rem_port)));
298 freemsg(mp);
299 return (error);
302 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
304 /* Allocate a message block for the T_UNITDATA_IND structure. */
305 mp1 = allocb(udi_size, BPRI_MED);
306 if (mp1 == NULL) {
307 dprint(2, ("%s: allocb failed", LABEL));
308 freemsg(mp);
309 return (ENOMEM);
312 mp1->b_cont = mp;
313 mp = mp1;
314 mp->b_datap->db_type = M_PROTO;
315 tudi = (struct T_unitdata_ind *)(uintptr_t)mp->b_rptr;
316 mp->b_wptr = (uchar_t *)tudi + udi_size;
317 tudi->PRIM_type = T_UNITDATA_IND;
318 tudi->SRC_length = sizeof (sin_t);
319 tudi->SRC_offset = sizeof (struct T_unitdata_ind);
320 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
321 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
322 tudi->OPT_length = udi_size;
323 sin = (sin_t *)&tudi[1];
324 sin->sin_addr.s_addr = rem_addr;
325 sin->sin_port = ntohs(rem_port);
326 sin->sin_family = rds->rds_family;
327 *(uint32_t *)(uintptr_t)&sin->sin_zero[0] = 0;
328 *(uint32_t *)(uintptr_t)&sin->sin_zero[4] = 0;
330 putnext(rds->rds_ulpd, mp);
332 /* check port quota */
333 if (RDS_GET_RXPKTS_PEND() > rds_rx_pkts_pending_hwm) {
334 ulong_t current_port_quota = RDS_GET_PORT_QUOTA();
335 if (rds->rds_port_quota > current_port_quota) {
336 /* this may result in stalling the port */
337 rds->rds_port_quota = current_port_quota;
338 (void) proto_set_rx_hiwat(rds->rds_ulpd, NULL,
339 rds->rds_port_quota * UserBufferSize);
340 RDS_INCR_PORT_QUOTA_ADJUSTED();
345 * canputnext() check is done after putnext as the protocol does
346 * not allow dropping any received packet.
348 if (!canputnext(rds->rds_ulpd)) {
349 error = ENOSPC;
352 RDS_DEC_REF_CNT(rds);
353 return (error);
357 /* Default structure copied into T_INFO_ACK messages */
358 static struct T_info_ack rds_g_t_info_ack_ipv4 = {
359 T_INFO_ACK,
360 65535, /* TSDU_size. Excl. headers */
361 T_INVALID, /* ETSU_size. rds does not support expedited data. */
362 T_INVALID, /* CDATA_size. rds does not support connect data. */
363 T_INVALID, /* DDATA_size. rds does not support disconnect data. */
364 sizeof (sin_t), /* ADDR_size. */
365 0, /* OPT_size - not initialized here */
366 65535, /* TIDU_size. Excl. headers */
367 T_CLTS, /* SERV_type. rds supports connection-less. */
368 TS_UNBND, /* CURRENT_state. This is set from rds_state. */
369 (XPG4_1|SENDZERO) /* PROVIDER_flag */
372 static in_port_t
373 rds_update_next_port(in_port_t port)
375 (void) random_get_pseudo_bytes((uint8_t *)&port, sizeof (in_port_t));
376 if (port < rds_smallest_port)
377 port = rds_smallest_port;
378 return (port);
381 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
382 static void
383 rds_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
385 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
386 qreply(q, mp);
389 static void
390 rds_capability_req(queue_t *q, mblk_t *mp)
392 t_uscalar_t cap_bits1;
393 struct T_capability_ack *tcap;
395 cap_bits1 =
396 ((struct T_capability_req *)(uintptr_t)mp->b_rptr)->CAP_bits1;
398 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
399 mp->b_datap->db_type, T_CAPABILITY_ACK);
400 if (mp == NULL)
401 return;
402 tcap = (struct T_capability_ack *)(uintptr_t)mp->b_rptr;
403 tcap->CAP_bits1 = 0;
405 if (cap_bits1 & TC1_INFO) {
406 tcap->CAP_bits1 |= TC1_INFO;
407 *(&tcap->INFO_ack) = rds_g_t_info_ack_ipv4;
410 qreply(q, mp);
413 static void
414 rds_info_req(queue_t *q, mblk_t *omp)
416 rds_t *rds = (rds_t *)q->q_ptr;
417 struct T_info_ack *tap;
418 mblk_t *mp;
420 /* Create a T_INFO_ACK message. */
421 mp = tpi_ack_alloc(omp, sizeof (struct T_info_ack), M_PCPROTO,
422 T_INFO_ACK);
423 if (mp == NULL)
424 return;
425 tap = (struct T_info_ack *)(uintptr_t)mp->b_rptr;
426 *tap = rds_g_t_info_ack_ipv4;
427 tap->CURRENT_state = rds->rds_state;
428 tap->OPT_size = 128;
429 qreply(q, mp);
433 * NO locking protection here as sockfs will only send down
434 * one bind operation at a time.
436 static void
437 rds_bind(queue_t *q, mblk_t *mp)
439 sin_t *sin;
440 rds_t *rds;
441 struct T_bind_req *tbr;
442 in_port_t port; /* Host byte order */
443 in_port_t requested_port; /* Host byte order */
444 struct T_bind_ack *tba;
445 int count;
446 rds_bf_t *rdsbf;
447 in_port_t lport; /* Network byte order */
449 rds = (rds_t *)q->q_ptr;
450 if (((uintptr_t)mp->b_wptr - (uintptr_t)mp->b_rptr) < sizeof (*tbr)) {
451 rds_err_ack(q, mp, TPROTO, 0);
452 return;
456 * We don't allow multiple binds
458 if (rds->rds_state != TS_UNBND) {
459 rds_err_ack(q, mp, TOUTSTATE, 0);
460 return;
463 tbr = (struct T_bind_req *)(uintptr_t)mp->b_rptr;
464 switch (tbr->ADDR_length) {
465 case sizeof (sin_t): /* Complete IPv4 address */
466 sin = (sin_t *)(uintptr_t)mi_offset_param(mp, tbr->ADDR_offset,
467 sizeof (sin_t));
468 if (sin == NULL || !OK_32PTR((char *)sin)) {
469 rds_err_ack(q, mp, TSYSERR, EINVAL);
470 return;
472 if (rds->rds_family != AF_INET_OFFLOAD ||
473 sin->sin_family != AF_INET_OFFLOAD) {
474 rds_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
475 return;
477 if (sin->sin_addr.s_addr == INADDR_ANY) {
478 rds_err_ack(q, mp, TBADADDR, 0);
479 return;
483 * verify that the address is hosted on IB
484 * only exception is the loopback address.
486 if ((sin->sin_addr.s_addr != INADDR_LOOPBACK) &&
487 !rds_verify_bind_address(sin->sin_addr.s_addr)) {
488 rds_err_ack(q, mp, TBADADDR, 0);
489 return;
492 port = ntohs(sin->sin_port);
493 break;
494 default: /* Invalid request */
495 rds_err_ack(q, mp, TBADADDR, 0);
496 return;
499 requested_port = port;
502 * TPI only sends down T_BIND_REQ for AF_INET and AF_INET6
503 * since RDS socket is of type AF_INET_OFFLOAD a O_T_BIND_REQ
504 * will be sent down. Treat O_T_BIND_REQ as T_BIND_REQ
507 if (requested_port == 0) {
509 * If the application passed in zero for the port number, it
510 * doesn't care which port number we bind to. Get one in the
511 * valid range.
513 port = rds_update_next_port(rds_next_port_to_try);
516 ASSERT(port != 0);
517 count = 0;
518 for (;;) {
519 rds_t *rds1;
520 ASSERT(sin->sin_addr.s_addr != INADDR_ANY);
522 * Walk through the list of rds streams bound to
523 * requested port with the same IP address.
525 lport = htons(port);
526 rdsbf = &rds_bind_fanout[RDS_BIND_HASH(lport)];
527 mutex_enter(&rdsbf->rds_bf_lock);
528 for (rds1 = rdsbf->rds_bf_rds; rds1 != NULL;
529 rds1 = rds1->rds_bind_hash) {
530 if (lport != rds1->rds_port ||
531 rds1->rds_src != sin->sin_addr.s_addr ||
532 rds1->rds_zoneid != rds->rds_zoneid)
534 continue;
535 break;
538 if (rds1 == NULL) {
540 * No other stream has this IP address
541 * and port number. We can use it.
543 break;
545 mutex_exit(&rdsbf->rds_bf_lock);
546 if (requested_port != 0) {
548 * We get here only when requested port
549 * is bound (and only first of the for()
550 * loop iteration).
552 * The semantics of this bind request
553 * require it to fail so we return from
554 * the routine (and exit the loop).
557 rds_err_ack(q, mp, TADDRBUSY, 0);
558 return;
561 port = rds_update_next_port(port + 1);
563 if (++count >= loopmax) {
565 * We've tried every possible port number and
566 * there are none available, so send an error
567 * to the user.
569 rds_err_ack(q, mp, TNOADDR, 0);
570 return;
575 * Copy the source address into our rds structure.
577 rds->rds_src = sin->sin_addr.s_addr;
578 rds->rds_port = lport;
581 * reset the next port if we choose the port
583 if (requested_port == 0) {
584 rds_next_port_to_try = port + 1;
587 rds->rds_state = TS_IDLE;
588 rds_bind_hash_insert(rdsbf, rds);
589 mutex_exit(&rdsbf->rds_bf_lock);
591 /* Reset the message type in preparation for shipping it back. */
592 mp->b_datap->db_type = M_PCPROTO;
593 tba = (struct T_bind_ack *)(uintptr_t)mp->b_rptr;
594 tba->PRIM_type = T_BIND_ACK;
596 /* Increment the number of ports and set the port quota */
597 RDS_INCR_NPORT();
598 rds->rds_port_quota = RDS_CURRENT_PORT_QUOTA();
599 RDS_SET_PORT_QUOTA(rds->rds_port_quota);
600 (void) proto_set_rx_hiwat(RD(q), NULL,
601 rds->rds_port_quota * UserBufferSize);
603 qreply(q, mp);
606 static void
607 rds_wput_other(queue_t *q, mblk_t *mp)
609 uchar_t *rptr = mp->b_rptr;
610 struct datab *db;
611 cred_t *cr;
613 db = mp->b_datap;
614 switch (db->db_type) {
615 case M_DATA:
616 /* Not connected */
617 freemsg(mp);
618 return;
619 case M_PROTO:
620 case M_PCPROTO:
621 if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr <
622 sizeof (t_scalar_t)) {
623 freemsg(mp);
624 return;
626 switch (((union T_primitives *)(uintptr_t)rptr)->type) {
627 case T_CAPABILITY_REQ:
628 rds_capability_req(q, mp);
629 return;
631 case T_INFO_REQ:
632 rds_info_req(q, mp);
633 return;
634 case O_T_BIND_REQ:
635 case T_BIND_REQ:
636 rds_bind(q, mp);
637 return;
638 case T_SVR4_OPTMGMT_REQ:
639 case T_OPTMGMT_REQ:
641 * All Solaris components should pass a db_credp
642 * for this TPI message, hence we ASSERT.
643 * But in case there is some other M_PROTO that looks
644 * like a TPI message sent by some other kernel
645 * component, we check and return an error.
647 cr = msg_getcred(mp, NULL);
648 ASSERT(cr != NULL);
649 if (cr == NULL) {
650 rds_err_ack(q, mp, TSYSERR, EINVAL);
651 return;
653 if (((union T_primitives *)(uintptr_t)rptr)->type ==
654 T_SVR4_OPTMGMT_REQ) {
655 svr4_optcom_req(q, mp, cr, &rds_opt_obj);
656 } else {
657 tpi_optcom_req(q, mp, cr, &rds_opt_obj);
659 return;
660 case T_CONN_REQ:
662 * We should not receive T_CONN_REQ as sockfs only
663 * sends down T_CONN_REQ if family == AF_INET/AF_INET6
664 * and type == SOCK_DGRAM/SOCK_RAW. For all others
665 * it simply calls soisconnected. see sotpi_connect()
666 * for details.
668 /* FALLTHRU */
669 default:
670 cmn_err(CE_PANIC, "type %d \n",
671 ((union T_primitives *)(uintptr_t)rptr)->type);
673 break;
674 case M_FLUSH:
675 if (*rptr & FLUSHW)
676 flushq(q, FLUSHDATA);
677 break;
678 case M_IOCTL:
679 rds_ioctl(q, mp);
680 break;
681 case M_IOCDATA:
682 /* IOCTL continuation following copyin or copyout. */
683 if (mi_copy_state(q, mp, NULL) == -1) {
685 * The copy operation failed. mi_copy_state already
686 * cleaned up, so we're out of here.
688 return;
691 * If we just completed a copy in, continue processing
692 * in rds_ioctl_copyin_done. If it was a copy out, we call
693 * mi_copyout again. If there is nothing more to copy out,
694 * it will complete the IOCTL.
697 if (MI_COPY_DIRECTION(mp) == MI_COPY_IN)
698 rds_ioctl_copyin_done(q, mp);
699 else
700 mi_copyout(q, mp);
701 return;
703 default:
704 cmn_err(CE_PANIC, "types %d \n", db->db_type);
708 static int
709 rds_wput(queue_t *q, mblk_t *mp)
711 struct datab *db;
712 uchar_t *rptr = mp->b_rptr;
714 db = mp->b_datap;
715 switch (db->db_type) {
716 case M_PROTO:
717 case M_PCPROTO:
718 ASSERT(((uintptr_t)mp->b_wptr - (uintptr_t)rptr) <=
719 (uintptr_t)INT_MAX);
720 if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr >=
721 sizeof (struct T_unitdata_req)) {
722 if (((union T_primitives *)(uintptr_t)rptr)->type
723 == T_UNITDATA_REQ) {
725 * We should never come here for T_UNITDATA_REQ
727 cmn_err(CE_PANIC, "rds_wput T_UNITDATA_REQ \n");
730 /* FALLTHRU */
731 default:
732 rds_wput_other(q, mp);
733 return (0);
737 static int
738 rds_wput_data(queue_t *q, mblk_t *mp, uio_t *uiop)
740 uchar_t *rptr = mp->b_rptr;
741 rds_t *rds;
742 mblk_t *mp1;
743 sin_t *sin;
744 ipaddr_t dst;
745 uint16_t port;
746 int ret = 0;
748 #define tudr ((struct T_unitdata_req *)(uintptr_t)rptr)
750 rds = (rds_t *)q->q_ptr;
751 /* Handle UNITDATA_REQ messages here */
752 if (rds->rds_state == TS_UNBND) {
753 /* If a port has not been bound to the stream, fail. */
754 dprint(2, ("%s: socket is not bound to a port", LABEL));
755 freemsg(mp);
756 return (EPROTO);
759 mp1 = mp->b_cont;
760 mp->b_cont = NULL;
761 if (mp1 == NULL) {
762 dprint(2, ("%s: No message to send", LABEL));
763 freemsg(mp);
764 return (EPROTO);
768 * No options allowed
770 if (tudr->OPT_length != 0) {
771 ret = EINVAL;
772 goto done;
775 ASSERT(mp1->b_datap->db_ref == 1);
777 if ((rptr + tudr->DEST_offset + tudr->DEST_length) >
778 mp->b_wptr) {
779 ret = EDESTADDRREQ;
780 goto done;
783 sin = (sin_t *)(uintptr_t)&rptr[tudr->DEST_offset];
784 if (!OK_32PTR((char *)sin) || tudr->DEST_length !=
785 sizeof (sin_t) || sin->sin_family != AF_INET_OFFLOAD) {
786 ret = EDESTADDRREQ;
787 goto done;
789 /* Extract port and ipaddr */
790 port = sin->sin_port;
791 dst = sin->sin_addr.s_addr;
793 if (port == 0 || dst == INADDR_ANY) {
794 ret = EDESTADDRREQ;
795 goto done;
798 ASSERT(rds_transport_ops != NULL);
799 ret = rds_transport_ops->rds_transport_sendmsg(uiop, rds->rds_src, dst,
800 ntohs(rds->rds_port), ntohs(port), rds->rds_zoneid);
801 if (ret != 0) {
802 if ((ret != ENOBUFS) && (ret != ENOMEM)) {
803 /* ENOMEM is actually EWOULDBLOCK */
804 dprint(2, ("%s: rds_sendmsg returned %d", LABEL, ret));
805 goto done;
808 done:
809 freemsg(mp1);
810 freemsg(mp);
811 return (ret);
815 * Make sure we dont return EINVAL and EWOULDBLOCK as it has
816 * special meanings for the synchronous streams (rwnext()).
817 * We should return ENOMEM which is changed to EWOULDBLOCK by kstrputmsg()
819 static int
820 rds_wrw(queue_t *q, struiod_t *dp)
822 mblk_t *mp = dp->d_mp;
823 int error = 0;
824 struct datab *db;
825 uchar_t *rptr;
827 db = mp->b_datap;
828 rptr = mp->b_rptr;
829 switch (db->db_type) {
830 case M_PROTO:
831 case M_PCPROTO:
832 ASSERT(((uintptr_t)mp->b_wptr - (uintptr_t)rptr) <=
833 (uintptr_t)INT_MAX);
834 if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr >=
835 sizeof (struct T_unitdata_req)) {
836 /* Detect valid T_UNITDATA_REQ here */
837 if (((union T_primitives *)(uintptr_t)rptr)->type
838 == T_UNITDATA_REQ)
839 break;
841 /* FALLTHRU */
842 default:
844 if (isuioq(q) && (error = struioget(q, mp, dp, 0))) {
846 * Uio error of some sort, so just return the error.
848 goto done;
850 dp->d_mp = 0;
851 rds_wput_other(q, mp);
852 return (0);
855 dp->d_mp = 0;
856 error = rds_wput_data(q, mp, &dp->d_uio);
857 done:
858 if (error == EWOULDBLOCK || error == EINVAL)
859 error = EIO;
861 return (error);
864 static void
865 rds_rsrv(queue_t *q)
867 rds_t *rds = (rds_t *)q->q_ptr;
868 ulong_t current_port_quota;
870 /* update the port quota to the current level */
871 current_port_quota = RDS_GET_PORT_QUOTA();
872 if (rds->rds_port_quota != current_port_quota) {
873 rds->rds_port_quota = current_port_quota;
874 (void) proto_set_rx_hiwat(q, NULL,
875 rds->rds_port_quota * UserBufferSize);
878 /* No more messages in the q, unstall the socket */
879 rds_transport_ops->rds_transport_resume_port(ntohs(rds->rds_port));
883 rds_close_transport_driver()
885 ASSERT(rds_transport_ops != NULL);
887 rw_enter(&rds_transport_lock, RW_WRITER);
888 if (rds_transport_handle != NULL) {
889 rds_transport_ops->rds_transport_close_ib();
890 (void) ldi_close(rds_transport_handle, FNDELAY, kcred);
891 rds_transport_handle = NULL;
893 rw_exit(&rds_transport_lock);
895 return (0);
900 rds_open_transport_driver()
902 int ret = 0;
904 rw_enter(&rds_transport_lock, RW_WRITER);
905 if (rds_transport_handle != NULL) {
907 * Someone beat us to it.
909 goto done;
912 if (ibt_hw_is_present() == 0) {
913 ret = ENODEV;
914 goto done;
917 if (rds_li == NULL) {
918 ret = EPROTONOSUPPORT;
919 goto done;
922 ret = ldi_open_by_name("/devices/ib/rdsib@0:rdsib",
923 FREAD | FWRITE, kcred, &rds_transport_handle, rds_li);
924 if (ret != 0) {
925 ret = EPROTONOSUPPORT;
926 rds_transport_handle = NULL;
927 goto done;
930 ret = rds_transport_ops->rds_transport_open_ib();
931 if (ret != 0) {
932 (void) ldi_close(rds_transport_handle, FNDELAY, kcred);
933 rds_transport_handle = NULL;
935 done:
936 rw_exit(&rds_transport_lock);
937 return (ret);
940 static struct module_info info = {
941 0, "rds", 1, INFPSZ, 65536, 1024
944 static struct qinit rinit = {
945 NULL, (pfi_t)rds_rsrv, rds_open, rds_close, NULL, &info
948 static struct qinit winit = {
949 (pfi_t)rds_wput, NULL, rds_open, rds_close, NULL, &info,
950 NULL, rds_wrw, NULL, STRUIOT_STANDARD
953 struct streamtab rdsinfo = {
954 &rinit, &winit, NULL, NULL
957 DDI_DEFINE_STREAM_OPS(rds_devops, nulldev, nulldev, rds_attach, rds_detach,
958 nulldev, rds_info, RDS_DEVMTFLAGS, &RDS_STRTAB, ddi_quiesce_not_supported);
961 * Module linkage information for the kernel.
963 static struct modldrv modldrv = {
964 &mod_driverops,
965 RDS_DEVDESC,
966 &rds_devops
969 static struct modlinkage modlinkage = {
970 MODREV_1,
971 &modldrv,
972 NULL
976 _init(void)
978 int ret;
980 rds_init();
982 ret = mod_install(&modlinkage);
983 if (ret != 0)
984 goto done;
985 ret = ldi_ident_from_mod(&modlinkage, &rds_li);
986 if (ret != 0)
987 rds_li = NULL;
988 done:
989 return (ret);
993 _fini(void)
995 int ret;
997 ret = mod_remove(&modlinkage);
998 if (ret != 0) {
999 return (ret);
1002 rds_fini();
1004 ldi_ident_release(rds_li);
1005 return (0);
1009 _info(struct modinfo *modinfop)
1011 return (mod_info(&modlinkage, modinfop));