usr/src/uts/common/inet/tcp/tcp_tpi.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25
  26 /* This files contains all TCP TLI/TPI related functions */
  27
  28 #include <sys/types.h>
  29 #include <sys/stream.h>
  30 #include <sys/strsun.h>
  31 #include <sys/strsubr.h>
  32 #include <sys/stropts.h>
  33 #include <sys/strlog.h>
  34 #define _SUN_TPI_VERSION 2
  35 #include <sys/tihdr.h>
  36 #include <sys/suntpi.h>
  37 #include <sys/xti_inet.h>
  38 #include <sys/squeue_impl.h>
  39 #include <sys/squeue.h>
  40
  41 #include <inet/common.h>
  42 #include <inet/ip.h>
  43 #include <inet/tcp.h>
  44 #include <inet/tcp_impl.h>
  45 #include <inet/proto_set.h>
  46
  47 static void     tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *);
  48 static int      tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *);
  49
  50 void
  51 tcp_use_pure_tpi(tcp_t *tcp)
  52 {
  53         conn_t          *connp = tcp->tcp_connp;
  54
  55 #ifdef  _ILP32
  56         tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq;
  57 #else
  58         tcp->tcp_acceptor_id = connp->conn_dev;
  59 #endif
  60         /*
  61          * Insert this socket into the acceptor hash.
  62          * We might need it for T_CONN_RES message
  63          */
  64         tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
  65
  66         tcp->tcp_issocket = B_FALSE;
  67         TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback);
  68 }
  69
  70 /* Shorthand to generate and send TPI error acks to our client */
  71 void
  72 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
  73 {
  74         if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
  75                 putnext(tcp->tcp_connp->conn_rq, mp);
  76 }
  77
  78 /* Shorthand to generate and send TPI error acks to our client */
  79 void
  80 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
  81     int t_error, int sys_error)
  82 {
  83         struct T_error_ack      *teackp;
  84
  85         if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
  86             M_PCPROTO, T_ERROR_ACK)) != NULL) {
  87                 teackp = (struct T_error_ack *)mp->b_rptr;
  88                 teackp->ERROR_prim = primitive;
  89                 teackp->TLI_error = t_error;
  90                 teackp->UNIX_error = sys_error;
  91                 putnext(tcp->tcp_connp->conn_rq, mp);
  92         }
  93 }
  94
  95 /*
  96  * TCP routine to get the values of options.
  97  */
  98 int
  99 tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
 100 {
 101         return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr));
 102 }
 103
 104 /* ARGSUSED */
 105 int
 106 tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
 107     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
 108     void *thisdg_attrs, cred_t *cr)
 109 {
 110         conn_t  *connp =  Q_TO_CONN(q);
 111
 112         return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp,
 113             outlenp, outvalp, thisdg_attrs, cr));
 114 }
 115
 116 static int
 117 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp,
 118     int *t_errorp, int *sys_errorp)
 119 {
 120         int error;
 121         int is_absreq_failure;
 122         t_scalar_t *opt_lenp;
 123         t_scalar_t opt_offset;
 124         int prim_type;
 125         struct T_conn_req *tcreqp;
 126         struct T_conn_res *tcresp;
 127         cred_t *cr;
 128
 129         /*
 130          * All Solaris components should pass a db_credp
 131          * for this TPI message, hence we ASSERT.
 132          * But in case there is some other M_PROTO that looks
 133          * like a TPI message sent by some other kernel
 134          * component, we check and return an error.
 135          */
 136         cr = msg_getcred(mp, NULL);
 137         ASSERT(cr != NULL);
 138         if (cr == NULL)
 139                 return (-1);
 140
 141         prim_type = ((union T_primitives *)mp->b_rptr)->type;
 142         ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES ||
 143             prim_type == T_CONN_RES);
 144
 145         switch (prim_type) {
 146         case T_CONN_REQ:
 147                 tcreqp = (struct T_conn_req *)mp->b_rptr;
 148                 opt_offset = tcreqp->OPT_offset;
 149                 opt_lenp = (t_scalar_t *)&tcreqp->OPT_length;
 150                 break;
 151         case O_T_CONN_RES:
 152         case T_CONN_RES:
 153                 tcresp = (struct T_conn_res *)mp->b_rptr;
 154                 opt_offset = tcresp->OPT_offset;
 155                 opt_lenp = (t_scalar_t *)&tcresp->OPT_length;
 156                 break;
 157         }
 158
 159         *t_errorp = 0;
 160         *sys_errorp = 0;
 161         *do_disconnectp = 0;
 162
 163         error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp,
 164             opt_offset, cr, &tcp_opt_obj,
 165             NULL, &is_absreq_failure);
 166
 167         switch (error) {
 168         case  0:                /* no error */
 169                 ASSERT(is_absreq_failure == 0);
 170                 return (0);
 171         case ENOPROTOOPT:
 172                 *t_errorp = TBADOPT;
 173                 break;
 174         case EACCES:
 175                 *t_errorp = TACCES;
 176                 break;
 177         default:
 178                 *t_errorp = TSYSERR; *sys_errorp = error;
 179                 break;
 180         }
 181         if (is_absreq_failure != 0) {
 182                 /*
 183                  * The connection request should get the local ack
 184                  * T_OK_ACK and then a T_DISCON_IND.
 185                  */
 186                 *do_disconnectp = 1;
 187         }
 188         return (-1);
 189 }
 190
 191 void
 192 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
 193 {
 194         int     error;
 195         conn_t  *connp = tcp->tcp_connp;
 196         struct sockaddr *sa;
 197         mblk_t  *mp1;
 198         struct T_bind_req *tbr;
 199         int     backlog;
 200         socklen_t       len;
 201         sin_t   *sin;
 202         sin6_t  *sin6;
 203         cred_t          *cr;
 204
 205         /*
 206          * All Solaris components should pass a db_credp
 207          * for this TPI message, hence we ASSERT.
 208          * But in case there is some other M_PROTO that looks
 209          * like a TPI message sent by some other kernel
 210          * component, we check and return an error.
 211          */
 212         cr = msg_getcred(mp, NULL);
 213         ASSERT(cr != NULL);
 214         if (cr == NULL) {
 215                 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
 216                 return;
 217         }
 218
 219         ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
 220         if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
 221                 if (connp->conn_debug) {
 222                         (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 223                             "tcp_tpi_bind: bad req, len %u",
 224                             (uint_t)(mp->b_wptr - mp->b_rptr));
 225                 }
 226                 tcp_err_ack(tcp, mp, TPROTO, 0);
 227                 return;
 228         }
 229         /* Make sure the largest address fits */
 230         mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
 231         if (mp1 == NULL) {
 232                 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
 233                 return;
 234         }
 235         mp = mp1;
 236         tbr = (struct T_bind_req *)mp->b_rptr;
 237
 238         backlog = tbr->CONIND_number;
 239         len = tbr->ADDR_length;
 240
 241         switch (len) {
 242         case 0:         /* request for a generic port */
 243                 tbr->ADDR_offset = sizeof (struct T_bind_req);
 244                 if (connp->conn_family == AF_INET) {
 245                         tbr->ADDR_length = sizeof (sin_t);
 246                         sin = (sin_t *)&tbr[1];
 247                         *sin = sin_null;
 248                         sin->sin_family = AF_INET;
 249                         sa = (struct sockaddr *)sin;
 250                         len = sizeof (sin_t);
 251                         mp->b_wptr = (uchar_t *)&sin[1];
 252                 } else {
 253                         ASSERT(connp->conn_family == AF_INET6);
 254                         tbr->ADDR_length = sizeof (sin6_t);
 255                         sin6 = (sin6_t *)&tbr[1];
 256                         *sin6 = sin6_null;
 257                         sin6->sin6_family = AF_INET6;
 258                         sa = (struct sockaddr *)sin6;
 259                         len = sizeof (sin6_t);
 260                         mp->b_wptr = (uchar_t *)&sin6[1];
 261                 }
 262                 break;
 263
 264         case sizeof (sin_t):    /* Complete IPv4 address */
 265                 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
 266                     sizeof (sin_t));
 267                 break;
 268
 269         case sizeof (sin6_t): /* Complete IPv6 address */
 270                 sa = (struct sockaddr *)mi_offset_param(mp,
 271                     tbr->ADDR_offset, sizeof (sin6_t));
 272                 break;
 273
 274         default:
 275                 if (connp->conn_debug) {
 276                         (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 277                             "tcp_tpi_bind: bad address length, %d",
 278                             tbr->ADDR_length);
 279                 }
 280                 tcp_err_ack(tcp, mp, TBADADDR, 0);
 281                 return;
 282         }
 283
 284         if (backlog > 0) {
 285                 error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp),
 286                     tbr->PRIM_type != O_T_BIND_REQ);
 287         } else {
 288                 error = tcp_do_bind(connp, sa, len, DB_CRED(mp),
 289                     tbr->PRIM_type != O_T_BIND_REQ);
 290         }
 291 done:
 292         if (error > 0) {
 293                 tcp_err_ack(tcp, mp, TSYSERR, error);
 294         } else if (error < 0) {
 295                 tcp_err_ack(tcp, mp, -error, 0);
 296         } else {
 297                 /*
 298                  * Update port information as sockfs/tpi needs it for checking
 299                  */
 300                 if (connp->conn_family == AF_INET) {
 301                         sin = (sin_t *)sa;
 302                         sin->sin_port = connp->conn_lport;
 303                 } else {
 304                         sin6 = (sin6_t *)sa;
 305                         sin6->sin6_port = connp->conn_lport;
 306                 }
 307                 mp->b_datap->db_type = M_PCPROTO;
 308                 tbr->PRIM_type = T_BIND_ACK;
 309                 putnext(connp->conn_rq, mp);
 310         }
 311 }
 312
 313 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */
 314 void
 315 tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp)
 316 {
 317         conn_t *connp = tcp->tcp_connp;
 318         int error;
 319
 320         error = tcp_do_unbind(connp);
 321         if (error > 0) {
 322                 tcp_err_ack(tcp, mp, TSYSERR, error);
 323         } else if (error < 0) {
 324                 tcp_err_ack(tcp, mp, -error, 0);
 325         } else {
 326                 /* Send M_FLUSH according to TPI */
 327                 (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
 328
 329                 mp = mi_tpi_ok_ack_alloc(mp);
 330                 if (mp != NULL)
 331                         putnext(connp->conn_rq, mp);
 332         }
 333 }
 334
 335 /* ARGSUSED */
 336 int
 337 tcp_tpi_close(queue_t *q, int flags, cred_t *credp __unused)
 338 {
 339         conn_t          *connp;
 340
 341         ASSERT(WR(q)->q_next == NULL);
 342
 343         if (flags & SO_FALLBACK) {
 344                 /*
 345                  * stream is being closed while in fallback
 346                  * simply free the resources that were allocated
 347                  */
 348                 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
 349                 qprocsoff(q);
 350                 goto done;
 351         }
 352
 353         connp = Q_TO_CONN(q);
 354         /*
 355          * We are being closed as /dev/tcp or /dev/tcp6.
 356          */
 357         tcp_close_common(connp, flags);
 358
 359         qprocsoff(q);
 360         inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
 361
 362         /*
 363          * Drop IP's reference on the conn. This is the last reference
 364          * on the connp if the state was less than established. If the
 365          * connection has gone into timewait state, then we will have
 366          * one ref for the TCP and one more ref (total of two) for the
 367          * classifier connected hash list (a timewait connections stays
 368          * in connected hash till closed).
 369          *
 370          * We can't assert the references because there might be other
 371          * transient reference places because of some walkers or queued
 372          * packets in squeue for the timewait state.
 373          */
 374         CONN_DEC_REF(connp);
 375 done:
 376         q->q_ptr = WR(q)->q_ptr = NULL;
 377         return (0);
 378 }
 379
 380 /* ARGSUSED */
 381 int
 382 tcp_tpi_close_accept(queue_t *q, int flags __unused, cred_t *credp __unused)
 383 {
 384         vmem_t  *minor_arena;
 385         dev_t   conn_dev;
 386         extern struct qinit tcp_acceptor_winit;
 387
 388         ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
 389
 390         /*
 391          * We had opened an acceptor STREAM for sockfs which is
 392          * now being closed due to some error.
 393          */
 394         qprocsoff(q);
 395
 396         minor_arena = (vmem_t *)WR(q)->q_ptr;
 397         conn_dev = (dev_t)RD(q)->q_ptr;
 398         ASSERT(minor_arena != NULL);
 399         ASSERT(conn_dev != 0);
 400         inet_minor_free(minor_arena, conn_dev);
 401         q->q_ptr = WR(q)->q_ptr = NULL;
 402         return (0);
 403 }
 404
 405 /*
 406  * Put a connection confirmation message upstream built from the
 407  * address/flowid information with the conn and iph. Report our success or
 408  * failure.
 409  */
 410 boolean_t
 411 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp,
 412     mblk_t **defermp, ip_recv_attr_t *ira)
 413 {
 414         sin_t   sin;
 415         sin6_t  sin6;
 416         mblk_t  *mp;
 417         char    *optp = NULL;
 418         int     optlen = 0;
 419         conn_t  *connp = tcp->tcp_connp;
 420
 421         if (defermp != NULL)
 422                 *defermp = NULL;
 423
 424         if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
 425                 /*
 426                  * Return in T_CONN_CON results of option negotiation through
 427                  * the T_CONN_REQ. Note: If there is an real end-to-end option
 428                  * negotiation, then what is received from remote end needs
 429                  * to be taken into account but there is no such thing (yet?)
 430                  * in our TCP/IP.
 431                  * Note: We do not use mi_offset_param() here as
 432                  * tcp_opts_conn_req contents do not directly come from
 433                  * an application and are either generated in kernel or
 434                  * from user input that was already verified.
 435                  */
 436                 mp = tcp->tcp_conn.tcp_opts_conn_req;
 437                 optp = (char *)(mp->b_rptr +
 438                     ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
 439                 optlen = (int)
 440                     ((struct T_conn_req *)mp->b_rptr)->OPT_length;
 441         }
 442
 443         if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
 444
 445                 /* packet is IPv4 */
 446                 if (connp->conn_family == AF_INET) {
 447                         sin = sin_null;
 448                         sin.sin_addr.s_addr = connp->conn_faddr_v4;
 449                         sin.sin_port = connp->conn_fport;
 450                         sin.sin_family = AF_INET;
 451                         mp = mi_tpi_conn_con(NULL, (char *)&sin,
 452                             (int)sizeof (sin_t), optp, optlen);
 453                 } else {
 454                         sin6 = sin6_null;
 455                         sin6.sin6_addr = connp->conn_faddr_v6;
 456                         sin6.sin6_port = connp->conn_fport;
 457                         sin6.sin6_family = AF_INET6;
 458                         mp = mi_tpi_conn_con(NULL, (char *)&sin6,
 459                             (int)sizeof (sin6_t), optp, optlen);
 460
 461                 }
 462         } else {
 463                 ip6_t   *ip6h = (ip6_t *)iphdr;
 464
 465                 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
 466                 ASSERT(connp->conn_family == AF_INET6);
 467                 sin6 = sin6_null;
 468                 sin6.sin6_addr = connp->conn_faddr_v6;
 469                 sin6.sin6_port = connp->conn_fport;
 470                 sin6.sin6_family = AF_INET6;
 471                 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
 472                 mp = mi_tpi_conn_con(NULL, (char *)&sin6,
 473                     (int)sizeof (sin6_t), optp, optlen);
 474         }
 475
 476         if (!mp)
 477                 return (B_FALSE);
 478
 479         mblk_copycred(mp, idmp);
 480
 481         if (defermp == NULL) {
 482                 conn_t *connp = tcp->tcp_connp;
 483                 if (IPCL_IS_NONSTR(connp)) {
 484                         (*connp->conn_upcalls->su_connected)
 485                             (connp->conn_upper_handle, tcp->tcp_connid,
 486                             ira->ira_cred, ira->ira_cpid);
 487                         freemsg(mp);
 488                 } else {
 489                         if (ira->ira_cred != NULL) {
 490                                 /* So that getpeerucred works for TPI sockfs */
 491                                 mblk_setcred(mp, ira->ira_cred, ira->ira_cpid);
 492                         }
 493                         putnext(connp->conn_rq, mp);
 494                 }
 495         } else {
 496                 *defermp = mp;
 497         }
 498
 499         if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
 500                 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
 501         return (B_TRUE);
 502 }
 503
 504 /*
 505  * Successful connect request processing begins when our client passes
 506  * a T_CONN_REQ message into tcp_wput(), which performs function calls into
 507  * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream).
 508  *
 509  * After various error checks are completed, tcp_tpi_connect() lays
 510  * the target address and port into the composite header template.
 511  * Then we ask IP for information, including a source address if we didn't
 512  * already have one. Finally we prepare to send the SYN packet, and then
 513  * send up the T_OK_ACK reply message.
 514  */
 515 void
 516 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
 517 {
 518         sin_t           *sin;
 519         struct T_conn_req       *tcr;
 520         struct sockaddr *sa;
 521         socklen_t       len;
 522         int             error;
 523         cred_t          *cr;
 524         pid_t           cpid;
 525         conn_t          *connp = tcp->tcp_connp;
 526         queue_t         *q = connp->conn_wq;
 527
 528         /*
 529          * All Solaris components should pass a db_credp
 530          * for this TPI message, hence we ASSERT.
 531          * But in case there is some other M_PROTO that looks
 532          * like a TPI message sent by some other kernel
 533          * component, we check and return an error.
 534          */
 535         cr = msg_getcred(mp, &cpid);
 536         ASSERT(cr != NULL);
 537         if (cr == NULL) {
 538                 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
 539                 return;
 540         }
 541
 542         tcr = (struct T_conn_req *)mp->b_rptr;
 543
 544         ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
 545         if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
 546                 tcp_err_ack(tcp, mp, TPROTO, 0);
 547                 return;
 548         }
 549
 550         /*
 551          * Pre-allocate the T_ordrel_ind mblk so that at close time, we
 552          * will always have that to send up.  Otherwise, we need to do
 553          * special handling in case the allocation fails at that time.
 554          * If the end point is TPI, the tcp_t can be reused and the
 555          * tcp_ordrel_mp may be allocated already.
 556          */
 557         if (tcp->tcp_ordrel_mp == NULL) {
 558                 if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) {
 559                         tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
 560                         return;
 561                 }
 562         }
 563
 564         /*
 565          * Determine packet type based on type of address passed in
 566          * the request should contain an IPv4 or IPv6 address.
 567          * Make sure that address family matches the type of
 568          * family of the address passed down.
 569          */
 570         switch (tcr->DEST_length) {
 571         default:
 572                 tcp_err_ack(tcp, mp, TBADADDR, 0);
 573                 return;
 574
 575         case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
 576                 /*
 577                  * XXX: The check for valid DEST_length was not there
 578                  * in earlier releases and some buggy
 579                  * TLI apps (e.g Sybase) got away with not feeding
 580                  * in sin_zero part of address.
 581                  * We allow that bug to keep those buggy apps humming.
 582                  * Test suites require the check on DEST_length.
 583                  * We construct a new mblk with valid DEST_length
 584                  * free the original so the rest of the code does
 585                  * not have to keep track of this special shorter
 586                  * length address case.
 587                  */
 588                 mblk_t *nmp;
 589                 struct T_conn_req *ntcr;
 590                 sin_t *nsin;
 591
 592                 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
 593                     tcr->OPT_length, BPRI_HI);
 594                 if (nmp == NULL) {
 595                         tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
 596                         return;
 597                 }
 598                 ntcr = (struct T_conn_req *)nmp->b_rptr;
 599                 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
 600                 ntcr->PRIM_type = T_CONN_REQ;
 601                 ntcr->DEST_length = sizeof (sin_t);
 602                 ntcr->DEST_offset = sizeof (struct T_conn_req);
 603
 604                 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
 605                 *nsin = sin_null;
 606                 /* Get pointer to shorter address to copy from original mp */
 607                 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
 608                     tcr->DEST_length); /* extract DEST_length worth of sin_t */
 609                 if (sin == NULL || !OK_32PTR((char *)sin)) {
 610                         freemsg(nmp);
 611                         tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
 612                         return;
 613                 }
 614                 nsin->sin_family = sin->sin_family;
 615                 nsin->sin_port = sin->sin_port;
 616                 nsin->sin_addr = sin->sin_addr;
 617                 /* Note:nsin->sin_zero zero-fill with sin_null assign above */
 618                 nmp->b_wptr = (uchar_t *)&nsin[1];
 619                 if (tcr->OPT_length != 0) {
 620                         ntcr->OPT_length = tcr->OPT_length;
 621                         ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
 622                         bcopy((uchar_t *)tcr + tcr->OPT_offset,
 623                             (uchar_t *)ntcr + ntcr->OPT_offset,
 624                             tcr->OPT_length);
 625                         nmp->b_wptr += tcr->OPT_length;
 626                 }
 627                 freemsg(mp);    /* original mp freed */
 628                 mp = nmp;       /* re-initialize original variables */
 629                 tcr = ntcr;
 630         }
 631         /* FALLTHRU */
 632
 633         case sizeof (sin_t):
 634                 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
 635                     sizeof (sin_t));
 636                 len = sizeof (sin_t);
 637                 break;
 638
 639         case sizeof (sin6_t):
 640                 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
 641                     sizeof (sin6_t));
 642                 len = sizeof (sin6_t);
 643                 break;
 644         }
 645
 646         error = proto_verify_ip_addr(connp->conn_family, sa, len);
 647         if (error != 0) {
 648                 tcp_err_ack(tcp, mp, TSYSERR, error);
 649                 return;
 650         }
 651
 652         /*
 653          * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
 654          * should key on their sequence number and cut them loose.
 655          */
 656
 657         /*
 658          * If options passed in, feed it for verification and handling
 659          */
 660         if (tcr->OPT_length != 0) {
 661                 mblk_t  *ok_mp;
 662                 mblk_t  *discon_mp;
 663                 mblk_t  *conn_opts_mp;
 664                 int t_error, sys_error, do_disconnect;
 665
 666                 conn_opts_mp = NULL;
 667
 668                 if (tcp_conprim_opt_process(tcp, mp,
 669                     &do_disconnect, &t_error, &sys_error) < 0) {
 670                         if (do_disconnect) {
 671                                 ASSERT(t_error == 0 && sys_error == 0);
 672                                 discon_mp = mi_tpi_discon_ind(NULL,
 673                                     ECONNREFUSED, 0);
 674                                 if (!discon_mp) {
 675                                         tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
 676                                             TSYSERR, ENOMEM);
 677                                         return;
 678                                 }
 679                                 ok_mp = mi_tpi_ok_ack_alloc(mp);
 680                                 if (!ok_mp) {
 681                                         tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
 682                                             TSYSERR, ENOMEM);
 683                                         return;
 684                                 }
 685                                 qreply(q, ok_mp);
 686                                 qreply(q, discon_mp); /* no flush! */
 687                         } else {
 688                                 ASSERT(t_error != 0);
 689                                 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
 690                                     sys_error);
 691                         }
 692                         return;
 693                 }
 694                 /*
 695                  * Success in setting options, the mp option buffer represented
 696                  * by OPT_length/offset has been potentially modified and
 697                  * contains results of option processing. We copy it in
 698                  * another mp to save it for potentially influencing returning
 699                  * it in T_CONN_CONN.
 700                  */
 701                 if (tcr->OPT_length != 0) { /* there are resulting options */
 702                         conn_opts_mp = copyb(mp);
 703                         if (!conn_opts_mp) {
 704                                 tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
 705                                     TSYSERR, ENOMEM);
 706                                 return;
 707                         }
 708                         ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
 709                         tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
 710                         /*
 711                          * Note:
 712                          * These resulting option negotiation can include any
 713                          * end-to-end negotiation options but there no such
 714                          * thing (yet?) in our TCP/IP.
 715                          */
 716                 }
 717         }
 718
 719         /* call the non-TPI version */
 720         error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid);
 721         if (error < 0) {
 722                 mp = mi_tpi_err_ack_alloc(mp, -error, 0);
 723         } else if (error > 0) {
 724                 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
 725         } else {
 726                 mp = mi_tpi_ok_ack_alloc(mp);
 727         }
 728
 729         /*
 730          * Note: Code below is the "failure" case
 731          */
 732         /* return error ack and blow away saved option results if any */
 733 connect_failed:
 734         if (mp != NULL)
 735                 putnext(connp->conn_rq, mp);
 736         else {
 737                 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
 738                     TSYSERR, ENOMEM);
 739         }
 740 }
 741
 742 /* Return the TPI/TLI equivalent of our current tcp_state */
 743 static int
 744 tcp_tpistate(tcp_t *tcp)
 745 {
 746         switch (tcp->tcp_state) {
 747         case TCPS_IDLE:
 748                 return (TS_UNBND);
 749         case TCPS_LISTEN:
 750                 /*
 751                  * Return whether there are outstanding T_CONN_IND waiting
 752                  * for the matching T_CONN_RES. Therefore don't count q0.
 753                  */
 754                 if (tcp->tcp_conn_req_cnt_q > 0)
 755                         return (TS_WRES_CIND);
 756                 else
 757                         return (TS_IDLE);
 758         case TCPS_BOUND:
 759                 return (TS_IDLE);
 760         case TCPS_SYN_SENT:
 761                 return (TS_WCON_CREQ);
 762         case TCPS_SYN_RCVD:
 763                 /*
 764                  * Note: assumption: this has to the active open SYN_RCVD.
 765                  * The passive instance is detached in SYN_RCVD stage of
 766                  * incoming connection processing so we cannot get request
 767                  * for T_info_ack on it.
 768                  */
 769                 return (TS_WACK_CRES);
 770         case TCPS_ESTABLISHED:
 771                 return (TS_DATA_XFER);
 772         case TCPS_CLOSE_WAIT:
 773                 return (TS_WREQ_ORDREL);
 774         case TCPS_FIN_WAIT_1:
 775                 return (TS_WIND_ORDREL);
 776         case TCPS_FIN_WAIT_2:
 777                 return (TS_WIND_ORDREL);
 778
 779         case TCPS_CLOSING:
 780         case TCPS_LAST_ACK:
 781         case TCPS_TIME_WAIT:
 782         case TCPS_CLOSED:
 783                 /*
 784                  * Following TS_WACK_DREQ7 is a rendition of "not
 785                  * yet TS_IDLE" TPI state. There is no best match to any
 786                  * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
 787                  * choose a value chosen that will map to TLI/XTI level
 788                  * state of TSTATECHNG (state is process of changing) which
 789                  * captures what this dummy state represents.
 790                  */
 791                 return (TS_WACK_DREQ7);
 792         default:
 793                 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
 794                     tcp->tcp_state, tcp_display(tcp, NULL,
 795                     DISP_PORT_ONLY));
 796                 return (TS_UNBND);
 797         }
 798 }
 799
 800 static void
 801 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
 802 {
 803         tcp_stack_t     *tcps = tcp->tcp_tcps;
 804         conn_t          *connp = tcp->tcp_connp;
 805         extern struct T_info_ack tcp_g_t_info_ack;
 806         extern struct T_info_ack tcp_g_t_info_ack_v6;
 807
 808         if (connp->conn_family == AF_INET6)
 809                 *tia = tcp_g_t_info_ack_v6;
 810         else
 811                 *tia = tcp_g_t_info_ack;
 812         tia->CURRENT_state = tcp_tpistate(tcp);
 813         tia->OPT_size = tcp_max_optsize;
 814         if (tcp->tcp_mss == 0) {
 815                 /* Not yet set - tcp_open does not set mss */
 816                 if (connp->conn_ipversion == IPV4_VERSION)
 817                         tia->TIDU_size = tcps->tcps_mss_def_ipv4;
 818                 else
 819                         tia->TIDU_size = tcps->tcps_mss_def_ipv6;
 820         } else {
 821                 tia->TIDU_size = tcp->tcp_mss;
 822         }
 823         /* TODO: Default ETSDU is 1.  Is that correct for tcp? */
 824 }
 825
 826 void
 827 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
 828     t_uscalar_t cap_bits1)
 829 {
 830         tcap->CAP_bits1 = 0;
 831
 832         if (cap_bits1 & TC1_INFO) {
 833                 tcp_copy_info(&tcap->INFO_ack, tcp);
 834                 tcap->CAP_bits1 |= TC1_INFO;
 835         }
 836
 837         if (cap_bits1 & TC1_ACCEPTOR_ID) {
 838                 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
 839                 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
 840         }
 841
 842 }
 843
 844 /*
 845  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
 846  * tcp_wput.  Much of the T_CAPABILITY_ACK information is copied from
 847  * tcp_g_t_info_ack.  The current state of the stream is copied from
 848  * tcp_state.
 849  */
 850 void
 851 tcp_capability_req(tcp_t *tcp, mblk_t *mp)
 852 {
 853         t_uscalar_t             cap_bits1;
 854         struct T_capability_ack *tcap;
 855
 856         if (MBLKL(mp) < sizeof (struct T_capability_req)) {
 857                 freemsg(mp);
 858                 return;
 859         }
 860
 861         cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
 862
 863         mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
 864             mp->b_datap->db_type, T_CAPABILITY_ACK);
 865         if (mp == NULL)
 866                 return;
 867
 868         tcap = (struct T_capability_ack *)mp->b_rptr;
 869         tcp_do_capability_ack(tcp, tcap, cap_bits1);
 870
 871         putnext(tcp->tcp_connp->conn_rq, mp);
 872 }
 873
 874 /*
 875  * This routine responds to T_INFO_REQ messages.  It is called by tcp_wput.
 876  * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
 877  * The current state of the stream is copied from tcp_state.
 878  */
 879 void
 880 tcp_info_req(tcp_t *tcp, mblk_t *mp)
 881 {
 882         mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
 883             T_INFO_ACK);
 884         if (!mp) {
 885                 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
 886                 return;
 887         }
 888         tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
 889         putnext(tcp->tcp_connp->conn_rq, mp);
 890 }
 891
 892 /* Respond to the TPI addr request */
 893 void
 894 tcp_addr_req(tcp_t *tcp, mblk_t *mp)
 895 {
 896         struct sockaddr *sa;
 897         mblk_t  *ackmp;
 898         struct T_addr_ack *taa;
 899         conn_t  *connp = tcp->tcp_connp;
 900         uint_t  addrlen;
 901
 902         /* Make it large enough for worst case */
 903         ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
 904             2 * sizeof (sin6_t), 1);
 905         if (ackmp == NULL) {
 906                 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
 907                 return;
 908         }
 909
 910         taa = (struct T_addr_ack *)ackmp->b_rptr;
 911
 912         bzero(taa, sizeof (struct T_addr_ack));
 913         ackmp->b_wptr = (uchar_t *)&taa[1];
 914
 915         taa->PRIM_type = T_ADDR_ACK;
 916         ackmp->b_datap->db_type = M_PCPROTO;
 917
 918         if (connp->conn_family == AF_INET)
 919                 addrlen = sizeof (sin_t);
 920         else
 921                 addrlen = sizeof (sin6_t);
 922
 923         /*
 924          * Note: Following code assumes 32 bit alignment of basic
 925          * data structures like sin_t and struct T_addr_ack.
 926          */
 927         if (tcp->tcp_state >= TCPS_BOUND) {
 928                 /*
 929                  * Fill in local address first
 930                  */
 931                 taa->LOCADDR_offset = sizeof (*taa);
 932                 taa->LOCADDR_length = addrlen;
 933                 sa = (struct sockaddr *)&taa[1];
 934                 (void) conn_getsockname(connp, sa, &addrlen);
 935                 ackmp->b_wptr += addrlen;
 936         }
 937         if (tcp->tcp_state >= TCPS_SYN_RCVD) {
 938                 /*
 939                  * Fill in Remote address
 940                  */
 941                 taa->REMADDR_length = addrlen;
 942                 /* assumed 32-bit alignment */
 943                 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
 944                 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
 945                 (void) conn_getpeername(connp, sa, &addrlen);
 946                 ackmp->b_wptr += addrlen;
 947         }
 948         ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
 949         putnext(tcp->tcp_connp->conn_rq, ackmp);
 950 }
 951
 952 /*
 953  * Swap information between the eager and acceptor for a TLI/XTI client.
 954  * The sockfs accept is done on the acceptor stream and control goes
 955  * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
 956  * called. In either case, both the eager and listener are in their own
 957  * perimeter (squeue) and the code has to deal with potential race.
 958  *
 959  * See the block comment on top of tcp_accept() and tcp_tli_accept().
 960  */
 961 static void
 962 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
 963 {
 964         conn_t  *econnp, *aconnp;
 965
 966         ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
 967         ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
 968         ASSERT(!TCP_IS_SOCKET(acceptor));
 969         ASSERT(!TCP_IS_SOCKET(eager));
 970         ASSERT(!TCP_IS_SOCKET(listener));
 971
 972         /*
 973          * Trusted Extensions may need to use a security label that is
 974          * different from the acceptor's label on MLP and MAC-Exempt
 975          * sockets. If this is the case, the required security label
 976          * already exists in econnp->conn_ixa->ixa_tsl. Since we make the
 977          * acceptor stream refer to econnp we atomatically get that label.
 978          */
 979
 980         acceptor->tcp_detached = B_TRUE;
 981         /*
 982          * To permit stream re-use by TLI/XTI, the eager needs a copy of
 983          * the acceptor id.
 984          */
 985         eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
 986
 987         /* remove eager from listen list... */
 988         mutex_enter(&listener->tcp_eager_lock);
 989         tcp_eager_unlink(eager);
 990         ASSERT(eager->tcp_eager_next_q == NULL &&
 991             eager->tcp_eager_last_q == NULL);
 992         ASSERT(eager->tcp_eager_next_q0 == NULL &&
 993             eager->tcp_eager_prev_q0 == NULL);
 994         mutex_exit(&listener->tcp_eager_lock);
 995
 996         econnp = eager->tcp_connp;
 997         aconnp = acceptor->tcp_connp;
 998         econnp->conn_rq = aconnp->conn_rq;
 999         econnp->conn_wq = aconnp->conn_wq;
1000         econnp->conn_rq->q_ptr = econnp;
1001         econnp->conn_wq->q_ptr = econnp;
1002
1003         /*
1004          * In the TLI/XTI loopback case, we are inside the listener's squeue,
1005          * which might be a different squeue from our peer TCP instance.
1006          * For TCP Fusion, the peer expects that whenever tcp_detached is
1007          * clear, our TCP queues point to the acceptor's queues.  Thus, use
1008          * membar_producer() to ensure that the assignments of conn_rq/conn_wq
1009          * above reach global visibility prior to the clearing of tcp_detached.
1010          */
1011         membar_producer();
1012         eager->tcp_detached = B_FALSE;
1013
1014         ASSERT(eager->tcp_ack_tid == 0);
1015
1016         econnp->conn_dev = aconnp->conn_dev;
1017         econnp->conn_minor_arena = aconnp->conn_minor_arena;
1018
1019         ASSERT(econnp->conn_minor_arena != NULL);
1020         if (econnp->conn_cred != NULL)
1021                 crfree(econnp->conn_cred);
1022         econnp->conn_cred = aconnp->conn_cred;
1023         ASSERT(!(econnp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1024         econnp->conn_ixa->ixa_cred = econnp->conn_cred;
1025         aconnp->conn_cred = NULL;
1026         econnp->conn_cpid = aconnp->conn_cpid;
1027         ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
1028         ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
1029
1030         econnp->conn_zoneid = aconnp->conn_zoneid;
1031         econnp->conn_allzones = aconnp->conn_allzones;
1032         econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
1033
1034         econnp->conn_mac_mode = aconnp->conn_mac_mode;
1035         econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
1036         aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
1037
1038         /* Do the IPC initialization */
1039         CONN_INC_REF(econnp);
1040
1041         /* Done with old IPC. Drop its ref on its connp */
1042         CONN_DEC_REF(aconnp);
1043 }
1044
1045 /*
1046  * This runs at the tail end of accept processing on the squeue of the
1047  * new connection.
1048  */
1049 /* ARGSUSED */
1050 static void
1051 tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1052 {
1053         conn_t                  *connp = (conn_t *)arg;
1054         tcp_t                   *tcp = connp->conn_tcp;
1055         queue_t                 *q = connp->conn_rq;
1056         tcp_stack_t             *tcps = tcp->tcp_tcps;
1057         struct stroptions       *stropt;
1058         struct sock_proto_props sopp;
1059
1060         /* Should never be called for non-STREAMS sockets */
1061         ASSERT(!IPCL_IS_NONSTR(connp));
1062
1063         /* We should just receive a single mblk that fits a T_discon_ind */
1064         ASSERT(mp->b_cont == NULL);
1065
1066         /*
1067          * Drop the eager's ref on the listener, that was placed when
1068          * this eager began life in tcp_input_listener.
1069          */
1070         CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
1071
1072         tcp->tcp_detached = B_FALSE;
1073
1074         if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) {
1075                 /*
1076                  * Someone blewoff the eager before we could finish
1077                  * the accept.
1078                  *
1079                  * The only reason eager exists it because we put in
1080                  * a ref on it when conn ind went up. We need to send
1081                  * a disconnect indication up while the last reference
1082                  * on the eager will be dropped by the squeue when we
1083                  * return.
1084                  */
1085                 ASSERT(tcp->tcp_listener == NULL);
1086                 if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) {
1087                         struct  T_discon_ind    *tdi;
1088
1089                         (void) putnextctl1(q, M_FLUSH, FLUSHRW);
1090                         /*
1091                          * Let us reuse the incoming mblk to avoid
1092                          * memory allocation failure problems. We know
1093                          * that the size of the incoming mblk i.e.
1094                          * stroptions is greater than sizeof
1095                          * T_discon_ind.
1096                          */
1097                         ASSERT(DB_REF(mp) == 1);
1098                         ASSERT(MBLKSIZE(mp) >=
1099                             sizeof (struct T_discon_ind));
1100
1101                         DB_TYPE(mp) = M_PROTO;
1102                         ((union T_primitives *)mp->b_rptr)->type =
1103                             T_DISCON_IND;
1104                         tdi = (struct T_discon_ind *)mp->b_rptr;
1105                         if (tcp->tcp_issocket) {
1106                                 tdi->DISCON_reason = ECONNREFUSED;
1107                                 tdi->SEQ_number = 0;
1108                         } else {
1109                                 tdi->DISCON_reason = ENOPROTOOPT;
1110                                 tdi->SEQ_number =
1111                                     tcp->tcp_conn_req_seqnum;
1112                         }
1113                         mp->b_wptr = mp->b_rptr +
1114                             sizeof (struct T_discon_ind);
1115                         putnext(q, mp);
1116                 }
1117                 tcp->tcp_hard_binding = B_FALSE;
1118                 return;
1119         }
1120
1121         /*
1122          * This is the first time we run on the correct
1123          * queue after tcp_accept. So fix all the q parameters
1124          * here.
1125          *
1126          * Let us reuse the incoming mblk to avoid
1127          * memory allocation failure problems. We know
1128          * that the size of the incoming mblk is at least
1129          * stroptions
1130          */
1131         tcp_get_proto_props(tcp, &sopp);
1132
1133         ASSERT(DB_REF(mp) == 1);
1134         ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions));
1135
1136         DB_TYPE(mp) = M_SETOPTS;
1137         stropt = (struct stroptions *)mp->b_rptr;
1138         mp->b_wptr = mp->b_rptr + sizeof (struct stroptions);
1139         stropt = (struct stroptions *)mp->b_rptr;
1140         ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK));
1141         stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
1142         stropt->so_hiwat = sopp.sopp_rxhiwat;
1143         stropt->so_wroff = sopp.sopp_wroff;
1144         stropt->so_maxblk = sopp.sopp_maxblk;
1145
1146         /* Send the options up */
1147         putnext(q, mp);
1148
1149         /*
1150          * Pass up any data and/or a fin that has been received.
1151          *
1152          * Adjust receive window in case it had decreased
1153          * (because there is data <=> tcp_rcv_list != NULL)
1154          * while the connection was detached. Note that
1155          * in case the eager was flow-controlled, w/o this
1156          * code, the rwnd may never open up again!
1157          */
1158         if (tcp->tcp_rcv_list != NULL) {
1159                 /* We drain directly in case of fused tcp loopback */
1160
1161                 if (!tcp->tcp_fused && canputnext(q)) {
1162                         tcp->tcp_rwnd = connp->conn_rcvbuf;
1163                         if (tcp->tcp_state >= TCPS_ESTABLISHED &&
1164                             tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
1165                                 tcp_xmit_ctl(NULL,
1166                                     tcp, (tcp->tcp_swnd == 0) ?
1167                                     tcp->tcp_suna : tcp->tcp_snxt,
1168                                     tcp->tcp_rnxt, TH_ACK);
1169                         }
1170                 }
1171
1172                 (void) tcp_rcv_drain(tcp);
1173
1174                 /*
1175                  * For fused tcp loopback, back-enable peer endpoint
1176                  * if it's currently flow-controlled.
1177                  */
1178                 if (tcp->tcp_fused) {
1179                         tcp_t *peer_tcp = tcp->tcp_loopback_peer;
1180
1181                         ASSERT(peer_tcp != NULL);
1182                         ASSERT(peer_tcp->tcp_fused);
1183
1184                         mutex_enter(&peer_tcp->tcp_non_sq_lock);
1185                         if (peer_tcp->tcp_flow_stopped) {
1186                                 tcp_clrqfull(peer_tcp);
1187                                 TCP_STAT(tcps, tcp_fusion_backenabled);
1188                         }
1189                         mutex_exit(&peer_tcp->tcp_non_sq_lock);
1190                 }
1191         }
1192         ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
1193         if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
1194                 tcp->tcp_ordrel_done = B_TRUE;
1195                 mp = tcp->tcp_ordrel_mp;
1196                 tcp->tcp_ordrel_mp = NULL;
1197                 putnext(q, mp);
1198         }
1199         tcp->tcp_hard_binding = B_FALSE;
1200
1201         if (connp->conn_keepalive) {
1202                 tcp->tcp_ka_last_intrvl = 0;
1203                 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1204                     tcp->tcp_ka_interval);
1205         }
1206
1207         /*
1208          * At this point, eager is fully established and will
1209          * have the following references -
1210          *
1211          * 2 references for connection to exist (1 for TCP and 1 for IP).
1212          * 1 reference for the squeue which will be dropped by the squeue as
1213          *      soon as this function returns.
1214          * There will be 1 additonal reference for being in classifier
1215          *      hash list provided something bad hasn't happened.
1216          */
1217         ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
1218             (connp->conn_fanout == NULL && connp->conn_ref >= 3));
1219 }
1220
1221 /*
1222  * Pull a deferred connection indication off of the listener. The caller
1223  * must verify that there is a deferred conn ind under eager_lock before
1224  * calling this function.
1225  */
1226 static mblk_t *
1227 tcp_get_def_conn_ind(tcp_t *listener)
1228 {
1229         tcp_t *tail;
1230         tcp_t *tcp;
1231         mblk_t *conn_ind;
1232
1233         ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
1234         ASSERT(listener->tcp_eager_prev_q0->tcp_conn_def_q0);
1235
1236         tcp = listener->tcp_eager_prev_q0;
1237         /*
1238          * listener->tcp_eager_prev_q0 points to the TAIL of the
1239          * deferred T_conn_ind queue. We need to get to the head
1240          * of the queue in order to send up T_conn_ind the same
1241          * order as how the 3WHS is completed.
1242          */
1243         while (tcp != listener) {
1244                 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
1245                         break;
1246                 else
1247                         tcp = tcp->tcp_eager_prev_q0;
1248         }
1249
1250         conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
1251         tcp->tcp_conn.tcp_eager_conn_ind = NULL;
1252         /* Move from q0 to q */
1253         ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1254         listener->tcp_conn_req_cnt_q0--;
1255         listener->tcp_conn_req_cnt_q++;
1256         tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1257             tcp->tcp_eager_prev_q0;
1258         tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1259             tcp->tcp_eager_next_q0;
1260         tcp->tcp_eager_prev_q0 = NULL;
1261         tcp->tcp_eager_next_q0 = NULL;
1262         tcp->tcp_conn_def_q0 = B_FALSE;
1263
1264         /* Make sure the tcp isn't in the list of droppables */
1265         ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
1266             tcp->tcp_eager_prev_drop_q0 == NULL);
1267
1268         /*
1269          * Insert at end of the queue because sockfs sends
1270          * down T_CONN_RES in chronological order. Leaving
1271          * the older conn indications at front of the queue
1272          * helps reducing search time.
1273          */
1274         tail = listener->tcp_eager_last_q;
1275         if (tail != NULL) {
1276                 tail->tcp_eager_next_q = tcp;
1277         } else {
1278                 listener->tcp_eager_next_q = tcp;
1279         }
1280         listener->tcp_eager_last_q = tcp;
1281         tcp->tcp_eager_next_q = NULL;
1282
1283         return (conn_ind);
1284 }
1285
1286
1287 /*
1288  * Reply to a clients T_CONN_RES TPI message. This function
1289  * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
1290  * on the acceptor STREAM and processed in tcp_accept_common().
1291  * Read the block comment on top of tcp_input_listener().
1292  */
1293 void
1294 tcp_tli_accept(tcp_t *listener, mblk_t *mp)
1295 {
1296         tcp_t           *acceptor;
1297         tcp_t           *eager;
1298         struct T_conn_res       *tcr;
1299         t_uscalar_t     acceptor_id;
1300         t_scalar_t      seqnum;
1301         mblk_t          *discon_mp = NULL;
1302         mblk_t          *ok_mp;
1303         mblk_t          *mp1;
1304         tcp_stack_t     *tcps = listener->tcp_tcps;
1305         conn_t          *econnp;
1306
1307         if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
1308                 tcp_err_ack(listener, mp, TPROTO, 0);
1309                 return;
1310         }
1311         tcr = (struct T_conn_res *)mp->b_rptr;
1312
1313         /*
1314          * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
1315          * read side queue of the streams device underneath us i.e. the
1316          * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
1317          * look it up in the queue_hash.  Under LP64 it sends down the
1318          * minor_t of the accepting endpoint.
1319          *
1320          * Once the acceptor/eager are modified (in tcp_accept_swap) the
1321          * fanout hash lock is held.
1322          * This prevents any thread from entering the acceptor queue from
1323          * below (since it has not been hard bound yet i.e. any inbound
1324          * packets will arrive on the listener conn_t and
1325          * go through the classifier).
1326          * The CONN_INC_REF will prevent the acceptor from closing.
1327          *
1328          * XXX It is still possible for a tli application to send down data
1329          * on the accepting stream while another thread calls t_accept.
1330          * This should not be a problem for well-behaved applications since
1331          * the T_OK_ACK is sent after the queue swapping is completed.
1332          *
1333          * If the accepting fd is the same as the listening fd, avoid
1334          * queue hash lookup since that will return an eager listener in a
1335          * already established state.
1336          */
1337         acceptor_id = tcr->ACCEPTOR_id;
1338         mutex_enter(&listener->tcp_eager_lock);
1339         if (listener->tcp_acceptor_id == acceptor_id) {
1340                 eager = listener->tcp_eager_next_q;
1341                 /* only count how many T_CONN_INDs so don't count q0 */
1342                 if ((listener->tcp_conn_req_cnt_q != 1) ||
1343                     (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
1344                         mutex_exit(&listener->tcp_eager_lock);
1345                         tcp_err_ack(listener, mp, TBADF, 0);
1346                         return;
1347                 }
1348                 if (listener->tcp_conn_req_cnt_q0 != 0) {
1349                         /* Throw away all the eagers on q0. */
1350                         tcp_eager_cleanup(listener, 1);
1351                 }
1352                 if (listener->tcp_syn_defense) {
1353                         listener->tcp_syn_defense = B_FALSE;
1354                         if (listener->tcp_ip_addr_cache != NULL) {
1355                                 kmem_free(listener->tcp_ip_addr_cache,
1356                                     IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1357                                 listener->tcp_ip_addr_cache = NULL;
1358                         }
1359                 }
1360                 /*
1361                  * Transfer tcp_conn_req_max to the eager so that when
1362                  * a disconnect occurs we can revert the endpoint to the
1363                  * listen state.
1364                  */
1365                 eager->tcp_conn_req_max = listener->tcp_conn_req_max;
1366                 ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
1367                 /*
1368                  * Get a reference on the acceptor just like the
1369                  * tcp_acceptor_hash_lookup below.
1370                  */
1371                 acceptor = listener;
1372                 CONN_INC_REF(acceptor->tcp_connp);
1373         } else {
1374                 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
1375                 if (acceptor == NULL) {
1376                         if (listener->tcp_connp->conn_debug) {
1377                                 (void) strlog(TCP_MOD_ID, 0, 1,
1378                                     SL_ERROR|SL_TRACE,
1379                                     "tcp_accept: did not find acceptor 0x%x\n",
1380                                     acceptor_id);
1381                         }
1382                         mutex_exit(&listener->tcp_eager_lock);
1383                         tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
1384                         return;
1385                 }
1386                 /*
1387                  * Verify acceptor state. The acceptable states for an acceptor
1388                  * include TCPS_IDLE and TCPS_BOUND.
1389                  */
1390                 switch (acceptor->tcp_state) {
1391                 case TCPS_IDLE:
1392                         /* FALLTHRU */
1393                 case TCPS_BOUND:
1394                         break;
1395                 default:
1396                         CONN_DEC_REF(acceptor->tcp_connp);
1397                         mutex_exit(&listener->tcp_eager_lock);
1398                         tcp_err_ack(listener, mp, TOUTSTATE, 0);
1399                         return;
1400                 }
1401         }
1402
1403         /* The listener must be in TCPS_LISTEN */
1404         if (listener->tcp_state != TCPS_LISTEN) {
1405                 CONN_DEC_REF(acceptor->tcp_connp);
1406                 mutex_exit(&listener->tcp_eager_lock);
1407                 tcp_err_ack(listener, mp, TOUTSTATE, 0);
1408                 return;
1409         }
1410
1411         /*
1412          * Rendezvous with an eager connection request packet hanging off
1413          * 'tcp' that has the 'seqnum' tag.  We tagged the detached open
1414          * tcp structure when the connection packet arrived in
1415          * tcp_input_listener().
1416          */
1417         seqnum = tcr->SEQ_number;
1418         eager = listener;
1419         do {
1420                 eager = eager->tcp_eager_next_q;
1421                 if (eager == NULL) {
1422                         CONN_DEC_REF(acceptor->tcp_connp);
1423                         mutex_exit(&listener->tcp_eager_lock);
1424                         tcp_err_ack(listener, mp, TBADSEQ, 0);
1425                         return;
1426                 }
1427         } while (eager->tcp_conn_req_seqnum != seqnum);
1428         mutex_exit(&listener->tcp_eager_lock);
1429
1430         /*
1431          * At this point, both acceptor and listener have 2 ref
1432          * that they begin with. Acceptor has one additional ref
1433          * we placed in lookup while listener has 3 additional
1434          * ref for being behind the squeue (tcp_accept() is
1435          * done on listener's squeue); being in classifier hash;
1436          * and eager's ref on listener.
1437          */
1438         ASSERT(listener->tcp_connp->conn_ref >= 5);
1439         ASSERT(acceptor->tcp_connp->conn_ref >= 3);
1440
1441         /*
1442          * The eager at this point is set in its own squeue and
1443          * could easily have been killed (tcp_accept_finish will
1444          * deal with that) because of a TH_RST so we can only
1445          * ASSERT for a single ref.
1446          */
1447         ASSERT(eager->tcp_connp->conn_ref >= 1);
1448
1449         /*
1450          * Pre allocate the discon_ind mblk also. tcp_accept_finish will
1451          * use it if something failed.
1452          */
1453         discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
1454             sizeof (struct stroptions)), BPRI_HI);
1455         if (discon_mp == NULL) {
1456                 CONN_DEC_REF(acceptor->tcp_connp);
1457                 CONN_DEC_REF(eager->tcp_connp);
1458                 tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
1459                 return;
1460         }
1461
1462         econnp = eager->tcp_connp;
1463
1464         /* Hold a copy of mp, in case reallocb fails */
1465         if ((mp1 = copymsg(mp)) == NULL) {
1466                 CONN_DEC_REF(acceptor->tcp_connp);
1467                 CONN_DEC_REF(eager->tcp_connp);
1468                 freemsg(discon_mp);
1469                 tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
1470                 return;
1471         }
1472
1473         tcr = (struct T_conn_res *)mp1->b_rptr;
1474
1475         /*
1476          * This is an expanded version of mi_tpi_ok_ack_alloc()
1477          * which allocates a larger mblk and appends the new
1478          * local address to the ok_ack.  The address is copied by
1479          * soaccept() for getsockname().
1480          */
1481         {
1482                 int extra;
1483
1484                 extra = (econnp->conn_family == AF_INET) ?
1485                     sizeof (sin_t) : sizeof (sin6_t);
1486
1487                 /*
1488                  * Try to re-use mp, if possible.  Otherwise, allocate
1489                  * an mblk and return it as ok_mp.  In any case, mp
1490                  * is no longer usable upon return.
1491                  */
1492                 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
1493                         CONN_DEC_REF(acceptor->tcp_connp);
1494                         CONN_DEC_REF(eager->tcp_connp);
1495                         freemsg(discon_mp);
1496                         /* Original mp has been freed by now, so use mp1 */
1497                         tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
1498                         return;
1499                 }
1500
1501                 mp = NULL;      /* We should never use mp after this point */
1502
1503                 switch (extra) {
1504                 case sizeof (sin_t): {
1505                         sin_t *sin = (sin_t *)ok_mp->b_wptr;
1506
1507                         ok_mp->b_wptr += extra;
1508                         sin->sin_family = AF_INET;
1509                         sin->sin_port = econnp->conn_lport;
1510                         sin->sin_addr.s_addr = econnp->conn_laddr_v4;
1511                         break;
1512                 }
1513                 case sizeof (sin6_t): {
1514                         sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
1515
1516                         ok_mp->b_wptr += extra;
1517                         sin6->sin6_family = AF_INET6;
1518                         sin6->sin6_port = econnp->conn_lport;
1519                         sin6->sin6_addr = econnp->conn_laddr_v6;
1520                         sin6->sin6_flowinfo = econnp->conn_flowinfo;
1521                         if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
1522                             (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
1523                                 sin6->sin6_scope_id =
1524                                     econnp->conn_ixa->ixa_scopeid;
1525                         } else {
1526                                 sin6->sin6_scope_id = 0;
1527                         }
1528                         sin6->__sin6_src_id = 0;
1529                         break;
1530                 }
1531                 default:
1532                         break;
1533                 }
1534                 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
1535         }
1536
1537         /*
1538          * If there are no options we know that the T_CONN_RES will
1539          * succeed. However, we can't send the T_OK_ACK upstream until
1540          * the tcp_accept_swap is done since it would be dangerous to
1541          * let the application start using the new fd prior to the swap.
1542          */
1543         tcp_accept_swap(listener, acceptor, eager);
1544
1545         /*
1546          * tcp_accept_swap unlinks eager from listener but does not drop
1547          * the eager's reference on the listener.
1548          */
1549         ASSERT(eager->tcp_listener == NULL);
1550         ASSERT(listener->tcp_connp->conn_ref >= 5);
1551
1552         /*
1553          * The eager is now associated with its own queue. Insert in
1554          * the hash so that the connection can be reused for a future
1555          * T_CONN_RES.
1556          */
1557         tcp_acceptor_hash_insert(acceptor_id, eager);
1558
1559         /*
1560          * We now do the processing of options with T_CONN_RES.
1561          * We delay till now since we wanted to have queue to pass to
1562          * option processing routines that points back to the right
1563          * instance structure which does not happen until after
1564          * tcp_accept_swap().
1565          *
1566          * Note:
1567          * The sanity of the logic here assumes that whatever options
1568          * are appropriate to inherit from listner=>eager are done
1569          * before this point, and whatever were to be overridden (or not)
1570          * in transfer logic from eager=>acceptor in tcp_accept_swap().
1571          * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
1572          *   before its ACCEPTOR_id comes down in T_CONN_RES ]
1573          * This may not be true at this point in time but can be fixed
1574          * independently. This option processing code starts with
1575          * the instantiated acceptor instance and the final queue at
1576          * this point.
1577          */
1578
1579         if (tcr->OPT_length != 0) {
1580                 /* Options to process */
1581                 int t_error = 0;
1582                 int sys_error = 0;
1583                 int do_disconnect = 0;
1584
1585                 if (tcp_conprim_opt_process(eager, mp1,
1586                     &do_disconnect, &t_error, &sys_error) < 0) {
1587                         eager->tcp_accept_error = 1;
1588                         if (do_disconnect) {
1589                                 /*
1590                                  * An option failed which does not allow
1591                                  * connection to be accepted.
1592                                  *
1593                                  * We allow T_CONN_RES to succeed and
1594                                  * put a T_DISCON_IND on the eager queue.
1595                                  */
1596                                 ASSERT(t_error == 0 && sys_error == 0);
1597                                 eager->tcp_send_discon_ind = 1;
1598                         } else {
1599                                 ASSERT(t_error != 0);
1600                                 freemsg(ok_mp);
1601                                 /*
1602                                  * Original mp was either freed or set
1603                                  * to ok_mp above, so use mp1 instead.
1604                                  */
1605                                 tcp_err_ack(listener, mp1, t_error, sys_error);
1606                                 goto finish;
1607                         }
1608                 }
1609                 /*
1610                  * Most likely success in setting options (except if
1611                  * eager->tcp_send_discon_ind set).
1612                  * mp1 option buffer represented by OPT_length/offset
1613                  * potentially modified and contains results of setting
1614                  * options at this point
1615                  */
1616         }
1617
1618         /* We no longer need mp1, since all options processing has passed */
1619         freemsg(mp1);
1620
1621         putnext(listener->tcp_connp->conn_rq, ok_mp);
1622
1623         mutex_enter(&listener->tcp_eager_lock);
1624         if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
1625                 mblk_t  *conn_ind;
1626
1627                 /*
1628                  * This path should not be executed if listener and
1629                  * acceptor streams are the same.
1630                  */
1631                 ASSERT(listener != acceptor);
1632                 conn_ind = tcp_get_def_conn_ind(listener);
1633                 mutex_exit(&listener->tcp_eager_lock);
1634                 putnext(listener->tcp_connp->conn_rq, conn_ind);
1635         } else {
1636                 mutex_exit(&listener->tcp_eager_lock);
1637         }
1638
1639         /*
1640          * Done with the acceptor - free it
1641          *
1642          * Note: from this point on, no access to listener should be made
1643          * as listener can be equal to acceptor.
1644          */
1645 finish:
1646         ASSERT(acceptor->tcp_detached);
1647         acceptor->tcp_connp->conn_rq = NULL;
1648         ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
1649         acceptor->tcp_connp->conn_wq = NULL;
1650         (void) tcp_clean_death(acceptor, 0);
1651         CONN_DEC_REF(acceptor->tcp_connp);
1652
1653         /*
1654          * We pass discon_mp to tcp_accept_finish to get on the right squeue.
1655          *
1656          * It will update the setting for sockfs/stream head and also take
1657          * care of any data that arrived before accept() wad called.
1658          * In case we already received a FIN then tcp_accept_finish will send up
1659          * the ordrel. It will also send up a window update if the window
1660          * has opened up.
1661          */
1662
1663         /*
1664          * XXX: we currently have a problem if XTI application closes the
1665          * acceptor stream in between. This problem exists in on10-gate also
1666          * and is well know but nothing can be done short of major rewrite
1667          * to fix it. Now it is possible to take care of it by assigning TLI/XTI
1668          * eager same squeue as listener (we can distinguish non socket
1669          * listeners at the time of handling a SYN in tcp_input_listener)
1670          * and do most of the work that tcp_accept_finish does here itself
1671          * and then get behind the acceptor squeue to access the acceptor
1672          * queue.
1673          */
1674         /*
1675          * We already have a ref on tcp so no need to do one before squeue_enter
1676          */
1677         SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
1678             tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
1679             SQTAG_TCP_ACCEPT_FINISH);
1680 }
1681
1682
1683 /*
1684  * This is the STREAMS entry point for T_CONN_RES coming down on
1685  * Acceptor STREAM when  sockfs listener does accept processing.
1686  * Read the block comment on top of tcp_input_listener().
1687  */
1688 void
1689 tcp_tpi_accept(queue_t *q, mblk_t *mp)
1690 {
1691         queue_t *rq = RD(q);
1692         struct T_conn_res *conn_res;
1693         tcp_t *eager;
1694         tcp_t *listener;
1695         struct T_ok_ack *ok;
1696         t_scalar_t PRIM_type;
1697         mblk_t *discon_mp;
1698         conn_t *econnp;
1699         cred_t *cr;
1700
1701         ASSERT(DB_TYPE(mp) == M_PROTO);
1702
1703         /*
1704          * All Solaris components should pass a db_credp
1705          * for this TPI message, hence we ASSERT.
1706          * But in case there is some other M_PROTO that looks
1707          * like a TPI message sent by some other kernel
1708          * component, we check and return an error.
1709          */
1710         cr = msg_getcred(mp, NULL);
1711         ASSERT(cr != NULL);
1712         if (cr == NULL) {
1713                 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
1714                 if (mp != NULL)
1715                         putnext(rq, mp);
1716                 return;
1717         }
1718         conn_res = (struct T_conn_res *)mp->b_rptr;
1719         ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
1720         if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) {
1721                 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
1722                 if (mp != NULL)
1723                         putnext(rq, mp);
1724                 return;
1725         }
1726         switch (conn_res->PRIM_type) {
1727         case O_T_CONN_RES:
1728         case T_CONN_RES:
1729                 /*
1730                  * We pass up an err ack if allocb fails. This will
1731                  * cause sockfs to issue a T_DISCON_REQ which will cause
1732                  * tcp_eager_blowoff to be called. sockfs will then call
1733                  * rq->q_qinfo->qi_qclose to cleanup the acceptor stream.
1734                  * we need to do the allocb up here because we have to
1735                  * make sure rq->q_qinfo->qi_qclose still points to the
1736                  * correct function (tcp_tpi_close_accept) in case allocb
1737                  * fails.
1738                  */
1739                 bcopy(mp->b_rptr + conn_res->OPT_offset,
1740                     &eager, conn_res->OPT_length);
1741                 PRIM_type = conn_res->PRIM_type;
1742                 mp->b_datap->db_type = M_PCPROTO;
1743                 mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack);
1744                 ok = (struct T_ok_ack *)mp->b_rptr;
1745                 ok->PRIM_type = T_OK_ACK;
1746                 ok->CORRECT_prim = PRIM_type;
1747                 econnp = eager->tcp_connp;
1748                 econnp->conn_dev = (dev_t)RD(q)->q_ptr;
1749                 econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr);
1750                 econnp->conn_rq = rq;
1751                 econnp->conn_wq = q;
1752                 rq->q_ptr = econnp;
1753                 rq->q_qinfo = &tcp_rinitv4;     /* No open - same as rinitv6 */
1754                 q->q_ptr = econnp;
1755                 q->q_qinfo = &tcp_winit;
1756                 listener = eager->tcp_listener;
1757
1758                 /*
1759                  * Pre allocate the discon_ind mblk also. tcp_accept_finish will
1760                  * use it if something failed.
1761                  */
1762                 discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
1763                     sizeof (struct stroptions)), BPRI_HI);
1764
1765                 if (discon_mp == NULL) {
1766                         mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
1767                         if (mp != NULL)
1768                                 putnext(rq, mp);
1769                         return;
1770                 }
1771
1772                 eager->tcp_issocket = B_TRUE;
1773
1774                 ASSERT(econnp->conn_netstack ==
1775                     listener->tcp_connp->conn_netstack);
1776                 ASSERT(eager->tcp_tcps == listener->tcp_tcps);
1777
1778                 /* Put the ref for IP */
1779                 CONN_INC_REF(econnp);
1780
1781                 /*
1782                  * We should have minimum of 3 references on the conn
1783                  * at this point. One each for TCP and IP and one for
1784                  * the T_conn_ind that was sent up when the 3-way handshake
1785                  * completed. In the normal case we would also have another
1786                  * reference (making a total of 4) for the conn being in the
1787                  * classifier hash list. However the eager could have received
1788                  * an RST subsequently and tcp_closei_local could have removed
1789                  * the eager from the classifier hash list, hence we can't
1790                  * assert that reference.
1791                  */
1792                 ASSERT(econnp->conn_ref >= 3);
1793
1794                 mutex_enter(&listener->tcp_eager_lock);
1795                 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
1796                         mblk_t *conn_ind = tcp_get_def_conn_ind(listener);
1797
1798                         /* Need to get inside the listener perimeter */
1799                         CONN_INC_REF(listener->tcp_connp);
1800                         SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
1801                             conn_ind, tcp_send_pending, listener->tcp_connp,
1802                             NULL, SQ_FILL, SQTAG_TCP_SEND_PENDING);
1803                 }
1804                 tcp_eager_unlink(eager);
1805                 mutex_exit(&listener->tcp_eager_lock);
1806
1807                 /*
1808                  * At this point, the eager is detached from the listener
1809                  * but we still have an extra refs on eager (apart from the
1810                  * usual tcp references). The ref was placed in tcp_input_data
1811                  * before sending the conn_ind in tcp_send_conn_ind.
1812                  * The ref will be dropped in tcp_accept_finish().
1813                  */
1814                 SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish,
1815                     econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
1816
1817                 /*
1818                  * Send the new local address also up to sockfs. There
1819                  * should already be enough space in the mp that came
1820                  * down from soaccept().
1821                  */
1822                 if (econnp->conn_family == AF_INET) {
1823                         sin_t *sin;
1824
1825                         ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
1826                             (sizeof (struct T_ok_ack) + sizeof (sin_t)));
1827                         sin = (sin_t *)mp->b_wptr;
1828                         mp->b_wptr += sizeof (sin_t);
1829                         sin->sin_family = AF_INET;
1830                         sin->sin_port = econnp->conn_lport;
1831                         sin->sin_addr.s_addr = econnp->conn_laddr_v4;
1832                 } else {
1833                         sin6_t *sin6;
1834
1835                         ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
1836                             sizeof (struct T_ok_ack) + sizeof (sin6_t));
1837                         sin6 = (sin6_t *)mp->b_wptr;
1838                         mp->b_wptr += sizeof (sin6_t);
1839                         sin6->sin6_family = AF_INET6;
1840                         sin6->sin6_port = econnp->conn_lport;
1841                         sin6->sin6_addr = econnp->conn_laddr_v6;
1842                         if (econnp->conn_ipversion == IPV4_VERSION)
1843                                 sin6->sin6_flowinfo = 0;
1844                         else
1845                                 sin6->sin6_flowinfo = econnp->conn_flowinfo;
1846                         if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
1847                             (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
1848                                 sin6->sin6_scope_id =
1849                                     econnp->conn_ixa->ixa_scopeid;
1850                         } else {
1851                                 sin6->sin6_scope_id = 0;
1852                         }
1853                         sin6->__sin6_src_id = 0;
1854                 }
1855
1856                 putnext(rq, mp);
1857                 return;
1858         default:
1859                 mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0);
1860                 if (mp != NULL)
1861                         putnext(rq, mp);
1862                 return;
1863         }
1864 }
1865
1866 /*
1867  * The function called through squeue to get behind listener's perimeter to
1868  * send a deferred conn_ind.
1869  */
1870 /* ARGSUSED */
1871 void
1872 tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1873 {
1874         conn_t  *lconnp = (conn_t *)arg;
1875         tcp_t *listener = lconnp->conn_tcp;
1876         struct T_conn_ind *conn_ind;
1877         tcp_t *tcp;
1878
1879         conn_ind = (struct T_conn_ind *)mp->b_rptr;
1880         bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
1881             conn_ind->OPT_length);
1882
1883         if (listener->tcp_state != TCPS_LISTEN) {
1884                 /*
1885                  * If listener has closed, it would have caused a
1886                  * a cleanup/blowoff to happen for the eager, so
1887                  * we don't need to do anything more.
1888                  */
1889                 freemsg(mp);
1890                 return;
1891         }
1892
1893         putnext(lconnp->conn_rq, mp);
1894 }
1895
1896 /*
1897  * Sends the T_CONN_IND to the listener. The caller calls this
1898  * functions via squeue to get inside the listener's perimeter
1899  * once the 3 way hand shake is done a T_CONN_IND needs to be
1900  * sent. As an optimization, the caller can call this directly
1901  * if listener's perimeter is same as eager's.
1902  */
1903 /* ARGSUSED */
1904 void
1905 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
1906 {
1907         conn_t                  *lconnp = (conn_t *)arg;
1908         tcp_t                   *listener = lconnp->conn_tcp;
1909         tcp_t                   *tcp;
1910         struct T_conn_ind       *conn_ind;
1911         ipaddr_t                *addr_cache;
1912         boolean_t               need_send_conn_ind = B_FALSE;
1913         tcp_stack_t             *tcps = listener->tcp_tcps;
1914
1915         /* retrieve the eager */
1916         conn_ind = (struct T_conn_ind *)mp->b_rptr;
1917         ASSERT(conn_ind->OPT_offset != 0 &&
1918             conn_ind->OPT_length == sizeof (intptr_t));
1919         bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
1920             conn_ind->OPT_length);
1921
1922         /*
1923          * TLI/XTI applications will get confused by
1924          * sending eager as an option since it violates
1925          * the option semantics. So remove the eager as
1926          * option since TLI/XTI app doesn't need it anyway.
1927          */
1928         if (!TCP_IS_SOCKET(listener)) {
1929                 conn_ind->OPT_length = 0;
1930                 conn_ind->OPT_offset = 0;
1931         }
1932         if (listener->tcp_state != TCPS_LISTEN) {
1933                 /*
1934                  * If listener has closed, it would have caused a
1935                  * a cleanup/blowoff to happen for the eager. We
1936                  * just need to return.
1937                  */
1938                 freemsg(mp);
1939                 return;
1940         }
1941
1942
1943         /*
1944          * if the conn_req_q is full defer passing up the
1945          * T_CONN_IND until space is availabe after t_accept()
1946          * processing
1947          */
1948         mutex_enter(&listener->tcp_eager_lock);
1949
1950         /*
1951          * Take the eager out, if it is in the list of droppable eagers
1952          * as we are here because the 3W handshake is over.
1953          */
1954         MAKE_UNDROPPABLE(tcp);
1955
1956         if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) {
1957                 tcp_t *tail;
1958
1959                 /*
1960                  * The eager already has an extra ref put in tcp_input_data
1961                  * so that it stays till accept comes back even though it
1962                  * might get into TCPS_CLOSED as a result of a TH_RST etc.
1963                  */
1964                 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1965                 listener->tcp_conn_req_cnt_q0--;
1966                 listener->tcp_conn_req_cnt_q++;
1967
1968                 /* Move from SYN_RCVD to ESTABLISHED list  */
1969                 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1970                     tcp->tcp_eager_prev_q0;
1971                 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1972                     tcp->tcp_eager_next_q0;
1973                 tcp->tcp_eager_prev_q0 = NULL;
1974                 tcp->tcp_eager_next_q0 = NULL;
1975
1976                 /*
1977                  * Insert at end of the queue because sockfs
1978                  * sends down T_CONN_RES in chronological
1979                  * order. Leaving the older conn indications
1980                  * at front of the queue helps reducing search
1981                  * time.
1982                  */
1983                 tail = listener->tcp_eager_last_q;
1984                 if (tail != NULL)
1985                         tail->tcp_eager_next_q = tcp;
1986                 else
1987                         listener->tcp_eager_next_q = tcp;
1988                 listener->tcp_eager_last_q = tcp;
1989                 tcp->tcp_eager_next_q = NULL;
1990                 /*
1991                  * Delay sending up the T_conn_ind until we are
1992                  * done with the eager. Once we have have sent up
1993                  * the T_conn_ind, the accept can potentially complete
1994                  * any time and release the refhold we have on the eager.
1995                  */
1996                 need_send_conn_ind = B_TRUE;
1997         } else {
1998                 /*
1999                  * Defer connection on q0 and set deferred
2000                  * connection bit true
2001                  */
2002                 tcp->tcp_conn_def_q0 = B_TRUE;
2003
2004                 /* take tcp out of q0 ... */
2005                 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
2006                     tcp->tcp_eager_next_q0;
2007                 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
2008                     tcp->tcp_eager_prev_q0;
2009
2010                 /* ... and place it at the end of q0 */
2011                 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0;
2012                 tcp->tcp_eager_next_q0 = listener;
2013                 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp;
2014                 listener->tcp_eager_prev_q0 = tcp;
2015                 tcp->tcp_conn.tcp_eager_conn_ind = mp;
2016         }
2017
2018         /* we have timed out before */
2019         if (tcp->tcp_syn_rcvd_timeout != 0) {
2020                 tcp->tcp_syn_rcvd_timeout = 0;
2021                 listener->tcp_syn_rcvd_timeout--;
2022                 if (listener->tcp_syn_defense &&
2023                     listener->tcp_syn_rcvd_timeout <=
2024                     (tcps->tcps_conn_req_max_q0 >> 5) &&
2025                     10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
2026                     listener->tcp_last_rcv_lbolt)) {
2027                         /*
2028                          * Turn off the defense mode if we
2029                          * believe the SYN attack is over.
2030                          */
2031                         listener->tcp_syn_defense = B_FALSE;
2032                         if (listener->tcp_ip_addr_cache) {
2033                                 kmem_free((void *)listener->tcp_ip_addr_cache,
2034                                     IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
2035                                 listener->tcp_ip_addr_cache = NULL;
2036                         }
2037                 }
2038         }
2039         addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
2040         if (addr_cache != NULL) {
2041                 /*
2042                  * We have finished a 3-way handshake with this
2043                  * remote host. This proves the IP addr is good.
2044                  * Cache it!
2045                  */
2046                 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
2047                     tcp->tcp_connp->conn_faddr_v4;
2048         }
2049         mutex_exit(&listener->tcp_eager_lock);
2050         if (need_send_conn_ind)
2051                 putnext(lconnp->conn_rq, mp);
2052 }