4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
26 /* This file contains all TCP kernel socket related functions. */
28 #include <sys/types.h>
29 #include <sys/strlog.h>
30 #include <sys/policy.h>
31 #include <sys/sockio.h>
32 #include <sys/strsubr.h>
33 #include <sys/strsun.h>
34 #include <sys/squeue_impl.h>
35 #include <sys/squeue.h>
36 #define _SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 #include <sys/timod.h>
39 #include <sys/tpicommon.h>
40 #include <sys/socketvar.h>
42 #include <inet/common.h>
43 #include <inet/proto_set.h>
46 #include <inet/tcp_impl.h>
48 static void tcp_activate(sock_lower_handle_t
, sock_upper_handle_t
,
49 sock_upcalls_t
*, int, cred_t
*);
50 static int tcp_accept(sock_lower_handle_t
, sock_lower_handle_t
,
51 sock_upper_handle_t
, cred_t
*);
52 static int tcp_bind(sock_lower_handle_t
, struct sockaddr
*,
54 static int tcp_listen(sock_lower_handle_t
, int, cred_t
*);
55 static int tcp_connect(sock_lower_handle_t
, const struct sockaddr
*,
56 socklen_t
, sock_connid_t
*, cred_t
*);
57 static int tcp_getpeername(sock_lower_handle_t
, struct sockaddr
*,
58 socklen_t
*, cred_t
*);
59 static int tcp_getsockname(sock_lower_handle_t
, struct sockaddr
*,
60 socklen_t
*, cred_t
*);
61 static int tcp_getsockopt(sock_lower_handle_t
, int, int, void *,
62 socklen_t
*, cred_t
*);
63 static int tcp_setsockopt(sock_lower_handle_t
, int, int, const void *,
65 static int tcp_sendmsg(sock_lower_handle_t
, mblk_t
*, struct msghdr
*,
67 static int tcp_shutdown(sock_lower_handle_t
, int, cred_t
*);
68 static void tcp_clr_flowctrl(sock_lower_handle_t
);
69 static int tcp_ioctl(sock_lower_handle_t
, int, intptr_t, int, int32_t *,
71 static int tcp_close(sock_lower_handle_t
, int, cred_t
*);
73 sock_downcalls_t sock_tcp_downcalls
= {
95 tcp_activate(sock_lower_handle_t proto_handle
, sock_upper_handle_t sock_handle
,
96 sock_upcalls_t
*sock_upcalls
, int flags
, cred_t
*cr
)
98 conn_t
*connp
= (conn_t
*)proto_handle
;
99 struct sock_proto_props sopp
;
100 extern struct module_info tcp_rinfo
;
102 ASSERT(connp
->conn_upper_handle
== NULL
);
104 /* All Solaris components should pass a cred for this operation. */
107 sopp
.sopp_flags
= SOCKOPT_RCVHIWAT
| SOCKOPT_RCVLOWAT
|
108 SOCKOPT_MAXPSZ
| SOCKOPT_MAXBLK
| SOCKOPT_RCVTIMER
|
109 SOCKOPT_RCVTHRESH
| SOCKOPT_MAXADDRLEN
| SOCKOPT_MINPSZ
;
111 sopp
.sopp_rxhiwat
= SOCKET_RECVHIWATER
;
112 sopp
.sopp_rxlowat
= SOCKET_RECVLOWATER
;
113 sopp
.sopp_maxpsz
= INFPSZ
;
114 sopp
.sopp_maxblk
= INFPSZ
;
115 sopp
.sopp_rcvtimer
= SOCKET_TIMER_INTERVAL
;
116 sopp
.sopp_rcvthresh
= SOCKET_RECVHIWATER
>> 3;
117 sopp
.sopp_maxaddrlen
= sizeof (sin6_t
);
118 sopp
.sopp_minpsz
= (tcp_rinfo
.mi_minpsz
== 1) ? 0 :
121 connp
->conn_upcalls
= sock_upcalls
;
122 connp
->conn_upper_handle
= sock_handle
;
124 ASSERT(connp
->conn_rcvbuf
!= 0 &&
125 connp
->conn_rcvbuf
== connp
->conn_tcp
->tcp_rwnd
);
126 (*sock_upcalls
->su_set_proto_props
)(sock_handle
, &sopp
);
131 tcp_accept(sock_lower_handle_t lproto_handle
,
132 sock_lower_handle_t eproto_handle
, sock_upper_handle_t sock_handle
,
135 conn_t
*lconnp
, *econnp
;
136 tcp_t
*listener
, *eager
;
139 * KSSL can move a socket from one listener to another, in which
140 * case `lproto_handle' points to the new listener. To ensure that
141 * the original listener is used the information is obtained from
144 econnp
= (conn_t
*)eproto_handle
;
145 eager
= econnp
->conn_tcp
;
146 ASSERT(IPCL_IS_NONSTR(econnp
));
147 ASSERT(eager
->tcp_listener
!= NULL
);
148 listener
= eager
->tcp_listener
;
149 lconnp
= (conn_t
*)listener
->tcp_connp
;
150 ASSERT(listener
->tcp_state
== TCPS_LISTEN
);
151 ASSERT(lconnp
->conn_upper_handle
!= NULL
);
154 * It is possible for the accept thread to race with the thread that
155 * made the su_newconn upcall in tcp_newconn_notify. Both
156 * tcp_newconn_notify and tcp_accept require that conn_upper_handle
157 * and conn_upcalls be set before returning, so they both write to
158 * them. However, we're guaranteed that the value written is the same
161 ASSERT(econnp
->conn_upper_handle
== NULL
||
162 econnp
->conn_upper_handle
== sock_handle
);
163 ASSERT(econnp
->conn_upcalls
== NULL
||
164 econnp
->conn_upcalls
== lconnp
->conn_upcalls
);
165 econnp
->conn_upper_handle
= sock_handle
;
166 econnp
->conn_upcalls
= lconnp
->conn_upcalls
;
168 ASSERT(econnp
->conn_netstack
==
169 listener
->tcp_connp
->conn_netstack
);
170 ASSERT(eager
->tcp_tcps
== listener
->tcp_tcps
);
173 * We should have a minimum of 2 references on the conn at this
174 * point. One for TCP and one for the newconn notification
175 * (which is now taken over by IP). In the normal case we would
176 * also have another reference (making a total of 3) for the conn
177 * being in the classifier hash list. However the eager could have
178 * received an RST subsequently and tcp_closei_local could have
179 * removed the eager from the classifier hash list, hence we can't
180 * assert that reference.
182 ASSERT(econnp
->conn_ref
>= 2);
184 mutex_enter(&listener
->tcp_eager_lock
);
186 * Non-STREAMS listeners never defer the notification of new
189 ASSERT(!listener
->tcp_eager_prev_q0
->tcp_conn_def_q0
);
190 tcp_eager_unlink(eager
);
191 mutex_exit(&listener
->tcp_eager_lock
);
192 CONN_DEC_REF(listener
->tcp_connp
);
194 return ((eager
->tcp_state
< TCPS_ESTABLISHED
) ? ECONNABORTED
: 0);
198 tcp_bind(sock_lower_handle_t proto_handle
, struct sockaddr
*sa
,
199 socklen_t len
, cred_t
*cr
)
202 conn_t
*connp
= (conn_t
*)proto_handle
;
204 /* All Solaris components should pass a cred for this operation. */
206 ASSERT(connp
->conn_upper_handle
!= NULL
);
208 error
= squeue_synch_enter(connp
, NULL
);
210 /* failed to enter */
214 /* binding to a NULL address really means unbind */
216 if (connp
->conn_tcp
->tcp_state
< TCPS_LISTEN
)
217 error
= tcp_do_unbind(connp
);
221 error
= tcp_do_bind(connp
, sa
, len
, cr
, B_TRUE
);
224 squeue_synch_exit(connp
);
227 if (error
== -TOUTSTATE
)
230 error
= proto_tlitosyserr(-error
);
238 tcp_listen(sock_lower_handle_t proto_handle
, int backlog
, cred_t
*cr
)
240 conn_t
*connp
= (conn_t
*)proto_handle
;
241 tcp_t
*tcp
= connp
->conn_tcp
;
244 ASSERT(connp
->conn_upper_handle
!= NULL
);
246 /* All Solaris components should pass a cred for this operation. */
249 error
= squeue_synch_enter(connp
, NULL
);
251 /* failed to enter */
255 error
= tcp_do_listen(connp
, NULL
, 0, backlog
, cr
, B_FALSE
);
258 * sockfs needs to know what's the maximum number of socket
259 * that can be queued on the listener.
261 (*connp
->conn_upcalls
->su_opctl
)(connp
->conn_upper_handle
,
262 SOCK_OPCTL_ENAB_ACCEPT
,
263 (uintptr_t)(tcp
->tcp_conn_req_max
+
264 tcp
->tcp_tcps
->tcps_conn_req_max_q0
));
265 } else if (error
< 0) {
266 if (error
== -TOUTSTATE
)
269 error
= proto_tlitosyserr(-error
);
271 squeue_synch_exit(connp
);
276 tcp_connect(sock_lower_handle_t proto_handle
, const struct sockaddr
*sa
,
277 socklen_t len
, sock_connid_t
*id
, cred_t
*cr
)
279 conn_t
*connp
= (conn_t
*)proto_handle
;
282 ASSERT(connp
->conn_upper_handle
!= NULL
);
284 /* All Solaris components should pass a cred for this operation. */
287 error
= proto_verify_ip_addr(connp
->conn_family
, sa
, len
);
292 error
= squeue_synch_enter(connp
, NULL
);
294 /* failed to enter */
299 * TCP supports quick connect, so no need to do an implicit bind
301 error
= tcp_do_connect(connp
, sa
, len
, cr
, curproc
->p_pid
);
303 *id
= connp
->conn_tcp
->tcp_connid
;
304 } else if (error
< 0) {
305 if (error
== -TOUTSTATE
) {
306 switch (connp
->conn_tcp
->tcp_state
) {
310 case TCPS_ESTABLISHED
:
321 error
= proto_tlitosyserr(-error
);
325 if (connp
->conn_tcp
->tcp_loopback
) {
326 struct sock_proto_props sopp
;
328 sopp
.sopp_flags
= SOCKOPT_LOOPBACK
;
329 sopp
.sopp_loopback
= B_TRUE
;
331 (*connp
->conn_upcalls
->su_set_proto_props
)(
332 connp
->conn_upper_handle
, &sopp
);
335 squeue_synch_exit(connp
);
337 return ((error
== 0) ? EINPROGRESS
: error
);
342 tcp_getpeername(sock_lower_handle_t proto_handle
, struct sockaddr
*addr
,
343 socklen_t
*addrlenp
, cred_t
*cr
)
345 conn_t
*connp
= (conn_t
*)proto_handle
;
346 tcp_t
*tcp
= connp
->conn_tcp
;
348 /* All Solaris components should pass a cred for this operation. */
352 if (tcp
->tcp_state
< TCPS_SYN_RCVD
)
355 return (conn_getpeername(connp
, addr
, addrlenp
));
360 tcp_getsockname(sock_lower_handle_t proto_handle
, struct sockaddr
*addr
,
361 socklen_t
*addrlenp
, cred_t
*cr
)
363 conn_t
*connp
= (conn_t
*)proto_handle
;
365 /* All Solaris components should pass a cred for this operation. */
368 return (conn_getsockname(connp
, addr
, addrlenp
));
371 /* returns UNIX error, the optlen is a value-result arg */
373 tcp_getsockopt(sock_lower_handle_t proto_handle
, int level
, int option_name
,
374 void *optvalp
, socklen_t
*optlen
, cred_t
*cr
)
376 conn_t
*connp
= (conn_t
*)proto_handle
;
378 t_uscalar_t max_optbuf_len
;
382 ASSERT(connp
->conn_upper_handle
!= NULL
);
384 error
= proto_opt_check(level
, option_name
, *optlen
, &max_optbuf_len
,
385 tcp_opt_obj
.odb_opt_des_arr
,
386 tcp_opt_obj
.odb_opt_arr_cnt
,
387 B_FALSE
, B_TRUE
, cr
);
390 error
= proto_tlitosyserr(-error
);
395 optvalp_buf
= kmem_alloc(max_optbuf_len
, KM_SLEEP
);
397 error
= squeue_synch_enter(connp
, NULL
);
398 if (error
== ENOMEM
) {
399 kmem_free(optvalp_buf
, max_optbuf_len
);
403 len
= tcp_opt_get(connp
, level
, option_name
, optvalp_buf
);
404 squeue_synch_exit(connp
);
407 kmem_free(optvalp_buf
, max_optbuf_len
);
412 * update optlen and copy option value
414 t_uscalar_t size
= MIN(len
, *optlen
);
416 bcopy(optvalp_buf
, optvalp
, size
);
417 bcopy(&size
, optlen
, sizeof (size
));
419 kmem_free(optvalp_buf
, max_optbuf_len
);
424 tcp_setsockopt(sock_lower_handle_t proto_handle
, int level
, int option_name
,
425 const void *optvalp
, socklen_t optlen
, cred_t
*cr
)
427 conn_t
*connp
= (conn_t
*)proto_handle
;
430 ASSERT(connp
->conn_upper_handle
!= NULL
);
432 * Entering the squeue synchronously can result in a context switch,
433 * which can cause a rather sever performance degradation. So we try to
434 * handle whatever options we can without entering the squeue.
436 if (level
== IPPROTO_TCP
) {
437 switch (option_name
) {
439 if (optlen
!= sizeof (int32_t))
441 mutex_enter(&connp
->conn_tcp
->tcp_non_sq_lock
);
442 connp
->conn_tcp
->tcp_naglim
= *(int *)optvalp
? 1 :
443 connp
->conn_tcp
->tcp_mss
;
444 mutex_exit(&connp
->conn_tcp
->tcp_non_sq_lock
);
451 error
= squeue_synch_enter(connp
, NULL
);
452 if (error
== ENOMEM
) {
456 error
= proto_opt_check(level
, option_name
, optlen
, NULL
,
457 tcp_opt_obj
.odb_opt_des_arr
,
458 tcp_opt_obj
.odb_opt_arr_cnt
,
459 B_TRUE
, B_FALSE
, cr
);
463 error
= proto_tlitosyserr(-error
);
465 squeue_synch_exit(connp
);
469 error
= tcp_opt_set(connp
, SETFN_OPTCOM_NEGOTIATE
, level
, option_name
,
470 optlen
, (uchar_t
*)optvalp
, (uint_t
*)&optlen
, (uchar_t
*)optvalp
,
472 squeue_synch_exit(connp
);
481 tcp_sendmsg(sock_lower_handle_t proto_handle
, mblk_t
*mp
, struct msghdr
*msg
,
486 conn_t
*connp
= (conn_t
*)proto_handle
;
489 /* All Solaris components should pass a cred for this operation. */
492 ASSERT(connp
->conn_ref
>= 2);
493 ASSERT(connp
->conn_upper_handle
!= NULL
);
495 if (msg
->msg_controllen
!= 0) {
500 switch (DB_TYPE(mp
)) {
502 tcp
= connp
->conn_tcp
;
505 tcpstate
= tcp
->tcp_state
;
506 if (tcpstate
< TCPS_ESTABLISHED
) {
509 * We return ENOTCONN if the endpoint is trying to
510 * connect or has never been connected, and EPIPE if it
511 * has been disconnected. The connection id helps us
512 * distinguish between the last two cases.
514 return ((tcpstate
== TCPS_SYN_SENT
) ? ENOTCONN
:
515 ((tcp
->tcp_connid
> 0) ? EPIPE
: ENOTCONN
));
516 } else if (tcpstate
> TCPS_CLOSE_WAIT
) {
521 msize
= msgdsize(mp
);
523 mutex_enter(&tcp
->tcp_non_sq_lock
);
524 tcp
->tcp_squeue_bytes
+= msize
;
526 * Squeue Flow Control
528 if (TCP_UNSENT_BYTES(tcp
) > connp
->conn_sndbuf
) {
531 mutex_exit(&tcp
->tcp_non_sq_lock
);
534 * The application may pass in an address in the msghdr, but
535 * we ignore the address on connection-oriented sockets.
536 * Just like BSD this code does not generate an error for
537 * TCP (a CONNREQUIRED socket) when sending to an address
538 * passed in with sendto/sendmsg. Instead the data is
539 * delivered on the connection as if no address had been
544 if (msg
->msg_flags
& MSG_OOB
) {
545 SQUEUE_ENTER_ONE(connp
->conn_sqp
, mp
, tcp_output_urgent
,
546 connp
, NULL
, tcp_squeue_flag
, SQTAG_TCP_OUTPUT
);
548 SQUEUE_ENTER_ONE(connp
->conn_sqp
, mp
, tcp_output
,
549 connp
, NULL
, tcp_squeue_flag
, SQTAG_TCP_OUTPUT
);
564 tcp_shutdown(sock_lower_handle_t proto_handle
, int how
, cred_t
*cr
)
566 conn_t
*connp
= (conn_t
*)proto_handle
;
567 tcp_t
*tcp
= connp
->conn_tcp
;
569 ASSERT(connp
->conn_upper_handle
!= NULL
);
571 /* All Solaris components should pass a cred for this operation. */
575 * X/Open requires that we check the connected state.
577 if (tcp
->tcp_state
< TCPS_SYN_SENT
)
580 /* shutdown the send side */
581 if (how
!= SHUT_RD
) {
584 bp
= allocb_wait(0, BPRI_HI
, STR_NOSIG
, NULL
);
586 SQUEUE_ENTER_ONE(connp
->conn_sqp
, bp
, tcp_shutdown_output
,
587 connp
, NULL
, SQ_NODRAIN
, SQTAG_TCP_SHUTDOWN_OUTPUT
);
589 (*connp
->conn_upcalls
->su_opctl
)(connp
->conn_upper_handle
,
590 SOCK_OPCTL_SHUT_SEND
, 0);
593 /* shutdown the recv side */
595 (*connp
->conn_upcalls
->su_opctl
)(connp
->conn_upper_handle
,
596 SOCK_OPCTL_SHUT_RECV
, 0);
602 tcp_clr_flowctrl(sock_lower_handle_t proto_handle
)
604 conn_t
*connp
= (conn_t
*)proto_handle
;
605 tcp_t
*tcp
= connp
->conn_tcp
;
609 ASSERT(connp
->conn_upper_handle
!= NULL
);
612 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
613 * is currently running.
615 mutex_enter(&tcp
->tcp_rsrv_mp_lock
);
616 if ((mp
= tcp
->tcp_rsrv_mp
) == NULL
) {
617 mutex_exit(&tcp
->tcp_rsrv_mp_lock
);
620 tcp
->tcp_rsrv_mp
= NULL
;
621 mutex_exit(&tcp
->tcp_rsrv_mp_lock
);
623 error
= squeue_synch_enter(connp
, mp
);
626 mutex_enter(&tcp
->tcp_rsrv_mp_lock
);
627 tcp
->tcp_rsrv_mp
= mp
;
628 mutex_exit(&tcp
->tcp_rsrv_mp_lock
);
630 if (tcp
->tcp_fused
) {
631 tcp_fuse_backenable(tcp
);
633 tcp
->tcp_rwnd
= connp
->conn_rcvbuf
;
635 * Send back a window update immediately if TCP is above
636 * ESTABLISHED state and the increase of the rcv window
637 * that the other side knows is at least 1 MSS after flow
640 if (tcp
->tcp_state
>= TCPS_ESTABLISHED
&&
641 tcp_rwnd_reopen(tcp
) == TH_ACK_NEEDED
) {
642 tcp_xmit_ctl(NULL
, tcp
,
643 (tcp
->tcp_swnd
== 0) ? tcp
->tcp_suna
:
644 tcp
->tcp_snxt
, tcp
->tcp_rnxt
, TH_ACK
);
648 squeue_synch_exit(connp
);
653 tcp_ioctl(sock_lower_handle_t proto_handle
, int cmd
, intptr_t arg
,
654 int mode
, int32_t *rvalp
, cred_t
*cr
)
656 conn_t
*connp
= (conn_t
*)proto_handle
;
659 ASSERT(connp
->conn_upper_handle
!= NULL
);
661 /* All Solaris components should pass a cred for this operation. */
665 * If we don't have a helper stream then create one.
666 * ip_create_helper_stream takes care of locking the conn_t,
667 * so this check for NULL is just a performance optimization.
669 if (connp
->conn_helper_info
== NULL
) {
670 tcp_stack_t
*tcps
= connp
->conn_tcp
->tcp_tcps
;
673 * Create a helper stream for non-STREAMS socket.
675 error
= ip_create_helper_stream(connp
, tcps
->tcps_ldi_ident
);
677 ip0dbg(("tcp_ioctl: create of IP helper stream "
678 "failed %d\n", error
));
686 case _SIOCSOCKFALLBACK
:
687 case TCP_IOC_ABORT_CONN
:
690 ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
696 * If the conn is not closing, pass on to IP using
697 * helper stream. Bump the ioctlref to prevent tcp_close
698 * from closing the rq/wq out from underneath the ioctl
699 * if it ends up queued or aborted/interrupted.
701 mutex_enter(&connp
->conn_lock
);
702 if (connp
->conn_state_flags
& (CONN_CLOSING
)) {
703 mutex_exit(&connp
->conn_lock
);
707 CONN_INC_IOCTLREF_LOCKED(connp
);
708 error
= ldi_ioctl(connp
->conn_helper_info
->iphs_handle
,
709 cmd
, arg
, mode
, cr
, rvalp
);
710 CONN_DEC_IOCTLREF(connp
);
718 tcp_close(sock_lower_handle_t proto_handle
, int flags
, cred_t
*cr
)
720 conn_t
*connp
= (conn_t
*)proto_handle
;
722 ASSERT(connp
->conn_upper_handle
!= NULL
);
724 /* All Solaris components should pass a cred for this operation. */
727 tcp_close_common(connp
, flags
);
729 ip_free_helper_stream(connp
);
732 * Drop IP's reference on the conn. This is the last reference
733 * on the connp if the state was less than established. If the
734 * connection has gone into timewait state, then we will have
735 * one ref for the TCP and one more ref (total of two) for the
736 * classifier connected hash list (a timewait connections stays
737 * in connected hash till closed).
739 * We can't assert the references because there might be other
740 * transient reference places because of some walkers or queued
741 * packets in squeue for the timewait state.
746 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
747 * freeing the socket.
749 return (EINPROGRESS
);
754 tcp_create(int family
, int type
, int proto
, sock_downcalls_t
**sock_downcalls
,
755 uint_t
*smodep
, int *errorp
, int flags
, cred_t
*credp
)
758 boolean_t isv6
= family
== AF_INET6
;
760 if (type
!= SOCK_STREAM
|| (family
!= AF_INET
&& family
!= AF_INET6
) ||
761 (proto
!= 0 && proto
!= IPPROTO_TCP
)) {
762 *errorp
= EPROTONOSUPPORT
;
766 connp
= tcp_create_common(credp
, isv6
, B_TRUE
, errorp
);
772 * Put the ref for TCP. Ref for IP was already put
773 * by ipcl_conn_create. Also make the conn_t globally
776 mutex_enter(&connp
->conn_lock
);
777 CONN_INC_REF_LOCKED(connp
);
778 ASSERT(connp
->conn_ref
== 2);
779 connp
->conn_state_flags
&= ~CONN_INCIPIENT
;
781 connp
->conn_flags
|= IPCL_NONSTR
;
782 mutex_exit(&connp
->conn_lock
);
784 ASSERT(errorp
!= NULL
);
786 *sock_downcalls
= &sock_tcp_downcalls
;
787 *smodep
= SM_CONNREQUIRED
| SM_EXDATA
| SM_ACCEPTSUPP
|
790 return ((sock_lower_handle_t
)connp
);
796 * A direct socket is falling back to using STREAMS. The queue
797 * that is being passed down was created using tcp_open() with
798 * the SO_FALLBACK flag set. As a result, the queue is not
799 * associated with a conn, and the q_ptrs instead contain the
800 * dev and minor area that should be used.
802 * The 'issocket' flag indicates whether the FireEngine
803 * optimizations should be used. The common case would be that
804 * optimizations are enabled, and they might be subsequently
805 * disabled using the _SIOCSOCKFALLBACK ioctl.
809 * An active connection is falling back to TPI. Gather all the information
810 * required by the STREAM head and TPI sonode and send it up.
813 tcp_fallback_noneager(tcp_t
*tcp
, mblk_t
*stropt_mp
, queue_t
*q
,
814 boolean_t issocket
, so_proto_quiesced_cb_t quiesced_cb
,
815 sock_quiesce_arg_t
*arg
)
817 conn_t
*connp
= tcp
->tcp_connp
;
818 struct stroptions
*stropt
;
819 struct T_capability_ack tca
;
820 struct sockaddr_in6 laddr
, faddr
;
821 socklen_t laddrlen
, faddrlen
;
826 connp
->conn_dev
= (dev_t
)RD(q
)->q_ptr
;
827 connp
->conn_minor_arena
= WR(q
)->q_ptr
;
829 RD(q
)->q_ptr
= WR(q
)->q_ptr
= connp
;
831 connp
->conn_rq
= RD(q
);
832 connp
->conn_wq
= WR(q
);
834 WR(q
)->q_qinfo
= &tcp_sock_winit
;
837 tcp_use_pure_tpi(tcp
);
840 * free the helper stream
842 ip_free_helper_stream(connp
);
845 * Notify the STREAM head about options
847 DB_TYPE(stropt_mp
) = M_SETOPTS
;
848 stropt
= (struct stroptions
*)stropt_mp
->b_rptr
;
849 stropt_mp
->b_wptr
+= sizeof (struct stroptions
);
850 stropt
->so_flags
= SO_HIWAT
| SO_WROFF
| SO_MAXBLK
;
852 stropt
->so_wroff
= connp
->conn_ht_iphc_len
+ (tcp
->tcp_loopback
? 0 :
853 tcp
->tcp_tcps
->tcps_wroff_xtra
);
854 if (tcp
->tcp_snd_sack_ok
)
855 stropt
->so_wroff
+= TCPOPT_MAX_SACK_LEN
;
856 stropt
->so_hiwat
= connp
->conn_rcvbuf
;
857 stropt
->so_maxblk
= tcp_maxpsz_set(tcp
, B_FALSE
);
859 putnext(RD(q
), stropt_mp
);
862 * Collect the information needed to sync with the sonode
864 tcp_do_capability_ack(tcp
, &tca
, TC1_INFO
|TC1_ACCEPTOR_ID
);
866 laddrlen
= faddrlen
= sizeof (sin6_t
);
867 (void) tcp_getsockname((sock_lower_handle_t
)connp
,
868 (struct sockaddr
*)&laddr
, &laddrlen
, CRED());
869 error
= tcp_getpeername((sock_lower_handle_t
)connp
,
870 (struct sockaddr
*)&faddr
, &faddrlen
, CRED());
875 if (connp
->conn_oobinline
)
876 opts
|= SO_OOBINLINE
;
877 if (connp
->conn_ixa
->ixa_flags
& IXAF_DONTROUTE
)
878 opts
|= SO_DONTROUTE
;
881 * Notify the socket that the protocol is now quiescent,
882 * and it's therefore safe move data from the socket
883 * to the stream head.
885 mp
= (*quiesced_cb
)(connp
->conn_upper_handle
, arg
, &tca
,
886 (struct sockaddr
*)&laddr
, laddrlen
,
887 (struct sockaddr
*)&faddr
, faddrlen
, opts
);
891 tcp
->tcp_rcv_list
= mp
->b_next
;
896 ASSERT(tcp
->tcp_rcv_last_head
== NULL
);
897 ASSERT(tcp
->tcp_rcv_last_tail
== NULL
);
898 ASSERT(tcp
->tcp_rcv_cnt
== 0);
901 * All eagers in q0 are marked as being non-STREAM, so they will
902 * make su_newconn upcalls when the handshake completes, which
903 * will fail (resulting in the conn being closed). So we just blow
904 * off everything in q0 instead of waiting for the inevitable.
906 if (tcp
->tcp_conn_req_cnt_q0
!= 0)
907 tcp_eager_cleanup(tcp
, B_TRUE
);
911 * An eager is falling back to TPI. All we have to do is send
915 tcp_fallback_eager(tcp_t
*eager
, boolean_t issocket
,
916 so_proto_quiesced_cb_t quiesced_cb
, sock_quiesce_arg_t
*arg
)
918 conn_t
*connp
= eager
->tcp_connp
;
919 tcp_t
*listener
= eager
->tcp_listener
;
922 ASSERT(listener
!= NULL
);
925 * Notify the socket that the protocol is now quiescent,
926 * and it's therefore safe move data from the socket
927 * to tcp's rcv queue.
929 mp
= (*quiesced_cb
)(connp
->conn_upper_handle
, arg
, NULL
, NULL
, 0,
933 ASSERT(eager
->tcp_rcv_cnt
== 0);
935 eager
->tcp_rcv_list
= mp
;
936 eager
->tcp_rcv_cnt
= msgdsize(mp
);
937 while (mp
->b_next
!= NULL
) {
939 eager
->tcp_rcv_cnt
+= msgdsize(mp
);
941 eager
->tcp_rcv_last_head
= mp
;
944 eager
->tcp_rcv_last_tail
= mp
;
945 if (eager
->tcp_rcv_cnt
> eager
->tcp_rwnd
)
948 eager
->tcp_rwnd
-= eager
->tcp_rcv_cnt
;
952 eager
->tcp_issocket
= B_FALSE
;
954 * The stream for this eager does not yet exist, so mark it as
957 eager
->tcp_detached
= B_TRUE
;
958 eager
->tcp_hard_binding
= B_TRUE
;
959 connp
->conn_rq
= listener
->tcp_connp
->conn_rq
;
960 connp
->conn_wq
= listener
->tcp_connp
->conn_wq
;
962 /* Send up the connection indication */
963 mp
= eager
->tcp_conn
.tcp_eager_conn_ind
;
965 eager
->tcp_conn
.tcp_eager_conn_ind
= NULL
;
968 * TLI/XTI applications will get confused by
969 * sending eager as an option since it violates
970 * the option semantics. So remove the eager as
971 * option since TLI/XTI app doesn't need it anyway.
974 struct T_conn_ind
*conn_ind
;
976 conn_ind
= (struct T_conn_ind
*)mp
->b_rptr
;
977 conn_ind
->OPT_length
= 0;
978 conn_ind
->OPT_offset
= 0;
982 * Sockfs guarantees that the listener will not be closed
983 * during fallback. So we can safely use the listener's queue.
985 putnext(listener
->tcp_connp
->conn_rq
, mp
);
990 tcp_fallback(sock_lower_handle_t proto_handle
, queue_t
*q
,
991 boolean_t direct_sockfs
, so_proto_quiesced_cb_t quiesced_cb
,
992 sock_quiesce_arg_t
*arg
)
995 conn_t
*connp
= (conn_t
*)proto_handle
;
1000 tcp
= connp
->conn_tcp
;
1002 stropt_mp
= allocb_wait(sizeof (struct stroptions
), BPRI_HI
, STR_NOSIG
,
1005 /* Pre-allocate the T_ordrel_ind mblk. */
1006 ASSERT(tcp
->tcp_ordrel_mp
== NULL
);
1007 ordrel_mp
= allocb_wait(sizeof (struct T_ordrel_ind
), BPRI_HI
,
1009 ordrel_mp
->b_datap
->db_type
= M_PROTO
;
1010 ((struct T_ordrel_ind
*)ordrel_mp
->b_rptr
)->PRIM_type
= T_ORDREL_IND
;
1011 ordrel_mp
->b_wptr
+= sizeof (struct T_ordrel_ind
);
1014 * Enter the squeue so that no new packets can come in
1016 error
= squeue_synch_enter(connp
, NULL
);
1018 /* failed to enter, free all the pre-allocated messages. */
1025 * Both endpoints must be of the same type (either STREAMS or
1026 * non-STREAMS) for fusion to be enabled. So if we are fused,
1027 * we have to unfuse.
1032 if (tcp
->tcp_listener
!= NULL
) {
1033 /* The eager will deal with opts when accept() is called */
1035 tcp_fallback_eager(tcp
, direct_sockfs
, quiesced_cb
, arg
);
1037 tcp_fallback_noneager(tcp
, stropt_mp
, q
, direct_sockfs
,
1042 * No longer a direct socket
1044 * Note that we intentionally leave the upper_handle and upcalls
1045 * intact, since eagers may still be using them.
1047 connp
->conn_flags
&= ~IPCL_NONSTR
;
1048 tcp
->tcp_ordrel_mp
= ordrel_mp
;
1051 * There should be atleast two ref's (IP + TCP)
1053 ASSERT(connp
->conn_ref
>= 2);
1054 squeue_synch_exit(connp
);
1060 * Notifies a non-STREAMS based listener about a new connection. This
1061 * function is executed on the *eager*'s squeue once the 3 way handshake
1062 * has completed. Note that the behavior differs from STREAMS, where the
1063 * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s
1066 * Returns B_TRUE if the notification succeeded and an upper handle was
1067 * obtained. `tcp' should be closed on failure.
1070 tcp_newconn_notify(tcp_t
*tcp
, ip_recv_attr_t
*ira
)
1072 tcp_t
*listener
= tcp
->tcp_listener
;
1073 conn_t
*lconnp
= listener
->tcp_connp
;
1074 conn_t
*econnp
= tcp
->tcp_connp
;
1076 ipaddr_t
*addr_cache
;
1077 sock_upper_handle_t upper
;
1078 struct sock_proto_props sopp
;
1080 mutex_enter(&listener
->tcp_eager_lock
);
1082 * Take the eager out, if it is in the list of droppable eagers
1083 * as we are here because the 3W handshake is over.
1085 MAKE_UNDROPPABLE(tcp
);
1087 * The eager already has an extra ref put in tcp_input_data
1088 * so that it stays till accept comes back even though it
1089 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1091 ASSERT(listener
->tcp_conn_req_cnt_q0
> 0);
1092 listener
->tcp_conn_req_cnt_q0
--;
1093 listener
->tcp_conn_req_cnt_q
++;
1095 /* Move from SYN_RCVD to ESTABLISHED list */
1096 tcp
->tcp_eager_next_q0
->tcp_eager_prev_q0
= tcp
->tcp_eager_prev_q0
;
1097 tcp
->tcp_eager_prev_q0
->tcp_eager_next_q0
= tcp
->tcp_eager_next_q0
;
1098 tcp
->tcp_eager_prev_q0
= NULL
;
1099 tcp
->tcp_eager_next_q0
= NULL
;
1102 * Insert at end of the queue because connections are accepted
1103 * in chronological order. Leaving the older connections at front
1104 * of the queue helps reducing search time.
1106 tail
= listener
->tcp_eager_last_q
;
1108 tail
->tcp_eager_next_q
= tcp
;
1110 listener
->tcp_eager_next_q
= tcp
;
1111 listener
->tcp_eager_last_q
= tcp
;
1112 tcp
->tcp_eager_next_q
= NULL
;
1114 /* we have timed out before */
1115 if (tcp
->tcp_syn_rcvd_timeout
!= 0) {
1116 tcp
->tcp_syn_rcvd_timeout
= 0;
1117 listener
->tcp_syn_rcvd_timeout
--;
1118 if (listener
->tcp_syn_defense
&&
1119 listener
->tcp_syn_rcvd_timeout
<=
1120 (listener
->tcp_tcps
->tcps_conn_req_max_q0
>> 5) &&
1121 10*MINUTES
< TICK_TO_MSEC(ddi_get_lbolt64() -
1122 listener
->tcp_last_rcv_lbolt
)) {
1124 * Turn off the defense mode if we
1125 * believe the SYN attack is over.
1127 listener
->tcp_syn_defense
= B_FALSE
;
1128 if (listener
->tcp_ip_addr_cache
) {
1129 kmem_free((void *)listener
->tcp_ip_addr_cache
,
1130 IP_ADDR_CACHE_SIZE
* sizeof (ipaddr_t
));
1131 listener
->tcp_ip_addr_cache
= NULL
;
1135 addr_cache
= (ipaddr_t
*)(listener
->tcp_ip_addr_cache
);
1136 if (addr_cache
!= NULL
) {
1138 * We have finished a 3-way handshake with this
1139 * remote host. This proves the IP addr is good.
1142 addr_cache
[IP_ADDR_CACHE_HASH(tcp
->tcp_connp
->conn_faddr_v4
)] =
1143 tcp
->tcp_connp
->conn_faddr_v4
;
1145 mutex_exit(&listener
->tcp_eager_lock
);
1148 * Notify the ULP about the newconn. It is guaranteed that no
1149 * tcp_accept() call will be made for the eager if the
1150 * notification fails.
1152 if ((upper
= (*lconnp
->conn_upcalls
->su_newconn
)
1153 (lconnp
->conn_upper_handle
, (sock_lower_handle_t
)econnp
,
1154 &sock_tcp_downcalls
, ira
->ira_cred
, ira
->ira_cpid
,
1155 &econnp
->conn_upcalls
)) == NULL
) {
1158 econnp
->conn_upper_handle
= upper
;
1160 tcp
->tcp_detached
= B_FALSE
;
1161 tcp
->tcp_hard_binding
= B_FALSE
;
1162 tcp
->tcp_tconnind_started
= B_TRUE
;
1164 if (econnp
->conn_keepalive
) {
1165 tcp
->tcp_ka_last_intrvl
= 0;
1166 tcp
->tcp_ka_tid
= TCP_TIMER(tcp
, tcp_keepalive_timer
,
1167 tcp
->tcp_ka_interval
);
1170 /* Update the necessary parameters */
1171 tcp_get_proto_props(tcp
, &sopp
);
1173 (*econnp
->conn_upcalls
->su_set_proto_props
)
1174 (econnp
->conn_upper_handle
, &sopp
);