4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #define _SUN_TPI_VERSION 2
29 #include <sys/tihdr.h>
30 #include <sys/socket.h>
31 #include <sys/xti_xtiopt.h>
32 #include <sys/xti_inet.h>
33 #include <sys/policy.h>
35 #include <inet/common.h>
36 #include <netinet/ip6.h>
39 #include <netinet/in.h>
40 #include <netinet/tcp.h>
41 #include <inet/optcom.h>
42 #include <inet/proto_set.h>
43 #include <inet/tcp_impl.h>
45 static int tcp_opt_default(queue_t
*, int, int, uchar_t
*);
48 * Table of all known options handled on a TCP protocol stack.
50 * Note: This table contains options processed by both TCP and IP levels
51 * and is the superset of options that can be performed on a TCP over IP
54 opdes_t tcp_opt_arr
[] = {
56 { SO_LINGER
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0,
57 sizeof (struct linger
), 0 },
59 { SO_DEBUG
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
60 { SO_KEEPALIVE
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
61 { SO_DONTROUTE
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
62 { SO_USELOOPBACK
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0
64 { SO_BROADCAST
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
65 { SO_REUSEADDR
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
66 { SO_OOBINLINE
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
67 { SO_TYPE
, SOL_SOCKET
, OA_R
, OA_R
, OP_NP
, 0, sizeof (int), 0 },
68 { SO_SNDBUF
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
69 { SO_RCVBUF
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
70 { SO_SNDTIMEO
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0,
71 sizeof (struct timeval
), 0 },
72 { SO_RCVTIMEO
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0,
73 sizeof (struct timeval
), 0 },
74 { SO_DGRAM_ERRIND
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0
76 { SO_SND_COPYAVOID
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
77 { SO_ALLZONES
, SOL_SOCKET
, OA_R
, OA_RW
, OP_CONFIG
, 0, sizeof (int),
79 { SO_EXCLBIND
, SOL_SOCKET
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
81 { SO_DOMAIN
, SOL_SOCKET
, OA_R
, OA_R
, OP_NP
, 0, sizeof (int), 0 },
83 { SO_PROTOTYPE
, SOL_SOCKET
, OA_R
, OA_R
, OP_NP
, 0, sizeof (int), 0 },
85 { TCP_NODELAY
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0
87 { TCP_MAXSEG
, IPPROTO_TCP
, OA_R
, OA_R
, OP_NP
, 0, sizeof (uint_t
),
90 { TCP_NOTIFY_THRESHOLD
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
,
91 OP_DEF_FN
, sizeof (int), -1 /* not initialized */ },
93 { TCP_ABORT_THRESHOLD
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
,
94 OP_DEF_FN
, sizeof (int), -1 /* not initialized */ },
96 { TCP_CONN_NOTIFY_THRESHOLD
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
,
97 OP_DEF_FN
, sizeof (int), -1 /* not initialized */ },
99 { TCP_CONN_ABORT_THRESHOLD
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
,
100 OP_DEF_FN
, sizeof (int), -1 /* not initialized */ },
102 { TCP_RECVDSTADDR
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int),
105 { TCP_ANONPRIVBIND
, IPPROTO_TCP
, OA_R
, OA_RW
, OP_PRIVPORT
, 0,
108 { TCP_EXCLBIND
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0
111 { TCP_INIT_CWND
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_CONFIG
, 0,
114 { TCP_KEEPALIVE_THRESHOLD
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
, 0,
117 { TCP_KEEPIDLE
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
119 { TCP_KEEPCNT
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
121 { TCP_KEEPINTVL
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
123 { TCP_KEEPALIVE_ABORT_THRESHOLD
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
, 0,
126 { TCP_CORK
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
128 { TCP_RTO_INITIAL
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (uint32_t), 0 },
130 { TCP_RTO_MIN
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (uint32_t), 0 },
132 { TCP_RTO_MAX
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (uint32_t), 0 },
134 { TCP_LINGER2
, IPPROTO_TCP
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
136 { IP_OPTIONS
, IPPROTO_IP
, OA_RW
, OA_RW
, OP_NP
,
137 (OP_VARLEN
|OP_NODEFAULT
),
138 IP_MAX_OPT_LENGTH
+ IP_ADDR_LEN
, -1 /* not initialized */ },
139 { T_IP_OPTIONS
, IPPROTO_IP
, OA_RW
, OA_RW
, OP_NP
,
140 (OP_VARLEN
|OP_NODEFAULT
),
141 IP_MAX_OPT_LENGTH
+ IP_ADDR_LEN
, -1 /* not initialized */ },
143 { IP_TOS
, IPPROTO_IP
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
144 { T_IP_TOS
, IPPROTO_IP
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
145 { IP_TTL
, IPPROTO_IP
, OA_RW
, OA_RW
, OP_NP
, OP_DEF_FN
,
146 sizeof (int), -1 /* not initialized */ },
148 { IP_SEC_OPT
, IPPROTO_IP
, OA_RW
, OA_RW
, OP_NP
, OP_NODEFAULT
,
149 sizeof (ipsec_req_t
), -1 /* not initialized */ },
151 { IP_BOUND_IF
, IPPROTO_IP
, OA_RW
, OA_RW
, OP_NP
, 0,
152 sizeof (int), 0 /* no ifindex */ },
154 { IP_UNSPEC_SRC
, IPPROTO_IP
, OA_R
, OA_RW
, OP_RAW
, 0,
157 { IPV6_UNICAST_HOPS
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, OP_DEF_FN
,
158 sizeof (int), -1 /* not initialized */ },
160 { IPV6_BOUND_IF
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, 0,
161 sizeof (int), 0 /* no ifindex */ },
163 { IP_DONTFRAG
, IPPROTO_IP
, OA_RW
, OA_RW
, OP_NP
, 0, sizeof (int), 0 },
165 { IP_NEXTHOP
, IPPROTO_IP
, OA_R
, OA_RW
, OP_CONFIG
, 0,
166 sizeof (in_addr_t
), -1 /* not initialized */ },
168 { IPV6_UNSPEC_SRC
, IPPROTO_IPV6
, OA_R
, OA_RW
, OP_RAW
, 0,
171 { IPV6_PKTINFO
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
,
172 (OP_NODEFAULT
|OP_VARLEN
),
173 sizeof (struct in6_pktinfo
), -1 /* not initialized */ },
174 { IPV6_NEXTHOP
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
,
176 sizeof (sin6_t
), -1 /* not initialized */ },
177 { IPV6_HOPOPTS
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
,
178 (OP_VARLEN
|OP_NODEFAULT
), 255*8,
179 -1 /* not initialized */ },
180 { IPV6_DSTOPTS
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
,
181 (OP_VARLEN
|OP_NODEFAULT
), 255*8,
182 -1 /* not initialized */ },
183 { IPV6_RTHDRDSTOPTS
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
,
184 (OP_VARLEN
|OP_NODEFAULT
), 255*8,
185 -1 /* not initialized */ },
186 { IPV6_RTHDR
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
,
187 (OP_VARLEN
|OP_NODEFAULT
), 255*8,
188 -1 /* not initialized */ },
189 { IPV6_TCLASS
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
,
191 sizeof (int), -1 /* not initialized */ },
192 { IPV6_PATHMTU
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
,
194 sizeof (struct ip6_mtuinfo
), -1 /* not initialized */ },
195 { IPV6_DONTFRAG
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, 0,
197 { IPV6_USE_MIN_MTU
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, 0,
199 { IPV6_V6ONLY
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, 0,
202 /* Enable receipt of ancillary data */
203 { IPV6_RECVPKTINFO
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, 0,
205 { IPV6_RECVHOPLIMIT
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, 0,
207 { IPV6_RECVHOPOPTS
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, 0,
209 { _OLD_IPV6_RECVDSTOPTS
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, 0,
211 { IPV6_RECVDSTOPTS
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, 0,
213 { IPV6_RECVRTHDR
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, 0,
215 { IPV6_RECVRTHDRDSTOPTS
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, 0,
217 { IPV6_RECVTCLASS
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, 0,
220 { IPV6_SEC_OPT
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, OP_NODEFAULT
,
221 sizeof (ipsec_req_t
), -1 /* not initialized */ },
222 { IPV6_SRC_PREFERENCES
, IPPROTO_IPV6
, OA_RW
, OA_RW
, OP_NP
, 0,
223 sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT
},
227 * Table of all supported levels
228 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
229 * any supported options so we need this info separately.
231 * This is needed only for topmost tpi providers and is used only by
234 optlevel_t tcp_valid_levels_arr
[] = {
243 #define TCP_OPT_ARR_CNT A_CNT(tcp_opt_arr)
244 #define TCP_VALID_LEVELS_CNT A_CNT(tcp_valid_levels_arr)
246 uint_t tcp_max_optsize
; /* initialized when TCP driver is loaded */
249 * Initialize option database object for TCP
251 * This object represents database of options to search passed to
252 * {sock,tpi}optcom_req() interface routine to take care of option
253 * management and associated methods.
256 optdb_obj_t tcp_opt_obj
= {
257 tcp_opt_default
, /* TCP default value function pointer */
258 tcp_tpi_opt_get
, /* TCP get function pointer */
259 tcp_tpi_opt_set
, /* TCP set function pointer */
260 TCP_OPT_ARR_CNT
, /* TCP option database count of entries */
261 tcp_opt_arr
, /* TCP option database */
262 TCP_VALID_LEVELS_CNT
, /* TCP valid level count of entries */
263 tcp_valid_levels_arr
/* TCP valid level array */
266 static int tcp_max_init_cwnd
= TCP_MAX_INIT_CWND
;
269 * Some TCP options can be "set" by requesting them in the option
270 * buffer. This is needed for XTI feature test though we do not
271 * allow it in general. We interpret that this mechanism is more
272 * applicable to OSI protocols and need not be allowed in general.
273 * This routine filters out options for which it is not allowed (most)
274 * and lets through those (few) for which it is. [ The XTI interface
275 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
276 * ever implemented will have to be allowed here ].
279 tcp_allow_connopt_set(int level
, int name
)
298 * This routine gets default values of certain options whose default
299 * values are maintained by protocol specific code
303 tcp_opt_default(queue_t
*q
, int level
, int name
, uchar_t
*ptr
)
305 int32_t *i1
= (int32_t *)ptr
;
306 tcp_stack_t
*tcps
= Q_TO_TCP(q
)->tcp_tcps
;
311 case TCP_NOTIFY_THRESHOLD
:
312 *i1
= tcps
->tcps_ip_notify_interval
;
314 case TCP_ABORT_THRESHOLD
:
315 *i1
= tcps
->tcps_ip_abort_interval
;
317 case TCP_CONN_NOTIFY_THRESHOLD
:
318 *i1
= tcps
->tcps_ip_notify_cinterval
;
320 case TCP_CONN_ABORT_THRESHOLD
:
321 *i1
= tcps
->tcps_ip_abort_cinterval
;
330 *i1
= tcps
->tcps_ipv4_ttl
;
338 case IPV6_UNICAST_HOPS
:
339 *i1
= tcps
->tcps_ipv6_hoplimit
;
348 return (sizeof (int));
352 * TCP routine to get the values of options.
355 tcp_opt_get(conn_t
*connp
, int level
, int name
, uchar_t
*ptr
)
357 int *i1
= (int *)ptr
;
358 tcp_t
*tcp
= connp
->conn_tcp
;
362 coas
.coa_connp
= connp
;
363 coas
.coa_ixa
= connp
->conn_ixa
;
364 coas
.coa_ipp
= &connp
->conn_xmit_ipp
;
365 coas
.coa_ancillary
= B_FALSE
;
366 coas
.coa_changed
= 0;
371 case SO_SND_COPYAVOID
:
372 *i1
= tcp
->tcp_snd_zcopy_on
?
373 SO_SND_COPYAVOID
: 0;
374 return (sizeof (int));
376 *i1
= (tcp
->tcp_state
== TCPS_LISTEN
);
377 return (sizeof (int));
383 *i1
= (tcp
->tcp_naglim
== 1) ? TCP_NODELAY
: 0;
384 return (sizeof (int));
387 return (sizeof (int));
388 case TCP_NOTIFY_THRESHOLD
:
389 *i1
= (int)tcp
->tcp_first_timer_threshold
;
390 return (sizeof (int));
391 case TCP_ABORT_THRESHOLD
:
392 *i1
= tcp
->tcp_second_timer_threshold
;
393 return (sizeof (int));
394 case TCP_CONN_NOTIFY_THRESHOLD
:
395 *i1
= tcp
->tcp_first_ctimer_threshold
;
396 return (sizeof (int));
397 case TCP_CONN_ABORT_THRESHOLD
:
398 *i1
= tcp
->tcp_second_ctimer_threshold
;
399 return (sizeof (int));
401 *i1
= tcp
->tcp_init_cwnd
;
402 return (sizeof (int));
403 case TCP_KEEPALIVE_THRESHOLD
:
404 *i1
= tcp
->tcp_ka_interval
;
405 return (sizeof (int));
408 * TCP_KEEPIDLE expects value in seconds, but
409 * tcp_ka_interval is in milliseconds.
412 *i1
= tcp
->tcp_ka_interval
/ 1000;
413 return (sizeof (int));
415 *i1
= tcp
->tcp_ka_cnt
;
416 return (sizeof (int));
419 * TCP_KEEPINTVL expects value in seconds, but
420 * tcp_ka_rinterval is in milliseconds.
423 *i1
= tcp
->tcp_ka_rinterval
/ 1000;
424 return (sizeof (int));
425 case TCP_KEEPALIVE_ABORT_THRESHOLD
:
426 *i1
= tcp
->tcp_ka_abort_thres
;
427 return (sizeof (int));
430 return (sizeof (int));
431 case TCP_RTO_INITIAL
:
432 *i1
= tcp
->tcp_rto_initial
;
433 return (sizeof (uint32_t));
435 *i1
= tcp
->tcp_rto_min
;
436 return (sizeof (uint32_t));
438 *i1
= tcp
->tcp_rto_max
;
439 return (sizeof (uint32_t));
441 *i1
= tcp
->tcp_fin_wait_2_flush_interval
/ SECONDS
;
442 return (sizeof (int));
446 if (connp
->conn_family
!= AF_INET
)
451 /* Caller ensures enough space */
452 return (ip_opt_get_user(connp
, ptr
));
460 * IPPROTO_IPV6 options are only supported for sockets
461 * that are using IPv6 on the wire.
463 if (connp
->conn_ipversion
!= IPV6_VERSION
) {
468 if (tcp
->tcp_state
< TCPS_ESTABLISHED
)
474 mutex_enter(&connp
->conn_lock
);
475 retval
= conn_opt_get(&coas
, level
, name
, ptr
);
476 mutex_exit(&connp
->conn_lock
);
481 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
482 * Parameters are assumed to be verified by the caller.
486 tcp_opt_set(conn_t
*connp
, uint_t optset_context
, int level
, int name
,
487 uint_t inlen
, uchar_t
*invalp
, uint_t
*outlenp
, uchar_t
*outvalp
,
488 void *thisdg_attrs
, cred_t
*cr
)
490 tcp_t
*tcp
= connp
->conn_tcp
;
491 int *i1
= (int *)invalp
;
492 boolean_t onoff
= (*i1
== 0) ? 0 : 1;
495 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
497 uint32_t val
= *((uint32_t *)invalp
);
499 coas
.coa_connp
= connp
;
500 coas
.coa_ixa
= connp
->conn_ixa
;
501 coas
.coa_ipp
= &connp
->conn_xmit_ipp
;
502 coas
.coa_ancillary
= B_FALSE
;
503 coas
.coa_changed
= 0;
505 switch (optset_context
) {
506 case SETFN_OPTCOM_CHECKONLY
:
509 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
510 * inlen != 0 implies value supplied and
511 * we have to "pretend" to set it.
512 * inlen == 0 implies that there is no
513 * value part in T_CHECK request and just validation
514 * done elsewhere should be enough, we just return here.
521 case SETFN_OPTCOM_NEGOTIATE
:
524 case SETFN_UD_NEGOTIATE
: /* error on conn-oriented transports ? */
525 case SETFN_CONN_NEGOTIATE
:
528 * Negotiating local and "association-related" options
529 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
530 * primitives is allowed by XTI, but we choose
531 * to not implement this style negotiation for Internet
532 * protocols (We interpret it is a must for OSI world but
533 * optional for Internet protocols) for all options.
534 * [ Will do only for the few options that enable test
535 * suites that our XTI implementation of this feature
536 * works for transports that do allow it ]
538 if (!tcp_allow_connopt_set(level
, name
)) {
545 * We should never get here
551 ASSERT((optset_context
!= SETFN_OPTCOM_CHECKONLY
) ||
552 (optset_context
== SETFN_OPTCOM_CHECKONLY
&& inlen
!= 0));
555 * For TCP, we should have no ancillary data sent down
556 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
559 ASSERT(thisdg_attrs
== NULL
);
562 * For fixed length options, no sanity check
563 * of passed in length is done. It is assumed *_optcom_req()
564 * routines do the right thing.
571 /* check only case */
576 if (connp
->conn_keepalive
) {
577 if (tcp
->tcp_ka_tid
!= 0) {
578 (void) TCP_TIMER_CANCEL(tcp
,
582 connp
->conn_keepalive
= 0;
586 if (!connp
->conn_keepalive
) {
587 /* Crank up the keepalive timer */
588 tcp
->tcp_ka_last_intrvl
= 0;
589 tcp
->tcp_ka_tid
= TCP_TIMER(tcp
,
590 tcp_keepalive_timer
, tcp
->tcp_ka_interval
);
591 connp
->conn_keepalive
= 1;
595 if (*i1
> tcps
->tcps_max_buf
) {
602 connp
->conn_sndbuf
= *i1
;
603 if (tcps
->tcps_snd_lowat_fraction
!= 0) {
604 connp
->conn_sndlowat
= connp
->conn_sndbuf
/
605 tcps
->tcps_snd_lowat_fraction
;
607 (void) tcp_maxpsz_set(tcp
, B_TRUE
);
609 * If we are flow-controlled, recheck the condition.
610 * There are apps that increase SO_SNDBUF size when
611 * flow-controlled (EWOULDBLOCK), and expect the flow
612 * control condition to be lifted right away.
614 mutex_enter(&tcp
->tcp_non_sq_lock
);
615 if (tcp
->tcp_flow_stopped
&&
616 TCP_UNSENT_BYTES(tcp
) < connp
->conn_sndbuf
) {
619 mutex_exit(&tcp
->tcp_non_sq_lock
);
624 if (*i1
> tcps
->tcps_max_buf
) {
628 /* Silently ignore zero */
629 if (!checkonly
&& *i1
!= 0) {
630 *i1
= MSS_ROUNDUP(*i1
, tcp
->tcp_mss
);
631 (void) tcp_rwnd_set(tcp
, *i1
);
634 * XXX should we return the rwnd here
639 case SO_SND_COPYAVOID
:
641 if (tcp
->tcp_loopback
||
642 (onoff
!= 1) || !tcp_zcopy_check(tcp
)) {
646 tcp
->tcp_snd_zcopy_aware
= 1;
656 tcp
->tcp_naglim
= *i1
? 1 : tcp
->tcp_mss
;
658 case TCP_NOTIFY_THRESHOLD
:
660 tcp
->tcp_first_timer_threshold
= *i1
;
662 case TCP_ABORT_THRESHOLD
:
664 tcp
->tcp_second_timer_threshold
= *i1
;
666 case TCP_CONN_NOTIFY_THRESHOLD
:
668 tcp
->tcp_first_ctimer_threshold
= *i1
;
670 case TCP_CONN_ABORT_THRESHOLD
:
672 tcp
->tcp_second_ctimer_threshold
= *i1
;
674 case TCP_RECVDSTADDR
:
675 if (tcp
->tcp_state
> TCPS_LISTEN
) {
679 /* Setting done in conn_opt_set */
686 * Only allow socket with network configuration
687 * privilege to set the initial cwnd to be larger
688 * than allowed by RFC 3390.
690 if (val
> MIN(4, MAX(2, 4380 / tcp
->tcp_mss
))) {
691 if ((reterr
= secpolicy_ip_config(cr
, B_TRUE
))
696 if (val
> tcp_max_init_cwnd
) {
702 tcp
->tcp_init_cwnd
= val
;
705 * If the socket is connected, AND no outbound data
706 * has been sent, reset the actual cwnd values.
708 if (tcp
->tcp_state
== TCPS_ESTABLISHED
&&
709 tcp
->tcp_iss
== tcp
->tcp_snxt
- 1) {
711 MIN(tcp
->tcp_rwnd
, val
* tcp
->tcp_mss
);
716 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
717 * is in milliseconds. TCP_KEEPIDLE is introduced for
718 * compatibility with other Unix flavors.
719 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
720 * converting the input to milliseconds.
726 case TCP_KEEPALIVE_THRESHOLD
:
730 if (*i1
< tcps
->tcps_keepalive_interval_low
||
731 *i1
> tcps
->tcps_keepalive_interval_high
) {
735 if (*i1
!= tcp
->tcp_ka_interval
) {
736 tcp
->tcp_ka_interval
= *i1
;
738 * Check if we need to restart the
741 if (tcp
->tcp_ka_tid
!= 0) {
742 ASSERT(connp
->conn_keepalive
);
743 (void) TCP_TIMER_CANCEL(tcp
,
745 tcp
->tcp_ka_last_intrvl
= 0;
746 tcp
->tcp_ka_tid
= TCP_TIMER(tcp
,
748 tcp
->tcp_ka_interval
);
754 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
755 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
756 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
765 } else if (tcp
->tcp_ka_rinterval
== 0) {
767 * When TCP_KEEPCNT is specified without first
768 * specifying a TCP_KEEPINTVL, we infer an
769 * interval based on a tunable specific to our
770 * stack: the tcp_keepalive_abort_interval.
771 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
772 * the unlikely event that that has been set.)
773 * Given the abort interval's default value of
774 * 480 seconds, low TCP_KEEPCNT values can
775 * result in intervals that exceed the default
776 * maximum RTO of 60 seconds. Rather than
777 * fail in these cases, we (implicitly) clamp
778 * the interval at the maximum RTO; if the
779 * TCP_KEEPCNT is shortly followed by a
780 * TCP_KEEPINTVL (as we expect), the abort
781 * threshold will be recalculated correctly --
782 * and if a TCP_KEEPINTVL is not forthcoming,
783 * keep-alive will at least operate reasonably
784 * given the underconfigured state.
788 interval
= tcp
->tcp_ka_abort_thres
/ *i1
;
790 if (interval
< tcp
->tcp_rto_min
)
791 interval
= tcp
->tcp_rto_min
;
793 if (interval
> tcp
->tcp_rto_max
)
794 interval
= tcp
->tcp_rto_max
;
796 tcp
->tcp_ka_rinterval
= interval
;
798 if ((*i1
* tcp
->tcp_ka_rinterval
) <
799 tcps
->tcps_keepalive_abort_interval_low
||
800 (*i1
* tcp
->tcp_ka_rinterval
) >
801 tcps
->tcps_keepalive_abort_interval_high
)
803 tcp
->tcp_ka_abort_thres
=
804 (*i1
* tcp
->tcp_ka_rinterval
);
806 tcp
->tcp_ka_cnt
= *i1
;
810 * TCP_KEEPINTVL is specified in seconds, but
811 * tcp_ka_rinterval is in milliseconds.
817 if ((*i1
* 1000) < tcp
->tcp_rto_min
||
818 (*i1
* 1000) > tcp
->tcp_rto_max
)
821 if (tcp
->tcp_ka_cnt
== 0) {
823 tcp
->tcp_ka_abort_thres
/ (*i1
* 1000);
825 if ((*i1
* tcp
->tcp_ka_cnt
* 1000) <
826 tcps
->tcps_keepalive_abort_interval_low
||
827 (*i1
* tcp
->tcp_ka_cnt
* 1000) >
828 tcps
->tcps_keepalive_abort_interval_high
)
830 tcp
->tcp_ka_abort_thres
=
831 (*i1
* tcp
->tcp_ka_cnt
* 1000);
833 tcp
->tcp_ka_rinterval
= *i1
* 1000;
835 case TCP_KEEPALIVE_ABORT_THRESHOLD
:
838 tcps
->tcps_keepalive_abort_interval_low
||
840 tcps
->tcps_keepalive_abort_interval_high
) {
844 tcp
->tcp_ka_abort_thres
= *i1
;
846 tcp
->tcp_ka_rinterval
= 0;
852 * if tcp->tcp_cork was set and is now
853 * being unset, we have to make sure that
854 * the remaining data gets sent out. Also
855 * unset tcp->tcp_cork so that tcp_wput_data()
856 * can send data even if it is less than mss
858 if (tcp
->tcp_cork
&& onoff
== 0 &&
859 tcp
->tcp_unsent
> 0) {
860 tcp
->tcp_cork
= B_FALSE
;
861 tcp_wput_data(tcp
, NULL
, B_FALSE
);
863 tcp
->tcp_cork
= onoff
;
866 case TCP_RTO_INITIAL
: {
869 if (checkonly
|| val
== 0)
875 * The initial RTO should be bounded by the minimum
876 * and maximum RTO. And it should also be smaller
877 * than the connect attempt abort timeout. Otherwise,
878 * the connection won't be aborted in a period
879 * reasonably close to that timeout.
881 if (val
< tcp
->tcp_rto_min
|| val
> tcp
->tcp_rto_max
||
882 val
> tcp
->tcp_second_ctimer_threshold
||
883 val
< tcps
->tcps_rexmit_interval_initial_low
||
884 val
> tcps
->tcps_rexmit_interval_initial_high
) {
888 tcp
->tcp_rto_initial
= val
;
891 * If TCP has not sent anything, need to re-calculate
892 * tcp_rto. Otherwise, this option change does not
893 * really affect anything.
895 if (tcp
->tcp_state
>= TCPS_SYN_SENT
)
898 tcp
->tcp_rtt_sa
= tcp
->tcp_rto_initial
<< 2;
899 tcp
->tcp_rtt_sd
= tcp
->tcp_rto_initial
>> 1;
900 rto
= (tcp
->tcp_rtt_sa
>> 3) + tcp
->tcp_rtt_sd
+
901 tcps
->tcps_rexmit_interval_extra
+
902 (tcp
->tcp_rtt_sa
>> 5) +
903 tcps
->tcps_conn_grace_period
;
904 TCP_SET_RTO(tcp
, rto
);
908 if (checkonly
|| val
== 0)
911 if (val
< tcps
->tcps_rexmit_interval_min_low
||
912 val
> tcps
->tcps_rexmit_interval_min_high
||
913 val
> tcp
->tcp_rto_max
) {
917 tcp
->tcp_rto_min
= val
;
918 if (tcp
->tcp_rto
< val
)
922 if (checkonly
|| val
== 0)
928 * The maximum RTO should not be larger than the
929 * connection abort timeout. Otherwise, the
930 * connection won't be aborted in a period reasonably
931 * close to that timeout.
933 if (val
< tcps
->tcps_rexmit_interval_max_low
||
934 val
> tcps
->tcps_rexmit_interval_max_high
||
935 val
< tcp
->tcp_rto_min
||
936 val
> tcp
->tcp_second_timer_threshold
) {
940 tcp
->tcp_rto_max
= val
;
941 if (tcp
->tcp_rto
> val
)
945 if (checkonly
|| *i1
== 0)
949 * Note that the option value's unit is second. And
950 * the value should be bigger than the private
951 * parameter tcp_fin_wait_2_flush_interval's lower
952 * bound and smaller than the current value of that
953 * parameter. It should be smaller than the current
954 * value to avoid an app setting TCP_LINGER2 to a big
955 * value, causing resource to be held up too long in
959 tcps
->tcps_fin_wait_2_flush_interval_low
/SECONDS
>
961 tcps
->tcps_fin_wait_2_flush_interval
/SECONDS
<
966 tcp
->tcp_fin_wait_2_flush_interval
= *i1
* SECONDS
;
973 if (connp
->conn_family
!= AF_INET
) {
980 * We should not allow policy setting after
981 * we start listening for connections.
983 if (tcp
->tcp_state
== TCPS_LISTEN
) {
991 * IPPROTO_IPV6 options are only supported for sockets
992 * that are using IPv6 on the wire.
994 if (connp
->conn_ipversion
!= IPV6_VERSION
) {
1000 case IPV6_RECVPKTINFO
:
1002 /* Force it to be sent up with the next msg */
1003 tcp
->tcp_recvifindex
= 0;
1006 case IPV6_RECVTCLASS
:
1008 /* Force it to be sent up with the next msg */
1009 tcp
->tcp_recvtclass
= 0xffffffffU
;
1012 case IPV6_RECVHOPLIMIT
:
1014 /* Force it to be sent up with the next msg */
1015 tcp
->tcp_recvhops
= 0xffffffffU
;
1019 /* This is an extra check for TCP */
1020 if (inlen
== sizeof (struct in6_pktinfo
)) {
1021 struct in6_pktinfo
*pkti
;
1023 pkti
= (struct in6_pktinfo
*)invalp
;
1025 * RFC 3542 states that ipi6_addr must be
1026 * the unspecified address when setting the
1027 * IPV6_PKTINFO sticky socket option on a
1030 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti
->ipi6_addr
))
1036 * We should not allow policy setting after
1037 * we start listening for connections.
1039 if (tcp
->tcp_state
== TCPS_LISTEN
) {
1046 reterr
= conn_opt_set(&coas
, level
, name
, inlen
, invalp
,
1054 * Common case of OK return with outval same as inval
1056 if (invalp
!= outvalp
) {
1057 /* don't trust bcopy for identical src/dst */
1058 (void) bcopy(invalp
, outvalp
, inlen
);
1062 if (coas
.coa_changed
& COA_HEADER_CHANGED
) {
1063 /* If we are connected we rebuilt the headers */
1064 if (!IN6_IS_ADDR_UNSPECIFIED(&connp
->conn_faddr_v6
) &&
1065 !IN6_IS_ADDR_V4MAPPED_ANY(&connp
->conn_faddr_v6
)) {
1066 reterr
= tcp_build_hdrs(tcp
);
1071 if (coas
.coa_changed
& COA_ROUTE_CHANGED
) {
1075 * If we are connected we re-cache the information.
1076 * We ignore errors to preserve BSD behavior.
1077 * Note that we don't redo IPsec policy lookup here
1078 * since the final destination (or source) didn't change.
1080 ip_attr_nexthop(&connp
->conn_xmit_ipp
, connp
->conn_ixa
,
1081 &connp
->conn_faddr_v6
, &nexthop
);
1083 if (!IN6_IS_ADDR_UNSPECIFIED(&connp
->conn_faddr_v6
) &&
1084 !IN6_IS_ADDR_V4MAPPED_ANY(&connp
->conn_faddr_v6
)) {
1085 (void) ip_attr_connect(connp
, connp
->conn_ixa
,
1086 &connp
->conn_laddr_v6
, &connp
->conn_faddr_v6
,
1087 &nexthop
, connp
->conn_fport
, NULL
, NULL
,
1091 if ((coas
.coa_changed
& COA_SNDBUF_CHANGED
) && !IPCL_IS_NONSTR(connp
)) {
1092 connp
->conn_wq
->q_hiwat
= connp
->conn_sndbuf
;
1094 if (coas
.coa_changed
& COA_WROFF_CHANGED
) {
1095 connp
->conn_wroff
= connp
->conn_ht_iphc_allocated
+
1096 tcps
->tcps_wroff_xtra
;
1097 (void) proto_set_tx_wroff(connp
->conn_rq
, connp
,
1100 if (coas
.coa_changed
& COA_OOBINLINE_CHANGED
) {
1101 if (IPCL_IS_NONSTR(connp
))
1102 proto_set_rx_oob_opt(connp
, onoff
);