lib: remove unused libfru & libfrureg
[unleashed.git] / kernel / net / tcp / tcp_opt_data.c
blob2508217d389f887e40fcff92eddaa2c2937d5c6b
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #define _SUN_TPI_VERSION 2
29 #include <sys/tihdr.h>
30 #include <sys/socket.h>
31 #include <sys/xti_xtiopt.h>
32 #include <sys/xti_inet.h>
33 #include <sys/policy.h>
35 #include <inet/common.h>
36 #include <netinet/ip6.h>
37 #include <inet/ip.h>
39 #include <netinet/in.h>
40 #include <netinet/tcp.h>
41 #include <inet/optcom.h>
42 #include <inet/proto_set.h>
43 #include <inet/tcp_impl.h>
45 static int tcp_opt_default(queue_t *, int, int, uchar_t *);
48 * Table of all known options handled on a TCP protocol stack.
50 * Note: This table contains options processed by both TCP and IP levels
51 * and is the superset of options that can be performed on a TCP over IP
52 * stack.
54 opdes_t tcp_opt_arr[] = {
56 { SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
57 sizeof (struct linger), 0 },
59 { SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
60 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
61 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
62 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
64 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
66 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67 { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
68 { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
69 { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
70 { SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
71 sizeof (struct timeval), 0 },
72 { SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
73 sizeof (struct timeval), 0 },
74 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
76 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
77 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
78 0 },
79 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
81 { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
83 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
85 { TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
87 { TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
88 536 },
90 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
91 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
93 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
94 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
96 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
97 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
99 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
100 OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
102 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
103 0 },
105 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
106 sizeof (int), 0 },
108 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
111 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
112 sizeof (int), 0 },
114 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
115 sizeof (int), 0 },
117 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
119 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
121 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
123 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
124 sizeof (int), 0 },
126 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
128 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
130 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
132 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
134 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
136 { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
137 (OP_VARLEN|OP_NODEFAULT),
138 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
139 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
140 (OP_VARLEN|OP_NODEFAULT),
141 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
143 { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
144 { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
145 { IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
146 sizeof (int), -1 /* not initialized */ },
148 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
149 sizeof (ipsec_req_t), -1 /* not initialized */ },
151 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
152 sizeof (int), 0 /* no ifindex */ },
154 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
155 sizeof (int), 0 },
157 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
158 sizeof (int), -1 /* not initialized */ },
160 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
161 sizeof (int), 0 /* no ifindex */ },
163 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
165 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
166 sizeof (in_addr_t), -1 /* not initialized */ },
168 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
169 sizeof (int), 0 },
171 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
172 (OP_NODEFAULT|OP_VARLEN),
173 sizeof (struct in6_pktinfo), -1 /* not initialized */ },
174 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
175 OP_NODEFAULT,
176 sizeof (sin6_t), -1 /* not initialized */ },
177 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
178 (OP_VARLEN|OP_NODEFAULT), 255*8,
179 -1 /* not initialized */ },
180 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
181 (OP_VARLEN|OP_NODEFAULT), 255*8,
182 -1 /* not initialized */ },
183 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
184 (OP_VARLEN|OP_NODEFAULT), 255*8,
185 -1 /* not initialized */ },
186 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
187 (OP_VARLEN|OP_NODEFAULT), 255*8,
188 -1 /* not initialized */ },
189 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
190 OP_NODEFAULT,
191 sizeof (int), -1 /* not initialized */ },
192 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
193 OP_NODEFAULT,
194 sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
195 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
196 sizeof (int), 0 },
197 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
198 sizeof (int), 0 },
199 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
200 sizeof (int), 0 },
202 /* Enable receipt of ancillary data */
203 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
204 sizeof (int), 0 },
205 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
206 sizeof (int), 0 },
207 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
208 sizeof (int), 0 },
209 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
210 sizeof (int), 0 },
211 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
212 sizeof (int), 0 },
213 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
214 sizeof (int), 0 },
215 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
216 sizeof (int), 0 },
217 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
218 sizeof (int), 0 },
220 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
221 sizeof (ipsec_req_t), -1 /* not initialized */ },
222 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
223 sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
227 * Table of all supported levels
228 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
229 * any supported options so we need this info separately.
231 * This is needed only for topmost tpi providers and is used only by
232 * XTI interfaces.
234 optlevel_t tcp_valid_levels_arr[] = {
235 XTI_GENERIC,
236 SOL_SOCKET,
237 IPPROTO_TCP,
238 IPPROTO_IP,
239 IPPROTO_IPV6
243 #define TCP_OPT_ARR_CNT A_CNT(tcp_opt_arr)
244 #define TCP_VALID_LEVELS_CNT A_CNT(tcp_valid_levels_arr)
246 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
249 * Initialize option database object for TCP
251 * This object represents database of options to search passed to
252 * {sock,tpi}optcom_req() interface routine to take care of option
253 * management and associated methods.
256 optdb_obj_t tcp_opt_obj = {
257 tcp_opt_default, /* TCP default value function pointer */
258 tcp_tpi_opt_get, /* TCP get function pointer */
259 tcp_tpi_opt_set, /* TCP set function pointer */
260 TCP_OPT_ARR_CNT, /* TCP option database count of entries */
261 tcp_opt_arr, /* TCP option database */
262 TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */
263 tcp_valid_levels_arr /* TCP valid level array */
266 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
269 * Some TCP options can be "set" by requesting them in the option
270 * buffer. This is needed for XTI feature test though we do not
271 * allow it in general. We interpret that this mechanism is more
272 * applicable to OSI protocols and need not be allowed in general.
273 * This routine filters out options for which it is not allowed (most)
274 * and lets through those (few) for which it is. [ The XTI interface
275 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
276 * ever implemented will have to be allowed here ].
278 static boolean_t
279 tcp_allow_connopt_set(int level, int name)
282 switch (level) {
283 case IPPROTO_TCP:
284 switch (name) {
285 case TCP_NODELAY:
286 return (B_TRUE);
287 default:
288 return (B_FALSE);
290 /*NOTREACHED*/
291 default:
292 return (B_FALSE);
294 /*NOTREACHED*/
298 * This routine gets default values of certain options whose default
299 * values are maintained by protocol specific code
301 /* ARGSUSED */
302 static int
303 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
305 int32_t *i1 = (int32_t *)ptr;
306 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
308 switch (level) {
309 case IPPROTO_TCP:
310 switch (name) {
311 case TCP_NOTIFY_THRESHOLD:
312 *i1 = tcps->tcps_ip_notify_interval;
313 break;
314 case TCP_ABORT_THRESHOLD:
315 *i1 = tcps->tcps_ip_abort_interval;
316 break;
317 case TCP_CONN_NOTIFY_THRESHOLD:
318 *i1 = tcps->tcps_ip_notify_cinterval;
319 break;
320 case TCP_CONN_ABORT_THRESHOLD:
321 *i1 = tcps->tcps_ip_abort_cinterval;
322 break;
323 default:
324 return (-1);
326 break;
327 case IPPROTO_IP:
328 switch (name) {
329 case IP_TTL:
330 *i1 = tcps->tcps_ipv4_ttl;
331 break;
332 default:
333 return (-1);
335 break;
336 case IPPROTO_IPV6:
337 switch (name) {
338 case IPV6_UNICAST_HOPS:
339 *i1 = tcps->tcps_ipv6_hoplimit;
340 break;
341 default:
342 return (-1);
344 break;
345 default:
346 return (-1);
348 return (sizeof (int));
352 * TCP routine to get the values of options.
355 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
357 int *i1 = (int *)ptr;
358 tcp_t *tcp = connp->conn_tcp;
359 conn_opt_arg_t coas;
360 int retval;
362 coas.coa_connp = connp;
363 coas.coa_ixa = connp->conn_ixa;
364 coas.coa_ipp = &connp->conn_xmit_ipp;
365 coas.coa_ancillary = B_FALSE;
366 coas.coa_changed = 0;
368 switch (level) {
369 case SOL_SOCKET:
370 switch (name) {
371 case SO_SND_COPYAVOID:
372 *i1 = tcp->tcp_snd_zcopy_on ?
373 SO_SND_COPYAVOID : 0;
374 return (sizeof (int));
375 case SO_ACCEPTCONN:
376 *i1 = (tcp->tcp_state == TCPS_LISTEN);
377 return (sizeof (int));
379 break;
380 case IPPROTO_TCP:
381 switch (name) {
382 case TCP_NODELAY:
383 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
384 return (sizeof (int));
385 case TCP_MAXSEG:
386 *i1 = tcp->tcp_mss;
387 return (sizeof (int));
388 case TCP_NOTIFY_THRESHOLD:
389 *i1 = (int)tcp->tcp_first_timer_threshold;
390 return (sizeof (int));
391 case TCP_ABORT_THRESHOLD:
392 *i1 = tcp->tcp_second_timer_threshold;
393 return (sizeof (int));
394 case TCP_CONN_NOTIFY_THRESHOLD:
395 *i1 = tcp->tcp_first_ctimer_threshold;
396 return (sizeof (int));
397 case TCP_CONN_ABORT_THRESHOLD:
398 *i1 = tcp->tcp_second_ctimer_threshold;
399 return (sizeof (int));
400 case TCP_INIT_CWND:
401 *i1 = tcp->tcp_init_cwnd;
402 return (sizeof (int));
403 case TCP_KEEPALIVE_THRESHOLD:
404 *i1 = tcp->tcp_ka_interval;
405 return (sizeof (int));
408 * TCP_KEEPIDLE expects value in seconds, but
409 * tcp_ka_interval is in milliseconds.
411 case TCP_KEEPIDLE:
412 *i1 = tcp->tcp_ka_interval / 1000;
413 return (sizeof (int));
414 case TCP_KEEPCNT:
415 *i1 = tcp->tcp_ka_cnt;
416 return (sizeof (int));
419 * TCP_KEEPINTVL expects value in seconds, but
420 * tcp_ka_rinterval is in milliseconds.
422 case TCP_KEEPINTVL:
423 *i1 = tcp->tcp_ka_rinterval / 1000;
424 return (sizeof (int));
425 case TCP_KEEPALIVE_ABORT_THRESHOLD:
426 *i1 = tcp->tcp_ka_abort_thres;
427 return (sizeof (int));
428 case TCP_CORK:
429 *i1 = tcp->tcp_cork;
430 return (sizeof (int));
431 case TCP_RTO_INITIAL:
432 *i1 = tcp->tcp_rto_initial;
433 return (sizeof (uint32_t));
434 case TCP_RTO_MIN:
435 *i1 = tcp->tcp_rto_min;
436 return (sizeof (uint32_t));
437 case TCP_RTO_MAX:
438 *i1 = tcp->tcp_rto_max;
439 return (sizeof (uint32_t));
440 case TCP_LINGER2:
441 *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
442 return (sizeof (int));
444 break;
445 case IPPROTO_IP:
446 if (connp->conn_family != AF_INET)
447 return (-1);
448 switch (name) {
449 case IP_OPTIONS:
450 case T_IP_OPTIONS:
451 /* Caller ensures enough space */
452 return (ip_opt_get_user(connp, ptr));
453 default:
454 break;
456 break;
458 case IPPROTO_IPV6:
460 * IPPROTO_IPV6 options are only supported for sockets
461 * that are using IPv6 on the wire.
463 if (connp->conn_ipversion != IPV6_VERSION) {
464 return (-1);
466 switch (name) {
467 case IPV6_PATHMTU:
468 if (tcp->tcp_state < TCPS_ESTABLISHED)
469 return (-1);
470 break;
472 break;
474 mutex_enter(&connp->conn_lock);
475 retval = conn_opt_get(&coas, level, name, ptr);
476 mutex_exit(&connp->conn_lock);
477 return (retval);
481 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
482 * Parameters are assumed to be verified by the caller.
484 /* ARGSUSED */
486 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
487 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
488 void *thisdg_attrs, cred_t *cr)
490 tcp_t *tcp = connp->conn_tcp;
491 int *i1 = (int *)invalp;
492 boolean_t onoff = (*i1 == 0) ? 0 : 1;
493 boolean_t checkonly;
494 int reterr;
495 tcp_stack_t *tcps = tcp->tcp_tcps;
496 conn_opt_arg_t coas;
497 uint32_t val = *((uint32_t *)invalp);
499 coas.coa_connp = connp;
500 coas.coa_ixa = connp->conn_ixa;
501 coas.coa_ipp = &connp->conn_xmit_ipp;
502 coas.coa_ancillary = B_FALSE;
503 coas.coa_changed = 0;
505 switch (optset_context) {
506 case SETFN_OPTCOM_CHECKONLY:
507 checkonly = B_TRUE;
509 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
510 * inlen != 0 implies value supplied and
511 * we have to "pretend" to set it.
512 * inlen == 0 implies that there is no
513 * value part in T_CHECK request and just validation
514 * done elsewhere should be enough, we just return here.
516 if (inlen == 0) {
517 *outlenp = 0;
518 return (0);
520 break;
521 case SETFN_OPTCOM_NEGOTIATE:
522 checkonly = B_FALSE;
523 break;
524 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
525 case SETFN_CONN_NEGOTIATE:
526 checkonly = B_FALSE;
528 * Negotiating local and "association-related" options
529 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
530 * primitives is allowed by XTI, but we choose
531 * to not implement this style negotiation for Internet
532 * protocols (We interpret it is a must for OSI world but
533 * optional for Internet protocols) for all options.
534 * [ Will do only for the few options that enable test
535 * suites that our XTI implementation of this feature
536 * works for transports that do allow it ]
538 if (!tcp_allow_connopt_set(level, name)) {
539 *outlenp = 0;
540 return (EINVAL);
542 break;
543 default:
545 * We should never get here
547 *outlenp = 0;
548 return (EINVAL);
551 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
552 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
555 * For TCP, we should have no ancillary data sent down
556 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
557 * has to be zero.
559 ASSERT(thisdg_attrs == NULL);
562 * For fixed length options, no sanity check
563 * of passed in length is done. It is assumed *_optcom_req()
564 * routines do the right thing.
566 switch (level) {
567 case SOL_SOCKET:
568 switch (name) {
569 case SO_KEEPALIVE:
570 if (checkonly) {
571 /* check only case */
572 break;
575 if (!onoff) {
576 if (connp->conn_keepalive) {
577 if (tcp->tcp_ka_tid != 0) {
578 (void) TCP_TIMER_CANCEL(tcp,
579 tcp->tcp_ka_tid);
580 tcp->tcp_ka_tid = 0;
582 connp->conn_keepalive = 0;
584 break;
586 if (!connp->conn_keepalive) {
587 /* Crank up the keepalive timer */
588 tcp->tcp_ka_last_intrvl = 0;
589 tcp->tcp_ka_tid = TCP_TIMER(tcp,
590 tcp_keepalive_timer, tcp->tcp_ka_interval);
591 connp->conn_keepalive = 1;
593 break;
594 case SO_SNDBUF: {
595 if (*i1 > tcps->tcps_max_buf) {
596 *outlenp = 0;
597 return (ENOBUFS);
599 if (checkonly)
600 break;
602 connp->conn_sndbuf = *i1;
603 if (tcps->tcps_snd_lowat_fraction != 0) {
604 connp->conn_sndlowat = connp->conn_sndbuf /
605 tcps->tcps_snd_lowat_fraction;
607 (void) tcp_maxpsz_set(tcp, B_TRUE);
609 * If we are flow-controlled, recheck the condition.
610 * There are apps that increase SO_SNDBUF size when
611 * flow-controlled (EWOULDBLOCK), and expect the flow
612 * control condition to be lifted right away.
614 mutex_enter(&tcp->tcp_non_sq_lock);
615 if (tcp->tcp_flow_stopped &&
616 TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
617 tcp_clrqfull(tcp);
619 mutex_exit(&tcp->tcp_non_sq_lock);
620 *outlenp = inlen;
621 return (0);
623 case SO_RCVBUF:
624 if (*i1 > tcps->tcps_max_buf) {
625 *outlenp = 0;
626 return (ENOBUFS);
628 /* Silently ignore zero */
629 if (!checkonly && *i1 != 0) {
630 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
631 (void) tcp_rwnd_set(tcp, *i1);
634 * XXX should we return the rwnd here
635 * and tcp_opt_get ?
637 *outlenp = inlen;
638 return (0);
639 case SO_SND_COPYAVOID:
640 if (!checkonly) {
641 if (tcp->tcp_loopback ||
642 (onoff != 1) || !tcp_zcopy_check(tcp)) {
643 *outlenp = 0;
644 return (EOPNOTSUPP);
646 tcp->tcp_snd_zcopy_aware = 1;
648 *outlenp = inlen;
649 return (0);
651 break;
652 case IPPROTO_TCP:
653 switch (name) {
654 case TCP_NODELAY:
655 if (!checkonly)
656 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
657 break;
658 case TCP_NOTIFY_THRESHOLD:
659 if (!checkonly)
660 tcp->tcp_first_timer_threshold = *i1;
661 break;
662 case TCP_ABORT_THRESHOLD:
663 if (!checkonly)
664 tcp->tcp_second_timer_threshold = *i1;
665 break;
666 case TCP_CONN_NOTIFY_THRESHOLD:
667 if (!checkonly)
668 tcp->tcp_first_ctimer_threshold = *i1;
669 break;
670 case TCP_CONN_ABORT_THRESHOLD:
671 if (!checkonly)
672 tcp->tcp_second_ctimer_threshold = *i1;
673 break;
674 case TCP_RECVDSTADDR:
675 if (tcp->tcp_state > TCPS_LISTEN) {
676 *outlenp = 0;
677 return (EOPNOTSUPP);
679 /* Setting done in conn_opt_set */
680 break;
681 case TCP_INIT_CWND:
682 if (checkonly)
683 break;
686 * Only allow socket with network configuration
687 * privilege to set the initial cwnd to be larger
688 * than allowed by RFC 3390.
690 if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
691 if ((reterr = secpolicy_ip_config(cr, B_TRUE))
692 != 0) {
693 *outlenp = 0;
694 return (reterr);
696 if (val > tcp_max_init_cwnd) {
697 *outlenp = 0;
698 return (EINVAL);
702 tcp->tcp_init_cwnd = val;
705 * If the socket is connected, AND no outbound data
706 * has been sent, reset the actual cwnd values.
708 if (tcp->tcp_state == TCPS_ESTABLISHED &&
709 tcp->tcp_iss == tcp->tcp_snxt - 1) {
710 tcp->tcp_cwnd =
711 MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
713 break;
716 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
717 * is in milliseconds. TCP_KEEPIDLE is introduced for
718 * compatibility with other Unix flavors.
719 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
720 * converting the input to milliseconds.
722 case TCP_KEEPIDLE:
723 *i1 *= 1000;
724 /* FALLTHRU */
726 case TCP_KEEPALIVE_THRESHOLD:
727 if (checkonly)
728 break;
730 if (*i1 < tcps->tcps_keepalive_interval_low ||
731 *i1 > tcps->tcps_keepalive_interval_high) {
732 *outlenp = 0;
733 return (EINVAL);
735 if (*i1 != tcp->tcp_ka_interval) {
736 tcp->tcp_ka_interval = *i1;
738 * Check if we need to restart the
739 * keepalive timer.
741 if (tcp->tcp_ka_tid != 0) {
742 ASSERT(connp->conn_keepalive);
743 (void) TCP_TIMER_CANCEL(tcp,
744 tcp->tcp_ka_tid);
745 tcp->tcp_ka_last_intrvl = 0;
746 tcp->tcp_ka_tid = TCP_TIMER(tcp,
747 tcp_keepalive_timer,
748 tcp->tcp_ka_interval);
751 break;
754 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
755 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
756 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
757 * tcp_ka_cnt.
759 case TCP_KEEPCNT:
760 if (checkonly)
761 break;
763 if (*i1 == 0) {
764 return (EINVAL);
765 } else if (tcp->tcp_ka_rinterval == 0) {
767 * When TCP_KEEPCNT is specified without first
768 * specifying a TCP_KEEPINTVL, we infer an
769 * interval based on a tunable specific to our
770 * stack: the tcp_keepalive_abort_interval.
771 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
772 * the unlikely event that that has been set.)
773 * Given the abort interval's default value of
774 * 480 seconds, low TCP_KEEPCNT values can
775 * result in intervals that exceed the default
776 * maximum RTO of 60 seconds. Rather than
777 * fail in these cases, we (implicitly) clamp
778 * the interval at the maximum RTO; if the
779 * TCP_KEEPCNT is shortly followed by a
780 * TCP_KEEPINTVL (as we expect), the abort
781 * threshold will be recalculated correctly --
782 * and if a TCP_KEEPINTVL is not forthcoming,
783 * keep-alive will at least operate reasonably
784 * given the underconfigured state.
786 uint32_t interval;
788 interval = tcp->tcp_ka_abort_thres / *i1;
790 if (interval < tcp->tcp_rto_min)
791 interval = tcp->tcp_rto_min;
793 if (interval > tcp->tcp_rto_max)
794 interval = tcp->tcp_rto_max;
796 tcp->tcp_ka_rinterval = interval;
797 } else {
798 if ((*i1 * tcp->tcp_ka_rinterval) <
799 tcps->tcps_keepalive_abort_interval_low ||
800 (*i1 * tcp->tcp_ka_rinterval) >
801 tcps->tcps_keepalive_abort_interval_high)
802 return (EINVAL);
803 tcp->tcp_ka_abort_thres =
804 (*i1 * tcp->tcp_ka_rinterval);
806 tcp->tcp_ka_cnt = *i1;
807 break;
808 case TCP_KEEPINTVL:
810 * TCP_KEEPINTVL is specified in seconds, but
811 * tcp_ka_rinterval is in milliseconds.
814 if (checkonly)
815 break;
817 if ((*i1 * 1000) < tcp->tcp_rto_min ||
818 (*i1 * 1000) > tcp->tcp_rto_max)
819 return (EINVAL);
821 if (tcp->tcp_ka_cnt == 0) {
822 tcp->tcp_ka_cnt =
823 tcp->tcp_ka_abort_thres / (*i1 * 1000);
824 } else {
825 if ((*i1 * tcp->tcp_ka_cnt * 1000) <
826 tcps->tcps_keepalive_abort_interval_low ||
827 (*i1 * tcp->tcp_ka_cnt * 1000) >
828 tcps->tcps_keepalive_abort_interval_high)
829 return (EINVAL);
830 tcp->tcp_ka_abort_thres =
831 (*i1 * tcp->tcp_ka_cnt * 1000);
833 tcp->tcp_ka_rinterval = *i1 * 1000;
834 break;
835 case TCP_KEEPALIVE_ABORT_THRESHOLD:
836 if (!checkonly) {
837 if (*i1 <
838 tcps->tcps_keepalive_abort_interval_low ||
839 *i1 >
840 tcps->tcps_keepalive_abort_interval_high) {
841 *outlenp = 0;
842 return (EINVAL);
844 tcp->tcp_ka_abort_thres = *i1;
845 tcp->tcp_ka_cnt = 0;
846 tcp->tcp_ka_rinterval = 0;
848 break;
849 case TCP_CORK:
850 if (!checkonly) {
852 * if tcp->tcp_cork was set and is now
853 * being unset, we have to make sure that
854 * the remaining data gets sent out. Also
855 * unset tcp->tcp_cork so that tcp_wput_data()
856 * can send data even if it is less than mss
858 if (tcp->tcp_cork && onoff == 0 &&
859 tcp->tcp_unsent > 0) {
860 tcp->tcp_cork = B_FALSE;
861 tcp_wput_data(tcp, NULL, B_FALSE);
863 tcp->tcp_cork = onoff;
865 break;
866 case TCP_RTO_INITIAL: {
867 clock_t rto;
869 if (checkonly || val == 0)
870 break;
873 * Sanity checks
875 * The initial RTO should be bounded by the minimum
876 * and maximum RTO. And it should also be smaller
877 * than the connect attempt abort timeout. Otherwise,
878 * the connection won't be aborted in a period
879 * reasonably close to that timeout.
881 if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
882 val > tcp->tcp_second_ctimer_threshold ||
883 val < tcps->tcps_rexmit_interval_initial_low ||
884 val > tcps->tcps_rexmit_interval_initial_high) {
885 *outlenp = 0;
886 return (EINVAL);
888 tcp->tcp_rto_initial = val;
891 * If TCP has not sent anything, need to re-calculate
892 * tcp_rto. Otherwise, this option change does not
893 * really affect anything.
895 if (tcp->tcp_state >= TCPS_SYN_SENT)
896 break;
898 tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
899 tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
900 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
901 tcps->tcps_rexmit_interval_extra +
902 (tcp->tcp_rtt_sa >> 5) +
903 tcps->tcps_conn_grace_period;
904 TCP_SET_RTO(tcp, rto);
905 break;
907 case TCP_RTO_MIN:
908 if (checkonly || val == 0)
909 break;
911 if (val < tcps->tcps_rexmit_interval_min_low ||
912 val > tcps->tcps_rexmit_interval_min_high ||
913 val > tcp->tcp_rto_max) {
914 *outlenp = 0;
915 return (EINVAL);
917 tcp->tcp_rto_min = val;
918 if (tcp->tcp_rto < val)
919 tcp->tcp_rto = val;
920 break;
921 case TCP_RTO_MAX:
922 if (checkonly || val == 0)
923 break;
926 * Sanity checks
928 * The maximum RTO should not be larger than the
929 * connection abort timeout. Otherwise, the
930 * connection won't be aborted in a period reasonably
931 * close to that timeout.
933 if (val < tcps->tcps_rexmit_interval_max_low ||
934 val > tcps->tcps_rexmit_interval_max_high ||
935 val < tcp->tcp_rto_min ||
936 val > tcp->tcp_second_timer_threshold) {
937 *outlenp = 0;
938 return (EINVAL);
940 tcp->tcp_rto_max = val;
941 if (tcp->tcp_rto > val)
942 tcp->tcp_rto = val;
943 break;
944 case TCP_LINGER2:
945 if (checkonly || *i1 == 0)
946 break;
949 * Note that the option value's unit is second. And
950 * the value should be bigger than the private
951 * parameter tcp_fin_wait_2_flush_interval's lower
952 * bound and smaller than the current value of that
953 * parameter. It should be smaller than the current
954 * value to avoid an app setting TCP_LINGER2 to a big
955 * value, causing resource to be held up too long in
956 * FIN-WAIT-2 state.
958 if (*i1 < 0 ||
959 tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
960 *i1 ||
961 tcps->tcps_fin_wait_2_flush_interval/SECONDS <
962 *i1) {
963 *outlenp = 0;
964 return (EINVAL);
966 tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
967 break;
968 default:
969 break;
971 break;
972 case IPPROTO_IP:
973 if (connp->conn_family != AF_INET) {
974 *outlenp = 0;
975 return (EINVAL);
977 switch (name) {
978 case IP_SEC_OPT:
980 * We should not allow policy setting after
981 * we start listening for connections.
983 if (tcp->tcp_state == TCPS_LISTEN) {
984 return (EINVAL);
986 break;
988 break;
989 case IPPROTO_IPV6:
991 * IPPROTO_IPV6 options are only supported for sockets
992 * that are using IPv6 on the wire.
994 if (connp->conn_ipversion != IPV6_VERSION) {
995 *outlenp = 0;
996 return (EINVAL);
999 switch (name) {
1000 case IPV6_RECVPKTINFO:
1001 if (!checkonly) {
1002 /* Force it to be sent up with the next msg */
1003 tcp->tcp_recvifindex = 0;
1005 break;
1006 case IPV6_RECVTCLASS:
1007 if (!checkonly) {
1008 /* Force it to be sent up with the next msg */
1009 tcp->tcp_recvtclass = 0xffffffffU;
1011 break;
1012 case IPV6_RECVHOPLIMIT:
1013 if (!checkonly) {
1014 /* Force it to be sent up with the next msg */
1015 tcp->tcp_recvhops = 0xffffffffU;
1017 break;
1018 case IPV6_PKTINFO:
1019 /* This is an extra check for TCP */
1020 if (inlen == sizeof (struct in6_pktinfo)) {
1021 struct in6_pktinfo *pkti;
1023 pkti = (struct in6_pktinfo *)invalp;
1025 * RFC 3542 states that ipi6_addr must be
1026 * the unspecified address when setting the
1027 * IPV6_PKTINFO sticky socket option on a
1028 * TCP socket.
1030 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1031 return (EINVAL);
1033 break;
1034 case IPV6_SEC_OPT:
1036 * We should not allow policy setting after
1037 * we start listening for connections.
1039 if (tcp->tcp_state == TCPS_LISTEN) {
1040 return (EINVAL);
1042 break;
1044 break;
1046 reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1047 checkonly, cr);
1048 if (reterr != 0) {
1049 *outlenp = 0;
1050 return (reterr);
1054 * Common case of OK return with outval same as inval
1056 if (invalp != outvalp) {
1057 /* don't trust bcopy for identical src/dst */
1058 (void) bcopy(invalp, outvalp, inlen);
1060 *outlenp = inlen;
1062 if (coas.coa_changed & COA_HEADER_CHANGED) {
1063 /* If we are connected we rebuilt the headers */
1064 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1065 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1066 reterr = tcp_build_hdrs(tcp);
1067 if (reterr != 0)
1068 return (reterr);
1071 if (coas.coa_changed & COA_ROUTE_CHANGED) {
1072 in6_addr_t nexthop;
1075 * If we are connected we re-cache the information.
1076 * We ignore errors to preserve BSD behavior.
1077 * Note that we don't redo IPsec policy lookup here
1078 * since the final destination (or source) didn't change.
1080 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1081 &connp->conn_faddr_v6, &nexthop);
1083 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1084 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1085 (void) ip_attr_connect(connp, connp->conn_ixa,
1086 &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1087 &nexthop, connp->conn_fport, NULL, NULL,
1088 IPDF_VERIFY_DST);
1091 if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1092 connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1094 if (coas.coa_changed & COA_WROFF_CHANGED) {
1095 connp->conn_wroff = connp->conn_ht_iphc_allocated +
1096 tcps->tcps_wroff_xtra;
1097 (void) proto_set_tx_wroff(connp->conn_rq, connp,
1098 connp->conn_wroff);
1100 if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1101 if (IPCL_IS_NONSTR(connp))
1102 proto_set_rx_oob_opt(connp, onoff);
1104 return (0);