4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014 by Delphix. All rights reserved.
27 /* This file contains all TCP output processing functions. */
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/strsun.h>
32 #include <sys/strsubr.h>
33 #include <sys/stropts.h>
34 #include <sys/strlog.h>
35 #define _SUN_TPI_VERSION 2
36 #include <sys/tihdr.h>
37 #include <sys/suntpi.h>
38 #include <sys/xti_inet.h>
39 #include <sys/timod.h>
40 #include <sys/pattr.h>
41 #include <sys/squeue_impl.h>
42 #include <sys/squeue.h>
43 #include <sys/sockio.h>
44 #include <sys/tsol/tnet.h>
46 #include <inet/common.h>
49 #include <inet/tcp_impl.h>
50 #include <inet/snmpcom.h>
51 #include <inet/proto_set.h>
52 #include <inet/ipsec_impl.h>
53 #include <inet/ip_ndp.h>
55 static mblk_t
*tcp_get_seg_mp(tcp_t
*, uint32_t, int32_t *);
56 static void tcp_wput_cmdblk(queue_t
*, mblk_t
*);
57 static void tcp_wput_flush(tcp_t
*, mblk_t
*);
58 static void tcp_wput_iocdata(tcp_t
*tcp
, mblk_t
*mp
);
59 static int tcp_xmit_end(tcp_t
*);
60 static int tcp_send(tcp_t
*, const int, const int, const int,
61 const int, int *, uint_t
*, int *, mblk_t
**, mblk_t
*);
62 static void tcp_xmit_early_reset(char *, mblk_t
*, uint32_t, uint32_t,
63 int, ip_recv_attr_t
*, ip_stack_t
*, conn_t
*);
64 static boolean_t
tcp_send_rst_chk(tcp_stack_t
*);
65 static void tcp_process_shrunk_swnd(tcp_t
*, uint32_t);
66 static void tcp_fill_header(tcp_t
*, uchar_t
*, clock_t, int);
69 * Functions called directly via squeue having a prototype of edesc_t.
71 static void tcp_wput_nondata(void *, mblk_t
*, void *, ip_recv_attr_t
*);
72 static void tcp_wput_ioctl(void *, mblk_t
*, void *, ip_recv_attr_t
*);
73 static void tcp_wput_proto(void *, mblk_t
*, void *, ip_recv_attr_t
*);
76 * This controls how tiny a write must be before we try to copy it
77 * into the mblk on the tail of the transmit queue. Not much
78 * speedup is observed for values larger than sixteen. Zero will
79 * disable the optimisation.
81 static int tcp_tx_pull_len
= 16;
84 tcp_wput(queue_t
*q
, mblk_t
*mp
)
86 conn_t
*connp
= Q_TO_CONN(q
);
88 void (*output_proc
)();
94 ASSERT(connp
->conn_ref
>= 2);
96 switch (DB_TYPE(mp
)) {
98 tcp
= connp
->conn_tcp
;
103 mutex_enter(&tcp
->tcp_non_sq_lock
);
104 tcp
->tcp_squeue_bytes
+= size
;
105 if (TCP_UNSENT_BYTES(tcp
) > connp
->conn_sndbuf
) {
108 mutex_exit(&tcp
->tcp_non_sq_lock
);
111 SQUEUE_ENTER_ONE(connp
->conn_sqp
, mp
, tcp_output
, connp
,
112 NULL
, tcp_squeue_flag
, SQTAG_TCP_OUTPUT
);
116 tcp_wput_cmdblk(q
, mp
);
122 * if it is a snmp message, don't get behind the squeue
124 tcp
= connp
->conn_tcp
;
126 if ((mp
->b_wptr
- rptr
) >= sizeof (t_scalar_t
)) {
127 type
= ((union T_primitives
*)rptr
)->type
;
129 if (connp
->conn_debug
) {
130 (void) strlog(TCP_MOD_ID
, 0, 1,
132 "tcp_wput_proto, dropping one...");
137 if (type
== T_SVR4_OPTMGMT_REQ
) {
139 * All Solaris components should pass a db_credp
140 * for this TPI message, hence we ASSERT.
141 * But in case there is some other M_PROTO that looks
142 * like a TPI message sent by some other kernel
143 * component, we check and return an error.
145 cred_t
*cr
= msg_getcred(mp
, NULL
);
149 tcp_err_ack(tcp
, mp
, TSYSERR
, EINVAL
);
152 if (snmpcom_req(q
, mp
, tcp_snmp_set
, ip_snmp_get
,
155 * This was a SNMP request
159 output_proc
= tcp_wput_proto
;
162 output_proc
= tcp_wput_proto
;
167 * Most ioctls can be processed right away without going via
168 * squeues - process them right here. Those that do require
169 * squeue (currently _SIOCSOCKFALLBACK)
170 * are processed by tcp_wput_ioctl().
172 iocp
= (struct iocblk
*)mp
->b_rptr
;
173 tcp
= connp
->conn_tcp
;
175 switch (iocp
->ioc_cmd
) {
176 case TCP_IOC_ABORT_CONN
:
177 tcp_ioctl_abort_conn(q
, mp
);
181 mi_copyin(q
, mp
, NULL
,
182 SIZEOF_STRUCT(strbuf
, iocp
->ioc_flag
));
186 output_proc
= tcp_wput_ioctl
;
191 output_proc
= tcp_wput_nondata
;
196 SQUEUE_ENTER_ONE(connp
->conn_sqp
, mp
, output_proc
, connp
,
197 NULL
, tcp_squeue_flag
, SQTAG_TCP_WPUT_OTHER
);
201 * The TCP normal data output path.
202 * NOTE: the logic of the fast path is duplicated from this function.
205 tcp_wput_data(tcp_t
*tcp
, mblk_t
*mp
, boolean_t urgent
)
216 int32_t num_sack_blk
= 0;
217 int32_t total_hdr_len
;
220 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
221 conn_t
*connp
= tcp
->tcp_connp
;
222 clock_t now
= LBOLT_FASTPATH
;
224 tcpstate
= tcp
->tcp_state
;
227 * tcp_wput_data() with NULL mp should only be called when
228 * there is unsent data.
230 ASSERT(tcp
->tcp_unsent
> 0);
231 /* Really tacky... but we need this for detached closes. */
232 len
= tcp
->tcp_unsent
;
236 ASSERT(mp
->b_datap
->db_type
== M_DATA
);
238 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
239 * or before a connection attempt has begun.
241 if (tcpstate
< TCPS_SYN_SENT
|| tcpstate
> TCPS_CLOSE_WAIT
||
242 (tcp
->tcp_valid_bits
& TCP_FSS_VALID
) != 0) {
243 if ((tcp
->tcp_valid_bits
& TCP_FSS_VALID
) != 0) {
246 "tcp_wput_data: data after ordrel, %s",
247 tcp_display(tcp
, NULL
,
248 DISP_ADDR_AND_PORT
));
250 if (connp
->conn_debug
) {
251 (void) strlog(TCP_MOD_ID
, 0, 1,
253 "tcp_wput_data: data after ordrel, %s\n",
254 tcp_display(tcp
, NULL
,
255 DISP_ADDR_AND_PORT
));
259 if (tcp
->tcp_snd_zcopy_aware
&&
260 (mp
->b_datap
->db_struioflag
& STRUIO_ZCNOTIFY
))
261 tcp_zcopy_notify(tcp
);
263 mutex_enter(&tcp
->tcp_non_sq_lock
);
264 if (tcp
->tcp_flow_stopped
&&
265 TCP_UNSENT_BYTES(tcp
) <= connp
->conn_sndlowat
) {
268 mutex_exit(&tcp
->tcp_non_sq_lock
);
274 ASSERT((uintptr_t)(mp
->b_wptr
- mp
->b_rptr
) <=
276 len
= (int)(mp
->b_wptr
- mp
->b_rptr
);
287 /* If we are the first on the list ... */
288 if (tcp
->tcp_xmit_head
== NULL
) {
289 tcp
->tcp_xmit_head
= mp
;
290 tcp
->tcp_xmit_tail
= mp
;
291 tcp
->tcp_xmit_tail_unsent
= len
;
293 /* If tiny tx and room in txq tail, pullup to save mblks. */
296 mp1
= tcp
->tcp_xmit_last
;
297 if (len
< tcp_tx_pull_len
&&
298 (dp
= mp1
->b_datap
)->db_ref
== 1 &&
299 dp
->db_lim
- mp1
->b_wptr
>= len
) {
301 ASSERT(!mp1
->b_cont
);
303 *mp1
->b_wptr
++ = *mp
->b_rptr
;
305 bcopy(mp
->b_rptr
, mp1
->b_wptr
, len
);
308 if (mp1
== tcp
->tcp_xmit_tail
)
309 tcp
->tcp_xmit_tail_unsent
+= len
;
310 mp1
->b_cont
= mp
->b_cont
;
311 if (tcp
->tcp_snd_zcopy_aware
&&
312 (mp
->b_datap
->db_struioflag
& STRUIO_ZCNOTIFY
))
313 mp1
->b_datap
->db_struioflag
|= STRUIO_ZCNOTIFY
;
317 tcp
->tcp_xmit_last
->b_cont
= mp
;
319 len
+= tcp
->tcp_unsent
;
322 /* Tack on however many more positive length mblks we have */
323 if ((mp1
= mp
->b_cont
) != NULL
) {
326 ASSERT((uintptr_t)(mp1
->b_wptr
- mp1
->b_rptr
) <=
328 tlen
= (int)(mp1
->b_wptr
- mp1
->b_rptr
);
330 mp
->b_cont
= mp1
->b_cont
;
336 } while ((mp1
= mp
->b_cont
) != NULL
);
338 tcp
->tcp_xmit_last
= mp
;
339 tcp
->tcp_unsent
= len
;
345 snxt
= tcp
->tcp_snxt
;
346 xmit_tail
= tcp
->tcp_xmit_tail
;
347 tail_unsent
= tcp
->tcp_xmit_tail_unsent
;
350 * Note that tcp_mss has been adjusted to take into account the
351 * timestamp option if applicable. Because SACK options do not
352 * appear in every TCP segments and they are of variable lengths,
353 * they cannot be included in tcp_mss. Thus we need to calculate
354 * the actual segment length when we need to send a segment which
355 * includes SACK options.
357 if (tcp
->tcp_snd_sack_ok
&& tcp
->tcp_num_sack_blk
> 0) {
360 num_sack_blk
= MIN(tcp
->tcp_max_sack_blk
,
361 tcp
->tcp_num_sack_blk
);
362 opt_len
= num_sack_blk
* sizeof (sack_blk_t
) + TCPOPT_NOP_LEN
*
363 2 + TCPOPT_HEADER_LEN
;
364 mss
= tcp
->tcp_mss
- opt_len
;
365 total_hdr_len
= connp
->conn_ht_iphc_len
+ opt_len
;
366 tcp_hdr_len
= connp
->conn_ht_ulp_len
+ opt_len
;
369 total_hdr_len
= connp
->conn_ht_iphc_len
;
370 tcp_hdr_len
= connp
->conn_ht_ulp_len
;
373 if ((tcp
->tcp_suna
== snxt
) && !tcp
->tcp_localnet
&&
374 (TICK_TO_MSEC(now
- tcp
->tcp_last_recv_time
) >= tcp
->tcp_rto
)) {
375 TCP_SET_INIT_CWND(tcp
, mss
, tcps
->tcps_slow_start_after_idle
);
377 if (tcpstate
== TCPS_SYN_RCVD
) {
379 * The three-way connection establishment handshake is not
380 * complete yet. We want to queue the data for transmission
381 * after entering ESTABLISHED state (RFC793). A jump to
382 * "done" label effectively leaves data on the queue.
389 * In the special case when cwnd is zero, which can only
390 * happen if the connection is ECN capable, return now.
391 * New segments is sent using tcp_timer(). The timer
392 * is set in tcp_input_data().
394 if (tcp
->tcp_cwnd
== 0) {
396 * Note that tcp_cwnd is 0 before 3-way handshake is
399 ASSERT(tcp
->tcp_ecn_ok
||
400 tcp
->tcp_state
< TCPS_ESTABLISHED
);
404 /* NOTE: trouble if xmitting while SYN not acked? */
405 usable_r
= snxt
- tcp
->tcp_suna
;
406 usable_r
= tcp
->tcp_swnd
- usable_r
;
409 * Check if the receiver has shrunk the window. If
410 * tcp_wput_data() with NULL mp is called, tcp_fin_sent
411 * cannot be set as there is unsent data, so FIN cannot
412 * be sent out. Otherwise, we need to take into account
413 * of FIN as it consumes an "invisible" sequence number.
415 ASSERT(tcp
->tcp_fin_sent
== 0);
418 * The receiver has shrunk the window and we have sent
419 * -usable_r date beyond the window, re-adjust.
421 * If TCP window scaling is enabled, there can be
422 * round down error as the advertised receive window
423 * is actually right shifted n bits. This means that
424 * the lower n bits info is wiped out. It will look
425 * like the window is shrunk. Do a check here to
426 * see if the shrunk amount is actually within the
427 * error in window calculation. If it is, just
428 * return. Note that this check is inside the
429 * shrunk window check. This makes sure that even
430 * though tcp_process_shrunk_swnd() is not called,
431 * we will stop further processing.
433 if ((-usable_r
>> tcp
->tcp_snd_ws
) > 0) {
434 tcp_process_shrunk_swnd(tcp
, -usable_r
);
439 /* usable = MIN(swnd, cwnd) - unacked_bytes */
440 if (tcp
->tcp_swnd
> tcp
->tcp_cwnd
)
441 usable_r
-= tcp
->tcp_swnd
- tcp
->tcp_cwnd
;
443 /* usable = MIN(usable, unsent) */
447 /* usable = MAX(usable, {1 for urgent, 0 for data}) */
451 /* Bypass all other unnecessary processing. */
456 local_time
= (mblk_t
*)now
;
459 * "Our" Nagle Algorithm. This is not the same as in the old
460 * BSD. This is more in line with the true intent of Nagle.
462 * The conditions are:
463 * 1. The amount of unsent data (or amount of data which can be
464 * sent, whichever is smaller) is less than Nagle limit.
465 * 2. The last sent size is also less than Nagle limit.
466 * 3. There is unack'ed data.
467 * 4. Urgent pointer is not set. Send urgent data ignoring the
468 * Nagle algorithm. This reduces the probability that urgent
469 * bytes get "merged" together.
470 * 5. The app has not closed the connection. This eliminates the
471 * wait time of the receiving side waiting for the last piece of
474 * If all are satisified, exit without sending anything. Note
475 * that Nagle limit can be smaller than 1 MSS. Nagle limit is
476 * the smaller of 1 MSS and global tcp_naglim_def (default to be
479 if (usable
< (int)tcp
->tcp_naglim
&&
480 tcp
->tcp_naglim
> tcp
->tcp_last_sent_len
&&
481 snxt
!= tcp
->tcp_suna
&&
482 !(tcp
->tcp_valid_bits
& TCP_URG_VALID
) &&
483 !(tcp
->tcp_valid_bits
& TCP_FSS_VALID
)) {
488 * If tcp_zero_win_probe is not set and the tcp->tcp_cork option
489 * is set, then we have to force TCP not to send partial segment
490 * (smaller than MSS bytes). We are calculating the usable now
491 * based on full mss and will save the rest of remaining data for
492 * later. When tcp_zero_win_probe is set, TCP needs to send out
493 * something to do zero window probe.
495 if (tcp
->tcp_cork
&& !tcp
->tcp_zero_win_probe
) {
498 usable
= (usable
/ mss
) * mss
;
501 /* Update the latest receive window size in TCP header. */
502 tcp
->tcp_tcpha
->tha_win
= htons(tcp
->tcp_rwnd
>> tcp
->tcp_rcv_ws
);
504 /* Send the packet. */
505 rc
= tcp_send(tcp
, mss
, total_hdr_len
, tcp_hdr_len
,
506 num_sack_blk
, &usable
, &snxt
, &tail_unsent
, &xmit_tail
,
509 /* Pretend that all we were trying to send really got sent */
510 if (rc
< 0 && tail_unsent
< 0) {
512 xmit_tail
= xmit_tail
->b_cont
;
513 xmit_tail
->b_prev
= local_time
;
514 ASSERT((uintptr_t)(xmit_tail
->b_wptr
-
515 xmit_tail
->b_rptr
) <= (uintptr_t)INT_MAX
);
516 tail_unsent
+= (int)(xmit_tail
->b_wptr
-
518 } while (tail_unsent
< 0);
521 tcp
->tcp_xmit_tail
= xmit_tail
;
522 tcp
->tcp_xmit_tail_unsent
= tail_unsent
;
523 len
= tcp
->tcp_snxt
- snxt
;
526 * If new data was sent, need to update the notsack
527 * list, which is, afterall, data blocks that have
528 * not been sack'ed by the receiver. New data is
531 if (tcp
->tcp_snd_sack_ok
&& tcp
->tcp_notsack_list
!= NULL
) {
532 /* len is a negative value. */
533 tcp
->tcp_pipe
-= len
;
534 tcp_notsack_update(&(tcp
->tcp_notsack_list
),
536 &(tcp
->tcp_num_notsack_blk
),
537 &(tcp
->tcp_cnt_notsack_list
));
539 tcp
->tcp_snxt
= snxt
+ tcp
->tcp_fin_sent
;
540 tcp
->tcp_rack
= tcp
->tcp_rnxt
;
541 tcp
->tcp_rack_cnt
= 0;
542 if ((snxt
+ len
) == tcp
->tcp_suna
) {
543 TCP_TIMER_RESTART(tcp
, tcp
->tcp_rto
);
545 } else if (snxt
== tcp
->tcp_suna
&& tcp
->tcp_swnd
== 0) {
547 * Didn't send anything. Make sure the timer is running
548 * so that we will probe a zero window.
550 TCP_TIMER_RESTART(tcp
, tcp
->tcp_rto
);
552 /* Note that len is the amount we just sent but with a negative sign */
553 tcp
->tcp_unsent
+= len
;
554 mutex_enter(&tcp
->tcp_non_sq_lock
);
555 if (tcp
->tcp_flow_stopped
) {
556 if (TCP_UNSENT_BYTES(tcp
) <= connp
->conn_sndlowat
) {
559 } else if (TCP_UNSENT_BYTES(tcp
) >= connp
->conn_sndbuf
) {
560 if (!(tcp
->tcp_detached
))
563 mutex_exit(&tcp
->tcp_non_sq_lock
);
567 * Initial STREAMS write side put() procedure for sockets. It tries to
568 * handle the T_CAPABILITY_REQ which sockfs sends down while setting
569 * up the socket without using the squeue. Non T_CAPABILITY_REQ messages
570 * are handled by tcp_wput() as usual.
572 * All further messages will also be handled by tcp_wput() because we cannot
573 * be sure that the above short cut is safe later.
576 tcp_wput_sock(queue_t
*wq
, mblk_t
*mp
)
578 conn_t
*connp
= Q_TO_CONN(wq
);
579 tcp_t
*tcp
= connp
->conn_tcp
;
580 struct T_capability_req
*car
= (struct T_capability_req
*)mp
->b_rptr
;
582 ASSERT(wq
->q_qinfo
== &tcp_sock_winit
);
583 wq
->q_qinfo
= &tcp_winit
;
585 ASSERT(IPCL_IS_TCP(connp
));
586 ASSERT(TCP_IS_SOCKET(tcp
));
588 if (DB_TYPE(mp
) == M_PCPROTO
&&
589 MBLKL(mp
) == sizeof (struct T_capability_req
) &&
590 car
->PRIM_type
== T_CAPABILITY_REQ
) {
591 tcp_capability_req(tcp
, mp
);
600 tcp_wput_fallback(queue_t
*wq
, mblk_t
*mp
)
603 cmn_err(CE_CONT
, "tcp_wput_fallback: Message during fallback \n");
609 * Call by tcp_wput() to handle misc non M_DATA messages.
613 tcp_wput_nondata(void *arg
, mblk_t
*mp
, void *arg2
, ip_recv_attr_t
*dummy
)
615 conn_t
*connp
= (conn_t
*)arg
;
616 tcp_t
*tcp
= connp
->conn_tcp
;
618 ASSERT(DB_TYPE(mp
) != M_IOCTL
);
620 * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close.
621 * Once the close starts, streamhead and sockfs will not let any data
622 * packets come down (close ensures that there are no threads using the
623 * queue and no new threads will come down) but since qprocsoff()
624 * hasn't happened yet, a M_FLUSH or some non data message might
625 * get reflected back (in response to our own FLUSHRW) and get
626 * processed after tcp_close() is done. The conn would still be valid
627 * because a ref would have added but we need to check the state
628 * before actually processing the packet.
630 if (TCP_IS_DETACHED(tcp
) || (tcp
->tcp_state
== TCPS_CLOSED
)) {
635 switch (DB_TYPE(mp
)) {
637 tcp_wput_iocdata(tcp
, mp
);
640 tcp_wput_flush(tcp
, mp
);
643 ip_wput_nondata(connp
->conn_wq
, mp
);
648 /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */
650 tcp_wput_flush(tcp_t
*tcp
, mblk_t
*mp
)
652 uchar_t fval
= *mp
->b_rptr
;
654 conn_t
*connp
= tcp
->tcp_connp
;
655 queue_t
*q
= connp
->conn_wq
;
657 /* TODO: How should flush interact with urgent data? */
658 if ((fval
& FLUSHW
) && tcp
->tcp_xmit_head
!= NULL
&&
659 !(tcp
->tcp_valid_bits
& TCP_URG_VALID
)) {
661 * Flush only data that has not yet been put on the wire. If
662 * we flush data that we have already transmitted, life, as we
663 * know it, may come to an end.
665 tail
= tcp
->tcp_xmit_tail
;
666 tail
->b_wptr
-= tcp
->tcp_xmit_tail_unsent
;
667 tcp
->tcp_xmit_tail_unsent
= 0;
669 if (tail
->b_wptr
!= tail
->b_rptr
)
672 mblk_t
**excess
= &tcp
->tcp_xmit_head
;
674 mblk_t
*mp1
= *excess
;
677 tcp
->tcp_xmit_tail
= mp1
;
678 tcp
->tcp_xmit_last
= mp1
;
679 excess
= &mp1
->b_cont
;
682 tcp_close_mpp(&tail
);
683 if (tcp
->tcp_snd_zcopy_aware
)
684 tcp_zcopy_notify(tcp
);
687 * We have no unsent data, so unsent must be less than
688 * conn_sndlowat, so re-enable flow.
690 mutex_enter(&tcp
->tcp_non_sq_lock
);
691 if (tcp
->tcp_flow_stopped
) {
694 mutex_exit(&tcp
->tcp_non_sq_lock
);
697 * TODO: you can't just flush these, you have to increase rwnd for one
698 * thing. For another, how should urgent data interact?
701 *mp
->b_rptr
= fval
& ~FLUSHW
;
710 * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA
714 tcp_wput_iocdata(tcp_t
*tcp
, mblk_t
*mp
)
717 struct iocblk
*iocp
= (struct iocblk
*)mp
->b_rptr
;
718 STRUCT_HANDLE(strbuf
, sb
);
720 conn_t
*connp
= tcp
->tcp_connp
;
721 queue_t
*q
= connp
->conn_wq
;
723 /* Make sure it is one of ours. */
724 switch (iocp
->ioc_cmd
) {
730 * If the conn is closing, then error the ioctl here. Otherwise
731 * use the CONN_IOCTLREF_* macros to hold off tcp_close until
734 mutex_enter(&connp
->conn_lock
);
735 if (connp
->conn_state_flags
& CONN_CLOSING
) {
736 mutex_exit(&connp
->conn_lock
);
737 iocp
->ioc_error
= EINVAL
;
738 mp
->b_datap
->db_type
= M_IOCNAK
;
744 CONN_INC_IOCTLREF_LOCKED(connp
);
745 ip_wput_nondata(q
, mp
);
746 CONN_DEC_IOCTLREF(connp
);
749 switch (mi_copy_state(q
, mp
, &mp1
)) {
752 case MI_COPY_CASE(MI_COPY_IN
, 1):
754 case MI_COPY_CASE(MI_COPY_OUT
, 1):
755 /* Copy out the strbuf. */
758 case MI_COPY_CASE(MI_COPY_OUT
, 2):
760 mi_copy_done(q
, mp
, 0);
763 mi_copy_done(q
, mp
, EPROTO
);
766 /* Check alignment of the strbuf */
767 if (!OK_32PTR(mp1
->b_rptr
)) {
768 mi_copy_done(q
, mp
, EINVAL
);
772 STRUCT_SET_HANDLE(sb
, iocp
->ioc_flag
, (void *)mp1
->b_rptr
);
774 if (connp
->conn_family
== AF_INET
)
775 addrlen
= sizeof (sin_t
);
777 addrlen
= sizeof (sin6_t
);
779 if (STRUCT_FGET(sb
, maxlen
) < addrlen
) {
780 mi_copy_done(q
, mp
, EINVAL
);
784 switch (iocp
->ioc_cmd
) {
788 if (tcp
->tcp_state
< TCPS_SYN_RCVD
) {
789 mi_copy_done(q
, mp
, ENOTCONN
);
794 mp1
= mi_copyout_alloc(q
, mp
, STRUCT_FGETP(sb
, buf
), addrlen
, B_TRUE
);
798 STRUCT_FSET(sb
, len
, addrlen
);
799 switch (((struct iocblk
*)mp
->b_rptr
)->ioc_cmd
) {
801 (void) conn_getsockname(connp
, (struct sockaddr
*)mp1
->b_wptr
,
805 (void) conn_getpeername(connp
, (struct sockaddr
*)mp1
->b_wptr
,
809 mp1
->b_wptr
+= addrlen
;
810 /* Copy out the address */
815 * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL
820 tcp_wput_ioctl(void *arg
, mblk_t
*mp
, void *arg2
, ip_recv_attr_t
*dummy
)
822 conn_t
*connp
= (conn_t
*)arg
;
823 tcp_t
*tcp
= connp
->conn_tcp
;
824 queue_t
*q
= connp
->conn_wq
;
827 ASSERT(DB_TYPE(mp
) == M_IOCTL
);
829 * Try and ASSERT the minimum possible references on the
830 * conn early enough. Since we are executing on write side,
831 * the connection is obviously not detached and that means
832 * there is a ref each for TCP and IP. Since we are behind
833 * the squeue, the minimum references needed are 3. If the
834 * conn is in classifier hash list, there should be an
835 * extra ref for that (we check both the possibilities).
837 ASSERT((connp
->conn_fanout
!= NULL
&& connp
->conn_ref
>= 4) ||
838 (connp
->conn_fanout
== NULL
&& connp
->conn_ref
>= 3));
840 iocp
= (struct iocblk
*)mp
->b_rptr
;
841 switch (iocp
->ioc_cmd
) {
842 case _SIOCSOCKFALLBACK
:
844 * Either sockmod is about to be popped and the socket
845 * would now be treated as a plain stream, or a module
846 * is about to be pushed so we could no longer use read-
847 * side synchronous streams for fused loopback tcp.
848 * Drain any queued data and disable direct sockfs
849 * interface from now on.
851 if (!tcp
->tcp_issocket
) {
852 DB_TYPE(mp
) = M_IOCNAK
;
853 iocp
->ioc_error
= EINVAL
;
855 tcp_use_pure_tpi(tcp
);
856 DB_TYPE(mp
) = M_IOCACK
;
866 * If the conn is closing, then error the ioctl here. Otherwise bump the
867 * conn_ioctlref to hold off tcp_close until we're done here.
869 mutex_enter(&(connp
)->conn_lock
);
870 if ((connp
)->conn_state_flags
& CONN_CLOSING
) {
871 mutex_exit(&(connp
)->conn_lock
);
872 iocp
->ioc_error
= EINVAL
;
873 mp
->b_datap
->db_type
= M_IOCNAK
;
879 CONN_INC_IOCTLREF_LOCKED(connp
);
880 ip_wput_nondata(q
, mp
);
881 CONN_DEC_IOCTLREF(connp
);
885 * This routine is called by tcp_wput() to handle all TPI requests.
889 tcp_wput_proto(void *arg
, mblk_t
*mp
, void *arg2
, ip_recv_attr_t
*dummy
)
891 conn_t
*connp
= (conn_t
*)arg
;
892 tcp_t
*tcp
= connp
->conn_tcp
;
893 union T_primitives
*tprim
= (union T_primitives
*)mp
->b_rptr
;
899 * Try and ASSERT the minimum possible references on the
900 * conn early enough. Since we are executing on write side,
901 * the connection is obviously not detached and that means
902 * there is a ref each for TCP and IP. Since we are behind
903 * the squeue, the minimum references needed are 3. If the
904 * conn is in classifier hash list, there should be an
905 * extra ref for that (we check both the possibilities).
907 ASSERT((connp
->conn_fanout
!= NULL
&& connp
->conn_ref
>= 4) ||
908 (connp
->conn_fanout
== NULL
&& connp
->conn_ref
>= 3));
911 ASSERT((uintptr_t)(mp
->b_wptr
- rptr
) <= (uintptr_t)INT_MAX
);
912 if ((mp
->b_wptr
- rptr
) >= sizeof (t_scalar_t
)) {
913 type
= ((union T_primitives
*)rptr
)->type
;
914 if (type
== T_EXDATA_REQ
) {
915 tcp_output_urgent(connp
, mp
, arg2
, NULL
);
916 } else if (type
!= T_DATA_REQ
) {
917 goto non_urgent_data
;
919 /* TODO: options, flags, ... from user */
920 /* Set length to zero for reclamation below */
921 tcp_wput_data(tcp
, mp
->b_cont
, B_TRUE
);
926 if (connp
->conn_debug
) {
927 (void) strlog(TCP_MOD_ID
, 0, 1, SL_ERROR
|SL_TRACE
,
928 "tcp_wput_proto, dropping one...");
936 switch ((int)tprim
->type
) {
937 case O_T_BIND_REQ
: /* bind request */
938 case T_BIND_REQ
: /* new semantics bind request */
939 tcp_tpi_bind(tcp
, mp
);
941 case T_UNBIND_REQ
: /* unbind request */
942 tcp_tpi_unbind(tcp
, mp
);
944 case O_T_CONN_RES
: /* old connection response XXX */
945 case T_CONN_RES
: /* connection response */
946 tcp_tli_accept(tcp
, mp
);
948 case T_CONN_REQ
: /* connection request */
949 tcp_tpi_connect(tcp
, mp
);
951 case T_DISCON_REQ
: /* disconnect request */
952 tcp_disconnect(tcp
, mp
);
954 case T_CAPABILITY_REQ
:
955 tcp_capability_req(tcp
, mp
); /* capability request */
957 case T_INFO_REQ
: /* information request */
958 tcp_info_req(tcp
, mp
);
960 case T_SVR4_OPTMGMT_REQ
: /* manage options req */
963 * Note: no support for snmpcom_req() through new
964 * T_OPTMGMT_REQ. See comments in ip.c
968 * All Solaris components should pass a db_credp
969 * for this TPI message, hence we ASSERT.
970 * But in case there is some other M_PROTO that looks
971 * like a TPI message sent by some other kernel
972 * component, we check and return an error.
974 cr
= msg_getcred(mp
, NULL
);
977 tcp_err_ack(tcp
, mp
, TSYSERR
, EINVAL
);
981 * If EINPROGRESS is returned, the request has been queued
982 * for subsequent processing by ip_restart_optmgmt(), which
983 * will do the CONN_DEC_REF().
985 if ((int)tprim
->type
== T_SVR4_OPTMGMT_REQ
) {
986 svr4_optcom_req(connp
->conn_wq
, mp
, cr
, &tcp_opt_obj
);
988 tpi_optcom_req(connp
->conn_wq
, mp
, cr
, &tcp_opt_obj
);
992 case T_UNITDATA_REQ
: /* unitdata request */
993 tcp_err_ack(tcp
, mp
, TNOTSUPPORT
, 0);
995 case T_ORDREL_REQ
: /* orderly release req */
1001 if (tcp_xmit_end(tcp
) != 0) {
1003 * We were crossing FINs and got a reset from
1004 * the other side. Just ignore it.
1006 if (connp
->conn_debug
) {
1007 (void) strlog(TCP_MOD_ID
, 0, 1,
1009 "tcp_wput_proto, T_ORDREL_REQ out of "
1011 tcp_display(tcp
, NULL
,
1012 DISP_ADDR_AND_PORT
));
1017 tcp_addr_req(tcp
, mp
);
1020 if (connp
->conn_debug
) {
1021 (void) strlog(TCP_MOD_ID
, 0, 1, SL_ERROR
|SL_TRACE
,
1022 "tcp_wput_proto, bogus TPI msg, type %d",
1026 * We used to M_ERROR. Sending TNOTSUPPORT gives the user
1029 tcp_err_ack(tcp
, mp
, TNOTSUPPORT
, 0);
1035 * Handle special out-of-band ioctl requests (see PSARC/2008/265).
1038 tcp_wput_cmdblk(queue_t
*q
, mblk_t
*mp
)
1041 mblk_t
*datamp
= mp
->b_cont
;
1042 conn_t
*connp
= Q_TO_CONN(q
);
1043 tcp_t
*tcp
= connp
->conn_tcp
;
1044 cmdblk_t
*cmdp
= (cmdblk_t
*)mp
->b_rptr
;
1046 if (datamp
== NULL
|| MBLKL(datamp
) < cmdp
->cb_len
) {
1047 cmdp
->cb_error
= EPROTO
;
1052 data
= datamp
->b_rptr
;
1054 switch (cmdp
->cb_cmd
) {
1055 case TI_GETPEERNAME
:
1056 if (tcp
->tcp_state
< TCPS_SYN_RCVD
)
1057 cmdp
->cb_error
= ENOTCONN
;
1059 cmdp
->cb_error
= conn_getpeername(connp
, data
,
1063 cmdp
->cb_error
= conn_getsockname(connp
, data
, &cmdp
->cb_len
);
1066 cmdp
->cb_error
= EINVAL
;
1074 * The TCP fast path write put procedure.
1075 * NOTE: the logic of the fast path is duplicated from tcp_wput_data()
1079 tcp_output(void *arg
, mblk_t
*mp
, void *arg2
, ip_recv_attr_t
*dummy
)
1095 conn_t
*connp
= (conn_t
*)arg
;
1096 tcp_t
*tcp
= connp
->conn_tcp
;
1098 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
1099 ip_xmit_attr_t
*ixa
;
1103 * Try and ASSERT the minimum possible references on the
1104 * conn early enough. Since we are executing on write side,
1105 * the connection is obviously not detached and that means
1106 * there is a ref each for TCP and IP. Since we are behind
1107 * the squeue, the minimum references needed are 3. If the
1108 * conn is in classifier hash list, there should be an
1109 * extra ref for that (we check both the possibilities).
1111 ASSERT((connp
->conn_fanout
!= NULL
&& connp
->conn_ref
>= 4) ||
1112 (connp
->conn_fanout
== NULL
&& connp
->conn_ref
>= 3));
1114 ASSERT(DB_TYPE(mp
) == M_DATA
);
1115 msize
= (mp
->b_cont
== NULL
) ? MBLKL(mp
) : msgdsize(mp
);
1117 mutex_enter(&tcp
->tcp_non_sq_lock
);
1118 tcp
->tcp_squeue_bytes
-= msize
;
1119 mutex_exit(&tcp
->tcp_non_sq_lock
);
1121 /* Bypass tcp protocol for fused tcp loopback */
1122 if (tcp
->tcp_fused
&& tcp_fuse_output(tcp
, mp
, msize
))
1127 * If ZEROCOPY has turned off, try not to send any zero-copy message
1128 * down. Do backoff, now.
1130 if (tcp
->tcp_snd_zcopy_aware
&& !tcp
->tcp_snd_zcopy_on
)
1131 mp
= tcp_zcopy_backoff(tcp
, mp
, B_FALSE
);
1134 ASSERT((uintptr_t)(mp
->b_wptr
- mp
->b_rptr
) <= (uintptr_t)INT_MAX
);
1135 len
= (int)(mp
->b_wptr
- mp
->b_rptr
);
1138 * Criteria for fast path:
1141 * 2. single mblk in request
1142 * 3. connection established
1145 * 6. no tcp_valid bits
1147 if ((tcp
->tcp_unsent
!= 0) ||
1149 (mp
->b_cont
!= NULL
) ||
1150 (tcp
->tcp_state
!= TCPS_ESTABLISHED
) ||
1153 (tcp
->tcp_valid_bits
!= 0)) {
1154 tcp_wput_data(tcp
, mp
, B_FALSE
);
1158 ASSERT(tcp
->tcp_xmit_tail_unsent
== 0);
1159 ASSERT(tcp
->tcp_fin_sent
== 0);
1161 /* queue new packet onto retransmission queue */
1162 if (tcp
->tcp_xmit_head
== NULL
) {
1163 tcp
->tcp_xmit_head
= mp
;
1165 tcp
->tcp_xmit_last
->b_cont
= mp
;
1167 tcp
->tcp_xmit_last
= mp
;
1168 tcp
->tcp_xmit_tail
= mp
;
1170 /* find out how much we can send */
1174 * |--------------|-----------------|
1175 * tcp_suna tcp_snxt tcp_suna+tcp_swnd
1179 /* start sending from tcp_snxt */
1180 snxt
= tcp
->tcp_snxt
;
1183 * Check to see if this connection has been idled for some
1184 * time and no ACK is expected. If it is, we need to slow
1185 * start again to get back the connection's "self-clock" as
1186 * described in VJ's paper.
1188 * Reinitialize tcp_cwnd after idle.
1190 now
= LBOLT_FASTPATH
;
1191 if ((tcp
->tcp_suna
== snxt
) && !tcp
->tcp_localnet
&&
1192 (TICK_TO_MSEC(now
- tcp
->tcp_last_recv_time
) >= tcp
->tcp_rto
)) {
1193 TCP_SET_INIT_CWND(tcp
, mss
, tcps
->tcps_slow_start_after_idle
);
1196 usable
= tcp
->tcp_swnd
; /* tcp window size */
1197 if (usable
> tcp
->tcp_cwnd
)
1198 usable
= tcp
->tcp_cwnd
; /* congestion window smaller */
1199 usable
-= snxt
; /* subtract stuff already sent */
1200 suna
= tcp
->tcp_suna
;
1202 /* usable can be < 0 if the congestion window is smaller */
1204 /* Can't send complete M_DATA in one shot */
1208 mutex_enter(&tcp
->tcp_non_sq_lock
);
1209 if (tcp
->tcp_flow_stopped
&&
1210 TCP_UNSENT_BYTES(tcp
) <= connp
->conn_sndlowat
) {
1213 mutex_exit(&tcp
->tcp_non_sq_lock
);
1216 * determine if anything to send (Nagle).
1218 * 1. len < tcp_mss (i.e. small)
1219 * 2. unacknowledged data present
1220 * 3. len < nagle limit
1221 * 4. last packet sent < nagle limit (previous packet sent)
1223 if ((len
< mss
) && (snxt
!= suna
) &&
1224 (len
< (int)tcp
->tcp_naglim
) &&
1225 (tcp
->tcp_last_sent_len
< tcp
->tcp_naglim
)) {
1227 * This was the first unsent packet and normally
1228 * mss < xmit_hiwater so there is no need to worry
1229 * about flow control. The next packet will go
1230 * through the flow control check in tcp_wput_data().
1232 /* leftover work from above */
1233 tcp
->tcp_unsent
= len
;
1234 tcp
->tcp_xmit_tail_unsent
= len
;
1240 * len <= tcp->tcp_mss && len == unsent so no sender silly window. Can
1245 TCP_TIMER_RESTART(tcp
, tcp
->tcp_rto
);
1248 /* we have always sent something */
1249 tcp
->tcp_rack_cnt
= 0;
1251 tcp
->tcp_snxt
= snxt
+ len
;
1252 tcp
->tcp_rack
= tcp
->tcp_rnxt
;
1254 if ((mp1
= dupb(mp
)) == 0)
1256 mp
->b_prev
= (mblk_t
*)(uintptr_t)now
;
1257 mp
->b_next
= (mblk_t
*)(uintptr_t)snxt
;
1259 /* adjust tcp header information */
1260 tcpha
= tcp
->tcp_tcpha
;
1261 tcpha
->tha_flags
= (TH_ACK
|TH_PUSH
);
1263 sum
= len
+ connp
->conn_ht_ulp_len
+ connp
->conn_sum
;
1264 sum
= (sum
>> 16) + (sum
& 0xFFFF);
1265 tcpha
->tha_sum
= htons(sum
);
1267 tcpha
->tha_seq
= htonl(snxt
);
1269 TCPS_BUMP_MIB(tcps
, tcpOutDataSegs
);
1270 TCPS_UPDATE_MIB(tcps
, tcpOutDataBytes
, len
);
1271 BUMP_LOCAL(tcp
->tcp_obsegs
);
1273 /* Update the latest receive window size in TCP header. */
1274 tcpha
->tha_win
= htons(tcp
->tcp_rwnd
>> tcp
->tcp_rcv_ws
);
1276 tcp
->tcp_last_sent_len
= (ushort_t
)len
;
1278 plen
= len
+ connp
->conn_ht_iphc_len
;
1280 ixa
= connp
->conn_ixa
;
1281 ixa
->ixa_pktlen
= plen
;
1283 if (ixa
->ixa_flags
& IXAF_IS_IPV4
) {
1284 tcp
->tcp_ipha
->ipha_length
= htons(plen
);
1286 tcp
->tcp_ip6h
->ip6_plen
= htons(plen
- IPV6_HDR_LEN
);
1289 /* see if we need to allocate a mblk for the headers */
1290 hdrlen
= connp
->conn_ht_iphc_len
;
1291 rptr
= mp1
->b_rptr
- hdrlen
;
1293 if ((db
->db_ref
!= 2) || rptr
< db
->db_base
||
1294 (!OK_32PTR(rptr
))) {
1295 /* NOTE: we assume allocb returns an OK_32PTR */
1296 mp
= allocb(hdrlen
+ tcps
->tcps_wroff_xtra
, BPRI_MED
);
1303 /* Leave room for Link Level header */
1304 rptr
= &mp1
->b_rptr
[tcps
->tcps_wroff_xtra
];
1305 mp1
->b_wptr
= &rptr
[hdrlen
];
1309 /* Fill in the timestamp option. */
1310 if (tcp
->tcp_snd_ts_ok
) {
1311 uint32_t llbolt
= (uint32_t)LBOLT_FASTPATH
;
1314 (char *)tcpha
+ TCP_MIN_HEADER_LENGTH
+4);
1315 U32_TO_BE32(tcp
->tcp_ts_recent
,
1316 (char *)tcpha
+ TCP_MIN_HEADER_LENGTH
+8);
1318 ASSERT(connp
->conn_ht_ulp_len
== TCP_MIN_HEADER_LENGTH
);
1321 /* copy header into outgoing packet */
1322 dst
= (ipaddr_t
*)rptr
;
1323 src
= (ipaddr_t
*)connp
->conn_ht_iphc
;
1344 * Set the ECN info in the TCP header. Note that this
1345 * is not the template header.
1347 if (tcp
->tcp_ecn_ok
) {
1348 TCP_SET_ECT(tcp
, rptr
);
1350 tcpha
= (tcpha_t
*)(rptr
+ ixa
->ixa_ip_hdr_length
);
1351 if (tcp
->tcp_ecn_echo_on
)
1352 tcpha
->tha_flags
|= TH_ECE
;
1353 if (tcp
->tcp_cwr
&& !tcp
->tcp_ecn_cwr_sent
) {
1354 tcpha
->tha_flags
|= TH_CWR
;
1355 tcp
->tcp_ecn_cwr_sent
= B_TRUE
;
1359 if (tcp
->tcp_ip_forward_progress
) {
1360 tcp
->tcp_ip_forward_progress
= B_FALSE
;
1361 connp
->conn_ixa
->ixa_flags
|= IXAF_REACH_CONF
;
1363 connp
->conn_ixa
->ixa_flags
&= ~IXAF_REACH_CONF
;
1365 tcp_send_data(tcp
, mp1
);
1369 * If we ran out of memory, we pretend to have sent the packet
1370 * and that it was lost on the wire.
1376 /* leftover work from above */
1377 tcp
->tcp_unsent
= len
;
1378 tcp
->tcp_xmit_tail_unsent
= len
;
1379 tcp_wput_data(tcp
, NULL
, B_FALSE
);
1384 tcp_output_urgent(void *arg
, mblk_t
*mp
, void *arg2
, ip_recv_attr_t
*dummy
)
1388 conn_t
*connp
= (conn_t
*)arg
;
1389 tcp_t
*tcp
= connp
->conn_tcp
;
1391 msize
= msgdsize(mp
);
1400 * Try to force urgent data out on the wire. Even if we have unsent
1401 * data this will at least send the urgent flag.
1402 * XXX does not handle more flag correctly.
1404 len
+= tcp
->tcp_unsent
;
1405 len
+= tcp
->tcp_snxt
;
1407 tcp
->tcp_valid_bits
|= TCP_URG_VALID
;
1409 /* Bypass tcp protocol for fused tcp loopback */
1410 if (tcp
->tcp_fused
&& tcp_fuse_output(tcp
, mp
, msize
))
1413 /* Strip off the T_EXDATA_REQ if the data is from TPI */
1414 if (DB_TYPE(mp
) != M_DATA
) {
1416 ASSERT(!IPCL_IS_NONSTR(connp
));
1420 tcp_wput_data(tcp
, mp
, B_TRUE
);
1424 * Called by streams close routine via squeues when our client blows off her
1425 * descriptor, we take this to mean: "close the stream state NOW, close the tcp
1426 * connection politely" When SO_LINGER is set (with a non-zero linger time and
1427 * it is not a nonblocking socket) then this routine sleeps until the FIN is
1430 * NOTE: tcp_close potentially returns error when lingering.
1431 * However, the stream head currently does not pass these errors
1432 * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK
1433 * errors to the application (from tsleep()) and not errors
1434 * like ECONNRESET caused by receiving a reset packet.
1439 tcp_close_output(void *arg
, mblk_t
*mp
, void *arg2
, ip_recv_attr_t
*dummy
)
1442 conn_t
*connp
= (conn_t
*)arg
;
1443 tcp_t
*tcp
= connp
->conn_tcp
;
1445 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
1448 * When a non-STREAMS socket is being closed, it does not always
1449 * stick around waiting for tcp_close_output to run and can therefore
1450 * have dropped a reference already. So adjust the asserts accordingly.
1452 ASSERT((connp
->conn_fanout
!= NULL
&&
1453 connp
->conn_ref
>= (IPCL_IS_NONSTR(connp
) ? 3 : 4)) ||
1454 (connp
->conn_fanout
== NULL
&&
1455 connp
->conn_ref
>= (IPCL_IS_NONSTR(connp
) ? 2 : 3)));
1457 mutex_enter(&tcp
->tcp_eager_lock
);
1458 if (tcp
->tcp_conn_req_cnt_q0
!= 0 || tcp
->tcp_conn_req_cnt_q
!= 0) {
1460 * Cleanup for listener. For non-STREAM sockets sockfs will
1461 * close all the eagers on 'q', so in that case only deal
1464 tcp_eager_cleanup(tcp
, IPCL_IS_NONSTR(connp
) ? 1 : 0);
1465 tcp
->tcp_wait_for_eagers
= 1;
1467 mutex_exit(&tcp
->tcp_eager_lock
);
1469 tcp
->tcp_lso
= B_FALSE
;
1472 switch (tcp
->tcp_state
) {
1477 if (tcp
->tcp_listener
!= NULL
) {
1478 ASSERT(IPCL_IS_NONSTR(connp
));
1480 * Unlink from the listener and drop the reference
1481 * put on it by the eager. tcp_closei_local will not
1482 * do it because tcp_tconnind_started is TRUE.
1484 mutex_enter(&tcp
->tcp_saved_listener
->tcp_eager_lock
);
1485 tcp_eager_unlink(tcp
);
1486 mutex_exit(&tcp
->tcp_saved_listener
->tcp_eager_lock
);
1487 CONN_DEC_REF(tcp
->tcp_saved_listener
->tcp_connp
);
1493 msg
= "tcp_close, during connect";
1497 * Close during the connect 3-way handshake
1498 * but here there may or may not be pending data
1499 * already on queue. Process almost same as in
1500 * the ESTABLISHED state.
1508 * If SO_LINGER has set a zero linger time, abort the
1509 * connection with a reset.
1511 if (connp
->conn_linger
&& connp
->conn_lingertime
== 0) {
1512 msg
= "tcp_close, zero lingertime";
1517 * Abort connection if there is unread data queued.
1519 if (tcp
->tcp_rcv_list
|| tcp
->tcp_reass_head
) {
1520 msg
= "tcp_close, unread data";
1525 * Abort connection if it is being closed without first
1526 * being accepted. This can happen if a listening non-STREAM
1527 * socket wants to get rid of the socket, for example, if the
1528 * listener is closing.
1530 if (tcp
->tcp_listener
!= NULL
) {
1531 ASSERT(IPCL_IS_NONSTR(connp
));
1532 msg
= "tcp_close, close before accept";
1535 * Unlink from the listener and drop the reference
1536 * put on it by the eager. tcp_closei_local will not
1537 * do it because tcp_tconnind_started is TRUE.
1539 mutex_enter(&tcp
->tcp_saved_listener
->tcp_eager_lock
);
1540 tcp_eager_unlink(tcp
);
1541 mutex_exit(&tcp
->tcp_saved_listener
->tcp_eager_lock
);
1542 CONN_DEC_REF(tcp
->tcp_saved_listener
->tcp_connp
);
1547 * Transmit the FIN before detaching the tcp_t.
1548 * After tcp_detach returns this queue/perimeter
1549 * no longer owns the tcp_t thus others can modify it.
1551 (void) tcp_xmit_end(tcp
);
1554 * If lingering on close then wait until the fin is acked,
1555 * the SO_LINGER time passes, or a reset is sent/received.
1557 if (connp
->conn_linger
&& connp
->conn_lingertime
> 0 &&
1558 !(tcp
->tcp_fin_acked
) &&
1559 tcp
->tcp_state
>= TCPS_ESTABLISHED
) {
1560 if (tcp
->tcp_closeflags
& (FNDELAY
|FNONBLOCK
)) {
1561 tcp
->tcp_client_errno
= EWOULDBLOCK
;
1562 } else if (tcp
->tcp_client_errno
== 0) {
1564 ASSERT(tcp
->tcp_linger_tid
== 0);
1566 /* conn_lingertime is in sec. */
1567 tcp
->tcp_linger_tid
= TCP_TIMER(tcp
,
1568 tcp_close_linger_timeout
,
1569 connp
->conn_lingertime
* MILLISEC
);
1571 /* tcp_close_linger_timeout will finish close */
1572 if (tcp
->tcp_linger_tid
== 0)
1573 tcp
->tcp_client_errno
= ENOSR
;
1579 * Check if we need to detach or just close
1582 if (tcp
->tcp_state
<= TCPS_LISTEN
)
1587 * Make sure that no other thread will access the conn_rq of
1588 * this instance (through lookups etc.) as conn_rq will go
1591 tcp_acceptor_hash_remove(tcp
);
1593 mutex_enter(&tcp
->tcp_non_sq_lock
);
1594 if (tcp
->tcp_flow_stopped
) {
1597 mutex_exit(&tcp
->tcp_non_sq_lock
);
1599 if (tcp
->tcp_timer_tid
!= 0) {
1600 delta
= TCP_TIMER_CANCEL(tcp
, tcp
->tcp_timer_tid
);
1601 tcp
->tcp_timer_tid
= 0;
1604 * Need to cancel those timers which will not be used when
1605 * TCP is detached. This has to be done before the conn_wq
1608 tcp_timers_stop(tcp
);
1610 tcp
->tcp_detached
= B_TRUE
;
1611 if (tcp
->tcp_state
== TCPS_TIME_WAIT
) {
1612 tcp_time_wait_append(tcp
);
1613 TCP_DBGSTAT(tcps
, tcp_detach_time_wait
);
1614 ASSERT(connp
->conn_ref
>=
1615 (IPCL_IS_NONSTR(connp
) ? 2 : 3));
1620 * If delta is zero the timer event wasn't executed and was
1621 * successfully canceled. In this case we need to restart it
1622 * with the minimal delta possible.
1625 tcp
->tcp_timer_tid
= TCP_TIMER(tcp
, tcp_timer
,
1628 ASSERT(connp
->conn_ref
>= (IPCL_IS_NONSTR(connp
) ? 2 : 3));
1632 /* Detach did not complete. Still need to remove q from stream. */
1634 if (tcp
->tcp_state
== TCPS_ESTABLISHED
||
1635 tcp
->tcp_state
== TCPS_CLOSE_WAIT
)
1636 TCPS_BUMP_MIB(tcps
, tcpEstabResets
);
1637 if (tcp
->tcp_state
== TCPS_SYN_SENT
||
1638 tcp
->tcp_state
== TCPS_SYN_RCVD
)
1639 TCPS_BUMP_MIB(tcps
, tcpAttemptFails
);
1640 tcp_xmit_ctl(msg
, tcp
, tcp
->tcp_snxt
, 0, TH_RST
);
1643 tcp_closei_local(tcp
);
1644 CONN_DEC_REF(connp
);
1645 ASSERT(connp
->conn_ref
>= (IPCL_IS_NONSTR(connp
) ? 1 : 2));
1649 * Don't change the queues in the case of a listener that has
1650 * eagers in its q or q0. It could surprise the eagers.
1651 * Instead wait for the eagers outside the squeue.
1653 * For non-STREAMS sockets tcp_wait_for_eagers implies that
1654 * we should delay the su_closed upcall until all eagers have
1655 * dropped their references.
1657 if (!tcp
->tcp_wait_for_eagers
) {
1658 tcp
->tcp_detached
= B_TRUE
;
1659 connp
->conn_rq
= NULL
;
1660 connp
->conn_wq
= NULL
;
1662 /* non-STREAM socket, release the upper handle */
1663 if (IPCL_IS_NONSTR(connp
)) {
1664 ASSERT(connp
->conn_upper_handle
!= NULL
);
1665 (*connp
->conn_upcalls
->su_closed
)
1666 (connp
->conn_upper_handle
);
1667 connp
->conn_upper_handle
= NULL
;
1668 connp
->conn_upcalls
= NULL
;
1672 /* Signal tcp_close() to finish closing. */
1673 mutex_enter(&tcp
->tcp_closelock
);
1674 tcp
->tcp_closed
= 1;
1675 cv_signal(&tcp
->tcp_closecv
);
1676 mutex_exit(&tcp
->tcp_closelock
);
1681 tcp_shutdown_output(void *arg
, mblk_t
*mp
, void *arg2
, ip_recv_attr_t
*dummy
)
1683 conn_t
*connp
= (conn_t
*)arg
;
1684 tcp_t
*tcp
= connp
->conn_tcp
;
1691 if (tcp_xmit_end(tcp
) != 0) {
1693 * We were crossing FINs and got a reset from
1694 * the other side. Just ignore it.
1696 if (connp
->conn_debug
) {
1697 (void) strlog(TCP_MOD_ID
, 0, 1,
1699 "tcp_shutdown_output() out of state %s",
1700 tcp_display(tcp
, NULL
, DISP_ADDR_AND_PORT
));
1705 #pragma inline(tcp_send_data)
1708 tcp_send_data(tcp_t
*tcp
, mblk_t
*mp
)
1710 conn_t
*connp
= tcp
->tcp_connp
;
1713 * Check here to avoid sending zero-copy message down to IP when
1714 * ZEROCOPY capability has turned off. We only need to deal with
1715 * the race condition between sockfs and the notification here.
1716 * Since we have tried to backoff the tcp_xmit_head when turning
1717 * zero-copy off and new messages in tcp_output(), we simply drop
1718 * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean
1721 if (tcp
->tcp_snd_zcopy_aware
&& !tcp
->tcp_snd_zcopy_on
&&
1722 !tcp
->tcp_xmit_zc_clean
) {
1723 ip_drop_output("TCP ZC was disabled but not clean", mp
, NULL
);
1728 DTRACE_TCP5(send
, mblk_t
*, NULL
, ip_xmit_attr_t
*, connp
->conn_ixa
,
1729 __dtrace_tcp_void_ip_t
*, mp
->b_rptr
, tcp_t
*, tcp
,
1730 __dtrace_tcp_tcph_t
*,
1731 &mp
->b_rptr
[connp
->conn_ixa
->ixa_ip_hdr_length
]);
1733 ASSERT(connp
->conn_ixa
->ixa_notify_cookie
== connp
->conn_tcp
);
1734 (void) conn_ip_output(mp
, connp
->conn_ixa
);
1739 tcp_send_synack(void *arg
, mblk_t
*mp
, void *arg2
, ip_recv_attr_t
*dummy
)
1741 conn_t
*econnp
= (conn_t
*)arg
;
1742 tcp_t
*tcp
= econnp
->conn_tcp
;
1743 ip_xmit_attr_t
*ixa
= econnp
->conn_ixa
;
1745 /* Guard against a RST having blown it away while on the squeue */
1746 if (tcp
->tcp_state
== TCPS_CLOSED
) {
1752 * In the off-chance that the eager received and responded to
1753 * some other packet while the SYN|ACK was queued, we recalculate
1754 * the ixa_pktlen. It would be better to fix the SYN/accept
1755 * multithreading scheme to avoid this complexity.
1757 ixa
->ixa_pktlen
= msgdsize(mp
);
1758 (void) conn_ip_output(mp
, ixa
);
1762 * tcp_send() is called by tcp_wput_data() and returns one of the following:
1764 * -1 = failed allocation.
1765 * 0 = We've either successfully sent data, or our usable send window is too
1766 * small and we'd rather wait until later before sending again.
1769 tcp_send(tcp_t
*tcp
, const int mss
, const int total_hdr_len
,
1770 const int tcp_hdr_len
, const int num_sack_blk
, int *usable
,
1771 uint_t
*snxt
, int *tail_unsent
, mblk_t
**xmit_tail
, mblk_t
*local_time
)
1773 int num_lso_seg
= 1;
1775 boolean_t do_lso_send
= B_FALSE
;
1776 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
1777 conn_t
*connp
= tcp
->tcp_connp
;
1778 ip_xmit_attr_t
*ixa
= connp
->conn_ixa
;
1781 * Check LSO possibility. The value of tcp->tcp_lso indicates whether
1782 * the underlying connection is LSO capable. Will check whether having
1783 * enough available data to initiate LSO transmission in the for(){}
1786 if (tcp
->tcp_lso
&& (tcp
->tcp_valid_bits
& ~TCP_FSS_VALID
) == 0)
1787 do_lso_send
= B_TRUE
;
1798 * Calculate the maximum payload length we can send at one
1803 * Determine whether or not it's possible to do LSO,
1804 * and if so, how much data we can send.
1806 if ((*usable
- 1) / mss
>= 1) {
1807 lso_usable
= MIN(tcp
->tcp_lso_max
, *usable
);
1808 num_lso_seg
= lso_usable
/ mss
;
1809 if (lso_usable
% mss
) {
1811 tcp
->tcp_last_sent_len
= (ushort_t
)
1814 tcp
->tcp_last_sent_len
= (ushort_t
)mss
;
1817 do_lso_send
= B_FALSE
;
1823 ASSERT(num_lso_seg
<= IP_MAXPACKET
/ mss
+ 1);
1826 if (len
> *usable
) {
1827 ASSERT(do_lso_send
== B_FALSE
);
1831 /* Terminate the loop */
1832 break; /* success; too small */
1835 * Sender silly-window avoidance.
1836 * Ignore this if we are going to send a
1837 * zero window probe out.
1839 * TODO: force data into microscopic window?
1840 * ==> (!pushed || (unsent > usable))
1842 if (len
< (tcp
->tcp_max_swnd
>> 1) &&
1843 (tcp
->tcp_unsent
- (*snxt
- tcp
->tcp_snxt
)) > len
&&
1844 !((tcp
->tcp_valid_bits
& TCP_URG_VALID
) &&
1845 len
== 1) && (! tcp
->tcp_zero_win_probe
)) {
1847 * If the retransmit timer is not running
1848 * we start it so that we will retransmit
1849 * in the case when the receiver has
1850 * decremented the window.
1852 if (*snxt
== tcp
->tcp_snxt
&&
1853 *snxt
== tcp
->tcp_suna
) {
1855 * We are not supposed to send
1856 * anything. So let's wait a little
1857 * bit longer before breaking SWS
1860 * What should the value be?
1861 * Suggestion: MAX(init rexmit time,
1864 TCP_TIMER_RESTART(tcp
, tcp
->tcp_rto
);
1866 break; /* success; too small */
1870 tcpha
= tcp
->tcp_tcpha
;
1873 * The reason to adjust len here is that we need to set flags
1874 * and calculate checksum.
1879 *usable
-= len
; /* Approximate - can be adjusted later */
1881 tcpha
->tha_flags
= TH_ACK
;
1883 tcpha
->tha_flags
= (TH_ACK
| TH_PUSH
);
1886 * Prime pump for IP's checksumming on our behalf.
1887 * Include the adjustment for a source route if any.
1888 * In case of LSO, the partial pseudo-header checksum should
1889 * exclusive TCP length, so zero tha_sum before IP calculate
1890 * pseudo-header checksum for partial checksum offload.
1895 sum
= len
+ tcp_hdr_len
+ connp
->conn_sum
;
1896 sum
= (sum
>> 16) + (sum
& 0xFFFF);
1898 tcpha
->tha_sum
= htons(sum
);
1899 tcpha
->tha_seq
= htonl(*snxt
);
1902 * Branch off to tcp_xmit_mp() if any of the VALID bits is
1903 * set. For the case when TCP_FSS_VALID is the only valid
1904 * bit (normal active close), branch off only when we think
1905 * that the FIN flag needs to be set. Note for this case,
1906 * that (snxt + len) may not reflect the actual seg_len,
1907 * as len may be further reduced in tcp_xmit_mp(). If len
1908 * gets modified, we will end up here again.
1910 if (tcp
->tcp_valid_bits
!= 0 &&
1911 (tcp
->tcp_valid_bits
!= TCP_FSS_VALID
||
1912 ((*snxt
+ len
) == tcp
->tcp_fss
))) {
1914 uint32_t prev_snxt
= tcp
->tcp_snxt
;
1916 if (*tail_unsent
== 0) {
1917 ASSERT((*xmit_tail
)->b_cont
!= NULL
);
1918 *xmit_tail
= (*xmit_tail
)->b_cont
;
1919 prev_rptr
= (*xmit_tail
)->b_rptr
;
1920 *tail_unsent
= (int)((*xmit_tail
)->b_wptr
-
1921 (*xmit_tail
)->b_rptr
);
1923 prev_rptr
= (*xmit_tail
)->b_rptr
;
1924 (*xmit_tail
)->b_rptr
= (*xmit_tail
)->b_wptr
-
1927 mp
= tcp_xmit_mp(tcp
, *xmit_tail
, len
, NULL
, NULL
,
1928 *snxt
, B_FALSE
, (uint32_t *)&len
, B_FALSE
);
1929 /* Restore tcp_snxt so we get amount sent right. */
1930 tcp
->tcp_snxt
= prev_snxt
;
1931 if (prev_rptr
== (*xmit_tail
)->b_rptr
) {
1933 * If the previous timestamp is still in use,
1934 * don't stomp on it.
1936 if ((*xmit_tail
)->b_next
== NULL
) {
1937 (*xmit_tail
)->b_prev
= local_time
;
1938 (*xmit_tail
)->b_next
=
1939 (mblk_t
*)(uintptr_t)(*snxt
);
1942 (*xmit_tail
)->b_rptr
= prev_rptr
;
1949 if (len
<= mss
) /* LSO is unusable (!do_lso_send) */
1950 tcp
->tcp_last_sent_len
= (ushort_t
)len
;
1951 while (mp1
->b_cont
) {
1952 *xmit_tail
= (*xmit_tail
)->b_cont
;
1953 (*xmit_tail
)->b_prev
= local_time
;
1954 (*xmit_tail
)->b_next
=
1955 (mblk_t
*)(uintptr_t)(*snxt
);
1959 *tail_unsent
= (*xmit_tail
)->b_wptr
- mp1
->b_wptr
;
1960 BUMP_LOCAL(tcp
->tcp_obsegs
);
1961 TCPS_BUMP_MIB(tcps
, tcpOutDataSegs
);
1962 TCPS_UPDATE_MIB(tcps
, tcpOutDataBytes
, len
);
1963 tcp_send_data(tcp
, mp
);
1967 *snxt
+= len
; /* Adjust later if we don't send all of len */
1968 TCPS_BUMP_MIB(tcps
, tcpOutDataSegs
);
1969 TCPS_UPDATE_MIB(tcps
, tcpOutDataBytes
, len
);
1972 /* Are the bytes above us in flight? */
1973 rptr
= (*xmit_tail
)->b_wptr
- *tail_unsent
;
1974 if (rptr
!= (*xmit_tail
)->b_rptr
) {
1975 *tail_unsent
-= len
;
1976 if (len
<= mss
) /* LSO is unusable */
1977 tcp
->tcp_last_sent_len
= (ushort_t
)len
;
1978 len
+= total_hdr_len
;
1979 ixa
->ixa_pktlen
= len
;
1981 if (ixa
->ixa_flags
& IXAF_IS_IPV4
) {
1982 tcp
->tcp_ipha
->ipha_length
= htons(len
);
1984 tcp
->tcp_ip6h
->ip6_plen
=
1985 htons(len
- IPV6_HDR_LEN
);
1988 mp
= dupb(*xmit_tail
);
1990 return (-1); /* out_of_mem */
1994 * If the old timestamp is no longer in use,
1995 * sample a new timestamp now.
1997 if ((*xmit_tail
)->b_next
== NULL
) {
1998 (*xmit_tail
)->b_prev
= local_time
;
1999 (*xmit_tail
)->b_next
=
2000 (mblk_t
*)(uintptr_t)(*snxt
-len
);
2005 *xmit_tail
= (*xmit_tail
)->b_cont
;
2006 ASSERT((uintptr_t)((*xmit_tail
)->b_wptr
-
2007 (*xmit_tail
)->b_rptr
) <= (uintptr_t)INT_MAX
);
2008 *tail_unsent
= (int)((*xmit_tail
)->b_wptr
-
2009 (*xmit_tail
)->b_rptr
);
2012 (*xmit_tail
)->b_prev
= local_time
;
2013 (*xmit_tail
)->b_next
= (mblk_t
*)(uintptr_t)(*snxt
- len
);
2015 *tail_unsent
-= len
;
2016 if (len
<= mss
) /* LSO is unusable (!do_lso_send) */
2017 tcp
->tcp_last_sent_len
= (ushort_t
)len
;
2019 len
+= total_hdr_len
;
2020 ixa
->ixa_pktlen
= len
;
2022 if (ixa
->ixa_flags
& IXAF_IS_IPV4
) {
2023 tcp
->tcp_ipha
->ipha_length
= htons(len
);
2025 tcp
->tcp_ip6h
->ip6_plen
= htons(len
- IPV6_HDR_LEN
);
2028 mp
= dupb(*xmit_tail
);
2030 return (-1); /* out_of_mem */
2033 len
= total_hdr_len
;
2035 * There are four reasons to allocate a new hdr mblk:
2036 * 1) The bytes above us are in use by another packet
2037 * 2) We don't have good alignment
2038 * 3) The mblk is being shared
2039 * 4) We don't have enough room for a header
2041 rptr
= mp
->b_rptr
- len
;
2042 if (!OK_32PTR(rptr
) ||
2043 ((db
= mp
->b_datap
), db
->db_ref
!= 2) ||
2044 rptr
< db
->db_base
) {
2045 /* NOTE: we assume allocb returns an OK_32PTR */
2048 mp1
= allocb(connp
->conn_ht_iphc_allocated
+
2049 tcps
->tcps_wroff_xtra
, BPRI_MED
);
2052 return (-1); /* out_of_mem */
2056 /* Leave room for Link Level header */
2057 len
= total_hdr_len
;
2058 rptr
= &mp
->b_rptr
[tcps
->tcps_wroff_xtra
];
2059 mp
->b_wptr
= &rptr
[len
];
2063 * Fill in the header using the template header, and add
2064 * options such as time-stamp, ECN and/or SACK, as needed.
2066 tcp_fill_header(tcp
, rptr
, (clock_t)local_time
, num_sack_blk
);
2071 int spill
= *tail_unsent
;
2078 * If we're a little short, tack on more mblks until
2079 * there is no more spillover.
2085 nmp
= (*xmit_tail
)->b_cont
;
2089 * Excess data in mblk; can we split it?
2090 * If LSO is enabled for the connection,
2091 * keep on splitting as this is a transient
2094 if (!do_lso_send
&& (spill
+ nmpsz
> 0)) {
2096 * Don't split if stream head was
2097 * told to break up larger writes
2098 * into smaller ones.
2100 if (tcp
->tcp_maxpsz_multiplier
> 0)
2104 * Next mblk is less than SMSS/2
2105 * rounded up to nearest 64-byte;
2106 * let it get sent as part of the
2109 if (tcp
->tcp_localnet
&&
2111 (nmpsz
< roundup((mss
>> 1), 64)))
2116 ASSERT((uintptr_t)nmpsz
<= (uintptr_t)INT_MAX
);
2117 /* Stash for rtt use later */
2118 (*xmit_tail
)->b_prev
= local_time
;
2119 (*xmit_tail
)->b_next
=
2120 (mblk_t
*)(uintptr_t)(*snxt
- len
);
2121 mp1
->b_cont
= dupb(*xmit_tail
);
2126 *tail_unsent
= spill
;
2128 return (-1); /* out_of_mem */
2132 /* Trim back any surplus on the last mblk */
2134 mp1
->b_wptr
-= spill
;
2135 *tail_unsent
= spill
;
2138 * We did not send everything we could in
2139 * order to remain within the b_cont limit.
2143 tcp
->tcp_last_sent_len
+= spill
;
2144 TCPS_UPDATE_MIB(tcps
, tcpOutDataBytes
, spill
);
2146 * Adjust the checksum
2148 tcpha
= (tcpha_t
*)(rptr
+
2149 ixa
->ixa_ip_hdr_length
);
2151 sum
= (sum
>> 16) + (sum
& 0xFFFF);
2152 tcpha
->tha_sum
= htons(sum
);
2153 if (connp
->conn_ipversion
== IPV4_VERSION
) {
2155 ((ipha_t
*)rptr
)->ipha_length
) +
2157 ((ipha_t
*)rptr
)->ipha_length
=
2161 ((ip6_t
*)rptr
)->ip6_plen
) +
2163 ((ip6_t
*)rptr
)->ip6_plen
=
2166 ixa
->ixa_pktlen
+= spill
;
2170 if (tcp
->tcp_ip_forward_progress
) {
2171 tcp
->tcp_ip_forward_progress
= B_FALSE
;
2172 ixa
->ixa_flags
|= IXAF_REACH_CONF
;
2174 ixa
->ixa_flags
&= ~IXAF_REACH_CONF
;
2178 /* Append LSO information to the mp. */
2179 lso_info_set(mp
, mss
, HW_LSO
);
2180 ixa
->ixa_fragsize
= IP_MAXPACKET
;
2181 ixa
->ixa_extra_ident
= num_lso_seg
- 1;
2183 DTRACE_PROBE2(tcp_send_lso
, int, num_lso_seg
,
2186 tcp_send_data(tcp
, mp
);
2189 * Restore values of ixa_fragsize and ixa_extra_ident.
2191 ixa
->ixa_fragsize
= ixa
->ixa_pmtu
;
2192 ixa
->ixa_extra_ident
= 0;
2193 tcp
->tcp_obsegs
+= num_lso_seg
;
2194 TCP_STAT(tcps
, tcp_lso_times
);
2195 TCP_STAT_UPDATE(tcps
, tcp_lso_pkt_out
, num_lso_seg
);
2198 * Make sure to clean up LSO information. Wherever a
2199 * new mp uses the prepended header room after dupb(),
2200 * lso_info_cleanup() should be called.
2202 lso_info_cleanup(mp
);
2203 tcp_send_data(tcp
, mp
);
2204 BUMP_LOCAL(tcp
->tcp_obsegs
);
2212 * Initiate closedown sequence on an active connection. (May be called as
2213 * writer.) Return value zero for OK return, non-zero for error return.
2216 tcp_xmit_end(tcp_t
*tcp
)
2219 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
2221 ip_stack_t
*ipst
= tcps
->tcps_netstack
->netstack_ip
;
2222 conn_t
*connp
= tcp
->tcp_connp
;
2224 if (tcp
->tcp_state
< TCPS_SYN_RCVD
||
2225 tcp
->tcp_state
> TCPS_CLOSE_WAIT
) {
2227 * Invalid state, only states TCPS_SYN_RCVD,
2228 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid
2233 tcp
->tcp_fss
= tcp
->tcp_snxt
+ tcp
->tcp_unsent
;
2234 tcp
->tcp_valid_bits
|= TCP_FSS_VALID
;
2236 * If there is nothing more unsent, send the FIN now.
2237 * Otherwise, it will go out with the last segment.
2239 if (tcp
->tcp_unsent
== 0) {
2240 mp
= tcp_xmit_mp(tcp
, NULL
, 0, NULL
, NULL
,
2241 tcp
->tcp_fss
, B_FALSE
, NULL
, B_FALSE
);
2244 tcp_send_data(tcp
, mp
);
2247 * Couldn't allocate msg. Pretend we got it out.
2248 * Wait for rexmit timeout.
2250 tcp
->tcp_snxt
= tcp
->tcp_fss
+ 1;
2251 TCP_TIMER_RESTART(tcp
, tcp
->tcp_rto
);
2255 * If needed, update tcp_rexmit_snxt as tcp_snxt is
2258 if (tcp
->tcp_rexmit
&& tcp
->tcp_rexmit_nxt
== tcp
->tcp_fss
) {
2259 tcp
->tcp_rexmit_nxt
= tcp
->tcp_snxt
;
2263 * If tcp->tcp_cork is set, then the data will not get sent,
2264 * so we have to check that and unset it first.
2267 tcp
->tcp_cork
= B_FALSE
;
2268 tcp_wput_data(tcp
, NULL
, B_FALSE
);
2272 * If TCP does not get enough samples of RTT or tcp_rtt_updates
2273 * is 0, don't update the cache.
2275 if (tcps
->tcps_rtt_updates
== 0 ||
2276 tcp
->tcp_rtt_update
< tcps
->tcps_rtt_updates
)
2280 * We do not have a good algorithm to update ssthresh at this time.
2281 * So don't do any update.
2283 bzero(&uinfo
, sizeof (uinfo
));
2284 uinfo
.iulp_rtt
= tcp
->tcp_rtt_sa
;
2285 uinfo
.iulp_rtt_sd
= tcp
->tcp_rtt_sd
;
2288 * Note that uinfo is kept for conn_faddr in the DCE. Could update even
2289 * if source routed but we don't.
2291 if (connp
->conn_ipversion
== IPV4_VERSION
) {
2292 if (connp
->conn_faddr_v4
!= tcp
->tcp_ipha
->ipha_dst
) {
2295 (void) dce_update_uinfo_v4(connp
->conn_faddr_v4
, &uinfo
, ipst
);
2299 if (!(IN6_ARE_ADDR_EQUAL(&connp
->conn_faddr_v6
,
2300 &tcp
->tcp_ip6h
->ip6_dst
))) {
2304 if (IN6_IS_ADDR_LINKSCOPE(&connp
->conn_faddr_v6
)) {
2305 ip_xmit_attr_t
*ixa
= connp
->conn_ixa
;
2308 * If we are going to create a DCE we'd better have
2311 if (ixa
->ixa_nce
!= NULL
) {
2312 ifindex
= ixa
->ixa_nce
->nce_common
->ncec_ill
->
2313 ill_phyint
->phyint_ifindex
;
2319 (void) dce_update_uinfo(&connp
->conn_faddr_v6
, ifindex
, &uinfo
,
2326 * Send out a control packet on the tcp connection specified. This routine
2327 * is typically called where we need a simple ACK or RST generated.
2330 tcp_xmit_ctl(char *str
, tcp_t
*tcp
, uint32_t seq
, uint32_t ack
, int ctl
)
2334 ipha_t
*ipha
= NULL
;
2340 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
2341 conn_t
*connp
= tcp
->tcp_connp
;
2342 ip_xmit_attr_t
*ixa
= connp
->conn_ixa
;
2345 * Save sum for use in source route later.
2347 sum
= connp
->conn_ht_ulp_len
+ connp
->conn_sum
;
2348 total_hdr_len
= connp
->conn_ht_iphc_len
;
2349 ip_hdr_len
= ixa
->ixa_ip_hdr_length
;
2351 /* If a text string is passed in with the request, pass it to strlog. */
2352 if (str
!= NULL
&& connp
->conn_debug
) {
2353 (void) strlog(TCP_MOD_ID
, 0, 1, SL_TRACE
,
2354 "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x",
2355 str
, seq
, ack
, ctl
);
2357 mp
= allocb(connp
->conn_ht_iphc_allocated
+ tcps
->tcps_wroff_xtra
,
2362 rptr
= &mp
->b_rptr
[tcps
->tcps_wroff_xtra
];
2364 mp
->b_wptr
= &rptr
[total_hdr_len
];
2365 bcopy(connp
->conn_ht_iphc
, rptr
, total_hdr_len
);
2367 ixa
->ixa_pktlen
= total_hdr_len
;
2369 if (ixa
->ixa_flags
& IXAF_IS_IPV4
) {
2370 ipha
= (ipha_t
*)rptr
;
2371 ipha
->ipha_length
= htons(total_hdr_len
);
2373 ip6h
= (ip6_t
*)rptr
;
2374 ip6h
->ip6_plen
= htons(total_hdr_len
- IPV6_HDR_LEN
);
2376 tcpha
= (tcpha_t
*)&rptr
[ip_hdr_len
];
2377 tcpha
->tha_flags
= (uint8_t)ctl
;
2379 TCPS_BUMP_MIB(tcps
, tcpOutRsts
);
2380 TCPS_BUMP_MIB(tcps
, tcpOutControl
);
2382 * Don't send TSopt w/ TH_RST packets per RFC 1323.
2384 if (tcp
->tcp_snd_ts_ok
&&
2385 tcp
->tcp_state
> TCPS_SYN_SENT
) {
2386 mp
->b_wptr
= &rptr
[total_hdr_len
- TCPOPT_REAL_TS_LEN
];
2387 *(mp
->b_wptr
) = TCPOPT_EOL
;
2389 ixa
->ixa_pktlen
= total_hdr_len
- TCPOPT_REAL_TS_LEN
;
2391 if (connp
->conn_ipversion
== IPV4_VERSION
) {
2392 ipha
->ipha_length
= htons(total_hdr_len
-
2393 TCPOPT_REAL_TS_LEN
);
2395 ip6h
->ip6_plen
= htons(total_hdr_len
-
2396 IPV6_HDR_LEN
- TCPOPT_REAL_TS_LEN
);
2398 tcpha
->tha_offset_and_reserved
-= (3 << 4);
2399 sum
-= TCPOPT_REAL_TS_LEN
;
2403 if (tcp
->tcp_snd_ts_ok
) {
2404 uint32_t llbolt
= (uint32_t)LBOLT_FASTPATH
;
2407 (char *)tcpha
+ TCP_MIN_HEADER_LENGTH
+4);
2408 U32_TO_BE32(tcp
->tcp_ts_recent
,
2409 (char *)tcpha
+ TCP_MIN_HEADER_LENGTH
+8);
2412 /* Update the latest receive window size in TCP header. */
2413 tcpha
->tha_win
= htons(tcp
->tcp_rwnd
>> tcp
->tcp_rcv_ws
);
2414 /* Track what we sent to the peer */
2415 tcp
->tcp_tcpha
->tha_win
= tcpha
->tha_win
;
2416 tcp
->tcp_rack
= ack
;
2417 tcp
->tcp_rack_cnt
= 0;
2418 TCPS_BUMP_MIB(tcps
, tcpOutAck
);
2420 BUMP_LOCAL(tcp
->tcp_obsegs
);
2421 tcpha
->tha_seq
= htonl(seq
);
2422 tcpha
->tha_ack
= htonl(ack
);
2424 * Include the adjustment for a source route if any.
2426 sum
= (sum
>> 16) + (sum
& 0xFFFF);
2427 tcpha
->tha_sum
= htons(sum
);
2428 tcp_send_data(tcp
, mp
);
2432 * Generate a reset based on an inbound packet, connp is set by caller
2433 * when RST is in response to an unexpected inbound packet for which
2434 * there is active tcp state in the system.
2436 * IPSEC NOTE : Try to send the reply with the same protection as it came
2437 * in. We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t.
2438 * That way the packet will go out at the same level of protection as it
2442 tcp_xmit_early_reset(char *str
, mblk_t
*mp
, uint32_t seq
, uint32_t ack
, int ctl
,
2443 ip_recv_attr_t
*ira
, ip_stack_t
*ipst
, conn_t
*connp
)
2445 ipha_t
*ipha
= NULL
;
2452 netstack_t
*ns
= ipst
->ips_netstack
;
2453 tcp_stack_t
*tcps
= ns
->netstack_tcp
;
2454 ip_xmit_attr_t ixas
, *ixa
;
2455 uint_t ip_hdr_len
= ira
->ira_ip_hdr_length
;
2456 boolean_t need_refrele
= B_FALSE
; /* ixa_refrele(ixa) */
2459 if (!tcp_send_rst_chk(tcps
)) {
2460 TCP_STAT(tcps
, tcp_rst_unsent
);
2466 * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other
2467 * options from the listener. In that case the caller must ensure that
2468 * we are running on the listener = connp squeue.
2470 * We get a safe copy of conn_ixa so we don't need to restore anything
2471 * we or ip_output_simple might change in the ixa.
2473 if (connp
!= NULL
) {
2474 ASSERT(connp
->conn_on_sqp
);
2476 ixa
= conn_get_ixa_exclusive(connp
);
2478 TCP_STAT(tcps
, tcp_rst_unsent
);
2482 need_refrele
= B_TRUE
;
2484 bzero(&ixas
, sizeof (ixas
));
2487 * IXAF_VERIFY_SOURCE is overkill since we know the
2488 * packet was for us.
2490 ixa
->ixa_flags
|= IXAF_SET_ULP_CKSUM
| IXAF_VERIFY_SOURCE
;
2491 ixa
->ixa_protocol
= IPPROTO_TCP
;
2492 ixa
->ixa_zoneid
= ira
->ira_zoneid
;
2493 ixa
->ixa_ifindex
= 0;
2494 ixa
->ixa_ipst
= ipst
;
2495 ixa
->ixa_cred
= kcred
;
2496 ixa
->ixa_cpid
= NOPID
;
2499 if (str
&& tcps
->tcps_dbg
) {
2500 (void) strlog(TCP_MOD_ID
, 0, 1, SL_TRACE
,
2501 "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, "
2503 str
, seq
, ack
, ctl
);
2505 if (mp
->b_datap
->db_ref
!= 1) {
2506 mblk_t
*mp1
= copyb(mp
);
2511 } else if (mp
->b_cont
) {
2512 freemsg(mp
->b_cont
);
2514 DB_CKSUMFLAGS(mp
) = 0;
2517 * We skip reversing source route here.
2518 * (for now we replace all IP options with EOL)
2520 if (IPH_HDR_VERSION(mp
->b_rptr
) == IPV4_VERSION
) {
2521 ipha
= (ipha_t
*)mp
->b_rptr
;
2522 for (i
= IP_SIMPLE_HDR_LENGTH
; i
< (int)ip_hdr_len
; i
++)
2523 mp
->b_rptr
[i
] = IPOPT_EOL
;
2525 * Make sure that src address isn't flagrantly invalid.
2526 * Not all broadcast address checking for the src address
2527 * is possible, since we don't know the netmask of the src
2528 * addr. No check for destination address is done, since
2529 * IP will not pass up a packet with a broadcast dest
2530 * address to TCP. Similar checks are done below for IPv6.
2532 if (ipha
->ipha_src
== 0 || ipha
->ipha_src
== INADDR_BROADCAST
||
2533 CLASSD(ipha
->ipha_src
)) {
2534 BUMP_MIB(&ipst
->ips_ip_mib
, ipIfStatsInDiscards
);
2535 ip_drop_input("ipIfStatsInDiscards", mp
, NULL
);
2540 ip6h
= (ip6_t
*)mp
->b_rptr
;
2542 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h
->ip6_src
) ||
2543 IN6_IS_ADDR_MULTICAST(&ip6h
->ip6_src
)) {
2544 BUMP_MIB(&ipst
->ips_ip6_mib
, ipIfStatsInDiscards
);
2545 ip_drop_input("ipIfStatsInDiscards", mp
, NULL
);
2550 /* Remove any extension headers assuming partial overlay */
2551 if (ip_hdr_len
> IPV6_HDR_LEN
) {
2554 to
= mp
->b_rptr
+ ip_hdr_len
- IPV6_HDR_LEN
;
2555 ovbcopy(ip6h
, to
, IPV6_HDR_LEN
);
2556 mp
->b_rptr
+= ip_hdr_len
- IPV6_HDR_LEN
;
2557 ip_hdr_len
= IPV6_HDR_LEN
;
2558 ip6h
= (ip6_t
*)mp
->b_rptr
;
2559 ip6h
->ip6_nxt
= IPPROTO_TCP
;
2562 tcpha
= (tcpha_t
*)&mp
->b_rptr
[ip_hdr_len
];
2563 if (tcpha
->tha_flags
& TH_RST
) {
2567 tcpha
->tha_offset_and_reserved
= (5 << 4);
2568 len
= ip_hdr_len
+ sizeof (tcpha_t
);
2569 mp
->b_wptr
= &mp
->b_rptr
[len
];
2570 if (IPH_HDR_VERSION(mp
->b_rptr
) == IPV4_VERSION
) {
2571 ipha
->ipha_length
= htons(len
);
2572 /* Swap addresses */
2573 v4addr
= ipha
->ipha_src
;
2574 ipha
->ipha_src
= ipha
->ipha_dst
;
2575 ipha
->ipha_dst
= v4addr
;
2576 ipha
->ipha_ident
= 0;
2577 ipha
->ipha_ttl
= (uchar_t
)tcps
->tcps_ipv4_ttl
;
2578 ixa
->ixa_flags
|= IXAF_IS_IPV4
;
2579 ixa
->ixa_ip_hdr_length
= ip_hdr_len
;
2581 ip6h
->ip6_plen
= htons(len
- IPV6_HDR_LEN
);
2582 /* Swap addresses */
2583 v6addr
= ip6h
->ip6_src
;
2584 ip6h
->ip6_src
= ip6h
->ip6_dst
;
2585 ip6h
->ip6_dst
= v6addr
;
2586 ip6h
->ip6_hops
= (uchar_t
)tcps
->tcps_ipv6_hoplimit
;
2587 ixa
->ixa_flags
&= ~IXAF_IS_IPV4
;
2589 if (IN6_IS_ADDR_LINKSCOPE(&ip6h
->ip6_dst
)) {
2590 ixa
->ixa_flags
|= IXAF_SCOPEID_SET
;
2591 ixa
->ixa_scopeid
= ira
->ira_ruifindex
;
2593 ixa
->ixa_ip_hdr_length
= IPV6_HDR_LEN
;
2595 ixa
->ixa_pktlen
= len
;
2597 /* Swap the ports */
2598 port
= tcpha
->tha_fport
;
2599 tcpha
->tha_fport
= tcpha
->tha_lport
;
2600 tcpha
->tha_lport
= port
;
2602 tcpha
->tha_ack
= htonl(ack
);
2603 tcpha
->tha_seq
= htonl(seq
);
2605 tcpha
->tha_sum
= htons(sizeof (tcpha_t
));
2606 tcpha
->tha_flags
= (uint8_t)ctl
;
2610 * Probe connection rejection here.
2611 * tcp_xmit_listeners_reset() drops non-SYN segments
2612 * that do not specify TH_ACK in their flags without
2613 * calling this function. As a consequence, if this
2614 * function is called with a TH_RST|TH_ACK ctl argument,
2615 * it is being called in response to a SYN segment
2616 * and thus the tcp:::accept-refused probe point
2619 DTRACE_TCP5(accept__refused
, mblk_t
*, NULL
,
2620 void, NULL
, void_ip_t
*, mp
->b_rptr
, tcp_t
*, NULL
,
2623 TCPS_BUMP_MIB(tcps
, tcpOutRsts
);
2624 TCPS_BUMP_MIB(tcps
, tcpOutControl
);
2627 /* Discard any old label */
2628 if (ixa
->ixa_free_flags
& IXA_FREE_TSL
) {
2629 ASSERT(ixa
->ixa_tsl
!= NULL
);
2630 label_rele(ixa
->ixa_tsl
);
2631 ixa
->ixa_free_flags
&= ~IXA_FREE_TSL
;
2633 ixa
->ixa_tsl
= ira
->ira_tsl
; /* Behave as a multi-level responder */
2635 if (ira
->ira_flags
& IRAF_IPSEC_SECURE
) {
2637 * Apply IPsec based on how IPsec was applied to
2638 * the packet that caused the RST.
2640 if (!ipsec_in_to_out(ira
, ixa
, mp
, ipha
, ip6h
)) {
2641 BUMP_MIB(&ipst
->ips_ip_mib
, ipIfStatsOutDiscards
);
2642 /* Note: mp already consumed and ip_drop_packet done */
2647 * This is in clear. The RST message we are building
2648 * here should go out in clear, independent of our policy.
2650 ixa
->ixa_flags
|= IXAF_NO_IPSEC
;
2653 DTRACE_TCP5(send
, mblk_t
*, NULL
, ip_xmit_attr_t
*, ixa
,
2654 __dtrace_tcp_void_ip_t
*, mp
->b_rptr
, tcp_t
*, NULL
,
2655 __dtrace_tcp_tcph_t
*, tcpha
);
2658 * NOTE: one might consider tracing a TCP packet here, but
2659 * this function has no active TCP state and no tcp structure
2660 * that has a trace buffer. If we traced here, we would have
2661 * to keep a local trace buffer in tcp_record_trace().
2664 (void) ip_output_simple(mp
, ixa
);
2668 ASSERT(ixa
!= &ixas
);
2674 * Generate a "no listener here" RST in response to an "unknown" segment.
2675 * connp is set by caller when RST is in response to an unexpected
2676 * inbound packet for which there is active tcp state in the system.
2677 * Note that we are reusing the incoming mp to construct the outgoing RST.
2680 tcp_xmit_listeners_reset(mblk_t
*mp
, ip_recv_attr_t
*ira
, ip_stack_t
*ipst
,
2691 boolean_t policy_present
;
2692 netstack_t
*ns
= ipst
->ips_netstack
;
2693 tcp_stack_t
*tcps
= ns
->netstack_tcp
;
2694 ipsec_stack_t
*ipss
= tcps
->tcps_netstack
->netstack_ipsec
;
2695 uint_t ip_hdr_len
= ira
->ira_ip_hdr_length
;
2697 TCP_STAT(tcps
, tcp_no_listener
);
2700 * DTrace this "unknown" segment as a tcp:::receive, as we did
2701 * just receive something that was TCP.
2703 DTRACE_TCP5(receive
, mblk_t
*, NULL
, ip_xmit_attr_t
*, NULL
,
2704 __dtrace_tcp_void_ip_t
*, mp
->b_rptr
, tcp_t
*, NULL
,
2705 __dtrace_tcp_tcph_t
*, &mp
->b_rptr
[ip_hdr_len
]);
2707 if (IPH_HDR_VERSION(mp
->b_rptr
) == IPV4_VERSION
) {
2708 policy_present
= ipss
->ipsec_inbound_v4_policy_present
;
2709 ipha
= (ipha_t
*)mp
->b_rptr
;
2712 policy_present
= ipss
->ipsec_inbound_v6_policy_present
;
2714 ip6h
= (ip6_t
*)mp
->b_rptr
;
2717 if (policy_present
) {
2719 * The conn_t parameter is NULL because we already know
2722 mp
= ipsec_check_global_policy(mp
, (conn_t
*)NULL
, ipha
, ip6h
,
2727 if (is_system_labeled() && !tsol_can_reply_error(mp
, ira
)) {
2729 tx__ip__log__error__nolistener__tcp
,
2730 char *, "Could not reply with RST to mp(1)",
2732 ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n"));
2739 tcpha
= (tcpha_t
*)&rptr
[ip_hdr_len
];
2740 seg_seq
= ntohl(tcpha
->tha_seq
);
2741 seg_ack
= ntohl(tcpha
->tha_ack
);
2742 flags
= tcpha
->tha_flags
;
2744 seg_len
= msgdsize(mp
) - (TCP_HDR_LENGTH(tcpha
) + ip_hdr_len
);
2745 if (flags
& TH_RST
) {
2747 } else if (flags
& TH_ACK
) {
2748 tcp_xmit_early_reset("no tcp, reset", mp
, seg_ack
, 0, TH_RST
,
2751 if (flags
& TH_SYN
) {
2755 * Here we violate the RFC. Note that a normal
2756 * TCP will never send a segment without the ACK
2757 * flag, except for RST or SYN segment. This
2758 * segment is neither. Just drop it on the
2762 TCP_STAT(tcps
, tcp_rst_unsent
);
2766 tcp_xmit_early_reset("no tcp, reset/ack", mp
, 0,
2767 seg_seq
+ seg_len
, TH_RST
| TH_ACK
, ira
, ipst
, connp
);
2772 * Helper function for tcp_xmit_mp() in handling connection set up flag
2776 tcp_xmit_mp_aux_iss(tcp_t
*tcp
, conn_t
*connp
, tcpha_t
*tcpha
, mblk_t
*mp
,
2780 uint8_t *wptr
= mp
->b_wptr
;
2781 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
2782 boolean_t add_sack
= B_FALSE
;
2785 * If TCP_ISS_VALID and the seq number is tcp_iss,
2786 * TCP can only be in SYN-SENT, SYN-RCVD or
2787 * FIN-WAIT-1 state. It can be FIN-WAIT-1 if
2788 * our SYN is not ack'ed but the app closes this
2791 ASSERT(tcp
->tcp_state
== TCPS_SYN_SENT
||
2792 tcp
->tcp_state
== TCPS_SYN_RCVD
||
2793 tcp
->tcp_state
== TCPS_FIN_WAIT_1
);
2796 * Tack on the MSS option. It is always needed
2797 * for both active and passive open.
2799 * MSS option value should be interface MTU - MIN
2800 * TCP/IP header according to RFC 793 as it means
2801 * the maximum segment size TCP can receive. But
2802 * to get around some broken middle boxes/end hosts
2803 * out there, we allow the option value to be the
2804 * same as the MSS option size on the peer side.
2805 * In this way, the other side will not send
2806 * anything larger than they can receive.
2808 * Note that for SYN_SENT state, the ndd param
2809 * tcp_use_smss_as_mss_opt has no effect as we
2810 * don't know the peer's MSS option value. So
2811 * the only case we need to take care of is in
2812 * SYN_RCVD state, which is done later.
2814 wptr
[0] = TCPOPT_MAXSEG
;
2815 wptr
[1] = TCPOPT_MAXSEG_LEN
;
2817 u1
= tcp
->tcp_initial_pmtu
- (connp
->conn_ipversion
== IPV4_VERSION
?
2818 IP_SIMPLE_HDR_LENGTH
: IPV6_HDR_LEN
) - TCP_MIN_HEADER_LENGTH
;
2819 U16_TO_BE16(u1
, wptr
);
2822 /* Update the offset to cover the additional word */
2823 tcpha
->tha_offset_and_reserved
+= (1 << 4);
2825 switch (tcp
->tcp_state
) {
2829 if (tcp
->tcp_snd_sack_ok
)
2832 if (tcp
->tcp_snd_ts_ok
) {
2833 uint32_t llbolt
= (uint32_t)LBOLT_FASTPATH
;
2836 wptr
[0] = TCPOPT_SACK_PERMITTED
;
2837 wptr
[1] = TCPOPT_SACK_OK_LEN
;
2840 wptr
[0] = TCPOPT_NOP
;
2841 wptr
[1] = TCPOPT_NOP
;
2843 wptr
[2] = TCPOPT_TSTAMP
;
2844 wptr
[3] = TCPOPT_TSTAMP_LEN
;
2846 U32_TO_BE32(llbolt
, wptr
);
2848 ASSERT(tcp
->tcp_ts_recent
== 0);
2849 U32_TO_BE32(0L, wptr
);
2851 tcpha
->tha_offset_and_reserved
+= (3 << 4);
2855 * Set up all the bits to tell other side
2856 * we are ECN capable.
2858 if (tcp
->tcp_ecn_ok
)
2859 *flags
|= (TH_ECE
| TH_CWR
);
2867 * Reset the MSS option value to be SMSS
2868 * We should probably add back the bytes
2869 * for timestamp option and IPsec. We
2870 * don't do that as this is a workaround
2871 * for broken middle boxes/end hosts, it
2872 * is better for us to be more cautious.
2873 * They may not take these things into
2874 * account in their SMSS calculation. Thus
2875 * the peer's calculated SMSS may be smaller
2876 * than what it can be. This should be OK.
2878 if (tcps
->tcps_use_smss_as_mss_opt
) {
2881 * Note that wptr points just past the MSS
2884 U16_TO_BE16(u1
, wptr
- 2);
2888 * tcp_snd_ts_ok can only be set in TCPS_SYN_RCVD
2889 * when the peer also uses timestamps option. And
2890 * the TCP header template must have already been
2891 * updated to include the timestamps option.
2893 if (tcp
->tcp_snd_sack_ok
) {
2894 if (tcp
->tcp_snd_ts_ok
) {
2898 * Use the NOP in the header just
2899 * before timestamps opton.
2901 tmp_wptr
= (uint8_t *)tcpha
+
2902 TCP_MIN_HEADER_LENGTH
;
2903 ASSERT(tmp_wptr
[0] == TCPOPT_NOP
&&
2904 tmp_wptr
[1] == TCPOPT_NOP
);
2905 tmp_wptr
[0] = TCPOPT_SACK_PERMITTED
;
2906 tmp_wptr
[1] = TCPOPT_SACK_OK_LEN
;
2914 * If the other side is ECN capable, reply
2915 * that we are also ECN capable.
2917 if (tcp
->tcp_ecn_ok
)
2923 * The above ASSERT() makes sure that this
2924 * must be FIN-WAIT-1 state. Our SYN has
2925 * not been ack'ed so retransmit it.
2932 wptr
[0] = TCPOPT_NOP
;
2933 wptr
[1] = TCPOPT_NOP
;
2934 wptr
[2] = TCPOPT_SACK_PERMITTED
;
2935 wptr
[3] = TCPOPT_SACK_OK_LEN
;
2936 wptr
+= TCPOPT_REAL_SACK_OK_LEN
;
2937 tcpha
->tha_offset_and_reserved
+= (1 << 4);
2940 if (tcp
->tcp_snd_ws_ok
) {
2941 wptr
[0] = TCPOPT_NOP
;
2942 wptr
[1] = TCPOPT_WSCALE
;
2943 wptr
[2] = TCPOPT_WS_LEN
;
2944 wptr
[3] = (uchar_t
)tcp
->tcp_rcv_ws
;
2945 wptr
+= TCPOPT_REAL_WS_LEN
;
2946 tcpha
->tha_offset_and_reserved
+= (1 << 4);
2950 u1
= (int)(mp
->b_wptr
- mp
->b_rptr
);
2952 * Get IP set to checksum on our behalf
2953 * Include the adjustment for a source route if any.
2955 u1
+= connp
->conn_sum
;
2956 u1
= (u1
>> 16) + (u1
& 0xFFFF);
2957 tcpha
->tha_sum
= htons(u1
);
2958 TCPS_BUMP_MIB(tcps
, tcpOutControl
);
2962 * Helper function for tcp_xmit_mp() in handling connection tear down
2963 * flag setting and state changes.
2966 tcp_xmit_mp_aux_fss(tcp_t
*tcp
, ip_xmit_attr_t
*ixa
, uint_t
*flags
)
2968 if (!tcp
->tcp_fin_acked
) {
2970 TCPS_BUMP_MIB(tcp
->tcp_tcps
, tcpOutControl
);
2972 if (!tcp
->tcp_fin_sent
) {
2973 tcp
->tcp_fin_sent
= B_TRUE
;
2974 switch (tcp
->tcp_state
) {
2976 tcp
->tcp_state
= TCPS_FIN_WAIT_1
;
2977 DTRACE_TCP6(state__change
, void, NULL
,
2978 ip_xmit_attr_t
*, ixa
, void, NULL
,
2979 tcp_t
*, tcp
, void, NULL
,
2980 int32_t, TCPS_SYN_RCVD
);
2982 case TCPS_ESTABLISHED
:
2983 tcp
->tcp_state
= TCPS_FIN_WAIT_1
;
2984 DTRACE_TCP6(state__change
, void, NULL
,
2985 ip_xmit_attr_t
*, ixa
, void, NULL
,
2986 tcp_t
*, tcp
, void, NULL
,
2987 int32_t, TCPS_ESTABLISHED
);
2989 case TCPS_CLOSE_WAIT
:
2990 tcp
->tcp_state
= TCPS_LAST_ACK
;
2991 DTRACE_TCP6(state__change
, void, NULL
,
2992 ip_xmit_attr_t
*, ixa
, void, NULL
,
2993 tcp_t
*, tcp
, void, NULL
,
2994 int32_t, TCPS_CLOSE_WAIT
);
2997 if (tcp
->tcp_suna
== tcp
->tcp_snxt
)
2998 TCP_TIMER_RESTART(tcp
, tcp
->tcp_rto
);
2999 tcp
->tcp_snxt
= tcp
->tcp_fss
+ 1;
3004 * tcp_xmit_mp is called to return a pointer to an mblk chain complete with
3005 * ip and tcp header ready to pass down to IP. If the mp passed in is
3006 * non-NULL, then up to max_to_send bytes of data will be dup'ed off that
3007 * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary
3008 * otherwise it will dup partial mblks.)
3009 * Otherwise, an appropriate ACK packet will be generated. This
3010 * routine is not usually called to send new data for the first time. It
3011 * is mostly called out of the timer for retransmits, and to generate ACKs.
3013 * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will
3014 * be adjusted by *offset. And after dupb(), the offset and the ending mblk
3015 * of the original mblk chain will be returned in *offset and *end_mp.
3018 tcp_xmit_mp(tcp_t
*tcp
, mblk_t
*mp
, int32_t max_to_send
, int32_t *offset
,
3019 mblk_t
**end_mp
, uint32_t seq
, boolean_t sendall
, uint32_t *seg_len
,
3029 int32_t num_sack_blk
= 0;
3030 int32_t sack_opt_len
= 0;
3031 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
3032 conn_t
*connp
= tcp
->tcp_connp
;
3033 ip_xmit_attr_t
*ixa
= connp
->conn_ixa
;
3035 /* Allocate for our maximum TCP header + link-level */
3036 mp1
= allocb(connp
->conn_ht_iphc_allocated
+ tcps
->tcps_wroff_xtra
,
3043 * Note that tcp_mss has been adjusted to take into account the
3044 * timestamp option if applicable. Because SACK options do not
3045 * appear in every TCP segments and they are of variable lengths,
3046 * they cannot be included in tcp_mss. Thus we need to calculate
3047 * the actual segment length when we need to send a segment which
3048 * includes SACK options.
3050 if (tcp
->tcp_snd_sack_ok
&& tcp
->tcp_num_sack_blk
> 0) {
3051 num_sack_blk
= MIN(tcp
->tcp_max_sack_blk
,
3052 tcp
->tcp_num_sack_blk
);
3053 sack_opt_len
= num_sack_blk
* sizeof (sack_blk_t
) +
3054 TCPOPT_NOP_LEN
* 2 + TCPOPT_HEADER_LEN
;
3055 if (max_to_send
+ sack_opt_len
> tcp
->tcp_mss
)
3056 max_to_send
-= sack_opt_len
;
3059 if (offset
!= NULL
) {
3061 /* We use offset as an indicator that end_mp is not NULL. */
3064 for (mp2
= mp1
; mp
&& data_length
!= max_to_send
; mp
= mp
->b_cont
) {
3065 /* This could be faster with cooperation from downstream */
3066 if (mp2
!= mp1
&& !sendall
&&
3067 data_length
+ (int)(mp
->b_wptr
- mp
->b_rptr
) >
3070 * Don't send the next mblk since the whole mblk
3074 mp2
->b_cont
= dupb(mp
);
3081 ASSERT((uintptr_t)(mp2
->b_wptr
- mp2
->b_rptr
) <=
3082 (uintptr_t)INT_MAX
);
3084 data_length
+= (int)(mp2
->b_wptr
- mp2
->b_rptr
);
3085 if (data_length
> max_to_send
) {
3086 mp2
->b_wptr
-= data_length
- max_to_send
;
3087 data_length
= max_to_send
;
3088 off
= mp2
->b_wptr
- mp
->b_rptr
;
3094 if (offset
!= NULL
) {
3098 if (seg_len
!= NULL
) {
3099 *seg_len
= data_length
;
3102 /* Update the latest receive window size in TCP header. */
3103 tcp
->tcp_tcpha
->tha_win
= htons(tcp
->tcp_rwnd
>> tcp
->tcp_rcv_ws
);
3105 rptr
= mp1
->b_rptr
+ tcps
->tcps_wroff_xtra
;
3107 mp1
->b_wptr
= rptr
+ connp
->conn_ht_iphc_len
+ sack_opt_len
;
3108 bcopy(connp
->conn_ht_iphc
, rptr
, connp
->conn_ht_iphc_len
);
3109 tcpha
= (tcpha_t
*)&rptr
[ixa
->ixa_ip_hdr_length
];
3110 tcpha
->tha_seq
= htonl(seq
);
3113 * Use tcp_unsent to determine if the PUSH bit should be used assumes
3114 * that this function was called from tcp_wput_data. Thus, when called
3115 * to retransmit data the setting of the PUSH bit may appear some
3116 * what random in that it might get set when it should not. This
3117 * should not pose any performance issues.
3119 if (data_length
!= 0 && (tcp
->tcp_unsent
== 0 ||
3120 tcp
->tcp_unsent
== data_length
)) {
3121 flags
= TH_ACK
| TH_PUSH
;
3126 if (tcp
->tcp_ecn_ok
) {
3127 if (tcp
->tcp_ecn_echo_on
)
3131 * Only set ECT bit and ECN_CWR if a segment contains new data.
3132 * There is no TCP flow control for non-data segments, and
3133 * only data segment is transmitted reliably.
3135 if (data_length
> 0 && !rexmit
) {
3136 TCP_SET_ECT(tcp
, rptr
);
3137 if (tcp
->tcp_cwr
&& !tcp
->tcp_ecn_cwr_sent
) {
3139 tcp
->tcp_ecn_cwr_sent
= B_TRUE
;
3144 /* Check if there is any special processing needs to be done. */
3145 if (tcp
->tcp_valid_bits
) {
3148 /* We don't allow having SYN and FIN in the same segment... */
3149 if ((tcp
->tcp_valid_bits
& TCP_ISS_VALID
) &&
3150 seq
== tcp
->tcp_iss
) {
3151 /* Need to do connection set up processing. */
3152 tcp_xmit_mp_aux_iss(tcp
, connp
, tcpha
, mp1
, &flags
);
3153 } else if ((tcp
->tcp_valid_bits
& TCP_FSS_VALID
) &&
3154 (seq
+ data_length
) == tcp
->tcp_fss
) {
3155 /* Need to do connection tear down processing. */
3156 tcp_xmit_mp_aux_fss(tcp
, ixa
, &flags
);
3160 * Need to do urgent pointer processing.
3162 * Note the trick here. u1 is unsigned. When tcp_urg
3163 * is smaller than seq, u1 will become a very huge value.
3164 * So the comparison will fail. Also note that tcp_urp
3165 * should be positive, see RFC 793 page 17.
3167 u1
= tcp
->tcp_urg
- seq
+ TCP_OLD_URP_INTERPRETATION
;
3168 if ((tcp
->tcp_valid_bits
& TCP_URG_VALID
) && u1
!= 0 &&
3169 u1
< (uint32_t)(64 * 1024)) {
3171 TCPS_BUMP_MIB(tcps
, tcpOutUrg
);
3172 tcpha
->tha_urp
= htons(u1
);
3175 tcpha
->tha_flags
= (uchar_t
)flags
;
3176 tcp
->tcp_rack
= tcp
->tcp_rnxt
;
3177 tcp
->tcp_rack_cnt
= 0;
3179 /* Fill in the current value of timestamps option. */
3180 if (tcp
->tcp_snd_ts_ok
) {
3181 if (tcp
->tcp_state
!= TCPS_SYN_SENT
) {
3182 uint32_t llbolt
= (uint32_t)LBOLT_FASTPATH
;
3185 (char *)tcpha
+ TCP_MIN_HEADER_LENGTH
+4);
3186 U32_TO_BE32(tcp
->tcp_ts_recent
,
3187 (char *)tcpha
+ TCP_MIN_HEADER_LENGTH
+8);
3191 /* Fill in the SACK blocks. */
3192 if (num_sack_blk
> 0) {
3193 uchar_t
*wptr
= (uchar_t
*)tcpha
+ connp
->conn_ht_ulp_len
;
3197 wptr
[0] = TCPOPT_NOP
;
3198 wptr
[1] = TCPOPT_NOP
;
3199 wptr
[2] = TCPOPT_SACK
;
3200 wptr
[3] = TCPOPT_HEADER_LEN
+ num_sack_blk
*
3201 sizeof (sack_blk_t
);
3202 wptr
+= TCPOPT_REAL_SACK_LEN
;
3204 tmp
= tcp
->tcp_sack_list
;
3205 for (i
= 0; i
< num_sack_blk
; i
++) {
3206 U32_TO_BE32(tmp
[i
].begin
, wptr
);
3207 wptr
+= sizeof (tcp_seq
);
3208 U32_TO_BE32(tmp
[i
].end
, wptr
);
3209 wptr
+= sizeof (tcp_seq
);
3211 tcpha
->tha_offset_and_reserved
+= ((num_sack_blk
* 2 + 1) << 4);
3213 ASSERT((uintptr_t)(mp1
->b_wptr
- rptr
) <= (uintptr_t)INT_MAX
);
3214 data_length
+= (int)(mp1
->b_wptr
- rptr
);
3216 ixa
->ixa_pktlen
= data_length
;
3218 if (ixa
->ixa_flags
& IXAF_IS_IPV4
) {
3219 ((ipha_t
*)rptr
)->ipha_length
= htons(data_length
);
3221 ip6_t
*ip6
= (ip6_t
*)rptr
;
3223 ip6
->ip6_plen
= htons(data_length
- IPV6_HDR_LEN
);
3228 * Include the adjustment for a source route if any.
3230 data_length
-= ixa
->ixa_ip_hdr_length
;
3231 data_length
+= connp
->conn_sum
;
3232 data_length
= (data_length
>> 16) + (data_length
& 0xFFFF);
3233 tcpha
->tha_sum
= htons(data_length
);
3234 if (tcp
->tcp_ip_forward_progress
) {
3235 tcp
->tcp_ip_forward_progress
= B_FALSE
;
3236 connp
->conn_ixa
->ixa_flags
|= IXAF_REACH_CONF
;
3238 connp
->conn_ixa
->ixa_flags
&= ~IXAF_REACH_CONF
;
3244 * If this routine returns B_TRUE, TCP can generate a RST in response
3245 * to a segment. If it returns B_FALSE, TCP should not respond.
3248 tcp_send_rst_chk(tcp_stack_t
*tcps
)
3253 * TCP needs to protect itself from generating too many RSTs.
3254 * This can be a DoS attack by sending us random segments
3257 * What we do here is to have a limit of tcp_rst_sent_rate RSTs
3258 * in each 1 second interval. In this way, TCP still generate
3259 * RSTs in normal cases but when under attack, the impact is
3262 if (tcps
->tcps_rst_sent_rate_enabled
!= 0) {
3263 now
= ddi_get_lbolt64();
3264 if (TICK_TO_MSEC(now
- tcps
->tcps_last_rst_intrvl
) >
3266 tcps
->tcps_last_rst_intrvl
= now
;
3267 tcps
->tcps_rst_cnt
= 1;
3268 } else if (++tcps
->tcps_rst_cnt
> tcps
->tcps_rst_sent_rate
) {
3276 * This function handles all retransmissions if SACK is enabled for this
3277 * connection. First it calculates how many segments can be retransmitted
3278 * based on tcp_pipe. Then it goes thru the notsack list to find eligible
3279 * segments. A segment is eligible if sack_cnt for that segment is greater
3280 * than or equal tcp_dupack_fast_retransmit. After it has retransmitted
3281 * all eligible segments, it checks to see if TCP can send some new segments
3282 * (fast recovery). If it can, set the appropriate flag for tcp_input_data().
3285 * tcp_t *tcp: the tcp structure of the connection.
3286 * uint_t *flags: in return, appropriate value will be set for
3290 tcp_sack_rexmit(tcp_t
*tcp
, uint_t
*flags
)
3292 notsack_blk_t
*notsack_blk
;
3293 int32_t usable_swnd
;
3297 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
3299 ASSERT(tcp
->tcp_notsack_list
!= NULL
);
3300 ASSERT(tcp
->tcp_rexmit
== B_FALSE
);
3302 /* Defensive coding in case there is a bug... */
3303 if (tcp
->tcp_notsack_list
== NULL
) {
3306 notsack_blk
= tcp
->tcp_notsack_list
;
3310 * Limit the num of outstanding data in the network to be
3311 * tcp_cwnd_ssthresh, which is half of the original congestion wnd.
3313 usable_swnd
= tcp
->tcp_cwnd_ssthresh
- tcp
->tcp_pipe
;
3315 /* At least retransmit 1 MSS of data. */
3316 if (usable_swnd
<= 0) {
3320 /* Make sure no new RTT samples will be taken. */
3321 tcp
->tcp_csuna
= tcp
->tcp_snxt
;
3323 notsack_blk
= tcp
->tcp_notsack_list
;
3324 while (usable_swnd
> 0) {
3325 mblk_t
*snxt_mp
, *tmp_mp
;
3326 tcp_seq begin
= tcp
->tcp_sack_snxt
;
3330 for (; notsack_blk
!= NULL
; notsack_blk
= notsack_blk
->next
) {
3331 if (SEQ_GT(notsack_blk
->end
, begin
) &&
3332 (notsack_blk
->sack_cnt
>=
3333 tcps
->tcps_dupack_fast_retransmit
)) {
3334 end
= notsack_blk
->end
;
3335 if (SEQ_LT(begin
, notsack_blk
->begin
)) {
3336 begin
= notsack_blk
->begin
;
3342 * All holes are filled. Manipulate tcp_cwnd to send more
3343 * if we can. Note that after the SACK recovery, tcp_cwnd is
3344 * set to tcp_cwnd_ssthresh.
3346 if (notsack_blk
== NULL
) {
3347 usable_swnd
= tcp
->tcp_cwnd_ssthresh
- tcp
->tcp_pipe
;
3348 if (usable_swnd
<= 0 || tcp
->tcp_unsent
== 0) {
3349 tcp
->tcp_cwnd
= tcp
->tcp_snxt
- tcp
->tcp_suna
;
3350 ASSERT(tcp
->tcp_cwnd
> 0);
3353 usable_swnd
= usable_swnd
/ mss
;
3354 tcp
->tcp_cwnd
= tcp
->tcp_snxt
- tcp
->tcp_suna
+
3355 MAX(usable_swnd
* mss
, mss
);
3356 *flags
|= TH_XMIT_NEEDED
;
3362 * Note that we may send more than usable_swnd allows here
3363 * because of round off, but no more than 1 MSS of data.
3365 seg_len
= end
- begin
;
3368 snxt_mp
= tcp_get_seg_mp(tcp
, begin
, &off
);
3369 ASSERT(snxt_mp
!= NULL
);
3370 /* This should not happen. Defensive coding again... */
3371 if (snxt_mp
== NULL
) {
3375 xmit_mp
= tcp_xmit_mp(tcp
, snxt_mp
, seg_len
, &off
,
3376 &tmp_mp
, begin
, B_TRUE
, &seg_len
, B_TRUE
);
3377 if (xmit_mp
== NULL
)
3380 usable_swnd
-= seg_len
;
3381 tcp
->tcp_pipe
+= seg_len
;
3382 tcp
->tcp_sack_snxt
= begin
+ seg_len
;
3384 tcp_send_data(tcp
, xmit_mp
);
3387 * Update the send timestamp to avoid false retransmission.
3389 snxt_mp
->b_prev
= (mblk_t
*)ddi_get_lbolt();
3391 TCPS_BUMP_MIB(tcps
, tcpRetransSegs
);
3392 TCPS_UPDATE_MIB(tcps
, tcpRetransBytes
, seg_len
);
3393 TCPS_BUMP_MIB(tcps
, tcpOutSackRetransSegs
);
3395 * Update tcp_rexmit_max to extend this SACK recovery phase.
3396 * This happens when new data sent during fast recovery is
3397 * also lost. If TCP retransmits those new data, it needs
3398 * to extend SACK recover phase to avoid starting another
3399 * fast retransmit/recovery unnecessarily.
3401 if (SEQ_GT(tcp
->tcp_sack_snxt
, tcp
->tcp_rexmit_max
)) {
3402 tcp
->tcp_rexmit_max
= tcp
->tcp_sack_snxt
;
3408 * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3412 tcp_ss_rexmit(tcp_t
*tcp
)
3420 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
3423 * Note that tcp_rexmit can be set even though TCP has retransmitted
3424 * all unack'ed segments.
3426 if (SEQ_LT(tcp
->tcp_rexmit_nxt
, tcp
->tcp_rexmit_max
)) {
3427 smax
= tcp
->tcp_rexmit_max
;
3428 snxt
= tcp
->tcp_rexmit_nxt
;
3429 if (SEQ_LT(snxt
, tcp
->tcp_suna
)) {
3430 snxt
= tcp
->tcp_suna
;
3432 win
= MIN(tcp
->tcp_cwnd
, tcp
->tcp_swnd
);
3433 win
-= snxt
- tcp
->tcp_suna
;
3435 snxt_mp
= tcp_get_seg_mp(tcp
, snxt
, &off
);
3437 while (SEQ_LT(snxt
, smax
) && (win
> 0) && (snxt_mp
!= NULL
)) {
3439 mblk_t
*old_snxt_mp
= snxt_mp
;
3445 if (SEQ_GT(snxt
+ cnt
, smax
)) {
3448 xmit_mp
= tcp_xmit_mp(tcp
, snxt_mp
, cnt
, &off
,
3449 &snxt_mp
, snxt
, B_TRUE
, &cnt
, B_TRUE
);
3450 if (xmit_mp
== NULL
)
3453 tcp_send_data(tcp
, xmit_mp
);
3458 * Update the send timestamp to avoid false
3461 old_snxt_mp
->b_prev
= (mblk_t
*)ddi_get_lbolt();
3462 TCPS_BUMP_MIB(tcps
, tcpRetransSegs
);
3463 TCPS_UPDATE_MIB(tcps
, tcpRetransBytes
, cnt
);
3465 tcp
->tcp_rexmit_nxt
= snxt
;
3468 * If we have transmitted all we have at the time
3469 * we started the retranmission, we can leave
3470 * the rest of the job to tcp_wput_data(). But we
3471 * need to check the send window first. If the
3472 * win is not 0, go on with tcp_wput_data().
3474 if (SEQ_LT(snxt
, smax
) || win
== 0) {
3478 /* Only call tcp_wput_data() if there is data to be sent. */
3479 if (tcp
->tcp_unsent
) {
3480 tcp_wput_data(tcp
, NULL
, B_FALSE
);
3485 * Do slow start retransmission after ICMP errors of PMTU changes.
3488 tcp_rexmit_after_error(tcp_t
*tcp
)
3491 * All sent data has been acknowledged or no data left to send, just
3494 if (!SEQ_LT(tcp
->tcp_suna
, tcp
->tcp_snxt
) ||
3495 (tcp
->tcp_xmit_head
== NULL
))
3498 if ((tcp
->tcp_valid_bits
& TCP_FSS_VALID
) && (tcp
->tcp_unsent
== 0))
3499 tcp
->tcp_rexmit_max
= tcp
->tcp_fss
;
3501 tcp
->tcp_rexmit_max
= tcp
->tcp_snxt
;
3503 tcp
->tcp_rexmit_nxt
= tcp
->tcp_suna
;
3504 tcp
->tcp_rexmit
= B_TRUE
;
3505 tcp
->tcp_dupack_cnt
= 0;
3510 * tcp_get_seg_mp() is called to get the pointer to a segment in the
3511 * send queue which starts at the given sequence number. If the given
3512 * sequence number is equal to last valid sequence number (tcp_snxt), the
3513 * returned mblk is the last valid mblk, and off is set to the length of
3516 * send queue which starts at the given seq. no.
3519 * tcp_t *tcp: the tcp instance pointer.
3520 * uint32_t seq: the starting seq. no of the requested segment.
3521 * int32_t *off: after the execution, *off will be the offset to
3522 * the returned mblk which points to the requested seq no.
3523 * It is the caller's responsibility to send in a non-null off.
3526 * A mblk_t pointer pointing to the requested segment in send queue.
3529 tcp_get_seg_mp(tcp_t
*tcp
, uint32_t seq
, int32_t *off
)
3534 /* Defensive coding. Make sure we don't send incorrect data. */
3535 if (SEQ_LT(seq
, tcp
->tcp_suna
) || SEQ_GT(seq
, tcp
->tcp_snxt
))
3538 cnt
= seq
- tcp
->tcp_suna
;
3539 mp
= tcp
->tcp_xmit_head
;
3540 while (cnt
> 0 && mp
!= NULL
) {
3541 cnt
-= mp
->b_wptr
- mp
->b_rptr
;
3543 cnt
+= mp
->b_wptr
- mp
->b_rptr
;
3554 * This routine adjusts next-to-send sequence number variables, in the
3555 * case where the reciever has shrunk it's window.
3558 tcp_update_xmit_tail(tcp_t
*tcp
, uint32_t snxt
)
3563 tcp
->tcp_snxt
= snxt
;
3565 /* Get the mblk, and the offset in it, as per the shrunk window */
3566 xmit_tail
= tcp_get_seg_mp(tcp
, snxt
, &offset
);
3567 ASSERT(xmit_tail
!= NULL
);
3568 tcp
->tcp_xmit_tail
= xmit_tail
;
3569 tcp
->tcp_xmit_tail_unsent
= xmit_tail
->b_wptr
-
3570 xmit_tail
->b_rptr
- offset
;
3574 * This handles the case when the receiver has shrunk its win. Per RFC 1122
3575 * if the receiver shrinks the window, i.e. moves the right window to the
3576 * left, the we should not send new data, but should retransmit normally the
3577 * old unacked data between suna and suna + swnd. We might has sent data
3578 * that is now outside the new window, pretend that we didn't send it.
3581 tcp_process_shrunk_swnd(tcp_t
*tcp
, uint32_t shrunk_count
)
3583 uint32_t snxt
= tcp
->tcp_snxt
;
3585 ASSERT(shrunk_count
> 0);
3587 if (!tcp
->tcp_is_wnd_shrnk
) {
3588 tcp
->tcp_snxt_shrunk
= snxt
;
3589 tcp
->tcp_is_wnd_shrnk
= B_TRUE
;
3590 } else if (SEQ_GT(snxt
, tcp
->tcp_snxt_shrunk
)) {
3591 tcp
->tcp_snxt_shrunk
= snxt
;
3594 /* Pretend we didn't send the data outside the window */
3595 snxt
-= shrunk_count
;
3597 /* Reset all the values per the now shrunk window */
3598 tcp_update_xmit_tail(tcp
, snxt
);
3599 tcp
->tcp_unsent
+= shrunk_count
;
3602 * If the SACK option is set, delete the entire list of
3603 * notsack'ed blocks.
3605 TCP_NOTSACK_REMOVE_ALL(tcp
->tcp_notsack_list
, tcp
);
3607 if (tcp
->tcp_suna
== tcp
->tcp_snxt
&& tcp
->tcp_swnd
== 0)
3609 * Make sure the timer is running so that we will probe a zero
3612 TCP_TIMER_RESTART(tcp
, tcp
->tcp_rto
);
3616 * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
3617 * with the template header, as well as other options such as time-stamp,
3621 tcp_fill_header(tcp_t
*tcp
, uchar_t
*rptr
, clock_t now
, int num_sack_blk
)
3623 tcpha_t
*tcp_tmpl
, *tcpha
;
3624 uint32_t *dst
, *src
;
3626 conn_t
*connp
= tcp
->tcp_connp
;
3628 ASSERT(OK_32PTR(rptr
));
3630 /* Template header */
3631 tcp_tmpl
= tcp
->tcp_tcpha
;
3633 /* Header of outgoing packet */
3634 tcpha
= (tcpha_t
*)(rptr
+ connp
->conn_ixa
->ixa_ip_hdr_length
);
3636 /* dst and src are opaque 32-bit fields, used for copying */
3637 dst
= (uint32_t *)rptr
;
3638 src
= (uint32_t *)connp
->conn_ht_iphc
;
3639 hdrlen
= connp
->conn_ht_iphc_len
;
3641 /* Fill time-stamp option if needed */
3642 if (tcp
->tcp_snd_ts_ok
) {
3643 U32_TO_BE32((uint32_t)now
,
3644 (char *)tcp_tmpl
+ TCP_MIN_HEADER_LENGTH
+ 4);
3645 U32_TO_BE32(tcp
->tcp_ts_recent
,
3646 (char *)tcp_tmpl
+ TCP_MIN_HEADER_LENGTH
+ 8);
3648 ASSERT(connp
->conn_ht_ulp_len
== TCP_MIN_HEADER_LENGTH
);
3652 * Copy the template header; is this really more efficient than
3653 * calling bcopy()? For simple IPv4/TCP, it may be the case,
3654 * but perhaps not for other scenarios.
3676 * Set the ECN info in the TCP header if it is not a zero
3677 * window probe. Zero window probe is only sent in
3678 * tcp_wput_data() and tcp_timer().
3680 if (tcp
->tcp_ecn_ok
&& !tcp
->tcp_zero_win_probe
) {
3681 TCP_SET_ECT(tcp
, rptr
);
3683 if (tcp
->tcp_ecn_echo_on
)
3684 tcpha
->tha_flags
|= TH_ECE
;
3685 if (tcp
->tcp_cwr
&& !tcp
->tcp_ecn_cwr_sent
) {
3686 tcpha
->tha_flags
|= TH_CWR
;
3687 tcp
->tcp_ecn_cwr_sent
= B_TRUE
;
3691 /* Fill in SACK options */
3692 if (num_sack_blk
> 0) {
3693 uchar_t
*wptr
= rptr
+ connp
->conn_ht_iphc_len
;
3697 wptr
[0] = TCPOPT_NOP
;
3698 wptr
[1] = TCPOPT_NOP
;
3699 wptr
[2] = TCPOPT_SACK
;
3700 wptr
[3] = TCPOPT_HEADER_LEN
+ num_sack_blk
*
3701 sizeof (sack_blk_t
);
3702 wptr
+= TCPOPT_REAL_SACK_LEN
;
3704 tmp
= tcp
->tcp_sack_list
;
3705 for (i
= 0; i
< num_sack_blk
; i
++) {
3706 U32_TO_BE32(tmp
[i
].begin
, wptr
);
3707 wptr
+= sizeof (tcp_seq
);
3708 U32_TO_BE32(tmp
[i
].end
, wptr
);
3709 wptr
+= sizeof (tcp_seq
);
3711 tcpha
->tha_offset_and_reserved
+=
3712 ((num_sack_blk
* 2 + 1) << 4);