4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 #include <sys/types.h>
26 #include <sys/stream.h>
27 #include <sys/strsun.h>
28 #include <sys/strsubr.h>
29 #include <sys/debug.h>
31 #include <sys/cmn_err.h>
32 #include <sys/tihdr.h>
34 #include <inet/common.h>
35 #include <inet/optcom.h>
37 #include <inet/ip_if.h>
38 #include <inet/ip_impl.h>
40 #include <inet/tcp_impl.h>
41 #include <inet/ipsec_impl.h>
42 #include <inet/ipclassifier.h>
43 #include <inet/ipp_common.h>
44 #include <inet/ip_if.h>
47 * This file implements TCP fusion - a protocol-less data path for TCP
48 * loopback connections. The fusion of two local TCP endpoints occurs
49 * at connection establishment time. Various conditions (see details
50 * in tcp_fuse()) need to be met for fusion to be successful. If it
51 * fails, we fall back to the regular TCP data path; if it succeeds,
52 * both endpoints proceed to use tcp_fuse_output() as the transmit path.
53 * tcp_fuse_output() enqueues application data directly onto the peer's
54 * receive queue; no protocol processing is involved.
56 * Sychronization is handled by squeue and the mutex tcp_non_sq_lock.
57 * One of the requirements for fusion to succeed is that both endpoints
58 * need to be using the same squeue. This ensures that neither side
59 * can disappear while the other side is still sending data. Flow
60 * control information is manipulated outside the squeue, so the
61 * tcp_non_sq_lock must be held when touching tcp_flow_stopped.
65 * Setting this to false means we disable fusion altogether and
66 * loopback connections would go through the protocol paths.
68 boolean_t do_tcp_fusion
= B_TRUE
;
71 * This routine gets called by the eager tcp upon changing state from
72 * SYN_RCVD to ESTABLISHED. It fuses a direct path between itself
73 * and the active connect tcp such that the regular tcp processings
74 * may be bypassed under allowable circumstances. Because the fusion
75 * requires both endpoints to be in the same squeue, it does not work
76 * for simultaneous active connects because there is no easy way to
77 * switch from one squeue to another once the connection is created.
78 * This is different from the eager tcp case where we assign it the
79 * same squeue as the one given to the active connect tcp during open.
82 tcp_fuse(tcp_t
*tcp
, uchar_t
*iphdr
, tcpha_t
*tcpha
)
84 conn_t
*peer_connp
, *connp
= tcp
->tcp_connp
;
86 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
88 ip_stack_t
*ipst
= tcps
->tcps_netstack
->netstack_ip
;
90 ASSERT(!tcp
->tcp_fused
);
91 ASSERT(tcp
->tcp_loopback
);
92 ASSERT(tcp
->tcp_loopback_peer
== NULL
);
94 * We need to inherit conn_rcvbuf of the listener tcp,
95 * but we can't really use tcp_listener since we get here after
96 * sending up T_CONN_IND and tcp_tli_accept() may be called
97 * independently, at which point tcp_listener is cleared;
98 * this is why we use tcp_saved_listener. The listener itself
99 * is guaranteed to be around until tcp_accept_finish() is called
100 * on this eager -- this won't happen until we're done since we're
101 * inside the eager's perimeter now.
103 ASSERT(tcp
->tcp_saved_listener
!= NULL
);
105 * Lookup peer endpoint; search for the remote endpoint having
106 * the reversed address-port quadruplet in ESTABLISHED state,
107 * which is guaranteed to be unique in the system. Zone check
108 * is applied accordingly for loopback address, but not for
109 * local address since we want fusion to happen across Zones.
111 if (connp
->conn_ipversion
== IPV4_VERSION
) {
112 peer_connp
= ipcl_conn_tcp_lookup_reversed_ipv4(connp
,
113 (ipha_t
*)iphdr
, tcpha
, ipst
);
115 peer_connp
= ipcl_conn_tcp_lookup_reversed_ipv6(connp
,
116 (ip6_t
*)iphdr
, tcpha
, ipst
);
120 * We can only proceed if peer exists, resides in the same squeue
121 * as our conn and is not raw-socket. We also restrict fusion to
122 * endpoints of the same type (STREAMS or non-STREAMS). The squeue
123 * assignment of this eager tcp was done earlier at the time of SYN
124 * processing in ip_fanout_tcp{_v6}. Note that similar squeues by
125 * itself doesn't guarantee a safe condition to fuse, hence we perform
126 * additional tests below.
128 ASSERT(peer_connp
== NULL
|| peer_connp
!= connp
);
129 if (peer_connp
== NULL
|| peer_connp
->conn_sqp
!= connp
->conn_sqp
||
130 !IPCL_IS_TCP(peer_connp
) ||
131 IPCL_IS_NONSTR(connp
) != IPCL_IS_NONSTR(peer_connp
)) {
132 if (peer_connp
!= NULL
) {
133 TCP_STAT(tcps
, tcp_fusion_unqualified
);
134 CONN_DEC_REF(peer_connp
);
138 peer_tcp
= peer_connp
->conn_tcp
; /* active connect tcp */
140 ASSERT(peer_tcp
!= NULL
&& peer_tcp
!= tcp
&& !peer_tcp
->tcp_fused
);
141 ASSERT(peer_tcp
->tcp_loopback_peer
== NULL
);
142 ASSERT(peer_connp
->conn_sqp
== connp
->conn_sqp
);
145 * Due to IRE changes the peer and us might not agree on tcp_loopback.
146 * We bail in that case.
148 if (!peer_tcp
->tcp_loopback
) {
149 TCP_STAT(tcps
, tcp_fusion_unqualified
);
150 CONN_DEC_REF(peer_connp
);
154 * Fuse the endpoints; we perform further checks against both
155 * tcp endpoints to ensure that a fusion is allowed to happen.
157 ns
= tcps
->tcps_netstack
;
158 ipst
= ns
->netstack_ip
;
160 if (!tcp
->tcp_unfusable
&& !peer_tcp
->tcp_unfusable
&&
161 tcp
->tcp_xmit_head
== NULL
&& peer_tcp
->tcp_xmit_head
== NULL
) {
163 queue_t
*peer_rq
= peer_connp
->conn_rq
;
165 ASSERT(!TCP_IS_DETACHED(peer_tcp
));
166 ASSERT(tcp
->tcp_fused_sigurg_mp
== NULL
);
167 ASSERT(peer_tcp
->tcp_fused_sigurg_mp
== NULL
);
170 * We need to drain data on both endpoints during unfuse.
171 * If we need to send up SIGURG at the time of draining,
172 * we want to be sure that an mblk is readily available.
173 * This is why we pre-allocate the M_PCSIG mblks for both
174 * endpoints which will only be used during/after unfuse.
175 * The mblk might already exist if we are doing a re-fuse.
177 if (!IPCL_IS_NONSTR(tcp
->tcp_connp
)) {
178 ASSERT(!IPCL_IS_NONSTR(peer_tcp
->tcp_connp
));
180 if (tcp
->tcp_fused_sigurg_mp
== NULL
) {
181 if ((mp
= allocb(1, BPRI_HI
)) == NULL
)
183 tcp
->tcp_fused_sigurg_mp
= mp
;
186 if (peer_tcp
->tcp_fused_sigurg_mp
== NULL
) {
187 if ((mp
= allocb(1, BPRI_HI
)) == NULL
)
189 peer_tcp
->tcp_fused_sigurg_mp
= mp
;
192 if ((mp
= allocb(sizeof (struct stroptions
),
197 /* Fuse both endpoints */
198 peer_tcp
->tcp_loopback_peer
= tcp
;
199 tcp
->tcp_loopback_peer
= peer_tcp
;
200 peer_tcp
->tcp_fused
= tcp
->tcp_fused
= B_TRUE
;
203 * We never use regular tcp paths in fusion and should
204 * therefore clear tcp_unsent on both endpoints. Having
205 * them set to non-zero values means asking for trouble
206 * especially after unfuse, where we may end up sending
207 * through regular tcp paths which expect xmit_list and
208 * friends to be correctly setup.
210 peer_tcp
->tcp_unsent
= tcp
->tcp_unsent
= 0;
212 tcp_timers_stop(tcp
);
213 tcp_timers_stop(peer_tcp
);
216 * Set receive buffer and max packet size for the
218 * eager's values will be set in tcp_accept_finish.
220 (void) tcp_rwnd_set(peer_tcp
, peer_tcp
->tcp_connp
->conn_rcvbuf
);
223 * Set the write offset value to zero since we won't
224 * be needing any room for TCP/IP headers.
226 if (!IPCL_IS_NONSTR(peer_tcp
->tcp_connp
)) {
227 struct stroptions
*stropt
;
229 DB_TYPE(mp
) = M_SETOPTS
;
230 mp
->b_wptr
+= sizeof (*stropt
);
232 stropt
= (struct stroptions
*)mp
->b_rptr
;
233 stropt
->so_flags
= SO_WROFF
| SO_MAXBLK
;
234 stropt
->so_wroff
= 0;
235 stropt
->so_maxblk
= INFPSZ
;
237 /* Send the options up */
238 putnext(peer_rq
, mp
);
240 struct sock_proto_props sopp
;
242 /* The peer is a non-STREAMS end point */
243 ASSERT(IPCL_IS_TCP(peer_connp
));
245 sopp
.sopp_flags
= SOCKOPT_WROFF
| SOCKOPT_MAXBLK
;
247 sopp
.sopp_maxblk
= INFPSZ
;
248 (*peer_connp
->conn_upcalls
->su_set_proto_props
)
249 (peer_connp
->conn_upper_handle
, &sopp
);
252 TCP_STAT(tcps
, tcp_fusion_unqualified
);
254 CONN_DEC_REF(peer_connp
);
258 if (tcp
->tcp_fused_sigurg_mp
!= NULL
) {
259 freeb(tcp
->tcp_fused_sigurg_mp
);
260 tcp
->tcp_fused_sigurg_mp
= NULL
;
262 if (peer_tcp
->tcp_fused_sigurg_mp
!= NULL
) {
263 freeb(peer_tcp
->tcp_fused_sigurg_mp
);
264 peer_tcp
->tcp_fused_sigurg_mp
= NULL
;
266 CONN_DEC_REF(peer_connp
);
270 * Unfuse a previously-fused pair of tcp loopback endpoints.
273 tcp_unfuse(tcp_t
*tcp
)
275 tcp_t
*peer_tcp
= tcp
->tcp_loopback_peer
;
276 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
278 ASSERT(tcp
->tcp_fused
&& peer_tcp
!= NULL
);
279 ASSERT(peer_tcp
->tcp_fused
&& peer_tcp
->tcp_loopback_peer
== tcp
);
280 ASSERT(tcp
->tcp_connp
->conn_sqp
== peer_tcp
->tcp_connp
->conn_sqp
);
281 ASSERT(tcp
->tcp_unsent
== 0 && peer_tcp
->tcp_unsent
== 0);
284 * Cancel any pending push timers.
286 if (tcp
->tcp_push_tid
!= 0) {
287 (void) TCP_TIMER_CANCEL(tcp
, tcp
->tcp_push_tid
);
288 tcp
->tcp_push_tid
= 0;
290 if (peer_tcp
->tcp_push_tid
!= 0) {
291 (void) TCP_TIMER_CANCEL(peer_tcp
, peer_tcp
->tcp_push_tid
);
292 peer_tcp
->tcp_push_tid
= 0;
296 * Drain any pending data; Note that in case of a detached tcp, the
297 * draining will happen later after the tcp is unfused. For non-
298 * urgent data, this can be handled by the regular tcp_rcv_drain().
299 * If we have urgent data sitting in the receive list, we will
300 * need to send up a SIGURG signal first before draining the data.
301 * All of these will be handled by the code in tcp_fuse_rcv_drain()
302 * when called from tcp_rcv_drain().
304 if (!TCP_IS_DETACHED(tcp
)) {
305 (void) tcp_fuse_rcv_drain(tcp
->tcp_connp
->conn_rq
, tcp
,
306 &tcp
->tcp_fused_sigurg_mp
);
308 if (!TCP_IS_DETACHED(peer_tcp
)) {
309 (void) tcp_fuse_rcv_drain(peer_tcp
->tcp_connp
->conn_rq
,
310 peer_tcp
, &peer_tcp
->tcp_fused_sigurg_mp
);
313 /* Lift up any flow-control conditions */
314 mutex_enter(&tcp
->tcp_non_sq_lock
);
315 if (tcp
->tcp_flow_stopped
) {
317 TCP_STAT(tcps
, tcp_fusion_backenabled
);
319 mutex_exit(&tcp
->tcp_non_sq_lock
);
321 mutex_enter(&peer_tcp
->tcp_non_sq_lock
);
322 if (peer_tcp
->tcp_flow_stopped
) {
323 tcp_clrqfull(peer_tcp
);
324 TCP_STAT(tcps
, tcp_fusion_backenabled
);
326 mutex_exit(&peer_tcp
->tcp_non_sq_lock
);
329 * Update tha_seq and tha_ack in the header template
331 tcp
->tcp_tcpha
->tha_seq
= htonl(tcp
->tcp_snxt
);
332 tcp
->tcp_tcpha
->tha_ack
= htonl(tcp
->tcp_rnxt
);
333 peer_tcp
->tcp_tcpha
->tha_seq
= htonl(peer_tcp
->tcp_snxt
);
334 peer_tcp
->tcp_tcpha
->tha_ack
= htonl(peer_tcp
->tcp_rnxt
);
336 /* Unfuse the endpoints */
337 peer_tcp
->tcp_fused
= tcp
->tcp_fused
= B_FALSE
;
338 peer_tcp
->tcp_loopback_peer
= tcp
->tcp_loopback_peer
= NULL
;
342 * Fusion output routine used to handle urgent data sent by STREAMS based
343 * endpoints. This routine is called by tcp_fuse_output() for handling
347 tcp_fuse_output_urg(tcp_t
*tcp
, mblk_t
*mp
)
350 struct T_exdata_ind
*tei
;
351 tcp_t
*peer_tcp
= tcp
->tcp_loopback_peer
;
352 mblk_t
*head
, *prev_head
= NULL
;
353 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
355 ASSERT(tcp
->tcp_fused
);
356 ASSERT(peer_tcp
!= NULL
&& peer_tcp
->tcp_loopback_peer
== tcp
);
357 ASSERT(!IPCL_IS_NONSTR(tcp
->tcp_connp
));
358 ASSERT(DB_TYPE(mp
) == M_PROTO
|| DB_TYPE(mp
) == M_PCPROTO
);
359 ASSERT(mp
->b_cont
!= NULL
&& DB_TYPE(mp
->b_cont
) == M_DATA
);
360 ASSERT(MBLKL(mp
) >= sizeof (*tei
) && MBLKL(mp
->b_cont
) > 0);
363 * Urgent data arrives in the form of T_EXDATA_REQ from above.
364 * Each occurence denotes a new urgent pointer. For each new
365 * urgent pointer we signal (SIGURG) the receiving app to indicate
366 * that it needs to go into urgent mode. This is similar to the
367 * urgent data handling in the regular tcp. We don't need to keep
368 * track of where the urgent pointer is, because each T_EXDATA_REQ
369 * "advances" the urgent pointer for us.
371 * The actual urgent data carried by T_EXDATA_REQ is then prepended
372 * by a T_EXDATA_IND before being enqueued behind any existing data
373 * destined for the receiving app. There is only a single urgent
374 * pointer (out-of-band mark) for a given tcp. If the new urgent
375 * data arrives before the receiving app reads some existing urgent
376 * data, the previous marker is lost. This behavior is emulated
377 * accordingly below, by removing any existing T_EXDATA_IND messages
378 * and essentially converting old urgent data into non-urgent.
380 ASSERT(tcp
->tcp_valid_bits
& TCP_URG_VALID
);
381 /* Let sender get out of urgent mode */
382 tcp
->tcp_valid_bits
&= ~TCP_URG_VALID
;
385 * This flag indicates that a signal needs to be sent up.
386 * This flag will only get cleared once SIGURG is delivered and
387 * is not affected by the tcp_fused flag -- delivery will still
388 * happen even after an endpoint is unfused, to handle the case
389 * where the sending endpoint immediately closes/unfuses after
390 * sending urgent data and the accept is not yet finished.
392 peer_tcp
->tcp_fused_sigurg
= B_TRUE
;
394 /* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */
395 DB_TYPE(mp
) = M_PROTO
;
396 tei
= (struct T_exdata_ind
*)mp
->b_rptr
;
397 tei
->PRIM_type
= T_EXDATA_IND
;
399 mp
->b_wptr
= (uchar_t
*)&tei
[1];
401 TCP_STAT(tcps
, tcp_fusion_urg
);
402 TCPS_BUMP_MIB(tcps
, tcpOutUrg
);
404 head
= peer_tcp
->tcp_rcv_list
;
405 while (head
!= NULL
) {
407 * Remove existing T_EXDATA_IND, keep the data which follows
408 * it and relink our list. Note that we don't modify the
409 * tcp_rcv_last_tail since it never points to T_EXDATA_IND.
411 if (DB_TYPE(head
) != M_DATA
) {
414 ASSERT(DB_TYPE(mp1
->b_cont
) == M_DATA
);
417 head
->b_next
= mp1
->b_next
;
419 if (prev_head
!= NULL
)
420 prev_head
->b_next
= head
;
421 if (peer_tcp
->tcp_rcv_list
== mp1
)
422 peer_tcp
->tcp_rcv_list
= head
;
423 if (peer_tcp
->tcp_rcv_last_head
== mp1
)
424 peer_tcp
->tcp_rcv_last_head
= head
;
433 * Fusion output routine, called by tcp_output() and tcp_wput_proto().
434 * If we are modifying any member that can be changed outside the squeue,
435 * like tcp_flow_stopped, we need to take tcp_non_sq_lock.
438 tcp_fuse_output(tcp_t
*tcp
, mblk_t
*mp
, uint32_t send_size
)
440 conn_t
*connp
= tcp
->tcp_connp
;
441 tcp_t
*peer_tcp
= tcp
->tcp_loopback_peer
;
442 conn_t
*peer_connp
= peer_tcp
->tcp_connp
;
443 boolean_t flow_stopped
, peer_data_queued
= B_FALSE
;
444 boolean_t urgent
= (DB_TYPE(mp
) != M_DATA
);
445 boolean_t push
= B_TRUE
;
448 uint32_t recv_size
= send_size
;
449 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
450 netstack_t
*ns
= tcps
->tcps_netstack
;
451 ip_stack_t
*ipst
= ns
->netstack_ip
;
452 ipsec_stack_t
*ipss
= ns
->netstack_ipsec
;
453 iaflags_t ixaflags
= connp
->conn_ixa
->ixa_flags
;
454 boolean_t do_ipsec
, hooks_out
, hooks_in
, ipobs_enabled
;
456 ASSERT(tcp
->tcp_fused
);
457 ASSERT(peer_tcp
!= NULL
&& peer_tcp
->tcp_loopback_peer
== tcp
);
458 ASSERT(connp
->conn_sqp
== peer_connp
->conn_sqp
);
459 ASSERT(DB_TYPE(mp
) == M_DATA
|| DB_TYPE(mp
) == M_PROTO
||
460 DB_TYPE(mp
) == M_PCPROTO
);
462 if (send_size
== 0) {
468 * Handle urgent data; we either send up SIGURG to the peer now
469 * or do it later when we drain, in case the peer is detached
470 * or if we're short of memory for M_PCSIG mblk.
473 tcp_fuse_output_urg(tcp
, mp
);
479 * Check that we are still using an IRE_LOCAL or IRE_LOOPBACK before
482 if (!ip_output_verify_local(connp
->conn_ixa
))
486 * Build IP and TCP header in case we have something that needs the
487 * headers. Those cases are:
492 * If tcp_xmit_mp() fails to dupb() the message, unfuse the connection
493 * and back to regular path.
495 if (ixaflags
& IXAF_IS_IPV4
) {
496 do_ipsec
= (ixaflags
& IXAF_IPSEC_SECURE
) ||
497 CONN_INBOUND_POLICY_PRESENT(peer_connp
, ipss
);
499 hooks_out
= HOOKS4_INTERESTED_LOOPBACK_OUT(ipst
);
500 hooks_in
= HOOKS4_INTERESTED_LOOPBACK_IN(ipst
);
501 ipobs_enabled
= (ipst
->ips_ip4_observe
.he_interested
!= 0);
503 do_ipsec
= (ixaflags
& IXAF_IPSEC_SECURE
) ||
504 CONN_INBOUND_POLICY_PRESENT_V6(peer_connp
, ipss
);
506 hooks_out
= HOOKS6_INTERESTED_LOOPBACK_OUT(ipst
);
507 hooks_in
= HOOKS6_INTERESTED_LOOPBACK_IN(ipst
);
508 ipobs_enabled
= (ipst
->ips_ip6_observe
.he_interested
!= 0);
511 /* We do logical 'or' for efficiency */
512 if (ipobs_enabled
| do_ipsec
| hooks_in
| hooks_out
) {
513 if ((mp1
= tcp_xmit_mp(tcp
, mp1
, tcp
->tcp_mss
, NULL
, NULL
,
514 tcp
->tcp_snxt
, B_TRUE
, NULL
, B_FALSE
)) == NULL
)
515 /* If tcp_xmit_mp fails, use regular path */
519 * Leave all IP relevant processes to ip_output_process_local(),
520 * which handles IPsec, IPobs, and FW_HOOKS.
522 mp1
= ip_output_process_local(mp1
, connp
->conn_ixa
, hooks_out
,
523 hooks_in
, do_ipsec
? peer_connp
: NULL
);
525 /* If the message is dropped for any reason. */
530 * Data length might have been changed by FW_HOOKS.
531 * We assume that the first mblk contains the TCP/IP headers.
533 if (hooks_in
|| hooks_out
) {
536 ip_hdr_len
= (ixaflags
& IXAF_IS_IPV4
) ?
537 IPH_HDR_LENGTH((ipha_t
*)mp1
->b_rptr
) :
538 ip_hdr_length_v6(mp1
, (ip6_t
*)mp1
->b_rptr
);
540 tcpha
= (tcpha_t
*)&mp1
->b_rptr
[ip_hdr_len
];
541 ASSERT((uchar_t
*)tcpha
+ sizeof (tcpha_t
) <=
543 recv_size
+= htonl(tcpha
->tha_seq
) - tcp
->tcp_snxt
;
548 * The message duplicated by tcp_xmit_mp is freed.
549 * Note: the original message passed in remains unchanged.
555 * Enqueue data into the peer's receive list; we may or may not
556 * drain the contents depending on the conditions below.
558 * For non-STREAMS sockets we normally queue data directly in the
559 * socket by calling the su_recv upcall. However, if the peer is
560 * detached we use tcp_rcv_enqueue() instead. Queued data will be
561 * drained when the accept completes (in tcp_accept_finish()).
563 if (IPCL_IS_NONSTR(peer_connp
) &&
564 !TCP_IS_DETACHED(peer_tcp
)) {
568 if ((tcp
->tcp_valid_bits
& TCP_URG_VALID
) &&
569 (tcp
->tcp_urg
== tcp
->tcp_snxt
)) {
571 (*peer_connp
->conn_upcalls
->su_signal_oob
)
572 (peer_connp
->conn_upper_handle
, 0);
573 tcp
->tcp_valid_bits
&= ~TCP_URG_VALID
;
575 if ((*peer_connp
->conn_upcalls
->su_recv
)(
576 peer_connp
->conn_upper_handle
, mp
, recv_size
,
577 flags
, &error
, &push
) < 0) {
578 ASSERT(error
!= EOPNOTSUPP
);
579 peer_data_queued
= B_TRUE
;
582 if (IPCL_IS_NONSTR(peer_connp
) &&
583 (tcp
->tcp_valid_bits
& TCP_URG_VALID
) &&
584 (tcp
->tcp_urg
== tcp
->tcp_snxt
)) {
586 * Can not deal with urgent pointers
587 * that arrive before the connection has been
590 tcp
->tcp_valid_bits
&= ~TCP_URG_VALID
;
595 tcp_rcv_enqueue(peer_tcp
, mp
, recv_size
,
596 tcp
->tcp_connp
->conn_cred
);
598 /* In case it wrapped around and also to keep it constant */
599 peer_tcp
->tcp_rwnd
+= recv_size
;
603 * Exercise flow-control when needed; we will get back-enabled
604 * in either tcp_accept_finish(), tcp_unfuse(), or when data is
605 * consumed. If peer endpoint is detached, we emulate streams flow
606 * control by checking the peer's queue size and high water mark;
607 * otherwise we simply use canputnext() to decide if we need to stop
610 * Since we are accessing our tcp_flow_stopped and might modify it,
611 * we need to take tcp->tcp_non_sq_lock.
613 mutex_enter(&tcp
->tcp_non_sq_lock
);
614 flow_stopped
= tcp
->tcp_flow_stopped
;
615 if ((TCP_IS_DETACHED(peer_tcp
) &&
616 (peer_tcp
->tcp_rcv_cnt
>= peer_connp
->conn_rcvbuf
)) ||
617 (!TCP_IS_DETACHED(peer_tcp
) &&
618 !IPCL_IS_NONSTR(peer_connp
) && !canputnext(peer_connp
->conn_rq
))) {
619 peer_data_queued
= B_TRUE
;
622 if (!flow_stopped
&& (peer_data_queued
||
623 (TCP_UNSENT_BYTES(tcp
) >= connp
->conn_sndbuf
))) {
625 flow_stopped
= B_TRUE
;
626 TCP_STAT(tcps
, tcp_fusion_flowctl
);
627 DTRACE_PROBE3(tcp__fuse__output__flowctl
, tcp_t
*, tcp
,
628 uint_t
, send_size
, uint_t
, peer_tcp
->tcp_rcv_cnt
);
629 } else if (flow_stopped
&& !peer_data_queued
&&
630 (TCP_UNSENT_BYTES(tcp
) <= connp
->conn_sndlowat
)) {
632 TCP_STAT(tcps
, tcp_fusion_backenabled
);
633 flow_stopped
= B_FALSE
;
635 mutex_exit(&tcp
->tcp_non_sq_lock
);
637 ipst
->ips_loopback_packets
++;
638 tcp
->tcp_last_sent_len
= send_size
;
640 /* Need to adjust the following SNMP MIB-related variables */
641 tcp
->tcp_snxt
+= send_size
;
642 tcp
->tcp_suna
= tcp
->tcp_snxt
;
643 peer_tcp
->tcp_rnxt
+= recv_size
;
644 peer_tcp
->tcp_last_recv_len
= recv_size
;
645 peer_tcp
->tcp_rack
= peer_tcp
->tcp_rnxt
;
647 TCPS_BUMP_MIB(tcps
, tcpOutDataSegs
);
648 TCPS_UPDATE_MIB(tcps
, tcpOutDataBytes
, send_size
);
650 TCPS_BUMP_MIB(tcps
, tcpHCInSegs
);
651 TCPS_BUMP_MIB(tcps
, tcpInDataInorderSegs
);
652 TCPS_UPDATE_MIB(tcps
, tcpInDataInorderBytes
, send_size
);
654 BUMP_LOCAL(tcp
->tcp_obsegs
);
655 BUMP_LOCAL(peer_tcp
->tcp_ibsegs
);
657 DTRACE_TCP5(send
, void, NULL
, ip_xmit_attr_t
*, connp
->conn_ixa
,
658 __dtrace_tcp_void_ip_t
*, NULL
, tcp_t
*, tcp
,
659 __dtrace_tcp_tcph_t
*, NULL
);
660 DTRACE_TCP5(receive
, void, NULL
, ip_xmit_attr_t
*,
661 peer_connp
->conn_ixa
, __dtrace_tcp_void_ip_t
*, NULL
,
662 tcp_t
*, peer_tcp
, __dtrace_tcp_tcph_t
*, NULL
);
664 if (!IPCL_IS_NONSTR(peer_tcp
->tcp_connp
) &&
665 !TCP_IS_DETACHED(peer_tcp
)) {
667 * Drain the peer's receive queue it has urgent data or if
668 * we're not flow-controlled.
670 if (urgent
|| !flow_stopped
) {
671 ASSERT(peer_tcp
->tcp_rcv_list
!= NULL
);
673 * For TLI-based streams, a thread in tcp_accept_swap()
674 * can race with us. That thread will ensure that the
675 * correct peer_connp->conn_rq is globally visible
676 * before peer_tcp->tcp_detached is visible as clear,
677 * but we must also ensure that the load of conn_rq
678 * cannot be reordered to be before the tcp_detached
682 (void) tcp_fuse_rcv_drain(peer_connp
->conn_rq
, peer_tcp
,
693 * This routine gets called to deliver data upstream on a fused or
694 * previously fused tcp loopback endpoint; the latter happens only
695 * when there is a pending SIGURG signal plus urgent data that can't
696 * be sent upstream in the past.
699 tcp_fuse_rcv_drain(queue_t
*q
, tcp_t
*tcp
, mblk_t
**sigurg_mpp
)
702 conn_t
*connp
= tcp
->tcp_connp
;
707 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
708 tcp_t
*peer_tcp
= tcp
->tcp_loopback_peer
;
710 ASSERT(tcp
->tcp_loopback
);
711 ASSERT(tcp
->tcp_fused
|| tcp
->tcp_fused_sigurg
);
712 ASSERT(!tcp
->tcp_fused
|| tcp
->tcp_loopback_peer
!= NULL
);
713 ASSERT(IPCL_IS_NONSTR(connp
) || sigurg_mpp
!= NULL
|| tcp
->tcp_fused
);
715 /* No need for the push timer now, in case it was scheduled */
716 if (tcp
->tcp_push_tid
!= 0) {
717 (void) TCP_TIMER_CANCEL(tcp
, tcp
->tcp_push_tid
);
718 tcp
->tcp_push_tid
= 0;
721 * If there's urgent data sitting in receive list and we didn't
722 * get a chance to send up a SIGURG signal, make sure we send
723 * it first before draining in order to ensure that SIOCATMARK
726 if (tcp
->tcp_fused_sigurg
) {
727 ASSERT(!IPCL_IS_NONSTR(tcp
->tcp_connp
));
729 tcp
->tcp_fused_sigurg
= B_FALSE
;
731 * sigurg_mpp is normally NULL, i.e. when we're still
732 * fused and didn't get here because of tcp_unfuse().
733 * In this case try hard to allocate the M_PCSIG mblk.
735 if (sigurg_mpp
== NULL
&&
736 (mp
= allocb(1, BPRI_HI
)) == NULL
&&
737 (mp
= allocb_tryhard(1)) == NULL
) {
738 /* Alloc failed; try again next time */
739 tcp
->tcp_push_tid
= TCP_TIMER(tcp
,
740 tcp_push_timer
, tcps
->tcps_push_timer_interval
);
742 } else if (sigurg_mpp
!= NULL
) {
744 * Use the supplied M_PCSIG mblk; it means we're
745 * either unfused or in the process of unfusing,
746 * and the drain must happen now.
753 /* Send up the signal */
754 DB_TYPE(mp
) = M_PCSIG
;
755 *mp
->b_wptr
++ = (uchar_t
)SIGURG
;
759 * Let the regular tcp_rcv_drain() path handle
760 * draining the data if we're no longer fused.
767 while ((mp
= tcp
->tcp_rcv_list
) != NULL
) {
768 tcp
->tcp_rcv_list
= mp
->b_next
;
773 ASSERT(!IPCL_IS_NONSTR(connp
));
775 TCP_STAT(tcps
, tcp_fusion_putnext
);
779 ASSERT(cnt
== tcp
->tcp_rcv_cnt
);
781 tcp
->tcp_rcv_last_head
= NULL
;
782 tcp
->tcp_rcv_last_tail
= NULL
;
783 tcp
->tcp_rcv_cnt
= 0;
784 tcp
->tcp_rwnd
= tcp
->tcp_connp
->conn_rcvbuf
;
786 mutex_enter(&peer_tcp
->tcp_non_sq_lock
);
787 if (peer_tcp
->tcp_flow_stopped
&& (TCP_UNSENT_BYTES(peer_tcp
) <=
788 peer_tcp
->tcp_connp
->conn_sndlowat
)) {
789 tcp_clrqfull(peer_tcp
);
790 TCP_STAT(tcps
, tcp_fusion_backenabled
);
792 mutex_exit(&peer_tcp
->tcp_non_sq_lock
);
798 * Calculate the size of receive buffer for a fused tcp endpoint.
801 tcp_fuse_set_rcv_hiwat(tcp_t
*tcp
, size_t rwnd
)
803 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
806 ASSERT(tcp
->tcp_fused
);
808 /* Ensure that value is within the maximum upper bound */
809 if (rwnd
> tcps
->tcps_max_buf
)
810 rwnd
= tcps
->tcps_max_buf
;
812 * Round up to system page size in case SO_RCVBUF is modified
813 * after SO_SNDBUF; the latter is also similarly rounded up.
815 rwnd
= P2ROUNDUP_TYPED(rwnd
, PAGESIZE
, size_t);
816 max_win
= TCP_MAXWIN
<< tcp
->tcp_rcv_ws
;
817 if (rwnd
> max_win
) {
818 rwnd
= max_win
- (max_win
% tcp
->tcp_mss
);
819 if (rwnd
< tcp
->tcp_mss
)
824 * Record high water mark, this is used for flow-control
825 * purposes in tcp_fuse_output().
827 tcp
->tcp_connp
->conn_rcvbuf
= rwnd
;
828 tcp
->tcp_rwnd
= rwnd
;
833 * Calculate the maximum outstanding unread data block for a fused tcp endpoint.
836 tcp_fuse_maxpsz(tcp_t
*tcp
)
838 tcp_t
*peer_tcp
= tcp
->tcp_loopback_peer
;
839 conn_t
*connp
= tcp
->tcp_connp
;
840 uint_t sndbuf
= connp
->conn_sndbuf
;
841 uint_t maxpsz
= sndbuf
;
843 ASSERT(tcp
->tcp_fused
);
844 ASSERT(peer_tcp
!= NULL
);
845 ASSERT(peer_tcp
->tcp_connp
->conn_rcvbuf
!= 0);
847 * In the fused loopback case, we want the stream head to split
848 * up larger writes into smaller chunks for a more accurate flow-
849 * control accounting. Our maxpsz is half of the sender's send
850 * buffer or the receiver's receive buffer, whichever is smaller.
851 * We round up the buffer to system page size due to the lack of
852 * TCP MSS concept in Fusion.
854 if (maxpsz
> peer_tcp
->tcp_connp
->conn_rcvbuf
)
855 maxpsz
= peer_tcp
->tcp_connp
->conn_rcvbuf
;
856 maxpsz
= P2ROUNDUP_TYPED(maxpsz
, PAGESIZE
, uint_t
) >> 1;
862 * Called to release flow control.
865 tcp_fuse_backenable(tcp_t
*tcp
)
867 tcp_t
*peer_tcp
= tcp
->tcp_loopback_peer
;
869 ASSERT(tcp
->tcp_fused
);
870 ASSERT(peer_tcp
!= NULL
&& peer_tcp
->tcp_fused
);
871 ASSERT(peer_tcp
->tcp_loopback_peer
== tcp
);
872 ASSERT(!TCP_IS_DETACHED(tcp
));
873 ASSERT(tcp
->tcp_connp
->conn_sqp
==
874 peer_tcp
->tcp_connp
->conn_sqp
);
876 if (tcp
->tcp_rcv_list
!= NULL
)
877 (void) tcp_fuse_rcv_drain(tcp
->tcp_connp
->conn_rq
, tcp
, NULL
);
879 mutex_enter(&peer_tcp
->tcp_non_sq_lock
);
880 if (peer_tcp
->tcp_flow_stopped
&&
881 (TCP_UNSENT_BYTES(peer_tcp
) <=
882 peer_tcp
->tcp_connp
->conn_sndlowat
)) {
883 tcp_clrqfull(peer_tcp
);
885 mutex_exit(&peer_tcp
->tcp_non_sq_lock
);
887 TCP_STAT(tcp
->tcp_tcps
, tcp_fusion_backenabled
);