2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_output.c,v 1.125 2000/08/09 11:59:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
34 * Cacophonix Gaul : draft-minshall-nagle-01
35 * J Hadi Salim : ECN support
41 #include <linux/smp_lock.h>
43 /* People can turn this off for buggy TCP's found in printers etc. */
44 int sysctl_tcp_retrans_collapse
= 1;
47 void update_send_head(struct sock
*sk
, struct tcp_opt
*tp
, struct sk_buff
*skb
)
49 tp
->send_head
= skb
->next
;
50 if (tp
->send_head
== (struct sk_buff
*) &sk
->write_queue
)
52 tp
->snd_nxt
= TCP_SKB_CB(skb
)->end_seq
;
53 if (tp
->packets_out
++ == 0)
54 tcp_reset_xmit_timer(sk
, TCP_TIME_RETRANS
, tp
->rto
);
57 /* SND.NXT, if window was not shrunk.
58 * If window has been shrunk, what should we make? It is not clear at all.
59 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
60 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
61 * invalid. OK, let's make this for now:
63 static __inline__ __u32
tcp_acceptable_seq(struct sock
*sk
, struct tcp_opt
*tp
)
65 if (!before(tp
->snd_una
+tp
->snd_wnd
, tp
->snd_nxt
))
68 return tp
->snd_una
+tp
->snd_wnd
;
71 /* Calculate mss to advertise in SYN segment.
72 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
74 * 1. It is independent of path mtu.
75 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
76 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
77 * attached devices, because some buggy hosts are confused by
79 * 4. We do not make 3, we advertise MSS, calculated from first
80 * hop device mtu, but allow to raise it to ip_rt_min_advmss.
81 * This may be overriden via information stored in routing table.
82 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
83 * probably even Jumbo".
85 static __u16
tcp_advertise_mss(struct sock
*sk
)
87 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
88 struct dst_entry
*dst
= __sk_dst_get(sk
);
91 if (dst
&& dst
->advmss
< mss
) {
99 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
100 * This is the first part of cwnd validation mechanism. */
101 static void tcp_cwnd_restart(struct tcp_opt
*tp
)
103 s32 delta
= tcp_time_stamp
- tp
->lsndtime
;
104 u32 restart_cwnd
= tcp_init_cwnd(tp
);
105 u32 cwnd
= tp
->snd_cwnd
;
107 tp
->snd_ssthresh
= tcp_current_ssthresh(tp
);
108 restart_cwnd
= min(restart_cwnd
, cwnd
);
110 while ((delta
-= tp
->rto
) > 0 && cwnd
> restart_cwnd
)
112 tp
->snd_cwnd
= max(cwnd
, restart_cwnd
);
113 tp
->snd_cwnd_stamp
= tcp_time_stamp
;
114 tp
->snd_cwnd_used
= 0;
117 static __inline__
void tcp_event_data_sent(struct tcp_opt
*tp
, struct sk_buff
*skb
)
119 u32 now
= tcp_time_stamp
;
121 if (!tp
->packets_out
&& (s32
)(now
- tp
->lsndtime
) > tp
->rto
)
122 tcp_cwnd_restart(tp
);
126 /* If it is a reply for ato after last received
127 * packet, enter pingpong mode.
129 if ((u32
)(now
- tp
->ack
.lrcvtime
) < tp
->ack
.ato
)
130 tp
->ack
.pingpong
= 1;
133 static __inline__
void tcp_event_ack_sent(struct sock
*sk
)
135 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
137 tcp_dec_quickack_mode(tp
);
138 tcp_clear_xmit_timer(sk
, TCP_TIME_DACK
);
140 /* If we ever saw N>1 small segments from peer, it has
141 * enough of send buffer to send N packets and does not nagle.
142 * Hence, we may delay acks more aggresively.
144 if (tp
->ack
.rcv_small
> tp
->ack
.rcv_thresh
+1)
145 tp
->ack
.rcv_thresh
= tp
->ack
.rcv_small
-1;
146 tp
->ack
.rcv_small
= 0;
149 /* Chose a new window to advertise, update state in tcp_opt for the
150 * socket, and return result with RFC1323 scaling applied. The return
151 * value can be stuffed directly into th->window for an outgoing
154 static __inline__ u16
tcp_select_window(struct sock
*sk
)
156 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
157 u32 cur_win
= tcp_receive_window(tp
);
158 u32 new_win
= __tcp_select_window(sk
);
160 /* Never shrink the offered window */
161 if(new_win
< cur_win
) {
162 /* Danger Will Robinson!
163 * Don't update rcv_wup/rcv_wnd here or else
164 * we will not be able to advertise a zero
165 * window in time. --DaveM
167 * Relax Will Robinson.
171 tp
->rcv_wnd
= new_win
;
172 tp
->rcv_wup
= tp
->rcv_nxt
;
174 /* RFC1323 scaling applied */
175 new_win
>>= tp
->rcv_wscale
;
177 #ifdef TCP_FORMAL_WINDOW
179 /* If we advertise zero window, disable fast path. */
181 } else if (cur_win
== 0 && tp
->pred_flags
== 0 &&
182 skb_queue_len(&tp
->out_of_order_queue
) == 0 &&
184 /* If we open zero window, enable fast path.
185 Without this it will be open by the first data packet,
186 it is too late to merge checksumming to copy.
188 tcp_fast_path_on(tp
);
196 /* This routine actually transmits TCP packets queued in by
197 * tcp_do_sendmsg(). This is used by both the initial
198 * transmission and possible later retransmissions.
199 * All SKB's seen here are completely headerless. It is our
200 * job to build the TCP header, and pass the packet down to
201 * IP so it can do the same plus pass the packet off to the
204 * We are working here with either a clone of the original
205 * SKB, or a fresh unique copy made by the retransmit engine.
207 int tcp_transmit_skb(struct sock
*sk
, struct sk_buff
*skb
)
210 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
211 struct tcp_skb_cb
*tcb
= TCP_SKB_CB(skb
);
212 int tcp_header_size
= tp
->tcp_header_len
;
217 #define SYSCTL_FLAG_TSTAMPS 0x1
218 #define SYSCTL_FLAG_WSCALE 0x2
219 #define SYSCTL_FLAG_SACK 0x4
222 if (tcb
->flags
& TCPCB_FLAG_SYN
) {
223 tcp_header_size
= sizeof(struct tcphdr
) + TCPOLEN_MSS
;
224 if(sysctl_tcp_timestamps
) {
225 tcp_header_size
+= TCPOLEN_TSTAMP_ALIGNED
;
226 sysctl_flags
|= SYSCTL_FLAG_TSTAMPS
;
228 if(sysctl_tcp_window_scaling
) {
229 tcp_header_size
+= TCPOLEN_WSCALE_ALIGNED
;
230 sysctl_flags
|= SYSCTL_FLAG_WSCALE
;
232 if(sysctl_tcp_sack
) {
233 sysctl_flags
|= SYSCTL_FLAG_SACK
;
234 if(!(sysctl_flags
& SYSCTL_FLAG_TSTAMPS
))
235 tcp_header_size
+= TCPOLEN_SACKPERM_ALIGNED
;
237 } else if (tp
->eff_sacks
) {
238 /* A SACK is 2 pad bytes, a 2 byte header, plus
239 * 2 32-bit sequence numbers for each SACK block.
241 tcp_header_size
+= (TCPOLEN_SACK_BASE_ALIGNED
+
242 (tp
->eff_sacks
* TCPOLEN_SACK_PERBLOCK
));
244 th
= (struct tcphdr
*) skb_push(skb
, tcp_header_size
);
246 skb_set_owner_w(skb
, sk
);
248 /* Build TCP header and checksum it. */
249 th
->source
= sk
->sport
;
250 th
->dest
= sk
->dport
;
251 th
->seq
= htonl(tcb
->seq
);
252 th
->ack_seq
= htonl(tp
->rcv_nxt
);
253 *(((__u16
*)th
) + 6) = htons(((tcp_header_size
>> 2) << 12) | tcb
->flags
);
254 if (tcb
->flags
& TCPCB_FLAG_SYN
) {
255 /* RFC1323: The window in SYN & SYN/ACK segments
258 th
->window
= htons(tp
->rcv_wnd
);
260 th
->window
= htons(tcp_select_window(sk
));
263 th
->urg_ptr
= ntohs(tcb
->urg_ptr
);
265 if (tcb
->flags
& TCPCB_FLAG_SYN
) {
266 tcp_syn_build_options((__u32
*)(th
+ 1),
267 tcp_advertise_mss(sk
),
268 (sysctl_flags
& SYSCTL_FLAG_TSTAMPS
),
269 (sysctl_flags
& SYSCTL_FLAG_SACK
),
270 (sysctl_flags
& SYSCTL_FLAG_WSCALE
),
275 tcp_build_and_update_options((__u32
*)(th
+ 1),
278 TCP_ECN_send(sk
, tp
, skb
, tcp_header_size
);
280 tp
->af_specific
->send_check(sk
, th
, skb
->len
, skb
);
282 if (tcb
->flags
& TCPCB_FLAG_ACK
)
283 tcp_event_ack_sent(sk
);
285 if (skb
->len
!= tcp_header_size
)
286 tcp_event_data_sent(tp
, skb
);
288 TCP_INC_STATS(TcpOutSegs
);
290 err
= tp
->af_specific
->queue_xmit(skb
);
296 /* NET_XMIT_CN is special. It does not guarantee,
297 * that this packet is lost. It tells that device
298 * is about to start to drop packets or already
299 * drops some packets of the same priority and
300 * invokes us to send less aggressively.
302 return err
== NET_XMIT_CN
? 0 : err
;
305 #undef SYSCTL_FLAG_TSTAMPS
306 #undef SYSCTL_FLAG_WSCALE
307 #undef SYSCTL_FLAG_SACK
311 /* This is the main buffer sending routine. We queue the buffer
312 * and decide whether to queue or transmit now.
314 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
315 * otherwise socket can stall.
317 void tcp_send_skb(struct sock
*sk
, struct sk_buff
*skb
, int force_queue
, unsigned cur_mss
)
319 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
321 /* Advance write_seq and place onto the write_queue. */
322 tp
->write_seq
= TCP_SKB_CB(skb
)->end_seq
;
323 __skb_queue_tail(&sk
->write_queue
, skb
);
324 tcp_charge_skb(sk
, skb
);
326 if (!force_queue
&& tp
->send_head
== NULL
&& tcp_snd_test(tp
, skb
, cur_mss
, 1)) {
327 /* Send it out now. */
328 TCP_SKB_CB(skb
)->when
= tcp_time_stamp
;
329 if (tcp_transmit_skb(sk
, skb_clone(skb
, GFP_KERNEL
)) == 0) {
330 tp
->snd_nxt
= TCP_SKB_CB(skb
)->end_seq
;
331 tcp_minshall_update(tp
, cur_mss
, skb
);
332 if (tp
->packets_out
++ == 0)
333 tcp_reset_xmit_timer(sk
, TCP_TIME_RETRANS
, tp
->rto
);
337 /* Queue it, remembering where we must start sending. */
338 if (tp
->send_head
== NULL
)
342 /* Function to create two new TCP segments. Shrinks the given segment
343 * to the specified size and appends a new segment with the rest of the
344 * packet to the list. This won't be called frequently, I hope.
345 * Remember, these are still headerless SKBs at this point.
347 static int tcp_fragment(struct sock
*sk
, struct sk_buff
*skb
, u32 len
)
349 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
350 struct sk_buff
*buff
;
351 int nsize
= skb
->len
- len
;
354 /* Get a new skb... force flag on. */
355 buff
= tcp_alloc_skb(sk
, nsize
+ MAX_TCP_HEADER
+ 15, GFP_ATOMIC
);
357 return -ENOMEM
; /* We'll just try again later. */
358 tcp_charge_skb(sk
, buff
);
360 /* Reserve space for headers. */
361 skb_reserve(buff
, MAX_TCP_HEADER
);
363 /* Correct the sequence numbers. */
364 TCP_SKB_CB(buff
)->seq
= TCP_SKB_CB(skb
)->seq
+ len
;
365 TCP_SKB_CB(buff
)->end_seq
= TCP_SKB_CB(skb
)->end_seq
;
367 /* PSH and FIN should only be set in the second packet. */
368 flags
= TCP_SKB_CB(skb
)->flags
;
369 TCP_SKB_CB(skb
)->flags
= flags
& ~(TCPCB_FLAG_FIN
| TCPCB_FLAG_PSH
);
370 if(flags
& TCPCB_FLAG_URG
) {
371 u16 old_urg_ptr
= TCP_SKB_CB(skb
)->urg_ptr
;
373 /* Urgent data is always a pain in the ass. */
374 if(old_urg_ptr
> len
) {
375 TCP_SKB_CB(skb
)->flags
&= ~(TCPCB_FLAG_URG
);
376 TCP_SKB_CB(skb
)->urg_ptr
= 0;
377 TCP_SKB_CB(buff
)->urg_ptr
= old_urg_ptr
- len
;
379 flags
&= ~(TCPCB_FLAG_URG
);
382 if(!(flags
& TCPCB_FLAG_URG
))
383 TCP_SKB_CB(buff
)->urg_ptr
= 0;
384 TCP_SKB_CB(buff
)->flags
= flags
;
385 TCP_SKB_CB(buff
)->sacked
= TCP_SKB_CB(skb
)->sacked
&(TCPCB_LOST
|TCPCB_EVER_RETRANS
);
386 if (TCP_SKB_CB(buff
)->sacked
&TCPCB_LOST
) {
391 /* Copy and checksum data tail into the new buffer. */
392 buff
->csum
= csum_partial_copy_nocheck(skb
->data
+ len
, skb_put(buff
, nsize
),
395 /* This takes care of the FIN sequence number too. */
396 TCP_SKB_CB(skb
)->end_seq
= TCP_SKB_CB(buff
)->seq
;
399 /* Rechecksum original buffer. */
400 skb
->csum
= csum_partial(skb
->data
, skb
->len
, 0);
402 /* Looks stupid, but our code really uses when of
403 * skbs, which it never sent before. --ANK
405 TCP_SKB_CB(buff
)->when
= TCP_SKB_CB(skb
)->when
;
407 /* Link BUFF into the send queue. */
408 __skb_append(skb
, buff
);
413 /* This function synchronize snd mss to current pmtu/exthdr set.
415 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
416 for TCP options, but includes only bare TCP header.
418 tp->mss_clamp is mss negotiated at connection setup.
419 It is minumum of user_mss and mss received with SYN.
420 It also does not include TCP options.
422 tp->pmtu_cookie is last pmtu, seen by this function.
424 tp->mss_cache is current effective sending mss, including
425 all tcp options except for SACKs. It is evaluated,
426 taking into account current pmtu, but never exceeds
429 NOTE1. rfc1122 clearly states that advertised MSS
430 DOES NOT include either tcp or ip options.
432 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
433 this function. --ANK (980731)
436 int tcp_sync_mss(struct sock
*sk
, u32 pmtu
)
438 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
441 /* Calculate base mss without TCP options:
442 It is MMS_S - sizeof(tcphdr) of rfc1122
445 mss_now
= pmtu
- tp
->af_specific
->net_header_len
- sizeof(struct tcphdr
);
447 /* Clamp it (mss_clamp does not include tcp options) */
448 if (mss_now
> tp
->mss_clamp
)
449 mss_now
= tp
->mss_clamp
;
451 /* Now subtract optional transport overhead */
452 mss_now
-= tp
->ext_header_len
;
454 /* Then reserve room for full set of TCP options and 8 bytes of data */
458 /* Now subtract TCP options size, not including SACKs */
459 mss_now
-= tp
->tcp_header_len
- sizeof(struct tcphdr
);
461 /* Bound mss with half of window */
462 if (tp
->max_window
&& mss_now
> (tp
->max_window
>>1))
463 mss_now
= max((tp
->max_window
>>1), 68 - tp
->tcp_header_len
);
465 /* And store cached results */
466 tp
->pmtu_cookie
= pmtu
;
467 tp
->mss_cache
= mss_now
;
472 /* This routine writes packets to the network. It advances the
473 * send_head. This happens as incoming acks open up the remote
476 * Returns 1, if no segments are in flight and we have queued segments, but
477 * cannot send anything now because of SWS or another problem.
479 int tcp_write_xmit(struct sock
*sk
)
481 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
482 unsigned int mss_now
;
484 /* If we are closed, the bytes will have to remain here.
485 * In time closedown will finish, we empty the write queue and all
488 if(sk
->state
!= TCP_CLOSE
) {
492 /* Account for SACKS, we may need to fragment due to this.
493 * It is just like the real MSS changing on us midstream.
494 * We also handle things correctly when the user adds some
495 * IP options mid-stream. Silly to do, but cover it.
497 mss_now
= tcp_current_mss(sk
);
499 while((skb
= tp
->send_head
) &&
500 tcp_snd_test(tp
, skb
, mss_now
, tcp_skb_is_last(sk
, skb
))) {
501 if (skb
->len
> mss_now
) {
502 if (tcp_fragment(sk
, skb
, mss_now
))
506 TCP_SKB_CB(skb
)->when
= tcp_time_stamp
;
507 if (tcp_transmit_skb(sk
, skb_clone(skb
, GFP_ATOMIC
)))
509 /* Advance the send_head. This one is sent out. */
510 update_send_head(sk
, tp
, skb
);
511 tcp_minshall_update(tp
, mss_now
, skb
);
516 tcp_cwnd_validate(sk
, tp
);
520 return !tp
->packets_out
&& tp
->send_head
;
525 /* This function returns the amount that we can raise the
526 * usable window based on the following constraints
528 * 1. The window can never be shrunk once it is offered (RFC 793)
529 * 2. We limit memory per socket
532 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
533 * RECV.NEXT + RCV.WIN fixed until:
534 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
536 * i.e. don't raise the right edge of the window until you can raise
537 * it at least MSS bytes.
539 * Unfortunately, the recommended algorithm breaks header prediction,
540 * since header prediction assumes th->window stays fixed.
542 * Strictly speaking, keeping th->window fixed violates the receiver
543 * side SWS prevention criteria. The problem is that under this rule
544 * a stream of single byte packets will cause the right side of the
545 * window to always advance by a single byte.
547 * Of course, if the sender implements sender side SWS prevention
548 * then this will not be a problem.
550 * BSD seems to make the following compromise:
552 * If the free space is less than the 1/4 of the maximum
553 * space available and the free space is less than 1/2 mss,
554 * then set the window to 0.
555 * Otherwise, just prevent the window from shrinking
556 * and from being larger than the largest representable value.
558 * This prevents incremental opening of the window in the regime
559 * where TCP is limited by the speed of the reader side taking
560 * data out of the TCP receive queue. It does nothing about
561 * those cases where the window is constrained on the sender side
562 * because the pipeline is full.
564 * BSD also seems to "accidentally" limit itself to windows that are a
565 * multiple of MSS, at least until the free space gets quite small.
566 * This would appear to be a side effect of the mbuf implementation.
567 * Combining these two algorithms results in the observed behavior
568 * of having a fixed window size at almost all times.
570 * Below we obtain similar behavior by forcing the offered window to
571 * a multiple of the mss when it is feasible to do so.
573 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
574 * Regular options like TIMESTAMP are taken into account.
576 u32
__tcp_select_window(struct sock
*sk
)
578 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
579 /* MSS for the peer's data. Previous verions used mss_clamp
580 * here. I don't know if the value based on our guesses
581 * of peer's MSS is better for the performance. It's more correct
582 * but may be worse for the performance because of rcv_mss
583 * fluctuations. --SAW 1998/11/1
585 unsigned int mss
= tp
->ack
.rcv_mss
;
589 /* Sometimes free_space can be < 0. */
590 free_space
= tcp_space(sk
);
591 if (tp
->window_clamp
< mss
)
592 mss
= tp
->window_clamp
;
594 if (free_space
< (int)min(tp
->window_clamp
, tcp_full_space(sk
)) / 2) {
597 if (tcp_memory_pressure
)
598 tp
->rcv_ssthresh
= min(tp
->rcv_ssthresh
, 4*tp
->advmss
);
600 if (free_space
< ((int) (mss
/2)))
604 if (free_space
> tp
->rcv_ssthresh
)
605 free_space
= tp
->rcv_ssthresh
;
607 /* Get the largest window that is a nice multiple of mss.
608 * Window clamp already applied above.
609 * If our current window offering is within 1 mss of the
610 * free space we just keep it. This prevents the divide
611 * and multiply from happening most of the time.
612 * We also don't do any window rounding when the free space
615 window
= tp
->rcv_wnd
;
616 if ((((int) window
) <= (free_space
- ((int) mss
))) ||
617 (((int) window
) > free_space
))
618 window
= (((unsigned int) free_space
)/mss
)*mss
;
623 /* Attempt to collapse two adjacent SKB's during retransmission. */
624 static void tcp_retrans_try_collapse(struct sock
*sk
, struct sk_buff
*skb
, int mss_now
)
626 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
627 struct sk_buff
*next_skb
= skb
->next
;
629 /* The first test we must make is that neither of these two
630 * SKB's are still referenced by someone else.
632 if(!skb_cloned(skb
) && !skb_cloned(next_skb
)) {
633 int skb_size
= skb
->len
, next_skb_size
= next_skb
->len
;
634 u16 flags
= TCP_SKB_CB(skb
)->flags
;
636 /* Punt if the first SKB has URG set. */
637 if(flags
& TCPCB_FLAG_URG
)
640 /* Also punt if next skb has been SACK'd. */
641 if(TCP_SKB_CB(next_skb
)->sacked
& TCPCB_SACKED_ACKED
)
644 /* Next skb is out of window. */
645 if (after(TCP_SKB_CB(next_skb
)->end_seq
, tp
->snd_una
+tp
->snd_wnd
))
648 /* Punt if not enough space exists in the first SKB for
649 * the data in the second, or the total combined payload
650 * would exceed the MSS.
652 if ((next_skb_size
> skb_tailroom(skb
)) ||
653 ((skb_size
+ next_skb_size
) > mss_now
))
656 /* Ok. We will be able to collapse the packet. */
657 __skb_unlink(next_skb
, next_skb
->list
);
660 /* Must copy and rechecksum all data. */
661 memcpy(skb_put(skb
, next_skb_size
), next_skb
->data
, next_skb_size
);
662 skb
->csum
= csum_partial(skb
->data
, skb
->len
, 0);
664 /* Optimize, actually we could also combine next_skb->csum
665 * to skb->csum using a single add w/carry operation too.
667 skb
->csum
= csum_partial_copy_nocheck(next_skb
->data
,
668 skb_put(skb
, next_skb_size
),
669 next_skb_size
, skb
->csum
);
672 /* Update sequence range on original skb. */
673 TCP_SKB_CB(skb
)->end_seq
= TCP_SKB_CB(next_skb
)->end_seq
;
675 /* Merge over control information. */
676 flags
|= TCP_SKB_CB(next_skb
)->flags
; /* This moves PSH/FIN etc. over */
677 if(flags
& TCPCB_FLAG_URG
) {
678 u16 urgptr
= TCP_SKB_CB(next_skb
)->urg_ptr
;
679 TCP_SKB_CB(skb
)->urg_ptr
= urgptr
+ skb_size
;
681 TCP_SKB_CB(skb
)->flags
= flags
;
683 /* All done, get rid of second SKB and account for it so
684 * packet counting does not break.
686 TCP_SKB_CB(skb
)->sacked
|= TCP_SKB_CB(next_skb
)->sacked
&TCPCB_EVER_RETRANS
;
687 if (TCP_SKB_CB(next_skb
)->sacked
&TCPCB_SACKED_RETRANS
)
689 if (TCP_SKB_CB(next_skb
)->sacked
&TCPCB_LOST
) {
693 if (!tp
->sack_ok
&& tp
->sacked_out
) {
694 /* Reno case is special. Sigh... */
698 tcp_free_skb(sk
, next_skb
);
703 /* Do a simple retransmit without using the backoff mechanisms in
704 * tcp_timer. This is used for path mtu discovery.
705 * The socket is already locked here.
707 void tcp_simple_retransmit(struct sock
*sk
)
709 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
711 unsigned int mss
= tcp_current_mss(sk
);
714 for_retrans_queue(skb
, sk
, tp
) {
715 if (skb
->len
> mss
&&
716 !(TCP_SKB_CB(skb
)->sacked
&TCPCB_SACKED_ACKED
)) {
717 if (TCP_SKB_CB(skb
)->sacked
&TCPCB_SACKED_RETRANS
) {
718 TCP_SKB_CB(skb
)->sacked
&= ~TCPCB_SACKED_RETRANS
;
721 if (!(TCP_SKB_CB(skb
)->sacked
&TCPCB_LOST
)) {
722 TCP_SKB_CB(skb
)->sacked
|= TCPCB_LOST
;
732 tp
->left_out
= tp
->sacked_out
+ tp
->lost_out
;
734 /* Don't muck with the congestion window here.
735 * Reason is that we do not increase amount of _data_
736 * in network, but units changed and effective
737 * cwnd/ssthresh really reduced now.
739 if (tp
->ca_state
!= TCP_CA_Loss
) {
740 tp
->high_seq
= tp
->snd_nxt
;
741 tp
->snd_ssthresh
= tcp_current_ssthresh(tp
);
742 tp
->prior_ssthresh
= 0;
744 tp
->ca_state
= TCP_CA_Loss
;
746 tcp_xmit_retransmit_queue(sk
);
749 /* This retransmits one SKB. Policy decisions and retransmit queue
750 * state updates are done by the caller. Returns non-zero if an
751 * error occurred which prevented the send.
753 int tcp_retransmit_skb(struct sock
*sk
, struct sk_buff
*skb
)
755 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
756 unsigned int cur_mss
= tcp_current_mss(sk
);
759 /* Do not sent more than we queued. 1/4 is reserved for possible
760 * copying overhead: frgagmentation, tunneling, mangling etc.
762 if (atomic_read(&sk
->wmem_alloc
) > min(sk
->wmem_queued
+(sk
->wmem_queued
>>2),sk
->sndbuf
))
765 if(skb
->len
> cur_mss
) {
766 if(tcp_fragment(sk
, skb
, cur_mss
))
767 return -ENOMEM
; /* We'll try again later. */
769 /* New SKB created, account for it. */
773 /* Collapse two adjacent packets if worthwhile and we can. */
774 if(!(TCP_SKB_CB(skb
)->flags
& TCPCB_FLAG_SYN
) &&
775 (skb
->len
< (cur_mss
>> 1)) &&
776 (skb
->next
!= tp
->send_head
) &&
777 (skb
->next
!= (struct sk_buff
*)&sk
->write_queue
) &&
778 (sysctl_tcp_retrans_collapse
!= 0))
779 tcp_retrans_try_collapse(sk
, skb
, cur_mss
);
781 if(tp
->af_specific
->rebuild_header(sk
))
782 return -EHOSTUNREACH
; /* Routing failure or similar. */
784 /* Some Solaris stacks overoptimize and ignore the FIN on a
785 * retransmit when old data is attached. So strip it off
786 * since it is cheap to do so and saves bytes on the network.
789 (TCP_SKB_CB(skb
)->flags
& TCPCB_FLAG_FIN
) &&
790 tp
->snd_una
== (TCP_SKB_CB(skb
)->end_seq
- 1)) {
791 TCP_SKB_CB(skb
)->seq
= TCP_SKB_CB(skb
)->end_seq
- 1;
796 /* Make a copy, if the first transmission SKB clone we made
797 * is still in somebody's hands, else make a clone.
799 TCP_SKB_CB(skb
)->when
= tcp_time_stamp
;
801 err
= tcp_transmit_skb(sk
, (skb_cloned(skb
) ?
802 skb_copy(skb
, GFP_ATOMIC
):
803 skb_clone(skb
, GFP_ATOMIC
)));
806 /* Update global TCP statistics. */
807 TCP_INC_STATS(TcpRetransSegs
);
809 #if FASTRETRANS_DEBUG > 0
810 if (TCP_SKB_CB(skb
)->sacked
&TCPCB_SACKED_RETRANS
) {
812 printk(KERN_DEBUG
"retrans_out leaked.\n");
815 TCP_SKB_CB(skb
)->sacked
|= TCPCB_RETRANS
;
818 /* Save stamp of the first retransmit. */
819 if (!tp
->retrans_stamp
)
820 tp
->retrans_stamp
= TCP_SKB_CB(skb
)->when
;
824 /* snd_nxt is stored to detect loss of retransmitted segment,
825 * see tcp_input.c tcp_sacktag_write_queue().
827 TCP_SKB_CB(skb
)->ack_seq
= tp
->snd_nxt
;
832 /* This gets called after a retransmit timeout, and the initially
833 * retransmitted data is acknowledged. It tries to continue
834 * resending the rest of the retransmit queue, until either
835 * we've sent it all or the congestion window limit is reached.
836 * If doing SACK, the first ACK which comes back for a timeout
837 * based retransmit packet might feed us FACK information again.
838 * If so, we use it to avoid unnecessarily retransmissions.
840 void tcp_xmit_retransmit_queue(struct sock
*sk
)
842 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
844 int packet_cnt
= tp
->lost_out
;
846 /* First pass: retransmit lost packets. */
848 for_retrans_queue(skb
, sk
, tp
) {
849 __u8 sacked
= TCP_SKB_CB(skb
)->sacked
;
851 if (tcp_packets_in_flight(tp
) >= tp
->snd_cwnd
)
854 if (sacked
&TCPCB_LOST
) {
855 if (!(sacked
&(TCPCB_SACKED_ACKED
|TCPCB_SACKED_RETRANS
))) {
856 if (tcp_retransmit_skb(sk
, skb
))
858 if (tp
->ca_state
!= TCP_CA_Loss
)
859 NET_INC_STATS_BH(TCPFastRetrans
);
861 NET_INC_STATS_BH(TCPSlowStartRetrans
);
863 if (skb
== skb_peek(&sk
->write_queue
))
864 tcp_reset_xmit_timer(sk
, TCP_TIME_RETRANS
, tp
->rto
);
867 if (--packet_cnt
<= 0)
873 /* OK, demanded retransmission is finished. */
875 /* Forward retransmissions are possible only during Recovery. */
876 if (tp
->ca_state
!= TCP_CA_Recovery
)
879 /* No forward retransmissions in Reno are possible. */
883 /* Yeah, we have to make difficult choice between forward transmission
884 * and retransmission... Both ways have their merits...
886 * For now we do not retrnamsit anything, while we have some new
890 if (tcp_may_send_now(sk
, tp
))
895 for_retrans_queue(skb
, sk
, tp
) {
896 if(++packet_cnt
> tp
->fackets_out
)
899 if (tcp_packets_in_flight(tp
) >= tp
->snd_cwnd
)
902 if(TCP_SKB_CB(skb
)->sacked
& TCPCB_TAGBITS
)
905 /* Ok, retransmit it. */
906 if(tcp_retransmit_skb(sk
, skb
))
909 if (skb
== skb_peek(&sk
->write_queue
))
910 tcp_reset_xmit_timer(sk
, TCP_TIME_RETRANS
, tp
->rto
);
912 NET_INC_STATS_BH(TCPForwardRetrans
);
917 /* Send a fin. The caller locks the socket for us. This cannot be
918 * allowed to fail queueing a FIN frame under any circumstances.
920 void tcp_send_fin(struct sock
*sk
)
922 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
923 struct sk_buff
*skb
= skb_peek_tail(&sk
->write_queue
);
924 unsigned int mss_now
;
926 /* Optimization, tack on the FIN if we have a queue of
927 * unsent frames. But be careful about outgoing SACKS
930 mss_now
= tcp_current_mss(sk
);
932 /* Please, find seven differences of 2.3.33 and loook
933 * what I broke here. 8) --ANK
936 if(tp
->send_head
!= NULL
) {
937 /* tcp_write_xmit() takes care of the rest. */
938 TCP_SKB_CB(skb
)->flags
|= TCPCB_FLAG_FIN
;
939 TCP_SKB_CB(skb
)->end_seq
++;
942 /* Special case to avoid Nagle bogosity. If this
943 * segment is the last segment, and it was queued
944 * due to Nagle/SWS-avoidance, send it out now.
946 if(tp
->send_head
== skb
&&
947 !after(tp
->write_seq
, tp
->snd_una
+ tp
->snd_wnd
)) {
948 TCP_SKB_CB(skb
)->when
= tcp_time_stamp
;
949 if (!tcp_transmit_skb(sk
, skb_clone(skb
, GFP_KERNEL
)))
950 update_send_head(sk
, tp
, skb
);
952 tcp_check_probe_timer(sk
, tp
);
955 /* Socket is locked, keep trying until memory is available. */
957 skb
= alloc_skb(MAX_TCP_HEADER
+ 15, GFP_KERNEL
);
960 current
->policy
|= SCHED_YIELD
;
964 /* Reserve space for headers and prepare control bits. */
965 skb_reserve(skb
, MAX_TCP_HEADER
);
967 TCP_SKB_CB(skb
)->flags
= (TCPCB_FLAG_ACK
| TCPCB_FLAG_FIN
);
968 TCP_SKB_CB(skb
)->sacked
= 0;
969 TCP_SKB_CB(skb
)->urg_ptr
= 0;
971 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
972 TCP_SKB_CB(skb
)->seq
= tp
->write_seq
;
973 TCP_SKB_CB(skb
)->end_seq
= TCP_SKB_CB(skb
)->seq
+ 1;
974 tcp_send_skb(sk
, skb
, 0, mss_now
);
975 __tcp_push_pending_frames(sk
, tp
, mss_now
);
979 /* We get here when a process closes a file descriptor (either due to
980 * an explicit close() or as a byproduct of exit()'ing) and there
981 * was unread data in the receive queue. This behavior is recommended
982 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
984 void tcp_send_active_reset(struct sock
*sk
, int priority
)
986 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
989 /* NOTE: No TCP options attached and we never retransmit this. */
990 skb
= alloc_skb(MAX_TCP_HEADER
+ 15, priority
);
992 NET_INC_STATS(TCPAbortFailed
);
996 /* Reserve space for headers and prepare control bits. */
997 skb_reserve(skb
, MAX_TCP_HEADER
);
999 TCP_SKB_CB(skb
)->flags
= (TCPCB_FLAG_ACK
| TCPCB_FLAG_RST
);
1000 TCP_SKB_CB(skb
)->sacked
= 0;
1001 TCP_SKB_CB(skb
)->urg_ptr
= 0;
1004 TCP_SKB_CB(skb
)->seq
= tcp_acceptable_seq(sk
, tp
);
1005 TCP_SKB_CB(skb
)->end_seq
= TCP_SKB_CB(skb
)->seq
;
1006 TCP_SKB_CB(skb
)->when
= tcp_time_stamp
;
1007 if (tcp_transmit_skb(sk
, skb
))
1008 NET_INC_STATS(TCPAbortFailed
);
1011 /* WARNING: This routine must only be called when we have already sent
1012 * a SYN packet that crossed the incoming SYN that caused this routine
1013 * to get called. If this assumption fails then the initial rcv_wnd
1014 * and rcv_wscale values will not be correct.
1016 int tcp_send_synack(struct sock
*sk
)
1018 struct sk_buff
* skb
;
1020 skb
= skb_peek(&sk
->write_queue
);
1021 if (skb
== NULL
|| !(TCP_SKB_CB(skb
)->flags
&TCPCB_FLAG_SYN
)) {
1022 printk(KERN_DEBUG
"tcp_send_synack: wrong queue state\n");
1025 if (!(TCP_SKB_CB(skb
)->flags
&TCPCB_FLAG_ACK
)) {
1026 if (skb_cloned(skb
)) {
1027 struct sk_buff
*nskb
= skb_copy(skb
, GFP_ATOMIC
);
1030 __skb_unlink(skb
, &sk
->write_queue
);
1031 __skb_queue_head(&sk
->write_queue
, nskb
);
1032 tcp_free_skb(sk
, skb
);
1033 tcp_charge_skb(sk
, nskb
);
1037 TCP_SKB_CB(skb
)->flags
|= TCPCB_FLAG_ACK
;
1038 TCP_ECN_send_synack(&sk
->tp_pinfo
.af_tcp
, skb
);
1040 TCP_SKB_CB(skb
)->when
= tcp_time_stamp
;
1041 return tcp_transmit_skb(sk
, skb_clone(skb
, GFP_ATOMIC
));
1045 * Prepare a SYN-ACK.
1047 struct sk_buff
* tcp_make_synack(struct sock
*sk
, struct dst_entry
*dst
,
1048 struct open_request
*req
)
1050 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1052 int tcp_header_size
;
1053 struct sk_buff
*skb
;
1055 skb
= sock_wmalloc(sk
, MAX_TCP_HEADER
+ 15, 1, GFP_ATOMIC
);
1059 /* Reserve space for headers. */
1060 skb_reserve(skb
, MAX_TCP_HEADER
);
1062 skb
->dst
= dst_clone(dst
);
1064 tcp_header_size
= (sizeof(struct tcphdr
) + TCPOLEN_MSS
+
1065 (req
->tstamp_ok
? TCPOLEN_TSTAMP_ALIGNED
: 0) +
1066 (req
->wscale_ok
? TCPOLEN_WSCALE_ALIGNED
: 0) +
1067 /* SACK_PERM is in the place of NOP NOP of TS */
1068 ((req
->sack_ok
&& !req
->tstamp_ok
) ? TCPOLEN_SACKPERM_ALIGNED
: 0));
1069 skb
->h
.th
= th
= (struct tcphdr
*) skb_push(skb
, tcp_header_size
);
1071 memset(th
, 0, sizeof(struct tcphdr
));
1074 TCP_ECN_make_synack(req
, th
);
1075 th
->source
= sk
->sport
;
1076 th
->dest
= req
->rmt_port
;
1077 TCP_SKB_CB(skb
)->seq
= req
->snt_isn
;
1078 TCP_SKB_CB(skb
)->end_seq
= TCP_SKB_CB(skb
)->seq
+ 1;
1079 th
->seq
= htonl(TCP_SKB_CB(skb
)->seq
);
1080 th
->ack_seq
= htonl(req
->rcv_isn
+ 1);
1081 if (req
->rcv_wnd
== 0) { /* ignored for retransmitted syns */
1083 /* Set this up on the first call only */
1084 req
->window_clamp
= tp
->window_clamp
? : dst
->window
;
1085 /* tcp_full_space because it is guaranteed to be the first packet */
1086 tcp_select_initial_window(tcp_full_space(sk
),
1087 dst
->advmss
- (req
->tstamp_ok
? TCPOLEN_TSTAMP_ALIGNED
: 0),
1092 req
->rcv_wscale
= rcv_wscale
;
1095 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1096 th
->window
= htons(req
->rcv_wnd
);
1098 TCP_SKB_CB(skb
)->when
= tcp_time_stamp
;
1099 tcp_syn_build_options((__u32
*)(th
+ 1), dst
->advmss
, req
->tstamp_ok
,
1100 req
->sack_ok
, req
->wscale_ok
, req
->rcv_wscale
,
1101 TCP_SKB_CB(skb
)->when
,
1105 th
->doff
= (tcp_header_size
>> 2);
1106 TCP_INC_STATS(TcpOutSegs
);
1110 int tcp_connect(struct sock
*sk
, struct sk_buff
*buff
)
1112 struct dst_entry
*dst
= __sk_dst_get(sk
);
1113 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1115 /* Reserve space for headers. */
1116 skb_reserve(buff
, MAX_TCP_HEADER
);
1118 /* We'll fix this up when we get a response from the other end.
1119 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1121 tp
->tcp_header_len
= sizeof(struct tcphdr
) +
1122 (sysctl_tcp_timestamps
? TCPOLEN_TSTAMP_ALIGNED
: 0);
1124 /* If user gave his TCP_MAXSEG, record it to clamp */
1126 tp
->mss_clamp
= tp
->user_mss
;
1128 tcp_sync_mss(sk
, dst
->pmtu
);
1130 if (!tp
->window_clamp
)
1131 tp
->window_clamp
= dst
->window
;
1132 tp
->advmss
= dst
->advmss
;
1133 tcp_initialize_rcv_mss(sk
);
1135 tcp_select_initial_window(tcp_full_space(sk
),
1136 tp
->advmss
- (tp
->ts_recent_stamp
? tp
->tcp_header_len
- sizeof(struct tcphdr
) : 0),
1139 sysctl_tcp_window_scaling
,
1142 tp
->rcv_ssthresh
= tp
->rcv_wnd
;
1144 /* Socket identity change complete, no longer
1145 * in TCP_CLOSE, so enter ourselves into the
1148 tcp_set_state(sk
,TCP_SYN_SENT
);
1149 if (tp
->af_specific
->hash_connecting(sk
))
1155 tcp_init_wl(tp
, tp
->write_seq
, 0);
1156 tp
->snd_una
= tp
->write_seq
;
1157 tp
->snd_sml
= tp
->write_seq
;
1162 tp
->rto
= TCP_TIMEOUT_INIT
;
1163 tcp_init_xmit_timers(sk
);
1164 tp
->retransmits
= 0;
1165 tcp_clear_retrans(tp
);
1167 TCP_SKB_CB(buff
)->flags
= TCPCB_FLAG_SYN
;
1168 TCP_ECN_send_syn(tp
, buff
);
1169 TCP_SKB_CB(buff
)->sacked
= 0;
1170 TCP_SKB_CB(buff
)->urg_ptr
= 0;
1172 TCP_SKB_CB(buff
)->seq
= tp
->write_seq
++;
1173 TCP_SKB_CB(buff
)->end_seq
= tp
->write_seq
;
1174 tp
->snd_nxt
= tp
->write_seq
;
1175 tp
->pushed_seq
= tp
->write_seq
;
1178 TCP_SKB_CB(buff
)->when
= tcp_time_stamp
;
1179 tp
->retrans_stamp
= TCP_SKB_CB(buff
)->when
;
1180 __skb_queue_tail(&sk
->write_queue
, buff
);
1181 tcp_charge_skb(sk
, buff
);
1183 tcp_transmit_skb(sk
, skb_clone(buff
, GFP_KERNEL
));
1184 TCP_INC_STATS(TcpActiveOpens
);
1186 /* Timer for repeating the SYN until an answer. */
1187 tcp_reset_xmit_timer(sk
, TCP_TIME_RETRANS
, tp
->rto
);
1191 tcp_set_state(sk
,TCP_CLOSE
);
1193 return -EADDRNOTAVAIL
;
1196 /* Send out a delayed ack, the caller does the policy checking
1197 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1200 void tcp_send_delayed_ack(struct sock
*sk
)
1202 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
1203 int ato
= tp
->ack
.ato
;
1204 unsigned long timeout
;
1206 if (ato
> TCP_DELACK_MIN
) {
1207 int max_ato
= (tp
->ack
.pingpong
|| tp
->ack
.rcv_small
) ?
1208 TCP_DELACK_MAX
: (HZ
/2);
1210 /* Slow path, intersegment interval is "high". */
1212 /* If some rtt estimate is known, use it to bound delayed ack.
1213 * Do not use tp->rto here, use results of rtt measurements
1217 int rtt
= max(tp
->srtt
>>3, TCP_DELACK_MIN
);
1223 ato
= min(ato
, max_ato
);
1226 /* Stay within the limit we were given */
1227 timeout
= jiffies
+ ato
;
1229 /* Use new timeout only if there wasn't a older one earlier. */
1230 if (tp
->ack
.pending
&2) {
1231 /* If delack timer was blocked or is about to expire,
1234 if (tp
->ack
.blocked
|| time_before_eq(tp
->ack
.timeout
, jiffies
+(ato
>>2))) {
1239 if (!time_before(timeout
, tp
->ack
.timeout
))
1240 timeout
= tp
->ack
.timeout
;
1242 tp
->ack
.pending
= 3;
1243 tp
->ack
.timeout
= timeout
;
1244 if (!mod_timer(&tp
->delack_timer
, timeout
))
1247 #ifdef TCP_FORMAL_WINDOW
1248 /* Explanation. Header prediction path does not handle
1249 * case of zero window. If we send ACK immediately, pred_flags
1250 * are reset when sending ACK. If rcv_nxt is advanced and
1251 * ack is not sent, than delayed ack is scheduled.
1252 * Hence, it is the best place to check for zero window.
1254 if (tp
->pred_flags
) {
1255 if (tcp_receive_window(tp
) == 0)
1258 if (skb_queue_len(&tp
->out_of_order_queue
) == 0 &&
1260 tcp_fast_path_on(tp
);
1265 /* This routine sends an ack and also updates the window. */
1266 void tcp_send_ack(struct sock
*sk
)
1268 /* If we have been reset, we may not send again. */
1269 if(sk
->state
!= TCP_CLOSE
) {
1270 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1271 struct sk_buff
*buff
;
1273 /* We are not putting this on the write queue, so
1274 * tcp_transmit_skb() will set the ownership to this
1277 buff
= alloc_skb(MAX_TCP_HEADER
+ 15, GFP_ATOMIC
);
1279 tcp_schedule_ack(tp
);
1280 tp
->ack
.ato
= TCP_ATO_MIN
;
1281 tcp_reset_xmit_timer(sk
, TCP_TIME_DACK
, TCP_DELACK_MAX
);
1285 /* Reserve space for headers and prepare control bits. */
1286 skb_reserve(buff
, MAX_TCP_HEADER
);
1288 TCP_SKB_CB(buff
)->flags
= TCPCB_FLAG_ACK
;
1289 TCP_SKB_CB(buff
)->sacked
= 0;
1290 TCP_SKB_CB(buff
)->urg_ptr
= 0;
1292 /* Send it off, this clears delayed acks for us. */
1293 TCP_SKB_CB(buff
)->seq
= TCP_SKB_CB(buff
)->end_seq
= tcp_acceptable_seq(sk
, tp
);
1294 TCP_SKB_CB(buff
)->when
= tcp_time_stamp
;
1295 tcp_transmit_skb(sk
, buff
);
1299 /* This routine sends a packet with an out of date sequence
1300 * number. It assumes the other end will try to ack it.
1302 static int tcp_xmit_probe_skb(struct sock
*sk
)
1304 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1305 struct sk_buff
*skb
;
1307 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1308 skb
= alloc_skb(MAX_TCP_HEADER
+ 15, GFP_ATOMIC
);
1312 /* Reserve space for headers and set control bits. */
1313 skb_reserve(skb
, MAX_TCP_HEADER
);
1315 TCP_SKB_CB(skb
)->flags
= TCPCB_FLAG_ACK
;
1316 TCP_SKB_CB(skb
)->sacked
= 0;
1317 TCP_SKB_CB(skb
)->urg_ptr
= 0;
1319 /* Use a previous sequence. This should cause the other
1320 * end to send an ack. Don't queue or clone SKB, just
1323 TCP_SKB_CB(skb
)->seq
= tp
->snd_una
- 1;
1324 TCP_SKB_CB(skb
)->end_seq
= TCP_SKB_CB(skb
)->seq
;
1325 TCP_SKB_CB(skb
)->when
= tcp_time_stamp
;
1326 return tcp_transmit_skb(sk
, skb
);
1329 int tcp_write_wakeup(struct sock
*sk
)
1331 if (sk
->state
!= TCP_CLOSE
) {
1332 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1333 struct sk_buff
*skb
;
1335 if ((skb
= tp
->send_head
) != NULL
&&
1336 before(TCP_SKB_CB(skb
)->seq
, tp
->snd_una
+tp
->snd_wnd
)) {
1338 int mss
= tcp_current_mss(sk
);
1339 int seg_size
= tp
->snd_una
+tp
->snd_wnd
-TCP_SKB_CB(skb
)->seq
;
1341 if (before(tp
->pushed_seq
, TCP_SKB_CB(skb
)->end_seq
))
1342 tp
->pushed_seq
= TCP_SKB_CB(skb
)->end_seq
;
1344 /* We are probing the opening of a window
1345 * but the window size is != 0
1346 * must have been a result SWS avoidance ( sender )
1348 if (seg_size
< TCP_SKB_CB(skb
)->end_seq
- TCP_SKB_CB(skb
)->seq
||
1350 seg_size
= min(seg_size
, mss
);
1351 TCP_SKB_CB(skb
)->flags
|= TCPCB_FLAG_PSH
;
1352 if (tcp_fragment(sk
, skb
, seg_size
))
1355 TCP_SKB_CB(skb
)->flags
|= TCPCB_FLAG_PSH
;
1356 TCP_SKB_CB(skb
)->when
= tcp_time_stamp
;
1357 err
= tcp_transmit_skb(sk
, skb_clone(skb
, GFP_ATOMIC
));
1359 update_send_head(sk
, tp
, skb
);
1363 return tcp_xmit_probe_skb(sk
);
1369 /* A window probe timeout has occurred. If window is not closed send
1370 * a partial packet else a zero probe.
1372 void tcp_send_probe0(struct sock
*sk
)
1374 struct tcp_opt
*tp
= &(sk
->tp_pinfo
.af_tcp
);
1377 err
= tcp_write_wakeup(sk
);
1379 if (tp
->packets_out
|| !tp
->send_head
) {
1380 /* Cancel probe timer, if it is not required. */
1389 tcp_reset_xmit_timer (sk
, TCP_TIME_PROBE0
,
1390 min(tp
->rto
<< tp
->backoff
, TCP_RTO_MAX
));
1392 /* If packet was not sent due to local congestion,
1393 * do not backoff and do not remember probes_out.
1394 * Let local senders to fight for local resources.
1396 * Use accumulated backoff yet.
1398 if (!tp
->probes_out
)
1400 tcp_reset_xmit_timer (sk
, TCP_TIME_PROBE0
,
1401 min(tp
->rto
<< tp
->backoff
, TCP_RESOURCE_PROBE_INTERVAL
));