Linux 2.2.0
[davej-history.git] / net / ipv4 / tcp_output.c
blob3e99d80dbf33cf5f4bda1cdcd65b16dcda9d43b7
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_output.c,v 1.101 1999/01/20 07:20:14 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
27 * : AF independence
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
36 #include <net/tcp.h>
38 extern int sysctl_tcp_timestamps;
39 extern int sysctl_tcp_window_scaling;
40 extern int sysctl_tcp_sack;
42 /* People can turn this off for buggy TCP's found in printers etc. */
43 int sysctl_tcp_retrans_collapse = 1;
45 /* Get rid of any delayed acks, we sent one already.. */
46 static __inline__ void clear_delayed_acks(struct sock * sk)
48 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
50 tp->delayed_acks = 0;
51 if(tcp_in_quickack_mode(tp))
52 tcp_exit_quickack_mode(tp);
53 tcp_clear_xmit_timer(sk, TIME_DACK);
56 static __inline__ void update_send_head(struct sock *sk)
58 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
60 tp->send_head = tp->send_head->next;
61 if (tp->send_head == (struct sk_buff *) &sk->write_queue)
62 tp->send_head = NULL;
65 /* This routine actually transmits TCP packets queued in by
66 * tcp_do_sendmsg(). This is used by both the initial
67 * transmission and possible later retransmissions.
68 * All SKB's seen here are completely headerless. It is our
69 * job to build the TCP header, and pass the packet down to
70 * IP so it can do the same plus pass the packet off to the
71 * device.
73 * We are working here with either a clone of the original
74 * SKB, or a fresh unique copy made by the retransmit engine.
76 void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
78 if(skb != NULL) {
79 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
80 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
81 int tcp_header_size = tp->tcp_header_len;
82 struct tcphdr *th;
83 int sysctl_flags;
85 #define SYSCTL_FLAG_TSTAMPS 0x1
86 #define SYSCTL_FLAG_WSCALE 0x2
87 #define SYSCTL_FLAG_SACK 0x4
89 sysctl_flags = 0;
90 if(tcb->flags & TCPCB_FLAG_SYN) {
91 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
92 if(sysctl_tcp_timestamps) {
93 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
94 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
96 if(sysctl_tcp_window_scaling) {
97 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
98 sysctl_flags |= SYSCTL_FLAG_WSCALE;
100 if(sysctl_tcp_sack) {
101 sysctl_flags |= SYSCTL_FLAG_SACK;
102 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
103 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
105 } else if(tp->sack_ok && tp->num_sacks) {
106 /* A SACK is 2 pad bytes, a 2 byte header, plus
107 * 2 32-bit sequence numbers for each SACK block.
109 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
110 (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
112 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
113 skb->h.th = th;
114 skb_set_owner_w(skb, sk);
116 /* Build TCP header and checksum it. */
117 th->source = sk->sport;
118 th->dest = sk->dport;
119 th->seq = htonl(TCP_SKB_CB(skb)->seq);
120 th->ack_seq = htonl(tp->rcv_nxt);
121 th->doff = (tcp_header_size >> 2);
122 th->res1 = 0;
123 *(((__u8 *)th) + 13) = tcb->flags;
124 if(!(tcb->flags & TCPCB_FLAG_SYN))
125 th->window = htons(tcp_select_window(sk));
126 th->check = 0;
127 th->urg_ptr = ntohs(tcb->urg_ptr);
128 if(tcb->flags & TCPCB_FLAG_SYN) {
129 /* RFC1323: The window in SYN & SYN/ACK segments
130 * is never scaled.
132 th->window = htons(tp->rcv_wnd);
133 tcp_syn_build_options((__u32 *)(th + 1), tp->mss_clamp,
134 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
135 (sysctl_flags & SYSCTL_FLAG_SACK),
136 (sysctl_flags & SYSCTL_FLAG_WSCALE),
137 tp->rcv_wscale,
138 TCP_SKB_CB(skb)->when);
139 } else {
140 tcp_build_and_update_options((__u32 *)(th + 1),
141 tp, TCP_SKB_CB(skb)->when);
143 tp->af_specific->send_check(sk, th, skb->len, skb);
145 clear_delayed_acks(sk);
146 tp->last_ack_sent = tp->rcv_nxt;
147 tcp_statistics.TcpOutSegs++;
148 tp->af_specific->queue_xmit(skb);
150 #undef SYSCTL_FLAG_TSTAMPS
151 #undef SYSCTL_FLAG_WSCALE
152 #undef SYSCTL_FLAG_SACK
155 /* This is the main buffer sending routine. We queue the buffer
156 * and decide whether to queue or transmit now.
158 void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
160 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
162 /* Advance write_seq and place onto the write_queue. */
163 tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq);
164 __skb_queue_tail(&sk->write_queue, skb);
166 if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) {
167 /* Send it out now. */
168 TCP_SKB_CB(skb)->when = jiffies;
169 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
170 tp->packets_out++;
171 tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
172 if(!tcp_timer_is_set(sk, TIME_RETRANS))
173 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
174 } else {
175 /* Queue it, remembering where we must start sending. */
176 if (tp->send_head == NULL)
177 tp->send_head = skb;
178 if (!force_queue && tp->packets_out == 0 && !tp->pending) {
179 tp->pending = TIME_PROBE0;
180 tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
185 /* Function to create two new TCP segments. Shrinks the given segment
186 * to the specified size and appends a new segment with the rest of the
187 * packet to the list. This won't be called frequently, I hope.
188 * Remember, these are still headerless SKBs at this point.
190 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
192 struct sk_buff *buff;
193 int nsize = skb->len - len;
194 u16 flags;
196 /* Get a new skb... force flag on. */
197 buff = sock_wmalloc(sk,
198 (nsize + MAX_HEADER + sk->prot->max_header),
199 1, GFP_ATOMIC);
200 if (buff == NULL)
201 return -1; /* We'll just try again later. */
203 /* Reserve space for headers. */
204 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
206 /* Correct the sequence numbers. */
207 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
208 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
210 /* PSH and FIN should only be set in the second packet. */
211 flags = TCP_SKB_CB(skb)->flags;
212 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
213 if(flags & TCPCB_FLAG_URG) {
214 u16 old_urg_ptr = TCP_SKB_CB(skb)->urg_ptr;
216 /* Urgent data is always a pain in the ass. */
217 if(old_urg_ptr > len) {
218 TCP_SKB_CB(skb)->flags &= ~(TCPCB_FLAG_URG);
219 TCP_SKB_CB(skb)->urg_ptr = 0;
220 TCP_SKB_CB(buff)->urg_ptr = old_urg_ptr - len;
221 } else {
222 flags &= ~(TCPCB_FLAG_URG);
225 if(!(flags & TCPCB_FLAG_URG))
226 TCP_SKB_CB(buff)->urg_ptr = 0;
227 TCP_SKB_CB(buff)->flags = flags;
228 TCP_SKB_CB(buff)->sacked = 0;
230 /* Copy and checksum data tail into the new buffer. */
231 buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize),
232 nsize, 0);
234 TCP_SKB_CB(skb)->end_seq -= nsize;
235 skb_trim(skb, skb->len - nsize);
237 /* Rechecksum original buffer. */
238 skb->csum = csum_partial(skb->data, skb->len, 0);
240 /* Link BUFF into the send queue. */
241 __skb_append(skb, buff);
243 return 0;
246 /* This function synchronize snd mss to current pmtu/exthdr set.
248 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
249 for TCP options, but includes only bare TCP header.
251 tp->mss_clamp is mss negotiated at connection setup.
252 It is minumum of user_mss and mss received with SYN.
253 It also does not include TCP options.
255 tp->pmtu_cookie is last pmtu, seen by this function.
257 tp->mss_cache is current effective sending mss, including
258 all tcp options except for SACKs. It is evaluated,
259 taking into account current pmtu, but never exceeds
260 tp->mss_clamp.
262 NOTE1. rfc1122 clearly states that advertised MSS
263 DOES NOT include either tcp or ip options.
265 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
266 this function. --ANK (980731)
269 int tcp_sync_mss(struct sock *sk, u32 pmtu)
271 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
272 int mss_now;
274 /* Calculate base mss without TCP options:
275 It is MMS_S - sizeof(tcphdr) of rfc1122
277 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
279 /* Clamp it (mss_clamp does not include tcp options) */
280 if (mss_now > tp->mss_clamp)
281 mss_now = tp->mss_clamp;
283 /* Now subtract TCP options size, not including SACKs */
284 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
286 /* Now subtract optional transport overhead */
287 mss_now -= tp->ext_header_len;
289 /* It we got too small (or even negative) value,
290 clamp it by 8 from below. Why 8 ?
291 Well, it could be 1 with the same success,
292 but if IP accepted segment of length 1,
293 it would love 8 even more 8) --ANK (980731)
295 if (mss_now < 8)
296 mss_now = 8;
298 /* And store cached results */
299 tp->pmtu_cookie = pmtu;
300 tp->mss_cache = mss_now;
301 return mss_now;
305 /* This routine writes packets to the network. It advances the
306 * send_head. This happens as incoming acks open up the remote
307 * window for us.
309 void tcp_write_xmit(struct sock *sk)
311 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
312 unsigned int mss_now;
314 /* Account for SACKS, we may need to fragment due to this.
315 * It is just like the real MSS changing on us midstream.
316 * We also handle things correctly when the user adds some
317 * IP options mid-stream. Silly to do, but cover it.
319 mss_now = tcp_current_mss(sk);
321 /* If we are zapped, the bytes will have to remain here.
322 * In time closedown will empty the write queue and all
323 * will be happy.
325 if(!sk->zapped) {
326 struct sk_buff *skb;
327 int sent_pkts = 0;
329 /* Anything on the transmit queue that fits the window can
330 * be added providing we are:
332 * a) following SWS avoidance [and Nagle algorithm]
333 * b) not exceeding our congestion window.
334 * c) not retransmitting [Nagle]
336 while((skb = tp->send_head) && tcp_snd_test(sk, skb)) {
337 if (skb->len > mss_now) {
338 if (tcp_fragment(sk, skb, mss_now))
339 break;
342 /* Advance the send_head. This one is going out. */
343 update_send_head(sk);
344 TCP_SKB_CB(skb)->when = jiffies;
345 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
346 tp->packets_out++;
347 tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
348 sent_pkts = 1;
351 /* If we sent anything, make sure the retransmit
352 * timer is active.
354 if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS))
355 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
359 /* This function returns the amount that we can raise the
360 * usable window based on the following constraints
362 * 1. The window can never be shrunk once it is offered (RFC 793)
363 * 2. We limit memory per socket
365 * RFC 1122:
366 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
367 * RECV.NEXT + RCV.WIN fixed until:
368 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
370 * i.e. don't raise the right edge of the window until you can raise
371 * it at least MSS bytes.
373 * Unfortunately, the recommended algorithm breaks header prediction,
374 * since header prediction assumes th->window stays fixed.
376 * Strictly speaking, keeping th->window fixed violates the receiver
377 * side SWS prevention criteria. The problem is that under this rule
378 * a stream of single byte packets will cause the right side of the
379 * window to always advance by a single byte.
381 * Of course, if the sender implements sender side SWS prevention
382 * then this will not be a problem.
384 * BSD seems to make the following compromise:
386 * If the free space is less than the 1/4 of the maximum
387 * space available and the free space is less than 1/2 mss,
388 * then set the window to 0.
389 * Otherwise, just prevent the window from shrinking
390 * and from being larger than the largest representable value.
392 * This prevents incremental opening of the window in the regime
393 * where TCP is limited by the speed of the reader side taking
394 * data out of the TCP receive queue. It does nothing about
395 * those cases where the window is constrained on the sender side
396 * because the pipeline is full.
398 * BSD also seems to "accidentally" limit itself to windows that are a
399 * multiple of MSS, at least until the free space gets quite small.
400 * This would appear to be a side effect of the mbuf implementation.
401 * Combining these two algorithms results in the observed behavior
402 * of having a fixed window size at almost all times.
404 * Below we obtain similar behavior by forcing the offered window to
405 * a multiple of the mss when it is feasible to do so.
407 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
409 u32 __tcp_select_window(struct sock *sk)
411 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
412 unsigned int mss = tp->mss_cache;
413 int free_space;
414 u32 window;
416 /* Sometimes free_space can be < 0. */
417 free_space = (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / 2;
418 if (tp->window_clamp) {
419 if (free_space > ((int) tp->window_clamp))
420 free_space = tp->window_clamp;
421 mss = min(tp->window_clamp, mss);
422 } else {
423 printk("tcp_select_window: tp->window_clamp == 0.\n");
426 if (mss < 1) {
427 mss = 1;
428 printk("tcp_select_window: sk->mss fell to 0.\n");
431 if ((free_space < (sk->rcvbuf/4)) && (free_space < ((int) (mss/2)))) {
432 window = 0;
433 tp->pred_flags = 0;
434 } else {
435 /* Get the largest window that is a nice multiple of mss.
436 * Window clamp already applied above.
437 * If our current window offering is within 1 mss of the
438 * free space we just keep it. This prevents the divide
439 * and multiply from happening most of the time.
440 * We also don't do any window rounding when the free space
441 * is too small.
443 window = tp->rcv_wnd;
444 if ((((int) window) <= (free_space - ((int) mss))) ||
445 (((int) window) > free_space))
446 window = (((unsigned int) free_space)/mss)*mss;
448 return window;
451 /* Attempt to collapse two adjacent SKB's during retransmission. */
452 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
454 struct sk_buff *next_skb = skb->next;
456 /* The first test we must make is that neither of these two
457 * SKB's are still referenced by someone else.
459 if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
460 int skb_size = skb->len, next_skb_size = next_skb->len;
461 u16 flags = TCP_SKB_CB(skb)->flags;
463 /* Punt if the first SKB has URG set. */
464 if(flags & TCPCB_FLAG_URG)
465 return;
467 /* Also punt if next skb has been SACK'd. */
468 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
469 return;
471 /* Punt if not enough space exists in the first SKB for
472 * the data in the second, or the total combined payload
473 * would exceed the MSS.
475 if ((next_skb_size > skb_tailroom(skb)) ||
476 ((skb_size + next_skb_size) > mss_now))
477 return;
479 /* Ok. We will be able to collapse the packet. */
480 __skb_unlink(next_skb, next_skb->list);
482 if(skb->len % 4) {
483 /* Must copy and rechecksum all data. */
484 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
485 skb->csum = csum_partial(skb->data, skb->len, 0);
486 } else {
487 /* Optimize, actually we could also combine next_skb->csum
488 * to skb->csum using a single add w/carry operation too.
490 skb->csum = csum_partial_copy(next_skb->data,
491 skb_put(skb, next_skb_size),
492 next_skb_size, skb->csum);
495 /* Update sequence range on original skb. */
496 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
498 /* Merge over control information. */
499 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
500 if(flags & TCPCB_FLAG_URG) {
501 u16 urgptr = TCP_SKB_CB(next_skb)->urg_ptr;
502 TCP_SKB_CB(skb)->urg_ptr = urgptr + skb_size;
504 TCP_SKB_CB(skb)->flags = flags;
506 /* All done, get rid of second SKB and account for it so
507 * packet counting does not break.
509 kfree_skb(next_skb);
510 sk->tp_pinfo.af_tcp.packets_out--;
514 /* Do a simple retransmit without using the backoff mechanisms in
515 * tcp_timer. This is used for path mtu discovery.
516 * The socket is already locked here.
518 void tcp_simple_retransmit(struct sock *sk)
520 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
521 struct sk_buff *skb;
522 unsigned int mss = tcp_current_mss(sk);
524 /* Don't muck with the congestion window here. */
525 tp->dup_acks = 0;
526 tp->high_seq = tp->snd_nxt;
527 tp->retrans_head = NULL;
529 /* Input control flow will see that this was retransmitted
530 * and not use it for RTT calculation in the absence of
531 * the timestamp option.
533 for (skb = skb_peek(&sk->write_queue);
534 ((skb != tp->send_head) &&
535 (skb != (struct sk_buff *)&sk->write_queue));
536 skb = skb->next)
537 if (skb->len > mss)
538 tcp_retransmit_skb(sk, skb);
541 static __inline__ void update_retrans_head(struct sock *sk)
543 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
545 tp->retrans_head = tp->retrans_head->next;
546 if((tp->retrans_head == tp->send_head) ||
547 (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) {
548 tp->retrans_head = NULL;
549 tp->rexmt_done = 1;
553 /* This retransmits one SKB. Policy decisions and retransmit queue
554 * state updates are done by the caller. Returns non-zero if an
555 * error occurred which prevented the send.
557 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
559 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
560 unsigned int cur_mss = tcp_current_mss(sk);
562 if(skb->len > cur_mss) {
563 if(tcp_fragment(sk, skb, cur_mss))
564 return 1; /* We'll try again later. */
566 /* New SKB created, account for it. */
567 tp->packets_out++;
570 /* Collapse two adjacent packets if worthwhile and we can. */
571 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
572 (skb->len < (cur_mss >> 1)) &&
573 (skb->next != tp->send_head) &&
574 (skb->next != (struct sk_buff *)&sk->write_queue) &&
575 (sysctl_tcp_retrans_collapse != 0))
576 tcp_retrans_try_collapse(sk, skb, cur_mss);
578 if(tp->af_specific->rebuild_header(sk))
579 return 1; /* Routing failure or similar. */
581 /* Ok, we're gonna send it out, update state. */
582 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS;
583 tp->retrans_out++;
585 /* Make a copy, if the first transmission SKB clone we made
586 * is still in somebody's hands, else make a clone.
588 TCP_SKB_CB(skb)->when = jiffies;
589 if(skb_cloned(skb))
590 skb = skb_copy(skb, GFP_ATOMIC);
591 else
592 skb = skb_clone(skb, GFP_ATOMIC);
593 tcp_transmit_skb(sk, skb);
595 /* Update global TCP statistics and return success. */
596 sk->prot->retransmits++;
597 tcp_statistics.TcpRetransSegs++;
599 return 0;
602 /* This gets called after a retransmit timeout, and the initially
603 * retransmitted data is acknowledged. It tries to continue
604 * resending the rest of the retransmit queue, until either
605 * we've sent it all or the congestion window limit is reached.
606 * If doing SACK, the first ACK which comes back for a timeout
607 * based retransmit packet might feed us FACK information again.
608 * If so, we use it to avoid unnecessarily retransmissions.
610 void tcp_xmit_retransmit_queue(struct sock *sk)
612 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
613 struct sk_buff *skb;
615 if (tp->retrans_head == NULL &&
616 tp->rexmt_done == 0)
617 tp->retrans_head = skb_peek(&sk->write_queue);
618 if (tp->retrans_head == tp->send_head)
619 tp->retrans_head = NULL;
621 /* Each time, advance the retrans_head if we got
622 * a packet out or we skipped one because it was
623 * SACK'd. -DaveM
625 while ((skb = tp->retrans_head) != NULL) {
626 /* If it has been ack'd by a SACK block, we don't
627 * retransmit it.
629 if(!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
630 /* Send it out, punt if error occurred. */
631 if(tcp_retransmit_skb(sk, skb))
632 break;
634 update_retrans_head(sk);
636 /* Stop retransmitting if we've hit the congestion
637 * window limit.
639 if (tp->retrans_out >= tp->snd_cwnd)
640 break;
641 } else {
642 update_retrans_head(sk);
647 /* Using FACK information, retransmit all missing frames at the receiver
648 * up to the forward most SACK'd packet (tp->fackets_out) if the packet
649 * has not been retransmitted already.
651 void tcp_fack_retransmit(struct sock *sk)
653 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
654 struct sk_buff *skb = skb_peek(&sk->write_queue);
655 int packet_cnt = 0;
657 while((skb != NULL) &&
658 (skb != tp->send_head) &&
659 (skb != (struct sk_buff *)&sk->write_queue)) {
660 __u8 sacked = TCP_SKB_CB(skb)->sacked;
662 if(sacked & (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS))
663 goto next_packet;
665 /* Ok, retransmit it. */
666 if(tcp_retransmit_skb(sk, skb))
667 break;
669 if(tcp_packets_in_flight(tp) >= tp->snd_cwnd)
670 break;
671 next_packet:
672 packet_cnt++;
673 if(packet_cnt >= tp->fackets_out)
674 break;
675 skb = skb->next;
679 /* Send a fin. The caller locks the socket for us. This cannot be
680 * allowed to fail queueing a FIN frame under any circumstances.
682 void tcp_send_fin(struct sock *sk)
684 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
685 struct sk_buff *skb = skb_peek_tail(&sk->write_queue);
686 unsigned int mss_now;
688 /* Optimization, tack on the FIN if we have a queue of
689 * unsent frames. But be careful about outgoing SACKS
690 * and IP options.
692 mss_now = tcp_current_mss(sk);
694 if((tp->send_head != NULL) && (skb->len < mss_now)) {
695 /* tcp_write_xmit() takes care of the rest. */
696 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
697 TCP_SKB_CB(skb)->end_seq++;
698 tp->write_seq++;
700 /* Special case to avoid Nagle bogosity. If this
701 * segment is the last segment, and it was queued
702 * due to Nagle/SWS-avoidance, send it out now.
704 if(tp->send_head == skb &&
705 !sk->nonagle &&
706 skb->len < (tp->mss_cache >> 1) &&
707 tp->packets_out &&
708 !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
709 update_send_head(sk);
710 TCP_SKB_CB(skb)->when = jiffies;
711 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
712 tp->packets_out++;
713 tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
714 if(!tcp_timer_is_set(sk, TIME_RETRANS))
715 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
717 } else {
718 /* Socket is locked, keep trying until memory is available. */
719 do {
720 skb = sock_wmalloc(sk,
721 (MAX_HEADER +
722 sk->prot->max_header),
723 1, GFP_KERNEL);
724 } while (skb == NULL);
726 /* Reserve space for headers and prepare control bits. */
727 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
728 skb->csum = 0;
729 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
730 TCP_SKB_CB(skb)->sacked = 0;
731 TCP_SKB_CB(skb)->urg_ptr = 0;
733 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
734 TCP_SKB_CB(skb)->seq = tp->write_seq;
735 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
736 tcp_send_skb(sk, skb, 0);
740 /* We get here when a process closes a file descriptor (either due to
741 * an explicit close() or as a byproduct of exit()'ing) and there
742 * was unread data in the receive queue. This behavior is recommended
743 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
745 void tcp_send_active_reset(struct sock *sk)
747 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
748 struct sk_buff *skb;
750 /* NOTE: No TCP options attached and we never retransmit this. */
751 skb = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL);
752 if (!skb)
753 return;
755 /* Reserve space for headers and prepare control bits. */
756 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
757 skb->csum = 0;
758 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
759 TCP_SKB_CB(skb)->sacked = 0;
760 TCP_SKB_CB(skb)->urg_ptr = 0;
762 /* Send it off. */
763 TCP_SKB_CB(skb)->seq = tp->write_seq;
764 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
765 TCP_SKB_CB(skb)->when = jiffies;
766 tcp_transmit_skb(sk, skb);
769 /* WARNING: This routine must only be called when we have already sent
770 * a SYN packet that crossed the incoming SYN that caused this routine
771 * to get called. If this assumption fails then the initial rcv_wnd
772 * and rcv_wscale values will not be correct.
774 int tcp_send_synack(struct sock *sk)
776 struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
777 struct sk_buff* skb;
779 skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
780 1, GFP_ATOMIC);
781 if (skb == NULL)
782 return -ENOMEM;
784 /* Reserve space for headers and prepare control bits. */
785 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
786 skb->csum = 0;
787 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN);
788 TCP_SKB_CB(skb)->sacked = 0;
789 TCP_SKB_CB(skb)->urg_ptr = 0;
791 /* SYN eats a sequence byte. */
792 TCP_SKB_CB(skb)->seq = tp->snd_una;
793 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
794 __skb_queue_tail(&sk->write_queue, skb);
795 TCP_SKB_CB(skb)->when = jiffies;
796 tp->packets_out++;
797 tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
798 return 0;
802 * Prepare a SYN-ACK.
804 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
805 struct open_request *req, int mss)
807 struct tcphdr *th;
808 int tcp_header_size;
809 struct sk_buff *skb;
811 skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC);
812 if (skb == NULL)
813 return NULL;
815 /* Reserve space for headers. */
816 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
818 skb->dst = dst_clone(dst);
820 /* Don't offer more than they did.
821 * This way we don't have to memorize who said what.
822 * FIXME: maybe this should be changed for better performance
823 * with syncookies.
825 req->mss = min(mss, req->mss);
826 if (req->mss < 8) {
827 printk(KERN_DEBUG "initial req->mss below 8\n");
828 req->mss = 8;
831 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
832 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
833 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
834 /* SACK_PERM is in the place of NOP NOP of TS */
835 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
836 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
838 memset(th, 0, sizeof(struct tcphdr));
839 th->syn = 1;
840 th->ack = 1;
841 th->source = sk->sport;
842 th->dest = req->rmt_port;
843 TCP_SKB_CB(skb)->seq = req->snt_isn;
844 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
845 th->seq = htonl(TCP_SKB_CB(skb)->seq);
846 th->ack_seq = htonl(req->rcv_isn + 1);
847 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
848 __u8 rcv_wscale;
849 /* Set this up on the first call only */
850 req->window_clamp = skb->dst->window;
851 tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
852 &req->rcv_wnd,
853 &req->window_clamp,
854 req->wscale_ok,
855 &rcv_wscale);
856 req->rcv_wscale = rcv_wscale;
859 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
860 th->window = htons(req->rcv_wnd);
862 TCP_SKB_CB(skb)->when = jiffies;
863 tcp_syn_build_options((__u32 *)(th + 1), req->mss, req->tstamp_ok,
864 req->sack_ok, req->wscale_ok, req->rcv_wscale,
865 TCP_SKB_CB(skb)->when);
867 skb->csum = 0;
868 th->doff = (tcp_header_size >> 2);
869 tcp_statistics.TcpOutSegs++;
870 return skb;
873 void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu)
875 struct dst_entry *dst = sk->dst_cache;
876 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
878 /* Reserve space for headers. */
879 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
881 tp->snd_wnd = 0;
882 tp->snd_wl1 = 0;
883 tp->snd_wl2 = tp->write_seq;
884 tp->snd_una = tp->write_seq;
885 tp->rcv_nxt = 0;
887 sk->err = 0;
889 /* We'll fix this up when we get a response from the other end.
890 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
892 tp->tcp_header_len = sizeof(struct tcphdr) +
893 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
895 /* If user gave his TCP_MAXSEG, record it to clamp */
896 if (tp->user_mss)
897 tp->mss_clamp = tp->user_mss;
898 tcp_sync_mss(sk, mtu);
900 /* Now unpleasant action: if initial pmtu is too low
901 set lower clamp. I am not sure that it is good.
902 To be more exact, I do not think that clamping at value, which
903 is apparently transient and may improve in future is good idea.
904 It would be better to wait until peer will returns its MSS
905 (probably 65535 too) and now advertise something sort of 65535
906 or at least first hop device mtu. Is it clear, what I mean?
907 We should tell peer what maximal mss we expect to RECEIVE,
908 it has nothing to do with pmtu.
909 I am afraid someone will be confused by such huge value.
910 --ANK (980731)
912 if (tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr) < tp->mss_clamp )
913 tp->mss_clamp = tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr);
915 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
916 TCP_SKB_CB(buff)->sacked = 0;
917 TCP_SKB_CB(buff)->urg_ptr = 0;
918 buff->csum = 0;
919 TCP_SKB_CB(buff)->seq = tp->write_seq++;
920 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
921 tp->snd_nxt = TCP_SKB_CB(buff)->end_seq;
923 tp->window_clamp = dst->window;
924 tcp_select_initial_window(sock_rspace(sk)/2,tp->mss_clamp,
925 &tp->rcv_wnd,
926 &tp->window_clamp,
927 sysctl_tcp_window_scaling,
928 &tp->rcv_wscale);
929 /* Ok, now lock the socket before we make it visible to
930 * the incoming packet engine.
932 lock_sock(sk);
934 /* Socket identity change complete, no longer
935 * in TCP_CLOSE, so enter ourselves into the
936 * hash tables.
938 tcp_set_state(sk,TCP_SYN_SENT);
939 sk->prot->hash(sk);
941 tp->rto = dst->rtt;
942 tcp_init_xmit_timers(sk);
943 tp->retransmits = 0;
944 tp->fackets_out = 0;
945 tp->retrans_out = 0;
947 /* Send it off. */
948 __skb_queue_tail(&sk->write_queue, buff);
949 TCP_SKB_CB(buff)->when = jiffies;
950 tp->packets_out++;
951 tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
952 tcp_statistics.TcpActiveOpens++;
954 /* Timer for repeating the SYN until an answer. */
955 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
957 /* Now, it is safe to release the socket. */
958 release_sock(sk);
961 /* Send out a delayed ack, the caller does the policy checking
962 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
963 * for details.
965 void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout)
967 unsigned long timeout;
969 /* Stay within the limit we were given */
970 timeout = tp->ato;
971 if (timeout > max_timeout)
972 timeout = max_timeout;
973 timeout += jiffies;
975 /* Use new timeout only if there wasn't a older one earlier. */
976 if (!tp->delack_timer.prev) {
977 tp->delack_timer.expires = timeout;
978 add_timer(&tp->delack_timer);
979 } else {
980 if (timeout < tp->delack_timer.expires)
981 mod_timer(&tp->delack_timer, timeout);
985 /* This routine sends an ack and also updates the window. */
986 void tcp_send_ack(struct sock *sk)
988 /* If we have been reset, we may not send again. */
989 if(!sk->zapped) {
990 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
991 struct sk_buff *buff;
993 /* We are not putting this on the write queue, so
994 * tcp_transmit_skb() will set the ownership to this
995 * sock.
997 buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC);
998 if (buff == NULL) {
999 /* Force it to send an ack. We don't have to do this
1000 * (ACK is unreliable) but it's much better use of
1001 * bandwidth on slow links to send a spare ack than
1002 * resend packets.
1004 * This is the one possible way that we can delay an
1005 * ACK and have tp->ato indicate that we are in
1006 * quick ack mode, so clear it.
1008 if(tcp_in_quickack_mode(tp))
1009 tcp_exit_quickack_mode(tp);
1010 tcp_send_delayed_ack(tp, HZ/2);
1011 return;
1014 /* Reserve space for headers and prepare control bits. */
1015 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
1016 buff->csum = 0;
1017 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1018 TCP_SKB_CB(buff)->sacked = 0;
1019 TCP_SKB_CB(buff)->urg_ptr = 0;
1021 /* Send it off, this clears delayed acks for us. */
1022 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tp->snd_nxt;
1023 TCP_SKB_CB(buff)->when = jiffies;
1024 tcp_transmit_skb(sk, buff);
1028 /* This routine sends a packet with an out of date sequence
1029 * number. It assumes the other end will try to ack it.
1031 void tcp_write_wakeup(struct sock *sk)
1033 /* After a valid reset we can send no more. */
1034 if (!sk->zapped) {
1035 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1036 struct sk_buff *skb;
1038 /* Write data can still be transmitted/retransmitted in the
1039 * following states. If any other state is encountered, return.
1040 * [listen/close will never occur here anyway]
1042 if ((1 << sk->state) &
1043 ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
1044 TCPF_LAST_ACK|TCPF_CLOSING))
1045 return;
1047 if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
1048 ((skb = tp->send_head) != NULL)) {
1049 unsigned long win_size;
1051 /* We are probing the opening of a window
1052 * but the window size is != 0
1053 * must have been a result SWS avoidance ( sender )
1055 win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
1056 if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
1057 if (tcp_fragment(sk, skb, win_size))
1058 return; /* Let a retransmit get it. */
1060 update_send_head(sk);
1061 TCP_SKB_CB(skb)->when = jiffies;
1062 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
1063 tp->packets_out++;
1064 tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1065 if (!tcp_timer_is_set(sk, TIME_RETRANS))
1066 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
1067 } else {
1068 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1069 skb = alloc_skb(MAX_HEADER + sk->prot->max_header,
1070 GFP_ATOMIC);
1071 if (skb == NULL)
1072 return;
1074 /* Reserve space for headers and set control bits. */
1075 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
1076 skb->csum = 0;
1077 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1078 TCP_SKB_CB(skb)->sacked = 0;
1079 TCP_SKB_CB(skb)->urg_ptr = 0;
1081 /* Use a previous sequence. This should cause the other
1082 * end to send an ack. Don't queue or clone SKB, just
1083 * send it.
1085 TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1;
1086 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1087 TCP_SKB_CB(skb)->when = jiffies;
1088 tcp_transmit_skb(sk, skb);
1093 /* A window probe timeout has occurred. If window is not closed send
1094 * a partial packet else a zero probe.
1096 void tcp_send_probe0(struct sock *sk)
1098 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1100 tcp_write_wakeup(sk);
1101 tp->pending = TIME_PROBE0;
1102 tp->backoff++;
1103 tp->probes_out++;
1104 tcp_reset_xmit_timer (sk, TIME_PROBE0,
1105 min(tp->rto << tp->backoff, 120*HZ));