Import 2.3.18pre1
[davej-history.git] / net / ipv4 / tcp_output.c
blob77f8b98cafb770700ac3a993a9c3935e5bd28dfa
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_output.c,v 1.113 1999/09/07 02:31:39 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
27 * : AF independence
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
37 #include <net/tcp.h>
39 #include <linux/smp_lock.h>
41 extern int sysctl_tcp_timestamps;
42 extern int sysctl_tcp_window_scaling;
43 extern int sysctl_tcp_sack;
45 /* People can turn this off for buggy TCP's found in printers etc. */
46 int sysctl_tcp_retrans_collapse = 1;
48 /* Get rid of any delayed acks, we sent one already.. */
49 static __inline__ void clear_delayed_acks(struct sock * sk)
51 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
53 tp->delayed_acks = 0;
54 if(tcp_in_quickack_mode(tp))
55 tcp_exit_quickack_mode(tp);
56 tcp_clear_xmit_timer(sk, TIME_DACK);
59 static __inline__ void update_send_head(struct sock *sk)
61 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
63 tp->send_head = tp->send_head->next;
64 if (tp->send_head == (struct sk_buff *) &sk->write_queue)
65 tp->send_head = NULL;
68 /* Calculate mss to advertise in SYN segment.
69 RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
71 1. It is independent of path mtu.
72 2. Ideally, it is maximal possible segment size i.e. 65535-40.
73 3. For IPv4 it is reasonable to calculate it from maximal MTU of
74 attached devices, because some buggy hosts are confused by
75 large MSS.
76 4. We do not make 3, we advertise MSS, calculated from first
77 hop device mtu, but allow to raise it to ip_rt_min_advmss.
78 This may be overriden via information stored in routing table.
79 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
80 probably even Jumbo".
82 static __u16 tcp_advertise_mss(struct sock *sk)
84 struct dst_entry *dst = __sk_dst_get(sk);
85 int mss;
87 if (dst) {
88 mss = dst->advmss;
89 } else {
90 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
92 /* No dst. It is bad. Guess some reasonable value.
93 * Actually, this case should not be possible.
94 * SANITY.
96 BUG_TRAP(dst!=NULL);
98 mss = tp->mss_cache;
99 mss += (tp->tcp_header_len - sizeof(struct tcphdr)) +
100 tp->ext_header_len;
102 /* Minimal MSS to include full set of of TCP/IP options
103 plus 8 bytes of data. It corresponds to mtu 128.
105 if (mss < 88)
106 mss = 88;
109 return (__u16)mss;
112 /* This routine actually transmits TCP packets queued in by
113 * tcp_do_sendmsg(). This is used by both the initial
114 * transmission and possible later retransmissions.
115 * All SKB's seen here are completely headerless. It is our
116 * job to build the TCP header, and pass the packet down to
117 * IP so it can do the same plus pass the packet off to the
118 * device.
120 * We are working here with either a clone of the original
121 * SKB, or a fresh unique copy made by the retransmit engine.
123 void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
125 if(skb != NULL) {
126 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
127 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
128 int tcp_header_size = tp->tcp_header_len;
129 struct tcphdr *th;
130 int sysctl_flags;
132 #define SYSCTL_FLAG_TSTAMPS 0x1
133 #define SYSCTL_FLAG_WSCALE 0x2
134 #define SYSCTL_FLAG_SACK 0x4
136 sysctl_flags = 0;
137 if(tcb->flags & TCPCB_FLAG_SYN) {
138 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
139 if(sysctl_tcp_timestamps) {
140 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
141 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
143 if(sysctl_tcp_window_scaling) {
144 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
145 sysctl_flags |= SYSCTL_FLAG_WSCALE;
147 if(sysctl_tcp_sack) {
148 sysctl_flags |= SYSCTL_FLAG_SACK;
149 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
150 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
152 } else if(tp->sack_ok && tp->num_sacks) {
153 /* A SACK is 2 pad bytes, a 2 byte header, plus
154 * 2 32-bit sequence numbers for each SACK block.
156 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
157 (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
159 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
160 skb->h.th = th;
161 skb_set_owner_w(skb, sk);
163 /* Build TCP header and checksum it. */
164 th->source = sk->sport;
165 th->dest = sk->dport;
166 th->seq = htonl(TCP_SKB_CB(skb)->seq);
167 th->ack_seq = htonl(tp->rcv_nxt);
168 th->doff = (tcp_header_size >> 2);
169 th->res1 = 0;
170 *(((__u8 *)th) + 13) = tcb->flags;
171 th->check = 0;
172 th->urg_ptr = ntohs(tcb->urg_ptr);
173 if(tcb->flags & TCPCB_FLAG_SYN) {
174 /* RFC1323: The window in SYN & SYN/ACK segments
175 * is never scaled.
177 th->window = htons(tp->rcv_wnd);
178 tcp_syn_build_options((__u32 *)(th + 1),
179 tcp_advertise_mss(sk),
180 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
181 (sysctl_flags & SYSCTL_FLAG_SACK),
182 (sysctl_flags & SYSCTL_FLAG_WSCALE),
183 tp->rcv_wscale,
184 TCP_SKB_CB(skb)->when,
185 tp->ts_recent);
186 } else {
187 th->window = htons(tcp_select_window(sk));
188 tcp_build_and_update_options((__u32 *)(th + 1),
189 tp, TCP_SKB_CB(skb)->when);
191 tp->af_specific->send_check(sk, th, skb->len, skb);
193 clear_delayed_acks(sk);
194 tp->last_ack_sent = tp->rcv_nxt;
195 tcp_statistics.TcpOutSegs++;
196 tp->af_specific->queue_xmit(skb);
198 #undef SYSCTL_FLAG_TSTAMPS
199 #undef SYSCTL_FLAG_WSCALE
200 #undef SYSCTL_FLAG_SACK
203 /* This is the main buffer sending routine. We queue the buffer
204 * and decide whether to queue or transmit now.
206 void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
208 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
210 /* Advance write_seq and place onto the write_queue. */
211 tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq);
212 __skb_queue_tail(&sk->write_queue, skb);
214 if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) {
215 /* Send it out now. */
216 TCP_SKB_CB(skb)->when = tcp_time_stamp;
217 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
218 tp->packets_out++;
219 tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
220 if(!tcp_timer_is_set(sk, TIME_RETRANS))
221 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
222 } else {
223 /* Queue it, remembering where we must start sending. */
224 if (tp->send_head == NULL)
225 tp->send_head = skb;
226 if (!force_queue && tp->packets_out == 0 && !tp->pending) {
227 tp->pending = TIME_PROBE0;
228 tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
233 /* Function to create two new TCP segments. Shrinks the given segment
234 * to the specified size and appends a new segment with the rest of the
235 * packet to the list. This won't be called frequently, I hope.
236 * Remember, these are still headerless SKBs at this point.
238 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
240 struct sk_buff *buff;
241 int nsize = skb->len - len;
242 u16 flags;
244 /* Get a new skb... force flag on. */
245 buff = sock_wmalloc(sk,
246 (nsize + MAX_HEADER + sk->prot->max_header),
247 1, GFP_ATOMIC);
248 if (buff == NULL)
249 return -1; /* We'll just try again later. */
251 /* Reserve space for headers. */
252 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
254 /* Correct the sequence numbers. */
255 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
256 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
258 /* PSH and FIN should only be set in the second packet. */
259 flags = TCP_SKB_CB(skb)->flags;
260 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
261 if(flags & TCPCB_FLAG_URG) {
262 u16 old_urg_ptr = TCP_SKB_CB(skb)->urg_ptr;
264 /* Urgent data is always a pain in the ass. */
265 if(old_urg_ptr > len) {
266 TCP_SKB_CB(skb)->flags &= ~(TCPCB_FLAG_URG);
267 TCP_SKB_CB(skb)->urg_ptr = 0;
268 TCP_SKB_CB(buff)->urg_ptr = old_urg_ptr - len;
269 } else {
270 flags &= ~(TCPCB_FLAG_URG);
273 if(!(flags & TCPCB_FLAG_URG))
274 TCP_SKB_CB(buff)->urg_ptr = 0;
275 TCP_SKB_CB(buff)->flags = flags;
276 TCP_SKB_CB(buff)->sacked = 0;
278 /* Copy and checksum data tail into the new buffer. */
279 buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize),
280 nsize, 0);
282 /* This takes care of the FIN sequence number too. */
283 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
284 skb_trim(skb, len);
286 /* Rechecksum original buffer. */
287 skb->csum = csum_partial(skb->data, skb->len, 0);
289 /* Looks stupid, but our code really uses when of
290 * skbs, which it never sent before. --ANK
292 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
294 /* Link BUFF into the send queue. */
295 __skb_append(skb, buff);
297 return 0;
300 /* This function synchronize snd mss to current pmtu/exthdr set.
302 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
303 for TCP options, but includes only bare TCP header.
305 tp->mss_clamp is mss negotiated at connection setup.
306 It is minumum of user_mss and mss received with SYN.
307 It also does not include TCP options.
309 tp->pmtu_cookie is last pmtu, seen by this function.
311 tp->mss_cache is current effective sending mss, including
312 all tcp options except for SACKs. It is evaluated,
313 taking into account current pmtu, but never exceeds
314 tp->mss_clamp.
316 NOTE1. rfc1122 clearly states that advertised MSS
317 DOES NOT include either tcp or ip options.
319 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
320 this function. --ANK (980731)
323 int tcp_sync_mss(struct sock *sk, u32 pmtu)
325 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
326 int mss_now;
328 /* Calculate base mss without TCP options:
329 It is MMS_S - sizeof(tcphdr) of rfc1122
332 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
334 /* Clamp it (mss_clamp does not include tcp options) */
335 if (mss_now > tp->mss_clamp)
336 mss_now = tp->mss_clamp;
338 /* Now subtract TCP options size, not including SACKs */
339 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
341 /* Now subtract optional transport overhead */
342 mss_now -= tp->ext_header_len;
344 /* It we got too small (or even negative) value,
345 clamp it by 8 from below. Why 8 ?
346 Well, it could be 1 with the same success,
347 but if IP accepted segment of length 1,
348 it would love 8 even more 8) --ANK (980731)
350 if (mss_now < 8)
351 mss_now = 8;
353 /* And store cached results */
354 tp->pmtu_cookie = pmtu;
355 tp->mss_cache = mss_now;
356 return mss_now;
360 /* This routine writes packets to the network. It advances the
361 * send_head. This happens as incoming acks open up the remote
362 * window for us.
364 void tcp_write_xmit(struct sock *sk)
366 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
367 unsigned int mss_now;
369 /* Account for SACKS, we may need to fragment due to this.
370 * It is just like the real MSS changing on us midstream.
371 * We also handle things correctly when the user adds some
372 * IP options mid-stream. Silly to do, but cover it.
374 mss_now = tcp_current_mss(sk);
376 /* If we are zapped, the bytes will have to remain here.
377 * In time closedown will empty the write queue and all
378 * will be happy.
380 if(!sk->zapped) {
381 struct sk_buff *skb;
382 int sent_pkts = 0;
384 /* Anything on the transmit queue that fits the window can
385 * be added providing we are:
387 * a) following SWS avoidance [and Nagle algorithm]
388 * b) not exceeding our congestion window.
389 * c) not retransmitting [Nagle]
391 while((skb = tp->send_head) && tcp_snd_test(sk, skb)) {
392 if (skb->len > mss_now) {
393 if (tcp_fragment(sk, skb, mss_now))
394 break;
397 /* Advance the send_head. This one is going out. */
398 update_send_head(sk);
399 TCP_SKB_CB(skb)->when = tcp_time_stamp;
400 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
401 tp->packets_out++;
402 tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
403 sent_pkts = 1;
406 /* If we sent anything, make sure the retransmit
407 * timer is active.
409 if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS))
410 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
414 /* This function returns the amount that we can raise the
415 * usable window based on the following constraints
417 * 1. The window can never be shrunk once it is offered (RFC 793)
418 * 2. We limit memory per socket
420 * RFC 1122:
421 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
422 * RECV.NEXT + RCV.WIN fixed until:
423 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
425 * i.e. don't raise the right edge of the window until you can raise
426 * it at least MSS bytes.
428 * Unfortunately, the recommended algorithm breaks header prediction,
429 * since header prediction assumes th->window stays fixed.
431 * Strictly speaking, keeping th->window fixed violates the receiver
432 * side SWS prevention criteria. The problem is that under this rule
433 * a stream of single byte packets will cause the right side of the
434 * window to always advance by a single byte.
436 * Of course, if the sender implements sender side SWS prevention
437 * then this will not be a problem.
439 * BSD seems to make the following compromise:
441 * If the free space is less than the 1/4 of the maximum
442 * space available and the free space is less than 1/2 mss,
443 * then set the window to 0.
444 * Otherwise, just prevent the window from shrinking
445 * and from being larger than the largest representable value.
447 * This prevents incremental opening of the window in the regime
448 * where TCP is limited by the speed of the reader side taking
449 * data out of the TCP receive queue. It does nothing about
450 * those cases where the window is constrained on the sender side
451 * because the pipeline is full.
453 * BSD also seems to "accidentally" limit itself to windows that are a
454 * multiple of MSS, at least until the free space gets quite small.
455 * This would appear to be a side effect of the mbuf implementation.
456 * Combining these two algorithms results in the observed behavior
457 * of having a fixed window size at almost all times.
459 * Below we obtain similar behavior by forcing the offered window to
460 * a multiple of the mss when it is feasible to do so.
462 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
463 * Regular options like TIMESTAMP are taken into account.
465 u32 __tcp_select_window(struct sock *sk)
467 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
468 /* MSS for the peer's data. Previous verions used mss_clamp
469 * here. I don't know if the value based on our guesses
470 * of peer's MSS is better for the performance. It's more correct
471 * but may be worse for the performance because of rcv_mss
472 * fluctuations. --SAW 1998/11/1
474 unsigned int mss = tp->rcv_mss;
475 int free_space;
476 u32 window;
478 /* Sometimes free_space can be < 0. */
479 free_space = tcp_space(sk);
480 if (free_space > ((int) tp->window_clamp))
481 free_space = tp->window_clamp;
482 if (tp->window_clamp < mss)
483 mss = tp->window_clamp;
485 if ((free_space < (tcp_full_space(sk) / 2)) &&
486 (free_space < ((int) (mss/2)))) {
487 window = 0;
488 tp->pred_flags = 0;
489 } else {
490 /* Get the largest window that is a nice multiple of mss.
491 * Window clamp already applied above.
492 * If our current window offering is within 1 mss of the
493 * free space we just keep it. This prevents the divide
494 * and multiply from happening most of the time.
495 * We also don't do any window rounding when the free space
496 * is too small.
498 window = tp->rcv_wnd;
499 if ((((int) window) <= (free_space - ((int) mss))) ||
500 (((int) window) > free_space))
501 window = (((unsigned int) free_space)/mss)*mss;
503 return window;
506 /* Attempt to collapse two adjacent SKB's during retransmission. */
507 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
509 struct sk_buff *next_skb = skb->next;
511 /* The first test we must make is that neither of these two
512 * SKB's are still referenced by someone else.
514 if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
515 int skb_size = skb->len, next_skb_size = next_skb->len;
516 u16 flags = TCP_SKB_CB(skb)->flags;
518 /* Punt if the first SKB has URG set. */
519 if(flags & TCPCB_FLAG_URG)
520 return;
522 /* Also punt if next skb has been SACK'd. */
523 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
524 return;
526 /* Punt if not enough space exists in the first SKB for
527 * the data in the second, or the total combined payload
528 * would exceed the MSS.
530 if ((next_skb_size > skb_tailroom(skb)) ||
531 ((skb_size + next_skb_size) > mss_now))
532 return;
534 /* Ok. We will be able to collapse the packet. */
535 __skb_unlink(next_skb, next_skb->list);
537 if(skb->len % 4) {
538 /* Must copy and rechecksum all data. */
539 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
540 skb->csum = csum_partial(skb->data, skb->len, 0);
541 } else {
542 /* Optimize, actually we could also combine next_skb->csum
543 * to skb->csum using a single add w/carry operation too.
545 skb->csum = csum_partial_copy(next_skb->data,
546 skb_put(skb, next_skb_size),
547 next_skb_size, skb->csum);
550 /* Update sequence range on original skb. */
551 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
553 /* Merge over control information. */
554 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
555 if(flags & TCPCB_FLAG_URG) {
556 u16 urgptr = TCP_SKB_CB(next_skb)->urg_ptr;
557 TCP_SKB_CB(skb)->urg_ptr = urgptr + skb_size;
559 TCP_SKB_CB(skb)->flags = flags;
561 /* All done, get rid of second SKB and account for it so
562 * packet counting does not break.
564 kfree_skb(next_skb);
565 sk->tp_pinfo.af_tcp.packets_out--;
569 /* Do a simple retransmit without using the backoff mechanisms in
570 * tcp_timer. This is used for path mtu discovery.
571 * The socket is already locked here.
573 void tcp_simple_retransmit(struct sock *sk)
575 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
576 struct sk_buff *skb, *old_next_skb;
577 unsigned int mss = tcp_current_mss(sk);
579 /* Don't muck with the congestion window here. */
580 tp->dup_acks = 0;
581 tp->high_seq = tp->snd_nxt;
582 tp->retrans_head = NULL;
584 /* Input control flow will see that this was retransmitted
585 * and not use it for RTT calculation in the absence of
586 * the timestamp option.
588 for (old_next_skb = skb = skb_peek(&sk->write_queue);
589 ((skb != tp->send_head) &&
590 (skb != (struct sk_buff *)&sk->write_queue));
591 skb = skb->next) {
592 int resend_skb = 0;
594 /* Our goal is to push out the packets which we
595 * sent already, but are being chopped up now to
596 * account for the PMTU information we have.
598 * As we resend the queue, packets are fragmented
599 * into two pieces, and when we try to send the
600 * second piece it may be collapsed together with
601 * a subsequent packet, and so on. -DaveM
603 if (old_next_skb != skb || skb->len > mss)
604 resend_skb = 1;
605 old_next_skb = skb->next;
606 if (resend_skb != 0)
607 tcp_retransmit_skb(sk, skb);
611 static __inline__ void update_retrans_head(struct sock *sk)
613 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
615 tp->retrans_head = tp->retrans_head->next;
616 if((tp->retrans_head == tp->send_head) ||
617 (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) {
618 tp->retrans_head = NULL;
619 tp->rexmt_done = 1;
623 /* This retransmits one SKB. Policy decisions and retransmit queue
624 * state updates are done by the caller. Returns non-zero if an
625 * error occurred which prevented the send.
627 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
629 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
630 unsigned int cur_mss = tcp_current_mss(sk);
632 if(skb->len > cur_mss) {
633 if(tcp_fragment(sk, skb, cur_mss))
634 return 1; /* We'll try again later. */
636 /* New SKB created, account for it. */
637 tp->packets_out++;
640 /* Collapse two adjacent packets if worthwhile and we can. */
641 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
642 (skb->len < (cur_mss >> 1)) &&
643 (skb->next != tp->send_head) &&
644 (skb->next != (struct sk_buff *)&sk->write_queue) &&
645 (sysctl_tcp_retrans_collapse != 0))
646 tcp_retrans_try_collapse(sk, skb, cur_mss);
648 if(tp->af_specific->rebuild_header(sk))
649 return 1; /* Routing failure or similar. */
651 /* Some Solaris stacks overoptimize and ignore the FIN on a
652 * retransmit when old data is attached. So strip it off
653 * since it is cheap to do so and saves bytes on the network.
655 if(skb->len > 0 &&
656 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
657 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
658 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
659 skb_trim(skb, 0);
660 skb->csum = 0;
663 /* Ok, we're gonna send it out, update state. */
664 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS;
665 tp->retrans_out++;
667 /* Make a copy, if the first transmission SKB clone we made
668 * is still in somebody's hands, else make a clone.
670 TCP_SKB_CB(skb)->when = tcp_time_stamp;
671 if(skb_cloned(skb))
672 skb = skb_copy(skb, GFP_ATOMIC);
673 else
674 skb = skb_clone(skb, GFP_ATOMIC);
676 tcp_transmit_skb(sk, skb);
678 /* Update global TCP statistics and return success. */
679 sk->prot->retransmits++;
680 tcp_statistics.TcpRetransSegs++;
682 return 0;
685 /* This gets called after a retransmit timeout, and the initially
686 * retransmitted data is acknowledged. It tries to continue
687 * resending the rest of the retransmit queue, until either
688 * we've sent it all or the congestion window limit is reached.
689 * If doing SACK, the first ACK which comes back for a timeout
690 * based retransmit packet might feed us FACK information again.
691 * If so, we use it to avoid unnecessarily retransmissions.
693 void tcp_xmit_retransmit_queue(struct sock *sk)
695 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
696 struct sk_buff *skb;
698 if (tp->retrans_head == NULL &&
699 tp->rexmt_done == 0)
700 tp->retrans_head = skb_peek(&sk->write_queue);
701 if (tp->retrans_head == tp->send_head)
702 tp->retrans_head = NULL;
704 /* Each time, advance the retrans_head if we got
705 * a packet out or we skipped one because it was
706 * SACK'd. -DaveM
708 while ((skb = tp->retrans_head) != NULL) {
709 /* If it has been ack'd by a SACK block, we don't
710 * retransmit it.
712 if(!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
713 /* Send it out, punt if error occurred. */
714 if(tcp_retransmit_skb(sk, skb))
715 break;
717 update_retrans_head(sk);
719 /* Stop retransmitting if we've hit the congestion
720 * window limit.
722 if (tp->retrans_out >= tp->snd_cwnd)
723 break;
724 } else {
725 update_retrans_head(sk);
730 /* Using FACK information, retransmit all missing frames at the receiver
731 * up to the forward most SACK'd packet (tp->fackets_out) if the packet
732 * has not been retransmitted already.
734 void tcp_fack_retransmit(struct sock *sk)
736 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
737 struct sk_buff *skb = skb_peek(&sk->write_queue);
738 int packet_cnt = 0;
740 while((skb != NULL) &&
741 (skb != tp->send_head) &&
742 (skb != (struct sk_buff *)&sk->write_queue)) {
743 __u8 sacked = TCP_SKB_CB(skb)->sacked;
745 if(sacked & (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS))
746 goto next_packet;
748 /* Ok, retransmit it. */
749 if(tcp_retransmit_skb(sk, skb))
750 break;
752 if(tcp_packets_in_flight(tp) >= tp->snd_cwnd)
753 break;
754 next_packet:
755 packet_cnt++;
756 if(packet_cnt >= tp->fackets_out)
757 break;
758 skb = skb->next;
762 /* Send a fin. The caller locks the socket for us. This cannot be
763 * allowed to fail queueing a FIN frame under any circumstances.
765 void tcp_send_fin(struct sock *sk)
767 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
768 struct sk_buff *skb = skb_peek_tail(&sk->write_queue);
769 unsigned int mss_now;
771 /* Optimization, tack on the FIN if we have a queue of
772 * unsent frames. But be careful about outgoing SACKS
773 * and IP options.
775 mss_now = tcp_current_mss(sk);
777 if((tp->send_head != NULL) && (skb->len < mss_now)) {
778 /* tcp_write_xmit() takes care of the rest. */
779 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
780 TCP_SKB_CB(skb)->end_seq++;
781 tp->write_seq++;
783 /* Special case to avoid Nagle bogosity. If this
784 * segment is the last segment, and it was queued
785 * due to Nagle/SWS-avoidance, send it out now.
787 if(tp->send_head == skb &&
788 !sk->nonagle &&
789 skb->len < (tp->rcv_mss >> 1) &&
790 tp->packets_out &&
791 !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
792 update_send_head(sk);
793 TCP_SKB_CB(skb)->when = tcp_time_stamp;
794 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
795 tp->packets_out++;
796 tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
797 if(!tcp_timer_is_set(sk, TIME_RETRANS))
798 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
800 } else {
801 /* Socket is locked, keep trying until memory is available. */
802 do {
803 skb = sock_wmalloc(sk,
804 (MAX_HEADER +
805 sk->prot->max_header),
806 1, GFP_KERNEL);
807 } while (skb == NULL);
809 /* Reserve space for headers and prepare control bits. */
810 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
811 skb->csum = 0;
812 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
813 TCP_SKB_CB(skb)->sacked = 0;
814 TCP_SKB_CB(skb)->urg_ptr = 0;
816 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
817 TCP_SKB_CB(skb)->seq = tp->write_seq;
818 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
819 tcp_send_skb(sk, skb, 0);
823 /* We get here when a process closes a file descriptor (either due to
824 * an explicit close() or as a byproduct of exit()'ing) and there
825 * was unread data in the receive queue. This behavior is recommended
826 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
828 void tcp_send_active_reset(struct sock *sk, int priority)
830 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
831 struct sk_buff *skb;
833 /* NOTE: No TCP options attached and we never retransmit this. */
834 skb = alloc_skb(MAX_HEADER + sk->prot->max_header, priority);
835 if (!skb)
836 return;
838 /* Reserve space for headers and prepare control bits. */
839 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
840 skb->csum = 0;
841 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
842 TCP_SKB_CB(skb)->sacked = 0;
843 TCP_SKB_CB(skb)->urg_ptr = 0;
845 /* Send it off. */
846 TCP_SKB_CB(skb)->seq = tp->write_seq;
847 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
848 TCP_SKB_CB(skb)->when = tcp_time_stamp;
849 tcp_transmit_skb(sk, skb);
852 /* WARNING: This routine must only be called when we have already sent
853 * a SYN packet that crossed the incoming SYN that caused this routine
854 * to get called. If this assumption fails then the initial rcv_wnd
855 * and rcv_wscale values will not be correct.
857 int tcp_send_synack(struct sock *sk)
859 struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
860 struct sk_buff* skb;
862 skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
863 1, GFP_ATOMIC);
864 if (skb == NULL)
865 return -ENOMEM;
867 /* Reserve space for headers and prepare control bits. */
868 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
869 skb->csum = 0;
870 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN);
871 TCP_SKB_CB(skb)->sacked = 0;
872 TCP_SKB_CB(skb)->urg_ptr = 0;
874 /* SYN eats a sequence byte. */
875 TCP_SKB_CB(skb)->seq = tp->snd_una;
876 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
877 __skb_queue_tail(&sk->write_queue, skb);
878 TCP_SKB_CB(skb)->when = tcp_time_stamp;
879 tp->packets_out++;
880 tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
881 return 0;
885 * Prepare a SYN-ACK.
887 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
888 struct open_request *req)
890 struct tcphdr *th;
891 int tcp_header_size;
892 struct sk_buff *skb;
894 skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC);
895 if (skb == NULL)
896 return NULL;
898 /* Reserve space for headers. */
899 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
901 skb->dst = dst_clone(dst);
903 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
904 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
905 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
906 /* SACK_PERM is in the place of NOP NOP of TS */
907 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
908 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
910 memset(th, 0, sizeof(struct tcphdr));
911 th->syn = 1;
912 th->ack = 1;
913 th->source = sk->sport;
914 th->dest = req->rmt_port;
915 TCP_SKB_CB(skb)->seq = req->snt_isn;
916 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
917 th->seq = htonl(TCP_SKB_CB(skb)->seq);
918 th->ack_seq = htonl(req->rcv_isn + 1);
919 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
920 __u8 rcv_wscale;
921 /* Set this up on the first call only */
922 req->window_clamp = skb->dst->window;
923 /* tcp_full_space because it is guaranteed to be the first packet */
924 tcp_select_initial_window(tcp_full_space(sk),
925 dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
926 &req->rcv_wnd,
927 &req->window_clamp,
928 req->wscale_ok,
929 &rcv_wscale);
930 req->rcv_wscale = rcv_wscale;
933 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
934 th->window = htons(req->rcv_wnd);
936 TCP_SKB_CB(skb)->when = tcp_time_stamp;
937 tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok,
938 req->sack_ok, req->wscale_ok, req->rcv_wscale,
939 TCP_SKB_CB(skb)->when,
940 req->ts_recent);
942 skb->csum = 0;
943 th->doff = (tcp_header_size >> 2);
944 tcp_statistics.TcpOutSegs++;
945 return skb;
948 int tcp_connect(struct sock *sk, struct sk_buff *buff)
950 struct dst_entry *dst = __sk_dst_get(sk);
951 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
953 /* Reserve space for headers. */
954 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
956 /* We'll fix this up when we get a response from the other end.
957 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
959 tp->tcp_header_len = sizeof(struct tcphdr) +
960 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
962 /* If user gave his TCP_MAXSEG, record it to clamp */
963 if (tp->user_mss)
964 tp->mss_clamp = tp->user_mss;
965 tcp_sync_mss(sk, dst->pmtu);
967 tp->window_clamp = dst->window;
969 tcp_select_initial_window(tcp_full_space(sk),
970 dst->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)),
971 &tp->rcv_wnd,
972 &tp->window_clamp,
973 sysctl_tcp_window_scaling,
974 &tp->rcv_wscale);
976 /* Socket identity change complete, no longer
977 * in TCP_CLOSE, so enter ourselves into the
978 * hash tables.
980 tcp_set_state(sk,TCP_SYN_SENT);
981 if (tp->af_specific->hash_connecting(sk))
982 goto err_out;
984 sk->err = 0;
985 tp->snd_wnd = 0;
986 tp->snd_wl1 = 0;
987 tp->snd_wl2 = tp->write_seq;
988 tp->snd_una = tp->write_seq;
989 tp->rcv_nxt = 0;
990 tp->rcv_wup = 0;
991 tp->copied_seq = 0;
993 tp->rto = TCP_TIMEOUT_INIT;
994 tcp_init_xmit_timers(sk);
995 tp->retransmits = 0;
996 tp->fackets_out = 0;
997 tp->retrans_out = 0;
999 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1000 TCP_SKB_CB(buff)->sacked = 0;
1001 TCP_SKB_CB(buff)->urg_ptr = 0;
1002 buff->csum = 0;
1003 TCP_SKB_CB(buff)->seq = tp->write_seq++;
1004 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1005 tp->snd_nxt = tp->write_seq;
1007 /* Send it off. */
1008 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1009 __skb_queue_tail(&sk->write_queue, buff);
1010 tp->packets_out++;
1011 tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1012 tcp_statistics.TcpActiveOpens++;
1014 /* Timer for repeating the SYN until an answer. */
1015 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
1016 return 0;
1018 err_out:
1019 tcp_set_state(sk,TCP_CLOSE);
1020 kfree_skb(buff);
1021 return -EADDRNOTAVAIL;
1024 /* Send out a delayed ack, the caller does the policy checking
1025 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1026 * for details.
1028 void tcp_send_delayed_ack(struct sock *sk, int max_timeout)
1030 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1031 unsigned long timeout;
1033 /* Stay within the limit we were given */
1034 timeout = tp->ato;
1035 if (timeout > max_timeout)
1036 timeout = max_timeout;
1037 timeout += jiffies;
1039 /* Use new timeout only if there wasn't a older one earlier. */
1040 spin_lock_bh(&sk->timer_lock);
1041 if (!tp->delack_timer.prev || !del_timer(&tp->delack_timer)) {
1042 sock_hold(sk);
1043 tp->delack_timer.expires = timeout;
1044 } else {
1045 if (time_before(timeout, tp->delack_timer.expires))
1046 tp->delack_timer.expires = timeout;
1048 add_timer(&tp->delack_timer);
1049 spin_unlock_bh(&sk->timer_lock);
1052 /* This routine sends an ack and also updates the window. */
1053 void tcp_send_ack(struct sock *sk)
1055 /* If we have been reset, we may not send again. */
1056 if(!sk->zapped) {
1057 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1058 struct sk_buff *buff;
1060 /* We are not putting this on the write queue, so
1061 * tcp_transmit_skb() will set the ownership to this
1062 * sock.
1064 buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC);
1065 if (buff == NULL) {
1066 /* Force it to send an ack. We don't have to do this
1067 * (ACK is unreliable) but it's much better use of
1068 * bandwidth on slow links to send a spare ack than
1069 * resend packets.
1071 * This is the one possible way that we can delay an
1072 * ACK and have tp->ato indicate that we are in
1073 * quick ack mode, so clear it.
1075 if(tcp_in_quickack_mode(tp))
1076 tcp_exit_quickack_mode(tp);
1077 tcp_send_delayed_ack(sk, HZ/2);
1078 return;
1081 /* Reserve space for headers and prepare control bits. */
1082 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
1083 buff->csum = 0;
1084 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1085 TCP_SKB_CB(buff)->sacked = 0;
1086 TCP_SKB_CB(buff)->urg_ptr = 0;
1088 /* Send it off, this clears delayed acks for us. */
1089 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tp->snd_nxt;
1090 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1091 tcp_transmit_skb(sk, buff);
1095 /* This routine sends a packet with an out of date sequence
1096 * number. It assumes the other end will try to ack it.
1098 void tcp_write_wakeup(struct sock *sk)
1100 /* After a valid reset we can send no more. */
1101 if (!sk->zapped) {
1102 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1103 struct sk_buff *skb;
1105 /* Write data can still be transmitted/retransmitted in the
1106 * following states. If any other state is encountered, return.
1107 * [listen/close will never occur here anyway]
1109 if ((1 << sk->state) &
1110 ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
1111 TCPF_FIN_WAIT2|TCPF_LAST_ACK|TCPF_CLOSING))
1112 return;
1114 if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
1115 ((skb = tp->send_head) != NULL)) {
1116 unsigned long win_size;
1118 /* We are probing the opening of a window
1119 * but the window size is != 0
1120 * must have been a result SWS avoidance ( sender )
1122 win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
1123 if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
1124 if (tcp_fragment(sk, skb, win_size))
1125 return; /* Let a retransmit get it. */
1127 update_send_head(sk);
1128 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1129 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
1130 tp->packets_out++;
1131 tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1132 if (!tcp_timer_is_set(sk, TIME_RETRANS))
1133 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
1134 } else {
1135 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1136 skb = alloc_skb(MAX_HEADER + sk->prot->max_header,
1137 GFP_ATOMIC);
1138 if (skb == NULL)
1139 return;
1141 /* Reserve space for headers and set control bits. */
1142 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
1143 skb->csum = 0;
1144 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1145 TCP_SKB_CB(skb)->sacked = 0;
1146 TCP_SKB_CB(skb)->urg_ptr = 0;
1148 /* Use a previous sequence. This should cause the other
1149 * end to send an ack. Don't queue or clone SKB, just
1150 * send it.
1152 TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1;
1153 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1154 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1155 tcp_transmit_skb(sk, skb);
1160 /* A window probe timeout has occurred. If window is not closed send
1161 * a partial packet else a zero probe.
1163 void tcp_send_probe0(struct sock *sk)
1165 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1167 tcp_write_wakeup(sk);
1168 tp->pending = TIME_PROBE0;
1169 tp->backoff++;
1170 tp->probes_out++;
1171 tcp_reset_xmit_timer (sk, TIME_PROBE0,
1172 min(tp->rto << tp->backoff, 120*HZ));