pre-2.3.4..
[davej-history.git] / net / ipv4 / tcp_output.c
blob4d0a4164dd82c06fd743b9badac42b91bf0a7955
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_output.c,v 1.109 1999/05/14 23:10:13 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
24 * Changes: Pedro Roque : Retransmit queue handled by TCP.
25 * : Fragmentation on mtu decrease
26 * : Segment collapse on retransmit
27 * : AF independence
29 * Linus Torvalds : send_delayed_ack
30 * David S. Miller : Charge memory using the right skb
31 * during syn/ack processing.
32 * David S. Miller : Output engine completely rewritten.
33 * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
37 #include <net/tcp.h>
39 #include <linux/smp_lock.h>
41 extern int sysctl_tcp_timestamps;
42 extern int sysctl_tcp_window_scaling;
43 extern int sysctl_tcp_sack;
45 /* People can turn this off for buggy TCP's found in printers etc. */
46 int sysctl_tcp_retrans_collapse = 1;
48 /* Get rid of any delayed acks, we sent one already.. */
49 static __inline__ void clear_delayed_acks(struct sock * sk)
51 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
53 tp->delayed_acks = 0;
54 if(tcp_in_quickack_mode(tp))
55 tcp_exit_quickack_mode(tp);
56 tcp_clear_xmit_timer(sk, TIME_DACK);
59 static __inline__ void update_send_head(struct sock *sk)
61 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
63 tp->send_head = tp->send_head->next;
64 if (tp->send_head == (struct sk_buff *) &sk->write_queue)
65 tp->send_head = NULL;
68 /* This routine actually transmits TCP packets queued in by
69 * tcp_do_sendmsg(). This is used by both the initial
70 * transmission and possible later retransmissions.
71 * All SKB's seen here are completely headerless. It is our
72 * job to build the TCP header, and pass the packet down to
73 * IP so it can do the same plus pass the packet off to the
74 * device.
76 * We are working here with either a clone of the original
77 * SKB, or a fresh unique copy made by the retransmit engine.
79 void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
81 if(skb != NULL) {
82 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
83 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
84 int tcp_header_size = tp->tcp_header_len;
85 struct tcphdr *th;
86 int sysctl_flags;
88 #define SYSCTL_FLAG_TSTAMPS 0x1
89 #define SYSCTL_FLAG_WSCALE 0x2
90 #define SYSCTL_FLAG_SACK 0x4
92 sysctl_flags = 0;
93 if(tcb->flags & TCPCB_FLAG_SYN) {
94 tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
95 if(sysctl_tcp_timestamps) {
96 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
97 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
99 if(sysctl_tcp_window_scaling) {
100 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
101 sysctl_flags |= SYSCTL_FLAG_WSCALE;
103 if(sysctl_tcp_sack) {
104 sysctl_flags |= SYSCTL_FLAG_SACK;
105 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
106 tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
108 } else if(tp->sack_ok && tp->num_sacks) {
109 /* A SACK is 2 pad bytes, a 2 byte header, plus
110 * 2 32-bit sequence numbers for each SACK block.
112 tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
113 (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
115 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
116 skb->h.th = th;
117 skb_set_owner_w(skb, sk);
119 /* Build TCP header and checksum it. */
120 th->source = sk->sport;
121 th->dest = sk->dport;
122 th->seq = htonl(TCP_SKB_CB(skb)->seq);
123 th->ack_seq = htonl(tp->rcv_nxt);
124 th->doff = (tcp_header_size >> 2);
125 th->res1 = 0;
126 *(((__u8 *)th) + 13) = tcb->flags;
127 if(!(tcb->flags & TCPCB_FLAG_SYN))
128 th->window = htons(tcp_select_window(sk));
129 th->check = 0;
130 th->urg_ptr = ntohs(tcb->urg_ptr);
131 if(tcb->flags & TCPCB_FLAG_SYN) {
132 /* RFC1323: The window in SYN & SYN/ACK segments
133 * is never scaled.
135 th->window = htons(tp->rcv_wnd);
136 tcp_syn_build_options((__u32 *)(th + 1), tp->mss_clamp,
137 (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
138 (sysctl_flags & SYSCTL_FLAG_SACK),
139 (sysctl_flags & SYSCTL_FLAG_WSCALE),
140 tp->rcv_wscale,
141 TCP_SKB_CB(skb)->when,
142 tp->ts_recent);
143 } else {
144 tcp_build_and_update_options((__u32 *)(th + 1),
145 tp, TCP_SKB_CB(skb)->when);
147 tp->af_specific->send_check(sk, th, skb->len, skb);
149 clear_delayed_acks(sk);
150 tp->last_ack_sent = tp->rcv_nxt;
151 tcp_statistics.TcpOutSegs++;
152 tp->af_specific->queue_xmit(skb);
154 #undef SYSCTL_FLAG_TSTAMPS
155 #undef SYSCTL_FLAG_WSCALE
156 #undef SYSCTL_FLAG_SACK
159 /* This is the main buffer sending routine. We queue the buffer
160 * and decide whether to queue or transmit now.
162 void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
164 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
166 /* Advance write_seq and place onto the write_queue. */
167 tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq);
168 __skb_queue_tail(&sk->write_queue, skb);
170 if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) {
171 /* Send it out now. */
172 TCP_SKB_CB(skb)->when = tcp_time_stamp;
173 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
174 tp->packets_out++;
175 tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
176 if(!tcp_timer_is_set(sk, TIME_RETRANS))
177 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
178 } else {
179 /* Queue it, remembering where we must start sending. */
180 if (tp->send_head == NULL)
181 tp->send_head = skb;
182 if (!force_queue && tp->packets_out == 0 && !tp->pending) {
183 tp->pending = TIME_PROBE0;
184 tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
189 /* Function to create two new TCP segments. Shrinks the given segment
190 * to the specified size and appends a new segment with the rest of the
191 * packet to the list. This won't be called frequently, I hope.
192 * Remember, these are still headerless SKBs at this point.
194 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
196 struct sk_buff *buff;
197 int nsize = skb->len - len;
198 u16 flags;
200 /* Get a new skb... force flag on. */
201 buff = sock_wmalloc(sk,
202 (nsize + MAX_HEADER + sk->prot->max_header),
203 1, GFP_ATOMIC);
204 if (buff == NULL)
205 return -1; /* We'll just try again later. */
207 /* Reserve space for headers. */
208 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
210 /* Correct the sequence numbers. */
211 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
212 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
214 /* PSH and FIN should only be set in the second packet. */
215 flags = TCP_SKB_CB(skb)->flags;
216 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
217 if(flags & TCPCB_FLAG_URG) {
218 u16 old_urg_ptr = TCP_SKB_CB(skb)->urg_ptr;
220 /* Urgent data is always a pain in the ass. */
221 if(old_urg_ptr > len) {
222 TCP_SKB_CB(skb)->flags &= ~(TCPCB_FLAG_URG);
223 TCP_SKB_CB(skb)->urg_ptr = 0;
224 TCP_SKB_CB(buff)->urg_ptr = old_urg_ptr - len;
225 } else {
226 flags &= ~(TCPCB_FLAG_URG);
229 if(!(flags & TCPCB_FLAG_URG))
230 TCP_SKB_CB(buff)->urg_ptr = 0;
231 TCP_SKB_CB(buff)->flags = flags;
232 TCP_SKB_CB(buff)->sacked = 0;
234 /* Copy and checksum data tail into the new buffer. */
235 buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize),
236 nsize, 0);
238 /* This takes care of the FIN sequence number too. */
239 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
240 skb_trim(skb, len);
242 /* Rechecksum original buffer. */
243 skb->csum = csum_partial(skb->data, skb->len, 0);
245 /* Looks stupid, but our code really uses when of
246 * skbs, which it never sent before. --ANK
248 TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
250 /* Link BUFF into the send queue. */
251 __skb_append(skb, buff);
253 return 0;
256 /* This function synchronize snd mss to current pmtu/exthdr set.
258 tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
259 for TCP options, but includes only bare TCP header.
261 tp->mss_clamp is mss negotiated at connection setup.
262 It is minumum of user_mss and mss received with SYN.
263 It also does not include TCP options.
265 tp->pmtu_cookie is last pmtu, seen by this function.
267 tp->mss_cache is current effective sending mss, including
268 all tcp options except for SACKs. It is evaluated,
269 taking into account current pmtu, but never exceeds
270 tp->mss_clamp.
272 NOTE1. rfc1122 clearly states that advertised MSS
273 DOES NOT include either tcp or ip options.
275 NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
276 this function. --ANK (980731)
279 int tcp_sync_mss(struct sock *sk, u32 pmtu)
281 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
282 int mss_now;
284 /* Calculate base mss without TCP options:
285 It is MMS_S - sizeof(tcphdr) of rfc1122
287 mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
289 /* Clamp it (mss_clamp does not include tcp options) */
290 if (mss_now > tp->mss_clamp)
291 mss_now = tp->mss_clamp;
293 /* Now subtract TCP options size, not including SACKs */
294 mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
296 /* Now subtract optional transport overhead */
297 mss_now -= tp->ext_header_len;
299 /* It we got too small (or even negative) value,
300 clamp it by 8 from below. Why 8 ?
301 Well, it could be 1 with the same success,
302 but if IP accepted segment of length 1,
303 it would love 8 even more 8) --ANK (980731)
305 if (mss_now < 8)
306 mss_now = 8;
308 /* And store cached results */
309 tp->pmtu_cookie = pmtu;
310 tp->mss_cache = mss_now;
311 return mss_now;
315 /* This routine writes packets to the network. It advances the
316 * send_head. This happens as incoming acks open up the remote
317 * window for us.
319 void tcp_write_xmit(struct sock *sk)
321 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
322 unsigned int mss_now;
324 /* Account for SACKS, we may need to fragment due to this.
325 * It is just like the real MSS changing on us midstream.
326 * We also handle things correctly when the user adds some
327 * IP options mid-stream. Silly to do, but cover it.
329 mss_now = tcp_current_mss(sk);
331 /* If we are zapped, the bytes will have to remain here.
332 * In time closedown will empty the write queue and all
333 * will be happy.
335 if(!sk->zapped) {
336 struct sk_buff *skb;
337 int sent_pkts = 0;
339 /* Anything on the transmit queue that fits the window can
340 * be added providing we are:
342 * a) following SWS avoidance [and Nagle algorithm]
343 * b) not exceeding our congestion window.
344 * c) not retransmitting [Nagle]
346 while((skb = tp->send_head) && tcp_snd_test(sk, skb)) {
347 if (skb->len > mss_now) {
348 if (tcp_fragment(sk, skb, mss_now))
349 break;
352 /* Advance the send_head. This one is going out. */
353 update_send_head(sk);
354 TCP_SKB_CB(skb)->when = tcp_time_stamp;
355 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
356 tp->packets_out++;
357 tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
358 sent_pkts = 1;
361 /* If we sent anything, make sure the retransmit
362 * timer is active.
364 if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS))
365 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
369 /* This function returns the amount that we can raise the
370 * usable window based on the following constraints
372 * 1. The window can never be shrunk once it is offered (RFC 793)
373 * 2. We limit memory per socket
375 * RFC 1122:
376 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
377 * RECV.NEXT + RCV.WIN fixed until:
378 * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
380 * i.e. don't raise the right edge of the window until you can raise
381 * it at least MSS bytes.
383 * Unfortunately, the recommended algorithm breaks header prediction,
384 * since header prediction assumes th->window stays fixed.
386 * Strictly speaking, keeping th->window fixed violates the receiver
387 * side SWS prevention criteria. The problem is that under this rule
388 * a stream of single byte packets will cause the right side of the
389 * window to always advance by a single byte.
391 * Of course, if the sender implements sender side SWS prevention
392 * then this will not be a problem.
394 * BSD seems to make the following compromise:
396 * If the free space is less than the 1/4 of the maximum
397 * space available and the free space is less than 1/2 mss,
398 * then set the window to 0.
399 * Otherwise, just prevent the window from shrinking
400 * and from being larger than the largest representable value.
402 * This prevents incremental opening of the window in the regime
403 * where TCP is limited by the speed of the reader side taking
404 * data out of the TCP receive queue. It does nothing about
405 * those cases where the window is constrained on the sender side
406 * because the pipeline is full.
408 * BSD also seems to "accidentally" limit itself to windows that are a
409 * multiple of MSS, at least until the free space gets quite small.
410 * This would appear to be a side effect of the mbuf implementation.
411 * Combining these two algorithms results in the observed behavior
412 * of having a fixed window size at almost all times.
414 * Below we obtain similar behavior by forcing the offered window to
415 * a multiple of the mss when it is feasible to do so.
417 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
419 u32 __tcp_select_window(struct sock *sk)
421 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
422 unsigned int mss = tp->mss_cache;
423 int free_space;
424 u32 window;
426 /* Sometimes free_space can be < 0. */
427 free_space = (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / 2;
428 if (tp->window_clamp) {
429 if (free_space > ((int) tp->window_clamp))
430 free_space = tp->window_clamp;
431 mss = min(tp->window_clamp, mss);
432 } else {
433 printk("tcp_select_window: tp->window_clamp == 0.\n");
436 if (mss < 1) {
437 mss = 1;
438 printk("tcp_select_window: sk->mss fell to 0.\n");
441 if ((free_space < (sk->rcvbuf/4)) && (free_space < ((int) (mss/2)))) {
442 window = 0;
443 tp->pred_flags = 0;
444 } else {
445 /* Get the largest window that is a nice multiple of mss.
446 * Window clamp already applied above.
447 * If our current window offering is within 1 mss of the
448 * free space we just keep it. This prevents the divide
449 * and multiply from happening most of the time.
450 * We also don't do any window rounding when the free space
451 * is too small.
453 window = tp->rcv_wnd;
454 if ((((int) window) <= (free_space - ((int) mss))) ||
455 (((int) window) > free_space))
456 window = (((unsigned int) free_space)/mss)*mss;
458 return window;
461 /* Attempt to collapse two adjacent SKB's during retransmission. */
462 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
464 struct sk_buff *next_skb = skb->next;
466 /* The first test we must make is that neither of these two
467 * SKB's are still referenced by someone else.
469 if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
470 int skb_size = skb->len, next_skb_size = next_skb->len;
471 u16 flags = TCP_SKB_CB(skb)->flags;
473 /* Punt if the first SKB has URG set. */
474 if(flags & TCPCB_FLAG_URG)
475 return;
477 /* Also punt if next skb has been SACK'd. */
478 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
479 return;
481 /* Punt if not enough space exists in the first SKB for
482 * the data in the second, or the total combined payload
483 * would exceed the MSS.
485 if ((next_skb_size > skb_tailroom(skb)) ||
486 ((skb_size + next_skb_size) > mss_now))
487 return;
489 /* Ok. We will be able to collapse the packet. */
490 __skb_unlink(next_skb, next_skb->list);
492 if(skb->len % 4) {
493 /* Must copy and rechecksum all data. */
494 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
495 skb->csum = csum_partial(skb->data, skb->len, 0);
496 } else {
497 /* Optimize, actually we could also combine next_skb->csum
498 * to skb->csum using a single add w/carry operation too.
500 skb->csum = csum_partial_copy(next_skb->data,
501 skb_put(skb, next_skb_size),
502 next_skb_size, skb->csum);
505 /* Update sequence range on original skb. */
506 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
508 /* Merge over control information. */
509 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
510 if(flags & TCPCB_FLAG_URG) {
511 u16 urgptr = TCP_SKB_CB(next_skb)->urg_ptr;
512 TCP_SKB_CB(skb)->urg_ptr = urgptr + skb_size;
514 TCP_SKB_CB(skb)->flags = flags;
516 /* All done, get rid of second SKB and account for it so
517 * packet counting does not break.
519 kfree_skb(next_skb);
520 sk->tp_pinfo.af_tcp.packets_out--;
524 /* Do a simple retransmit without using the backoff mechanisms in
525 * tcp_timer. This is used for path mtu discovery.
526 * The socket is already locked here.
528 void tcp_simple_retransmit(struct sock *sk)
530 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
531 struct sk_buff *skb, *old_next_skb;
532 unsigned int mss = tcp_current_mss(sk);
534 /* Don't muck with the congestion window here. */
535 tp->dup_acks = 0;
536 tp->high_seq = tp->snd_nxt;
537 tp->retrans_head = NULL;
539 /* Input control flow will see that this was retransmitted
540 * and not use it for RTT calculation in the absence of
541 * the timestamp option.
543 for (old_next_skb = skb = skb_peek(&sk->write_queue);
544 ((skb != tp->send_head) &&
545 (skb != (struct sk_buff *)&sk->write_queue));
546 skb = skb->next) {
547 int resend_skb = 0;
549 /* Our goal is to push out the packets which we
550 * sent already, but are being chopped up now to
551 * account for the PMTU information we have.
553 * As we resend the queue, packets are fragmented
554 * into two pieces, and when we try to send the
555 * second piece it may be collapsed together with
556 * a subsequent packet, and so on. -DaveM
558 if (old_next_skb != skb || skb->len > mss)
559 resend_skb = 1;
560 old_next_skb = skb->next;
561 if (resend_skb != 0)
562 tcp_retransmit_skb(sk, skb);
566 static __inline__ void update_retrans_head(struct sock *sk)
568 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
570 tp->retrans_head = tp->retrans_head->next;
571 if((tp->retrans_head == tp->send_head) ||
572 (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) {
573 tp->retrans_head = NULL;
574 tp->rexmt_done = 1;
578 /* This retransmits one SKB. Policy decisions and retransmit queue
579 * state updates are done by the caller. Returns non-zero if an
580 * error occurred which prevented the send.
582 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
584 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
585 unsigned int cur_mss = tcp_current_mss(sk);
587 if(skb->len > cur_mss) {
588 if(tcp_fragment(sk, skb, cur_mss))
589 return 1; /* We'll try again later. */
591 /* New SKB created, account for it. */
592 tp->packets_out++;
595 /* Collapse two adjacent packets if worthwhile and we can. */
596 if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
597 (skb->len < (cur_mss >> 1)) &&
598 (skb->next != tp->send_head) &&
599 (skb->next != (struct sk_buff *)&sk->write_queue) &&
600 (sysctl_tcp_retrans_collapse != 0))
601 tcp_retrans_try_collapse(sk, skb, cur_mss);
603 if(tp->af_specific->rebuild_header(sk))
604 return 1; /* Routing failure or similar. */
606 /* Some Solaris stacks overoptimize and ignore the FIN on a
607 * retransmit when old data is attached. So strip it off
608 * since it is cheap to do so and saves bytes on the network.
610 if(skb->len > 0 &&
611 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
612 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
613 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
614 skb_trim(skb, 0);
615 skb->csum = 0;
618 /* Ok, we're gonna send it out, update state. */
619 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS;
620 tp->retrans_out++;
622 /* Make a copy, if the first transmission SKB clone we made
623 * is still in somebody's hands, else make a clone.
625 TCP_SKB_CB(skb)->when = tcp_time_stamp;
626 if(skb_cloned(skb))
627 skb = skb_copy(skb, GFP_ATOMIC);
628 else
629 skb = skb_clone(skb, GFP_ATOMIC);
631 tcp_transmit_skb(sk, skb);
633 /* Update global TCP statistics and return success. */
634 sk->prot->retransmits++;
635 tcp_statistics.TcpRetransSegs++;
637 return 0;
640 /* This gets called after a retransmit timeout, and the initially
641 * retransmitted data is acknowledged. It tries to continue
642 * resending the rest of the retransmit queue, until either
643 * we've sent it all or the congestion window limit is reached.
644 * If doing SACK, the first ACK which comes back for a timeout
645 * based retransmit packet might feed us FACK information again.
646 * If so, we use it to avoid unnecessarily retransmissions.
648 void tcp_xmit_retransmit_queue(struct sock *sk)
650 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
651 struct sk_buff *skb;
653 if (tp->retrans_head == NULL &&
654 tp->rexmt_done == 0)
655 tp->retrans_head = skb_peek(&sk->write_queue);
656 if (tp->retrans_head == tp->send_head)
657 tp->retrans_head = NULL;
659 /* Each time, advance the retrans_head if we got
660 * a packet out or we skipped one because it was
661 * SACK'd. -DaveM
663 while ((skb = tp->retrans_head) != NULL) {
664 /* If it has been ack'd by a SACK block, we don't
665 * retransmit it.
667 if(!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
668 /* Send it out, punt if error occurred. */
669 if(tcp_retransmit_skb(sk, skb))
670 break;
672 update_retrans_head(sk);
674 /* Stop retransmitting if we've hit the congestion
675 * window limit.
677 if (tp->retrans_out >= tp->snd_cwnd)
678 break;
679 } else {
680 update_retrans_head(sk);
685 /* Using FACK information, retransmit all missing frames at the receiver
686 * up to the forward most SACK'd packet (tp->fackets_out) if the packet
687 * has not been retransmitted already.
689 void tcp_fack_retransmit(struct sock *sk)
691 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
692 struct sk_buff *skb = skb_peek(&sk->write_queue);
693 int packet_cnt = 0;
695 while((skb != NULL) &&
696 (skb != tp->send_head) &&
697 (skb != (struct sk_buff *)&sk->write_queue)) {
698 __u8 sacked = TCP_SKB_CB(skb)->sacked;
700 if(sacked & (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS))
701 goto next_packet;
703 /* Ok, retransmit it. */
704 if(tcp_retransmit_skb(sk, skb))
705 break;
707 if(tcp_packets_in_flight(tp) >= tp->snd_cwnd)
708 break;
709 next_packet:
710 packet_cnt++;
711 if(packet_cnt >= tp->fackets_out)
712 break;
713 skb = skb->next;
717 /* Send a fin. The caller locks the socket for us. This cannot be
718 * allowed to fail queueing a FIN frame under any circumstances.
720 void tcp_send_fin(struct sock *sk)
722 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
723 struct sk_buff *skb = skb_peek_tail(&sk->write_queue);
724 unsigned int mss_now;
726 /* Optimization, tack on the FIN if we have a queue of
727 * unsent frames. But be careful about outgoing SACKS
728 * and IP options.
730 mss_now = tcp_current_mss(sk);
732 if((tp->send_head != NULL) && (skb->len < mss_now)) {
733 /* tcp_write_xmit() takes care of the rest. */
734 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
735 TCP_SKB_CB(skb)->end_seq++;
736 tp->write_seq++;
738 /* Special case to avoid Nagle bogosity. If this
739 * segment is the last segment, and it was queued
740 * due to Nagle/SWS-avoidance, send it out now.
742 if(tp->send_head == skb &&
743 !sk->nonagle &&
744 skb->len < (tp->mss_cache >> 1) &&
745 tp->packets_out &&
746 !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
747 update_send_head(sk);
748 TCP_SKB_CB(skb)->when = tcp_time_stamp;
749 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
750 tp->packets_out++;
751 tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
752 if(!tcp_timer_is_set(sk, TIME_RETRANS))
753 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
755 } else {
756 /* Socket is locked, keep trying until memory is available. */
757 do {
758 skb = sock_wmalloc(sk,
759 (MAX_HEADER +
760 sk->prot->max_header),
761 1, GFP_KERNEL);
762 } while (skb == NULL);
764 /* Reserve space for headers and prepare control bits. */
765 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
766 skb->csum = 0;
767 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
768 TCP_SKB_CB(skb)->sacked = 0;
769 TCP_SKB_CB(skb)->urg_ptr = 0;
771 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
772 TCP_SKB_CB(skb)->seq = tp->write_seq;
773 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
774 tcp_send_skb(sk, skb, 0);
778 /* We get here when a process closes a file descriptor (either due to
779 * an explicit close() or as a byproduct of exit()'ing) and there
780 * was unread data in the receive queue. This behavior is recommended
781 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
783 void tcp_send_active_reset(struct sock *sk)
785 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
786 struct sk_buff *skb;
788 /* NOTE: No TCP options attached and we never retransmit this. */
789 skb = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL);
790 if (!skb)
791 return;
793 /* Reserve space for headers and prepare control bits. */
794 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
795 skb->csum = 0;
796 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
797 TCP_SKB_CB(skb)->sacked = 0;
798 TCP_SKB_CB(skb)->urg_ptr = 0;
800 /* Send it off. */
801 TCP_SKB_CB(skb)->seq = tp->write_seq;
802 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
803 TCP_SKB_CB(skb)->when = tcp_time_stamp;
804 tcp_transmit_skb(sk, skb);
807 /* WARNING: This routine must only be called when we have already sent
808 * a SYN packet that crossed the incoming SYN that caused this routine
809 * to get called. If this assumption fails then the initial rcv_wnd
810 * and rcv_wscale values will not be correct.
812 int tcp_send_synack(struct sock *sk)
814 struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
815 struct sk_buff* skb;
817 skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
818 1, GFP_ATOMIC);
819 if (skb == NULL)
820 return -ENOMEM;
822 /* Reserve space for headers and prepare control bits. */
823 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
824 skb->csum = 0;
825 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN);
826 TCP_SKB_CB(skb)->sacked = 0;
827 TCP_SKB_CB(skb)->urg_ptr = 0;
829 /* SYN eats a sequence byte. */
830 TCP_SKB_CB(skb)->seq = tp->snd_una;
831 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
832 __skb_queue_tail(&sk->write_queue, skb);
833 TCP_SKB_CB(skb)->when = tcp_time_stamp;
834 tp->packets_out++;
835 tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
836 return 0;
840 * Prepare a SYN-ACK.
842 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
843 struct open_request *req, int mss)
845 struct tcphdr *th;
846 int tcp_header_size;
847 struct sk_buff *skb;
849 skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC);
850 if (skb == NULL)
851 return NULL;
853 /* Reserve space for headers. */
854 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
856 skb->dst = dst_clone(dst);
858 /* Don't offer more than they did.
859 * This way we don't have to memorize who said what.
860 * FIXME: maybe this should be changed for better performance
861 * with syncookies.
863 req->mss = min(mss, req->mss);
864 if (req->mss < 8) {
865 printk(KERN_DEBUG "initial req->mss below 8\n");
866 req->mss = 8;
869 tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
870 (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
871 (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
872 /* SACK_PERM is in the place of NOP NOP of TS */
873 ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
874 skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
876 memset(th, 0, sizeof(struct tcphdr));
877 th->syn = 1;
878 th->ack = 1;
879 th->source = sk->sport;
880 th->dest = req->rmt_port;
881 TCP_SKB_CB(skb)->seq = req->snt_isn;
882 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
883 th->seq = htonl(TCP_SKB_CB(skb)->seq);
884 th->ack_seq = htonl(req->rcv_isn + 1);
885 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
886 __u8 rcv_wscale;
887 /* Set this up on the first call only */
888 req->window_clamp = skb->dst->window;
889 tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
890 &req->rcv_wnd,
891 &req->window_clamp,
892 req->wscale_ok,
893 &rcv_wscale);
894 req->rcv_wscale = rcv_wscale;
897 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
898 th->window = htons(req->rcv_wnd);
900 TCP_SKB_CB(skb)->when = tcp_time_stamp;
901 tcp_syn_build_options((__u32 *)(th + 1), req->mss, req->tstamp_ok,
902 req->sack_ok, req->wscale_ok, req->rcv_wscale,
903 TCP_SKB_CB(skb)->when,
904 req->ts_recent);
906 skb->csum = 0;
907 th->doff = (tcp_header_size >> 2);
908 tcp_statistics.TcpOutSegs++;
909 return skb;
912 void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu)
914 struct dst_entry *dst = sk->dst_cache;
915 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
917 /* Reserve space for headers. */
918 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
920 tp->snd_wnd = 0;
921 tp->snd_wl1 = 0;
922 tp->snd_wl2 = tp->write_seq;
923 tp->snd_una = tp->write_seq;
924 tp->rcv_nxt = 0;
926 sk->err = 0;
928 /* We'll fix this up when we get a response from the other end.
929 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
931 tp->tcp_header_len = sizeof(struct tcphdr) +
932 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
934 /* If user gave his TCP_MAXSEG, record it to clamp */
935 if (tp->user_mss)
936 tp->mss_clamp = tp->user_mss;
937 tcp_sync_mss(sk, mtu);
939 /* Now unpleasant action: if initial pmtu is too low
940 set lower clamp. I am not sure that it is good.
941 To be more exact, I do not think that clamping at value, which
942 is apparently transient and may improve in future is good idea.
943 It would be better to wait until peer will returns its MSS
944 (probably 65535 too) and now advertise something sort of 65535
945 or at least first hop device mtu. Is it clear, what I mean?
946 We should tell peer what maximal mss we expect to RECEIVE,
947 it has nothing to do with pmtu.
948 I am afraid someone will be confused by such huge value.
949 --ANK (980731)
951 if (tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr) < tp->mss_clamp )
952 tp->mss_clamp = tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr);
954 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
955 TCP_SKB_CB(buff)->sacked = 0;
956 TCP_SKB_CB(buff)->urg_ptr = 0;
957 buff->csum = 0;
958 TCP_SKB_CB(buff)->seq = tp->write_seq++;
959 TCP_SKB_CB(buff)->end_seq = tp->write_seq;
960 tp->snd_nxt = TCP_SKB_CB(buff)->end_seq;
962 tp->window_clamp = dst->window;
963 tcp_select_initial_window(sock_rspace(sk)/2,tp->mss_clamp,
964 &tp->rcv_wnd,
965 &tp->window_clamp,
966 sysctl_tcp_window_scaling,
967 &tp->rcv_wscale);
968 /* Ok, now lock the socket before we make it visible to
969 * the incoming packet engine.
971 unlock_kernel();
972 lock_sock(sk);
974 /* Socket identity change complete, no longer
975 * in TCP_CLOSE, so enter ourselves into the
976 * hash tables.
978 tcp_set_state(sk,TCP_SYN_SENT);
979 sk->prot->hash(sk);
981 tp->rto = dst->rtt;
982 tcp_init_xmit_timers(sk);
983 tp->retransmits = 0;
984 tp->fackets_out = 0;
985 tp->retrans_out = 0;
987 /* Send it off. */
988 __skb_queue_tail(&sk->write_queue, buff);
989 TCP_SKB_CB(buff)->when = tcp_time_stamp;
990 tp->packets_out++;
991 tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
992 tcp_statistics.TcpActiveOpens++;
994 /* Timer for repeating the SYN until an answer. */
995 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
997 /* Now, it is safe to release the socket. */
998 release_sock(sk);
999 lock_kernel();
1002 /* Send out a delayed ack, the caller does the policy checking
1003 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
1004 * for details.
1006 void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout)
1008 unsigned long timeout;
1010 /* Stay within the limit we were given */
1011 timeout = tp->ato;
1012 if (timeout > max_timeout)
1013 timeout = max_timeout;
1014 timeout += jiffies;
1016 /* Use new timeout only if there wasn't a older one earlier. */
1017 if (!tp->delack_timer.prev) {
1018 tp->delack_timer.expires = timeout;
1019 add_timer(&tp->delack_timer);
1020 } else {
1021 if (time_before(timeout, tp->delack_timer.expires))
1022 mod_timer(&tp->delack_timer, timeout);
1026 /* This routine sends an ack and also updates the window. */
1027 void tcp_send_ack(struct sock *sk)
1029 /* If we have been reset, we may not send again. */
1030 if(!sk->zapped) {
1031 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1032 struct sk_buff *buff;
1034 /* We are not putting this on the write queue, so
1035 * tcp_transmit_skb() will set the ownership to this
1036 * sock.
1038 buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC);
1039 if (buff == NULL) {
1040 /* Force it to send an ack. We don't have to do this
1041 * (ACK is unreliable) but it's much better use of
1042 * bandwidth on slow links to send a spare ack than
1043 * resend packets.
1045 * This is the one possible way that we can delay an
1046 * ACK and have tp->ato indicate that we are in
1047 * quick ack mode, so clear it.
1049 if(tcp_in_quickack_mode(tp))
1050 tcp_exit_quickack_mode(tp);
1051 tcp_send_delayed_ack(tp, HZ/2);
1052 return;
1055 /* Reserve space for headers and prepare control bits. */
1056 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
1057 buff->csum = 0;
1058 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1059 TCP_SKB_CB(buff)->sacked = 0;
1060 TCP_SKB_CB(buff)->urg_ptr = 0;
1062 /* Send it off, this clears delayed acks for us. */
1063 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tp->snd_nxt;
1064 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1065 tcp_transmit_skb(sk, buff);
1069 /* This routine sends a packet with an out of date sequence
1070 * number. It assumes the other end will try to ack it.
1072 void tcp_write_wakeup(struct sock *sk)
1074 /* After a valid reset we can send no more. */
1075 if (!sk->zapped) {
1076 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1077 struct sk_buff *skb;
1079 /* Write data can still be transmitted/retransmitted in the
1080 * following states. If any other state is encountered, return.
1081 * [listen/close will never occur here anyway]
1083 if ((1 << sk->state) &
1084 ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
1085 TCPF_LAST_ACK|TCPF_CLOSING))
1086 return;
1088 if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
1089 ((skb = tp->send_head) != NULL)) {
1090 unsigned long win_size;
1092 /* We are probing the opening of a window
1093 * but the window size is != 0
1094 * must have been a result SWS avoidance ( sender )
1096 win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
1097 if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
1098 if (tcp_fragment(sk, skb, win_size))
1099 return; /* Let a retransmit get it. */
1101 update_send_head(sk);
1102 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1103 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
1104 tp->packets_out++;
1105 tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1106 if (!tcp_timer_is_set(sk, TIME_RETRANS))
1107 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
1108 } else {
1109 /* We don't queue it, tcp_transmit_skb() sets ownership. */
1110 skb = alloc_skb(MAX_HEADER + sk->prot->max_header,
1111 GFP_ATOMIC);
1112 if (skb == NULL)
1113 return;
1115 /* Reserve space for headers and set control bits. */
1116 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
1117 skb->csum = 0;
1118 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1119 TCP_SKB_CB(skb)->sacked = 0;
1120 TCP_SKB_CB(skb)->urg_ptr = 0;
1122 /* Use a previous sequence. This should cause the other
1123 * end to send an ack. Don't queue or clone SKB, just
1124 * send it.
1126 TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1;
1127 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1128 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1129 tcp_transmit_skb(sk, skb);
1134 /* A window probe timeout has occurred. If window is not closed send
1135 * a partial packet else a zero probe.
1137 void tcp_send_probe0(struct sock *sk)
1139 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1141 tcp_write_wakeup(sk);
1142 tp->pending = TIME_PROBE0;
1143 tp->backoff++;
1144 tp->probes_out++;
1145 tcp_reset_xmit_timer (sk, TIME_PROBE0,
1146 min(tp->rto << tp->backoff, 120*HZ));