net/ipv4/tcp_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_output.c,v 1.113 1999/09/07 02:31:39 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 /*
  24  * Changes:     Pedro Roque     :       Retransmit queue handled by TCP.
  25  *                              :       Fragmentation on mtu decrease
  26  *                              :       Segment collapse on retransmit
  27  *                              :       AF independence
  28  *
  29  *              Linus Torvalds  :       send_delayed_ack
  30  *              David S. Miller :       Charge memory using the right skb
  31  *                                      during syn/ack processing.
  32  *              David S. Miller :       Output engine completely rewritten.
  33  *              Andrea Arcangeli:       SYNACK carry ts_recent in tsecr.
  34  *
  35  */
  36
  37 #include <net/tcp.h>
  38
  39 #include <linux/smp_lock.h>
  40
  41 extern int sysctl_tcp_timestamps;
  42 extern int sysctl_tcp_window_scaling;
  43 extern int sysctl_tcp_sack;
  44
  45 /* People can turn this off for buggy TCP's found in printers etc. */
  46 int sysctl_tcp_retrans_collapse = 1;
  47
  48 /* Get rid of any delayed acks, we sent one already.. */
  49 static __inline__ void clear_delayed_acks(struct sock * sk)
  50 {
  51         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  52
  53         tp->delayed_acks = 0;
  54         if(tcp_in_quickack_mode(tp))
  55                 tcp_exit_quickack_mode(tp);
  56         tcp_clear_xmit_timer(sk, TIME_DACK);
  57 }
  58
  59 static __inline__ void update_send_head(struct sock *sk)
  60 {
  61         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
  62
  63         tp->send_head = tp->send_head->next;
  64         if (tp->send_head == (struct sk_buff *) &sk->write_queue)
  65                 tp->send_head = NULL;
  66 }
  67
  68 /* Calculate mss to advertise in SYN segment.
  69    RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
  70
  71    1. It is independent of path mtu.
  72    2. Ideally, it is maximal possible segment size i.e. 65535-40.
  73    3. For IPv4 it is reasonable to calculate it from maximal MTU of
  74       attached devices, because some buggy hosts are confused by
  75       large MSS.
  76    4. We do not make 3, we advertise MSS, calculated from first
  77       hop device mtu, but allow to raise it to ip_rt_min_advmss.
  78       This may be overriden via information stored in routing table.
  79    5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
  80       probably even Jumbo".
  81  */
  82 static __u16 tcp_advertise_mss(struct sock *sk)
  83 {
  84         struct dst_entry *dst = __sk_dst_get(sk);
  85         int mss;
  86
  87         if (dst) {
  88                 mss = dst->advmss;
  89         } else {
  90                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  91
  92                 /* No dst. It is bad. Guess some reasonable value.
  93                  * Actually, this case should not be possible.
  94                  * SANITY.
  95                  */
  96                 BUG_TRAP(dst!=NULL);
  97
  98                 mss = tp->mss_cache;
  99                 mss += (tp->tcp_header_len - sizeof(struct tcphdr)) +
 100                         tp->ext_header_len;
 101
 102                 /* Minimal MSS to include full set of of TCP/IP options
 103                    plus 8 bytes of data. It corresponds to mtu 128.
 104                  */
 105                 if (mss < 88)
 106                         mss = 88;
 107         }
 108
 109         return (__u16)mss;
 110 }
 111
 112 /* This routine actually transmits TCP packets queued in by
 113  * tcp_do_sendmsg().  This is used by both the initial
 114  * transmission and possible later retransmissions.
 115  * All SKB's seen here are completely headerless.  It is our
 116  * job to build the TCP header, and pass the packet down to
 117  * IP so it can do the same plus pass the packet off to the
 118  * device.
 119  *
 120  * We are working here with either a clone of the original
 121  * SKB, or a fresh unique copy made by the retransmit engine.
 122  */
 123 void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 124 {
 125         if(skb != NULL) {
 126                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 127                 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 128                 int tcp_header_size = tp->tcp_header_len;
 129                 struct tcphdr *th;
 130                 int sysctl_flags;
 131
 132 #define SYSCTL_FLAG_TSTAMPS     0x1
 133 #define SYSCTL_FLAG_WSCALE      0x2
 134 #define SYSCTL_FLAG_SACK        0x4
 135
 136                 sysctl_flags = 0;
 137                 if(tcb->flags & TCPCB_FLAG_SYN) {
 138                         tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
 139                         if(sysctl_tcp_timestamps) {
 140                                 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
 141                                 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
 142                         }
 143                         if(sysctl_tcp_window_scaling) {
 144                                 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
 145                                 sysctl_flags |= SYSCTL_FLAG_WSCALE;
 146                         }
 147                         if(sysctl_tcp_sack) {
 148                                 sysctl_flags |= SYSCTL_FLAG_SACK;
 149                                 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
 150                                         tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
 151                         }
 152                 } else if(tp->sack_ok && tp->num_sacks) {
 153                         /* A SACK is 2 pad bytes, a 2 byte header, plus
 154                          * 2 32-bit sequence numbers for each SACK block.
 155                          */
 156                         tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
 157                                             (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
 158                 }
 159                 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 160                 skb->h.th = th;
 161                 skb_set_owner_w(skb, sk);
 162
 163                 /* Build TCP header and checksum it. */
 164                 th->source              = sk->sport;
 165                 th->dest                = sk->dport;
 166                 th->seq                 = htonl(TCP_SKB_CB(skb)->seq);
 167                 th->ack_seq             = htonl(tp->rcv_nxt);
 168                 th->doff                = (tcp_header_size >> 2);
 169                 th->res1                = 0;
 170                 *(((__u8 *)th) + 13)    = tcb->flags;
 171                 th->check               = 0;
 172                 th->urg_ptr             = ntohs(tcb->urg_ptr);
 173                 if(tcb->flags & TCPCB_FLAG_SYN) {
 174                         /* RFC1323: The window in SYN & SYN/ACK segments
 175                          * is never scaled.
 176                          */
 177                         th->window      = htons(tp->rcv_wnd);
 178                         tcp_syn_build_options((__u32 *)(th + 1),
 179                                               tcp_advertise_mss(sk),
 180                                               (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
 181                                               (sysctl_flags & SYSCTL_FLAG_SACK),
 182                                               (sysctl_flags & SYSCTL_FLAG_WSCALE),
 183                                               tp->rcv_wscale,
 184                                               TCP_SKB_CB(skb)->when,
 185                                               tp->ts_recent);
 186                 } else {
 187                         th->window      = htons(tcp_select_window(sk));
 188                         tcp_build_and_update_options((__u32 *)(th + 1),
 189                                                      tp, TCP_SKB_CB(skb)->when);
 190                 }
 191                 tp->af_specific->send_check(sk, th, skb->len, skb);
 192
 193                 clear_delayed_acks(sk);
 194                 tp->last_ack_sent = tp->rcv_nxt;
 195                 tcp_statistics.TcpOutSegs++;
 196                 tp->af_specific->queue_xmit(skb);
 197         }
 198 #undef SYSCTL_FLAG_TSTAMPS
 199 #undef SYSCTL_FLAG_WSCALE
 200 #undef SYSCTL_FLAG_SACK
 201 }
 202
 203 /* This is the main buffer sending routine. We queue the buffer
 204  * and decide whether to queue or transmit now.
 205  */
 206 void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
 207 {
 208         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 209
 210         /* Advance write_seq and place onto the write_queue. */
 211         tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq);
 212         __skb_queue_tail(&sk->write_queue, skb);
 213
 214         if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) {
 215                 /* Send it out now. */
 216                 TCP_SKB_CB(skb)->when = tcp_time_stamp;
 217                 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 218                 tp->packets_out++;
 219                 tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
 220                 if(!tcp_timer_is_set(sk, TIME_RETRANS))
 221                         tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 222         } else {
 223                 /* Queue it, remembering where we must start sending. */
 224                 if (tp->send_head == NULL)
 225                         tp->send_head = skb;
 226                 if (!force_queue && tp->packets_out == 0 && !tp->pending) {
 227                         tp->pending = TIME_PROBE0;
 228                         tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
 229                 }
 230         }
 231 }
 232
 233 /* Function to create two new TCP segments.  Shrinks the given segment
 234  * to the specified size and appends a new segment with the rest of the
 235  * packet to the list.  This won't be called frequently, I hope.
 236  * Remember, these are still headerless SKBs at this point.
 237  */
 238 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 239 {
 240         struct sk_buff *buff;
 241         int nsize = skb->len - len;
 242         u16 flags;
 243
 244         /* Get a new skb... force flag on. */
 245         buff = sock_wmalloc(sk,
 246                             (nsize + MAX_HEADER + sk->prot->max_header),
 247                             1, GFP_ATOMIC);
 248         if (buff == NULL)
 249                 return -1; /* We'll just try again later. */
 250
 251         /* Reserve space for headers. */
 252         skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
 253
 254         /* Correct the sequence numbers. */
 255         TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
 256         TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
 257
 258         /* PSH and FIN should only be set in the second packet. */
 259         flags = TCP_SKB_CB(skb)->flags;
 260         TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
 261         if(flags & TCPCB_FLAG_URG) {
 262                 u16 old_urg_ptr = TCP_SKB_CB(skb)->urg_ptr;
 263
 264                 /* Urgent data is always a pain in the ass. */
 265                 if(old_urg_ptr > len) {
 266                         TCP_SKB_CB(skb)->flags &= ~(TCPCB_FLAG_URG);
 267                         TCP_SKB_CB(skb)->urg_ptr = 0;
 268                         TCP_SKB_CB(buff)->urg_ptr = old_urg_ptr - len;
 269                 } else {
 270                         flags &= ~(TCPCB_FLAG_URG);
 271                 }
 272         }
 273         if(!(flags & TCPCB_FLAG_URG))
 274                 TCP_SKB_CB(buff)->urg_ptr = 0;
 275         TCP_SKB_CB(buff)->flags = flags;
 276         TCP_SKB_CB(buff)->sacked = 0;
 277
 278         /* Copy and checksum data tail into the new buffer. */
 279         buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize),
 280                                        nsize, 0);
 281
 282         /* This takes care of the FIN sequence number too. */
 283         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
 284         skb_trim(skb, len);
 285
 286         /* Rechecksum original buffer. */
 287         skb->csum = csum_partial(skb->data, skb->len, 0);
 288
 289         /* Looks stupid, but our code really uses when of
 290          * skbs, which it never sent before. --ANK
 291          */
 292         TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
 293
 294         /* Link BUFF into the send queue. */
 295         __skb_append(skb, buff);
 296
 297         return 0;
 298 }
 299
 300 /* This function synchronize snd mss to current pmtu/exthdr set.
 301
 302    tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
 303    for TCP options, but includes only bare TCP header.
 304
 305    tp->mss_clamp is mss negotiated at connection setup.
 306    It is minumum of user_mss and mss received with SYN.
 307    It also does not include TCP options.
 308
 309    tp->pmtu_cookie is last pmtu, seen by this function.
 310
 311    tp->mss_cache is current effective sending mss, including
 312    all tcp options except for SACKs. It is evaluated,
 313    taking into account current pmtu, but never exceeds
 314    tp->mss_clamp.
 315
 316    NOTE1. rfc1122 clearly states that advertised MSS
 317    DOES NOT include either tcp or ip options.
 318
 319    NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
 320    this function.                       --ANK (980731)
 321  */
 322
 323 int tcp_sync_mss(struct sock *sk, u32 pmtu)
 324 {
 325         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 326         int mss_now;
 327
 328         /* Calculate base mss without TCP options:
 329            It is MMS_S - sizeof(tcphdr) of rfc1122
 330          */
 331
 332         mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
 333
 334         /* Clamp it (mss_clamp does not include tcp options) */
 335         if (mss_now > tp->mss_clamp)
 336                 mss_now = tp->mss_clamp;
 337
 338         /* Now subtract TCP options size, not including SACKs */
 339         mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
 340
 341         /* Now subtract optional transport overhead */
 342         mss_now -= tp->ext_header_len;
 343
 344         /* It we got too small (or even negative) value,
 345            clamp it by 8 from below. Why 8 ?
 346            Well, it could be 1 with the same success,
 347            but if IP accepted segment of length 1,
 348            it would love 8 even more 8)         --ANK (980731)
 349          */
 350         if (mss_now < 8)
 351                 mss_now = 8;
 352
 353         /* And store cached results */
 354         tp->pmtu_cookie = pmtu;
 355         tp->mss_cache = mss_now;
 356         return mss_now;
 357 }
 358
 359
 360 /* This routine writes packets to the network.  It advances the
 361  * send_head.  This happens as incoming acks open up the remote
 362  * window for us.
 363  */
 364 void tcp_write_xmit(struct sock *sk)
 365 {
 366         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 367         unsigned int mss_now;
 368
 369         /* Account for SACKS, we may need to fragment due to this.
 370          * It is just like the real MSS changing on us midstream.
 371          * We also handle things correctly when the user adds some
 372          * IP options mid-stream.  Silly to do, but cover it.
 373          */
 374         mss_now = tcp_current_mss(sk);
 375
 376         /* If we are zapped, the bytes will have to remain here.
 377          * In time closedown will empty the write queue and all
 378          * will be happy.
 379          */
 380         if(!sk->zapped) {
 381                 struct sk_buff *skb;
 382                 int sent_pkts = 0;
 383
 384                 /* Anything on the transmit queue that fits the window can
 385                  * be added providing we are:
 386                  *
 387                  * a) following SWS avoidance [and Nagle algorithm]
 388                  * b) not exceeding our congestion window.
 389                  * c) not retransmitting [Nagle]
 390                  */
 391                 while((skb = tp->send_head) && tcp_snd_test(sk, skb)) {
 392                         if (skb->len > mss_now) {
 393                                 if (tcp_fragment(sk, skb, mss_now))
 394                                         break;
 395                         }
 396
 397                         /* Advance the send_head.  This one is going out. */
 398                         update_send_head(sk);
 399                         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 400                         tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 401                         tp->packets_out++;
 402                         tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 403                         sent_pkts = 1;
 404                 }
 405
 406                 /* If we sent anything, make sure the retransmit
 407                  * timer is active.
 408                  */
 409                 if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS))
 410                         tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 411         }
 412 }
 413
 414 /* This function returns the amount that we can raise the
 415  * usable window based on the following constraints
 416  *
 417  * 1. The window can never be shrunk once it is offered (RFC 793)
 418  * 2. We limit memory per socket
 419  *
 420  * RFC 1122:
 421  * "the suggested [SWS] avoidance algorithm for the receiver is to keep
 422  *  RECV.NEXT + RCV.WIN fixed until:
 423  *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
 424  *
 425  * i.e. don't raise the right edge of the window until you can raise
 426  * it at least MSS bytes.
 427  *
 428  * Unfortunately, the recommended algorithm breaks header prediction,
 429  * since header prediction assumes th->window stays fixed.
 430  *
 431  * Strictly speaking, keeping th->window fixed violates the receiver
 432  * side SWS prevention criteria. The problem is that under this rule
 433  * a stream of single byte packets will cause the right side of the
 434  * window to always advance by a single byte.
 435  *
 436  * Of course, if the sender implements sender side SWS prevention
 437  * then this will not be a problem.
 438  *
 439  * BSD seems to make the following compromise:
 440  *
 441  *      If the free space is less than the 1/4 of the maximum
 442  *      space available and the free space is less than 1/2 mss,
 443  *      then set the window to 0.
 444  *      Otherwise, just prevent the window from shrinking
 445  *      and from being larger than the largest representable value.
 446  *
 447  * This prevents incremental opening of the window in the regime
 448  * where TCP is limited by the speed of the reader side taking
 449  * data out of the TCP receive queue. It does nothing about
 450  * those cases where the window is constrained on the sender side
 451  * because the pipeline is full.
 452  *
 453  * BSD also seems to "accidentally" limit itself to windows that are a
 454  * multiple of MSS, at least until the free space gets quite small.
 455  * This would appear to be a side effect of the mbuf implementation.
 456  * Combining these two algorithms results in the observed behavior
 457  * of having a fixed window size at almost all times.
 458  *
 459  * Below we obtain similar behavior by forcing the offered window to
 460  * a multiple of the mss when it is feasible to do so.
 461  *
 462  * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
 463  * Regular options like TIMESTAMP are taken into account.
 464  */
 465 u32 __tcp_select_window(struct sock *sk)
 466 {
 467         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 468         /* MSS for the peer's data.  Previous verions used mss_clamp
 469          * here.  I don't know if the value based on our guesses
 470          * of peer's MSS is better for the performance.  It's more correct
 471          * but may be worse for the performance because of rcv_mss
 472          * fluctuations.  --SAW  1998/11/1
 473          */
 474         unsigned int mss = tp->rcv_mss;
 475         int free_space;
 476         u32 window;
 477
 478         /* Sometimes free_space can be < 0. */
 479         free_space = tcp_space(sk);
 480         if (free_space > ((int) tp->window_clamp))
 481                 free_space = tp->window_clamp;
 482         if (tp->window_clamp < mss)
 483                 mss = tp->window_clamp;
 484
 485         if ((free_space < (tcp_full_space(sk) / 2)) &&
 486                 (free_space < ((int) (mss/2)))) {
 487                 window = 0;
 488                 tp->pred_flags = 0;
 489         } else {
 490                 /* Get the largest window that is a nice multiple of mss.
 491                  * Window clamp already applied above.
 492                  * If our current window offering is within 1 mss of the
 493                  * free space we just keep it. This prevents the divide
 494                  * and multiply from happening most of the time.
 495                  * We also don't do any window rounding when the free space
 496                  * is too small.
 497                  */
 498                 window = tp->rcv_wnd;
 499                 if ((((int) window) <= (free_space - ((int) mss))) ||
 500                                 (((int) window) > free_space))
 501                         window = (((unsigned int) free_space)/mss)*mss;
 502         }
 503         return window;
 504 }
 505
 506 /* Attempt to collapse two adjacent SKB's during retransmission. */
 507 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
 508 {
 509         struct sk_buff *next_skb = skb->next;
 510
 511         /* The first test we must make is that neither of these two
 512          * SKB's are still referenced by someone else.
 513          */
 514         if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
 515                 int skb_size = skb->len, next_skb_size = next_skb->len;
 516                 u16 flags = TCP_SKB_CB(skb)->flags;
 517
 518                 /* Punt if the first SKB has URG set. */
 519                 if(flags & TCPCB_FLAG_URG)
 520                         return;
 521
 522                 /* Also punt if next skb has been SACK'd. */
 523                 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
 524                         return;
 525
 526                 /* Punt if not enough space exists in the first SKB for
 527                  * the data in the second, or the total combined payload
 528                  * would exceed the MSS.
 529                  */
 530                 if ((next_skb_size > skb_tailroom(skb)) ||
 531                     ((skb_size + next_skb_size) > mss_now))
 532                         return;
 533
 534                 /* Ok.  We will be able to collapse the packet. */
 535                 __skb_unlink(next_skb, next_skb->list);
 536
 537                 if(skb->len % 4) {
 538                         /* Must copy and rechecksum all data. */
 539                         memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
 540                         skb->csum = csum_partial(skb->data, skb->len, 0);
 541                 } else {
 542                         /* Optimize, actually we could also combine next_skb->csum
 543                          * to skb->csum using a single add w/carry operation too.
 544                          */
 545                         skb->csum = csum_partial_copy(next_skb->data,
 546                                                       skb_put(skb, next_skb_size),
 547                                                       next_skb_size, skb->csum);
 548                 }
 549
 550                 /* Update sequence range on original skb. */
 551                 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
 552
 553                 /* Merge over control information. */
 554                 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
 555                 if(flags & TCPCB_FLAG_URG) {
 556                         u16 urgptr = TCP_SKB_CB(next_skb)->urg_ptr;
 557                         TCP_SKB_CB(skb)->urg_ptr = urgptr + skb_size;
 558                 }
 559                 TCP_SKB_CB(skb)->flags = flags;
 560
 561                 /* All done, get rid of second SKB and account for it so
 562                  * packet counting does not break.
 563                  */
 564                 kfree_skb(next_skb);
 565                 sk->tp_pinfo.af_tcp.packets_out--;
 566         }
 567 }
 568
 569 /* Do a simple retransmit without using the backoff mechanisms in
 570  * tcp_timer. This is used for path mtu discovery.
 571  * The socket is already locked here.
 572  */
 573 void tcp_simple_retransmit(struct sock *sk)
 574 {
 575         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 576         struct sk_buff *skb, *old_next_skb;
 577         unsigned int mss = tcp_current_mss(sk);
 578
 579         /* Don't muck with the congestion window here. */
 580         tp->dup_acks = 0;
 581         tp->high_seq = tp->snd_nxt;
 582         tp->retrans_head = NULL;
 583
 584         /* Input control flow will see that this was retransmitted
 585          * and not use it for RTT calculation in the absence of
 586          * the timestamp option.
 587          */
 588         for (old_next_skb = skb = skb_peek(&sk->write_queue);
 589              ((skb != tp->send_head) &&
 590               (skb != (struct sk_buff *)&sk->write_queue));
 591              skb = skb->next) {
 592                 int resend_skb = 0;
 593
 594                 /* Our goal is to push out the packets which we
 595                  * sent already, but are being chopped up now to
 596                  * account for the PMTU information we have.
 597                  *
 598                  * As we resend the queue, packets are fragmented
 599                  * into two pieces, and when we try to send the
 600                  * second piece it may be collapsed together with
 601                  * a subsequent packet, and so on.  -DaveM
 602                  */
 603                 if (old_next_skb != skb || skb->len > mss)
 604                         resend_skb = 1;
 605                 old_next_skb = skb->next;
 606                 if (resend_skb != 0)
 607                         tcp_retransmit_skb(sk, skb);
 608         }
 609 }
 610
 611 static __inline__ void update_retrans_head(struct sock *sk)
 612 {
 613         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 614
 615         tp->retrans_head = tp->retrans_head->next;
 616         if((tp->retrans_head == tp->send_head) ||
 617            (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) {
 618                 tp->retrans_head = NULL;
 619                 tp->rexmt_done = 1;
 620         }
 621 }
 622
 623 /* This retransmits one SKB.  Policy decisions and retransmit queue
 624  * state updates are done by the caller.  Returns non-zero if an
 625  * error occurred which prevented the send.
 626  */
 627 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 628 {
 629         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 630         unsigned int cur_mss = tcp_current_mss(sk);
 631
 632         if(skb->len > cur_mss) {
 633                 if(tcp_fragment(sk, skb, cur_mss))
 634                         return 1; /* We'll try again later. */
 635
 636                 /* New SKB created, account for it. */
 637                 tp->packets_out++;
 638         }
 639
 640         /* Collapse two adjacent packets if worthwhile and we can. */
 641         if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
 642            (skb->len < (cur_mss >> 1)) &&
 643            (skb->next != tp->send_head) &&
 644            (skb->next != (struct sk_buff *)&sk->write_queue) &&
 645            (sysctl_tcp_retrans_collapse != 0))
 646                 tcp_retrans_try_collapse(sk, skb, cur_mss);
 647
 648         if(tp->af_specific->rebuild_header(sk))
 649                 return 1; /* Routing failure or similar. */
 650
 651         /* Some Solaris stacks overoptimize and ignore the FIN on a
 652          * retransmit when old data is attached.  So strip it off
 653          * since it is cheap to do so and saves bytes on the network.
 654          */
 655         if(skb->len > 0 &&
 656            (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
 657            tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
 658                 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
 659                 skb_trim(skb, 0);
 660                 skb->csum = 0;
 661         }
 662
 663         /* Ok, we're gonna send it out, update state. */
 664         TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS;
 665         tp->retrans_out++;
 666
 667         /* Make a copy, if the first transmission SKB clone we made
 668          * is still in somebody's hands, else make a clone.
 669          */
 670         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 671         if(skb_cloned(skb))
 672                 skb = skb_copy(skb, GFP_ATOMIC);
 673         else
 674                 skb = skb_clone(skb, GFP_ATOMIC);
 675
 676         tcp_transmit_skb(sk, skb);
 677
 678         /* Update global TCP statistics and return success. */
 679         sk->prot->retransmits++;
 680         tcp_statistics.TcpRetransSegs++;
 681
 682         return 0;
 683 }
 684
 685 /* This gets called after a retransmit timeout, and the initially
 686  * retransmitted data is acknowledged.  It tries to continue
 687  * resending the rest of the retransmit queue, until either
 688  * we've sent it all or the congestion window limit is reached.
 689  * If doing SACK, the first ACK which comes back for a timeout
 690  * based retransmit packet might feed us FACK information again.
 691  * If so, we use it to avoid unnecessarily retransmissions.
 692  */
 693 void tcp_xmit_retransmit_queue(struct sock *sk)
 694 {
 695         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 696         struct sk_buff *skb;
 697
 698         if (tp->retrans_head == NULL &&
 699             tp->rexmt_done == 0)
 700                 tp->retrans_head = skb_peek(&sk->write_queue);
 701         if (tp->retrans_head == tp->send_head)
 702                 tp->retrans_head = NULL;
 703
 704         /* Each time, advance the retrans_head if we got
 705          * a packet out or we skipped one because it was
 706          * SACK'd.  -DaveM
 707          */
 708         while ((skb = tp->retrans_head) != NULL) {
 709                 /* If it has been ack'd by a SACK block, we don't
 710                  * retransmit it.
 711                  */
 712                 if(!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
 713                         /* Send it out, punt if error occurred. */
 714                         if(tcp_retransmit_skb(sk, skb))
 715                                 break;
 716
 717                         update_retrans_head(sk);
 718
 719                         /* Stop retransmitting if we've hit the congestion
 720                          * window limit.
 721                          */
 722                         if (tp->retrans_out >= tp->snd_cwnd)
 723                                 break;
 724                 } else {
 725                         update_retrans_head(sk);
 726                 }
 727         }
 728 }
 729
 730 /* Using FACK information, retransmit all missing frames at the receiver
 731  * up to the forward most SACK'd packet (tp->fackets_out) if the packet
 732  * has not been retransmitted already.
 733  */
 734 void tcp_fack_retransmit(struct sock *sk)
 735 {
 736         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 737         struct sk_buff *skb = skb_peek(&sk->write_queue);
 738         int packet_cnt = 0;
 739
 740         while((skb != NULL) &&
 741               (skb != tp->send_head) &&
 742               (skb != (struct sk_buff *)&sk->write_queue)) {
 743                 __u8 sacked = TCP_SKB_CB(skb)->sacked;
 744
 745                 if(sacked & (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS))
 746                         goto next_packet;
 747
 748                 /* Ok, retransmit it. */
 749                 if(tcp_retransmit_skb(sk, skb))
 750                         break;
 751
 752                 if(tcp_packets_in_flight(tp) >= tp->snd_cwnd)
 753                         break;
 754 next_packet:
 755                 packet_cnt++;
 756                 if(packet_cnt >= tp->fackets_out)
 757                         break;
 758                 skb = skb->next;
 759         }
 760 }
 761
 762 /* Send a fin.  The caller locks the socket for us.  This cannot be
 763  * allowed to fail queueing a FIN frame under any circumstances.
 764  */
 765 void tcp_send_fin(struct sock *sk)
 766 {
 767         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 768         struct sk_buff *skb = skb_peek_tail(&sk->write_queue);
 769         unsigned int mss_now;
 770
 771         /* Optimization, tack on the FIN if we have a queue of
 772          * unsent frames.  But be careful about outgoing SACKS
 773          * and IP options.
 774          */
 775         mss_now = tcp_current_mss(sk);
 776
 777         if((tp->send_head != NULL) && (skb->len < mss_now)) {
 778                 /* tcp_write_xmit() takes care of the rest. */
 779                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
 780                 TCP_SKB_CB(skb)->end_seq++;
 781                 tp->write_seq++;
 782
 783                 /* Special case to avoid Nagle bogosity.  If this
 784                  * segment is the last segment, and it was queued
 785                  * due to Nagle/SWS-avoidance, send it out now.
 786                  */
 787                 if(tp->send_head == skb &&
 788                    !sk->nonagle &&
 789                    skb->len < (tp->rcv_mss >> 1) &&
 790                    tp->packets_out &&
 791                    !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
 792                         update_send_head(sk);
 793                         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 794                         tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 795                         tp->packets_out++;
 796                         tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 797                         if(!tcp_timer_is_set(sk, TIME_RETRANS))
 798                                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 799                 }
 800         } else {
 801                 /* Socket is locked, keep trying until memory is available. */
 802                 do {
 803                         skb = sock_wmalloc(sk,
 804                                            (MAX_HEADER +
 805                                             sk->prot->max_header),
 806                                            1, GFP_KERNEL);
 807                 } while (skb == NULL);
 808
 809                 /* Reserve space for headers and prepare control bits. */
 810                 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 811                 skb->csum = 0;
 812                 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
 813                 TCP_SKB_CB(skb)->sacked = 0;
 814                 TCP_SKB_CB(skb)->urg_ptr = 0;
 815
 816                 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
 817                 TCP_SKB_CB(skb)->seq = tp->write_seq;
 818                 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
 819                 tcp_send_skb(sk, skb, 0);
 820         }
 821 }
 822
 823 /* We get here when a process closes a file descriptor (either due to
 824  * an explicit close() or as a byproduct of exit()'ing) and there
 825  * was unread data in the receive queue.  This behavior is recommended
 826  * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
 827  */
 828 void tcp_send_active_reset(struct sock *sk, int priority)
 829 {
 830         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 831         struct sk_buff *skb;
 832
 833         /* NOTE: No TCP options attached and we never retransmit this. */
 834         skb = alloc_skb(MAX_HEADER + sk->prot->max_header, priority);
 835         if (!skb)
 836                 return;
 837
 838         /* Reserve space for headers and prepare control bits. */
 839         skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 840         skb->csum = 0;
 841         TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
 842         TCP_SKB_CB(skb)->sacked = 0;
 843         TCP_SKB_CB(skb)->urg_ptr = 0;
 844
 845         /* Send it off. */
 846         TCP_SKB_CB(skb)->seq = tp->write_seq;
 847         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
 848         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 849         tcp_transmit_skb(sk, skb);
 850 }
 851
 852 /* WARNING: This routine must only be called when we have already sent
 853  * a SYN packet that crossed the incoming SYN that caused this routine
 854  * to get called. If this assumption fails then the initial rcv_wnd
 855  * and rcv_wscale values will not be correct.
 856  */
 857 int tcp_send_synack(struct sock *sk)
 858 {
 859         struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
 860         struct sk_buff* skb;
 861
 862         skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
 863                            1, GFP_ATOMIC);
 864         if (skb == NULL)
 865                 return -ENOMEM;
 866
 867         /* Reserve space for headers and prepare control bits. */
 868         skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 869         skb->csum = 0;
 870         TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN);
 871         TCP_SKB_CB(skb)->sacked = 0;
 872         TCP_SKB_CB(skb)->urg_ptr = 0;
 873
 874         /* SYN eats a sequence byte. */
 875         TCP_SKB_CB(skb)->seq = tp->snd_una;
 876         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
 877         __skb_queue_tail(&sk->write_queue, skb);
 878         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 879         tp->packets_out++;
 880         tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 881         return 0;
 882 }
 883
 884 /*
 885  * Prepare a SYN-ACK.
 886  */
 887 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 888                                  struct open_request *req)
 889 {
 890         struct tcphdr *th;
 891         int tcp_header_size;
 892         struct sk_buff *skb;
 893
 894         skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC);
 895         if (skb == NULL)
 896                 return NULL;
 897
 898         /* Reserve space for headers. */
 899         skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 900
 901         skb->dst = dst_clone(dst);
 902
 903         tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
 904                            (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
 905                            (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
 906                            /* SACK_PERM is in the place of NOP NOP of TS */
 907                            ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
 908         skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 909
 910         memset(th, 0, sizeof(struct tcphdr));
 911         th->syn = 1;
 912         th->ack = 1;
 913         th->source = sk->sport;
 914         th->dest = req->rmt_port;
 915         TCP_SKB_CB(skb)->seq = req->snt_isn;
 916         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
 917         th->seq = htonl(TCP_SKB_CB(skb)->seq);
 918         th->ack_seq = htonl(req->rcv_isn + 1);
 919         if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
 920                 __u8 rcv_wscale;
 921                 /* Set this up on the first call only */
 922                 req->window_clamp = skb->dst->window;
 923                 /* tcp_full_space because it is guaranteed to be the first packet */
 924                 tcp_select_initial_window(tcp_full_space(sk),
 925                         dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
 926                         &req->rcv_wnd,
 927                         &req->window_clamp,
 928                         req->wscale_ok,
 929                         &rcv_wscale);
 930                 req->rcv_wscale = rcv_wscale;
 931         }
 932
 933         /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
 934         th->window = htons(req->rcv_wnd);
 935
 936         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 937         tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok,
 938                               req->sack_ok, req->wscale_ok, req->rcv_wscale,
 939                               TCP_SKB_CB(skb)->when,
 940                               req->ts_recent);
 941
 942         skb->csum = 0;
 943         th->doff = (tcp_header_size >> 2);
 944         tcp_statistics.TcpOutSegs++;
 945         return skb;
 946 }
 947
 948 int tcp_connect(struct sock *sk, struct sk_buff *buff)
 949 {
 950         struct dst_entry *dst = __sk_dst_get(sk);
 951         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 952
 953         /* Reserve space for headers. */
 954         skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
 955
 956         /* We'll fix this up when we get a response from the other end.
 957          * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
 958          */
 959         tp->tcp_header_len = sizeof(struct tcphdr) +
 960                 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
 961
 962         /* If user gave his TCP_MAXSEG, record it to clamp */
 963         if (tp->user_mss)
 964                 tp->mss_clamp = tp->user_mss;
 965         tcp_sync_mss(sk, dst->pmtu);
 966
 967         tp->window_clamp = dst->window;
 968
 969         tcp_select_initial_window(tcp_full_space(sk),
 970                 dst->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)),
 971                 &tp->rcv_wnd,
 972                 &tp->window_clamp,
 973                 sysctl_tcp_window_scaling,
 974                 &tp->rcv_wscale);
 975
 976         /* Socket identity change complete, no longer
 977          * in TCP_CLOSE, so enter ourselves into the
 978          * hash tables.
 979          */
 980         tcp_set_state(sk,TCP_SYN_SENT);
 981         if (tp->af_specific->hash_connecting(sk))
 982                 goto err_out;
 983
 984         sk->err = 0;
 985         tp->snd_wnd = 0;
 986         tp->snd_wl1 = 0;
 987         tp->snd_wl2 = tp->write_seq;
 988         tp->snd_una = tp->write_seq;
 989         tp->rcv_nxt = 0;
 990         tp->rcv_wup = 0;
 991         tp->copied_seq = 0;
 992
 993         tp->rto = TCP_TIMEOUT_INIT;
 994         tcp_init_xmit_timers(sk);
 995         tp->retransmits = 0;
 996         tp->fackets_out = 0;
 997         tp->retrans_out = 0;
 998
 999         TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1000         TCP_SKB_CB(buff)->sacked = 0;
1001         TCP_SKB_CB(buff)->urg_ptr = 0;
1002         buff->csum = 0;
1003         TCP_SKB_CB(buff)->seq = tp->write_seq++;
1004         TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1005         tp->snd_nxt = tp->write_seq;
1006
1007         /* Send it off. */
1008         TCP_SKB_CB(buff)->when = tcp_time_stamp;
1009         __skb_queue_tail(&sk->write_queue, buff);
1010         tp->packets_out++;
1011         tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1012         tcp_statistics.TcpActiveOpens++;
1013
1014         /* Timer for repeating the SYN until an answer. */
1015         tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
1016         return 0;
1017
1018 err_out:
1019         tcp_set_state(sk,TCP_CLOSE);
1020         kfree_skb(buff);
1021         return -EADDRNOTAVAIL;
1022 }
1023
1024 /* Send out a delayed ack, the caller does the policy checking
1025  * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
1026  * for details.
1027  */
1028 void tcp_send_delayed_ack(struct sock *sk, int max_timeout)
1029 {
1030         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1031         unsigned long timeout;
1032
1033         /* Stay within the limit we were given */
1034         timeout = tp->ato;
1035         if (timeout > max_timeout)
1036                 timeout = max_timeout;
1037         timeout += jiffies;
1038
1039         /* Use new timeout only if there wasn't a older one earlier. */
1040         spin_lock_bh(&sk->timer_lock);
1041         if (!tp->delack_timer.prev || !del_timer(&tp->delack_timer)) {
1042                 sock_hold(sk);
1043                 tp->delack_timer.expires = timeout;
1044         } else {
1045                 if (time_before(timeout, tp->delack_timer.expires))
1046                         tp->delack_timer.expires = timeout;
1047         }
1048         add_timer(&tp->delack_timer);
1049         spin_unlock_bh(&sk->timer_lock);
1050 }
1051
1052 /* This routine sends an ack and also updates the window. */
1053 void tcp_send_ack(struct sock *sk)
1054 {
1055         /* If we have been reset, we may not send again. */
1056         if(!sk->zapped) {
1057                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1058                 struct sk_buff *buff;
1059
1060                 /* We are not putting this on the write queue, so
1061                  * tcp_transmit_skb() will set the ownership to this
1062                  * sock.
1063                  */
1064                 buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC);
1065                 if (buff == NULL) {
1066                         /* Force it to send an ack. We don't have to do this
1067                          * (ACK is unreliable) but it's much better use of
1068                          * bandwidth on slow links to send a spare ack than
1069                          * resend packets.
1070                          *
1071                          * This is the one possible way that we can delay an
1072                          * ACK and have tp->ato indicate that we are in
1073                          * quick ack mode, so clear it.
1074                          */
1075                         if(tcp_in_quickack_mode(tp))
1076                                 tcp_exit_quickack_mode(tp);
1077                         tcp_send_delayed_ack(sk, HZ/2);
1078                         return;
1079                 }
1080
1081                 /* Reserve space for headers and prepare control bits. */
1082                 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
1083                 buff->csum = 0;
1084                 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1085                 TCP_SKB_CB(buff)->sacked = 0;
1086                 TCP_SKB_CB(buff)->urg_ptr = 0;
1087
1088                 /* Send it off, this clears delayed acks for us. */
1089                 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tp->snd_nxt;
1090                 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1091                 tcp_transmit_skb(sk, buff);
1092         }
1093 }
1094
1095 /* This routine sends a packet with an out of date sequence
1096  * number. It assumes the other end will try to ack it.
1097  */
1098 void tcp_write_wakeup(struct sock *sk)
1099 {
1100         /* After a valid reset we can send no more. */
1101         if (!sk->zapped) {
1102                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1103                 struct sk_buff *skb;
1104
1105                 /* Write data can still be transmitted/retransmitted in the
1106                  * following states.  If any other state is encountered, return.
1107                  * [listen/close will never occur here anyway]
1108                  */
1109                 if ((1 << sk->state) &
1110                     ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
1111                       TCPF_FIN_WAIT2|TCPF_LAST_ACK|TCPF_CLOSING))
1112                         return;
1113
1114                 if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
1115                     ((skb = tp->send_head) != NULL)) {
1116                         unsigned long win_size;
1117
1118                         /* We are probing the opening of a window
1119                          * but the window size is != 0
1120                          * must have been a result SWS avoidance ( sender )
1121                          */
1122                         win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
1123                         if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
1124                                 if (tcp_fragment(sk, skb, win_size))
1125                                         return; /* Let a retransmit get it. */
1126                         }
1127                         update_send_head(sk);
1128                         TCP_SKB_CB(skb)->when = tcp_time_stamp;
1129                         tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
1130                         tp->packets_out++;
1131                         tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1132                         if (!tcp_timer_is_set(sk, TIME_RETRANS))
1133                                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
1134                 } else {
1135                         /* We don't queue it, tcp_transmit_skb() sets ownership. */
1136                         skb = alloc_skb(MAX_HEADER + sk->prot->max_header,
1137                                         GFP_ATOMIC);
1138                         if (skb == NULL)
1139                                 return;
1140
1141                         /* Reserve space for headers and set control bits. */
1142                         skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
1143                         skb->csum = 0;
1144                         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1145                         TCP_SKB_CB(skb)->sacked = 0;
1146                         TCP_SKB_CB(skb)->urg_ptr = 0;
1147
1148                         /* Use a previous sequence.  This should cause the other
1149                          * end to send an ack.  Don't queue or clone SKB, just
1150                          * send it.
1151                          */
1152                         TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1;
1153                         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1154                         TCP_SKB_CB(skb)->when = tcp_time_stamp;
1155                         tcp_transmit_skb(sk, skb);
1156                 }
1157         }
1158 }
1159
1160 /* A window probe timeout has occurred.  If window is not closed send
1161  * a partial packet else a zero probe.
1162  */
1163 void tcp_send_probe0(struct sock *sk)
1164 {
1165         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1166
1167         tcp_write_wakeup(sk);
1168         tp->pending = TIME_PROBE0;
1169         tp->backoff++;
1170         tp->probes_out++;
1171         tcp_reset_xmit_timer (sk, TIME_PROBE0,
1172                               min(tp->rto << tp->backoff, 120*HZ));
1173 }