net/ipv4/tcp_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_output.c,v 1.110 1999/05/27 00:37:45 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 /*
  24  * Changes:     Pedro Roque     :       Retransmit queue handled by TCP.
  25  *                              :       Fragmentation on mtu decrease
  26  *                              :       Segment collapse on retransmit
  27  *                              :       AF independence
  28  *
  29  *              Linus Torvalds  :       send_delayed_ack
  30  *              David S. Miller :       Charge memory using the right skb
  31  *                                      during syn/ack processing.
  32  *              David S. Miller :       Output engine completely rewritten.
  33  *              Andrea Arcangeli:       SYNACK carry ts_recent in tsecr.
  34  *
  35  */
  36
  37 #include <net/tcp.h>
  38
  39 #include <linux/smp_lock.h>
  40
  41 extern int sysctl_tcp_timestamps;
  42 extern int sysctl_tcp_window_scaling;
  43 extern int sysctl_tcp_sack;
  44
  45 /* People can turn this off for buggy TCP's found in printers etc. */
  46 int sysctl_tcp_retrans_collapse = 1;
  47
  48 /* Get rid of any delayed acks, we sent one already.. */
  49 static __inline__ void clear_delayed_acks(struct sock * sk)
  50 {
  51         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  52
  53         tp->delayed_acks = 0;
  54         if(tcp_in_quickack_mode(tp))
  55                 tcp_exit_quickack_mode(tp);
  56         tcp_clear_xmit_timer(sk, TIME_DACK);
  57 }
  58
  59 static __inline__ void update_send_head(struct sock *sk)
  60 {
  61         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
  62
  63         tp->send_head = tp->send_head->next;
  64         if (tp->send_head == (struct sk_buff *) &sk->write_queue)
  65                 tp->send_head = NULL;
  66 }
  67
  68 /* This routine actually transmits TCP packets queued in by
  69  * tcp_do_sendmsg().  This is used by both the initial
  70  * transmission and possible later retransmissions.
  71  * All SKB's seen here are completely headerless.  It is our
  72  * job to build the TCP header, and pass the packet down to
  73  * IP so it can do the same plus pass the packet off to the
  74  * device.
  75  *
  76  * We are working here with either a clone of the original
  77  * SKB, or a fresh unique copy made by the retransmit engine.
  78  */
  79 void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
  80 {
  81         if(skb != NULL) {
  82                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  83                 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
  84                 int tcp_header_size = tp->tcp_header_len;
  85                 struct tcphdr *th;
  86                 int sysctl_flags;
  87
  88 #define SYSCTL_FLAG_TSTAMPS     0x1
  89 #define SYSCTL_FLAG_WSCALE      0x2
  90 #define SYSCTL_FLAG_SACK        0x4
  91
  92                 sysctl_flags = 0;
  93                 if(tcb->flags & TCPCB_FLAG_SYN) {
  94                         tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
  95                         if(sysctl_tcp_timestamps) {
  96                                 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
  97                                 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
  98                         }
  99                         if(sysctl_tcp_window_scaling) {
 100                                 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
 101                                 sysctl_flags |= SYSCTL_FLAG_WSCALE;
 102                         }
 103                         if(sysctl_tcp_sack) {
 104                                 sysctl_flags |= SYSCTL_FLAG_SACK;
 105                                 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
 106                                         tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
 107                         }
 108                 } else if(tp->sack_ok && tp->num_sacks) {
 109                         /* A SACK is 2 pad bytes, a 2 byte header, plus
 110                          * 2 32-bit sequence numbers for each SACK block.
 111                          */
 112                         tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
 113                                             (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
 114                 }
 115                 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 116                 skb->h.th = th;
 117                 skb_set_owner_w(skb, sk);
 118
 119                 /* Build TCP header and checksum it. */
 120                 th->source              = sk->sport;
 121                 th->dest                = sk->dport;
 122                 th->seq                 = htonl(TCP_SKB_CB(skb)->seq);
 123                 th->ack_seq             = htonl(tp->rcv_nxt);
 124                 th->doff                = (tcp_header_size >> 2);
 125                 th->res1                = 0;
 126                 *(((__u8 *)th) + 13)    = tcb->flags;
 127                 if(!(tcb->flags & TCPCB_FLAG_SYN))
 128                         th->window      = htons(tcp_select_window(sk));
 129                 th->check               = 0;
 130                 th->urg_ptr             = ntohs(tcb->urg_ptr);
 131                 if(tcb->flags & TCPCB_FLAG_SYN) {
 132                         /* RFC1323: The window in SYN & SYN/ACK segments
 133                          * is never scaled.
 134                          */
 135                         th->window      = htons(tp->rcv_wnd);
 136                         tcp_syn_build_options((__u32 *)(th + 1), tp->mss_clamp,
 137                                               (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
 138                                               (sysctl_flags & SYSCTL_FLAG_SACK),
 139                                               (sysctl_flags & SYSCTL_FLAG_WSCALE),
 140                                               tp->rcv_wscale,
 141                                               TCP_SKB_CB(skb)->when,
 142                                               tp->ts_recent);
 143                 } else {
 144                         tcp_build_and_update_options((__u32 *)(th + 1),
 145                                                      tp, TCP_SKB_CB(skb)->when);
 146                 }
 147                 tp->af_specific->send_check(sk, th, skb->len, skb);
 148
 149                 clear_delayed_acks(sk);
 150                 tp->last_ack_sent = tp->rcv_nxt;
 151                 tcp_statistics.TcpOutSegs++;
 152                 tp->af_specific->queue_xmit(skb);
 153         }
 154 #undef SYSCTL_FLAG_TSTAMPS
 155 #undef SYSCTL_FLAG_WSCALE
 156 #undef SYSCTL_FLAG_SACK
 157 }
 158
 159 /* This is the main buffer sending routine. We queue the buffer
 160  * and decide whether to queue or transmit now.
 161  */
 162 void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
 163 {
 164         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 165
 166         /* Advance write_seq and place onto the write_queue. */
 167         tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq);
 168         __skb_queue_tail(&sk->write_queue, skb);
 169
 170         if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) {
 171                 /* Send it out now. */
 172                 TCP_SKB_CB(skb)->when = tcp_time_stamp;
 173                 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 174                 tp->packets_out++;
 175                 tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
 176                 if(!tcp_timer_is_set(sk, TIME_RETRANS))
 177                         tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 178         } else {
 179                 /* Queue it, remembering where we must start sending. */
 180                 if (tp->send_head == NULL)
 181                         tp->send_head = skb;
 182                 if (!force_queue && tp->packets_out == 0 && !tp->pending) {
 183                         tp->pending = TIME_PROBE0;
 184                         tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
 185                 }
 186         }
 187 }
 188
 189 /* Function to create two new TCP segments.  Shrinks the given segment
 190  * to the specified size and appends a new segment with the rest of the
 191  * packet to the list.  This won't be called frequently, I hope.
 192  * Remember, these are still headerless SKBs at this point.
 193  */
 194 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 195 {
 196         struct sk_buff *buff;
 197         int nsize = skb->len - len;
 198         u16 flags;
 199
 200         /* Get a new skb... force flag on. */
 201         buff = sock_wmalloc(sk,
 202                             (nsize + MAX_HEADER + sk->prot->max_header),
 203                             1, GFP_ATOMIC);
 204         if (buff == NULL)
 205                 return -1; /* We'll just try again later. */
 206
 207         /* Reserve space for headers. */
 208         skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
 209
 210         /* Correct the sequence numbers. */
 211         TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
 212         TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
 213
 214         /* PSH and FIN should only be set in the second packet. */
 215         flags = TCP_SKB_CB(skb)->flags;
 216         TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
 217         if(flags & TCPCB_FLAG_URG) {
 218                 u16 old_urg_ptr = TCP_SKB_CB(skb)->urg_ptr;
 219
 220                 /* Urgent data is always a pain in the ass. */
 221                 if(old_urg_ptr > len) {
 222                         TCP_SKB_CB(skb)->flags &= ~(TCPCB_FLAG_URG);
 223                         TCP_SKB_CB(skb)->urg_ptr = 0;
 224                         TCP_SKB_CB(buff)->urg_ptr = old_urg_ptr - len;
 225                 } else {
 226                         flags &= ~(TCPCB_FLAG_URG);
 227                 }
 228         }
 229         if(!(flags & TCPCB_FLAG_URG))
 230                 TCP_SKB_CB(buff)->urg_ptr = 0;
 231         TCP_SKB_CB(buff)->flags = flags;
 232         TCP_SKB_CB(buff)->sacked = 0;
 233
 234         /* Copy and checksum data tail into the new buffer. */
 235         buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize),
 236                                        nsize, 0);
 237
 238         /* This takes care of the FIN sequence number too. */
 239         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
 240         skb_trim(skb, len);
 241
 242         /* Rechecksum original buffer. */
 243         skb->csum = csum_partial(skb->data, skb->len, 0);
 244
 245         /* Looks stupid, but our code really uses when of
 246          * skbs, which it never sent before. --ANK
 247          */
 248         TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
 249
 250         /* Link BUFF into the send queue. */
 251         __skb_append(skb, buff);
 252
 253         return 0;
 254 }
 255
 256 /* This function synchronize snd mss to current pmtu/exthdr set.
 257
 258    tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
 259    for TCP options, but includes only bare TCP header.
 260
 261    tp->mss_clamp is mss negotiated at connection setup.
 262    It is minumum of user_mss and mss received with SYN.
 263    It also does not include TCP options.
 264
 265    tp->pmtu_cookie is last pmtu, seen by this function.
 266
 267    tp->mss_cache is current effective sending mss, including
 268    all tcp options except for SACKs. It is evaluated,
 269    taking into account current pmtu, but never exceeds
 270    tp->mss_clamp.
 271
 272    NOTE1. rfc1122 clearly states that advertised MSS
 273    DOES NOT include either tcp or ip options.
 274
 275    NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
 276    this function.                       --ANK (980731)
 277  */
 278
 279 int tcp_sync_mss(struct sock *sk, u32 pmtu)
 280 {
 281         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 282         int mss_now;
 283
 284         /* Calculate base mss without TCP options:
 285            It is MMS_S - sizeof(tcphdr) of rfc1122
 286         */
 287         mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
 288
 289         /* Clamp it (mss_clamp does not include tcp options) */
 290         if (mss_now > tp->mss_clamp)
 291                 mss_now = tp->mss_clamp;
 292
 293         /* Now subtract TCP options size, not including SACKs */
 294         mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
 295
 296         /* Now subtract optional transport overhead */
 297         mss_now -= tp->ext_header_len;
 298
 299         /* It we got too small (or even negative) value,
 300            clamp it by 8 from below. Why 8 ?
 301            Well, it could be 1 with the same success,
 302            but if IP accepted segment of length 1,
 303            it would love 8 even more 8)         --ANK (980731)
 304          */
 305         if (mss_now < 8)
 306                 mss_now = 8;
 307
 308         /* And store cached results */
 309         tp->pmtu_cookie = pmtu;
 310         tp->mss_cache = mss_now;
 311         return mss_now;
 312 }
 313
 314
 315 /* This routine writes packets to the network.  It advances the
 316  * send_head.  This happens as incoming acks open up the remote
 317  * window for us.
 318  */
 319 void tcp_write_xmit(struct sock *sk)
 320 {
 321         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 322         unsigned int mss_now;
 323
 324         /* Account for SACKS, we may need to fragment due to this.
 325          * It is just like the real MSS changing on us midstream.
 326          * We also handle things correctly when the user adds some
 327          * IP options mid-stream.  Silly to do, but cover it.
 328          */
 329         mss_now = tcp_current_mss(sk);
 330
 331         /* If we are zapped, the bytes will have to remain here.
 332          * In time closedown will empty the write queue and all
 333          * will be happy.
 334          */
 335         if(!sk->zapped) {
 336                 struct sk_buff *skb;
 337                 int sent_pkts = 0;
 338
 339                 /* Anything on the transmit queue that fits the window can
 340                  * be added providing we are:
 341                  *
 342                  * a) following SWS avoidance [and Nagle algorithm]
 343                  * b) not exceeding our congestion window.
 344                  * c) not retransmitting [Nagle]
 345                  */
 346                 while((skb = tp->send_head) && tcp_snd_test(sk, skb)) {
 347                         if (skb->len > mss_now) {
 348                                 if (tcp_fragment(sk, skb, mss_now))
 349                                         break;
 350                         }
 351
 352                         /* Advance the send_head.  This one is going out. */
 353                         update_send_head(sk);
 354                         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 355                         tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 356                         tp->packets_out++;
 357                         tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 358                         sent_pkts = 1;
 359                 }
 360
 361                 /* If we sent anything, make sure the retransmit
 362                  * timer is active.
 363                  */
 364                 if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS))
 365                         tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 366         }
 367 }
 368
 369 /* This function returns the amount that we can raise the
 370  * usable window based on the following constraints
 371  *
 372  * 1. The window can never be shrunk once it is offered (RFC 793)
 373  * 2. We limit memory per socket
 374  *
 375  * RFC 1122:
 376  * "the suggested [SWS] avoidance algorithm for the receiver is to keep
 377  *  RECV.NEXT + RCV.WIN fixed until:
 378  *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
 379  *
 380  * i.e. don't raise the right edge of the window until you can raise
 381  * it at least MSS bytes.
 382  *
 383  * Unfortunately, the recommended algorithm breaks header prediction,
 384  * since header prediction assumes th->window stays fixed.
 385  *
 386  * Strictly speaking, keeping th->window fixed violates the receiver
 387  * side SWS prevention criteria. The problem is that under this rule
 388  * a stream of single byte packets will cause the right side of the
 389  * window to always advance by a single byte.
 390  *
 391  * Of course, if the sender implements sender side SWS prevention
 392  * then this will not be a problem.
 393  *
 394  * BSD seems to make the following compromise:
 395  *
 396  *      If the free space is less than the 1/4 of the maximum
 397  *      space available and the free space is less than 1/2 mss,
 398  *      then set the window to 0.
 399  *      Otherwise, just prevent the window from shrinking
 400  *      and from being larger than the largest representable value.
 401  *
 402  * This prevents incremental opening of the window in the regime
 403  * where TCP is limited by the speed of the reader side taking
 404  * data out of the TCP receive queue. It does nothing about
 405  * those cases where the window is constrained on the sender side
 406  * because the pipeline is full.
 407  *
 408  * BSD also seems to "accidentally" limit itself to windows that are a
 409  * multiple of MSS, at least until the free space gets quite small.
 410  * This would appear to be a side effect of the mbuf implementation.
 411  * Combining these two algorithms results in the observed behavior
 412  * of having a fixed window size at almost all times.
 413  *
 414  * Below we obtain similar behavior by forcing the offered window to
 415  * a multiple of the mss when it is feasible to do so.
 416  *
 417  * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
 418  */
 419 u32 __tcp_select_window(struct sock *sk)
 420 {
 421         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 422         unsigned int mss = tp->mss_cache;
 423         int free_space;
 424         u32 window;
 425
 426         /* Sometimes free_space can be < 0. */
 427         free_space = (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / 2;
 428         if (tp->window_clamp) {
 429                 if (free_space > ((int) tp->window_clamp))
 430                         free_space = tp->window_clamp;
 431                 mss = min(tp->window_clamp, mss);
 432         } else {
 433                 printk("tcp_select_window: tp->window_clamp == 0.\n");
 434         }
 435
 436         if (mss < 1) {
 437                 mss = 1;
 438                 printk("tcp_select_window: sk->mss fell to 0.\n");
 439         }
 440
 441         if ((free_space < (sk->rcvbuf/4)) && (free_space < ((int) (mss/2)))) {
 442                 window = 0;
 443                 tp->pred_flags = 0;
 444         } else {
 445                 /* Get the largest window that is a nice multiple of mss.
 446                  * Window clamp already applied above.
 447                  * If our current window offering is within 1 mss of the
 448                  * free space we just keep it. This prevents the divide
 449                  * and multiply from happening most of the time.
 450                  * We also don't do any window rounding when the free space
 451                  * is too small.
 452                  */
 453                 window = tp->rcv_wnd;
 454                 if ((((int) window) <= (free_space - ((int) mss))) ||
 455                                 (((int) window) > free_space))
 456                         window = (((unsigned int) free_space)/mss)*mss;
 457         }
 458         return window;
 459 }
 460
 461 /* Attempt to collapse two adjacent SKB's during retransmission. */
 462 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
 463 {
 464         struct sk_buff *next_skb = skb->next;
 465
 466         /* The first test we must make is that neither of these two
 467          * SKB's are still referenced by someone else.
 468          */
 469         if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
 470                 int skb_size = skb->len, next_skb_size = next_skb->len;
 471                 u16 flags = TCP_SKB_CB(skb)->flags;
 472
 473                 /* Punt if the first SKB has URG set. */
 474                 if(flags & TCPCB_FLAG_URG)
 475                         return;
 476
 477                 /* Also punt if next skb has been SACK'd. */
 478                 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
 479                         return;
 480
 481                 /* Punt if not enough space exists in the first SKB for
 482                  * the data in the second, or the total combined payload
 483                  * would exceed the MSS.
 484                  */
 485                 if ((next_skb_size > skb_tailroom(skb)) ||
 486                     ((skb_size + next_skb_size) > mss_now))
 487                         return;
 488
 489                 /* Ok.  We will be able to collapse the packet. */
 490                 __skb_unlink(next_skb, next_skb->list);
 491
 492                 if(skb->len % 4) {
 493                         /* Must copy and rechecksum all data. */
 494                         memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
 495                         skb->csum = csum_partial(skb->data, skb->len, 0);
 496                 } else {
 497                         /* Optimize, actually we could also combine next_skb->csum
 498                          * to skb->csum using a single add w/carry operation too.
 499                          */
 500                         skb->csum = csum_partial_copy(next_skb->data,
 501                                                       skb_put(skb, next_skb_size),
 502                                                       next_skb_size, skb->csum);
 503                 }
 504
 505                 /* Update sequence range on original skb. */
 506                 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
 507
 508                 /* Merge over control information. */
 509                 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
 510                 if(flags & TCPCB_FLAG_URG) {
 511                         u16 urgptr = TCP_SKB_CB(next_skb)->urg_ptr;
 512                         TCP_SKB_CB(skb)->urg_ptr = urgptr + skb_size;
 513                 }
 514                 TCP_SKB_CB(skb)->flags = flags;
 515
 516                 /* All done, get rid of second SKB and account for it so
 517                  * packet counting does not break.
 518                  */
 519                 kfree_skb(next_skb);
 520                 sk->tp_pinfo.af_tcp.packets_out--;
 521         }
 522 }
 523
 524 /* Do a simple retransmit without using the backoff mechanisms in
 525  * tcp_timer. This is used for path mtu discovery.
 526  * The socket is already locked here.
 527  */
 528 void tcp_simple_retransmit(struct sock *sk)
 529 {
 530         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 531         struct sk_buff *skb, *old_next_skb;
 532         unsigned int mss = tcp_current_mss(sk);
 533
 534         /* Don't muck with the congestion window here. */
 535         tp->dup_acks = 0;
 536         tp->high_seq = tp->snd_nxt;
 537         tp->retrans_head = NULL;
 538
 539         /* Input control flow will see that this was retransmitted
 540          * and not use it for RTT calculation in the absence of
 541          * the timestamp option.
 542          */
 543         for (old_next_skb = skb = skb_peek(&sk->write_queue);
 544              ((skb != tp->send_head) &&
 545               (skb != (struct sk_buff *)&sk->write_queue));
 546              skb = skb->next) {
 547                 int resend_skb = 0;
 548
 549                 /* Our goal is to push out the packets which we
 550                  * sent already, but are being chopped up now to
 551                  * account for the PMTU information we have.
 552                  *
 553                  * As we resend the queue, packets are fragmented
 554                  * into two pieces, and when we try to send the
 555                  * second piece it may be collapsed together with
 556                  * a subsequent packet, and so on.  -DaveM
 557                  */
 558                 if (old_next_skb != skb || skb->len > mss)
 559                         resend_skb = 1;
 560                 old_next_skb = skb->next;
 561                 if (resend_skb != 0)
 562                         tcp_retransmit_skb(sk, skb);
 563         }
 564 }
 565
 566 static __inline__ void update_retrans_head(struct sock *sk)
 567 {
 568         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 569
 570         tp->retrans_head = tp->retrans_head->next;
 571         if((tp->retrans_head == tp->send_head) ||
 572            (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) {
 573                 tp->retrans_head = NULL;
 574                 tp->rexmt_done = 1;
 575         }
 576 }
 577
 578 /* This retransmits one SKB.  Policy decisions and retransmit queue
 579  * state updates are done by the caller.  Returns non-zero if an
 580  * error occurred which prevented the send.
 581  */
 582 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 583 {
 584         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 585         unsigned int cur_mss = tcp_current_mss(sk);
 586
 587         if(skb->len > cur_mss) {
 588                 if(tcp_fragment(sk, skb, cur_mss))
 589                         return 1; /* We'll try again later. */
 590
 591                 /* New SKB created, account for it. */
 592                 tp->packets_out++;
 593         }
 594
 595         /* Collapse two adjacent packets if worthwhile and we can. */
 596         if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
 597            (skb->len < (cur_mss >> 1)) &&
 598            (skb->next != tp->send_head) &&
 599            (skb->next != (struct sk_buff *)&sk->write_queue) &&
 600            (sysctl_tcp_retrans_collapse != 0))
 601                 tcp_retrans_try_collapse(sk, skb, cur_mss);
 602
 603         if(tp->af_specific->rebuild_header(sk))
 604                 return 1; /* Routing failure or similar. */
 605
 606         /* Some Solaris stacks overoptimize and ignore the FIN on a
 607          * retransmit when old data is attached.  So strip it off
 608          * since it is cheap to do so and saves bytes on the network.
 609          */
 610         if(skb->len > 0 &&
 611            (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
 612            tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
 613                 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
 614                 skb_trim(skb, 0);
 615                 skb->csum = 0;
 616         }
 617
 618         /* Ok, we're gonna send it out, update state. */
 619         TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS;
 620         tp->retrans_out++;
 621
 622         /* Make a copy, if the first transmission SKB clone we made
 623          * is still in somebody's hands, else make a clone.
 624          */
 625         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 626         if(skb_cloned(skb))
 627                 skb = skb_copy(skb, GFP_ATOMIC);
 628         else
 629                 skb = skb_clone(skb, GFP_ATOMIC);
 630
 631         tcp_transmit_skb(sk, skb);
 632
 633         /* Update global TCP statistics and return success. */
 634         sk->prot->retransmits++;
 635         tcp_statistics.TcpRetransSegs++;
 636
 637         return 0;
 638 }
 639
 640 /* This gets called after a retransmit timeout, and the initially
 641  * retransmitted data is acknowledged.  It tries to continue
 642  * resending the rest of the retransmit queue, until either
 643  * we've sent it all or the congestion window limit is reached.
 644  * If doing SACK, the first ACK which comes back for a timeout
 645  * based retransmit packet might feed us FACK information again.
 646  * If so, we use it to avoid unnecessarily retransmissions.
 647  */
 648 void tcp_xmit_retransmit_queue(struct sock *sk)
 649 {
 650         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 651         struct sk_buff *skb;
 652
 653         if (tp->retrans_head == NULL &&
 654             tp->rexmt_done == 0)
 655                 tp->retrans_head = skb_peek(&sk->write_queue);
 656         if (tp->retrans_head == tp->send_head)
 657                 tp->retrans_head = NULL;
 658
 659         /* Each time, advance the retrans_head if we got
 660          * a packet out or we skipped one because it was
 661          * SACK'd.  -DaveM
 662          */
 663         while ((skb = tp->retrans_head) != NULL) {
 664                 /* If it has been ack'd by a SACK block, we don't
 665                  * retransmit it.
 666                  */
 667                 if(!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
 668                         /* Send it out, punt if error occurred. */
 669                         if(tcp_retransmit_skb(sk, skb))
 670                                 break;
 671
 672                         update_retrans_head(sk);
 673
 674                         /* Stop retransmitting if we've hit the congestion
 675                          * window limit.
 676                          */
 677                         if (tp->retrans_out >= tp->snd_cwnd)
 678                                 break;
 679                 } else {
 680                         update_retrans_head(sk);
 681                 }
 682         }
 683 }
 684
 685 /* Using FACK information, retransmit all missing frames at the receiver
 686  * up to the forward most SACK'd packet (tp->fackets_out) if the packet
 687  * has not been retransmitted already.
 688  */
 689 void tcp_fack_retransmit(struct sock *sk)
 690 {
 691         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 692         struct sk_buff *skb = skb_peek(&sk->write_queue);
 693         int packet_cnt = 0;
 694
 695         while((skb != NULL) &&
 696               (skb != tp->send_head) &&
 697               (skb != (struct sk_buff *)&sk->write_queue)) {
 698                 __u8 sacked = TCP_SKB_CB(skb)->sacked;
 699
 700                 if(sacked & (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS))
 701                         goto next_packet;
 702
 703                 /* Ok, retransmit it. */
 704                 if(tcp_retransmit_skb(sk, skb))
 705                         break;
 706
 707                 if(tcp_packets_in_flight(tp) >= tp->snd_cwnd)
 708                         break;
 709 next_packet:
 710                 packet_cnt++;
 711                 if(packet_cnt >= tp->fackets_out)
 712                         break;
 713                 skb = skb->next;
 714         }
 715 }
 716
 717 /* Send a fin.  The caller locks the socket for us.  This cannot be
 718  * allowed to fail queueing a FIN frame under any circumstances.
 719  */
 720 void tcp_send_fin(struct sock *sk)
 721 {
 722         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 723         struct sk_buff *skb = skb_peek_tail(&sk->write_queue);
 724         unsigned int mss_now;
 725
 726         /* Optimization, tack on the FIN if we have a queue of
 727          * unsent frames.  But be careful about outgoing SACKS
 728          * and IP options.
 729          */
 730         mss_now = tcp_current_mss(sk);
 731
 732         if((tp->send_head != NULL) && (skb->len < mss_now)) {
 733                 /* tcp_write_xmit() takes care of the rest. */
 734                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
 735                 TCP_SKB_CB(skb)->end_seq++;
 736                 tp->write_seq++;
 737
 738                 /* Special case to avoid Nagle bogosity.  If this
 739                  * segment is the last segment, and it was queued
 740                  * due to Nagle/SWS-avoidance, send it out now.
 741                  */
 742                 if(tp->send_head == skb &&
 743                    !sk->nonagle &&
 744                    skb->len < (tp->mss_cache >> 1) &&
 745                    tp->packets_out &&
 746                    !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
 747                         update_send_head(sk);
 748                         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 749                         tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 750                         tp->packets_out++;
 751                         tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 752                         if(!tcp_timer_is_set(sk, TIME_RETRANS))
 753                                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 754                 }
 755         } else {
 756                 /* Socket is locked, keep trying until memory is available. */
 757                 do {
 758                         skb = sock_wmalloc(sk,
 759                                            (MAX_HEADER +
 760                                             sk->prot->max_header),
 761                                            1, GFP_KERNEL);
 762                 } while (skb == NULL);
 763
 764                 /* Reserve space for headers and prepare control bits. */
 765                 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 766                 skb->csum = 0;
 767                 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
 768                 TCP_SKB_CB(skb)->sacked = 0;
 769                 TCP_SKB_CB(skb)->urg_ptr = 0;
 770
 771                 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
 772                 TCP_SKB_CB(skb)->seq = tp->write_seq;
 773                 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
 774                 tcp_send_skb(sk, skb, 0);
 775         }
 776 }
 777
 778 /* We get here when a process closes a file descriptor (either due to
 779  * an explicit close() or as a byproduct of exit()'ing) and there
 780  * was unread data in the receive queue.  This behavior is recommended
 781  * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
 782  */
 783 void tcp_send_active_reset(struct sock *sk)
 784 {
 785         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 786         struct sk_buff *skb;
 787
 788         /* NOTE: No TCP options attached and we never retransmit this. */
 789         skb = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL);
 790         if (!skb)
 791                 return;
 792
 793         /* Reserve space for headers and prepare control bits. */
 794         skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 795         skb->csum = 0;
 796         TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
 797         TCP_SKB_CB(skb)->sacked = 0;
 798         TCP_SKB_CB(skb)->urg_ptr = 0;
 799
 800         /* Send it off. */
 801         TCP_SKB_CB(skb)->seq = tp->write_seq;
 802         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
 803         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 804         tcp_transmit_skb(sk, skb);
 805 }
 806
 807 /* WARNING: This routine must only be called when we have already sent
 808  * a SYN packet that crossed the incoming SYN that caused this routine
 809  * to get called. If this assumption fails then the initial rcv_wnd
 810  * and rcv_wscale values will not be correct.
 811  */
 812 int tcp_send_synack(struct sock *sk)
 813 {
 814         struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
 815         struct sk_buff* skb;
 816
 817         skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
 818                            1, GFP_ATOMIC);
 819         if (skb == NULL)
 820                 return -ENOMEM;
 821
 822         /* Reserve space for headers and prepare control bits. */
 823         skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 824         skb->csum = 0;
 825         TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN);
 826         TCP_SKB_CB(skb)->sacked = 0;
 827         TCP_SKB_CB(skb)->urg_ptr = 0;
 828
 829         /* SYN eats a sequence byte. */
 830         TCP_SKB_CB(skb)->seq = tp->snd_una;
 831         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
 832         __skb_queue_tail(&sk->write_queue, skb);
 833         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 834         tp->packets_out++;
 835         tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 836         return 0;
 837 }
 838
 839 /*
 840  * Prepare a SYN-ACK.
 841  */
 842 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 843                                  struct open_request *req, int mss)
 844 {
 845         struct tcphdr *th;
 846         int tcp_header_size;
 847         struct sk_buff *skb;
 848
 849         skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC);
 850         if (skb == NULL)
 851                 return NULL;
 852
 853         /* Reserve space for headers. */
 854         skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 855
 856         skb->dst = dst_clone(dst);
 857
 858         /* Don't offer more than they did.
 859          * This way we don't have to memorize who said what.
 860          * FIXME: maybe this should be changed for better performance
 861          * with syncookies.
 862          */
 863         req->mss = min(mss, req->mss);
 864         if (req->mss < 8) {
 865                 printk(KERN_DEBUG "initial req->mss below 8\n");
 866                 req->mss = 8;
 867         }
 868
 869         tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
 870                            (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
 871                            (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
 872                            /* SACK_PERM is in the place of NOP NOP of TS */
 873                            ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
 874         skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 875
 876         memset(th, 0, sizeof(struct tcphdr));
 877         th->syn = 1;
 878         th->ack = 1;
 879         th->source = sk->sport;
 880         th->dest = req->rmt_port;
 881         TCP_SKB_CB(skb)->seq = req->snt_isn;
 882         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
 883         th->seq = htonl(TCP_SKB_CB(skb)->seq);
 884         th->ack_seq = htonl(req->rcv_isn + 1);
 885         if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
 886                 __u8 rcv_wscale;
 887                 /* Set this up on the first call only */
 888                 req->window_clamp = skb->dst->window;
 889                 tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
 890                         &req->rcv_wnd,
 891                         &req->window_clamp,
 892                         req->wscale_ok,
 893                         &rcv_wscale);
 894                 req->rcv_wscale = rcv_wscale;
 895         }
 896
 897         /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
 898         th->window = htons(req->rcv_wnd);
 899
 900         TCP_SKB_CB(skb)->when = tcp_time_stamp;
 901         tcp_syn_build_options((__u32 *)(th + 1), req->mss, req->tstamp_ok,
 902                               req->sack_ok, req->wscale_ok, req->rcv_wscale,
 903                               TCP_SKB_CB(skb)->when,
 904                               req->ts_recent);
 905
 906         skb->csum = 0;
 907         th->doff = (tcp_header_size >> 2);
 908         tcp_statistics.TcpOutSegs++;
 909         return skb;
 910 }
 911
 912 void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu)
 913 {
 914         struct dst_entry *dst = sk->dst_cache;
 915         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 916
 917         /* Reserve space for headers. */
 918         skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
 919
 920         tp->snd_wnd = 0;
 921         tp->snd_wl1 = 0;
 922         tp->snd_wl2 = tp->write_seq;
 923         tp->snd_una = tp->write_seq;
 924         tp->rcv_nxt = 0;
 925
 926         sk->err = 0;
 927
 928         /* We'll fix this up when we get a response from the other end.
 929          * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
 930          */
 931         tp->tcp_header_len = sizeof(struct tcphdr) +
 932                 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
 933
 934         /* If user gave his TCP_MAXSEG, record it to clamp */
 935         if (tp->user_mss)
 936                 tp->mss_clamp = tp->user_mss;
 937         tcp_sync_mss(sk, mtu);
 938
 939         /* Now unpleasant action: if initial pmtu is too low
 940            set lower clamp. I am not sure that it is good.
 941            To be more exact, I do not think that clamping at value, which
 942            is apparently transient and may improve in future is good idea.
 943            It would be better to wait until peer will returns its MSS
 944            (probably 65535 too) and now advertise something sort of 65535
 945            or at least first hop device mtu. Is it clear, what I mean?
 946            We should tell peer what maximal mss we expect to RECEIVE,
 947            it has nothing to do with pmtu.
 948            I am afraid someone will be confused by such huge value.
 949                                                            --ANK (980731)
 950          */
 951         if (tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr) < tp->mss_clamp )
 952                 tp->mss_clamp = tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr);
 953
 954         TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
 955         TCP_SKB_CB(buff)->sacked = 0;
 956         TCP_SKB_CB(buff)->urg_ptr = 0;
 957         buff->csum = 0;
 958         TCP_SKB_CB(buff)->seq = tp->write_seq++;
 959         TCP_SKB_CB(buff)->end_seq = tp->write_seq;
 960         tp->snd_nxt = TCP_SKB_CB(buff)->end_seq;
 961
 962         tp->window_clamp = dst->window;
 963         tcp_select_initial_window(sock_rspace(sk)/2,tp->mss_clamp,
 964                 &tp->rcv_wnd,
 965                 &tp->window_clamp,
 966                 sysctl_tcp_window_scaling,
 967                 &tp->rcv_wscale);
 968         /* Ok, now lock the socket before we make it visible to
 969          * the incoming packet engine.
 970          */
 971         unlock_kernel();
 972         lock_sock(sk);
 973
 974         /* Socket identity change complete, no longer
 975          * in TCP_CLOSE, so enter ourselves into the
 976          * hash tables.
 977          */
 978         tcp_set_state(sk,TCP_SYN_SENT);
 979         sk->prot->hash(sk);
 980
 981         tp->rto = dst->rtt;
 982         tcp_init_xmit_timers(sk);
 983         tp->retransmits = 0;
 984         tp->fackets_out = 0;
 985         tp->retrans_out = 0;
 986
 987         /* Send it off. */
 988         __skb_queue_tail(&sk->write_queue, buff);
 989         TCP_SKB_CB(buff)->when = tcp_time_stamp;
 990         tp->packets_out++;
 991         tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
 992         tcp_statistics.TcpActiveOpens++;
 993
 994         /* Timer for repeating the SYN until an answer. */
 995         tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 996
 997         /* Now, it is safe to release the socket. */
 998         release_sock(sk);
 999         lock_kernel();
1000 }
1001
1002 /* Send out a delayed ack, the caller does the policy checking
1003  * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
1004  * for details.
1005  */
1006 void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout)
1007 {
1008         unsigned long timeout;
1009
1010         /* Stay within the limit we were given */
1011         timeout = tp->ato;
1012         if (timeout > max_timeout)
1013                 timeout = max_timeout;
1014         timeout += jiffies;
1015
1016         /* Use new timeout only if there wasn't a older one earlier. */
1017         if (!tp->delack_timer.prev) {
1018                 tp->delack_timer.expires = timeout;
1019                 add_timer(&tp->delack_timer);
1020         } else {
1021                 if (time_before(timeout, tp->delack_timer.expires))
1022                         mod_timer(&tp->delack_timer, timeout);
1023         }
1024 }
1025
1026 /* This routine sends an ack and also updates the window. */
1027 void tcp_send_ack(struct sock *sk)
1028 {
1029         /* If we have been reset, we may not send again. */
1030         if(!sk->zapped) {
1031                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1032                 struct sk_buff *buff;
1033
1034                 /* We are not putting this on the write queue, so
1035                  * tcp_transmit_skb() will set the ownership to this
1036                  * sock.
1037                  */
1038                 buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC);
1039                 if (buff == NULL) {
1040                         /* Force it to send an ack. We don't have to do this
1041                          * (ACK is unreliable) but it's much better use of
1042                          * bandwidth on slow links to send a spare ack than
1043                          * resend packets.
1044                          *
1045                          * This is the one possible way that we can delay an
1046                          * ACK and have tp->ato indicate that we are in
1047                          * quick ack mode, so clear it.
1048                          */
1049                         if(tcp_in_quickack_mode(tp))
1050                                 tcp_exit_quickack_mode(tp);
1051                         tcp_send_delayed_ack(tp, HZ/2);
1052                         return;
1053                 }
1054
1055                 /* Reserve space for headers and prepare control bits. */
1056                 skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
1057                 buff->csum = 0;
1058                 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1059                 TCP_SKB_CB(buff)->sacked = 0;
1060                 TCP_SKB_CB(buff)->urg_ptr = 0;
1061
1062                 /* Send it off, this clears delayed acks for us. */
1063                 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tp->snd_nxt;
1064                 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1065                 tcp_transmit_skb(sk, buff);
1066         }
1067 }
1068
1069 /* This routine sends a packet with an out of date sequence
1070  * number. It assumes the other end will try to ack it.
1071  */
1072 void tcp_write_wakeup(struct sock *sk)
1073 {
1074         /* After a valid reset we can send no more. */
1075         if (!sk->zapped) {
1076                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1077                 struct sk_buff *skb;
1078
1079                 /* Write data can still be transmitted/retransmitted in the
1080                  * following states.  If any other state is encountered, return.
1081                  * [listen/close will never occur here anyway]
1082                  */
1083                 if ((1 << sk->state) &
1084                     ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
1085                       TCPF_LAST_ACK|TCPF_CLOSING))
1086                         return;
1087
1088                 if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
1089                     ((skb = tp->send_head) != NULL)) {
1090                         unsigned long win_size;
1091
1092                         /* We are probing the opening of a window
1093                          * but the window size is != 0
1094                          * must have been a result SWS avoidance ( sender )
1095                          */
1096                         win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
1097                         if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
1098                                 if (tcp_fragment(sk, skb, win_size))
1099                                         return; /* Let a retransmit get it. */
1100                         }
1101                         update_send_head(sk);
1102                         TCP_SKB_CB(skb)->when = tcp_time_stamp;
1103                         tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
1104                         tp->packets_out++;
1105                         tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1106                         if (!tcp_timer_is_set(sk, TIME_RETRANS))
1107                                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
1108                 } else {
1109                         /* We don't queue it, tcp_transmit_skb() sets ownership. */
1110                         skb = alloc_skb(MAX_HEADER + sk->prot->max_header,
1111                                         GFP_ATOMIC);
1112                         if (skb == NULL)
1113                                 return;
1114
1115                         /* Reserve space for headers and set control bits. */
1116                         skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
1117                         skb->csum = 0;
1118                         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1119                         TCP_SKB_CB(skb)->sacked = 0;
1120                         TCP_SKB_CB(skb)->urg_ptr = 0;
1121
1122                         /* Use a previous sequence.  This should cause the other
1123                          * end to send an ack.  Don't queue or clone SKB, just
1124                          * send it.
1125                          */
1126                         TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1;
1127                         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1128                         TCP_SKB_CB(skb)->when = tcp_time_stamp;
1129                         tcp_transmit_skb(sk, skb);
1130                 }
1131         }
1132 }
1133
1134 /* A window probe timeout has occurred.  If window is not closed send
1135  * a partial packet else a zero probe.
1136  */
1137 void tcp_send_probe0(struct sock *sk)
1138 {
1139         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1140
1141         tcp_write_wakeup(sk);
1142         tp->pending = TIME_PROBE0;
1143         tp->backoff++;
1144         tp->probes_out++;
1145         tcp_reset_xmit_timer (sk, TIME_PROBE0,
1146                               min(tp->rto << tp->backoff, 120*HZ));
1147 }