Import 2.3.18pre1
[davej-history.git] / net / ipv4 / tcp.c
blobb8e5d197c344a1c51880619cfdf13185d73f0dd9
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp.c,v 1.151 1999/09/07 02:31:21 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
27 * (tcp_err()).
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. poll
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_send_reset() fixed to work for
37 * everything not just packets for
38 * unknown sockets.
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
41 * syn rule wrong]
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
47 * escape still
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
51 * facilities
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
56 * bit to skb ops.
57 * Alan Cox : Tidied tcp_data to avoid a potential
58 * nasty.
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
70 * sockets.
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
74 * state ack error.
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
79 * fixes
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
85 * completely
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
93 * (not yet usable)
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle poll() after URG properly in
106 * all cases.
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), poll() after URG
111 * works now.
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
113 * BSD api.
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
121 * fixed ports.
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
127 * socket close.
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in polling before an
132 * accept.
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
143 * close.
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
149 * comments.
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
157 * resemble the RFC.
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
162 * generates them.
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : poll()->select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if state is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
175 * but its a start!
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect().
180 * Alan Cox : Small hooks for enSKIP.
181 * Alexey Kuznetsov: Path MTU discovery.
182 * Alan Cox : Support soft errors.
183 * Alan Cox : Fix MTU discovery pathological case
184 * when the remote claims no mtu!
185 * Marc Tamsky : TCP_CLOSE fix.
186 * Colin (G3TNE) : Send a reset on syn ack replies in
187 * window but wrong (fixes NT lpd problems)
188 * Pedro Roque : Better TCP window handling, delayed ack.
189 * Joerg Reuter : No modification of locked buffers in
190 * tcp_do_retransmit()
191 * Eric Schenk : Changed receiver side silly window
192 * avoidance algorithm to BSD style
193 * algorithm. This doubles throughput
194 * against machines running Solaris,
195 * and seems to result in general
196 * improvement.
197 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
198 * Willy Konynenberg : Transparent proxying support.
199 * Mike McLagan : Routing by source
200 * Keith Owens : Do proper merging with partial SKB's in
201 * tcp_do_sendmsg to avoid burstiness.
202 * Eric Schenk : Fix fast close down bug with
203 * shutdown() followed by close().
204 * Andi Kleen : Make poll agree with SIGIO
206 * This program is free software; you can redistribute it and/or
207 * modify it under the terms of the GNU General Public License
208 * as published by the Free Software Foundation; either version
209 * 2 of the License, or(at your option) any later version.
211 * Description of States:
213 * TCP_SYN_SENT sent a connection request, waiting for ack
215 * TCP_SYN_RECV received a connection request, sent ack,
216 * waiting for final ack in three-way handshake.
218 * TCP_ESTABLISHED connection established
220 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
221 * transmission of remaining buffered data
223 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
224 * to shutdown
226 * TCP_CLOSING both sides have shutdown but we still have
227 * data we have to finish sending
229 * TCP_TIME_WAIT timeout to catch resent junk before entering
230 * closed, can only be entered from FIN_WAIT2
231 * or CLOSING. Required because the other end
232 * may not have gotten our last ACK causing it
233 * to retransmit the data packet (which we ignore)
235 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
236 * us to finish writing our data and to shutdown
237 * (we have to close() to move on to LAST_ACK)
239 * TCP_LAST_ACK out side has shutdown after remote has
240 * shutdown. There may still be data in our
241 * buffer that we have to finish sending
243 * TCP_CLOSE socket is finished
247 * RFC1122 status:
248 * NOTE: I'm not going to be doing comments in the code for this one except
249 * for violations and the like. tcp.c is just too big... If I say something
250 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
251 * with Alan. -- MS 950903
252 * [Note: Most of the TCP code has been rewriten/redesigned since this
253 * RFC1122 check. It is probably not correct anymore. It should be redone
254 * before 2.2. -AK]
256 * Use of PSH (4.2.2.2)
257 * MAY aggregate data sent without the PSH flag. (does)
258 * MAY queue data received without the PSH flag. (does)
259 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
260 * MAY implement PSH on send calls. (doesn't, thus:)
261 * MUST NOT buffer data indefinitely (doesn't [1 second])
262 * MUST set PSH on last segment (does)
263 * MAY pass received PSH to application layer (doesn't)
264 * SHOULD send maximum-sized segment whenever possible. (almost always does)
266 * Window Size (4.2.2.3, 4.2.2.16)
267 * MUST treat window size as an unsigned number (does)
268 * SHOULD treat window size as a 32-bit number (does not)
269 * MUST NOT shrink window once it is offered (does not normally)
271 * Urgent Pointer (4.2.2.4)
272 * **MUST point urgent pointer to last byte of urgent data (not right
273 * after). (doesn't, to be like BSD. That's configurable, but defaults
274 * to off)
275 * MUST inform application layer asynchronously of incoming urgent
276 * data. (does)
277 * MUST provide application with means of determining the amount of
278 * urgent data pending. (does)
279 * **MUST support urgent data sequence of arbitrary length. (doesn't, but
280 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
281 * [Follows BSD 1 byte of urgent data]
283 * TCP Options (4.2.2.5)
284 * MUST be able to receive TCP options in any segment. (does)
285 * MUST ignore unsupported options (does)
287 * Maximum Segment Size Option (4.2.2.6)
288 * MUST implement both sending and receiving MSS. (does, but currently
289 * only uses the smaller of both of them)
290 * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
291 * it always). (does, even when MSS == 536, which is legal)
292 * MUST assume MSS == 536 if no MSS received at connection setup (does)
293 * MUST calculate "effective send MSS" correctly:
294 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
295 * (does - but allows operator override)
297 * TCP Checksum (4.2.2.7)
298 * MUST generate and check TCP checksum. (does)
300 * Initial Sequence Number Selection (4.2.2.8)
301 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's
302 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
303 * necessary for 10Mbps networks - and harder than BSD to spoof!
304 * With syncookies we don't)
306 * Simultaneous Open Attempts (4.2.2.10)
307 * MUST support simultaneous open attempts (does)
309 * Recovery from Old Duplicate SYN (4.2.2.11)
310 * MUST keep track of active vs. passive open (does)
312 * RST segment (4.2.2.12)
313 * SHOULD allow an RST segment to contain data (does, but doesn't do
314 * anything with it, which is standard)
316 * Closing a Connection (4.2.2.13)
317 * MUST inform application of whether connection was closed by RST or
318 * normal close. (does)
319 * MAY allow "half-duplex" close (treat connection as closed for the
320 * local app, even before handshake is done). (does)
321 * MUST linger in TIME_WAIT for 2 * MSL (does)
323 * Retransmission Timeout (4.2.2.15)
324 * MUST implement Jacobson's slow start and congestion avoidance
325 * stuff. (does)
327 * Probing Zero Windows (4.2.2.17)
328 * MUST support probing of zero windows. (does)
329 * MAY keep offered window closed indefinitely. (does)
330 * MUST allow remote window to stay closed indefinitely. (does)
332 * Passive Open Calls (4.2.2.18)
333 * MUST NOT let new passive open affect other connections. (doesn't)
334 * MUST support passive opens (LISTENs) concurrently. (does)
336 * Time to Live (4.2.2.19)
337 * MUST make TCP TTL configurable. (does - IP_TTL option)
339 * Event Processing (4.2.2.20)
340 * SHOULD queue out-of-order segments. (does)
341 * MUST aggregate ACK segments whenever possible. (does but badly)
343 * Retransmission Timeout Calculation (4.2.3.1)
344 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO
345 * calculation. (does, or at least explains them in the comments 8*b)
346 * SHOULD initialize RTO to 0 and RTT to 3. (does)
348 * When to Send an ACK Segment (4.2.3.2)
349 * SHOULD implement delayed ACK. (does)
350 * MUST keep ACK delay < 0.5 sec. (does)
352 * When to Send a Window Update (4.2.3.3)
353 * MUST implement receiver-side SWS. (does)
355 * When to Send Data (4.2.3.4)
356 * MUST implement sender-side SWS. (does)
357 * SHOULD implement Nagle algorithm. (does)
359 * TCP Connection Failures (4.2.3.5)
360 * MUST handle excessive retransmissions "properly" (see the RFC). (does)
361 * SHOULD inform application layer of soft errors. (does)
363 * TCP Keep-Alives (4.2.3.6)
364 * MAY provide keep-alives. (does)
365 * MUST make keep-alives configurable on a per-connection basis. (does)
366 * MUST default to no keep-alives. (does)
367 * MUST make keep-alive interval configurable. (does)
368 * MUST make default keep-alive interval > 2 hours. (does)
369 * MUST NOT interpret failure to ACK keep-alive packet as dead
370 * connection. (doesn't)
371 * SHOULD send keep-alive with no data. (does)
373 * TCP Multihoming (4.2.3.7)
374 * MUST get source address from IP layer before sending first
375 * SYN. (does)
376 * MUST use same local address for all segments of a connection. (does)
378 * IP Options (4.2.3.8)
379 * MUST ignore unsupported IP options. (does)
380 * MAY support Time Stamp and Record Route. (does)
381 * MUST allow application to specify a source route. (does)
382 * MUST allow received Source Route option to set route for all future
383 * segments on this connection. (does not (security issues))
385 * ICMP messages (4.2.3.9)
386 * MUST act on ICMP errors. (does)
387 * MUST slow transmission upon receipt of a Source Quench. (doesn't anymore
388 * because that is deprecated now by the IETF, can be turned on)
389 * MUST NOT abort connection upon receipt of soft Destination
390 * Unreachables (0, 1, 5), Time Exceededs and Parameter
391 * Problems. (doesn't)
392 * SHOULD report soft Destination Unreachables etc. to the
393 * application. (does, except during SYN_RECV and may drop messages
394 * in some rare cases before accept() - ICMP is unreliable)
395 * SHOULD abort connection upon receipt of hard Destination Unreachable
396 * messages (2, 3, 4). (does, but see above)
398 * Remote Address Validation (4.2.3.10)
399 * MUST reject as an error OPEN for invalid remote IP address. (does)
400 * MUST ignore SYN with invalid source address. (does)
401 * MUST silently discard incoming SYN for broadcast/multicast
402 * address. (does)
404 * Asynchronous Reports (4.2.4.1)
405 * MUST provide mechanism for reporting soft errors to application
406 * layer. (does)
408 * Type of Service (4.2.4.2)
409 * MUST allow application layer to set Type of Service. (does IP_TOS)
411 * (Whew. -- MS 950903)
412 * (Updated by AK, but not complete yet.)
415 #include <linux/config.h>
416 #include <linux/types.h>
417 #include <linux/fcntl.h>
418 #include <linux/poll.h>
419 #include <linux/init.h>
420 #include <linux/smp_lock.h>
422 #include <net/icmp.h>
423 #include <net/tcp.h>
425 #include <asm/uaccess.h>
427 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
429 struct tcp_mib tcp_statistics;
431 kmem_cache_t *tcp_openreq_cachep;
432 kmem_cache_t *tcp_bucket_cachep;
433 kmem_cache_t *tcp_timewait_cachep;
436 * Find someone to 'accept'. Must be called with
437 * the listening socket locked.
440 static struct open_request *tcp_find_established(struct tcp_opt *tp,
441 struct open_request **prevp)
443 struct open_request *req = tp->syn_wait_queue;
444 struct open_request *prev = (struct open_request *)&tp->syn_wait_queue;
445 while(req) {
446 if (req->sk) {
447 if((1 << req->sk->state) &
448 ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
449 break;
451 prev = req;
452 req = req->dl_next;
454 *prevp = prev;
455 return req;
459 * Walk down the receive queue counting readable data.
461 * Must be called with the socket lock held.
464 static int tcp_readable(struct sock *sk)
466 unsigned long counted;
467 unsigned long amount;
468 struct sk_buff *skb;
469 int sum;
471 SOCK_DEBUG(sk, "tcp_readable: %p - ",sk);
473 skb = skb_peek(&sk->receive_queue);
474 if (skb == NULL) {
475 SOCK_DEBUG(sk, "empty\n");
476 return(0);
479 counted = sk->tp_pinfo.af_tcp.copied_seq; /* Where we are at the moment */
480 amount = 0;
482 /* Do until a push or until we are out of data. */
483 do {
484 /* Found a hole so stops here. */
485 if (before(counted, TCP_SKB_CB(skb)->seq)) /* should not happen */
486 break;
488 /* Length - header but start from where we are up to
489 * avoid overlaps.
491 sum = skb->len - (counted - TCP_SKB_CB(skb)->seq);
492 if (sum >= 0) {
493 /* Add it up, move on. */
494 amount += sum;
495 counted += sum;
496 if (skb->h.th->syn)
497 counted++;
500 /* Don't count urg data ... but do it in the right place!
501 * Consider: "old_data (ptr is here) URG PUSH data"
502 * The old code would stop at the first push because
503 * it counted the urg (amount==1) and then does amount--
504 * *after* the loop. This means tcp_readable() always
505 * returned zero if any URG PUSH was in the queue, even
506 * though there was normal data available. If we subtract
507 * the urg data right here, we even get it to work for more
508 * than one URG PUSH skb without normal data.
509 * This means that poll() finally works now with urg data
510 * in the queue. Note that rlogin was never affected
511 * because it doesn't use poll(); it uses two processes
512 * and a blocking read(). And the queue scan in tcp_read()
513 * was correct. Mike <pall@rz.uni-karlsruhe.de>
516 /* Don't count urg data. */
517 if (skb->h.th->urg)
518 amount--;
519 #if 0
520 if (amount && skb->h.th->psh) break;
521 #endif
522 skb = skb->next;
523 } while(skb != (struct sk_buff *)&sk->receive_queue);
525 SOCK_DEBUG(sk, "got %lu bytes.\n",amount);
526 return(amount);
530 * LISTEN is a special case for poll..
532 static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
534 struct open_request *req, *dummy;
536 lock_sock(sk);
537 req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy);
538 release_sock(sk);
539 if (req)
540 return POLLIN | POLLRDNORM;
541 return 0;
545 * Compute minimal free write space needed to queue new packets.
547 #define tcp_min_write_space(__sk) \
548 (atomic_read(&(__sk)->wmem_alloc) / 2)
551 * Wait for a TCP event.
553 * Note that we don't need to lock the socket, as the upper poll layers
554 * take care of normal races (between the test and the event) and we don't
555 * go look at any of the socket buffers directly.
557 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
559 unsigned int mask;
560 struct sock *sk = sock->sk;
561 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
563 poll_wait(file, sk->sleep, wait);
564 if (sk->state == TCP_LISTEN)
565 return tcp_listen_poll(sk, wait);
567 /* Socket is not locked. We are protected from async events
568 by poll logic and correct handling of state changes
569 made by another threads is impossible in any case.
572 mask = 0;
573 if (sk->err)
574 mask = POLLERR;
577 * POLLHUP is certainly not done right. But poll() doesn't
578 * have a notion of HUP in just one direction, and for a
579 * socket the read side is more interesting.
581 * Some poll() documentation says that POLLHUP is incompatible
582 * with the POLLOUT/POLLWR flags, so somebody should check this
583 * all. But careful, it tends to be safer to return too many
584 * bits than too few, and you can easily break real applications
585 * if you don't tell them that something has hung up!
587 * Check-me.
589 if (sk->shutdown & RCV_SHUTDOWN)
590 mask |= POLLHUP;
592 /* Connected? */
593 if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
594 if ((tp->rcv_nxt != tp->copied_seq) &&
595 (tp->urg_seq != tp->copied_seq ||
596 tp->rcv_nxt != tp->copied_seq+1 ||
597 sk->urginline || !tp->urg_data))
598 mask |= POLLIN | POLLRDNORM;
600 if (!(sk->shutdown & SEND_SHUTDOWN)) {
601 if (sock_wspace(sk) >= tcp_min_write_space(sk)) {
602 mask |= POLLOUT | POLLWRNORM;
603 } else { /* send SIGIO later */
604 sk->socket->flags |= SO_NOSPACE;
608 if (tp->urg_data & URG_VALID)
609 mask |= POLLPRI;
611 return mask;
615 * Socket write_space callback.
616 * This (or rather the sock_wake_async) should agree with poll.
618 * WARNING. This callback is called from any context (process,
619 * bh or irq). Do not make anything more smart from it.
621 void tcp_write_space(struct sock *sk)
623 read_lock(&sk->callback_lock);
624 if (!sk->dead) {
625 /* Why??!! Does it really not overshedule? --ANK */
626 wake_up_interruptible(sk->sleep);
628 if (sock_wspace(sk) >= tcp_min_write_space(sk))
629 sock_wake_async(sk->socket, 2);
631 read_unlock(&sk->callback_lock);
635 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
637 int answ;
639 switch(cmd) {
640 case TIOCINQ:
641 #ifdef FIXME /* FIXME: */
642 case FIONREAD:
643 #endif
644 if (sk->state == TCP_LISTEN)
645 return(-EINVAL);
646 lock_sock(sk);
647 answ = tcp_readable(sk);
648 release_sock(sk);
649 break;
650 case SIOCATMARK:
652 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
653 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
654 break;
656 case TIOCOUTQ:
657 if (sk->state == TCP_LISTEN)
658 return(-EINVAL);
659 answ = sock_wspace(sk);
660 break;
661 default:
662 return(-ENOIOCTLCMD);
665 return put_user(answ, (int *)arg);
669 * Wait for a socket to get into the connected state
671 * Note: Must be called with the socket locked.
673 static int wait_for_tcp_connect(struct sock * sk, int flags)
675 struct task_struct *tsk = current;
676 DECLARE_WAITQUEUE(wait, tsk);
678 while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
679 if(sk->err)
680 return sock_error(sk);
681 if((1 << sk->state) &
682 ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
683 if(sk->keepopen && !(flags&MSG_NOSIGNAL))
684 send_sig(SIGPIPE, tsk, 0);
685 return -EPIPE;
687 if(flags & MSG_DONTWAIT)
688 return -EAGAIN;
689 if(signal_pending(tsk))
690 return -ERESTARTSYS;
692 __set_task_state(tsk, TASK_INTERRUPTIBLE);
693 add_wait_queue(sk->sleep, &wait);
694 sk->tp_pinfo.af_tcp.write_pending++;
696 release_sock(sk);
697 schedule();
698 lock_sock(sk);
700 __set_task_state(tsk, TASK_RUNNING);
701 remove_wait_queue(sk->sleep, &wait);
702 sk->tp_pinfo.af_tcp.write_pending--;
704 return 0;
707 static inline int tcp_memory_free(struct sock *sk)
709 return atomic_read(&sk->wmem_alloc) < sk->sndbuf;
713 * Wait for more memory for a socket
715 static void wait_for_tcp_memory(struct sock * sk)
717 if (!tcp_memory_free(sk)) {
718 DECLARE_WAITQUEUE(wait, current);
720 sk->socket->flags &= ~SO_NOSPACE;
721 add_wait_queue(sk->sleep, &wait);
722 for (;;) {
723 set_current_state(TASK_INTERRUPTIBLE);
725 if (signal_pending(current))
726 break;
727 if (tcp_memory_free(sk))
728 break;
729 if (sk->shutdown & SEND_SHUTDOWN)
730 break;
731 if (sk->err)
732 break;
733 release_sock(sk);
734 if (!tcp_memory_free(sk))
735 schedule();
736 lock_sock(sk);
738 current->state = TASK_RUNNING;
739 remove_wait_queue(sk->sleep, &wait);
743 /* When all user supplied data has been queued set the PSH bit */
744 #define PSH_NEEDED (seglen == 0 && iovlen == 0)
747 * This routine copies from a user buffer into a socket,
748 * and starts the transmit system.
750 * Note: must be called with the socket locked.
753 int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
755 struct iovec *iov;
756 struct tcp_opt *tp;
757 struct sk_buff *skb;
758 int iovlen, flags;
759 int mss_now;
760 int err, copied;
762 err = 0;
763 tp = &(sk->tp_pinfo.af_tcp);
765 /* Wait for a connection to finish. */
766 flags = msg->msg_flags;
767 if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
768 if((err = wait_for_tcp_connect(sk, flags)) != 0)
769 goto out;
771 /* This should be in poll */
772 sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */
774 mss_now = tcp_current_mss(sk);
776 /* Ok commence sending. */
777 iovlen = msg->msg_iovlen;
778 iov = msg->msg_iov;
779 copied = 0;
781 while(--iovlen >= 0) {
782 int seglen=iov->iov_len;
783 unsigned char * from=iov->iov_base;
785 iov++;
787 while(seglen > 0) {
788 int copy, tmp, queue_it, psh;
790 if (err)
791 goto do_fault2;
793 /* Stop on errors. */
794 if (sk->err)
795 goto do_sock_err;
797 /* Make sure that we are established. */
798 if (sk->shutdown & SEND_SHUTDOWN)
799 goto do_shutdown;
801 /* Now we need to check if we have a half
802 * built packet we can tack some data onto.
804 if (tp->send_head && !(flags & MSG_OOB)) {
805 skb = sk->write_queue.prev;
806 copy = skb->len;
807 /* If the remote does SWS avoidance we should
808 * queue the best we can if not we should in
809 * fact send multiple packets...
810 * A method for detecting this would be most
811 * welcome.
813 if (skb_tailroom(skb) > 0 &&
814 (mss_now - copy) > 0 &&
815 tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) {
816 int last_byte_was_odd = (copy % 4);
818 copy = mss_now - copy;
819 if(copy > skb_tailroom(skb))
820 copy = skb_tailroom(skb);
821 if(copy > seglen)
822 copy = seglen;
823 if(last_byte_was_odd) {
824 if(copy_from_user(skb_put(skb, copy),
825 from, copy))
826 err = -EFAULT;
827 skb->csum = csum_partial(skb->data,
828 skb->len, 0);
829 } else {
830 skb->csum =
831 csum_and_copy_from_user(
832 from, skb_put(skb, copy),
833 copy, skb->csum, &err);
836 * FIXME: the *_user functions should
837 * return how much data was
838 * copied before the fault
839 * occurred and then a partial
840 * packet with this data should
841 * be sent. Unfortunately
842 * csum_and_copy_from_user doesn't
843 * return this information.
844 * ATM it might send partly zeroed
845 * data in this case.
847 tp->write_seq += copy;
848 TCP_SKB_CB(skb)->end_seq += copy;
849 from += copy;
850 copied += copy;
851 seglen -= copy;
852 if (PSH_NEEDED)
853 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
854 continue;
858 /* We also need to worry about the window. If
859 * window < 1/2 the maximum window we've seen
860 * from this host, don't use it. This is
861 * sender side silly window prevention, as
862 * specified in RFC1122. (Note that this is
863 * different than earlier versions of SWS
864 * prevention, e.g. RFC813.). What we
865 * actually do is use the whole MSS. Since
866 * the results in the right edge of the packet
867 * being outside the window, it will be queued
868 * for later rather than sent.
870 psh = 0;
871 copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
872 if(copy > (tp->max_window >> 1)) {
873 copy = min(copy, mss_now);
874 psh = 1;
875 } else {
876 copy = mss_now;
878 if(copy > seglen)
879 copy = seglen;
881 /* Determine how large of a buffer to allocate. */
882 tmp = MAX_HEADER + sk->prot->max_header;
883 if (copy < min(mss_now, tp->max_window >> 1) &&
884 !(flags & MSG_OOB)) {
885 tmp += min(mss_now, tp->max_window);
887 /* What is happening here is that we want to
888 * tack on later members of the users iovec
889 * if possible into a single frame. When we
890 * leave this loop our caller checks to see if
891 * we can send queued frames onto the wire.
892 * See tcp_v[46]_sendmsg() for this.
894 queue_it = 1;
895 } else {
896 tmp += copy;
897 queue_it = 0;
899 skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
901 /* If we didn't get any memory, we need to sleep. */
902 if (skb == NULL) {
903 sk->socket->flags |= SO_NOSPACE;
904 if (flags&MSG_DONTWAIT) {
905 err = -EAGAIN;
906 goto do_interrupted;
908 if (signal_pending(current)) {
909 err = -ERESTARTSYS;
910 goto do_interrupted;
912 tcp_push_pending_frames(sk, tp);
913 wait_for_tcp_memory(sk);
915 /* If SACK's were formed or PMTU events happened,
916 * we must find out about it.
918 mss_now = tcp_current_mss(sk);
919 continue;
922 seglen -= copy;
924 /* Prepare control bits for TCP header creation engine. */
925 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
926 ((PSH_NEEDED || psh) ?
927 TCPCB_FLAG_PSH : 0));
928 TCP_SKB_CB(skb)->sacked = 0;
929 if (flags & MSG_OOB) {
930 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG;
931 TCP_SKB_CB(skb)->urg_ptr = copy;
932 } else
933 TCP_SKB_CB(skb)->urg_ptr = 0;
935 /* TCP data bytes are SKB_PUT() on top, later
936 * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
937 * Reserve header space and checksum the data.
939 skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
940 skb->csum = csum_and_copy_from_user(from,
941 skb_put(skb, copy), copy, 0, &err);
943 if (err)
944 goto do_fault;
946 from += copy;
947 copied += copy;
949 TCP_SKB_CB(skb)->seq = tp->write_seq;
950 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
952 /* This advances tp->write_seq for us. */
953 tcp_send_skb(sk, skb, queue_it);
956 sk->err = 0;
957 err = copied;
958 goto out;
960 do_sock_err:
961 if(copied)
962 err = copied;
963 else
964 err = sock_error(sk);
965 goto out;
966 do_shutdown:
967 if(copied)
968 err = copied;
969 else {
970 if (!(flags&MSG_NOSIGNAL))
971 send_sig(SIGPIPE, current, 0);
972 err = -EPIPE;
974 goto out;
975 do_interrupted:
976 if(copied)
977 err = copied;
978 goto out;
979 do_fault:
980 kfree_skb(skb);
981 do_fault2:
982 err = -EFAULT;
983 out:
984 tcp_push_pending_frames(sk, tp);
985 return err;
988 #undef PSH_NEEDED
991 * Send an ack if one is backlogged at this point. Ought to merge
992 * this with tcp_send_ack().
993 * This is called for delayed acks also.
996 void tcp_read_wakeup(struct sock *sk)
998 /* If we're closed, don't send an ack, or we'll get a RST
999 * from the closed destination.
1001 if (sk->state != TCP_CLOSE)
1002 tcp_send_ack(sk);
1006 * Handle reading urgent data. BSD has very simple semantics for
1007 * this, no blocking and very strange errors 8)
1010 static int tcp_recv_urg(struct sock * sk, int nonblock,
1011 struct msghdr *msg, int len, int flags,
1012 int *addr_len)
1014 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1016 /* No URG data to read. */
1017 if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ)
1018 return -EINVAL; /* Yes this is right ! */
1020 if (sk->done)
1021 return -ENOTCONN;
1023 if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) {
1024 sk->done = 1;
1025 return 0;
1028 if (tp->urg_data & URG_VALID) {
1029 int err = 0;
1030 char c = tp->urg_data;
1032 if (!(flags & MSG_PEEK))
1033 tp->urg_data = URG_READ;
1035 if(msg->msg_name)
1036 tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
1037 msg->msg_name);
1039 if(addr_len)
1040 *addr_len = tp->af_specific->sockaddr_len;
1042 /* Read urgent data. */
1043 msg->msg_flags|=MSG_OOB;
1045 if(len>0) {
1046 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1047 len = 1;
1048 } else
1049 msg->msg_flags|=MSG_TRUNC;
1051 return err ? -EFAULT : len;
1054 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1055 * the available implementations agree in this case:
1056 * this call should never block, independent of the
1057 * blocking state of the socket.
1058 * Mike <pall@rz.uni-karlsruhe.de>
1060 return -EAGAIN;
1064 * Release a skb if it is no longer needed. This routine
1065 * must be called with interrupts disabled or with the
1066 * socket locked so that the sk_buff queue operation is ok.
1069 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1071 __skb_unlink(skb, &sk->receive_queue);
1072 __kfree_skb(skb);
1075 /* Clean up the receive buffer for full frames taken by the user,
1076 * then send an ACK if necessary. COPIED is the number of bytes
1077 * tcp_recvmsg has given to the user so far, it speeds up the
1078 * calculation of whether or not we must ACK for the sake of
1079 * a window update.
1081 static void cleanup_rbuf(struct sock *sk, int copied)
1083 struct sk_buff *skb;
1085 /* NOTE! The socket must be locked, so that we don't get
1086 * a messed-up receive queue.
1088 while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
1089 if (!skb->used || atomic_read(&skb->users) > 1)
1090 break;
1091 tcp_eat_skb(sk, skb);
1094 /* We send an ACK if we can now advertise a non-zero window
1095 * which has been raised "significantly".
1097 if(copied > 0) {
1098 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1099 __u32 rcv_window_now = tcp_receive_window(tp);
1100 __u32 new_window = __tcp_select_window(sk);
1102 /* We won't be raising the window any further than
1103 * the window-clamp allows. Our window selection
1104 * also keeps things a nice multiple of MSS. These
1105 * checks are necessary to prevent spurious ACKs
1106 * which don't advertize a larger window.
1108 if((new_window && (new_window >= rcv_window_now * 2)) &&
1109 ((rcv_window_now + tp->mss_cache) <= tp->window_clamp))
1110 tcp_read_wakeup(sk);
1114 /* Now socket state including sk->err is changed only under lock,
1115 hence we should check only pending signals.
1118 static void tcp_data_wait(struct sock *sk)
1120 DECLARE_WAITQUEUE(wait, current);
1122 add_wait_queue(sk->sleep, &wait);
1124 __set_current_state(TASK_INTERRUPTIBLE);
1126 sk->socket->flags |= SO_WAITDATA;
1127 release_sock(sk);
1129 if (skb_queue_empty(&sk->receive_queue))
1130 schedule();
1132 lock_sock(sk);
1133 sk->socket->flags &= ~SO_WAITDATA;
1135 remove_wait_queue(sk->sleep, &wait);
1136 __set_current_state(TASK_RUNNING);
1140 * This routine copies from a sock struct into the user buffer.
1143 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1144 int len, int nonblock, int flags, int *addr_len)
1146 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1147 int copied = 0;
1148 u32 peek_seq;
1149 volatile u32 *seq; /* So gcc doesn't overoptimise */
1150 unsigned long used;
1151 int err;
1152 int target = 1; /* Read at least this many bytes */
1154 lock_sock(sk);
1156 if (sk->err)
1157 goto out_err;
1159 err = -ENOTCONN;
1160 if (sk->state == TCP_LISTEN)
1161 goto out;
1163 /* Urgent data needs to be handled specially. */
1164 if (flags & MSG_OOB)
1165 goto recv_urg;
1167 /* Copying sequence to update. This is volatile to handle
1168 * the multi-reader case neatly (memcpy_to/fromfs might be
1169 * inline and thus not flush cached variables otherwise).
1171 peek_seq = tp->copied_seq;
1172 seq = &tp->copied_seq;
1173 if (flags & MSG_PEEK)
1174 seq = &peek_seq;
1176 /* Handle the POSIX bogosity MSG_WAITALL. */
1177 if (flags & MSG_WAITALL)
1178 target=len;
1182 * BUG BUG BUG
1183 * This violates 1003.1g compliance. We must wait for
1184 * data to exist even if we read none!
1187 while (len > 0) {
1188 struct sk_buff * skb;
1189 u32 offset;
1191 /* Are we at urgent data? Stop if we have read anything. */
1192 if (copied && tp->urg_data && tp->urg_seq == *seq)
1193 break;
1195 /* We need to check signals first, to get correct SIGURG
1196 * handling. FIXME: Need to check this doesnt impact 1003.1g
1197 * and move it down to the bottom of the loop
1199 if (signal_pending(current)) {
1200 if (copied)
1201 break;
1202 copied = -ERESTARTSYS;
1203 if (nonblock)
1204 copied = -EAGAIN;
1205 break;
1208 /* Next get a buffer. */
1210 skb = skb_peek(&sk->receive_queue);
1211 do {
1212 if (!skb)
1213 break;
1215 /* Now that we have two receive queues this
1216 * shouldn't happen.
1218 if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1219 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1220 *seq, TCP_SKB_CB(skb)->seq);
1221 break;
1223 offset = *seq - TCP_SKB_CB(skb)->seq;
1224 if (skb->h.th->syn)
1225 offset--;
1226 if (offset < skb->len)
1227 goto found_ok_skb;
1228 if (skb->h.th->fin)
1229 goto found_fin_ok;
1230 if (!(flags & MSG_PEEK))
1231 skb->used = 1;
1232 skb = skb->next;
1233 } while (skb != (struct sk_buff *)&sk->receive_queue);
1235 if (copied >= target)
1236 break;
1238 if (sk->err && !(flags&MSG_PEEK)) {
1239 if (!copied)
1240 copied = sock_error(sk);
1241 break;
1244 if (sk->shutdown & RCV_SHUTDOWN) {
1245 sk->done = 1;
1246 break;
1249 if (sk->state == TCP_CLOSE) {
1250 if (!sk->done) {
1251 sk->done = 1;
1252 break;
1254 if (!copied)
1255 copied = -ENOTCONN;
1256 break;
1259 if (nonblock) {
1260 copied = -EAGAIN;
1261 break;
1264 cleanup_rbuf(sk, copied);
1265 tcp_data_wait(sk);
1266 continue;
1268 found_ok_skb:
1269 /* Lock the buffer. We can be fairly relaxed as
1270 * an interrupt will never steal a buffer we are
1271 * using unless I've missed something serious in
1272 * tcp_data.
1274 atomic_inc(&skb->users);
1276 /* Ok so how much can we use? */
1277 used = skb->len - offset;
1278 if (len < used)
1279 used = len;
1281 /* Do we have urgent data here? */
1282 if (tp->urg_data) {
1283 u32 urg_offset = tp->urg_seq - *seq;
1284 if (urg_offset < used) {
1285 if (!urg_offset) {
1286 if (!sk->urginline) {
1287 ++*seq;
1288 offset++;
1289 used--;
1291 } else
1292 used = urg_offset;
1296 /* Copy it - We _MUST_ update *seq first so that we
1297 * don't ever double read when we have dual readers
1299 *seq += used;
1301 /* This memcpy_toiovec can sleep. If it sleeps and we
1302 * do a second read it relies on the skb->users to avoid
1303 * a crash when cleanup_rbuf() gets called.
1305 err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
1306 if (err) {
1307 /* Exception. Bailout! */
1308 atomic_dec(&skb->users);
1309 copied = -EFAULT;
1310 break;
1313 copied += used;
1314 len -= used;
1316 /* We now will not sleep again until we are finished
1317 * with skb. Sorry if you are doing the SMP port
1318 * but you'll just have to fix it neatly ;)
1320 * Very funny Alan... -DaveM
1322 atomic_dec(&skb->users);
1324 if (after(tp->copied_seq,tp->urg_seq))
1325 tp->urg_data = 0;
1326 if (used + offset < skb->len)
1327 continue;
1329 /* Process the FIN. We may also need to handle PSH
1330 * here and make it break out of MSG_WAITALL.
1332 if (skb->h.th->fin)
1333 goto found_fin_ok;
1334 if (flags & MSG_PEEK)
1335 continue;
1336 skb->used = 1;
1337 if (atomic_read(&skb->users) == 1)
1338 tcp_eat_skb(sk, skb);
1339 continue;
1341 found_fin_ok:
1342 ++*seq;
1343 if (flags & MSG_PEEK)
1344 break;
1346 /* All is done. */
1347 skb->used = 1;
1348 sk->shutdown |= RCV_SHUTDOWN;
1349 break;
1352 if (copied >= 0 && msg->msg_name)
1353 tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
1354 msg->msg_name);
1356 if(addr_len)
1357 *addr_len = tp->af_specific->sockaddr_len;
1359 /* Clean up data we have read: This will do ACK frames. */
1360 cleanup_rbuf(sk, copied);
1361 release_sock(sk);
1362 return copied;
1364 out_err:
1365 err = sock_error(sk);
1367 out:
1368 release_sock(sk);
1369 return err;
1371 recv_urg:
1372 err = tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
1373 goto out;
1377 * Check whether to renew the timer.
1379 static inline void tcp_check_fin_timer(struct sock *sk)
1381 if (sk->state == TCP_FIN_WAIT2)
1382 tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
1386 * State processing on a close. This implements the state shift for
1387 * sending our FIN frame. Note that we only send a FIN for some
1388 * states. A shutdown() may have already sent the FIN, or we may be
1389 * closed.
1392 static unsigned char new_state[16] = {
1393 /* current state: new state: action: */
1394 /* (Invalid) */ TCP_CLOSE,
1395 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1396 /* TCP_SYN_SENT */ TCP_CLOSE,
1397 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1398 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
1399 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
1400 /* TCP_TIME_WAIT */ TCP_CLOSE,
1401 /* TCP_CLOSE */ TCP_CLOSE,
1402 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
1403 /* TCP_LAST_ACK */ TCP_LAST_ACK,
1404 /* TCP_LISTEN */ TCP_CLOSE,
1405 /* TCP_CLOSING */ TCP_CLOSING,
1408 static int tcp_close_state(struct sock *sk, int dead)
1410 int next = (int) new_state[sk->state];
1411 int ns = (next & TCP_STATE_MASK);
1413 tcp_set_state(sk, ns);
1415 /* This is a (useful) BSD violating of the RFC. There is a
1416 * problem with TCP as specified in that the other end could
1417 * keep a socket open forever with no application left this end.
1418 * We use a 3 minute timeout (about the same as BSD) then kill
1419 * our end. If they send after that then tough - BUT: long enough
1420 * that we won't make the old 4*rto = almost no time - whoops
1421 * reset mistake.
1423 if (dead)
1424 tcp_check_fin_timer(sk);
1426 return (next & TCP_ACTION_FIN);
1430 * Shutdown the sending side of a connection. Much like close except
1431 * that we don't receive shut down or set sk->dead.
1434 void tcp_shutdown(struct sock *sk, int how)
1436 /* We need to grab some memory, and put together a FIN,
1437 * and then put it into the queue to be sent.
1438 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1440 if (!(how & SEND_SHUTDOWN))
1441 return;
1443 /* If we've already sent a FIN, or it's a closed state, skip this. */
1444 if ((1 << sk->state) &
1445 (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1447 /* Clear out any half completed packets. FIN if needed. */
1448 if (tcp_close_state(sk,0))
1449 tcp_send_fin(sk);
1455 * Return 1 if we still have things to send in our buffers.
1458 static inline int closing(struct sock * sk)
1460 return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1464 * This routine closes sockets which have been at least partially
1465 * opened, but not yet accepted. Currently it is only called by
1466 * tcp_close.
1469 static void tcp_close_pending (struct sock *sk)
1471 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1472 struct open_request *req = tp->syn_wait_queue;
1474 while(req) {
1475 struct open_request *iter;
1477 if (req->sk)
1478 tcp_close(req->sk, 0);
1480 iter = req;
1481 req = req->dl_next;
1483 if (iter->sk) {
1484 sk->ack_backlog--;
1485 } else {
1486 tcp_dec_slow_timer(TCP_SLT_SYNACK);
1487 tp->syn_backlog--;
1489 (*iter->class->destructor)(iter);
1490 tcp_openreq_free(iter);
1492 BUG_TRAP(tp->syn_backlog == 0);
1493 BUG_TRAP(sk->ack_backlog == 0);
1494 tcp_synq_init(tp);
1497 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1499 /* First the read buffer. */
1500 skb_queue_purge(&sk->receive_queue);
1502 /* Next, the error queue. */
1503 skb_queue_purge(&sk->error_queue);
1505 /* Next, the write queue. */
1506 BUG_TRAP(skb_queue_empty(&sk->write_queue));
1508 /* It is _impossible_ for the backlog to contain anything
1509 * when we get here. All user references to this socket
1510 * have gone away, only the net layer knows can touch it.
1515 * At this point, there should be no process reference to this
1516 * socket, and thus no user references at all. Therefore we
1517 * can assume the socket waitqueue is inactive and nobody will
1518 * try to jump onto it.
1520 void tcp_destroy_sock(struct sock *sk)
1522 BUG_TRAP(sk->state==TCP_CLOSE);
1523 BUG_TRAP(sk->dead);
1525 /* It cannot be in hash table! */
1526 BUG_TRAP(sk->pprev==NULL);
1528 /* It it has not 0 sk->num, it must be bound */
1529 BUG_TRAP(!sk->num || sk->prev!=NULL);
1531 sk->prot->destroy(sk);
1533 tcp_kill_sk_queues(sk);
1535 #ifdef INET_REFCNT_DEBUG
1536 if (atomic_read(&sk->refcnt) != 1) {
1537 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
1539 #endif
1541 sock_put(sk);
1544 void tcp_close(struct sock *sk, long timeout)
1546 struct sk_buff *skb;
1547 int data_was_unread = 0;
1549 lock_sock(sk);
1550 if(sk->state == TCP_LISTEN) {
1551 tcp_set_state(sk, TCP_CLOSE);
1553 /* Special case. */
1554 tcp_close_pending(sk);
1556 goto adjudge_to_death;
1559 sk->shutdown = SHUTDOWN_MASK;
1561 /* We need to flush the recv. buffs. We do this only on the
1562 * descriptor close, not protocol-sourced closes, because the
1563 * reader process may not have drained the data yet!
1565 while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1566 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1567 data_was_unread += len;
1568 kfree_skb(skb);
1571 /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1572 * 3.10, we send a RST here because data was lost. To
1573 * witness the awful effects of the old behavior of always
1574 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1575 * a bulk GET in an FTP client, suspend the process, wait
1576 * for the client to advertise a zero window, then kill -9
1577 * the FTP client, wheee... Note: timeout is always zero
1578 * in such a case.
1580 if(data_was_unread != 0) {
1581 /* Unread data was tossed, zap the connection. */
1582 tcp_set_state(sk, TCP_CLOSE);
1583 tcp_send_active_reset(sk, GFP_KERNEL);
1584 } else if (tcp_close_state(sk,1)) {
1585 /* We FIN if the application ate all the data before
1586 * zapping the connection.
1588 tcp_send_fin(sk);
1591 if (timeout) {
1592 struct task_struct *tsk = current;
1593 DECLARE_WAITQUEUE(wait, current);
1595 add_wait_queue(sk->sleep, &wait);
1597 while (1) {
1598 set_current_state(TASK_INTERRUPTIBLE);
1599 if (!closing(sk))
1600 break;
1601 release_sock(sk);
1602 timeout = schedule_timeout(timeout);
1603 lock_sock(sk);
1604 if (!signal_pending(tsk) || timeout)
1605 break;
1608 tsk->state = TASK_RUNNING;
1609 remove_wait_queue(sk->sleep, &wait);
1612 /* Now that the socket is dead, if we are in the FIN_WAIT2 state
1613 * we may need to set up a timer.
1615 tcp_check_fin_timer(sk);
1617 adjudge_to_death:
1618 /* It is the last release_sock in its life. It will remove backlog. */
1619 release_sock(sk);
1622 /* Now socket is owned by kernel and we acquire BH lock
1623 to finish close. No need to check for user refs.
1625 local_bh_disable();
1626 bh_lock_sock(sk);
1627 BUG_TRAP(sk->lock.users==0);
1629 sock_hold(sk);
1631 /* Announce socket dead, detach it from wait queue and inode. */
1632 write_lock_irq(&sk->callback_lock);
1633 sk->dead = 1;
1634 sk->socket = NULL;
1635 sk->sleep = NULL;
1636 write_unlock_irq(&sk->callback_lock);
1638 if (sk->state == TCP_CLOSE)
1639 tcp_destroy_sock(sk);
1640 /* Otherwise, socket is reprieved until protocol close. */
1642 bh_unlock_sock(sk);
1643 local_bh_enable();
1644 sock_put(sk);
1647 int tcp_disconnect(struct sock *sk, int flags)
1649 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1650 int old_state;
1651 int err = 0;
1653 old_state = sk->state;
1654 if (old_state != TCP_CLOSE)
1655 tcp_set_state(sk, TCP_CLOSE);
1657 /* ABORT function of RFC793 */
1658 if (old_state == TCP_LISTEN) {
1659 tcp_close_pending(sk);
1660 } else if (tcp_connected(old_state)) {
1661 tcp_send_active_reset(sk, GFP_KERNEL);
1662 sk->err = ECONNRESET;
1663 } else if (old_state == TCP_SYN_SENT)
1664 sk->err = ECONNRESET;
1666 tcp_clear_xmit_timers(sk);
1667 __skb_queue_purge(&sk->receive_queue);
1668 __skb_queue_purge(&sk->write_queue);
1669 __skb_queue_purge(&tp->out_of_order_queue);
1671 sk->dport = 0;
1673 sk->rcv_saddr = 0;
1674 sk->saddr = 0;
1675 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1676 memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
1677 memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
1678 #endif
1680 sk->zapped = 0;
1681 sk->shutdown = 0;
1682 sk->done = 0;
1683 sk->write_space = tcp_write_space;
1684 tp->srtt = 0;
1685 #ifdef CONFIG_TCP_TW_RECYCLE
1686 if ((tp->write_seq += 2) == 0)
1687 tp->write_seq = 1;
1688 #else
1689 tp->write_seq = 0;
1690 #endif
1691 tp->ato = 0;
1692 tp->backoff = 0;
1693 tp->snd_cwnd = 2;
1694 tp->probes_out = 0;
1695 tp->high_seq = 0;
1696 tp->snd_ssthresh = 0x7fffffff;
1697 tp->snd_cwnd_cnt = 0;
1698 tp->dup_acks = 0;
1699 tp->delayed_acks = 0;
1700 tp->send_head = tp->retrans_head = NULL;
1701 tp->saw_tstamp = 0;
1702 __sk_dst_reset(sk);
1704 BUG_TRAP(!sk->num || sk->prev);
1706 sk->error_report(sk);
1707 return err;
1711 * Wait for an incoming connection, avoid race
1712 * conditions. This must be called with the socket locked,
1713 * and without the kernel lock held.
1715 static struct open_request * wait_for_connect(struct sock * sk,
1716 struct open_request **pprev)
1718 DECLARE_WAITQUEUE(wait, current);
1719 struct open_request *req;
1722 * True wake-one mechanism for incoming connections: only
1723 * one process gets woken up, not the 'whole herd'.
1724 * Since we do not 'race & poll' for established sockets
1725 * anymore, the common case will execute the loop only once.
1727 * Subtle issue: "add_wait_queue_exclusive()" will be added
1728 * after any current non-exclusive waiters, and we know that
1729 * it will always _stay_ after any new non-exclusive waiters
1730 * because all non-exclusive waiters are added at the
1731 * beginning of the wait-queue. As such, it's ok to "drop"
1732 * our exclusiveness temporarily when we get woken up without
1733 * having to remove and re-insert us on the wait queue.
1735 add_wait_queue_exclusive(sk->sleep, &wait);
1736 for (;;) {
1737 current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
1738 release_sock(sk);
1739 schedule();
1740 lock_sock(sk);
1741 req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev);
1742 if (req)
1743 break;
1744 if (signal_pending(current))
1745 break;
1747 current->state = TASK_RUNNING;
1748 remove_wait_queue(sk->sleep, &wait);
1749 return req;
1753 * This will accept the next outstanding connection.
1755 * Be careful about race conditions here - this is subtle.
1758 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1760 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1761 struct open_request *req, *prev;
1762 struct sock *newsk;
1763 int error;
1765 lock_sock(sk);
1767 /* We need to make sure that this socket is listening,
1768 * and that it has something pending.
1770 error = -EINVAL;
1771 if (sk->state != TCP_LISTEN)
1772 goto out;
1774 /* Find already established connection */
1775 req = tcp_find_established(tp, &prev);
1776 if (!req) {
1777 /* If this is a non blocking socket don't sleep */
1778 error = -EAGAIN;
1779 if (flags & O_NONBLOCK)
1780 goto out;
1782 error = -ERESTARTSYS;
1783 req = wait_for_connect(sk, &prev);
1784 if (!req)
1785 goto out;
1788 tcp_synq_unlink(tp, req, prev);
1789 newsk = req->sk;
1790 req->class->destructor(req);
1791 tcp_openreq_free(req);
1792 sk->ack_backlog--;
1793 release_sock(sk);
1794 return newsk;
1796 out:
1797 release_sock(sk);
1798 *err = error;
1799 return NULL;
1803 * Socket option code for TCP.
1806 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
1807 int optlen)
1809 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1810 int val;
1811 int err = 0;
1813 if (level != SOL_TCP)
1814 return tp->af_specific->setsockopt(sk, level, optname,
1815 optval, optlen);
1817 if(optlen<sizeof(int))
1818 return -EINVAL;
1820 if (get_user(val, (int *)optval))
1821 return -EFAULT;
1823 lock_sock(sk);
1825 switch(optname) {
1826 case TCP_MAXSEG:
1827 /* values greater than interface MTU won't take effect. however at
1828 * the point when this call is done we typically don't yet know
1829 * which interface is going to be used
1831 if(val < 1 || val > MAX_WINDOW) {
1832 err = -EINVAL;
1833 break;
1835 tp->user_mss = val;
1836 break;
1838 case TCP_NODELAY:
1839 /* You cannot try to use this and TCP_CORK in
1840 * tandem, so let the user know.
1842 if (sk->nonagle == 2) {
1843 err = -EINVAL;
1844 break;
1846 sk->nonagle = (val == 0) ? 0 : 1;
1847 break;
1849 case TCP_CORK:
1850 /* When set indicates to always queue non-full frames.
1851 * Later the user clears this option and we transmit
1852 * any pending partial frames in the queue. This is
1853 * meant to be used alongside sendfile() to get properly
1854 * filled frames when the user (for example) must write
1855 * out headers with a write() call first and then use
1856 * sendfile to send out the data parts.
1858 * You cannot try to use TCP_NODELAY and this mechanism
1859 * at the same time, so let the user know.
1861 if (sk->nonagle == 1) {
1862 err = -EINVAL;
1863 break;
1865 if (val != 0) {
1866 sk->nonagle = 2;
1867 } else {
1868 sk->nonagle = 0;
1870 tcp_push_pending_frames(sk, tp);
1872 break;
1874 case TCP_KEEPIDLE:
1875 if (val < 1 || val > MAX_TCP_KEEPIDLE)
1876 err = -EINVAL;
1877 else {
1878 tp->keepalive_time = val * HZ;
1879 if (sk->keepopen) {
1880 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1881 if (tp->keepalive_time > elapsed)
1882 elapsed = tp->keepalive_time - elapsed;
1883 else
1884 elapsed = 0;
1885 tcp_reset_keepalive_timer(sk, elapsed);
1888 break;
1889 case TCP_KEEPINTVL:
1890 if (val < 1 || val > MAX_TCP_KEEPINTVL)
1891 err = -EINVAL;
1892 else
1893 tp->keepalive_intvl = val * HZ;
1894 break;
1895 case TCP_KEEPCNT:
1896 if (val < 1 || val > MAX_TCP_KEEPCNT)
1897 err = -EINVAL;
1898 else
1899 tp->keepalive_probes = val;
1900 break;
1901 case TCP_SYNCNT:
1902 if (val < 1 || val > MAX_TCP_SYNCNT)
1903 err = -EINVAL;
1904 else
1905 tp->syn_retries = val;
1906 break;
1908 default:
1909 err = -ENOPROTOOPT;
1910 break;
1912 release_sock(sk);
1913 return err;
1916 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
1917 int *optlen)
1919 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1920 int val, len;
1922 if(level != SOL_TCP)
1923 return tp->af_specific->getsockopt(sk, level, optname,
1924 optval, optlen);
1926 if(get_user(len,optlen))
1927 return -EFAULT;
1929 len = min(len, sizeof(int));
1931 switch(optname) {
1932 case TCP_MAXSEG:
1933 val = tp->user_mss;
1934 break;
1935 case TCP_NODELAY:
1936 val = (sk->nonagle == 1);
1937 break;
1938 case TCP_CORK:
1939 val = (sk->nonagle == 2);
1940 break;
1941 case TCP_KEEPIDLE:
1942 if (tp->keepalive_time)
1943 val = tp->keepalive_time / HZ;
1944 else
1945 val = sysctl_tcp_keepalive_time / HZ;
1946 break;
1947 case TCP_KEEPINTVL:
1948 if (tp->keepalive_intvl)
1949 val = tp->keepalive_intvl / HZ;
1950 else
1951 val = sysctl_tcp_keepalive_intvl / HZ;
1952 break;
1953 case TCP_KEEPCNT:
1954 if (tp->keepalive_probes)
1955 val = tp->keepalive_probes;
1956 else
1957 val = sysctl_tcp_keepalive_probes;
1958 break;
1959 case TCP_SYNCNT:
1960 if (tp->syn_retries)
1961 val = tp->syn_retries;
1962 else
1963 val = sysctl_tcp_syn_retries;
1964 break;
1965 default:
1966 return -ENOPROTOOPT;
1969 if(put_user(len, optlen))
1970 return -EFAULT;
1971 if(copy_to_user(optval, &val,len))
1972 return -EFAULT;
1973 return 0;
1977 extern void __skb_cb_too_small_for_tcp(int, int);
1979 void __init tcp_init(void)
1981 struct sk_buff *skb = NULL;
1982 unsigned long goal;
1983 int order, i;
1985 if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
1986 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
1987 sizeof(skb->cb));
1989 tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
1990 sizeof(struct open_request),
1991 0, SLAB_HWCACHE_ALIGN,
1992 NULL, NULL);
1993 if(!tcp_openreq_cachep)
1994 panic("tcp_init: Cannot alloc open_request cache.");
1996 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
1997 sizeof(struct tcp_bind_bucket),
1998 0, SLAB_HWCACHE_ALIGN,
1999 NULL, NULL);
2000 if(!tcp_bucket_cachep)
2001 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2003 tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2004 sizeof(struct tcp_tw_bucket),
2005 0, SLAB_HWCACHE_ALIGN,
2006 NULL, NULL);
2007 if(!tcp_timewait_cachep)
2008 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2010 /* Size and allocate the main established and bind bucket
2011 * hash tables.
2013 * The methodology is similar to that of the buffer cache.
2015 goal = num_physpages >> (23 - PAGE_SHIFT);
2017 for(order = 0; (1UL << order) < goal; order++)
2019 do {
2020 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2021 sizeof(struct tcp_ehash_bucket);
2022 tcp_ehash_size >>= 1;
2023 while (tcp_ehash_size & (tcp_ehash_size-1))
2024 tcp_ehash_size--;
2025 tcp_ehash = (struct tcp_ehash_bucket *)
2026 __get_free_pages(GFP_ATOMIC, order);
2027 } while (tcp_ehash == NULL && --order > 0);
2029 if (!tcp_ehash)
2030 panic("Failed to allocate TCP established hash table\n");
2031 for (i = 0; i < (tcp_ehash_size<<1); i++) {
2032 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2033 tcp_ehash[i].chain = NULL;
2036 do {
2037 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2038 sizeof(struct tcp_bind_hashbucket);
2039 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2040 continue;
2041 tcp_bhash = (struct tcp_bind_hashbucket *)
2042 __get_free_pages(GFP_ATOMIC, order);
2043 } while (tcp_bhash == NULL && --order >= 0);
2045 if (!tcp_bhash)
2046 panic("Failed to allocate TCP bind hash table\n");
2047 for (i = 0; i < tcp_bhash_size; i++) {
2048 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2049 tcp_bhash[i].chain = NULL;
2052 if (order > 4) {
2053 sysctl_local_port_range[0] = 32768;
2054 sysctl_local_port_range[1] = 61000;
2055 } else if (order < 3) {
2056 sysctl_local_port_range[0] = 1024*(3-order);
2058 tcp_port_rover = sysctl_local_port_range[0] - 1;
2060 printk("TCP: Hash tables configured (established %d bind %d)\n",
2061 tcp_ehash_size<<1, tcp_bhash_size);