Import 2.4.0-test3pre8
[davej-history.git] / net / ipv4 / tcp.c
blobdbf680233fc4e9a5808f7731f1fb6c7684964f6b
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp.c,v 1.170 2000/07/08 00:20:43 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
27 * (tcp_err()).
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. poll
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_send_reset() fixed to work for
37 * everything not just packets for
38 * unknown sockets.
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
41 * syn rule wrong]
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
47 * escape still
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
51 * facilities
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
56 * bit to skb ops.
57 * Alan Cox : Tidied tcp_data to avoid a potential
58 * nasty.
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
70 * sockets.
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
74 * state ack error.
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
79 * fixes
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
85 * completely
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
93 * (not yet usable)
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle poll() after URG properly in
106 * all cases.
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), poll() after URG
111 * works now.
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
113 * BSD api.
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
121 * fixed ports.
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
127 * socket close.
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in polling before an
132 * accept.
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
143 * close.
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
149 * comments.
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
157 * resemble the RFC.
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
162 * generates them.
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : poll()->select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if state is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
175 * but its a start!
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect().
180 * Alan Cox : Small hooks for enSKIP.
181 * Alexey Kuznetsov: Path MTU discovery.
182 * Alan Cox : Support soft errors.
183 * Alan Cox : Fix MTU discovery pathological case
184 * when the remote claims no mtu!
185 * Marc Tamsky : TCP_CLOSE fix.
186 * Colin (G3TNE) : Send a reset on syn ack replies in
187 * window but wrong (fixes NT lpd problems)
188 * Pedro Roque : Better TCP window handling, delayed ack.
189 * Joerg Reuter : No modification of locked buffers in
190 * tcp_do_retransmit()
191 * Eric Schenk : Changed receiver side silly window
192 * avoidance algorithm to BSD style
193 * algorithm. This doubles throughput
194 * against machines running Solaris,
195 * and seems to result in general
196 * improvement.
197 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
198 * Willy Konynenberg : Transparent proxying support.
199 * Mike McLagan : Routing by source
200 * Keith Owens : Do proper merging with partial SKB's in
201 * tcp_do_sendmsg to avoid burstiness.
202 * Eric Schenk : Fix fast close down bug with
203 * shutdown() followed by close().
204 * Andi Kleen : Make poll agree with SIGIO
205 * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
206 * lingertime == 0 (RFC 793 ABORT Call)
208 * This program is free software; you can redistribute it and/or
209 * modify it under the terms of the GNU General Public License
210 * as published by the Free Software Foundation; either version
211 * 2 of the License, or(at your option) any later version.
213 * Description of States:
215 * TCP_SYN_SENT sent a connection request, waiting for ack
217 * TCP_SYN_RECV received a connection request, sent ack,
218 * waiting for final ack in three-way handshake.
220 * TCP_ESTABLISHED connection established
222 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
223 * transmission of remaining buffered data
225 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
226 * to shutdown
228 * TCP_CLOSING both sides have shutdown but we still have
229 * data we have to finish sending
231 * TCP_TIME_WAIT timeout to catch resent junk before entering
232 * closed, can only be entered from FIN_WAIT2
233 * or CLOSING. Required because the other end
234 * may not have gotten our last ACK causing it
235 * to retransmit the data packet (which we ignore)
237 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
238 * us to finish writing our data and to shutdown
239 * (we have to close() to move on to LAST_ACK)
241 * TCP_LAST_ACK out side has shutdown after remote has
242 * shutdown. There may still be data in our
243 * buffer that we have to finish sending
245 * TCP_CLOSE socket is finished
249 * RFC1122 status:
250 * NOTE: I'm not going to be doing comments in the code for this one except
251 * for violations and the like. tcp.c is just too big... If I say something
252 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
253 * with Alan. -- MS 950903
254 * [Note: Most of the TCP code has been rewriten/redesigned since this
255 * RFC1122 check. It is probably not correct anymore. It should be redone
256 * before 2.2. -AK]
258 * Use of PSH (4.2.2.2)
259 * MAY aggregate data sent without the PSH flag. (does)
260 * MAY queue data received without the PSH flag. (does)
261 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
262 * MAY implement PSH on send calls. (doesn't, thus:)
263 * MUST NOT buffer data indefinitely (doesn't [1 second])
264 * MUST set PSH on last segment (does)
265 * MAY pass received PSH to application layer (doesn't)
266 * SHOULD send maximum-sized segment whenever possible. (almost always does)
268 * Window Size (4.2.2.3, 4.2.2.16)
269 * MUST treat window size as an unsigned number (does)
270 * SHOULD treat window size as a 32-bit number (does not)
271 * MUST NOT shrink window once it is offered (does not normally)
273 * Urgent Pointer (4.2.2.4)
274 * **MUST point urgent pointer to last byte of urgent data (not right
275 * after). (doesn't, to be like BSD. That's configurable, but defaults
276 * to off)
277 * MUST inform application layer asynchronously of incoming urgent
278 * data. (does)
279 * MUST provide application with means of determining the amount of
280 * urgent data pending. (does)
281 * **MUST support urgent data sequence of arbitrary length. (doesn't, but
282 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
283 * [Follows BSD 1 byte of urgent data]
285 * TCP Options (4.2.2.5)
286 * MUST be able to receive TCP options in any segment. (does)
287 * MUST ignore unsupported options (does)
289 * Maximum Segment Size Option (4.2.2.6)
290 * MUST implement both sending and receiving MSS. (does, but currently
291 * only uses the smaller of both of them)
292 * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
293 * it always). (does, even when MSS == 536, which is legal)
294 * MUST assume MSS == 536 if no MSS received at connection setup (does)
295 * MUST calculate "effective send MSS" correctly:
296 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
297 * (does - but allows operator override)
299 * TCP Checksum (4.2.2.7)
300 * MUST generate and check TCP checksum. (does)
302 * Initial Sequence Number Selection (4.2.2.8)
303 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's
304 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
305 * necessary for 10Mbps networks - and harder than BSD to spoof!
306 * With syncookies we don't)
308 * Simultaneous Open Attempts (4.2.2.10)
309 * MUST support simultaneous open attempts (does)
311 * Recovery from Old Duplicate SYN (4.2.2.11)
312 * MUST keep track of active vs. passive open (does)
314 * RST segment (4.2.2.12)
315 * SHOULD allow an RST segment to contain data (does, but doesn't do
316 * anything with it, which is standard)
318 * Closing a Connection (4.2.2.13)
319 * MUST inform application of whether connection was closed by RST or
320 * normal close. (does)
321 * MAY allow "half-duplex" close (treat connection as closed for the
322 * local app, even before handshake is done). (does)
323 * MUST linger in TIME_WAIT for 2 * MSL (does)
325 * Retransmission Timeout (4.2.2.15)
326 * MUST implement Jacobson's slow start and congestion avoidance
327 * stuff. (does)
329 * Probing Zero Windows (4.2.2.17)
330 * MUST support probing of zero windows. (does)
331 * MAY keep offered window closed indefinitely. (does)
332 * MUST allow remote window to stay closed indefinitely. (does)
334 * Passive Open Calls (4.2.2.18)
335 * MUST NOT let new passive open affect other connections. (doesn't)
336 * MUST support passive opens (LISTENs) concurrently. (does)
338 * Time to Live (4.2.2.19)
339 * MUST make TCP TTL configurable. (does - IP_TTL option)
341 * Event Processing (4.2.2.20)
342 * SHOULD queue out-of-order segments. (does)
343 * MUST aggregate ACK segments whenever possible. (does but badly)
345 * Retransmission Timeout Calculation (4.2.3.1)
346 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO
347 * calculation. (does, or at least explains them in the comments 8*b)
348 * SHOULD initialize RTO to 0 and RTT to 3. (does)
350 * When to Send an ACK Segment (4.2.3.2)
351 * SHOULD implement delayed ACK. (does)
352 * MUST keep ACK delay < 0.5 sec. (does)
354 * When to Send a Window Update (4.2.3.3)
355 * MUST implement receiver-side SWS. (does)
357 * When to Send Data (4.2.3.4)
358 * MUST implement sender-side SWS. (does)
359 * SHOULD implement Nagle algorithm. (does)
361 * TCP Connection Failures (4.2.3.5)
362 * MUST handle excessive retransmissions "properly" (see the RFC). (does)
363 * SHOULD inform application layer of soft errors. (does)
365 * TCP Keep-Alives (4.2.3.6)
366 * MAY provide keep-alives. (does)
367 * MUST make keep-alives configurable on a per-connection basis. (does)
368 * MUST default to no keep-alives. (does)
369 * MUST make keep-alive interval configurable. (does)
370 * MUST make default keep-alive interval > 2 hours. (does)
371 * MUST NOT interpret failure to ACK keep-alive packet as dead
372 * connection. (doesn't)
373 * SHOULD send keep-alive with no data. (does)
375 * TCP Multihoming (4.2.3.7)
376 * MUST get source address from IP layer before sending first
377 * SYN. (does)
378 * MUST use same local address for all segments of a connection. (does)
380 * IP Options (4.2.3.8)
381 * MUST ignore unsupported IP options. (does)
382 * MAY support Time Stamp and Record Route. (does)
383 * MUST allow application to specify a source route. (does)
384 * MUST allow received Source Route option to set route for all future
385 * segments on this connection. (does not (security issues))
387 * ICMP messages (4.2.3.9)
388 * MUST act on ICMP errors. (does)
389 * MUST slow transmission upon receipt of a Source Quench. (doesn't anymore
390 * because that is deprecated now by the IETF, can be turned on)
391 * MUST NOT abort connection upon receipt of soft Destination
392 * Unreachables (0, 1, 5), Time Exceededs and Parameter
393 * Problems. (doesn't)
394 * SHOULD report soft Destination Unreachables etc. to the
395 * application. (does, except during SYN_RECV and may drop messages
396 * in some rare cases before accept() - ICMP is unreliable)
397 * SHOULD abort connection upon receipt of hard Destination Unreachable
398 * messages (2, 3, 4). (does, but see above)
400 * Remote Address Validation (4.2.3.10)
401 * MUST reject as an error OPEN for invalid remote IP address. (does)
402 * MUST ignore SYN with invalid source address. (does)
403 * MUST silently discard incoming SYN for broadcast/multicast
404 * address. (does)
406 * Asynchronous Reports (4.2.4.1)
407 * MUST provide mechanism for reporting soft errors to application
408 * layer. (does)
410 * Type of Service (4.2.4.2)
411 * MUST allow application layer to set Type of Service. (does IP_TOS)
413 * (Whew. -- MS 950903)
414 * (Updated by AK, but not complete yet.)
417 #include <linux/config.h>
418 #include <linux/types.h>
419 #include <linux/fcntl.h>
420 #include <linux/poll.h>
421 #include <linux/init.h>
422 #include <linux/smp_lock.h>
424 #include <net/icmp.h>
425 #include <net/tcp.h>
427 #include <asm/uaccess.h>
429 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
431 struct tcp_mib tcp_statistics[NR_CPUS*2];
433 kmem_cache_t *tcp_openreq_cachep;
434 kmem_cache_t *tcp_bucket_cachep;
435 kmem_cache_t *tcp_timewait_cachep;
437 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
440 * LISTEN is a special case for poll..
442 static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
444 return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
448 * Wait for a TCP event.
450 * Note that we don't need to lock the socket, as the upper poll layers
451 * take care of normal races (between the test and the event) and we don't
452 * go look at any of the socket buffers directly.
454 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
456 unsigned int mask;
457 struct sock *sk = sock->sk;
458 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
460 poll_wait(file, sk->sleep, wait);
461 if (sk->state == TCP_LISTEN)
462 return tcp_listen_poll(sk, wait);
464 /* Socket is not locked. We are protected from async events
465 by poll logic and correct handling of state changes
466 made by another threads is impossible in any case.
469 mask = 0;
470 if (sk->err)
471 mask = POLLERR;
474 * POLLHUP is certainly not done right. But poll() doesn't
475 * have a notion of HUP in just one direction, and for a
476 * socket the read side is more interesting.
478 * Some poll() documentation says that POLLHUP is incompatible
479 * with the POLLOUT/POLLWR flags, so somebody should check this
480 * all. But careful, it tends to be safer to return too many
481 * bits than too few, and you can easily break real applications
482 * if you don't tell them that something has hung up!
484 * Check-me.
486 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
487 * our fs/select.c). It means that after we received EOF,
488 * poll always returns immediately, making impossible poll() on write()
489 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
490 * if and only if shutdown has been made in both directions.
491 * Actually, it is interesting to look how Solaris and DUX
492 * solve this dilemma. I would prefer, if PULLHUP were maskable,
493 * then we could set it on SND_SHUTDOWN. BTW examples given
494 * in Stevens' books assume exactly this behaviour, it explains
495 * why PULLHUP is incompatible with POLLOUT. --ANK
497 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
498 * blocking on fresh not-connected or disconnected socket. --ANK
500 if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
501 mask |= POLLHUP;
502 if (sk->shutdown & RCV_SHUTDOWN)
503 mask |= POLLIN | POLLRDNORM;
505 /* Connected? */
506 if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
507 if ((tp->rcv_nxt != tp->copied_seq) &&
508 (tp->urg_seq != tp->copied_seq ||
509 tp->rcv_nxt != tp->copied_seq+1 ||
510 sk->urginline || !tp->urg_data))
511 mask |= POLLIN | POLLRDNORM;
513 if (!(sk->shutdown & SEND_SHUTDOWN)) {
514 if (sock_wspace(sk) >= tcp_min_write_space(sk)) {
515 mask |= POLLOUT | POLLWRNORM;
516 } else { /* send SIGIO later */
517 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
518 set_bit(SOCK_NOSPACE, &sk->socket->flags);
520 /* Race breaker. If space is freed after
521 * wspace test but before the flags are set,
522 * IO signal will be lost.
524 if (sock_wspace(sk) >= tcp_min_write_space(sk))
525 mask |= POLLOUT | POLLWRNORM;
529 if (tp->urg_data & TCP_URG_VALID)
530 mask |= POLLPRI;
532 return mask;
536 * Socket write_space callback.
537 * This (or rather the sock_wake_async) should agree with poll.
539 * WARNING. This callback is called, when socket is not locked.
541 * This wakeup is used by TCP only as dead-lock breaker, real
542 * wakeup occurs when incoming ack frees some space in buffer.
544 void tcp_write_space(struct sock *sk)
546 struct socket *sock;
548 read_lock(&sk->callback_lock);
549 if ((sock = sk->socket) != NULL && atomic_read(&sk->wmem_alloc) == 0) {
550 if (test_bit(SOCK_NOSPACE, &sock->flags)) {
551 if (sk->sleep && waitqueue_active(sk->sleep)) {
552 clear_bit(SOCK_NOSPACE, &sock->flags);
553 wake_up_interruptible(sk->sleep);
557 if (sock->fasync_list)
558 sock_wake_async(sock, 2, POLL_OUT);
560 read_unlock(&sk->callback_lock);
563 /* Listening TCP sockets never sleep to wait for memory, so
564 * it is completely silly to wake them up on queue space
565 * available events. So we hook them up to this dummy callback.
567 static void tcp_listen_write_space(struct sock *sk)
571 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
573 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
574 int answ;
576 switch(cmd) {
577 case SIOCINQ:
578 if (sk->state == TCP_LISTEN)
579 return(-EINVAL);
581 lock_sock(sk);
582 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
583 answ = 0;
584 else if (sk->urginline || !tp->urg_data ||
585 before(tp->urg_seq,tp->copied_seq) ||
586 !before(tp->urg_seq,tp->rcv_nxt)) {
587 answ = tp->rcv_nxt - tp->copied_seq;
589 /* Subtract 1, if FIN is in queue. */
590 if (answ && !skb_queue_empty(&sk->receive_queue))
591 answ -= ((struct sk_buff*)sk->receive_queue.prev)->h.th->fin;
592 } else
593 answ = tp->urg_seq - tp->copied_seq;
594 release_sock(sk);
595 break;
596 case SIOCATMARK:
598 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
599 break;
601 case SIOCOUTQ:
602 if (sk->state == TCP_LISTEN)
603 return(-EINVAL);
605 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
606 answ = 0;
607 else
608 answ = tp->write_seq - tp->snd_una;
609 break;
610 default:
611 return(-ENOIOCTLCMD);
614 return put_user(answ, (int *)arg);
618 int tcp_listen_start(struct sock *sk)
620 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
621 struct tcp_listen_opt *lopt;
623 sk->max_ack_backlog = 0;
624 sk->ack_backlog = 0;
625 tp->accept_queue = tp->accept_queue_tail = NULL;
626 tp->syn_wait_lock = RW_LOCK_UNLOCKED;
628 lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
629 if (!lopt)
630 return -ENOMEM;
632 memset(lopt, 0, sizeof(struct tcp_listen_opt));
633 for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
634 if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
635 break;
637 write_lock_bh(&tp->syn_wait_lock);
638 tp->listen_opt = lopt;
639 write_unlock_bh(&tp->syn_wait_lock);
641 /* There is race window here: we announce ourselves listening,
642 * but this transition is still not validated by get_port().
643 * It is OK, because this socket enters to hash table only
644 * after validation is complete.
646 sk->state = TCP_LISTEN;
647 if (sk->prot->get_port(sk, sk->num) == 0) {
648 sk->sport = htons(sk->num);
650 sk->write_space = tcp_listen_write_space;
651 sk_dst_reset(sk);
652 sk->prot->hash(sk);
654 return 0;
657 sk->state = TCP_CLOSE;
658 write_lock_bh(&tp->syn_wait_lock);
659 tp->listen_opt = NULL;
660 write_unlock_bh(&tp->syn_wait_lock);
661 kfree(lopt);
662 return -EADDRINUSE;
666 * This routine closes sockets which have been at least partially
667 * opened, but not yet accepted.
670 static void tcp_listen_stop (struct sock *sk)
672 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
673 struct tcp_listen_opt *lopt = tp->listen_opt;
674 struct open_request *acc_req = tp->accept_queue;
675 struct open_request *req;
676 int i;
678 tcp_delete_keepalive_timer(sk);
680 /* make all the listen_opt local to us */
681 write_lock_bh(&tp->syn_wait_lock);
682 tp->listen_opt =NULL;
683 write_unlock_bh(&tp->syn_wait_lock);
684 tp->accept_queue = tp->accept_queue_tail = NULL;
686 if (lopt->qlen) {
687 for (i=0; i<TCP_SYNQ_HSIZE; i++) {
688 while ((req = lopt->syn_table[i]) != NULL) {
689 lopt->syn_table[i] = req->dl_next;
690 lopt->qlen--;
691 tcp_openreq_free(req);
693 /* Following specs, it would be better either to send FIN
694 * (and enter FIN-WAIT-1, it is normal close)
695 * or to send active reset (abort).
696 * Certainly, it is pretty dangerous while synflood, but it is
697 * bad justification for our negligence 8)
698 * To be honest, we are not able to make either
699 * of the variants now. --ANK
704 BUG_TRAP(lopt->qlen == 0);
706 kfree(lopt);
708 while ((req=acc_req) != NULL) {
709 struct sock *child = req->sk;
711 acc_req = req->dl_next;
713 local_bh_disable();
714 bh_lock_sock(child);
715 BUG_TRAP(child->lock.users==0);
716 sock_hold(child);
718 tcp_disconnect(child, O_NONBLOCK);
720 sock_orphan(child);
722 atomic_inc(&tcp_orphan_count);
724 tcp_destroy_sock(child);
726 bh_unlock_sock(child);
727 local_bh_enable();
728 sock_put(child);
730 tcp_acceptq_removed(sk);
731 tcp_openreq_fastfree(req);
733 BUG_TRAP(sk->ack_backlog == 0);
737 * Wait for a socket to get into the connected state
739 * Note: Must be called with the socket locked.
741 static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
743 struct task_struct *tsk = current;
744 DECLARE_WAITQUEUE(wait, tsk);
746 while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
747 if(sk->err)
748 return sock_error(sk);
749 if((1 << sk->state) &
750 ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
751 if(sk->keepopen && !(flags&MSG_NOSIGNAL))
752 send_sig(SIGPIPE, tsk, 0);
753 return -EPIPE;
755 if(!*timeo_p)
756 return -EAGAIN;
757 if(signal_pending(tsk))
758 return sock_intr_errno(*timeo_p);
760 __set_task_state(tsk, TASK_INTERRUPTIBLE);
761 add_wait_queue(sk->sleep, &wait);
762 sk->tp_pinfo.af_tcp.write_pending++;
764 release_sock(sk);
765 *timeo_p = schedule_timeout(*timeo_p);
766 lock_sock(sk);
768 __set_task_state(tsk, TASK_RUNNING);
769 remove_wait_queue(sk->sleep, &wait);
770 sk->tp_pinfo.af_tcp.write_pending--;
772 return 0;
775 static inline int tcp_memory_free(struct sock *sk)
777 return atomic_read(&sk->wmem_alloc) < sk->sndbuf;
781 * Wait for more memory for a socket
783 static long wait_for_tcp_memory(struct sock * sk, long timeo)
785 if (!tcp_memory_free(sk)) {
786 DECLARE_WAITQUEUE(wait, current);
788 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
790 add_wait_queue(sk->sleep, &wait);
791 for (;;) {
792 set_bit(SOCK_NOSPACE, &sk->socket->flags);
794 set_current_state(TASK_INTERRUPTIBLE);
796 if (signal_pending(current))
797 break;
798 if (tcp_memory_free(sk))
799 break;
800 if (sk->shutdown & SEND_SHUTDOWN)
801 break;
802 if (sk->err)
803 break;
804 release_sock(sk);
805 if (!tcp_memory_free(sk))
806 timeo = schedule_timeout(timeo);
807 lock_sock(sk);
809 current->state = TASK_RUNNING;
810 remove_wait_queue(sk->sleep, &wait);
812 return timeo;
815 /* When all user supplied data has been queued set the PSH bit */
816 #define PSH_NEEDED (seglen == 0 && iovlen == 0)
819 * This routine copies from a user buffer into a socket,
820 * and starts the transmit system.
823 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
825 struct iovec *iov;
826 struct tcp_opt *tp;
827 struct sk_buff *skb;
828 int iovlen, flags;
829 int mss_now;
830 int err, copied;
831 long timeo;
833 err = 0;
834 tp = &(sk->tp_pinfo.af_tcp);
836 lock_sock(sk);
837 TCP_CHECK_TIMER(sk);
839 flags = msg->msg_flags;
841 timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
843 /* Wait for a connection to finish. */
844 if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
845 if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
846 goto out_unlock;
848 /* This should be in poll */
849 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
851 mss_now = tcp_current_mss(sk);
853 /* Ok commence sending. */
854 iovlen = msg->msg_iovlen;
855 iov = msg->msg_iov;
856 copied = 0;
858 while(--iovlen >= 0) {
859 int seglen=iov->iov_len;
860 unsigned char * from=iov->iov_base;
862 iov++;
864 while(seglen > 0) {
865 int copy, tmp, queue_it;
867 if (err)
868 goto do_fault2;
870 /* Stop on errors. */
871 if (sk->err)
872 goto do_sock_err;
874 /* Make sure that we are established. */
875 if (sk->shutdown & SEND_SHUTDOWN)
876 goto do_shutdown;
878 /* Now we need to check if we have a half
879 * built packet we can tack some data onto.
881 if (tp->send_head && !(flags & MSG_OOB)) {
882 skb = sk->write_queue.prev;
883 copy = skb->len;
884 /* If the remote does SWS avoidance we should
885 * queue the best we can if not we should in
886 * fact send multiple packets...
887 * A method for detecting this would be most
888 * welcome.
890 if (skb_tailroom(skb) > 0 &&
891 (mss_now - copy) > 0) {
892 int last_byte_was_odd = (copy % 4);
894 copy = mss_now - copy;
895 if(copy > skb_tailroom(skb))
896 copy = skb_tailroom(skb);
897 if(copy > seglen)
898 copy = seglen;
899 if(last_byte_was_odd) {
900 if(copy_from_user(skb_put(skb, copy),
901 from, copy))
902 err = -EFAULT;
903 skb->csum = csum_partial(skb->data,
904 skb->len, 0);
905 } else {
906 skb->csum =
907 csum_and_copy_from_user(
908 from, skb_put(skb, copy),
909 copy, skb->csum, &err);
912 * FIXME: the *_user functions should
913 * return how much data was
914 * copied before the fault
915 * occurred and then a partial
916 * packet with this data should
917 * be sent. Unfortunately
918 * csum_and_copy_from_user doesn't
919 * return this information.
920 * ATM it might send partly zeroed
921 * data in this case.
923 tp->write_seq += copy;
924 TCP_SKB_CB(skb)->end_seq += copy;
925 from += copy;
926 copied += copy;
927 seglen -= copy;
928 if (PSH_NEEDED)
929 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
930 continue;
934 /* A chunk was here doing something strange
935 * with psh etc. It is deleted, because it was
936 * evident non-sense. --ANK
939 copy = min(seglen, mss_now);
941 /* Determine how large of a buffer to allocate. */
942 tmp = MAX_TCP_HEADER + 15;
943 if (copy < mss_now && !(flags & MSG_OOB)) {
944 tmp += mss_now;
946 /* What is happening here is that we want to
947 * tack on later members of the users iovec
948 * if possible into a single frame. When we
949 * leave this loop our caller checks to see if
950 * we can send queued frames onto the wire.
951 * See tcp_v[46]_sendmsg() for this.
953 queue_it = 1;
954 } else {
955 tmp += copy;
956 queue_it = 0;
959 if (tcp_memory_free(sk)) {
960 skb = alloc_skb(tmp, GFP_KERNEL);
961 if (skb == NULL)
962 goto do_oom;
963 skb_set_owner_w(skb, sk);
964 } else {
965 /* If we didn't get any memory, we need to sleep. */
966 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
967 set_bit(SOCK_NOSPACE, &sk->socket->flags);
969 if (!timeo) {
970 err = -EAGAIN;
971 goto do_interrupted;
973 if (signal_pending(current)) {
974 err = sock_intr_errno(timeo);
975 goto do_interrupted;
977 __tcp_push_pending_frames(sk, tp, mss_now);
978 timeo = wait_for_tcp_memory(sk, timeo);
980 /* If SACK's were formed or PMTU events happened,
981 * we must find out about it.
983 mss_now = tcp_current_mss(sk);
984 continue;
987 seglen -= copy;
989 /* Prepare control bits for TCP header creation engine. */
990 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
991 ((PSH_NEEDED) ?
992 TCPCB_FLAG_PSH : 0));
993 TCP_SKB_CB(skb)->sacked = 0;
994 if (flags & MSG_OOB) {
995 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG;
996 TCP_SKB_CB(skb)->urg_ptr = copy;
997 } else
998 TCP_SKB_CB(skb)->urg_ptr = 0;
1000 /* TCP data bytes are SKB_PUT() on top, later
1001 * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
1002 * Reserve header space and checksum the data.
1004 skb_reserve(skb, MAX_TCP_HEADER);
1005 skb->csum = csum_and_copy_from_user(from,
1006 skb_put(skb, copy), copy, 0, &err);
1008 if (err)
1009 goto do_fault;
1011 from += copy;
1012 copied += copy;
1014 TCP_SKB_CB(skb)->seq = tp->write_seq;
1015 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
1017 /* This advances tp->write_seq for us. */
1018 tcp_send_skb(sk, skb, queue_it, mss_now);
1021 err = copied;
1022 out:
1023 __tcp_push_pending_frames(sk, tp, mss_now);
1024 TCP_CHECK_TIMER(sk);
1025 out_unlock:
1026 release_sock(sk);
1027 return err;
1029 do_sock_err:
1030 if(copied)
1031 err = copied;
1032 else
1033 err = sock_error(sk);
1034 goto out;
1035 do_shutdown:
1036 if(copied)
1037 err = copied;
1038 else {
1039 if (!(flags&MSG_NOSIGNAL))
1040 send_sig(SIGPIPE, current, 0);
1041 err = -EPIPE;
1043 goto out;
1044 do_oom:
1045 err = copied ? : -ENOBUFS;
1046 goto out;
1047 do_interrupted:
1048 if(copied)
1049 err = copied;
1050 goto out;
1051 do_fault:
1052 kfree_skb(skb);
1053 do_fault2:
1054 err = -EFAULT;
1055 goto out;
1058 #undef PSH_NEEDED
1061 * Handle reading urgent data. BSD has very simple semantics for
1062 * this, no blocking and very strange errors 8)
1065 static int tcp_recv_urg(struct sock * sk, long timeo,
1066 struct msghdr *msg, int len, int flags,
1067 int *addr_len)
1069 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1071 /* No URG data to read. */
1072 if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
1073 return -EINVAL; /* Yes this is right ! */
1075 if (sk->done)
1076 return -ENOTCONN;
1078 if (tp->urg_data & TCP_URG_VALID) {
1079 int err = 0;
1080 char c = tp->urg_data;
1082 if (!(flags & MSG_PEEK))
1083 tp->urg_data = TCP_URG_READ;
1085 /* Read urgent data. */
1086 msg->msg_flags|=MSG_OOB;
1088 if(len>0) {
1089 if (!(flags & MSG_PEEK))
1090 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1091 len = 1;
1092 } else
1093 msg->msg_flags|=MSG_TRUNC;
1095 return err ? -EFAULT : len;
1098 /* Do not set sk->done, it is set only by normal data receive */
1099 if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
1100 return 0;
1102 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1103 * the available implementations agree in this case:
1104 * this call should never block, independent of the
1105 * blocking state of the socket.
1106 * Mike <pall@rz.uni-karlsruhe.de>
1108 return -EAGAIN;
1112 * Release a skb if it is no longer needed. This routine
1113 * must be called with interrupts disabled or with the
1114 * socket locked so that the sk_buff queue operation is ok.
1117 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1119 __skb_unlink(skb, &sk->receive_queue);
1120 BUG_TRAP(atomic_read(&skb->users) == 1);
1121 /* Well, if I missed something then punishment will be terrible oops. */
1122 __kfree_skb(skb);
1125 /* Clean up the receive buffer for full frames taken by the user,
1126 * then send an ACK if necessary. COPIED is the number of bytes
1127 * tcp_recvmsg has given to the user so far, it speeds up the
1128 * calculation of whether or not we must ACK for the sake of
1129 * a window update.
1131 static void cleanup_rbuf(struct sock *sk, int copied)
1133 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1134 struct sk_buff *skb;
1135 int time_to_ack = 0;
1137 /* NOTE! The socket must be locked, so that we don't get
1138 * a messed-up receive queue.
1140 while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
1141 if (!skb->used)
1142 break;
1143 tcp_eat_skb(sk, skb);
1146 if (tp->ack.pending) {
1147 /* Delayed ACKs frequently hit locked sockets during bulk receive. */
1148 if (tp->ack.blocked
1149 #ifdef TCP_MORE_COARSE_ACKS
1150 /* Once-per-two-segments ACK was not sent by tcp_input.c */
1151 || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss
1152 #endif
1154 * If this read emptied read buffer, we send ACK when:
1156 * -- ATO estimator diverged. In this case it is useless
1157 * to delay ACK, it will miss in any case.
1159 * -- The second condition is triggered when we did not
1160 * ACK 8 segments not depending of their size.
1161 * Linux senders allocate full-sized frame even for one byte
1162 * packets, so that default queue for MTU=8K can hold
1163 * only 8 packets. Note, that no other workarounds
1164 * but counting packets are possible. If sender selected
1165 * a small sndbuf or have larger mtu lockup will still
1166 * occur. Well, not lockup, but 10-20msec gap.
1167 * It is essentially dead lockup for 1Gib ethernet
1168 * and loopback :-). The value 8 covers all reasonable
1169 * cases and we may receive packet of any size
1170 * with maximal possible rate now.
1172 || (copied > 0 &&
1173 (tp->ack.ato >= TCP_DELACK_MAX || tp->ack.rcv_segs > 7) &&
1174 !tp->ack.pingpong &&
1175 atomic_read(&sk->rmem_alloc) == 0)) {
1176 time_to_ack = 1;
1180 /* We send an ACK if we can now advertise a non-zero window
1181 * which has been raised "significantly".
1183 * Even if window raised up to infinity, do not send window open ACK
1184 * in states, where we will not receive more. It is useless.
1186 if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
1187 __u32 rcv_window_now = tcp_receive_window(tp);
1188 __u32 new_window = __tcp_select_window(sk);
1190 /* Send ACK now, if this read freed lots of space
1191 * in our buffer. Certainly, new_window is new window.
1192 * We can advertise it now, if it is not less than current one.
1193 * "Lots" means "at least twice" here.
1195 if(new_window && new_window >= 2*rcv_window_now)
1196 time_to_ack = 1;
1198 if (time_to_ack)
1199 tcp_send_ack(sk);
1202 /* Now socket state including sk->err is changed only under lock,
1203 * hence we may omit checks after joining wait queue.
1204 * We check receive queue before schedule() only as optimization;
1205 * it is very likely that release_sock() added new data.
1208 static long tcp_data_wait(struct sock *sk, long timeo)
1210 DECLARE_WAITQUEUE(wait, current);
1212 add_wait_queue(sk->sleep, &wait);
1214 __set_current_state(TASK_INTERRUPTIBLE);
1216 set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1217 release_sock(sk);
1219 if (skb_queue_empty(&sk->receive_queue))
1220 timeo = schedule_timeout(timeo);
1222 lock_sock(sk);
1223 clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1225 remove_wait_queue(sk->sleep, &wait);
1226 __set_current_state(TASK_RUNNING);
1227 return timeo;
1230 static void tcp_prequeue_process(struct sock *sk)
1232 struct sk_buff *skb;
1233 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1235 net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
1237 /* RX process wants to run with disabled BHs, though it is not necessary */
1238 local_bh_disable();
1239 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1240 sk->backlog_rcv(sk, skb);
1241 local_bh_enable();
1243 /* Clear memory counter. */
1244 tp->ucopy.memory = 0;
1248 * This routine copies from a sock struct into the user buffer.
1250 * Technical note: in 2.3 we work on _locked_ socket, so that
1251 * tricks with *seq access order and skb->users are not required.
1252 * Probably, code can be easily improved even more.
1255 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1256 int len, int nonblock, int flags, int *addr_len)
1258 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1259 int copied = 0;
1260 u32 peek_seq;
1261 u32 *seq;
1262 unsigned long used;
1263 int err;
1264 int target; /* Read at least this many bytes */
1265 long timeo;
1266 struct task_struct *user_recv = NULL;
1268 lock_sock(sk);
1270 TCP_CHECK_TIMER(sk);
1272 err = -ENOTCONN;
1273 if (sk->state == TCP_LISTEN)
1274 goto out;
1276 timeo = sock_rcvtimeo(sk, nonblock);
1278 /* Urgent data needs to be handled specially. */
1279 if (flags & MSG_OOB)
1280 goto recv_urg;
1282 seq = &tp->copied_seq;
1283 if (flags & MSG_PEEK) {
1284 peek_seq = tp->copied_seq;
1285 seq = &peek_seq;
1288 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1290 do {
1291 struct sk_buff * skb;
1292 u32 offset;
1294 /* Are we at urgent data? Stop if we have read anything. */
1295 if (copied && tp->urg_data && tp->urg_seq == *seq)
1296 break;
1298 /* We need to check signals first, to get correct SIGURG
1299 * handling. FIXME: Need to check this doesnt impact 1003.1g
1300 * and move it down to the bottom of the loop
1302 if (signal_pending(current)) {
1303 if (copied)
1304 break;
1305 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1306 break;
1309 /* Next get a buffer. */
1311 skb = skb_peek(&sk->receive_queue);
1312 do {
1313 if (!skb)
1314 break;
1316 /* Now that we have two receive queues this
1317 * shouldn't happen.
1319 if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1320 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1321 *seq, TCP_SKB_CB(skb)->seq);
1322 break;
1324 offset = *seq - TCP_SKB_CB(skb)->seq;
1325 if (skb->h.th->syn)
1326 offset--;
1327 if (offset < skb->len)
1328 goto found_ok_skb;
1329 if (skb->h.th->fin)
1330 goto found_fin_ok;
1331 if (!(flags & MSG_PEEK))
1332 skb->used = 1;
1333 skb = skb->next;
1334 } while (skb != (struct sk_buff *)&sk->receive_queue);
1336 /* Well, if we have backlog, try to process it now yet. */
1338 if (copied >= target && sk->backlog.tail == NULL)
1339 break;
1341 if (copied) {
1342 if (sk->err ||
1343 sk->state == TCP_CLOSE ||
1344 (sk->shutdown & RCV_SHUTDOWN) ||
1345 !timeo)
1346 break;
1347 } else {
1348 if (sk->err) {
1349 copied = sock_error(sk);
1350 break;
1353 if (sk->shutdown & RCV_SHUTDOWN) {
1354 if (!(flags&MSG_PEEK))
1355 sk->done = 1;
1356 break;
1359 if (sk->state == TCP_CLOSE) {
1360 if (sk->done) {
1361 copied = -ENOTCONN;
1362 break;
1363 } else if (!(flags&MSG_PEEK))
1364 sk->done = 1;
1365 break;
1368 if (!timeo) {
1369 copied = -EAGAIN;
1370 break;
1374 cleanup_rbuf(sk, copied);
1376 if (tp->ucopy.task == user_recv) {
1377 /* Install new reader */
1378 if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) {
1379 user_recv = current;
1380 tp->ucopy.task = user_recv;
1381 tp->ucopy.iov = msg->msg_iov;
1384 tp->ucopy.len = len;
1386 BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC)));
1388 /* Ugly... If prequeue is not empty, we have to
1389 * process it before releasing socket, otherwise
1390 * order will be broken at second iteration.
1391 * More elegant solution is required!!!
1393 * Look: we have the following (pseudo)queues:
1395 * 1. packets in flight
1396 * 2. backlog
1397 * 3. prequeue
1398 * 4. receive_queue
1400 * Each queue can be processed only if the next ones
1401 * are empty. At this point we have empty receive_queue.
1402 * But prequeue _can_ be not empty after second iteration,
1403 * when we jumped to start of loop because backlog
1404 * processing added something to receive_queue.
1405 * We cannot release_sock(), because backlog contains
1406 * packets arrived _after_ prequeued ones.
1408 * Shortly, algorithm is clear --- to process all
1409 * the queues in order. We could make it more directly,
1410 * requeueing packets from backlog to prequeue, if
1411 * is not empty. It is more elegant, but eats cycles,
1412 * unfortunately.
1414 if (skb_queue_len(&tp->ucopy.prequeue))
1415 goto do_prequeue;
1417 /* __ Set realtime policy in scheduler __ */
1420 if (copied >= target) {
1421 /* Do not sleep, just process backlog. */
1422 release_sock(sk);
1423 lock_sock(sk);
1424 } else {
1425 timeo = tcp_data_wait(sk, timeo);
1428 if (user_recv) {
1429 int chunk;
1431 /* __ Restore normal policy in scheduler __ */
1433 if ((chunk = len - tp->ucopy.len) != 0) {
1434 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
1435 len -= chunk;
1436 copied += chunk;
1439 if (tp->rcv_nxt == tp->copied_seq &&
1440 skb_queue_len(&tp->ucopy.prequeue)) {
1441 do_prequeue:
1442 tcp_prequeue_process(sk);
1444 if ((chunk = len - tp->ucopy.len) != 0) {
1445 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1446 len -= chunk;
1447 copied += chunk;
1451 continue;
1453 found_ok_skb:
1454 /* Ok so how much can we use? */
1455 used = skb->len - offset;
1456 if (len < used)
1457 used = len;
1459 /* Do we have urgent data here? */
1460 if (tp->urg_data) {
1461 u32 urg_offset = tp->urg_seq - *seq;
1462 if (urg_offset < used) {
1463 if (!urg_offset) {
1464 if (!sk->urginline) {
1465 ++*seq;
1466 offset++;
1467 used--;
1469 } else
1470 used = urg_offset;
1474 err = 0;
1475 if (!(flags&MSG_TRUNC)) {
1476 err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
1477 if (err) {
1478 /* Exception. Bailout! */
1479 if (!copied)
1480 copied = -EFAULT;
1481 break;
1485 *seq += used;
1486 copied += used;
1487 len -= used;
1489 if (after(tp->copied_seq,tp->urg_seq)) {
1490 tp->urg_data = 0;
1491 if (skb_queue_len(&tp->out_of_order_queue) == 0
1492 #ifdef TCP_FORMAL_WINDOW
1493 && tcp_receive_window(tp)
1494 #endif
1496 tcp_fast_path_on(tp);
1499 if (used + offset < skb->len)
1500 continue;
1502 /* Process the FIN. We may also need to handle PSH
1503 * here and make it break out of MSG_WAITALL.
1505 if (skb->h.th->fin)
1506 goto found_fin_ok;
1507 if (flags & MSG_PEEK)
1508 continue;
1509 skb->used = 1;
1510 tcp_eat_skb(sk, skb);
1511 continue;
1513 found_fin_ok:
1514 ++*seq;
1515 if (flags & MSG_PEEK)
1516 break;
1518 /* All is done. */
1519 skb->used = 1;
1520 break;
1521 } while (len > 0);
1523 if (user_recv) {
1524 if (skb_queue_len(&tp->ucopy.prequeue)) {
1525 int chunk;
1527 tp->ucopy.len = copied > 0 ? len : 0;
1529 tcp_prequeue_process(sk);
1531 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1532 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1533 len -= chunk;
1534 copied += chunk;
1538 tp->ucopy.task = NULL;
1539 tp->ucopy.len = 0;
1542 /* According to UNIX98, msg_name/msg_namelen are ignored
1543 * on connected socket. I was just happy when found this 8) --ANK
1546 /* Clean up data we have read: This will do ACK frames. */
1547 cleanup_rbuf(sk, copied);
1549 TCP_CHECK_TIMER(sk);
1550 release_sock(sk);
1551 return copied;
1553 out:
1554 TCP_CHECK_TIMER(sk);
1555 release_sock(sk);
1556 return err;
1558 recv_urg:
1559 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1560 goto out;
1564 * State processing on a close. This implements the state shift for
1565 * sending our FIN frame. Note that we only send a FIN for some
1566 * states. A shutdown() may have already sent the FIN, or we may be
1567 * closed.
1570 static unsigned char new_state[16] = {
1571 /* current state: new state: action: */
1572 /* (Invalid) */ TCP_CLOSE,
1573 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1574 /* TCP_SYN_SENT */ TCP_CLOSE,
1575 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1576 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
1577 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
1578 /* TCP_TIME_WAIT */ TCP_CLOSE,
1579 /* TCP_CLOSE */ TCP_CLOSE,
1580 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
1581 /* TCP_LAST_ACK */ TCP_LAST_ACK,
1582 /* TCP_LISTEN */ TCP_CLOSE,
1583 /* TCP_CLOSING */ TCP_CLOSING,
1586 static int tcp_close_state(struct sock *sk)
1588 int next = (int) new_state[sk->state];
1589 int ns = (next & TCP_STATE_MASK);
1591 tcp_set_state(sk, ns);
1593 return (next & TCP_ACTION_FIN);
1597 * Shutdown the sending side of a connection. Much like close except
1598 * that we don't receive shut down or set sk->dead.
1601 void tcp_shutdown(struct sock *sk, int how)
1603 /* We need to grab some memory, and put together a FIN,
1604 * and then put it into the queue to be sent.
1605 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1607 if (!(how & SEND_SHUTDOWN))
1608 return;
1610 /* If we've already sent a FIN, or it's a closed state, skip this. */
1611 if ((1 << sk->state) &
1612 (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1613 /* Clear out any half completed packets. FIN if needed. */
1614 if (tcp_close_state(sk))
1615 tcp_send_fin(sk);
1621 * Return 1 if we still have things to send in our buffers.
1624 static inline int closing(struct sock * sk)
1626 return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1629 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1631 /* First the read buffer. */
1632 skb_queue_purge(&sk->receive_queue);
1634 /* Next, the error queue. */
1635 skb_queue_purge(&sk->error_queue);
1637 /* Next, the write queue. */
1638 BUG_TRAP(skb_queue_empty(&sk->write_queue));
1640 /* It is _impossible_ for the backlog to contain anything
1641 * when we get here. All user references to this socket
1642 * have gone away, only the net layer knows can touch it.
1647 * At this point, there should be no process reference to this
1648 * socket, and thus no user references at all. Therefore we
1649 * can assume the socket waitqueue is inactive and nobody will
1650 * try to jump onto it.
1652 void tcp_destroy_sock(struct sock *sk)
1654 BUG_TRAP(sk->state==TCP_CLOSE);
1655 BUG_TRAP(sk->dead);
1657 /* It cannot be in hash table! */
1658 BUG_TRAP(sk->pprev==NULL);
1660 /* It it has not 0 sk->num, it must be bound */
1661 BUG_TRAP(!sk->num || sk->prev!=NULL);
1663 #ifdef TCP_DEBUG
1664 if (sk->zapped) {
1665 printk("TCP: double destroy sk=%p\n", sk);
1666 sock_hold(sk);
1668 sk->zapped = 1;
1669 #endif
1671 sk->prot->destroy(sk);
1673 tcp_kill_sk_queues(sk);
1675 #ifdef INET_REFCNT_DEBUG
1676 if (atomic_read(&sk->refcnt) != 1) {
1677 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
1679 #endif
1681 atomic_dec(&tcp_orphan_count);
1682 sock_put(sk);
1685 void tcp_close(struct sock *sk, long timeout)
1687 struct sk_buff *skb;
1688 int data_was_unread = 0;
1690 lock_sock(sk);
1691 sk->shutdown = SHUTDOWN_MASK;
1693 if(sk->state == TCP_LISTEN) {
1694 tcp_set_state(sk, TCP_CLOSE);
1696 /* Special case. */
1697 tcp_listen_stop(sk);
1699 goto adjudge_to_death;
1702 /* We need to flush the recv. buffs. We do this only on the
1703 * descriptor close, not protocol-sourced closes, because the
1704 * reader process may not have drained the data yet!
1706 while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1707 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1708 data_was_unread += len;
1709 kfree_skb(skb);
1712 /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1713 * 3.10, we send a RST here because data was lost. To
1714 * witness the awful effects of the old behavior of always
1715 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1716 * a bulk GET in an FTP client, suspend the process, wait
1717 * for the client to advertise a zero window, then kill -9
1718 * the FTP client, wheee... Note: timeout is always zero
1719 * in such a case.
1721 if(data_was_unread != 0) {
1722 /* Unread data was tossed, zap the connection. */
1723 tcp_set_state(sk, TCP_CLOSE);
1724 tcp_send_active_reset(sk, GFP_KERNEL);
1725 } else if (sk->linger && sk->lingertime==0) {
1726 /* Check zero linger _after_ checking for unread data. */
1727 sk->prot->disconnect(sk, 0);
1728 } else if (tcp_close_state(sk)) {
1729 /* We FIN if the application ate all the data before
1730 * zapping the connection.
1733 /* RED-PEN. Formally speaking, we have broken TCP state
1734 * machine. State transitions:
1736 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1737 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1738 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1740 * are legal only when FIN has been sent (i.e. in window),
1741 * rather than queued out of window. Purists blame.
1743 * F.e. "RFC state" is ESTABLISHED,
1744 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1746 * The visible declinations are that sometimes
1747 * we enter time-wait state, when it is not required really
1748 * (harmless), do not send active resets, when they are
1749 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1750 * they look as CLOSING or LAST_ACK for Linux)
1751 * Probably, I missed some more holelets.
1752 * --ANK
1754 tcp_send_fin(sk);
1757 if (timeout) {
1758 struct task_struct *tsk = current;
1759 DECLARE_WAITQUEUE(wait, current);
1761 add_wait_queue(sk->sleep, &wait);
1763 do {
1764 set_current_state(TASK_INTERRUPTIBLE);
1765 if (!closing(sk))
1766 break;
1767 release_sock(sk);
1768 timeout = schedule_timeout(timeout);
1769 lock_sock(sk);
1770 } while (!signal_pending(tsk) && timeout);
1772 tsk->state = TASK_RUNNING;
1773 remove_wait_queue(sk->sleep, &wait);
1776 adjudge_to_death:
1777 /* It is the last release_sock in its life. It will remove backlog. */
1778 release_sock(sk);
1781 /* Now socket is owned by kernel and we acquire BH lock
1782 to finish close. No need to check for user refs.
1784 local_bh_disable();
1785 bh_lock_sock(sk);
1786 BUG_TRAP(sk->lock.users==0);
1788 sock_hold(sk);
1789 sock_orphan(sk);
1791 /* This is a (useful) BSD violating of the RFC. There is a
1792 * problem with TCP as specified in that the other end could
1793 * keep a socket open forever with no application left this end.
1794 * We use a 3 minute timeout (about the same as BSD) then kill
1795 * our end. If they send after that then tough - BUT: long enough
1796 * that we won't make the old 4*rto = almost no time - whoops
1797 * reset mistake.
1799 * Nope, it was not mistake. It is really desired behaviour
1800 * f.e. on http servers, when such sockets are useless, but
1801 * consume significant resources. Let's do it with special
1802 * linger2 option. --ANK
1805 if (sk->state == TCP_FIN_WAIT2) {
1806 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1807 if (tp->linger2 < 0) {
1808 tcp_set_state(sk, TCP_CLOSE);
1809 tcp_send_active_reset(sk, GFP_ATOMIC);
1810 } else {
1811 int tmo = tcp_fin_time(tp);
1813 if (tmo > TCP_TIMEWAIT_LEN) {
1814 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1815 } else {
1816 atomic_inc(&tcp_orphan_count);
1817 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1818 goto out;
1822 if (sk->state != TCP_CLOSE &&
1823 atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans) {
1824 if (net_ratelimit())
1825 printk(KERN_INFO "TCP: too many of orphaned sockets\n");
1826 tcp_set_state(sk, TCP_CLOSE);
1827 tcp_send_active_reset(sk, GFP_ATOMIC);
1829 atomic_inc(&tcp_orphan_count);
1831 if (sk->state == TCP_CLOSE)
1832 tcp_destroy_sock(sk);
1833 /* Otherwise, socket is reprieved until protocol close. */
1835 out:
1836 bh_unlock_sock(sk);
1837 local_bh_enable();
1838 sock_put(sk);
1841 /* These states need RST on ABORT according to RFC793 */
1843 extern __inline__ int tcp_need_reset(int state)
1845 return ((1 << state) &
1846 (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
1847 TCPF_FIN_WAIT2|TCPF_SYN_RECV));
1850 int tcp_disconnect(struct sock *sk, int flags)
1852 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1853 int old_state;
1854 int err = 0;
1856 old_state = sk->state;
1857 if (old_state != TCP_CLOSE)
1858 tcp_set_state(sk, TCP_CLOSE);
1860 /* ABORT function of RFC793 */
1861 if (old_state == TCP_LISTEN) {
1862 tcp_listen_stop(sk);
1863 } else if (tcp_need_reset(old_state) ||
1864 (tp->snd_nxt != tp->write_seq &&
1865 (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
1866 /* The last check adjusts for discrepance of Linux wrt. RFC
1867 * states
1869 tcp_send_active_reset(sk, gfp_any());
1870 sk->err = ECONNRESET;
1871 } else if (old_state == TCP_SYN_SENT)
1872 sk->err = ECONNRESET;
1874 tcp_clear_xmit_timers(sk);
1875 __skb_queue_purge(&sk->receive_queue);
1876 __skb_queue_purge(&sk->write_queue);
1877 __skb_queue_purge(&tp->out_of_order_queue);
1879 sk->dport = 0;
1881 sk->rcv_saddr = 0;
1882 sk->saddr = 0;
1883 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1884 memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
1885 memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
1886 #endif
1888 sk->shutdown = 0;
1889 sk->done = 0;
1890 sk->write_space = tcp_write_space;
1891 tp->srtt = 0;
1892 if (sysctl_tcp_tw_recycle) {
1893 if ((tp->write_seq += 2) == 0)
1894 tp->write_seq = 1;
1895 } else {
1896 tp->write_seq = 0;
1898 tp->backoff = 0;
1899 tp->snd_cwnd = 2;
1900 tp->probes_out = 0;
1901 tp->packets_out = 0;
1902 tp->high_seq = 0;
1903 tp->snd_ssthresh = 0x7fffffff;
1904 tp->snd_cwnd_cnt = 0;
1905 tp->dup_acks = 0;
1906 tcp_delack_init(tp);
1907 tp->send_head = tp->retrans_head = NULL;
1908 tp->saw_tstamp = 0;
1909 __sk_dst_reset(sk);
1911 BUG_TRAP(!sk->num || sk->prev);
1913 sk->error_report(sk);
1914 return err;
1918 * Wait for an incoming connection, avoid race
1919 * conditions. This must be called with the socket locked,
1920 * and without the kernel lock held.
1922 static int wait_for_connect(struct sock * sk, long timeo)
1924 DECLARE_WAITQUEUE(wait, current);
1925 int err;
1928 * True wake-one mechanism for incoming connections: only
1929 * one process gets woken up, not the 'whole herd'.
1930 * Since we do not 'race & poll' for established sockets
1931 * anymore, the common case will execute the loop only once.
1933 * Subtle issue: "add_wait_queue_exclusive()" will be added
1934 * after any current non-exclusive waiters, and we know that
1935 * it will always _stay_ after any new non-exclusive waiters
1936 * because all non-exclusive waiters are added at the
1937 * beginning of the wait-queue. As such, it's ok to "drop"
1938 * our exclusiveness temporarily when we get woken up without
1939 * having to remove and re-insert us on the wait queue.
1941 add_wait_queue_exclusive(sk->sleep, &wait);
1942 for (;;) {
1943 current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
1944 release_sock(sk);
1945 if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
1946 timeo = schedule_timeout(timeo);
1947 lock_sock(sk);
1948 err = 0;
1949 if (sk->tp_pinfo.af_tcp.accept_queue)
1950 break;
1951 err = -EINVAL;
1952 if (sk->state != TCP_LISTEN)
1953 break;
1954 err = sock_intr_errno(timeo);
1955 if (signal_pending(current))
1956 break;
1957 err = -EAGAIN;
1958 if (!timeo)
1959 break;
1961 current->state = TASK_RUNNING;
1962 remove_wait_queue(sk->sleep, &wait);
1963 return err;
1967 * This will accept the next outstanding connection.
1969 * Be careful about race conditions here - this is subtle.
1972 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1974 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1975 struct open_request *req;
1976 struct sock *newsk;
1977 int error;
1979 lock_sock(sk);
1981 /* We need to make sure that this socket is listening,
1982 * and that it has something pending.
1984 error = -EINVAL;
1985 if (sk->state != TCP_LISTEN)
1986 goto out;
1988 /* Find already established connection */
1989 if (!tp->accept_queue) {
1990 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1992 /* If this is a non blocking socket don't sleep */
1993 error = -EAGAIN;
1994 if (!timeo)
1995 goto out;
1997 error = wait_for_connect(sk, timeo);
1998 if (error)
1999 goto out;
2002 req = tp->accept_queue;
2003 if ((tp->accept_queue = req->dl_next) == NULL)
2004 tp->accept_queue_tail = NULL;
2006 newsk = req->sk;
2007 tcp_acceptq_removed(sk);
2008 tcp_openreq_fastfree(req);
2009 BUG_TRAP(newsk->state != TCP_SYN_RECV);
2010 release_sock(sk);
2011 return newsk;
2013 out:
2014 release_sock(sk);
2015 *err = error;
2016 return NULL;
2020 * Socket option code for TCP.
2023 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2024 int optlen)
2026 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2027 int val;
2028 int err = 0;
2030 if (level != SOL_TCP)
2031 return tp->af_specific->setsockopt(sk, level, optname,
2032 optval, optlen);
2034 if(optlen<sizeof(int))
2035 return -EINVAL;
2037 if (get_user(val, (int *)optval))
2038 return -EFAULT;
2040 lock_sock(sk);
2042 switch(optname) {
2043 case TCP_MAXSEG:
2044 /* values greater than interface MTU won't take effect. however at
2045 * the point when this call is done we typically don't yet know
2046 * which interface is going to be used
2048 if(val < 8 || val > MAX_TCP_WINDOW) {
2049 err = -EINVAL;
2050 break;
2052 tp->user_mss = val;
2053 break;
2055 case TCP_NODELAY:
2056 /* You cannot try to use this and TCP_CORK in
2057 * tandem, so let the user know.
2059 if (tp->nonagle == 2) {
2060 err = -EINVAL;
2061 break;
2063 tp->nonagle = (val == 0) ? 0 : 1;
2064 if (val)
2065 tcp_push_pending_frames(sk, tp);
2066 break;
2068 case TCP_CORK:
2069 /* When set indicates to always queue non-full frames.
2070 * Later the user clears this option and we transmit
2071 * any pending partial frames in the queue. This is
2072 * meant to be used alongside sendfile() to get properly
2073 * filled frames when the user (for example) must write
2074 * out headers with a write() call first and then use
2075 * sendfile to send out the data parts.
2077 * You cannot try to use TCP_NODELAY and this mechanism
2078 * at the same time, so let the user know.
2080 if (tp->nonagle == 1) {
2081 err = -EINVAL;
2082 break;
2084 if (val != 0) {
2085 tp->nonagle = 2;
2086 } else {
2087 tp->nonagle = 0;
2089 tcp_push_pending_frames(sk, tp);
2091 break;
2093 case TCP_KEEPIDLE:
2094 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2095 err = -EINVAL;
2096 else {
2097 tp->keepalive_time = val * HZ;
2098 if (sk->keepopen) {
2099 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2100 if (tp->keepalive_time > elapsed)
2101 elapsed = tp->keepalive_time - elapsed;
2102 else
2103 elapsed = 0;
2104 tcp_reset_keepalive_timer(sk, elapsed);
2107 break;
2108 case TCP_KEEPINTVL:
2109 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2110 err = -EINVAL;
2111 else
2112 tp->keepalive_intvl = val * HZ;
2113 break;
2114 case TCP_KEEPCNT:
2115 if (val < 1 || val > MAX_TCP_KEEPCNT)
2116 err = -EINVAL;
2117 else
2118 tp->keepalive_probes = val;
2119 break;
2120 case TCP_SYNCNT:
2121 if (val < 1 || val > MAX_TCP_SYNCNT)
2122 err = -EINVAL;
2123 else
2124 tp->syn_retries = val;
2125 break;
2127 case TCP_LINGER2:
2128 if (val < 0)
2129 tp->linger2 = -1;
2130 else if (val > sysctl_tcp_fin_timeout/HZ)
2131 tp->linger2 = 0;
2132 else
2133 tp->linger2 = val*HZ;
2134 break;
2136 case TCP_DEFER_ACCEPT:
2137 tp->defer_accept = 0;
2138 if (val > 0) {
2139 /* Translate value in seconds to number of retransmits */
2140 while (val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
2141 tp->defer_accept++;
2142 tp->defer_accept++;
2144 break;
2146 case TCP_WINDOW_CLAMP:
2147 if (val==0) {
2148 if (sk->state != TCP_CLOSE) {
2149 err = -EINVAL;
2150 break;
2152 tp->window_clamp = 0;
2153 } else {
2154 tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
2155 SOCK_MIN_SNDBUF : val;
2157 break;
2159 default:
2160 err = -ENOPROTOOPT;
2161 break;
2163 release_sock(sk);
2164 return err;
2167 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2168 int *optlen)
2170 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2171 int val, len;
2173 if(level != SOL_TCP)
2174 return tp->af_specific->getsockopt(sk, level, optname,
2175 optval, optlen);
2177 if(get_user(len,optlen))
2178 return -EFAULT;
2180 len = min(len, sizeof(int));
2182 switch(optname) {
2183 case TCP_MAXSEG:
2184 val = tp->mss_cache;
2185 if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
2186 val = tp->user_mss;
2187 break;
2188 case TCP_NODELAY:
2189 val = (tp->nonagle == 1);
2190 break;
2191 case TCP_CORK:
2192 val = (tp->nonagle == 2);
2193 break;
2194 case TCP_KEEPIDLE:
2195 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
2196 break;
2197 case TCP_KEEPINTVL:
2198 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
2199 break;
2200 case TCP_KEEPCNT:
2201 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2202 break;
2203 case TCP_SYNCNT:
2204 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2205 break;
2206 case TCP_LINGER2:
2207 val = tp->linger2;
2208 if (val > 0)
2209 val = (val ? : sysctl_tcp_fin_timeout)/HZ;
2210 break;
2211 case TCP_DEFER_ACCEPT:
2212 val = tp->defer_accept == 0 ? 0 : (TCP_TIMEOUT_INIT<<(tp->defer_accept-1));
2213 break;
2214 case TCP_WINDOW_CLAMP:
2215 val = tp->window_clamp;
2216 break;
2217 default:
2218 return -ENOPROTOOPT;
2221 if(put_user(len, optlen))
2222 return -EFAULT;
2223 if(copy_to_user(optval, &val,len))
2224 return -EFAULT;
2225 return 0;
2229 extern void __skb_cb_too_small_for_tcp(int, int);
2231 void __init tcp_init(void)
2233 struct sk_buff *skb = NULL;
2234 unsigned long goal;
2235 int order, i;
2237 if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2238 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2239 sizeof(skb->cb));
2241 tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2242 sizeof(struct open_request),
2243 0, SLAB_HWCACHE_ALIGN,
2244 NULL, NULL);
2245 if(!tcp_openreq_cachep)
2246 panic("tcp_init: Cannot alloc open_request cache.");
2248 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2249 sizeof(struct tcp_bind_bucket),
2250 0, SLAB_HWCACHE_ALIGN,
2251 NULL, NULL);
2252 if(!tcp_bucket_cachep)
2253 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2255 tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2256 sizeof(struct tcp_tw_bucket),
2257 0, SLAB_HWCACHE_ALIGN,
2258 NULL, NULL);
2259 if(!tcp_timewait_cachep)
2260 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2262 /* Size and allocate the main established and bind bucket
2263 * hash tables.
2265 * The methodology is similar to that of the buffer cache.
2267 goal = num_physpages >> (23 - PAGE_SHIFT);
2269 for(order = 0; (1UL << order) < goal; order++)
2271 do {
2272 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2273 sizeof(struct tcp_ehash_bucket);
2274 tcp_ehash_size >>= 1;
2275 while (tcp_ehash_size & (tcp_ehash_size-1))
2276 tcp_ehash_size--;
2277 tcp_ehash = (struct tcp_ehash_bucket *)
2278 __get_free_pages(GFP_ATOMIC, order);
2279 } while (tcp_ehash == NULL && --order > 0);
2281 if (!tcp_ehash)
2282 panic("Failed to allocate TCP established hash table\n");
2283 for (i = 0; i < (tcp_ehash_size<<1); i++) {
2284 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2285 tcp_ehash[i].chain = NULL;
2288 do {
2289 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2290 sizeof(struct tcp_bind_hashbucket);
2291 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2292 continue;
2293 tcp_bhash = (struct tcp_bind_hashbucket *)
2294 __get_free_pages(GFP_ATOMIC, order);
2295 } while (tcp_bhash == NULL && --order >= 0);
2297 if (!tcp_bhash)
2298 panic("Failed to allocate TCP bind hash table\n");
2299 for (i = 0; i < tcp_bhash_size; i++) {
2300 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2301 tcp_bhash[i].chain = NULL;
2304 /* Try to be a bit smarter and adjust defaults depending
2305 * on available memory.
2307 if (order > 4) {
2308 sysctl_local_port_range[0] = 32768;
2309 sysctl_local_port_range[1] = 61000;
2310 sysctl_tcp_max_tw_buckets = 180000;
2311 sysctl_tcp_max_orphans = 4096<<(order-4);
2312 sysctl_max_syn_backlog = 1024;
2313 } else if (order < 3) {
2314 sysctl_local_port_range[0] = 1024*(3-order);
2315 sysctl_tcp_max_tw_buckets >>= (3-order);
2316 sysctl_tcp_max_orphans >>= (3-order);
2317 sysctl_max_syn_backlog = 128;
2319 tcp_port_rover = sysctl_local_port_range[0] - 1;
2321 printk("TCP: Hash tables configured (established %d bind %d)\n",
2322 tcp_ehash_size<<1, tcp_bhash_size);