Import 2.3.41pre2
[davej-history.git] / net / ipv4 / tcp.c
blob479836c28721bff31e1a71f6520b378a3f8f232f
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp.c,v 1.158 2000/01/21 23:45:57 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
27 * (tcp_err()).
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. poll
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_send_reset() fixed to work for
37 * everything not just packets for
38 * unknown sockets.
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
41 * syn rule wrong]
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
47 * escape still
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
51 * facilities
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
56 * bit to skb ops.
57 * Alan Cox : Tidied tcp_data to avoid a potential
58 * nasty.
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
70 * sockets.
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
74 * state ack error.
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
79 * fixes
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
85 * completely
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
93 * (not yet usable)
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle poll() after URG properly in
106 * all cases.
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), poll() after URG
111 * works now.
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
113 * BSD api.
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
121 * fixed ports.
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
127 * socket close.
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in polling before an
132 * accept.
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
143 * close.
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
149 * comments.
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
157 * resemble the RFC.
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
162 * generates them.
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : poll()->select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if state is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
175 * but its a start!
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect().
180 * Alan Cox : Small hooks for enSKIP.
181 * Alexey Kuznetsov: Path MTU discovery.
182 * Alan Cox : Support soft errors.
183 * Alan Cox : Fix MTU discovery pathological case
184 * when the remote claims no mtu!
185 * Marc Tamsky : TCP_CLOSE fix.
186 * Colin (G3TNE) : Send a reset on syn ack replies in
187 * window but wrong (fixes NT lpd problems)
188 * Pedro Roque : Better TCP window handling, delayed ack.
189 * Joerg Reuter : No modification of locked buffers in
190 * tcp_do_retransmit()
191 * Eric Schenk : Changed receiver side silly window
192 * avoidance algorithm to BSD style
193 * algorithm. This doubles throughput
194 * against machines running Solaris,
195 * and seems to result in general
196 * improvement.
197 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
198 * Willy Konynenberg : Transparent proxying support.
199 * Mike McLagan : Routing by source
200 * Keith Owens : Do proper merging with partial SKB's in
201 * tcp_do_sendmsg to avoid burstiness.
202 * Eric Schenk : Fix fast close down bug with
203 * shutdown() followed by close().
204 * Andi Kleen : Make poll agree with SIGIO
205 * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
206 * lingertime == 0 (RFC 793 ABORT Call)
208 * This program is free software; you can redistribute it and/or
209 * modify it under the terms of the GNU General Public License
210 * as published by the Free Software Foundation; either version
211 * 2 of the License, or(at your option) any later version.
213 * Description of States:
215 * TCP_SYN_SENT sent a connection request, waiting for ack
217 * TCP_SYN_RECV received a connection request, sent ack,
218 * waiting for final ack in three-way handshake.
220 * TCP_ESTABLISHED connection established
222 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
223 * transmission of remaining buffered data
225 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
226 * to shutdown
228 * TCP_CLOSING both sides have shutdown but we still have
229 * data we have to finish sending
231 * TCP_TIME_WAIT timeout to catch resent junk before entering
232 * closed, can only be entered from FIN_WAIT2
233 * or CLOSING. Required because the other end
234 * may not have gotten our last ACK causing it
235 * to retransmit the data packet (which we ignore)
237 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
238 * us to finish writing our data and to shutdown
239 * (we have to close() to move on to LAST_ACK)
241 * TCP_LAST_ACK out side has shutdown after remote has
242 * shutdown. There may still be data in our
243 * buffer that we have to finish sending
245 * TCP_CLOSE socket is finished
249 * RFC1122 status:
250 * NOTE: I'm not going to be doing comments in the code for this one except
251 * for violations and the like. tcp.c is just too big... If I say something
252 * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
253 * with Alan. -- MS 950903
254 * [Note: Most of the TCP code has been rewriten/redesigned since this
255 * RFC1122 check. It is probably not correct anymore. It should be redone
256 * before 2.2. -AK]
258 * Use of PSH (4.2.2.2)
259 * MAY aggregate data sent without the PSH flag. (does)
260 * MAY queue data received without the PSH flag. (does)
261 * SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
262 * MAY implement PSH on send calls. (doesn't, thus:)
263 * MUST NOT buffer data indefinitely (doesn't [1 second])
264 * MUST set PSH on last segment (does)
265 * MAY pass received PSH to application layer (doesn't)
266 * SHOULD send maximum-sized segment whenever possible. (almost always does)
268 * Window Size (4.2.2.3, 4.2.2.16)
269 * MUST treat window size as an unsigned number (does)
270 * SHOULD treat window size as a 32-bit number (does not)
271 * MUST NOT shrink window once it is offered (does not normally)
273 * Urgent Pointer (4.2.2.4)
274 * **MUST point urgent pointer to last byte of urgent data (not right
275 * after). (doesn't, to be like BSD. That's configurable, but defaults
276 * to off)
277 * MUST inform application layer asynchronously of incoming urgent
278 * data. (does)
279 * MUST provide application with means of determining the amount of
280 * urgent data pending. (does)
281 * **MUST support urgent data sequence of arbitrary length. (doesn't, but
282 * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
283 * [Follows BSD 1 byte of urgent data]
285 * TCP Options (4.2.2.5)
286 * MUST be able to receive TCP options in any segment. (does)
287 * MUST ignore unsupported options (does)
289 * Maximum Segment Size Option (4.2.2.6)
290 * MUST implement both sending and receiving MSS. (does, but currently
291 * only uses the smaller of both of them)
292 * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
293 * it always). (does, even when MSS == 536, which is legal)
294 * MUST assume MSS == 536 if no MSS received at connection setup (does)
295 * MUST calculate "effective send MSS" correctly:
296 * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
297 * (does - but allows operator override)
299 * TCP Checksum (4.2.2.7)
300 * MUST generate and check TCP checksum. (does)
302 * Initial Sequence Number Selection (4.2.2.8)
303 * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's
304 * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
305 * necessary for 10Mbps networks - and harder than BSD to spoof!
306 * With syncookies we don't)
308 * Simultaneous Open Attempts (4.2.2.10)
309 * MUST support simultaneous open attempts (does)
311 * Recovery from Old Duplicate SYN (4.2.2.11)
312 * MUST keep track of active vs. passive open (does)
314 * RST segment (4.2.2.12)
315 * SHOULD allow an RST segment to contain data (does, but doesn't do
316 * anything with it, which is standard)
318 * Closing a Connection (4.2.2.13)
319 * MUST inform application of whether connection was closed by RST or
320 * normal close. (does)
321 * MAY allow "half-duplex" close (treat connection as closed for the
322 * local app, even before handshake is done). (does)
323 * MUST linger in TIME_WAIT for 2 * MSL (does)
325 * Retransmission Timeout (4.2.2.15)
326 * MUST implement Jacobson's slow start and congestion avoidance
327 * stuff. (does)
329 * Probing Zero Windows (4.2.2.17)
330 * MUST support probing of zero windows. (does)
331 * MAY keep offered window closed indefinitely. (does)
332 * MUST allow remote window to stay closed indefinitely. (does)
334 * Passive Open Calls (4.2.2.18)
335 * MUST NOT let new passive open affect other connections. (doesn't)
336 * MUST support passive opens (LISTENs) concurrently. (does)
338 * Time to Live (4.2.2.19)
339 * MUST make TCP TTL configurable. (does - IP_TTL option)
341 * Event Processing (4.2.2.20)
342 * SHOULD queue out-of-order segments. (does)
343 * MUST aggregate ACK segments whenever possible. (does but badly)
345 * Retransmission Timeout Calculation (4.2.3.1)
346 * MUST implement Karn's algorithm and Jacobson's algorithm for RTO
347 * calculation. (does, or at least explains them in the comments 8*b)
348 * SHOULD initialize RTO to 0 and RTT to 3. (does)
350 * When to Send an ACK Segment (4.2.3.2)
351 * SHOULD implement delayed ACK. (does)
352 * MUST keep ACK delay < 0.5 sec. (does)
354 * When to Send a Window Update (4.2.3.3)
355 * MUST implement receiver-side SWS. (does)
357 * When to Send Data (4.2.3.4)
358 * MUST implement sender-side SWS. (does)
359 * SHOULD implement Nagle algorithm. (does)
361 * TCP Connection Failures (4.2.3.5)
362 * MUST handle excessive retransmissions "properly" (see the RFC). (does)
363 * SHOULD inform application layer of soft errors. (does)
365 * TCP Keep-Alives (4.2.3.6)
366 * MAY provide keep-alives. (does)
367 * MUST make keep-alives configurable on a per-connection basis. (does)
368 * MUST default to no keep-alives. (does)
369 * MUST make keep-alive interval configurable. (does)
370 * MUST make default keep-alive interval > 2 hours. (does)
371 * MUST NOT interpret failure to ACK keep-alive packet as dead
372 * connection. (doesn't)
373 * SHOULD send keep-alive with no data. (does)
375 * TCP Multihoming (4.2.3.7)
376 * MUST get source address from IP layer before sending first
377 * SYN. (does)
378 * MUST use same local address for all segments of a connection. (does)
380 * IP Options (4.2.3.8)
381 * MUST ignore unsupported IP options. (does)
382 * MAY support Time Stamp and Record Route. (does)
383 * MUST allow application to specify a source route. (does)
384 * MUST allow received Source Route option to set route for all future
385 * segments on this connection. (does not (security issues))
387 * ICMP messages (4.2.3.9)
388 * MUST act on ICMP errors. (does)
389 * MUST slow transmission upon receipt of a Source Quench. (doesn't anymore
390 * because that is deprecated now by the IETF, can be turned on)
391 * MUST NOT abort connection upon receipt of soft Destination
392 * Unreachables (0, 1, 5), Time Exceededs and Parameter
393 * Problems. (doesn't)
394 * SHOULD report soft Destination Unreachables etc. to the
395 * application. (does, except during SYN_RECV and may drop messages
396 * in some rare cases before accept() - ICMP is unreliable)
397 * SHOULD abort connection upon receipt of hard Destination Unreachable
398 * messages (2, 3, 4). (does, but see above)
400 * Remote Address Validation (4.2.3.10)
401 * MUST reject as an error OPEN for invalid remote IP address. (does)
402 * MUST ignore SYN with invalid source address. (does)
403 * MUST silently discard incoming SYN for broadcast/multicast
404 * address. (does)
406 * Asynchronous Reports (4.2.4.1)
407 * MUST provide mechanism for reporting soft errors to application
408 * layer. (does)
410 * Type of Service (4.2.4.2)
411 * MUST allow application layer to set Type of Service. (does IP_TOS)
413 * (Whew. -- MS 950903)
414 * (Updated by AK, but not complete yet.)
417 #include <linux/config.h>
418 #include <linux/types.h>
419 #include <linux/fcntl.h>
420 #include <linux/poll.h>
421 #include <linux/init.h>
422 #include <linux/smp_lock.h>
424 #include <net/icmp.h>
425 #include <net/tcp.h>
427 #include <asm/uaccess.h>
429 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
431 struct tcp_mib tcp_statistics[NR_CPUS*2];
433 kmem_cache_t *tcp_openreq_cachep;
434 kmem_cache_t *tcp_bucket_cachep;
435 kmem_cache_t *tcp_timewait_cachep;
437 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
440 * LISTEN is a special case for poll..
442 static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
444 return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
448 * Compute minimal free write space needed to queue new packets.
450 #define tcp_min_write_space(__sk) \
451 (atomic_read(&(__sk)->wmem_alloc) / 2)
454 * Wait for a TCP event.
456 * Note that we don't need to lock the socket, as the upper poll layers
457 * take care of normal races (between the test and the event) and we don't
458 * go look at any of the socket buffers directly.
460 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
462 unsigned int mask;
463 struct sock *sk = sock->sk;
464 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
466 poll_wait(file, sk->sleep, wait);
467 if (sk->state == TCP_LISTEN)
468 return tcp_listen_poll(sk, wait);
470 /* Socket is not locked. We are protected from async events
471 by poll logic and correct handling of state changes
472 made by another threads is impossible in any case.
475 mask = 0;
476 if (sk->err)
477 mask = POLLERR;
480 * POLLHUP is certainly not done right. But poll() doesn't
481 * have a notion of HUP in just one direction, and for a
482 * socket the read side is more interesting.
484 * Some poll() documentation says that POLLHUP is incompatible
485 * with the POLLOUT/POLLWR flags, so somebody should check this
486 * all. But careful, it tends to be safer to return too many
487 * bits than too few, and you can easily break real applications
488 * if you don't tell them that something has hung up!
490 * Check-me.
492 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
493 * our fs/select.c). It means that after we received EOF,
494 * poll always returns immediately, making impossible poll() on write()
495 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
496 * if and only if shutdown has been made in both directions.
497 * Actually, it is interesting to look how Solaris and DUX
498 * solve this dilemma. I would prefer, if PULLHUP were maskable,
499 * then we could set it on SND_SHUTDOWN. BTW examples given
500 * in Stevens' books assume exactly this behaviour, it explains
501 * why PULLHUP is incompatible with POLLOUT. --ANK
503 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
504 * blocking on fresh not-connected or disconnected socket. --ANK
506 if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
507 mask |= POLLHUP;
508 if (sk->shutdown & RCV_SHUTDOWN)
509 mask |= POLLIN | POLLRDNORM;
511 /* Connected? */
512 if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
513 if ((tp->rcv_nxt != tp->copied_seq) &&
514 (tp->urg_seq != tp->copied_seq ||
515 tp->rcv_nxt != tp->copied_seq+1 ||
516 sk->urginline || !tp->urg_data))
517 mask |= POLLIN | POLLRDNORM;
519 if (!(sk->shutdown & SEND_SHUTDOWN)) {
520 if (sock_wspace(sk) >= tcp_min_write_space(sk)) {
521 mask |= POLLOUT | POLLWRNORM;
522 } else { /* send SIGIO later */
523 sk->socket->flags |= SO_NOSPACE;
527 if (tp->urg_data & TCP_URG_VALID)
528 mask |= POLLPRI;
530 return mask;
534 * Socket write_space callback.
535 * This (or rather the sock_wake_async) should agree with poll.
537 * WARNING. This callback is called from any context (process,
538 * bh or irq). Do not make anything more smart from it.
540 void tcp_write_space(struct sock *sk)
542 read_lock(&sk->callback_lock);
543 if (!sk->dead) {
544 /* Why??!! Does it really not overshedule? --ANK */
545 wake_up_interruptible(sk->sleep);
547 if (sock_wspace(sk) >= tcp_min_write_space(sk))
548 sock_wake_async(sk->socket, 2, POLL_OUT);
550 read_unlock(&sk->callback_lock);
553 /* Listening TCP sockets never sleep to wait for memory, so
554 * it is completely silly to wake them up on queue space
555 * available events. So we hook them up to this dummy callback.
557 static void tcp_listen_write_space(struct sock *sk)
561 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
563 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
564 int answ;
566 switch(cmd) {
567 case SIOCINQ:
568 if (sk->state == TCP_LISTEN)
569 return(-EINVAL);
571 lock_sock(sk);
572 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
573 answ = 0;
574 else if (sk->urginline || !tp->urg_data ||
575 before(tp->urg_seq,tp->copied_seq) ||
576 !before(tp->urg_seq,tp->rcv_nxt))
577 answ = tp->rcv_nxt - tp->copied_seq;
578 else
579 answ = tp->urg_seq - tp->copied_seq;
580 release_sock(sk);
581 break;
582 case SIOCATMARK:
584 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
585 break;
587 case SIOCOUTQ:
588 if (sk->state == TCP_LISTEN)
589 return(-EINVAL);
591 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
592 answ = 0;
593 else
594 answ = tp->write_seq - tp->snd_una;
595 break;
596 default:
597 return(-ENOIOCTLCMD);
600 return put_user(answ, (int *)arg);
604 int tcp_listen_start(struct sock *sk)
606 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
607 struct tcp_listen_opt *lopt;
609 sk->max_ack_backlog = 0;
610 sk->ack_backlog = 0;
611 tp->accept_queue = NULL;
612 tp->syn_wait_lock = RW_LOCK_UNLOCKED;
614 lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
615 if (!lopt)
616 return -ENOMEM;
618 memset(lopt, 0, sizeof(struct tcp_listen_opt));
619 for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
620 if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
621 break;
623 write_lock_bh(&tp->syn_wait_lock);
624 tp->listen_opt = lopt;
625 write_unlock_bh(&tp->syn_wait_lock);
627 sk->state = TCP_LISTEN;
628 if (sk->num == 0) {
629 if (sk->prot->get_port(sk, 0) != 0) {
630 sk->state = TCP_CLOSE;
631 write_lock_bh(&tp->syn_wait_lock);
632 tp->listen_opt = NULL;
633 write_unlock_bh(&tp->syn_wait_lock);
634 kfree(lopt);
635 return -EAGAIN;
637 sk->sport = htons(sk->num);
638 } else {
639 if (sk->prev)
640 ((struct tcp_bind_bucket*)sk->prev)->fastreuse = 0;
643 sk_dst_reset(sk);
644 sk->prot->hash(sk);
645 sk->socket->flags |= SO_ACCEPTCON;
646 sk->write_space = tcp_listen_write_space;
648 return 0;
652 * This routine closes sockets which have been at least partially
653 * opened, but not yet accepted.
656 static void tcp_listen_stop (struct sock *sk)
658 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
659 struct tcp_listen_opt *lopt = tp->listen_opt;
660 struct open_request *acc_req = tp->accept_queue;
661 struct open_request *req;
662 int i;
664 tcp_delete_keepalive_timer(sk);
666 /* make all the listen_opt local to us */
667 write_lock_bh(&tp->syn_wait_lock);
668 tp->listen_opt =NULL;
669 write_unlock_bh(&tp->syn_wait_lock);
670 tp->accept_queue = NULL;
672 if (lopt->qlen) {
673 for (i=0; i<TCP_SYNQ_HSIZE; i++) {
674 while ((req = lopt->syn_table[i]) != NULL) {
675 lopt->syn_table[i] = req->dl_next;
676 lopt->qlen--;
677 tcp_openreq_free(req);
679 /* Following specs, it would be better either to send FIN
680 * (and enter FIN-WAIT-1, it is normal close)
681 * or to send active reset (abort).
682 * Certainly, it is pretty dangerous while synflood, but it is
683 * bad justification for our negligence 8)
684 * To be honest, we are not able to make either
685 * of the variants now. --ANK
690 BUG_TRAP(lopt->qlen == 0);
692 kfree(lopt);
694 while ((req=acc_req) != NULL) {
695 struct sock *child = req->sk;
697 acc_req = req->dl_next;
699 local_bh_disable();
700 bh_lock_sock(child);
701 BUG_TRAP(child->lock.users==0);
702 sock_hold(child);
704 tcp_disconnect(child, O_NONBLOCK);
706 sock_orphan(child);
708 atomic_inc(&tcp_orphan_count);
710 tcp_destroy_sock(child);
712 bh_unlock_sock(child);
713 local_bh_enable();
714 sock_put(child);
716 tcp_acceptq_removed(sk);
717 tcp_openreq_fastfree(req);
719 BUG_TRAP(sk->ack_backlog == 0);
723 * Wait for a socket to get into the connected state
725 * Note: Must be called with the socket locked.
727 static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
729 struct task_struct *tsk = current;
730 DECLARE_WAITQUEUE(wait, tsk);
732 while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
733 if(sk->err)
734 return sock_error(sk);
735 if((1 << sk->state) &
736 ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
737 if(sk->keepopen && !(flags&MSG_NOSIGNAL))
738 send_sig(SIGPIPE, tsk, 0);
739 return -EPIPE;
741 if(!*timeo_p)
742 return -EAGAIN;
743 if(signal_pending(tsk))
744 return -ERESTARTSYS;
746 __set_task_state(tsk, TASK_INTERRUPTIBLE);
747 add_wait_queue(sk->sleep, &wait);
748 sk->tp_pinfo.af_tcp.write_pending++;
750 release_sock(sk);
751 *timeo_p = schedule_timeout(*timeo_p);
752 lock_sock(sk);
754 __set_task_state(tsk, TASK_RUNNING);
755 remove_wait_queue(sk->sleep, &wait);
756 sk->tp_pinfo.af_tcp.write_pending--;
758 return 0;
761 static inline int tcp_memory_free(struct sock *sk)
763 return atomic_read(&sk->wmem_alloc) < sk->sndbuf;
767 * Wait for more memory for a socket
769 static long wait_for_tcp_memory(struct sock * sk, long timeo)
771 if (!tcp_memory_free(sk)) {
772 DECLARE_WAITQUEUE(wait, current);
774 sk->socket->flags &= ~SO_NOSPACE;
775 add_wait_queue(sk->sleep, &wait);
776 for (;;) {
777 set_current_state(TASK_INTERRUPTIBLE);
779 if (signal_pending(current))
780 break;
781 if (tcp_memory_free(sk))
782 break;
783 if (sk->shutdown & SEND_SHUTDOWN)
784 break;
785 if (sk->err)
786 break;
787 release_sock(sk);
788 if (!tcp_memory_free(sk))
789 timeo = schedule_timeout(timeo);
790 lock_sock(sk);
792 current->state = TASK_RUNNING;
793 remove_wait_queue(sk->sleep, &wait);
795 return timeo;
798 /* When all user supplied data has been queued set the PSH bit */
799 #define PSH_NEEDED (seglen == 0 && iovlen == 0)
802 * This routine copies from a user buffer into a socket,
803 * and starts the transmit system.
806 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
808 struct iovec *iov;
809 struct tcp_opt *tp;
810 struct sk_buff *skb;
811 int iovlen, flags;
812 int mss_now;
813 int err, copied;
814 long timeo;
816 err = 0;
817 tp = &(sk->tp_pinfo.af_tcp);
819 lock_sock(sk);
820 TCP_CHECK_TIMER(sk);
822 flags = msg->msg_flags;
824 timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
826 /* Wait for a connection to finish. */
827 if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
828 if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
829 goto out_unlock;
831 /* This should be in poll */
832 sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */
834 mss_now = tcp_current_mss(sk);
836 /* Ok commence sending. */
837 iovlen = msg->msg_iovlen;
838 iov = msg->msg_iov;
839 copied = 0;
841 while(--iovlen >= 0) {
842 int seglen=iov->iov_len;
843 unsigned char * from=iov->iov_base;
845 iov++;
847 while(seglen > 0) {
848 int copy, tmp, queue_it;
850 if (err)
851 goto do_fault2;
853 /* Stop on errors. */
854 if (sk->err)
855 goto do_sock_err;
857 /* Make sure that we are established. */
858 if (sk->shutdown & SEND_SHUTDOWN)
859 goto do_shutdown;
861 /* Now we need to check if we have a half
862 * built packet we can tack some data onto.
864 if (tp->send_head && !(flags & MSG_OOB)) {
865 skb = sk->write_queue.prev;
866 copy = skb->len;
867 /* If the remote does SWS avoidance we should
868 * queue the best we can if not we should in
869 * fact send multiple packets...
870 * A method for detecting this would be most
871 * welcome.
873 if (skb_tailroom(skb) > 0 &&
874 (mss_now - copy) > 0) {
875 int last_byte_was_odd = (copy % 4);
877 copy = mss_now - copy;
878 if(copy > skb_tailroom(skb))
879 copy = skb_tailroom(skb);
880 if(copy > seglen)
881 copy = seglen;
882 if(last_byte_was_odd) {
883 if(copy_from_user(skb_put(skb, copy),
884 from, copy))
885 err = -EFAULT;
886 skb->csum = csum_partial(skb->data,
887 skb->len, 0);
888 } else {
889 skb->csum =
890 csum_and_copy_from_user(
891 from, skb_put(skb, copy),
892 copy, skb->csum, &err);
895 * FIXME: the *_user functions should
896 * return how much data was
897 * copied before the fault
898 * occurred and then a partial
899 * packet with this data should
900 * be sent. Unfortunately
901 * csum_and_copy_from_user doesn't
902 * return this information.
903 * ATM it might send partly zeroed
904 * data in this case.
906 tp->write_seq += copy;
907 TCP_SKB_CB(skb)->end_seq += copy;
908 from += copy;
909 copied += copy;
910 seglen -= copy;
911 if (PSH_NEEDED)
912 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
913 continue;
917 /* A chunk was here doing something strange
918 * with psh etc. It is deleted, because it was
919 * evident non-sense. --ANK
922 copy = min(seglen, mss_now);
924 /* Determine how large of a buffer to allocate. */
925 tmp = MAX_TCP_HEADER + 15;
926 if (copy < mss_now && !(flags & MSG_OOB)) {
927 tmp += mss_now;
929 /* What is happening here is that we want to
930 * tack on later members of the users iovec
931 * if possible into a single frame. When we
932 * leave this loop our caller checks to see if
933 * we can send queued frames onto the wire.
934 * See tcp_v[46]_sendmsg() for this.
936 queue_it = 1;
937 } else {
938 tmp += copy;
939 queue_it = 0;
941 skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
943 /* If we didn't get any memory, we need to sleep. */
944 if (skb == NULL) {
945 sk->socket->flags |= SO_NOSPACE;
946 if (!timeo) {
947 err = -EAGAIN;
948 goto do_interrupted;
950 if (signal_pending(current)) {
951 err = -ERESTARTSYS;
952 goto do_interrupted;
954 __tcp_push_pending_frames(sk, tp, mss_now);
955 timeo = wait_for_tcp_memory(sk, timeo);
957 /* If SACK's were formed or PMTU events happened,
958 * we must find out about it.
960 mss_now = tcp_current_mss(sk);
961 continue;
964 seglen -= copy;
966 /* Prepare control bits for TCP header creation engine. */
967 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
968 ((PSH_NEEDED) ?
969 TCPCB_FLAG_PSH : 0));
970 TCP_SKB_CB(skb)->sacked = 0;
971 if (flags & MSG_OOB) {
972 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG;
973 TCP_SKB_CB(skb)->urg_ptr = copy;
974 } else
975 TCP_SKB_CB(skb)->urg_ptr = 0;
977 /* TCP data bytes are SKB_PUT() on top, later
978 * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
979 * Reserve header space and checksum the data.
981 skb_reserve(skb, MAX_TCP_HEADER);
982 skb->csum = csum_and_copy_from_user(from,
983 skb_put(skb, copy), copy, 0, &err);
985 if (err)
986 goto do_fault;
988 from += copy;
989 copied += copy;
991 TCP_SKB_CB(skb)->seq = tp->write_seq;
992 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
994 /* This advances tp->write_seq for us. */
995 tcp_send_skb(sk, skb, queue_it, mss_now);
998 sk->err = 0;
999 err = copied;
1000 goto out;
1002 do_sock_err:
1003 if(copied)
1004 err = copied;
1005 else
1006 err = sock_error(sk);
1007 goto out;
1008 do_shutdown:
1009 if(copied)
1010 err = copied;
1011 else {
1012 if (!(flags&MSG_NOSIGNAL))
1013 send_sig(SIGPIPE, current, 0);
1014 err = -EPIPE;
1016 goto out;
1017 do_interrupted:
1018 if(copied)
1019 err = copied;
1020 goto out;
1021 do_fault:
1022 kfree_skb(skb);
1023 do_fault2:
1024 err = -EFAULT;
1025 out:
1026 __tcp_push_pending_frames(sk, tp, mss_now);
1027 TCP_CHECK_TIMER(sk);
1028 out_unlock:
1029 release_sock(sk);
1030 tcp_push_pending_frames(sk, tp);
1031 return err;
1034 #undef PSH_NEEDED
1037 * Handle reading urgent data. BSD has very simple semantics for
1038 * this, no blocking and very strange errors 8)
1041 static int tcp_recv_urg(struct sock * sk, long timeo,
1042 struct msghdr *msg, int len, int flags,
1043 int *addr_len)
1045 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1047 /* No URG data to read. */
1048 if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
1049 return -EINVAL; /* Yes this is right ! */
1051 if (sk->done)
1052 return -ENOTCONN;
1054 if (tp->urg_data & TCP_URG_VALID) {
1055 int err = 0;
1056 char c = tp->urg_data;
1058 if (!(flags & MSG_PEEK))
1059 tp->urg_data = TCP_URG_READ;
1061 /* Read urgent data. */
1062 msg->msg_flags|=MSG_OOB;
1064 if(len>0) {
1065 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1066 len = 1;
1067 } else
1068 msg->msg_flags|=MSG_TRUNC;
1070 return err ? -EFAULT : len;
1073 /* Do not set sk->done, it is set only by normal data receive */
1074 if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
1075 return 0;
1077 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1078 * the available implementations agree in this case:
1079 * this call should never block, independent of the
1080 * blocking state of the socket.
1081 * Mike <pall@rz.uni-karlsruhe.de>
1083 return -EAGAIN;
1087 * Release a skb if it is no longer needed. This routine
1088 * must be called with interrupts disabled or with the
1089 * socket locked so that the sk_buff queue operation is ok.
1092 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1094 __skb_unlink(skb, &sk->receive_queue);
1095 BUG_TRAP(atomic_read(&skb->users) == 1);
1096 /* Well, if I missed something then punishment will be terrible oops. */
1097 __kfree_skb(skb);
1100 /* Clean up the receive buffer for full frames taken by the user,
1101 * then send an ACK if necessary. COPIED is the number of bytes
1102 * tcp_recvmsg has given to the user so far, it speeds up the
1103 * calculation of whether or not we must ACK for the sake of
1104 * a window update.
1106 static void cleanup_rbuf(struct sock *sk, int copied)
1108 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1109 struct sk_buff *skb;
1110 int time_to_ack;
1112 /* NOTE! The socket must be locked, so that we don't get
1113 * a messed-up receive queue.
1115 while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
1116 if (!skb->used)
1117 break;
1118 tcp_eat_skb(sk, skb);
1121 /* Delayed ACKs frequently hit locked sockets during bulk receive. */
1122 time_to_ack = tp->ack.blocked && tp->ack.pending;
1123 #if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/
1124 if (tp->ack.pending &&
1125 (tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss)
1126 time_to_ack = 1;
1127 #endif
1129 /* We send an ACK if we can now advertise a non-zero window
1130 * which has been raised "significantly".
1132 * Even if window raised up to infinity, do not send window open ACK
1133 * in states, where we will not receive more. It is useless.
1135 if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
1136 __u32 rcv_window_now = tcp_receive_window(tp);
1137 __u32 new_window = __tcp_select_window(sk);
1139 /* We won't be raising the window any further than
1140 * the window-clamp allows. Our window selection
1141 * also keeps things a nice multiple of MSS. These
1142 * checks are necessary to prevent spurious ACKs
1143 * which don't advertize a larger window.
1145 if((new_window && (new_window >= rcv_window_now * 2)) &&
1146 ((rcv_window_now + tp->ack.rcv_mss) <= tp->window_clamp))
1147 time_to_ack = 1;
1149 if (time_to_ack)
1150 tcp_send_ack(sk);
1153 /* Now socket state including sk->err is changed only under lock,
1154 * hence we may omit checks after joining wait queue.
1155 * We check receive queue before schedule() only as optimization;
1156 * it is very likely that release_sock() added new data.
1159 static long tcp_data_wait(struct sock *sk, long timeo)
1161 DECLARE_WAITQUEUE(wait, current);
1163 add_wait_queue(sk->sleep, &wait);
1165 __set_current_state(TASK_INTERRUPTIBLE);
1167 sk->socket->flags |= SO_WAITDATA;
1168 release_sock(sk);
1170 if (skb_queue_empty(&sk->receive_queue))
1171 timeo = schedule_timeout(timeo);
1173 lock_sock(sk);
1174 sk->socket->flags &= ~SO_WAITDATA;
1176 remove_wait_queue(sk->sleep, &wait);
1177 __set_current_state(TASK_RUNNING);
1178 return timeo;
1181 static void tcp_prequeue_process(struct sock *sk)
1183 struct sk_buff *skb;
1184 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1186 net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
1188 /* RX process wants to run with disabled BHs, though it is not necessary */
1189 local_bh_disable();
1190 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1191 sk->backlog_rcv(sk, skb);
1192 local_bh_enable();
1194 /* Clear memory counter. */
1195 tp->ucopy.memory = 0;
1199 * This routine copies from a sock struct into the user buffer.
1201 * Technical note: in 2.3 we work on _locked_ socket, so that
1202 * tricks with *seq access order and skb->users are not required.
1203 * Probably, code can be easily improved even more.
1206 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1207 int len, int nonblock, int flags, int *addr_len)
1209 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1210 int copied = 0;
1211 u32 peek_seq;
1212 u32 *seq;
1213 unsigned long used;
1214 int err;
1215 int target; /* Read at least this many bytes */
1216 long timeo;
1217 struct task_struct *user_recv = NULL;
1219 lock_sock(sk);
1221 TCP_CHECK_TIMER(sk);
1224 if (sk->err)
1225 goto out_err;
1227 err = -ENOTCONN;
1228 if (sk->state == TCP_LISTEN)
1229 goto out;
1231 timeo = sock_rcvtimeo(sk, nonblock);
1233 /* Urgent data needs to be handled specially. */
1234 if (flags & MSG_OOB)
1235 goto recv_urg;
1237 seq = &tp->copied_seq;
1238 if (flags & MSG_PEEK) {
1239 peek_seq = tp->copied_seq;
1240 seq = &peek_seq;
1243 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1246 * BUG BUG BUG
1247 * This violates 1003.1g compliance. We must wait for
1248 * data to exist even if we read none!
1251 while (len > 0) {
1252 struct sk_buff * skb;
1253 u32 offset;
1255 /* Are we at urgent data? Stop if we have read anything. */
1256 if (copied && tp->urg_data && tp->urg_seq == *seq)
1257 break;
1259 /* We need to check signals first, to get correct SIGURG
1260 * handling. FIXME: Need to check this doesnt impact 1003.1g
1261 * and move it down to the bottom of the loop
1263 if (signal_pending(current)) {
1264 if (copied)
1265 break;
1266 copied = -ERESTARTSYS;
1267 if (!timeo)
1268 copied = -EAGAIN;
1269 break;
1272 /* Next get a buffer. */
1274 skb = skb_peek(&sk->receive_queue);
1275 do {
1276 if (!skb)
1277 break;
1279 /* Now that we have two receive queues this
1280 * shouldn't happen.
1282 if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1283 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1284 *seq, TCP_SKB_CB(skb)->seq);
1285 break;
1287 offset = *seq - TCP_SKB_CB(skb)->seq;
1288 if (skb->h.th->syn)
1289 offset--;
1290 if (offset < skb->len)
1291 goto found_ok_skb;
1292 if (skb->h.th->fin)
1293 goto found_fin_ok;
1294 if (!(flags & MSG_PEEK))
1295 skb->used = 1;
1296 skb = skb->next;
1297 } while (skb != (struct sk_buff *)&sk->receive_queue);
1299 /* Well, if we have backlog, try to process it now yet. */
1301 if (copied >= target && sk->backlog.tail == NULL)
1302 break;
1304 if (copied) {
1305 if (sk->err ||
1306 sk->state == TCP_CLOSE ||
1307 (sk->shutdown & RCV_SHUTDOWN) ||
1308 !timeo)
1309 break;
1310 } else {
1311 if (sk->err) {
1312 copied = sock_error(sk);
1313 break;
1316 if (sk->done) {
1317 copied = -ENOTCONN;
1318 break;
1321 if (sk->state == TCP_CLOSE) {
1322 if (!(flags&MSG_PEEK))
1323 sk->done = 1;
1324 break;
1327 if (sk->shutdown & RCV_SHUTDOWN)
1328 break;
1330 if (!timeo) {
1331 copied = -EAGAIN;
1332 break;
1336 cleanup_rbuf(sk, copied);
1338 if (tp->ucopy.task == user_recv) {
1339 /* Install new reader */
1340 if (user_recv == NULL && !(flags&MSG_PEEK)) {
1341 user_recv = current;
1342 tp->ucopy.task = user_recv;
1343 tp->ucopy.iov = msg->msg_iov;
1346 tp->ucopy.len = len;
1348 BUG_TRAP(tp->copied_seq == tp->rcv_nxt);
1350 /* __ Set realtime policy in scheduler __ */
1353 if (copied >= target) {
1354 /* Do not sleep, just process backlog. */
1355 release_sock(sk);
1356 lock_sock(sk);
1357 } else {
1358 timeo = tcp_data_wait(sk, timeo);
1361 if (user_recv) {
1362 int chunk;
1364 /* __ Restore normal policy in scheduler __ */
1366 if ((chunk = len - tp->ucopy.len) != 0) {
1367 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
1368 len -= chunk;
1369 copied += chunk;
1372 if (tp->rcv_nxt == tp->copied_seq &&
1373 skb_queue_len(&tp->ucopy.prequeue)) {
1374 tcp_prequeue_process(sk);
1376 if ((chunk = len - tp->ucopy.len) != 0) {
1377 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1378 len -= chunk;
1379 copied += chunk;
1382 #if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/
1383 if (tp->ack.pending &&
1384 (tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss)
1385 tcp_send_ack(sk);
1386 #endif
1388 continue;
1390 found_ok_skb:
1391 /* Ok so how much can we use? */
1392 used = skb->len - offset;
1393 if (len < used)
1394 used = len;
1396 /* Do we have urgent data here? */
1397 if (tp->urg_data) {
1398 u32 urg_offset = tp->urg_seq - *seq;
1399 if (urg_offset < used) {
1400 if (!urg_offset) {
1401 if (!sk->urginline) {
1402 ++*seq;
1403 offset++;
1404 used--;
1406 } else
1407 used = urg_offset;
1411 err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
1412 if (err) {
1413 /* Exception. Bailout! */
1414 if (!copied)
1415 copied = -EFAULT;
1416 break;
1419 *seq += used;
1420 copied += used;
1421 len -= used;
1423 if (after(tp->copied_seq,tp->urg_seq)) {
1424 tp->urg_data = 0;
1425 if (skb_queue_len(&tp->out_of_order_queue) == 0
1426 #ifdef TCP_FORMAL_WINDOW
1427 && tcp_receive_window(tp)
1428 #endif
1430 tcp_fast_path_on(tp);
1433 if (used + offset < skb->len)
1434 continue;
1436 /* Process the FIN. We may also need to handle PSH
1437 * here and make it break out of MSG_WAITALL.
1439 if (skb->h.th->fin)
1440 goto found_fin_ok;
1441 if (flags & MSG_PEEK)
1442 continue;
1443 skb->used = 1;
1444 tcp_eat_skb(sk, skb);
1446 #ifdef CONFIG_TCP_LESS_COARSE_ACKS
1447 /* Possible improvement. When sender is faster than receiver,
1448 * traffic looks like: fill window ... wait for window open ...
1449 * fill window. We lose at least one rtt, because call
1450 * cleanup_rbuf only once. Probably, if "len" was large
1451 * we should insert several intermediate cleanup_rbuf(s).
1453 * F.e.:
1455 do {
1456 u32 full_space = min(tp->window_clamp, tcp_full_space(sk));
1458 /* Try to ACK, if total buffer length is larger
1459 than maximal window and if rcv_window has
1460 chances to increase twice. It will result
1461 to exponentially decreased ACKing during
1462 read to huge (usually, mmapped) buffer.
1464 if (len >= full_space && tp->rcv_wnd <= full_space/2)
1465 cleanup_rbuf(sk, copied);
1466 } while (0);
1467 #endif
1468 continue;
1470 found_fin_ok:
1471 ++*seq;
1472 if (flags & MSG_PEEK)
1473 break;
1475 /* All is done. */
1476 skb->used = 1;
1477 break;
1480 if (user_recv) {
1481 if (skb_queue_len(&tp->ucopy.prequeue)) {
1482 int chunk;
1484 tp->ucopy.len = copied > 0 ? len : 0;
1486 tcp_prequeue_process(sk);
1488 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1489 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1490 len -= chunk;
1491 copied += chunk;
1495 tp->ucopy.task = NULL;
1496 tp->ucopy.len = 0;
1499 /* According to UNIX98, msg_name/msg_namelen are ignored
1500 * on connected socket. I was just happy when found this 8) --ANK
1503 /* Clean up data we have read: This will do ACK frames. */
1504 cleanup_rbuf(sk, copied);
1506 TCP_CHECK_TIMER(sk);
1507 release_sock(sk);
1508 return copied;
1510 out_err:
1511 err = sock_error(sk);
1513 out:
1514 TCP_CHECK_TIMER(sk);
1515 release_sock(sk);
1516 return err;
1518 recv_urg:
1519 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1520 goto out;
1524 * State processing on a close. This implements the state shift for
1525 * sending our FIN frame. Note that we only send a FIN for some
1526 * states. A shutdown() may have already sent the FIN, or we may be
1527 * closed.
1530 static unsigned char new_state[16] = {
1531 /* current state: new state: action: */
1532 /* (Invalid) */ TCP_CLOSE,
1533 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1534 /* TCP_SYN_SENT */ TCP_CLOSE,
1535 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1536 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
1537 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
1538 /* TCP_TIME_WAIT */ TCP_CLOSE,
1539 /* TCP_CLOSE */ TCP_CLOSE,
1540 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
1541 /* TCP_LAST_ACK */ TCP_LAST_ACK,
1542 /* TCP_LISTEN */ TCP_CLOSE,
1543 /* TCP_CLOSING */ TCP_CLOSING,
1546 static int tcp_close_state(struct sock *sk)
1548 int next = (int) new_state[sk->state];
1549 int ns = (next & TCP_STATE_MASK);
1551 tcp_set_state(sk, ns);
1553 return (next & TCP_ACTION_FIN);
1557 * Shutdown the sending side of a connection. Much like close except
1558 * that we don't receive shut down or set sk->dead.
1561 void tcp_shutdown(struct sock *sk, int how)
1563 /* We need to grab some memory, and put together a FIN,
1564 * and then put it into the queue to be sent.
1565 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1567 if (!(how & SEND_SHUTDOWN))
1568 return;
1570 /* If we've already sent a FIN, or it's a closed state, skip this. */
1571 if ((1 << sk->state) &
1572 (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1573 /* Clear out any half completed packets. FIN if needed. */
1574 if (tcp_close_state(sk))
1575 tcp_send_fin(sk);
1581 * Return 1 if we still have things to send in our buffers.
1584 static inline int closing(struct sock * sk)
1586 return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1589 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1591 /* First the read buffer. */
1592 skb_queue_purge(&sk->receive_queue);
1594 /* Next, the error queue. */
1595 skb_queue_purge(&sk->error_queue);
1597 /* Next, the write queue. */
1598 BUG_TRAP(skb_queue_empty(&sk->write_queue));
1600 /* It is _impossible_ for the backlog to contain anything
1601 * when we get here. All user references to this socket
1602 * have gone away, only the net layer knows can touch it.
1607 * At this point, there should be no process reference to this
1608 * socket, and thus no user references at all. Therefore we
1609 * can assume the socket waitqueue is inactive and nobody will
1610 * try to jump onto it.
1612 void tcp_destroy_sock(struct sock *sk)
1614 BUG_TRAP(sk->state==TCP_CLOSE);
1615 BUG_TRAP(sk->dead);
1617 /* It cannot be in hash table! */
1618 BUG_TRAP(sk->pprev==NULL);
1620 /* It it has not 0 sk->num, it must be bound */
1621 BUG_TRAP(!sk->num || sk->prev!=NULL);
1623 #ifdef TCP_DEBUG
1624 if (sk->zapped) {
1625 printk("TCP: double destroy sk=%p\n", sk);
1626 sock_hold(sk);
1628 sk->zapped = 1;
1629 #endif
1631 sk->prot->destroy(sk);
1633 tcp_kill_sk_queues(sk);
1635 #ifdef INET_REFCNT_DEBUG
1636 if (atomic_read(&sk->refcnt) != 1) {
1637 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
1639 #endif
1641 atomic_dec(&tcp_orphan_count);
1642 sock_put(sk);
1645 void tcp_close(struct sock *sk, long timeout)
1647 struct sk_buff *skb;
1648 int data_was_unread = 0;
1650 lock_sock(sk);
1651 sk->shutdown = SHUTDOWN_MASK;
1653 if(sk->state == TCP_LISTEN) {
1654 tcp_set_state(sk, TCP_CLOSE);
1656 /* Special case. */
1657 tcp_listen_stop(sk);
1659 goto adjudge_to_death;
1662 /* We need to flush the recv. buffs. We do this only on the
1663 * descriptor close, not protocol-sourced closes, because the
1664 * reader process may not have drained the data yet!
1666 while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1667 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1668 data_was_unread += len;
1669 kfree_skb(skb);
1672 /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1673 * 3.10, we send a RST here because data was lost. To
1674 * witness the awful effects of the old behavior of always
1675 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1676 * a bulk GET in an FTP client, suspend the process, wait
1677 * for the client to advertise a zero window, then kill -9
1678 * the FTP client, wheee... Note: timeout is always zero
1679 * in such a case.
1681 if(data_was_unread != 0) {
1682 /* Unread data was tossed, zap the connection. */
1683 tcp_set_state(sk, TCP_CLOSE);
1684 tcp_send_active_reset(sk, GFP_KERNEL);
1685 } else if (sk->linger && sk->lingertime==0) {
1686 /* Check zero linger _after_ checking for unread data. */
1687 sk->prot->disconnect(sk, 0);
1688 } else if (tcp_close_state(sk)) {
1689 /* We FIN if the application ate all the data before
1690 * zapping the connection.
1693 /* RED-PEN. Formally speaking, we have broken TCP state
1694 * machine. State transitions:
1696 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1697 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1698 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1700 * are legal only when FIN has been sent (i.e. in window),
1701 * rather than queued out of window. Purists blame.
1703 * F.e. "RFC state" is ESTABLISHED,
1704 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1706 * The visible declinations are that sometimes
1707 * we enter time-wait state, when it is not required really
1708 * (harmless), do not send active resets, when they are
1709 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1710 * they look as CLOSING or LAST_ACK for Linux)
1711 * Probably, I missed some more holelets.
1712 * --ANK
1714 tcp_send_fin(sk);
1717 if (timeout) {
1718 struct task_struct *tsk = current;
1719 DECLARE_WAITQUEUE(wait, current);
1721 add_wait_queue(sk->sleep, &wait);
1723 do {
1724 set_current_state(TASK_INTERRUPTIBLE);
1725 if (!closing(sk))
1726 break;
1727 release_sock(sk);
1728 timeout = schedule_timeout(timeout);
1729 lock_sock(sk);
1730 } while (!signal_pending(tsk) && timeout);
1732 tsk->state = TASK_RUNNING;
1733 remove_wait_queue(sk->sleep, &wait);
1736 adjudge_to_death:
1737 /* It is the last release_sock in its life. It will remove backlog. */
1738 release_sock(sk);
1741 /* Now socket is owned by kernel and we acquire BH lock
1742 to finish close. No need to check for user refs.
1744 local_bh_disable();
1745 bh_lock_sock(sk);
1746 BUG_TRAP(sk->lock.users==0);
1748 sock_hold(sk);
1749 sock_orphan(sk);
1751 /* This is a (useful) BSD violating of the RFC. There is a
1752 * problem with TCP as specified in that the other end could
1753 * keep a socket open forever with no application left this end.
1754 * We use a 3 minute timeout (about the same as BSD) then kill
1755 * our end. If they send after that then tough - BUT: long enough
1756 * that we won't make the old 4*rto = almost no time - whoops
1757 * reset mistake.
1759 * Nope, it was not mistake. It is really desired behaviour
1760 * f.e. on http servers, when such sockets are useless, but
1761 * consume significant resources. Let's do it with special
1762 * linger2 option. --ANK
1765 if (sk->state == TCP_FIN_WAIT2) {
1766 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1767 if (tp->linger2 < 0) {
1768 tcp_set_state(sk, TCP_CLOSE);
1769 tcp_send_active_reset(sk, GFP_ATOMIC);
1770 } else {
1771 int tmo = tcp_fin_time(tp);
1773 if (tmo > TCP_TIMEWAIT_LEN) {
1774 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1775 } else {
1776 atomic_inc(&tcp_orphan_count);
1777 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1778 goto out;
1782 if (sk->state != TCP_CLOSE &&
1783 atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans) {
1784 if (net_ratelimit())
1785 printk(KERN_INFO "TCP: too many of orphaned sockets\n");
1786 tcp_set_state(sk, TCP_CLOSE);
1787 tcp_send_active_reset(sk, GFP_ATOMIC);
1789 atomic_inc(&tcp_orphan_count);
1791 if (sk->state == TCP_CLOSE)
1792 tcp_destroy_sock(sk);
1793 /* Otherwise, socket is reprieved until protocol close. */
1795 out:
1796 bh_unlock_sock(sk);
1797 local_bh_enable();
1798 sock_put(sk);
1801 /* These states need RST on ABORT according to RFC793 */
1803 extern __inline__ int tcp_need_reset(int state)
1805 return ((1 << state) &
1806 (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
1807 TCPF_FIN_WAIT2|TCPF_SYN_RECV));
1810 int tcp_disconnect(struct sock *sk, int flags)
1812 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1813 int old_state;
1814 int err = 0;
1816 old_state = sk->state;
1817 if (old_state != TCP_CLOSE)
1818 tcp_set_state(sk, TCP_CLOSE);
1820 /* ABORT function of RFC793 */
1821 if (old_state == TCP_LISTEN) {
1822 tcp_listen_stop(sk);
1823 } else if (tcp_need_reset(old_state) ||
1824 (tp->snd_nxt != tp->write_seq &&
1825 (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
1826 /* The last check adjusts for discrepance of Linux wrt. RFC
1827 * states
1829 tcp_send_active_reset(sk, gfp_any());
1830 sk->err = ECONNRESET;
1831 } else if (old_state == TCP_SYN_SENT)
1832 sk->err = ECONNRESET;
1834 tcp_clear_xmit_timers(sk);
1835 __skb_queue_purge(&sk->receive_queue);
1836 __skb_queue_purge(&sk->write_queue);
1837 __skb_queue_purge(&tp->out_of_order_queue);
1839 sk->dport = 0;
1841 sk->rcv_saddr = 0;
1842 sk->saddr = 0;
1843 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1844 memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
1845 memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
1846 #endif
1848 sk->shutdown = 0;
1849 sk->done = 0;
1850 sk->write_space = tcp_write_space;
1851 tp->srtt = 0;
1852 if (sysctl_tcp_tw_recycle) {
1853 if ((tp->write_seq += 2) == 0)
1854 tp->write_seq = 1;
1855 } else {
1856 tp->write_seq = 0;
1858 tp->backoff = 0;
1859 tp->snd_cwnd = 2;
1860 tp->probes_out = 0;
1861 tp->packets_out = 0;
1862 tp->high_seq = 0;
1863 tp->snd_ssthresh = 0x7fffffff;
1864 tp->snd_cwnd_cnt = 0;
1865 tp->dup_acks = 0;
1866 tcp_delack_init(tp);
1867 tp->send_head = tp->retrans_head = NULL;
1868 tp->saw_tstamp = 0;
1869 __sk_dst_reset(sk);
1871 BUG_TRAP(!sk->num || sk->prev);
1873 sk->error_report(sk);
1874 return err;
1878 * Wait for an incoming connection, avoid race
1879 * conditions. This must be called with the socket locked,
1880 * and without the kernel lock held.
1882 static int wait_for_connect(struct sock * sk, long timeo)
1884 DECLARE_WAITQUEUE(wait, current);
1885 int err;
1888 * True wake-one mechanism for incoming connections: only
1889 * one process gets woken up, not the 'whole herd'.
1890 * Since we do not 'race & poll' for established sockets
1891 * anymore, the common case will execute the loop only once.
1893 * Subtle issue: "add_wait_queue_exclusive()" will be added
1894 * after any current non-exclusive waiters, and we know that
1895 * it will always _stay_ after any new non-exclusive waiters
1896 * because all non-exclusive waiters are added at the
1897 * beginning of the wait-queue. As such, it's ok to "drop"
1898 * our exclusiveness temporarily when we get woken up without
1899 * having to remove and re-insert us on the wait queue.
1901 add_wait_queue_exclusive(sk->sleep, &wait);
1902 for (;;) {
1903 current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
1904 release_sock(sk);
1905 if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
1906 timeo = schedule_timeout(timeo);
1907 lock_sock(sk);
1908 err = 0;
1909 if (sk->tp_pinfo.af_tcp.accept_queue)
1910 break;
1911 err = -EINVAL;
1912 if (sk->state != TCP_LISTEN)
1913 break;
1914 err = -ERESTARTSYS;
1915 if (signal_pending(current))
1916 break;
1917 err = -EAGAIN;
1918 if (!timeo)
1919 break;
1921 current->state = TASK_RUNNING;
1922 remove_wait_queue(sk->sleep, &wait);
1923 return err;
1927 * This will accept the next outstanding connection.
1929 * Be careful about race conditions here - this is subtle.
1932 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1934 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1935 struct open_request *req;
1936 struct sock *newsk;
1937 int error;
1938 long timeo;
1940 lock_sock(sk);
1942 /* We need to make sure that this socket is listening,
1943 * and that it has something pending.
1945 error = -EINVAL;
1946 if (sk->state != TCP_LISTEN)
1947 goto out;
1949 timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1951 /* Find already established connection */
1952 if (!tp->accept_queue) {
1953 /* If this is a non blocking socket don't sleep */
1954 error = -EAGAIN;
1955 if (!timeo)
1956 goto out;
1958 error = wait_for_connect(sk, timeo);
1959 if (error)
1960 goto out;
1963 req = tp->accept_queue;
1964 tp->accept_queue = req->dl_next;
1966 newsk = req->sk;
1967 tcp_acceptq_removed(sk);
1968 tcp_openreq_fastfree(req);
1969 BUG_TRAP(newsk->state != TCP_SYN_RECV);
1970 release_sock(sk);
1971 return newsk;
1973 out:
1974 release_sock(sk);
1975 *err = error;
1976 return NULL;
1980 * Socket option code for TCP.
1983 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
1984 int optlen)
1986 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1987 int val;
1988 int err = 0;
1990 if (level != SOL_TCP)
1991 return tp->af_specific->setsockopt(sk, level, optname,
1992 optval, optlen);
1994 if(optlen<sizeof(int))
1995 return -EINVAL;
1997 if (get_user(val, (int *)optval))
1998 return -EFAULT;
2000 lock_sock(sk);
2002 switch(optname) {
2003 case TCP_MAXSEG:
2004 /* values greater than interface MTU won't take effect. however at
2005 * the point when this call is done we typically don't yet know
2006 * which interface is going to be used
2008 if(val < 8 || val > MAX_TCP_WINDOW) {
2009 err = -EINVAL;
2010 break;
2012 tp->user_mss = val;
2013 break;
2015 case TCP_NODELAY:
2016 /* You cannot try to use this and TCP_CORK in
2017 * tandem, so let the user know.
2019 if (tp->nonagle == 2) {
2020 err = -EINVAL;
2021 break;
2023 tp->nonagle = (val == 0) ? 0 : 1;
2024 break;
2026 case TCP_CORK:
2027 /* When set indicates to always queue non-full frames.
2028 * Later the user clears this option and we transmit
2029 * any pending partial frames in the queue. This is
2030 * meant to be used alongside sendfile() to get properly
2031 * filled frames when the user (for example) must write
2032 * out headers with a write() call first and then use
2033 * sendfile to send out the data parts.
2035 * You cannot try to use TCP_NODELAY and this mechanism
2036 * at the same time, so let the user know.
2038 if (tp->nonagle == 1) {
2039 err = -EINVAL;
2040 break;
2042 if (val != 0) {
2043 tp->nonagle = 2;
2044 } else {
2045 tp->nonagle = 0;
2047 tcp_push_pending_frames(sk, tp);
2049 break;
2051 case TCP_KEEPIDLE:
2052 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2053 err = -EINVAL;
2054 else {
2055 tp->keepalive_time = val * HZ;
2056 if (sk->keepopen) {
2057 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2058 if (tp->keepalive_time > elapsed)
2059 elapsed = tp->keepalive_time - elapsed;
2060 else
2061 elapsed = 0;
2062 tcp_reset_keepalive_timer(sk, elapsed);
2065 break;
2066 case TCP_KEEPINTVL:
2067 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2068 err = -EINVAL;
2069 else
2070 tp->keepalive_intvl = val * HZ;
2071 break;
2072 case TCP_KEEPCNT:
2073 if (val < 1 || val > MAX_TCP_KEEPCNT)
2074 err = -EINVAL;
2075 else
2076 tp->keepalive_probes = val;
2077 break;
2078 case TCP_SYNCNT:
2079 if (val < 1 || val > MAX_TCP_SYNCNT)
2080 err = -EINVAL;
2081 else
2082 tp->syn_retries = val;
2083 break;
2085 case TCP_LINGER2:
2086 if (val < 0)
2087 tp->linger2 = -1;
2088 else if (val > sysctl_tcp_fin_timeout/HZ)
2089 tp->linger2 = 0;
2090 else
2091 tp->linger2 = val*HZ;
2092 break;
2094 case TCP_DEFER_ACCEPT:
2095 tp->defer_accept = 0;
2096 if (val > 0) {
2097 /* Translate value in seconds to number of retransmits */
2098 while (val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
2099 tp->defer_accept++;
2100 tp->defer_accept++;
2102 break;
2104 case TCP_WINDOW_CLAMP:
2105 if (val==0) {
2106 if (sk->state != TCP_CLOSE) {
2107 err = -EINVAL;
2108 break;
2110 tp->window_clamp = 0;
2111 } else {
2112 tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
2113 SOCK_MIN_SNDBUF : val;
2115 break;
2117 default:
2118 err = -ENOPROTOOPT;
2119 break;
2121 release_sock(sk);
2122 return err;
2125 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2126 int *optlen)
2128 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2129 int val, len;
2131 if(level != SOL_TCP)
2132 return tp->af_specific->getsockopt(sk, level, optname,
2133 optval, optlen);
2135 if(get_user(len,optlen))
2136 return -EFAULT;
2138 len = min(len, sizeof(int));
2140 switch(optname) {
2141 case TCP_MAXSEG:
2142 val = tp->mss_cache;
2143 if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
2144 val = tp->user_mss;
2145 break;
2146 case TCP_NODELAY:
2147 val = (tp->nonagle == 1);
2148 break;
2149 case TCP_CORK:
2150 val = (tp->nonagle == 2);
2151 break;
2152 case TCP_KEEPIDLE:
2153 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
2154 break;
2155 case TCP_KEEPINTVL:
2156 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
2157 break;
2158 case TCP_KEEPCNT:
2159 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2160 break;
2161 case TCP_SYNCNT:
2162 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2163 break;
2164 case TCP_LINGER2:
2165 val = tp->linger2;
2166 if (val > 0)
2167 val = (val ? : sysctl_tcp_fin_timeout)/HZ;
2168 break;
2169 case TCP_DEFER_ACCEPT:
2170 val = tp->defer_accept == 0 ? 0 : (TCP_TIMEOUT_INIT<<(tp->defer_accept-1));
2171 break;
2172 case TCP_WINDOW_CLAMP:
2173 val = tp->window_clamp;
2174 break;
2175 default:
2176 return -ENOPROTOOPT;
2179 if(put_user(len, optlen))
2180 return -EFAULT;
2181 if(copy_to_user(optval, &val,len))
2182 return -EFAULT;
2183 return 0;
2187 extern void __skb_cb_too_small_for_tcp(int, int);
2189 void __init tcp_init(void)
2191 struct sk_buff *skb = NULL;
2192 unsigned long goal;
2193 int order, i;
2195 if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2196 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2197 sizeof(skb->cb));
2199 tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2200 sizeof(struct open_request),
2201 0, SLAB_HWCACHE_ALIGN,
2202 NULL, NULL);
2203 if(!tcp_openreq_cachep)
2204 panic("tcp_init: Cannot alloc open_request cache.");
2206 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2207 sizeof(struct tcp_bind_bucket),
2208 0, SLAB_HWCACHE_ALIGN,
2209 NULL, NULL);
2210 if(!tcp_bucket_cachep)
2211 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2213 tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2214 sizeof(struct tcp_tw_bucket),
2215 0, SLAB_HWCACHE_ALIGN,
2216 NULL, NULL);
2217 if(!tcp_timewait_cachep)
2218 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2220 /* Size and allocate the main established and bind bucket
2221 * hash tables.
2223 * The methodology is similar to that of the buffer cache.
2225 goal = num_physpages >> (23 - PAGE_SHIFT);
2227 for(order = 0; (1UL << order) < goal; order++)
2229 do {
2230 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2231 sizeof(struct tcp_ehash_bucket);
2232 tcp_ehash_size >>= 1;
2233 while (tcp_ehash_size & (tcp_ehash_size-1))
2234 tcp_ehash_size--;
2235 tcp_ehash = (struct tcp_ehash_bucket *)
2236 __get_free_pages(GFP_ATOMIC, order);
2237 } while (tcp_ehash == NULL && --order > 0);
2239 if (!tcp_ehash)
2240 panic("Failed to allocate TCP established hash table\n");
2241 for (i = 0; i < (tcp_ehash_size<<1); i++) {
2242 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2243 tcp_ehash[i].chain = NULL;
2246 do {
2247 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2248 sizeof(struct tcp_bind_hashbucket);
2249 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2250 continue;
2251 tcp_bhash = (struct tcp_bind_hashbucket *)
2252 __get_free_pages(GFP_ATOMIC, order);
2253 } while (tcp_bhash == NULL && --order >= 0);
2255 if (!tcp_bhash)
2256 panic("Failed to allocate TCP bind hash table\n");
2257 for (i = 0; i < tcp_bhash_size; i++) {
2258 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2259 tcp_bhash[i].chain = NULL;
2262 /* Try to be a bit smarter and adjust defaults depending
2263 * on available memory.
2265 if (order > 4) {
2266 sysctl_local_port_range[0] = 32768;
2267 sysctl_local_port_range[1] = 61000;
2268 sysctl_tcp_max_tw_buckets = 180000;
2269 sysctl_tcp_max_orphans = 4096<<(order-4);
2270 sysctl_max_syn_backlog = 1024;
2271 } else if (order < 3) {
2272 sysctl_local_port_range[0] = 1024*(3-order);
2273 sysctl_tcp_max_tw_buckets >>= (3-order);
2274 sysctl_tcp_max_orphans >>= (3-order);
2275 sysctl_max_syn_backlog = 128;
2277 tcp_port_rover = sysctl_local_port_range[0] - 1;
2279 printk("TCP: Hash tables configured (established %d bind %d)\n",
2280 tcp_ehash_size<<1, tcp_bhash_size);