Get rid of arch/mips64/kernel. 9116 lines of code gone.
[linux-2.6/linux-mips.git] / net / ipv4 / tcp.c
blob0c8ac243188d935fa6329f69699088319daf3b73
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
27 * (tcp_err()).
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. poll
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_send_reset() fixed to work for
37 * everything not just packets for
38 * unknown sockets.
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
41 * syn rule wrong]
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
47 * escape still
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
51 * facilities
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
56 * bit to skb ops.
57 * Alan Cox : Tidied tcp_data to avoid a potential
58 * nasty.
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
70 * sockets.
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
74 * state ack error.
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
79 * fixes
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
85 * completely
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
93 * (not yet usable)
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle poll() after URG properly in
106 * all cases.
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), poll() after URG
111 * works now.
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
113 * BSD api.
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
121 * fixed ports.
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
127 * socket close.
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in polling before an
132 * accept.
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
143 * close.
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
149 * comments.
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
157 * resemble the RFC.
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
162 * generates them.
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : poll()->select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if state is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
175 * but it's a start!
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect().
180 * Alan Cox : Small hooks for enSKIP.
181 * Alexey Kuznetsov: Path MTU discovery.
182 * Alan Cox : Support soft errors.
183 * Alan Cox : Fix MTU discovery pathological case
184 * when the remote claims no mtu!
185 * Marc Tamsky : TCP_CLOSE fix.
186 * Colin (G3TNE) : Send a reset on syn ack replies in
187 * window but wrong (fixes NT lpd problems)
188 * Pedro Roque : Better TCP window handling, delayed ack.
189 * Joerg Reuter : No modification of locked buffers in
190 * tcp_do_retransmit()
191 * Eric Schenk : Changed receiver side silly window
192 * avoidance algorithm to BSD style
193 * algorithm. This doubles throughput
194 * against machines running Solaris,
195 * and seems to result in general
196 * improvement.
197 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
198 * Willy Konynenberg : Transparent proxying support.
199 * Mike McLagan : Routing by source
200 * Keith Owens : Do proper merging with partial SKB's in
201 * tcp_do_sendmsg to avoid burstiness.
202 * Eric Schenk : Fix fast close down bug with
203 * shutdown() followed by close().
204 * Andi Kleen : Make poll agree with SIGIO
205 * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
206 * lingertime == 0 (RFC 793 ABORT Call)
207 * Hirokazu Takahashi : Use copy_from_user() instead of
208 * csum_and_copy_from_user() if possible.
210 * This program is free software; you can redistribute it and/or
211 * modify it under the terms of the GNU General Public License
212 * as published by the Free Software Foundation; either version
213 * 2 of the License, or(at your option) any later version.
215 * Description of States:
217 * TCP_SYN_SENT sent a connection request, waiting for ack
219 * TCP_SYN_RECV received a connection request, sent ack,
220 * waiting for final ack in three-way handshake.
222 * TCP_ESTABLISHED connection established
224 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
225 * transmission of remaining buffered data
227 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
228 * to shutdown
230 * TCP_CLOSING both sides have shutdown but we still have
231 * data we have to finish sending
233 * TCP_TIME_WAIT timeout to catch resent junk before entering
234 * closed, can only be entered from FIN_WAIT2
235 * or CLOSING. Required because the other end
236 * may not have gotten our last ACK causing it
237 * to retransmit the data packet (which we ignore)
239 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
240 * us to finish writing our data and to shutdown
241 * (we have to close() to move on to LAST_ACK)
243 * TCP_LAST_ACK out side has shutdown after remote has
244 * shutdown. There may still be data in our
245 * buffer that we have to finish sending
247 * TCP_CLOSE socket is finished
250 #include <linux/config.h>
251 #include <linux/types.h>
252 #include <linux/fcntl.h>
253 #include <linux/poll.h>
254 #include <linux/init.h>
255 #include <linux/smp_lock.h>
256 #include <linux/fs.h>
257 #include <linux/random.h>
259 #include <net/icmp.h>
260 #include <net/tcp.h>
261 #include <net/xfrm.h>
262 #include <net/ip.h>
265 #include <asm/uaccess.h>
266 #include <asm/ioctls.h>
268 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
270 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
272 kmem_cache_t *tcp_openreq_cachep;
273 kmem_cache_t *tcp_bucket_cachep;
274 kmem_cache_t *tcp_timewait_cachep;
276 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278 int sysctl_tcp_mem[3];
279 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
280 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282 atomic_t tcp_memory_allocated; /* Current allocated memory. */
283 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
285 /* Pressure flag: try to collapse.
286 * Technical note: it is used by multiple contexts non atomically.
287 * All the tcp_mem_schedule() is of this nature: accounting
288 * is strict, actions are advisory and have some latency. */
289 int tcp_memory_pressure;
291 #define TCP_PAGES(amt) (((amt) + TCP_MEM_QUANTUM - 1) / TCP_MEM_QUANTUM)
293 int tcp_mem_schedule(struct sock *sk, int size, int kind)
295 int amt = TCP_PAGES(size);
297 sk->sk_forward_alloc += amt * TCP_MEM_QUANTUM;
298 atomic_add(amt, &tcp_memory_allocated);
300 /* Under limit. */
301 if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
302 if (tcp_memory_pressure)
303 tcp_memory_pressure = 0;
304 return 1;
307 /* Over hard limit. */
308 if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
309 tcp_enter_memory_pressure();
310 goto suppress_allocation;
313 /* Under pressure. */
314 if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
315 tcp_enter_memory_pressure();
317 if (kind) {
318 if (atomic_read(&sk->sk_rmem_alloc) < sysctl_tcp_rmem[0])
319 return 1;
320 } else if (sk->sk_wmem_queued < sysctl_tcp_wmem[0])
321 return 1;
323 if (!tcp_memory_pressure ||
324 sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated) *
325 TCP_PAGES(sk->sk_wmem_queued +
326 atomic_read(&sk->sk_rmem_alloc) +
327 sk->sk_forward_alloc))
328 return 1;
330 suppress_allocation:
332 if (!kind) {
333 tcp_moderate_sndbuf(sk);
335 /* Fail only if socket is _under_ its sndbuf.
336 * In this case we cannot block, so that we have to fail.
338 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
339 return 1;
342 /* Alas. Undo changes. */
343 sk->sk_forward_alloc -= amt * TCP_MEM_QUANTUM;
344 atomic_sub(amt, &tcp_memory_allocated);
345 return 0;
348 void __tcp_mem_reclaim(struct sock *sk)
350 if (sk->sk_forward_alloc >= TCP_MEM_QUANTUM) {
351 atomic_sub(sk->sk_forward_alloc / TCP_MEM_QUANTUM,
352 &tcp_memory_allocated);
353 sk->sk_forward_alloc &= TCP_MEM_QUANTUM - 1;
354 if (tcp_memory_pressure &&
355 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
356 tcp_memory_pressure = 0;
360 void tcp_rfree(struct sk_buff *skb)
362 struct sock *sk = skb->sk;
364 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
365 sk->sk_forward_alloc += skb->truesize;
369 * LISTEN is a special case for poll..
371 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
372 poll_table *wait)
374 return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
378 * Wait for a TCP event.
380 * Note that we don't need to lock the socket, as the upper poll layers
381 * take care of normal races (between the test and the event) and we don't
382 * go look at any of the socket buffers directly.
384 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
386 unsigned int mask;
387 struct sock *sk = sock->sk;
388 struct tcp_opt *tp = tcp_sk(sk);
390 poll_wait(file, sk->sk_sleep, wait);
391 if (sk->sk_state == TCP_LISTEN)
392 return tcp_listen_poll(sk, wait);
394 /* Socket is not locked. We are protected from async events
395 by poll logic and correct handling of state changes
396 made by another threads is impossible in any case.
399 mask = 0;
400 if (sk->sk_err)
401 mask = POLLERR;
404 * POLLHUP is certainly not done right. But poll() doesn't
405 * have a notion of HUP in just one direction, and for a
406 * socket the read side is more interesting.
408 * Some poll() documentation says that POLLHUP is incompatible
409 * with the POLLOUT/POLLWR flags, so somebody should check this
410 * all. But careful, it tends to be safer to return too many
411 * bits than too few, and you can easily break real applications
412 * if you don't tell them that something has hung up!
414 * Check-me.
416 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
417 * our fs/select.c). It means that after we received EOF,
418 * poll always returns immediately, making impossible poll() on write()
419 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
420 * if and only if shutdown has been made in both directions.
421 * Actually, it is interesting to look how Solaris and DUX
422 * solve this dilemma. I would prefer, if PULLHUP were maskable,
423 * then we could set it on SND_SHUTDOWN. BTW examples given
424 * in Stevens' books assume exactly this behaviour, it explains
425 * why PULLHUP is incompatible with POLLOUT. --ANK
427 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
428 * blocking on fresh not-connected or disconnected socket. --ANK
430 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
431 mask |= POLLHUP;
432 if (sk->sk_shutdown & RCV_SHUTDOWN)
433 mask |= POLLIN | POLLRDNORM;
435 /* Connected? */
436 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
437 /* Potential race condition. If read of tp below will
438 * escape above sk->sk_state, we can be illegally awaken
439 * in SYN_* states. */
440 if ((tp->rcv_nxt != tp->copied_seq) &&
441 (tp->urg_seq != tp->copied_seq ||
442 tp->rcv_nxt != tp->copied_seq + 1 ||
443 sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
444 mask |= POLLIN | POLLRDNORM;
446 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
447 if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
448 mask |= POLLOUT | POLLWRNORM;
449 } else { /* send SIGIO later */
450 set_bit(SOCK_ASYNC_NOSPACE,
451 &sk->sk_socket->flags);
452 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
454 /* Race breaker. If space is freed after
455 * wspace test but before the flags are set,
456 * IO signal will be lost.
458 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
459 mask |= POLLOUT | POLLWRNORM;
463 if (tp->urg_data & TCP_URG_VALID)
464 mask |= POLLPRI;
466 return mask;
470 * TCP socket write_space callback.
472 void tcp_write_space(struct sock *sk)
474 struct socket *sock = sk->sk_socket;
476 if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
477 clear_bit(SOCK_NOSPACE, &sock->flags);
479 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
480 wake_up_interruptible(sk->sk_sleep);
482 if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
483 sock_wake_async(sock, 2, POLL_OUT);
487 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
489 struct tcp_opt *tp = tcp_sk(sk);
490 int answ;
492 switch (cmd) {
493 case SIOCINQ:
494 if (sk->sk_state == TCP_LISTEN)
495 return -EINVAL;
497 lock_sock(sk);
498 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
499 answ = 0;
500 else if (sock_flag(sk, SOCK_URGINLINE) ||
501 !tp->urg_data ||
502 before(tp->urg_seq, tp->copied_seq) ||
503 !before(tp->urg_seq, tp->rcv_nxt)) {
504 answ = tp->rcv_nxt - tp->copied_seq;
506 /* Subtract 1, if FIN is in queue. */
507 if (answ && !skb_queue_empty(&sk->sk_receive_queue))
508 answ -=
509 ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
510 } else
511 answ = tp->urg_seq - tp->copied_seq;
512 release_sock(sk);
513 break;
514 case SIOCATMARK:
515 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
516 break;
517 case SIOCOUTQ:
518 if (sk->sk_state == TCP_LISTEN)
519 return -EINVAL;
521 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
522 answ = 0;
523 else
524 answ = tp->write_seq - tp->snd_una;
525 break;
526 default:
527 return -ENOIOCTLCMD;
530 return put_user(answ, (int *)arg);
534 int tcp_listen_start(struct sock *sk)
536 struct inet_opt *inet = inet_sk(sk);
537 struct tcp_opt *tp = tcp_sk(sk);
538 struct tcp_listen_opt *lopt;
540 sk->sk_max_ack_backlog = 0;
541 sk->sk_ack_backlog = 0;
542 tp->accept_queue = tp->accept_queue_tail = NULL;
543 tp->syn_wait_lock = RW_LOCK_UNLOCKED;
544 tcp_delack_init(tp);
546 lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
547 if (!lopt)
548 return -ENOMEM;
550 memset(lopt, 0, sizeof(struct tcp_listen_opt));
551 for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
552 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
553 break;
554 get_random_bytes(&lopt->hash_rnd, 4);
556 write_lock_bh(&tp->syn_wait_lock);
557 tp->listen_opt = lopt;
558 write_unlock_bh(&tp->syn_wait_lock);
560 /* There is race window here: we announce ourselves listening,
561 * but this transition is still not validated by get_port().
562 * It is OK, because this socket enters to hash table only
563 * after validation is complete.
565 sk->sk_state = TCP_LISTEN;
566 if (!sk->sk_prot->get_port(sk, inet->num)) {
567 inet->sport = htons(inet->num);
569 sk_dst_reset(sk);
570 sk->sk_prot->hash(sk);
572 return 0;
575 sk->sk_state = TCP_CLOSE;
576 write_lock_bh(&tp->syn_wait_lock);
577 tp->listen_opt = NULL;
578 write_unlock_bh(&tp->syn_wait_lock);
579 kfree(lopt);
580 return -EADDRINUSE;
584 * This routine closes sockets which have been at least partially
585 * opened, but not yet accepted.
588 static void tcp_listen_stop (struct sock *sk)
590 struct tcp_opt *tp = tcp_sk(sk);
591 struct tcp_listen_opt *lopt = tp->listen_opt;
592 struct open_request *acc_req = tp->accept_queue;
593 struct open_request *req;
594 int i;
596 tcp_delete_keepalive_timer(sk);
598 /* make all the listen_opt local to us */
599 write_lock_bh(&tp->syn_wait_lock);
600 tp->listen_opt = NULL;
601 write_unlock_bh(&tp->syn_wait_lock);
602 tp->accept_queue = tp->accept_queue_tail = NULL;
604 if (lopt->qlen) {
605 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
606 while ((req = lopt->syn_table[i]) != NULL) {
607 lopt->syn_table[i] = req->dl_next;
608 lopt->qlen--;
609 tcp_openreq_free(req);
611 /* Following specs, it would be better either to send FIN
612 * (and enter FIN-WAIT-1, it is normal close)
613 * or to send active reset (abort).
614 * Certainly, it is pretty dangerous while synflood, but it is
615 * bad justification for our negligence 8)
616 * To be honest, we are not able to make either
617 * of the variants now. --ANK
622 BUG_TRAP(!lopt->qlen);
624 kfree(lopt);
626 while ((req = acc_req) != NULL) {
627 struct sock *child = req->sk;
629 acc_req = req->dl_next;
631 local_bh_disable();
632 bh_lock_sock(child);
633 BUG_TRAP(!sock_owned_by_user(child));
634 sock_hold(child);
636 tcp_disconnect(child, O_NONBLOCK);
638 sock_orphan(child);
640 atomic_inc(&tcp_orphan_count);
642 tcp_destroy_sock(child);
644 bh_unlock_sock(child);
645 local_bh_enable();
646 sock_put(child);
648 tcp_acceptq_removed(sk);
649 tcp_openreq_fastfree(req);
651 BUG_TRAP(!sk->sk_ack_backlog);
655 * Wait for a socket to get into the connected state
657 * Note: Must be called with the socket locked.
659 static int wait_for_tcp_connect(struct sock *sk, int flags, long *timeo_p)
661 struct tcp_opt *tp = tcp_sk(sk);
662 struct task_struct *tsk = current;
663 DEFINE_WAIT(wait);
665 while ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
666 if (sk->sk_err)
667 return sock_error(sk);
668 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
669 return -EPIPE;
670 if (!*timeo_p)
671 return -EAGAIN;
672 if (signal_pending(tsk))
673 return sock_intr_errno(*timeo_p);
675 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
676 tp->write_pending++;
678 release_sock(sk);
679 *timeo_p = schedule_timeout(*timeo_p);
680 lock_sock(sk);
682 finish_wait(sk->sk_sleep, &wait);
683 tp->write_pending--;
685 return 0;
688 static inline int tcp_memory_free(struct sock *sk)
690 return sk->sk_wmem_queued < sk->sk_sndbuf;
694 * Wait for more memory for a socket
696 static int wait_for_tcp_memory(struct sock *sk, long *timeo)
698 struct tcp_opt *tp = tcp_sk(sk);
699 int err = 0;
700 long vm_wait = 0;
701 long current_timeo = *timeo;
702 DEFINE_WAIT(wait);
704 if (tcp_memory_free(sk))
705 current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
707 for (;;) {
708 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
710 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
712 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
713 goto do_error;
714 if (!*timeo)
715 goto do_nonblock;
716 if (signal_pending(current))
717 goto do_interrupted;
718 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
719 if (tcp_memory_free(sk) && !vm_wait)
720 break;
722 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
723 tp->write_pending++;
724 release_sock(sk);
725 if (!tcp_memory_free(sk) || vm_wait)
726 current_timeo = schedule_timeout(current_timeo);
727 lock_sock(sk);
728 tp->write_pending--;
730 if (vm_wait) {
731 vm_wait -= current_timeo;
732 current_timeo = *timeo;
733 if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
734 (current_timeo -= vm_wait) < 0)
735 current_timeo = 0;
736 vm_wait = 0;
738 *timeo = current_timeo;
740 out:
741 finish_wait(sk->sk_sleep, &wait);
742 return err;
744 do_error:
745 err = -EPIPE;
746 goto out;
747 do_nonblock:
748 err = -EAGAIN;
749 goto out;
750 do_interrupted:
751 err = sock_intr_errno(*timeo);
752 goto out;
755 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
756 size_t psize, int flags);
758 static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
759 int off)
761 if (i) {
762 skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
763 return page == frag->page &&
764 off == frag->page_offset + frag->size;
766 return 0;
769 static inline void fill_page_desc(struct sk_buff *skb, int i,
770 struct page *page, int off, int size)
772 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
773 frag->page = page;
774 frag->page_offset = off;
775 frag->size = size;
776 skb_shinfo(skb)->nr_frags = i + 1;
779 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
781 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
782 tp->pushed_seq = tp->write_seq;
785 static inline int forced_push(struct tcp_opt *tp)
787 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
790 static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
791 struct sk_buff *skb)
793 skb->csum = 0;
794 TCP_SKB_CB(skb)->seq = tp->write_seq;
795 TCP_SKB_CB(skb)->end_seq = tp->write_seq;
796 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
797 TCP_SKB_CB(skb)->sacked = 0;
798 __skb_queue_tail(&sk->sk_write_queue, skb);
799 tcp_charge_skb(sk, skb);
800 if (!tp->send_head)
801 tp->send_head = skb;
802 else if (tp->nonagle&TCP_NAGLE_PUSH)
803 tp->nonagle &= ~TCP_NAGLE_PUSH;
806 static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
807 struct sk_buff *skb)
809 if (flags & MSG_OOB) {
810 tp->urg_mode = 1;
811 tp->snd_up = tp->write_seq;
812 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
816 static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
817 int mss_now, int nonagle)
819 if (tp->send_head) {
820 struct sk_buff *skb = sk->sk_write_queue.prev;
821 if (!(flags & MSG_MORE) || forced_push(tp))
822 tcp_mark_push(tp, skb);
823 tcp_mark_urg(tp, flags, skb);
824 __tcp_push_pending_frames(sk, tp, mss_now,
825 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
829 static int tcp_error(struct sock *sk, int flags, int err)
831 if (err == -EPIPE)
832 err = sock_error(sk) ? : -EPIPE;
833 if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
834 send_sig(SIGPIPE, current, 0);
835 return err;
838 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
839 size_t psize, int flags)
841 struct tcp_opt *tp = tcp_sk(sk);
842 int mss_now;
843 int err;
844 ssize_t copied;
845 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
847 /* Wait for a connection to finish. */
848 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
849 if ((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
850 goto out_err;
852 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
854 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
855 copied = 0;
857 err = -EPIPE;
858 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
859 goto do_error;
861 while (psize > 0) {
862 struct sk_buff *skb = sk->sk_write_queue.prev;
863 struct page *page = pages[poffset / PAGE_SIZE];
864 int copy, i;
865 int offset = poffset % PAGE_SIZE;
866 int size = min_t(size_t, psize, PAGE_SIZE - offset);
868 if (!tp->send_head || (copy = mss_now - skb->len) <= 0) {
869 new_segment:
870 if (!tcp_memory_free(sk))
871 goto wait_for_sndbuf;
873 skb = tcp_alloc_pskb(sk, 0, tp->mss_cache,
874 sk->sk_allocation);
875 if (!skb)
876 goto wait_for_memory;
878 skb_entail(sk, tp, skb);
879 copy = mss_now;
882 if (copy > size)
883 copy = size;
885 i = skb_shinfo(skb)->nr_frags;
886 if (can_coalesce(skb, i, page, offset)) {
887 skb_shinfo(skb)->frags[i - 1].size += copy;
888 } else if (i < MAX_SKB_FRAGS) {
889 get_page(page);
890 fill_page_desc(skb, i, page, offset, copy);
891 } else {
892 tcp_mark_push(tp, skb);
893 goto new_segment;
896 skb->len += copy;
897 skb->data_len += copy;
898 skb->ip_summed = CHECKSUM_HW;
899 tp->write_seq += copy;
900 TCP_SKB_CB(skb)->end_seq += copy;
902 if (!copied)
903 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
905 copied += copy;
906 poffset += copy;
907 if (!(psize -= copy))
908 goto out;
910 if (skb->len != mss_now || (flags & MSG_OOB))
911 continue;
913 if (forced_push(tp)) {
914 tcp_mark_push(tp, skb);
915 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
916 } else if (skb == tp->send_head)
917 tcp_push_one(sk, mss_now);
918 continue;
920 wait_for_sndbuf:
921 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
922 wait_for_memory:
923 if (copied)
924 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
926 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
927 goto do_error;
929 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
932 out:
933 if (copied)
934 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
935 return copied;
937 do_error:
938 if (copied)
939 goto out;
940 out_err:
941 return tcp_error(sk, flags, err);
944 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
945 size_t size, int flags)
947 ssize_t res;
948 struct sock *sk = sock->sk;
950 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
952 if (!(sk->sk_route_caps & NETIF_F_SG) ||
953 !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
954 return sock_no_sendpage(sock, page, offset, size, flags);
956 #undef TCP_ZC_CSUM_FLAGS
958 lock_sock(sk);
959 TCP_CHECK_TIMER(sk);
960 res = do_tcp_sendpages(sk, &page, offset, size, flags);
961 TCP_CHECK_TIMER(sk);
962 release_sock(sk);
963 return res;
966 #define TCP_PAGE(sk) (inet_sk(sk)->sndmsg_page)
967 #define TCP_OFF(sk) (inet_sk(sk)->sndmsg_off)
969 static inline int tcp_copy_to_page(struct sock *sk, char *from,
970 struct sk_buff *skb, struct page *page,
971 int off, int copy)
973 int err = 0;
974 unsigned int csum;
976 if (skb->ip_summed == CHECKSUM_NONE) {
977 csum = csum_and_copy_from_user(from, page_address(page) + off,
978 copy, 0, &err);
979 if (err) return err;
980 skb->csum = csum_block_add(skb->csum, csum, skb->len);
981 } else {
982 if (copy_from_user(page_address(page) + off, from, copy))
983 return -EFAULT;
986 skb->len += copy;
987 skb->data_len += copy;
988 skb->truesize += copy;
989 sk->sk_wmem_queued += copy;
990 sk->sk_forward_alloc -= copy;
991 return 0;
994 static inline int skb_add_data(struct sk_buff *skb, char *from, int copy)
996 int err = 0;
997 unsigned int csum;
998 int off = skb->len;
1000 if (skb->ip_summed == CHECKSUM_NONE) {
1001 csum = csum_and_copy_from_user(from, skb_put(skb, copy),
1002 copy, 0, &err);
1003 if (!err) {
1004 skb->csum = csum_block_add(skb->csum, csum, off);
1005 return 0;
1007 } else {
1008 if (!copy_from_user(skb_put(skb, copy), from, copy))
1009 return 0;
1012 __skb_trim(skb, off);
1013 return -EFAULT;
1016 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1018 int tmp = tp->mss_cache_std;
1020 if (sk->sk_route_caps & NETIF_F_SG) {
1021 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1023 if (tmp >= pgbreak &&
1024 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1025 tmp = pgbreak;
1027 return tmp;
1030 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1031 int size)
1033 struct iovec *iov;
1034 struct tcp_opt *tp = tcp_sk(sk);
1035 struct sk_buff *skb;
1036 int iovlen, flags;
1037 int mss_now;
1038 int err, copied;
1039 long timeo;
1041 lock_sock(sk);
1042 TCP_CHECK_TIMER(sk);
1044 flags = msg->msg_flags;
1045 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1047 /* Wait for a connection to finish. */
1048 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1049 if ((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1050 goto out_err;
1052 /* This should be in poll */
1053 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1055 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1057 /* Ok commence sending. */
1058 iovlen = msg->msg_iovlen;
1059 iov = msg->msg_iov;
1060 copied = 0;
1062 err = -EPIPE;
1063 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1064 goto do_error;
1066 while (--iovlen >= 0) {
1067 int seglen = iov->iov_len;
1068 unsigned char *from = iov->iov_base;
1070 iov++;
1072 while (seglen > 0) {
1073 int copy;
1075 skb = sk->sk_write_queue.prev;
1077 if (!tp->send_head ||
1078 (copy = mss_now - skb->len) <= 0) {
1080 new_segment:
1081 /* Allocate new segment. If the interface is SG,
1082 * allocate skb fitting to single page.
1084 if (!tcp_memory_free(sk))
1085 goto wait_for_sndbuf;
1087 skb = tcp_alloc_pskb(sk, select_size(sk, tp),
1088 0, sk->sk_allocation);
1089 if (!skb)
1090 goto wait_for_memory;
1093 * Check whether we can use HW checksum.
1095 if (sk->sk_route_caps &
1096 (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
1097 NETIF_F_HW_CSUM))
1098 skb->ip_summed = CHECKSUM_HW;
1100 skb_entail(sk, tp, skb);
1101 copy = mss_now;
1104 /* Try to append data to the end of skb. */
1105 if (copy > seglen)
1106 copy = seglen;
1108 /* Where to copy to? */
1109 if (skb_tailroom(skb) > 0) {
1110 /* We have some space in skb head. Superb! */
1111 if (copy > skb_tailroom(skb))
1112 copy = skb_tailroom(skb);
1113 if ((err = skb_add_data(skb, from, copy)) != 0)
1114 goto do_fault;
1115 } else {
1116 int merge = 0;
1117 int i = skb_shinfo(skb)->nr_frags;
1118 struct page *page = TCP_PAGE(sk);
1119 int off = TCP_OFF(sk);
1121 if (can_coalesce(skb, i, page, off) &&
1122 off != PAGE_SIZE) {
1123 /* We can extend the last page
1124 * fragment. */
1125 merge = 1;
1126 } else if (i == MAX_SKB_FRAGS ||
1127 (!i &&
1128 !(sk->sk_route_caps & NETIF_F_SG))) {
1129 /* Need to add new fragment and cannot
1130 * do this because interface is non-SG,
1131 * or because all the page slots are
1132 * busy. */
1133 tcp_mark_push(tp, skb);
1134 goto new_segment;
1135 } else if (page) {
1136 /* If page is cached, align
1137 * offset to L1 cache boundary
1139 off = (off + L1_CACHE_BYTES - 1) &
1140 ~(L1_CACHE_BYTES - 1);
1141 if (off == PAGE_SIZE) {
1142 put_page(page);
1143 TCP_PAGE(sk) = page = NULL;
1147 if (!page) {
1148 /* Allocate new cache page. */
1149 if (!(page = tcp_alloc_page(sk)))
1150 goto wait_for_memory;
1151 off = 0;
1154 if (copy > PAGE_SIZE - off)
1155 copy = PAGE_SIZE - off;
1157 /* Time to copy data. We are close to
1158 * the end! */
1159 err = tcp_copy_to_page(sk, from, skb, page,
1160 off, copy);
1161 if (err) {
1162 /* If this page was new, give it to the
1163 * socket so it does not get leaked.
1165 if (!TCP_PAGE(sk)) {
1166 TCP_PAGE(sk) = page;
1167 TCP_OFF(sk) = 0;
1169 goto do_error;
1172 /* Update the skb. */
1173 if (merge) {
1174 skb_shinfo(skb)->frags[i - 1].size +=
1175 copy;
1176 } else {
1177 fill_page_desc(skb, i, page, off, copy);
1178 if (TCP_PAGE(sk)) {
1179 get_page(page);
1180 } else if (off + copy < PAGE_SIZE) {
1181 get_page(page);
1182 TCP_PAGE(sk) = page;
1186 TCP_OFF(sk) = off + copy;
1189 if (!copied)
1190 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1192 tp->write_seq += copy;
1193 TCP_SKB_CB(skb)->end_seq += copy;
1195 from += copy;
1196 copied += copy;
1197 if ((seglen -= copy) == 0 && iovlen == 0)
1198 goto out;
1200 if (skb->len != mss_now || (flags & MSG_OOB))
1201 continue;
1203 if (forced_push(tp)) {
1204 tcp_mark_push(tp, skb);
1205 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1206 } else if (skb == tp->send_head)
1207 tcp_push_one(sk, mss_now);
1208 continue;
1210 wait_for_sndbuf:
1211 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1212 wait_for_memory:
1213 if (copied)
1214 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1216 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1217 goto do_error;
1219 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1223 out:
1224 if (copied)
1225 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1226 TCP_CHECK_TIMER(sk);
1227 release_sock(sk);
1228 return copied;
1230 do_fault:
1231 if (!skb->len) {
1232 if (tp->send_head == skb)
1233 tp->send_head = NULL;
1234 __skb_unlink(skb, skb->list);
1235 tcp_free_skb(sk, skb);
1238 do_error:
1239 if (copied)
1240 goto out;
1241 out_err:
1242 err = tcp_error(sk, flags, err);
1243 TCP_CHECK_TIMER(sk);
1244 release_sock(sk);
1245 return err;
1249 * Handle reading urgent data. BSD has very simple semantics for
1250 * this, no blocking and very strange errors 8)
1253 static int tcp_recv_urg(struct sock *sk, long timeo,
1254 struct msghdr *msg, int len, int flags,
1255 int *addr_len)
1257 struct tcp_opt *tp = tcp_sk(sk);
1259 /* No URG data to read. */
1260 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1261 tp->urg_data == TCP_URG_READ)
1262 return -EINVAL; /* Yes this is right ! */
1264 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1265 return -ENOTCONN;
1267 if (tp->urg_data & TCP_URG_VALID) {
1268 int err = 0;
1269 char c = tp->urg_data;
1271 if (!(flags & MSG_PEEK))
1272 tp->urg_data = TCP_URG_READ;
1274 /* Read urgent data. */
1275 msg->msg_flags |= MSG_OOB;
1277 if (len > 0) {
1278 if (!(flags & MSG_TRUNC))
1279 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1280 len = 1;
1281 } else
1282 msg->msg_flags |= MSG_TRUNC;
1284 return err ? -EFAULT : len;
1287 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1288 return 0;
1290 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1291 * the available implementations agree in this case:
1292 * this call should never block, independent of the
1293 * blocking state of the socket.
1294 * Mike <pall@rz.uni-karlsruhe.de>
1296 return -EAGAIN;
1300 * Release a skb if it is no longer needed. This routine
1301 * must be called with interrupts disabled or with the
1302 * socket locked so that the sk_buff queue operation is ok.
1305 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
1307 __skb_unlink(skb, &sk->sk_receive_queue);
1308 __kfree_skb(skb);
1311 /* Clean up the receive buffer for full frames taken by the user,
1312 * then send an ACK if necessary. COPIED is the number of bytes
1313 * tcp_recvmsg has given to the user so far, it speeds up the
1314 * calculation of whether or not we must ACK for the sake of
1315 * a window update.
1317 static void cleanup_rbuf(struct sock *sk, int copied)
1319 struct tcp_opt *tp = tcp_sk(sk);
1320 int time_to_ack = 0;
1322 #if TCP_DEBUG
1323 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1325 BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1326 #endif
1328 if (tcp_ack_scheduled(tp)) {
1329 /* Delayed ACKs frequently hit locked sockets during bulk
1330 * receive. */
1331 if (tp->ack.blocked ||
1332 /* Once-per-two-segments ACK was not sent by tcp_input.c */
1333 tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1335 * If this read emptied read buffer, we send ACK, if
1336 * connection is not bidirectional, user drained
1337 * receive buffer and there was a small segment
1338 * in queue.
1340 (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1341 !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1342 time_to_ack = 1;
1345 /* We send an ACK if we can now advertise a non-zero window
1346 * which has been raised "significantly".
1348 * Even if window raised up to infinity, do not send window open ACK
1349 * in states, where we will not receive more. It is useless.
1351 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1352 __u32 rcv_window_now = tcp_receive_window(tp);
1354 /* Optimize, __tcp_select_window() is not cheap. */
1355 if (2*rcv_window_now <= tp->window_clamp) {
1356 __u32 new_window = __tcp_select_window(sk);
1358 /* Send ACK now, if this read freed lots of space
1359 * in our buffer. Certainly, new_window is new window.
1360 * We can advertise it now, if it is not less than current one.
1361 * "Lots" means "at least twice" here.
1363 if (new_window && new_window >= 2 * rcv_window_now)
1364 time_to_ack = 1;
1367 if (time_to_ack)
1368 tcp_send_ack(sk);
1371 /* Now socket state including sk->sk_err is changed only under lock,
1372 * hence we may omit checks after joining wait queue.
1373 * We check receive queue before schedule() only as optimization;
1374 * it is very likely that release_sock() added new data.
1377 static long tcp_data_wait(struct sock *sk, long timeo)
1379 DEFINE_WAIT(wait);
1381 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1383 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1384 release_sock(sk);
1386 if (skb_queue_empty(&sk->sk_receive_queue))
1387 timeo = schedule_timeout(timeo);
1389 lock_sock(sk);
1390 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1392 finish_wait(sk->sk_sleep, &wait);
1393 return timeo;
1396 static void tcp_prequeue_process(struct sock *sk)
1398 struct sk_buff *skb;
1399 struct tcp_opt *tp = tcp_sk(sk);
1401 NET_ADD_STATS_USER(TCPPrequeued, skb_queue_len(&tp->ucopy.prequeue));
1403 /* RX process wants to run with disabled BHs, though it is not
1404 * necessary */
1405 local_bh_disable();
1406 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1407 sk->sk_backlog_rcv(sk, skb);
1408 local_bh_enable();
1410 /* Clear memory counter. */
1411 tp->ucopy.memory = 0;
1414 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1416 struct sk_buff *skb;
1417 u32 offset;
1419 skb_queue_walk(&sk->sk_receive_queue, skb) {
1420 offset = seq - TCP_SKB_CB(skb)->seq;
1421 if (skb->h.th->syn)
1422 offset--;
1423 if (offset < skb->len || skb->h.th->fin) {
1424 *off = offset;
1425 return skb;
1428 return NULL;
1432 * This routine provides an alternative to tcp_recvmsg() for routines
1433 * that would like to handle copying from skbuffs directly in 'sendfile'
1434 * fashion.
1435 * Note:
1436 * - It is assumed that the socket was locked by the caller.
1437 * - The routine does not block.
1438 * - At present, there is no support for reading OOB data
1439 * or for 'peeking' the socket using this routine
1440 * (although both would be easy to implement).
1442 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1443 sk_read_actor_t recv_actor)
1445 struct sk_buff *skb;
1446 struct tcp_opt *tp = tcp_sk(sk);
1447 u32 seq = tp->copied_seq;
1448 u32 offset;
1449 int copied = 0;
1451 if (sk->sk_state == TCP_LISTEN)
1452 return -ENOTCONN;
1453 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1454 if (offset < skb->len) {
1455 size_t used, len;
1457 len = skb->len - offset;
1458 /* Stop reading if we hit a patch of urgent data */
1459 if (tp->urg_data) {
1460 u32 urg_offset = tp->urg_seq - seq;
1461 if (urg_offset < len)
1462 len = urg_offset;
1463 if (!len)
1464 break;
1466 used = recv_actor(desc, skb, offset, len);
1467 if (used <= len) {
1468 seq += used;
1469 copied += used;
1470 offset += used;
1472 if (offset != skb->len)
1473 break;
1475 if (skb->h.th->fin) {
1476 tcp_eat_skb(sk, skb);
1477 ++seq;
1478 break;
1480 tcp_eat_skb(sk, skb);
1481 if (!desc->count)
1482 break;
1484 tp->copied_seq = seq;
1485 /* Clean up data we have read: This will do ACK frames. */
1486 if (copied)
1487 cleanup_rbuf(sk, copied);
1488 return copied;
1492 * This routine copies from a sock struct into the user buffer.
1494 * Technical note: in 2.3 we work on _locked_ socket, so that
1495 * tricks with *seq access order and skb->users are not required.
1496 * Probably, code can be easily improved even more.
1499 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1500 int len, int nonblock, int flags, int *addr_len)
1502 struct tcp_opt *tp = tcp_sk(sk);
1503 int copied = 0;
1504 u32 peek_seq;
1505 u32 *seq;
1506 unsigned long used;
1507 int err;
1508 int target; /* Read at least this many bytes */
1509 long timeo;
1510 struct task_struct *user_recv = NULL;
1512 lock_sock(sk);
1514 TCP_CHECK_TIMER(sk);
1516 err = -ENOTCONN;
1517 if (sk->sk_state == TCP_LISTEN)
1518 goto out;
1520 timeo = sock_rcvtimeo(sk, nonblock);
1522 /* Urgent data needs to be handled specially. */
1523 if (flags & MSG_OOB)
1524 goto recv_urg;
1526 seq = &tp->copied_seq;
1527 if (flags & MSG_PEEK) {
1528 peek_seq = tp->copied_seq;
1529 seq = &peek_seq;
1532 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1534 do {
1535 struct sk_buff *skb;
1536 u32 offset;
1538 /* Are we at urgent data? Stop if we have read anything. */
1539 if (copied && tp->urg_data && tp->urg_seq == *seq)
1540 break;
1542 /* We need to check signals first, to get correct SIGURG
1543 * handling. FIXME: Need to check this doesn't impact 1003.1g
1544 * and move it down to the bottom of the loop
1546 if (signal_pending(current)) {
1547 if (copied)
1548 break;
1549 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1550 break;
1553 /* Next get a buffer. */
1555 skb = skb_peek(&sk->sk_receive_queue);
1556 do {
1557 if (!skb)
1558 break;
1560 /* Now that we have two receive queues this
1561 * shouldn't happen.
1563 if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1564 printk(KERN_INFO "recvmsg bug: copied %X "
1565 "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1566 break;
1568 offset = *seq - TCP_SKB_CB(skb)->seq;
1569 if (skb->h.th->syn)
1570 offset--;
1571 if (offset < skb->len)
1572 goto found_ok_skb;
1573 if (skb->h.th->fin)
1574 goto found_fin_ok;
1575 BUG_TRAP(flags & MSG_PEEK);
1576 skb = skb->next;
1577 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1579 /* Well, if we have backlog, try to process it now yet. */
1581 if (copied >= target && !sk->sk_backlog.tail)
1582 break;
1584 if (copied) {
1585 if (sk->sk_err ||
1586 sk->sk_state == TCP_CLOSE ||
1587 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1588 !timeo ||
1589 (flags & MSG_PEEK))
1590 break;
1591 } else {
1592 if (sock_flag(sk, SOCK_DONE))
1593 break;
1595 if (sk->sk_err) {
1596 copied = sock_error(sk);
1597 break;
1600 if (sk->sk_shutdown & RCV_SHUTDOWN)
1601 break;
1603 if (sk->sk_state == TCP_CLOSE) {
1604 if (!sock_flag(sk, SOCK_DONE)) {
1605 /* This occurs when user tries to read
1606 * from never connected socket.
1608 copied = -ENOTCONN;
1609 break;
1611 break;
1614 if (!timeo) {
1615 copied = -EAGAIN;
1616 break;
1620 cleanup_rbuf(sk, copied);
1622 if (tp->ucopy.task == user_recv) {
1623 /* Install new reader */
1624 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1625 user_recv = current;
1626 tp->ucopy.task = user_recv;
1627 tp->ucopy.iov = msg->msg_iov;
1630 tp->ucopy.len = len;
1632 BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1633 (flags & (MSG_PEEK | MSG_TRUNC)));
1635 /* Ugly... If prequeue is not empty, we have to
1636 * process it before releasing socket, otherwise
1637 * order will be broken at second iteration.
1638 * More elegant solution is required!!!
1640 * Look: we have the following (pseudo)queues:
1642 * 1. packets in flight
1643 * 2. backlog
1644 * 3. prequeue
1645 * 4. receive_queue
1647 * Each queue can be processed only if the next ones
1648 * are empty. At this point we have empty receive_queue.
1649 * But prequeue _can_ be not empty after 2nd iteration,
1650 * when we jumped to start of loop because backlog
1651 * processing added something to receive_queue.
1652 * We cannot release_sock(), because backlog contains
1653 * packets arrived _after_ prequeued ones.
1655 * Shortly, algorithm is clear --- to process all
1656 * the queues in order. We could make it more directly,
1657 * requeueing packets from backlog to prequeue, if
1658 * is not empty. It is more elegant, but eats cycles,
1659 * unfortunately.
1661 if (skb_queue_len(&tp->ucopy.prequeue))
1662 goto do_prequeue;
1664 /* __ Set realtime policy in scheduler __ */
1667 if (copied >= target) {
1668 /* Do not sleep, just process backlog. */
1669 release_sock(sk);
1670 lock_sock(sk);
1671 } else {
1672 timeo = tcp_data_wait(sk, timeo);
1675 if (user_recv) {
1676 int chunk;
1678 /* __ Restore normal policy in scheduler __ */
1680 if ((chunk = len - tp->ucopy.len) != 0) {
1681 NET_ADD_STATS_USER(TCPDirectCopyFromBacklog, chunk);
1682 len -= chunk;
1683 copied += chunk;
1686 if (tp->rcv_nxt == tp->copied_seq &&
1687 skb_queue_len(&tp->ucopy.prequeue)) {
1688 do_prequeue:
1689 tcp_prequeue_process(sk);
1691 if ((chunk = len - tp->ucopy.len) != 0) {
1692 NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1693 len -= chunk;
1694 copied += chunk;
1698 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1699 if (net_ratelimit())
1700 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1701 current->comm, current->pid);
1702 peek_seq = tp->copied_seq;
1704 continue;
1706 found_ok_skb:
1707 /* Ok so how much can we use? */
1708 used = skb->len - offset;
1709 if (len < used)
1710 used = len;
1712 /* Do we have urgent data here? */
1713 if (tp->urg_data) {
1714 u32 urg_offset = tp->urg_seq - *seq;
1715 if (urg_offset < used) {
1716 if (!urg_offset) {
1717 if (!sock_flag(sk, SOCK_URGINLINE)) {
1718 ++*seq;
1719 offset++;
1720 used--;
1721 if (!used)
1722 goto skip_copy;
1724 } else
1725 used = urg_offset;
1729 if (!(flags & MSG_TRUNC)) {
1730 err = skb_copy_datagram_iovec(skb, offset,
1731 msg->msg_iov, used);
1732 if (err) {
1733 /* Exception. Bailout! */
1734 if (!copied)
1735 copied = -EFAULT;
1736 break;
1740 *seq += used;
1741 copied += used;
1742 len -= used;
1744 skip_copy:
1745 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1746 tp->urg_data = 0;
1747 tcp_fast_path_check(sk, tp);
1749 if (used + offset < skb->len)
1750 continue;
1752 if (skb->h.th->fin)
1753 goto found_fin_ok;
1754 if (!(flags & MSG_PEEK))
1755 tcp_eat_skb(sk, skb);
1756 continue;
1758 found_fin_ok:
1759 /* Process the FIN. */
1760 ++*seq;
1761 if (!(flags & MSG_PEEK))
1762 tcp_eat_skb(sk, skb);
1763 break;
1764 } while (len > 0);
1766 if (user_recv) {
1767 if (skb_queue_len(&tp->ucopy.prequeue)) {
1768 int chunk;
1770 tp->ucopy.len = copied > 0 ? len : 0;
1772 tcp_prequeue_process(sk);
1774 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1775 NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1776 len -= chunk;
1777 copied += chunk;
1781 tp->ucopy.task = NULL;
1782 tp->ucopy.len = 0;
1785 /* According to UNIX98, msg_name/msg_namelen are ignored
1786 * on connected socket. I was just happy when found this 8) --ANK
1789 /* Clean up data we have read: This will do ACK frames. */
1790 cleanup_rbuf(sk, copied);
1792 TCP_CHECK_TIMER(sk);
1793 release_sock(sk);
1794 return copied;
1796 out:
1797 TCP_CHECK_TIMER(sk);
1798 release_sock(sk);
1799 return err;
1801 recv_urg:
1802 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1803 goto out;
1807 * State processing on a close. This implements the state shift for
1808 * sending our FIN frame. Note that we only send a FIN for some
1809 * states. A shutdown() may have already sent the FIN, or we may be
1810 * closed.
1813 static unsigned char new_state[16] = {
1814 /* current state: new state: action: */
1815 /* (Invalid) */ TCP_CLOSE,
1816 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1817 /* TCP_SYN_SENT */ TCP_CLOSE,
1818 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1819 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
1820 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
1821 /* TCP_TIME_WAIT */ TCP_CLOSE,
1822 /* TCP_CLOSE */ TCP_CLOSE,
1823 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
1824 /* TCP_LAST_ACK */ TCP_LAST_ACK,
1825 /* TCP_LISTEN */ TCP_CLOSE,
1826 /* TCP_CLOSING */ TCP_CLOSING,
1829 static int tcp_close_state(struct sock *sk)
1831 int next = (int)new_state[sk->sk_state];
1832 int ns = next & TCP_STATE_MASK;
1834 tcp_set_state(sk, ns);
1836 return next & TCP_ACTION_FIN;
1840 * Shutdown the sending side of a connection. Much like close except
1841 * that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1844 void tcp_shutdown(struct sock *sk, int how)
1846 /* We need to grab some memory, and put together a FIN,
1847 * and then put it into the queue to be sent.
1848 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1850 if (!(how & SEND_SHUTDOWN))
1851 return;
1853 /* If we've already sent a FIN, or it's a closed state, skip this. */
1854 if ((1 << sk->sk_state) &
1855 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1856 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1857 /* Clear out any half completed packets. FIN if needed. */
1858 if (tcp_close_state(sk))
1859 tcp_send_fin(sk);
1865 * Return 1 if we still have things to send in our buffers.
1868 static inline int closing(struct sock *sk)
1870 return (1 << sk->sk_state) &
1871 (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
1874 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1876 /* First the read buffer. */
1877 __skb_queue_purge(&sk->sk_receive_queue);
1879 /* Next, the error queue. */
1880 __skb_queue_purge(&sk->sk_error_queue);
1882 /* Next, the write queue. */
1883 BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
1885 /* Account for returned memory. */
1886 tcp_mem_reclaim(sk);
1888 BUG_TRAP(!sk->sk_wmem_queued);
1889 BUG_TRAP(!sk->sk_forward_alloc);
1891 /* It is _impossible_ for the backlog to contain anything
1892 * when we get here. All user references to this socket
1893 * have gone away, only the net layer knows can touch it.
1898 * At this point, there should be no process reference to this
1899 * socket, and thus no user references at all. Therefore we
1900 * can assume the socket waitqueue is inactive and nobody will
1901 * try to jump onto it.
1903 void tcp_destroy_sock(struct sock *sk)
1905 BUG_TRAP(sk->sk_state == TCP_CLOSE);
1906 BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1908 /* It cannot be in hash table! */
1909 BUG_TRAP(sk_unhashed(sk));
1911 /* If it has not 0 inet_sk(sk)->num, it must be bound */
1912 BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1914 #ifdef TCP_DEBUG
1915 if (sk->sk_zapped) {
1916 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1917 sock_hold(sk);
1919 sk->sk_zapped = 1;
1920 #endif
1922 sk->sk_prot->destroy(sk);
1924 tcp_kill_sk_queues(sk);
1926 xfrm_sk_free_policy(sk);
1928 #ifdef INET_REFCNT_DEBUG
1929 if (atomic_read(&sk->sk_refcnt) != 1) {
1930 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1931 sk, atomic_read(&sk->sk_refcnt));
1933 #endif
1935 atomic_dec(&tcp_orphan_count);
1936 sock_put(sk);
1939 void tcp_close(struct sock *sk, long timeout)
1941 struct sk_buff *skb;
1942 int data_was_unread = 0;
1944 lock_sock(sk);
1945 sk->sk_shutdown = SHUTDOWN_MASK;
1947 if (sk->sk_state == TCP_LISTEN) {
1948 tcp_set_state(sk, TCP_CLOSE);
1950 /* Special case. */
1951 tcp_listen_stop(sk);
1953 goto adjudge_to_death;
1956 /* We need to flush the recv. buffs. We do this only on the
1957 * descriptor close, not protocol-sourced closes, because the
1958 * reader process may not have drained the data yet!
1960 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1961 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1962 skb->h.th->fin;
1963 data_was_unread += len;
1964 __kfree_skb(skb);
1967 tcp_mem_reclaim(sk);
1969 /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1970 * 3.10, we send a RST here because data was lost. To
1971 * witness the awful effects of the old behavior of always
1972 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1973 * a bulk GET in an FTP client, suspend the process, wait
1974 * for the client to advertise a zero window, then kill -9
1975 * the FTP client, wheee... Note: timeout is always zero
1976 * in such a case.
1978 if (data_was_unread) {
1979 /* Unread data was tossed, zap the connection. */
1980 NET_INC_STATS_USER(TCPAbortOnClose);
1981 tcp_set_state(sk, TCP_CLOSE);
1982 tcp_send_active_reset(sk, GFP_KERNEL);
1983 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1984 /* Check zero linger _after_ checking for unread data. */
1985 sk->sk_prot->disconnect(sk, 0);
1986 NET_INC_STATS_USER(TCPAbortOnData);
1987 } else if (tcp_close_state(sk)) {
1988 /* We FIN if the application ate all the data before
1989 * zapping the connection.
1992 /* RED-PEN. Formally speaking, we have broken TCP state
1993 * machine. State transitions:
1995 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1996 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1997 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1999 * are legal only when FIN has been sent (i.e. in window),
2000 * rather than queued out of window. Purists blame.
2002 * F.e. "RFC state" is ESTABLISHED,
2003 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2005 * The visible declinations are that sometimes
2006 * we enter time-wait state, when it is not required really
2007 * (harmless), do not send active resets, when they are
2008 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2009 * they look as CLOSING or LAST_ACK for Linux)
2010 * Probably, I missed some more holelets.
2011 * --ANK
2013 tcp_send_fin(sk);
2016 if (timeout) {
2017 struct task_struct *tsk = current;
2018 DEFINE_WAIT(wait);
2020 do {
2021 prepare_to_wait(sk->sk_sleep, &wait,
2022 TASK_INTERRUPTIBLE);
2023 if (!closing(sk))
2024 break;
2025 release_sock(sk);
2026 timeout = schedule_timeout(timeout);
2027 lock_sock(sk);
2028 } while (!signal_pending(tsk) && timeout);
2030 finish_wait(sk->sk_sleep, &wait);
2033 adjudge_to_death:
2034 /* It is the last release_sock in its life. It will remove backlog. */
2035 release_sock(sk);
2038 /* Now socket is owned by kernel and we acquire BH lock
2039 to finish close. No need to check for user refs.
2041 local_bh_disable();
2042 bh_lock_sock(sk);
2043 BUG_TRAP(!sock_owned_by_user(sk));
2045 sock_hold(sk);
2046 sock_orphan(sk);
2048 /* This is a (useful) BSD violating of the RFC. There is a
2049 * problem with TCP as specified in that the other end could
2050 * keep a socket open forever with no application left this end.
2051 * We use a 3 minute timeout (about the same as BSD) then kill
2052 * our end. If they send after that then tough - BUT: long enough
2053 * that we won't make the old 4*rto = almost no time - whoops
2054 * reset mistake.
2056 * Nope, it was not mistake. It is really desired behaviour
2057 * f.e. on http servers, when such sockets are useless, but
2058 * consume significant resources. Let's do it with special
2059 * linger2 option. --ANK
2062 if (sk->sk_state == TCP_FIN_WAIT2) {
2063 struct tcp_opt *tp = tcp_sk(sk);
2064 if (tp->linger2 < 0) {
2065 tcp_set_state(sk, TCP_CLOSE);
2066 tcp_send_active_reset(sk, GFP_ATOMIC);
2067 NET_INC_STATS_BH(TCPAbortOnLinger);
2068 } else {
2069 int tmo = tcp_fin_time(tp);
2071 if (tmo > TCP_TIMEWAIT_LEN) {
2072 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2073 } else {
2074 atomic_inc(&tcp_orphan_count);
2075 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2076 goto out;
2080 if (sk->sk_state != TCP_CLOSE) {
2081 tcp_mem_reclaim(sk);
2082 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2083 (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
2084 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2085 if (net_ratelimit())
2086 printk(KERN_INFO "TCP: too many of orphaned "
2087 "sockets\n");
2088 tcp_set_state(sk, TCP_CLOSE);
2089 tcp_send_active_reset(sk, GFP_ATOMIC);
2090 NET_INC_STATS_BH(TCPAbortOnMemory);
2093 atomic_inc(&tcp_orphan_count);
2095 if (sk->sk_state == TCP_CLOSE)
2096 tcp_destroy_sock(sk);
2097 /* Otherwise, socket is reprieved until protocol close. */
2099 out:
2100 bh_unlock_sock(sk);
2101 local_bh_enable();
2102 sock_put(sk);
2105 /* These states need RST on ABORT according to RFC793 */
2107 static inline int tcp_need_reset(int state)
2109 return (1 << state) &
2110 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2111 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2114 int tcp_disconnect(struct sock *sk, int flags)
2116 struct inet_opt *inet = inet_sk(sk);
2117 struct tcp_opt *tp = tcp_sk(sk);
2118 int err = 0;
2119 int old_state = sk->sk_state;
2121 if (old_state != TCP_CLOSE)
2122 tcp_set_state(sk, TCP_CLOSE);
2124 /* ABORT function of RFC793 */
2125 if (old_state == TCP_LISTEN) {
2126 tcp_listen_stop(sk);
2127 } else if (tcp_need_reset(old_state) ||
2128 (tp->snd_nxt != tp->write_seq &&
2129 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2130 /* The last check adjusts for discrepance of Linux wrt. RFC
2131 * states
2133 tcp_send_active_reset(sk, gfp_any());
2134 sk->sk_err = ECONNRESET;
2135 } else if (old_state == TCP_SYN_SENT)
2136 sk->sk_err = ECONNRESET;
2138 tcp_clear_xmit_timers(sk);
2139 __skb_queue_purge(&sk->sk_receive_queue);
2140 tcp_writequeue_purge(sk);
2141 __skb_queue_purge(&tp->out_of_order_queue);
2143 inet->dport = 0;
2145 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2146 inet_reset_saddr(sk);
2148 sk->sk_shutdown = 0;
2149 sock_reset_flag(sk, SOCK_DONE);
2150 tp->srtt = 0;
2151 if ((tp->write_seq += tp->max_window + 2) == 0)
2152 tp->write_seq = 1;
2153 tp->backoff = 0;
2154 tp->snd_cwnd = 2;
2155 tp->probes_out = 0;
2156 tp->packets_out = 0;
2157 tp->snd_ssthresh = 0x7fffffff;
2158 tp->snd_cwnd_cnt = 0;
2159 tp->ca_state = TCP_CA_Open;
2160 tcp_clear_retrans(tp);
2161 tcp_delack_init(tp);
2162 tp->send_head = NULL;
2163 tp->saw_tstamp = 0;
2164 tcp_sack_reset(tp);
2165 __sk_dst_reset(sk);
2167 BUG_TRAP(!inet->num || tp->bind_hash);
2169 sk->sk_error_report(sk);
2170 return err;
2174 * Wait for an incoming connection, avoid race
2175 * conditions. This must be called with the socket locked.
2177 static int wait_for_connect(struct sock *sk, long timeo)
2179 struct tcp_opt *tp = tcp_sk(sk);
2180 DEFINE_WAIT(wait);
2181 int err;
2184 * True wake-one mechanism for incoming connections: only
2185 * one process gets woken up, not the 'whole herd'.
2186 * Since we do not 'race & poll' for established sockets
2187 * anymore, the common case will execute the loop only once.
2189 * Subtle issue: "add_wait_queue_exclusive()" will be added
2190 * after any current non-exclusive waiters, and we know that
2191 * it will always _stay_ after any new non-exclusive waiters
2192 * because all non-exclusive waiters are added at the
2193 * beginning of the wait-queue. As such, it's ok to "drop"
2194 * our exclusiveness temporarily when we get woken up without
2195 * having to remove and re-insert us on the wait queue.
2197 for (;;) {
2198 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
2199 TASK_INTERRUPTIBLE);
2200 release_sock(sk);
2201 if (!tp->accept_queue)
2202 timeo = schedule_timeout(timeo);
2203 lock_sock(sk);
2204 err = 0;
2205 if (tp->accept_queue)
2206 break;
2207 err = -EINVAL;
2208 if (sk->sk_state != TCP_LISTEN)
2209 break;
2210 err = sock_intr_errno(timeo);
2211 if (signal_pending(current))
2212 break;
2213 err = -EAGAIN;
2214 if (!timeo)
2215 break;
2217 finish_wait(sk->sk_sleep, &wait);
2218 return err;
2222 * This will accept the next outstanding connection.
2225 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2227 struct tcp_opt *tp = tcp_sk(sk);
2228 struct open_request *req;
2229 struct sock *newsk;
2230 int error;
2232 lock_sock(sk);
2234 /* We need to make sure that this socket is listening,
2235 * and that it has something pending.
2237 error = -EINVAL;
2238 if (sk->sk_state != TCP_LISTEN)
2239 goto out;
2241 /* Find already established connection */
2242 if (!tp->accept_queue) {
2243 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2245 /* If this is a non blocking socket don't sleep */
2246 error = -EAGAIN;
2247 if (!timeo)
2248 goto out;
2250 error = wait_for_connect(sk, timeo);
2251 if (error)
2252 goto out;
2255 req = tp->accept_queue;
2256 if ((tp->accept_queue = req->dl_next) == NULL)
2257 tp->accept_queue_tail = NULL;
2259 newsk = req->sk;
2260 tcp_acceptq_removed(sk);
2261 tcp_openreq_fastfree(req);
2262 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2263 release_sock(sk);
2264 return newsk;
2266 out:
2267 release_sock(sk);
2268 *err = error;
2269 return NULL;
2273 * Socket option code for TCP.
2275 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2276 int optlen)
2278 struct tcp_opt *tp = tcp_sk(sk);
2279 int val;
2280 int err = 0;
2282 if (level != SOL_TCP)
2283 return tp->af_specific->setsockopt(sk, level, optname,
2284 optval, optlen);
2286 if (optlen < sizeof(int))
2287 return -EINVAL;
2289 if (get_user(val, (int *)optval))
2290 return -EFAULT;
2292 lock_sock(sk);
2294 switch (optname) {
2295 case TCP_MAXSEG:
2296 /* Values greater than interface MTU won't take effect. However
2297 * at the point when this call is done we typically don't yet
2298 * know which interface is going to be used */
2299 if (val < 8 || val > MAX_TCP_WINDOW) {
2300 err = -EINVAL;
2301 break;
2303 tp->user_mss = val;
2304 break;
2306 case TCP_NODELAY:
2307 if (val) {
2308 /* TCP_NODELAY is weaker than TCP_CORK, so that
2309 * this option on corked socket is remembered, but
2310 * it is not activated until cork is cleared.
2312 * However, when TCP_NODELAY is set we make
2313 * an explicit push, which overrides even TCP_CORK
2314 * for currently queued segments.
2316 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2317 tcp_push_pending_frames(sk, tp);
2318 } else {
2319 tp->nonagle &= ~TCP_NAGLE_OFF;
2321 break;
2323 case TCP_CORK:
2324 /* When set indicates to always queue non-full frames.
2325 * Later the user clears this option and we transmit
2326 * any pending partial frames in the queue. This is
2327 * meant to be used alongside sendfile() to get properly
2328 * filled frames when the user (for example) must write
2329 * out headers with a write() call first and then use
2330 * sendfile to send out the data parts.
2332 * TCP_CORK can be set together with TCP_NODELAY and it is
2333 * stronger than TCP_NODELAY.
2335 if (val) {
2336 tp->nonagle |= TCP_NAGLE_CORK;
2337 } else {
2338 tp->nonagle &= ~TCP_NAGLE_CORK;
2339 if (tp->nonagle&TCP_NAGLE_OFF)
2340 tp->nonagle |= TCP_NAGLE_PUSH;
2341 tcp_push_pending_frames(sk, tp);
2343 break;
2345 case TCP_KEEPIDLE:
2346 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2347 err = -EINVAL;
2348 else {
2349 tp->keepalive_time = val * HZ;
2350 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2351 !((1 << sk->sk_state) &
2352 (TCPF_CLOSE | TCPF_LISTEN))) {
2353 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2354 if (tp->keepalive_time > elapsed)
2355 elapsed = tp->keepalive_time - elapsed;
2356 else
2357 elapsed = 0;
2358 tcp_reset_keepalive_timer(sk, elapsed);
2361 break;
2362 case TCP_KEEPINTVL:
2363 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2364 err = -EINVAL;
2365 else
2366 tp->keepalive_intvl = val * HZ;
2367 break;
2368 case TCP_KEEPCNT:
2369 if (val < 1 || val > MAX_TCP_KEEPCNT)
2370 err = -EINVAL;
2371 else
2372 tp->keepalive_probes = val;
2373 break;
2374 case TCP_SYNCNT:
2375 if (val < 1 || val > MAX_TCP_SYNCNT)
2376 err = -EINVAL;
2377 else
2378 tp->syn_retries = val;
2379 break;
2381 case TCP_LINGER2:
2382 if (val < 0)
2383 tp->linger2 = -1;
2384 else if (val > sysctl_tcp_fin_timeout / HZ)
2385 tp->linger2 = 0;
2386 else
2387 tp->linger2 = val * HZ;
2388 break;
2390 case TCP_DEFER_ACCEPT:
2391 tp->defer_accept = 0;
2392 if (val > 0) {
2393 /* Translate value in seconds to number of
2394 * retransmits */
2395 while (tp->defer_accept < 32 &&
2396 val > ((TCP_TIMEOUT_INIT / HZ) <<
2397 tp->defer_accept))
2398 tp->defer_accept++;
2399 tp->defer_accept++;
2401 break;
2403 case TCP_WINDOW_CLAMP:
2404 if (!val) {
2405 if (sk->sk_state != TCP_CLOSE) {
2406 err = -EINVAL;
2407 break;
2409 tp->window_clamp = 0;
2410 } else
2411 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2412 SOCK_MIN_RCVBUF / 2 : val;
2413 break;
2415 case TCP_QUICKACK:
2416 if (!val) {
2417 tp->ack.pingpong = 1;
2418 } else {
2419 tp->ack.pingpong = 0;
2420 if ((1 << sk->sk_state) &
2421 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2422 tcp_ack_scheduled(tp)) {
2423 tp->ack.pending |= TCP_ACK_PUSHED;
2424 cleanup_rbuf(sk, 1);
2425 if (!(val & 1))
2426 tp->ack.pingpong = 1;
2429 break;
2431 default:
2432 err = -ENOPROTOOPT;
2433 break;
2435 release_sock(sk);
2436 return err;
2439 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2440 int *optlen)
2442 struct tcp_opt *tp = tcp_sk(sk);
2443 int val, len;
2445 if (level != SOL_TCP)
2446 return tp->af_specific->getsockopt(sk, level, optname,
2447 optval, optlen);
2449 if (get_user(len, optlen))
2450 return -EFAULT;
2452 len = min_t(unsigned int, len, sizeof(int));
2454 if (len < 0)
2455 return -EINVAL;
2457 switch (optname) {
2458 case TCP_MAXSEG:
2459 val = tp->mss_cache_std;
2460 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2461 val = tp->user_mss;
2462 break;
2463 case TCP_NODELAY:
2464 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2465 break;
2466 case TCP_CORK:
2467 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2468 break;
2469 case TCP_KEEPIDLE:
2470 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2471 break;
2472 case TCP_KEEPINTVL:
2473 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2474 break;
2475 case TCP_KEEPCNT:
2476 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2477 break;
2478 case TCP_SYNCNT:
2479 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2480 break;
2481 case TCP_LINGER2:
2482 val = tp->linger2;
2483 if (val >= 0)
2484 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2485 break;
2486 case TCP_DEFER_ACCEPT:
2487 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2488 (tp->defer_accept - 1));
2489 break;
2490 case TCP_WINDOW_CLAMP:
2491 val = tp->window_clamp;
2492 break;
2493 case TCP_INFO: {
2494 struct tcp_info info;
2495 u32 now = tcp_time_stamp;
2497 if (get_user(len, optlen))
2498 return -EFAULT;
2499 info.tcpi_state = sk->sk_state;
2500 info.tcpi_ca_state = tp->ca_state;
2501 info.tcpi_retransmits = tp->retransmits;
2502 info.tcpi_probes = tp->probes_out;
2503 info.tcpi_backoff = tp->backoff;
2504 info.tcpi_options = 0;
2505 if (tp->tstamp_ok)
2506 info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2507 if (tp->sack_ok)
2508 info.tcpi_options |= TCPI_OPT_SACK;
2509 if (tp->wscale_ok) {
2510 info.tcpi_options |= TCPI_OPT_WSCALE;
2511 info.tcpi_snd_wscale = tp->snd_wscale;
2512 info.tcpi_rcv_wscale = tp->rcv_wscale;
2513 } else {
2514 info.tcpi_snd_wscale = 0;
2515 info.tcpi_rcv_wscale = 0;
2517 if (tp->ecn_flags & TCP_ECN_OK)
2518 info.tcpi_options |= TCPI_OPT_ECN;
2520 info.tcpi_rto = (1000000 * tp->rto) / HZ;
2521 info.tcpi_ato = (1000000 * tp->ack.ato) / HZ;
2522 info.tcpi_snd_mss = tp->mss_cache_std;
2523 info.tcpi_rcv_mss = tp->ack.rcv_mss;
2525 info.tcpi_unacked = tp->packets_out;
2526 info.tcpi_sacked = tp->sacked_out;
2527 info.tcpi_lost = tp->lost_out;
2528 info.tcpi_retrans = tp->retrans_out;
2529 info.tcpi_fackets = tp->fackets_out;
2531 info.tcpi_last_data_sent = ((now - tp->lsndtime) * 1000) / HZ;
2532 info.tcpi_last_ack_sent = 0;
2533 info.tcpi_last_data_recv = ((now -
2534 tp->ack.lrcvtime) * 1000) / HZ;
2535 info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp) * 1000) / HZ;
2537 info.tcpi_pmtu = tp->pmtu_cookie;
2538 info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2539 info.tcpi_rtt = ((1000000 * tp->srtt) / HZ) >> 3;
2540 info.tcpi_rttvar = ((1000000 * tp->mdev) / HZ) >> 2;
2541 info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2542 info.tcpi_snd_cwnd = tp->snd_cwnd;
2543 info.tcpi_advmss = tp->advmss;
2544 info.tcpi_reordering = tp->reordering;
2546 len = min_t(unsigned int, len, sizeof(info));
2547 if (put_user(len, optlen))
2548 return -EFAULT;
2549 if (copy_to_user(optval, &info, len))
2550 return -EFAULT;
2551 return 0;
2553 case TCP_QUICKACK:
2554 val = !tp->ack.pingpong;
2555 break;
2556 default:
2557 return -ENOPROTOOPT;
2560 if (put_user(len, optlen))
2561 return -EFAULT;
2562 if (copy_to_user(optval, &val, len))
2563 return -EFAULT;
2564 return 0;
2568 extern void __skb_cb_too_small_for_tcp(int, int);
2569 extern void tcpdiag_init(void);
2571 void __init tcp_init(void)
2573 struct sk_buff *skb = NULL;
2574 unsigned long goal;
2575 int order, i;
2577 if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2578 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2579 sizeof(skb->cb));
2581 tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2582 sizeof(struct open_request),
2583 0, SLAB_HWCACHE_ALIGN,
2584 NULL, NULL);
2585 if (!tcp_openreq_cachep)
2586 panic("tcp_init: Cannot alloc open_request cache.");
2588 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2589 sizeof(struct tcp_bind_bucket),
2590 0, SLAB_HWCACHE_ALIGN,
2591 NULL, NULL);
2592 if (!tcp_bucket_cachep)
2593 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2595 tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2596 sizeof(struct tcp_tw_bucket),
2597 0, SLAB_HWCACHE_ALIGN,
2598 NULL, NULL);
2599 if (!tcp_timewait_cachep)
2600 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2602 /* Size and allocate the main established and bind bucket
2603 * hash tables.
2605 * The methodology is similar to that of the buffer cache.
2607 if (num_physpages >= (128 * 1024))
2608 goal = num_physpages >> (21 - PAGE_SHIFT);
2609 else
2610 goal = num_physpages >> (23 - PAGE_SHIFT);
2612 for (order = 0; (1UL << order) < goal; order++)
2614 do {
2615 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2616 sizeof(struct tcp_ehash_bucket);
2617 tcp_ehash_size >>= 1;
2618 while (tcp_ehash_size & (tcp_ehash_size - 1))
2619 tcp_ehash_size--;
2620 tcp_ehash = (struct tcp_ehash_bucket *)
2621 __get_free_pages(GFP_ATOMIC, order);
2622 } while (!tcp_ehash && --order > 0);
2624 if (!tcp_ehash)
2625 panic("Failed to allocate TCP established hash table\n");
2626 for (i = 0; i < (tcp_ehash_size << 1); i++) {
2627 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2628 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2631 do {
2632 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2633 sizeof(struct tcp_bind_hashbucket);
2634 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2635 continue;
2636 tcp_bhash = (struct tcp_bind_hashbucket *)
2637 __get_free_pages(GFP_ATOMIC, order);
2638 } while (!tcp_bhash && --order >= 0);
2640 if (!tcp_bhash)
2641 panic("Failed to allocate TCP bind hash table\n");
2642 for (i = 0; i < tcp_bhash_size; i++) {
2643 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2644 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2647 /* Try to be a bit smarter and adjust defaults depending
2648 * on available memory.
2650 if (order > 4) {
2651 sysctl_local_port_range[0] = 32768;
2652 sysctl_local_port_range[1] = 61000;
2653 sysctl_tcp_max_tw_buckets = 180000;
2654 sysctl_tcp_max_orphans = 4096 << (order - 4);
2655 sysctl_max_syn_backlog = 1024;
2656 } else if (order < 3) {
2657 sysctl_local_port_range[0] = 1024 * (3 - order);
2658 sysctl_tcp_max_tw_buckets >>= (3 - order);
2659 sysctl_tcp_max_orphans >>= (3 - order);
2660 sysctl_max_syn_backlog = 128;
2662 tcp_port_rover = sysctl_local_port_range[0] - 1;
2664 sysctl_tcp_mem[0] = 768 << order;
2665 sysctl_tcp_mem[1] = 1024 << order;
2666 sysctl_tcp_mem[2] = 1536 << order;
2667 if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 512)
2668 sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 512;
2669 if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 512)
2670 sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 512;
2672 if (order < 3) {
2673 sysctl_tcp_wmem[2] = 64 * 1024;
2674 sysctl_tcp_rmem[0] = PAGE_SIZE;
2675 sysctl_tcp_rmem[1] = 43689;
2676 sysctl_tcp_rmem[2] = 2 * 43689;
2679 printk(KERN_INFO "TCP: Hash tables configured "
2680 "(established %d bind %d)\n",
2681 tcp_ehash_size << 1, tcp_bhash_size);
2683 tcpdiag_init();