allow coexistance of N build and AC build.
[tomato.git] / release / src-rt-6.x / linux / linux-2.6 / net / core / sock.c
blobea6983484f58299c84ed14b83555b084443039ae
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
12 * Authors: Ross Biro
13 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Alan Cox, <A.Cox@swansea.ac.uk>
17 * Fixes:
18 * Alan Cox : Numerous verify_area() problems
19 * Alan Cox : Connecting on a connecting socket
20 * now returns an error for tcp.
21 * Alan Cox : sock->protocol is set correctly.
22 * and is not sometimes left as 0.
23 * Alan Cox : connect handles icmp errors on a
24 * connect properly. Unfortunately there
25 * is a restart syscall nasty there. I
26 * can't match BSD without hacking the C
27 * library. Ideas urgently sought!
28 * Alan Cox : Disallow bind() to addresses that are
29 * not ours - especially broadcast ones!!
30 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
31 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
32 * instead they leave that for the DESTROY timer.
33 * Alan Cox : Clean up error flag in accept
34 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
35 * was buggy. Put a remove_sock() in the handler
36 * for memory when we hit 0. Also altered the timer
37 * code. The ACK stuff can wait and needs major
38 * TCP layer surgery.
39 * Alan Cox : Fixed TCP ack bug, removed remove sock
40 * and fixed timer/inet_bh race.
41 * Alan Cox : Added zapped flag for TCP
42 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
43 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
45 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 * Rick Sladkey : Relaxed UDP rules for matching packets.
48 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
49 * Pauline Middelink : identd support
50 * Alan Cox : Fixed connect() taking signals I think.
51 * Alan Cox : SO_LINGER supported
52 * Alan Cox : Error reporting fixes
53 * Anonymous : inet_create tidied up (sk->reuse setting)
54 * Alan Cox : inet sockets don't set sk->type!
55 * Alan Cox : Split socket option code
56 * Alan Cox : Callbacks
57 * Alan Cox : Nagle flag for Charles & Johannes stuff
58 * Alex : Removed restriction on inet fioctl
59 * Alan Cox : Splitting INET from NET core
60 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
61 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
62 * Alan Cox : Split IP from generic code
63 * Alan Cox : New kfree_skbmem()
64 * Alan Cox : Make SO_DEBUG superuser only.
65 * Alan Cox : Allow anyone to clear SO_DEBUG
66 * (compatibility fix)
67 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
68 * Alan Cox : Allocator for a socket is settable.
69 * Alan Cox : SO_ERROR includes soft errors.
70 * Alan Cox : Allow NULL arguments on some SO_ opts
71 * Alan Cox : Generic socket allocation to make hooks
72 * easier (suggested by Craig Metz).
73 * Michael Pall : SO_ERROR returns positive errno again
74 * Steve Whitehouse: Added default destructor to free
75 * protocol private data.
76 * Steve Whitehouse: Added various other default routines
77 * common to several socket families.
78 * Chris Evans : Call suser() check last on F_SETOWN
79 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
81 * Andi Kleen : Fix write_space callback
82 * Chris Evans : Security fixes - signedness again
83 * Arnaldo C. Melo : cleanups, use skb_queue_purge
85 * To Fix:
88 * This program is free software; you can redistribute it and/or
89 * modify it under the terms of the GNU General Public License
90 * as published by the Free Software Foundation; either version
91 * 2 of the License, or (at your option) any later version.
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114 #include <linux/highmem.h>
116 #include <asm/uaccess.h>
117 #include <asm/system.h>
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/request_sock.h>
123 #include <net/sock.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
127 #include <linux/filter.h>
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
137 static struct lock_class_key af_family_keys[AF_MAX];
138 static struct lock_class_key af_family_slock_keys[AF_MAX];
140 #ifdef CONFIG_DEBUG_LOCK_ALLOC
142 * Make lock validator output more readable. (we pre-construct these
143 * strings build-time, so that runtime initialization of socket
144 * locks is fast):
146 static const char *af_family_key_strings[AF_MAX+1] = {
147 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
148 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
149 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
150 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
151 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
152 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
153 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
154 "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
155 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
156 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
157 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
158 "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
160 static const char *af_family_slock_key_strings[AF_MAX+1] = {
161 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
162 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
163 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
164 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
165 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
166 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
167 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
168 "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" ,
169 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
170 "slock-27" , "slock-28" , "slock-AF_CAN" ,
171 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
172 "slock-AF_RXRPC" , "slock-AF_MAX"
174 #endif
177 * sk_callback_lock locking rules are per-address-family,
178 * so split the lock classes by using a per-AF key:
180 static struct lock_class_key af_callback_keys[AF_MAX];
182 /* Take into consideration the size of the struct sk_buff overhead in the
183 * determination of these values, since that is non-constant across
184 * platforms. This makes socket queueing behavior and performance
185 * not depend upon such differences.
187 #define _SK_MEM_PACKETS 256
188 #define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
189 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
190 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
192 /* Run time adjustable parameters. */
193 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
194 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
195 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
196 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
198 /* Maximal space eaten by iovec or ancilliary data plus some space */
199 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
201 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
203 struct timeval tv;
205 if (optlen < sizeof(tv))
206 return -EINVAL;
207 if (copy_from_user(&tv, optval, sizeof(tv)))
208 return -EFAULT;
209 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
210 return -EDOM;
212 if (tv.tv_sec < 0) {
213 static int warned = 0;
214 *timeo_p = 0;
215 if (warned < 10 && net_ratelimit())
216 warned++;
217 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
218 "tries to set negative timeout\n",
219 current->comm, current->pid);
220 return 0;
222 *timeo_p = MAX_SCHEDULE_TIMEOUT;
223 if (tv.tv_sec == 0 && tv.tv_usec == 0)
224 return 0;
225 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
226 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
227 return 0;
230 static void sock_warn_obsolete_bsdism(const char *name)
232 static int warned;
233 static char warncomm[TASK_COMM_LEN];
234 if (strcmp(warncomm, current->comm) && warned < 5) {
235 strcpy(warncomm, current->comm);
236 printk(KERN_WARNING "process `%s' is using obsolete "
237 "%s SO_BSDCOMPAT\n", warncomm, name);
238 warned++;
242 static void sock_disable_timestamp(struct sock *sk)
244 if (sock_flag(sk, SOCK_TIMESTAMP)) {
245 sock_reset_flag(sk, SOCK_TIMESTAMP);
246 net_disable_timestamp();
251 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
253 int err = 0;
254 int skb_len;
256 /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
257 number of warnings when compiling with -W --ANK
259 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
260 (unsigned)sk->sk_rcvbuf) {
261 err = -ENOMEM;
262 goto out;
265 err = sk_filter(sk, skb);
266 if (err)
267 goto out;
269 skb->dev = NULL;
270 skb_set_owner_r(skb, sk);
272 /* Cache the SKB length before we tack it onto the receive
273 * queue. Once it is added it no longer belongs to us and
274 * may be freed by other threads of control pulling packets
275 * from the queue.
277 skb_len = skb->len;
279 skb_queue_tail(&sk->sk_receive_queue, skb);
281 if (!sock_flag(sk, SOCK_DEAD))
282 sk->sk_data_ready(sk, skb_len);
283 out:
284 return err;
286 EXPORT_SYMBOL(sock_queue_rcv_skb);
288 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
290 int rc = NET_RX_SUCCESS;
292 if (sk_filter(sk, skb))
293 goto discard_and_relse;
295 skb->dev = NULL;
297 if (nested)
298 bh_lock_sock_nested(sk);
299 else
300 bh_lock_sock(sk);
301 if (!sock_owned_by_user(sk)) {
303 * trylock + unlock semantics:
305 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
307 rc = sk->sk_backlog_rcv(sk, skb);
309 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
310 } else
311 sk_add_backlog(sk, skb);
312 bh_unlock_sock(sk);
313 out:
314 sock_put(sk);
315 return rc;
316 discard_and_relse:
317 kfree_skb(skb);
318 goto out;
320 EXPORT_SYMBOL(sk_receive_skb);
322 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
324 struct dst_entry *dst = sk->sk_dst_cache;
326 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
327 sk->sk_dst_cache = NULL;
328 dst_release(dst);
329 return NULL;
332 return dst;
334 EXPORT_SYMBOL(__sk_dst_check);
336 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
338 struct dst_entry *dst = sk_dst_get(sk);
340 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
341 sk_dst_reset(sk);
342 dst_release(dst);
343 return NULL;
346 return dst;
348 EXPORT_SYMBOL(sk_dst_check);
350 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
352 int ret = -ENOPROTOOPT;
353 #ifdef CONFIG_NETDEVICES
354 char devname[IFNAMSIZ];
355 int index;
357 /* Sorry... */
358 ret = -EPERM;
359 if (!capable(CAP_NET_RAW))
360 goto out;
362 ret = -EINVAL;
363 if (optlen < 0)
364 goto out;
366 /* Bind this socket to a particular device like "eth0",
367 * as specified in the passed interface name. If the
368 * name is "" or the option length is zero the socket
369 * is not bound.
371 if (optlen > IFNAMSIZ - 1)
372 optlen = IFNAMSIZ - 1;
373 memset(devname, 0, sizeof(devname));
375 ret = -EFAULT;
376 if (copy_from_user(devname, optval, optlen))
377 goto out;
379 if (devname[0] == '\0') {
380 index = 0;
381 } else {
382 struct net_device *dev = dev_get_by_name(devname);
384 ret = -ENODEV;
385 if (!dev)
386 goto out;
388 index = dev->ifindex;
389 dev_put(dev);
392 lock_sock(sk);
393 sk->sk_bound_dev_if = index;
394 sk_dst_reset(sk);
395 release_sock(sk);
397 ret = 0;
399 out:
400 #endif
402 return ret;
406 * This is meant for all protocols to use and covers goings on
407 * at the socket level. Everything here is generic.
410 int sock_setsockopt(struct socket *sock, int level, int optname,
411 char __user *optval, int optlen)
413 struct sock *sk=sock->sk;
414 struct sk_filter *filter;
415 int val;
416 int valbool;
417 struct linger ling;
418 int ret = 0;
421 * Options without arguments
424 #ifdef SO_DONTLINGER /* Compatibility item... */
425 if (optname == SO_DONTLINGER) {
426 lock_sock(sk);
427 sock_reset_flag(sk, SOCK_LINGER);
428 release_sock(sk);
429 return 0;
431 #endif
433 if (optname == SO_BINDTODEVICE)
434 return sock_bindtodevice(sk, optval, optlen);
436 if (optlen < sizeof(int))
437 return -EINVAL;
439 if (get_user(val, (int __user *)optval))
440 return -EFAULT;
442 valbool = val?1:0;
444 lock_sock(sk);
446 switch(optname) {
447 case SO_DEBUG:
448 if (val && !capable(CAP_NET_ADMIN)) {
449 ret = -EACCES;
451 else if (valbool)
452 sock_set_flag(sk, SOCK_DBG);
453 else
454 sock_reset_flag(sk, SOCK_DBG);
455 break;
456 case SO_REUSEADDR:
457 sk->sk_reuse = valbool;
458 break;
459 case SO_TYPE:
460 case SO_ERROR:
461 ret = -ENOPROTOOPT;
462 break;
463 case SO_DONTROUTE:
464 if (valbool)
465 sock_set_flag(sk, SOCK_LOCALROUTE);
466 else
467 sock_reset_flag(sk, SOCK_LOCALROUTE);
468 break;
469 case SO_BROADCAST:
470 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
471 break;
472 case SO_SNDBUF:
473 /* Don't error on this BSD doesn't and if you think
474 about it this is right. Otherwise apps have to
475 play 'guess the biggest size' games. RCVBUF/SNDBUF
476 are treated in BSD as hints */
478 if (val > sysctl_wmem_max)
479 val = sysctl_wmem_max;
480 set_sndbuf:
481 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
482 if ((val * 2) < SOCK_MIN_SNDBUF)
483 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
484 else
485 sk->sk_sndbuf = val * 2;
488 * Wake up sending tasks if we
489 * upped the value.
491 sk->sk_write_space(sk);
492 break;
494 case SO_SNDBUFFORCE:
495 if (!capable(CAP_NET_ADMIN)) {
496 ret = -EPERM;
497 break;
499 goto set_sndbuf;
501 case SO_RCVBUF:
502 /* Don't error on this BSD doesn't and if you think
503 about it this is right. Otherwise apps have to
504 play 'guess the biggest size' games. RCVBUF/SNDBUF
505 are treated in BSD as hints */
507 if (val > sysctl_rmem_max)
508 val = sysctl_rmem_max;
509 set_rcvbuf:
510 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
512 * We double it on the way in to account for
513 * "struct sk_buff" etc. overhead. Applications
514 * assume that the SO_RCVBUF setting they make will
515 * allow that much actual data to be received on that
516 * socket.
518 * Applications are unaware that "struct sk_buff" and
519 * other overheads allocate from the receive buffer
520 * during socket buffer allocation.
522 * And after considering the possible alternatives,
523 * returning the value we actually used in getsockopt
524 * is the most desirable behavior.
526 if ((val * 2) < SOCK_MIN_RCVBUF)
527 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
528 else
529 sk->sk_rcvbuf = val * 2;
530 break;
532 case SO_RCVBUFFORCE:
533 if (!capable(CAP_NET_ADMIN)) {
534 ret = -EPERM;
535 break;
537 goto set_rcvbuf;
539 case SO_KEEPALIVE:
540 #ifdef CONFIG_INET
541 if (sk->sk_protocol == IPPROTO_TCP)
542 tcp_set_keepalive(sk, valbool);
543 #endif
544 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
545 break;
547 case SO_OOBINLINE:
548 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
549 break;
551 case SO_NO_CHECK:
552 sk->sk_no_check = valbool;
553 break;
555 case SO_PRIORITY:
556 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
557 sk->sk_priority = val;
558 else
559 ret = -EPERM;
560 break;
562 case SO_LINGER:
563 if (optlen < sizeof(ling)) {
564 ret = -EINVAL; /* 1003.1g */
565 break;
567 if (copy_from_user(&ling,optval,sizeof(ling))) {
568 ret = -EFAULT;
569 break;
571 if (!ling.l_onoff)
572 sock_reset_flag(sk, SOCK_LINGER);
573 else {
574 #if (BITS_PER_LONG == 32)
575 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
576 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
577 else
578 #endif
579 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
580 sock_set_flag(sk, SOCK_LINGER);
582 break;
584 case SO_BSDCOMPAT:
585 sock_warn_obsolete_bsdism("setsockopt");
586 break;
588 case SO_PASSCRED:
589 if (valbool)
590 set_bit(SOCK_PASSCRED, &sock->flags);
591 else
592 clear_bit(SOCK_PASSCRED, &sock->flags);
593 break;
595 case SO_TIMESTAMP:
596 case SO_TIMESTAMPNS:
597 if (valbool) {
598 if (optname == SO_TIMESTAMP)
599 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
600 else
601 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
602 sock_set_flag(sk, SOCK_RCVTSTAMP);
603 sock_enable_timestamp(sk);
604 } else {
605 sock_reset_flag(sk, SOCK_RCVTSTAMP);
606 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
608 break;
610 case SO_RCVLOWAT:
611 if (val < 0)
612 val = INT_MAX;
613 sk->sk_rcvlowat = val ? : 1;
614 break;
616 case SO_RCVTIMEO:
617 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
618 break;
620 case SO_SNDTIMEO:
621 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
622 break;
624 case SO_ATTACH_FILTER:
625 ret = -EINVAL;
626 if (optlen == sizeof(struct sock_fprog)) {
627 struct sock_fprog fprog;
629 ret = -EFAULT;
630 if (copy_from_user(&fprog, optval, sizeof(fprog)))
631 break;
633 ret = sk_attach_filter(&fprog, sk);
635 break;
637 case SO_DETACH_FILTER:
638 rcu_read_lock_bh();
639 filter = rcu_dereference(sk->sk_filter);
640 if (filter) {
641 rcu_assign_pointer(sk->sk_filter, NULL);
642 sk_filter_release(sk, filter);
643 rcu_read_unlock_bh();
644 break;
646 rcu_read_unlock_bh();
647 ret = -ENONET;
648 break;
650 case SO_PASSSEC:
651 if (valbool)
652 set_bit(SOCK_PASSSEC, &sock->flags);
653 else
654 clear_bit(SOCK_PASSSEC, &sock->flags);
655 break;
657 /* We implement the SO_SNDLOWAT etc to
658 not be settable (1003.1g 5.3) */
659 default:
660 ret = -ENOPROTOOPT;
661 break;
663 release_sock(sk);
664 return ret;
668 int sock_getsockopt(struct socket *sock, int level, int optname,
669 char __user *optval, int __user *optlen)
671 struct sock *sk = sock->sk;
673 union {
674 int val;
675 struct linger ling;
676 struct timeval tm;
677 } v;
679 unsigned int lv = sizeof(int);
680 int len;
682 if (get_user(len, optlen))
683 return -EFAULT;
684 if (len < 0)
685 return -EINVAL;
687 memset(&v, 0, sizeof(v));
689 switch(optname) {
690 case SO_DEBUG:
691 v.val = sock_flag(sk, SOCK_DBG);
692 break;
694 case SO_DONTROUTE:
695 v.val = sock_flag(sk, SOCK_LOCALROUTE);
696 break;
698 case SO_BROADCAST:
699 v.val = !!sock_flag(sk, SOCK_BROADCAST);
700 break;
702 case SO_SNDBUF:
703 v.val = sk->sk_sndbuf;
704 break;
706 case SO_RCVBUF:
707 v.val = sk->sk_rcvbuf;
708 break;
710 case SO_REUSEADDR:
711 v.val = sk->sk_reuse;
712 break;
714 case SO_KEEPALIVE:
715 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
716 break;
718 case SO_TYPE:
719 v.val = sk->sk_type;
720 break;
722 case SO_ERROR:
723 v.val = -sock_error(sk);
724 if (v.val==0)
725 v.val = xchg(&sk->sk_err_soft, 0);
726 break;
728 case SO_OOBINLINE:
729 v.val = !!sock_flag(sk, SOCK_URGINLINE);
730 break;
732 case SO_NO_CHECK:
733 v.val = sk->sk_no_check;
734 break;
736 case SO_PRIORITY:
737 v.val = sk->sk_priority;
738 break;
740 case SO_LINGER:
741 lv = sizeof(v.ling);
742 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
743 v.ling.l_linger = sk->sk_lingertime / HZ;
744 break;
746 case SO_BSDCOMPAT:
747 sock_warn_obsolete_bsdism("getsockopt");
748 break;
750 case SO_TIMESTAMP:
751 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
752 !sock_flag(sk, SOCK_RCVTSTAMPNS);
753 break;
755 case SO_TIMESTAMPNS:
756 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
757 break;
759 case SO_RCVTIMEO:
760 lv=sizeof(struct timeval);
761 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
762 v.tm.tv_sec = 0;
763 v.tm.tv_usec = 0;
764 } else {
765 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
766 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
768 break;
770 case SO_SNDTIMEO:
771 lv=sizeof(struct timeval);
772 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
773 v.tm.tv_sec = 0;
774 v.tm.tv_usec = 0;
775 } else {
776 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
777 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
779 break;
781 case SO_RCVLOWAT:
782 v.val = sk->sk_rcvlowat;
783 break;
785 case SO_SNDLOWAT:
786 v.val=1;
787 break;
789 case SO_PASSCRED:
790 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
791 break;
793 case SO_PEERCRED:
794 if (len > sizeof(sk->sk_peercred))
795 len = sizeof(sk->sk_peercred);
796 if (copy_to_user(optval, &sk->sk_peercred, len))
797 return -EFAULT;
798 goto lenout;
800 case SO_PEERNAME:
802 char address[128];
804 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
805 return -ENOTCONN;
806 if (lv < len)
807 return -EINVAL;
808 if (copy_to_user(optval, address, len))
809 return -EFAULT;
810 goto lenout;
813 /* Dubious BSD thing... Probably nobody even uses it, but
814 * the UNIX standard wants it for whatever reason... -DaveM
816 case SO_ACCEPTCONN:
817 v.val = sk->sk_state == TCP_LISTEN;
818 break;
820 case SO_PASSSEC:
821 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
822 break;
824 case SO_PEERSEC:
825 return security_socket_getpeersec_stream(sock, optval, optlen, len);
827 default:
828 return -ENOPROTOOPT;
831 if (len > lv)
832 len = lv;
833 if (copy_to_user(optval, &v, len))
834 return -EFAULT;
835 lenout:
836 if (put_user(len, optlen))
837 return -EFAULT;
838 return 0;
842 * Initialize an sk_lock.
844 * (We also register the sk_lock with the lock validator.)
846 static inline void sock_lock_init(struct sock *sk)
848 sock_lock_init_class_and_name(sk,
849 af_family_slock_key_strings[sk->sk_family],
850 af_family_slock_keys + sk->sk_family,
851 af_family_key_strings[sk->sk_family],
852 af_family_keys + sk->sk_family);
856 * sk_alloc - All socket objects are allocated here
857 * @family: protocol family
858 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
859 * @prot: struct proto associated with this new sock instance
860 * @zero_it: if we should zero the newly allocated sock
862 struct sock *sk_alloc(int family, gfp_t priority,
863 struct proto *prot, int zero_it)
865 struct sock *sk = NULL;
866 struct kmem_cache *slab = prot->slab;
868 if (slab != NULL)
869 sk = kmem_cache_alloc(slab, priority);
870 else
871 sk = kmalloc(prot->obj_size, priority);
873 if (sk) {
874 if (zero_it) {
875 memset(sk, 0, prot->obj_size);
876 sk->sk_family = family;
878 * See comment in struct sock definition to understand
879 * why we need sk_prot_creator -acme
881 sk->sk_prot = sk->sk_prot_creator = prot;
882 sock_lock_init(sk);
885 if (security_sk_alloc(sk, family, priority))
886 goto out_free;
888 if (!try_module_get(prot->owner))
889 goto out_free;
891 atomic_set(&sk->sk_wmem_alloc, 1);
893 return sk;
895 out_free:
896 if (slab != NULL)
897 kmem_cache_free(slab, sk);
898 else
899 kfree(sk);
900 return NULL;
903 static void __sk_free(struct sock *sk)
905 struct sk_filter *filter;
906 struct module *owner = sk->sk_prot_creator->owner;
908 if (sk->sk_destruct)
909 sk->sk_destruct(sk);
911 filter = rcu_dereference(sk->sk_filter);
912 if (filter) {
913 sk_filter_release(sk, filter);
914 rcu_assign_pointer(sk->sk_filter, NULL);
917 sock_disable_timestamp(sk);
919 if (atomic_read(&sk->sk_omem_alloc))
920 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
921 __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
923 security_sk_free(sk);
924 if (sk->sk_prot_creator->slab != NULL)
925 kmem_cache_free(sk->sk_prot_creator->slab, sk);
926 else
927 kfree(sk);
928 module_put(owner);
931 void sk_free(struct sock *sk)
934 * We substract one from sk_wmem_alloc and can know if
935 * some packets are still in some tx queue.
936 * If not null, sock_wfree() will call __sk_free(sk) later
938 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
939 __sk_free(sk);
942 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
944 struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
946 if (newsk != NULL) {
947 struct sk_filter *filter;
949 sock_copy(newsk, sk);
951 /* SANITY */
952 sk_node_init(&newsk->sk_node);
953 sock_lock_init(newsk);
954 bh_lock_sock(newsk);
955 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
957 atomic_set(&newsk->sk_rmem_alloc, 0);
959 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
961 atomic_set(&newsk->sk_wmem_alloc, 1);
962 atomic_set(&newsk->sk_omem_alloc, 0);
963 skb_queue_head_init(&newsk->sk_receive_queue);
964 skb_queue_head_init(&newsk->sk_write_queue);
965 #ifdef CONFIG_NET_DMA
966 skb_queue_head_init(&newsk->sk_async_wait_queue);
967 #endif
969 rwlock_init(&newsk->sk_dst_lock);
970 rwlock_init(&newsk->sk_callback_lock);
971 lockdep_set_class(&newsk->sk_callback_lock,
972 af_callback_keys + newsk->sk_family);
974 newsk->sk_dst_cache = NULL;
975 newsk->sk_wmem_queued = 0;
976 newsk->sk_forward_alloc = 0;
977 newsk->sk_send_head = NULL;
978 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
980 sock_reset_flag(newsk, SOCK_DONE);
981 skb_queue_head_init(&newsk->sk_error_queue);
983 filter = newsk->sk_filter;
984 if (filter != NULL)
985 sk_filter_charge(newsk, filter);
987 if (unlikely(xfrm_sk_clone_policy(newsk))) {
988 /* It is still raw copy of parent, so invalidate
989 * destructor and make plain sk_free() */
990 newsk->sk_destruct = NULL;
991 sk_free(newsk);
992 newsk = NULL;
993 goto out;
996 newsk->sk_err = 0;
997 newsk->sk_priority = 0;
998 atomic_set(&newsk->sk_refcnt, 2);
1001 * Increment the counter in the same struct proto as the master
1002 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1003 * is the same as sk->sk_prot->socks, as this field was copied
1004 * with memcpy).
1006 * This _changes_ the previous behaviour, where
1007 * tcp_create_openreq_child always was incrementing the
1008 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1009 * to be taken into account in all callers. -acme
1011 sk_refcnt_debug_inc(newsk);
1012 newsk->sk_socket = NULL;
1013 newsk->sk_sleep = NULL;
1015 if (newsk->sk_prot->sockets_allocated)
1016 atomic_inc(newsk->sk_prot->sockets_allocated);
1018 out:
1019 return newsk;
1022 EXPORT_SYMBOL_GPL(sk_clone);
1024 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1026 __sk_dst_set(sk, dst);
1027 sk->sk_route_caps = dst->dev->features;
1028 if (sk->sk_route_caps & NETIF_F_GSO)
1029 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1030 if (sk_can_gso(sk)) {
1031 if (dst->header_len)
1032 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1033 else
1034 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1037 EXPORT_SYMBOL_GPL(sk_setup_caps);
1039 void __init sk_init(void)
1041 if (num_physpages <= 4096) {
1042 sysctl_wmem_max = 32767;
1043 sysctl_rmem_max = 32767;
1044 sysctl_wmem_default = 32767;
1045 sysctl_rmem_default = 32767;
1046 } else if (num_physpages >= 131072) {
1047 sysctl_wmem_max = 131071;
1048 sysctl_rmem_max = 131071;
1053 * Simple resource managers for sockets.
1058 * Write buffer destructor automatically called from kfree_skb.
1060 void sock_wfree(struct sk_buff *skb)
1062 struct sock *sk = skb->sk;
1063 unsigned int len = skb->truesize;
1065 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1067 * Keep a reference on sk_wmem_alloc, this will be released
1068 * after sk_write_space() call
1070 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1071 sk->sk_write_space(sk);
1072 len = 1;
1075 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1076 * could not do because of in-flight packets
1078 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1079 __sk_free(sk);
1083 * Read buffer destructor automatically called from kfree_skb.
1085 void sock_rfree(struct sk_buff *skb)
1087 struct sock *sk = skb->sk;
1089 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1093 int sock_i_uid(struct sock *sk)
1095 int uid;
1097 read_lock(&sk->sk_callback_lock);
1098 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1099 read_unlock(&sk->sk_callback_lock);
1100 return uid;
1103 unsigned long sock_i_ino(struct sock *sk)
1105 unsigned long ino;
1107 read_lock(&sk->sk_callback_lock);
1108 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1109 read_unlock(&sk->sk_callback_lock);
1110 return ino;
1114 * Allocate a skb from the socket's send buffer.
1116 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1117 gfp_t priority)
1119 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1120 struct sk_buff * skb = alloc_skb(size, priority);
1121 if (skb) {
1122 skb_set_owner_w(skb, sk);
1123 return skb;
1126 return NULL;
1130 * Allocate a skb from the socket's receive buffer.
1132 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1133 gfp_t priority)
1135 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1136 struct sk_buff *skb = alloc_skb(size, priority);
1137 if (skb) {
1138 skb_set_owner_r(skb, sk);
1139 return skb;
1142 return NULL;
1146 * Allocate a memory block from the socket's option memory buffer.
1148 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1150 if ((unsigned)size <= sysctl_optmem_max &&
1151 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1152 void *mem;
1153 /* First do the add, to avoid the race if kmalloc
1154 * might sleep.
1156 atomic_add(size, &sk->sk_omem_alloc);
1157 mem = kmalloc(size, priority);
1158 if (mem)
1159 return mem;
1160 atomic_sub(size, &sk->sk_omem_alloc);
1162 return NULL;
1166 * Free an option memory block.
1168 void sock_kfree_s(struct sock *sk, void *mem, int size)
1170 kfree(mem);
1171 atomic_sub(size, &sk->sk_omem_alloc);
1174 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1175 I think, these locks should be removed for datagram sockets.
1177 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1179 DEFINE_WAIT(wait);
1181 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1182 for (;;) {
1183 if (!timeo)
1184 break;
1185 if (signal_pending(current))
1186 break;
1187 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1188 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1189 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1190 break;
1191 if (sk->sk_shutdown & SEND_SHUTDOWN)
1192 break;
1193 if (sk->sk_err)
1194 break;
1195 timeo = schedule_timeout(timeo);
1197 finish_wait(sk->sk_sleep, &wait);
1198 return timeo;
1203 * Generic send/receive buffer handlers
1206 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1207 unsigned long header_len,
1208 unsigned long data_len,
1209 int noblock, int *errcode)
1211 struct sk_buff *skb;
1212 gfp_t gfp_mask;
1213 long timeo;
1214 int err;
1216 gfp_mask = sk->sk_allocation;
1217 if (gfp_mask & __GFP_WAIT)
1218 gfp_mask |= __GFP_REPEAT;
1220 timeo = sock_sndtimeo(sk, noblock);
1221 while (1) {
1222 err = sock_error(sk);
1223 if (err != 0)
1224 goto failure;
1226 err = -EPIPE;
1227 if (sk->sk_shutdown & SEND_SHUTDOWN)
1228 goto failure;
1230 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1231 skb = alloc_skb(header_len, gfp_mask);
1232 if (skb) {
1233 int npages;
1234 int i;
1236 /* No pages, we're done... */
1237 if (!data_len)
1238 break;
1240 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1241 skb->truesize += data_len;
1242 skb_shinfo(skb)->nr_frags = npages;
1243 for (i = 0; i < npages; i++) {
1244 struct page *page;
1245 skb_frag_t *frag;
1247 page = alloc_pages(sk->sk_allocation, 0);
1248 if (!page) {
1249 err = -ENOBUFS;
1250 skb_shinfo(skb)->nr_frags = i;
1251 kfree_skb(skb);
1252 goto failure;
1255 frag = &skb_shinfo(skb)->frags[i];
1256 frag->page = page;
1257 frag->page_offset = 0;
1258 frag->size = (data_len >= PAGE_SIZE ?
1259 PAGE_SIZE :
1260 data_len);
1261 data_len -= PAGE_SIZE;
1264 /* Full success... */
1265 break;
1267 err = -ENOBUFS;
1268 goto failure;
1270 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1271 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1272 err = -EAGAIN;
1273 if (!timeo)
1274 goto failure;
1275 if (signal_pending(current))
1276 goto interrupted;
1277 timeo = sock_wait_for_wmem(sk, timeo);
1280 skb_set_owner_w(skb, sk);
1281 return skb;
1283 interrupted:
1284 err = sock_intr_errno(timeo);
1285 failure:
1286 *errcode = err;
1287 return NULL;
1290 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1291 int noblock, int *errcode)
1293 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1296 static void __lock_sock(struct sock *sk)
1298 DEFINE_WAIT(wait);
1300 for (;;) {
1301 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1302 TASK_UNINTERRUPTIBLE);
1303 spin_unlock_bh(&sk->sk_lock.slock);
1304 schedule();
1305 spin_lock_bh(&sk->sk_lock.slock);
1306 if (!sock_owned_by_user(sk))
1307 break;
1309 finish_wait(&sk->sk_lock.wq, &wait);
1312 static void __release_sock(struct sock *sk)
1314 struct sk_buff *skb = sk->sk_backlog.head;
1316 do {
1317 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1318 bh_unlock_sock(sk);
1320 do {
1321 struct sk_buff *next = skb->next;
1323 skb->next = NULL;
1324 sk->sk_backlog_rcv(sk, skb);
1327 * We are in process context here with softirqs
1328 * disabled, use cond_resched_softirq() to preempt.
1329 * This is safe to do because we've taken the backlog
1330 * queue private:
1332 cond_resched_softirq();
1334 skb = next;
1335 } while (skb != NULL);
1337 bh_lock_sock(sk);
1338 } while ((skb = sk->sk_backlog.head) != NULL);
1342 * sk_wait_data - wait for data to arrive at sk_receive_queue
1343 * @sk: sock to wait on
1344 * @timeo: for how long
1346 * Now socket state including sk->sk_err is changed only under lock,
1347 * hence we may omit checks after joining wait queue.
1348 * We check receive queue before schedule() only as optimization;
1349 * it is very likely that release_sock() added new data.
1351 int sk_wait_data(struct sock *sk, long *timeo)
1353 int rc;
1354 DEFINE_WAIT(wait);
1356 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1357 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1358 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1359 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1360 finish_wait(sk->sk_sleep, &wait);
1361 return rc;
1364 EXPORT_SYMBOL(sk_wait_data);
1367 * Set of default routines for initialising struct proto_ops when
1368 * the protocol does not support a particular function. In certain
1369 * cases where it makes no sense for a protocol to have a "do nothing"
1370 * function, some default processing is provided.
1373 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1375 return -EOPNOTSUPP;
1378 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1379 int len, int flags)
1381 return -EOPNOTSUPP;
1384 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1386 return -EOPNOTSUPP;
1389 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1391 return -EOPNOTSUPP;
1394 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1395 int *len, int peer)
1397 return -EOPNOTSUPP;
1400 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1402 return 0;
1405 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1407 return -EOPNOTSUPP;
1410 int sock_no_listen(struct socket *sock, int backlog)
1412 return -EOPNOTSUPP;
1415 int sock_no_shutdown(struct socket *sock, int how)
1417 return -EOPNOTSUPP;
1420 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1421 char __user *optval, int optlen)
1423 return -EOPNOTSUPP;
1426 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1427 char __user *optval, int __user *optlen)
1429 return -EOPNOTSUPP;
1432 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1433 size_t len)
1435 return -EOPNOTSUPP;
1438 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1439 size_t len, int flags)
1441 return -EOPNOTSUPP;
1444 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1446 /* Mirror missing mmap method error code */
1447 return -ENODEV;
1450 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1452 ssize_t res;
1453 struct msghdr msg = {.msg_flags = flags};
1454 struct kvec iov;
1455 char *kaddr = kmap(page);
1456 iov.iov_base = kaddr + offset;
1457 iov.iov_len = size;
1458 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1459 kunmap(page);
1460 return res;
1464 * Default Socket Callbacks
1467 static void sock_def_wakeup(struct sock *sk)
1469 read_lock(&sk->sk_callback_lock);
1470 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1471 wake_up_interruptible_all(sk->sk_sleep);
1472 read_unlock(&sk->sk_callback_lock);
1475 static void sock_def_error_report(struct sock *sk)
1477 read_lock(&sk->sk_callback_lock);
1478 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1479 wake_up_interruptible(sk->sk_sleep);
1480 sk_wake_async(sk,0,POLL_ERR);
1481 read_unlock(&sk->sk_callback_lock);
1484 static void sock_def_readable(struct sock *sk, int len)
1486 read_lock(&sk->sk_callback_lock);
1487 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1488 wake_up_interruptible(sk->sk_sleep);
1489 sk_wake_async(sk,1,POLL_IN);
1490 read_unlock(&sk->sk_callback_lock);
1493 static void sock_def_write_space(struct sock *sk)
1495 read_lock(&sk->sk_callback_lock);
1497 /* Do not wake up a writer until he can make "significant"
1498 * progress. --DaveM
1500 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1501 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1502 wake_up_interruptible(sk->sk_sleep);
1504 /* Should agree with poll, otherwise some programs break */
1505 if (sock_writeable(sk))
1506 sk_wake_async(sk, 2, POLL_OUT);
1509 read_unlock(&sk->sk_callback_lock);
1512 static void sock_def_destruct(struct sock *sk)
1514 kfree(sk->sk_protinfo);
1517 void sk_send_sigurg(struct sock *sk)
1519 if (sk->sk_socket && sk->sk_socket->file)
1520 if (send_sigurg(&sk->sk_socket->file->f_owner))
1521 sk_wake_async(sk, 3, POLL_PRI);
1524 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1525 unsigned long expires)
1527 if (!mod_timer(timer, expires))
1528 sock_hold(sk);
1531 EXPORT_SYMBOL(sk_reset_timer);
1533 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1535 if (timer_pending(timer) && del_timer(timer))
1536 __sock_put(sk);
1539 EXPORT_SYMBOL(sk_stop_timer);
1541 void sock_init_data(struct socket *sock, struct sock *sk)
1543 skb_queue_head_init(&sk->sk_receive_queue);
1544 skb_queue_head_init(&sk->sk_write_queue);
1545 skb_queue_head_init(&sk->sk_error_queue);
1546 #ifdef CONFIG_NET_DMA
1547 skb_queue_head_init(&sk->sk_async_wait_queue);
1548 #endif
1550 sk->sk_send_head = NULL;
1552 init_timer(&sk->sk_timer);
1554 sk->sk_allocation = GFP_KERNEL;
1555 sk->sk_rcvbuf = sysctl_rmem_default;
1556 sk->sk_sndbuf = sysctl_wmem_default;
1557 sk->sk_state = TCP_CLOSE;
1558 sk->sk_socket = sock;
1560 sock_set_flag(sk, SOCK_ZAPPED);
1562 if (sock) {
1563 sk->sk_type = sock->type;
1564 sk->sk_sleep = &sock->wait;
1565 sock->sk = sk;
1566 } else
1567 sk->sk_sleep = NULL;
1569 rwlock_init(&sk->sk_dst_lock);
1570 rwlock_init(&sk->sk_callback_lock);
1571 lockdep_set_class(&sk->sk_callback_lock,
1572 af_callback_keys + sk->sk_family);
1574 sk->sk_state_change = sock_def_wakeup;
1575 sk->sk_data_ready = sock_def_readable;
1576 sk->sk_write_space = sock_def_write_space;
1577 sk->sk_error_report = sock_def_error_report;
1578 sk->sk_destruct = sock_def_destruct;
1580 sk->sk_sndmsg_page = NULL;
1581 sk->sk_sndmsg_off = 0;
1583 sk->sk_peercred.pid = 0;
1584 sk->sk_peercred.uid = -1;
1585 sk->sk_peercred.gid = -1;
1586 sk->sk_write_pending = 0;
1587 sk->sk_rcvlowat = 1;
1588 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1589 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1591 sk->sk_stamp = ktime_set(-1L, -1L);
1593 atomic_set(&sk->sk_refcnt, 1);
1596 void fastcall lock_sock_nested(struct sock *sk, int subclass)
1598 might_sleep();
1599 spin_lock_bh(&sk->sk_lock.slock);
1600 if (sk->sk_lock.owner)
1601 __lock_sock(sk);
1602 sk->sk_lock.owner = (void *)1;
1603 spin_unlock(&sk->sk_lock.slock);
1605 * The sk_lock has mutex_lock() semantics here:
1607 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1608 local_bh_enable();
1611 EXPORT_SYMBOL(lock_sock_nested);
1613 void fastcall release_sock(struct sock *sk)
1616 * The sk_lock has mutex_unlock() semantics:
1618 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1620 spin_lock_bh(&sk->sk_lock.slock);
1621 if (sk->sk_backlog.tail)
1622 __release_sock(sk);
1623 sk->sk_lock.owner = NULL;
1624 if (waitqueue_active(&sk->sk_lock.wq))
1625 wake_up(&sk->sk_lock.wq);
1626 spin_unlock_bh(&sk->sk_lock.slock);
1628 EXPORT_SYMBOL(release_sock);
1630 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1632 struct timeval tv;
1633 if (!sock_flag(sk, SOCK_TIMESTAMP))
1634 sock_enable_timestamp(sk);
1635 tv = ktime_to_timeval(sk->sk_stamp);
1636 if (tv.tv_sec == -1)
1637 return -ENOENT;
1638 if (tv.tv_sec == 0) {
1639 sk->sk_stamp = ktime_get_real();
1640 tv = ktime_to_timeval(sk->sk_stamp);
1642 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1644 EXPORT_SYMBOL(sock_get_timestamp);
1646 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1648 struct timespec ts;
1649 if (!sock_flag(sk, SOCK_TIMESTAMP))
1650 sock_enable_timestamp(sk);
1651 ts = ktime_to_timespec(sk->sk_stamp);
1652 if (ts.tv_sec == -1)
1653 return -ENOENT;
1654 if (ts.tv_sec == 0) {
1655 sk->sk_stamp = ktime_get_real();
1656 ts = ktime_to_timespec(sk->sk_stamp);
1658 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1660 EXPORT_SYMBOL(sock_get_timestampns);
1662 void sock_enable_timestamp(struct sock *sk)
1664 if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1665 sock_set_flag(sk, SOCK_TIMESTAMP);
1666 net_enable_timestamp();
1669 EXPORT_SYMBOL(sock_enable_timestamp);
1672 * Get a socket option on an socket.
1674 * FIX: POSIX 1003.1g is very ambiguous here. It states that
1675 * asynchronous errors should be reported by getsockopt. We assume
1676 * this means if you specify SO_ERROR (otherwise whats the point of it).
1678 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1679 char __user *optval, int __user *optlen)
1681 struct sock *sk = sock->sk;
1683 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1686 EXPORT_SYMBOL(sock_common_getsockopt);
1688 #ifdef CONFIG_COMPAT
1689 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1690 char __user *optval, int __user *optlen)
1692 struct sock *sk = sock->sk;
1694 if (sk->sk_prot->compat_getsockopt != NULL)
1695 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1696 optval, optlen);
1697 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1699 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1700 #endif
1702 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1703 struct msghdr *msg, size_t size, int flags)
1705 struct sock *sk = sock->sk;
1706 int addr_len = 0;
1707 int err;
1709 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1710 flags & ~MSG_DONTWAIT, &addr_len);
1711 if (err >= 0)
1712 msg->msg_namelen = addr_len;
1713 return err;
1716 EXPORT_SYMBOL(sock_common_recvmsg);
1719 * Set socket options on an inet socket.
1721 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1722 char __user *optval, int optlen)
1724 struct sock *sk = sock->sk;
1726 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1729 EXPORT_SYMBOL(sock_common_setsockopt);
1731 #ifdef CONFIG_COMPAT
1732 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1733 char __user *optval, int optlen)
1735 struct sock *sk = sock->sk;
1737 if (sk->sk_prot->compat_setsockopt != NULL)
1738 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1739 optval, optlen);
1740 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1742 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1743 #endif
1745 void sk_common_release(struct sock *sk)
1747 if (sk->sk_prot->destroy)
1748 sk->sk_prot->destroy(sk);
1751 * Observation: when sock_common_release is called, processes have
1752 * no access to socket. But net still has.
1753 * Step one, detach it from networking:
1755 * A. Remove from hash tables.
1758 sk->sk_prot->unhash(sk);
1761 * In this point socket cannot receive new packets, but it is possible
1762 * that some packets are in flight because some CPU runs receiver and
1763 * did hash table lookup before we unhashed socket. They will achieve
1764 * receive queue and will be purged by socket destructor.
1766 * Also we still have packets pending on receive queue and probably,
1767 * our own packets waiting in device queues. sock_destroy will drain
1768 * receive queue, but transmitted packets will delay socket destruction
1769 * until the last reference will be released.
1772 sock_orphan(sk);
1774 xfrm_sk_free_policy(sk);
1776 sk_refcnt_debug_release(sk);
1777 sock_put(sk);
1780 EXPORT_SYMBOL(sk_common_release);
1782 static DEFINE_RWLOCK(proto_list_lock);
1783 static LIST_HEAD(proto_list);
1785 int proto_register(struct proto *prot, int alloc_slab)
1787 char *request_sock_slab_name = NULL;
1788 char *timewait_sock_slab_name;
1789 int rc = -ENOBUFS;
1791 if (alloc_slab) {
1792 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1793 SLAB_HWCACHE_ALIGN, NULL, NULL);
1795 if (prot->slab == NULL) {
1796 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1797 prot->name);
1798 goto out;
1801 if (prot->rsk_prot != NULL) {
1802 static const char mask[] = "request_sock_%s";
1804 request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1805 if (request_sock_slab_name == NULL)
1806 goto out_free_sock_slab;
1808 sprintf(request_sock_slab_name, mask, prot->name);
1809 prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1810 prot->rsk_prot->obj_size, 0,
1811 SLAB_HWCACHE_ALIGN, NULL, NULL);
1813 if (prot->rsk_prot->slab == NULL) {
1814 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1815 prot->name);
1816 goto out_free_request_sock_slab_name;
1820 if (prot->twsk_prot != NULL) {
1821 static const char mask[] = "tw_sock_%s";
1823 timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1825 if (timewait_sock_slab_name == NULL)
1826 goto out_free_request_sock_slab;
1828 sprintf(timewait_sock_slab_name, mask, prot->name);
1829 prot->twsk_prot->twsk_slab =
1830 kmem_cache_create(timewait_sock_slab_name,
1831 prot->twsk_prot->twsk_obj_size,
1832 0, SLAB_HWCACHE_ALIGN,
1833 NULL, NULL);
1834 if (prot->twsk_prot->twsk_slab == NULL)
1835 goto out_free_timewait_sock_slab_name;
1839 write_lock(&proto_list_lock);
1840 list_add(&prot->node, &proto_list);
1841 write_unlock(&proto_list_lock);
1842 rc = 0;
1843 out:
1844 return rc;
1845 out_free_timewait_sock_slab_name:
1846 kfree(timewait_sock_slab_name);
1847 out_free_request_sock_slab:
1848 if (prot->rsk_prot && prot->rsk_prot->slab) {
1849 kmem_cache_destroy(prot->rsk_prot->slab);
1850 prot->rsk_prot->slab = NULL;
1852 out_free_request_sock_slab_name:
1853 kfree(request_sock_slab_name);
1854 out_free_sock_slab:
1855 kmem_cache_destroy(prot->slab);
1856 prot->slab = NULL;
1857 goto out;
1860 EXPORT_SYMBOL(proto_register);
1862 void proto_unregister(struct proto *prot)
1864 write_lock(&proto_list_lock);
1865 list_del(&prot->node);
1866 write_unlock(&proto_list_lock);
1868 if (prot->slab != NULL) {
1869 kmem_cache_destroy(prot->slab);
1870 prot->slab = NULL;
1873 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1874 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1876 kmem_cache_destroy(prot->rsk_prot->slab);
1877 kfree(name);
1878 prot->rsk_prot->slab = NULL;
1881 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1882 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1884 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1885 kfree(name);
1886 prot->twsk_prot->twsk_slab = NULL;
1890 EXPORT_SYMBOL(proto_unregister);
1892 #ifdef CONFIG_PROC_FS
1893 static inline struct proto *__proto_head(void)
1895 return list_entry(proto_list.next, struct proto, node);
1898 static inline struct proto *proto_head(void)
1900 return list_empty(&proto_list) ? NULL : __proto_head();
1903 static inline struct proto *proto_next(struct proto *proto)
1905 return proto->node.next == &proto_list ? NULL :
1906 list_entry(proto->node.next, struct proto, node);
1909 static inline struct proto *proto_get_idx(loff_t pos)
1911 struct proto *proto;
1912 loff_t i = 0;
1914 list_for_each_entry(proto, &proto_list, node)
1915 if (i++ == pos)
1916 goto out;
1918 proto = NULL;
1919 out:
1920 return proto;
1923 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1925 read_lock(&proto_list_lock);
1926 return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1929 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1931 ++*pos;
1932 return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1935 static void proto_seq_stop(struct seq_file *seq, void *v)
1937 read_unlock(&proto_list_lock);
1940 static char proto_method_implemented(const void *method)
1942 return method == NULL ? 'n' : 'y';
1945 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1947 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
1948 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1949 proto->name,
1950 proto->obj_size,
1951 proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1952 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1953 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1954 proto->max_header,
1955 proto->slab == NULL ? "no" : "yes",
1956 module_name(proto->owner),
1957 proto_method_implemented(proto->close),
1958 proto_method_implemented(proto->connect),
1959 proto_method_implemented(proto->disconnect),
1960 proto_method_implemented(proto->accept),
1961 proto_method_implemented(proto->ioctl),
1962 proto_method_implemented(proto->init),
1963 proto_method_implemented(proto->destroy),
1964 proto_method_implemented(proto->shutdown),
1965 proto_method_implemented(proto->setsockopt),
1966 proto_method_implemented(proto->getsockopt),
1967 proto_method_implemented(proto->sendmsg),
1968 proto_method_implemented(proto->recvmsg),
1969 proto_method_implemented(proto->sendpage),
1970 proto_method_implemented(proto->bind),
1971 proto_method_implemented(proto->backlog_rcv),
1972 proto_method_implemented(proto->hash),
1973 proto_method_implemented(proto->unhash),
1974 proto_method_implemented(proto->get_port),
1975 proto_method_implemented(proto->enter_memory_pressure));
1978 static int proto_seq_show(struct seq_file *seq, void *v)
1980 if (v == SEQ_START_TOKEN)
1981 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1982 "protocol",
1983 "size",
1984 "sockets",
1985 "memory",
1986 "press",
1987 "maxhdr",
1988 "slab",
1989 "module",
1990 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1991 else
1992 proto_seq_printf(seq, v);
1993 return 0;
1996 static const struct seq_operations proto_seq_ops = {
1997 .start = proto_seq_start,
1998 .next = proto_seq_next,
1999 .stop = proto_seq_stop,
2000 .show = proto_seq_show,
2003 static int proto_seq_open(struct inode *inode, struct file *file)
2005 return seq_open(file, &proto_seq_ops);
2008 static const struct file_operations proto_seq_fops = {
2009 .owner = THIS_MODULE,
2010 .open = proto_seq_open,
2011 .read = seq_read,
2012 .llseek = seq_lseek,
2013 .release = seq_release,
2016 static int __init proto_init(void)
2018 /* register /proc/net/protocols */
2019 return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
2022 subsys_initcall(proto_init);
2024 #endif /* PROC_FS */
2026 EXPORT_SYMBOL(sk_alloc);
2027 EXPORT_SYMBOL(sk_free);
2028 EXPORT_SYMBOL(sk_send_sigurg);
2029 EXPORT_SYMBOL(sock_alloc_send_skb);
2030 EXPORT_SYMBOL(sock_init_data);
2031 EXPORT_SYMBOL(sock_kfree_s);
2032 EXPORT_SYMBOL(sock_kmalloc);
2033 EXPORT_SYMBOL(sock_no_accept);
2034 EXPORT_SYMBOL(sock_no_bind);
2035 EXPORT_SYMBOL(sock_no_connect);
2036 EXPORT_SYMBOL(sock_no_getname);
2037 EXPORT_SYMBOL(sock_no_getsockopt);
2038 EXPORT_SYMBOL(sock_no_ioctl);
2039 EXPORT_SYMBOL(sock_no_listen);
2040 EXPORT_SYMBOL(sock_no_mmap);
2041 EXPORT_SYMBOL(sock_no_poll);
2042 EXPORT_SYMBOL(sock_no_recvmsg);
2043 EXPORT_SYMBOL(sock_no_sendmsg);
2044 EXPORT_SYMBOL(sock_no_sendpage);
2045 EXPORT_SYMBOL(sock_no_setsockopt);
2046 EXPORT_SYMBOL(sock_no_shutdown);
2047 EXPORT_SYMBOL(sock_no_socketpair);
2048 EXPORT_SYMBOL(sock_rfree);
2049 EXPORT_SYMBOL(sock_setsockopt);
2050 EXPORT_SYMBOL(sock_wfree);
2051 EXPORT_SYMBOL(sock_wmalloc);
2052 EXPORT_SYMBOL(sock_i_uid);
2053 EXPORT_SYMBOL(sock_i_ino);
2054 EXPORT_SYMBOL(sysctl_optmem_max);
2055 #ifdef CONFIG_SYSCTL
2056 EXPORT_SYMBOL(sysctl_rmem_max);
2057 EXPORT_SYMBOL(sysctl_wmem_max);
2058 #endif