ACPI: thinkpad-acpi: fix regression on HKEY LID event handling
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / core / sock.c
blob190de61cd648db8400ec852e40a652a6897a9e7d
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
12 * Authors: Ross Biro
13 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Alan Cox, <A.Cox@swansea.ac.uk>
17 * Fixes:
18 * Alan Cox : Numerous verify_area() problems
19 * Alan Cox : Connecting on a connecting socket
20 * now returns an error for tcp.
21 * Alan Cox : sock->protocol is set correctly.
22 * and is not sometimes left as 0.
23 * Alan Cox : connect handles icmp errors on a
24 * connect properly. Unfortunately there
25 * is a restart syscall nasty there. I
26 * can't match BSD without hacking the C
27 * library. Ideas urgently sought!
28 * Alan Cox : Disallow bind() to addresses that are
29 * not ours - especially broadcast ones!!
30 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
31 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
32 * instead they leave that for the DESTROY timer.
33 * Alan Cox : Clean up error flag in accept
34 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
35 * was buggy. Put a remove_sock() in the handler
36 * for memory when we hit 0. Also altered the timer
37 * code. The ACK stuff can wait and needs major
38 * TCP layer surgery.
39 * Alan Cox : Fixed TCP ack bug, removed remove sock
40 * and fixed timer/inet_bh race.
41 * Alan Cox : Added zapped flag for TCP
42 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
43 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
45 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 * Rick Sladkey : Relaxed UDP rules for matching packets.
48 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
49 * Pauline Middelink : identd support
50 * Alan Cox : Fixed connect() taking signals I think.
51 * Alan Cox : SO_LINGER supported
52 * Alan Cox : Error reporting fixes
53 * Anonymous : inet_create tidied up (sk->reuse setting)
54 * Alan Cox : inet sockets don't set sk->type!
55 * Alan Cox : Split socket option code
56 * Alan Cox : Callbacks
57 * Alan Cox : Nagle flag for Charles & Johannes stuff
58 * Alex : Removed restriction on inet fioctl
59 * Alan Cox : Splitting INET from NET core
60 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
61 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
62 * Alan Cox : Split IP from generic code
63 * Alan Cox : New kfree_skbmem()
64 * Alan Cox : Make SO_DEBUG superuser only.
65 * Alan Cox : Allow anyone to clear SO_DEBUG
66 * (compatibility fix)
67 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
68 * Alan Cox : Allocator for a socket is settable.
69 * Alan Cox : SO_ERROR includes soft errors.
70 * Alan Cox : Allow NULL arguments on some SO_ opts
71 * Alan Cox : Generic socket allocation to make hooks
72 * easier (suggested by Craig Metz).
73 * Michael Pall : SO_ERROR returns positive errno again
74 * Steve Whitehouse: Added default destructor to free
75 * protocol private data.
76 * Steve Whitehouse: Added various other default routines
77 * common to several socket families.
78 * Chris Evans : Call suser() check last on F_SETOWN
79 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
81 * Andi Kleen : Fix write_space callback
82 * Chris Evans : Security fixes - signedness again
83 * Arnaldo C. Melo : cleanups, use skb_queue_purge
85 * To Fix:
88 * This program is free software; you can redistribute it and/or
89 * modify it under the terms of the GNU General Public License
90 * as published by the Free Software Foundation; either version
91 * 2 of the License, or (at your option) any later version.
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114 #include <linux/highmem.h>
116 #include <asm/uaccess.h>
117 #include <asm/system.h>
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/request_sock.h>
123 #include <net/sock.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
127 #include <linux/filter.h>
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
137 static struct lock_class_key af_family_keys[AF_MAX];
138 static struct lock_class_key af_family_slock_keys[AF_MAX];
140 #ifdef CONFIG_DEBUG_LOCK_ALLOC
142 * Make lock validator output more readable. (we pre-construct these
143 * strings build-time, so that runtime initialization of socket
144 * locks is fast):
146 static const char *af_family_key_strings[AF_MAX+1] = {
147 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
148 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
149 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
150 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
151 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
152 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
153 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
154 "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
155 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
156 "sk_lock-27" , "sk_lock-28" , "sk_lock-29" ,
157 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
158 "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
160 static const char *af_family_slock_key_strings[AF_MAX+1] = {
161 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
162 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
163 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
164 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
165 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
166 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
167 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
168 "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" ,
169 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
170 "slock-27" , "slock-28" , "slock-29" ,
171 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
172 "slock-AF_RXRPC" , "slock-AF_MAX"
174 static const char *af_family_clock_key_strings[AF_MAX+1] = {
175 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
176 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
177 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
178 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
179 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
180 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
181 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
182 "clock-21" , "clock-AF_SNA" , "clock-AF_IRDA" ,
183 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
184 "clock-27" , "clock-28" , "clock-29" ,
185 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
186 "clock-AF_RXRPC" , "clock-AF_MAX"
188 #endif
191 * sk_callback_lock locking rules are per-address-family,
192 * so split the lock classes by using a per-AF key:
194 static struct lock_class_key af_callback_keys[AF_MAX];
196 /* Take into consideration the size of the struct sk_buff overhead in the
197 * determination of these values, since that is non-constant across
198 * platforms. This makes socket queueing behavior and performance
199 * not depend upon such differences.
201 #define _SK_MEM_PACKETS 256
202 #define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
203 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
204 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
206 /* Run time adjustable parameters. */
207 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
208 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
209 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
210 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
212 /* Maximal space eaten by iovec or ancilliary data plus some space */
213 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
215 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
217 struct timeval tv;
219 if (optlen < sizeof(tv))
220 return -EINVAL;
221 if (copy_from_user(&tv, optval, sizeof(tv)))
222 return -EFAULT;
223 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
224 return -EDOM;
226 if (tv.tv_sec < 0) {
227 static int warned __read_mostly;
229 *timeo_p = 0;
230 if (warned < 10 && net_ratelimit())
231 warned++;
232 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
233 "tries to set negative timeout\n",
234 current->comm, current->pid);
235 return 0;
237 *timeo_p = MAX_SCHEDULE_TIMEOUT;
238 if (tv.tv_sec == 0 && tv.tv_usec == 0)
239 return 0;
240 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
241 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
242 return 0;
245 static void sock_warn_obsolete_bsdism(const char *name)
247 static int warned;
248 static char warncomm[TASK_COMM_LEN];
249 if (strcmp(warncomm, current->comm) && warned < 5) {
250 strcpy(warncomm, current->comm);
251 printk(KERN_WARNING "process `%s' is using obsolete "
252 "%s SO_BSDCOMPAT\n", warncomm, name);
253 warned++;
257 static void sock_disable_timestamp(struct sock *sk)
259 if (sock_flag(sk, SOCK_TIMESTAMP)) {
260 sock_reset_flag(sk, SOCK_TIMESTAMP);
261 net_disable_timestamp();
266 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
268 int err = 0;
269 int skb_len;
271 /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
272 number of warnings when compiling with -W --ANK
274 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
275 (unsigned)sk->sk_rcvbuf) {
276 err = -ENOMEM;
277 goto out;
280 err = sk_filter(sk, skb);
281 if (err)
282 goto out;
284 skb->dev = NULL;
285 skb_set_owner_r(skb, sk);
287 /* Cache the SKB length before we tack it onto the receive
288 * queue. Once it is added it no longer belongs to us and
289 * may be freed by other threads of control pulling packets
290 * from the queue.
292 skb_len = skb->len;
294 skb_queue_tail(&sk->sk_receive_queue, skb);
296 if (!sock_flag(sk, SOCK_DEAD))
297 sk->sk_data_ready(sk, skb_len);
298 out:
299 return err;
301 EXPORT_SYMBOL(sock_queue_rcv_skb);
303 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
305 int rc = NET_RX_SUCCESS;
307 if (sk_filter(sk, skb))
308 goto discard_and_relse;
310 skb->dev = NULL;
312 if (nested)
313 bh_lock_sock_nested(sk);
314 else
315 bh_lock_sock(sk);
316 if (!sock_owned_by_user(sk)) {
318 * trylock + unlock semantics:
320 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
322 rc = sk->sk_backlog_rcv(sk, skb);
324 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
325 } else
326 sk_add_backlog(sk, skb);
327 bh_unlock_sock(sk);
328 out:
329 sock_put(sk);
330 return rc;
331 discard_and_relse:
332 kfree_skb(skb);
333 goto out;
335 EXPORT_SYMBOL(sk_receive_skb);
337 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
339 struct dst_entry *dst = sk->sk_dst_cache;
341 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
342 sk->sk_dst_cache = NULL;
343 dst_release(dst);
344 return NULL;
347 return dst;
349 EXPORT_SYMBOL(__sk_dst_check);
351 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
353 struct dst_entry *dst = sk_dst_get(sk);
355 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
356 sk_dst_reset(sk);
357 dst_release(dst);
358 return NULL;
361 return dst;
363 EXPORT_SYMBOL(sk_dst_check);
365 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
367 int ret = -ENOPROTOOPT;
368 #ifdef CONFIG_NETDEVICES
369 char devname[IFNAMSIZ];
370 int index;
372 /* Sorry... */
373 ret = -EPERM;
374 if (!capable(CAP_NET_RAW))
375 goto out;
377 ret = -EINVAL;
378 if (optlen < 0)
379 goto out;
381 /* Bind this socket to a particular device like "eth0",
382 * as specified in the passed interface name. If the
383 * name is "" or the option length is zero the socket
384 * is not bound.
386 if (optlen > IFNAMSIZ - 1)
387 optlen = IFNAMSIZ - 1;
388 memset(devname, 0, sizeof(devname));
390 ret = -EFAULT;
391 if (copy_from_user(devname, optval, optlen))
392 goto out;
394 if (devname[0] == '\0') {
395 index = 0;
396 } else {
397 struct net_device *dev = dev_get_by_name(devname);
399 ret = -ENODEV;
400 if (!dev)
401 goto out;
403 index = dev->ifindex;
404 dev_put(dev);
407 lock_sock(sk);
408 sk->sk_bound_dev_if = index;
409 sk_dst_reset(sk);
410 release_sock(sk);
412 ret = 0;
414 out:
415 #endif
417 return ret;
421 * This is meant for all protocols to use and covers goings on
422 * at the socket level. Everything here is generic.
425 int sock_setsockopt(struct socket *sock, int level, int optname,
426 char __user *optval, int optlen)
428 struct sock *sk=sock->sk;
429 struct sk_filter *filter;
430 int val;
431 int valbool;
432 struct linger ling;
433 int ret = 0;
436 * Options without arguments
439 #ifdef SO_DONTLINGER /* Compatibility item... */
440 if (optname == SO_DONTLINGER) {
441 lock_sock(sk);
442 sock_reset_flag(sk, SOCK_LINGER);
443 release_sock(sk);
444 return 0;
446 #endif
448 if (optname == SO_BINDTODEVICE)
449 return sock_bindtodevice(sk, optval, optlen);
451 if (optlen < sizeof(int))
452 return -EINVAL;
454 if (get_user(val, (int __user *)optval))
455 return -EFAULT;
457 valbool = val?1:0;
459 lock_sock(sk);
461 switch(optname) {
462 case SO_DEBUG:
463 if (val && !capable(CAP_NET_ADMIN)) {
464 ret = -EACCES;
466 else if (valbool)
467 sock_set_flag(sk, SOCK_DBG);
468 else
469 sock_reset_flag(sk, SOCK_DBG);
470 break;
471 case SO_REUSEADDR:
472 sk->sk_reuse = valbool;
473 break;
474 case SO_TYPE:
475 case SO_ERROR:
476 ret = -ENOPROTOOPT;
477 break;
478 case SO_DONTROUTE:
479 if (valbool)
480 sock_set_flag(sk, SOCK_LOCALROUTE);
481 else
482 sock_reset_flag(sk, SOCK_LOCALROUTE);
483 break;
484 case SO_BROADCAST:
485 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
486 break;
487 case SO_SNDBUF:
488 /* Don't error on this BSD doesn't and if you think
489 about it this is right. Otherwise apps have to
490 play 'guess the biggest size' games. RCVBUF/SNDBUF
491 are treated in BSD as hints */
493 if (val > sysctl_wmem_max)
494 val = sysctl_wmem_max;
495 set_sndbuf:
496 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
497 if ((val * 2) < SOCK_MIN_SNDBUF)
498 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
499 else
500 sk->sk_sndbuf = val * 2;
503 * Wake up sending tasks if we
504 * upped the value.
506 sk->sk_write_space(sk);
507 break;
509 case SO_SNDBUFFORCE:
510 if (!capable(CAP_NET_ADMIN)) {
511 ret = -EPERM;
512 break;
514 goto set_sndbuf;
516 case SO_RCVBUF:
517 /* Don't error on this BSD doesn't and if you think
518 about it this is right. Otherwise apps have to
519 play 'guess the biggest size' games. RCVBUF/SNDBUF
520 are treated in BSD as hints */
522 if (val > sysctl_rmem_max)
523 val = sysctl_rmem_max;
524 set_rcvbuf:
525 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
527 * We double it on the way in to account for
528 * "struct sk_buff" etc. overhead. Applications
529 * assume that the SO_RCVBUF setting they make will
530 * allow that much actual data to be received on that
531 * socket.
533 * Applications are unaware that "struct sk_buff" and
534 * other overheads allocate from the receive buffer
535 * during socket buffer allocation.
537 * And after considering the possible alternatives,
538 * returning the value we actually used in getsockopt
539 * is the most desirable behavior.
541 if ((val * 2) < SOCK_MIN_RCVBUF)
542 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
543 else
544 sk->sk_rcvbuf = val * 2;
545 break;
547 case SO_RCVBUFFORCE:
548 if (!capable(CAP_NET_ADMIN)) {
549 ret = -EPERM;
550 break;
552 goto set_rcvbuf;
554 case SO_KEEPALIVE:
555 #ifdef CONFIG_INET
556 if (sk->sk_protocol == IPPROTO_TCP)
557 tcp_set_keepalive(sk, valbool);
558 #endif
559 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
560 break;
562 case SO_OOBINLINE:
563 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
564 break;
566 case SO_NO_CHECK:
567 sk->sk_no_check = valbool;
568 break;
570 case SO_PRIORITY:
571 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
572 sk->sk_priority = val;
573 else
574 ret = -EPERM;
575 break;
577 case SO_LINGER:
578 if (optlen < sizeof(ling)) {
579 ret = -EINVAL; /* 1003.1g */
580 break;
582 if (copy_from_user(&ling,optval,sizeof(ling))) {
583 ret = -EFAULT;
584 break;
586 if (!ling.l_onoff)
587 sock_reset_flag(sk, SOCK_LINGER);
588 else {
589 #if (BITS_PER_LONG == 32)
590 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
591 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
592 else
593 #endif
594 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
595 sock_set_flag(sk, SOCK_LINGER);
597 break;
599 case SO_BSDCOMPAT:
600 sock_warn_obsolete_bsdism("setsockopt");
601 break;
603 case SO_PASSCRED:
604 if (valbool)
605 set_bit(SOCK_PASSCRED, &sock->flags);
606 else
607 clear_bit(SOCK_PASSCRED, &sock->flags);
608 break;
610 case SO_TIMESTAMP:
611 case SO_TIMESTAMPNS:
612 if (valbool) {
613 if (optname == SO_TIMESTAMP)
614 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
615 else
616 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
617 sock_set_flag(sk, SOCK_RCVTSTAMP);
618 sock_enable_timestamp(sk);
619 } else {
620 sock_reset_flag(sk, SOCK_RCVTSTAMP);
621 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
623 break;
625 case SO_RCVLOWAT:
626 if (val < 0)
627 val = INT_MAX;
628 sk->sk_rcvlowat = val ? : 1;
629 break;
631 case SO_RCVTIMEO:
632 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
633 break;
635 case SO_SNDTIMEO:
636 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
637 break;
639 case SO_ATTACH_FILTER:
640 ret = -EINVAL;
641 if (optlen == sizeof(struct sock_fprog)) {
642 struct sock_fprog fprog;
644 ret = -EFAULT;
645 if (copy_from_user(&fprog, optval, sizeof(fprog)))
646 break;
648 ret = sk_attach_filter(&fprog, sk);
650 break;
652 case SO_DETACH_FILTER:
653 rcu_read_lock_bh();
654 filter = rcu_dereference(sk->sk_filter);
655 if (filter) {
656 rcu_assign_pointer(sk->sk_filter, NULL);
657 sk_filter_release(sk, filter);
658 rcu_read_unlock_bh();
659 break;
661 rcu_read_unlock_bh();
662 ret = -ENONET;
663 break;
665 case SO_PASSSEC:
666 if (valbool)
667 set_bit(SOCK_PASSSEC, &sock->flags);
668 else
669 clear_bit(SOCK_PASSSEC, &sock->flags);
670 break;
672 /* We implement the SO_SNDLOWAT etc to
673 not be settable (1003.1g 5.3) */
674 default:
675 ret = -ENOPROTOOPT;
676 break;
678 release_sock(sk);
679 return ret;
683 int sock_getsockopt(struct socket *sock, int level, int optname,
684 char __user *optval, int __user *optlen)
686 struct sock *sk = sock->sk;
688 union {
689 int val;
690 struct linger ling;
691 struct timeval tm;
692 } v;
694 unsigned int lv = sizeof(int);
695 int len;
697 if (get_user(len, optlen))
698 return -EFAULT;
699 if (len < 0)
700 return -EINVAL;
702 switch(optname) {
703 case SO_DEBUG:
704 v.val = sock_flag(sk, SOCK_DBG);
705 break;
707 case SO_DONTROUTE:
708 v.val = sock_flag(sk, SOCK_LOCALROUTE);
709 break;
711 case SO_BROADCAST:
712 v.val = !!sock_flag(sk, SOCK_BROADCAST);
713 break;
715 case SO_SNDBUF:
716 v.val = sk->sk_sndbuf;
717 break;
719 case SO_RCVBUF:
720 v.val = sk->sk_rcvbuf;
721 break;
723 case SO_REUSEADDR:
724 v.val = sk->sk_reuse;
725 break;
727 case SO_KEEPALIVE:
728 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
729 break;
731 case SO_TYPE:
732 v.val = sk->sk_type;
733 break;
735 case SO_ERROR:
736 v.val = -sock_error(sk);
737 if (v.val==0)
738 v.val = xchg(&sk->sk_err_soft, 0);
739 break;
741 case SO_OOBINLINE:
742 v.val = !!sock_flag(sk, SOCK_URGINLINE);
743 break;
745 case SO_NO_CHECK:
746 v.val = sk->sk_no_check;
747 break;
749 case SO_PRIORITY:
750 v.val = sk->sk_priority;
751 break;
753 case SO_LINGER:
754 lv = sizeof(v.ling);
755 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
756 v.ling.l_linger = sk->sk_lingertime / HZ;
757 break;
759 case SO_BSDCOMPAT:
760 sock_warn_obsolete_bsdism("getsockopt");
761 break;
763 case SO_TIMESTAMP:
764 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
765 !sock_flag(sk, SOCK_RCVTSTAMPNS);
766 break;
768 case SO_TIMESTAMPNS:
769 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
770 break;
772 case SO_RCVTIMEO:
773 lv=sizeof(struct timeval);
774 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
775 v.tm.tv_sec = 0;
776 v.tm.tv_usec = 0;
777 } else {
778 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
779 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
781 break;
783 case SO_SNDTIMEO:
784 lv=sizeof(struct timeval);
785 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
786 v.tm.tv_sec = 0;
787 v.tm.tv_usec = 0;
788 } else {
789 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
790 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
792 break;
794 case SO_RCVLOWAT:
795 v.val = sk->sk_rcvlowat;
796 break;
798 case SO_SNDLOWAT:
799 v.val=1;
800 break;
802 case SO_PASSCRED:
803 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
804 break;
806 case SO_PEERCRED:
807 if (len > sizeof(sk->sk_peercred))
808 len = sizeof(sk->sk_peercred);
809 if (copy_to_user(optval, &sk->sk_peercred, len))
810 return -EFAULT;
811 goto lenout;
813 case SO_PEERNAME:
815 char address[128];
817 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
818 return -ENOTCONN;
819 if (lv < len)
820 return -EINVAL;
821 if (copy_to_user(optval, address, len))
822 return -EFAULT;
823 goto lenout;
826 /* Dubious BSD thing... Probably nobody even uses it, but
827 * the UNIX standard wants it for whatever reason... -DaveM
829 case SO_ACCEPTCONN:
830 v.val = sk->sk_state == TCP_LISTEN;
831 break;
833 case SO_PASSSEC:
834 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
835 break;
837 case SO_PEERSEC:
838 return security_socket_getpeersec_stream(sock, optval, optlen, len);
840 default:
841 return -ENOPROTOOPT;
844 if (len > lv)
845 len = lv;
846 if (copy_to_user(optval, &v, len))
847 return -EFAULT;
848 lenout:
849 if (put_user(len, optlen))
850 return -EFAULT;
851 return 0;
855 * Initialize an sk_lock.
857 * (We also register the sk_lock with the lock validator.)
859 static inline void sock_lock_init(struct sock *sk)
861 sock_lock_init_class_and_name(sk,
862 af_family_slock_key_strings[sk->sk_family],
863 af_family_slock_keys + sk->sk_family,
864 af_family_key_strings[sk->sk_family],
865 af_family_keys + sk->sk_family);
869 * sk_alloc - All socket objects are allocated here
870 * @family: protocol family
871 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
872 * @prot: struct proto associated with this new sock instance
873 * @zero_it: if we should zero the newly allocated sock
875 struct sock *sk_alloc(int family, gfp_t priority,
876 struct proto *prot, int zero_it)
878 struct sock *sk = NULL;
879 struct kmem_cache *slab = prot->slab;
881 if (slab != NULL)
882 sk = kmem_cache_alloc(slab, priority);
883 else
884 sk = kmalloc(prot->obj_size, priority);
886 if (sk) {
887 if (zero_it) {
888 memset(sk, 0, prot->obj_size);
889 sk->sk_family = family;
891 * See comment in struct sock definition to understand
892 * why we need sk_prot_creator -acme
894 sk->sk_prot = sk->sk_prot_creator = prot;
895 sock_lock_init(sk);
898 if (security_sk_alloc(sk, family, priority))
899 goto out_free;
901 if (!try_module_get(prot->owner))
902 goto out_free;
904 return sk;
906 out_free:
907 if (slab != NULL)
908 kmem_cache_free(slab, sk);
909 else
910 kfree(sk);
911 return NULL;
914 void sk_free(struct sock *sk)
916 struct sk_filter *filter;
917 struct module *owner = sk->sk_prot_creator->owner;
919 if (sk->sk_destruct)
920 sk->sk_destruct(sk);
922 filter = rcu_dereference(sk->sk_filter);
923 if (filter) {
924 sk_filter_release(sk, filter);
925 rcu_assign_pointer(sk->sk_filter, NULL);
928 sock_disable_timestamp(sk);
930 if (atomic_read(&sk->sk_omem_alloc))
931 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
932 __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
934 security_sk_free(sk);
935 if (sk->sk_prot_creator->slab != NULL)
936 kmem_cache_free(sk->sk_prot_creator->slab, sk);
937 else
938 kfree(sk);
939 module_put(owner);
942 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
944 struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
946 if (newsk != NULL) {
947 struct sk_filter *filter;
949 sock_copy(newsk, sk);
951 /* SANITY */
952 sk_node_init(&newsk->sk_node);
953 sock_lock_init(newsk);
954 bh_lock_sock(newsk);
955 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
957 atomic_set(&newsk->sk_rmem_alloc, 0);
958 atomic_set(&newsk->sk_wmem_alloc, 0);
959 atomic_set(&newsk->sk_omem_alloc, 0);
960 skb_queue_head_init(&newsk->sk_receive_queue);
961 skb_queue_head_init(&newsk->sk_write_queue);
962 #ifdef CONFIG_NET_DMA
963 skb_queue_head_init(&newsk->sk_async_wait_queue);
964 #endif
966 rwlock_init(&newsk->sk_dst_lock);
967 rwlock_init(&newsk->sk_callback_lock);
968 lockdep_set_class_and_name(&newsk->sk_callback_lock,
969 af_callback_keys + newsk->sk_family,
970 af_family_clock_key_strings[newsk->sk_family]);
972 newsk->sk_dst_cache = NULL;
973 newsk->sk_wmem_queued = 0;
974 newsk->sk_forward_alloc = 0;
975 newsk->sk_send_head = NULL;
976 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
978 sock_reset_flag(newsk, SOCK_DONE);
979 skb_queue_head_init(&newsk->sk_error_queue);
981 filter = newsk->sk_filter;
982 if (filter != NULL)
983 sk_filter_charge(newsk, filter);
985 if (unlikely(xfrm_sk_clone_policy(newsk))) {
986 /* It is still raw copy of parent, so invalidate
987 * destructor and make plain sk_free() */
988 newsk->sk_destruct = NULL;
989 sk_free(newsk);
990 newsk = NULL;
991 goto out;
994 newsk->sk_err = 0;
995 newsk->sk_priority = 0;
996 atomic_set(&newsk->sk_refcnt, 2);
999 * Increment the counter in the same struct proto as the master
1000 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1001 * is the same as sk->sk_prot->socks, as this field was copied
1002 * with memcpy).
1004 * This _changes_ the previous behaviour, where
1005 * tcp_create_openreq_child always was incrementing the
1006 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1007 * to be taken into account in all callers. -acme
1009 sk_refcnt_debug_inc(newsk);
1010 newsk->sk_socket = NULL;
1011 newsk->sk_sleep = NULL;
1013 if (newsk->sk_prot->sockets_allocated)
1014 atomic_inc(newsk->sk_prot->sockets_allocated);
1016 out:
1017 return newsk;
1020 EXPORT_SYMBOL_GPL(sk_clone);
1022 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1024 __sk_dst_set(sk, dst);
1025 sk->sk_route_caps = dst->dev->features;
1026 if (sk->sk_route_caps & NETIF_F_GSO)
1027 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1028 if (sk_can_gso(sk)) {
1029 if (dst->header_len)
1030 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1031 else
1032 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1035 EXPORT_SYMBOL_GPL(sk_setup_caps);
1037 void __init sk_init(void)
1039 if (num_physpages <= 4096) {
1040 sysctl_wmem_max = 32767;
1041 sysctl_rmem_max = 32767;
1042 sysctl_wmem_default = 32767;
1043 sysctl_rmem_default = 32767;
1044 } else if (num_physpages >= 131072) {
1045 sysctl_wmem_max = 131071;
1046 sysctl_rmem_max = 131071;
1051 * Simple resource managers for sockets.
1056 * Write buffer destructor automatically called from kfree_skb.
1058 void sock_wfree(struct sk_buff *skb)
1060 struct sock *sk = skb->sk;
1062 /* In case it might be waiting for more memory. */
1063 atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1064 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1065 sk->sk_write_space(sk);
1066 sock_put(sk);
1070 * Read buffer destructor automatically called from kfree_skb.
1072 void sock_rfree(struct sk_buff *skb)
1074 struct sock *sk = skb->sk;
1076 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1080 int sock_i_uid(struct sock *sk)
1082 int uid;
1084 read_lock(&sk->sk_callback_lock);
1085 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1086 read_unlock(&sk->sk_callback_lock);
1087 return uid;
1090 unsigned long sock_i_ino(struct sock *sk)
1092 unsigned long ino;
1094 read_lock(&sk->sk_callback_lock);
1095 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1096 read_unlock(&sk->sk_callback_lock);
1097 return ino;
1101 * Allocate a skb from the socket's send buffer.
1103 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1104 gfp_t priority)
1106 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1107 struct sk_buff * skb = alloc_skb(size, priority);
1108 if (skb) {
1109 skb_set_owner_w(skb, sk);
1110 return skb;
1113 return NULL;
1117 * Allocate a skb from the socket's receive buffer.
1119 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1120 gfp_t priority)
1122 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1123 struct sk_buff *skb = alloc_skb(size, priority);
1124 if (skb) {
1125 skb_set_owner_r(skb, sk);
1126 return skb;
1129 return NULL;
1133 * Allocate a memory block from the socket's option memory buffer.
1135 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1137 if ((unsigned)size <= sysctl_optmem_max &&
1138 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1139 void *mem;
1140 /* First do the add, to avoid the race if kmalloc
1141 * might sleep.
1143 atomic_add(size, &sk->sk_omem_alloc);
1144 mem = kmalloc(size, priority);
1145 if (mem)
1146 return mem;
1147 atomic_sub(size, &sk->sk_omem_alloc);
1149 return NULL;
1153 * Free an option memory block.
1155 void sock_kfree_s(struct sock *sk, void *mem, int size)
1157 kfree(mem);
1158 atomic_sub(size, &sk->sk_omem_alloc);
1161 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1162 I think, these locks should be removed for datagram sockets.
1164 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1166 DEFINE_WAIT(wait);
1168 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1169 for (;;) {
1170 if (!timeo)
1171 break;
1172 if (signal_pending(current))
1173 break;
1174 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1175 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1176 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1177 break;
1178 if (sk->sk_shutdown & SEND_SHUTDOWN)
1179 break;
1180 if (sk->sk_err)
1181 break;
1182 timeo = schedule_timeout(timeo);
1184 finish_wait(sk->sk_sleep, &wait);
1185 return timeo;
1190 * Generic send/receive buffer handlers
1193 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1194 unsigned long header_len,
1195 unsigned long data_len,
1196 int noblock, int *errcode)
1198 struct sk_buff *skb;
1199 gfp_t gfp_mask;
1200 long timeo;
1201 int err;
1203 gfp_mask = sk->sk_allocation;
1204 if (gfp_mask & __GFP_WAIT)
1205 gfp_mask |= __GFP_REPEAT;
1207 timeo = sock_sndtimeo(sk, noblock);
1208 while (1) {
1209 err = sock_error(sk);
1210 if (err != 0)
1211 goto failure;
1213 err = -EPIPE;
1214 if (sk->sk_shutdown & SEND_SHUTDOWN)
1215 goto failure;
1217 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1218 skb = alloc_skb(header_len, gfp_mask);
1219 if (skb) {
1220 int npages;
1221 int i;
1223 /* No pages, we're done... */
1224 if (!data_len)
1225 break;
1227 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1228 skb->truesize += data_len;
1229 skb_shinfo(skb)->nr_frags = npages;
1230 for (i = 0; i < npages; i++) {
1231 struct page *page;
1232 skb_frag_t *frag;
1234 page = alloc_pages(sk->sk_allocation, 0);
1235 if (!page) {
1236 err = -ENOBUFS;
1237 skb_shinfo(skb)->nr_frags = i;
1238 kfree_skb(skb);
1239 goto failure;
1242 frag = &skb_shinfo(skb)->frags[i];
1243 frag->page = page;
1244 frag->page_offset = 0;
1245 frag->size = (data_len >= PAGE_SIZE ?
1246 PAGE_SIZE :
1247 data_len);
1248 data_len -= PAGE_SIZE;
1251 /* Full success... */
1252 break;
1254 err = -ENOBUFS;
1255 goto failure;
1257 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1258 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1259 err = -EAGAIN;
1260 if (!timeo)
1261 goto failure;
1262 if (signal_pending(current))
1263 goto interrupted;
1264 timeo = sock_wait_for_wmem(sk, timeo);
1267 skb_set_owner_w(skb, sk);
1268 return skb;
1270 interrupted:
1271 err = sock_intr_errno(timeo);
1272 failure:
1273 *errcode = err;
1274 return NULL;
1277 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1278 int noblock, int *errcode)
1280 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1283 static void __lock_sock(struct sock *sk)
1285 DEFINE_WAIT(wait);
1287 for (;;) {
1288 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1289 TASK_UNINTERRUPTIBLE);
1290 spin_unlock_bh(&sk->sk_lock.slock);
1291 schedule();
1292 spin_lock_bh(&sk->sk_lock.slock);
1293 if (!sock_owned_by_user(sk))
1294 break;
1296 finish_wait(&sk->sk_lock.wq, &wait);
1299 static void __release_sock(struct sock *sk)
1301 struct sk_buff *skb = sk->sk_backlog.head;
1303 do {
1304 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1305 bh_unlock_sock(sk);
1307 do {
1308 struct sk_buff *next = skb->next;
1310 skb->next = NULL;
1311 sk->sk_backlog_rcv(sk, skb);
1314 * We are in process context here with softirqs
1315 * disabled, use cond_resched_softirq() to preempt.
1316 * This is safe to do because we've taken the backlog
1317 * queue private:
1319 cond_resched_softirq();
1321 skb = next;
1322 } while (skb != NULL);
1324 bh_lock_sock(sk);
1325 } while ((skb = sk->sk_backlog.head) != NULL);
1329 * sk_wait_data - wait for data to arrive at sk_receive_queue
1330 * @sk: sock to wait on
1331 * @timeo: for how long
1333 * Now socket state including sk->sk_err is changed only under lock,
1334 * hence we may omit checks after joining wait queue.
1335 * We check receive queue before schedule() only as optimization;
1336 * it is very likely that release_sock() added new data.
1338 int sk_wait_data(struct sock *sk, long *timeo)
1340 int rc;
1341 DEFINE_WAIT(wait);
1343 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1344 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1345 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1346 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1347 finish_wait(sk->sk_sleep, &wait);
1348 return rc;
1351 EXPORT_SYMBOL(sk_wait_data);
1354 * Set of default routines for initialising struct proto_ops when
1355 * the protocol does not support a particular function. In certain
1356 * cases where it makes no sense for a protocol to have a "do nothing"
1357 * function, some default processing is provided.
1360 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1362 return -EOPNOTSUPP;
1365 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1366 int len, int flags)
1368 return -EOPNOTSUPP;
1371 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1373 return -EOPNOTSUPP;
1376 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1378 return -EOPNOTSUPP;
1381 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1382 int *len, int peer)
1384 return -EOPNOTSUPP;
1387 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1389 return 0;
1392 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1394 return -EOPNOTSUPP;
1397 int sock_no_listen(struct socket *sock, int backlog)
1399 return -EOPNOTSUPP;
1402 int sock_no_shutdown(struct socket *sock, int how)
1404 return -EOPNOTSUPP;
1407 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1408 char __user *optval, int optlen)
1410 return -EOPNOTSUPP;
1413 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1414 char __user *optval, int __user *optlen)
1416 return -EOPNOTSUPP;
1419 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1420 size_t len)
1422 return -EOPNOTSUPP;
1425 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1426 size_t len, int flags)
1428 return -EOPNOTSUPP;
1431 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1433 /* Mirror missing mmap method error code */
1434 return -ENODEV;
1437 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1439 ssize_t res;
1440 struct msghdr msg = {.msg_flags = flags};
1441 struct kvec iov;
1442 char *kaddr = kmap(page);
1443 iov.iov_base = kaddr + offset;
1444 iov.iov_len = size;
1445 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1446 kunmap(page);
1447 return res;
1451 * Default Socket Callbacks
1454 static void sock_def_wakeup(struct sock *sk)
1456 read_lock(&sk->sk_callback_lock);
1457 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1458 wake_up_interruptible_all(sk->sk_sleep);
1459 read_unlock(&sk->sk_callback_lock);
1462 static void sock_def_error_report(struct sock *sk)
1464 read_lock(&sk->sk_callback_lock);
1465 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1466 wake_up_interruptible(sk->sk_sleep);
1467 sk_wake_async(sk,0,POLL_ERR);
1468 read_unlock(&sk->sk_callback_lock);
1471 static void sock_def_readable(struct sock *sk, int len)
1473 read_lock(&sk->sk_callback_lock);
1474 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1475 wake_up_interruptible(sk->sk_sleep);
1476 sk_wake_async(sk,1,POLL_IN);
1477 read_unlock(&sk->sk_callback_lock);
1480 static void sock_def_write_space(struct sock *sk)
1482 read_lock(&sk->sk_callback_lock);
1484 /* Do not wake up a writer until he can make "significant"
1485 * progress. --DaveM
1487 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1488 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1489 wake_up_interruptible(sk->sk_sleep);
1491 /* Should agree with poll, otherwise some programs break */
1492 if (sock_writeable(sk))
1493 sk_wake_async(sk, 2, POLL_OUT);
1496 read_unlock(&sk->sk_callback_lock);
1499 static void sock_def_destruct(struct sock *sk)
1501 kfree(sk->sk_protinfo);
1504 void sk_send_sigurg(struct sock *sk)
1506 if (sk->sk_socket && sk->sk_socket->file)
1507 if (send_sigurg(&sk->sk_socket->file->f_owner))
1508 sk_wake_async(sk, 3, POLL_PRI);
1511 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1512 unsigned long expires)
1514 if (!mod_timer(timer, expires))
1515 sock_hold(sk);
1518 EXPORT_SYMBOL(sk_reset_timer);
1520 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1522 if (timer_pending(timer) && del_timer(timer))
1523 __sock_put(sk);
1526 EXPORT_SYMBOL(sk_stop_timer);
1528 void sock_init_data(struct socket *sock, struct sock *sk)
1530 skb_queue_head_init(&sk->sk_receive_queue);
1531 skb_queue_head_init(&sk->sk_write_queue);
1532 skb_queue_head_init(&sk->sk_error_queue);
1533 #ifdef CONFIG_NET_DMA
1534 skb_queue_head_init(&sk->sk_async_wait_queue);
1535 #endif
1537 sk->sk_send_head = NULL;
1539 init_timer(&sk->sk_timer);
1541 sk->sk_allocation = GFP_KERNEL;
1542 sk->sk_rcvbuf = sysctl_rmem_default;
1543 sk->sk_sndbuf = sysctl_wmem_default;
1544 sk->sk_state = TCP_CLOSE;
1545 sk->sk_socket = sock;
1547 sock_set_flag(sk, SOCK_ZAPPED);
1549 if (sock) {
1550 sk->sk_type = sock->type;
1551 sk->sk_sleep = &sock->wait;
1552 sock->sk = sk;
1553 } else
1554 sk->sk_sleep = NULL;
1556 rwlock_init(&sk->sk_dst_lock);
1557 rwlock_init(&sk->sk_callback_lock);
1558 lockdep_set_class_and_name(&sk->sk_callback_lock,
1559 af_callback_keys + sk->sk_family,
1560 af_family_clock_key_strings[sk->sk_family]);
1562 sk->sk_state_change = sock_def_wakeup;
1563 sk->sk_data_ready = sock_def_readable;
1564 sk->sk_write_space = sock_def_write_space;
1565 sk->sk_error_report = sock_def_error_report;
1566 sk->sk_destruct = sock_def_destruct;
1568 sk->sk_sndmsg_page = NULL;
1569 sk->sk_sndmsg_off = 0;
1571 sk->sk_peercred.pid = 0;
1572 sk->sk_peercred.uid = -1;
1573 sk->sk_peercred.gid = -1;
1574 sk->sk_write_pending = 0;
1575 sk->sk_rcvlowat = 1;
1576 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1577 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1579 sk->sk_stamp = ktime_set(-1L, -1L);
1581 atomic_set(&sk->sk_refcnt, 1);
1584 void fastcall lock_sock_nested(struct sock *sk, int subclass)
1586 might_sleep();
1587 spin_lock_bh(&sk->sk_lock.slock);
1588 if (sk->sk_lock.owner)
1589 __lock_sock(sk);
1590 sk->sk_lock.owner = (void *)1;
1591 spin_unlock(&sk->sk_lock.slock);
1593 * The sk_lock has mutex_lock() semantics here:
1595 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1596 local_bh_enable();
1599 EXPORT_SYMBOL(lock_sock_nested);
1601 void fastcall release_sock(struct sock *sk)
1604 * The sk_lock has mutex_unlock() semantics:
1606 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1608 spin_lock_bh(&sk->sk_lock.slock);
1609 if (sk->sk_backlog.tail)
1610 __release_sock(sk);
1611 sk->sk_lock.owner = NULL;
1612 if (waitqueue_active(&sk->sk_lock.wq))
1613 wake_up(&sk->sk_lock.wq);
1614 spin_unlock_bh(&sk->sk_lock.slock);
1616 EXPORT_SYMBOL(release_sock);
1618 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1620 struct timeval tv;
1621 if (!sock_flag(sk, SOCK_TIMESTAMP))
1622 sock_enable_timestamp(sk);
1623 tv = ktime_to_timeval(sk->sk_stamp);
1624 if (tv.tv_sec == -1)
1625 return -ENOENT;
1626 if (tv.tv_sec == 0) {
1627 sk->sk_stamp = ktime_get_real();
1628 tv = ktime_to_timeval(sk->sk_stamp);
1630 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1632 EXPORT_SYMBOL(sock_get_timestamp);
1634 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1636 struct timespec ts;
1637 if (!sock_flag(sk, SOCK_TIMESTAMP))
1638 sock_enable_timestamp(sk);
1639 ts = ktime_to_timespec(sk->sk_stamp);
1640 if (ts.tv_sec == -1)
1641 return -ENOENT;
1642 if (ts.tv_sec == 0) {
1643 sk->sk_stamp = ktime_get_real();
1644 ts = ktime_to_timespec(sk->sk_stamp);
1646 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1648 EXPORT_SYMBOL(sock_get_timestampns);
1650 void sock_enable_timestamp(struct sock *sk)
1652 if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1653 sock_set_flag(sk, SOCK_TIMESTAMP);
1654 net_enable_timestamp();
1657 EXPORT_SYMBOL(sock_enable_timestamp);
1660 * Get a socket option on an socket.
1662 * FIX: POSIX 1003.1g is very ambiguous here. It states that
1663 * asynchronous errors should be reported by getsockopt. We assume
1664 * this means if you specify SO_ERROR (otherwise whats the point of it).
1666 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1667 char __user *optval, int __user *optlen)
1669 struct sock *sk = sock->sk;
1671 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1674 EXPORT_SYMBOL(sock_common_getsockopt);
1676 #ifdef CONFIG_COMPAT
1677 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1678 char __user *optval, int __user *optlen)
1680 struct sock *sk = sock->sk;
1682 if (sk->sk_prot->compat_getsockopt != NULL)
1683 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1684 optval, optlen);
1685 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1687 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1688 #endif
1690 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1691 struct msghdr *msg, size_t size, int flags)
1693 struct sock *sk = sock->sk;
1694 int addr_len = 0;
1695 int err;
1697 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1698 flags & ~MSG_DONTWAIT, &addr_len);
1699 if (err >= 0)
1700 msg->msg_namelen = addr_len;
1701 return err;
1704 EXPORT_SYMBOL(sock_common_recvmsg);
1707 * Set socket options on an inet socket.
1709 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1710 char __user *optval, int optlen)
1712 struct sock *sk = sock->sk;
1714 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1717 EXPORT_SYMBOL(sock_common_setsockopt);
1719 #ifdef CONFIG_COMPAT
1720 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1721 char __user *optval, int optlen)
1723 struct sock *sk = sock->sk;
1725 if (sk->sk_prot->compat_setsockopt != NULL)
1726 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1727 optval, optlen);
1728 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1730 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1731 #endif
1733 void sk_common_release(struct sock *sk)
1735 if (sk->sk_prot->destroy)
1736 sk->sk_prot->destroy(sk);
1739 * Observation: when sock_common_release is called, processes have
1740 * no access to socket. But net still has.
1741 * Step one, detach it from networking:
1743 * A. Remove from hash tables.
1746 sk->sk_prot->unhash(sk);
1749 * In this point socket cannot receive new packets, but it is possible
1750 * that some packets are in flight because some CPU runs receiver and
1751 * did hash table lookup before we unhashed socket. They will achieve
1752 * receive queue and will be purged by socket destructor.
1754 * Also we still have packets pending on receive queue and probably,
1755 * our own packets waiting in device queues. sock_destroy will drain
1756 * receive queue, but transmitted packets will delay socket destruction
1757 * until the last reference will be released.
1760 sock_orphan(sk);
1762 xfrm_sk_free_policy(sk);
1764 sk_refcnt_debug_release(sk);
1765 sock_put(sk);
1768 EXPORT_SYMBOL(sk_common_release);
1770 static DEFINE_RWLOCK(proto_list_lock);
1771 static LIST_HEAD(proto_list);
1773 int proto_register(struct proto *prot, int alloc_slab)
1775 char *request_sock_slab_name = NULL;
1776 char *timewait_sock_slab_name;
1777 int rc = -ENOBUFS;
1779 if (alloc_slab) {
1780 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1781 SLAB_HWCACHE_ALIGN, NULL);
1783 if (prot->slab == NULL) {
1784 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1785 prot->name);
1786 goto out;
1789 if (prot->rsk_prot != NULL) {
1790 static const char mask[] = "request_sock_%s";
1792 request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1793 if (request_sock_slab_name == NULL)
1794 goto out_free_sock_slab;
1796 sprintf(request_sock_slab_name, mask, prot->name);
1797 prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1798 prot->rsk_prot->obj_size, 0,
1799 SLAB_HWCACHE_ALIGN, NULL);
1801 if (prot->rsk_prot->slab == NULL) {
1802 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1803 prot->name);
1804 goto out_free_request_sock_slab_name;
1808 if (prot->twsk_prot != NULL) {
1809 static const char mask[] = "tw_sock_%s";
1811 timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1813 if (timewait_sock_slab_name == NULL)
1814 goto out_free_request_sock_slab;
1816 sprintf(timewait_sock_slab_name, mask, prot->name);
1817 prot->twsk_prot->twsk_slab =
1818 kmem_cache_create(timewait_sock_slab_name,
1819 prot->twsk_prot->twsk_obj_size,
1820 0, SLAB_HWCACHE_ALIGN,
1821 NULL);
1822 if (prot->twsk_prot->twsk_slab == NULL)
1823 goto out_free_timewait_sock_slab_name;
1827 write_lock(&proto_list_lock);
1828 list_add(&prot->node, &proto_list);
1829 write_unlock(&proto_list_lock);
1830 rc = 0;
1831 out:
1832 return rc;
1833 out_free_timewait_sock_slab_name:
1834 kfree(timewait_sock_slab_name);
1835 out_free_request_sock_slab:
1836 if (prot->rsk_prot && prot->rsk_prot->slab) {
1837 kmem_cache_destroy(prot->rsk_prot->slab);
1838 prot->rsk_prot->slab = NULL;
1840 out_free_request_sock_slab_name:
1841 kfree(request_sock_slab_name);
1842 out_free_sock_slab:
1843 kmem_cache_destroy(prot->slab);
1844 prot->slab = NULL;
1845 goto out;
1848 EXPORT_SYMBOL(proto_register);
1850 void proto_unregister(struct proto *prot)
1852 write_lock(&proto_list_lock);
1853 list_del(&prot->node);
1854 write_unlock(&proto_list_lock);
1856 if (prot->slab != NULL) {
1857 kmem_cache_destroy(prot->slab);
1858 prot->slab = NULL;
1861 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1862 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1864 kmem_cache_destroy(prot->rsk_prot->slab);
1865 kfree(name);
1866 prot->rsk_prot->slab = NULL;
1869 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1870 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1872 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1873 kfree(name);
1874 prot->twsk_prot->twsk_slab = NULL;
1878 EXPORT_SYMBOL(proto_unregister);
1880 #ifdef CONFIG_PROC_FS
1881 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1883 read_lock(&proto_list_lock);
1884 return seq_list_start_head(&proto_list, *pos);
1887 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1889 return seq_list_next(v, &proto_list, pos);
1892 static void proto_seq_stop(struct seq_file *seq, void *v)
1894 read_unlock(&proto_list_lock);
1897 static char proto_method_implemented(const void *method)
1899 return method == NULL ? 'n' : 'y';
1902 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1904 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
1905 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1906 proto->name,
1907 proto->obj_size,
1908 proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1909 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1910 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1911 proto->max_header,
1912 proto->slab == NULL ? "no" : "yes",
1913 module_name(proto->owner),
1914 proto_method_implemented(proto->close),
1915 proto_method_implemented(proto->connect),
1916 proto_method_implemented(proto->disconnect),
1917 proto_method_implemented(proto->accept),
1918 proto_method_implemented(proto->ioctl),
1919 proto_method_implemented(proto->init),
1920 proto_method_implemented(proto->destroy),
1921 proto_method_implemented(proto->shutdown),
1922 proto_method_implemented(proto->setsockopt),
1923 proto_method_implemented(proto->getsockopt),
1924 proto_method_implemented(proto->sendmsg),
1925 proto_method_implemented(proto->recvmsg),
1926 proto_method_implemented(proto->sendpage),
1927 proto_method_implemented(proto->bind),
1928 proto_method_implemented(proto->backlog_rcv),
1929 proto_method_implemented(proto->hash),
1930 proto_method_implemented(proto->unhash),
1931 proto_method_implemented(proto->get_port),
1932 proto_method_implemented(proto->enter_memory_pressure));
1935 static int proto_seq_show(struct seq_file *seq, void *v)
1937 if (v == &proto_list)
1938 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1939 "protocol",
1940 "size",
1941 "sockets",
1942 "memory",
1943 "press",
1944 "maxhdr",
1945 "slab",
1946 "module",
1947 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1948 else
1949 proto_seq_printf(seq, list_entry(v, struct proto, node));
1950 return 0;
1953 static const struct seq_operations proto_seq_ops = {
1954 .start = proto_seq_start,
1955 .next = proto_seq_next,
1956 .stop = proto_seq_stop,
1957 .show = proto_seq_show,
1960 static int proto_seq_open(struct inode *inode, struct file *file)
1962 return seq_open(file, &proto_seq_ops);
1965 static const struct file_operations proto_seq_fops = {
1966 .owner = THIS_MODULE,
1967 .open = proto_seq_open,
1968 .read = seq_read,
1969 .llseek = seq_lseek,
1970 .release = seq_release,
1973 static int __init proto_init(void)
1975 /* register /proc/net/protocols */
1976 return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1979 subsys_initcall(proto_init);
1981 #endif /* PROC_FS */
1983 EXPORT_SYMBOL(sk_alloc);
1984 EXPORT_SYMBOL(sk_free);
1985 EXPORT_SYMBOL(sk_send_sigurg);
1986 EXPORT_SYMBOL(sock_alloc_send_skb);
1987 EXPORT_SYMBOL(sock_init_data);
1988 EXPORT_SYMBOL(sock_kfree_s);
1989 EXPORT_SYMBOL(sock_kmalloc);
1990 EXPORT_SYMBOL(sock_no_accept);
1991 EXPORT_SYMBOL(sock_no_bind);
1992 EXPORT_SYMBOL(sock_no_connect);
1993 EXPORT_SYMBOL(sock_no_getname);
1994 EXPORT_SYMBOL(sock_no_getsockopt);
1995 EXPORT_SYMBOL(sock_no_ioctl);
1996 EXPORT_SYMBOL(sock_no_listen);
1997 EXPORT_SYMBOL(sock_no_mmap);
1998 EXPORT_SYMBOL(sock_no_poll);
1999 EXPORT_SYMBOL(sock_no_recvmsg);
2000 EXPORT_SYMBOL(sock_no_sendmsg);
2001 EXPORT_SYMBOL(sock_no_sendpage);
2002 EXPORT_SYMBOL(sock_no_setsockopt);
2003 EXPORT_SYMBOL(sock_no_shutdown);
2004 EXPORT_SYMBOL(sock_no_socketpair);
2005 EXPORT_SYMBOL(sock_rfree);
2006 EXPORT_SYMBOL(sock_setsockopt);
2007 EXPORT_SYMBOL(sock_wfree);
2008 EXPORT_SYMBOL(sock_wmalloc);
2009 EXPORT_SYMBOL(sock_i_uid);
2010 EXPORT_SYMBOL(sock_i_ino);
2011 EXPORT_SYMBOL(sysctl_optmem_max);
2012 #ifdef CONFIG_SYSCTL
2013 EXPORT_SYMBOL(sysctl_rmem_max);
2014 EXPORT_SYMBOL(sysctl_wmem_max);
2015 #endif