mm: Remove slab destructors from kmem_cache_create().
[linux-2.6/libata-dev.git] / net / core / sock.c
blobbd209c4477a9d2a150dd578206a3ad18e8653a38
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
12 * Authors: Ross Biro
13 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Alan Cox, <A.Cox@swansea.ac.uk>
17 * Fixes:
18 * Alan Cox : Numerous verify_area() problems
19 * Alan Cox : Connecting on a connecting socket
20 * now returns an error for tcp.
21 * Alan Cox : sock->protocol is set correctly.
22 * and is not sometimes left as 0.
23 * Alan Cox : connect handles icmp errors on a
24 * connect properly. Unfortunately there
25 * is a restart syscall nasty there. I
26 * can't match BSD without hacking the C
27 * library. Ideas urgently sought!
28 * Alan Cox : Disallow bind() to addresses that are
29 * not ours - especially broadcast ones!!
30 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
31 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
32 * instead they leave that for the DESTROY timer.
33 * Alan Cox : Clean up error flag in accept
34 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
35 * was buggy. Put a remove_sock() in the handler
36 * for memory when we hit 0. Also altered the timer
37 * code. The ACK stuff can wait and needs major
38 * TCP layer surgery.
39 * Alan Cox : Fixed TCP ack bug, removed remove sock
40 * and fixed timer/inet_bh race.
41 * Alan Cox : Added zapped flag for TCP
42 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
43 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
45 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 * Rick Sladkey : Relaxed UDP rules for matching packets.
48 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
49 * Pauline Middelink : identd support
50 * Alan Cox : Fixed connect() taking signals I think.
51 * Alan Cox : SO_LINGER supported
52 * Alan Cox : Error reporting fixes
53 * Anonymous : inet_create tidied up (sk->reuse setting)
54 * Alan Cox : inet sockets don't set sk->type!
55 * Alan Cox : Split socket option code
56 * Alan Cox : Callbacks
57 * Alan Cox : Nagle flag for Charles & Johannes stuff
58 * Alex : Removed restriction on inet fioctl
59 * Alan Cox : Splitting INET from NET core
60 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
61 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
62 * Alan Cox : Split IP from generic code
63 * Alan Cox : New kfree_skbmem()
64 * Alan Cox : Make SO_DEBUG superuser only.
65 * Alan Cox : Allow anyone to clear SO_DEBUG
66 * (compatibility fix)
67 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
68 * Alan Cox : Allocator for a socket is settable.
69 * Alan Cox : SO_ERROR includes soft errors.
70 * Alan Cox : Allow NULL arguments on some SO_ opts
71 * Alan Cox : Generic socket allocation to make hooks
72 * easier (suggested by Craig Metz).
73 * Michael Pall : SO_ERROR returns positive errno again
74 * Steve Whitehouse: Added default destructor to free
75 * protocol private data.
76 * Steve Whitehouse: Added various other default routines
77 * common to several socket families.
78 * Chris Evans : Call suser() check last on F_SETOWN
79 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
81 * Andi Kleen : Fix write_space callback
82 * Chris Evans : Security fixes - signedness again
83 * Arnaldo C. Melo : cleanups, use skb_queue_purge
85 * To Fix:
88 * This program is free software; you can redistribute it and/or
89 * modify it under the terms of the GNU General Public License
90 * as published by the Free Software Foundation; either version
91 * 2 of the License, or (at your option) any later version.
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114 #include <linux/highmem.h>
116 #include <asm/uaccess.h>
117 #include <asm/system.h>
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/request_sock.h>
123 #include <net/sock.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
127 #include <linux/filter.h>
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
137 static struct lock_class_key af_family_keys[AF_MAX];
138 static struct lock_class_key af_family_slock_keys[AF_MAX];
140 #ifdef CONFIG_DEBUG_LOCK_ALLOC
142 * Make lock validator output more readable. (we pre-construct these
143 * strings build-time, so that runtime initialization of socket
144 * locks is fast):
146 static const char *af_family_key_strings[AF_MAX+1] = {
147 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
148 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
149 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
150 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
151 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
152 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
153 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
154 "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
155 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
156 "sk_lock-27" , "sk_lock-28" , "sk_lock-29" ,
157 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
158 "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
160 static const char *af_family_slock_key_strings[AF_MAX+1] = {
161 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
162 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
163 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
164 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
165 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
166 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
167 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
168 "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" ,
169 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
170 "slock-27" , "slock-28" , "slock-29" ,
171 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
172 "slock-AF_RXRPC" , "slock-AF_MAX"
174 static const char *af_family_clock_key_strings[AF_MAX+1] = {
175 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
176 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
177 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
178 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
179 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
180 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
181 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
182 "clock-21" , "clock-AF_SNA" , "clock-AF_IRDA" ,
183 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
184 "clock-27" , "clock-28" , "clock-29" ,
185 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_MAX"
187 #endif
190 * sk_callback_lock locking rules are per-address-family,
191 * so split the lock classes by using a per-AF key:
193 static struct lock_class_key af_callback_keys[AF_MAX];
195 /* Take into consideration the size of the struct sk_buff overhead in the
196 * determination of these values, since that is non-constant across
197 * platforms. This makes socket queueing behavior and performance
198 * not depend upon such differences.
200 #define _SK_MEM_PACKETS 256
201 #define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
202 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
203 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
205 /* Run time adjustable parameters. */
206 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
207 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
208 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
209 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
211 /* Maximal space eaten by iovec or ancilliary data plus some space */
212 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
214 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
216 struct timeval tv;
218 if (optlen < sizeof(tv))
219 return -EINVAL;
220 if (copy_from_user(&tv, optval, sizeof(tv)))
221 return -EFAULT;
222 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
223 return -EDOM;
225 if (tv.tv_sec < 0) {
226 static int warned __read_mostly;
228 *timeo_p = 0;
229 if (warned < 10 && net_ratelimit())
230 warned++;
231 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
232 "tries to set negative timeout\n",
233 current->comm, current->pid);
234 return 0;
236 *timeo_p = MAX_SCHEDULE_TIMEOUT;
237 if (tv.tv_sec == 0 && tv.tv_usec == 0)
238 return 0;
239 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
240 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
241 return 0;
244 static void sock_warn_obsolete_bsdism(const char *name)
246 static int warned;
247 static char warncomm[TASK_COMM_LEN];
248 if (strcmp(warncomm, current->comm) && warned < 5) {
249 strcpy(warncomm, current->comm);
250 printk(KERN_WARNING "process `%s' is using obsolete "
251 "%s SO_BSDCOMPAT\n", warncomm, name);
252 warned++;
256 static void sock_disable_timestamp(struct sock *sk)
258 if (sock_flag(sk, SOCK_TIMESTAMP)) {
259 sock_reset_flag(sk, SOCK_TIMESTAMP);
260 net_disable_timestamp();
265 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
267 int err = 0;
268 int skb_len;
270 /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
271 number of warnings when compiling with -W --ANK
273 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
274 (unsigned)sk->sk_rcvbuf) {
275 err = -ENOMEM;
276 goto out;
279 err = sk_filter(sk, skb);
280 if (err)
281 goto out;
283 skb->dev = NULL;
284 skb_set_owner_r(skb, sk);
286 /* Cache the SKB length before we tack it onto the receive
287 * queue. Once it is added it no longer belongs to us and
288 * may be freed by other threads of control pulling packets
289 * from the queue.
291 skb_len = skb->len;
293 skb_queue_tail(&sk->sk_receive_queue, skb);
295 if (!sock_flag(sk, SOCK_DEAD))
296 sk->sk_data_ready(sk, skb_len);
297 out:
298 return err;
300 EXPORT_SYMBOL(sock_queue_rcv_skb);
302 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
304 int rc = NET_RX_SUCCESS;
306 if (sk_filter(sk, skb))
307 goto discard_and_relse;
309 skb->dev = NULL;
311 if (nested)
312 bh_lock_sock_nested(sk);
313 else
314 bh_lock_sock(sk);
315 if (!sock_owned_by_user(sk)) {
317 * trylock + unlock semantics:
319 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
321 rc = sk->sk_backlog_rcv(sk, skb);
323 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
324 } else
325 sk_add_backlog(sk, skb);
326 bh_unlock_sock(sk);
327 out:
328 sock_put(sk);
329 return rc;
330 discard_and_relse:
331 kfree_skb(skb);
332 goto out;
334 EXPORT_SYMBOL(sk_receive_skb);
336 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
338 struct dst_entry *dst = sk->sk_dst_cache;
340 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
341 sk->sk_dst_cache = NULL;
342 dst_release(dst);
343 return NULL;
346 return dst;
348 EXPORT_SYMBOL(__sk_dst_check);
350 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
352 struct dst_entry *dst = sk_dst_get(sk);
354 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
355 sk_dst_reset(sk);
356 dst_release(dst);
357 return NULL;
360 return dst;
362 EXPORT_SYMBOL(sk_dst_check);
365 * This is meant for all protocols to use and covers goings on
366 * at the socket level. Everything here is generic.
369 int sock_setsockopt(struct socket *sock, int level, int optname,
370 char __user *optval, int optlen)
372 struct sock *sk=sock->sk;
373 struct sk_filter *filter;
374 int val;
375 int valbool;
376 struct linger ling;
377 int ret = 0;
380 * Options without arguments
383 #ifdef SO_DONTLINGER /* Compatibility item... */
384 if (optname == SO_DONTLINGER) {
385 lock_sock(sk);
386 sock_reset_flag(sk, SOCK_LINGER);
387 release_sock(sk);
388 return 0;
390 #endif
392 if (optlen < sizeof(int))
393 return -EINVAL;
395 if (get_user(val, (int __user *)optval))
396 return -EFAULT;
398 valbool = val?1:0;
400 lock_sock(sk);
402 switch(optname) {
403 case SO_DEBUG:
404 if (val && !capable(CAP_NET_ADMIN)) {
405 ret = -EACCES;
407 else if (valbool)
408 sock_set_flag(sk, SOCK_DBG);
409 else
410 sock_reset_flag(sk, SOCK_DBG);
411 break;
412 case SO_REUSEADDR:
413 sk->sk_reuse = valbool;
414 break;
415 case SO_TYPE:
416 case SO_ERROR:
417 ret = -ENOPROTOOPT;
418 break;
419 case SO_DONTROUTE:
420 if (valbool)
421 sock_set_flag(sk, SOCK_LOCALROUTE);
422 else
423 sock_reset_flag(sk, SOCK_LOCALROUTE);
424 break;
425 case SO_BROADCAST:
426 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
427 break;
428 case SO_SNDBUF:
429 /* Don't error on this BSD doesn't and if you think
430 about it this is right. Otherwise apps have to
431 play 'guess the biggest size' games. RCVBUF/SNDBUF
432 are treated in BSD as hints */
434 if (val > sysctl_wmem_max)
435 val = sysctl_wmem_max;
436 set_sndbuf:
437 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
438 if ((val * 2) < SOCK_MIN_SNDBUF)
439 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
440 else
441 sk->sk_sndbuf = val * 2;
444 * Wake up sending tasks if we
445 * upped the value.
447 sk->sk_write_space(sk);
448 break;
450 case SO_SNDBUFFORCE:
451 if (!capable(CAP_NET_ADMIN)) {
452 ret = -EPERM;
453 break;
455 goto set_sndbuf;
457 case SO_RCVBUF:
458 /* Don't error on this BSD doesn't and if you think
459 about it this is right. Otherwise apps have to
460 play 'guess the biggest size' games. RCVBUF/SNDBUF
461 are treated in BSD as hints */
463 if (val > sysctl_rmem_max)
464 val = sysctl_rmem_max;
465 set_rcvbuf:
466 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
468 * We double it on the way in to account for
469 * "struct sk_buff" etc. overhead. Applications
470 * assume that the SO_RCVBUF setting they make will
471 * allow that much actual data to be received on that
472 * socket.
474 * Applications are unaware that "struct sk_buff" and
475 * other overheads allocate from the receive buffer
476 * during socket buffer allocation.
478 * And after considering the possible alternatives,
479 * returning the value we actually used in getsockopt
480 * is the most desirable behavior.
482 if ((val * 2) < SOCK_MIN_RCVBUF)
483 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
484 else
485 sk->sk_rcvbuf = val * 2;
486 break;
488 case SO_RCVBUFFORCE:
489 if (!capable(CAP_NET_ADMIN)) {
490 ret = -EPERM;
491 break;
493 goto set_rcvbuf;
495 case SO_KEEPALIVE:
496 #ifdef CONFIG_INET
497 if (sk->sk_protocol == IPPROTO_TCP)
498 tcp_set_keepalive(sk, valbool);
499 #endif
500 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
501 break;
503 case SO_OOBINLINE:
504 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
505 break;
507 case SO_NO_CHECK:
508 sk->sk_no_check = valbool;
509 break;
511 case SO_PRIORITY:
512 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
513 sk->sk_priority = val;
514 else
515 ret = -EPERM;
516 break;
518 case SO_LINGER:
519 if (optlen < sizeof(ling)) {
520 ret = -EINVAL; /* 1003.1g */
521 break;
523 if (copy_from_user(&ling,optval,sizeof(ling))) {
524 ret = -EFAULT;
525 break;
527 if (!ling.l_onoff)
528 sock_reset_flag(sk, SOCK_LINGER);
529 else {
530 #if (BITS_PER_LONG == 32)
531 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
532 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
533 else
534 #endif
535 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
536 sock_set_flag(sk, SOCK_LINGER);
538 break;
540 case SO_BSDCOMPAT:
541 sock_warn_obsolete_bsdism("setsockopt");
542 break;
544 case SO_PASSCRED:
545 if (valbool)
546 set_bit(SOCK_PASSCRED, &sock->flags);
547 else
548 clear_bit(SOCK_PASSCRED, &sock->flags);
549 break;
551 case SO_TIMESTAMP:
552 case SO_TIMESTAMPNS:
553 if (valbool) {
554 if (optname == SO_TIMESTAMP)
555 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
556 else
557 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
558 sock_set_flag(sk, SOCK_RCVTSTAMP);
559 sock_enable_timestamp(sk);
560 } else {
561 sock_reset_flag(sk, SOCK_RCVTSTAMP);
562 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
564 break;
566 case SO_RCVLOWAT:
567 if (val < 0)
568 val = INT_MAX;
569 sk->sk_rcvlowat = val ? : 1;
570 break;
572 case SO_RCVTIMEO:
573 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
574 break;
576 case SO_SNDTIMEO:
577 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
578 break;
580 #ifdef CONFIG_NETDEVICES
581 case SO_BINDTODEVICE:
583 char devname[IFNAMSIZ];
585 /* Sorry... */
586 if (!capable(CAP_NET_RAW)) {
587 ret = -EPERM;
588 break;
591 /* Bind this socket to a particular device like "eth0",
592 * as specified in the passed interface name. If the
593 * name is "" or the option length is zero the socket
594 * is not bound.
597 if (!valbool) {
598 sk->sk_bound_dev_if = 0;
599 } else {
600 if (optlen > IFNAMSIZ - 1)
601 optlen = IFNAMSIZ - 1;
602 memset(devname, 0, sizeof(devname));
603 if (copy_from_user(devname, optval, optlen)) {
604 ret = -EFAULT;
605 break;
608 /* Remove any cached route for this socket. */
609 sk_dst_reset(sk);
611 if (devname[0] == '\0') {
612 sk->sk_bound_dev_if = 0;
613 } else {
614 struct net_device *dev = dev_get_by_name(devname);
615 if (!dev) {
616 ret = -ENODEV;
617 break;
619 sk->sk_bound_dev_if = dev->ifindex;
620 dev_put(dev);
623 break;
625 #endif
628 case SO_ATTACH_FILTER:
629 ret = -EINVAL;
630 if (optlen == sizeof(struct sock_fprog)) {
631 struct sock_fprog fprog;
633 ret = -EFAULT;
634 if (copy_from_user(&fprog, optval, sizeof(fprog)))
635 break;
637 ret = sk_attach_filter(&fprog, sk);
639 break;
641 case SO_DETACH_FILTER:
642 rcu_read_lock_bh();
643 filter = rcu_dereference(sk->sk_filter);
644 if (filter) {
645 rcu_assign_pointer(sk->sk_filter, NULL);
646 sk_filter_release(sk, filter);
647 rcu_read_unlock_bh();
648 break;
650 rcu_read_unlock_bh();
651 ret = -ENONET;
652 break;
654 case SO_PASSSEC:
655 if (valbool)
656 set_bit(SOCK_PASSSEC, &sock->flags);
657 else
658 clear_bit(SOCK_PASSSEC, &sock->flags);
659 break;
661 /* We implement the SO_SNDLOWAT etc to
662 not be settable (1003.1g 5.3) */
663 default:
664 ret = -ENOPROTOOPT;
665 break;
667 release_sock(sk);
668 return ret;
672 int sock_getsockopt(struct socket *sock, int level, int optname,
673 char __user *optval, int __user *optlen)
675 struct sock *sk = sock->sk;
677 union {
678 int val;
679 struct linger ling;
680 struct timeval tm;
681 } v;
683 unsigned int lv = sizeof(int);
684 int len;
686 if (get_user(len, optlen))
687 return -EFAULT;
688 if (len < 0)
689 return -EINVAL;
691 switch(optname) {
692 case SO_DEBUG:
693 v.val = sock_flag(sk, SOCK_DBG);
694 break;
696 case SO_DONTROUTE:
697 v.val = sock_flag(sk, SOCK_LOCALROUTE);
698 break;
700 case SO_BROADCAST:
701 v.val = !!sock_flag(sk, SOCK_BROADCAST);
702 break;
704 case SO_SNDBUF:
705 v.val = sk->sk_sndbuf;
706 break;
708 case SO_RCVBUF:
709 v.val = sk->sk_rcvbuf;
710 break;
712 case SO_REUSEADDR:
713 v.val = sk->sk_reuse;
714 break;
716 case SO_KEEPALIVE:
717 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
718 break;
720 case SO_TYPE:
721 v.val = sk->sk_type;
722 break;
724 case SO_ERROR:
725 v.val = -sock_error(sk);
726 if (v.val==0)
727 v.val = xchg(&sk->sk_err_soft, 0);
728 break;
730 case SO_OOBINLINE:
731 v.val = !!sock_flag(sk, SOCK_URGINLINE);
732 break;
734 case SO_NO_CHECK:
735 v.val = sk->sk_no_check;
736 break;
738 case SO_PRIORITY:
739 v.val = sk->sk_priority;
740 break;
742 case SO_LINGER:
743 lv = sizeof(v.ling);
744 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
745 v.ling.l_linger = sk->sk_lingertime / HZ;
746 break;
748 case SO_BSDCOMPAT:
749 sock_warn_obsolete_bsdism("getsockopt");
750 break;
752 case SO_TIMESTAMP:
753 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
754 !sock_flag(sk, SOCK_RCVTSTAMPNS);
755 break;
757 case SO_TIMESTAMPNS:
758 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
759 break;
761 case SO_RCVTIMEO:
762 lv=sizeof(struct timeval);
763 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
764 v.tm.tv_sec = 0;
765 v.tm.tv_usec = 0;
766 } else {
767 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
768 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
770 break;
772 case SO_SNDTIMEO:
773 lv=sizeof(struct timeval);
774 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
775 v.tm.tv_sec = 0;
776 v.tm.tv_usec = 0;
777 } else {
778 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
779 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
781 break;
783 case SO_RCVLOWAT:
784 v.val = sk->sk_rcvlowat;
785 break;
787 case SO_SNDLOWAT:
788 v.val=1;
789 break;
791 case SO_PASSCRED:
792 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
793 break;
795 case SO_PEERCRED:
796 if (len > sizeof(sk->sk_peercred))
797 len = sizeof(sk->sk_peercred);
798 if (copy_to_user(optval, &sk->sk_peercred, len))
799 return -EFAULT;
800 goto lenout;
802 case SO_PEERNAME:
804 char address[128];
806 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
807 return -ENOTCONN;
808 if (lv < len)
809 return -EINVAL;
810 if (copy_to_user(optval, address, len))
811 return -EFAULT;
812 goto lenout;
815 /* Dubious BSD thing... Probably nobody even uses it, but
816 * the UNIX standard wants it for whatever reason... -DaveM
818 case SO_ACCEPTCONN:
819 v.val = sk->sk_state == TCP_LISTEN;
820 break;
822 case SO_PASSSEC:
823 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
824 break;
826 case SO_PEERSEC:
827 return security_socket_getpeersec_stream(sock, optval, optlen, len);
829 default:
830 return -ENOPROTOOPT;
833 if (len > lv)
834 len = lv;
835 if (copy_to_user(optval, &v, len))
836 return -EFAULT;
837 lenout:
838 if (put_user(len, optlen))
839 return -EFAULT;
840 return 0;
844 * Initialize an sk_lock.
846 * (We also register the sk_lock with the lock validator.)
848 static inline void sock_lock_init(struct sock *sk)
850 sock_lock_init_class_and_name(sk,
851 af_family_slock_key_strings[sk->sk_family],
852 af_family_slock_keys + sk->sk_family,
853 af_family_key_strings[sk->sk_family],
854 af_family_keys + sk->sk_family);
858 * sk_alloc - All socket objects are allocated here
859 * @family: protocol family
860 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
861 * @prot: struct proto associated with this new sock instance
862 * @zero_it: if we should zero the newly allocated sock
864 struct sock *sk_alloc(int family, gfp_t priority,
865 struct proto *prot, int zero_it)
867 struct sock *sk = NULL;
868 struct kmem_cache *slab = prot->slab;
870 if (slab != NULL)
871 sk = kmem_cache_alloc(slab, priority);
872 else
873 sk = kmalloc(prot->obj_size, priority);
875 if (sk) {
876 if (zero_it) {
877 memset(sk, 0, prot->obj_size);
878 sk->sk_family = family;
880 * See comment in struct sock definition to understand
881 * why we need sk_prot_creator -acme
883 sk->sk_prot = sk->sk_prot_creator = prot;
884 sock_lock_init(sk);
887 if (security_sk_alloc(sk, family, priority))
888 goto out_free;
890 if (!try_module_get(prot->owner))
891 goto out_free;
893 return sk;
895 out_free:
896 if (slab != NULL)
897 kmem_cache_free(slab, sk);
898 else
899 kfree(sk);
900 return NULL;
903 void sk_free(struct sock *sk)
905 struct sk_filter *filter;
906 struct module *owner = sk->sk_prot_creator->owner;
908 if (sk->sk_destruct)
909 sk->sk_destruct(sk);
911 filter = rcu_dereference(sk->sk_filter);
912 if (filter) {
913 sk_filter_release(sk, filter);
914 rcu_assign_pointer(sk->sk_filter, NULL);
917 sock_disable_timestamp(sk);
919 if (atomic_read(&sk->sk_omem_alloc))
920 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
921 __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
923 security_sk_free(sk);
924 if (sk->sk_prot_creator->slab != NULL)
925 kmem_cache_free(sk->sk_prot_creator->slab, sk);
926 else
927 kfree(sk);
928 module_put(owner);
931 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
933 struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
935 if (newsk != NULL) {
936 struct sk_filter *filter;
938 sock_copy(newsk, sk);
940 /* SANITY */
941 sk_node_init(&newsk->sk_node);
942 sock_lock_init(newsk);
943 bh_lock_sock(newsk);
944 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
946 atomic_set(&newsk->sk_rmem_alloc, 0);
947 atomic_set(&newsk->sk_wmem_alloc, 0);
948 atomic_set(&newsk->sk_omem_alloc, 0);
949 skb_queue_head_init(&newsk->sk_receive_queue);
950 skb_queue_head_init(&newsk->sk_write_queue);
951 #ifdef CONFIG_NET_DMA
952 skb_queue_head_init(&newsk->sk_async_wait_queue);
953 #endif
955 rwlock_init(&newsk->sk_dst_lock);
956 rwlock_init(&newsk->sk_callback_lock);
957 lockdep_set_class_and_name(&newsk->sk_callback_lock,
958 af_callback_keys + newsk->sk_family,
959 af_family_clock_key_strings[newsk->sk_family]);
961 newsk->sk_dst_cache = NULL;
962 newsk->sk_wmem_queued = 0;
963 newsk->sk_forward_alloc = 0;
964 newsk->sk_send_head = NULL;
965 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
967 sock_reset_flag(newsk, SOCK_DONE);
968 skb_queue_head_init(&newsk->sk_error_queue);
970 filter = newsk->sk_filter;
971 if (filter != NULL)
972 sk_filter_charge(newsk, filter);
974 if (unlikely(xfrm_sk_clone_policy(newsk))) {
975 /* It is still raw copy of parent, so invalidate
976 * destructor and make plain sk_free() */
977 newsk->sk_destruct = NULL;
978 sk_free(newsk);
979 newsk = NULL;
980 goto out;
983 newsk->sk_err = 0;
984 newsk->sk_priority = 0;
985 atomic_set(&newsk->sk_refcnt, 2);
988 * Increment the counter in the same struct proto as the master
989 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
990 * is the same as sk->sk_prot->socks, as this field was copied
991 * with memcpy).
993 * This _changes_ the previous behaviour, where
994 * tcp_create_openreq_child always was incrementing the
995 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
996 * to be taken into account in all callers. -acme
998 sk_refcnt_debug_inc(newsk);
999 newsk->sk_socket = NULL;
1000 newsk->sk_sleep = NULL;
1002 if (newsk->sk_prot->sockets_allocated)
1003 atomic_inc(newsk->sk_prot->sockets_allocated);
1005 out:
1006 return newsk;
1009 EXPORT_SYMBOL_GPL(sk_clone);
1011 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1013 __sk_dst_set(sk, dst);
1014 sk->sk_route_caps = dst->dev->features;
1015 if (sk->sk_route_caps & NETIF_F_GSO)
1016 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1017 if (sk_can_gso(sk)) {
1018 if (dst->header_len)
1019 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1020 else
1021 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1024 EXPORT_SYMBOL_GPL(sk_setup_caps);
1026 void __init sk_init(void)
1028 if (num_physpages <= 4096) {
1029 sysctl_wmem_max = 32767;
1030 sysctl_rmem_max = 32767;
1031 sysctl_wmem_default = 32767;
1032 sysctl_rmem_default = 32767;
1033 } else if (num_physpages >= 131072) {
1034 sysctl_wmem_max = 131071;
1035 sysctl_rmem_max = 131071;
1040 * Simple resource managers for sockets.
1045 * Write buffer destructor automatically called from kfree_skb.
1047 void sock_wfree(struct sk_buff *skb)
1049 struct sock *sk = skb->sk;
1051 /* In case it might be waiting for more memory. */
1052 atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1053 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1054 sk->sk_write_space(sk);
1055 sock_put(sk);
1059 * Read buffer destructor automatically called from kfree_skb.
1061 void sock_rfree(struct sk_buff *skb)
1063 struct sock *sk = skb->sk;
1065 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1069 int sock_i_uid(struct sock *sk)
1071 int uid;
1073 read_lock(&sk->sk_callback_lock);
1074 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1075 read_unlock(&sk->sk_callback_lock);
1076 return uid;
1079 unsigned long sock_i_ino(struct sock *sk)
1081 unsigned long ino;
1083 read_lock(&sk->sk_callback_lock);
1084 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1085 read_unlock(&sk->sk_callback_lock);
1086 return ino;
1090 * Allocate a skb from the socket's send buffer.
1092 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1093 gfp_t priority)
1095 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1096 struct sk_buff * skb = alloc_skb(size, priority);
1097 if (skb) {
1098 skb_set_owner_w(skb, sk);
1099 return skb;
1102 return NULL;
1106 * Allocate a skb from the socket's receive buffer.
1108 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1109 gfp_t priority)
1111 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1112 struct sk_buff *skb = alloc_skb(size, priority);
1113 if (skb) {
1114 skb_set_owner_r(skb, sk);
1115 return skb;
1118 return NULL;
1122 * Allocate a memory block from the socket's option memory buffer.
1124 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1126 if ((unsigned)size <= sysctl_optmem_max &&
1127 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1128 void *mem;
1129 /* First do the add, to avoid the race if kmalloc
1130 * might sleep.
1132 atomic_add(size, &sk->sk_omem_alloc);
1133 mem = kmalloc(size, priority);
1134 if (mem)
1135 return mem;
1136 atomic_sub(size, &sk->sk_omem_alloc);
1138 return NULL;
1142 * Free an option memory block.
1144 void sock_kfree_s(struct sock *sk, void *mem, int size)
1146 kfree(mem);
1147 atomic_sub(size, &sk->sk_omem_alloc);
1150 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1151 I think, these locks should be removed for datagram sockets.
1153 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1155 DEFINE_WAIT(wait);
1157 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1158 for (;;) {
1159 if (!timeo)
1160 break;
1161 if (signal_pending(current))
1162 break;
1163 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1164 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1165 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1166 break;
1167 if (sk->sk_shutdown & SEND_SHUTDOWN)
1168 break;
1169 if (sk->sk_err)
1170 break;
1171 timeo = schedule_timeout(timeo);
1173 finish_wait(sk->sk_sleep, &wait);
1174 return timeo;
1179 * Generic send/receive buffer handlers
1182 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1183 unsigned long header_len,
1184 unsigned long data_len,
1185 int noblock, int *errcode)
1187 struct sk_buff *skb;
1188 gfp_t gfp_mask;
1189 long timeo;
1190 int err;
1192 gfp_mask = sk->sk_allocation;
1193 if (gfp_mask & __GFP_WAIT)
1194 gfp_mask |= __GFP_REPEAT;
1196 timeo = sock_sndtimeo(sk, noblock);
1197 while (1) {
1198 err = sock_error(sk);
1199 if (err != 0)
1200 goto failure;
1202 err = -EPIPE;
1203 if (sk->sk_shutdown & SEND_SHUTDOWN)
1204 goto failure;
1206 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1207 skb = alloc_skb(header_len, gfp_mask);
1208 if (skb) {
1209 int npages;
1210 int i;
1212 /* No pages, we're done... */
1213 if (!data_len)
1214 break;
1216 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1217 skb->truesize += data_len;
1218 skb_shinfo(skb)->nr_frags = npages;
1219 for (i = 0; i < npages; i++) {
1220 struct page *page;
1221 skb_frag_t *frag;
1223 page = alloc_pages(sk->sk_allocation, 0);
1224 if (!page) {
1225 err = -ENOBUFS;
1226 skb_shinfo(skb)->nr_frags = i;
1227 kfree_skb(skb);
1228 goto failure;
1231 frag = &skb_shinfo(skb)->frags[i];
1232 frag->page = page;
1233 frag->page_offset = 0;
1234 frag->size = (data_len >= PAGE_SIZE ?
1235 PAGE_SIZE :
1236 data_len);
1237 data_len -= PAGE_SIZE;
1240 /* Full success... */
1241 break;
1243 err = -ENOBUFS;
1244 goto failure;
1246 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1247 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1248 err = -EAGAIN;
1249 if (!timeo)
1250 goto failure;
1251 if (signal_pending(current))
1252 goto interrupted;
1253 timeo = sock_wait_for_wmem(sk, timeo);
1256 skb_set_owner_w(skb, sk);
1257 return skb;
1259 interrupted:
1260 err = sock_intr_errno(timeo);
1261 failure:
1262 *errcode = err;
1263 return NULL;
1266 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1267 int noblock, int *errcode)
1269 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1272 static void __lock_sock(struct sock *sk)
1274 DEFINE_WAIT(wait);
1276 for (;;) {
1277 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1278 TASK_UNINTERRUPTIBLE);
1279 spin_unlock_bh(&sk->sk_lock.slock);
1280 schedule();
1281 spin_lock_bh(&sk->sk_lock.slock);
1282 if (!sock_owned_by_user(sk))
1283 break;
1285 finish_wait(&sk->sk_lock.wq, &wait);
1288 static void __release_sock(struct sock *sk)
1290 struct sk_buff *skb = sk->sk_backlog.head;
1292 do {
1293 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1294 bh_unlock_sock(sk);
1296 do {
1297 struct sk_buff *next = skb->next;
1299 skb->next = NULL;
1300 sk->sk_backlog_rcv(sk, skb);
1303 * We are in process context here with softirqs
1304 * disabled, use cond_resched_softirq() to preempt.
1305 * This is safe to do because we've taken the backlog
1306 * queue private:
1308 cond_resched_softirq();
1310 skb = next;
1311 } while (skb != NULL);
1313 bh_lock_sock(sk);
1314 } while ((skb = sk->sk_backlog.head) != NULL);
1318 * sk_wait_data - wait for data to arrive at sk_receive_queue
1319 * @sk: sock to wait on
1320 * @timeo: for how long
1322 * Now socket state including sk->sk_err is changed only under lock,
1323 * hence we may omit checks after joining wait queue.
1324 * We check receive queue before schedule() only as optimization;
1325 * it is very likely that release_sock() added new data.
1327 int sk_wait_data(struct sock *sk, long *timeo)
1329 int rc;
1330 DEFINE_WAIT(wait);
1332 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1333 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1334 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1335 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1336 finish_wait(sk->sk_sleep, &wait);
1337 return rc;
1340 EXPORT_SYMBOL(sk_wait_data);
1343 * Set of default routines for initialising struct proto_ops when
1344 * the protocol does not support a particular function. In certain
1345 * cases where it makes no sense for a protocol to have a "do nothing"
1346 * function, some default processing is provided.
1349 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1351 return -EOPNOTSUPP;
1354 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1355 int len, int flags)
1357 return -EOPNOTSUPP;
1360 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1362 return -EOPNOTSUPP;
1365 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1367 return -EOPNOTSUPP;
1370 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1371 int *len, int peer)
1373 return -EOPNOTSUPP;
1376 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1378 return 0;
1381 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1383 return -EOPNOTSUPP;
1386 int sock_no_listen(struct socket *sock, int backlog)
1388 return -EOPNOTSUPP;
1391 int sock_no_shutdown(struct socket *sock, int how)
1393 return -EOPNOTSUPP;
1396 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1397 char __user *optval, int optlen)
1399 return -EOPNOTSUPP;
1402 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1403 char __user *optval, int __user *optlen)
1405 return -EOPNOTSUPP;
1408 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1409 size_t len)
1411 return -EOPNOTSUPP;
1414 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1415 size_t len, int flags)
1417 return -EOPNOTSUPP;
1420 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1422 /* Mirror missing mmap method error code */
1423 return -ENODEV;
1426 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1428 ssize_t res;
1429 struct msghdr msg = {.msg_flags = flags};
1430 struct kvec iov;
1431 char *kaddr = kmap(page);
1432 iov.iov_base = kaddr + offset;
1433 iov.iov_len = size;
1434 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1435 kunmap(page);
1436 return res;
1440 * Default Socket Callbacks
1443 static void sock_def_wakeup(struct sock *sk)
1445 read_lock(&sk->sk_callback_lock);
1446 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1447 wake_up_interruptible_all(sk->sk_sleep);
1448 read_unlock(&sk->sk_callback_lock);
1451 static void sock_def_error_report(struct sock *sk)
1453 read_lock(&sk->sk_callback_lock);
1454 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1455 wake_up_interruptible(sk->sk_sleep);
1456 sk_wake_async(sk,0,POLL_ERR);
1457 read_unlock(&sk->sk_callback_lock);
1460 static void sock_def_readable(struct sock *sk, int len)
1462 read_lock(&sk->sk_callback_lock);
1463 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1464 wake_up_interruptible(sk->sk_sleep);
1465 sk_wake_async(sk,1,POLL_IN);
1466 read_unlock(&sk->sk_callback_lock);
1469 static void sock_def_write_space(struct sock *sk)
1471 read_lock(&sk->sk_callback_lock);
1473 /* Do not wake up a writer until he can make "significant"
1474 * progress. --DaveM
1476 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1477 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1478 wake_up_interruptible(sk->sk_sleep);
1480 /* Should agree with poll, otherwise some programs break */
1481 if (sock_writeable(sk))
1482 sk_wake_async(sk, 2, POLL_OUT);
1485 read_unlock(&sk->sk_callback_lock);
1488 static void sock_def_destruct(struct sock *sk)
1490 kfree(sk->sk_protinfo);
1493 void sk_send_sigurg(struct sock *sk)
1495 if (sk->sk_socket && sk->sk_socket->file)
1496 if (send_sigurg(&sk->sk_socket->file->f_owner))
1497 sk_wake_async(sk, 3, POLL_PRI);
1500 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1501 unsigned long expires)
1503 if (!mod_timer(timer, expires))
1504 sock_hold(sk);
1507 EXPORT_SYMBOL(sk_reset_timer);
1509 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1511 if (timer_pending(timer) && del_timer(timer))
1512 __sock_put(sk);
1515 EXPORT_SYMBOL(sk_stop_timer);
1517 void sock_init_data(struct socket *sock, struct sock *sk)
1519 skb_queue_head_init(&sk->sk_receive_queue);
1520 skb_queue_head_init(&sk->sk_write_queue);
1521 skb_queue_head_init(&sk->sk_error_queue);
1522 #ifdef CONFIG_NET_DMA
1523 skb_queue_head_init(&sk->sk_async_wait_queue);
1524 #endif
1526 sk->sk_send_head = NULL;
1528 init_timer(&sk->sk_timer);
1530 sk->sk_allocation = GFP_KERNEL;
1531 sk->sk_rcvbuf = sysctl_rmem_default;
1532 sk->sk_sndbuf = sysctl_wmem_default;
1533 sk->sk_state = TCP_CLOSE;
1534 sk->sk_socket = sock;
1536 sock_set_flag(sk, SOCK_ZAPPED);
1538 if (sock) {
1539 sk->sk_type = sock->type;
1540 sk->sk_sleep = &sock->wait;
1541 sock->sk = sk;
1542 } else
1543 sk->sk_sleep = NULL;
1545 rwlock_init(&sk->sk_dst_lock);
1546 rwlock_init(&sk->sk_callback_lock);
1547 lockdep_set_class_and_name(&sk->sk_callback_lock,
1548 af_callback_keys + sk->sk_family,
1549 af_family_clock_key_strings[sk->sk_family]);
1551 sk->sk_state_change = sock_def_wakeup;
1552 sk->sk_data_ready = sock_def_readable;
1553 sk->sk_write_space = sock_def_write_space;
1554 sk->sk_error_report = sock_def_error_report;
1555 sk->sk_destruct = sock_def_destruct;
1557 sk->sk_sndmsg_page = NULL;
1558 sk->sk_sndmsg_off = 0;
1560 sk->sk_peercred.pid = 0;
1561 sk->sk_peercred.uid = -1;
1562 sk->sk_peercred.gid = -1;
1563 sk->sk_write_pending = 0;
1564 sk->sk_rcvlowat = 1;
1565 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1566 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1568 sk->sk_stamp = ktime_set(-1L, -1L);
1570 atomic_set(&sk->sk_refcnt, 1);
1573 void fastcall lock_sock_nested(struct sock *sk, int subclass)
1575 might_sleep();
1576 spin_lock_bh(&sk->sk_lock.slock);
1577 if (sk->sk_lock.owner)
1578 __lock_sock(sk);
1579 sk->sk_lock.owner = (void *)1;
1580 spin_unlock(&sk->sk_lock.slock);
1582 * The sk_lock has mutex_lock() semantics here:
1584 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1585 local_bh_enable();
1588 EXPORT_SYMBOL(lock_sock_nested);
1590 void fastcall release_sock(struct sock *sk)
1593 * The sk_lock has mutex_unlock() semantics:
1595 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1597 spin_lock_bh(&sk->sk_lock.slock);
1598 if (sk->sk_backlog.tail)
1599 __release_sock(sk);
1600 sk->sk_lock.owner = NULL;
1601 if (waitqueue_active(&sk->sk_lock.wq))
1602 wake_up(&sk->sk_lock.wq);
1603 spin_unlock_bh(&sk->sk_lock.slock);
1605 EXPORT_SYMBOL(release_sock);
1607 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1609 struct timeval tv;
1610 if (!sock_flag(sk, SOCK_TIMESTAMP))
1611 sock_enable_timestamp(sk);
1612 tv = ktime_to_timeval(sk->sk_stamp);
1613 if (tv.tv_sec == -1)
1614 return -ENOENT;
1615 if (tv.tv_sec == 0) {
1616 sk->sk_stamp = ktime_get_real();
1617 tv = ktime_to_timeval(sk->sk_stamp);
1619 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1621 EXPORT_SYMBOL(sock_get_timestamp);
1623 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1625 struct timespec ts;
1626 if (!sock_flag(sk, SOCK_TIMESTAMP))
1627 sock_enable_timestamp(sk);
1628 ts = ktime_to_timespec(sk->sk_stamp);
1629 if (ts.tv_sec == -1)
1630 return -ENOENT;
1631 if (ts.tv_sec == 0) {
1632 sk->sk_stamp = ktime_get_real();
1633 ts = ktime_to_timespec(sk->sk_stamp);
1635 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1637 EXPORT_SYMBOL(sock_get_timestampns);
1639 void sock_enable_timestamp(struct sock *sk)
1641 if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1642 sock_set_flag(sk, SOCK_TIMESTAMP);
1643 net_enable_timestamp();
1646 EXPORT_SYMBOL(sock_enable_timestamp);
1649 * Get a socket option on an socket.
1651 * FIX: POSIX 1003.1g is very ambiguous here. It states that
1652 * asynchronous errors should be reported by getsockopt. We assume
1653 * this means if you specify SO_ERROR (otherwise whats the point of it).
1655 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1656 char __user *optval, int __user *optlen)
1658 struct sock *sk = sock->sk;
1660 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1663 EXPORT_SYMBOL(sock_common_getsockopt);
1665 #ifdef CONFIG_COMPAT
1666 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1667 char __user *optval, int __user *optlen)
1669 struct sock *sk = sock->sk;
1671 if (sk->sk_prot->compat_getsockopt != NULL)
1672 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1673 optval, optlen);
1674 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1676 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1677 #endif
1679 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1680 struct msghdr *msg, size_t size, int flags)
1682 struct sock *sk = sock->sk;
1683 int addr_len = 0;
1684 int err;
1686 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1687 flags & ~MSG_DONTWAIT, &addr_len);
1688 if (err >= 0)
1689 msg->msg_namelen = addr_len;
1690 return err;
1693 EXPORT_SYMBOL(sock_common_recvmsg);
1696 * Set socket options on an inet socket.
1698 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1699 char __user *optval, int optlen)
1701 struct sock *sk = sock->sk;
1703 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1706 EXPORT_SYMBOL(sock_common_setsockopt);
1708 #ifdef CONFIG_COMPAT
1709 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1710 char __user *optval, int optlen)
1712 struct sock *sk = sock->sk;
1714 if (sk->sk_prot->compat_setsockopt != NULL)
1715 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1716 optval, optlen);
1717 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1719 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1720 #endif
1722 void sk_common_release(struct sock *sk)
1724 if (sk->sk_prot->destroy)
1725 sk->sk_prot->destroy(sk);
1728 * Observation: when sock_common_release is called, processes have
1729 * no access to socket. But net still has.
1730 * Step one, detach it from networking:
1732 * A. Remove from hash tables.
1735 sk->sk_prot->unhash(sk);
1738 * In this point socket cannot receive new packets, but it is possible
1739 * that some packets are in flight because some CPU runs receiver and
1740 * did hash table lookup before we unhashed socket. They will achieve
1741 * receive queue and will be purged by socket destructor.
1743 * Also we still have packets pending on receive queue and probably,
1744 * our own packets waiting in device queues. sock_destroy will drain
1745 * receive queue, but transmitted packets will delay socket destruction
1746 * until the last reference will be released.
1749 sock_orphan(sk);
1751 xfrm_sk_free_policy(sk);
1753 sk_refcnt_debug_release(sk);
1754 sock_put(sk);
1757 EXPORT_SYMBOL(sk_common_release);
1759 static DEFINE_RWLOCK(proto_list_lock);
1760 static LIST_HEAD(proto_list);
1762 int proto_register(struct proto *prot, int alloc_slab)
1764 char *request_sock_slab_name = NULL;
1765 char *timewait_sock_slab_name;
1766 int rc = -ENOBUFS;
1768 if (alloc_slab) {
1769 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1770 SLAB_HWCACHE_ALIGN, NULL);
1772 if (prot->slab == NULL) {
1773 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1774 prot->name);
1775 goto out;
1778 if (prot->rsk_prot != NULL) {
1779 static const char mask[] = "request_sock_%s";
1781 request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1782 if (request_sock_slab_name == NULL)
1783 goto out_free_sock_slab;
1785 sprintf(request_sock_slab_name, mask, prot->name);
1786 prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1787 prot->rsk_prot->obj_size, 0,
1788 SLAB_HWCACHE_ALIGN, NULL);
1790 if (prot->rsk_prot->slab == NULL) {
1791 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1792 prot->name);
1793 goto out_free_request_sock_slab_name;
1797 if (prot->twsk_prot != NULL) {
1798 static const char mask[] = "tw_sock_%s";
1800 timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1802 if (timewait_sock_slab_name == NULL)
1803 goto out_free_request_sock_slab;
1805 sprintf(timewait_sock_slab_name, mask, prot->name);
1806 prot->twsk_prot->twsk_slab =
1807 kmem_cache_create(timewait_sock_slab_name,
1808 prot->twsk_prot->twsk_obj_size,
1809 0, SLAB_HWCACHE_ALIGN,
1810 NULL);
1811 if (prot->twsk_prot->twsk_slab == NULL)
1812 goto out_free_timewait_sock_slab_name;
1816 write_lock(&proto_list_lock);
1817 list_add(&prot->node, &proto_list);
1818 write_unlock(&proto_list_lock);
1819 rc = 0;
1820 out:
1821 return rc;
1822 out_free_timewait_sock_slab_name:
1823 kfree(timewait_sock_slab_name);
1824 out_free_request_sock_slab:
1825 if (prot->rsk_prot && prot->rsk_prot->slab) {
1826 kmem_cache_destroy(prot->rsk_prot->slab);
1827 prot->rsk_prot->slab = NULL;
1829 out_free_request_sock_slab_name:
1830 kfree(request_sock_slab_name);
1831 out_free_sock_slab:
1832 kmem_cache_destroy(prot->slab);
1833 prot->slab = NULL;
1834 goto out;
1837 EXPORT_SYMBOL(proto_register);
1839 void proto_unregister(struct proto *prot)
1841 write_lock(&proto_list_lock);
1842 list_del(&prot->node);
1843 write_unlock(&proto_list_lock);
1845 if (prot->slab != NULL) {
1846 kmem_cache_destroy(prot->slab);
1847 prot->slab = NULL;
1850 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1851 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1853 kmem_cache_destroy(prot->rsk_prot->slab);
1854 kfree(name);
1855 prot->rsk_prot->slab = NULL;
1858 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1859 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1861 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1862 kfree(name);
1863 prot->twsk_prot->twsk_slab = NULL;
1867 EXPORT_SYMBOL(proto_unregister);
1869 #ifdef CONFIG_PROC_FS
1870 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1872 read_lock(&proto_list_lock);
1873 return seq_list_start_head(&proto_list, *pos);
1876 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1878 return seq_list_next(v, &proto_list, pos);
1881 static void proto_seq_stop(struct seq_file *seq, void *v)
1883 read_unlock(&proto_list_lock);
1886 static char proto_method_implemented(const void *method)
1888 return method == NULL ? 'n' : 'y';
1891 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1893 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
1894 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1895 proto->name,
1896 proto->obj_size,
1897 proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1898 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1899 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1900 proto->max_header,
1901 proto->slab == NULL ? "no" : "yes",
1902 module_name(proto->owner),
1903 proto_method_implemented(proto->close),
1904 proto_method_implemented(proto->connect),
1905 proto_method_implemented(proto->disconnect),
1906 proto_method_implemented(proto->accept),
1907 proto_method_implemented(proto->ioctl),
1908 proto_method_implemented(proto->init),
1909 proto_method_implemented(proto->destroy),
1910 proto_method_implemented(proto->shutdown),
1911 proto_method_implemented(proto->setsockopt),
1912 proto_method_implemented(proto->getsockopt),
1913 proto_method_implemented(proto->sendmsg),
1914 proto_method_implemented(proto->recvmsg),
1915 proto_method_implemented(proto->sendpage),
1916 proto_method_implemented(proto->bind),
1917 proto_method_implemented(proto->backlog_rcv),
1918 proto_method_implemented(proto->hash),
1919 proto_method_implemented(proto->unhash),
1920 proto_method_implemented(proto->get_port),
1921 proto_method_implemented(proto->enter_memory_pressure));
1924 static int proto_seq_show(struct seq_file *seq, void *v)
1926 if (v == &proto_list)
1927 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1928 "protocol",
1929 "size",
1930 "sockets",
1931 "memory",
1932 "press",
1933 "maxhdr",
1934 "slab",
1935 "module",
1936 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1937 else
1938 proto_seq_printf(seq, list_entry(v, struct proto, node));
1939 return 0;
1942 static const struct seq_operations proto_seq_ops = {
1943 .start = proto_seq_start,
1944 .next = proto_seq_next,
1945 .stop = proto_seq_stop,
1946 .show = proto_seq_show,
1949 static int proto_seq_open(struct inode *inode, struct file *file)
1951 return seq_open(file, &proto_seq_ops);
1954 static const struct file_operations proto_seq_fops = {
1955 .owner = THIS_MODULE,
1956 .open = proto_seq_open,
1957 .read = seq_read,
1958 .llseek = seq_lseek,
1959 .release = seq_release,
1962 static int __init proto_init(void)
1964 /* register /proc/net/protocols */
1965 return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1968 subsys_initcall(proto_init);
1970 #endif /* PROC_FS */
1972 EXPORT_SYMBOL(sk_alloc);
1973 EXPORT_SYMBOL(sk_free);
1974 EXPORT_SYMBOL(sk_send_sigurg);
1975 EXPORT_SYMBOL(sock_alloc_send_skb);
1976 EXPORT_SYMBOL(sock_init_data);
1977 EXPORT_SYMBOL(sock_kfree_s);
1978 EXPORT_SYMBOL(sock_kmalloc);
1979 EXPORT_SYMBOL(sock_no_accept);
1980 EXPORT_SYMBOL(sock_no_bind);
1981 EXPORT_SYMBOL(sock_no_connect);
1982 EXPORT_SYMBOL(sock_no_getname);
1983 EXPORT_SYMBOL(sock_no_getsockopt);
1984 EXPORT_SYMBOL(sock_no_ioctl);
1985 EXPORT_SYMBOL(sock_no_listen);
1986 EXPORT_SYMBOL(sock_no_mmap);
1987 EXPORT_SYMBOL(sock_no_poll);
1988 EXPORT_SYMBOL(sock_no_recvmsg);
1989 EXPORT_SYMBOL(sock_no_sendmsg);
1990 EXPORT_SYMBOL(sock_no_sendpage);
1991 EXPORT_SYMBOL(sock_no_setsockopt);
1992 EXPORT_SYMBOL(sock_no_shutdown);
1993 EXPORT_SYMBOL(sock_no_socketpair);
1994 EXPORT_SYMBOL(sock_rfree);
1995 EXPORT_SYMBOL(sock_setsockopt);
1996 EXPORT_SYMBOL(sock_wfree);
1997 EXPORT_SYMBOL(sock_wmalloc);
1998 EXPORT_SYMBOL(sock_i_uid);
1999 EXPORT_SYMBOL(sock_i_ino);
2000 EXPORT_SYMBOL(sysctl_optmem_max);
2001 #ifdef CONFIG_SYSCTL
2002 EXPORT_SYMBOL(sysctl_rmem_max);
2003 EXPORT_SYMBOL(sysctl_wmem_max);
2004 #endif