be2net: Implement ethtool get_phys_id function.
[linux-2.6.git] / net / core / sock.c
blob43ca2c99539330fb68753c63d92939796db36cca
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
83 * To Fix:
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/net_namespace.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <linux/net_tstamp.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
127 #include <linux/filter.h>
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
137 static struct lock_class_key af_family_keys[AF_MAX];
138 static struct lock_class_key af_family_slock_keys[AF_MAX];
141 * Make lock validator output more readable. (we pre-construct these
142 * strings build-time, so that runtime initialization of socket
143 * locks is fast):
145 static const char *const af_family_key_strings[AF_MAX+1] = {
146 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
147 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
148 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
149 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
150 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
151 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
152 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
153 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
154 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
155 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
156 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
157 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
158 "sk_lock-AF_IEEE802154",
159 "sk_lock-AF_MAX"
161 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
162 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
163 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
164 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
165 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
166 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
167 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
168 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
169 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
170 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
171 "slock-27" , "slock-28" , "slock-AF_CAN" ,
172 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
173 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
174 "slock-AF_IEEE802154",
175 "slock-AF_MAX"
177 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
178 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
179 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
180 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
181 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
182 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
183 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
184 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
185 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
186 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
187 "clock-27" , "clock-28" , "clock-AF_CAN" ,
188 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
189 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
190 "clock-AF_IEEE802154",
191 "clock-AF_MAX"
195 * sk_callback_lock locking rules are per-address-family,
196 * so split the lock classes by using a per-AF key:
198 static struct lock_class_key af_callback_keys[AF_MAX];
200 /* Take into consideration the size of the struct sk_buff overhead in the
201 * determination of these values, since that is non-constant across
202 * platforms. This makes socket queueing behavior and performance
203 * not depend upon such differences.
205 #define _SK_MEM_PACKETS 256
206 #define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
207 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
210 /* Run time adjustable parameters. */
211 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
212 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
213 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
214 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
216 /* Maximal space eaten by iovec or ancilliary data plus some space */
217 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
218 EXPORT_SYMBOL(sysctl_optmem_max);
220 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
222 struct timeval tv;
224 if (optlen < sizeof(tv))
225 return -EINVAL;
226 if (copy_from_user(&tv, optval, sizeof(tv)))
227 return -EFAULT;
228 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
229 return -EDOM;
231 if (tv.tv_sec < 0) {
232 static int warned __read_mostly;
234 *timeo_p = 0;
235 if (warned < 10 && net_ratelimit()) {
236 warned++;
237 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
238 "tries to set negative timeout\n",
239 current->comm, task_pid_nr(current));
241 return 0;
243 *timeo_p = MAX_SCHEDULE_TIMEOUT;
244 if (tv.tv_sec == 0 && tv.tv_usec == 0)
245 return 0;
246 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
247 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
248 return 0;
251 static void sock_warn_obsolete_bsdism(const char *name)
253 static int warned;
254 static char warncomm[TASK_COMM_LEN];
255 if (strcmp(warncomm, current->comm) && warned < 5) {
256 strcpy(warncomm, current->comm);
257 printk(KERN_WARNING "process `%s' is using obsolete "
258 "%s SO_BSDCOMPAT\n", warncomm, name);
259 warned++;
263 static void sock_disable_timestamp(struct sock *sk, int flag)
265 if (sock_flag(sk, flag)) {
266 sock_reset_flag(sk, flag);
267 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
268 !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
269 net_disable_timestamp();
275 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
277 int err = 0;
278 int skb_len;
279 unsigned long flags;
280 struct sk_buff_head *list = &sk->sk_receive_queue;
282 /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
283 number of warnings when compiling with -W --ANK
285 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
286 (unsigned)sk->sk_rcvbuf) {
287 err = -ENOMEM;
288 goto out;
291 err = sk_filter(sk, skb);
292 if (err)
293 goto out;
295 if (!sk_rmem_schedule(sk, skb->truesize)) {
296 err = -ENOBUFS;
297 goto out;
300 skb->dev = NULL;
301 skb_set_owner_r(skb, sk);
303 /* Cache the SKB length before we tack it onto the receive
304 * queue. Once it is added it no longer belongs to us and
305 * may be freed by other threads of control pulling packets
306 * from the queue.
308 skb_len = skb->len;
310 spin_lock_irqsave(&list->lock, flags);
311 skb->dropcount = atomic_read(&sk->sk_drops);
312 __skb_queue_tail(list, skb);
313 spin_unlock_irqrestore(&list->lock, flags);
315 if (!sock_flag(sk, SOCK_DEAD))
316 sk->sk_data_ready(sk, skb_len);
317 out:
318 return err;
320 EXPORT_SYMBOL(sock_queue_rcv_skb);
322 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
324 int rc = NET_RX_SUCCESS;
326 if (sk_filter(sk, skb))
327 goto discard_and_relse;
329 skb->dev = NULL;
331 if (nested)
332 bh_lock_sock_nested(sk);
333 else
334 bh_lock_sock(sk);
335 if (!sock_owned_by_user(sk)) {
337 * trylock + unlock semantics:
339 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
341 rc = sk_backlog_rcv(sk, skb);
343 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
344 } else
345 sk_add_backlog(sk, skb);
346 bh_unlock_sock(sk);
347 out:
348 sock_put(sk);
349 return rc;
350 discard_and_relse:
351 kfree_skb(skb);
352 goto out;
354 EXPORT_SYMBOL(sk_receive_skb);
356 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
358 struct dst_entry *dst = sk->sk_dst_cache;
360 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
361 sk->sk_dst_cache = NULL;
362 dst_release(dst);
363 return NULL;
366 return dst;
368 EXPORT_SYMBOL(__sk_dst_check);
370 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
372 struct dst_entry *dst = sk_dst_get(sk);
374 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
375 sk_dst_reset(sk);
376 dst_release(dst);
377 return NULL;
380 return dst;
382 EXPORT_SYMBOL(sk_dst_check);
384 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
386 int ret = -ENOPROTOOPT;
387 #ifdef CONFIG_NETDEVICES
388 struct net *net = sock_net(sk);
389 char devname[IFNAMSIZ];
390 int index;
392 /* Sorry... */
393 ret = -EPERM;
394 if (!capable(CAP_NET_RAW))
395 goto out;
397 ret = -EINVAL;
398 if (optlen < 0)
399 goto out;
401 /* Bind this socket to a particular device like "eth0",
402 * as specified in the passed interface name. If the
403 * name is "" or the option length is zero the socket
404 * is not bound.
406 if (optlen > IFNAMSIZ - 1)
407 optlen = IFNAMSIZ - 1;
408 memset(devname, 0, sizeof(devname));
410 ret = -EFAULT;
411 if (copy_from_user(devname, optval, optlen))
412 goto out;
414 if (devname[0] == '\0') {
415 index = 0;
416 } else {
417 struct net_device *dev = dev_get_by_name(net, devname);
419 ret = -ENODEV;
420 if (!dev)
421 goto out;
423 index = dev->ifindex;
424 dev_put(dev);
427 lock_sock(sk);
428 sk->sk_bound_dev_if = index;
429 sk_dst_reset(sk);
430 release_sock(sk);
432 ret = 0;
434 out:
435 #endif
437 return ret;
440 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
442 if (valbool)
443 sock_set_flag(sk, bit);
444 else
445 sock_reset_flag(sk, bit);
449 * This is meant for all protocols to use and covers goings on
450 * at the socket level. Everything here is generic.
453 int sock_setsockopt(struct socket *sock, int level, int optname,
454 char __user *optval, unsigned int optlen)
456 struct sock *sk = sock->sk;
457 int val;
458 int valbool;
459 struct linger ling;
460 int ret = 0;
463 * Options without arguments
466 if (optname == SO_BINDTODEVICE)
467 return sock_bindtodevice(sk, optval, optlen);
469 if (optlen < sizeof(int))
470 return -EINVAL;
472 if (get_user(val, (int __user *)optval))
473 return -EFAULT;
475 valbool = val ? 1 : 0;
477 lock_sock(sk);
479 switch (optname) {
480 case SO_DEBUG:
481 if (val && !capable(CAP_NET_ADMIN))
482 ret = -EACCES;
483 else
484 sock_valbool_flag(sk, SOCK_DBG, valbool);
485 break;
486 case SO_REUSEADDR:
487 sk->sk_reuse = valbool;
488 break;
489 case SO_TYPE:
490 case SO_PROTOCOL:
491 case SO_DOMAIN:
492 case SO_ERROR:
493 ret = -ENOPROTOOPT;
494 break;
495 case SO_DONTROUTE:
496 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
497 break;
498 case SO_BROADCAST:
499 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
500 break;
501 case SO_SNDBUF:
502 /* Don't error on this BSD doesn't and if you think
503 about it this is right. Otherwise apps have to
504 play 'guess the biggest size' games. RCVBUF/SNDBUF
505 are treated in BSD as hints */
507 if (val > sysctl_wmem_max)
508 val = sysctl_wmem_max;
509 set_sndbuf:
510 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
511 if ((val * 2) < SOCK_MIN_SNDBUF)
512 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
513 else
514 sk->sk_sndbuf = val * 2;
517 * Wake up sending tasks if we
518 * upped the value.
520 sk->sk_write_space(sk);
521 break;
523 case SO_SNDBUFFORCE:
524 if (!capable(CAP_NET_ADMIN)) {
525 ret = -EPERM;
526 break;
528 goto set_sndbuf;
530 case SO_RCVBUF:
531 /* Don't error on this BSD doesn't and if you think
532 about it this is right. Otherwise apps have to
533 play 'guess the biggest size' games. RCVBUF/SNDBUF
534 are treated in BSD as hints */
536 if (val > sysctl_rmem_max)
537 val = sysctl_rmem_max;
538 set_rcvbuf:
539 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
541 * We double it on the way in to account for
542 * "struct sk_buff" etc. overhead. Applications
543 * assume that the SO_RCVBUF setting they make will
544 * allow that much actual data to be received on that
545 * socket.
547 * Applications are unaware that "struct sk_buff" and
548 * other overheads allocate from the receive buffer
549 * during socket buffer allocation.
551 * And after considering the possible alternatives,
552 * returning the value we actually used in getsockopt
553 * is the most desirable behavior.
555 if ((val * 2) < SOCK_MIN_RCVBUF)
556 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
557 else
558 sk->sk_rcvbuf = val * 2;
559 break;
561 case SO_RCVBUFFORCE:
562 if (!capable(CAP_NET_ADMIN)) {
563 ret = -EPERM;
564 break;
566 goto set_rcvbuf;
568 case SO_KEEPALIVE:
569 #ifdef CONFIG_INET
570 if (sk->sk_protocol == IPPROTO_TCP)
571 tcp_set_keepalive(sk, valbool);
572 #endif
573 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
574 break;
576 case SO_OOBINLINE:
577 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
578 break;
580 case SO_NO_CHECK:
581 sk->sk_no_check = valbool;
582 break;
584 case SO_PRIORITY:
585 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
586 sk->sk_priority = val;
587 else
588 ret = -EPERM;
589 break;
591 case SO_LINGER:
592 if (optlen < sizeof(ling)) {
593 ret = -EINVAL; /* 1003.1g */
594 break;
596 if (copy_from_user(&ling, optval, sizeof(ling))) {
597 ret = -EFAULT;
598 break;
600 if (!ling.l_onoff)
601 sock_reset_flag(sk, SOCK_LINGER);
602 else {
603 #if (BITS_PER_LONG == 32)
604 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
605 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
606 else
607 #endif
608 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
609 sock_set_flag(sk, SOCK_LINGER);
611 break;
613 case SO_BSDCOMPAT:
614 sock_warn_obsolete_bsdism("setsockopt");
615 break;
617 case SO_PASSCRED:
618 if (valbool)
619 set_bit(SOCK_PASSCRED, &sock->flags);
620 else
621 clear_bit(SOCK_PASSCRED, &sock->flags);
622 break;
624 case SO_TIMESTAMP:
625 case SO_TIMESTAMPNS:
626 if (valbool) {
627 if (optname == SO_TIMESTAMP)
628 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
629 else
630 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
631 sock_set_flag(sk, SOCK_RCVTSTAMP);
632 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
633 } else {
634 sock_reset_flag(sk, SOCK_RCVTSTAMP);
635 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
637 break;
639 case SO_TIMESTAMPING:
640 if (val & ~SOF_TIMESTAMPING_MASK) {
641 ret = -EINVAL;
642 break;
644 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
645 val & SOF_TIMESTAMPING_TX_HARDWARE);
646 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
647 val & SOF_TIMESTAMPING_TX_SOFTWARE);
648 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
649 val & SOF_TIMESTAMPING_RX_HARDWARE);
650 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
651 sock_enable_timestamp(sk,
652 SOCK_TIMESTAMPING_RX_SOFTWARE);
653 else
654 sock_disable_timestamp(sk,
655 SOCK_TIMESTAMPING_RX_SOFTWARE);
656 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
657 val & SOF_TIMESTAMPING_SOFTWARE);
658 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
659 val & SOF_TIMESTAMPING_SYS_HARDWARE);
660 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
661 val & SOF_TIMESTAMPING_RAW_HARDWARE);
662 break;
664 case SO_RCVLOWAT:
665 if (val < 0)
666 val = INT_MAX;
667 sk->sk_rcvlowat = val ? : 1;
668 break;
670 case SO_RCVTIMEO:
671 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
672 break;
674 case SO_SNDTIMEO:
675 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
676 break;
678 case SO_ATTACH_FILTER:
679 ret = -EINVAL;
680 if (optlen == sizeof(struct sock_fprog)) {
681 struct sock_fprog fprog;
683 ret = -EFAULT;
684 if (copy_from_user(&fprog, optval, sizeof(fprog)))
685 break;
687 ret = sk_attach_filter(&fprog, sk);
689 break;
691 case SO_DETACH_FILTER:
692 ret = sk_detach_filter(sk);
693 break;
695 case SO_PASSSEC:
696 if (valbool)
697 set_bit(SOCK_PASSSEC, &sock->flags);
698 else
699 clear_bit(SOCK_PASSSEC, &sock->flags);
700 break;
701 case SO_MARK:
702 if (!capable(CAP_NET_ADMIN))
703 ret = -EPERM;
704 else
705 sk->sk_mark = val;
706 break;
708 /* We implement the SO_SNDLOWAT etc to
709 not be settable (1003.1g 5.3) */
710 case SO_RXQ_OVFL:
711 if (valbool)
712 sock_set_flag(sk, SOCK_RXQ_OVFL);
713 else
714 sock_reset_flag(sk, SOCK_RXQ_OVFL);
715 break;
716 default:
717 ret = -ENOPROTOOPT;
718 break;
720 release_sock(sk);
721 return ret;
723 EXPORT_SYMBOL(sock_setsockopt);
726 int sock_getsockopt(struct socket *sock, int level, int optname,
727 char __user *optval, int __user *optlen)
729 struct sock *sk = sock->sk;
731 union {
732 int val;
733 struct linger ling;
734 struct timeval tm;
735 } v;
737 unsigned int lv = sizeof(int);
738 int len;
740 if (get_user(len, optlen))
741 return -EFAULT;
742 if (len < 0)
743 return -EINVAL;
745 memset(&v, 0, sizeof(v));
747 switch (optname) {
748 case SO_DEBUG:
749 v.val = sock_flag(sk, SOCK_DBG);
750 break;
752 case SO_DONTROUTE:
753 v.val = sock_flag(sk, SOCK_LOCALROUTE);
754 break;
756 case SO_BROADCAST:
757 v.val = !!sock_flag(sk, SOCK_BROADCAST);
758 break;
760 case SO_SNDBUF:
761 v.val = sk->sk_sndbuf;
762 break;
764 case SO_RCVBUF:
765 v.val = sk->sk_rcvbuf;
766 break;
768 case SO_REUSEADDR:
769 v.val = sk->sk_reuse;
770 break;
772 case SO_KEEPALIVE:
773 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
774 break;
776 case SO_TYPE:
777 v.val = sk->sk_type;
778 break;
780 case SO_PROTOCOL:
781 v.val = sk->sk_protocol;
782 break;
784 case SO_DOMAIN:
785 v.val = sk->sk_family;
786 break;
788 case SO_ERROR:
789 v.val = -sock_error(sk);
790 if (v.val == 0)
791 v.val = xchg(&sk->sk_err_soft, 0);
792 break;
794 case SO_OOBINLINE:
795 v.val = !!sock_flag(sk, SOCK_URGINLINE);
796 break;
798 case SO_NO_CHECK:
799 v.val = sk->sk_no_check;
800 break;
802 case SO_PRIORITY:
803 v.val = sk->sk_priority;
804 break;
806 case SO_LINGER:
807 lv = sizeof(v.ling);
808 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
809 v.ling.l_linger = sk->sk_lingertime / HZ;
810 break;
812 case SO_BSDCOMPAT:
813 sock_warn_obsolete_bsdism("getsockopt");
814 break;
816 case SO_TIMESTAMP:
817 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
818 !sock_flag(sk, SOCK_RCVTSTAMPNS);
819 break;
821 case SO_TIMESTAMPNS:
822 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
823 break;
825 case SO_TIMESTAMPING:
826 v.val = 0;
827 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
828 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
829 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
830 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
831 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
832 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
833 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
834 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
835 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
836 v.val |= SOF_TIMESTAMPING_SOFTWARE;
837 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
838 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
839 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
840 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
841 break;
843 case SO_RCVTIMEO:
844 lv = sizeof(struct timeval);
845 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
846 v.tm.tv_sec = 0;
847 v.tm.tv_usec = 0;
848 } else {
849 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
850 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
852 break;
854 case SO_SNDTIMEO:
855 lv = sizeof(struct timeval);
856 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
857 v.tm.tv_sec = 0;
858 v.tm.tv_usec = 0;
859 } else {
860 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
861 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
863 break;
865 case SO_RCVLOWAT:
866 v.val = sk->sk_rcvlowat;
867 break;
869 case SO_SNDLOWAT:
870 v.val = 1;
871 break;
873 case SO_PASSCRED:
874 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
875 break;
877 case SO_PEERCRED:
878 if (len > sizeof(sk->sk_peercred))
879 len = sizeof(sk->sk_peercred);
880 if (copy_to_user(optval, &sk->sk_peercred, len))
881 return -EFAULT;
882 goto lenout;
884 case SO_PEERNAME:
886 char address[128];
888 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
889 return -ENOTCONN;
890 if (lv < len)
891 return -EINVAL;
892 if (copy_to_user(optval, address, len))
893 return -EFAULT;
894 goto lenout;
897 /* Dubious BSD thing... Probably nobody even uses it, but
898 * the UNIX standard wants it for whatever reason... -DaveM
900 case SO_ACCEPTCONN:
901 v.val = sk->sk_state == TCP_LISTEN;
902 break;
904 case SO_PASSSEC:
905 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
906 break;
908 case SO_PEERSEC:
909 return security_socket_getpeersec_stream(sock, optval, optlen, len);
911 case SO_MARK:
912 v.val = sk->sk_mark;
913 break;
915 case SO_RXQ_OVFL:
916 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
917 break;
919 default:
920 return -ENOPROTOOPT;
923 if (len > lv)
924 len = lv;
925 if (copy_to_user(optval, &v, len))
926 return -EFAULT;
927 lenout:
928 if (put_user(len, optlen))
929 return -EFAULT;
930 return 0;
934 * Initialize an sk_lock.
936 * (We also register the sk_lock with the lock validator.)
938 static inline void sock_lock_init(struct sock *sk)
940 sock_lock_init_class_and_name(sk,
941 af_family_slock_key_strings[sk->sk_family],
942 af_family_slock_keys + sk->sk_family,
943 af_family_key_strings[sk->sk_family],
944 af_family_keys + sk->sk_family);
948 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
949 * even temporarly, because of RCU lookups. sk_node should also be left as is.
951 static void sock_copy(struct sock *nsk, const struct sock *osk)
953 #ifdef CONFIG_SECURITY_NETWORK
954 void *sptr = nsk->sk_security;
955 #endif
956 BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
957 sizeof(osk->sk_node) + sizeof(osk->sk_refcnt));
958 memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
959 osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
960 #ifdef CONFIG_SECURITY_NETWORK
961 nsk->sk_security = sptr;
962 security_sk_clone(osk, nsk);
963 #endif
966 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
967 int family)
969 struct sock *sk;
970 struct kmem_cache *slab;
972 slab = prot->slab;
973 if (slab != NULL) {
974 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
975 if (!sk)
976 return sk;
977 if (priority & __GFP_ZERO) {
979 * caches using SLAB_DESTROY_BY_RCU should let
980 * sk_node.next un-modified. Special care is taken
981 * when initializing object to zero.
983 if (offsetof(struct sock, sk_node.next) != 0)
984 memset(sk, 0, offsetof(struct sock, sk_node.next));
985 memset(&sk->sk_node.pprev, 0,
986 prot->obj_size - offsetof(struct sock,
987 sk_node.pprev));
990 else
991 sk = kmalloc(prot->obj_size, priority);
993 if (sk != NULL) {
994 kmemcheck_annotate_bitfield(sk, flags);
996 if (security_sk_alloc(sk, family, priority))
997 goto out_free;
999 if (!try_module_get(prot->owner))
1000 goto out_free_sec;
1003 return sk;
1005 out_free_sec:
1006 security_sk_free(sk);
1007 out_free:
1008 if (slab != NULL)
1009 kmem_cache_free(slab, sk);
1010 else
1011 kfree(sk);
1012 return NULL;
1015 static void sk_prot_free(struct proto *prot, struct sock *sk)
1017 struct kmem_cache *slab;
1018 struct module *owner;
1020 owner = prot->owner;
1021 slab = prot->slab;
1023 security_sk_free(sk);
1024 if (slab != NULL)
1025 kmem_cache_free(slab, sk);
1026 else
1027 kfree(sk);
1028 module_put(owner);
1032 * sk_alloc - All socket objects are allocated here
1033 * @net: the applicable net namespace
1034 * @family: protocol family
1035 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1036 * @prot: struct proto associated with this new sock instance
1038 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1039 struct proto *prot)
1041 struct sock *sk;
1043 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1044 if (sk) {
1045 sk->sk_family = family;
1047 * See comment in struct sock definition to understand
1048 * why we need sk_prot_creator -acme
1050 sk->sk_prot = sk->sk_prot_creator = prot;
1051 sock_lock_init(sk);
1052 sock_net_set(sk, get_net(net));
1053 atomic_set(&sk->sk_wmem_alloc, 1);
1056 return sk;
1058 EXPORT_SYMBOL(sk_alloc);
1060 static void __sk_free(struct sock *sk)
1062 struct sk_filter *filter;
1064 if (sk->sk_destruct)
1065 sk->sk_destruct(sk);
1067 filter = rcu_dereference(sk->sk_filter);
1068 if (filter) {
1069 sk_filter_uncharge(sk, filter);
1070 rcu_assign_pointer(sk->sk_filter, NULL);
1073 sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1074 sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1076 if (atomic_read(&sk->sk_omem_alloc))
1077 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1078 __func__, atomic_read(&sk->sk_omem_alloc));
1080 put_net(sock_net(sk));
1081 sk_prot_free(sk->sk_prot_creator, sk);
1084 void sk_free(struct sock *sk)
1087 * We substract one from sk_wmem_alloc and can know if
1088 * some packets are still in some tx queue.
1089 * If not null, sock_wfree() will call __sk_free(sk) later
1091 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1092 __sk_free(sk);
1094 EXPORT_SYMBOL(sk_free);
1097 * Last sock_put should drop referrence to sk->sk_net. It has already
1098 * been dropped in sk_change_net. Taking referrence to stopping namespace
1099 * is not an option.
1100 * Take referrence to a socket to remove it from hash _alive_ and after that
1101 * destroy it in the context of init_net.
1103 void sk_release_kernel(struct sock *sk)
1105 if (sk == NULL || sk->sk_socket == NULL)
1106 return;
1108 sock_hold(sk);
1109 sock_release(sk->sk_socket);
1110 release_net(sock_net(sk));
1111 sock_net_set(sk, get_net(&init_net));
1112 sock_put(sk);
1114 EXPORT_SYMBOL(sk_release_kernel);
1116 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1118 struct sock *newsk;
1120 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1121 if (newsk != NULL) {
1122 struct sk_filter *filter;
1124 sock_copy(newsk, sk);
1126 /* SANITY */
1127 get_net(sock_net(newsk));
1128 sk_node_init(&newsk->sk_node);
1129 sock_lock_init(newsk);
1130 bh_lock_sock(newsk);
1131 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1133 atomic_set(&newsk->sk_rmem_alloc, 0);
1135 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1137 atomic_set(&newsk->sk_wmem_alloc, 1);
1138 atomic_set(&newsk->sk_omem_alloc, 0);
1139 skb_queue_head_init(&newsk->sk_receive_queue);
1140 skb_queue_head_init(&newsk->sk_write_queue);
1141 #ifdef CONFIG_NET_DMA
1142 skb_queue_head_init(&newsk->sk_async_wait_queue);
1143 #endif
1145 rwlock_init(&newsk->sk_dst_lock);
1146 rwlock_init(&newsk->sk_callback_lock);
1147 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1148 af_callback_keys + newsk->sk_family,
1149 af_family_clock_key_strings[newsk->sk_family]);
1151 newsk->sk_dst_cache = NULL;
1152 newsk->sk_wmem_queued = 0;
1153 newsk->sk_forward_alloc = 0;
1154 newsk->sk_send_head = NULL;
1155 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1157 sock_reset_flag(newsk, SOCK_DONE);
1158 skb_queue_head_init(&newsk->sk_error_queue);
1160 filter = newsk->sk_filter;
1161 if (filter != NULL)
1162 sk_filter_charge(newsk, filter);
1164 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1165 /* It is still raw copy of parent, so invalidate
1166 * destructor and make plain sk_free() */
1167 newsk->sk_destruct = NULL;
1168 sk_free(newsk);
1169 newsk = NULL;
1170 goto out;
1173 newsk->sk_err = 0;
1174 newsk->sk_priority = 0;
1176 * Before updating sk_refcnt, we must commit prior changes to memory
1177 * (Documentation/RCU/rculist_nulls.txt for details)
1179 smp_wmb();
1180 atomic_set(&newsk->sk_refcnt, 2);
1183 * Increment the counter in the same struct proto as the master
1184 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1185 * is the same as sk->sk_prot->socks, as this field was copied
1186 * with memcpy).
1188 * This _changes_ the previous behaviour, where
1189 * tcp_create_openreq_child always was incrementing the
1190 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1191 * to be taken into account in all callers. -acme
1193 sk_refcnt_debug_inc(newsk);
1194 sk_set_socket(newsk, NULL);
1195 newsk->sk_sleep = NULL;
1197 if (newsk->sk_prot->sockets_allocated)
1198 percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1200 out:
1201 return newsk;
1203 EXPORT_SYMBOL_GPL(sk_clone);
1205 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1207 __sk_dst_set(sk, dst);
1208 sk->sk_route_caps = dst->dev->features;
1209 if (sk->sk_route_caps & NETIF_F_GSO)
1210 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1211 if (sk_can_gso(sk)) {
1212 if (dst->header_len) {
1213 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1214 } else {
1215 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1216 sk->sk_gso_max_size = dst->dev->gso_max_size;
1220 EXPORT_SYMBOL_GPL(sk_setup_caps);
1222 void __init sk_init(void)
1224 if (totalram_pages <= 4096) {
1225 sysctl_wmem_max = 32767;
1226 sysctl_rmem_max = 32767;
1227 sysctl_wmem_default = 32767;
1228 sysctl_rmem_default = 32767;
1229 } else if (totalram_pages >= 131072) {
1230 sysctl_wmem_max = 131071;
1231 sysctl_rmem_max = 131071;
1236 * Simple resource managers for sockets.
1241 * Write buffer destructor automatically called from kfree_skb.
1243 void sock_wfree(struct sk_buff *skb)
1245 struct sock *sk = skb->sk;
1246 unsigned int len = skb->truesize;
1248 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1250 * Keep a reference on sk_wmem_alloc, this will be released
1251 * after sk_write_space() call
1253 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1254 sk->sk_write_space(sk);
1255 len = 1;
1258 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1259 * could not do because of in-flight packets
1261 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1262 __sk_free(sk);
1264 EXPORT_SYMBOL(sock_wfree);
1267 * Read buffer destructor automatically called from kfree_skb.
1269 void sock_rfree(struct sk_buff *skb)
1271 struct sock *sk = skb->sk;
1273 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1274 sk_mem_uncharge(skb->sk, skb->truesize);
1276 EXPORT_SYMBOL(sock_rfree);
1279 int sock_i_uid(struct sock *sk)
1281 int uid;
1283 read_lock(&sk->sk_callback_lock);
1284 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1285 read_unlock(&sk->sk_callback_lock);
1286 return uid;
1288 EXPORT_SYMBOL(sock_i_uid);
1290 unsigned long sock_i_ino(struct sock *sk)
1292 unsigned long ino;
1294 read_lock(&sk->sk_callback_lock);
1295 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1296 read_unlock(&sk->sk_callback_lock);
1297 return ino;
1299 EXPORT_SYMBOL(sock_i_ino);
1302 * Allocate a skb from the socket's send buffer.
1304 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1305 gfp_t priority)
1307 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1308 struct sk_buff *skb = alloc_skb(size, priority);
1309 if (skb) {
1310 skb_set_owner_w(skb, sk);
1311 return skb;
1314 return NULL;
1316 EXPORT_SYMBOL(sock_wmalloc);
1319 * Allocate a skb from the socket's receive buffer.
1321 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1322 gfp_t priority)
1324 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1325 struct sk_buff *skb = alloc_skb(size, priority);
1326 if (skb) {
1327 skb_set_owner_r(skb, sk);
1328 return skb;
1331 return NULL;
1335 * Allocate a memory block from the socket's option memory buffer.
1337 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1339 if ((unsigned)size <= sysctl_optmem_max &&
1340 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1341 void *mem;
1342 /* First do the add, to avoid the race if kmalloc
1343 * might sleep.
1345 atomic_add(size, &sk->sk_omem_alloc);
1346 mem = kmalloc(size, priority);
1347 if (mem)
1348 return mem;
1349 atomic_sub(size, &sk->sk_omem_alloc);
1351 return NULL;
1353 EXPORT_SYMBOL(sock_kmalloc);
1356 * Free an option memory block.
1358 void sock_kfree_s(struct sock *sk, void *mem, int size)
1360 kfree(mem);
1361 atomic_sub(size, &sk->sk_omem_alloc);
1363 EXPORT_SYMBOL(sock_kfree_s);
1365 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1366 I think, these locks should be removed for datagram sockets.
1368 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1370 DEFINE_WAIT(wait);
1372 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1373 for (;;) {
1374 if (!timeo)
1375 break;
1376 if (signal_pending(current))
1377 break;
1378 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1379 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1380 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1381 break;
1382 if (sk->sk_shutdown & SEND_SHUTDOWN)
1383 break;
1384 if (sk->sk_err)
1385 break;
1386 timeo = schedule_timeout(timeo);
1388 finish_wait(sk->sk_sleep, &wait);
1389 return timeo;
1394 * Generic send/receive buffer handlers
1397 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1398 unsigned long data_len, int noblock,
1399 int *errcode)
1401 struct sk_buff *skb;
1402 gfp_t gfp_mask;
1403 long timeo;
1404 int err;
1406 gfp_mask = sk->sk_allocation;
1407 if (gfp_mask & __GFP_WAIT)
1408 gfp_mask |= __GFP_REPEAT;
1410 timeo = sock_sndtimeo(sk, noblock);
1411 while (1) {
1412 err = sock_error(sk);
1413 if (err != 0)
1414 goto failure;
1416 err = -EPIPE;
1417 if (sk->sk_shutdown & SEND_SHUTDOWN)
1418 goto failure;
1420 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1421 skb = alloc_skb(header_len, gfp_mask);
1422 if (skb) {
1423 int npages;
1424 int i;
1426 /* No pages, we're done... */
1427 if (!data_len)
1428 break;
1430 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1431 skb->truesize += data_len;
1432 skb_shinfo(skb)->nr_frags = npages;
1433 for (i = 0; i < npages; i++) {
1434 struct page *page;
1435 skb_frag_t *frag;
1437 page = alloc_pages(sk->sk_allocation, 0);
1438 if (!page) {
1439 err = -ENOBUFS;
1440 skb_shinfo(skb)->nr_frags = i;
1441 kfree_skb(skb);
1442 goto failure;
1445 frag = &skb_shinfo(skb)->frags[i];
1446 frag->page = page;
1447 frag->page_offset = 0;
1448 frag->size = (data_len >= PAGE_SIZE ?
1449 PAGE_SIZE :
1450 data_len);
1451 data_len -= PAGE_SIZE;
1454 /* Full success... */
1455 break;
1457 err = -ENOBUFS;
1458 goto failure;
1460 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1461 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1462 err = -EAGAIN;
1463 if (!timeo)
1464 goto failure;
1465 if (signal_pending(current))
1466 goto interrupted;
1467 timeo = sock_wait_for_wmem(sk, timeo);
1470 skb_set_owner_w(skb, sk);
1471 return skb;
1473 interrupted:
1474 err = sock_intr_errno(timeo);
1475 failure:
1476 *errcode = err;
1477 return NULL;
1479 EXPORT_SYMBOL(sock_alloc_send_pskb);
1481 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1482 int noblock, int *errcode)
1484 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1486 EXPORT_SYMBOL(sock_alloc_send_skb);
1488 static void __lock_sock(struct sock *sk)
1490 DEFINE_WAIT(wait);
1492 for (;;) {
1493 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1494 TASK_UNINTERRUPTIBLE);
1495 spin_unlock_bh(&sk->sk_lock.slock);
1496 schedule();
1497 spin_lock_bh(&sk->sk_lock.slock);
1498 if (!sock_owned_by_user(sk))
1499 break;
1501 finish_wait(&sk->sk_lock.wq, &wait);
1504 static void __release_sock(struct sock *sk)
1506 struct sk_buff *skb = sk->sk_backlog.head;
1508 do {
1509 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1510 bh_unlock_sock(sk);
1512 do {
1513 struct sk_buff *next = skb->next;
1515 skb->next = NULL;
1516 sk_backlog_rcv(sk, skb);
1519 * We are in process context here with softirqs
1520 * disabled, use cond_resched_softirq() to preempt.
1521 * This is safe to do because we've taken the backlog
1522 * queue private:
1524 cond_resched_softirq();
1526 skb = next;
1527 } while (skb != NULL);
1529 bh_lock_sock(sk);
1530 } while ((skb = sk->sk_backlog.head) != NULL);
1534 * sk_wait_data - wait for data to arrive at sk_receive_queue
1535 * @sk: sock to wait on
1536 * @timeo: for how long
1538 * Now socket state including sk->sk_err is changed only under lock,
1539 * hence we may omit checks after joining wait queue.
1540 * We check receive queue before schedule() only as optimization;
1541 * it is very likely that release_sock() added new data.
1543 int sk_wait_data(struct sock *sk, long *timeo)
1545 int rc;
1546 DEFINE_WAIT(wait);
1548 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1549 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1550 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1551 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1552 finish_wait(sk->sk_sleep, &wait);
1553 return rc;
1555 EXPORT_SYMBOL(sk_wait_data);
1558 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1559 * @sk: socket
1560 * @size: memory size to allocate
1561 * @kind: allocation type
1563 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1564 * rmem allocation. This function assumes that protocols which have
1565 * memory_pressure use sk_wmem_queued as write buffer accounting.
1567 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1569 struct proto *prot = sk->sk_prot;
1570 int amt = sk_mem_pages(size);
1571 int allocated;
1573 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1574 allocated = atomic_add_return(amt, prot->memory_allocated);
1576 /* Under limit. */
1577 if (allocated <= prot->sysctl_mem[0]) {
1578 if (prot->memory_pressure && *prot->memory_pressure)
1579 *prot->memory_pressure = 0;
1580 return 1;
1583 /* Under pressure. */
1584 if (allocated > prot->sysctl_mem[1])
1585 if (prot->enter_memory_pressure)
1586 prot->enter_memory_pressure(sk);
1588 /* Over hard limit. */
1589 if (allocated > prot->sysctl_mem[2])
1590 goto suppress_allocation;
1592 /* guarantee minimum buffer size under pressure */
1593 if (kind == SK_MEM_RECV) {
1594 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1595 return 1;
1596 } else { /* SK_MEM_SEND */
1597 if (sk->sk_type == SOCK_STREAM) {
1598 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1599 return 1;
1600 } else if (atomic_read(&sk->sk_wmem_alloc) <
1601 prot->sysctl_wmem[0])
1602 return 1;
1605 if (prot->memory_pressure) {
1606 int alloc;
1608 if (!*prot->memory_pressure)
1609 return 1;
1610 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1611 if (prot->sysctl_mem[2] > alloc *
1612 sk_mem_pages(sk->sk_wmem_queued +
1613 atomic_read(&sk->sk_rmem_alloc) +
1614 sk->sk_forward_alloc))
1615 return 1;
1618 suppress_allocation:
1620 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1621 sk_stream_moderate_sndbuf(sk);
1623 /* Fail only if socket is _under_ its sndbuf.
1624 * In this case we cannot block, so that we have to fail.
1626 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1627 return 1;
1630 /* Alas. Undo changes. */
1631 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1632 atomic_sub(amt, prot->memory_allocated);
1633 return 0;
1635 EXPORT_SYMBOL(__sk_mem_schedule);
1638 * __sk_reclaim - reclaim memory_allocated
1639 * @sk: socket
1641 void __sk_mem_reclaim(struct sock *sk)
1643 struct proto *prot = sk->sk_prot;
1645 atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1646 prot->memory_allocated);
1647 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1649 if (prot->memory_pressure && *prot->memory_pressure &&
1650 (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1651 *prot->memory_pressure = 0;
1653 EXPORT_SYMBOL(__sk_mem_reclaim);
1657 * Set of default routines for initialising struct proto_ops when
1658 * the protocol does not support a particular function. In certain
1659 * cases where it makes no sense for a protocol to have a "do nothing"
1660 * function, some default processing is provided.
1663 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1665 return -EOPNOTSUPP;
1667 EXPORT_SYMBOL(sock_no_bind);
1669 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1670 int len, int flags)
1672 return -EOPNOTSUPP;
1674 EXPORT_SYMBOL(sock_no_connect);
1676 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1678 return -EOPNOTSUPP;
1680 EXPORT_SYMBOL(sock_no_socketpair);
1682 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1684 return -EOPNOTSUPP;
1686 EXPORT_SYMBOL(sock_no_accept);
1688 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1689 int *len, int peer)
1691 return -EOPNOTSUPP;
1693 EXPORT_SYMBOL(sock_no_getname);
1695 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1697 return 0;
1699 EXPORT_SYMBOL(sock_no_poll);
1701 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1703 return -EOPNOTSUPP;
1705 EXPORT_SYMBOL(sock_no_ioctl);
1707 int sock_no_listen(struct socket *sock, int backlog)
1709 return -EOPNOTSUPP;
1711 EXPORT_SYMBOL(sock_no_listen);
1713 int sock_no_shutdown(struct socket *sock, int how)
1715 return -EOPNOTSUPP;
1717 EXPORT_SYMBOL(sock_no_shutdown);
1719 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1720 char __user *optval, unsigned int optlen)
1722 return -EOPNOTSUPP;
1724 EXPORT_SYMBOL(sock_no_setsockopt);
1726 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1727 char __user *optval, int __user *optlen)
1729 return -EOPNOTSUPP;
1731 EXPORT_SYMBOL(sock_no_getsockopt);
1733 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1734 size_t len)
1736 return -EOPNOTSUPP;
1738 EXPORT_SYMBOL(sock_no_sendmsg);
1740 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1741 size_t len, int flags)
1743 return -EOPNOTSUPP;
1745 EXPORT_SYMBOL(sock_no_recvmsg);
1747 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1749 /* Mirror missing mmap method error code */
1750 return -ENODEV;
1752 EXPORT_SYMBOL(sock_no_mmap);
1754 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1756 ssize_t res;
1757 struct msghdr msg = {.msg_flags = flags};
1758 struct kvec iov;
1759 char *kaddr = kmap(page);
1760 iov.iov_base = kaddr + offset;
1761 iov.iov_len = size;
1762 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1763 kunmap(page);
1764 return res;
1766 EXPORT_SYMBOL(sock_no_sendpage);
1769 * Default Socket Callbacks
1772 static void sock_def_wakeup(struct sock *sk)
1774 read_lock(&sk->sk_callback_lock);
1775 if (sk_has_sleeper(sk))
1776 wake_up_interruptible_all(sk->sk_sleep);
1777 read_unlock(&sk->sk_callback_lock);
1780 static void sock_def_error_report(struct sock *sk)
1782 read_lock(&sk->sk_callback_lock);
1783 if (sk_has_sleeper(sk))
1784 wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
1785 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1786 read_unlock(&sk->sk_callback_lock);
1789 static void sock_def_readable(struct sock *sk, int len)
1791 read_lock(&sk->sk_callback_lock);
1792 if (sk_has_sleeper(sk))
1793 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1794 POLLRDNORM | POLLRDBAND);
1795 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1796 read_unlock(&sk->sk_callback_lock);
1799 static void sock_def_write_space(struct sock *sk)
1801 read_lock(&sk->sk_callback_lock);
1803 /* Do not wake up a writer until he can make "significant"
1804 * progress. --DaveM
1806 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1807 if (sk_has_sleeper(sk))
1808 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1809 POLLWRNORM | POLLWRBAND);
1811 /* Should agree with poll, otherwise some programs break */
1812 if (sock_writeable(sk))
1813 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1816 read_unlock(&sk->sk_callback_lock);
1819 static void sock_def_destruct(struct sock *sk)
1821 kfree(sk->sk_protinfo);
1824 void sk_send_sigurg(struct sock *sk)
1826 if (sk->sk_socket && sk->sk_socket->file)
1827 if (send_sigurg(&sk->sk_socket->file->f_owner))
1828 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1830 EXPORT_SYMBOL(sk_send_sigurg);
1832 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1833 unsigned long expires)
1835 if (!mod_timer(timer, expires))
1836 sock_hold(sk);
1838 EXPORT_SYMBOL(sk_reset_timer);
1840 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1842 if (timer_pending(timer) && del_timer(timer))
1843 __sock_put(sk);
1845 EXPORT_SYMBOL(sk_stop_timer);
1847 void sock_init_data(struct socket *sock, struct sock *sk)
1849 skb_queue_head_init(&sk->sk_receive_queue);
1850 skb_queue_head_init(&sk->sk_write_queue);
1851 skb_queue_head_init(&sk->sk_error_queue);
1852 #ifdef CONFIG_NET_DMA
1853 skb_queue_head_init(&sk->sk_async_wait_queue);
1854 #endif
1856 sk->sk_send_head = NULL;
1858 init_timer(&sk->sk_timer);
1860 sk->sk_allocation = GFP_KERNEL;
1861 sk->sk_rcvbuf = sysctl_rmem_default;
1862 sk->sk_sndbuf = sysctl_wmem_default;
1863 sk->sk_state = TCP_CLOSE;
1864 sk_set_socket(sk, sock);
1866 sock_set_flag(sk, SOCK_ZAPPED);
1868 if (sock) {
1869 sk->sk_type = sock->type;
1870 sk->sk_sleep = &sock->wait;
1871 sock->sk = sk;
1872 } else
1873 sk->sk_sleep = NULL;
1875 rwlock_init(&sk->sk_dst_lock);
1876 rwlock_init(&sk->sk_callback_lock);
1877 lockdep_set_class_and_name(&sk->sk_callback_lock,
1878 af_callback_keys + sk->sk_family,
1879 af_family_clock_key_strings[sk->sk_family]);
1881 sk->sk_state_change = sock_def_wakeup;
1882 sk->sk_data_ready = sock_def_readable;
1883 sk->sk_write_space = sock_def_write_space;
1884 sk->sk_error_report = sock_def_error_report;
1885 sk->sk_destruct = sock_def_destruct;
1887 sk->sk_sndmsg_page = NULL;
1888 sk->sk_sndmsg_off = 0;
1890 sk->sk_peercred.pid = 0;
1891 sk->sk_peercred.uid = -1;
1892 sk->sk_peercred.gid = -1;
1893 sk->sk_write_pending = 0;
1894 sk->sk_rcvlowat = 1;
1895 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1896 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1898 sk->sk_stamp = ktime_set(-1L, 0);
1901 * Before updating sk_refcnt, we must commit prior changes to memory
1902 * (Documentation/RCU/rculist_nulls.txt for details)
1904 smp_wmb();
1905 atomic_set(&sk->sk_refcnt, 1);
1906 atomic_set(&sk->sk_drops, 0);
1908 EXPORT_SYMBOL(sock_init_data);
1910 void lock_sock_nested(struct sock *sk, int subclass)
1912 might_sleep();
1913 spin_lock_bh(&sk->sk_lock.slock);
1914 if (sk->sk_lock.owned)
1915 __lock_sock(sk);
1916 sk->sk_lock.owned = 1;
1917 spin_unlock(&sk->sk_lock.slock);
1919 * The sk_lock has mutex_lock() semantics here:
1921 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1922 local_bh_enable();
1924 EXPORT_SYMBOL(lock_sock_nested);
1926 void release_sock(struct sock *sk)
1929 * The sk_lock has mutex_unlock() semantics:
1931 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1933 spin_lock_bh(&sk->sk_lock.slock);
1934 if (sk->sk_backlog.tail)
1935 __release_sock(sk);
1936 sk->sk_lock.owned = 0;
1937 if (waitqueue_active(&sk->sk_lock.wq))
1938 wake_up(&sk->sk_lock.wq);
1939 spin_unlock_bh(&sk->sk_lock.slock);
1941 EXPORT_SYMBOL(release_sock);
1943 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1945 struct timeval tv;
1946 if (!sock_flag(sk, SOCK_TIMESTAMP))
1947 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1948 tv = ktime_to_timeval(sk->sk_stamp);
1949 if (tv.tv_sec == -1)
1950 return -ENOENT;
1951 if (tv.tv_sec == 0) {
1952 sk->sk_stamp = ktime_get_real();
1953 tv = ktime_to_timeval(sk->sk_stamp);
1955 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1957 EXPORT_SYMBOL(sock_get_timestamp);
1959 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1961 struct timespec ts;
1962 if (!sock_flag(sk, SOCK_TIMESTAMP))
1963 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1964 ts = ktime_to_timespec(sk->sk_stamp);
1965 if (ts.tv_sec == -1)
1966 return -ENOENT;
1967 if (ts.tv_sec == 0) {
1968 sk->sk_stamp = ktime_get_real();
1969 ts = ktime_to_timespec(sk->sk_stamp);
1971 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1973 EXPORT_SYMBOL(sock_get_timestampns);
1975 void sock_enable_timestamp(struct sock *sk, int flag)
1977 if (!sock_flag(sk, flag)) {
1978 sock_set_flag(sk, flag);
1980 * we just set one of the two flags which require net
1981 * time stamping, but time stamping might have been on
1982 * already because of the other one
1984 if (!sock_flag(sk,
1985 flag == SOCK_TIMESTAMP ?
1986 SOCK_TIMESTAMPING_RX_SOFTWARE :
1987 SOCK_TIMESTAMP))
1988 net_enable_timestamp();
1993 * Get a socket option on an socket.
1995 * FIX: POSIX 1003.1g is very ambiguous here. It states that
1996 * asynchronous errors should be reported by getsockopt. We assume
1997 * this means if you specify SO_ERROR (otherwise whats the point of it).
1999 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2000 char __user *optval, int __user *optlen)
2002 struct sock *sk = sock->sk;
2004 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2006 EXPORT_SYMBOL(sock_common_getsockopt);
2008 #ifdef CONFIG_COMPAT
2009 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2010 char __user *optval, int __user *optlen)
2012 struct sock *sk = sock->sk;
2014 if (sk->sk_prot->compat_getsockopt != NULL)
2015 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2016 optval, optlen);
2017 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2019 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2020 #endif
2022 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2023 struct msghdr *msg, size_t size, int flags)
2025 struct sock *sk = sock->sk;
2026 int addr_len = 0;
2027 int err;
2029 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2030 flags & ~MSG_DONTWAIT, &addr_len);
2031 if (err >= 0)
2032 msg->msg_namelen = addr_len;
2033 return err;
2035 EXPORT_SYMBOL(sock_common_recvmsg);
2038 * Set socket options on an inet socket.
2040 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2041 char __user *optval, unsigned int optlen)
2043 struct sock *sk = sock->sk;
2045 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2047 EXPORT_SYMBOL(sock_common_setsockopt);
2049 #ifdef CONFIG_COMPAT
2050 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2051 char __user *optval, unsigned int optlen)
2053 struct sock *sk = sock->sk;
2055 if (sk->sk_prot->compat_setsockopt != NULL)
2056 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2057 optval, optlen);
2058 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2060 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2061 #endif
2063 void sk_common_release(struct sock *sk)
2065 if (sk->sk_prot->destroy)
2066 sk->sk_prot->destroy(sk);
2069 * Observation: when sock_common_release is called, processes have
2070 * no access to socket. But net still has.
2071 * Step one, detach it from networking:
2073 * A. Remove from hash tables.
2076 sk->sk_prot->unhash(sk);
2079 * In this point socket cannot receive new packets, but it is possible
2080 * that some packets are in flight because some CPU runs receiver and
2081 * did hash table lookup before we unhashed socket. They will achieve
2082 * receive queue and will be purged by socket destructor.
2084 * Also we still have packets pending on receive queue and probably,
2085 * our own packets waiting in device queues. sock_destroy will drain
2086 * receive queue, but transmitted packets will delay socket destruction
2087 * until the last reference will be released.
2090 sock_orphan(sk);
2092 xfrm_sk_free_policy(sk);
2094 sk_refcnt_debug_release(sk);
2095 sock_put(sk);
2097 EXPORT_SYMBOL(sk_common_release);
2099 static DEFINE_RWLOCK(proto_list_lock);
2100 static LIST_HEAD(proto_list);
2102 #ifdef CONFIG_PROC_FS
2103 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
2104 struct prot_inuse {
2105 int val[PROTO_INUSE_NR];
2108 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2110 #ifdef CONFIG_NET_NS
2111 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2113 int cpu = smp_processor_id();
2114 per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2116 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2118 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2120 int cpu, idx = prot->inuse_idx;
2121 int res = 0;
2123 for_each_possible_cpu(cpu)
2124 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2126 return res >= 0 ? res : 0;
2128 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2130 static int sock_inuse_init_net(struct net *net)
2132 net->core.inuse = alloc_percpu(struct prot_inuse);
2133 return net->core.inuse ? 0 : -ENOMEM;
2136 static void sock_inuse_exit_net(struct net *net)
2138 free_percpu(net->core.inuse);
2141 static struct pernet_operations net_inuse_ops = {
2142 .init = sock_inuse_init_net,
2143 .exit = sock_inuse_exit_net,
2146 static __init int net_inuse_init(void)
2148 if (register_pernet_subsys(&net_inuse_ops))
2149 panic("Cannot initialize net inuse counters");
2151 return 0;
2154 core_initcall(net_inuse_init);
2155 #else
2156 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2158 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2160 __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2162 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2164 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2166 int cpu, idx = prot->inuse_idx;
2167 int res = 0;
2169 for_each_possible_cpu(cpu)
2170 res += per_cpu(prot_inuse, cpu).val[idx];
2172 return res >= 0 ? res : 0;
2174 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2175 #endif
2177 static void assign_proto_idx(struct proto *prot)
2179 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2181 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2182 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2183 return;
2186 set_bit(prot->inuse_idx, proto_inuse_idx);
2189 static void release_proto_idx(struct proto *prot)
2191 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2192 clear_bit(prot->inuse_idx, proto_inuse_idx);
2194 #else
2195 static inline void assign_proto_idx(struct proto *prot)
2199 static inline void release_proto_idx(struct proto *prot)
2202 #endif
2204 int proto_register(struct proto *prot, int alloc_slab)
2206 if (alloc_slab) {
2207 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2208 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2209 NULL);
2211 if (prot->slab == NULL) {
2212 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2213 prot->name);
2214 goto out;
2217 if (prot->rsk_prot != NULL) {
2218 static const char mask[] = "request_sock_%s";
2220 prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2221 if (prot->rsk_prot->slab_name == NULL)
2222 goto out_free_sock_slab;
2224 sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2225 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2226 prot->rsk_prot->obj_size, 0,
2227 SLAB_HWCACHE_ALIGN, NULL);
2229 if (prot->rsk_prot->slab == NULL) {
2230 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2231 prot->name);
2232 goto out_free_request_sock_slab_name;
2236 if (prot->twsk_prot != NULL) {
2237 static const char mask[] = "tw_sock_%s";
2239 prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2241 if (prot->twsk_prot->twsk_slab_name == NULL)
2242 goto out_free_request_sock_slab;
2244 sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2245 prot->twsk_prot->twsk_slab =
2246 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2247 prot->twsk_prot->twsk_obj_size,
2249 SLAB_HWCACHE_ALIGN |
2250 prot->slab_flags,
2251 NULL);
2252 if (prot->twsk_prot->twsk_slab == NULL)
2253 goto out_free_timewait_sock_slab_name;
2257 write_lock(&proto_list_lock);
2258 list_add(&prot->node, &proto_list);
2259 assign_proto_idx(prot);
2260 write_unlock(&proto_list_lock);
2261 return 0;
2263 out_free_timewait_sock_slab_name:
2264 kfree(prot->twsk_prot->twsk_slab_name);
2265 out_free_request_sock_slab:
2266 if (prot->rsk_prot && prot->rsk_prot->slab) {
2267 kmem_cache_destroy(prot->rsk_prot->slab);
2268 prot->rsk_prot->slab = NULL;
2270 out_free_request_sock_slab_name:
2271 kfree(prot->rsk_prot->slab_name);
2272 out_free_sock_slab:
2273 kmem_cache_destroy(prot->slab);
2274 prot->slab = NULL;
2275 out:
2276 return -ENOBUFS;
2278 EXPORT_SYMBOL(proto_register);
2280 void proto_unregister(struct proto *prot)
2282 write_lock(&proto_list_lock);
2283 release_proto_idx(prot);
2284 list_del(&prot->node);
2285 write_unlock(&proto_list_lock);
2287 if (prot->slab != NULL) {
2288 kmem_cache_destroy(prot->slab);
2289 prot->slab = NULL;
2292 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2293 kmem_cache_destroy(prot->rsk_prot->slab);
2294 kfree(prot->rsk_prot->slab_name);
2295 prot->rsk_prot->slab = NULL;
2298 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2299 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2300 kfree(prot->twsk_prot->twsk_slab_name);
2301 prot->twsk_prot->twsk_slab = NULL;
2304 EXPORT_SYMBOL(proto_unregister);
2306 #ifdef CONFIG_PROC_FS
2307 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2308 __acquires(proto_list_lock)
2310 read_lock(&proto_list_lock);
2311 return seq_list_start_head(&proto_list, *pos);
2314 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2316 return seq_list_next(v, &proto_list, pos);
2319 static void proto_seq_stop(struct seq_file *seq, void *v)
2320 __releases(proto_list_lock)
2322 read_unlock(&proto_list_lock);
2325 static char proto_method_implemented(const void *method)
2327 return method == NULL ? 'n' : 'y';
2330 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2332 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
2333 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2334 proto->name,
2335 proto->obj_size,
2336 sock_prot_inuse_get(seq_file_net(seq), proto),
2337 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2338 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2339 proto->max_header,
2340 proto->slab == NULL ? "no" : "yes",
2341 module_name(proto->owner),
2342 proto_method_implemented(proto->close),
2343 proto_method_implemented(proto->connect),
2344 proto_method_implemented(proto->disconnect),
2345 proto_method_implemented(proto->accept),
2346 proto_method_implemented(proto->ioctl),
2347 proto_method_implemented(proto->init),
2348 proto_method_implemented(proto->destroy),
2349 proto_method_implemented(proto->shutdown),
2350 proto_method_implemented(proto->setsockopt),
2351 proto_method_implemented(proto->getsockopt),
2352 proto_method_implemented(proto->sendmsg),
2353 proto_method_implemented(proto->recvmsg),
2354 proto_method_implemented(proto->sendpage),
2355 proto_method_implemented(proto->bind),
2356 proto_method_implemented(proto->backlog_rcv),
2357 proto_method_implemented(proto->hash),
2358 proto_method_implemented(proto->unhash),
2359 proto_method_implemented(proto->get_port),
2360 proto_method_implemented(proto->enter_memory_pressure));
2363 static int proto_seq_show(struct seq_file *seq, void *v)
2365 if (v == &proto_list)
2366 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2367 "protocol",
2368 "size",
2369 "sockets",
2370 "memory",
2371 "press",
2372 "maxhdr",
2373 "slab",
2374 "module",
2375 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2376 else
2377 proto_seq_printf(seq, list_entry(v, struct proto, node));
2378 return 0;
2381 static const struct seq_operations proto_seq_ops = {
2382 .start = proto_seq_start,
2383 .next = proto_seq_next,
2384 .stop = proto_seq_stop,
2385 .show = proto_seq_show,
2388 static int proto_seq_open(struct inode *inode, struct file *file)
2390 return seq_open_net(inode, file, &proto_seq_ops,
2391 sizeof(struct seq_net_private));
2394 static const struct file_operations proto_seq_fops = {
2395 .owner = THIS_MODULE,
2396 .open = proto_seq_open,
2397 .read = seq_read,
2398 .llseek = seq_lseek,
2399 .release = seq_release_net,
2402 static __net_init int proto_init_net(struct net *net)
2404 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2405 return -ENOMEM;
2407 return 0;
2410 static __net_exit void proto_exit_net(struct net *net)
2412 proc_net_remove(net, "protocols");
2416 static __net_initdata struct pernet_operations proto_net_ops = {
2417 .init = proto_init_net,
2418 .exit = proto_exit_net,
2421 static int __init proto_init(void)
2423 return register_pernet_subsys(&proto_net_ops);
2426 subsys_initcall(proto_init);
2428 #endif /* PROC_FS */