2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Version: $Id: sock.c,v 1.78 1999/03/25 10:03:55 davem Exp $
12 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
13 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Alan Cox, <A.Cox@swansea.ac.uk>
18 * Alan Cox : Numerous verify_area() problems
19 * Alan Cox : Connecting on a connecting socket
20 * now returns an error for tcp.
21 * Alan Cox : sock->protocol is set correctly.
22 * and is not sometimes left as 0.
23 * Alan Cox : connect handles icmp errors on a
24 * connect properly. Unfortunately there
25 * is a restart syscall nasty there. I
26 * can't match BSD without hacking the C
27 * library. Ideas urgently sought!
28 * Alan Cox : Disallow bind() to addresses that are
29 * not ours - especially broadcast ones!!
30 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
31 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
32 * instead they leave that for the DESTROY timer.
33 * Alan Cox : Clean up error flag in accept
34 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
35 * was buggy. Put a remove_sock() in the handler
36 * for memory when we hit 0. Also altered the timer
37 * code. The ACK stuff can wait and needs major
39 * Alan Cox : Fixed TCP ack bug, removed remove sock
40 * and fixed timer/inet_bh race.
41 * Alan Cox : Added zapped flag for TCP
42 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
43 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
45 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 * Rick Sladkey : Relaxed UDP rules for matching packets.
48 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
49 * Pauline Middelink : identd support
50 * Alan Cox : Fixed connect() taking signals I think.
51 * Alan Cox : SO_LINGER supported
52 * Alan Cox : Error reporting fixes
53 * Anonymous : inet_create tidied up (sk->reuse setting)
54 * Alan Cox : inet sockets don't set sk->type!
55 * Alan Cox : Split socket option code
56 * Alan Cox : Callbacks
57 * Alan Cox : Nagle flag for Charles & Johannes stuff
58 * Alex : Removed restriction on inet fioctl
59 * Alan Cox : Splitting INET from NET core
60 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
61 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
62 * Alan Cox : Split IP from generic code
63 * Alan Cox : New kfree_skbmem()
64 * Alan Cox : Make SO_DEBUG superuser only.
65 * Alan Cox : Allow anyone to clear SO_DEBUG
67 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
68 * Alan Cox : Allocator for a socket is settable.
69 * Alan Cox : SO_ERROR includes soft errors.
70 * Alan Cox : Allow NULL arguments on some SO_ opts
71 * Alan Cox : Generic socket allocation to make hooks
72 * easier (suggested by Craig Metz).
73 * Michael Pall : SO_ERROR returns positive errno again
74 * Steve Whitehouse: Added default destructor to free
75 * protocol private data.
76 * Steve Whitehouse: Added various other default routines
77 * common to several socket families.
78 * Chris Evans : Call suser() check last on F_SETOWN
79 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
81 * Andi Kleen : Fix write_space callback
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
92 #include <linux/config.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
97 #include <linux/kernel.h>
98 #include <linux/major.h>
99 #include <linux/sched.h>
100 #include <linux/timer.h>
101 #include <linux/string.h>
102 #include <linux/sockios.h>
103 #include <linux/net.h>
104 #include <linux/fcntl.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/init.h>
111 #include <asm/uaccess.h>
112 #include <asm/system.h>
114 #include <linux/inet.h>
115 #include <linux/netdevice.h>
117 #include <net/protocol.h>
119 #include <net/rarp.h>
120 #include <net/route.h>
123 #include <linux/skbuff.h>
124 #include <net/sock.h>
126 #include <net/icmp.h>
127 #include <linux/ipsec.h>
130 #include <linux/filter.h>
133 #define min(a,b) ((a)<(b)?(a):(b))
135 /* Run time adjustable parameters. */
136 __u32 sysctl_wmem_max
= SK_WMEM_MAX
;
137 __u32 sysctl_rmem_max
= SK_RMEM_MAX
;
138 __u32 sysctl_wmem_default
= SK_WMEM_MAX
;
139 __u32 sysctl_rmem_default
= SK_RMEM_MAX
;
141 /* Maximal space eaten by iovec or ancilliary data plus some space */
142 int sysctl_optmem_max
= sizeof(unsigned long)*(2*UIO_MAXIOV
+ 512);
145 * This is meant for all protocols to use and covers goings on
146 * at the socket level. Everything here is generic.
149 int sock_setsockopt(struct socket
*sock
, int level
, int optname
,
150 char *optval
, int optlen
)
152 struct sock
*sk
=sock
->sk
;
160 * Options without arguments
163 #ifdef SO_DONTLINGER /* Compatibility item... */
172 if(optlen
<sizeof(int))
175 err
= get_user(val
, (int *)optval
);
184 if(val
&& !capable(CAP_NET_ADMIN
))
199 sk
->localroute
=valbool
;
202 sk
->broadcast
=valbool
;
205 /* Don't error on this BSD doesn't and if you think
206 about it this is right. Otherwise apps have to
207 play 'guess the biggest size' games. RCVBUF/SNDBUF
208 are treated in BSD as hints */
210 if (val
> sysctl_wmem_max
)
211 val
= sysctl_wmem_max
;
213 sk
->sndbuf
= max(val
*2,2048);
216 * Wake up sending tasks if we
223 /* Don't error on this BSD doesn't and if you think
224 about it this is right. Otherwise apps have to
225 play 'guess the biggest size' games. RCVBUF/SNDBUF
226 are treated in BSD as hints */
228 if (val
> sysctl_rmem_max
)
229 val
= sysctl_rmem_max
;
231 /* FIXME: is this lower bound the right one? */
232 sk
->rcvbuf
= max(val
*2,256);
237 if (sk
->protocol
== IPPROTO_TCP
)
239 tcp_set_keepalive(sk
, valbool
);
242 sk
->keepopen
= valbool
;
246 sk
->urginline
= valbool
;
250 sk
->no_check
= valbool
;
254 if ((val
>= 0 && val
<= 6) || capable(CAP_NET_ADMIN
))
261 if(optlen
<sizeof(ling
))
262 return -EINVAL
; /* 1003.1g */
263 err
= copy_from_user(&ling
,optval
,sizeof(ling
));
273 sk
->lingertime
=ling
.l_linger
;
279 sk
->bsdism
= valbool
;
283 sock
->passcred
= valbool
;
287 #ifdef CONFIG_NETDEVICES
288 case SO_BINDTODEVICE
:
290 char devname
[IFNAMSIZ
];
293 if (!capable(CAP_NET_RAW
))
296 /* Bind this socket to a particular device like "eth0",
297 * as specified in the passed interface name. If the
298 * name is "" or the option length is zero the socket
303 sk
->bound_dev_if
= 0;
305 if (optlen
> IFNAMSIZ
)
307 if (copy_from_user(devname
, optval
, optlen
))
310 /* Remove any cached route for this socket. */
312 dst_release(xchg(&sk
->dst_cache
, NULL
));
315 if (devname
[0] == '\0') {
316 sk
->bound_dev_if
= 0;
318 struct device
*dev
= dev_get(devname
);
321 sk
->bound_dev_if
= dev
->ifindex
;
330 case SO_ATTACH_FILTER
:
332 if (optlen
== sizeof(struct sock_fprog
)) {
333 struct sock_fprog fprog
;
336 if (copy_from_user(&fprog
, optval
, sizeof(fprog
)))
339 ret
= sk_attach_filter(&fprog
, sk
);
343 case SO_DETACH_FILTER
:
345 struct sk_filter
*filter
;
353 sk_filter_release(sk
, filter
);
358 /* We implement the SO_SNDLOWAT etc to
359 not be settable (1003.1g 5.3) */
361 return(-ENOPROTOOPT
);
367 int sock_getsockopt(struct socket
*sock
, int level
, int optname
,
368 char *optval
, int *optlen
)
370 struct sock
*sk
= sock
->sk
;
379 int lv
=sizeof(int),len
;
381 if(get_user(len
,optlen
))
391 v
.val
= sk
->localroute
;
395 v
.val
= sk
->broadcast
;
411 v
.val
= sk
->keepopen
;
419 v
.val
= -sock_error(sk
);
421 v
.val
=xchg(&sk
->err_soft
,0);
425 v
.val
= sk
->urginline
;
429 v
.val
= sk
->no_check
;
433 v
.val
= sk
->priority
;
438 v
.ling
.l_onoff
=sk
->linger
;
439 v
.ling
.l_linger
=sk
->lingertime
;
448 lv
=sizeof(struct timeval
);
459 v
.val
= sock
->passcred
;
463 lv
=sizeof(sk
->peercred
);
465 if(copy_to_user((void*)optval
, &sk
->peercred
, len
))
470 return(-ENOPROTOOPT
);
473 if(copy_to_user(optval
,&v
,len
))
476 if(put_user(len
, optlen
))
481 static kmem_cache_t
*sk_cachep
;
484 * All socket objects are allocated here. This is for future
488 struct sock
*sk_alloc(int family
, int priority
, int zero_it
)
490 struct sock
*sk
= kmem_cache_alloc(sk_cachep
, priority
);
494 memset(sk
, 0, sizeof(struct sock
));
501 void sk_free(struct sock
*sk
)
508 sk_filter_release(sk
, sk
->filter
);
513 if (atomic_read(&sk
->omem_alloc
))
514 printk(KERN_DEBUG
"sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk
->omem_alloc
));
518 sk_filter_release(sk
, sk
->filter
);
523 if (atomic_read(&sk
->omem_alloc
))
524 printk(KERN_DEBUG
"sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk
->omem_alloc
));
526 kmem_cache_free(sk_cachep
, sk
);
529 void __init
sk_init(void)
531 sk_cachep
= kmem_cache_create("sock", sizeof(struct sock
), 0,
532 SLAB_HWCACHE_ALIGN
, 0, 0);
537 * Simple resource managers for sockets.
542 * Write buffer destructor automatically called from kfree_skb.
544 void sock_wfree(struct sk_buff
*skb
)
546 struct sock
*sk
= skb
->sk
;
548 /* In case it might be waiting for more memory. */
549 atomic_sub(skb
->truesize
, &sk
->wmem_alloc
);
554 * Read buffer destructor automatically called from kfree_skb.
556 void sock_rfree(struct sk_buff
*skb
)
558 struct sock
*sk
= skb
->sk
;
560 atomic_sub(skb
->truesize
, &sk
->rmem_alloc
);
565 * Allocate a skb from the socket's send buffer.
567 struct sk_buff
*sock_wmalloc(struct sock
*sk
, unsigned long size
, int force
, int priority
)
569 if (force
|| atomic_read(&sk
->wmem_alloc
) < sk
->sndbuf
) {
570 struct sk_buff
* skb
= alloc_skb(size
, priority
);
572 atomic_add(skb
->truesize
, &sk
->wmem_alloc
);
573 skb
->destructor
= sock_wfree
;
582 * Allocate a skb from the socket's receive buffer.
584 struct sk_buff
*sock_rmalloc(struct sock
*sk
, unsigned long size
, int force
, int priority
)
586 if (force
|| atomic_read(&sk
->rmem_alloc
) < sk
->rcvbuf
) {
587 struct sk_buff
*skb
= alloc_skb(size
, priority
);
589 atomic_add(skb
->truesize
, &sk
->rmem_alloc
);
590 skb
->destructor
= sock_rfree
;
599 * Allocate a memory block from the socket's option memory buffer.
601 void *sock_kmalloc(struct sock
*sk
, int size
, int priority
)
603 if (atomic_read(&sk
->omem_alloc
)+size
< sysctl_optmem_max
) {
605 /* First do the add, to avoid the race if kmalloc
608 atomic_add(size
, &sk
->omem_alloc
);
609 mem
= kmalloc(size
, priority
);
612 atomic_sub(size
, &sk
->omem_alloc
);
618 * Free an option memory block.
620 void sock_kfree_s(struct sock
*sk
, void *mem
, int size
)
623 atomic_sub(size
, &sk
->omem_alloc
);
626 /* FIXME: this is insane. We are trying suppose to be controlling how
627 * how much space we have for data bytes, not packet headers.
628 * This really points out that we need a better system for doing the
629 * receive buffer. -- erics
630 * WARNING: This is currently ONLY used in tcp. If you need it else where
631 * this will probably not be what you want. Possibly these two routines
632 * should move over to the ipv4 directory.
634 unsigned long sock_rspace(struct sock
*sk
)
639 /* This used to have some bizarre complications that
640 * to attempt to reserve some amount of space. This doesn't
641 * make sense, since the number returned here does not
642 * actually reflect allocated space, but rather the amount
643 * of space we committed to. We gamble that we won't
644 * run out of memory, and returning a smaller number does
645 * not change the gamble. If we lose the gamble tcp still
646 * works, it may just slow down for retransmissions.
648 amt
= sk
->rcvbuf
- atomic_read(&sk
->rmem_alloc
);
656 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
657 I think, these locks should be removed for datagram sockets.
659 static void sock_wait_for_wmem(struct sock
* sk
)
661 struct wait_queue wait
= { current
, NULL
};
663 sk
->socket
->flags
&= ~SO_NOSPACE
;
664 add_wait_queue(sk
->sleep
, &wait
);
666 if (signal_pending(current
))
668 current
->state
= TASK_INTERRUPTIBLE
;
669 if (atomic_read(&sk
->wmem_alloc
) < sk
->sndbuf
)
671 if (sk
->shutdown
& SEND_SHUTDOWN
)
677 current
->state
= TASK_RUNNING
;
678 remove_wait_queue(sk
->sleep
, &wait
);
683 * Generic send/receive buffer handlers
686 struct sk_buff
*sock_alloc_send_skb(struct sock
*sk
, unsigned long size
,
687 unsigned long fallback
, int noblock
, int *errcode
)
693 unsigned long try_size
= size
;
695 err
= sock_error(sk
);
700 * We should send SIGPIPE in these cases according to
701 * 1003.1g draft 6.4. If we (the user) did a shutdown()
702 * call however we should not.
704 * Note: This routine isnt just used for datagrams and
705 * anyway some datagram protocols have a notion of
710 if (sk
->shutdown
&SEND_SHUTDOWN
)
714 /* The buffer get won't block, or use the atomic queue.
715 * It does produce annoying no free page messages still.
717 skb
= sock_wmalloc(sk
, size
, 0, GFP_BUFFER
);
722 skb
= sock_wmalloc(sk
, try_size
, 0, sk
->allocation
);
727 * This means we have too many buffers for this socket already.
730 sk
->socket
->flags
|= SO_NOSPACE
;
735 if (signal_pending(current
))
737 sock_wait_for_wmem(sk
);
748 void __release_sock(struct sock
*sk
)
751 if (!sk
->prot
|| !sk
->backlog_rcv
)
754 /* See if we have any packets built up. */
756 while (!skb_queue_empty(&sk
->back_log
)) {
757 struct sk_buff
* skb
= sk
->back_log
.next
;
758 __skb_unlink(skb
, &sk
->back_log
);
759 sk
->backlog_rcv(sk
, skb
);
767 * Generic socket manager library. Most simpler socket families
768 * use this to manage their socket lists. At some point we should
769 * hash these. By making this generic we get the lot hashed for free.
772 void sklist_remove_socket(struct sock
**list
, struct sock
*sk
)
797 void sklist_insert_socket(struct sock
**list
, struct sock
*sk
)
806 * This is only called from user mode. Thus it protects itself against
807 * interrupt users but doesn't worry about being called during work.
808 * Once it is removed from the queue no interrupt or bottom half will
809 * touch it and we are (fairly 8-) ) safe.
812 void sklist_destroy_socket(struct sock
**list
, struct sock
*sk
);
815 * Handler for deferred kills.
818 static void sklist_destroy_timer(unsigned long data
)
820 struct sock
*sk
=(struct sock
*)data
;
821 sklist_destroy_socket(NULL
,sk
);
825 * Destroy a socket. We pass NULL for a list if we know the
826 * socket is not on a list.
829 void sklist_destroy_socket(struct sock
**list
,struct sock
*sk
)
833 sklist_remove_socket(list
, sk
);
835 while((skb
=skb_dequeue(&sk
->receive_queue
))!=NULL
)
840 if(atomic_read(&sk
->wmem_alloc
) == 0 &&
841 atomic_read(&sk
->rmem_alloc
) == 0 &&
849 * Someone is using our buffers still.. defer
851 init_timer(&sk
->timer
);
852 sk
->timer
.expires
=jiffies
+SOCK_DESTROY_TIME
;
853 sk
->timer
.function
=sklist_destroy_timer
;
854 sk
->timer
.data
= (unsigned long)sk
;
855 add_timer(&sk
->timer
);
860 * Set of default routines for initialising struct proto_ops when
861 * the protocol does not support a particular function. In certain
862 * cases where it makes no sense for a protocol to have a "do nothing"
863 * function, some default processing is provided.
866 int sock_no_dup(struct socket
*newsock
, struct socket
*oldsock
)
868 struct sock
*sk
= oldsock
->sk
;
870 return net_families
[sk
->family
]->create(newsock
, sk
->protocol
);
873 int sock_no_release(struct socket
*sock
, struct socket
*peersock
)
878 int sock_no_bind(struct socket
*sock
, struct sockaddr
*saddr
, int len
)
883 int sock_no_connect(struct socket
*sock
, struct sockaddr
*saddr
,
889 int sock_no_socketpair(struct socket
*sock1
, struct socket
*sock2
)
894 int sock_no_accept(struct socket
*sock
, struct socket
*newsock
, int flags
)
899 int sock_no_getname(struct socket
*sock
, struct sockaddr
*saddr
,
905 unsigned int sock_no_poll(struct file
* file
, struct socket
*sock
, poll_table
*pt
)
910 int sock_no_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
915 int sock_no_listen(struct socket
*sock
, int backlog
)
920 int sock_no_shutdown(struct socket
*sock
, int how
)
925 int sock_no_setsockopt(struct socket
*sock
, int level
, int optname
,
926 char *optval
, int optlen
)
931 int sock_no_getsockopt(struct socket
*sock
, int level
, int optname
,
932 char *optval
, int *optlen
)
938 * Note: if you add something that sleeps here then change sock_fcntl()
939 * to do proper fd locking.
941 int sock_no_fcntl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
943 struct sock
*sk
= sock
->sk
;
949 * This is a little restrictive, but it's the only
950 * way to make sure that you can't send a sigurg to
953 if (current
->pgrp
!= -arg
&&
954 current
->pid
!= arg
&&
955 !capable(CAP_NET_ADMIN
)) return(-EPERM
);
965 int sock_no_sendmsg(struct socket
*sock
, struct msghdr
*m
, int flags
,
966 struct scm_cookie
*scm
)
971 int sock_no_recvmsg(struct socket
*sock
, struct msghdr
*m
, int flags
,
972 struct scm_cookie
*scm
)
980 * Default Socket Callbacks
983 void sock_def_wakeup(struct sock
*sk
)
986 wake_up_interruptible(sk
->sleep
);
989 void sock_def_error_report(struct sock
*sk
)
992 wake_up_interruptible(sk
->sleep
);
993 sock_wake_async(sk
->socket
,0);
997 void sock_def_readable(struct sock
*sk
, int len
)
1000 wake_up_interruptible(sk
->sleep
);
1001 sock_wake_async(sk
->socket
,1);
1005 void sock_def_write_space(struct sock
*sk
)
1007 /* Do not wake up a writer until he can make "significant"
1011 ((atomic_read(&sk
->wmem_alloc
) << 1) <= sk
->sndbuf
)) {
1012 wake_up_interruptible(sk
->sleep
);
1014 /* Should agree with poll, otherwise some programs break */
1015 if (sock_writeable(sk
))
1016 sock_wake_async(sk
->socket
, 2);
1020 void sock_def_destruct(struct sock
*sk
)
1022 if (sk
->protinfo
.destruct_hook
)
1023 kfree(sk
->protinfo
.destruct_hook
);
1026 void sock_init_data(struct socket
*sock
, struct sock
*sk
)
1028 skb_queue_head_init(&sk
->receive_queue
);
1029 skb_queue_head_init(&sk
->write_queue
);
1030 skb_queue_head_init(&sk
->back_log
);
1031 skb_queue_head_init(&sk
->error_queue
);
1033 init_timer(&sk
->timer
);
1035 sk
->allocation
= GFP_KERNEL
;
1036 sk
->rcvbuf
= sysctl_rmem_default
;
1037 sk
->sndbuf
= sysctl_wmem_default
;
1038 sk
->state
= TCP_CLOSE
;
1044 sk
->type
= sock
->type
;
1045 sk
->sleep
= &sock
->wait
;
1049 sk
->state_change
= sock_def_wakeup
;
1050 sk
->data_ready
= sock_def_readable
;
1051 sk
->write_space
= sock_def_write_space
;
1052 sk
->error_report
= sock_def_error_report
;
1053 sk
->destruct
= sock_def_destruct
;
1055 sk
->peercred
.pid
= 0;
1056 sk
->peercred
.uid
= -1;
1057 sk
->peercred
.gid
= -1;