2 * NET3: Implementation of BSD Unix domain sockets.
4 * Authors: Alan Cox, <alan.cox@linux.org>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
11 * Version: $Id: af_unix.c,v 1.88 2000/01/18 08:24:28 davem Exp $
14 * Linus Torvalds : Assorted bug cures.
15 * Niibe Yutaka : async I/O support.
16 * Carsten Paeth : PF_UNIX check, address fixes.
17 * Alan Cox : Limit size of allocated blocks.
18 * Alan Cox : Fixed the stupid socketpair bug.
19 * Alan Cox : BSD compatibility fine tuning.
20 * Alan Cox : Fixed a bug in connect when interrupted.
21 * Alan Cox : Sorted out a proper draft version of
22 * file descriptor passing hacked up from
24 * Marty Leisner : Fixes to fd passing
25 * Nick Nevin : recvmsg bugfix.
26 * Alan Cox : Started proper garbage collector
27 * Heiko EiBfeldt : Missing verify_area check
28 * Alan Cox : Started POSIXisms
29 * Andreas Schwab : Replace inode by dentry for proper
31 * Kirk Petersen : Made this a module
32 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
34 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
35 * by above two patches.
36 * Andrea Arcangeli : If possible we block in connect(2)
37 * if the max backlog of the listen socket
38 * is been reached. This won't break
39 * old apps and it will avoid huge amount
40 * of socks hashed (this for unix_gc()
41 * performances reasons).
42 * Security fix that limits the max
43 * number of socks to 2*max_files and
44 * the number of skb queueable in the
46 * Artur Skawina : Hash function optimizations
47 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
50 * Known differences from reference BSD that was tested:
53 * ECONNREFUSED is not returned from one end of a connected() socket to the
54 * other the moment one end closes.
55 * fstat() doesn't return st_dev=NODEV, and give the blksize as high water mark
56 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
58 * accept() returns a path name even if the connecting socket has closed
59 * in the meantime (BSD loses the path and gives up).
60 * accept() returns 0 length path for an unbound connector. BSD returns 16
61 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
62 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
63 * BSD af_unix apparently has connect forgetting to block properly.
64 * (need to check this with the POSIX spec in detail)
66 * Differences from 2.0.0-11-... (ANK)
67 * Bug fixes and improvements.
68 * - client shutdown killed server socket.
69 * - removed all useless cli/sti pairs.
71 * Semantic changes/extensions.
72 * - generic control message passing.
73 * - SCM_CREDENTIALS control message.
74 * - "Abstract" (not FS based) socket bindings.
75 * Abstract names are sequences of bytes (not zero terminated)
76 * started by 0, so that this name space does not intersect
80 #include <linux/module.h>
81 #include <linux/config.h>
82 #include <linux/kernel.h>
83 #include <linux/major.h>
84 #include <linux/signal.h>
85 #include <linux/sched.h>
86 #include <linux/errno.h>
87 #include <linux/string.h>
88 #include <linux/stat.h>
89 #include <linux/socket.h>
91 #include <linux/fcntl.h>
92 #include <linux/termios.h>
93 #include <linux/sockios.h>
94 #include <linux/net.h>
97 #include <linux/malloc.h>
98 #include <asm/uaccess.h>
99 #include <linux/skbuff.h>
100 #include <linux/netdevice.h>
101 #include <net/sock.h>
103 #include <net/af_unix.h>
104 #include <linux/proc_fs.h>
106 #include <linux/init.h>
107 #include <linux/poll.h>
108 #include <linux/smp_lock.h>
110 #include <asm/checksum.h>
112 #define min(a,b) (((a)<(b))?(a):(b))
114 int sysctl_unix_max_dgram_qlen
= 10;
116 unix_socket
*unix_socket_table
[UNIX_HASH_SIZE
+1];
117 rwlock_t unix_table_lock
= RW_LOCK_UNLOCKED
;
118 static atomic_t unix_nr_socks
= ATOMIC_INIT(0);
120 #define unix_sockets_unbound (unix_socket_table[UNIX_HASH_SIZE])
122 #define UNIX_ABSTRACT(sk) ((sk)->protinfo.af_unix.addr->hash!=UNIX_HASH_SIZE)
125 SMP locking strategy.
126 * hash table is protceted with rwlock unix_table_lock
127 * each socket state is protected by separate rwlock.
131 extern __inline__
unsigned unix_hash_fold(unsigned hash
)
135 return hash
&(UNIX_HASH_SIZE
-1);
138 #define unix_peer(sk) ((sk)->pair)
140 extern __inline__
int unix_our_peer(unix_socket
*sk
, unix_socket
*osk
)
142 return unix_peer(osk
) == sk
;
145 extern __inline__
int unix_may_send(unix_socket
*sk
, unix_socket
*osk
)
147 return (unix_peer(osk
) == NULL
|| unix_our_peer(sk
, osk
));
150 static __inline__ unix_socket
* unix_peer_get(unix_socket
*s
)
158 unix_state_runlock(s
);
162 extern __inline__
void unix_release_addr(struct unix_address
*addr
)
164 if (atomic_dec_and_test(&addr
->refcnt
))
169 * Check unix socket name:
170 * - should be not zero length.
171 * - if started by not zero, should be NULL terminated (FS object)
172 * - if started by zero, it is abstract name.
175 static int unix_mkname(struct sockaddr_un
* sunaddr
, int len
, unsigned *hashp
)
177 if (len
<= sizeof(short) || len
> sizeof(*sunaddr
))
179 if (!sunaddr
|| sunaddr
->sun_family
!= AF_UNIX
)
181 if (sunaddr
->sun_path
[0])
184 * This may look like an off by one error but it is
185 * a bit more subtle. 108 is the longest valid AF_UNIX
186 * path for a binding. sun_path[108] doesnt as such
187 * exist. However in kernel space we are guaranteed that
188 * it is a valid memory location in our kernel
191 if (len
> sizeof(*sunaddr
))
192 len
= sizeof(*sunaddr
);
193 ((char *)sunaddr
)[len
]=0;
194 len
= strlen(sunaddr
->sun_path
)+1+sizeof(short);
198 *hashp
= unix_hash_fold(csum_partial((char*)sunaddr
, len
, 0));
202 static void __unix_remove_socket(unix_socket
*sk
)
204 unix_socket
**list
= sk
->protinfo
.af_unix
.list
;
207 sk
->next
->prev
= sk
->prev
;
209 sk
->prev
->next
= sk
->next
;
212 sk
->protinfo
.af_unix
.list
= NULL
;
219 static void __unix_insert_socket(unix_socket
**list
, unix_socket
*sk
)
221 BUG_TRAP(sk
->protinfo
.af_unix
.list
==NULL
);
223 sk
->protinfo
.af_unix
.list
= list
;
232 static __inline__
void unix_remove_socket(unix_socket
*sk
)
234 write_lock(&unix_table_lock
);
235 __unix_remove_socket(sk
);
236 write_unlock(&unix_table_lock
);
239 static __inline__
void unix_insert_socket(unix_socket
**list
, unix_socket
*sk
)
241 write_lock(&unix_table_lock
);
242 __unix_insert_socket(list
, sk
);
243 write_unlock(&unix_table_lock
);
246 static unix_socket
*__unix_find_socket_byname(struct sockaddr_un
*sunname
,
247 int len
, int type
, unsigned hash
)
251 for (s
=unix_socket_table
[hash
^type
]; s
; s
=s
->next
) {
252 if(s
->protinfo
.af_unix
.addr
->len
==len
&&
253 memcmp(s
->protinfo
.af_unix
.addr
->name
, sunname
, len
) == 0)
259 static __inline__ unix_socket
*
260 unix_find_socket_byname(struct sockaddr_un
*sunname
,
261 int len
, int type
, unsigned hash
)
265 read_lock(&unix_table_lock
);
266 s
= __unix_find_socket_byname(sunname
, len
, type
, hash
);
269 read_unlock(&unix_table_lock
);
273 static unix_socket
*unix_find_socket_byinode(struct inode
*i
)
277 read_lock(&unix_table_lock
);
278 for (s
=unix_socket_table
[i
->i_ino
& (UNIX_HASH_SIZE
-1)]; s
; s
=s
->next
)
280 struct dentry
*dentry
= s
->protinfo
.af_unix
.dentry
;
282 if(dentry
&& dentry
->d_inode
== i
)
288 read_unlock(&unix_table_lock
);
292 static __inline__
int unix_writable(struct sock
*sk
)
294 return ((atomic_read(&sk
->wmem_alloc
)<<2) <= sk
->sndbuf
);
297 static void unix_write_space(struct sock
*sk
)
299 read_lock(&sk
->callback_lock
);
300 if (!sk
->dead
&& unix_writable(sk
)) {
301 wake_up_interruptible(sk
->sleep
);
302 sock_wake_async(sk
->socket
, 2, POLL_OUT
);
304 read_unlock(&sk
->callback_lock
);
307 static void unix_sock_destructor(struct sock
*sk
)
309 skb_queue_purge(&sk
->receive_queue
);
311 BUG_TRAP(atomic_read(&sk
->wmem_alloc
) == 0);
312 BUG_TRAP(sk
->protinfo
.af_unix
.list
==NULL
);
313 BUG_TRAP(sk
->socket
==NULL
);
315 printk("Attempt to release alive unix socket: %p\n", sk
);
319 if (sk
->protinfo
.af_unix
.addr
)
320 unix_release_addr(sk
->protinfo
.af_unix
.addr
);
322 atomic_dec(&unix_nr_socks
);
323 #ifdef UNIX_REFCNT_DEBUG
324 printk(KERN_DEBUG
"UNIX %p is destroyed, %d are still alive.\n", sk
, atomic_read(&unix_nr_socks
));
329 static int unix_release_sock (unix_socket
*sk
, int embrion
)
331 struct dentry
*dentry
;
336 unix_remove_socket(sk
);
339 unix_state_wlock(sk
);
341 sk
->shutdown
= SHUTDOWN_MASK
;
342 dentry
= sk
->protinfo
.af_unix
.dentry
;
343 sk
->protinfo
.af_unix
.dentry
=NULL
;
345 sk
->state
= TCP_CLOSE
;
346 unix_state_wunlock(sk
);
348 wake_up_interruptible_all(&sk
->protinfo
.af_unix
.peer_wait
);
350 skpair
=unix_peer(sk
);
353 if (sk
->type
==SOCK_STREAM
) {
354 unix_state_wlock(skpair
);
355 skpair
->shutdown
=SHUTDOWN_MASK
; /* No more writes*/
356 if (!skb_queue_empty(&sk
->receive_queue
) || embrion
)
357 skpair
->err
= ECONNRESET
;
358 unix_state_wunlock(skpair
);
359 sk
->state_change(skpair
);
360 sock_wake_async(sk
->socket
,1,POLL_HUP
);
362 sock_put(skpair
); /* It may now die */
363 unix_peer(sk
) = NULL
;
366 /* Try to flush out this socket. Throw out buffers at least */
368 while((skb
=skb_dequeue(&sk
->receive_queue
))!=NULL
)
370 if (state
==TCP_LISTEN
)
371 unix_release_sock(skb
->sk
, 1);
372 /* passed fds are erased in the kfree_skb hook */
384 /* ---- Socket is dead now and most probably destroyed ---- */
387 * Fixme: BSD difference: In BSD all sockets connected to use get
388 * ECONNRESET and we die on the spot. In Linux we behave
389 * like files and pipes do and wait for the last
392 * Can't we simply set sock->err?
394 * What the above comment does talk about? --ANK(980817)
397 if (atomic_read(&unix_tot_inflight
))
398 unix_gc(); /* Garbage collect fds */
403 static int unix_listen(struct socket
*sock
, int backlog
)
406 struct sock
*sk
= sock
->sk
;
409 if (sock
->type
!=SOCK_STREAM
)
410 goto out
; /* Only stream sockets accept */
412 if (!sk
->protinfo
.af_unix
.addr
)
413 goto out
; /* No listens on an unbound socket */
414 unix_state_wlock(sk
);
415 if (sk
->state
!= TCP_CLOSE
&& sk
->state
!= TCP_LISTEN
)
417 if (backlog
> sk
->max_ack_backlog
)
418 wake_up_interruptible_all(&sk
->protinfo
.af_unix
.peer_wait
);
419 sk
->max_ack_backlog
=backlog
;
420 sk
->state
=TCP_LISTEN
;
421 sock
->flags
|= SO_ACCEPTCON
;
422 /* set credentials so connect can copy them */
423 sk
->peercred
.pid
= current
->pid
;
424 sk
->peercred
.uid
= current
->euid
;
425 sk
->peercred
.gid
= current
->egid
;
429 unix_state_wunlock(sk
);
434 extern struct proto_ops unix_stream_ops
;
435 extern struct proto_ops unix_dgram_ops
;
437 static struct sock
* unix_create1(struct socket
*sock
)
441 if (atomic_read(&unix_nr_socks
) >= 2*max_files
)
445 sk
= sk_alloc(PF_UNIX
, GFP_KERNEL
, 1);
451 atomic_inc(&unix_nr_socks
);
453 sock_init_data(sock
,sk
);
455 sk
->write_space
= unix_write_space
;
457 sk
->max_ack_backlog
= sysctl_unix_max_dgram_qlen
;
458 sk
->destruct
= unix_sock_destructor
;
459 sk
->protinfo
.af_unix
.dentry
=NULL
;
460 sk
->protinfo
.af_unix
.lock
= RW_LOCK_UNLOCKED
;
461 atomic_set(&sk
->protinfo
.af_unix
.inflight
, 0);
462 init_MUTEX(&sk
->protinfo
.af_unix
.readsem
);/* single task reading lock */
463 init_waitqueue_head(&sk
->protinfo
.af_unix
.peer_wait
);
464 sk
->protinfo
.af_unix
.list
=NULL
;
465 unix_insert_socket(&unix_sockets_unbound
, sk
);
470 static int unix_create(struct socket
*sock
, int protocol
)
472 if (protocol
&& protocol
!= PF_UNIX
)
473 return -EPROTONOSUPPORT
;
475 sock
->state
= SS_UNCONNECTED
;
477 switch (sock
->type
) {
479 sock
->ops
= &unix_stream_ops
;
482 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
486 sock
->type
=SOCK_DGRAM
;
488 sock
->ops
= &unix_dgram_ops
;
491 return -ESOCKTNOSUPPORT
;
494 return unix_create1(sock
) ? 0 : -ENOMEM
;
497 static int unix_release(struct socket
*sock
)
499 unix_socket
*sk
= sock
->sk
;
506 return unix_release_sock (sk
, 0);
509 static int unix_autobind(struct socket
*sock
)
511 struct sock
*sk
= sock
->sk
;
512 static u32 ordernum
= 1;
513 struct unix_address
* addr
;
516 down(&sk
->protinfo
.af_unix
.readsem
);
519 if (sk
->protinfo
.af_unix
.addr
)
523 addr
= kmalloc(sizeof(*addr
) + sizeof(short) + 16, GFP_KERNEL
);
527 memset(addr
, 0, sizeof(*addr
) + sizeof(short) + 16);
528 addr
->name
->sun_family
= AF_UNIX
;
529 atomic_set(&addr
->refcnt
, 1);
532 addr
->len
= sprintf(addr
->name
->sun_path
+1, "%05x", ordernum
) + 1 + sizeof(short);
533 addr
->hash
= unix_hash_fold(csum_partial((void*)addr
->name
, addr
->len
, 0));
535 write_lock(&unix_table_lock
);
536 ordernum
= (ordernum
+1)&0xFFFFF;
538 if (__unix_find_socket_byname(addr
->name
, addr
->len
, sock
->type
,
540 write_unlock(&unix_table_lock
);
541 /* Sanity yield. It is unusual case, but yet... */
542 if (!(ordernum
&0xFF)) {
543 current
->policy
|= SCHED_YIELD
;
548 addr
->hash
^= sk
->type
;
550 __unix_remove_socket(sk
);
551 sk
->protinfo
.af_unix
.addr
= addr
;
552 __unix_insert_socket(&unix_socket_table
[addr
->hash
], sk
);
553 write_unlock(&unix_table_lock
);
557 up(&sk
->protinfo
.af_unix
.readsem
);
561 static unix_socket
*unix_find_other(struct sockaddr_un
*sunname
, int len
,
562 int type
, unsigned hash
, int *error
)
566 if (sunname
->sun_path
[0])
568 struct dentry
*dentry
;
570 /* Do not believe to VFS, grab kernel lock */
572 dentry
= open_namei(sunname
->sun_path
, 2|O_NOFOLLOW
, S_IFSOCK
);
573 if (IS_ERR(dentry
)) {
574 *error
= PTR_ERR(dentry
);
578 u
=unix_find_socket_byinode(dentry
->d_inode
);
582 if (u
&& u
->type
!= type
)
590 u
=unix_find_socket_byname(sunname
, len
, type
, hash
);
594 *error
=-ECONNREFUSED
;
601 static int unix_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
603 struct sock
*sk
= sock
->sk
;
604 struct sockaddr_un
*sunaddr
=(struct sockaddr_un
*)uaddr
;
605 struct dentry
* dentry
= NULL
;
608 struct unix_address
*addr
;
612 if (sunaddr
->sun_family
!= AF_UNIX
)
615 if (addr_len
==sizeof(short)) {
616 err
= unix_autobind(sock
);
620 err
= unix_mkname(sunaddr
, addr_len
, &hash
);
625 down(&sk
->protinfo
.af_unix
.readsem
);
628 if (sk
->protinfo
.af_unix
.addr
)
632 addr
= kmalloc(sizeof(*addr
)+addr_len
, GFP_KERNEL
);
636 memcpy(addr
->name
, sunaddr
, addr_len
);
637 addr
->len
= addr_len
;
638 addr
->hash
= hash
^sk
->type
;
639 atomic_set(&addr
->refcnt
, 1);
641 if (sunaddr
->sun_path
[0]) {
643 dentry
= do_mknod(sunaddr
->sun_path
, S_IFSOCK
|sock
->inode
->i_mode
, 0);
644 if (IS_ERR(dentry
)) {
645 err
= PTR_ERR(dentry
);
649 unix_release_addr(addr
);
654 addr
->hash
= UNIX_HASH_SIZE
;
657 write_lock(&unix_table_lock
);
659 if (!sunaddr
->sun_path
[0]) {
661 if (__unix_find_socket_byname(sunaddr
, addr_len
,
663 unix_release_addr(addr
);
667 list
= &unix_socket_table
[addr
->hash
];
669 list
= &unix_socket_table
[dentry
->d_inode
->i_ino
& (UNIX_HASH_SIZE
-1)];
670 sk
->protinfo
.af_unix
.dentry
= dentry
;
674 __unix_remove_socket(sk
);
675 sk
->protinfo
.af_unix
.addr
= addr
;
676 __unix_insert_socket(list
, sk
);
679 write_unlock(&unix_table_lock
);
681 up(&sk
->protinfo
.af_unix
.readsem
);
686 static int unix_dgram_connect(struct socket
*sock
, struct sockaddr
*addr
,
689 struct sock
*sk
= sock
->sk
;
690 struct sockaddr_un
*sunaddr
=(struct sockaddr_un
*)addr
;
695 if (addr
->sa_family
!= AF_UNSPEC
) {
696 err
= unix_mkname(sunaddr
, alen
, &hash
);
701 if (sock
->passcred
&& !sk
->protinfo
.af_unix
.addr
&&
702 (err
= unix_autobind(sock
)) != 0)
705 other
=unix_find_other(sunaddr
, alen
, sock
->type
, hash
, &err
);
709 unix_state_wlock(sk
);
712 if (!unix_may_send(sk
, other
))
716 * 1003.1g breaking connected state with AF_UNSPEC
719 unix_state_wlock(sk
);
723 * If it was connected, reconnect.
726 sock_put(unix_peer(sk
));
730 unix_state_wunlock(sk
);
734 unix_state_wunlock(sk
);
740 static long unix_wait_for_peer(unix_socket
*other
, long timeo
)
743 DECLARE_WAITQUEUE(wait
, current
);
745 __set_current_state(TASK_INTERRUPTIBLE
|TASK_EXCLUSIVE
);
746 add_wait_queue_exclusive(&other
->protinfo
.af_unix
.peer_wait
, &wait
);
748 sched
= (!other
->dead
&&
749 !(other
->shutdown
&RCV_SHUTDOWN
) &&
750 skb_queue_len(&other
->receive_queue
) > other
->max_ack_backlog
);
752 unix_state_runlock(other
);
755 timeo
= schedule_timeout(timeo
);
757 __set_current_state(TASK_RUNNING
);
758 remove_wait_queue(&other
->protinfo
.af_unix
.peer_wait
, &wait
);
762 static int unix_stream_connect(struct socket
*sock
, struct sockaddr
*uaddr
,
763 int addr_len
, int flags
)
765 struct sockaddr_un
*sunaddr
=(struct sockaddr_un
*)uaddr
;
766 struct sock
*sk
= sock
->sk
;
767 struct sock
*newsk
= NULL
;
768 unix_socket
*other
= NULL
;
769 struct sk_buff
*skb
= NULL
;
775 err
= unix_mkname(sunaddr
, addr_len
, &hash
);
780 if (sock
->passcred
&& !sk
->protinfo
.af_unix
.addr
&&
781 (err
= unix_autobind(sock
)) != 0)
784 timeo
= sock_sndtimeo(sk
, flags
& O_NONBLOCK
);
786 /* First of all allocate resources.
787 If we will make it after state is locked,
788 we will have to recheck all again in any case.
793 /* create new sock for complete connection */
794 newsk
= unix_create1(NULL
);
798 /* Allocate skb for sending to listening sock */
799 skb
= sock_wmalloc(newsk
, 1, 0, GFP_KERNEL
);
804 /* Find listening sock. */
805 other
=unix_find_other(sunaddr
, addr_len
, sk
->type
, hash
, &err
);
809 /* Latch state of peer */
810 unix_state_rlock(other
);
812 /* Apparently VFS overslept socket death. Retry. */
814 unix_state_runlock(other
);
820 if (other
->state
!= TCP_LISTEN
)
823 if (skb_queue_len(&other
->receive_queue
) > other
->max_ack_backlog
) {
828 timeo
= unix_wait_for_peer(other
, timeo
);
831 if (signal_pending(current
))
839 It is tricky place. We need to grab write lock and cannot
840 drop lock on peer. It is dangerous because deadlock is
841 possible. Connect to self case and simultaneous
842 attempt to connect are eliminated by checking socket
843 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
844 check this before attempt to grab lock.
846 Well, and we have to recheck the state after socket locked.
852 /* This is ok... continue with connect */
854 case TCP_ESTABLISHED
:
855 /* Socket is already connected */
863 unix_state_wlock(sk
);
865 if (sk
->state
!= st
) {
866 unix_state_wunlock(sk
);
867 unix_state_runlock(other
);
872 /* The way is open! Fastly set all the necessary fields... */
876 newsk
->state
=TCP_ESTABLISHED
;
877 newsk
->type
=SOCK_STREAM
;
878 newsk
->peercred
.pid
= current
->pid
;
879 newsk
->peercred
.uid
= current
->euid
;
880 newsk
->peercred
.gid
= current
->egid
;
881 newsk
->sleep
= &newsk
->protinfo
.af_unix
.peer_wait
;
883 /* copy address information from listening to new sock*/
884 if (other
->protinfo
.af_unix
.addr
)
886 atomic_inc(&other
->protinfo
.af_unix
.addr
->refcnt
);
887 newsk
->protinfo
.af_unix
.addr
=other
->protinfo
.af_unix
.addr
;
889 if (other
->protinfo
.af_unix
.dentry
) {
890 /* Damn, even dget is not SMP safe. It becomes ridiculous... */
892 newsk
->protinfo
.af_unix
.dentry
=dget(other
->protinfo
.af_unix
.dentry
);
896 /* Set credentials */
897 sk
->peercred
= other
->peercred
;
901 sock
->state
=SS_CONNECTED
;
902 sk
->state
=TCP_ESTABLISHED
;
904 unix_state_wunlock(sk
);
906 /* take ten and and send info to listening sock */
907 skb_queue_tail(&other
->receive_queue
,skb
);
908 unix_state_runlock(other
);
909 other
->data_ready(other
, 0);
915 unix_state_runlock(other
);
921 unix_release_sock(newsk
, 0);
927 static int unix_socketpair(struct socket
*socka
, struct socket
*sockb
)
929 struct sock
*ska
=socka
->sk
, *skb
= sockb
->sk
;
931 /* Join our sockets back to back */
937 if (ska
->type
!= SOCK_DGRAM
)
939 ska
->state
=TCP_ESTABLISHED
;
940 skb
->state
=TCP_ESTABLISHED
;
941 socka
->state
=SS_CONNECTED
;
942 sockb
->state
=SS_CONNECTED
;
947 static int unix_accept(struct socket
*sock
, struct socket
*newsock
, int flags
)
949 unix_socket
*sk
= sock
->sk
;
955 if (sock
->type
!=SOCK_STREAM
)
959 if (sk
->state
!=TCP_LISTEN
)
962 /* If socket state is TCP_LISTEN it cannot change (for now...),
963 * so that no locks are necessary.
966 skb
= skb_recv_datagram(sk
, 0, flags
&O_NONBLOCK
, &err
);
971 skb_free_datagram(sk
, skb
);
972 wake_up_interruptible(&sk
->protinfo
.af_unix
.peer_wait
);
974 /* attach accepted sock to socket */
975 unix_state_wlock(tsk
);
976 newsock
->state
= SS_CONNECTED
;
977 sock_graft(tsk
, newsock
);
978 unix_state_wunlock(tsk
);
986 static int unix_getname(struct socket
*sock
, struct sockaddr
*uaddr
, int *uaddr_len
, int peer
)
988 struct sock
*sk
= sock
->sk
;
989 struct sockaddr_un
*sunaddr
=(struct sockaddr_un
*)uaddr
;
993 sk
= unix_peer_get(sk
);
1003 unix_state_rlock(sk
);
1004 if (!sk
->protinfo
.af_unix
.addr
) {
1005 sunaddr
->sun_family
= AF_UNIX
;
1006 sunaddr
->sun_path
[0] = 0;
1007 *uaddr_len
= sizeof(short);
1009 struct unix_address
*addr
= sk
->protinfo
.af_unix
.addr
;
1011 *uaddr_len
= addr
->len
;
1012 memcpy(sunaddr
, addr
->name
, *uaddr_len
);
1014 unix_state_runlock(sk
);
1020 static void unix_detach_fds(struct scm_cookie
*scm
, struct sk_buff
*skb
)
1024 scm
->fp
= UNIXCB(skb
).fp
;
1025 skb
->destructor
= sock_wfree
;
1026 UNIXCB(skb
).fp
= NULL
;
1028 for (i
=scm
->fp
->count
-1; i
>=0; i
--)
1029 unix_notinflight(scm
->fp
->fp
[i
]);
1032 static void unix_destruct_fds(struct sk_buff
*skb
)
1034 struct scm_cookie scm
;
1035 memset(&scm
, 0, sizeof(scm
));
1036 unix_detach_fds(&scm
, skb
);
1038 /* Alas, it calls VFS */
1045 static void unix_attach_fds(struct scm_cookie
*scm
, struct sk_buff
*skb
)
1048 for (i
=scm
->fp
->count
-1; i
>=0; i
--)
1049 unix_inflight(scm
->fp
->fp
[i
]);
1050 UNIXCB(skb
).fp
= scm
->fp
;
1051 skb
->destructor
= unix_destruct_fds
;
1056 * Send AF_UNIX data.
1059 static int unix_dgram_sendmsg(struct socket
*sock
, struct msghdr
*msg
, int len
,
1060 struct scm_cookie
*scm
)
1062 struct sock
*sk
= sock
->sk
;
1063 struct sockaddr_un
*sunaddr
=msg
->msg_name
;
1064 unix_socket
*other
= NULL
;
1065 int namelen
= 0; /* fake GCC */
1068 struct sk_buff
*skb
;
1072 if (msg
->msg_flags
&MSG_OOB
)
1075 if (msg
->msg_namelen
) {
1076 err
= unix_mkname(sunaddr
, msg
->msg_namelen
, &hash
);
1083 other
= unix_peer_get(sk
);
1088 if (sock
->passcred
&& !sk
->protinfo
.af_unix
.addr
&&
1089 (err
= unix_autobind(sock
)) != 0)
1093 skb
= sock_alloc_send_skb(sk
, len
, 0, msg
->msg_flags
&MSG_DONTWAIT
, &err
);
1097 memcpy(UNIXCREDS(skb
), &scm
->creds
, sizeof(struct ucred
));
1099 unix_attach_fds(scm
, skb
);
1101 skb
->h
.raw
= skb
->data
;
1102 err
= memcpy_fromiovec(skb_put(skb
,len
), msg
->msg_iov
, len
);
1106 timeo
= sock_sndtimeo(sk
, msg
->msg_flags
& MSG_DONTWAIT
);
1111 if (sunaddr
== NULL
)
1114 other
= unix_find_other(sunaddr
, namelen
, sk
->type
, hash
, &err
);
1119 unix_state_rlock(other
);
1121 if (!unix_may_send(sk
, other
))
1126 * Check with 1003.1g - what should
1129 unix_state_runlock(other
);
1133 unix_state_wlock(sk
);
1134 if (unix_peer(sk
) == other
) {
1137 err
= -ECONNREFUSED
;
1139 unix_state_wunlock(sk
);
1148 if (other
->shutdown
&RCV_SHUTDOWN
)
1151 if (skb_queue_len(&other
->receive_queue
) > other
->max_ack_backlog
) {
1157 timeo
= unix_wait_for_peer(other
, timeo
);
1160 if (signal_pending(current
))
1166 skb_queue_tail(&other
->receive_queue
, skb
);
1167 unix_state_runlock(other
);
1168 other
->data_ready(other
, len
);
1173 unix_state_runlock(other
);
1183 static int unix_stream_sendmsg(struct socket
*sock
, struct msghdr
*msg
, int len
,
1184 struct scm_cookie
*scm
)
1186 struct sock
*sk
= sock
->sk
;
1187 unix_socket
*other
= NULL
;
1188 struct sockaddr_un
*sunaddr
=msg
->msg_name
;
1190 struct sk_buff
*skb
;
1195 if (msg
->msg_flags
&MSG_OOB
)
1198 if (msg
->msg_namelen
) {
1199 err
= (sk
->state
==TCP_ESTABLISHED
? -EISCONN
: -EOPNOTSUPP
);
1204 other
= unix_peer_get(sk
);
1209 if (sk
->shutdown
&SEND_SHUTDOWN
)
1215 * Optimisation for the fact that under 0.01% of X messages typically
1221 /* Keep two messages in the pipe so it schedules better */
1222 if (size
> sk
->sndbuf
/2 - 16)
1223 size
= sk
->sndbuf
/2 - 16;
1226 * Keep to page sized kmalloc()'s as various people
1227 * have suggested. Big mallocs stress the vm too
1232 limit
= 4096-16; /* Fall back to a page if we can't grab a big buffer this instant */
1234 limit
= 0; /* Otherwise just grab and wait */
1240 skb
=sock_alloc_send_skb(sk
,size
,limit
,msg
->msg_flags
&MSG_DONTWAIT
, &err
);
1246 * If you pass two values to the sock_alloc_send_skb
1247 * it tries to grab the large buffer with GFP_BUFFER
1248 * (which can fail easily), and if it fails grab the
1249 * fallback size buffer which is under a page and will
1252 size
= min(size
, skb_tailroom(skb
));
1254 memcpy(UNIXCREDS(skb
), &scm
->creds
, sizeof(struct ucred
));
1256 unix_attach_fds(scm
, skb
);
1258 if ((err
= memcpy_fromiovec(skb_put(skb
,size
), msg
->msg_iov
, size
)) != 0) {
1263 unix_state_rlock(other
);
1265 if (other
->dead
|| (other
->shutdown
& RCV_SHUTDOWN
))
1268 skb_queue_tail(&other
->receive_queue
, skb
);
1269 unix_state_runlock(other
);
1270 other
->data_ready(other
, size
);
1278 unix_state_runlock(other
);
1280 if (sent
==0 && !(msg
->msg_flags
&MSG_NOSIGNAL
))
1281 send_sig(SIGPIPE
,current
,0);
1286 return sent
? : err
;
1289 static void unix_copy_addr(struct msghdr
*msg
, struct sock
*sk
)
1291 msg
->msg_namelen
= sizeof(short);
1292 if (sk
->protinfo
.af_unix
.addr
) {
1293 msg
->msg_namelen
=sk
->protinfo
.af_unix
.addr
->len
;
1294 memcpy(msg
->msg_name
,
1295 sk
->protinfo
.af_unix
.addr
->name
,
1296 sk
->protinfo
.af_unix
.addr
->len
);
1300 static int unix_dgram_recvmsg(struct socket
*sock
, struct msghdr
*msg
, int size
,
1301 int flags
, struct scm_cookie
*scm
)
1303 struct sock
*sk
= sock
->sk
;
1304 int noblock
= flags
& MSG_DONTWAIT
;
1305 struct sk_buff
*skb
;
1312 msg
->msg_namelen
= 0;
1314 skb
= skb_recv_datagram(sk
, flags
, noblock
, &err
);
1318 wake_up_interruptible(&sk
->protinfo
.af_unix
.peer_wait
);
1321 unix_copy_addr(msg
, skb
->sk
);
1323 if (size
> skb
->len
)
1325 else if (size
< skb
->len
)
1326 msg
->msg_flags
|= MSG_TRUNC
;
1328 err
= skb_copy_datagram_iovec(skb
, 0, msg
->msg_iov
, size
);
1332 scm
->creds
= *UNIXCREDS(skb
);
1334 if (!(flags
& MSG_PEEK
))
1337 unix_detach_fds(scm
, skb
);
1341 /* It is questionable: on PEEK we could:
1342 - do not return fds - good, but too simple 8)
1343 - return fds, and do not return them on read (old strategy,
1345 - clone fds (I choosed it for now, it is the most universal
1348 POSIX 1003.1g does not actually define this clearly
1349 at all. POSIX 1003.1g doesn't define a lot of things
1354 scm
->fp
= scm_fp_dup(UNIXCB(skb
).fp
);
1359 skb_free_datagram(sk
,skb
);
1365 * Sleep until data has arrive. But check for races..
1368 static long unix_stream_data_wait(unix_socket
* sk
, long timeo
)
1370 DECLARE_WAITQUEUE(wait
, current
);
1372 unix_state_rlock(sk
);
1374 add_wait_queue(sk
->sleep
, &wait
);
1377 set_current_state(TASK_INTERRUPTIBLE
);
1379 if (skb_queue_len(&sk
->receive_queue
) ||
1381 (sk
->shutdown
& RCV_SHUTDOWN
) ||
1382 signal_pending(current
) ||
1386 sk
->socket
->flags
|= SO_WAITDATA
;
1387 unix_state_runlock(sk
);
1388 timeo
= schedule_timeout(timeo
);
1389 unix_state_rlock(sk
);
1390 sk
->socket
->flags
&= ~SO_WAITDATA
;
1393 __set_current_state(TASK_RUNNING
);
1394 remove_wait_queue(sk
->sleep
, &wait
);
1395 unix_state_runlock(sk
);
1401 static int unix_stream_recvmsg(struct socket
*sock
, struct msghdr
*msg
, int size
,
1402 int flags
, struct scm_cookie
*scm
)
1404 struct sock
*sk
= sock
->sk
;
1405 struct sockaddr_un
*sunaddr
=msg
->msg_name
;
1407 int check_creds
= 0;
1413 if (sk
->state
!= TCP_ESTABLISHED
)
1420 target
= sock_rcvlowat(sk
, flags
&MSG_WAITALL
, size
);
1421 timeo
= sock_rcvtimeo(sk
, flags
&MSG_DONTWAIT
);
1423 msg
->msg_namelen
= 0;
1425 /* Lock the socket to prevent queue disordering
1426 * while sleeps in memcpy_tomsg
1429 down(&sk
->protinfo
.af_unix
.readsem
);
1434 struct sk_buff
*skb
;
1436 skb
=skb_dequeue(&sk
->receive_queue
);
1439 if (copied
>= target
)
1443 * POSIX 1003.1g mandates this order.
1446 if ((err
= sock_error(sk
)) != 0)
1448 if (sk
->shutdown
& RCV_SHUTDOWN
)
1453 up(&sk
->protinfo
.af_unix
.readsem
);
1455 timeo
= unix_stream_data_wait(sk
, timeo
);
1457 if (signal_pending(current
)) {
1461 down(&sk
->protinfo
.af_unix
.readsem
);
1466 /* Never glue messages from different writers */
1467 if (memcmp(UNIXCREDS(skb
), &scm
->creds
, sizeof(scm
->creds
)) != 0) {
1468 skb_queue_head(&sk
->receive_queue
, skb
);
1472 /* Copy credentials */
1473 scm
->creds
= *UNIXCREDS(skb
);
1477 /* Copy address just once */
1480 unix_copy_addr(msg
, skb
->sk
);
1484 chunk
= min(skb
->len
, size
);
1485 if (memcpy_toiovec(msg
->msg_iov
, skb
->data
, chunk
)) {
1486 skb_queue_head(&sk
->receive_queue
, skb
);
1494 /* Mark read part of skb as used */
1495 if (!(flags
& MSG_PEEK
))
1497 skb_pull(skb
, chunk
);
1500 unix_detach_fds(scm
, skb
);
1502 /* put the skb back if we didn't use it up.. */
1505 skb_queue_head(&sk
->receive_queue
, skb
);
1516 /* It is questionable, see note in unix_dgram_recvmsg.
1519 scm
->fp
= scm_fp_dup(UNIXCB(skb
).fp
);
1521 /* put message back and return */
1522 skb_queue_head(&sk
->receive_queue
, skb
);
1527 up(&sk
->protinfo
.af_unix
.readsem
);
1529 return copied
? : err
;
1532 static int unix_shutdown(struct socket
*sock
, int mode
)
1534 struct sock
*sk
= sock
->sk
;
1537 mode
= (mode
+1)&(RCV_SHUTDOWN
|SEND_SHUTDOWN
);
1540 unix_state_wlock(sk
);
1541 sk
->shutdown
|= mode
;
1542 other
=unix_peer(sk
);
1545 unix_state_wunlock(sk
);
1546 sk
->state_change(sk
);
1548 if (other
&& sk
->type
== SOCK_STREAM
) {
1551 if (mode
&RCV_SHUTDOWN
)
1552 peer_mode
|= SEND_SHUTDOWN
;
1553 if (mode
&SEND_SHUTDOWN
)
1554 peer_mode
|= RCV_SHUTDOWN
;
1555 unix_state_wlock(other
);
1556 other
->shutdown
|= peer_mode
;
1557 unix_state_wunlock(other
);
1558 other
->state_change(other
);
1559 if (peer_mode
&RCV_SHUTDOWN
)
1560 sock_wake_async(sk
->socket
,1,POLL_HUP
);
1569 static int unix_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
1571 struct sock
*sk
= sock
->sk
;
1578 amount
= atomic_read(&sk
->wmem_alloc
);
1579 err
= put_user(amount
, (int *)arg
);
1583 struct sk_buff
*skb
;
1584 if (sk
->state
==TCP_LISTEN
) {
1589 spin_lock(&sk
->receive_queue
.lock
);
1590 if((skb
=skb_peek(&sk
->receive_queue
))!=NULL
)
1592 spin_unlock(&sk
->receive_queue
.lock
);
1593 err
= put_user(amount
, (int *)arg
);
1604 static unsigned int unix_poll(struct file
* file
, struct socket
*sock
, poll_table
*wait
)
1606 struct sock
*sk
= sock
->sk
;
1609 poll_wait(file
, sk
->sleep
, wait
);
1612 /* exceptional events? */
1615 if (sk
->shutdown
== SHUTDOWN_MASK
)
1619 if (!skb_queue_empty(&sk
->receive_queue
) || (sk
->shutdown
&RCV_SHUTDOWN
))
1620 mask
|= POLLIN
| POLLRDNORM
;
1622 /* Connection-based need to check for termination and startup */
1623 if (sk
->type
== SOCK_STREAM
&& sk
->state
==TCP_CLOSE
)
1627 * we set writable also when the other side has shut down the
1628 * connection. This prevents stuck sockets.
1630 if (unix_writable(sk
))
1631 mask
|= POLLOUT
| POLLWRNORM
| POLLWRBAND
;
1637 #ifdef CONFIG_PROC_FS
1638 static int unix_read_proc(char *buffer
, char **start
, off_t offset
,
1639 int length
, int *eof
, void *data
)
1647 len
+= sprintf(buffer
,"Num RefCount Protocol Flags Type St "
1650 read_lock(&unix_table_lock
);
1651 forall_unix_sockets (i
,s
)
1653 unix_state_rlock(s
);
1655 len
+=sprintf(buffer
+len
,"%p: %08X %08X %08X %04X %02X %5ld",
1657 atomic_read(&s
->refcnt
),
1659 s
->state
== TCP_LISTEN
? SO_ACCEPTCON
: 0,
1662 (s
->state
== TCP_ESTABLISHED
? SS_CONNECTED
: SS_UNCONNECTED
) :
1663 (s
->state
== TCP_ESTABLISHED
? SS_CONNECTING
: SS_DISCONNECTING
),
1664 s
->socket
? s
->socket
->inode
->i_ino
: 0);
1666 if (s
->protinfo
.af_unix
.addr
)
1668 buffer
[len
++] = ' ';
1669 memcpy(buffer
+len
, s
->protinfo
.af_unix
.addr
->name
->sun_path
,
1670 s
->protinfo
.af_unix
.addr
->len
-sizeof(short));
1671 if (!UNIX_ABSTRACT(s
))
1675 len
+= s
->protinfo
.af_unix
.addr
->len
- sizeof(short);
1677 unix_state_runlock(s
);
1687 if(pos
>offset
+length
)
1692 read_unlock(&unix_table_lock
);
1693 *start
=buffer
+(offset
-begin
);
1694 len
-=(offset
-begin
);
1703 struct proto_ops unix_stream_ops
= {
1708 unix_stream_connect
,
1719 unix_stream_sendmsg
,
1720 unix_stream_recvmsg
,
1724 struct proto_ops unix_dgram_ops
= {
1745 struct net_proto_family unix_family_ops
= {
1751 #ifdef CONFIG_SYSCTL
1752 extern void unix_sysctl_register(void);
1753 extern void unix_sysctl_unregister(void);
1756 int init_module(void)
1758 void __init
unix_proto_init(struct net_proto
*pro
)
1761 struct sk_buff
*dummy_skb
;
1763 printk(KERN_INFO
"NET4: Unix domain sockets 1.0/SMP for Linux NET4.0.\n");
1764 if (sizeof(struct unix_skb_parms
) > sizeof(dummy_skb
->cb
))
1766 printk(KERN_CRIT
"unix_proto_init: panic\n");
1773 sock_register(&unix_family_ops
);
1774 #ifdef CONFIG_PROC_FS
1775 create_proc_read_entry("net/unix", 0, 0, unix_read_proc
, NULL
);
1779 #ifdef CONFIG_SYSCTL
1780 unix_sysctl_register();
1788 void cleanup_module(void)
1790 sock_unregister(PF_UNIX
);
1791 #ifdef CONFIG_SYSCTL
1792 unix_sysctl_unregister();
1794 #ifdef CONFIG_PROC_FS
1795 remove_proc_entry("net/unix", 0);
1802 * compile-command: "gcc -g -D__KERNEL__ -Wall -O6 -I/usr/src/linux/include -c af_unix.c"