2 * NET3: Implementation of BSD Unix domain sockets.
4 * Authors: Alan Cox, <alan.cox@linux.org>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
11 * Version: $Id: af_unix.c,v 1.108 2000/11/10 04:02:04 davem Exp $
14 * Linus Torvalds : Assorted bug cures.
15 * Niibe Yutaka : async I/O support.
16 * Carsten Paeth : PF_UNIX check, address fixes.
17 * Alan Cox : Limit size of allocated blocks.
18 * Alan Cox : Fixed the stupid socketpair bug.
19 * Alan Cox : BSD compatibility fine tuning.
20 * Alan Cox : Fixed a bug in connect when interrupted.
21 * Alan Cox : Sorted out a proper draft version of
22 * file descriptor passing hacked up from
24 * Marty Leisner : Fixes to fd passing
25 * Nick Nevin : recvmsg bugfix.
26 * Alan Cox : Started proper garbage collector
27 * Heiko EiBfeldt : Missing verify_area check
28 * Alan Cox : Started POSIXisms
29 * Andreas Schwab : Replace inode by dentry for proper
31 * Kirk Petersen : Made this a module
32 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
34 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
35 * by above two patches.
36 * Andrea Arcangeli : If possible we block in connect(2)
37 * if the max backlog of the listen socket
38 * is been reached. This won't break
39 * old apps and it will avoid huge amount
40 * of socks hashed (this for unix_gc()
41 * performances reasons).
42 * Security fix that limits the max
43 * number of socks to 2*max_files and
44 * the number of skb queueable in the
46 * Artur Skawina : Hash function optimizations
47 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
48 * Malcolm Beattie : Set peercred for socketpair
49 * Michal Ostrowski : Module initialization cleanup.
52 * Known differences from reference BSD that was tested:
55 * ECONNREFUSED is not returned from one end of a connected() socket to the
56 * other the moment one end closes.
57 * fstat() doesn't return st_dev=NODEV, and give the blksize as high water mark
58 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
60 * accept() returns a path name even if the connecting socket has closed
61 * in the meantime (BSD loses the path and gives up).
62 * accept() returns 0 length path for an unbound connector. BSD returns 16
63 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
64 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
65 * BSD af_unix apparently has connect forgetting to block properly.
66 * (need to check this with the POSIX spec in detail)
68 * Differences from 2.0.0-11-... (ANK)
69 * Bug fixes and improvements.
70 * - client shutdown killed server socket.
71 * - removed all useless cli/sti pairs.
73 * Semantic changes/extensions.
74 * - generic control message passing.
75 * - SCM_CREDENTIALS control message.
76 * - "Abstract" (not FS based) socket bindings.
77 * Abstract names are sequences of bytes (not zero terminated)
78 * started by 0, so that this name space does not intersect
82 #include <linux/module.h>
83 #include <linux/config.h>
84 #include <linux/kernel.h>
85 #include <linux/major.h>
86 #include <linux/signal.h>
87 #include <linux/sched.h>
88 #include <linux/errno.h>
89 #include <linux/string.h>
90 #include <linux/stat.h>
91 #include <linux/socket.h>
93 #include <linux/fcntl.h>
94 #include <linux/termios.h>
95 #include <linux/sockios.h>
96 #include <linux/net.h>
99 #include <linux/malloc.h>
100 #include <asm/uaccess.h>
101 #include <linux/skbuff.h>
102 #include <linux/netdevice.h>
103 #include <net/sock.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
108 #include <linux/init.h>
109 #include <linux/poll.h>
110 #include <linux/smp_lock.h>
112 #include <asm/checksum.h>
114 #define min(a,b) (((a)<(b))?(a):(b))
116 int sysctl_unix_max_dgram_qlen
= 10;
118 unix_socket
*unix_socket_table
[UNIX_HASH_SIZE
+1];
119 rwlock_t unix_table_lock
= RW_LOCK_UNLOCKED
;
120 static atomic_t unix_nr_socks
= ATOMIC_INIT(0);
122 #define unix_sockets_unbound (unix_socket_table[UNIX_HASH_SIZE])
124 #define UNIX_ABSTRACT(sk) ((sk)->protinfo.af_unix.addr->hash!=UNIX_HASH_SIZE)
127 SMP locking strategy.
128 * hash table is protceted with rwlock unix_table_lock
129 * each socket state is protected by separate rwlock.
133 extern __inline__
unsigned unix_hash_fold(unsigned hash
)
137 return hash
&(UNIX_HASH_SIZE
-1);
140 #define unix_peer(sk) ((sk)->pair)
142 extern __inline__
int unix_our_peer(unix_socket
*sk
, unix_socket
*osk
)
144 return unix_peer(osk
) == sk
;
147 extern __inline__
int unix_may_send(unix_socket
*sk
, unix_socket
*osk
)
149 return (unix_peer(osk
) == NULL
|| unix_our_peer(sk
, osk
));
152 static __inline__ unix_socket
* unix_peer_get(unix_socket
*s
)
160 unix_state_runlock(s
);
164 extern __inline__
void unix_release_addr(struct unix_address
*addr
)
166 if (atomic_dec_and_test(&addr
->refcnt
))
171 * Check unix socket name:
172 * - should be not zero length.
173 * - if started by not zero, should be NULL terminated (FS object)
174 * - if started by zero, it is abstract name.
177 static int unix_mkname(struct sockaddr_un
* sunaddr
, int len
, unsigned *hashp
)
179 if (len
<= sizeof(short) || len
> sizeof(*sunaddr
))
181 if (!sunaddr
|| sunaddr
->sun_family
!= AF_UNIX
)
183 if (sunaddr
->sun_path
[0])
186 * This may look like an off by one error but it is
187 * a bit more subtle. 108 is the longest valid AF_UNIX
188 * path for a binding. sun_path[108] doesnt as such
189 * exist. However in kernel space we are guaranteed that
190 * it is a valid memory location in our kernel
193 if (len
> sizeof(*sunaddr
))
194 len
= sizeof(*sunaddr
);
195 ((char *)sunaddr
)[len
]=0;
196 len
= strlen(sunaddr
->sun_path
)+1+sizeof(short);
200 *hashp
= unix_hash_fold(csum_partial((char*)sunaddr
, len
, 0));
204 static void __unix_remove_socket(unix_socket
*sk
)
206 unix_socket
**list
= sk
->protinfo
.af_unix
.list
;
209 sk
->next
->prev
= sk
->prev
;
211 sk
->prev
->next
= sk
->next
;
214 sk
->protinfo
.af_unix
.list
= NULL
;
221 static void __unix_insert_socket(unix_socket
**list
, unix_socket
*sk
)
223 BUG_TRAP(sk
->protinfo
.af_unix
.list
==NULL
);
225 sk
->protinfo
.af_unix
.list
= list
;
234 static __inline__
void unix_remove_socket(unix_socket
*sk
)
236 write_lock(&unix_table_lock
);
237 __unix_remove_socket(sk
);
238 write_unlock(&unix_table_lock
);
241 static __inline__
void unix_insert_socket(unix_socket
**list
, unix_socket
*sk
)
243 write_lock(&unix_table_lock
);
244 __unix_insert_socket(list
, sk
);
245 write_unlock(&unix_table_lock
);
248 static unix_socket
*__unix_find_socket_byname(struct sockaddr_un
*sunname
,
249 int len
, int type
, unsigned hash
)
253 for (s
=unix_socket_table
[hash
^type
]; s
; s
=s
->next
) {
254 if(s
->protinfo
.af_unix
.addr
->len
==len
&&
255 memcmp(s
->protinfo
.af_unix
.addr
->name
, sunname
, len
) == 0)
261 static __inline__ unix_socket
*
262 unix_find_socket_byname(struct sockaddr_un
*sunname
,
263 int len
, int type
, unsigned hash
)
267 read_lock(&unix_table_lock
);
268 s
= __unix_find_socket_byname(sunname
, len
, type
, hash
);
271 read_unlock(&unix_table_lock
);
275 static unix_socket
*unix_find_socket_byinode(struct inode
*i
)
279 read_lock(&unix_table_lock
);
280 for (s
=unix_socket_table
[i
->i_ino
& (UNIX_HASH_SIZE
-1)]; s
; s
=s
->next
)
282 struct dentry
*dentry
= s
->protinfo
.af_unix
.dentry
;
284 if(dentry
&& dentry
->d_inode
== i
)
290 read_unlock(&unix_table_lock
);
294 static __inline__
int unix_writable(struct sock
*sk
)
296 return ((atomic_read(&sk
->wmem_alloc
)<<2) <= sk
->sndbuf
);
299 static void unix_write_space(struct sock
*sk
)
301 read_lock(&sk
->callback_lock
);
302 if (unix_writable(sk
)) {
303 if (sk
->sleep
&& waitqueue_active(sk
->sleep
))
304 wake_up_interruptible(sk
->sleep
);
305 sk_wake_async(sk
, 2, POLL_OUT
);
307 read_unlock(&sk
->callback_lock
);
310 /* When dgram socket disconnects (or changes its peer), we clear its receive
311 * queue of packets arrived from previous peer. First, it allows to do
312 * flow control based only on wmem_alloc; second, sk connected to peer
313 * may receive messages only from that peer. */
314 static void unix_dgram_disconnected(struct sock
*sk
, struct sock
*other
)
316 if (skb_queue_len(&sk
->receive_queue
)) {
317 skb_queue_purge(&sk
->receive_queue
);
318 wake_up_interruptible_all(&sk
->protinfo
.af_unix
.peer_wait
);
320 /* If one link of bidirectional dgram pipe is disconnected,
321 * we signal error. Messages are lost. Do not make this,
322 * when peer was not connected to us.
324 if (!other
->dead
&& unix_peer(other
) == sk
) {
325 other
->err
= ECONNRESET
;
326 other
->error_report(other
);
331 static void unix_sock_destructor(struct sock
*sk
)
333 skb_queue_purge(&sk
->receive_queue
);
335 BUG_TRAP(atomic_read(&sk
->wmem_alloc
) == 0);
336 BUG_TRAP(sk
->protinfo
.af_unix
.list
==NULL
);
337 BUG_TRAP(sk
->socket
==NULL
);
339 printk("Attempt to release alive unix socket: %p\n", sk
);
343 if (sk
->protinfo
.af_unix
.addr
)
344 unix_release_addr(sk
->protinfo
.af_unix
.addr
);
346 atomic_dec(&unix_nr_socks
);
347 #ifdef UNIX_REFCNT_DEBUG
348 printk(KERN_DEBUG
"UNIX %p is destroyed, %d are still alive.\n", sk
, atomic_read(&unix_nr_socks
));
353 static int unix_release_sock (unix_socket
*sk
, int embrion
)
355 struct dentry
*dentry
;
356 struct vfsmount
*mnt
;
361 unix_remove_socket(sk
);
364 unix_state_wlock(sk
);
366 sk
->shutdown
= SHUTDOWN_MASK
;
367 dentry
= sk
->protinfo
.af_unix
.dentry
;
368 sk
->protinfo
.af_unix
.dentry
=NULL
;
369 mnt
= sk
->protinfo
.af_unix
.mnt
;
370 sk
->protinfo
.af_unix
.mnt
=NULL
;
372 sk
->state
= TCP_CLOSE
;
373 unix_state_wunlock(sk
);
375 wake_up_interruptible_all(&sk
->protinfo
.af_unix
.peer_wait
);
377 skpair
=unix_peer(sk
);
380 if (sk
->type
==SOCK_STREAM
) {
381 unix_state_wlock(skpair
);
382 skpair
->shutdown
=SHUTDOWN_MASK
; /* No more writes*/
383 if (!skb_queue_empty(&sk
->receive_queue
) || embrion
)
384 skpair
->err
= ECONNRESET
;
385 unix_state_wunlock(skpair
);
386 skpair
->state_change(skpair
);
387 read_lock(&skpair
->callback_lock
);
388 sk_wake_async(skpair
,1,POLL_HUP
);
389 read_unlock(&skpair
->callback_lock
);
391 sock_put(skpair
); /* It may now die */
392 unix_peer(sk
) = NULL
;
395 /* Try to flush out this socket. Throw out buffers at least */
397 while((skb
=skb_dequeue(&sk
->receive_queue
))!=NULL
)
399 if (state
==TCP_LISTEN
)
400 unix_release_sock(skb
->sk
, 1);
401 /* passed fds are erased in the kfree_skb hook */
412 /* ---- Socket is dead now and most probably destroyed ---- */
415 * Fixme: BSD difference: In BSD all sockets connected to use get
416 * ECONNRESET and we die on the spot. In Linux we behave
417 * like files and pipes do and wait for the last
420 * Can't we simply set sock->err?
422 * What the above comment does talk about? --ANK(980817)
425 if (atomic_read(&unix_tot_inflight
))
426 unix_gc(); /* Garbage collect fds */
431 static int unix_listen(struct socket
*sock
, int backlog
)
434 struct sock
*sk
= sock
->sk
;
437 if (sock
->type
!=SOCK_STREAM
)
438 goto out
; /* Only stream sockets accept */
440 if (!sk
->protinfo
.af_unix
.addr
)
441 goto out
; /* No listens on an unbound socket */
442 unix_state_wlock(sk
);
443 if (sk
->state
!= TCP_CLOSE
&& sk
->state
!= TCP_LISTEN
)
445 if (backlog
> sk
->max_ack_backlog
)
446 wake_up_interruptible_all(&sk
->protinfo
.af_unix
.peer_wait
);
447 sk
->max_ack_backlog
=backlog
;
448 sk
->state
=TCP_LISTEN
;
449 /* set credentials so connect can copy them */
450 sk
->peercred
.pid
= current
->pid
;
451 sk
->peercred
.uid
= current
->euid
;
452 sk
->peercred
.gid
= current
->egid
;
456 unix_state_wunlock(sk
);
461 extern struct proto_ops unix_stream_ops
;
462 extern struct proto_ops unix_dgram_ops
;
464 static struct sock
* unix_create1(struct socket
*sock
)
468 if (atomic_read(&unix_nr_socks
) >= 2*files_stat
.max_files
)
472 sk
= sk_alloc(PF_UNIX
, GFP_KERNEL
, 1);
478 atomic_inc(&unix_nr_socks
);
480 sock_init_data(sock
,sk
);
482 sk
->write_space
= unix_write_space
;
484 sk
->max_ack_backlog
= sysctl_unix_max_dgram_qlen
;
485 sk
->destruct
= unix_sock_destructor
;
486 sk
->protinfo
.af_unix
.dentry
=NULL
;
487 sk
->protinfo
.af_unix
.mnt
=NULL
;
488 sk
->protinfo
.af_unix
.lock
= RW_LOCK_UNLOCKED
;
489 atomic_set(&sk
->protinfo
.af_unix
.inflight
, 0);
490 init_MUTEX(&sk
->protinfo
.af_unix
.readsem
);/* single task reading lock */
491 init_waitqueue_head(&sk
->protinfo
.af_unix
.peer_wait
);
492 sk
->protinfo
.af_unix
.list
=NULL
;
493 unix_insert_socket(&unix_sockets_unbound
, sk
);
498 static int unix_create(struct socket
*sock
, int protocol
)
500 if (protocol
&& protocol
!= PF_UNIX
)
501 return -EPROTONOSUPPORT
;
503 sock
->state
= SS_UNCONNECTED
;
505 switch (sock
->type
) {
507 sock
->ops
= &unix_stream_ops
;
510 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
514 sock
->type
=SOCK_DGRAM
;
516 sock
->ops
= &unix_dgram_ops
;
519 return -ESOCKTNOSUPPORT
;
522 return unix_create1(sock
) ? 0 : -ENOMEM
;
525 static int unix_release(struct socket
*sock
)
527 unix_socket
*sk
= sock
->sk
;
534 return unix_release_sock (sk
, 0);
537 static int unix_autobind(struct socket
*sock
)
539 struct sock
*sk
= sock
->sk
;
540 static u32 ordernum
= 1;
541 struct unix_address
* addr
;
544 down(&sk
->protinfo
.af_unix
.readsem
);
547 if (sk
->protinfo
.af_unix
.addr
)
551 addr
= kmalloc(sizeof(*addr
) + sizeof(short) + 16, GFP_KERNEL
);
555 memset(addr
, 0, sizeof(*addr
) + sizeof(short) + 16);
556 addr
->name
->sun_family
= AF_UNIX
;
557 atomic_set(&addr
->refcnt
, 1);
560 addr
->len
= sprintf(addr
->name
->sun_path
+1, "%05x", ordernum
) + 1 + sizeof(short);
561 addr
->hash
= unix_hash_fold(csum_partial((void*)addr
->name
, addr
->len
, 0));
563 write_lock(&unix_table_lock
);
564 ordernum
= (ordernum
+1)&0xFFFFF;
566 if (__unix_find_socket_byname(addr
->name
, addr
->len
, sock
->type
,
568 write_unlock(&unix_table_lock
);
569 /* Sanity yield. It is unusual case, but yet... */
570 if (!(ordernum
&0xFF)) {
571 current
->policy
|= SCHED_YIELD
;
576 addr
->hash
^= sk
->type
;
578 __unix_remove_socket(sk
);
579 sk
->protinfo
.af_unix
.addr
= addr
;
580 __unix_insert_socket(&unix_socket_table
[addr
->hash
], sk
);
581 write_unlock(&unix_table_lock
);
585 up(&sk
->protinfo
.af_unix
.readsem
);
589 static unix_socket
*unix_find_other(struct sockaddr_un
*sunname
, int len
,
590 int type
, unsigned hash
, int *error
)
596 if (sunname
->sun_path
[0]) {
597 if (path_init(sunname
->sun_path
,
598 LOOKUP_POSITIVE
|LOOKUP_FOLLOW
, &nd
))
599 err
= path_walk(sunname
->sun_path
, &nd
);
602 err
= permission(nd
.dentry
->d_inode
,MAY_WRITE
);
607 if (!S_ISSOCK(nd
.dentry
->d_inode
->i_mode
))
609 u
=unix_find_socket_byinode(nd
.dentry
->d_inode
);
616 if (u
->type
!= type
) {
622 u
=unix_find_socket_byname(sunname
, len
, type
, hash
);
636 static int unix_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
638 struct sock
*sk
= sock
->sk
;
639 struct sockaddr_un
*sunaddr
=(struct sockaddr_un
*)uaddr
;
640 struct dentry
* dentry
= NULL
;
644 struct unix_address
*addr
;
648 if (sunaddr
->sun_family
!= AF_UNIX
)
651 if (addr_len
==sizeof(short)) {
652 err
= unix_autobind(sock
);
656 err
= unix_mkname(sunaddr
, addr_len
, &hash
);
661 down(&sk
->protinfo
.af_unix
.readsem
);
664 if (sk
->protinfo
.af_unix
.addr
)
668 addr
= kmalloc(sizeof(*addr
)+addr_len
, GFP_KERNEL
);
672 memcpy(addr
->name
, sunaddr
, addr_len
);
673 addr
->len
= addr_len
;
674 addr
->hash
= hash
^sk
->type
;
675 atomic_set(&addr
->refcnt
, 1);
677 if (sunaddr
->sun_path
[0]) {
680 * Get the parent directory, calculate the hash for last
683 if (path_init(sunaddr
->sun_path
, LOOKUP_PARENT
, &nd
))
684 err
= path_walk(sunaddr
->sun_path
, &nd
);
686 goto out_mknod_parent
;
688 * Yucky last component or no last component at all?
689 * (foo/., foo/.., /////)
692 if (nd
.last_type
!= LAST_NORM
)
695 * Lock the directory.
697 down(&nd
.dentry
->d_inode
->i_sem
);
699 * Do the final lookup.
701 dentry
= lookup_hash(&nd
.last
, nd
.dentry
);
702 err
= PTR_ERR(dentry
);
704 goto out_mknod_unlock
;
707 * Special case - lookup gave negative, but... we had foo/bar/
708 * From the vfs_mknod() POV we just have a negative dentry -
709 * all is fine. Let's be bastards - you had / on the end, you've
710 * been asking for (non-existent) directory. -ENOENT for you.
712 if (nd
.last
.name
[nd
.last
.len
] && !dentry
->d_inode
)
715 * All right, let's create it.
717 err
= vfs_mknod(nd
.dentry
->d_inode
, dentry
,
718 S_IFSOCK
|sock
->inode
->i_mode
, 0);
721 up(&nd
.dentry
->d_inode
->i_sem
);
725 addr
->hash
= UNIX_HASH_SIZE
;
728 write_lock(&unix_table_lock
);
730 if (!sunaddr
->sun_path
[0]) {
732 if (__unix_find_socket_byname(sunaddr
, addr_len
,
734 unix_release_addr(addr
);
738 list
= &unix_socket_table
[addr
->hash
];
740 list
= &unix_socket_table
[dentry
->d_inode
->i_ino
& (UNIX_HASH_SIZE
-1)];
741 sk
->protinfo
.af_unix
.dentry
= nd
.dentry
;
742 sk
->protinfo
.af_unix
.mnt
= nd
.mnt
;
746 __unix_remove_socket(sk
);
747 sk
->protinfo
.af_unix
.addr
= addr
;
748 __unix_insert_socket(list
, sk
);
751 write_unlock(&unix_table_lock
);
753 up(&sk
->protinfo
.af_unix
.readsem
);
760 up(&nd
.dentry
->d_inode
->i_sem
);
766 unix_release_addr(addr
);
770 static int unix_dgram_connect(struct socket
*sock
, struct sockaddr
*addr
,
773 struct sock
*sk
= sock
->sk
;
774 struct sockaddr_un
*sunaddr
=(struct sockaddr_un
*)addr
;
779 if (addr
->sa_family
!= AF_UNSPEC
) {
780 err
= unix_mkname(sunaddr
, alen
, &hash
);
785 if (sock
->passcred
&& !sk
->protinfo
.af_unix
.addr
&&
786 (err
= unix_autobind(sock
)) != 0)
789 other
=unix_find_other(sunaddr
, alen
, sock
->type
, hash
, &err
);
793 unix_state_wlock(sk
);
796 if (!unix_may_send(sk
, other
))
800 * 1003.1g breaking connected state with AF_UNSPEC
803 unix_state_wlock(sk
);
807 * If it was connected, reconnect.
810 struct sock
*old_peer
= unix_peer(sk
);
812 unix_state_wunlock(sk
);
814 if (other
!= old_peer
)
815 unix_dgram_disconnected(sk
, old_peer
);
819 unix_state_wunlock(sk
);
824 unix_state_wunlock(sk
);
830 static long unix_wait_for_peer(unix_socket
*other
, long timeo
)
833 DECLARE_WAITQUEUE(wait
, current
);
835 __set_current_state(TASK_INTERRUPTIBLE
);
836 add_wait_queue_exclusive(&other
->protinfo
.af_unix
.peer_wait
, &wait
);
838 sched
= (!other
->dead
&&
839 !(other
->shutdown
&RCV_SHUTDOWN
) &&
840 skb_queue_len(&other
->receive_queue
) > other
->max_ack_backlog
);
842 unix_state_runlock(other
);
845 timeo
= schedule_timeout(timeo
);
847 __set_current_state(TASK_RUNNING
);
848 remove_wait_queue(&other
->protinfo
.af_unix
.peer_wait
, &wait
);
852 static int unix_stream_connect(struct socket
*sock
, struct sockaddr
*uaddr
,
853 int addr_len
, int flags
)
855 struct sockaddr_un
*sunaddr
=(struct sockaddr_un
*)uaddr
;
856 struct sock
*sk
= sock
->sk
;
857 struct sock
*newsk
= NULL
;
858 unix_socket
*other
= NULL
;
859 struct sk_buff
*skb
= NULL
;
865 err
= unix_mkname(sunaddr
, addr_len
, &hash
);
870 if (sock
->passcred
&& !sk
->protinfo
.af_unix
.addr
&&
871 (err
= unix_autobind(sock
)) != 0)
874 timeo
= sock_sndtimeo(sk
, flags
& O_NONBLOCK
);
876 /* First of all allocate resources.
877 If we will make it after state is locked,
878 we will have to recheck all again in any case.
883 /* create new sock for complete connection */
884 newsk
= unix_create1(NULL
);
888 /* Allocate skb for sending to listening sock */
889 skb
= sock_wmalloc(newsk
, 1, 0, GFP_KERNEL
);
894 /* Find listening sock. */
895 other
=unix_find_other(sunaddr
, addr_len
, sk
->type
, hash
, &err
);
899 /* Latch state of peer */
900 unix_state_rlock(other
);
902 /* Apparently VFS overslept socket death. Retry. */
904 unix_state_runlock(other
);
910 if (other
->state
!= TCP_LISTEN
)
913 if (skb_queue_len(&other
->receive_queue
) > other
->max_ack_backlog
) {
918 timeo
= unix_wait_for_peer(other
, timeo
);
920 err
= sock_intr_errno(timeo
);
921 if (signal_pending(current
))
929 It is tricky place. We need to grab write lock and cannot
930 drop lock on peer. It is dangerous because deadlock is
931 possible. Connect to self case and simultaneous
932 attempt to connect are eliminated by checking socket
933 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
934 check this before attempt to grab lock.
936 Well, and we have to recheck the state after socket locked.
942 /* This is ok... continue with connect */
944 case TCP_ESTABLISHED
:
945 /* Socket is already connected */
953 unix_state_wlock(sk
);
955 if (sk
->state
!= st
) {
956 unix_state_wunlock(sk
);
957 unix_state_runlock(other
);
962 /* The way is open! Fastly set all the necessary fields... */
966 newsk
->state
=TCP_ESTABLISHED
;
967 newsk
->type
=SOCK_STREAM
;
968 newsk
->peercred
.pid
= current
->pid
;
969 newsk
->peercred
.uid
= current
->euid
;
970 newsk
->peercred
.gid
= current
->egid
;
971 newsk
->sleep
= &newsk
->protinfo
.af_unix
.peer_wait
;
973 /* copy address information from listening to new sock*/
974 if (other
->protinfo
.af_unix
.addr
)
976 atomic_inc(&other
->protinfo
.af_unix
.addr
->refcnt
);
977 newsk
->protinfo
.af_unix
.addr
=other
->protinfo
.af_unix
.addr
;
979 if (other
->protinfo
.af_unix
.dentry
) {
980 newsk
->protinfo
.af_unix
.dentry
=dget(other
->protinfo
.af_unix
.dentry
);
981 newsk
->protinfo
.af_unix
.mnt
=mntget(other
->protinfo
.af_unix
.mnt
);
984 /* Set credentials */
985 sk
->peercred
= other
->peercred
;
989 sock
->state
=SS_CONNECTED
;
990 sk
->state
=TCP_ESTABLISHED
;
992 unix_state_wunlock(sk
);
994 /* take ten and and send info to listening sock */
995 skb_queue_tail(&other
->receive_queue
,skb
);
996 unix_state_runlock(other
);
997 other
->data_ready(other
, 0);
1003 unix_state_runlock(other
);
1009 unix_release_sock(newsk
, 0);
1015 static int unix_socketpair(struct socket
*socka
, struct socket
*sockb
)
1017 struct sock
*ska
=socka
->sk
, *skb
= sockb
->sk
;
1019 /* Join our sockets back to back */
1024 ska
->peercred
.pid
= skb
->peercred
.pid
= current
->pid
;
1025 ska
->peercred
.uid
= skb
->peercred
.uid
= current
->euid
;
1026 ska
->peercred
.gid
= skb
->peercred
.gid
= current
->egid
;
1028 if (ska
->type
!= SOCK_DGRAM
)
1030 ska
->state
=TCP_ESTABLISHED
;
1031 skb
->state
=TCP_ESTABLISHED
;
1032 socka
->state
=SS_CONNECTED
;
1033 sockb
->state
=SS_CONNECTED
;
1038 static int unix_accept(struct socket
*sock
, struct socket
*newsock
, int flags
)
1040 unix_socket
*sk
= sock
->sk
;
1042 struct sk_buff
*skb
;
1046 if (sock
->type
!=SOCK_STREAM
)
1050 if (sk
->state
!=TCP_LISTEN
)
1053 /* If socket state is TCP_LISTEN it cannot change (for now...),
1054 * so that no locks are necessary.
1057 skb
= skb_recv_datagram(sk
, 0, flags
&O_NONBLOCK
, &err
);
1062 skb_free_datagram(sk
, skb
);
1063 wake_up_interruptible(&sk
->protinfo
.af_unix
.peer_wait
);
1065 /* attach accepted sock to socket */
1066 unix_state_wlock(tsk
);
1067 newsock
->state
= SS_CONNECTED
;
1068 sock_graft(tsk
, newsock
);
1069 unix_state_wunlock(tsk
);
1077 static int unix_getname(struct socket
*sock
, struct sockaddr
*uaddr
, int *uaddr_len
, int peer
)
1079 struct sock
*sk
= sock
->sk
;
1080 struct sockaddr_un
*sunaddr
=(struct sockaddr_un
*)uaddr
;
1084 sk
= unix_peer_get(sk
);
1094 unix_state_rlock(sk
);
1095 if (!sk
->protinfo
.af_unix
.addr
) {
1096 sunaddr
->sun_family
= AF_UNIX
;
1097 sunaddr
->sun_path
[0] = 0;
1098 *uaddr_len
= sizeof(short);
1100 struct unix_address
*addr
= sk
->protinfo
.af_unix
.addr
;
1102 *uaddr_len
= addr
->len
;
1103 memcpy(sunaddr
, addr
->name
, *uaddr_len
);
1105 unix_state_runlock(sk
);
1111 static void unix_detach_fds(struct scm_cookie
*scm
, struct sk_buff
*skb
)
1115 scm
->fp
= UNIXCB(skb
).fp
;
1116 skb
->destructor
= sock_wfree
;
1117 UNIXCB(skb
).fp
= NULL
;
1119 for (i
=scm
->fp
->count
-1; i
>=0; i
--)
1120 unix_notinflight(scm
->fp
->fp
[i
]);
1123 static void unix_destruct_fds(struct sk_buff
*skb
)
1125 struct scm_cookie scm
;
1126 memset(&scm
, 0, sizeof(scm
));
1127 unix_detach_fds(&scm
, skb
);
1129 /* Alas, it calls VFS */
1130 /* So fscking what? fput() had been SMP-safe since the last Summer */
1135 static void unix_attach_fds(struct scm_cookie
*scm
, struct sk_buff
*skb
)
1138 for (i
=scm
->fp
->count
-1; i
>=0; i
--)
1139 unix_inflight(scm
->fp
->fp
[i
]);
1140 UNIXCB(skb
).fp
= scm
->fp
;
1141 skb
->destructor
= unix_destruct_fds
;
1146 * Send AF_UNIX data.
1149 static int unix_dgram_sendmsg(struct socket
*sock
, struct msghdr
*msg
, int len
,
1150 struct scm_cookie
*scm
)
1152 struct sock
*sk
= sock
->sk
;
1153 struct sockaddr_un
*sunaddr
=msg
->msg_name
;
1154 unix_socket
*other
= NULL
;
1155 int namelen
= 0; /* fake GCC */
1158 struct sk_buff
*skb
;
1162 if (msg
->msg_flags
&MSG_OOB
)
1165 if (msg
->msg_namelen
) {
1166 err
= unix_mkname(sunaddr
, msg
->msg_namelen
, &hash
);
1173 other
= unix_peer_get(sk
);
1178 if (sock
->passcred
&& !sk
->protinfo
.af_unix
.addr
&&
1179 (err
= unix_autobind(sock
)) != 0)
1183 if ((unsigned)len
> sk
->sndbuf
- 32)
1186 skb
= sock_alloc_send_skb(sk
, len
, 0, msg
->msg_flags
&MSG_DONTWAIT
, &err
);
1190 memcpy(UNIXCREDS(skb
), &scm
->creds
, sizeof(struct ucred
));
1192 unix_attach_fds(scm
, skb
);
1194 skb
->h
.raw
= skb
->data
;
1195 err
= memcpy_fromiovec(skb_put(skb
,len
), msg
->msg_iov
, len
);
1199 timeo
= sock_sndtimeo(sk
, msg
->msg_flags
& MSG_DONTWAIT
);
1204 if (sunaddr
== NULL
)
1207 other
= unix_find_other(sunaddr
, namelen
, sk
->type
, hash
, &err
);
1212 unix_state_rlock(other
);
1214 if (!unix_may_send(sk
, other
))
1219 * Check with 1003.1g - what should
1222 unix_state_runlock(other
);
1226 unix_state_wlock(sk
);
1227 if (unix_peer(sk
) == other
) {
1229 unix_state_wunlock(sk
);
1231 unix_dgram_disconnected(sk
, other
);
1233 err
= -ECONNREFUSED
;
1235 unix_state_wunlock(sk
);
1245 if (other
->shutdown
&RCV_SHUTDOWN
)
1248 if (unix_peer(other
) != sk
&&
1249 skb_queue_len(&other
->receive_queue
) > other
->max_ack_backlog
) {
1255 timeo
= unix_wait_for_peer(other
, timeo
);
1257 err
= sock_intr_errno(timeo
);
1258 if (signal_pending(current
))
1264 skb_queue_tail(&other
->receive_queue
, skb
);
1265 unix_state_runlock(other
);
1266 other
->data_ready(other
, len
);
1271 unix_state_runlock(other
);
1281 static int unix_stream_sendmsg(struct socket
*sock
, struct msghdr
*msg
, int len
,
1282 struct scm_cookie
*scm
)
1284 struct sock
*sk
= sock
->sk
;
1285 unix_socket
*other
= NULL
;
1286 struct sockaddr_un
*sunaddr
=msg
->msg_name
;
1288 struct sk_buff
*skb
;
1293 if (msg
->msg_flags
&MSG_OOB
)
1296 if (msg
->msg_namelen
) {
1297 err
= (sk
->state
==TCP_ESTABLISHED
? -EISCONN
: -EOPNOTSUPP
);
1302 other
= unix_peer_get(sk
);
1307 if (sk
->shutdown
&SEND_SHUTDOWN
)
1313 * Optimisation for the fact that under 0.01% of X messages typically
1319 /* Keep two messages in the pipe so it schedules better */
1320 if (size
> sk
->sndbuf
/2 - 16)
1321 size
= sk
->sndbuf
/2 - 16;
1324 * Keep to page sized kmalloc()'s as various people
1325 * have suggested. Big mallocs stress the vm too
1329 if (size
> PAGE_SIZE
-16)
1330 limit
= PAGE_SIZE
-16; /* Fall back to a page if we can't grab a big buffer this instant */
1332 limit
= 0; /* Otherwise just grab and wait */
1338 skb
=sock_alloc_send_skb(sk
,size
,limit
,msg
->msg_flags
&MSG_DONTWAIT
, &err
);
1344 * If you pass two values to the sock_alloc_send_skb
1345 * it tries to grab the large buffer with GFP_BUFFER
1346 * (which can fail easily), and if it fails grab the
1347 * fallback size buffer which is under a page and will
1350 size
= min(size
, skb_tailroom(skb
));
1352 memcpy(UNIXCREDS(skb
), &scm
->creds
, sizeof(struct ucred
));
1354 unix_attach_fds(scm
, skb
);
1356 if ((err
= memcpy_fromiovec(skb_put(skb
,size
), msg
->msg_iov
, size
)) != 0) {
1361 unix_state_rlock(other
);
1363 if (other
->dead
|| (other
->shutdown
& RCV_SHUTDOWN
))
1366 skb_queue_tail(&other
->receive_queue
, skb
);
1367 unix_state_runlock(other
);
1368 other
->data_ready(other
, size
);
1375 unix_state_runlock(other
);
1378 if (sent
==0 && !(msg
->msg_flags
&MSG_NOSIGNAL
))
1379 send_sig(SIGPIPE
,current
,0);
1384 return sent
? : err
;
1387 static void unix_copy_addr(struct msghdr
*msg
, struct sock
*sk
)
1389 msg
->msg_namelen
= sizeof(short);
1390 if (sk
->protinfo
.af_unix
.addr
) {
1391 msg
->msg_namelen
=sk
->protinfo
.af_unix
.addr
->len
;
1392 memcpy(msg
->msg_name
,
1393 sk
->protinfo
.af_unix
.addr
->name
,
1394 sk
->protinfo
.af_unix
.addr
->len
);
1398 static int unix_dgram_recvmsg(struct socket
*sock
, struct msghdr
*msg
, int size
,
1399 int flags
, struct scm_cookie
*scm
)
1401 struct sock
*sk
= sock
->sk
;
1402 int noblock
= flags
& MSG_DONTWAIT
;
1403 struct sk_buff
*skb
;
1410 msg
->msg_namelen
= 0;
1412 skb
= skb_recv_datagram(sk
, flags
, noblock
, &err
);
1416 wake_up_interruptible(&sk
->protinfo
.af_unix
.peer_wait
);
1419 unix_copy_addr(msg
, skb
->sk
);
1421 if (size
> skb
->len
)
1423 else if (size
< skb
->len
)
1424 msg
->msg_flags
|= MSG_TRUNC
;
1426 err
= skb_copy_datagram_iovec(skb
, 0, msg
->msg_iov
, size
);
1430 scm
->creds
= *UNIXCREDS(skb
);
1432 if (!(flags
& MSG_PEEK
))
1435 unix_detach_fds(scm
, skb
);
1439 /* It is questionable: on PEEK we could:
1440 - do not return fds - good, but too simple 8)
1441 - return fds, and do not return them on read (old strategy,
1443 - clone fds (I choosed it for now, it is the most universal
1446 POSIX 1003.1g does not actually define this clearly
1447 at all. POSIX 1003.1g doesn't define a lot of things
1452 scm
->fp
= scm_fp_dup(UNIXCB(skb
).fp
);
1457 skb_free_datagram(sk
,skb
);
1463 * Sleep until data has arrive. But check for races..
1466 static long unix_stream_data_wait(unix_socket
* sk
, long timeo
)
1468 DECLARE_WAITQUEUE(wait
, current
);
1470 unix_state_rlock(sk
);
1472 add_wait_queue(sk
->sleep
, &wait
);
1475 set_current_state(TASK_INTERRUPTIBLE
);
1477 if (skb_queue_len(&sk
->receive_queue
) ||
1479 (sk
->shutdown
& RCV_SHUTDOWN
) ||
1480 signal_pending(current
) ||
1484 set_bit(SOCK_ASYNC_WAITDATA
, &sk
->socket
->flags
);
1485 unix_state_runlock(sk
);
1486 timeo
= schedule_timeout(timeo
);
1487 unix_state_rlock(sk
);
1488 clear_bit(SOCK_ASYNC_WAITDATA
, &sk
->socket
->flags
);
1491 __set_current_state(TASK_RUNNING
);
1492 remove_wait_queue(sk
->sleep
, &wait
);
1493 unix_state_runlock(sk
);
1499 static int unix_stream_recvmsg(struct socket
*sock
, struct msghdr
*msg
, int size
,
1500 int flags
, struct scm_cookie
*scm
)
1502 struct sock
*sk
= sock
->sk
;
1503 struct sockaddr_un
*sunaddr
=msg
->msg_name
;
1505 int check_creds
= 0;
1511 if (sk
->state
!= TCP_ESTABLISHED
)
1518 target
= sock_rcvlowat(sk
, flags
&MSG_WAITALL
, size
);
1519 timeo
= sock_rcvtimeo(sk
, flags
&MSG_DONTWAIT
);
1521 msg
->msg_namelen
= 0;
1523 /* Lock the socket to prevent queue disordering
1524 * while sleeps in memcpy_tomsg
1527 down(&sk
->protinfo
.af_unix
.readsem
);
1532 struct sk_buff
*skb
;
1534 skb
=skb_dequeue(&sk
->receive_queue
);
1537 if (copied
>= target
)
1541 * POSIX 1003.1g mandates this order.
1544 if ((err
= sock_error(sk
)) != 0)
1546 if (sk
->shutdown
& RCV_SHUTDOWN
)
1551 up(&sk
->protinfo
.af_unix
.readsem
);
1553 timeo
= unix_stream_data_wait(sk
, timeo
);
1555 if (signal_pending(current
)) {
1556 err
= sock_intr_errno(timeo
);
1559 down(&sk
->protinfo
.af_unix
.readsem
);
1564 /* Never glue messages from different writers */
1565 if (memcmp(UNIXCREDS(skb
), &scm
->creds
, sizeof(scm
->creds
)) != 0) {
1566 skb_queue_head(&sk
->receive_queue
, skb
);
1570 /* Copy credentials */
1571 scm
->creds
= *UNIXCREDS(skb
);
1575 /* Copy address just once */
1578 unix_copy_addr(msg
, skb
->sk
);
1582 chunk
= min(skb
->len
, size
);
1583 if (memcpy_toiovec(msg
->msg_iov
, skb
->data
, chunk
)) {
1584 skb_queue_head(&sk
->receive_queue
, skb
);
1592 /* Mark read part of skb as used */
1593 if (!(flags
& MSG_PEEK
))
1595 skb_pull(skb
, chunk
);
1598 unix_detach_fds(scm
, skb
);
1600 /* put the skb back if we didn't use it up.. */
1603 skb_queue_head(&sk
->receive_queue
, skb
);
1614 /* It is questionable, see note in unix_dgram_recvmsg.
1617 scm
->fp
= scm_fp_dup(UNIXCB(skb
).fp
);
1619 /* put message back and return */
1620 skb_queue_head(&sk
->receive_queue
, skb
);
1625 up(&sk
->protinfo
.af_unix
.readsem
);
1627 return copied
? : err
;
1630 static int unix_shutdown(struct socket
*sock
, int mode
)
1632 struct sock
*sk
= sock
->sk
;
1635 mode
= (mode
+1)&(RCV_SHUTDOWN
|SEND_SHUTDOWN
);
1638 unix_state_wlock(sk
);
1639 sk
->shutdown
|= mode
;
1640 other
=unix_peer(sk
);
1643 unix_state_wunlock(sk
);
1644 sk
->state_change(sk
);
1646 if (other
&& sk
->type
== SOCK_STREAM
) {
1649 if (mode
&RCV_SHUTDOWN
)
1650 peer_mode
|= SEND_SHUTDOWN
;
1651 if (mode
&SEND_SHUTDOWN
)
1652 peer_mode
|= RCV_SHUTDOWN
;
1653 unix_state_wlock(other
);
1654 other
->shutdown
|= peer_mode
;
1655 unix_state_wunlock(other
);
1656 other
->state_change(other
);
1657 read_lock(&other
->callback_lock
);
1658 if (peer_mode
== SHUTDOWN_MASK
)
1659 sk_wake_async(other
,1,POLL_HUP
);
1660 else if (peer_mode
& RCV_SHUTDOWN
)
1661 sk_wake_async(other
,1,POLL_IN
);
1662 read_unlock(&other
->callback_lock
);
1670 static int unix_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
1672 struct sock
*sk
= sock
->sk
;
1679 amount
= atomic_read(&sk
->wmem_alloc
);
1680 err
= put_user(amount
, (int *)arg
);
1684 struct sk_buff
*skb
;
1685 if (sk
->state
==TCP_LISTEN
) {
1690 spin_lock(&sk
->receive_queue
.lock
);
1691 if((skb
=skb_peek(&sk
->receive_queue
))!=NULL
)
1693 spin_unlock(&sk
->receive_queue
.lock
);
1694 err
= put_user(amount
, (int *)arg
);
1699 err
= dev_ioctl(cmd
, (void *)arg
);
1705 static unsigned int unix_poll(struct file
* file
, struct socket
*sock
, poll_table
*wait
)
1707 struct sock
*sk
= sock
->sk
;
1710 poll_wait(file
, sk
->sleep
, wait
);
1713 /* exceptional events? */
1716 if (sk
->shutdown
== SHUTDOWN_MASK
)
1720 if (!skb_queue_empty(&sk
->receive_queue
) || (sk
->shutdown
&RCV_SHUTDOWN
))
1721 mask
|= POLLIN
| POLLRDNORM
;
1723 /* Connection-based need to check for termination and startup */
1724 if (sk
->type
== SOCK_STREAM
&& sk
->state
==TCP_CLOSE
)
1728 * we set writable also when the other side has shut down the
1729 * connection. This prevents stuck sockets.
1731 if (unix_writable(sk
))
1732 mask
|= POLLOUT
| POLLWRNORM
| POLLWRBAND
;
1738 #ifdef CONFIG_PROC_FS
1739 static int unix_read_proc(char *buffer
, char **start
, off_t offset
,
1740 int length
, int *eof
, void *data
)
1748 len
+= sprintf(buffer
,"Num RefCount Protocol Flags Type St "
1751 read_lock(&unix_table_lock
);
1752 forall_unix_sockets (i
,s
)
1754 unix_state_rlock(s
);
1756 len
+=sprintf(buffer
+len
,"%p: %08X %08X %08X %04X %02X %5ld",
1758 atomic_read(&s
->refcnt
),
1760 s
->state
== TCP_LISTEN
? __SO_ACCEPTCON
: 0,
1763 (s
->state
== TCP_ESTABLISHED
? SS_CONNECTED
: SS_UNCONNECTED
) :
1764 (s
->state
== TCP_ESTABLISHED
? SS_CONNECTING
: SS_DISCONNECTING
),
1767 if (s
->protinfo
.af_unix
.addr
)
1769 buffer
[len
++] = ' ';
1770 memcpy(buffer
+len
, s
->protinfo
.af_unix
.addr
->name
->sun_path
,
1771 s
->protinfo
.af_unix
.addr
->len
-sizeof(short));
1772 if (!UNIX_ABSTRACT(s
))
1776 len
+= s
->protinfo
.af_unix
.addr
->len
- sizeof(short);
1778 unix_state_runlock(s
);
1788 if(pos
>offset
+length
)
1793 read_unlock(&unix_table_lock
);
1794 *start
=buffer
+(offset
-begin
);
1795 len
-=(offset
-begin
);
1804 struct proto_ops unix_stream_ops
= {
1807 release
: unix_release
,
1809 connect
: unix_stream_connect
,
1810 socketpair
: unix_socketpair
,
1811 accept
: unix_accept
,
1812 getname
: unix_getname
,
1815 listen
: unix_listen
,
1816 shutdown
: unix_shutdown
,
1817 setsockopt
: sock_no_setsockopt
,
1818 getsockopt
: sock_no_getsockopt
,
1819 sendmsg
: unix_stream_sendmsg
,
1820 recvmsg
: unix_stream_recvmsg
,
1824 struct proto_ops unix_dgram_ops
= {
1827 release
: unix_release
,
1829 connect
: unix_dgram_connect
,
1830 socketpair
: unix_socketpair
,
1831 accept
: sock_no_accept
,
1832 getname
: unix_getname
,
1833 poll
: datagram_poll
,
1835 listen
: sock_no_listen
,
1836 shutdown
: unix_shutdown
,
1837 setsockopt
: sock_no_setsockopt
,
1838 getsockopt
: sock_no_getsockopt
,
1839 sendmsg
: unix_dgram_sendmsg
,
1840 recvmsg
: unix_dgram_recvmsg
,
1844 struct net_proto_family unix_family_ops
= {
1849 #ifdef CONFIG_SYSCTL
1850 extern void unix_sysctl_register(void);
1851 extern void unix_sysctl_unregister(void);
1854 static int __init
af_unix_init(void)
1856 struct sk_buff
*dummy_skb
;
1858 printk(KERN_INFO
"NET4: Unix domain sockets 1.0/SMP for Linux NET4.0.\n");
1859 if (sizeof(struct unix_skb_parms
) > sizeof(dummy_skb
->cb
))
1861 printk(KERN_CRIT
"unix_proto_init: panic\n");
1864 sock_register(&unix_family_ops
);
1865 #ifdef CONFIG_PROC_FS
1866 create_proc_read_entry("net/unix", 0, 0, unix_read_proc
, NULL
);
1869 #ifdef CONFIG_SYSCTL
1870 unix_sysctl_register();
1876 static void __exit
af_unix_exit(void)
1878 sock_unregister(PF_UNIX
);
1879 #ifdef CONFIG_SYSCTL
1880 unix_sysctl_unregister();
1882 #ifdef CONFIG_PROC_FS
1883 remove_proc_entry("net/unix", 0);
1887 module_init(af_unix_init
);
1888 module_exit(af_unix_exit
);
1892 * compile-command: "gcc -g -D__KERNEL__ -Wall -O6 -I/usr/src/linux/include -c af_unix.c"