net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #include <linux/module.h>
  84 #include <linux/kernel.h>
  85 #include <linux/signal.h>
  86 #include <linux/sched.h>
  87 #include <linux/errno.h>
  88 #include <linux/string.h>
  89 #include <linux/stat.h>
  90 #include <linux/dcache.h>
  91 #include <linux/namei.h>
  92 #include <linux/socket.h>
  93 #include <linux/un.h>
  94 #include <linux/fcntl.h>
  95 #include <linux/termios.h>
  96 #include <linux/sockios.h>
  97 #include <linux/net.h>
  98 #include <linux/in.h>
  99 #include <linux/fs.h>
 100 #include <linux/slab.h>
 101 #include <asm/uaccess.h>
 102 #include <linux/skbuff.h>
 103 #include <linux/netdevice.h>
 104 #include <net/net_namespace.h>
 105 #include <net/sock.h>
 106 #include <net/tcp_states.h>
 107 #include <net/af_unix.h>
 108 #include <linux/proc_fs.h>
 109 #include <linux/seq_file.h>
 110 #include <net/scm.h>
 111 #include <linux/init.h>
 112 #include <linux/poll.h>
 113 #include <linux/rtnetlink.h>
 114 #include <linux/mount.h>
 115 #include <net/checksum.h>
 116 #include <linux/security.h>
 117 #include <linux/freezer.h>
 118
 119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 120 EXPORT_SYMBOL_GPL(unix_socket_table);
 121 DEFINE_SPINLOCK(unix_table_lock);
 122 EXPORT_SYMBOL_GPL(unix_table_lock);
 123 static atomic_long_t unix_nr_socks;
 124
 125
 126 static struct hlist_head *unix_sockets_unbound(void *addr)
 127 {
 128         unsigned long hash = (unsigned long)addr;
 129
 130         hash ^= hash >> 16;
 131         hash ^= hash >> 8;
 132         hash %= UNIX_HASH_SIZE;
 133         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 134 }
 135
 136 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 137
 138 #ifdef CONFIG_SECURITY_NETWORK
 139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 140 {
 141         memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
 142 }
 143
 144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 145 {
 146         scm->secid = *UNIXSID(skb);
 147 }
 148 #else
 149 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 150 { }
 151
 152 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 153 { }
 154 #endif /* CONFIG_SECURITY_NETWORK */
 155
 156 /*
 157  *  SMP locking strategy:
 158  *    hash table is protected with spinlock unix_table_lock
 159  *    each socket state is protected by separate spin lock.
 160  */
 161
 162 static inline unsigned int unix_hash_fold(__wsum n)
 163 {
 164         unsigned int hash = (__force unsigned int)n;
 165
 166         hash ^= hash>>16;
 167         hash ^= hash>>8;
 168         return hash&(UNIX_HASH_SIZE-1);
 169 }
 170
 171 #define unix_peer(sk) (unix_sk(sk)->peer)
 172
 173 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 174 {
 175         return unix_peer(osk) == sk;
 176 }
 177
 178 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 179 {
 180         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 181 }
 182
 183 static inline int unix_recvq_full(struct sock const *sk)
 184 {
 185         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 186 }
 187
 188 struct sock *unix_peer_get(struct sock *s)
 189 {
 190         struct sock *peer;
 191
 192         unix_state_lock(s);
 193         peer = unix_peer(s);
 194         if (peer)
 195                 sock_hold(peer);
 196         unix_state_unlock(s);
 197         return peer;
 198 }
 199 EXPORT_SYMBOL_GPL(unix_peer_get);
 200
 201 static inline void unix_release_addr(struct unix_address *addr)
 202 {
 203         if (atomic_dec_and_test(&addr->refcnt))
 204                 kfree(addr);
 205 }
 206
 207 /*
 208  *      Check unix socket name:
 209  *              - should be not zero length.
 210  *              - if started by not zero, should be NULL terminated (FS object)
 211  *              - if started by zero, it is abstract name.
 212  */
 213
 214 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 215 {
 216         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 217                 return -EINVAL;
 218         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 219                 return -EINVAL;
 220         if (sunaddr->sun_path[0]) {
 221                 /*
 222                  * This may look like an off by one error but it is a bit more
 223                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 224                  * sun_path[108] doesn't as such exist.  However in kernel space
 225                  * we are guaranteed that it is a valid memory location in our
 226                  * kernel address buffer.
 227                  */
 228                 ((char *)sunaddr)[len] = 0;
 229                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 230                 return len;
 231         }
 232
 233         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 234         return len;
 235 }
 236
 237 static void __unix_remove_socket(struct sock *sk)
 238 {
 239         sk_del_node_init(sk);
 240 }
 241
 242 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 243 {
 244         WARN_ON(!sk_unhashed(sk));
 245         sk_add_node(sk, list);
 246 }
 247
 248 static inline void unix_remove_socket(struct sock *sk)
 249 {
 250         spin_lock(&unix_table_lock);
 251         __unix_remove_socket(sk);
 252         spin_unlock(&unix_table_lock);
 253 }
 254
 255 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 256 {
 257         spin_lock(&unix_table_lock);
 258         __unix_insert_socket(list, sk);
 259         spin_unlock(&unix_table_lock);
 260 }
 261
 262 static struct sock *__unix_find_socket_byname(struct net *net,
 263                                               struct sockaddr_un *sunname,
 264                                               int len, int type, unsigned int hash)
 265 {
 266         struct sock *s;
 267
 268         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 269                 struct unix_sock *u = unix_sk(s);
 270
 271                 if (!net_eq(sock_net(s), net))
 272                         continue;
 273
 274                 if (u->addr->len == len &&
 275                     !memcmp(u->addr->name, sunname, len))
 276                         goto found;
 277         }
 278         s = NULL;
 279 found:
 280         return s;
 281 }
 282
 283 static inline struct sock *unix_find_socket_byname(struct net *net,
 284                                                    struct sockaddr_un *sunname,
 285                                                    int len, int type,
 286                                                    unsigned int hash)
 287 {
 288         struct sock *s;
 289
 290         spin_lock(&unix_table_lock);
 291         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 292         if (s)
 293                 sock_hold(s);
 294         spin_unlock(&unix_table_lock);
 295         return s;
 296 }
 297
 298 static struct sock *unix_find_socket_byinode(struct inode *i)
 299 {
 300         struct sock *s;
 301
 302         spin_lock(&unix_table_lock);
 303         sk_for_each(s,
 304                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 305                 struct dentry *dentry = unix_sk(s)->path.dentry;
 306
 307                 if (dentry && dentry->d_inode == i) {
 308                         sock_hold(s);
 309                         goto found;
 310                 }
 311         }
 312         s = NULL;
 313 found:
 314         spin_unlock(&unix_table_lock);
 315         return s;
 316 }
 317
 318 static inline int unix_writable(struct sock *sk)
 319 {
 320         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 321 }
 322
 323 static void unix_write_space(struct sock *sk)
 324 {
 325         struct socket_wq *wq;
 326
 327         rcu_read_lock();
 328         if (unix_writable(sk)) {
 329                 wq = rcu_dereference(sk->sk_wq);
 330                 if (wq_has_sleeper(wq))
 331                         wake_up_interruptible_sync_poll(&wq->wait,
 332                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 333                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 334         }
 335         rcu_read_unlock();
 336 }
 337
 338 /* When dgram socket disconnects (or changes its peer), we clear its receive
 339  * queue of packets arrived from previous peer. First, it allows to do
 340  * flow control based only on wmem_alloc; second, sk connected to peer
 341  * may receive messages only from that peer. */
 342 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 343 {
 344         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 345                 skb_queue_purge(&sk->sk_receive_queue);
 346                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 347
 348                 /* If one link of bidirectional dgram pipe is disconnected,
 349                  * we signal error. Messages are lost. Do not make this,
 350                  * when peer was not connected to us.
 351                  */
 352                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 353                         other->sk_err = ECONNRESET;
 354                         other->sk_error_report(other);
 355                 }
 356         }
 357 }
 358
 359 static void unix_sock_destructor(struct sock *sk)
 360 {
 361         struct unix_sock *u = unix_sk(sk);
 362
 363         skb_queue_purge(&sk->sk_receive_queue);
 364
 365         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 366         WARN_ON(!sk_unhashed(sk));
 367         WARN_ON(sk->sk_socket);
 368         if (!sock_flag(sk, SOCK_DEAD)) {
 369                 printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
 370                 return;
 371         }
 372
 373         if (u->addr)
 374                 unix_release_addr(u->addr);
 375
 376         atomic_long_dec(&unix_nr_socks);
 377         local_bh_disable();
 378         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 379         local_bh_enable();
 380 #ifdef UNIX_REFCNT_DEBUG
 381         printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
 382                 atomic_long_read(&unix_nr_socks));
 383 #endif
 384 }
 385
 386 static void unix_release_sock(struct sock *sk, int embrion)
 387 {
 388         struct unix_sock *u = unix_sk(sk);
 389         struct path path;
 390         struct sock *skpair;
 391         struct sk_buff *skb;
 392         int state;
 393
 394         unix_remove_socket(sk);
 395
 396         /* Clear state */
 397         unix_state_lock(sk);
 398         sock_orphan(sk);
 399         sk->sk_shutdown = SHUTDOWN_MASK;
 400         path         = u->path;
 401         u->path.dentry = NULL;
 402         u->path.mnt = NULL;
 403         state = sk->sk_state;
 404         sk->sk_state = TCP_CLOSE;
 405         unix_state_unlock(sk);
 406
 407         wake_up_interruptible_all(&u->peer_wait);
 408
 409         skpair = unix_peer(sk);
 410
 411         if (skpair != NULL) {
 412                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 413                         unix_state_lock(skpair);
 414                         /* No more writes */
 415                         skpair->sk_shutdown = SHUTDOWN_MASK;
 416                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 417                                 skpair->sk_err = ECONNRESET;
 418                         unix_state_unlock(skpair);
 419                         skpair->sk_state_change(skpair);
 420                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 421                 }
 422                 sock_put(skpair); /* It may now die */
 423                 unix_peer(sk) = NULL;
 424         }
 425
 426         /* Try to flush out this socket. Throw out buffers at least */
 427
 428         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 429                 if (state == TCP_LISTEN)
 430                         unix_release_sock(skb->sk, 1);
 431                 /* passed fds are erased in the kfree_skb hook        */
 432                 kfree_skb(skb);
 433         }
 434
 435         if (path.dentry)
 436                 path_put(&path);
 437
 438         sock_put(sk);
 439
 440         /* ---- Socket is dead now and most probably destroyed ---- */
 441
 442         /*
 443          * Fixme: BSD difference: In BSD all sockets connected to us get
 444          *        ECONNRESET and we die on the spot. In Linux we behave
 445          *        like files and pipes do and wait for the last
 446          *        dereference.
 447          *
 448          * Can't we simply set sock->err?
 449          *
 450          *        What the above comment does talk about? --ANK(980817)
 451          */
 452
 453         if (unix_tot_inflight)
 454                 unix_gc();              /* Garbage collect fds */
 455 }
 456
 457 static void init_peercred(struct sock *sk)
 458 {
 459         put_pid(sk->sk_peer_pid);
 460         if (sk->sk_peer_cred)
 461                 put_cred(sk->sk_peer_cred);
 462         sk->sk_peer_pid  = get_pid(task_tgid(current));
 463         sk->sk_peer_cred = get_current_cred();
 464 }
 465
 466 static void copy_peercred(struct sock *sk, struct sock *peersk)
 467 {
 468         put_pid(sk->sk_peer_pid);
 469         if (sk->sk_peer_cred)
 470                 put_cred(sk->sk_peer_cred);
 471         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 472         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 473 }
 474
 475 static int unix_listen(struct socket *sock, int backlog)
 476 {
 477         int err;
 478         struct sock *sk = sock->sk;
 479         struct unix_sock *u = unix_sk(sk);
 480         struct pid *old_pid = NULL;
 481
 482         err = -EOPNOTSUPP;
 483         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 484                 goto out;       /* Only stream/seqpacket sockets accept */
 485         err = -EINVAL;
 486         if (!u->addr)
 487                 goto out;       /* No listens on an unbound socket */
 488         unix_state_lock(sk);
 489         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 490                 goto out_unlock;
 491         if (backlog > sk->sk_max_ack_backlog)
 492                 wake_up_interruptible_all(&u->peer_wait);
 493         sk->sk_max_ack_backlog  = backlog;
 494         sk->sk_state            = TCP_LISTEN;
 495         /* set credentials so connect can copy them */
 496         init_peercred(sk);
 497         err = 0;
 498
 499 out_unlock:
 500         unix_state_unlock(sk);
 501         put_pid(old_pid);
 502 out:
 503         return err;
 504 }
 505
 506 static int unix_release(struct socket *);
 507 static int unix_bind(struct socket *, struct sockaddr *, int);
 508 static int unix_stream_connect(struct socket *, struct sockaddr *,
 509                                int addr_len, int flags);
 510 static int unix_socketpair(struct socket *, struct socket *);
 511 static int unix_accept(struct socket *, struct socket *, int);
 512 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 513 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 514 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 515                                     poll_table *);
 516 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 517 static int unix_shutdown(struct socket *, int);
 518 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
 519                                struct msghdr *, size_t);
 520 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
 521                                struct msghdr *, size_t, int);
 522 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
 523                               struct msghdr *, size_t);
 524 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
 525                               struct msghdr *, size_t, int);
 526 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 527                               int, int);
 528 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
 529                                   struct msghdr *, size_t);
 530 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
 531                                   struct msghdr *, size_t, int);
 532
 533 static void unix_set_peek_off(struct sock *sk, int val)
 534 {
 535         struct unix_sock *u = unix_sk(sk);
 536
 537         mutex_lock(&u->readlock);
 538         sk->sk_peek_off = val;
 539         mutex_unlock(&u->readlock);
 540 }
 541
 542
 543 static const struct proto_ops unix_stream_ops = {
 544         .family =       PF_UNIX,
 545         .owner =        THIS_MODULE,
 546         .release =      unix_release,
 547         .bind =         unix_bind,
 548         .connect =      unix_stream_connect,
 549         .socketpair =   unix_socketpair,
 550         .accept =       unix_accept,
 551         .getname =      unix_getname,
 552         .poll =         unix_poll,
 553         .ioctl =        unix_ioctl,
 554         .listen =       unix_listen,
 555         .shutdown =     unix_shutdown,
 556         .setsockopt =   sock_no_setsockopt,
 557         .getsockopt =   sock_no_getsockopt,
 558         .sendmsg =      unix_stream_sendmsg,
 559         .recvmsg =      unix_stream_recvmsg,
 560         .mmap =         sock_no_mmap,
 561         .sendpage =     sock_no_sendpage,
 562         .set_peek_off = unix_set_peek_off,
 563 };
 564
 565 static const struct proto_ops unix_dgram_ops = {
 566         .family =       PF_UNIX,
 567         .owner =        THIS_MODULE,
 568         .release =      unix_release,
 569         .bind =         unix_bind,
 570         .connect =      unix_dgram_connect,
 571         .socketpair =   unix_socketpair,
 572         .accept =       sock_no_accept,
 573         .getname =      unix_getname,
 574         .poll =         unix_dgram_poll,
 575         .ioctl =        unix_ioctl,
 576         .listen =       sock_no_listen,
 577         .shutdown =     unix_shutdown,
 578         .setsockopt =   sock_no_setsockopt,
 579         .getsockopt =   sock_no_getsockopt,
 580         .sendmsg =      unix_dgram_sendmsg,
 581         .recvmsg =      unix_dgram_recvmsg,
 582         .mmap =         sock_no_mmap,
 583         .sendpage =     sock_no_sendpage,
 584         .set_peek_off = unix_set_peek_off,
 585 };
 586
 587 static const struct proto_ops unix_seqpacket_ops = {
 588         .family =       PF_UNIX,
 589         .owner =        THIS_MODULE,
 590         .release =      unix_release,
 591         .bind =         unix_bind,
 592         .connect =      unix_stream_connect,
 593         .socketpair =   unix_socketpair,
 594         .accept =       unix_accept,
 595         .getname =      unix_getname,
 596         .poll =         unix_dgram_poll,
 597         .ioctl =        unix_ioctl,
 598         .listen =       unix_listen,
 599         .shutdown =     unix_shutdown,
 600         .setsockopt =   sock_no_setsockopt,
 601         .getsockopt =   sock_no_getsockopt,
 602         .sendmsg =      unix_seqpacket_sendmsg,
 603         .recvmsg =      unix_seqpacket_recvmsg,
 604         .mmap =         sock_no_mmap,
 605         .sendpage =     sock_no_sendpage,
 606         .set_peek_off = unix_set_peek_off,
 607 };
 608
 609 static struct proto unix_proto = {
 610         .name                   = "UNIX",
 611         .owner                  = THIS_MODULE,
 612         .obj_size               = sizeof(struct unix_sock),
 613 };
 614
 615 /*
 616  * AF_UNIX sockets do not interact with hardware, hence they
 617  * dont trigger interrupts - so it's safe for them to have
 618  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 619  * this special lock-class by reinitializing the spinlock key:
 620  */
 621 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 622
 623 static struct sock *unix_create1(struct net *net, struct socket *sock)
 624 {
 625         struct sock *sk = NULL;
 626         struct unix_sock *u;
 627
 628         atomic_long_inc(&unix_nr_socks);
 629         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 630                 goto out;
 631
 632         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
 633         if (!sk)
 634                 goto out;
 635
 636         sock_init_data(sock, sk);
 637         lockdep_set_class(&sk->sk_receive_queue.lock,
 638                                 &af_unix_sk_receive_queue_lock_key);
 639
 640         sk->sk_write_space      = unix_write_space;
 641         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 642         sk->sk_destruct         = unix_sock_destructor;
 643         u         = unix_sk(sk);
 644         u->path.dentry = NULL;
 645         u->path.mnt = NULL;
 646         spin_lock_init(&u->lock);
 647         atomic_long_set(&u->inflight, 0);
 648         INIT_LIST_HEAD(&u->link);
 649         mutex_init(&u->readlock); /* single task reading lock */
 650         init_waitqueue_head(&u->peer_wait);
 651         unix_insert_socket(unix_sockets_unbound(sk), sk);
 652 out:
 653         if (sk == NULL)
 654                 atomic_long_dec(&unix_nr_socks);
 655         else {
 656                 local_bh_disable();
 657                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 658                 local_bh_enable();
 659         }
 660         return sk;
 661 }
 662
 663 static int unix_create(struct net *net, struct socket *sock, int protocol,
 664                        int kern)
 665 {
 666         if (protocol && protocol != PF_UNIX)
 667                 return -EPROTONOSUPPORT;
 668
 669         sock->state = SS_UNCONNECTED;
 670
 671         switch (sock->type) {
 672         case SOCK_STREAM:
 673                 sock->ops = &unix_stream_ops;
 674                 break;
 675                 /*
 676                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 677                  *      nothing uses it.
 678                  */
 679         case SOCK_RAW:
 680                 sock->type = SOCK_DGRAM;
 681         case SOCK_DGRAM:
 682                 sock->ops = &unix_dgram_ops;
 683                 break;
 684         case SOCK_SEQPACKET:
 685                 sock->ops = &unix_seqpacket_ops;
 686                 break;
 687         default:
 688                 return -ESOCKTNOSUPPORT;
 689         }
 690
 691         return unix_create1(net, sock) ? 0 : -ENOMEM;
 692 }
 693
 694 static int unix_release(struct socket *sock)
 695 {
 696         struct sock *sk = sock->sk;
 697
 698         if (!sk)
 699                 return 0;
 700
 701         unix_release_sock(sk, 0);
 702         sock->sk = NULL;
 703
 704         return 0;
 705 }
 706
 707 static int unix_autobind(struct socket *sock)
 708 {
 709         struct sock *sk = sock->sk;
 710         struct net *net = sock_net(sk);
 711         struct unix_sock *u = unix_sk(sk);
 712         static u32 ordernum = 1;
 713         struct unix_address *addr;
 714         int err;
 715         unsigned int retries = 0;
 716
 717         mutex_lock(&u->readlock);
 718
 719         err = 0;
 720         if (u->addr)
 721                 goto out;
 722
 723         err = -ENOMEM;
 724         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 725         if (!addr)
 726                 goto out;
 727
 728         addr->name->sun_family = AF_UNIX;
 729         atomic_set(&addr->refcnt, 1);
 730
 731 retry:
 732         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 733         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 734
 735         spin_lock(&unix_table_lock);
 736         ordernum = (ordernum+1)&0xFFFFF;
 737
 738         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 739                                       addr->hash)) {
 740                 spin_unlock(&unix_table_lock);
 741                 /*
 742                  * __unix_find_socket_byname() may take long time if many names
 743                  * are already in use.
 744                  */
 745                 cond_resched();
 746                 /* Give up if all names seems to be in use. */
 747                 if (retries++ == 0xFFFFF) {
 748                         err = -ENOSPC;
 749                         kfree(addr);
 750                         goto out;
 751                 }
 752                 goto retry;
 753         }
 754         addr->hash ^= sk->sk_type;
 755
 756         __unix_remove_socket(sk);
 757         u->addr = addr;
 758         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 759         spin_unlock(&unix_table_lock);
 760         err = 0;
 761
 762 out:    mutex_unlock(&u->readlock);
 763         return err;
 764 }
 765
 766 static struct sock *unix_find_other(struct net *net,
 767                                     struct sockaddr_un *sunname, int len,
 768                                     int type, unsigned int hash, int *error)
 769 {
 770         struct sock *u;
 771         struct path path;
 772         int err = 0;
 773
 774         if (sunname->sun_path[0]) {
 775                 struct inode *inode;
 776                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 777                 if (err)
 778                         goto fail;
 779                 inode = path.dentry->d_inode;
 780                 err = inode_permission(inode, MAY_WRITE);
 781                 if (err)
 782                         goto put_fail;
 783
 784                 err = -ECONNREFUSED;
 785                 if (!S_ISSOCK(inode->i_mode))
 786                         goto put_fail;
 787                 u = unix_find_socket_byinode(inode);
 788                 if (!u)
 789                         goto put_fail;
 790
 791                 if (u->sk_type == type)
 792                         touch_atime(&path);
 793
 794                 path_put(&path);
 795
 796                 err = -EPROTOTYPE;
 797                 if (u->sk_type != type) {
 798                         sock_put(u);
 799                         goto fail;
 800                 }
 801         } else {
 802                 err = -ECONNREFUSED;
 803                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 804                 if (u) {
 805                         struct dentry *dentry;
 806                         dentry = unix_sk(u)->path.dentry;
 807                         if (dentry)
 808                                 touch_atime(&unix_sk(u)->path);
 809                 } else
 810                         goto fail;
 811         }
 812         return u;
 813
 814 put_fail:
 815         path_put(&path);
 816 fail:
 817         *error = err;
 818         return NULL;
 819 }
 820
 821 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 822 {
 823         struct dentry *dentry;
 824         struct path path;
 825         int err = 0;
 826         /*
 827          * Get the parent directory, calculate the hash for last
 828          * component.
 829          */
 830         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 831         err = PTR_ERR(dentry);
 832         if (IS_ERR(dentry))
 833                 return err;
 834
 835         /*
 836          * All right, let's create it.
 837          */
 838         err = security_path_mknod(&path, dentry, mode, 0);
 839         if (!err) {
 840                 err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
 841                 if (!err) {
 842                         res->mnt = mntget(path.mnt);
 843                         res->dentry = dget(dentry);
 844                 }
 845         }
 846         done_path_create(&path, dentry);
 847         return err;
 848 }
 849
 850 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 851 {
 852         struct sock *sk = sock->sk;
 853         struct net *net = sock_net(sk);
 854         struct unix_sock *u = unix_sk(sk);
 855         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 856         char *sun_path = sunaddr->sun_path;
 857         int err;
 858         unsigned int hash;
 859         struct unix_address *addr;
 860         struct hlist_head *list;
 861
 862         err = -EINVAL;
 863         if (sunaddr->sun_family != AF_UNIX)
 864                 goto out;
 865
 866         if (addr_len == sizeof(short)) {
 867                 err = unix_autobind(sock);
 868                 goto out;
 869         }
 870
 871         err = unix_mkname(sunaddr, addr_len, &hash);
 872         if (err < 0)
 873                 goto out;
 874         addr_len = err;
 875
 876         mutex_lock(&u->readlock);
 877
 878         err = -EINVAL;
 879         if (u->addr)
 880                 goto out_up;
 881
 882         err = -ENOMEM;
 883         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
 884         if (!addr)
 885                 goto out_up;
 886
 887         memcpy(addr->name, sunaddr, addr_len);
 888         addr->len = addr_len;
 889         addr->hash = hash ^ sk->sk_type;
 890         atomic_set(&addr->refcnt, 1);
 891
 892         if (sun_path[0]) {
 893                 struct path path;
 894                 umode_t mode = S_IFSOCK |
 895                        (SOCK_INODE(sock)->i_mode & ~current_umask());
 896                 err = unix_mknod(sun_path, mode, &path);
 897                 if (err) {
 898                         if (err == -EEXIST)
 899                                 err = -EADDRINUSE;
 900                         unix_release_addr(addr);
 901                         goto out_up;
 902                 }
 903                 addr->hash = UNIX_HASH_SIZE;
 904                 hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
 905                 spin_lock(&unix_table_lock);
 906                 u->path = path;
 907                 list = &unix_socket_table[hash];
 908         } else {
 909                 spin_lock(&unix_table_lock);
 910                 err = -EADDRINUSE;
 911                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
 912                                               sk->sk_type, hash)) {
 913                         unix_release_addr(addr);
 914                         goto out_unlock;
 915                 }
 916
 917                 list = &unix_socket_table[addr->hash];
 918         }
 919
 920         err = 0;
 921         __unix_remove_socket(sk);
 922         u->addr = addr;
 923         __unix_insert_socket(list, sk);
 924
 925 out_unlock:
 926         spin_unlock(&unix_table_lock);
 927 out_up:
 928         mutex_unlock(&u->readlock);
 929 out:
 930         return err;
 931 }
 932
 933 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
 934 {
 935         if (unlikely(sk1 == sk2) || !sk2) {
 936                 unix_state_lock(sk1);
 937                 return;
 938         }
 939         if (sk1 < sk2) {
 940                 unix_state_lock(sk1);
 941                 unix_state_lock_nested(sk2);
 942         } else {
 943                 unix_state_lock(sk2);
 944                 unix_state_lock_nested(sk1);
 945         }
 946 }
 947
 948 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
 949 {
 950         if (unlikely(sk1 == sk2) || !sk2) {
 951                 unix_state_unlock(sk1);
 952                 return;
 953         }
 954         unix_state_unlock(sk1);
 955         unix_state_unlock(sk2);
 956 }
 957
 958 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 959                               int alen, int flags)
 960 {
 961         struct sock *sk = sock->sk;
 962         struct net *net = sock_net(sk);
 963         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 964         struct sock *other;
 965         unsigned int hash;
 966         int err;
 967
 968         if (addr->sa_family != AF_UNSPEC) {
 969                 err = unix_mkname(sunaddr, alen, &hash);
 970                 if (err < 0)
 971                         goto out;
 972                 alen = err;
 973
 974                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
 975                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
 976                         goto out;
 977
 978 restart:
 979                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
 980                 if (!other)
 981                         goto out;
 982
 983                 unix_state_double_lock(sk, other);
 984
 985                 /* Apparently VFS overslept socket death. Retry. */
 986                 if (sock_flag(other, SOCK_DEAD)) {
 987                         unix_state_double_unlock(sk, other);
 988                         sock_put(other);
 989                         goto restart;
 990                 }
 991
 992                 err = -EPERM;
 993                 if (!unix_may_send(sk, other))
 994                         goto out_unlock;
 995
 996                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
 997                 if (err)
 998                         goto out_unlock;
 999
1000         } else {
1001                 /*
1002                  *      1003.1g breaking connected state with AF_UNSPEC
1003                  */
1004                 other = NULL;
1005                 unix_state_double_lock(sk, other);
1006         }
1007
1008         /*
1009          * If it was connected, reconnect.
1010          */
1011         if (unix_peer(sk)) {
1012                 struct sock *old_peer = unix_peer(sk);
1013                 unix_peer(sk) = other;
1014                 unix_state_double_unlock(sk, other);
1015
1016                 if (other != old_peer)
1017                         unix_dgram_disconnected(sk, old_peer);
1018                 sock_put(old_peer);
1019         } else {
1020                 unix_peer(sk) = other;
1021                 unix_state_double_unlock(sk, other);
1022         }
1023         return 0;
1024
1025 out_unlock:
1026         unix_state_double_unlock(sk, other);
1027         sock_put(other);
1028 out:
1029         return err;
1030 }
1031
1032 static long unix_wait_for_peer(struct sock *other, long timeo)
1033 {
1034         struct unix_sock *u = unix_sk(other);
1035         int sched;
1036         DEFINE_WAIT(wait);
1037
1038         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1039
1040         sched = !sock_flag(other, SOCK_DEAD) &&
1041                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1042                 unix_recvq_full(other);
1043
1044         unix_state_unlock(other);
1045
1046         if (sched)
1047                 timeo = schedule_timeout(timeo);
1048
1049         finish_wait(&u->peer_wait, &wait);
1050         return timeo;
1051 }
1052
1053 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1054                                int addr_len, int flags)
1055 {
1056         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1057         struct sock *sk = sock->sk;
1058         struct net *net = sock_net(sk);
1059         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1060         struct sock *newsk = NULL;
1061         struct sock *other = NULL;
1062         struct sk_buff *skb = NULL;
1063         unsigned int hash;
1064         int st;
1065         int err;
1066         long timeo;
1067
1068         err = unix_mkname(sunaddr, addr_len, &hash);
1069         if (err < 0)
1070                 goto out;
1071         addr_len = err;
1072
1073         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1074             (err = unix_autobind(sock)) != 0)
1075                 goto out;
1076
1077         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1078
1079         /* First of all allocate resources.
1080            If we will make it after state is locked,
1081            we will have to recheck all again in any case.
1082          */
1083
1084         err = -ENOMEM;
1085
1086         /* create new sock for complete connection */
1087         newsk = unix_create1(sock_net(sk), NULL);
1088         if (newsk == NULL)
1089                 goto out;
1090
1091         /* Allocate skb for sending to listening sock */
1092         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1093         if (skb == NULL)
1094                 goto out;
1095
1096 restart:
1097         /*  Find listening sock. */
1098         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1099         if (!other)
1100                 goto out;
1101
1102         /* Latch state of peer */
1103         unix_state_lock(other);
1104
1105         /* Apparently VFS overslept socket death. Retry. */
1106         if (sock_flag(other, SOCK_DEAD)) {
1107                 unix_state_unlock(other);
1108                 sock_put(other);
1109                 goto restart;
1110         }
1111
1112         err = -ECONNREFUSED;
1113         if (other->sk_state != TCP_LISTEN)
1114                 goto out_unlock;
1115         if (other->sk_shutdown & RCV_SHUTDOWN)
1116                 goto out_unlock;
1117
1118         if (unix_recvq_full(other)) {
1119                 err = -EAGAIN;
1120                 if (!timeo)
1121                         goto out_unlock;
1122
1123                 timeo = unix_wait_for_peer(other, timeo);
1124
1125                 err = sock_intr_errno(timeo);
1126                 if (signal_pending(current))
1127                         goto out;
1128                 sock_put(other);
1129                 goto restart;
1130         }
1131
1132         /* Latch our state.
1133
1134            It is tricky place. We need to grab our state lock and cannot
1135            drop lock on peer. It is dangerous because deadlock is
1136            possible. Connect to self case and simultaneous
1137            attempt to connect are eliminated by checking socket
1138            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1139            check this before attempt to grab lock.
1140
1141            Well, and we have to recheck the state after socket locked.
1142          */
1143         st = sk->sk_state;
1144
1145         switch (st) {
1146         case TCP_CLOSE:
1147                 /* This is ok... continue with connect */
1148                 break;
1149         case TCP_ESTABLISHED:
1150                 /* Socket is already connected */
1151                 err = -EISCONN;
1152                 goto out_unlock;
1153         default:
1154                 err = -EINVAL;
1155                 goto out_unlock;
1156         }
1157
1158         unix_state_lock_nested(sk);
1159
1160         if (sk->sk_state != st) {
1161                 unix_state_unlock(sk);
1162                 unix_state_unlock(other);
1163                 sock_put(other);
1164                 goto restart;
1165         }
1166
1167         err = security_unix_stream_connect(sk, other, newsk);
1168         if (err) {
1169                 unix_state_unlock(sk);
1170                 goto out_unlock;
1171         }
1172
1173         /* The way is open! Fastly set all the necessary fields... */
1174
1175         sock_hold(sk);
1176         unix_peer(newsk)        = sk;
1177         newsk->sk_state         = TCP_ESTABLISHED;
1178         newsk->sk_type          = sk->sk_type;
1179         init_peercred(newsk);
1180         newu = unix_sk(newsk);
1181         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1182         otheru = unix_sk(other);
1183
1184         /* copy address information from listening to new sock*/
1185         if (otheru->addr) {
1186                 atomic_inc(&otheru->addr->refcnt);
1187                 newu->addr = otheru->addr;
1188         }
1189         if (otheru->path.dentry) {
1190                 path_get(&otheru->path);
1191                 newu->path = otheru->path;
1192         }
1193
1194         /* Set credentials */
1195         copy_peercred(sk, other);
1196
1197         sock->state     = SS_CONNECTED;
1198         sk->sk_state    = TCP_ESTABLISHED;
1199         sock_hold(newsk);
1200
1201         smp_mb__after_atomic_inc();     /* sock_hold() does an atomic_inc() */
1202         unix_peer(sk)   = newsk;
1203
1204         unix_state_unlock(sk);
1205
1206         /* take ten and and send info to listening sock */
1207         spin_lock(&other->sk_receive_queue.lock);
1208         __skb_queue_tail(&other->sk_receive_queue, skb);
1209         spin_unlock(&other->sk_receive_queue.lock);
1210         unix_state_unlock(other);
1211         other->sk_data_ready(other, 0);
1212         sock_put(other);
1213         return 0;
1214
1215 out_unlock:
1216         if (other)
1217                 unix_state_unlock(other);
1218
1219 out:
1220         kfree_skb(skb);
1221         if (newsk)
1222                 unix_release_sock(newsk, 0);
1223         if (other)
1224                 sock_put(other);
1225         return err;
1226 }
1227
1228 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1229 {
1230         struct sock *ska = socka->sk, *skb = sockb->sk;
1231
1232         /* Join our sockets back to back */
1233         sock_hold(ska);
1234         sock_hold(skb);
1235         unix_peer(ska) = skb;
1236         unix_peer(skb) = ska;
1237         init_peercred(ska);
1238         init_peercred(skb);
1239
1240         if (ska->sk_type != SOCK_DGRAM) {
1241                 ska->sk_state = TCP_ESTABLISHED;
1242                 skb->sk_state = TCP_ESTABLISHED;
1243                 socka->state  = SS_CONNECTED;
1244                 sockb->state  = SS_CONNECTED;
1245         }
1246         return 0;
1247 }
1248
1249 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1250 {
1251         struct sock *sk = sock->sk;
1252         struct sock *tsk;
1253         struct sk_buff *skb;
1254         int err;
1255
1256         err = -EOPNOTSUPP;
1257         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1258                 goto out;
1259
1260         err = -EINVAL;
1261         if (sk->sk_state != TCP_LISTEN)
1262                 goto out;
1263
1264         /* If socket state is TCP_LISTEN it cannot change (for now...),
1265          * so that no locks are necessary.
1266          */
1267
1268         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1269         if (!skb) {
1270                 /* This means receive shutdown. */
1271                 if (err == 0)
1272                         err = -EINVAL;
1273                 goto out;
1274         }
1275
1276         tsk = skb->sk;
1277         skb_free_datagram(sk, skb);
1278         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1279
1280         /* attach accepted sock to socket */
1281         unix_state_lock(tsk);
1282         newsock->state = SS_CONNECTED;
1283         sock_graft(tsk, newsock);
1284         unix_state_unlock(tsk);
1285         return 0;
1286
1287 out:
1288         return err;
1289 }
1290
1291
1292 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1293 {
1294         struct sock *sk = sock->sk;
1295         struct unix_sock *u;
1296         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1297         int err = 0;
1298
1299         if (peer) {
1300                 sk = unix_peer_get(sk);
1301
1302                 err = -ENOTCONN;
1303                 if (!sk)
1304                         goto out;
1305                 err = 0;
1306         } else {
1307                 sock_hold(sk);
1308         }
1309
1310         u = unix_sk(sk);
1311         unix_state_lock(sk);
1312         if (!u->addr) {
1313                 sunaddr->sun_family = AF_UNIX;
1314                 sunaddr->sun_path[0] = 0;
1315                 *uaddr_len = sizeof(short);
1316         } else {
1317                 struct unix_address *addr = u->addr;
1318
1319                 *uaddr_len = addr->len;
1320                 memcpy(sunaddr, addr->name, *uaddr_len);
1321         }
1322         unix_state_unlock(sk);
1323         sock_put(sk);
1324 out:
1325         return err;
1326 }
1327
1328 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1329 {
1330         int i;
1331
1332         scm->fp = UNIXCB(skb).fp;
1333         UNIXCB(skb).fp = NULL;
1334
1335         for (i = scm->fp->count-1; i >= 0; i--)
1336                 unix_notinflight(scm->fp->fp[i]);
1337 }
1338
1339 static void unix_destruct_scm(struct sk_buff *skb)
1340 {
1341         struct scm_cookie scm;
1342         memset(&scm, 0, sizeof(scm));
1343         scm.pid  = UNIXCB(skb).pid;
1344         if (UNIXCB(skb).fp)
1345                 unix_detach_fds(&scm, skb);
1346
1347         /* Alas, it calls VFS */
1348         /* So fscking what? fput() had been SMP-safe since the last Summer */
1349         scm_destroy(&scm);
1350         sock_wfree(skb);
1351 }
1352
1353 #define MAX_RECURSION_LEVEL 4
1354
1355 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1356 {
1357         int i;
1358         unsigned char max_level = 0;
1359         int unix_sock_count = 0;
1360
1361         for (i = scm->fp->count - 1; i >= 0; i--) {
1362                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1363
1364                 if (sk) {
1365                         unix_sock_count++;
1366                         max_level = max(max_level,
1367                                         unix_sk(sk)->recursion_level);
1368                 }
1369         }
1370         if (unlikely(max_level > MAX_RECURSION_LEVEL))
1371                 return -ETOOMANYREFS;
1372
1373         /*
1374          * Need to duplicate file references for the sake of garbage
1375          * collection.  Otherwise a socket in the fps might become a
1376          * candidate for GC while the skb is not yet queued.
1377          */
1378         UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1379         if (!UNIXCB(skb).fp)
1380                 return -ENOMEM;
1381
1382         if (unix_sock_count) {
1383                 for (i = scm->fp->count - 1; i >= 0; i--)
1384                         unix_inflight(scm->fp->fp[i]);
1385         }
1386         return max_level;
1387 }
1388
1389 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1390 {
1391         int err = 0;
1392
1393         UNIXCB(skb).pid  = get_pid(scm->pid);
1394         UNIXCB(skb).uid = scm->creds.uid;
1395         UNIXCB(skb).gid = scm->creds.gid;
1396         UNIXCB(skb).fp = NULL;
1397         if (scm->fp && send_fds)
1398                 err = unix_attach_fds(scm, skb);
1399
1400         skb->destructor = unix_destruct_scm;
1401         return err;
1402 }
1403
1404 /*
1405  * Some apps rely on write() giving SCM_CREDENTIALS
1406  * We include credentials if source or destination socket
1407  * asserted SOCK_PASSCRED.
1408  */
1409 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1410                             const struct sock *other)
1411 {
1412         if (UNIXCB(skb).pid)
1413                 return;
1414         if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1415             !other->sk_socket ||
1416             test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1417                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1418                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1419         }
1420 }
1421
1422 /*
1423  *      Send AF_UNIX data.
1424  */
1425
1426 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1427                               struct msghdr *msg, size_t len)
1428 {
1429         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1430         struct sock *sk = sock->sk;
1431         struct net *net = sock_net(sk);
1432         struct unix_sock *u = unix_sk(sk);
1433         struct sockaddr_un *sunaddr = msg->msg_name;
1434         struct sock *other = NULL;
1435         int namelen = 0; /* fake GCC */
1436         int err;
1437         unsigned int hash;
1438         struct sk_buff *skb;
1439         long timeo;
1440         struct scm_cookie tmp_scm;
1441         int max_level;
1442         int data_len = 0;
1443
1444         if (NULL == siocb->scm)
1445                 siocb->scm = &tmp_scm;
1446         wait_for_unix_gc();
1447         err = scm_send(sock, msg, siocb->scm, false);
1448         if (err < 0)
1449                 return err;
1450
1451         err = -EOPNOTSUPP;
1452         if (msg->msg_flags&MSG_OOB)
1453                 goto out;
1454
1455         if (msg->msg_namelen) {
1456                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1457                 if (err < 0)
1458                         goto out;
1459                 namelen = err;
1460         } else {
1461                 sunaddr = NULL;
1462                 err = -ENOTCONN;
1463                 other = unix_peer_get(sk);
1464                 if (!other)
1465                         goto out;
1466         }
1467
1468         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1469             && (err = unix_autobind(sock)) != 0)
1470                 goto out;
1471
1472         err = -EMSGSIZE;
1473         if (len > sk->sk_sndbuf - 32)
1474                 goto out;
1475
1476         if (len > SKB_MAX_ALLOC)
1477                 data_len = min_t(size_t,
1478                                  len - SKB_MAX_ALLOC,
1479                                  MAX_SKB_FRAGS * PAGE_SIZE);
1480
1481         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1482                                    msg->msg_flags & MSG_DONTWAIT, &err,
1483                                    PAGE_ALLOC_COSTLY_ORDER);
1484         if (skb == NULL)
1485                 goto out;
1486
1487         err = unix_scm_to_skb(siocb->scm, skb, true);
1488         if (err < 0)
1489                 goto out_free;
1490         max_level = err + 1;
1491         unix_get_secdata(siocb->scm, skb);
1492
1493         skb_put(skb, len - data_len);
1494         skb->data_len = data_len;
1495         skb->len = len;
1496         err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1497         if (err)
1498                 goto out_free;
1499
1500         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1501
1502 restart:
1503         if (!other) {
1504                 err = -ECONNRESET;
1505                 if (sunaddr == NULL)
1506                         goto out_free;
1507
1508                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1509                                         hash, &err);
1510                 if (other == NULL)
1511                         goto out_free;
1512         }
1513
1514         if (sk_filter(other, skb) < 0) {
1515                 /* Toss the packet but do not return any error to the sender */
1516                 err = len;
1517                 goto out_free;
1518         }
1519
1520         unix_state_lock(other);
1521         err = -EPERM;
1522         if (!unix_may_send(sk, other))
1523                 goto out_unlock;
1524
1525         if (sock_flag(other, SOCK_DEAD)) {
1526                 /*
1527                  *      Check with 1003.1g - what should
1528                  *      datagram error
1529                  */
1530                 unix_state_unlock(other);
1531                 sock_put(other);
1532
1533                 err = 0;
1534                 unix_state_lock(sk);
1535                 if (unix_peer(sk) == other) {
1536                         unix_peer(sk) = NULL;
1537                         unix_state_unlock(sk);
1538
1539                         unix_dgram_disconnected(sk, other);
1540                         sock_put(other);
1541                         err = -ECONNREFUSED;
1542                 } else {
1543                         unix_state_unlock(sk);
1544                 }
1545
1546                 other = NULL;
1547                 if (err)
1548                         goto out_free;
1549                 goto restart;
1550         }
1551
1552         err = -EPIPE;
1553         if (other->sk_shutdown & RCV_SHUTDOWN)
1554                 goto out_unlock;
1555
1556         if (sk->sk_type != SOCK_SEQPACKET) {
1557                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1558                 if (err)
1559                         goto out_unlock;
1560         }
1561
1562         if (unix_peer(other) != sk && unix_recvq_full(other)) {
1563                 if (!timeo) {
1564                         err = -EAGAIN;
1565                         goto out_unlock;
1566                 }
1567
1568                 timeo = unix_wait_for_peer(other, timeo);
1569
1570                 err = sock_intr_errno(timeo);
1571                 if (signal_pending(current))
1572                         goto out_free;
1573
1574                 goto restart;
1575         }
1576
1577         if (sock_flag(other, SOCK_RCVTSTAMP))
1578                 __net_timestamp(skb);
1579         maybe_add_creds(skb, sock, other);
1580         skb_queue_tail(&other->sk_receive_queue, skb);
1581         if (max_level > unix_sk(other)->recursion_level)
1582                 unix_sk(other)->recursion_level = max_level;
1583         unix_state_unlock(other);
1584         other->sk_data_ready(other, len);
1585         sock_put(other);
1586         scm_destroy(siocb->scm);
1587         return len;
1588
1589 out_unlock:
1590         unix_state_unlock(other);
1591 out_free:
1592         kfree_skb(skb);
1593 out:
1594         if (other)
1595                 sock_put(other);
1596         scm_destroy(siocb->scm);
1597         return err;
1598 }
1599
1600 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1601  * bytes, and a minimun of a full page.
1602  */
1603 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1604
1605 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1606                                struct msghdr *msg, size_t len)
1607 {
1608         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1609         struct sock *sk = sock->sk;
1610         struct sock *other = NULL;
1611         int err, size;
1612         struct sk_buff *skb;
1613         int sent = 0;
1614         struct scm_cookie tmp_scm;
1615         bool fds_sent = false;
1616         int max_level;
1617         int data_len;
1618
1619         if (NULL == siocb->scm)
1620                 siocb->scm = &tmp_scm;
1621         wait_for_unix_gc();
1622         err = scm_send(sock, msg, siocb->scm, false);
1623         if (err < 0)
1624                 return err;
1625
1626         err = -EOPNOTSUPP;
1627         if (msg->msg_flags&MSG_OOB)
1628                 goto out_err;
1629
1630         if (msg->msg_namelen) {
1631                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1632                 goto out_err;
1633         } else {
1634                 err = -ENOTCONN;
1635                 other = unix_peer(sk);
1636                 if (!other)
1637                         goto out_err;
1638         }
1639
1640         if (sk->sk_shutdown & SEND_SHUTDOWN)
1641                 goto pipe_err;
1642
1643         while (sent < len) {
1644                 size = len - sent;
1645
1646                 /* Keep two messages in the pipe so it schedules better */
1647                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1648
1649                 /* allow fallback to order-0 allocations */
1650                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1651
1652                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1653
1654                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1655                                            msg->msg_flags & MSG_DONTWAIT, &err,
1656                                            get_order(UNIX_SKB_FRAGS_SZ));
1657                 if (!skb)
1658                         goto out_err;
1659
1660                 /* Only send the fds in the first buffer */
1661                 err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1662                 if (err < 0) {
1663                         kfree_skb(skb);
1664                         goto out_err;
1665                 }
1666                 max_level = err + 1;
1667                 fds_sent = true;
1668
1669                 skb_put(skb, size - data_len);
1670                 skb->data_len = data_len;
1671                 skb->len = size;
1672                 err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov,
1673                                                    sent, size);
1674                 if (err) {
1675                         kfree_skb(skb);
1676                         goto out_err;
1677                 }
1678
1679                 unix_state_lock(other);
1680
1681                 if (sock_flag(other, SOCK_DEAD) ||
1682                     (other->sk_shutdown & RCV_SHUTDOWN))
1683                         goto pipe_err_free;
1684
1685                 maybe_add_creds(skb, sock, other);
1686                 skb_queue_tail(&other->sk_receive_queue, skb);
1687                 if (max_level > unix_sk(other)->recursion_level)
1688                         unix_sk(other)->recursion_level = max_level;
1689                 unix_state_unlock(other);
1690                 other->sk_data_ready(other, size);
1691                 sent += size;
1692         }
1693
1694         scm_destroy(siocb->scm);
1695         siocb->scm = NULL;
1696
1697         return sent;
1698
1699 pipe_err_free:
1700         unix_state_unlock(other);
1701         kfree_skb(skb);
1702 pipe_err:
1703         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1704                 send_sig(SIGPIPE, current, 0);
1705         err = -EPIPE;
1706 out_err:
1707         scm_destroy(siocb->scm);
1708         siocb->scm = NULL;
1709         return sent ? : err;
1710 }
1711
1712 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1713                                   struct msghdr *msg, size_t len)
1714 {
1715         int err;
1716         struct sock *sk = sock->sk;
1717
1718         err = sock_error(sk);
1719         if (err)
1720                 return err;
1721
1722         if (sk->sk_state != TCP_ESTABLISHED)
1723                 return -ENOTCONN;
1724
1725         if (msg->msg_namelen)
1726                 msg->msg_namelen = 0;
1727
1728         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1729 }
1730
1731 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1732                               struct msghdr *msg, size_t size,
1733                               int flags)
1734 {
1735         struct sock *sk = sock->sk;
1736
1737         if (sk->sk_state != TCP_ESTABLISHED)
1738                 return -ENOTCONN;
1739
1740         return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1741 }
1742
1743 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1744 {
1745         struct unix_sock *u = unix_sk(sk);
1746
1747         msg->msg_namelen = 0;
1748         if (u->addr) {
1749                 msg->msg_namelen = u->addr->len;
1750                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1751         }
1752 }
1753
1754 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1755                               struct msghdr *msg, size_t size,
1756                               int flags)
1757 {
1758         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1759         struct scm_cookie tmp_scm;
1760         struct sock *sk = sock->sk;
1761         struct unix_sock *u = unix_sk(sk);
1762         int noblock = flags & MSG_DONTWAIT;
1763         struct sk_buff *skb;
1764         int err;
1765         int peeked, skip;
1766
1767         err = -EOPNOTSUPP;
1768         if (flags&MSG_OOB)
1769                 goto out;
1770
1771         msg->msg_namelen = 0;
1772
1773         err = mutex_lock_interruptible(&u->readlock);
1774         if (err) {
1775                 err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1776                 goto out;
1777         }
1778
1779         skip = sk_peek_offset(sk, flags);
1780
1781         skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1782         if (!skb) {
1783                 unix_state_lock(sk);
1784                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1785                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1786                     (sk->sk_shutdown & RCV_SHUTDOWN))
1787                         err = 0;
1788                 unix_state_unlock(sk);
1789                 goto out_unlock;
1790         }
1791
1792         wake_up_interruptible_sync_poll(&u->peer_wait,
1793                                         POLLOUT | POLLWRNORM | POLLWRBAND);
1794
1795         if (msg->msg_name)
1796                 unix_copy_addr(msg, skb->sk);
1797
1798         if (size > skb->len - skip)
1799                 size = skb->len - skip;
1800         else if (size < skb->len - skip)
1801                 msg->msg_flags |= MSG_TRUNC;
1802
1803         err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1804         if (err)
1805                 goto out_free;
1806
1807         if (sock_flag(sk, SOCK_RCVTSTAMP))
1808                 __sock_recv_timestamp(msg, sk, skb);
1809
1810         if (!siocb->scm) {
1811                 siocb->scm = &tmp_scm;
1812                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1813         }
1814         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1815         unix_set_secdata(siocb->scm, skb);
1816
1817         if (!(flags & MSG_PEEK)) {
1818                 if (UNIXCB(skb).fp)
1819                         unix_detach_fds(siocb->scm, skb);
1820
1821                 sk_peek_offset_bwd(sk, skb->len);
1822         } else {
1823                 /* It is questionable: on PEEK we could:
1824                    - do not return fds - good, but too simple 8)
1825                    - return fds, and do not return them on read (old strategy,
1826                      apparently wrong)
1827                    - clone fds (I chose it for now, it is the most universal
1828                      solution)
1829
1830                    POSIX 1003.1g does not actually define this clearly
1831                    at all. POSIX 1003.1g doesn't define a lot of things
1832                    clearly however!
1833
1834                 */
1835
1836                 sk_peek_offset_fwd(sk, size);
1837
1838                 if (UNIXCB(skb).fp)
1839                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1840         }
1841         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1842
1843         scm_recv(sock, msg, siocb->scm, flags);
1844
1845 out_free:
1846         skb_free_datagram(sk, skb);
1847 out_unlock:
1848         mutex_unlock(&u->readlock);
1849 out:
1850         return err;
1851 }
1852
1853 /*
1854  *      Sleep until more data has arrived. But check for races..
1855  */
1856 static long unix_stream_data_wait(struct sock *sk, long timeo,
1857                                   struct sk_buff *last)
1858 {
1859         DEFINE_WAIT(wait);
1860
1861         unix_state_lock(sk);
1862
1863         for (;;) {
1864                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1865
1866                 if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1867                     sk->sk_err ||
1868                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1869                     signal_pending(current) ||
1870                     !timeo)
1871                         break;
1872
1873                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1874                 unix_state_unlock(sk);
1875                 timeo = freezable_schedule_timeout(timeo);
1876                 unix_state_lock(sk);
1877                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1878         }
1879
1880         finish_wait(sk_sleep(sk), &wait);
1881         unix_state_unlock(sk);
1882         return timeo;
1883 }
1884
1885 static unsigned int unix_skb_len(const struct sk_buff *skb)
1886 {
1887         return skb->len - UNIXCB(skb).consumed;
1888 }
1889
1890 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1891                                struct msghdr *msg, size_t size,
1892                                int flags)
1893 {
1894         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1895         struct scm_cookie tmp_scm;
1896         struct sock *sk = sock->sk;
1897         struct unix_sock *u = unix_sk(sk);
1898         struct sockaddr_un *sunaddr = msg->msg_name;
1899         int copied = 0;
1900         int check_creds = 0;
1901         int target;
1902         int err = 0;
1903         long timeo;
1904         int skip;
1905
1906         err = -EINVAL;
1907         if (sk->sk_state != TCP_ESTABLISHED)
1908                 goto out;
1909
1910         err = -EOPNOTSUPP;
1911         if (flags&MSG_OOB)
1912                 goto out;
1913
1914         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1915         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1916
1917         msg->msg_namelen = 0;
1918
1919         /* Lock the socket to prevent queue disordering
1920          * while sleeps in memcpy_tomsg
1921          */
1922
1923         if (!siocb->scm) {
1924                 siocb->scm = &tmp_scm;
1925                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1926         }
1927
1928         err = mutex_lock_interruptible(&u->readlock);
1929         if (err) {
1930                 err = sock_intr_errno(timeo);
1931                 goto out;
1932         }
1933
1934         do {
1935                 int chunk;
1936                 struct sk_buff *skb, *last;
1937
1938                 unix_state_lock(sk);
1939                 last = skb = skb_peek(&sk->sk_receive_queue);
1940 again:
1941                 if (skb == NULL) {
1942                         unix_sk(sk)->recursion_level = 0;
1943                         if (copied >= target)
1944                                 goto unlock;
1945
1946                         /*
1947                          *      POSIX 1003.1g mandates this order.
1948                          */
1949
1950                         err = sock_error(sk);
1951                         if (err)
1952                                 goto unlock;
1953                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1954                                 goto unlock;
1955
1956                         unix_state_unlock(sk);
1957                         err = -EAGAIN;
1958                         if (!timeo)
1959                                 break;
1960                         mutex_unlock(&u->readlock);
1961
1962                         timeo = unix_stream_data_wait(sk, timeo, last);
1963
1964                         if (signal_pending(current)
1965                             ||  mutex_lock_interruptible(&u->readlock)) {
1966                                 err = sock_intr_errno(timeo);
1967                                 goto out;
1968                         }
1969
1970                         continue;
1971  unlock:
1972                         unix_state_unlock(sk);
1973                         break;
1974                 }
1975
1976                 skip = sk_peek_offset(sk, flags);
1977                 while (skip >= unix_skb_len(skb)) {
1978                         skip -= unix_skb_len(skb);
1979                         last = skb;
1980                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
1981                         if (!skb)
1982                                 goto again;
1983                 }
1984
1985                 unix_state_unlock(sk);
1986
1987                 if (check_creds) {
1988                         /* Never glue messages from different writers */
1989                         if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1990                             !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
1991                             !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
1992                                 break;
1993                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
1994                         /* Copy credentials */
1995                         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1996                         check_creds = 1;
1997                 }
1998
1999                 /* Copy address just once */
2000                 if (sunaddr) {
2001                         unix_copy_addr(msg, skb->sk);
2002                         sunaddr = NULL;
2003                 }
2004
2005                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2006                 if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
2007                                             msg->msg_iov, chunk)) {
2008                         if (copied == 0)
2009                                 copied = -EFAULT;
2010                         break;
2011                 }
2012                 copied += chunk;
2013                 size -= chunk;
2014
2015                 /* Mark read part of skb as used */
2016                 if (!(flags & MSG_PEEK)) {
2017                         UNIXCB(skb).consumed += chunk;
2018
2019                         sk_peek_offset_bwd(sk, chunk);
2020
2021                         if (UNIXCB(skb).fp)
2022                                 unix_detach_fds(siocb->scm, skb);
2023
2024                         if (unix_skb_len(skb))
2025                                 break;
2026
2027                         skb_unlink(skb, &sk->sk_receive_queue);
2028                         consume_skb(skb);
2029
2030                         if (siocb->scm->fp)
2031                                 break;
2032                 } else {
2033                         /* It is questionable, see note in unix_dgram_recvmsg.
2034                          */
2035                         if (UNIXCB(skb).fp)
2036                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2037
2038                         sk_peek_offset_fwd(sk, chunk);
2039
2040                         break;
2041                 }
2042         } while (size);
2043
2044         mutex_unlock(&u->readlock);
2045         scm_recv(sock, msg, siocb->scm, flags);
2046 out:
2047         return copied ? : err;
2048 }
2049
2050 static int unix_shutdown(struct socket *sock, int mode)
2051 {
2052         struct sock *sk = sock->sk;
2053         struct sock *other;
2054
2055         if (mode < SHUT_RD || mode > SHUT_RDWR)
2056                 return -EINVAL;
2057         /* This maps:
2058          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2059          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2060          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2061          */
2062         ++mode;
2063
2064         unix_state_lock(sk);
2065         sk->sk_shutdown |= mode;
2066         other = unix_peer(sk);
2067         if (other)
2068                 sock_hold(other);
2069         unix_state_unlock(sk);
2070         sk->sk_state_change(sk);
2071
2072         if (other &&
2073                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2074
2075                 int peer_mode = 0;
2076
2077                 if (mode&RCV_SHUTDOWN)
2078                         peer_mode |= SEND_SHUTDOWN;
2079                 if (mode&SEND_SHUTDOWN)
2080                         peer_mode |= RCV_SHUTDOWN;
2081                 unix_state_lock(other);
2082                 other->sk_shutdown |= peer_mode;
2083                 unix_state_unlock(other);
2084                 other->sk_state_change(other);
2085                 if (peer_mode == SHUTDOWN_MASK)
2086                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2087                 else if (peer_mode & RCV_SHUTDOWN)
2088                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2089         }
2090         if (other)
2091                 sock_put(other);
2092
2093         return 0;
2094 }
2095
2096 long unix_inq_len(struct sock *sk)
2097 {
2098         struct sk_buff *skb;
2099         long amount = 0;
2100
2101         if (sk->sk_state == TCP_LISTEN)
2102                 return -EINVAL;
2103
2104         spin_lock(&sk->sk_receive_queue.lock);
2105         if (sk->sk_type == SOCK_STREAM ||
2106             sk->sk_type == SOCK_SEQPACKET) {
2107                 skb_queue_walk(&sk->sk_receive_queue, skb)
2108                         amount += unix_skb_len(skb);
2109         } else {
2110                 skb = skb_peek(&sk->sk_receive_queue);
2111                 if (skb)
2112                         amount = skb->len;
2113         }
2114         spin_unlock(&sk->sk_receive_queue.lock);
2115
2116         return amount;
2117 }
2118 EXPORT_SYMBOL_GPL(unix_inq_len);
2119
2120 long unix_outq_len(struct sock *sk)
2121 {
2122         return sk_wmem_alloc_get(sk);
2123 }
2124 EXPORT_SYMBOL_GPL(unix_outq_len);
2125
2126 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2127 {
2128         struct sock *sk = sock->sk;
2129         long amount = 0;
2130         int err;
2131
2132         switch (cmd) {
2133         case SIOCOUTQ:
2134                 amount = unix_outq_len(sk);
2135                 err = put_user(amount, (int __user *)arg);
2136                 break;
2137         case SIOCINQ:
2138                 amount = unix_inq_len(sk);
2139                 if (amount < 0)
2140                         err = amount;
2141                 else
2142                         err = put_user(amount, (int __user *)arg);
2143                 break;
2144         default:
2145                 err = -ENOIOCTLCMD;
2146                 break;
2147         }
2148         return err;
2149 }
2150
2151 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2152 {
2153         struct sock *sk = sock->sk;
2154         unsigned int mask;
2155
2156         sock_poll_wait(file, sk_sleep(sk), wait);
2157         mask = 0;
2158
2159         /* exceptional events? */
2160         if (sk->sk_err)
2161                 mask |= POLLERR;
2162         if (sk->sk_shutdown == SHUTDOWN_MASK)
2163                 mask |= POLLHUP;
2164         if (sk->sk_shutdown & RCV_SHUTDOWN)
2165                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2166
2167         /* readable? */
2168         if (!skb_queue_empty(&sk->sk_receive_queue))
2169                 mask |= POLLIN | POLLRDNORM;
2170
2171         /* Connection-based need to check for termination and startup */
2172         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2173             sk->sk_state == TCP_CLOSE)
2174                 mask |= POLLHUP;
2175
2176         /*
2177          * we set writable also when the other side has shut down the
2178          * connection. This prevents stuck sockets.
2179          */
2180         if (unix_writable(sk))
2181                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2182
2183         return mask;
2184 }
2185
2186 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2187                                     poll_table *wait)
2188 {
2189         struct sock *sk = sock->sk, *other;
2190         unsigned int mask, writable;
2191
2192         sock_poll_wait(file, sk_sleep(sk), wait);
2193         mask = 0;
2194
2195         /* exceptional events? */
2196         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2197                 mask |= POLLERR |
2198                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2199
2200         if (sk->sk_shutdown & RCV_SHUTDOWN)
2201                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2202         if (sk->sk_shutdown == SHUTDOWN_MASK)
2203                 mask |= POLLHUP;
2204
2205         /* readable? */
2206         if (!skb_queue_empty(&sk->sk_receive_queue))
2207                 mask |= POLLIN | POLLRDNORM;
2208
2209         /* Connection-based need to check for termination and startup */
2210         if (sk->sk_type == SOCK_SEQPACKET) {
2211                 if (sk->sk_state == TCP_CLOSE)
2212                         mask |= POLLHUP;
2213                 /* connection hasn't started yet? */
2214                 if (sk->sk_state == TCP_SYN_SENT)
2215                         return mask;
2216         }
2217
2218         /* No write status requested, avoid expensive OUT tests. */
2219         if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2220                 return mask;
2221
2222         writable = unix_writable(sk);
2223         other = unix_peer_get(sk);
2224         if (other) {
2225                 if (unix_peer(other) != sk) {
2226                         sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2227                         if (unix_recvq_full(other))
2228                                 writable = 0;
2229                 }
2230                 sock_put(other);
2231         }
2232
2233         if (writable)
2234                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2235         else
2236                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2237
2238         return mask;
2239 }
2240
2241 #ifdef CONFIG_PROC_FS
2242
2243 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2244
2245 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2246 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2247 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2248
2249 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2250 {
2251         unsigned long offset = get_offset(*pos);
2252         unsigned long bucket = get_bucket(*pos);
2253         struct sock *sk;
2254         unsigned long count = 0;
2255
2256         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2257                 if (sock_net(sk) != seq_file_net(seq))
2258                         continue;
2259                 if (++count == offset)
2260                         break;
2261         }
2262
2263         return sk;
2264 }
2265
2266 static struct sock *unix_next_socket(struct seq_file *seq,
2267                                      struct sock *sk,
2268                                      loff_t *pos)
2269 {
2270         unsigned long bucket;
2271
2272         while (sk > (struct sock *)SEQ_START_TOKEN) {
2273                 sk = sk_next(sk);
2274                 if (!sk)
2275                         goto next_bucket;
2276                 if (sock_net(sk) == seq_file_net(seq))
2277                         return sk;
2278         }
2279
2280         do {
2281                 sk = unix_from_bucket(seq, pos);
2282                 if (sk)
2283                         return sk;
2284
2285 next_bucket:
2286                 bucket = get_bucket(*pos) + 1;
2287                 *pos = set_bucket_offset(bucket, 1);
2288         } while (bucket < ARRAY_SIZE(unix_socket_table));
2289
2290         return NULL;
2291 }
2292
2293 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2294         __acquires(unix_table_lock)
2295 {
2296         spin_lock(&unix_table_lock);
2297
2298         if (!*pos)
2299                 return SEQ_START_TOKEN;
2300
2301         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2302                 return NULL;
2303
2304         return unix_next_socket(seq, NULL, pos);
2305 }
2306
2307 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2308 {
2309         ++*pos;
2310         return unix_next_socket(seq, v, pos);
2311 }
2312
2313 static void unix_seq_stop(struct seq_file *seq, void *v)
2314         __releases(unix_table_lock)
2315 {
2316         spin_unlock(&unix_table_lock);
2317 }
2318
2319 static int unix_seq_show(struct seq_file *seq, void *v)
2320 {
2321
2322         if (v == SEQ_START_TOKEN)
2323                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2324                          "Inode Path\n");
2325         else {
2326                 struct sock *s = v;
2327                 struct unix_sock *u = unix_sk(s);
2328                 unix_state_lock(s);
2329
2330                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2331                         s,
2332                         atomic_read(&s->sk_refcnt),
2333                         0,
2334                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2335                         s->sk_type,
2336                         s->sk_socket ?
2337                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2338                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2339                         sock_i_ino(s));
2340
2341                 if (u->addr) {
2342                         int i, len;
2343                         seq_putc(seq, ' ');
2344
2345                         i = 0;
2346                         len = u->addr->len - sizeof(short);
2347                         if (!UNIX_ABSTRACT(s))
2348                                 len--;
2349                         else {
2350                                 seq_putc(seq, '@');
2351                                 i++;
2352                         }
2353                         for ( ; i < len; i++)
2354                                 seq_putc(seq, u->addr->name->sun_path[i]);
2355                 }
2356                 unix_state_unlock(s);
2357                 seq_putc(seq, '\n');
2358         }
2359
2360         return 0;
2361 }
2362
2363 static const struct seq_operations unix_seq_ops = {
2364         .start  = unix_seq_start,
2365         .next   = unix_seq_next,
2366         .stop   = unix_seq_stop,
2367         .show   = unix_seq_show,
2368 };
2369
2370 static int unix_seq_open(struct inode *inode, struct file *file)
2371 {
2372         return seq_open_net(inode, file, &unix_seq_ops,
2373                             sizeof(struct seq_net_private));
2374 }
2375
2376 static const struct file_operations unix_seq_fops = {
2377         .owner          = THIS_MODULE,
2378         .open           = unix_seq_open,
2379         .read           = seq_read,
2380         .llseek         = seq_lseek,
2381         .release        = seq_release_net,
2382 };
2383
2384 #endif
2385
2386 static const struct net_proto_family unix_family_ops = {
2387         .family = PF_UNIX,
2388         .create = unix_create,
2389         .owner  = THIS_MODULE,
2390 };
2391
2392
2393 static int __net_init unix_net_init(struct net *net)
2394 {
2395         int error = -ENOMEM;
2396
2397         net->unx.sysctl_max_dgram_qlen = 10;
2398         if (unix_sysctl_register(net))
2399                 goto out;
2400
2401 #ifdef CONFIG_PROC_FS
2402         if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2403                 unix_sysctl_unregister(net);
2404                 goto out;
2405         }
2406 #endif
2407         error = 0;
2408 out:
2409         return error;
2410 }
2411
2412 static void __net_exit unix_net_exit(struct net *net)
2413 {
2414         unix_sysctl_unregister(net);
2415         remove_proc_entry("unix", net->proc_net);
2416 }
2417
2418 static struct pernet_operations unix_net_ops = {
2419         .init = unix_net_init,
2420         .exit = unix_net_exit,
2421 };
2422
2423 static int __init af_unix_init(void)
2424 {
2425         int rc = -1;
2426
2427         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2428
2429         rc = proto_register(&unix_proto, 1);
2430         if (rc != 0) {
2431                 printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2432                        __func__);
2433                 goto out;
2434         }
2435
2436         sock_register(&unix_family_ops);
2437         register_pernet_subsys(&unix_net_ops);
2438 out:
2439         return rc;
2440 }
2441
2442 static void __exit af_unix_exit(void)
2443 {
2444         sock_unregister(PF_UNIX);
2445         proto_unregister(&unix_proto);
2446         unregister_pernet_subsys(&unix_net_ops);
2447 }
2448
2449 /* Earlier than device_initcall() so that other drivers invoking
2450    request_module() don't end up in a loop when modprobe tries
2451    to use a UNIX socket. But later than subsys_initcall() because
2452    we depend on stuff initialised there */
2453 fs_initcall(af_unix_init);
2454 module_exit(af_unix_exit);
2455
2456 MODULE_LICENSE("GPL");
2457 MODULE_ALIAS_NETPROTO(PF_UNIX);