net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #include <linux/module.h>
  84 #include <linux/kernel.h>
  85 #include <linux/signal.h>
  86 #include <linux/sched.h>
  87 #include <linux/errno.h>
  88 #include <linux/string.h>
  89 #include <linux/stat.h>
  90 #include <linux/dcache.h>
  91 #include <linux/namei.h>
  92 #include <linux/socket.h>
  93 #include <linux/un.h>
  94 #include <linux/fcntl.h>
  95 #include <linux/termios.h>
  96 #include <linux/sockios.h>
  97 #include <linux/net.h>
  98 #include <linux/in.h>
  99 #include <linux/fs.h>
 100 #include <linux/slab.h>
 101 #include <asm/uaccess.h>
 102 #include <linux/skbuff.h>
 103 #include <linux/netdevice.h>
 104 #include <net/net_namespace.h>
 105 #include <net/sock.h>
 106 #include <net/tcp_states.h>
 107 #include <net/af_unix.h>
 108 #include <linux/proc_fs.h>
 109 #include <linux/seq_file.h>
 110 #include <net/scm.h>
 111 #include <linux/init.h>
 112 #include <linux/poll.h>
 113 #include <linux/rtnetlink.h>
 114 #include <linux/mount.h>
 115 #include <net/checksum.h>
 116 #include <linux/security.h>
 117 #include <linux/freezer.h>
 118
 119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 120 EXPORT_SYMBOL_GPL(unix_socket_table);
 121 DEFINE_SPINLOCK(unix_table_lock);
 122 EXPORT_SYMBOL_GPL(unix_table_lock);
 123 static atomic_long_t unix_nr_socks;
 124
 125
 126 static struct hlist_head *unix_sockets_unbound(void *addr)
 127 {
 128         unsigned long hash = (unsigned long)addr;
 129
 130         hash ^= hash >> 16;
 131         hash ^= hash >> 8;
 132         hash %= UNIX_HASH_SIZE;
 133         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 134 }
 135
 136 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 137
 138 #ifdef CONFIG_SECURITY_NETWORK
 139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 140 {
 141         memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
 142 }
 143
 144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 145 {
 146         scm->secid = *UNIXSID(skb);
 147 }
 148 #else
 149 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 150 { }
 151
 152 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 153 { }
 154 #endif /* CONFIG_SECURITY_NETWORK */
 155
 156 /*
 157  *  SMP locking strategy:
 158  *    hash table is protected with spinlock unix_table_lock
 159  *    each socket state is protected by separate spin lock.
 160  */
 161
 162 static inline unsigned int unix_hash_fold(__wsum n)
 163 {
 164         unsigned int hash = (__force unsigned int)n;
 165
 166         hash ^= hash>>16;
 167         hash ^= hash>>8;
 168         return hash&(UNIX_HASH_SIZE-1);
 169 }
 170
 171 #define unix_peer(sk) (unix_sk(sk)->peer)
 172
 173 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 174 {
 175         return unix_peer(osk) == sk;
 176 }
 177
 178 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 179 {
 180         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 181 }
 182
 183 static inline int unix_recvq_full(struct sock const *sk)
 184 {
 185         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 186 }
 187
 188 struct sock *unix_peer_get(struct sock *s)
 189 {
 190         struct sock *peer;
 191
 192         unix_state_lock(s);
 193         peer = unix_peer(s);
 194         if (peer)
 195                 sock_hold(peer);
 196         unix_state_unlock(s);
 197         return peer;
 198 }
 199 EXPORT_SYMBOL_GPL(unix_peer_get);
 200
 201 static inline void unix_release_addr(struct unix_address *addr)
 202 {
 203         if (atomic_dec_and_test(&addr->refcnt))
 204                 kfree(addr);
 205 }
 206
 207 /*
 208  *      Check unix socket name:
 209  *              - should be not zero length.
 210  *              - if started by not zero, should be NULL terminated (FS object)
 211  *              - if started by zero, it is abstract name.
 212  */
 213
 214 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 215 {
 216         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 217                 return -EINVAL;
 218         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 219                 return -EINVAL;
 220         if (sunaddr->sun_path[0]) {
 221                 /*
 222                  * This may look like an off by one error but it is a bit more
 223                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 224                  * sun_path[108] doesn't as such exist.  However in kernel space
 225                  * we are guaranteed that it is a valid memory location in our
 226                  * kernel address buffer.
 227                  */
 228                 ((char *)sunaddr)[len] = 0;
 229                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 230                 return len;
 231         }
 232
 233         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 234         return len;
 235 }
 236
 237 static void __unix_remove_socket(struct sock *sk)
 238 {
 239         sk_del_node_init(sk);
 240 }
 241
 242 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 243 {
 244         WARN_ON(!sk_unhashed(sk));
 245         sk_add_node(sk, list);
 246 }
 247
 248 static inline void unix_remove_socket(struct sock *sk)
 249 {
 250         spin_lock(&unix_table_lock);
 251         __unix_remove_socket(sk);
 252         spin_unlock(&unix_table_lock);
 253 }
 254
 255 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 256 {
 257         spin_lock(&unix_table_lock);
 258         __unix_insert_socket(list, sk);
 259         spin_unlock(&unix_table_lock);
 260 }
 261
 262 static struct sock *__unix_find_socket_byname(struct net *net,
 263                                               struct sockaddr_un *sunname,
 264                                               int len, int type, unsigned int hash)
 265 {
 266         struct sock *s;
 267
 268         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 269                 struct unix_sock *u = unix_sk(s);
 270
 271                 if (!net_eq(sock_net(s), net))
 272                         continue;
 273
 274                 if (u->addr->len == len &&
 275                     !memcmp(u->addr->name, sunname, len))
 276                         goto found;
 277         }
 278         s = NULL;
 279 found:
 280         return s;
 281 }
 282
 283 static inline struct sock *unix_find_socket_byname(struct net *net,
 284                                                    struct sockaddr_un *sunname,
 285                                                    int len, int type,
 286                                                    unsigned int hash)
 287 {
 288         struct sock *s;
 289
 290         spin_lock(&unix_table_lock);
 291         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 292         if (s)
 293                 sock_hold(s);
 294         spin_unlock(&unix_table_lock);
 295         return s;
 296 }
 297
 298 static struct sock *unix_find_socket_byinode(struct inode *i)
 299 {
 300         struct sock *s;
 301
 302         spin_lock(&unix_table_lock);
 303         sk_for_each(s,
 304                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 305                 struct dentry *dentry = unix_sk(s)->path.dentry;
 306
 307                 if (dentry && dentry->d_inode == i) {
 308                         sock_hold(s);
 309                         goto found;
 310                 }
 311         }
 312         s = NULL;
 313 found:
 314         spin_unlock(&unix_table_lock);
 315         return s;
 316 }
 317
 318 static inline int unix_writable(struct sock *sk)
 319 {
 320         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 321 }
 322
 323 static void unix_write_space(struct sock *sk)
 324 {
 325         struct socket_wq *wq;
 326
 327         rcu_read_lock();
 328         if (unix_writable(sk)) {
 329                 wq = rcu_dereference(sk->sk_wq);
 330                 if (wq_has_sleeper(wq))
 331                         wake_up_interruptible_sync_poll(&wq->wait,
 332                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 333                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 334         }
 335         rcu_read_unlock();
 336 }
 337
 338 /* When dgram socket disconnects (or changes its peer), we clear its receive
 339  * queue of packets arrived from previous peer. First, it allows to do
 340  * flow control based only on wmem_alloc; second, sk connected to peer
 341  * may receive messages only from that peer. */
 342 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 343 {
 344         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 345                 skb_queue_purge(&sk->sk_receive_queue);
 346                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 347
 348                 /* If one link of bidirectional dgram pipe is disconnected,
 349                  * we signal error. Messages are lost. Do not make this,
 350                  * when peer was not connected to us.
 351                  */
 352                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 353                         other->sk_err = ECONNRESET;
 354                         other->sk_error_report(other);
 355                 }
 356         }
 357 }
 358
 359 static void unix_sock_destructor(struct sock *sk)
 360 {
 361         struct unix_sock *u = unix_sk(sk);
 362
 363         skb_queue_purge(&sk->sk_receive_queue);
 364
 365         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 366         WARN_ON(!sk_unhashed(sk));
 367         WARN_ON(sk->sk_socket);
 368         if (!sock_flag(sk, SOCK_DEAD)) {
 369                 printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
 370                 return;
 371         }
 372
 373         if (u->addr)
 374                 unix_release_addr(u->addr);
 375
 376         atomic_long_dec(&unix_nr_socks);
 377         local_bh_disable();
 378         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 379         local_bh_enable();
 380 #ifdef UNIX_REFCNT_DEBUG
 381         printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
 382                 atomic_long_read(&unix_nr_socks));
 383 #endif
 384 }
 385
 386 static void unix_release_sock(struct sock *sk, int embrion)
 387 {
 388         struct unix_sock *u = unix_sk(sk);
 389         struct path path;
 390         struct sock *skpair;
 391         struct sk_buff *skb;
 392         int state;
 393
 394         unix_remove_socket(sk);
 395
 396         /* Clear state */
 397         unix_state_lock(sk);
 398         sock_orphan(sk);
 399         sk->sk_shutdown = SHUTDOWN_MASK;
 400         path         = u->path;
 401         u->path.dentry = NULL;
 402         u->path.mnt = NULL;
 403         state = sk->sk_state;
 404         sk->sk_state = TCP_CLOSE;
 405         unix_state_unlock(sk);
 406
 407         wake_up_interruptible_all(&u->peer_wait);
 408
 409         skpair = unix_peer(sk);
 410
 411         if (skpair != NULL) {
 412                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 413                         unix_state_lock(skpair);
 414                         /* No more writes */
 415                         skpair->sk_shutdown = SHUTDOWN_MASK;
 416                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 417                                 skpair->sk_err = ECONNRESET;
 418                         unix_state_unlock(skpair);
 419                         skpair->sk_state_change(skpair);
 420                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 421                 }
 422                 sock_put(skpair); /* It may now die */
 423                 unix_peer(sk) = NULL;
 424         }
 425
 426         /* Try to flush out this socket. Throw out buffers at least */
 427
 428         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 429                 if (state == TCP_LISTEN)
 430                         unix_release_sock(skb->sk, 1);
 431                 /* passed fds are erased in the kfree_skb hook        */
 432                 kfree_skb(skb);
 433         }
 434
 435         if (path.dentry)
 436                 path_put(&path);
 437
 438         sock_put(sk);
 439
 440         /* ---- Socket is dead now and most probably destroyed ---- */
 441
 442         /*
 443          * Fixme: BSD difference: In BSD all sockets connected to us get
 444          *        ECONNRESET and we die on the spot. In Linux we behave
 445          *        like files and pipes do and wait for the last
 446          *        dereference.
 447          *
 448          * Can't we simply set sock->err?
 449          *
 450          *        What the above comment does talk about? --ANK(980817)
 451          */
 452
 453         if (unix_tot_inflight)
 454                 unix_gc();              /* Garbage collect fds */
 455 }
 456
 457 static void init_peercred(struct sock *sk)
 458 {
 459         put_pid(sk->sk_peer_pid);
 460         if (sk->sk_peer_cred)
 461                 put_cred(sk->sk_peer_cred);
 462         sk->sk_peer_pid  = get_pid(task_tgid(current));
 463         sk->sk_peer_cred = get_current_cred();
 464 }
 465
 466 static void copy_peercred(struct sock *sk, struct sock *peersk)
 467 {
 468         put_pid(sk->sk_peer_pid);
 469         if (sk->sk_peer_cred)
 470                 put_cred(sk->sk_peer_cred);
 471         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 472         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 473 }
 474
 475 static int unix_listen(struct socket *sock, int backlog)
 476 {
 477         int err;
 478         struct sock *sk = sock->sk;
 479         struct unix_sock *u = unix_sk(sk);
 480         struct pid *old_pid = NULL;
 481
 482         err = -EOPNOTSUPP;
 483         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 484                 goto out;       /* Only stream/seqpacket sockets accept */
 485         err = -EINVAL;
 486         if (!u->addr)
 487                 goto out;       /* No listens on an unbound socket */
 488         unix_state_lock(sk);
 489         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 490                 goto out_unlock;
 491         if (backlog > sk->sk_max_ack_backlog)
 492                 wake_up_interruptible_all(&u->peer_wait);
 493         sk->sk_max_ack_backlog  = backlog;
 494         sk->sk_state            = TCP_LISTEN;
 495         /* set credentials so connect can copy them */
 496         init_peercred(sk);
 497         err = 0;
 498
 499 out_unlock:
 500         unix_state_unlock(sk);
 501         put_pid(old_pid);
 502 out:
 503         return err;
 504 }
 505
 506 static int unix_release(struct socket *);
 507 static int unix_bind(struct socket *, struct sockaddr *, int);
 508 static int unix_stream_connect(struct socket *, struct sockaddr *,
 509                                int addr_len, int flags);
 510 static int unix_socketpair(struct socket *, struct socket *);
 511 static int unix_accept(struct socket *, struct socket *, int);
 512 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 513 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 514 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 515                                     poll_table *);
 516 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 517 static int unix_shutdown(struct socket *, int);
 518 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
 519                                struct msghdr *, size_t);
 520 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
 521                                struct msghdr *, size_t, int);
 522 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
 523                               struct msghdr *, size_t);
 524 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
 525                               struct msghdr *, size_t, int);
 526 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 527                               int, int);
 528 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
 529                                   struct msghdr *, size_t);
 530 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
 531                                   struct msghdr *, size_t, int);
 532
 533 static void unix_set_peek_off(struct sock *sk, int val)
 534 {
 535         struct unix_sock *u = unix_sk(sk);
 536
 537         mutex_lock(&u->readlock);
 538         sk->sk_peek_off = val;
 539         mutex_unlock(&u->readlock);
 540 }
 541
 542
 543 static const struct proto_ops unix_stream_ops = {
 544         .family =       PF_UNIX,
 545         .owner =        THIS_MODULE,
 546         .release =      unix_release,
 547         .bind =         unix_bind,
 548         .connect =      unix_stream_connect,
 549         .socketpair =   unix_socketpair,
 550         .accept =       unix_accept,
 551         .getname =      unix_getname,
 552         .poll =         unix_poll,
 553         .ioctl =        unix_ioctl,
 554         .listen =       unix_listen,
 555         .shutdown =     unix_shutdown,
 556         .setsockopt =   sock_no_setsockopt,
 557         .getsockopt =   sock_no_getsockopt,
 558         .sendmsg =      unix_stream_sendmsg,
 559         .recvmsg =      unix_stream_recvmsg,
 560         .mmap =         sock_no_mmap,
 561         .sendpage =     sock_no_sendpage,
 562         .set_peek_off = unix_set_peek_off,
 563 };
 564
 565 static const struct proto_ops unix_dgram_ops = {
 566         .family =       PF_UNIX,
 567         .owner =        THIS_MODULE,
 568         .release =      unix_release,
 569         .bind =         unix_bind,
 570         .connect =      unix_dgram_connect,
 571         .socketpair =   unix_socketpair,
 572         .accept =       sock_no_accept,
 573         .getname =      unix_getname,
 574         .poll =         unix_dgram_poll,
 575         .ioctl =        unix_ioctl,
 576         .listen =       sock_no_listen,
 577         .shutdown =     unix_shutdown,
 578         .setsockopt =   sock_no_setsockopt,
 579         .getsockopt =   sock_no_getsockopt,
 580         .sendmsg =      unix_dgram_sendmsg,
 581         .recvmsg =      unix_dgram_recvmsg,
 582         .mmap =         sock_no_mmap,
 583         .sendpage =     sock_no_sendpage,
 584         .set_peek_off = unix_set_peek_off,
 585 };
 586
 587 static const struct proto_ops unix_seqpacket_ops = {
 588         .family =       PF_UNIX,
 589         .owner =        THIS_MODULE,
 590         .release =      unix_release,
 591         .bind =         unix_bind,
 592         .connect =      unix_stream_connect,
 593         .socketpair =   unix_socketpair,
 594         .accept =       unix_accept,
 595         .getname =      unix_getname,
 596         .poll =         unix_dgram_poll,
 597         .ioctl =        unix_ioctl,
 598         .listen =       unix_listen,
 599         .shutdown =     unix_shutdown,
 600         .setsockopt =   sock_no_setsockopt,
 601         .getsockopt =   sock_no_getsockopt,
 602         .sendmsg =      unix_seqpacket_sendmsg,
 603         .recvmsg =      unix_seqpacket_recvmsg,
 604         .mmap =         sock_no_mmap,
 605         .sendpage =     sock_no_sendpage,
 606         .set_peek_off = unix_set_peek_off,
 607 };
 608
 609 static struct proto unix_proto = {
 610         .name                   = "UNIX",
 611         .owner                  = THIS_MODULE,
 612         .obj_size               = sizeof(struct unix_sock),
 613 };
 614
 615 /*
 616  * AF_UNIX sockets do not interact with hardware, hence they
 617  * dont trigger interrupts - so it's safe for them to have
 618  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 619  * this special lock-class by reinitializing the spinlock key:
 620  */
 621 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 622
 623 static struct sock *unix_create1(struct net *net, struct socket *sock)
 624 {
 625         struct sock *sk = NULL;
 626         struct unix_sock *u;
 627
 628         atomic_long_inc(&unix_nr_socks);
 629         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 630                 goto out;
 631
 632         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
 633         if (!sk)
 634                 goto out;
 635
 636         sock_init_data(sock, sk);
 637         lockdep_set_class(&sk->sk_receive_queue.lock,
 638                                 &af_unix_sk_receive_queue_lock_key);
 639
 640         sk->sk_write_space      = unix_write_space;
 641         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 642         sk->sk_destruct         = unix_sock_destructor;
 643         u         = unix_sk(sk);
 644         u->path.dentry = NULL;
 645         u->path.mnt = NULL;
 646         spin_lock_init(&u->lock);
 647         atomic_long_set(&u->inflight, 0);
 648         INIT_LIST_HEAD(&u->link);
 649         mutex_init(&u->readlock); /* single task reading lock */
 650         init_waitqueue_head(&u->peer_wait);
 651         unix_insert_socket(unix_sockets_unbound(sk), sk);
 652 out:
 653         if (sk == NULL)
 654                 atomic_long_dec(&unix_nr_socks);
 655         else {
 656                 local_bh_disable();
 657                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 658                 local_bh_enable();
 659         }
 660         return sk;
 661 }
 662
 663 static int unix_create(struct net *net, struct socket *sock, int protocol,
 664                        int kern)
 665 {
 666         if (protocol && protocol != PF_UNIX)
 667                 return -EPROTONOSUPPORT;
 668
 669         sock->state = SS_UNCONNECTED;
 670
 671         switch (sock->type) {
 672         case SOCK_STREAM:
 673                 sock->ops = &unix_stream_ops;
 674                 break;
 675                 /*
 676                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 677                  *      nothing uses it.
 678                  */
 679         case SOCK_RAW:
 680                 sock->type = SOCK_DGRAM;
 681         case SOCK_DGRAM:
 682                 sock->ops = &unix_dgram_ops;
 683                 break;
 684         case SOCK_SEQPACKET:
 685                 sock->ops = &unix_seqpacket_ops;
 686                 break;
 687         default:
 688                 return -ESOCKTNOSUPPORT;
 689         }
 690
 691         return unix_create1(net, sock) ? 0 : -ENOMEM;
 692 }
 693
 694 static int unix_release(struct socket *sock)
 695 {
 696         struct sock *sk = sock->sk;
 697
 698         if (!sk)
 699                 return 0;
 700
 701         unix_release_sock(sk, 0);
 702         sock->sk = NULL;
 703
 704         return 0;
 705 }
 706
 707 static int unix_autobind(struct socket *sock)
 708 {
 709         struct sock *sk = sock->sk;
 710         struct net *net = sock_net(sk);
 711         struct unix_sock *u = unix_sk(sk);
 712         static u32 ordernum = 1;
 713         struct unix_address *addr;
 714         int err;
 715         unsigned int retries = 0;
 716
 717         mutex_lock(&u->readlock);
 718
 719         err = 0;
 720         if (u->addr)
 721                 goto out;
 722
 723         err = -ENOMEM;
 724         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 725         if (!addr)
 726                 goto out;
 727
 728         addr->name->sun_family = AF_UNIX;
 729         atomic_set(&addr->refcnt, 1);
 730
 731 retry:
 732         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 733         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 734
 735         spin_lock(&unix_table_lock);
 736         ordernum = (ordernum+1)&0xFFFFF;
 737
 738         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 739                                       addr->hash)) {
 740                 spin_unlock(&unix_table_lock);
 741                 /*
 742                  * __unix_find_socket_byname() may take long time if many names
 743                  * are already in use.
 744                  */
 745                 cond_resched();
 746                 /* Give up if all names seems to be in use. */
 747                 if (retries++ == 0xFFFFF) {
 748                         err = -ENOSPC;
 749                         kfree(addr);
 750                         goto out;
 751                 }
 752                 goto retry;
 753         }
 754         addr->hash ^= sk->sk_type;
 755
 756         __unix_remove_socket(sk);
 757         u->addr = addr;
 758         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 759         spin_unlock(&unix_table_lock);
 760         err = 0;
 761
 762 out:    mutex_unlock(&u->readlock);
 763         return err;
 764 }
 765
 766 static struct sock *unix_find_other(struct net *net,
 767                                     struct sockaddr_un *sunname, int len,
 768                                     int type, unsigned int hash, int *error)
 769 {
 770         struct sock *u;
 771         struct path path;
 772         int err = 0;
 773
 774         if (sunname->sun_path[0]) {
 775                 struct inode *inode;
 776                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 777                 if (err)
 778                         goto fail;
 779                 inode = path.dentry->d_inode;
 780                 err = inode_permission(inode, MAY_WRITE);
 781                 if (err)
 782                         goto put_fail;
 783
 784                 err = -ECONNREFUSED;
 785                 if (!S_ISSOCK(inode->i_mode))
 786                         goto put_fail;
 787                 u = unix_find_socket_byinode(inode);
 788                 if (!u)
 789                         goto put_fail;
 790
 791                 if (u->sk_type == type)
 792                         touch_atime(&path);
 793
 794                 path_put(&path);
 795
 796                 err = -EPROTOTYPE;
 797                 if (u->sk_type != type) {
 798                         sock_put(u);
 799                         goto fail;
 800                 }
 801         } else {
 802                 err = -ECONNREFUSED;
 803                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 804                 if (u) {
 805                         struct dentry *dentry;
 806                         dentry = unix_sk(u)->path.dentry;
 807                         if (dentry)
 808                                 touch_atime(&unix_sk(u)->path);
 809                 } else
 810                         goto fail;
 811         }
 812         return u;
 813
 814 put_fail:
 815         path_put(&path);
 816 fail:
 817         *error = err;
 818         return NULL;
 819 }
 820
 821 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 822 {
 823         struct dentry *dentry;
 824         struct path path;
 825         int err = 0;
 826         /*
 827          * Get the parent directory, calculate the hash for last
 828          * component.
 829          */
 830         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 831         err = PTR_ERR(dentry);
 832         if (IS_ERR(dentry))
 833                 return err;
 834
 835         /*
 836          * All right, let's create it.
 837          */
 838         err = security_path_mknod(&path, dentry, mode, 0);
 839         if (!err) {
 840                 err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
 841                 if (!err) {
 842                         res->mnt = mntget(path.mnt);
 843                         res->dentry = dget(dentry);
 844                 }
 845         }
 846         done_path_create(&path, dentry);
 847         return err;
 848 }
 849
 850 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 851 {
 852         struct sock *sk = sock->sk;
 853         struct net *net = sock_net(sk);
 854         struct unix_sock *u = unix_sk(sk);
 855         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 856         char *sun_path = sunaddr->sun_path;
 857         int err;
 858         unsigned int hash;
 859         struct unix_address *addr;
 860         struct hlist_head *list;
 861
 862         err = -EINVAL;
 863         if (sunaddr->sun_family != AF_UNIX)
 864                 goto out;
 865
 866         if (addr_len == sizeof(short)) {
 867                 err = unix_autobind(sock);
 868                 goto out;
 869         }
 870
 871         err = unix_mkname(sunaddr, addr_len, &hash);
 872         if (err < 0)
 873                 goto out;
 874         addr_len = err;
 875
 876         mutex_lock(&u->readlock);
 877
 878         err = -EINVAL;
 879         if (u->addr)
 880                 goto out_up;
 881
 882         err = -ENOMEM;
 883         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
 884         if (!addr)
 885                 goto out_up;
 886
 887         memcpy(addr->name, sunaddr, addr_len);
 888         addr->len = addr_len;
 889         addr->hash = hash ^ sk->sk_type;
 890         atomic_set(&addr->refcnt, 1);
 891
 892         if (sun_path[0]) {
 893                 struct path path;
 894                 umode_t mode = S_IFSOCK |
 895                        (SOCK_INODE(sock)->i_mode & ~current_umask());
 896                 err = unix_mknod(sun_path, mode, &path);
 897                 if (err) {
 898                         if (err == -EEXIST)
 899                                 err = -EADDRINUSE;
 900                         unix_release_addr(addr);
 901                         goto out_up;
 902                 }
 903                 addr->hash = UNIX_HASH_SIZE;
 904                 hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
 905                 spin_lock(&unix_table_lock);
 906                 u->path = path;
 907                 list = &unix_socket_table[hash];
 908         } else {
 909                 spin_lock(&unix_table_lock);
 910                 err = -EADDRINUSE;
 911                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
 912                                               sk->sk_type, hash)) {
 913                         unix_release_addr(addr);
 914                         goto out_unlock;
 915                 }
 916
 917                 list = &unix_socket_table[addr->hash];
 918         }
 919
 920         err = 0;
 921         __unix_remove_socket(sk);
 922         u->addr = addr;
 923         __unix_insert_socket(list, sk);
 924
 925 out_unlock:
 926         spin_unlock(&unix_table_lock);
 927 out_up:
 928         mutex_unlock(&u->readlock);
 929 out:
 930         return err;
 931 }
 932
 933 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
 934 {
 935         if (unlikely(sk1 == sk2) || !sk2) {
 936                 unix_state_lock(sk1);
 937                 return;
 938         }
 939         if (sk1 < sk2) {
 940                 unix_state_lock(sk1);
 941                 unix_state_lock_nested(sk2);
 942         } else {
 943                 unix_state_lock(sk2);
 944                 unix_state_lock_nested(sk1);
 945         }
 946 }
 947
 948 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
 949 {
 950         if (unlikely(sk1 == sk2) || !sk2) {
 951                 unix_state_unlock(sk1);
 952                 return;
 953         }
 954         unix_state_unlock(sk1);
 955         unix_state_unlock(sk2);
 956 }
 957
 958 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 959                               int alen, int flags)
 960 {
 961         struct sock *sk = sock->sk;
 962         struct net *net = sock_net(sk);
 963         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 964         struct sock *other;
 965         unsigned int hash;
 966         int err;
 967
 968         if (addr->sa_family != AF_UNSPEC) {
 969                 err = unix_mkname(sunaddr, alen, &hash);
 970                 if (err < 0)
 971                         goto out;
 972                 alen = err;
 973
 974                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
 975                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
 976                         goto out;
 977
 978 restart:
 979                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
 980                 if (!other)
 981                         goto out;
 982
 983                 unix_state_double_lock(sk, other);
 984
 985                 /* Apparently VFS overslept socket death. Retry. */
 986                 if (sock_flag(other, SOCK_DEAD)) {
 987                         unix_state_double_unlock(sk, other);
 988                         sock_put(other);
 989                         goto restart;
 990                 }
 991
 992                 err = -EPERM;
 993                 if (!unix_may_send(sk, other))
 994                         goto out_unlock;
 995
 996                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
 997                 if (err)
 998                         goto out_unlock;
 999
1000         } else {
1001                 /*
1002                  *      1003.1g breaking connected state with AF_UNSPEC
1003                  */
1004                 other = NULL;
1005                 unix_state_double_lock(sk, other);
1006         }
1007
1008         /*
1009          * If it was connected, reconnect.
1010          */
1011         if (unix_peer(sk)) {
1012                 struct sock *old_peer = unix_peer(sk);
1013                 unix_peer(sk) = other;
1014                 unix_state_double_unlock(sk, other);
1015
1016                 if (other != old_peer)
1017                         unix_dgram_disconnected(sk, old_peer);
1018                 sock_put(old_peer);
1019         } else {
1020                 unix_peer(sk) = other;
1021                 unix_state_double_unlock(sk, other);
1022         }
1023         return 0;
1024
1025 out_unlock:
1026         unix_state_double_unlock(sk, other);
1027         sock_put(other);
1028 out:
1029         return err;
1030 }
1031
1032 static long unix_wait_for_peer(struct sock *other, long timeo)
1033 {
1034         struct unix_sock *u = unix_sk(other);
1035         int sched;
1036         DEFINE_WAIT(wait);
1037
1038         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1039
1040         sched = !sock_flag(other, SOCK_DEAD) &&
1041                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1042                 unix_recvq_full(other);
1043
1044         unix_state_unlock(other);
1045
1046         if (sched)
1047                 timeo = schedule_timeout(timeo);
1048
1049         finish_wait(&u->peer_wait, &wait);
1050         return timeo;
1051 }
1052
1053 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1054                                int addr_len, int flags)
1055 {
1056         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1057         struct sock *sk = sock->sk;
1058         struct net *net = sock_net(sk);
1059         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1060         struct sock *newsk = NULL;
1061         struct sock *other = NULL;
1062         struct sk_buff *skb = NULL;
1063         unsigned int hash;
1064         int st;
1065         int err;
1066         long timeo;
1067
1068         err = unix_mkname(sunaddr, addr_len, &hash);
1069         if (err < 0)
1070                 goto out;
1071         addr_len = err;
1072
1073         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1074             (err = unix_autobind(sock)) != 0)
1075                 goto out;
1076
1077         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1078
1079         /* First of all allocate resources.
1080            If we will make it after state is locked,
1081            we will have to recheck all again in any case.
1082          */
1083
1084         err = -ENOMEM;
1085
1086         /* create new sock for complete connection */
1087         newsk = unix_create1(sock_net(sk), NULL);
1088         if (newsk == NULL)
1089                 goto out;
1090
1091         /* Allocate skb for sending to listening sock */
1092         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1093         if (skb == NULL)
1094                 goto out;
1095
1096 restart:
1097         /*  Find listening sock. */
1098         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1099         if (!other)
1100                 goto out;
1101
1102         /* Latch state of peer */
1103         unix_state_lock(other);
1104
1105         /* Apparently VFS overslept socket death. Retry. */
1106         if (sock_flag(other, SOCK_DEAD)) {
1107                 unix_state_unlock(other);
1108                 sock_put(other);
1109                 goto restart;
1110         }
1111
1112         err = -ECONNREFUSED;
1113         if (other->sk_state != TCP_LISTEN)
1114                 goto out_unlock;
1115         if (other->sk_shutdown & RCV_SHUTDOWN)
1116                 goto out_unlock;
1117
1118         if (unix_recvq_full(other)) {
1119                 err = -EAGAIN;
1120                 if (!timeo)
1121                         goto out_unlock;
1122
1123                 timeo = unix_wait_for_peer(other, timeo);
1124
1125                 err = sock_intr_errno(timeo);
1126                 if (signal_pending(current))
1127                         goto out;
1128                 sock_put(other);
1129                 goto restart;
1130         }
1131
1132         /* Latch our state.
1133
1134            It is tricky place. We need to grab our state lock and cannot
1135            drop lock on peer. It is dangerous because deadlock is
1136            possible. Connect to self case and simultaneous
1137            attempt to connect are eliminated by checking socket
1138            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1139            check this before attempt to grab lock.
1140
1141            Well, and we have to recheck the state after socket locked.
1142          */
1143         st = sk->sk_state;
1144
1145         switch (st) {
1146         case TCP_CLOSE:
1147                 /* This is ok... continue with connect */
1148                 break;
1149         case TCP_ESTABLISHED:
1150                 /* Socket is already connected */
1151                 err = -EISCONN;
1152                 goto out_unlock;
1153         default:
1154                 err = -EINVAL;
1155                 goto out_unlock;
1156         }
1157
1158         unix_state_lock_nested(sk);
1159
1160         if (sk->sk_state != st) {
1161                 unix_state_unlock(sk);
1162                 unix_state_unlock(other);
1163                 sock_put(other);
1164                 goto restart;
1165         }
1166
1167         err = security_unix_stream_connect(sk, other, newsk);
1168         if (err) {
1169                 unix_state_unlock(sk);
1170                 goto out_unlock;
1171         }
1172
1173         /* The way is open! Fastly set all the necessary fields... */
1174
1175         sock_hold(sk);
1176         unix_peer(newsk)        = sk;
1177         newsk->sk_state         = TCP_ESTABLISHED;
1178         newsk->sk_type          = sk->sk_type;
1179         init_peercred(newsk);
1180         newu = unix_sk(newsk);
1181         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1182         otheru = unix_sk(other);
1183
1184         /* copy address information from listening to new sock*/
1185         if (otheru->addr) {
1186                 atomic_inc(&otheru->addr->refcnt);
1187                 newu->addr = otheru->addr;
1188         }
1189         if (otheru->path.dentry) {
1190                 path_get(&otheru->path);
1191                 newu->path = otheru->path;
1192         }
1193
1194         /* Set credentials */
1195         copy_peercred(sk, other);
1196
1197         sock->state     = SS_CONNECTED;
1198         sk->sk_state    = TCP_ESTABLISHED;
1199         sock_hold(newsk);
1200
1201         smp_mb__after_atomic_inc();     /* sock_hold() does an atomic_inc() */
1202         unix_peer(sk)   = newsk;
1203
1204         unix_state_unlock(sk);
1205
1206         /* take ten and and send info to listening sock */
1207         spin_lock(&other->sk_receive_queue.lock);
1208         __skb_queue_tail(&other->sk_receive_queue, skb);
1209         spin_unlock(&other->sk_receive_queue.lock);
1210         unix_state_unlock(other);
1211         other->sk_data_ready(other, 0);
1212         sock_put(other);
1213         return 0;
1214
1215 out_unlock:
1216         if (other)
1217                 unix_state_unlock(other);
1218
1219 out:
1220         kfree_skb(skb);
1221         if (newsk)
1222                 unix_release_sock(newsk, 0);
1223         if (other)
1224                 sock_put(other);
1225         return err;
1226 }
1227
1228 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1229 {
1230         struct sock *ska = socka->sk, *skb = sockb->sk;
1231
1232         /* Join our sockets back to back */
1233         sock_hold(ska);
1234         sock_hold(skb);
1235         unix_peer(ska) = skb;
1236         unix_peer(skb) = ska;
1237         init_peercred(ska);
1238         init_peercred(skb);
1239
1240         if (ska->sk_type != SOCK_DGRAM) {
1241                 ska->sk_state = TCP_ESTABLISHED;
1242                 skb->sk_state = TCP_ESTABLISHED;
1243                 socka->state  = SS_CONNECTED;
1244                 sockb->state  = SS_CONNECTED;
1245         }
1246         return 0;
1247 }
1248
1249 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1250 {
1251         struct sock *sk = sock->sk;
1252         struct sock *tsk;
1253         struct sk_buff *skb;
1254         int err;
1255
1256         err = -EOPNOTSUPP;
1257         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1258                 goto out;
1259
1260         err = -EINVAL;
1261         if (sk->sk_state != TCP_LISTEN)
1262                 goto out;
1263
1264         /* If socket state is TCP_LISTEN it cannot change (for now...),
1265          * so that no locks are necessary.
1266          */
1267
1268         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1269         if (!skb) {
1270                 /* This means receive shutdown. */
1271                 if (err == 0)
1272                         err = -EINVAL;
1273                 goto out;
1274         }
1275
1276         tsk = skb->sk;
1277         skb_free_datagram(sk, skb);
1278         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1279
1280         /* attach accepted sock to socket */
1281         unix_state_lock(tsk);
1282         newsock->state = SS_CONNECTED;
1283         sock_graft(tsk, newsock);
1284         unix_state_unlock(tsk);
1285         return 0;
1286
1287 out:
1288         return err;
1289 }
1290
1291
1292 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1293 {
1294         struct sock *sk = sock->sk;
1295         struct unix_sock *u;
1296         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1297         int err = 0;
1298
1299         if (peer) {
1300                 sk = unix_peer_get(sk);
1301
1302                 err = -ENOTCONN;
1303                 if (!sk)
1304                         goto out;
1305                 err = 0;
1306         } else {
1307                 sock_hold(sk);
1308         }
1309
1310         u = unix_sk(sk);
1311         unix_state_lock(sk);
1312         if (!u->addr) {
1313                 sunaddr->sun_family = AF_UNIX;
1314                 sunaddr->sun_path[0] = 0;
1315                 *uaddr_len = sizeof(short);
1316         } else {
1317                 struct unix_address *addr = u->addr;
1318
1319                 *uaddr_len = addr->len;
1320                 memcpy(sunaddr, addr->name, *uaddr_len);
1321         }
1322         unix_state_unlock(sk);
1323         sock_put(sk);
1324 out:
1325         return err;
1326 }
1327
1328 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1329 {
1330         int i;
1331
1332         scm->fp = UNIXCB(skb).fp;
1333         UNIXCB(skb).fp = NULL;
1334
1335         for (i = scm->fp->count-1; i >= 0; i--)
1336                 unix_notinflight(scm->fp->fp[i]);
1337 }
1338
1339 static void unix_destruct_scm(struct sk_buff *skb)
1340 {
1341         struct scm_cookie scm;
1342         memset(&scm, 0, sizeof(scm));
1343         scm.pid  = UNIXCB(skb).pid;
1344         if (UNIXCB(skb).fp)
1345                 unix_detach_fds(&scm, skb);
1346
1347         /* Alas, it calls VFS */
1348         /* So fscking what? fput() had been SMP-safe since the last Summer */
1349         scm_destroy(&scm);
1350         sock_wfree(skb);
1351 }
1352
1353 #define MAX_RECURSION_LEVEL 4
1354
1355 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1356 {
1357         int i;
1358         unsigned char max_level = 0;
1359         int unix_sock_count = 0;
1360
1361         for (i = scm->fp->count - 1; i >= 0; i--) {
1362                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1363
1364                 if (sk) {
1365                         unix_sock_count++;
1366                         max_level = max(max_level,
1367                                         unix_sk(sk)->recursion_level);
1368                 }
1369         }
1370         if (unlikely(max_level > MAX_RECURSION_LEVEL))
1371                 return -ETOOMANYREFS;
1372
1373         /*
1374          * Need to duplicate file references for the sake of garbage
1375          * collection.  Otherwise a socket in the fps might become a
1376          * candidate for GC while the skb is not yet queued.
1377          */
1378         UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1379         if (!UNIXCB(skb).fp)
1380                 return -ENOMEM;
1381
1382         if (unix_sock_count) {
1383                 for (i = scm->fp->count - 1; i >= 0; i--)
1384                         unix_inflight(scm->fp->fp[i]);
1385         }
1386         return max_level;
1387 }
1388
1389 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1390 {
1391         int err = 0;
1392
1393         UNIXCB(skb).pid  = get_pid(scm->pid);
1394         UNIXCB(skb).uid = scm->creds.uid;
1395         UNIXCB(skb).gid = scm->creds.gid;
1396         UNIXCB(skb).fp = NULL;
1397         if (scm->fp && send_fds)
1398                 err = unix_attach_fds(scm, skb);
1399
1400         skb->destructor = unix_destruct_scm;
1401         return err;
1402 }
1403
1404 /*
1405  * Some apps rely on write() giving SCM_CREDENTIALS
1406  * We include credentials if source or destination socket
1407  * asserted SOCK_PASSCRED.
1408  */
1409 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1410                             const struct sock *other)
1411 {
1412         if (UNIXCB(skb).pid)
1413                 return;
1414         if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1415             !other->sk_socket ||
1416             test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1417                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1418                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1419         }
1420 }
1421
1422 /*
1423  *      Send AF_UNIX data.
1424  */
1425
1426 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1427                               struct msghdr *msg, size_t len)
1428 {
1429         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1430         struct sock *sk = sock->sk;
1431         struct net *net = sock_net(sk);
1432         struct unix_sock *u = unix_sk(sk);
1433         struct sockaddr_un *sunaddr = msg->msg_name;
1434         struct sock *other = NULL;
1435         int namelen = 0; /* fake GCC */
1436         int err;
1437         unsigned int hash;
1438         struct sk_buff *skb;
1439         long timeo;
1440         struct scm_cookie tmp_scm;
1441         int max_level;
1442         int data_len = 0;
1443
1444         if (NULL == siocb->scm)
1445                 siocb->scm = &tmp_scm;
1446         wait_for_unix_gc();
1447         err = scm_send(sock, msg, siocb->scm, false);
1448         if (err < 0)
1449                 return err;
1450
1451         err = -EOPNOTSUPP;
1452         if (msg->msg_flags&MSG_OOB)
1453                 goto out;
1454
1455         if (msg->msg_namelen) {
1456                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1457                 if (err < 0)
1458                         goto out;
1459                 namelen = err;
1460         } else {
1461                 sunaddr = NULL;
1462                 err = -ENOTCONN;
1463                 other = unix_peer_get(sk);
1464                 if (!other)
1465                         goto out;
1466         }
1467
1468         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1469             && (err = unix_autobind(sock)) != 0)
1470                 goto out;
1471
1472         err = -EMSGSIZE;
1473         if (len > sk->sk_sndbuf - 32)
1474                 goto out;
1475
1476         if (len > SKB_MAX_ALLOC)
1477                 data_len = min_t(size_t,
1478                                  len - SKB_MAX_ALLOC,
1479                                  MAX_SKB_FRAGS * PAGE_SIZE);
1480
1481         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1482                                    msg->msg_flags & MSG_DONTWAIT, &err);
1483         if (skb == NULL)
1484                 goto out;
1485
1486         err = unix_scm_to_skb(siocb->scm, skb, true);
1487         if (err < 0)
1488                 goto out_free;
1489         max_level = err + 1;
1490         unix_get_secdata(siocb->scm, skb);
1491
1492         skb_put(skb, len - data_len);
1493         skb->data_len = data_len;
1494         skb->len = len;
1495         err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1496         if (err)
1497                 goto out_free;
1498
1499         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1500
1501 restart:
1502         if (!other) {
1503                 err = -ECONNRESET;
1504                 if (sunaddr == NULL)
1505                         goto out_free;
1506
1507                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1508                                         hash, &err);
1509                 if (other == NULL)
1510                         goto out_free;
1511         }
1512
1513         if (sk_filter(other, skb) < 0) {
1514                 /* Toss the packet but do not return any error to the sender */
1515                 err = len;
1516                 goto out_free;
1517         }
1518
1519         unix_state_lock(other);
1520         err = -EPERM;
1521         if (!unix_may_send(sk, other))
1522                 goto out_unlock;
1523
1524         if (sock_flag(other, SOCK_DEAD)) {
1525                 /*
1526                  *      Check with 1003.1g - what should
1527                  *      datagram error
1528                  */
1529                 unix_state_unlock(other);
1530                 sock_put(other);
1531
1532                 err = 0;
1533                 unix_state_lock(sk);
1534                 if (unix_peer(sk) == other) {
1535                         unix_peer(sk) = NULL;
1536                         unix_state_unlock(sk);
1537
1538                         unix_dgram_disconnected(sk, other);
1539                         sock_put(other);
1540                         err = -ECONNREFUSED;
1541                 } else {
1542                         unix_state_unlock(sk);
1543                 }
1544
1545                 other = NULL;
1546                 if (err)
1547                         goto out_free;
1548                 goto restart;
1549         }
1550
1551         err = -EPIPE;
1552         if (other->sk_shutdown & RCV_SHUTDOWN)
1553                 goto out_unlock;
1554
1555         if (sk->sk_type != SOCK_SEQPACKET) {
1556                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1557                 if (err)
1558                         goto out_unlock;
1559         }
1560
1561         if (unix_peer(other) != sk && unix_recvq_full(other)) {
1562                 if (!timeo) {
1563                         err = -EAGAIN;
1564                         goto out_unlock;
1565                 }
1566
1567                 timeo = unix_wait_for_peer(other, timeo);
1568
1569                 err = sock_intr_errno(timeo);
1570                 if (signal_pending(current))
1571                         goto out_free;
1572
1573                 goto restart;
1574         }
1575
1576         if (sock_flag(other, SOCK_RCVTSTAMP))
1577                 __net_timestamp(skb);
1578         maybe_add_creds(skb, sock, other);
1579         skb_queue_tail(&other->sk_receive_queue, skb);
1580         if (max_level > unix_sk(other)->recursion_level)
1581                 unix_sk(other)->recursion_level = max_level;
1582         unix_state_unlock(other);
1583         other->sk_data_ready(other, len);
1584         sock_put(other);
1585         scm_destroy(siocb->scm);
1586         return len;
1587
1588 out_unlock:
1589         unix_state_unlock(other);
1590 out_free:
1591         kfree_skb(skb);
1592 out:
1593         if (other)
1594                 sock_put(other);
1595         scm_destroy(siocb->scm);
1596         return err;
1597 }
1598
1599
1600 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1601                                struct msghdr *msg, size_t len)
1602 {
1603         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1604         struct sock *sk = sock->sk;
1605         struct sock *other = NULL;
1606         int err, size;
1607         struct sk_buff *skb;
1608         int sent = 0;
1609         struct scm_cookie tmp_scm;
1610         bool fds_sent = false;
1611         int max_level;
1612
1613         if (NULL == siocb->scm)
1614                 siocb->scm = &tmp_scm;
1615         wait_for_unix_gc();
1616         err = scm_send(sock, msg, siocb->scm, false);
1617         if (err < 0)
1618                 return err;
1619
1620         err = -EOPNOTSUPP;
1621         if (msg->msg_flags&MSG_OOB)
1622                 goto out_err;
1623
1624         if (msg->msg_namelen) {
1625                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1626                 goto out_err;
1627         } else {
1628                 err = -ENOTCONN;
1629                 other = unix_peer(sk);
1630                 if (!other)
1631                         goto out_err;
1632         }
1633
1634         if (sk->sk_shutdown & SEND_SHUTDOWN)
1635                 goto pipe_err;
1636
1637         while (sent < len) {
1638                 /*
1639                  *      Optimisation for the fact that under 0.01% of X
1640                  *      messages typically need breaking up.
1641                  */
1642
1643                 size = len-sent;
1644
1645                 /* Keep two messages in the pipe so it schedules better */
1646                 if (size > ((sk->sk_sndbuf >> 1) - 64))
1647                         size = (sk->sk_sndbuf >> 1) - 64;
1648
1649                 if (size > SKB_MAX_ALLOC)
1650                         size = SKB_MAX_ALLOC;
1651
1652                 /*
1653                  *      Grab a buffer
1654                  */
1655
1656                 skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT,
1657                                           &err);
1658
1659                 if (skb == NULL)
1660                         goto out_err;
1661
1662                 /*
1663                  *      If you pass two values to the sock_alloc_send_skb
1664                  *      it tries to grab the large buffer with GFP_NOFS
1665                  *      (which can fail easily), and if it fails grab the
1666                  *      fallback size buffer which is under a page and will
1667                  *      succeed. [Alan]
1668                  */
1669                 size = min_t(int, size, skb_tailroom(skb));
1670
1671
1672                 /* Only send the fds in the first buffer */
1673                 err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1674                 if (err < 0) {
1675                         kfree_skb(skb);
1676                         goto out_err;
1677                 }
1678                 max_level = err + 1;
1679                 fds_sent = true;
1680
1681                 err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
1682                 if (err) {
1683                         kfree_skb(skb);
1684                         goto out_err;
1685                 }
1686
1687                 unix_state_lock(other);
1688
1689                 if (sock_flag(other, SOCK_DEAD) ||
1690                     (other->sk_shutdown & RCV_SHUTDOWN))
1691                         goto pipe_err_free;
1692
1693                 maybe_add_creds(skb, sock, other);
1694                 skb_queue_tail(&other->sk_receive_queue, skb);
1695                 if (max_level > unix_sk(other)->recursion_level)
1696                         unix_sk(other)->recursion_level = max_level;
1697                 unix_state_unlock(other);
1698                 other->sk_data_ready(other, size);
1699                 sent += size;
1700         }
1701
1702         scm_destroy(siocb->scm);
1703         siocb->scm = NULL;
1704
1705         return sent;
1706
1707 pipe_err_free:
1708         unix_state_unlock(other);
1709         kfree_skb(skb);
1710 pipe_err:
1711         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1712                 send_sig(SIGPIPE, current, 0);
1713         err = -EPIPE;
1714 out_err:
1715         scm_destroy(siocb->scm);
1716         siocb->scm = NULL;
1717         return sent ? : err;
1718 }
1719
1720 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1721                                   struct msghdr *msg, size_t len)
1722 {
1723         int err;
1724         struct sock *sk = sock->sk;
1725
1726         err = sock_error(sk);
1727         if (err)
1728                 return err;
1729
1730         if (sk->sk_state != TCP_ESTABLISHED)
1731                 return -ENOTCONN;
1732
1733         if (msg->msg_namelen)
1734                 msg->msg_namelen = 0;
1735
1736         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1737 }
1738
1739 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1740                               struct msghdr *msg, size_t size,
1741                               int flags)
1742 {
1743         struct sock *sk = sock->sk;
1744
1745         if (sk->sk_state != TCP_ESTABLISHED)
1746                 return -ENOTCONN;
1747
1748         return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1749 }
1750
1751 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1752 {
1753         struct unix_sock *u = unix_sk(sk);
1754
1755         msg->msg_namelen = 0;
1756         if (u->addr) {
1757                 msg->msg_namelen = u->addr->len;
1758                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1759         }
1760 }
1761
1762 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1763                               struct msghdr *msg, size_t size,
1764                               int flags)
1765 {
1766         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1767         struct scm_cookie tmp_scm;
1768         struct sock *sk = sock->sk;
1769         struct unix_sock *u = unix_sk(sk);
1770         int noblock = flags & MSG_DONTWAIT;
1771         struct sk_buff *skb;
1772         int err;
1773         int peeked, skip;
1774
1775         err = -EOPNOTSUPP;
1776         if (flags&MSG_OOB)
1777                 goto out;
1778
1779         msg->msg_namelen = 0;
1780
1781         err = mutex_lock_interruptible(&u->readlock);
1782         if (err) {
1783                 err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1784                 goto out;
1785         }
1786
1787         skip = sk_peek_offset(sk, flags);
1788
1789         skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1790         if (!skb) {
1791                 unix_state_lock(sk);
1792                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1793                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1794                     (sk->sk_shutdown & RCV_SHUTDOWN))
1795                         err = 0;
1796                 unix_state_unlock(sk);
1797                 goto out_unlock;
1798         }
1799
1800         wake_up_interruptible_sync_poll(&u->peer_wait,
1801                                         POLLOUT | POLLWRNORM | POLLWRBAND);
1802
1803         if (msg->msg_name)
1804                 unix_copy_addr(msg, skb->sk);
1805
1806         if (size > skb->len - skip)
1807                 size = skb->len - skip;
1808         else if (size < skb->len - skip)
1809                 msg->msg_flags |= MSG_TRUNC;
1810
1811         err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1812         if (err)
1813                 goto out_free;
1814
1815         if (sock_flag(sk, SOCK_RCVTSTAMP))
1816                 __sock_recv_timestamp(msg, sk, skb);
1817
1818         if (!siocb->scm) {
1819                 siocb->scm = &tmp_scm;
1820                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1821         }
1822         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1823         unix_set_secdata(siocb->scm, skb);
1824
1825         if (!(flags & MSG_PEEK)) {
1826                 if (UNIXCB(skb).fp)
1827                         unix_detach_fds(siocb->scm, skb);
1828
1829                 sk_peek_offset_bwd(sk, skb->len);
1830         } else {
1831                 /* It is questionable: on PEEK we could:
1832                    - do not return fds - good, but too simple 8)
1833                    - return fds, and do not return them on read (old strategy,
1834                      apparently wrong)
1835                    - clone fds (I chose it for now, it is the most universal
1836                      solution)
1837
1838                    POSIX 1003.1g does not actually define this clearly
1839                    at all. POSIX 1003.1g doesn't define a lot of things
1840                    clearly however!
1841
1842                 */
1843
1844                 sk_peek_offset_fwd(sk, size);
1845
1846                 if (UNIXCB(skb).fp)
1847                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1848         }
1849         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1850
1851         scm_recv(sock, msg, siocb->scm, flags);
1852
1853 out_free:
1854         skb_free_datagram(sk, skb);
1855 out_unlock:
1856         mutex_unlock(&u->readlock);
1857 out:
1858         return err;
1859 }
1860
1861 /*
1862  *      Sleep until more data has arrived. But check for races..
1863  */
1864 static long unix_stream_data_wait(struct sock *sk, long timeo,
1865                                   struct sk_buff *last)
1866 {
1867         DEFINE_WAIT(wait);
1868
1869         unix_state_lock(sk);
1870
1871         for (;;) {
1872                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1873
1874                 if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1875                     sk->sk_err ||
1876                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1877                     signal_pending(current) ||
1878                     !timeo)
1879                         break;
1880
1881                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1882                 unix_state_unlock(sk);
1883                 timeo = freezable_schedule_timeout(timeo);
1884                 unix_state_lock(sk);
1885                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1886         }
1887
1888         finish_wait(sk_sleep(sk), &wait);
1889         unix_state_unlock(sk);
1890         return timeo;
1891 }
1892
1893 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1894                                struct msghdr *msg, size_t size,
1895                                int flags)
1896 {
1897         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1898         struct scm_cookie tmp_scm;
1899         struct sock *sk = sock->sk;
1900         struct unix_sock *u = unix_sk(sk);
1901         struct sockaddr_un *sunaddr = msg->msg_name;
1902         int copied = 0;
1903         int check_creds = 0;
1904         int target;
1905         int err = 0;
1906         long timeo;
1907         int skip;
1908
1909         err = -EINVAL;
1910         if (sk->sk_state != TCP_ESTABLISHED)
1911                 goto out;
1912
1913         err = -EOPNOTSUPP;
1914         if (flags&MSG_OOB)
1915                 goto out;
1916
1917         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1918         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1919
1920         msg->msg_namelen = 0;
1921
1922         /* Lock the socket to prevent queue disordering
1923          * while sleeps in memcpy_tomsg
1924          */
1925
1926         if (!siocb->scm) {
1927                 siocb->scm = &tmp_scm;
1928                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1929         }
1930
1931         err = mutex_lock_interruptible(&u->readlock);
1932         if (err) {
1933                 err = sock_intr_errno(timeo);
1934                 goto out;
1935         }
1936
1937         do {
1938                 int chunk;
1939                 struct sk_buff *skb, *last;
1940
1941                 unix_state_lock(sk);
1942                 last = skb = skb_peek(&sk->sk_receive_queue);
1943 again:
1944                 if (skb == NULL) {
1945                         unix_sk(sk)->recursion_level = 0;
1946                         if (copied >= target)
1947                                 goto unlock;
1948
1949                         /*
1950                          *      POSIX 1003.1g mandates this order.
1951                          */
1952
1953                         err = sock_error(sk);
1954                         if (err)
1955                                 goto unlock;
1956                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1957                                 goto unlock;
1958
1959                         unix_state_unlock(sk);
1960                         err = -EAGAIN;
1961                         if (!timeo)
1962                                 break;
1963                         mutex_unlock(&u->readlock);
1964
1965                         timeo = unix_stream_data_wait(sk, timeo, last);
1966
1967                         if (signal_pending(current)
1968                             ||  mutex_lock_interruptible(&u->readlock)) {
1969                                 err = sock_intr_errno(timeo);
1970                                 goto out;
1971                         }
1972
1973                         continue;
1974  unlock:
1975                         unix_state_unlock(sk);
1976                         break;
1977                 }
1978
1979                 skip = sk_peek_offset(sk, flags);
1980                 while (skip >= skb->len) {
1981                         skip -= skb->len;
1982                         last = skb;
1983                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
1984                         if (!skb)
1985                                 goto again;
1986                 }
1987
1988                 unix_state_unlock(sk);
1989
1990                 if (check_creds) {
1991                         /* Never glue messages from different writers */
1992                         if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1993                             !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
1994                             !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
1995                                 break;
1996                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
1997                         /* Copy credentials */
1998                         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1999                         check_creds = 1;
2000                 }
2001
2002                 /* Copy address just once */
2003                 if (sunaddr) {
2004                         unix_copy_addr(msg, skb->sk);
2005                         sunaddr = NULL;
2006                 }
2007
2008                 chunk = min_t(unsigned int, skb->len - skip, size);
2009                 if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) {
2010                         if (copied == 0)
2011                                 copied = -EFAULT;
2012                         break;
2013                 }
2014                 copied += chunk;
2015                 size -= chunk;
2016
2017                 /* Mark read part of skb as used */
2018                 if (!(flags & MSG_PEEK)) {
2019                         skb_pull(skb, chunk);
2020
2021                         sk_peek_offset_bwd(sk, chunk);
2022
2023                         if (UNIXCB(skb).fp)
2024                                 unix_detach_fds(siocb->scm, skb);
2025
2026                         if (skb->len)
2027                                 break;
2028
2029                         skb_unlink(skb, &sk->sk_receive_queue);
2030                         consume_skb(skb);
2031
2032                         if (siocb->scm->fp)
2033                                 break;
2034                 } else {
2035                         /* It is questionable, see note in unix_dgram_recvmsg.
2036                          */
2037                         if (UNIXCB(skb).fp)
2038                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2039
2040                         sk_peek_offset_fwd(sk, chunk);
2041
2042                         break;
2043                 }
2044         } while (size);
2045
2046         mutex_unlock(&u->readlock);
2047         scm_recv(sock, msg, siocb->scm, flags);
2048 out:
2049         return copied ? : err;
2050 }
2051
2052 static int unix_shutdown(struct socket *sock, int mode)
2053 {
2054         struct sock *sk = sock->sk;
2055         struct sock *other;
2056
2057         if (mode < SHUT_RD || mode > SHUT_RDWR)
2058                 return -EINVAL;
2059         /* This maps:
2060          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2061          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2062          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2063          */
2064         ++mode;
2065
2066         unix_state_lock(sk);
2067         sk->sk_shutdown |= mode;
2068         other = unix_peer(sk);
2069         if (other)
2070                 sock_hold(other);
2071         unix_state_unlock(sk);
2072         sk->sk_state_change(sk);
2073
2074         if (other &&
2075                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2076
2077                 int peer_mode = 0;
2078
2079                 if (mode&RCV_SHUTDOWN)
2080                         peer_mode |= SEND_SHUTDOWN;
2081                 if (mode&SEND_SHUTDOWN)
2082                         peer_mode |= RCV_SHUTDOWN;
2083                 unix_state_lock(other);
2084                 other->sk_shutdown |= peer_mode;
2085                 unix_state_unlock(other);
2086                 other->sk_state_change(other);
2087                 if (peer_mode == SHUTDOWN_MASK)
2088                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2089                 else if (peer_mode & RCV_SHUTDOWN)
2090                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2091         }
2092         if (other)
2093                 sock_put(other);
2094
2095         return 0;
2096 }
2097
2098 long unix_inq_len(struct sock *sk)
2099 {
2100         struct sk_buff *skb;
2101         long amount = 0;
2102
2103         if (sk->sk_state == TCP_LISTEN)
2104                 return -EINVAL;
2105
2106         spin_lock(&sk->sk_receive_queue.lock);
2107         if (sk->sk_type == SOCK_STREAM ||
2108             sk->sk_type == SOCK_SEQPACKET) {
2109                 skb_queue_walk(&sk->sk_receive_queue, skb)
2110                         amount += skb->len;
2111         } else {
2112                 skb = skb_peek(&sk->sk_receive_queue);
2113                 if (skb)
2114                         amount = skb->len;
2115         }
2116         spin_unlock(&sk->sk_receive_queue.lock);
2117
2118         return amount;
2119 }
2120 EXPORT_SYMBOL_GPL(unix_inq_len);
2121
2122 long unix_outq_len(struct sock *sk)
2123 {
2124         return sk_wmem_alloc_get(sk);
2125 }
2126 EXPORT_SYMBOL_GPL(unix_outq_len);
2127
2128 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2129 {
2130         struct sock *sk = sock->sk;
2131         long amount = 0;
2132         int err;
2133
2134         switch (cmd) {
2135         case SIOCOUTQ:
2136                 amount = unix_outq_len(sk);
2137                 err = put_user(amount, (int __user *)arg);
2138                 break;
2139         case SIOCINQ:
2140                 amount = unix_inq_len(sk);
2141                 if (amount < 0)
2142                         err = amount;
2143                 else
2144                         err = put_user(amount, (int __user *)arg);
2145                 break;
2146         default:
2147                 err = -ENOIOCTLCMD;
2148                 break;
2149         }
2150         return err;
2151 }
2152
2153 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2154 {
2155         struct sock *sk = sock->sk;
2156         unsigned int mask;
2157
2158         sock_poll_wait(file, sk_sleep(sk), wait);
2159         mask = 0;
2160
2161         /* exceptional events? */
2162         if (sk->sk_err)
2163                 mask |= POLLERR;
2164         if (sk->sk_shutdown == SHUTDOWN_MASK)
2165                 mask |= POLLHUP;
2166         if (sk->sk_shutdown & RCV_SHUTDOWN)
2167                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2168
2169         /* readable? */
2170         if (!skb_queue_empty(&sk->sk_receive_queue))
2171                 mask |= POLLIN | POLLRDNORM;
2172
2173         /* Connection-based need to check for termination and startup */
2174         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2175             sk->sk_state == TCP_CLOSE)
2176                 mask |= POLLHUP;
2177
2178         /*
2179          * we set writable also when the other side has shut down the
2180          * connection. This prevents stuck sockets.
2181          */
2182         if (unix_writable(sk))
2183                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2184
2185         return mask;
2186 }
2187
2188 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2189                                     poll_table *wait)
2190 {
2191         struct sock *sk = sock->sk, *other;
2192         unsigned int mask, writable;
2193
2194         sock_poll_wait(file, sk_sleep(sk), wait);
2195         mask = 0;
2196
2197         /* exceptional events? */
2198         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2199                 mask |= POLLERR |
2200                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2201
2202         if (sk->sk_shutdown & RCV_SHUTDOWN)
2203                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2204         if (sk->sk_shutdown == SHUTDOWN_MASK)
2205                 mask |= POLLHUP;
2206
2207         /* readable? */
2208         if (!skb_queue_empty(&sk->sk_receive_queue))
2209                 mask |= POLLIN | POLLRDNORM;
2210
2211         /* Connection-based need to check for termination and startup */
2212         if (sk->sk_type == SOCK_SEQPACKET) {
2213                 if (sk->sk_state == TCP_CLOSE)
2214                         mask |= POLLHUP;
2215                 /* connection hasn't started yet? */
2216                 if (sk->sk_state == TCP_SYN_SENT)
2217                         return mask;
2218         }
2219
2220         /* No write status requested, avoid expensive OUT tests. */
2221         if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2222                 return mask;
2223
2224         writable = unix_writable(sk);
2225         other = unix_peer_get(sk);
2226         if (other) {
2227                 if (unix_peer(other) != sk) {
2228                         sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2229                         if (unix_recvq_full(other))
2230                                 writable = 0;
2231                 }
2232                 sock_put(other);
2233         }
2234
2235         if (writable)
2236                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2237         else
2238                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2239
2240         return mask;
2241 }
2242
2243 #ifdef CONFIG_PROC_FS
2244
2245 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2246
2247 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2248 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2249 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2250
2251 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2252 {
2253         unsigned long offset = get_offset(*pos);
2254         unsigned long bucket = get_bucket(*pos);
2255         struct sock *sk;
2256         unsigned long count = 0;
2257
2258         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2259                 if (sock_net(sk) != seq_file_net(seq))
2260                         continue;
2261                 if (++count == offset)
2262                         break;
2263         }
2264
2265         return sk;
2266 }
2267
2268 static struct sock *unix_next_socket(struct seq_file *seq,
2269                                      struct sock *sk,
2270                                      loff_t *pos)
2271 {
2272         unsigned long bucket;
2273
2274         while (sk > (struct sock *)SEQ_START_TOKEN) {
2275                 sk = sk_next(sk);
2276                 if (!sk)
2277                         goto next_bucket;
2278                 if (sock_net(sk) == seq_file_net(seq))
2279                         return sk;
2280         }
2281
2282         do {
2283                 sk = unix_from_bucket(seq, pos);
2284                 if (sk)
2285                         return sk;
2286
2287 next_bucket:
2288                 bucket = get_bucket(*pos) + 1;
2289                 *pos = set_bucket_offset(bucket, 1);
2290         } while (bucket < ARRAY_SIZE(unix_socket_table));
2291
2292         return NULL;
2293 }
2294
2295 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2296         __acquires(unix_table_lock)
2297 {
2298         spin_lock(&unix_table_lock);
2299
2300         if (!*pos)
2301                 return SEQ_START_TOKEN;
2302
2303         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2304                 return NULL;
2305
2306         return unix_next_socket(seq, NULL, pos);
2307 }
2308
2309 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2310 {
2311         ++*pos;
2312         return unix_next_socket(seq, v, pos);
2313 }
2314
2315 static void unix_seq_stop(struct seq_file *seq, void *v)
2316         __releases(unix_table_lock)
2317 {
2318         spin_unlock(&unix_table_lock);
2319 }
2320
2321 static int unix_seq_show(struct seq_file *seq, void *v)
2322 {
2323
2324         if (v == SEQ_START_TOKEN)
2325                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2326                          "Inode Path\n");
2327         else {
2328                 struct sock *s = v;
2329                 struct unix_sock *u = unix_sk(s);
2330                 unix_state_lock(s);
2331
2332                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2333                         s,
2334                         atomic_read(&s->sk_refcnt),
2335                         0,
2336                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2337                         s->sk_type,
2338                         s->sk_socket ?
2339                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2340                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2341                         sock_i_ino(s));
2342
2343                 if (u->addr) {
2344                         int i, len;
2345                         seq_putc(seq, ' ');
2346
2347                         i = 0;
2348                         len = u->addr->len - sizeof(short);
2349                         if (!UNIX_ABSTRACT(s))
2350                                 len--;
2351                         else {
2352                                 seq_putc(seq, '@');
2353                                 i++;
2354                         }
2355                         for ( ; i < len; i++)
2356                                 seq_putc(seq, u->addr->name->sun_path[i]);
2357                 }
2358                 unix_state_unlock(s);
2359                 seq_putc(seq, '\n');
2360         }
2361
2362         return 0;
2363 }
2364
2365 static const struct seq_operations unix_seq_ops = {
2366         .start  = unix_seq_start,
2367         .next   = unix_seq_next,
2368         .stop   = unix_seq_stop,
2369         .show   = unix_seq_show,
2370 };
2371
2372 static int unix_seq_open(struct inode *inode, struct file *file)
2373 {
2374         return seq_open_net(inode, file, &unix_seq_ops,
2375                             sizeof(struct seq_net_private));
2376 }
2377
2378 static const struct file_operations unix_seq_fops = {
2379         .owner          = THIS_MODULE,
2380         .open           = unix_seq_open,
2381         .read           = seq_read,
2382         .llseek         = seq_lseek,
2383         .release        = seq_release_net,
2384 };
2385
2386 #endif
2387
2388 static const struct net_proto_family unix_family_ops = {
2389         .family = PF_UNIX,
2390         .create = unix_create,
2391         .owner  = THIS_MODULE,
2392 };
2393
2394
2395 static int __net_init unix_net_init(struct net *net)
2396 {
2397         int error = -ENOMEM;
2398
2399         net->unx.sysctl_max_dgram_qlen = 10;
2400         if (unix_sysctl_register(net))
2401                 goto out;
2402
2403 #ifdef CONFIG_PROC_FS
2404         if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2405                 unix_sysctl_unregister(net);
2406                 goto out;
2407         }
2408 #endif
2409         error = 0;
2410 out:
2411         return error;
2412 }
2413
2414 static void __net_exit unix_net_exit(struct net *net)
2415 {
2416         unix_sysctl_unregister(net);
2417         remove_proc_entry("unix", net->proc_net);
2418 }
2419
2420 static struct pernet_operations unix_net_ops = {
2421         .init = unix_net_init,
2422         .exit = unix_net_exit,
2423 };
2424
2425 static int __init af_unix_init(void)
2426 {
2427         int rc = -1;
2428
2429         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2430
2431         rc = proto_register(&unix_proto, 1);
2432         if (rc != 0) {
2433                 printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2434                        __func__);
2435                 goto out;
2436         }
2437
2438         sock_register(&unix_family_ops);
2439         register_pernet_subsys(&unix_net_ops);
2440 out:
2441         return rc;
2442 }
2443
2444 static void __exit af_unix_exit(void)
2445 {
2446         sock_unregister(PF_UNIX);
2447         proto_unregister(&unix_proto);
2448         unregister_pernet_subsys(&unix_net_ops);
2449 }
2450
2451 /* Earlier than device_initcall() so that other drivers invoking
2452    request_module() don't end up in a loop when modprobe tries
2453    to use a UNIX socket. But later than subsys_initcall() because
2454    we depend on stuff initialised there */
2455 fs_initcall(af_unix_init);
2456 module_exit(af_unix_exit);
2457
2458 MODULE_LICENSE("GPL");
2459 MODULE_ALIAS_NETPROTO(PF_UNIX);