net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  84
  85 #include <linux/module.h>
  86 #include <linux/kernel.h>
  87 #include <linux/signal.h>
  88 #include <linux/sched/signal.h>
  89 #include <linux/errno.h>
  90 #include <linux/string.h>
  91 #include <linux/stat.h>
  92 #include <linux/dcache.h>
  93 #include <linux/namei.h>
  94 #include <linux/socket.h>
  95 #include <linux/un.h>
  96 #include <linux/fcntl.h>
  97 #include <linux/termios.h>
  98 #include <linux/sockios.h>
  99 #include <linux/net.h>
 100 #include <linux/in.h>
 101 #include <linux/fs.h>
 102 #include <linux/slab.h>
 103 #include <linux/uaccess.h>
 104 #include <linux/skbuff.h>
 105 #include <linux/netdevice.h>
 106 #include <net/net_namespace.h>
 107 #include <net/sock.h>
 108 #include <net/tcp_states.h>
 109 #include <net/af_unix.h>
 110 #include <linux/proc_fs.h>
 111 #include <linux/seq_file.h>
 112 #include <net/scm.h>
 113 #include <linux/init.h>
 114 #include <linux/poll.h>
 115 #include <linux/rtnetlink.h>
 116 #include <linux/mount.h>
 117 #include <net/checksum.h>
 118 #include <linux/security.h>
 119 #include <linux/freezer.h>
 120 #include <linux/file.h>
 121
 122 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 123 EXPORT_SYMBOL_GPL(unix_socket_table);
 124 DEFINE_SPINLOCK(unix_table_lock);
 125 EXPORT_SYMBOL_GPL(unix_table_lock);
 126 static atomic_long_t unix_nr_socks;
 127
 128
 129 static struct hlist_head *unix_sockets_unbound(void *addr)
 130 {
 131         unsigned long hash = (unsigned long)addr;
 132
 133         hash ^= hash >> 16;
 134         hash ^= hash >> 8;
 135         hash %= UNIX_HASH_SIZE;
 136         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 137 }
 138
 139 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 140
 141 #ifdef CONFIG_SECURITY_NETWORK
 142 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 143 {
 144         UNIXCB(skb).secid = scm->secid;
 145 }
 146
 147 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 148 {
 149         scm->secid = UNIXCB(skb).secid;
 150 }
 151
 152 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 153 {
 154         return (scm->secid == UNIXCB(skb).secid);
 155 }
 156 #else
 157 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 158 { }
 159
 160 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 161 { }
 162
 163 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 164 {
 165         return true;
 166 }
 167 #endif /* CONFIG_SECURITY_NETWORK */
 168
 169 /*
 170  *  SMP locking strategy:
 171  *    hash table is protected with spinlock unix_table_lock
 172  *    each socket state is protected by separate spin lock.
 173  */
 174
 175 static inline unsigned int unix_hash_fold(__wsum n)
 176 {
 177         unsigned int hash = (__force unsigned int)csum_fold(n);
 178
 179         hash ^= hash>>8;
 180         return hash&(UNIX_HASH_SIZE-1);
 181 }
 182
 183 #define unix_peer(sk) (unix_sk(sk)->peer)
 184
 185 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 186 {
 187         return unix_peer(osk) == sk;
 188 }
 189
 190 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 191 {
 192         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 193 }
 194
 195 static inline int unix_recvq_full(struct sock const *sk)
 196 {
 197         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 198 }
 199
 200 struct sock *unix_peer_get(struct sock *s)
 201 {
 202         struct sock *peer;
 203
 204         unix_state_lock(s);
 205         peer = unix_peer(s);
 206         if (peer)
 207                 sock_hold(peer);
 208         unix_state_unlock(s);
 209         return peer;
 210 }
 211 EXPORT_SYMBOL_GPL(unix_peer_get);
 212
 213 static inline void unix_release_addr(struct unix_address *addr)
 214 {
 215         if (refcount_dec_and_test(&addr->refcnt))
 216                 kfree(addr);
 217 }
 218
 219 /*
 220  *      Check unix socket name:
 221  *              - should be not zero length.
 222  *              - if started by not zero, should be NULL terminated (FS object)
 223  *              - if started by zero, it is abstract name.
 224  */
 225
 226 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 227 {
 228         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 229                 return -EINVAL;
 230         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 231                 return -EINVAL;
 232         if (sunaddr->sun_path[0]) {
 233                 /*
 234                  * This may look like an off by one error but it is a bit more
 235                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 236                  * sun_path[108] doesn't as such exist.  However in kernel space
 237                  * we are guaranteed that it is a valid memory location in our
 238                  * kernel address buffer.
 239                  */
 240                 ((char *)sunaddr)[len] = 0;
 241                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 242                 return len;
 243         }
 244
 245         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 246         return len;
 247 }
 248
 249 static void __unix_remove_socket(struct sock *sk)
 250 {
 251         sk_del_node_init(sk);
 252 }
 253
 254 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 255 {
 256         WARN_ON(!sk_unhashed(sk));
 257         sk_add_node(sk, list);
 258 }
 259
 260 static inline void unix_remove_socket(struct sock *sk)
 261 {
 262         spin_lock(&unix_table_lock);
 263         __unix_remove_socket(sk);
 264         spin_unlock(&unix_table_lock);
 265 }
 266
 267 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 268 {
 269         spin_lock(&unix_table_lock);
 270         __unix_insert_socket(list, sk);
 271         spin_unlock(&unix_table_lock);
 272 }
 273
 274 static struct sock *__unix_find_socket_byname(struct net *net,
 275                                               struct sockaddr_un *sunname,
 276                                               int len, int type, unsigned int hash)
 277 {
 278         struct sock *s;
 279
 280         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 281                 struct unix_sock *u = unix_sk(s);
 282
 283                 if (!net_eq(sock_net(s), net))
 284                         continue;
 285
 286                 if (u->addr->len == len &&
 287                     !memcmp(u->addr->name, sunname, len))
 288                         goto found;
 289         }
 290         s = NULL;
 291 found:
 292         return s;
 293 }
 294
 295 static inline struct sock *unix_find_socket_byname(struct net *net,
 296                                                    struct sockaddr_un *sunname,
 297                                                    int len, int type,
 298                                                    unsigned int hash)
 299 {
 300         struct sock *s;
 301
 302         spin_lock(&unix_table_lock);
 303         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 304         if (s)
 305                 sock_hold(s);
 306         spin_unlock(&unix_table_lock);
 307         return s;
 308 }
 309
 310 static struct sock *unix_find_socket_byinode(struct inode *i)
 311 {
 312         struct sock *s;
 313
 314         spin_lock(&unix_table_lock);
 315         sk_for_each(s,
 316                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 317                 struct dentry *dentry = unix_sk(s)->path.dentry;
 318
 319                 if (dentry && d_backing_inode(dentry) == i) {
 320                         sock_hold(s);
 321                         goto found;
 322                 }
 323         }
 324         s = NULL;
 325 found:
 326         spin_unlock(&unix_table_lock);
 327         return s;
 328 }
 329
 330 /* Support code for asymmetrically connected dgram sockets
 331  *
 332  * If a datagram socket is connected to a socket not itself connected
 333  * to the first socket (eg, /dev/log), clients may only enqueue more
 334  * messages if the present receive queue of the server socket is not
 335  * "too large". This means there's a second writeability condition
 336  * poll and sendmsg need to test. The dgram recv code will do a wake
 337  * up on the peer_wait wait queue of a socket upon reception of a
 338  * datagram which needs to be propagated to sleeping would-be writers
 339  * since these might not have sent anything so far. This can't be
 340  * accomplished via poll_wait because the lifetime of the server
 341  * socket might be less than that of its clients if these break their
 342  * association with it or if the server socket is closed while clients
 343  * are still connected to it and there's no way to inform "a polling
 344  * implementation" that it should let go of a certain wait queue
 345  *
 346  * In order to propagate a wake up, a wait_queue_entry_t of the client
 347  * socket is enqueued on the peer_wait queue of the server socket
 348  * whose wake function does a wake_up on the ordinary client socket
 349  * wait queue. This connection is established whenever a write (or
 350  * poll for write) hit the flow control condition and broken when the
 351  * association to the server socket is dissolved or after a wake up
 352  * was relayed.
 353  */
 354
 355 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 356                                       void *key)
 357 {
 358         struct unix_sock *u;
 359         wait_queue_head_t *u_sleep;
 360
 361         u = container_of(q, struct unix_sock, peer_wake);
 362
 363         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 364                             q);
 365         u->peer_wake.private = NULL;
 366
 367         /* relaying can only happen while the wq still exists */
 368         u_sleep = sk_sleep(&u->sk);
 369         if (u_sleep)
 370                 wake_up_interruptible_poll(u_sleep, key);
 371
 372         return 0;
 373 }
 374
 375 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 376 {
 377         struct unix_sock *u, *u_other;
 378         int rc;
 379
 380         u = unix_sk(sk);
 381         u_other = unix_sk(other);
 382         rc = 0;
 383         spin_lock(&u_other->peer_wait.lock);
 384
 385         if (!u->peer_wake.private) {
 386                 u->peer_wake.private = other;
 387                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 388
 389                 rc = 1;
 390         }
 391
 392         spin_unlock(&u_other->peer_wait.lock);
 393         return rc;
 394 }
 395
 396 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 397                                             struct sock *other)
 398 {
 399         struct unix_sock *u, *u_other;
 400
 401         u = unix_sk(sk);
 402         u_other = unix_sk(other);
 403         spin_lock(&u_other->peer_wait.lock);
 404
 405         if (u->peer_wake.private == other) {
 406                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 407                 u->peer_wake.private = NULL;
 408         }
 409
 410         spin_unlock(&u_other->peer_wait.lock);
 411 }
 412
 413 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 414                                                    struct sock *other)
 415 {
 416         unix_dgram_peer_wake_disconnect(sk, other);
 417         wake_up_interruptible_poll(sk_sleep(sk),
 418                                    POLLOUT |
 419                                    POLLWRNORM |
 420                                    POLLWRBAND);
 421 }
 422
 423 /* preconditions:
 424  *      - unix_peer(sk) == other
 425  *      - association is stable
 426  */
 427 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 428 {
 429         int connected;
 430
 431         connected = unix_dgram_peer_wake_connect(sk, other);
 432
 433         if (unix_recvq_full(other))
 434                 return 1;
 435
 436         if (connected)
 437                 unix_dgram_peer_wake_disconnect(sk, other);
 438
 439         return 0;
 440 }
 441
 442 static int unix_writable(const struct sock *sk)
 443 {
 444         return sk->sk_state != TCP_LISTEN &&
 445                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 446 }
 447
 448 static void unix_write_space(struct sock *sk)
 449 {
 450         struct socket_wq *wq;
 451
 452         rcu_read_lock();
 453         if (unix_writable(sk)) {
 454                 wq = rcu_dereference(sk->sk_wq);
 455                 if (skwq_has_sleeper(wq))
 456                         wake_up_interruptible_sync_poll(&wq->wait,
 457                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 458                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 459         }
 460         rcu_read_unlock();
 461 }
 462
 463 /* When dgram socket disconnects (or changes its peer), we clear its receive
 464  * queue of packets arrived from previous peer. First, it allows to do
 465  * flow control based only on wmem_alloc; second, sk connected to peer
 466  * may receive messages only from that peer. */
 467 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 468 {
 469         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 470                 skb_queue_purge(&sk->sk_receive_queue);
 471                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 472
 473                 /* If one link of bidirectional dgram pipe is disconnected,
 474                  * we signal error. Messages are lost. Do not make this,
 475                  * when peer was not connected to us.
 476                  */
 477                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 478                         other->sk_err = ECONNRESET;
 479                         other->sk_error_report(other);
 480                 }
 481         }
 482 }
 483
 484 static void unix_sock_destructor(struct sock *sk)
 485 {
 486         struct unix_sock *u = unix_sk(sk);
 487
 488         skb_queue_purge(&sk->sk_receive_queue);
 489
 490         WARN_ON(refcount_read(&sk->sk_wmem_alloc));
 491         WARN_ON(!sk_unhashed(sk));
 492         WARN_ON(sk->sk_socket);
 493         if (!sock_flag(sk, SOCK_DEAD)) {
 494                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 495                 return;
 496         }
 497
 498         if (u->addr)
 499                 unix_release_addr(u->addr);
 500
 501         atomic_long_dec(&unix_nr_socks);
 502         local_bh_disable();
 503         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 504         local_bh_enable();
 505 #ifdef UNIX_REFCNT_DEBUG
 506         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 507                 atomic_long_read(&unix_nr_socks));
 508 #endif
 509 }
 510
 511 static void unix_release_sock(struct sock *sk, int embrion)
 512 {
 513         struct unix_sock *u = unix_sk(sk);
 514         struct path path;
 515         struct sock *skpair;
 516         struct sk_buff *skb;
 517         int state;
 518
 519         unix_remove_socket(sk);
 520
 521         /* Clear state */
 522         unix_state_lock(sk);
 523         sock_orphan(sk);
 524         sk->sk_shutdown = SHUTDOWN_MASK;
 525         path         = u->path;
 526         u->path.dentry = NULL;
 527         u->path.mnt = NULL;
 528         state = sk->sk_state;
 529         sk->sk_state = TCP_CLOSE;
 530         unix_state_unlock(sk);
 531
 532         wake_up_interruptible_all(&u->peer_wait);
 533
 534         skpair = unix_peer(sk);
 535
 536         if (skpair != NULL) {
 537                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 538                         unix_state_lock(skpair);
 539                         /* No more writes */
 540                         skpair->sk_shutdown = SHUTDOWN_MASK;
 541                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 542                                 skpair->sk_err = ECONNRESET;
 543                         unix_state_unlock(skpair);
 544                         skpair->sk_state_change(skpair);
 545                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 546                 }
 547
 548                 unix_dgram_peer_wake_disconnect(sk, skpair);
 549                 sock_put(skpair); /* It may now die */
 550                 unix_peer(sk) = NULL;
 551         }
 552
 553         /* Try to flush out this socket. Throw out buffers at least */
 554
 555         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 556                 if (state == TCP_LISTEN)
 557                         unix_release_sock(skb->sk, 1);
 558                 /* passed fds are erased in the kfree_skb hook        */
 559                 UNIXCB(skb).consumed = skb->len;
 560                 kfree_skb(skb);
 561         }
 562
 563         if (path.dentry)
 564                 path_put(&path);
 565
 566         sock_put(sk);
 567
 568         /* ---- Socket is dead now and most probably destroyed ---- */
 569
 570         /*
 571          * Fixme: BSD difference: In BSD all sockets connected to us get
 572          *        ECONNRESET and we die on the spot. In Linux we behave
 573          *        like files and pipes do and wait for the last
 574          *        dereference.
 575          *
 576          * Can't we simply set sock->err?
 577          *
 578          *        What the above comment does talk about? --ANK(980817)
 579          */
 580
 581         if (unix_tot_inflight)
 582                 unix_gc();              /* Garbage collect fds */
 583 }
 584
 585 static void init_peercred(struct sock *sk)
 586 {
 587         put_pid(sk->sk_peer_pid);
 588         if (sk->sk_peer_cred)
 589                 put_cred(sk->sk_peer_cred);
 590         sk->sk_peer_pid  = get_pid(task_tgid(current));
 591         sk->sk_peer_cred = get_current_cred();
 592 }
 593
 594 static void copy_peercred(struct sock *sk, struct sock *peersk)
 595 {
 596         put_pid(sk->sk_peer_pid);
 597         if (sk->sk_peer_cred)
 598                 put_cred(sk->sk_peer_cred);
 599         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 600         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 601 }
 602
 603 static int unix_listen(struct socket *sock, int backlog)
 604 {
 605         int err;
 606         struct sock *sk = sock->sk;
 607         struct unix_sock *u = unix_sk(sk);
 608         struct pid *old_pid = NULL;
 609
 610         err = -EOPNOTSUPP;
 611         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 612                 goto out;       /* Only stream/seqpacket sockets accept */
 613         err = -EINVAL;
 614         if (!u->addr)
 615                 goto out;       /* No listens on an unbound socket */
 616         unix_state_lock(sk);
 617         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 618                 goto out_unlock;
 619         if (backlog > sk->sk_max_ack_backlog)
 620                 wake_up_interruptible_all(&u->peer_wait);
 621         sk->sk_max_ack_backlog  = backlog;
 622         sk->sk_state            = TCP_LISTEN;
 623         /* set credentials so connect can copy them */
 624         init_peercred(sk);
 625         err = 0;
 626
 627 out_unlock:
 628         unix_state_unlock(sk);
 629         put_pid(old_pid);
 630 out:
 631         return err;
 632 }
 633
 634 static int unix_release(struct socket *);
 635 static int unix_bind(struct socket *, struct sockaddr *, int);
 636 static int unix_stream_connect(struct socket *, struct sockaddr *,
 637                                int addr_len, int flags);
 638 static int unix_socketpair(struct socket *, struct socket *);
 639 static int unix_accept(struct socket *, struct socket *, int, bool);
 640 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 641 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 642 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 643                                     poll_table *);
 644 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 645 static int unix_shutdown(struct socket *, int);
 646 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 647 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 648 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 649                                     size_t size, int flags);
 650 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 651                                        struct pipe_inode_info *, size_t size,
 652                                        unsigned int flags);
 653 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 654 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 655 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 656                               int, int);
 657 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 658 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 659                                   int);
 660
 661 static int unix_set_peek_off(struct sock *sk, int val)
 662 {
 663         struct unix_sock *u = unix_sk(sk);
 664
 665         if (mutex_lock_interruptible(&u->iolock))
 666                 return -EINTR;
 667
 668         sk->sk_peek_off = val;
 669         mutex_unlock(&u->iolock);
 670
 671         return 0;
 672 }
 673
 674
 675 static const struct proto_ops unix_stream_ops = {
 676         .family =       PF_UNIX,
 677         .owner =        THIS_MODULE,
 678         .release =      unix_release,
 679         .bind =         unix_bind,
 680         .connect =      unix_stream_connect,
 681         .socketpair =   unix_socketpair,
 682         .accept =       unix_accept,
 683         .getname =      unix_getname,
 684         .poll =         unix_poll,
 685         .ioctl =        unix_ioctl,
 686         .listen =       unix_listen,
 687         .shutdown =     unix_shutdown,
 688         .setsockopt =   sock_no_setsockopt,
 689         .getsockopt =   sock_no_getsockopt,
 690         .sendmsg =      unix_stream_sendmsg,
 691         .recvmsg =      unix_stream_recvmsg,
 692         .mmap =         sock_no_mmap,
 693         .sendpage =     unix_stream_sendpage,
 694         .splice_read =  unix_stream_splice_read,
 695         .set_peek_off = unix_set_peek_off,
 696 };
 697
 698 static const struct proto_ops unix_dgram_ops = {
 699         .family =       PF_UNIX,
 700         .owner =        THIS_MODULE,
 701         .release =      unix_release,
 702         .bind =         unix_bind,
 703         .connect =      unix_dgram_connect,
 704         .socketpair =   unix_socketpair,
 705         .accept =       sock_no_accept,
 706         .getname =      unix_getname,
 707         .poll =         unix_dgram_poll,
 708         .ioctl =        unix_ioctl,
 709         .listen =       sock_no_listen,
 710         .shutdown =     unix_shutdown,
 711         .setsockopt =   sock_no_setsockopt,
 712         .getsockopt =   sock_no_getsockopt,
 713         .sendmsg =      unix_dgram_sendmsg,
 714         .recvmsg =      unix_dgram_recvmsg,
 715         .mmap =         sock_no_mmap,
 716         .sendpage =     sock_no_sendpage,
 717         .set_peek_off = unix_set_peek_off,
 718 };
 719
 720 static const struct proto_ops unix_seqpacket_ops = {
 721         .family =       PF_UNIX,
 722         .owner =        THIS_MODULE,
 723         .release =      unix_release,
 724         .bind =         unix_bind,
 725         .connect =      unix_stream_connect,
 726         .socketpair =   unix_socketpair,
 727         .accept =       unix_accept,
 728         .getname =      unix_getname,
 729         .poll =         unix_dgram_poll,
 730         .ioctl =        unix_ioctl,
 731         .listen =       unix_listen,
 732         .shutdown =     unix_shutdown,
 733         .setsockopt =   sock_no_setsockopt,
 734         .getsockopt =   sock_no_getsockopt,
 735         .sendmsg =      unix_seqpacket_sendmsg,
 736         .recvmsg =      unix_seqpacket_recvmsg,
 737         .mmap =         sock_no_mmap,
 738         .sendpage =     sock_no_sendpage,
 739         .set_peek_off = unix_set_peek_off,
 740 };
 741
 742 static struct proto unix_proto = {
 743         .name                   = "UNIX",
 744         .owner                  = THIS_MODULE,
 745         .obj_size               = sizeof(struct unix_sock),
 746 };
 747
 748 /*
 749  * AF_UNIX sockets do not interact with hardware, hence they
 750  * dont trigger interrupts - so it's safe for them to have
 751  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 752  * this special lock-class by reinitializing the spinlock key:
 753  */
 754 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 755
 756 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 757 {
 758         struct sock *sk = NULL;
 759         struct unix_sock *u;
 760
 761         atomic_long_inc(&unix_nr_socks);
 762         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 763                 goto out;
 764
 765         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 766         if (!sk)
 767                 goto out;
 768
 769         sock_init_data(sock, sk);
 770         lockdep_set_class(&sk->sk_receive_queue.lock,
 771                                 &af_unix_sk_receive_queue_lock_key);
 772
 773         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 774         sk->sk_write_space      = unix_write_space;
 775         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 776         sk->sk_destruct         = unix_sock_destructor;
 777         u         = unix_sk(sk);
 778         u->path.dentry = NULL;
 779         u->path.mnt = NULL;
 780         spin_lock_init(&u->lock);
 781         atomic_long_set(&u->inflight, 0);
 782         INIT_LIST_HEAD(&u->link);
 783         mutex_init(&u->iolock); /* single task reading lock */
 784         mutex_init(&u->bindlock); /* single task binding lock */
 785         init_waitqueue_head(&u->peer_wait);
 786         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 787         unix_insert_socket(unix_sockets_unbound(sk), sk);
 788 out:
 789         if (sk == NULL)
 790                 atomic_long_dec(&unix_nr_socks);
 791         else {
 792                 local_bh_disable();
 793                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 794                 local_bh_enable();
 795         }
 796         return sk;
 797 }
 798
 799 static int unix_create(struct net *net, struct socket *sock, int protocol,
 800                        int kern)
 801 {
 802         if (protocol && protocol != PF_UNIX)
 803                 return -EPROTONOSUPPORT;
 804
 805         sock->state = SS_UNCONNECTED;
 806
 807         switch (sock->type) {
 808         case SOCK_STREAM:
 809                 sock->ops = &unix_stream_ops;
 810                 break;
 811                 /*
 812                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 813                  *      nothing uses it.
 814                  */
 815         case SOCK_RAW:
 816                 sock->type = SOCK_DGRAM;
 817         case SOCK_DGRAM:
 818                 sock->ops = &unix_dgram_ops;
 819                 break;
 820         case SOCK_SEQPACKET:
 821                 sock->ops = &unix_seqpacket_ops;
 822                 break;
 823         default:
 824                 return -ESOCKTNOSUPPORT;
 825         }
 826
 827         return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 828 }
 829
 830 static int unix_release(struct socket *sock)
 831 {
 832         struct sock *sk = sock->sk;
 833
 834         if (!sk)
 835                 return 0;
 836
 837         unix_release_sock(sk, 0);
 838         sock->sk = NULL;
 839
 840         return 0;
 841 }
 842
 843 static int unix_autobind(struct socket *sock)
 844 {
 845         struct sock *sk = sock->sk;
 846         struct net *net = sock_net(sk);
 847         struct unix_sock *u = unix_sk(sk);
 848         static u32 ordernum = 1;
 849         struct unix_address *addr;
 850         int err;
 851         unsigned int retries = 0;
 852
 853         err = mutex_lock_interruptible(&u->bindlock);
 854         if (err)
 855                 return err;
 856
 857         err = 0;
 858         if (u->addr)
 859                 goto out;
 860
 861         err = -ENOMEM;
 862         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 863         if (!addr)
 864                 goto out;
 865
 866         addr->name->sun_family = AF_UNIX;
 867         refcount_set(&addr->refcnt, 1);
 868
 869 retry:
 870         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 871         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 872
 873         spin_lock(&unix_table_lock);
 874         ordernum = (ordernum+1)&0xFFFFF;
 875
 876         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 877                                       addr->hash)) {
 878                 spin_unlock(&unix_table_lock);
 879                 /*
 880                  * __unix_find_socket_byname() may take long time if many names
 881                  * are already in use.
 882                  */
 883                 cond_resched();
 884                 /* Give up if all names seems to be in use. */
 885                 if (retries++ == 0xFFFFF) {
 886                         err = -ENOSPC;
 887                         kfree(addr);
 888                         goto out;
 889                 }
 890                 goto retry;
 891         }
 892         addr->hash ^= sk->sk_type;
 893
 894         __unix_remove_socket(sk);
 895         u->addr = addr;
 896         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 897         spin_unlock(&unix_table_lock);
 898         err = 0;
 899
 900 out:    mutex_unlock(&u->bindlock);
 901         return err;
 902 }
 903
 904 static struct sock *unix_find_other(struct net *net,
 905                                     struct sockaddr_un *sunname, int len,
 906                                     int type, unsigned int hash, int *error)
 907 {
 908         struct sock *u;
 909         struct path path;
 910         int err = 0;
 911
 912         if (sunname->sun_path[0]) {
 913                 struct inode *inode;
 914                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 915                 if (err)
 916                         goto fail;
 917                 inode = d_backing_inode(path.dentry);
 918                 err = inode_permission(inode, MAY_WRITE);
 919                 if (err)
 920                         goto put_fail;
 921
 922                 err = -ECONNREFUSED;
 923                 if (!S_ISSOCK(inode->i_mode))
 924                         goto put_fail;
 925                 u = unix_find_socket_byinode(inode);
 926                 if (!u)
 927                         goto put_fail;
 928
 929                 if (u->sk_type == type)
 930                         touch_atime(&path);
 931
 932                 path_put(&path);
 933
 934                 err = -EPROTOTYPE;
 935                 if (u->sk_type != type) {
 936                         sock_put(u);
 937                         goto fail;
 938                 }
 939         } else {
 940                 err = -ECONNREFUSED;
 941                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 942                 if (u) {
 943                         struct dentry *dentry;
 944                         dentry = unix_sk(u)->path.dentry;
 945                         if (dentry)
 946                                 touch_atime(&unix_sk(u)->path);
 947                 } else
 948                         goto fail;
 949         }
 950         return u;
 951
 952 put_fail:
 953         path_put(&path);
 954 fail:
 955         *error = err;
 956         return NULL;
 957 }
 958
 959 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 960 {
 961         struct dentry *dentry;
 962         struct path path;
 963         int err = 0;
 964         /*
 965          * Get the parent directory, calculate the hash for last
 966          * component.
 967          */
 968         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 969         err = PTR_ERR(dentry);
 970         if (IS_ERR(dentry))
 971                 return err;
 972
 973         /*
 974          * All right, let's create it.
 975          */
 976         err = security_path_mknod(&path, dentry, mode, 0);
 977         if (!err) {
 978                 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
 979                 if (!err) {
 980                         res->mnt = mntget(path.mnt);
 981                         res->dentry = dget(dentry);
 982                 }
 983         }
 984         done_path_create(&path, dentry);
 985         return err;
 986 }
 987
 988 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 989 {
 990         struct sock *sk = sock->sk;
 991         struct net *net = sock_net(sk);
 992         struct unix_sock *u = unix_sk(sk);
 993         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 994         char *sun_path = sunaddr->sun_path;
 995         int err;
 996         unsigned int hash;
 997         struct unix_address *addr;
 998         struct hlist_head *list;
 999         struct path path = { };
1000
1001         err = -EINVAL;
1002         if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
1003             sunaddr->sun_family != AF_UNIX)
1004                 goto out;
1005
1006         if (addr_len == sizeof(short)) {
1007                 err = unix_autobind(sock);
1008                 goto out;
1009         }
1010
1011         err = unix_mkname(sunaddr, addr_len, &hash);
1012         if (err < 0)
1013                 goto out;
1014         addr_len = err;
1015
1016         if (sun_path[0]) {
1017                 umode_t mode = S_IFSOCK |
1018                        (SOCK_INODE(sock)->i_mode & ~current_umask());
1019                 err = unix_mknod(sun_path, mode, &path);
1020                 if (err) {
1021                         if (err == -EEXIST)
1022                                 err = -EADDRINUSE;
1023                         goto out;
1024                 }
1025         }
1026
1027         err = mutex_lock_interruptible(&u->bindlock);
1028         if (err)
1029                 goto out_put;
1030
1031         err = -EINVAL;
1032         if (u->addr)
1033                 goto out_up;
1034
1035         err = -ENOMEM;
1036         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1037         if (!addr)
1038                 goto out_up;
1039
1040         memcpy(addr->name, sunaddr, addr_len);
1041         addr->len = addr_len;
1042         addr->hash = hash ^ sk->sk_type;
1043         refcount_set(&addr->refcnt, 1);
1044
1045         if (sun_path[0]) {
1046                 addr->hash = UNIX_HASH_SIZE;
1047                 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1048                 spin_lock(&unix_table_lock);
1049                 u->path = path;
1050                 list = &unix_socket_table[hash];
1051         } else {
1052                 spin_lock(&unix_table_lock);
1053                 err = -EADDRINUSE;
1054                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
1055                                               sk->sk_type, hash)) {
1056                         unix_release_addr(addr);
1057                         goto out_unlock;
1058                 }
1059
1060                 list = &unix_socket_table[addr->hash];
1061         }
1062
1063         err = 0;
1064         __unix_remove_socket(sk);
1065         u->addr = addr;
1066         __unix_insert_socket(list, sk);
1067
1068 out_unlock:
1069         spin_unlock(&unix_table_lock);
1070 out_up:
1071         mutex_unlock(&u->bindlock);
1072 out_put:
1073         if (err)
1074                 path_put(&path);
1075 out:
1076         return err;
1077 }
1078
1079 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1080 {
1081         if (unlikely(sk1 == sk2) || !sk2) {
1082                 unix_state_lock(sk1);
1083                 return;
1084         }
1085         if (sk1 < sk2) {
1086                 unix_state_lock(sk1);
1087                 unix_state_lock_nested(sk2);
1088         } else {
1089                 unix_state_lock(sk2);
1090                 unix_state_lock_nested(sk1);
1091         }
1092 }
1093
1094 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1095 {
1096         if (unlikely(sk1 == sk2) || !sk2) {
1097                 unix_state_unlock(sk1);
1098                 return;
1099         }
1100         unix_state_unlock(sk1);
1101         unix_state_unlock(sk2);
1102 }
1103
1104 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1105                               int alen, int flags)
1106 {
1107         struct sock *sk = sock->sk;
1108         struct net *net = sock_net(sk);
1109         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1110         struct sock *other;
1111         unsigned int hash;
1112         int err;
1113
1114         err = -EINVAL;
1115         if (alen < offsetofend(struct sockaddr, sa_family))
1116                 goto out;
1117
1118         if (addr->sa_family != AF_UNSPEC) {
1119                 err = unix_mkname(sunaddr, alen, &hash);
1120                 if (err < 0)
1121                         goto out;
1122                 alen = err;
1123
1124                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1125                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1126                         goto out;
1127
1128 restart:
1129                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1130                 if (!other)
1131                         goto out;
1132
1133                 unix_state_double_lock(sk, other);
1134
1135                 /* Apparently VFS overslept socket death. Retry. */
1136                 if (sock_flag(other, SOCK_DEAD)) {
1137                         unix_state_double_unlock(sk, other);
1138                         sock_put(other);
1139                         goto restart;
1140                 }
1141
1142                 err = -EPERM;
1143                 if (!unix_may_send(sk, other))
1144                         goto out_unlock;
1145
1146                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1147                 if (err)
1148                         goto out_unlock;
1149
1150         } else {
1151                 /*
1152                  *      1003.1g breaking connected state with AF_UNSPEC
1153                  */
1154                 other = NULL;
1155                 unix_state_double_lock(sk, other);
1156         }
1157
1158         /*
1159          * If it was connected, reconnect.
1160          */
1161         if (unix_peer(sk)) {
1162                 struct sock *old_peer = unix_peer(sk);
1163                 unix_peer(sk) = other;
1164                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1165
1166                 unix_state_double_unlock(sk, other);
1167
1168                 if (other != old_peer)
1169                         unix_dgram_disconnected(sk, old_peer);
1170                 sock_put(old_peer);
1171         } else {
1172                 unix_peer(sk) = other;
1173                 unix_state_double_unlock(sk, other);
1174         }
1175         return 0;
1176
1177 out_unlock:
1178         unix_state_double_unlock(sk, other);
1179         sock_put(other);
1180 out:
1181         return err;
1182 }
1183
1184 static long unix_wait_for_peer(struct sock *other, long timeo)
1185 {
1186         struct unix_sock *u = unix_sk(other);
1187         int sched;
1188         DEFINE_WAIT(wait);
1189
1190         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1191
1192         sched = !sock_flag(other, SOCK_DEAD) &&
1193                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1194                 unix_recvq_full(other);
1195
1196         unix_state_unlock(other);
1197
1198         if (sched)
1199                 timeo = schedule_timeout(timeo);
1200
1201         finish_wait(&u->peer_wait, &wait);
1202         return timeo;
1203 }
1204
1205 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1206                                int addr_len, int flags)
1207 {
1208         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1209         struct sock *sk = sock->sk;
1210         struct net *net = sock_net(sk);
1211         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1212         struct sock *newsk = NULL;
1213         struct sock *other = NULL;
1214         struct sk_buff *skb = NULL;
1215         unsigned int hash;
1216         int st;
1217         int err;
1218         long timeo;
1219
1220         err = unix_mkname(sunaddr, addr_len, &hash);
1221         if (err < 0)
1222                 goto out;
1223         addr_len = err;
1224
1225         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1226             (err = unix_autobind(sock)) != 0)
1227                 goto out;
1228
1229         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1230
1231         /* First of all allocate resources.
1232            If we will make it after state is locked,
1233            we will have to recheck all again in any case.
1234          */
1235
1236         err = -ENOMEM;
1237
1238         /* create new sock for complete connection */
1239         newsk = unix_create1(sock_net(sk), NULL, 0);
1240         if (newsk == NULL)
1241                 goto out;
1242
1243         /* Allocate skb for sending to listening sock */
1244         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1245         if (skb == NULL)
1246                 goto out;
1247
1248 restart:
1249         /*  Find listening sock. */
1250         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1251         if (!other)
1252                 goto out;
1253
1254         /* Latch state of peer */
1255         unix_state_lock(other);
1256
1257         /* Apparently VFS overslept socket death. Retry. */
1258         if (sock_flag(other, SOCK_DEAD)) {
1259                 unix_state_unlock(other);
1260                 sock_put(other);
1261                 goto restart;
1262         }
1263
1264         err = -ECONNREFUSED;
1265         if (other->sk_state != TCP_LISTEN)
1266                 goto out_unlock;
1267         if (other->sk_shutdown & RCV_SHUTDOWN)
1268                 goto out_unlock;
1269
1270         if (unix_recvq_full(other)) {
1271                 err = -EAGAIN;
1272                 if (!timeo)
1273                         goto out_unlock;
1274
1275                 timeo = unix_wait_for_peer(other, timeo);
1276
1277                 err = sock_intr_errno(timeo);
1278                 if (signal_pending(current))
1279                         goto out;
1280                 sock_put(other);
1281                 goto restart;
1282         }
1283
1284         /* Latch our state.
1285
1286            It is tricky place. We need to grab our state lock and cannot
1287            drop lock on peer. It is dangerous because deadlock is
1288            possible. Connect to self case and simultaneous
1289            attempt to connect are eliminated by checking socket
1290            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1291            check this before attempt to grab lock.
1292
1293            Well, and we have to recheck the state after socket locked.
1294          */
1295         st = sk->sk_state;
1296
1297         switch (st) {
1298         case TCP_CLOSE:
1299                 /* This is ok... continue with connect */
1300                 break;
1301         case TCP_ESTABLISHED:
1302                 /* Socket is already connected */
1303                 err = -EISCONN;
1304                 goto out_unlock;
1305         default:
1306                 err = -EINVAL;
1307                 goto out_unlock;
1308         }
1309
1310         unix_state_lock_nested(sk);
1311
1312         if (sk->sk_state != st) {
1313                 unix_state_unlock(sk);
1314                 unix_state_unlock(other);
1315                 sock_put(other);
1316                 goto restart;
1317         }
1318
1319         err = security_unix_stream_connect(sk, other, newsk);
1320         if (err) {
1321                 unix_state_unlock(sk);
1322                 goto out_unlock;
1323         }
1324
1325         /* The way is open! Fastly set all the necessary fields... */
1326
1327         sock_hold(sk);
1328         unix_peer(newsk)        = sk;
1329         newsk->sk_state         = TCP_ESTABLISHED;
1330         newsk->sk_type          = sk->sk_type;
1331         init_peercred(newsk);
1332         newu = unix_sk(newsk);
1333         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1334         otheru = unix_sk(other);
1335
1336         /* copy address information from listening to new sock*/
1337         if (otheru->addr) {
1338                 refcount_inc(&otheru->addr->refcnt);
1339                 newu->addr = otheru->addr;
1340         }
1341         if (otheru->path.dentry) {
1342                 path_get(&otheru->path);
1343                 newu->path = otheru->path;
1344         }
1345
1346         /* Set credentials */
1347         copy_peercred(sk, other);
1348
1349         sock->state     = SS_CONNECTED;
1350         sk->sk_state    = TCP_ESTABLISHED;
1351         sock_hold(newsk);
1352
1353         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1354         unix_peer(sk)   = newsk;
1355
1356         unix_state_unlock(sk);
1357
1358         /* take ten and and send info to listening sock */
1359         spin_lock(&other->sk_receive_queue.lock);
1360         __skb_queue_tail(&other->sk_receive_queue, skb);
1361         spin_unlock(&other->sk_receive_queue.lock);
1362         unix_state_unlock(other);
1363         other->sk_data_ready(other);
1364         sock_put(other);
1365         return 0;
1366
1367 out_unlock:
1368         if (other)
1369                 unix_state_unlock(other);
1370
1371 out:
1372         kfree_skb(skb);
1373         if (newsk)
1374                 unix_release_sock(newsk, 0);
1375         if (other)
1376                 sock_put(other);
1377         return err;
1378 }
1379
1380 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1381 {
1382         struct sock *ska = socka->sk, *skb = sockb->sk;
1383
1384         /* Join our sockets back to back */
1385         sock_hold(ska);
1386         sock_hold(skb);
1387         unix_peer(ska) = skb;
1388         unix_peer(skb) = ska;
1389         init_peercred(ska);
1390         init_peercred(skb);
1391
1392         if (ska->sk_type != SOCK_DGRAM) {
1393                 ska->sk_state = TCP_ESTABLISHED;
1394                 skb->sk_state = TCP_ESTABLISHED;
1395                 socka->state  = SS_CONNECTED;
1396                 sockb->state  = SS_CONNECTED;
1397         }
1398         return 0;
1399 }
1400
1401 static void unix_sock_inherit_flags(const struct socket *old,
1402                                     struct socket *new)
1403 {
1404         if (test_bit(SOCK_PASSCRED, &old->flags))
1405                 set_bit(SOCK_PASSCRED, &new->flags);
1406         if (test_bit(SOCK_PASSSEC, &old->flags))
1407                 set_bit(SOCK_PASSSEC, &new->flags);
1408 }
1409
1410 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1411                        bool kern)
1412 {
1413         struct sock *sk = sock->sk;
1414         struct sock *tsk;
1415         struct sk_buff *skb;
1416         int err;
1417
1418         err = -EOPNOTSUPP;
1419         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1420                 goto out;
1421
1422         err = -EINVAL;
1423         if (sk->sk_state != TCP_LISTEN)
1424                 goto out;
1425
1426         /* If socket state is TCP_LISTEN it cannot change (for now...),
1427          * so that no locks are necessary.
1428          */
1429
1430         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1431         if (!skb) {
1432                 /* This means receive shutdown. */
1433                 if (err == 0)
1434                         err = -EINVAL;
1435                 goto out;
1436         }
1437
1438         tsk = skb->sk;
1439         skb_free_datagram(sk, skb);
1440         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1441
1442         /* attach accepted sock to socket */
1443         unix_state_lock(tsk);
1444         newsock->state = SS_CONNECTED;
1445         unix_sock_inherit_flags(sock, newsock);
1446         sock_graft(tsk, newsock);
1447         unix_state_unlock(tsk);
1448         return 0;
1449
1450 out:
1451         return err;
1452 }
1453
1454
1455 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1456 {
1457         struct sock *sk = sock->sk;
1458         struct unix_sock *u;
1459         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1460         int err = 0;
1461
1462         if (peer) {
1463                 sk = unix_peer_get(sk);
1464
1465                 err = -ENOTCONN;
1466                 if (!sk)
1467                         goto out;
1468                 err = 0;
1469         } else {
1470                 sock_hold(sk);
1471         }
1472
1473         u = unix_sk(sk);
1474         unix_state_lock(sk);
1475         if (!u->addr) {
1476                 sunaddr->sun_family = AF_UNIX;
1477                 sunaddr->sun_path[0] = 0;
1478                 *uaddr_len = sizeof(short);
1479         } else {
1480                 struct unix_address *addr = u->addr;
1481
1482                 *uaddr_len = addr->len;
1483                 memcpy(sunaddr, addr->name, *uaddr_len);
1484         }
1485         unix_state_unlock(sk);
1486         sock_put(sk);
1487 out:
1488         return err;
1489 }
1490
1491 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1492 {
1493         int i;
1494
1495         scm->fp = UNIXCB(skb).fp;
1496         UNIXCB(skb).fp = NULL;
1497
1498         for (i = scm->fp->count-1; i >= 0; i--)
1499                 unix_notinflight(scm->fp->user, scm->fp->fp[i]);
1500 }
1501
1502 static void unix_destruct_scm(struct sk_buff *skb)
1503 {
1504         struct scm_cookie scm;
1505         memset(&scm, 0, sizeof(scm));
1506         scm.pid  = UNIXCB(skb).pid;
1507         if (UNIXCB(skb).fp)
1508                 unix_detach_fds(&scm, skb);
1509
1510         /* Alas, it calls VFS */
1511         /* So fscking what? fput() had been SMP-safe since the last Summer */
1512         scm_destroy(&scm);
1513         sock_wfree(skb);
1514 }
1515
1516 /*
1517  * The "user->unix_inflight" variable is protected by the garbage
1518  * collection lock, and we just read it locklessly here. If you go
1519  * over the limit, there might be a tiny race in actually noticing
1520  * it across threads. Tough.
1521  */
1522 static inline bool too_many_unix_fds(struct task_struct *p)
1523 {
1524         struct user_struct *user = current_user();
1525
1526         if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
1527                 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1528         return false;
1529 }
1530
1531 #define MAX_RECURSION_LEVEL 4
1532
1533 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1534 {
1535         int i;
1536         unsigned char max_level = 0;
1537
1538         if (too_many_unix_fds(current))
1539                 return -ETOOMANYREFS;
1540
1541         for (i = scm->fp->count - 1; i >= 0; i--) {
1542                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1543
1544                 if (sk)
1545                         max_level = max(max_level,
1546                                         unix_sk(sk)->recursion_level);
1547         }
1548         if (unlikely(max_level > MAX_RECURSION_LEVEL))
1549                 return -ETOOMANYREFS;
1550
1551         /*
1552          * Need to duplicate file references for the sake of garbage
1553          * collection.  Otherwise a socket in the fps might become a
1554          * candidate for GC while the skb is not yet queued.
1555          */
1556         UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1557         if (!UNIXCB(skb).fp)
1558                 return -ENOMEM;
1559
1560         for (i = scm->fp->count - 1; i >= 0; i--)
1561                 unix_inflight(scm->fp->user, scm->fp->fp[i]);
1562         return max_level;
1563 }
1564
1565 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1566 {
1567         int err = 0;
1568
1569         UNIXCB(skb).pid  = get_pid(scm->pid);
1570         UNIXCB(skb).uid = scm->creds.uid;
1571         UNIXCB(skb).gid = scm->creds.gid;
1572         UNIXCB(skb).fp = NULL;
1573         unix_get_secdata(scm, skb);
1574         if (scm->fp && send_fds)
1575                 err = unix_attach_fds(scm, skb);
1576
1577         skb->destructor = unix_destruct_scm;
1578         return err;
1579 }
1580
1581 static bool unix_passcred_enabled(const struct socket *sock,
1582                                   const struct sock *other)
1583 {
1584         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1585                !other->sk_socket ||
1586                test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1587 }
1588
1589 /*
1590  * Some apps rely on write() giving SCM_CREDENTIALS
1591  * We include credentials if source or destination socket
1592  * asserted SOCK_PASSCRED.
1593  */
1594 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1595                             const struct sock *other)
1596 {
1597         if (UNIXCB(skb).pid)
1598                 return;
1599         if (unix_passcred_enabled(sock, other)) {
1600                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1601                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1602         }
1603 }
1604
1605 static int maybe_init_creds(struct scm_cookie *scm,
1606                             struct socket *socket,
1607                             const struct sock *other)
1608 {
1609         int err;
1610         struct msghdr msg = { .msg_controllen = 0 };
1611
1612         err = scm_send(socket, &msg, scm, false);
1613         if (err)
1614                 return err;
1615
1616         if (unix_passcred_enabled(socket, other)) {
1617                 scm->pid = get_pid(task_tgid(current));
1618                 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1619         }
1620         return err;
1621 }
1622
1623 static bool unix_skb_scm_eq(struct sk_buff *skb,
1624                             struct scm_cookie *scm)
1625 {
1626         const struct unix_skb_parms *u = &UNIXCB(skb);
1627
1628         return u->pid == scm->pid &&
1629                uid_eq(u->uid, scm->creds.uid) &&
1630                gid_eq(u->gid, scm->creds.gid) &&
1631                unix_secdata_eq(scm, skb);
1632 }
1633
1634 /*
1635  *      Send AF_UNIX data.
1636  */
1637
1638 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1639                               size_t len)
1640 {
1641         struct sock *sk = sock->sk;
1642         struct net *net = sock_net(sk);
1643         struct unix_sock *u = unix_sk(sk);
1644         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1645         struct sock *other = NULL;
1646         int namelen = 0; /* fake GCC */
1647         int err;
1648         unsigned int hash;
1649         struct sk_buff *skb;
1650         long timeo;
1651         struct scm_cookie scm;
1652         int max_level;
1653         int data_len = 0;
1654         int sk_locked;
1655
1656         wait_for_unix_gc();
1657         err = scm_send(sock, msg, &scm, false);
1658         if (err < 0)
1659                 return err;
1660
1661         err = -EOPNOTSUPP;
1662         if (msg->msg_flags&MSG_OOB)
1663                 goto out;
1664
1665         if (msg->msg_namelen) {
1666                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1667                 if (err < 0)
1668                         goto out;
1669                 namelen = err;
1670         } else {
1671                 sunaddr = NULL;
1672                 err = -ENOTCONN;
1673                 other = unix_peer_get(sk);
1674                 if (!other)
1675                         goto out;
1676         }
1677
1678         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1679             && (err = unix_autobind(sock)) != 0)
1680                 goto out;
1681
1682         err = -EMSGSIZE;
1683         if (len > sk->sk_sndbuf - 32)
1684                 goto out;
1685
1686         if (len > SKB_MAX_ALLOC) {
1687                 data_len = min_t(size_t,
1688                                  len - SKB_MAX_ALLOC,
1689                                  MAX_SKB_FRAGS * PAGE_SIZE);
1690                 data_len = PAGE_ALIGN(data_len);
1691
1692                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1693         }
1694
1695         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1696                                    msg->msg_flags & MSG_DONTWAIT, &err,
1697                                    PAGE_ALLOC_COSTLY_ORDER);
1698         if (skb == NULL)
1699                 goto out;
1700
1701         err = unix_scm_to_skb(&scm, skb, true);
1702         if (err < 0)
1703                 goto out_free;
1704         max_level = err + 1;
1705
1706         skb_put(skb, len - data_len);
1707         skb->data_len = data_len;
1708         skb->len = len;
1709         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1710         if (err)
1711                 goto out_free;
1712
1713         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1714
1715 restart:
1716         if (!other) {
1717                 err = -ECONNRESET;
1718                 if (sunaddr == NULL)
1719                         goto out_free;
1720
1721                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1722                                         hash, &err);
1723                 if (other == NULL)
1724                         goto out_free;
1725         }
1726
1727         if (sk_filter(other, skb) < 0) {
1728                 /* Toss the packet but do not return any error to the sender */
1729                 err = len;
1730                 goto out_free;
1731         }
1732
1733         sk_locked = 0;
1734         unix_state_lock(other);
1735 restart_locked:
1736         err = -EPERM;
1737         if (!unix_may_send(sk, other))
1738                 goto out_unlock;
1739
1740         if (unlikely(sock_flag(other, SOCK_DEAD))) {
1741                 /*
1742                  *      Check with 1003.1g - what should
1743                  *      datagram error
1744                  */
1745                 unix_state_unlock(other);
1746                 sock_put(other);
1747
1748                 if (!sk_locked)
1749                         unix_state_lock(sk);
1750
1751                 err = 0;
1752                 if (unix_peer(sk) == other) {
1753                         unix_peer(sk) = NULL;
1754                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1755
1756                         unix_state_unlock(sk);
1757
1758                         unix_dgram_disconnected(sk, other);
1759                         sock_put(other);
1760                         err = -ECONNREFUSED;
1761                 } else {
1762                         unix_state_unlock(sk);
1763                 }
1764
1765                 other = NULL;
1766                 if (err)
1767                         goto out_free;
1768                 goto restart;
1769         }
1770
1771         err = -EPIPE;
1772         if (other->sk_shutdown & RCV_SHUTDOWN)
1773                 goto out_unlock;
1774
1775         if (sk->sk_type != SOCK_SEQPACKET) {
1776                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1777                 if (err)
1778                         goto out_unlock;
1779         }
1780
1781         /* other == sk && unix_peer(other) != sk if
1782          * - unix_peer(sk) == NULL, destination address bound to sk
1783          * - unix_peer(sk) == sk by time of get but disconnected before lock
1784          */
1785         if (other != sk &&
1786             unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
1787                 if (timeo) {
1788                         timeo = unix_wait_for_peer(other, timeo);
1789
1790                         err = sock_intr_errno(timeo);
1791                         if (signal_pending(current))
1792                                 goto out_free;
1793
1794                         goto restart;
1795                 }
1796
1797                 if (!sk_locked) {
1798                         unix_state_unlock(other);
1799                         unix_state_double_lock(sk, other);
1800                 }
1801
1802                 if (unix_peer(sk) != other ||
1803                     unix_dgram_peer_wake_me(sk, other)) {
1804                         err = -EAGAIN;
1805                         sk_locked = 1;
1806                         goto out_unlock;
1807                 }
1808
1809                 if (!sk_locked) {
1810                         sk_locked = 1;
1811                         goto restart_locked;
1812                 }
1813         }
1814
1815         if (unlikely(sk_locked))
1816                 unix_state_unlock(sk);
1817
1818         if (sock_flag(other, SOCK_RCVTSTAMP))
1819                 __net_timestamp(skb);
1820         maybe_add_creds(skb, sock, other);
1821         skb_queue_tail(&other->sk_receive_queue, skb);
1822         if (max_level > unix_sk(other)->recursion_level)
1823                 unix_sk(other)->recursion_level = max_level;
1824         unix_state_unlock(other);
1825         other->sk_data_ready(other);
1826         sock_put(other);
1827         scm_destroy(&scm);
1828         return len;
1829
1830 out_unlock:
1831         if (sk_locked)
1832                 unix_state_unlock(sk);
1833         unix_state_unlock(other);
1834 out_free:
1835         kfree_skb(skb);
1836 out:
1837         if (other)
1838                 sock_put(other);
1839         scm_destroy(&scm);
1840         return err;
1841 }
1842
1843 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1844  * bytes, and a minimun of a full page.
1845  */
1846 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1847
1848 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1849                                size_t len)
1850 {
1851         struct sock *sk = sock->sk;
1852         struct sock *other = NULL;
1853         int err, size;
1854         struct sk_buff *skb;
1855         int sent = 0;
1856         struct scm_cookie scm;
1857         bool fds_sent = false;
1858         int max_level;
1859         int data_len;
1860
1861         wait_for_unix_gc();
1862         err = scm_send(sock, msg, &scm, false);
1863         if (err < 0)
1864                 return err;
1865
1866         err = -EOPNOTSUPP;
1867         if (msg->msg_flags&MSG_OOB)
1868                 goto out_err;
1869
1870         if (msg->msg_namelen) {
1871                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1872                 goto out_err;
1873         } else {
1874                 err = -ENOTCONN;
1875                 other = unix_peer(sk);
1876                 if (!other)
1877                         goto out_err;
1878         }
1879
1880         if (sk->sk_shutdown & SEND_SHUTDOWN)
1881                 goto pipe_err;
1882
1883         while (sent < len) {
1884                 size = len - sent;
1885
1886                 /* Keep two messages in the pipe so it schedules better */
1887                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1888
1889                 /* allow fallback to order-0 allocations */
1890                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1891
1892                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1893
1894                 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1895
1896                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1897                                            msg->msg_flags & MSG_DONTWAIT, &err,
1898                                            get_order(UNIX_SKB_FRAGS_SZ));
1899                 if (!skb)
1900                         goto out_err;
1901
1902                 /* Only send the fds in the first buffer */
1903                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1904                 if (err < 0) {
1905                         kfree_skb(skb);
1906                         goto out_err;
1907                 }
1908                 max_level = err + 1;
1909                 fds_sent = true;
1910
1911                 skb_put(skb, size - data_len);
1912                 skb->data_len = data_len;
1913                 skb->len = size;
1914                 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1915                 if (err) {
1916                         kfree_skb(skb);
1917                         goto out_err;
1918                 }
1919
1920                 unix_state_lock(other);
1921
1922                 if (sock_flag(other, SOCK_DEAD) ||
1923                     (other->sk_shutdown & RCV_SHUTDOWN))
1924                         goto pipe_err_free;
1925
1926                 maybe_add_creds(skb, sock, other);
1927                 skb_queue_tail(&other->sk_receive_queue, skb);
1928                 if (max_level > unix_sk(other)->recursion_level)
1929                         unix_sk(other)->recursion_level = max_level;
1930                 unix_state_unlock(other);
1931                 other->sk_data_ready(other);
1932                 sent += size;
1933         }
1934
1935         scm_destroy(&scm);
1936
1937         return sent;
1938
1939 pipe_err_free:
1940         unix_state_unlock(other);
1941         kfree_skb(skb);
1942 pipe_err:
1943         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1944                 send_sig(SIGPIPE, current, 0);
1945         err = -EPIPE;
1946 out_err:
1947         scm_destroy(&scm);
1948         return sent ? : err;
1949 }
1950
1951 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1952                                     int offset, size_t size, int flags)
1953 {
1954         int err;
1955         bool send_sigpipe = false;
1956         bool init_scm = true;
1957         struct scm_cookie scm;
1958         struct sock *other, *sk = socket->sk;
1959         struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1960
1961         if (flags & MSG_OOB)
1962                 return -EOPNOTSUPP;
1963
1964         other = unix_peer(sk);
1965         if (!other || sk->sk_state != TCP_ESTABLISHED)
1966                 return -ENOTCONN;
1967
1968         if (false) {
1969 alloc_skb:
1970                 unix_state_unlock(other);
1971                 mutex_unlock(&unix_sk(other)->iolock);
1972                 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1973                                               &err, 0);
1974                 if (!newskb)
1975                         goto err;
1976         }
1977
1978         /* we must acquire iolock as we modify already present
1979          * skbs in the sk_receive_queue and mess with skb->len
1980          */
1981         err = mutex_lock_interruptible(&unix_sk(other)->iolock);
1982         if (err) {
1983                 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1984                 goto err;
1985         }
1986
1987         if (sk->sk_shutdown & SEND_SHUTDOWN) {
1988                 err = -EPIPE;
1989                 send_sigpipe = true;
1990                 goto err_unlock;
1991         }
1992
1993         unix_state_lock(other);
1994
1995         if (sock_flag(other, SOCK_DEAD) ||
1996             other->sk_shutdown & RCV_SHUTDOWN) {
1997                 err = -EPIPE;
1998                 send_sigpipe = true;
1999                 goto err_state_unlock;
2000         }
2001
2002         if (init_scm) {
2003                 err = maybe_init_creds(&scm, socket, other);
2004                 if (err)
2005                         goto err_state_unlock;
2006                 init_scm = false;
2007         }
2008
2009         skb = skb_peek_tail(&other->sk_receive_queue);
2010         if (tail && tail == skb) {
2011                 skb = newskb;
2012         } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2013                 if (newskb) {
2014                         skb = newskb;
2015                 } else {
2016                         tail = skb;
2017                         goto alloc_skb;
2018                 }
2019         } else if (newskb) {
2020                 /* this is fast path, we don't necessarily need to
2021                  * call to kfree_skb even though with newskb == NULL
2022                  * this - does no harm
2023                  */
2024                 consume_skb(newskb);
2025                 newskb = NULL;
2026         }
2027
2028         if (skb_append_pagefrags(skb, page, offset, size)) {
2029                 tail = skb;
2030                 goto alloc_skb;
2031         }
2032
2033         skb->len += size;
2034         skb->data_len += size;
2035         skb->truesize += size;
2036         refcount_add(size, &sk->sk_wmem_alloc);
2037
2038         if (newskb) {
2039                 err = unix_scm_to_skb(&scm, skb, false);
2040                 if (err)
2041                         goto err_state_unlock;
2042                 spin_lock(&other->sk_receive_queue.lock);
2043                 __skb_queue_tail(&other->sk_receive_queue, newskb);
2044                 spin_unlock(&other->sk_receive_queue.lock);
2045         }
2046
2047         unix_state_unlock(other);
2048         mutex_unlock(&unix_sk(other)->iolock);
2049
2050         other->sk_data_ready(other);
2051         scm_destroy(&scm);
2052         return size;
2053
2054 err_state_unlock:
2055         unix_state_unlock(other);
2056 err_unlock:
2057         mutex_unlock(&unix_sk(other)->iolock);
2058 err:
2059         kfree_skb(newskb);
2060         if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2061                 send_sig(SIGPIPE, current, 0);
2062         if (!init_scm)
2063                 scm_destroy(&scm);
2064         return err;
2065 }
2066
2067 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2068                                   size_t len)
2069 {
2070         int err;
2071         struct sock *sk = sock->sk;
2072
2073         err = sock_error(sk);
2074         if (err)
2075                 return err;
2076
2077         if (sk->sk_state != TCP_ESTABLISHED)
2078                 return -ENOTCONN;
2079
2080         if (msg->msg_namelen)
2081                 msg->msg_namelen = 0;
2082
2083         return unix_dgram_sendmsg(sock, msg, len);
2084 }
2085
2086 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2087                                   size_t size, int flags)
2088 {
2089         struct sock *sk = sock->sk;
2090
2091         if (sk->sk_state != TCP_ESTABLISHED)
2092                 return -ENOTCONN;
2093
2094         return unix_dgram_recvmsg(sock, msg, size, flags);
2095 }
2096
2097 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2098 {
2099         struct unix_sock *u = unix_sk(sk);
2100
2101         if (u->addr) {
2102                 msg->msg_namelen = u->addr->len;
2103                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
2104         }
2105 }
2106
2107 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2108                               size_t size, int flags)
2109 {
2110         struct scm_cookie scm;
2111         struct sock *sk = sock->sk;
2112         struct unix_sock *u = unix_sk(sk);
2113         struct sk_buff *skb, *last;
2114         long timeo;
2115         int err;
2116         int peeked, skip;
2117
2118         err = -EOPNOTSUPP;
2119         if (flags&MSG_OOB)
2120                 goto out;
2121
2122         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2123
2124         do {
2125                 mutex_lock(&u->iolock);
2126
2127                 skip = sk_peek_offset(sk, flags);
2128                 skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
2129                                               &err, &last);
2130                 if (skb)
2131                         break;
2132
2133                 mutex_unlock(&u->iolock);
2134
2135                 if (err != -EAGAIN)
2136                         break;
2137         } while (timeo &&
2138                  !__skb_wait_for_more_packets(sk, &err, &timeo, last));
2139
2140         if (!skb) { /* implies iolock unlocked */
2141                 unix_state_lock(sk);
2142                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2143                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2144                     (sk->sk_shutdown & RCV_SHUTDOWN))
2145                         err = 0;
2146                 unix_state_unlock(sk);
2147                 goto out;
2148         }
2149
2150         if (wq_has_sleeper(&u->peer_wait))
2151                 wake_up_interruptible_sync_poll(&u->peer_wait,
2152                                                 POLLOUT | POLLWRNORM |
2153                                                 POLLWRBAND);
2154
2155         if (msg->msg_name)
2156                 unix_copy_addr(msg, skb->sk);
2157
2158         if (size > skb->len - skip)
2159                 size = skb->len - skip;
2160         else if (size < skb->len - skip)
2161                 msg->msg_flags |= MSG_TRUNC;
2162
2163         err = skb_copy_datagram_msg(skb, skip, msg, size);
2164         if (err)
2165                 goto out_free;
2166
2167         if (sock_flag(sk, SOCK_RCVTSTAMP))
2168                 __sock_recv_timestamp(msg, sk, skb);
2169
2170         memset(&scm, 0, sizeof(scm));
2171
2172         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2173         unix_set_secdata(&scm, skb);
2174
2175         if (!(flags & MSG_PEEK)) {
2176                 if (UNIXCB(skb).fp)
2177                         unix_detach_fds(&scm, skb);
2178
2179                 sk_peek_offset_bwd(sk, skb->len);
2180         } else {
2181                 /* It is questionable: on PEEK we could:
2182                    - do not return fds - good, but too simple 8)
2183                    - return fds, and do not return them on read (old strategy,
2184                      apparently wrong)
2185                    - clone fds (I chose it for now, it is the most universal
2186                      solution)
2187
2188                    POSIX 1003.1g does not actually define this clearly
2189                    at all. POSIX 1003.1g doesn't define a lot of things
2190                    clearly however!
2191
2192                 */
2193
2194                 sk_peek_offset_fwd(sk, size);
2195
2196                 if (UNIXCB(skb).fp)
2197                         scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2198         }
2199         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2200
2201         scm_recv(sock, msg, &scm, flags);
2202
2203 out_free:
2204         skb_free_datagram(sk, skb);
2205         mutex_unlock(&u->iolock);
2206 out:
2207         return err;
2208 }
2209
2210 /*
2211  *      Sleep until more data has arrived. But check for races..
2212  */
2213 static long unix_stream_data_wait(struct sock *sk, long timeo,
2214                                   struct sk_buff *last, unsigned int last_len,
2215                                   bool freezable)
2216 {
2217         struct sk_buff *tail;
2218         DEFINE_WAIT(wait);
2219
2220         unix_state_lock(sk);
2221
2222         for (;;) {
2223                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2224
2225                 tail = skb_peek_tail(&sk->sk_receive_queue);
2226                 if (tail != last ||
2227                     (tail && tail->len != last_len) ||
2228                     sk->sk_err ||
2229                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2230                     signal_pending(current) ||
2231                     !timeo)
2232                         break;
2233
2234                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2235                 unix_state_unlock(sk);
2236                 if (freezable)
2237                         timeo = freezable_schedule_timeout(timeo);
2238                 else
2239                         timeo = schedule_timeout(timeo);
2240                 unix_state_lock(sk);
2241
2242                 if (sock_flag(sk, SOCK_DEAD))
2243                         break;
2244
2245                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2246         }
2247
2248         finish_wait(sk_sleep(sk), &wait);
2249         unix_state_unlock(sk);
2250         return timeo;
2251 }
2252
2253 static unsigned int unix_skb_len(const struct sk_buff *skb)
2254 {
2255         return skb->len - UNIXCB(skb).consumed;
2256 }
2257
2258 struct unix_stream_read_state {
2259         int (*recv_actor)(struct sk_buff *, int, int,
2260                           struct unix_stream_read_state *);
2261         struct socket *socket;
2262         struct msghdr *msg;
2263         struct pipe_inode_info *pipe;
2264         size_t size;
2265         int flags;
2266         unsigned int splice_flags;
2267 };
2268
2269 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2270                                     bool freezable)
2271 {
2272         struct scm_cookie scm;
2273         struct socket *sock = state->socket;
2274         struct sock *sk = sock->sk;
2275         struct unix_sock *u = unix_sk(sk);
2276         int copied = 0;
2277         int flags = state->flags;
2278         int noblock = flags & MSG_DONTWAIT;
2279         bool check_creds = false;
2280         int target;
2281         int err = 0;
2282         long timeo;
2283         int skip;
2284         size_t size = state->size;
2285         unsigned int last_len;
2286
2287         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2288                 err = -EINVAL;
2289                 goto out;
2290         }
2291
2292         if (unlikely(flags & MSG_OOB)) {
2293                 err = -EOPNOTSUPP;
2294                 goto out;
2295         }
2296
2297         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2298         timeo = sock_rcvtimeo(sk, noblock);
2299
2300         memset(&scm, 0, sizeof(scm));
2301
2302         /* Lock the socket to prevent queue disordering
2303          * while sleeps in memcpy_tomsg
2304          */
2305         mutex_lock(&u->iolock);
2306
2307         if (flags & MSG_PEEK)
2308                 skip = sk_peek_offset(sk, flags);
2309         else
2310                 skip = 0;
2311
2312         do {
2313                 int chunk;
2314                 bool drop_skb;
2315                 struct sk_buff *skb, *last;
2316
2317 redo:
2318                 unix_state_lock(sk);
2319                 if (sock_flag(sk, SOCK_DEAD)) {
2320                         err = -ECONNRESET;
2321                         goto unlock;
2322                 }
2323                 last = skb = skb_peek(&sk->sk_receive_queue);
2324                 last_len = last ? last->len : 0;
2325 again:
2326                 if (skb == NULL) {
2327                         unix_sk(sk)->recursion_level = 0;
2328                         if (copied >= target)
2329                                 goto unlock;
2330
2331                         /*
2332                          *      POSIX 1003.1g mandates this order.
2333                          */
2334
2335                         err = sock_error(sk);
2336                         if (err)
2337                                 goto unlock;
2338                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2339                                 goto unlock;
2340
2341                         unix_state_unlock(sk);
2342                         if (!timeo) {
2343                                 err = -EAGAIN;
2344                                 break;
2345                         }
2346
2347                         mutex_unlock(&u->iolock);
2348
2349                         timeo = unix_stream_data_wait(sk, timeo, last,
2350                                                       last_len, freezable);
2351
2352                         if (signal_pending(current)) {
2353                                 err = sock_intr_errno(timeo);
2354                                 scm_destroy(&scm);
2355                                 goto out;
2356                         }
2357
2358                         mutex_lock(&u->iolock);
2359                         goto redo;
2360 unlock:
2361                         unix_state_unlock(sk);
2362                         break;
2363                 }
2364
2365                 while (skip >= unix_skb_len(skb)) {
2366                         skip -= unix_skb_len(skb);
2367                         last = skb;
2368                         last_len = skb->len;
2369                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2370                         if (!skb)
2371                                 goto again;
2372                 }
2373
2374                 unix_state_unlock(sk);
2375
2376                 if (check_creds) {
2377                         /* Never glue messages from different writers */
2378                         if (!unix_skb_scm_eq(skb, &scm))
2379                                 break;
2380                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2381                         /* Copy credentials */
2382                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2383                         unix_set_secdata(&scm, skb);
2384                         check_creds = true;
2385                 }
2386
2387                 /* Copy address just once */
2388                 if (state->msg && state->msg->msg_name) {
2389                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2390                                          state->msg->msg_name);
2391                         unix_copy_addr(state->msg, skb->sk);
2392                         sunaddr = NULL;
2393                 }
2394
2395                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2396                 skb_get(skb);
2397                 chunk = state->recv_actor(skb, skip, chunk, state);
2398                 drop_skb = !unix_skb_len(skb);
2399                 /* skb is only safe to use if !drop_skb */
2400                 consume_skb(skb);
2401                 if (chunk < 0) {
2402                         if (copied == 0)
2403                                 copied = -EFAULT;
2404                         break;
2405                 }
2406                 copied += chunk;
2407                 size -= chunk;
2408
2409                 if (drop_skb) {
2410                         /* the skb was touched by a concurrent reader;
2411                          * we should not expect anything from this skb
2412                          * anymore and assume it invalid - we can be
2413                          * sure it was dropped from the socket queue
2414                          *
2415                          * let's report a short read
2416                          */
2417                         err = 0;
2418                         break;
2419                 }
2420
2421                 /* Mark read part of skb as used */
2422                 if (!(flags & MSG_PEEK)) {
2423                         UNIXCB(skb).consumed += chunk;
2424
2425                         sk_peek_offset_bwd(sk, chunk);
2426
2427                         if (UNIXCB(skb).fp)
2428                                 unix_detach_fds(&scm, skb);
2429
2430                         if (unix_skb_len(skb))
2431                                 break;
2432
2433                         skb_unlink(skb, &sk->sk_receive_queue);
2434                         consume_skb(skb);
2435
2436                         if (scm.fp)
2437                                 break;
2438                 } else {
2439                         /* It is questionable, see note in unix_dgram_recvmsg.
2440                          */
2441                         if (UNIXCB(skb).fp)
2442                                 scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2443
2444                         sk_peek_offset_fwd(sk, chunk);
2445
2446                         if (UNIXCB(skb).fp)
2447                                 break;
2448
2449                         skip = 0;
2450                         last = skb;
2451                         last_len = skb->len;
2452                         unix_state_lock(sk);
2453                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2454                         if (skb)
2455                                 goto again;
2456                         unix_state_unlock(sk);
2457                         break;
2458                 }
2459         } while (size);
2460
2461         mutex_unlock(&u->iolock);
2462         if (state->msg)
2463                 scm_recv(sock, state->msg, &scm, flags);
2464         else
2465                 scm_destroy(&scm);
2466 out:
2467         return copied ? : err;
2468 }
2469
2470 static int unix_stream_read_actor(struct sk_buff *skb,
2471                                   int skip, int chunk,
2472                                   struct unix_stream_read_state *state)
2473 {
2474         int ret;
2475
2476         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2477                                     state->msg, chunk);
2478         return ret ?: chunk;
2479 }
2480
2481 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2482                                size_t size, int flags)
2483 {
2484         struct unix_stream_read_state state = {
2485                 .recv_actor = unix_stream_read_actor,
2486                 .socket = sock,
2487                 .msg = msg,
2488                 .size = size,
2489                 .flags = flags
2490         };
2491
2492         return unix_stream_read_generic(&state, true);
2493 }
2494
2495 static int unix_stream_splice_actor(struct sk_buff *skb,
2496                                     int skip, int chunk,
2497                                     struct unix_stream_read_state *state)
2498 {
2499         return skb_splice_bits(skb, state->socket->sk,
2500                                UNIXCB(skb).consumed + skip,
2501                                state->pipe, chunk, state->splice_flags);
2502 }
2503
2504 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2505                                        struct pipe_inode_info *pipe,
2506                                        size_t size, unsigned int flags)
2507 {
2508         struct unix_stream_read_state state = {
2509                 .recv_actor = unix_stream_splice_actor,
2510                 .socket = sock,
2511                 .pipe = pipe,
2512                 .size = size,
2513                 .splice_flags = flags,
2514         };
2515
2516         if (unlikely(*ppos))
2517                 return -ESPIPE;
2518
2519         if (sock->file->f_flags & O_NONBLOCK ||
2520             flags & SPLICE_F_NONBLOCK)
2521                 state.flags = MSG_DONTWAIT;
2522
2523         return unix_stream_read_generic(&state, false);
2524 }
2525
2526 static int unix_shutdown(struct socket *sock, int mode)
2527 {
2528         struct sock *sk = sock->sk;
2529         struct sock *other;
2530
2531         if (mode < SHUT_RD || mode > SHUT_RDWR)
2532                 return -EINVAL;
2533         /* This maps:
2534          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2535          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2536          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2537          */
2538         ++mode;
2539
2540         unix_state_lock(sk);
2541         sk->sk_shutdown |= mode;
2542         other = unix_peer(sk);
2543         if (other)
2544                 sock_hold(other);
2545         unix_state_unlock(sk);
2546         sk->sk_state_change(sk);
2547
2548         if (other &&
2549                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2550
2551                 int peer_mode = 0;
2552
2553                 if (mode&RCV_SHUTDOWN)
2554                         peer_mode |= SEND_SHUTDOWN;
2555                 if (mode&SEND_SHUTDOWN)
2556                         peer_mode |= RCV_SHUTDOWN;
2557                 unix_state_lock(other);
2558                 other->sk_shutdown |= peer_mode;
2559                 unix_state_unlock(other);
2560                 other->sk_state_change(other);
2561                 if (peer_mode == SHUTDOWN_MASK)
2562                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2563                 else if (peer_mode & RCV_SHUTDOWN)
2564                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2565         }
2566         if (other)
2567                 sock_put(other);
2568
2569         return 0;
2570 }
2571
2572 long unix_inq_len(struct sock *sk)
2573 {
2574         struct sk_buff *skb;
2575         long amount = 0;
2576
2577         if (sk->sk_state == TCP_LISTEN)
2578                 return -EINVAL;
2579
2580         spin_lock(&sk->sk_receive_queue.lock);
2581         if (sk->sk_type == SOCK_STREAM ||
2582             sk->sk_type == SOCK_SEQPACKET) {
2583                 skb_queue_walk(&sk->sk_receive_queue, skb)
2584                         amount += unix_skb_len(skb);
2585         } else {
2586                 skb = skb_peek(&sk->sk_receive_queue);
2587                 if (skb)
2588                         amount = skb->len;
2589         }
2590         spin_unlock(&sk->sk_receive_queue.lock);
2591
2592         return amount;
2593 }
2594 EXPORT_SYMBOL_GPL(unix_inq_len);
2595
2596 long unix_outq_len(struct sock *sk)
2597 {
2598         return sk_wmem_alloc_get(sk);
2599 }
2600 EXPORT_SYMBOL_GPL(unix_outq_len);
2601
2602 static int unix_open_file(struct sock *sk)
2603 {
2604         struct path path;
2605         struct file *f;
2606         int fd;
2607
2608         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2609                 return -EPERM;
2610
2611         unix_state_lock(sk);
2612         path = unix_sk(sk)->path;
2613         if (!path.dentry) {
2614                 unix_state_unlock(sk);
2615                 return -ENOENT;
2616         }
2617
2618         path_get(&path);
2619         unix_state_unlock(sk);
2620
2621         fd = get_unused_fd_flags(O_CLOEXEC);
2622         if (fd < 0)
2623                 goto out;
2624
2625         f = dentry_open(&path, O_PATH, current_cred());
2626         if (IS_ERR(f)) {
2627                 put_unused_fd(fd);
2628                 fd = PTR_ERR(f);
2629                 goto out;
2630         }
2631
2632         fd_install(fd, f);
2633 out:
2634         path_put(&path);
2635
2636         return fd;
2637 }
2638
2639 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2640 {
2641         struct sock *sk = sock->sk;
2642         long amount = 0;
2643         int err;
2644
2645         switch (cmd) {
2646         case SIOCOUTQ:
2647                 amount = unix_outq_len(sk);
2648                 err = put_user(amount, (int __user *)arg);
2649                 break;
2650         case SIOCINQ:
2651                 amount = unix_inq_len(sk);
2652                 if (amount < 0)
2653                         err = amount;
2654                 else
2655                         err = put_user(amount, (int __user *)arg);
2656                 break;
2657         case SIOCUNIXFILE:
2658                 err = unix_open_file(sk);
2659                 break;
2660         default:
2661                 err = -ENOIOCTLCMD;
2662                 break;
2663         }
2664         return err;
2665 }
2666
2667 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2668 {
2669         struct sock *sk = sock->sk;
2670         unsigned int mask;
2671
2672         sock_poll_wait(file, sk_sleep(sk), wait);
2673         mask = 0;
2674
2675         /* exceptional events? */
2676         if (sk->sk_err)
2677                 mask |= POLLERR;
2678         if (sk->sk_shutdown == SHUTDOWN_MASK)
2679                 mask |= POLLHUP;
2680         if (sk->sk_shutdown & RCV_SHUTDOWN)
2681                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2682
2683         /* readable? */
2684         if (!skb_queue_empty(&sk->sk_receive_queue))
2685                 mask |= POLLIN | POLLRDNORM;
2686
2687         /* Connection-based need to check for termination and startup */
2688         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2689             sk->sk_state == TCP_CLOSE)
2690                 mask |= POLLHUP;
2691
2692         /*
2693          * we set writable also when the other side has shut down the
2694          * connection. This prevents stuck sockets.
2695          */
2696         if (unix_writable(sk))
2697                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2698
2699         return mask;
2700 }
2701
2702 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2703                                     poll_table *wait)
2704 {
2705         struct sock *sk = sock->sk, *other;
2706         unsigned int mask, writable;
2707
2708         sock_poll_wait(file, sk_sleep(sk), wait);
2709         mask = 0;
2710
2711         /* exceptional events? */
2712         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2713                 mask |= POLLERR |
2714                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2715
2716         if (sk->sk_shutdown & RCV_SHUTDOWN)
2717                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2718         if (sk->sk_shutdown == SHUTDOWN_MASK)
2719                 mask |= POLLHUP;
2720
2721         /* readable? */
2722         if (!skb_queue_empty(&sk->sk_receive_queue))
2723                 mask |= POLLIN | POLLRDNORM;
2724
2725         /* Connection-based need to check for termination and startup */
2726         if (sk->sk_type == SOCK_SEQPACKET) {
2727                 if (sk->sk_state == TCP_CLOSE)
2728                         mask |= POLLHUP;
2729                 /* connection hasn't started yet? */
2730                 if (sk->sk_state == TCP_SYN_SENT)
2731                         return mask;
2732         }
2733
2734         /* No write status requested, avoid expensive OUT tests. */
2735         if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2736                 return mask;
2737
2738         writable = unix_writable(sk);
2739         if (writable) {
2740                 unix_state_lock(sk);
2741
2742                 other = unix_peer(sk);
2743                 if (other && unix_peer(other) != sk &&
2744                     unix_recvq_full(other) &&
2745                     unix_dgram_peer_wake_me(sk, other))
2746                         writable = 0;
2747
2748                 unix_state_unlock(sk);
2749         }
2750
2751         if (writable)
2752                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2753         else
2754                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2755
2756         return mask;
2757 }
2758
2759 #ifdef CONFIG_PROC_FS
2760
2761 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2762
2763 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2764 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2765 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2766
2767 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2768 {
2769         unsigned long offset = get_offset(*pos);
2770         unsigned long bucket = get_bucket(*pos);
2771         struct sock *sk;
2772         unsigned long count = 0;
2773
2774         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2775                 if (sock_net(sk) != seq_file_net(seq))
2776                         continue;
2777                 if (++count == offset)
2778                         break;
2779         }
2780
2781         return sk;
2782 }
2783
2784 static struct sock *unix_next_socket(struct seq_file *seq,
2785                                      struct sock *sk,
2786                                      loff_t *pos)
2787 {
2788         unsigned long bucket;
2789
2790         while (sk > (struct sock *)SEQ_START_TOKEN) {
2791                 sk = sk_next(sk);
2792                 if (!sk)
2793                         goto next_bucket;
2794                 if (sock_net(sk) == seq_file_net(seq))
2795                         return sk;
2796         }
2797
2798         do {
2799                 sk = unix_from_bucket(seq, pos);
2800                 if (sk)
2801                         return sk;
2802
2803 next_bucket:
2804                 bucket = get_bucket(*pos) + 1;
2805                 *pos = set_bucket_offset(bucket, 1);
2806         } while (bucket < ARRAY_SIZE(unix_socket_table));
2807
2808         return NULL;
2809 }
2810
2811 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2812         __acquires(unix_table_lock)
2813 {
2814         spin_lock(&unix_table_lock);
2815
2816         if (!*pos)
2817                 return SEQ_START_TOKEN;
2818
2819         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2820                 return NULL;
2821
2822         return unix_next_socket(seq, NULL, pos);
2823 }
2824
2825 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2826 {
2827         ++*pos;
2828         return unix_next_socket(seq, v, pos);
2829 }
2830
2831 static void unix_seq_stop(struct seq_file *seq, void *v)
2832         __releases(unix_table_lock)
2833 {
2834         spin_unlock(&unix_table_lock);
2835 }
2836
2837 static int unix_seq_show(struct seq_file *seq, void *v)
2838 {
2839
2840         if (v == SEQ_START_TOKEN)
2841                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2842                          "Inode Path\n");
2843         else {
2844                 struct sock *s = v;
2845                 struct unix_sock *u = unix_sk(s);
2846                 unix_state_lock(s);
2847
2848                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2849                         s,
2850                         refcount_read(&s->sk_refcnt),
2851                         0,
2852                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2853                         s->sk_type,
2854                         s->sk_socket ?
2855                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2856                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2857                         sock_i_ino(s));
2858
2859                 if (u->addr) {
2860                         int i, len;
2861                         seq_putc(seq, ' ');
2862
2863                         i = 0;
2864                         len = u->addr->len - sizeof(short);
2865                         if (!UNIX_ABSTRACT(s))
2866                                 len--;
2867                         else {
2868                                 seq_putc(seq, '@');
2869                                 i++;
2870                         }
2871                         for ( ; i < len; i++)
2872                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
2873                                          '@');
2874                 }
2875                 unix_state_unlock(s);
2876                 seq_putc(seq, '\n');
2877         }
2878
2879         return 0;
2880 }
2881
2882 static const struct seq_operations unix_seq_ops = {
2883         .start  = unix_seq_start,
2884         .next   = unix_seq_next,
2885         .stop   = unix_seq_stop,
2886         .show   = unix_seq_show,
2887 };
2888
2889 static int unix_seq_open(struct inode *inode, struct file *file)
2890 {
2891         return seq_open_net(inode, file, &unix_seq_ops,
2892                             sizeof(struct seq_net_private));
2893 }
2894
2895 static const struct file_operations unix_seq_fops = {
2896         .owner          = THIS_MODULE,
2897         .open           = unix_seq_open,
2898         .read           = seq_read,
2899         .llseek         = seq_lseek,
2900         .release        = seq_release_net,
2901 };
2902
2903 #endif
2904
2905 static const struct net_proto_family unix_family_ops = {
2906         .family = PF_UNIX,
2907         .create = unix_create,
2908         .owner  = THIS_MODULE,
2909 };
2910
2911
2912 static int __net_init unix_net_init(struct net *net)
2913 {
2914         int error = -ENOMEM;
2915
2916         net->unx.sysctl_max_dgram_qlen = 10;
2917         if (unix_sysctl_register(net))
2918                 goto out;
2919
2920 #ifdef CONFIG_PROC_FS
2921         if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2922                 unix_sysctl_unregister(net);
2923                 goto out;
2924         }
2925 #endif
2926         error = 0;
2927 out:
2928         return error;
2929 }
2930
2931 static void __net_exit unix_net_exit(struct net *net)
2932 {
2933         unix_sysctl_unregister(net);
2934         remove_proc_entry("unix", net->proc_net);
2935 }
2936
2937 static struct pernet_operations unix_net_ops = {
2938         .init = unix_net_init,
2939         .exit = unix_net_exit,
2940 };
2941
2942 static int __init af_unix_init(void)
2943 {
2944         int rc = -1;
2945
2946         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2947
2948         rc = proto_register(&unix_proto, 1);
2949         if (rc != 0) {
2950                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2951                 goto out;
2952         }
2953
2954         sock_register(&unix_family_ops);
2955         register_pernet_subsys(&unix_net_ops);
2956 out:
2957         return rc;
2958 }
2959
2960 static void __exit af_unix_exit(void)
2961 {
2962         sock_unregister(PF_UNIX);
2963         proto_unregister(&unix_proto);
2964         unregister_pernet_subsys(&unix_net_ops);
2965 }
2966
2967 /* Earlier than device_initcall() so that other drivers invoking
2968    request_module() don't end up in a loop when modprobe tries
2969    to use a UNIX socket. But later than subsys_initcall() because
2970    we depend on stuff initialised there */
2971 fs_initcall(af_unix_init);
2972 module_exit(af_unix_exit);
2973
2974 MODULE_LICENSE("GPL");
2975 MODULE_ALIAS_NETPROTO(PF_UNIX);