net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <asm/unaligned.h>
  95 #include <linux/capability.h>
  96 #include <linux/errno.h>
  97 #include <linux/errqueue.h>
  98 #include <linux/types.h>
  99 #include <linux/socket.h>
 100 #include <linux/in.h>
 101 #include <linux/kernel.h>
 102 #include <linux/module.h>
 103 #include <linux/proc_fs.h>
 104 #include <linux/seq_file.h>
 105 #include <linux/sched.h>
 106 #include <linux/sched/mm.h>
 107 #include <linux/timer.h>
 108 #include <linux/string.h>
 109 #include <linux/sockios.h>
 110 #include <linux/net.h>
 111 #include <linux/mm.h>
 112 #include <linux/slab.h>
 113 #include <linux/interrupt.h>
 114 #include <linux/poll.h>
 115 #include <linux/tcp.h>
 116 #include <linux/init.h>
 117 #include <linux/highmem.h>
 118 #include <linux/user_namespace.h>
 119 #include <linux/static_key.h>
 120 #include <linux/memcontrol.h>
 121 #include <linux/prefetch.h>
 122
 123 #include <linux/uaccess.h>
 124
 125 #include <linux/netdevice.h>
 126 #include <net/protocol.h>
 127 #include <linux/skbuff.h>
 128 #include <net/net_namespace.h>
 129 #include <net/request_sock.h>
 130 #include <net/sock.h>
 131 #include <linux/net_tstamp.h>
 132 #include <net/xfrm.h>
 133 #include <linux/ipsec.h>
 134 #include <net/cls_cgroup.h>
 135 #include <net/netprio_cgroup.h>
 136 #include <linux/sock_diag.h>
 137
 138 #include <linux/filter.h>
 139 #include <net/sock_reuseport.h>
 140
 141 #include <trace/events/sock.h>
 142
 143 #include <net/tcp.h>
 144 #include <net/busy_poll.h>
 145
 146 static DEFINE_MUTEX(proto_list_mutex);
 147 static LIST_HEAD(proto_list);
 148
 149 static void sock_inuse_add(struct net *net, int val);
 150
 151 /**
 152  * sk_ns_capable - General socket capability test
 153  * @sk: Socket to use a capability on or through
 154  * @user_ns: The user namespace of the capability to use
 155  * @cap: The capability to use
 156  *
 157  * Test to see if the opener of the socket had when the socket was
 158  * created and the current process has the capability @cap in the user
 159  * namespace @user_ns.
 160  */
 161 bool sk_ns_capable(const struct sock *sk,
 162                    struct user_namespace *user_ns, int cap)
 163 {
 164         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 165                 ns_capable(user_ns, cap);
 166 }
 167 EXPORT_SYMBOL(sk_ns_capable);
 168
 169 /**
 170  * sk_capable - Socket global capability test
 171  * @sk: Socket to use a capability on or through
 172  * @cap: The global capability to use
 173  *
 174  * Test to see if the opener of the socket had when the socket was
 175  * created and the current process has the capability @cap in all user
 176  * namespaces.
 177  */
 178 bool sk_capable(const struct sock *sk, int cap)
 179 {
 180         return sk_ns_capable(sk, &init_user_ns, cap);
 181 }
 182 EXPORT_SYMBOL(sk_capable);
 183
 184 /**
 185  * sk_net_capable - Network namespace socket capability test
 186  * @sk: Socket to use a capability on or through
 187  * @cap: The capability to use
 188  *
 189  * Test to see if the opener of the socket had when the socket was created
 190  * and the current process has the capability @cap over the network namespace
 191  * the socket is a member of.
 192  */
 193 bool sk_net_capable(const struct sock *sk, int cap)
 194 {
 195         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 196 }
 197 EXPORT_SYMBOL(sk_net_capable);
 198
 199 /*
 200  * Each address family might have different locking rules, so we have
 201  * one slock key per address family and separate keys for internal and
 202  * userspace sockets.
 203  */
 204 static struct lock_class_key af_family_keys[AF_MAX];
 205 static struct lock_class_key af_family_kern_keys[AF_MAX];
 206 static struct lock_class_key af_family_slock_keys[AF_MAX];
 207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 208
 209 /*
 210  * Make lock validator output more readable. (we pre-construct these
 211  * strings build-time, so that runtime initialization of socket
 212  * locks is fast):
 213  */
 214
 215 #define _sock_locks(x)                                            \
 216   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 217   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 218   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 219   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 220   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 221   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 222   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 223   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 224   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 225   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 226   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 227   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 228   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 229   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 230   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 231   x "AF_MAX"
 232
 233 static const char *const af_family_key_strings[AF_MAX+1] = {
 234         _sock_locks("sk_lock-")
 235 };
 236 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 237         _sock_locks("slock-")
 238 };
 239 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 240         _sock_locks("clock-")
 241 };
 242
 243 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 244         _sock_locks("k-sk_lock-")
 245 };
 246 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 247         _sock_locks("k-slock-")
 248 };
 249 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 250         _sock_locks("k-clock-")
 251 };
 252 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 253         _sock_locks("rlock-")
 254 };
 255 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 256         _sock_locks("wlock-")
 257 };
 258 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 259         _sock_locks("elock-")
 260 };
 261
 262 /*
 263  * sk_callback_lock and sk queues locking rules are per-address-family,
 264  * so split the lock classes by using a per-AF key:
 265  */
 266 static struct lock_class_key af_callback_keys[AF_MAX];
 267 static struct lock_class_key af_rlock_keys[AF_MAX];
 268 static struct lock_class_key af_wlock_keys[AF_MAX];
 269 static struct lock_class_key af_elock_keys[AF_MAX];
 270 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 271
 272 /* Run time adjustable parameters. */
 273 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 274 EXPORT_SYMBOL(sysctl_wmem_max);
 275 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 276 EXPORT_SYMBOL(sysctl_rmem_max);
 277 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 278 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 279
 280 /* Maximal space eaten by iovec or ancillary data plus some space */
 281 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 282 EXPORT_SYMBOL(sysctl_optmem_max);
 283
 284 int sysctl_tstamp_allow_data __read_mostly = 1;
 285
 286 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 287 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 288
 289 /**
 290  * sk_set_memalloc - sets %SOCK_MEMALLOC
 291  * @sk: socket to set it on
 292  *
 293  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 294  * It's the responsibility of the admin to adjust min_free_kbytes
 295  * to meet the requirements
 296  */
 297 void sk_set_memalloc(struct sock *sk)
 298 {
 299         sock_set_flag(sk, SOCK_MEMALLOC);
 300         sk->sk_allocation |= __GFP_MEMALLOC;
 301         static_branch_inc(&memalloc_socks_key);
 302 }
 303 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 304
 305 void sk_clear_memalloc(struct sock *sk)
 306 {
 307         sock_reset_flag(sk, SOCK_MEMALLOC);
 308         sk->sk_allocation &= ~__GFP_MEMALLOC;
 309         static_branch_dec(&memalloc_socks_key);
 310
 311         /*
 312          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 313          * progress of swapping. SOCK_MEMALLOC may be cleared while
 314          * it has rmem allocations due to the last swapfile being deactivated
 315          * but there is a risk that the socket is unusable due to exceeding
 316          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 317          */
 318         sk_mem_reclaim(sk);
 319 }
 320 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 321
 322 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 323 {
 324         int ret;
 325         unsigned int noreclaim_flag;
 326
 327         /* these should have been dropped before queueing */
 328         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 329
 330         noreclaim_flag = memalloc_noreclaim_save();
 331         ret = sk->sk_backlog_rcv(sk, skb);
 332         memalloc_noreclaim_restore(noreclaim_flag);
 333
 334         return ret;
 335 }
 336 EXPORT_SYMBOL(__sk_backlog_rcv);
 337
 338 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 339 {
 340         struct timeval tv;
 341
 342         if (optlen < sizeof(tv))
 343                 return -EINVAL;
 344         if (copy_from_user(&tv, optval, sizeof(tv)))
 345                 return -EFAULT;
 346         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 347                 return -EDOM;
 348
 349         if (tv.tv_sec < 0) {
 350                 static int warned __read_mostly;
 351
 352                 *timeo_p = 0;
 353                 if (warned < 10 && net_ratelimit()) {
 354                         warned++;
 355                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 356                                 __func__, current->comm, task_pid_nr(current));
 357                 }
 358                 return 0;
 359         }
 360         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 361         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 362                 return 0;
 363         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 364                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 365         return 0;
 366 }
 367
 368 static void sock_warn_obsolete_bsdism(const char *name)
 369 {
 370         static int warned;
 371         static char warncomm[TASK_COMM_LEN];
 372         if (strcmp(warncomm, current->comm) && warned < 5) {
 373                 strcpy(warncomm,  current->comm);
 374                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 375                         warncomm, name);
 376                 warned++;
 377         }
 378 }
 379
 380 static bool sock_needs_netstamp(const struct sock *sk)
 381 {
 382         switch (sk->sk_family) {
 383         case AF_UNSPEC:
 384         case AF_UNIX:
 385                 return false;
 386         default:
 387                 return true;
 388         }
 389 }
 390
 391 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 392 {
 393         if (sk->sk_flags & flags) {
 394                 sk->sk_flags &= ~flags;
 395                 if (sock_needs_netstamp(sk) &&
 396                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 397                         net_disable_timestamp();
 398         }
 399 }
 400
 401
 402 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 403 {
 404         unsigned long flags;
 405         struct sk_buff_head *list = &sk->sk_receive_queue;
 406
 407         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 408                 atomic_inc(&sk->sk_drops);
 409                 trace_sock_rcvqueue_full(sk, skb);
 410                 return -ENOMEM;
 411         }
 412
 413         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 414                 atomic_inc(&sk->sk_drops);
 415                 return -ENOBUFS;
 416         }
 417
 418         skb->dev = NULL;
 419         skb_set_owner_r(skb, sk);
 420
 421         /* we escape from rcu protected region, make sure we dont leak
 422          * a norefcounted dst
 423          */
 424         skb_dst_force(skb);
 425
 426         spin_lock_irqsave(&list->lock, flags);
 427         sock_skb_set_dropcount(sk, skb);
 428         __skb_queue_tail(list, skb);
 429         spin_unlock_irqrestore(&list->lock, flags);
 430
 431         if (!sock_flag(sk, SOCK_DEAD))
 432                 sk->sk_data_ready(sk);
 433         return 0;
 434 }
 435 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 436
 437 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 438 {
 439         int err;
 440
 441         err = sk_filter(sk, skb);
 442         if (err)
 443                 return err;
 444
 445         return __sock_queue_rcv_skb(sk, skb);
 446 }
 447 EXPORT_SYMBOL(sock_queue_rcv_skb);
 448
 449 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 450                      const int nested, unsigned int trim_cap, bool refcounted)
 451 {
 452         int rc = NET_RX_SUCCESS;
 453
 454         if (sk_filter_trim_cap(sk, skb, trim_cap))
 455                 goto discard_and_relse;
 456
 457         skb->dev = NULL;
 458
 459         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 460                 atomic_inc(&sk->sk_drops);
 461                 goto discard_and_relse;
 462         }
 463         if (nested)
 464                 bh_lock_sock_nested(sk);
 465         else
 466                 bh_lock_sock(sk);
 467         if (!sock_owned_by_user(sk)) {
 468                 /*
 469                  * trylock + unlock semantics:
 470                  */
 471                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 472
 473                 rc = sk_backlog_rcv(sk, skb);
 474
 475                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 476         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 477                 bh_unlock_sock(sk);
 478                 atomic_inc(&sk->sk_drops);
 479                 goto discard_and_relse;
 480         }
 481
 482         bh_unlock_sock(sk);
 483 out:
 484         if (refcounted)
 485                 sock_put(sk);
 486         return rc;
 487 discard_and_relse:
 488         kfree_skb(skb);
 489         goto out;
 490 }
 491 EXPORT_SYMBOL(__sk_receive_skb);
 492
 493 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 494 {
 495         struct dst_entry *dst = __sk_dst_get(sk);
 496
 497         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 498                 sk_tx_queue_clear(sk);
 499                 sk->sk_dst_pending_confirm = 0;
 500                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 501                 dst_release(dst);
 502                 return NULL;
 503         }
 504
 505         return dst;
 506 }
 507 EXPORT_SYMBOL(__sk_dst_check);
 508
 509 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 510 {
 511         struct dst_entry *dst = sk_dst_get(sk);
 512
 513         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 514                 sk_dst_reset(sk);
 515                 dst_release(dst);
 516                 return NULL;
 517         }
 518
 519         return dst;
 520 }
 521 EXPORT_SYMBOL(sk_dst_check);
 522
 523 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 524                                 int optlen)
 525 {
 526         int ret = -ENOPROTOOPT;
 527 #ifdef CONFIG_NETDEVICES
 528         struct net *net = sock_net(sk);
 529         char devname[IFNAMSIZ];
 530         int index;
 531
 532         /* Sorry... */
 533         ret = -EPERM;
 534         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 535                 goto out;
 536
 537         ret = -EINVAL;
 538         if (optlen < 0)
 539                 goto out;
 540
 541         /* Bind this socket to a particular device like "eth0",
 542          * as specified in the passed interface name. If the
 543          * name is "" or the option length is zero the socket
 544          * is not bound.
 545          */
 546         if (optlen > IFNAMSIZ - 1)
 547                 optlen = IFNAMSIZ - 1;
 548         memset(devname, 0, sizeof(devname));
 549
 550         ret = -EFAULT;
 551         if (copy_from_user(devname, optval, optlen))
 552                 goto out;
 553
 554         index = 0;
 555         if (devname[0] != '\0') {
 556                 struct net_device *dev;
 557
 558                 rcu_read_lock();
 559                 dev = dev_get_by_name_rcu(net, devname);
 560                 if (dev)
 561                         index = dev->ifindex;
 562                 rcu_read_unlock();
 563                 ret = -ENODEV;
 564                 if (!dev)
 565                         goto out;
 566         }
 567
 568         lock_sock(sk);
 569         sk->sk_bound_dev_if = index;
 570         sk_dst_reset(sk);
 571         release_sock(sk);
 572
 573         ret = 0;
 574
 575 out:
 576 #endif
 577
 578         return ret;
 579 }
 580
 581 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 582                                 int __user *optlen, int len)
 583 {
 584         int ret = -ENOPROTOOPT;
 585 #ifdef CONFIG_NETDEVICES
 586         struct net *net = sock_net(sk);
 587         char devname[IFNAMSIZ];
 588
 589         if (sk->sk_bound_dev_if == 0) {
 590                 len = 0;
 591                 goto zero;
 592         }
 593
 594         ret = -EINVAL;
 595         if (len < IFNAMSIZ)
 596                 goto out;
 597
 598         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 599         if (ret)
 600                 goto out;
 601
 602         len = strlen(devname) + 1;
 603
 604         ret = -EFAULT;
 605         if (copy_to_user(optval, devname, len))
 606                 goto out;
 607
 608 zero:
 609         ret = -EFAULT;
 610         if (put_user(len, optlen))
 611                 goto out;
 612
 613         ret = 0;
 614
 615 out:
 616 #endif
 617
 618         return ret;
 619 }
 620
 621 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 622 {
 623         if (valbool)
 624                 sock_set_flag(sk, bit);
 625         else
 626                 sock_reset_flag(sk, bit);
 627 }
 628
 629 bool sk_mc_loop(struct sock *sk)
 630 {
 631         if (dev_recursion_level())
 632                 return false;
 633         if (!sk)
 634                 return true;
 635         switch (sk->sk_family) {
 636         case AF_INET:
 637                 return inet_sk(sk)->mc_loop;
 638 #if IS_ENABLED(CONFIG_IPV6)
 639         case AF_INET6:
 640                 return inet6_sk(sk)->mc_loop;
 641 #endif
 642         }
 643         WARN_ON(1);
 644         return true;
 645 }
 646 EXPORT_SYMBOL(sk_mc_loop);
 647
 648 /*
 649  *      This is meant for all protocols to use and covers goings on
 650  *      at the socket level. Everything here is generic.
 651  */
 652
 653 int sock_setsockopt(struct socket *sock, int level, int optname,
 654                     char __user *optval, unsigned int optlen)
 655 {
 656         struct sock_txtime sk_txtime;
 657         struct sock *sk = sock->sk;
 658         int val;
 659         int valbool;
 660         struct linger ling;
 661         int ret = 0;
 662
 663         /*
 664          *      Options without arguments
 665          */
 666
 667         if (optname == SO_BINDTODEVICE)
 668                 return sock_setbindtodevice(sk, optval, optlen);
 669
 670         if (optlen < sizeof(int))
 671                 return -EINVAL;
 672
 673         if (get_user(val, (int __user *)optval))
 674                 return -EFAULT;
 675
 676         valbool = val ? 1 : 0;
 677
 678         lock_sock(sk);
 679
 680         switch (optname) {
 681         case SO_DEBUG:
 682                 if (val && !capable(CAP_NET_ADMIN))
 683                         ret = -EACCES;
 684                 else
 685                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 686                 break;
 687         case SO_REUSEADDR:
 688                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 689                 break;
 690         case SO_REUSEPORT:
 691                 sk->sk_reuseport = valbool;
 692                 break;
 693         case SO_TYPE:
 694         case SO_PROTOCOL:
 695         case SO_DOMAIN:
 696         case SO_ERROR:
 697                 ret = -ENOPROTOOPT;
 698                 break;
 699         case SO_DONTROUTE:
 700                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 701                 break;
 702         case SO_BROADCAST:
 703                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 704                 break;
 705         case SO_SNDBUF:
 706                 /* Don't error on this BSD doesn't and if you think
 707                  * about it this is right. Otherwise apps have to
 708                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 709                  * are treated in BSD as hints
 710                  */
 711                 val = min_t(u32, val, sysctl_wmem_max);
 712 set_sndbuf:
 713                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 714                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 715                 /* Wake up sending tasks if we upped the value. */
 716                 sk->sk_write_space(sk);
 717                 break;
 718
 719         case SO_SNDBUFFORCE:
 720                 if (!capable(CAP_NET_ADMIN)) {
 721                         ret = -EPERM;
 722                         break;
 723                 }
 724                 goto set_sndbuf;
 725
 726         case SO_RCVBUF:
 727                 /* Don't error on this BSD doesn't and if you think
 728                  * about it this is right. Otherwise apps have to
 729                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 730                  * are treated in BSD as hints
 731                  */
 732                 val = min_t(u32, val, sysctl_rmem_max);
 733 set_rcvbuf:
 734                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 735                 /*
 736                  * We double it on the way in to account for
 737                  * "struct sk_buff" etc. overhead.   Applications
 738                  * assume that the SO_RCVBUF setting they make will
 739                  * allow that much actual data to be received on that
 740                  * socket.
 741                  *
 742                  * Applications are unaware that "struct sk_buff" and
 743                  * other overheads allocate from the receive buffer
 744                  * during socket buffer allocation.
 745                  *
 746                  * And after considering the possible alternatives,
 747                  * returning the value we actually used in getsockopt
 748                  * is the most desirable behavior.
 749                  */
 750                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 751                 break;
 752
 753         case SO_RCVBUFFORCE:
 754                 if (!capable(CAP_NET_ADMIN)) {
 755                         ret = -EPERM;
 756                         break;
 757                 }
 758                 goto set_rcvbuf;
 759
 760         case SO_KEEPALIVE:
 761                 if (sk->sk_prot->keepalive)
 762                         sk->sk_prot->keepalive(sk, valbool);
 763                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 764                 break;
 765
 766         case SO_OOBINLINE:
 767                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 768                 break;
 769
 770         case SO_NO_CHECK:
 771                 sk->sk_no_check_tx = valbool;
 772                 break;
 773
 774         case SO_PRIORITY:
 775                 if ((val >= 0 && val <= 6) ||
 776                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 777                         sk->sk_priority = val;
 778                 else
 779                         ret = -EPERM;
 780                 break;
 781
 782         case SO_LINGER:
 783                 if (optlen < sizeof(ling)) {
 784                         ret = -EINVAL;  /* 1003.1g */
 785                         break;
 786                 }
 787                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 788                         ret = -EFAULT;
 789                         break;
 790                 }
 791                 if (!ling.l_onoff)
 792                         sock_reset_flag(sk, SOCK_LINGER);
 793                 else {
 794 #if (BITS_PER_LONG == 32)
 795                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 796                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 797                         else
 798 #endif
 799                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 800                         sock_set_flag(sk, SOCK_LINGER);
 801                 }
 802                 break;
 803
 804         case SO_BSDCOMPAT:
 805                 sock_warn_obsolete_bsdism("setsockopt");
 806                 break;
 807
 808         case SO_PASSCRED:
 809                 if (valbool)
 810                         set_bit(SOCK_PASSCRED, &sock->flags);
 811                 else
 812                         clear_bit(SOCK_PASSCRED, &sock->flags);
 813                 break;
 814
 815         case SO_TIMESTAMP:
 816         case SO_TIMESTAMPNS:
 817                 if (valbool)  {
 818                         if (optname == SO_TIMESTAMP)
 819                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 820                         else
 821                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 822                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 823                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 824                 } else {
 825                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 826                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 827                 }
 828                 break;
 829
 830         case SO_TIMESTAMPING:
 831                 if (val & ~SOF_TIMESTAMPING_MASK) {
 832                         ret = -EINVAL;
 833                         break;
 834                 }
 835
 836                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 837                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 838                         if (sk->sk_protocol == IPPROTO_TCP &&
 839                             sk->sk_type == SOCK_STREAM) {
 840                                 if ((1 << sk->sk_state) &
 841                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 842                                         ret = -EINVAL;
 843                                         break;
 844                                 }
 845                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 846                         } else {
 847                                 sk->sk_tskey = 0;
 848                         }
 849                 }
 850
 851                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 852                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 853                         ret = -EINVAL;
 854                         break;
 855                 }
 856
 857                 sk->sk_tsflags = val;
 858                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 859                         sock_enable_timestamp(sk,
 860                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 861                 else
 862                         sock_disable_timestamp(sk,
 863                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 864                 break;
 865
 866         case SO_RCVLOWAT:
 867                 if (val < 0)
 868                         val = INT_MAX;
 869                 if (sock->ops->set_rcvlowat)
 870                         ret = sock->ops->set_rcvlowat(sk, val);
 871                 else
 872                         sk->sk_rcvlowat = val ? : 1;
 873                 break;
 874
 875         case SO_RCVTIMEO:
 876                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 877                 break;
 878
 879         case SO_SNDTIMEO:
 880                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 881                 break;
 882
 883         case SO_ATTACH_FILTER:
 884                 ret = -EINVAL;
 885                 if (optlen == sizeof(struct sock_fprog)) {
 886                         struct sock_fprog fprog;
 887
 888                         ret = -EFAULT;
 889                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 890                                 break;
 891
 892                         ret = sk_attach_filter(&fprog, sk);
 893                 }
 894                 break;
 895
 896         case SO_ATTACH_BPF:
 897                 ret = -EINVAL;
 898                 if (optlen == sizeof(u32)) {
 899                         u32 ufd;
 900
 901                         ret = -EFAULT;
 902                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 903                                 break;
 904
 905                         ret = sk_attach_bpf(ufd, sk);
 906                 }
 907                 break;
 908
 909         case SO_ATTACH_REUSEPORT_CBPF:
 910                 ret = -EINVAL;
 911                 if (optlen == sizeof(struct sock_fprog)) {
 912                         struct sock_fprog fprog;
 913
 914                         ret = -EFAULT;
 915                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 916                                 break;
 917
 918                         ret = sk_reuseport_attach_filter(&fprog, sk);
 919                 }
 920                 break;
 921
 922         case SO_ATTACH_REUSEPORT_EBPF:
 923                 ret = -EINVAL;
 924                 if (optlen == sizeof(u32)) {
 925                         u32 ufd;
 926
 927                         ret = -EFAULT;
 928                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 929                                 break;
 930
 931                         ret = sk_reuseport_attach_bpf(ufd, sk);
 932                 }
 933                 break;
 934
 935         case SO_DETACH_FILTER:
 936                 ret = sk_detach_filter(sk);
 937                 break;
 938
 939         case SO_LOCK_FILTER:
 940                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 941                         ret = -EPERM;
 942                 else
 943                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 944                 break;
 945
 946         case SO_PASSSEC:
 947                 if (valbool)
 948                         set_bit(SOCK_PASSSEC, &sock->flags);
 949                 else
 950                         clear_bit(SOCK_PASSSEC, &sock->flags);
 951                 break;
 952         case SO_MARK:
 953                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 954                         ret = -EPERM;
 955                 else
 956                         sk->sk_mark = val;
 957                 break;
 958
 959         case SO_RXQ_OVFL:
 960                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 961                 break;
 962
 963         case SO_WIFI_STATUS:
 964                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 965                 break;
 966
 967         case SO_PEEK_OFF:
 968                 if (sock->ops->set_peek_off)
 969                         ret = sock->ops->set_peek_off(sk, val);
 970                 else
 971                         ret = -EOPNOTSUPP;
 972                 break;
 973
 974         case SO_NOFCS:
 975                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 976                 break;
 977
 978         case SO_SELECT_ERR_QUEUE:
 979                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 980                 break;
 981
 982 #ifdef CONFIG_NET_RX_BUSY_POLL
 983         case SO_BUSY_POLL:
 984                 /* allow unprivileged users to decrease the value */
 985                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 986                         ret = -EPERM;
 987                 else {
 988                         if (val < 0)
 989                                 ret = -EINVAL;
 990                         else
 991                                 sk->sk_ll_usec = val;
 992                 }
 993                 break;
 994 #endif
 995
 996         case SO_MAX_PACING_RATE:
 997                 if (val != ~0U)
 998                         cmpxchg(&sk->sk_pacing_status,
 999                                 SK_PACING_NONE,
1000                                 SK_PACING_NEEDED);
1001                 sk->sk_max_pacing_rate = val;
1002                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1003                                          sk->sk_max_pacing_rate);
1004                 break;
1005
1006         case SO_INCOMING_CPU:
1007                 sk->sk_incoming_cpu = val;
1008                 break;
1009
1010         case SO_CNX_ADVICE:
1011                 if (val == 1)
1012                         dst_negative_advice(sk);
1013                 break;
1014
1015         case SO_ZEROCOPY:
1016                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1017                         if (sk->sk_protocol != IPPROTO_TCP)
1018                                 ret = -ENOTSUPP;
1019                 } else if (sk->sk_family != PF_RDS) {
1020                         ret = -ENOTSUPP;
1021                 }
1022                 if (!ret) {
1023                         if (val < 0 || val > 1)
1024                                 ret = -EINVAL;
1025                         else
1026                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1027                 }
1028                 break;
1029
1030         case SO_TXTIME:
1031                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1032                         ret = -EPERM;
1033                 } else if (optlen != sizeof(struct sock_txtime)) {
1034                         ret = -EINVAL;
1035                 } else if (copy_from_user(&sk_txtime, optval,
1036                            sizeof(struct sock_txtime))) {
1037                         ret = -EFAULT;
1038                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1039                         ret = -EINVAL;
1040                 } else {
1041                         sock_valbool_flag(sk, SOCK_TXTIME, true);
1042                         sk->sk_clockid = sk_txtime.clockid;
1043                         sk->sk_txtime_deadline_mode =
1044                                 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1045                         sk->sk_txtime_report_errors =
1046                                 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1047                 }
1048                 break;
1049
1050         default:
1051                 ret = -ENOPROTOOPT;
1052                 break;
1053         }
1054         release_sock(sk);
1055         return ret;
1056 }
1057 EXPORT_SYMBOL(sock_setsockopt);
1058
1059
1060 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1061                           struct ucred *ucred)
1062 {
1063         ucred->pid = pid_vnr(pid);
1064         ucred->uid = ucred->gid = -1;
1065         if (cred) {
1066                 struct user_namespace *current_ns = current_user_ns();
1067
1068                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1069                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1070         }
1071 }
1072
1073 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1074 {
1075         struct user_namespace *user_ns = current_user_ns();
1076         int i;
1077
1078         for (i = 0; i < src->ngroups; i++)
1079                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1080                         return -EFAULT;
1081
1082         return 0;
1083 }
1084
1085 int sock_getsockopt(struct socket *sock, int level, int optname,
1086                     char __user *optval, int __user *optlen)
1087 {
1088         struct sock *sk = sock->sk;
1089
1090         union {
1091                 int val;
1092                 u64 val64;
1093                 struct linger ling;
1094                 struct timeval tm;
1095                 struct sock_txtime txtime;
1096         } v;
1097
1098         int lv = sizeof(int);
1099         int len;
1100
1101         if (get_user(len, optlen))
1102                 return -EFAULT;
1103         if (len < 0)
1104                 return -EINVAL;
1105
1106         memset(&v, 0, sizeof(v));
1107
1108         switch (optname) {
1109         case SO_DEBUG:
1110                 v.val = sock_flag(sk, SOCK_DBG);
1111                 break;
1112
1113         case SO_DONTROUTE:
1114                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1115                 break;
1116
1117         case SO_BROADCAST:
1118                 v.val = sock_flag(sk, SOCK_BROADCAST);
1119                 break;
1120
1121         case SO_SNDBUF:
1122                 v.val = sk->sk_sndbuf;
1123                 break;
1124
1125         case SO_RCVBUF:
1126                 v.val = sk->sk_rcvbuf;
1127                 break;
1128
1129         case SO_REUSEADDR:
1130                 v.val = sk->sk_reuse;
1131                 break;
1132
1133         case SO_REUSEPORT:
1134                 v.val = sk->sk_reuseport;
1135                 break;
1136
1137         case SO_KEEPALIVE:
1138                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1139                 break;
1140
1141         case SO_TYPE:
1142                 v.val = sk->sk_type;
1143                 break;
1144
1145         case SO_PROTOCOL:
1146                 v.val = sk->sk_protocol;
1147                 break;
1148
1149         case SO_DOMAIN:
1150                 v.val = sk->sk_family;
1151                 break;
1152
1153         case SO_ERROR:
1154                 v.val = -sock_error(sk);
1155                 if (v.val == 0)
1156                         v.val = xchg(&sk->sk_err_soft, 0);
1157                 break;
1158
1159         case SO_OOBINLINE:
1160                 v.val = sock_flag(sk, SOCK_URGINLINE);
1161                 break;
1162
1163         case SO_NO_CHECK:
1164                 v.val = sk->sk_no_check_tx;
1165                 break;
1166
1167         case SO_PRIORITY:
1168                 v.val = sk->sk_priority;
1169                 break;
1170
1171         case SO_LINGER:
1172                 lv              = sizeof(v.ling);
1173                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1174                 v.ling.l_linger = sk->sk_lingertime / HZ;
1175                 break;
1176
1177         case SO_BSDCOMPAT:
1178                 sock_warn_obsolete_bsdism("getsockopt");
1179                 break;
1180
1181         case SO_TIMESTAMP:
1182                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1183                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1184                 break;
1185
1186         case SO_TIMESTAMPNS:
1187                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1188                 break;
1189
1190         case SO_TIMESTAMPING:
1191                 v.val = sk->sk_tsflags;
1192                 break;
1193
1194         case SO_RCVTIMEO:
1195                 lv = sizeof(struct timeval);
1196                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1197                         v.tm.tv_sec = 0;
1198                         v.tm.tv_usec = 0;
1199                 } else {
1200                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1201                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1202                 }
1203                 break;
1204
1205         case SO_SNDTIMEO:
1206                 lv = sizeof(struct timeval);
1207                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1208                         v.tm.tv_sec = 0;
1209                         v.tm.tv_usec = 0;
1210                 } else {
1211                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1212                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1213                 }
1214                 break;
1215
1216         case SO_RCVLOWAT:
1217                 v.val = sk->sk_rcvlowat;
1218                 break;
1219
1220         case SO_SNDLOWAT:
1221                 v.val = 1;
1222                 break;
1223
1224         case SO_PASSCRED:
1225                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1226                 break;
1227
1228         case SO_PEERCRED:
1229         {
1230                 struct ucred peercred;
1231                 if (len > sizeof(peercred))
1232                         len = sizeof(peercred);
1233                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1234                 if (copy_to_user(optval, &peercred, len))
1235                         return -EFAULT;
1236                 goto lenout;
1237         }
1238
1239         case SO_PEERGROUPS:
1240         {
1241                 int ret, n;
1242
1243                 if (!sk->sk_peer_cred)
1244                         return -ENODATA;
1245
1246                 n = sk->sk_peer_cred->group_info->ngroups;
1247                 if (len < n * sizeof(gid_t)) {
1248                         len = n * sizeof(gid_t);
1249                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1250                 }
1251                 len = n * sizeof(gid_t);
1252
1253                 ret = groups_to_user((gid_t __user *)optval,
1254                                      sk->sk_peer_cred->group_info);
1255                 if (ret)
1256                         return ret;
1257                 goto lenout;
1258         }
1259
1260         case SO_PEERNAME:
1261         {
1262                 char address[128];
1263
1264                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1265                 if (lv < 0)
1266                         return -ENOTCONN;
1267                 if (lv < len)
1268                         return -EINVAL;
1269                 if (copy_to_user(optval, address, len))
1270                         return -EFAULT;
1271                 goto lenout;
1272         }
1273
1274         /* Dubious BSD thing... Probably nobody even uses it, but
1275          * the UNIX standard wants it for whatever reason... -DaveM
1276          */
1277         case SO_ACCEPTCONN:
1278                 v.val = sk->sk_state == TCP_LISTEN;
1279                 break;
1280
1281         case SO_PASSSEC:
1282                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1283                 break;
1284
1285         case SO_PEERSEC:
1286                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1287
1288         case SO_MARK:
1289                 v.val = sk->sk_mark;
1290                 break;
1291
1292         case SO_RXQ_OVFL:
1293                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1294                 break;
1295
1296         case SO_WIFI_STATUS:
1297                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1298                 break;
1299
1300         case SO_PEEK_OFF:
1301                 if (!sock->ops->set_peek_off)
1302                         return -EOPNOTSUPP;
1303
1304                 v.val = sk->sk_peek_off;
1305                 break;
1306         case SO_NOFCS:
1307                 v.val = sock_flag(sk, SOCK_NOFCS);
1308                 break;
1309
1310         case SO_BINDTODEVICE:
1311                 return sock_getbindtodevice(sk, optval, optlen, len);
1312
1313         case SO_GET_FILTER:
1314                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1315                 if (len < 0)
1316                         return len;
1317
1318                 goto lenout;
1319
1320         case SO_LOCK_FILTER:
1321                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1322                 break;
1323
1324         case SO_BPF_EXTENSIONS:
1325                 v.val = bpf_tell_extensions();
1326                 break;
1327
1328         case SO_SELECT_ERR_QUEUE:
1329                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1330                 break;
1331
1332 #ifdef CONFIG_NET_RX_BUSY_POLL
1333         case SO_BUSY_POLL:
1334                 v.val = sk->sk_ll_usec;
1335                 break;
1336 #endif
1337
1338         case SO_MAX_PACING_RATE:
1339                 v.val = sk->sk_max_pacing_rate;
1340                 break;
1341
1342         case SO_INCOMING_CPU:
1343                 v.val = sk->sk_incoming_cpu;
1344                 break;
1345
1346         case SO_MEMINFO:
1347         {
1348                 u32 meminfo[SK_MEMINFO_VARS];
1349
1350                 if (get_user(len, optlen))
1351                         return -EFAULT;
1352
1353                 sk_get_meminfo(sk, meminfo);
1354
1355                 len = min_t(unsigned int, len, sizeof(meminfo));
1356                 if (copy_to_user(optval, &meminfo, len))
1357                         return -EFAULT;
1358
1359                 goto lenout;
1360         }
1361
1362 #ifdef CONFIG_NET_RX_BUSY_POLL
1363         case SO_INCOMING_NAPI_ID:
1364                 v.val = READ_ONCE(sk->sk_napi_id);
1365
1366                 /* aggregate non-NAPI IDs down to 0 */
1367                 if (v.val < MIN_NAPI_ID)
1368                         v.val = 0;
1369
1370                 break;
1371 #endif
1372
1373         case SO_COOKIE:
1374                 lv = sizeof(u64);
1375                 if (len < lv)
1376                         return -EINVAL;
1377                 v.val64 = sock_gen_cookie(sk);
1378                 break;
1379
1380         case SO_ZEROCOPY:
1381                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1382                 break;
1383
1384         case SO_TXTIME:
1385                 lv = sizeof(v.txtime);
1386                 v.txtime.clockid = sk->sk_clockid;
1387                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1388                                   SOF_TXTIME_DEADLINE_MODE : 0;
1389                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1390                                   SOF_TXTIME_REPORT_ERRORS : 0;
1391                 break;
1392
1393         default:
1394                 /* We implement the SO_SNDLOWAT etc to not be settable
1395                  * (1003.1g 7).
1396                  */
1397                 return -ENOPROTOOPT;
1398         }
1399
1400         if (len > lv)
1401                 len = lv;
1402         if (copy_to_user(optval, &v, len))
1403                 return -EFAULT;
1404 lenout:
1405         if (put_user(len, optlen))
1406                 return -EFAULT;
1407         return 0;
1408 }
1409
1410 /*
1411  * Initialize an sk_lock.
1412  *
1413  * (We also register the sk_lock with the lock validator.)
1414  */
1415 static inline void sock_lock_init(struct sock *sk)
1416 {
1417         if (sk->sk_kern_sock)
1418                 sock_lock_init_class_and_name(
1419                         sk,
1420                         af_family_kern_slock_key_strings[sk->sk_family],
1421                         af_family_kern_slock_keys + sk->sk_family,
1422                         af_family_kern_key_strings[sk->sk_family],
1423                         af_family_kern_keys + sk->sk_family);
1424         else
1425                 sock_lock_init_class_and_name(
1426                         sk,
1427                         af_family_slock_key_strings[sk->sk_family],
1428                         af_family_slock_keys + sk->sk_family,
1429                         af_family_key_strings[sk->sk_family],
1430                         af_family_keys + sk->sk_family);
1431 }
1432
1433 /*
1434  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1435  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1436  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1437  */
1438 static void sock_copy(struct sock *nsk, const struct sock *osk)
1439 {
1440 #ifdef CONFIG_SECURITY_NETWORK
1441         void *sptr = nsk->sk_security;
1442 #endif
1443         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1444
1445         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1446                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1447
1448 #ifdef CONFIG_SECURITY_NETWORK
1449         nsk->sk_security = sptr;
1450         security_sk_clone(osk, nsk);
1451 #endif
1452 }
1453
1454 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1455                 int family)
1456 {
1457         struct sock *sk;
1458         struct kmem_cache *slab;
1459
1460         slab = prot->slab;
1461         if (slab != NULL) {
1462                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1463                 if (!sk)
1464                         return sk;
1465                 if (priority & __GFP_ZERO)
1466                         sk_prot_clear_nulls(sk, prot->obj_size);
1467         } else
1468                 sk = kmalloc(prot->obj_size, priority);
1469
1470         if (sk != NULL) {
1471                 if (security_sk_alloc(sk, family, priority))
1472                         goto out_free;
1473
1474                 if (!try_module_get(prot->owner))
1475                         goto out_free_sec;
1476                 sk_tx_queue_clear(sk);
1477         }
1478
1479         return sk;
1480
1481 out_free_sec:
1482         security_sk_free(sk);
1483 out_free:
1484         if (slab != NULL)
1485                 kmem_cache_free(slab, sk);
1486         else
1487                 kfree(sk);
1488         return NULL;
1489 }
1490
1491 static void sk_prot_free(struct proto *prot, struct sock *sk)
1492 {
1493         struct kmem_cache *slab;
1494         struct module *owner;
1495
1496         owner = prot->owner;
1497         slab = prot->slab;
1498
1499         cgroup_sk_free(&sk->sk_cgrp_data);
1500         mem_cgroup_sk_free(sk);
1501         security_sk_free(sk);
1502         if (slab != NULL)
1503                 kmem_cache_free(slab, sk);
1504         else
1505                 kfree(sk);
1506         module_put(owner);
1507 }
1508
1509 /**
1510  *      sk_alloc - All socket objects are allocated here
1511  *      @net: the applicable net namespace
1512  *      @family: protocol family
1513  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1514  *      @prot: struct proto associated with this new sock instance
1515  *      @kern: is this to be a kernel socket?
1516  */
1517 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1518                       struct proto *prot, int kern)
1519 {
1520         struct sock *sk;
1521
1522         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1523         if (sk) {
1524                 sk->sk_family = family;
1525                 /*
1526                  * See comment in struct sock definition to understand
1527                  * why we need sk_prot_creator -acme
1528                  */
1529                 sk->sk_prot = sk->sk_prot_creator = prot;
1530                 sk->sk_kern_sock = kern;
1531                 sock_lock_init(sk);
1532                 sk->sk_net_refcnt = kern ? 0 : 1;
1533                 if (likely(sk->sk_net_refcnt)) {
1534                         get_net(net);
1535                         sock_inuse_add(net, 1);
1536                 }
1537
1538                 sock_net_set(sk, net);
1539                 refcount_set(&sk->sk_wmem_alloc, 1);
1540
1541                 mem_cgroup_sk_alloc(sk);
1542                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1543                 sock_update_classid(&sk->sk_cgrp_data);
1544                 sock_update_netprioidx(&sk->sk_cgrp_data);
1545         }
1546
1547         return sk;
1548 }
1549 EXPORT_SYMBOL(sk_alloc);
1550
1551 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1552  * grace period. This is the case for UDP sockets and TCP listeners.
1553  */
1554 static void __sk_destruct(struct rcu_head *head)
1555 {
1556         struct sock *sk = container_of(head, struct sock, sk_rcu);
1557         struct sk_filter *filter;
1558
1559         if (sk->sk_destruct)
1560                 sk->sk_destruct(sk);
1561
1562         filter = rcu_dereference_check(sk->sk_filter,
1563                                        refcount_read(&sk->sk_wmem_alloc) == 0);
1564         if (filter) {
1565                 sk_filter_uncharge(sk, filter);
1566                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1567         }
1568         if (rcu_access_pointer(sk->sk_reuseport_cb))
1569                 reuseport_detach_sock(sk);
1570
1571         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1572
1573         if (atomic_read(&sk->sk_omem_alloc))
1574                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1575                          __func__, atomic_read(&sk->sk_omem_alloc));
1576
1577         if (sk->sk_frag.page) {
1578                 put_page(sk->sk_frag.page);
1579                 sk->sk_frag.page = NULL;
1580         }
1581
1582         if (sk->sk_peer_cred)
1583                 put_cred(sk->sk_peer_cred);
1584         put_pid(sk->sk_peer_pid);
1585         if (likely(sk->sk_net_refcnt))
1586                 put_net(sock_net(sk));
1587         sk_prot_free(sk->sk_prot_creator, sk);
1588 }
1589
1590 void sk_destruct(struct sock *sk)
1591 {
1592         if (sock_flag(sk, SOCK_RCU_FREE))
1593                 call_rcu(&sk->sk_rcu, __sk_destruct);
1594         else
1595                 __sk_destruct(&sk->sk_rcu);
1596 }
1597
1598 static void __sk_free(struct sock *sk)
1599 {
1600         if (likely(sk->sk_net_refcnt))
1601                 sock_inuse_add(sock_net(sk), -1);
1602
1603         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1604                 sock_diag_broadcast_destroy(sk);
1605         else
1606                 sk_destruct(sk);
1607 }
1608
1609 void sk_free(struct sock *sk)
1610 {
1611         /*
1612          * We subtract one from sk_wmem_alloc and can know if
1613          * some packets are still in some tx queue.
1614          * If not null, sock_wfree() will call __sk_free(sk) later
1615          */
1616         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1617                 __sk_free(sk);
1618 }
1619 EXPORT_SYMBOL(sk_free);
1620
1621 static void sk_init_common(struct sock *sk)
1622 {
1623         skb_queue_head_init(&sk->sk_receive_queue);
1624         skb_queue_head_init(&sk->sk_write_queue);
1625         skb_queue_head_init(&sk->sk_error_queue);
1626
1627         rwlock_init(&sk->sk_callback_lock);
1628         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1629                         af_rlock_keys + sk->sk_family,
1630                         af_family_rlock_key_strings[sk->sk_family]);
1631         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1632                         af_wlock_keys + sk->sk_family,
1633                         af_family_wlock_key_strings[sk->sk_family]);
1634         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1635                         af_elock_keys + sk->sk_family,
1636                         af_family_elock_key_strings[sk->sk_family]);
1637         lockdep_set_class_and_name(&sk->sk_callback_lock,
1638                         af_callback_keys + sk->sk_family,
1639                         af_family_clock_key_strings[sk->sk_family]);
1640 }
1641
1642 /**
1643  *      sk_clone_lock - clone a socket, and lock its clone
1644  *      @sk: the socket to clone
1645  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1646  *
1647  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1648  */
1649 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1650 {
1651         struct sock *newsk;
1652         bool is_charged = true;
1653
1654         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1655         if (newsk != NULL) {
1656                 struct sk_filter *filter;
1657
1658                 sock_copy(newsk, sk);
1659
1660                 newsk->sk_prot_creator = sk->sk_prot;
1661
1662                 /* SANITY */
1663                 if (likely(newsk->sk_net_refcnt))
1664                         get_net(sock_net(newsk));
1665                 sk_node_init(&newsk->sk_node);
1666                 sock_lock_init(newsk);
1667                 bh_lock_sock(newsk);
1668                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1669                 newsk->sk_backlog.len = 0;
1670
1671                 atomic_set(&newsk->sk_rmem_alloc, 0);
1672                 /*
1673                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1674                  */
1675                 refcount_set(&newsk->sk_wmem_alloc, 1);
1676                 atomic_set(&newsk->sk_omem_alloc, 0);
1677                 sk_init_common(newsk);
1678
1679                 newsk->sk_dst_cache     = NULL;
1680                 newsk->sk_dst_pending_confirm = 0;
1681                 newsk->sk_wmem_queued   = 0;
1682                 newsk->sk_forward_alloc = 0;
1683                 atomic_set(&newsk->sk_drops, 0);
1684                 newsk->sk_send_head     = NULL;
1685                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1686                 atomic_set(&newsk->sk_zckey, 0);
1687
1688                 sock_reset_flag(newsk, SOCK_DONE);
1689                 mem_cgroup_sk_alloc(newsk);
1690                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1691
1692                 rcu_read_lock();
1693                 filter = rcu_dereference(sk->sk_filter);
1694                 if (filter != NULL)
1695                         /* though it's an empty new sock, the charging may fail
1696                          * if sysctl_optmem_max was changed between creation of
1697                          * original socket and cloning
1698                          */
1699                         is_charged = sk_filter_charge(newsk, filter);
1700                 RCU_INIT_POINTER(newsk->sk_filter, filter);
1701                 rcu_read_unlock();
1702
1703                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1704                         /* We need to make sure that we don't uncharge the new
1705                          * socket if we couldn't charge it in the first place
1706                          * as otherwise we uncharge the parent's filter.
1707                          */
1708                         if (!is_charged)
1709                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1710                         sk_free_unlock_clone(newsk);
1711                         newsk = NULL;
1712                         goto out;
1713                 }
1714                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1715
1716                 newsk->sk_err      = 0;
1717                 newsk->sk_err_soft = 0;
1718                 newsk->sk_priority = 0;
1719                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1720                 atomic64_set(&newsk->sk_cookie, 0);
1721                 if (likely(newsk->sk_net_refcnt))
1722                         sock_inuse_add(sock_net(newsk), 1);
1723
1724                 /*
1725                  * Before updating sk_refcnt, we must commit prior changes to memory
1726                  * (Documentation/RCU/rculist_nulls.txt for details)
1727                  */
1728                 smp_wmb();
1729                 refcount_set(&newsk->sk_refcnt, 2);
1730
1731                 /*
1732                  * Increment the counter in the same struct proto as the master
1733                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1734                  * is the same as sk->sk_prot->socks, as this field was copied
1735                  * with memcpy).
1736                  *
1737                  * This _changes_ the previous behaviour, where
1738                  * tcp_create_openreq_child always was incrementing the
1739                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1740                  * to be taken into account in all callers. -acme
1741                  */
1742                 sk_refcnt_debug_inc(newsk);
1743                 sk_set_socket(newsk, NULL);
1744                 newsk->sk_wq = NULL;
1745
1746                 if (newsk->sk_prot->sockets_allocated)
1747                         sk_sockets_allocated_inc(newsk);
1748
1749                 if (sock_needs_netstamp(sk) &&
1750                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1751                         net_enable_timestamp();
1752         }
1753 out:
1754         return newsk;
1755 }
1756 EXPORT_SYMBOL_GPL(sk_clone_lock);
1757
1758 void sk_free_unlock_clone(struct sock *sk)
1759 {
1760         /* It is still raw copy of parent, so invalidate
1761          * destructor and make plain sk_free() */
1762         sk->sk_destruct = NULL;
1763         bh_unlock_sock(sk);
1764         sk_free(sk);
1765 }
1766 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1767
1768 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1769 {
1770         u32 max_segs = 1;
1771
1772         sk_dst_set(sk, dst);
1773         sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1774         if (sk->sk_route_caps & NETIF_F_GSO)
1775                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1776         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1777         if (sk_can_gso(sk)) {
1778                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1779                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1780                 } else {
1781                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1782                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1783                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1784                 }
1785         }
1786         sk->sk_gso_max_segs = max_segs;
1787 }
1788 EXPORT_SYMBOL_GPL(sk_setup_caps);
1789
1790 /*
1791  *      Simple resource managers for sockets.
1792  */
1793
1794
1795 /*
1796  * Write buffer destructor automatically called from kfree_skb.
1797  */
1798 void sock_wfree(struct sk_buff *skb)
1799 {
1800         struct sock *sk = skb->sk;
1801         unsigned int len = skb->truesize;
1802
1803         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1804                 /*
1805                  * Keep a reference on sk_wmem_alloc, this will be released
1806                  * after sk_write_space() call
1807                  */
1808                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1809                 sk->sk_write_space(sk);
1810                 len = 1;
1811         }
1812         /*
1813          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1814          * could not do because of in-flight packets
1815          */
1816         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1817                 __sk_free(sk);
1818 }
1819 EXPORT_SYMBOL(sock_wfree);
1820
1821 /* This variant of sock_wfree() is used by TCP,
1822  * since it sets SOCK_USE_WRITE_QUEUE.
1823  */
1824 void __sock_wfree(struct sk_buff *skb)
1825 {
1826         struct sock *sk = skb->sk;
1827
1828         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1829                 __sk_free(sk);
1830 }
1831
1832 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1833 {
1834         skb_orphan(skb);
1835         skb->sk = sk;
1836 #ifdef CONFIG_INET
1837         if (unlikely(!sk_fullsock(sk))) {
1838                 skb->destructor = sock_edemux;
1839                 sock_hold(sk);
1840                 return;
1841         }
1842 #endif
1843         skb->destructor = sock_wfree;
1844         skb_set_hash_from_sk(skb, sk);
1845         /*
1846          * We used to take a refcount on sk, but following operation
1847          * is enough to guarantee sk_free() wont free this sock until
1848          * all in-flight packets are completed
1849          */
1850         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1851 }
1852 EXPORT_SYMBOL(skb_set_owner_w);
1853
1854 /* This helper is used by netem, as it can hold packets in its
1855  * delay queue. We want to allow the owner socket to send more
1856  * packets, as if they were already TX completed by a typical driver.
1857  * But we also want to keep skb->sk set because some packet schedulers
1858  * rely on it (sch_fq for example).
1859  */
1860 void skb_orphan_partial(struct sk_buff *skb)
1861 {
1862         if (skb_is_tcp_pure_ack(skb))
1863                 return;
1864
1865         if (skb->destructor == sock_wfree
1866 #ifdef CONFIG_INET
1867             || skb->destructor == tcp_wfree
1868 #endif
1869                 ) {
1870                 struct sock *sk = skb->sk;
1871
1872                 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1873                         WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1874                         skb->destructor = sock_efree;
1875                 }
1876         } else {
1877                 skb_orphan(skb);
1878         }
1879 }
1880 EXPORT_SYMBOL(skb_orphan_partial);
1881
1882 /*
1883  * Read buffer destructor automatically called from kfree_skb.
1884  */
1885 void sock_rfree(struct sk_buff *skb)
1886 {
1887         struct sock *sk = skb->sk;
1888         unsigned int len = skb->truesize;
1889
1890         atomic_sub(len, &sk->sk_rmem_alloc);
1891         sk_mem_uncharge(sk, len);
1892 }
1893 EXPORT_SYMBOL(sock_rfree);
1894
1895 /*
1896  * Buffer destructor for skbs that are not used directly in read or write
1897  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1898  */
1899 void sock_efree(struct sk_buff *skb)
1900 {
1901         sock_put(skb->sk);
1902 }
1903 EXPORT_SYMBOL(sock_efree);
1904
1905 kuid_t sock_i_uid(struct sock *sk)
1906 {
1907         kuid_t uid;
1908
1909         read_lock_bh(&sk->sk_callback_lock);
1910         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1911         read_unlock_bh(&sk->sk_callback_lock);
1912         return uid;
1913 }
1914 EXPORT_SYMBOL(sock_i_uid);
1915
1916 unsigned long sock_i_ino(struct sock *sk)
1917 {
1918         unsigned long ino;
1919
1920         read_lock_bh(&sk->sk_callback_lock);
1921         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1922         read_unlock_bh(&sk->sk_callback_lock);
1923         return ino;
1924 }
1925 EXPORT_SYMBOL(sock_i_ino);
1926
1927 /*
1928  * Allocate a skb from the socket's send buffer.
1929  */
1930 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1931                              gfp_t priority)
1932 {
1933         if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1934                 struct sk_buff *skb = alloc_skb(size, priority);
1935                 if (skb) {
1936                         skb_set_owner_w(skb, sk);
1937                         return skb;
1938                 }
1939         }
1940         return NULL;
1941 }
1942 EXPORT_SYMBOL(sock_wmalloc);
1943
1944 static void sock_ofree(struct sk_buff *skb)
1945 {
1946         struct sock *sk = skb->sk;
1947
1948         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1949 }
1950
1951 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1952                              gfp_t priority)
1953 {
1954         struct sk_buff *skb;
1955
1956         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1957         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1958             sysctl_optmem_max)
1959                 return NULL;
1960
1961         skb = alloc_skb(size, priority);
1962         if (!skb)
1963                 return NULL;
1964
1965         atomic_add(skb->truesize, &sk->sk_omem_alloc);
1966         skb->sk = sk;
1967         skb->destructor = sock_ofree;
1968         return skb;
1969 }
1970
1971 /*
1972  * Allocate a memory block from the socket's option memory buffer.
1973  */
1974 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1975 {
1976         if ((unsigned int)size <= sysctl_optmem_max &&
1977             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1978                 void *mem;
1979                 /* First do the add, to avoid the race if kmalloc
1980                  * might sleep.
1981                  */
1982                 atomic_add(size, &sk->sk_omem_alloc);
1983                 mem = kmalloc(size, priority);
1984                 if (mem)
1985                         return mem;
1986                 atomic_sub(size, &sk->sk_omem_alloc);
1987         }
1988         return NULL;
1989 }
1990 EXPORT_SYMBOL(sock_kmalloc);
1991
1992 /* Free an option memory block. Note, we actually want the inline
1993  * here as this allows gcc to detect the nullify and fold away the
1994  * condition entirely.
1995  */
1996 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1997                                   const bool nullify)
1998 {
1999         if (WARN_ON_ONCE(!mem))
2000                 return;
2001         if (nullify)
2002                 kzfree(mem);
2003         else
2004                 kfree(mem);
2005         atomic_sub(size, &sk->sk_omem_alloc);
2006 }
2007
2008 void sock_kfree_s(struct sock *sk, void *mem, int size)
2009 {
2010         __sock_kfree_s(sk, mem, size, false);
2011 }
2012 EXPORT_SYMBOL(sock_kfree_s);
2013
2014 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2015 {
2016         __sock_kfree_s(sk, mem, size, true);
2017 }
2018 EXPORT_SYMBOL(sock_kzfree_s);
2019
2020 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2021    I think, these locks should be removed for datagram sockets.
2022  */
2023 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2024 {
2025         DEFINE_WAIT(wait);
2026
2027         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2028         for (;;) {
2029                 if (!timeo)
2030                         break;
2031                 if (signal_pending(current))
2032                         break;
2033                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2034                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2035                 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2036                         break;
2037                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2038                         break;
2039                 if (sk->sk_err)
2040                         break;
2041                 timeo = schedule_timeout(timeo);
2042         }
2043         finish_wait(sk_sleep(sk), &wait);
2044         return timeo;
2045 }
2046
2047
2048 /*
2049  *      Generic send/receive buffer handlers
2050  */
2051
2052 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2053                                      unsigned long data_len, int noblock,
2054                                      int *errcode, int max_page_order)
2055 {
2056         struct sk_buff *skb;
2057         long timeo;
2058         int err;
2059
2060         timeo = sock_sndtimeo(sk, noblock);
2061         for (;;) {
2062                 err = sock_error(sk);
2063                 if (err != 0)
2064                         goto failure;
2065
2066                 err = -EPIPE;
2067                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2068                         goto failure;
2069
2070                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2071                         break;
2072
2073                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2074                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2075                 err = -EAGAIN;
2076                 if (!timeo)
2077                         goto failure;
2078                 if (signal_pending(current))
2079                         goto interrupted;
2080                 timeo = sock_wait_for_wmem(sk, timeo);
2081         }
2082         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2083                                    errcode, sk->sk_allocation);
2084         if (skb)
2085                 skb_set_owner_w(skb, sk);
2086         return skb;
2087
2088 interrupted:
2089         err = sock_intr_errno(timeo);
2090 failure:
2091         *errcode = err;
2092         return NULL;
2093 }
2094 EXPORT_SYMBOL(sock_alloc_send_pskb);
2095
2096 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2097                                     int noblock, int *errcode)
2098 {
2099         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2100 }
2101 EXPORT_SYMBOL(sock_alloc_send_skb);
2102
2103 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2104                      struct sockcm_cookie *sockc)
2105 {
2106         u32 tsflags;
2107
2108         switch (cmsg->cmsg_type) {
2109         case SO_MARK:
2110                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2111                         return -EPERM;
2112                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2113                         return -EINVAL;
2114                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2115                 break;
2116         case SO_TIMESTAMPING:
2117                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2118                         return -EINVAL;
2119
2120                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2121                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2122                         return -EINVAL;
2123
2124                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2125                 sockc->tsflags |= tsflags;
2126                 break;
2127         case SCM_TXTIME:
2128                 if (!sock_flag(sk, SOCK_TXTIME))
2129                         return -EINVAL;
2130                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2131                         return -EINVAL;
2132                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2133                 break;
2134         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2135         case SCM_RIGHTS:
2136         case SCM_CREDENTIALS:
2137                 break;
2138         default:
2139                 return -EINVAL;
2140         }
2141         return 0;
2142 }
2143 EXPORT_SYMBOL(__sock_cmsg_send);
2144
2145 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2146                    struct sockcm_cookie *sockc)
2147 {
2148         struct cmsghdr *cmsg;
2149         int ret;
2150
2151         for_each_cmsghdr(cmsg, msg) {
2152                 if (!CMSG_OK(msg, cmsg))
2153                         return -EINVAL;
2154                 if (cmsg->cmsg_level != SOL_SOCKET)
2155                         continue;
2156                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2157                 if (ret)
2158                         return ret;
2159         }
2160         return 0;
2161 }
2162 EXPORT_SYMBOL(sock_cmsg_send);
2163
2164 static void sk_enter_memory_pressure(struct sock *sk)
2165 {
2166         if (!sk->sk_prot->enter_memory_pressure)
2167                 return;
2168
2169         sk->sk_prot->enter_memory_pressure(sk);
2170 }
2171
2172 static void sk_leave_memory_pressure(struct sock *sk)
2173 {
2174         if (sk->sk_prot->leave_memory_pressure) {
2175                 sk->sk_prot->leave_memory_pressure(sk);
2176         } else {
2177                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2178
2179                 if (memory_pressure && *memory_pressure)
2180                         *memory_pressure = 0;
2181         }
2182 }
2183
2184 /* On 32bit arches, an skb frag is limited to 2^15 */
2185 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2186
2187 /**
2188  * skb_page_frag_refill - check that a page_frag contains enough room
2189  * @sz: minimum size of the fragment we want to get
2190  * @pfrag: pointer to page_frag
2191  * @gfp: priority for memory allocation
2192  *
2193  * Note: While this allocator tries to use high order pages, there is
2194  * no guarantee that allocations succeed. Therefore, @sz MUST be
2195  * less or equal than PAGE_SIZE.
2196  */
2197 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2198 {
2199         if (pfrag->page) {
2200                 if (page_ref_count(pfrag->page) == 1) {
2201                         pfrag->offset = 0;
2202                         return true;
2203                 }
2204                 if (pfrag->offset + sz <= pfrag->size)
2205                         return true;
2206                 put_page(pfrag->page);
2207         }
2208
2209         pfrag->offset = 0;
2210         if (SKB_FRAG_PAGE_ORDER) {
2211                 /* Avoid direct reclaim but allow kswapd to wake */
2212                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2213                                           __GFP_COMP | __GFP_NOWARN |
2214                                           __GFP_NORETRY,
2215                                           SKB_FRAG_PAGE_ORDER);
2216                 if (likely(pfrag->page)) {
2217                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2218                         return true;
2219                 }
2220         }
2221         pfrag->page = alloc_page(gfp);
2222         if (likely(pfrag->page)) {
2223                 pfrag->size = PAGE_SIZE;
2224                 return true;
2225         }
2226         return false;
2227 }
2228 EXPORT_SYMBOL(skb_page_frag_refill);
2229
2230 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2231 {
2232         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2233                 return true;
2234
2235         sk_enter_memory_pressure(sk);
2236         sk_stream_moderate_sndbuf(sk);
2237         return false;
2238 }
2239 EXPORT_SYMBOL(sk_page_frag_refill);
2240
2241 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2242                 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2243                 int first_coalesce)
2244 {
2245         int sg_curr = *sg_curr_index, use = 0, rc = 0;
2246         unsigned int size = *sg_curr_size;
2247         struct page_frag *pfrag;
2248         struct scatterlist *sge;
2249
2250         len -= size;
2251         pfrag = sk_page_frag(sk);
2252
2253         while (len > 0) {
2254                 unsigned int orig_offset;
2255
2256                 if (!sk_page_frag_refill(sk, pfrag)) {
2257                         rc = -ENOMEM;
2258                         goto out;
2259                 }
2260
2261                 use = min_t(int, len, pfrag->size - pfrag->offset);
2262
2263                 if (!sk_wmem_schedule(sk, use)) {
2264                         rc = -ENOMEM;
2265                         goto out;
2266                 }
2267
2268                 sk_mem_charge(sk, use);
2269                 size += use;
2270                 orig_offset = pfrag->offset;
2271                 pfrag->offset += use;
2272
2273                 sge = sg + sg_curr - 1;
2274                 if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
2275                     sge->offset + sge->length == orig_offset) {
2276                         sge->length += use;
2277                 } else {
2278                         sge = sg + sg_curr;
2279                         sg_unmark_end(sge);
2280                         sg_set_page(sge, pfrag->page, use, orig_offset);
2281                         get_page(pfrag->page);
2282                         sg_curr++;
2283
2284                         if (sg_curr == MAX_SKB_FRAGS)
2285                                 sg_curr = 0;
2286
2287                         if (sg_curr == sg_start) {
2288                                 rc = -ENOSPC;
2289                                 break;
2290                         }
2291                 }
2292
2293                 len -= use;
2294         }
2295 out:
2296         *sg_curr_size = size;
2297         *sg_curr_index = sg_curr;
2298         return rc;
2299 }
2300 EXPORT_SYMBOL(sk_alloc_sg);
2301
2302 static void __lock_sock(struct sock *sk)
2303         __releases(&sk->sk_lock.slock)
2304         __acquires(&sk->sk_lock.slock)
2305 {
2306         DEFINE_WAIT(wait);
2307
2308         for (;;) {
2309                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2310                                         TASK_UNINTERRUPTIBLE);
2311                 spin_unlock_bh(&sk->sk_lock.slock);
2312                 schedule();
2313                 spin_lock_bh(&sk->sk_lock.slock);
2314                 if (!sock_owned_by_user(sk))
2315                         break;
2316         }
2317         finish_wait(&sk->sk_lock.wq, &wait);
2318 }
2319
2320 static void __release_sock(struct sock *sk)
2321         __releases(&sk->sk_lock.slock)
2322         __acquires(&sk->sk_lock.slock)
2323 {
2324         struct sk_buff *skb, *next;
2325
2326         while ((skb = sk->sk_backlog.head) != NULL) {
2327                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2328
2329                 spin_unlock_bh(&sk->sk_lock.slock);
2330
2331                 do {
2332                         next = skb->next;
2333                         prefetch(next);
2334                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2335                         skb->next = NULL;
2336                         sk_backlog_rcv(sk, skb);
2337
2338                         cond_resched();
2339
2340                         skb = next;
2341                 } while (skb != NULL);
2342
2343                 spin_lock_bh(&sk->sk_lock.slock);
2344         }
2345
2346         /*
2347          * Doing the zeroing here guarantee we can not loop forever
2348          * while a wild producer attempts to flood us.
2349          */
2350         sk->sk_backlog.len = 0;
2351 }
2352
2353 void __sk_flush_backlog(struct sock *sk)
2354 {
2355         spin_lock_bh(&sk->sk_lock.slock);
2356         __release_sock(sk);
2357         spin_unlock_bh(&sk->sk_lock.slock);
2358 }
2359
2360 /**
2361  * sk_wait_data - wait for data to arrive at sk_receive_queue
2362  * @sk:    sock to wait on
2363  * @timeo: for how long
2364  * @skb:   last skb seen on sk_receive_queue
2365  *
2366  * Now socket state including sk->sk_err is changed only under lock,
2367  * hence we may omit checks after joining wait queue.
2368  * We check receive queue before schedule() only as optimization;
2369  * it is very likely that release_sock() added new data.
2370  */
2371 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2372 {
2373         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2374         int rc;
2375
2376         add_wait_queue(sk_sleep(sk), &wait);
2377         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2378         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2379         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2380         remove_wait_queue(sk_sleep(sk), &wait);
2381         return rc;
2382 }
2383 EXPORT_SYMBOL(sk_wait_data);
2384
2385 /**
2386  *      __sk_mem_raise_allocated - increase memory_allocated
2387  *      @sk: socket
2388  *      @size: memory size to allocate
2389  *      @amt: pages to allocate
2390  *      @kind: allocation type
2391  *
2392  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2393  */
2394 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2395 {
2396         struct proto *prot = sk->sk_prot;
2397         long allocated = sk_memory_allocated_add(sk, amt);
2398         bool charged = true;
2399
2400         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2401             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2402                 goto suppress_allocation;
2403
2404         /* Under limit. */
2405         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2406                 sk_leave_memory_pressure(sk);
2407                 return 1;
2408         }
2409
2410         /* Under pressure. */
2411         if (allocated > sk_prot_mem_limits(sk, 1))
2412                 sk_enter_memory_pressure(sk);
2413
2414         /* Over hard limit. */
2415         if (allocated > sk_prot_mem_limits(sk, 2))
2416                 goto suppress_allocation;
2417
2418         /* guarantee minimum buffer size under pressure */
2419         if (kind == SK_MEM_RECV) {
2420                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2421                         return 1;
2422
2423         } else { /* SK_MEM_SEND */
2424                 int wmem0 = sk_get_wmem0(sk, prot);
2425
2426                 if (sk->sk_type == SOCK_STREAM) {
2427                         if (sk->sk_wmem_queued < wmem0)
2428                                 return 1;
2429                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2430                                 return 1;
2431                 }
2432         }
2433
2434         if (sk_has_memory_pressure(sk)) {
2435                 int alloc;
2436
2437                 if (!sk_under_memory_pressure(sk))
2438                         return 1;
2439                 alloc = sk_sockets_allocated_read_positive(sk);
2440                 if (sk_prot_mem_limits(sk, 2) > alloc *
2441                     sk_mem_pages(sk->sk_wmem_queued +
2442                                  atomic_read(&sk->sk_rmem_alloc) +
2443                                  sk->sk_forward_alloc))
2444                         return 1;
2445         }
2446
2447 suppress_allocation:
2448
2449         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2450                 sk_stream_moderate_sndbuf(sk);
2451
2452                 /* Fail only if socket is _under_ its sndbuf.
2453                  * In this case we cannot block, so that we have to fail.
2454                  */
2455                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2456                         return 1;
2457         }
2458
2459         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2460                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2461
2462         sk_memory_allocated_sub(sk, amt);
2463
2464         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2465                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2466
2467         return 0;
2468 }
2469 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2470
2471 /**
2472  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2473  *      @sk: socket
2474  *      @size: memory size to allocate
2475  *      @kind: allocation type
2476  *
2477  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2478  *      rmem allocation. This function assumes that protocols which have
2479  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2480  */
2481 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2482 {
2483         int ret, amt = sk_mem_pages(size);
2484
2485         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2486         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2487         if (!ret)
2488                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2489         return ret;
2490 }
2491 EXPORT_SYMBOL(__sk_mem_schedule);
2492
2493 /**
2494  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2495  *      @sk: socket
2496  *      @amount: number of quanta
2497  *
2498  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2499  */
2500 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2501 {
2502         sk_memory_allocated_sub(sk, amount);
2503
2504         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2505                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2506
2507         if (sk_under_memory_pressure(sk) &&
2508             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2509                 sk_leave_memory_pressure(sk);
2510 }
2511 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2512
2513 /**
2514  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2515  *      @sk: socket
2516  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2517  */
2518 void __sk_mem_reclaim(struct sock *sk, int amount)
2519 {
2520         amount >>= SK_MEM_QUANTUM_SHIFT;
2521         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2522         __sk_mem_reduce_allocated(sk, amount);
2523 }
2524 EXPORT_SYMBOL(__sk_mem_reclaim);
2525
2526 int sk_set_peek_off(struct sock *sk, int val)
2527 {
2528         sk->sk_peek_off = val;
2529         return 0;
2530 }
2531 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2532
2533 /*
2534  * Set of default routines for initialising struct proto_ops when
2535  * the protocol does not support a particular function. In certain
2536  * cases where it makes no sense for a protocol to have a "do nothing"
2537  * function, some default processing is provided.
2538  */
2539
2540 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2541 {
2542         return -EOPNOTSUPP;
2543 }
2544 EXPORT_SYMBOL(sock_no_bind);
2545
2546 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2547                     int len, int flags)
2548 {
2549         return -EOPNOTSUPP;
2550 }
2551 EXPORT_SYMBOL(sock_no_connect);
2552
2553 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2554 {
2555         return -EOPNOTSUPP;
2556 }
2557 EXPORT_SYMBOL(sock_no_socketpair);
2558
2559 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2560                    bool kern)
2561 {
2562         return -EOPNOTSUPP;
2563 }
2564 EXPORT_SYMBOL(sock_no_accept);
2565
2566 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2567                     int peer)
2568 {
2569         return -EOPNOTSUPP;
2570 }
2571 EXPORT_SYMBOL(sock_no_getname);
2572
2573 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2574 {
2575         return -EOPNOTSUPP;
2576 }
2577 EXPORT_SYMBOL(sock_no_ioctl);
2578
2579 int sock_no_listen(struct socket *sock, int backlog)
2580 {
2581         return -EOPNOTSUPP;
2582 }
2583 EXPORT_SYMBOL(sock_no_listen);
2584
2585 int sock_no_shutdown(struct socket *sock, int how)
2586 {
2587         return -EOPNOTSUPP;
2588 }
2589 EXPORT_SYMBOL(sock_no_shutdown);
2590
2591 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2592                     char __user *optval, unsigned int optlen)
2593 {
2594         return -EOPNOTSUPP;
2595 }
2596 EXPORT_SYMBOL(sock_no_setsockopt);
2597
2598 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2599                     char __user *optval, int __user *optlen)
2600 {
2601         return -EOPNOTSUPP;
2602 }
2603 EXPORT_SYMBOL(sock_no_getsockopt);
2604
2605 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2606 {
2607         return -EOPNOTSUPP;
2608 }
2609 EXPORT_SYMBOL(sock_no_sendmsg);
2610
2611 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2612 {
2613         return -EOPNOTSUPP;
2614 }
2615 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2616
2617 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2618                     int flags)
2619 {
2620         return -EOPNOTSUPP;
2621 }
2622 EXPORT_SYMBOL(sock_no_recvmsg);
2623
2624 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2625 {
2626         /* Mirror missing mmap method error code */
2627         return -ENODEV;
2628 }
2629 EXPORT_SYMBOL(sock_no_mmap);
2630
2631 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2632 {
2633         ssize_t res;
2634         struct msghdr msg = {.msg_flags = flags};
2635         struct kvec iov;
2636         char *kaddr = kmap(page);
2637         iov.iov_base = kaddr + offset;
2638         iov.iov_len = size;
2639         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2640         kunmap(page);
2641         return res;
2642 }
2643 EXPORT_SYMBOL(sock_no_sendpage);
2644
2645 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2646                                 int offset, size_t size, int flags)
2647 {
2648         ssize_t res;
2649         struct msghdr msg = {.msg_flags = flags};
2650         struct kvec iov;
2651         char *kaddr = kmap(page);
2652
2653         iov.iov_base = kaddr + offset;
2654         iov.iov_len = size;
2655         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2656         kunmap(page);
2657         return res;
2658 }
2659 EXPORT_SYMBOL(sock_no_sendpage_locked);
2660
2661 /*
2662  *      Default Socket Callbacks
2663  */
2664
2665 static void sock_def_wakeup(struct sock *sk)
2666 {
2667         struct socket_wq *wq;
2668
2669         rcu_read_lock();
2670         wq = rcu_dereference(sk->sk_wq);
2671         if (skwq_has_sleeper(wq))
2672                 wake_up_interruptible_all(&wq->wait);
2673         rcu_read_unlock();
2674 }
2675
2676 static void sock_def_error_report(struct sock *sk)
2677 {
2678         struct socket_wq *wq;
2679
2680         rcu_read_lock();
2681         wq = rcu_dereference(sk->sk_wq);
2682         if (skwq_has_sleeper(wq))
2683                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2684         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2685         rcu_read_unlock();
2686 }
2687
2688 static void sock_def_readable(struct sock *sk)
2689 {
2690         struct socket_wq *wq;
2691
2692         rcu_read_lock();
2693         wq = rcu_dereference(sk->sk_wq);
2694         if (skwq_has_sleeper(wq))
2695                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2696                                                 EPOLLRDNORM | EPOLLRDBAND);
2697         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2698         rcu_read_unlock();
2699 }
2700
2701 static void sock_def_write_space(struct sock *sk)
2702 {
2703         struct socket_wq *wq;
2704
2705         rcu_read_lock();
2706
2707         /* Do not wake up a writer until he can make "significant"
2708          * progress.  --DaveM
2709          */
2710         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2711                 wq = rcu_dereference(sk->sk_wq);
2712                 if (skwq_has_sleeper(wq))
2713                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2714                                                 EPOLLWRNORM | EPOLLWRBAND);
2715
2716                 /* Should agree with poll, otherwise some programs break */
2717                 if (sock_writeable(sk))
2718                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2719         }
2720
2721         rcu_read_unlock();
2722 }
2723
2724 static void sock_def_destruct(struct sock *sk)
2725 {
2726 }
2727
2728 void sk_send_sigurg(struct sock *sk)
2729 {
2730         if (sk->sk_socket && sk->sk_socket->file)
2731                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2732                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2733 }
2734 EXPORT_SYMBOL(sk_send_sigurg);
2735
2736 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2737                     unsigned long expires)
2738 {
2739         if (!mod_timer(timer, expires))
2740                 sock_hold(sk);
2741 }
2742 EXPORT_SYMBOL(sk_reset_timer);
2743
2744 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2745 {
2746         if (del_timer(timer))
2747                 __sock_put(sk);
2748 }
2749 EXPORT_SYMBOL(sk_stop_timer);
2750
2751 void sock_init_data(struct socket *sock, struct sock *sk)
2752 {
2753         sk_init_common(sk);
2754         sk->sk_send_head        =       NULL;
2755
2756         timer_setup(&sk->sk_timer, NULL, 0);
2757
2758         sk->sk_allocation       =       GFP_KERNEL;
2759         sk->sk_rcvbuf           =       sysctl_rmem_default;
2760         sk->sk_sndbuf           =       sysctl_wmem_default;
2761         sk->sk_state            =       TCP_CLOSE;
2762         sk_set_socket(sk, sock);
2763
2764         sock_set_flag(sk, SOCK_ZAPPED);
2765
2766         if (sock) {
2767                 sk->sk_type     =       sock->type;
2768                 sk->sk_wq       =       sock->wq;
2769                 sock->sk        =       sk;
2770                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2771         } else {
2772                 sk->sk_wq       =       NULL;
2773                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2774         }
2775
2776         rwlock_init(&sk->sk_callback_lock);
2777         if (sk->sk_kern_sock)
2778                 lockdep_set_class_and_name(
2779                         &sk->sk_callback_lock,
2780                         af_kern_callback_keys + sk->sk_family,
2781                         af_family_kern_clock_key_strings[sk->sk_family]);
2782         else
2783                 lockdep_set_class_and_name(
2784                         &sk->sk_callback_lock,
2785                         af_callback_keys + sk->sk_family,
2786                         af_family_clock_key_strings[sk->sk_family]);
2787
2788         sk->sk_state_change     =       sock_def_wakeup;
2789         sk->sk_data_ready       =       sock_def_readable;
2790         sk->sk_write_space      =       sock_def_write_space;
2791         sk->sk_error_report     =       sock_def_error_report;
2792         sk->sk_destruct         =       sock_def_destruct;
2793
2794         sk->sk_frag.page        =       NULL;
2795         sk->sk_frag.offset      =       0;
2796         sk->sk_peek_off         =       -1;
2797
2798         sk->sk_peer_pid         =       NULL;
2799         sk->sk_peer_cred        =       NULL;
2800         sk->sk_write_pending    =       0;
2801         sk->sk_rcvlowat         =       1;
2802         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2803         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2804
2805         sk->sk_stamp = SK_DEFAULT_STAMP;
2806         atomic_set(&sk->sk_zckey, 0);
2807
2808 #ifdef CONFIG_NET_RX_BUSY_POLL
2809         sk->sk_napi_id          =       0;
2810         sk->sk_ll_usec          =       sysctl_net_busy_read;
2811 #endif
2812
2813         sk->sk_max_pacing_rate = ~0U;
2814         sk->sk_pacing_rate = ~0U;
2815         sk->sk_pacing_shift = 10;
2816         sk->sk_incoming_cpu = -1;
2817
2818         sk_rx_queue_clear(sk);
2819         /*
2820          * Before updating sk_refcnt, we must commit prior changes to memory
2821          * (Documentation/RCU/rculist_nulls.txt for details)
2822          */
2823         smp_wmb();
2824         refcount_set(&sk->sk_refcnt, 1);
2825         atomic_set(&sk->sk_drops, 0);
2826 }
2827 EXPORT_SYMBOL(sock_init_data);
2828
2829 void lock_sock_nested(struct sock *sk, int subclass)
2830 {
2831         might_sleep();
2832         spin_lock_bh(&sk->sk_lock.slock);
2833         if (sk->sk_lock.owned)
2834                 __lock_sock(sk);
2835         sk->sk_lock.owned = 1;
2836         spin_unlock(&sk->sk_lock.slock);
2837         /*
2838          * The sk_lock has mutex_lock() semantics here:
2839          */
2840         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2841         local_bh_enable();
2842 }
2843 EXPORT_SYMBOL(lock_sock_nested);
2844
2845 void release_sock(struct sock *sk)
2846 {
2847         spin_lock_bh(&sk->sk_lock.slock);
2848         if (sk->sk_backlog.tail)
2849                 __release_sock(sk);
2850
2851         /* Warning : release_cb() might need to release sk ownership,
2852          * ie call sock_release_ownership(sk) before us.
2853          */
2854         if (sk->sk_prot->release_cb)
2855                 sk->sk_prot->release_cb(sk);
2856
2857         sock_release_ownership(sk);
2858         if (waitqueue_active(&sk->sk_lock.wq))
2859                 wake_up(&sk->sk_lock.wq);
2860         spin_unlock_bh(&sk->sk_lock.slock);
2861 }
2862 EXPORT_SYMBOL(release_sock);
2863
2864 /**
2865  * lock_sock_fast - fast version of lock_sock
2866  * @sk: socket
2867  *
2868  * This version should be used for very small section, where process wont block
2869  * return false if fast path is taken:
2870  *
2871  *   sk_lock.slock locked, owned = 0, BH disabled
2872  *
2873  * return true if slow path is taken:
2874  *
2875  *   sk_lock.slock unlocked, owned = 1, BH enabled
2876  */
2877 bool lock_sock_fast(struct sock *sk)
2878 {
2879         might_sleep();
2880         spin_lock_bh(&sk->sk_lock.slock);
2881
2882         if (!sk->sk_lock.owned)
2883                 /*
2884                  * Note : We must disable BH
2885                  */
2886                 return false;
2887
2888         __lock_sock(sk);
2889         sk->sk_lock.owned = 1;
2890         spin_unlock(&sk->sk_lock.slock);
2891         /*
2892          * The sk_lock has mutex_lock() semantics here:
2893          */
2894         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2895         local_bh_enable();
2896         return true;
2897 }
2898 EXPORT_SYMBOL(lock_sock_fast);
2899
2900 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2901 {
2902         struct timeval tv;
2903
2904         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2905         tv = ktime_to_timeval(sk->sk_stamp);
2906         if (tv.tv_sec == -1)
2907                 return -ENOENT;
2908         if (tv.tv_sec == 0) {
2909                 sk->sk_stamp = ktime_get_real();
2910                 tv = ktime_to_timeval(sk->sk_stamp);
2911         }
2912         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2913 }
2914 EXPORT_SYMBOL(sock_get_timestamp);
2915
2916 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2917 {
2918         struct timespec ts;
2919
2920         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2921         ts = ktime_to_timespec(sk->sk_stamp);
2922         if (ts.tv_sec == -1)
2923                 return -ENOENT;
2924         if (ts.tv_sec == 0) {
2925                 sk->sk_stamp = ktime_get_real();
2926                 ts = ktime_to_timespec(sk->sk_stamp);
2927         }
2928         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2929 }
2930 EXPORT_SYMBOL(sock_get_timestampns);
2931
2932 void sock_enable_timestamp(struct sock *sk, int flag)
2933 {
2934         if (!sock_flag(sk, flag)) {
2935                 unsigned long previous_flags = sk->sk_flags;
2936
2937                 sock_set_flag(sk, flag);
2938                 /*
2939                  * we just set one of the two flags which require net
2940                  * time stamping, but time stamping might have been on
2941                  * already because of the other one
2942                  */
2943                 if (sock_needs_netstamp(sk) &&
2944                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2945                         net_enable_timestamp();
2946         }
2947 }
2948
2949 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2950                        int level, int type)
2951 {
2952         struct sock_exterr_skb *serr;
2953         struct sk_buff *skb;
2954         int copied, err;
2955
2956         err = -EAGAIN;
2957         skb = sock_dequeue_err_skb(sk);
2958         if (skb == NULL)
2959                 goto out;
2960
2961         copied = skb->len;
2962         if (copied > len) {
2963                 msg->msg_flags |= MSG_TRUNC;
2964                 copied = len;
2965         }
2966         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2967         if (err)
2968                 goto out_free_skb;
2969
2970         sock_recv_timestamp(msg, sk, skb);
2971
2972         serr = SKB_EXT_ERR(skb);
2973         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2974
2975         msg->msg_flags |= MSG_ERRQUEUE;
2976         err = copied;
2977
2978 out_free_skb:
2979         kfree_skb(skb);
2980 out:
2981         return err;
2982 }
2983 EXPORT_SYMBOL(sock_recv_errqueue);
2984
2985 /*
2986  *      Get a socket option on an socket.
2987  *
2988  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2989  *      asynchronous errors should be reported by getsockopt. We assume
2990  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2991  */
2992 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2993                            char __user *optval, int __user *optlen)
2994 {
2995         struct sock *sk = sock->sk;
2996
2997         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2998 }
2999 EXPORT_SYMBOL(sock_common_getsockopt);
3000
3001 #ifdef CONFIG_COMPAT
3002 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3003                                   char __user *optval, int __user *optlen)
3004 {
3005         struct sock *sk = sock->sk;
3006
3007         if (sk->sk_prot->compat_getsockopt != NULL)
3008                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3009                                                       optval, optlen);
3010         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3011 }
3012 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3013 #endif
3014
3015 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3016                         int flags)
3017 {
3018         struct sock *sk = sock->sk;
3019         int addr_len = 0;
3020         int err;
3021
3022         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3023                                    flags & ~MSG_DONTWAIT, &addr_len);
3024         if (err >= 0)
3025                 msg->msg_namelen = addr_len;
3026         return err;
3027 }
3028 EXPORT_SYMBOL(sock_common_recvmsg);
3029
3030 /*
3031  *      Set socket options on an inet socket.
3032  */
3033 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3034                            char __user *optval, unsigned int optlen)
3035 {
3036         struct sock *sk = sock->sk;
3037
3038         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3039 }
3040 EXPORT_SYMBOL(sock_common_setsockopt);
3041
3042 #ifdef CONFIG_COMPAT
3043 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3044                                   char __user *optval, unsigned int optlen)
3045 {
3046         struct sock *sk = sock->sk;
3047
3048         if (sk->sk_prot->compat_setsockopt != NULL)
3049                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3050                                                       optval, optlen);
3051         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3052 }
3053 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3054 #endif
3055
3056 void sk_common_release(struct sock *sk)
3057 {
3058         if (sk->sk_prot->destroy)
3059                 sk->sk_prot->destroy(sk);
3060
3061         /*
3062          * Observation: when sock_common_release is called, processes have
3063          * no access to socket. But net still has.
3064          * Step one, detach it from networking:
3065          *
3066          * A. Remove from hash tables.
3067          */
3068
3069         sk->sk_prot->unhash(sk);
3070
3071         /*
3072          * In this point socket cannot receive new packets, but it is possible
3073          * that some packets are in flight because some CPU runs receiver and
3074          * did hash table lookup before we unhashed socket. They will achieve
3075          * receive queue and will be purged by socket destructor.
3076          *
3077          * Also we still have packets pending on receive queue and probably,
3078          * our own packets waiting in device queues. sock_destroy will drain
3079          * receive queue, but transmitted packets will delay socket destruction
3080          * until the last reference will be released.
3081          */
3082
3083         sock_orphan(sk);
3084
3085         xfrm_sk_free_policy(sk);
3086
3087         sk_refcnt_debug_release(sk);
3088
3089         sock_put(sk);
3090 }
3091 EXPORT_SYMBOL(sk_common_release);
3092
3093 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3094 {
3095         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3096
3097         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3098         mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3099         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3100         mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3101         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3102         mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3103         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3104         mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3105         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3106 }
3107
3108 #ifdef CONFIG_PROC_FS
3109 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
3110 struct prot_inuse {
3111         int val[PROTO_INUSE_NR];
3112 };
3113
3114 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3115
3116 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3117 {
3118         __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3119 }
3120 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3121
3122 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3123 {
3124         int cpu, idx = prot->inuse_idx;
3125         int res = 0;
3126
3127         for_each_possible_cpu(cpu)
3128                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3129
3130         return res >= 0 ? res : 0;
3131 }
3132 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3133
3134 static void sock_inuse_add(struct net *net, int val)
3135 {
3136         this_cpu_add(*net->core.sock_inuse, val);
3137 }
3138
3139 int sock_inuse_get(struct net *net)
3140 {
3141         int cpu, res = 0;
3142
3143         for_each_possible_cpu(cpu)
3144                 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3145
3146         return res;
3147 }
3148
3149 EXPORT_SYMBOL_GPL(sock_inuse_get);
3150
3151 static int __net_init sock_inuse_init_net(struct net *net)
3152 {
3153         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3154         if (net->core.prot_inuse == NULL)
3155                 return -ENOMEM;
3156
3157         net->core.sock_inuse = alloc_percpu(int);
3158         if (net->core.sock_inuse == NULL)
3159                 goto out;
3160
3161         return 0;
3162
3163 out:
3164         free_percpu(net->core.prot_inuse);
3165         return -ENOMEM;
3166 }
3167
3168 static void __net_exit sock_inuse_exit_net(struct net *net)
3169 {
3170         free_percpu(net->core.prot_inuse);
3171         free_percpu(net->core.sock_inuse);
3172 }
3173
3174 static struct pernet_operations net_inuse_ops = {
3175         .init = sock_inuse_init_net,
3176         .exit = sock_inuse_exit_net,
3177 };
3178
3179 static __init int net_inuse_init(void)
3180 {
3181         if (register_pernet_subsys(&net_inuse_ops))
3182                 panic("Cannot initialize net inuse counters");
3183
3184         return 0;
3185 }
3186
3187 core_initcall(net_inuse_init);
3188
3189 static void assign_proto_idx(struct proto *prot)
3190 {
3191         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3192
3193         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3194                 pr_err("PROTO_INUSE_NR exhausted\n");
3195                 return;
3196         }
3197
3198         set_bit(prot->inuse_idx, proto_inuse_idx);
3199 }
3200
3201 static void release_proto_idx(struct proto *prot)
3202 {
3203         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3204                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3205 }
3206 #else
3207 static inline void assign_proto_idx(struct proto *prot)
3208 {
3209 }
3210
3211 static inline void release_proto_idx(struct proto *prot)
3212 {
3213 }
3214
3215 static void sock_inuse_add(struct net *net, int val)
3216 {
3217 }
3218 #endif
3219
3220 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3221 {
3222         if (!rsk_prot)
3223                 return;
3224         kfree(rsk_prot->slab_name);
3225         rsk_prot->slab_name = NULL;
3226         kmem_cache_destroy(rsk_prot->slab);
3227         rsk_prot->slab = NULL;
3228 }
3229
3230 static int req_prot_init(const struct proto *prot)
3231 {
3232         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3233
3234         if (!rsk_prot)
3235                 return 0;
3236
3237         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3238                                         prot->name);
3239         if (!rsk_prot->slab_name)
3240                 return -ENOMEM;
3241
3242         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3243                                            rsk_prot->obj_size, 0,
3244                                            SLAB_ACCOUNT | prot->slab_flags,
3245                                            NULL);
3246
3247         if (!rsk_prot->slab) {
3248                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3249                         prot->name);
3250                 return -ENOMEM;
3251         }
3252         return 0;
3253 }
3254
3255 int proto_register(struct proto *prot, int alloc_slab)
3256 {
3257         if (alloc_slab) {
3258                 prot->slab = kmem_cache_create_usercopy(prot->name,
3259                                         prot->obj_size, 0,
3260                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3261                                         prot->slab_flags,
3262                                         prot->useroffset, prot->usersize,
3263                                         NULL);
3264
3265                 if (prot->slab == NULL) {
3266                         pr_crit("%s: Can't create sock SLAB cache!\n",
3267                                 prot->name);
3268                         goto out;
3269                 }
3270
3271                 if (req_prot_init(prot))
3272                         goto out_free_request_sock_slab;
3273
3274                 if (prot->twsk_prot != NULL) {
3275                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3276
3277                         if (prot->twsk_prot->twsk_slab_name == NULL)
3278                                 goto out_free_request_sock_slab;
3279
3280                         prot->twsk_prot->twsk_slab =
3281                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3282                                                   prot->twsk_prot->twsk_obj_size,
3283                                                   0,
3284                                                   SLAB_ACCOUNT |
3285                                                   prot->slab_flags,
3286                                                   NULL);
3287                         if (prot->twsk_prot->twsk_slab == NULL)
3288                                 goto out_free_timewait_sock_slab_name;
3289                 }
3290         }
3291
3292         mutex_lock(&proto_list_mutex);
3293         list_add(&prot->node, &proto_list);
3294         assign_proto_idx(prot);
3295         mutex_unlock(&proto_list_mutex);
3296         return 0;
3297
3298 out_free_timewait_sock_slab_name:
3299         kfree(prot->twsk_prot->twsk_slab_name);
3300 out_free_request_sock_slab:
3301         req_prot_cleanup(prot->rsk_prot);
3302
3303         kmem_cache_destroy(prot->slab);
3304         prot->slab = NULL;
3305 out:
3306         return -ENOBUFS;
3307 }
3308 EXPORT_SYMBOL(proto_register);
3309
3310 void proto_unregister(struct proto *prot)
3311 {
3312         mutex_lock(&proto_list_mutex);
3313         release_proto_idx(prot);
3314         list_del(&prot->node);
3315         mutex_unlock(&proto_list_mutex);
3316
3317         kmem_cache_destroy(prot->slab);
3318         prot->slab = NULL;
3319
3320         req_prot_cleanup(prot->rsk_prot);
3321
3322         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3323                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3324                 kfree(prot->twsk_prot->twsk_slab_name);
3325                 prot->twsk_prot->twsk_slab = NULL;
3326         }
3327 }
3328 EXPORT_SYMBOL(proto_unregister);
3329
3330 int sock_load_diag_module(int family, int protocol)
3331 {
3332         if (!protocol) {
3333                 if (!sock_is_registered(family))
3334                         return -ENOENT;
3335
3336                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3337                                       NETLINK_SOCK_DIAG, family);
3338         }
3339
3340 #ifdef CONFIG_INET
3341         if (family == AF_INET &&
3342             !rcu_access_pointer(inet_protos[protocol]))
3343                 return -ENOENT;
3344 #endif
3345
3346         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3347                               NETLINK_SOCK_DIAG, family, protocol);
3348 }
3349 EXPORT_SYMBOL(sock_load_diag_module);
3350
3351 #ifdef CONFIG_PROC_FS
3352 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3353         __acquires(proto_list_mutex)
3354 {
3355         mutex_lock(&proto_list_mutex);
3356         return seq_list_start_head(&proto_list, *pos);
3357 }
3358
3359 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3360 {
3361         return seq_list_next(v, &proto_list, pos);
3362 }
3363
3364 static void proto_seq_stop(struct seq_file *seq, void *v)
3365         __releases(proto_list_mutex)
3366 {
3367         mutex_unlock(&proto_list_mutex);
3368 }
3369
3370 static char proto_method_implemented(const void *method)
3371 {
3372         return method == NULL ? 'n' : 'y';
3373 }
3374 static long sock_prot_memory_allocated(struct proto *proto)
3375 {
3376         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3377 }
3378
3379 static char *sock_prot_memory_pressure(struct proto *proto)
3380 {
3381         return proto->memory_pressure != NULL ?
3382         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3383 }
3384
3385 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3386 {
3387
3388         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3389                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3390                    proto->name,
3391                    proto->obj_size,
3392                    sock_prot_inuse_get(seq_file_net(seq), proto),
3393                    sock_prot_memory_allocated(proto),
3394                    sock_prot_memory_pressure(proto),
3395                    proto->max_header,
3396                    proto->slab == NULL ? "no" : "yes",
3397                    module_name(proto->owner),
3398                    proto_method_implemented(proto->close),
3399                    proto_method_implemented(proto->connect),
3400                    proto_method_implemented(proto->disconnect),
3401                    proto_method_implemented(proto->accept),
3402                    proto_method_implemented(proto->ioctl),
3403                    proto_method_implemented(proto->init),
3404                    proto_method_implemented(proto->destroy),
3405                    proto_method_implemented(proto->shutdown),
3406                    proto_method_implemented(proto->setsockopt),
3407                    proto_method_implemented(proto->getsockopt),
3408                    proto_method_implemented(proto->sendmsg),
3409                    proto_method_implemented(proto->recvmsg),
3410                    proto_method_implemented(proto->sendpage),
3411                    proto_method_implemented(proto->bind),
3412                    proto_method_implemented(proto->backlog_rcv),
3413                    proto_method_implemented(proto->hash),
3414                    proto_method_implemented(proto->unhash),
3415                    proto_method_implemented(proto->get_port),
3416                    proto_method_implemented(proto->enter_memory_pressure));
3417 }
3418
3419 static int proto_seq_show(struct seq_file *seq, void *v)
3420 {
3421         if (v == &proto_list)
3422                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3423                            "protocol",
3424                            "size",
3425                            "sockets",
3426                            "memory",
3427                            "press",
3428                            "maxhdr",
3429                            "slab",
3430                            "module",
3431                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3432         else
3433                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3434         return 0;
3435 }
3436
3437 static const struct seq_operations proto_seq_ops = {
3438         .start  = proto_seq_start,
3439         .next   = proto_seq_next,
3440         .stop   = proto_seq_stop,
3441         .show   = proto_seq_show,
3442 };
3443
3444 static __net_init int proto_init_net(struct net *net)
3445 {
3446         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3447                         sizeof(struct seq_net_private)))
3448                 return -ENOMEM;
3449
3450         return 0;
3451 }
3452
3453 static __net_exit void proto_exit_net(struct net *net)
3454 {
3455         remove_proc_entry("protocols", net->proc_net);
3456 }
3457
3458
3459 static __net_initdata struct pernet_operations proto_net_ops = {
3460         .init = proto_init_net,
3461         .exit = proto_exit_net,
3462 };
3463
3464 static int __init proto_init(void)
3465 {
3466         return register_pernet_subsys(&proto_net_ops);
3467 }
3468
3469 subsys_initcall(proto_init);
3470
3471 #endif /* PROC_FS */
3472
3473 #ifdef CONFIG_NET_RX_BUSY_POLL
3474 bool sk_busy_loop_end(void *p, unsigned long start_time)
3475 {
3476         struct sock *sk = p;
3477
3478         return !skb_queue_empty(&sk->sk_receive_queue) ||
3479                sk_busy_loop_timeout(sk, start_time);
3480 }
3481 EXPORT_SYMBOL(sk_busy_loop_end);
3482 #endif /* CONFIG_NET_RX_BUSY_POLL */