net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/types.h>
  97 #include <linux/socket.h>
  98 #include <linux/in.h>
  99 #include <linux/kernel.h>
 100 #include <linux/module.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/sched.h>
 104 #include <linux/timer.h>
 105 #include <linux/string.h>
 106 #include <linux/sockios.h>
 107 #include <linux/net.h>
 108 #include <linux/mm.h>
 109 #include <linux/slab.h>
 110 #include <linux/interrupt.h>
 111 #include <linux/poll.h>
 112 #include <linux/tcp.h>
 113 #include <linux/init.h>
 114 #include <linux/highmem.h>
 115 #include <linux/user_namespace.h>
 116 #include <linux/static_key.h>
 117 #include <linux/memcontrol.h>
 118 #include <linux/prefetch.h>
 119
 120 #include <asm/uaccess.h>
 121
 122 #include <linux/netdevice.h>
 123 #include <net/protocol.h>
 124 #include <linux/skbuff.h>
 125 #include <net/net_namespace.h>
 126 #include <net/request_sock.h>
 127 #include <net/sock.h>
 128 #include <linux/net_tstamp.h>
 129 #include <net/xfrm.h>
 130 #include <linux/ipsec.h>
 131 #include <net/cls_cgroup.h>
 132 #include <net/netprio_cgroup.h>
 133
 134 #include <linux/filter.h>
 135
 136 #include <trace/events/sock.h>
 137
 138 #ifdef CONFIG_INET
 139 #include <net/tcp.h>
 140 #endif
 141
 142 static DEFINE_MUTEX(proto_list_mutex);
 143 static LIST_HEAD(proto_list);
 144
 145 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 146 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 147 {
 148         struct proto *proto;
 149         int ret = 0;
 150
 151         mutex_lock(&proto_list_mutex);
 152         list_for_each_entry(proto, &proto_list, node) {
 153                 if (proto->init_cgroup) {
 154                         ret = proto->init_cgroup(memcg, ss);
 155                         if (ret)
 156                                 goto out;
 157                 }
 158         }
 159
 160         mutex_unlock(&proto_list_mutex);
 161         return ret;
 162 out:
 163         list_for_each_entry_continue_reverse(proto, &proto_list, node)
 164                 if (proto->destroy_cgroup)
 165                         proto->destroy_cgroup(memcg);
 166         mutex_unlock(&proto_list_mutex);
 167         return ret;
 168 }
 169
 170 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 171 {
 172         struct proto *proto;
 173
 174         mutex_lock(&proto_list_mutex);
 175         list_for_each_entry_reverse(proto, &proto_list, node)
 176                 if (proto->destroy_cgroup)
 177                         proto->destroy_cgroup(memcg);
 178         mutex_unlock(&proto_list_mutex);
 179 }
 180 #endif
 181
 182 /*
 183  * Each address family might have different locking rules, so we have
 184  * one slock key per address family:
 185  */
 186 static struct lock_class_key af_family_keys[AF_MAX];
 187 static struct lock_class_key af_family_slock_keys[AF_MAX];
 188
 189 struct static_key memcg_socket_limit_enabled;
 190 EXPORT_SYMBOL(memcg_socket_limit_enabled);
 191
 192 /*
 193  * Make lock validator output more readable. (we pre-construct these
 194  * strings build-time, so that runtime initialization of socket
 195  * locks is fast):
 196  */
 197 static const char *const af_family_key_strings[AF_MAX+1] = {
 198   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 199   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 200   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 201   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 202   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 203   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 204   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 205   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 206   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 207   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 208   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 209   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 210   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 211   "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
 212 };
 213 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 214   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 215   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 216   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 217   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 218   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 219   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 220   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 221   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 222   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 223   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 224   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 225   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 226   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 227   "slock-AF_NFC"   , "slock-AF_MAX"
 228 };
 229 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 230   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 231   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 232   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 233   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 234   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 235   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 236   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 237   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 238   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 239   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 240   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 241   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 242   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 243   "clock-AF_NFC"   , "clock-AF_MAX"
 244 };
 245
 246 /*
 247  * sk_callback_lock locking rules are per-address-family,
 248  * so split the lock classes by using a per-AF key:
 249  */
 250 static struct lock_class_key af_callback_keys[AF_MAX];
 251
 252 /* Take into consideration the size of the struct sk_buff overhead in the
 253  * determination of these values, since that is non-constant across
 254  * platforms.  This makes socket queueing behavior and performance
 255  * not depend upon such differences.
 256  */
 257 #define _SK_MEM_PACKETS         256
 258 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 259 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 260 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 261
 262 /* Run time adjustable parameters. */
 263 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 264 EXPORT_SYMBOL(sysctl_wmem_max);
 265 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 266 EXPORT_SYMBOL(sysctl_rmem_max);
 267 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 268 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 269
 270 /* Maximal space eaten by iovec or ancillary data plus some space */
 271 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 272 EXPORT_SYMBOL(sysctl_optmem_max);
 273
 274 #if defined(CONFIG_CGROUPS)
 275 #if !defined(CONFIG_NET_CLS_CGROUP)
 276 int net_cls_subsys_id = -1;
 277 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
 278 #endif
 279 #if !defined(CONFIG_NETPRIO_CGROUP)
 280 int net_prio_subsys_id = -1;
 281 EXPORT_SYMBOL_GPL(net_prio_subsys_id);
 282 #endif
 283 #endif
 284
 285 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 286 {
 287         struct timeval tv;
 288
 289         if (optlen < sizeof(tv))
 290                 return -EINVAL;
 291         if (copy_from_user(&tv, optval, sizeof(tv)))
 292                 return -EFAULT;
 293         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 294                 return -EDOM;
 295
 296         if (tv.tv_sec < 0) {
 297                 static int warned __read_mostly;
 298
 299                 *timeo_p = 0;
 300                 if (warned < 10 && net_ratelimit()) {
 301                         warned++;
 302                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 303                                 __func__, current->comm, task_pid_nr(current));
 304                 }
 305                 return 0;
 306         }
 307         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 308         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 309                 return 0;
 310         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 311                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 312         return 0;
 313 }
 314
 315 static void sock_warn_obsolete_bsdism(const char *name)
 316 {
 317         static int warned;
 318         static char warncomm[TASK_COMM_LEN];
 319         if (strcmp(warncomm, current->comm) && warned < 5) {
 320                 strcpy(warncomm,  current->comm);
 321                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 322                         warncomm, name);
 323                 warned++;
 324         }
 325 }
 326
 327 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 328
 329 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 330 {
 331         if (sk->sk_flags & flags) {
 332                 sk->sk_flags &= ~flags;
 333                 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 334                         net_disable_timestamp();
 335         }
 336 }
 337
 338
 339 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 340 {
 341         int err;
 342         int skb_len;
 343         unsigned long flags;
 344         struct sk_buff_head *list = &sk->sk_receive_queue;
 345
 346         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 347                 atomic_inc(&sk->sk_drops);
 348                 trace_sock_rcvqueue_full(sk, skb);
 349                 return -ENOMEM;
 350         }
 351
 352         err = sk_filter(sk, skb);
 353         if (err)
 354                 return err;
 355
 356         if (!sk_rmem_schedule(sk, skb->truesize)) {
 357                 atomic_inc(&sk->sk_drops);
 358                 return -ENOBUFS;
 359         }
 360
 361         skb->dev = NULL;
 362         skb_set_owner_r(skb, sk);
 363
 364         /* Cache the SKB length before we tack it onto the receive
 365          * queue.  Once it is added it no longer belongs to us and
 366          * may be freed by other threads of control pulling packets
 367          * from the queue.
 368          */
 369         skb_len = skb->len;
 370
 371         /* we escape from rcu protected region, make sure we dont leak
 372          * a norefcounted dst
 373          */
 374         skb_dst_force(skb);
 375
 376         spin_lock_irqsave(&list->lock, flags);
 377         skb->dropcount = atomic_read(&sk->sk_drops);
 378         __skb_queue_tail(list, skb);
 379         spin_unlock_irqrestore(&list->lock, flags);
 380
 381         if (!sock_flag(sk, SOCK_DEAD))
 382                 sk->sk_data_ready(sk, skb_len);
 383         return 0;
 384 }
 385 EXPORT_SYMBOL(sock_queue_rcv_skb);
 386
 387 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 388 {
 389         int rc = NET_RX_SUCCESS;
 390
 391         if (sk_filter(sk, skb))
 392                 goto discard_and_relse;
 393
 394         skb->dev = NULL;
 395
 396         if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
 397                 atomic_inc(&sk->sk_drops);
 398                 goto discard_and_relse;
 399         }
 400         if (nested)
 401                 bh_lock_sock_nested(sk);
 402         else
 403                 bh_lock_sock(sk);
 404         if (!sock_owned_by_user(sk)) {
 405                 /*
 406                  * trylock + unlock semantics:
 407                  */
 408                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 409
 410                 rc = sk_backlog_rcv(sk, skb);
 411
 412                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 413         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 414                 bh_unlock_sock(sk);
 415                 atomic_inc(&sk->sk_drops);
 416                 goto discard_and_relse;
 417         }
 418
 419         bh_unlock_sock(sk);
 420 out:
 421         sock_put(sk);
 422         return rc;
 423 discard_and_relse:
 424         kfree_skb(skb);
 425         goto out;
 426 }
 427 EXPORT_SYMBOL(sk_receive_skb);
 428
 429 void sk_reset_txq(struct sock *sk)
 430 {
 431         sk_tx_queue_clear(sk);
 432 }
 433 EXPORT_SYMBOL(sk_reset_txq);
 434
 435 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 436 {
 437         struct dst_entry *dst = __sk_dst_get(sk);
 438
 439         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 440                 sk_tx_queue_clear(sk);
 441                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 442                 dst_release(dst);
 443                 return NULL;
 444         }
 445
 446         return dst;
 447 }
 448 EXPORT_SYMBOL(__sk_dst_check);
 449
 450 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 451 {
 452         struct dst_entry *dst = sk_dst_get(sk);
 453
 454         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 455                 sk_dst_reset(sk);
 456                 dst_release(dst);
 457                 return NULL;
 458         }
 459
 460         return dst;
 461 }
 462 EXPORT_SYMBOL(sk_dst_check);
 463
 464 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 465 {
 466         int ret = -ENOPROTOOPT;
 467 #ifdef CONFIG_NETDEVICES
 468         struct net *net = sock_net(sk);
 469         char devname[IFNAMSIZ];
 470         int index;
 471
 472         /* Sorry... */
 473         ret = -EPERM;
 474         if (!capable(CAP_NET_RAW))
 475                 goto out;
 476
 477         ret = -EINVAL;
 478         if (optlen < 0)
 479                 goto out;
 480
 481         /* Bind this socket to a particular device like "eth0",
 482          * as specified in the passed interface name. If the
 483          * name is "" or the option length is zero the socket
 484          * is not bound.
 485          */
 486         if (optlen > IFNAMSIZ - 1)
 487                 optlen = IFNAMSIZ - 1;
 488         memset(devname, 0, sizeof(devname));
 489
 490         ret = -EFAULT;
 491         if (copy_from_user(devname, optval, optlen))
 492                 goto out;
 493
 494         index = 0;
 495         if (devname[0] != '\0') {
 496                 struct net_device *dev;
 497
 498                 rcu_read_lock();
 499                 dev = dev_get_by_name_rcu(net, devname);
 500                 if (dev)
 501                         index = dev->ifindex;
 502                 rcu_read_unlock();
 503                 ret = -ENODEV;
 504                 if (!dev)
 505                         goto out;
 506         }
 507
 508         lock_sock(sk);
 509         sk->sk_bound_dev_if = index;
 510         sk_dst_reset(sk);
 511         release_sock(sk);
 512
 513         ret = 0;
 514
 515 out:
 516 #endif
 517
 518         return ret;
 519 }
 520
 521 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 522 {
 523         if (valbool)
 524                 sock_set_flag(sk, bit);
 525         else
 526                 sock_reset_flag(sk, bit);
 527 }
 528
 529 /*
 530  *      This is meant for all protocols to use and covers goings on
 531  *      at the socket level. Everything here is generic.
 532  */
 533
 534 int sock_setsockopt(struct socket *sock, int level, int optname,
 535                     char __user *optval, unsigned int optlen)
 536 {
 537         struct sock *sk = sock->sk;
 538         int val;
 539         int valbool;
 540         struct linger ling;
 541         int ret = 0;
 542
 543         /*
 544          *      Options without arguments
 545          */
 546
 547         if (optname == SO_BINDTODEVICE)
 548                 return sock_bindtodevice(sk, optval, optlen);
 549
 550         if (optlen < sizeof(int))
 551                 return -EINVAL;
 552
 553         if (get_user(val, (int __user *)optval))
 554                 return -EFAULT;
 555
 556         valbool = val ? 1 : 0;
 557
 558         lock_sock(sk);
 559
 560         switch (optname) {
 561         case SO_DEBUG:
 562                 if (val && !capable(CAP_NET_ADMIN))
 563                         ret = -EACCES;
 564                 else
 565                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 566                 break;
 567         case SO_REUSEADDR:
 568                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 569                 break;
 570         case SO_TYPE:
 571         case SO_PROTOCOL:
 572         case SO_DOMAIN:
 573         case SO_ERROR:
 574                 ret = -ENOPROTOOPT;
 575                 break;
 576         case SO_DONTROUTE:
 577                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 578                 break;
 579         case SO_BROADCAST:
 580                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 581                 break;
 582         case SO_SNDBUF:
 583                 /* Don't error on this BSD doesn't and if you think
 584                  * about it this is right. Otherwise apps have to
 585                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 586                  * are treated in BSD as hints
 587                  */
 588                 val = min_t(u32, val, sysctl_wmem_max);
 589 set_sndbuf:
 590                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 591                 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 592                 /* Wake up sending tasks if we upped the value. */
 593                 sk->sk_write_space(sk);
 594                 break;
 595
 596         case SO_SNDBUFFORCE:
 597                 if (!capable(CAP_NET_ADMIN)) {
 598                         ret = -EPERM;
 599                         break;
 600                 }
 601                 goto set_sndbuf;
 602
 603         case SO_RCVBUF:
 604                 /* Don't error on this BSD doesn't and if you think
 605                  * about it this is right. Otherwise apps have to
 606                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 607                  * are treated in BSD as hints
 608                  */
 609                 val = min_t(u32, val, sysctl_rmem_max);
 610 set_rcvbuf:
 611                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 612                 /*
 613                  * We double it on the way in to account for
 614                  * "struct sk_buff" etc. overhead.   Applications
 615                  * assume that the SO_RCVBUF setting they make will
 616                  * allow that much actual data to be received on that
 617                  * socket.
 618                  *
 619                  * Applications are unaware that "struct sk_buff" and
 620                  * other overheads allocate from the receive buffer
 621                  * during socket buffer allocation.
 622                  *
 623                  * And after considering the possible alternatives,
 624                  * returning the value we actually used in getsockopt
 625                  * is the most desirable behavior.
 626                  */
 627                 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 628                 break;
 629
 630         case SO_RCVBUFFORCE:
 631                 if (!capable(CAP_NET_ADMIN)) {
 632                         ret = -EPERM;
 633                         break;
 634                 }
 635                 goto set_rcvbuf;
 636
 637         case SO_KEEPALIVE:
 638 #ifdef CONFIG_INET
 639                 if (sk->sk_protocol == IPPROTO_TCP)
 640                         tcp_set_keepalive(sk, valbool);
 641 #endif
 642                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 643                 break;
 644
 645         case SO_OOBINLINE:
 646                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 647                 break;
 648
 649         case SO_NO_CHECK:
 650                 sk->sk_no_check = valbool;
 651                 break;
 652
 653         case SO_PRIORITY:
 654                 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 655                         sk->sk_priority = val;
 656                 else
 657                         ret = -EPERM;
 658                 break;
 659
 660         case SO_LINGER:
 661                 if (optlen < sizeof(ling)) {
 662                         ret = -EINVAL;  /* 1003.1g */
 663                         break;
 664                 }
 665                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 666                         ret = -EFAULT;
 667                         break;
 668                 }
 669                 if (!ling.l_onoff)
 670                         sock_reset_flag(sk, SOCK_LINGER);
 671                 else {
 672 #if (BITS_PER_LONG == 32)
 673                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 674                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 675                         else
 676 #endif
 677                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 678                         sock_set_flag(sk, SOCK_LINGER);
 679                 }
 680                 break;
 681
 682         case SO_BSDCOMPAT:
 683                 sock_warn_obsolete_bsdism("setsockopt");
 684                 break;
 685
 686         case SO_PASSCRED:
 687                 if (valbool)
 688                         set_bit(SOCK_PASSCRED, &sock->flags);
 689                 else
 690                         clear_bit(SOCK_PASSCRED, &sock->flags);
 691                 break;
 692
 693         case SO_TIMESTAMP:
 694         case SO_TIMESTAMPNS:
 695                 if (valbool)  {
 696                         if (optname == SO_TIMESTAMP)
 697                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 698                         else
 699                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 700                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 701                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 702                 } else {
 703                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 704                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 705                 }
 706                 break;
 707
 708         case SO_TIMESTAMPING:
 709                 if (val & ~SOF_TIMESTAMPING_MASK) {
 710                         ret = -EINVAL;
 711                         break;
 712                 }
 713                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 714                                   val & SOF_TIMESTAMPING_TX_HARDWARE);
 715                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 716                                   val & SOF_TIMESTAMPING_TX_SOFTWARE);
 717                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 718                                   val & SOF_TIMESTAMPING_RX_HARDWARE);
 719                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 720                         sock_enable_timestamp(sk,
 721                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 722                 else
 723                         sock_disable_timestamp(sk,
 724                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 725                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 726                                   val & SOF_TIMESTAMPING_SOFTWARE);
 727                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 728                                   val & SOF_TIMESTAMPING_SYS_HARDWARE);
 729                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 730                                   val & SOF_TIMESTAMPING_RAW_HARDWARE);
 731                 break;
 732
 733         case SO_RCVLOWAT:
 734                 if (val < 0)
 735                         val = INT_MAX;
 736                 sk->sk_rcvlowat = val ? : 1;
 737                 break;
 738
 739         case SO_RCVTIMEO:
 740                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 741                 break;
 742
 743         case SO_SNDTIMEO:
 744                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 745                 break;
 746
 747         case SO_ATTACH_FILTER:
 748                 ret = -EINVAL;
 749                 if (optlen == sizeof(struct sock_fprog)) {
 750                         struct sock_fprog fprog;
 751
 752                         ret = -EFAULT;
 753                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 754                                 break;
 755
 756                         ret = sk_attach_filter(&fprog, sk);
 757                 }
 758                 break;
 759
 760         case SO_DETACH_FILTER:
 761                 ret = sk_detach_filter(sk);
 762                 break;
 763
 764         case SO_PASSSEC:
 765                 if (valbool)
 766                         set_bit(SOCK_PASSSEC, &sock->flags);
 767                 else
 768                         clear_bit(SOCK_PASSSEC, &sock->flags);
 769                 break;
 770         case SO_MARK:
 771                 if (!capable(CAP_NET_ADMIN))
 772                         ret = -EPERM;
 773                 else
 774                         sk->sk_mark = val;
 775                 break;
 776
 777                 /* We implement the SO_SNDLOWAT etc to
 778                    not be settable (1003.1g 5.3) */
 779         case SO_RXQ_OVFL:
 780                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 781                 break;
 782
 783         case SO_WIFI_STATUS:
 784                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 785                 break;
 786
 787         case SO_PEEK_OFF:
 788                 if (sock->ops->set_peek_off)
 789                         sock->ops->set_peek_off(sk, val);
 790                 else
 791                         ret = -EOPNOTSUPP;
 792                 break;
 793
 794         case SO_NOFCS:
 795                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 796                 break;
 797
 798         default:
 799                 ret = -ENOPROTOOPT;
 800                 break;
 801         }
 802         release_sock(sk);
 803         return ret;
 804 }
 805 EXPORT_SYMBOL(sock_setsockopt);
 806
 807
 808 void cred_to_ucred(struct pid *pid, const struct cred *cred,
 809                    struct ucred *ucred)
 810 {
 811         ucred->pid = pid_vnr(pid);
 812         ucred->uid = ucred->gid = -1;
 813         if (cred) {
 814                 struct user_namespace *current_ns = current_user_ns();
 815
 816                 ucred->uid = from_kuid(current_ns, cred->euid);
 817                 ucred->gid = from_kgid(current_ns, cred->egid);
 818         }
 819 }
 820 EXPORT_SYMBOL_GPL(cred_to_ucred);
 821
 822 int sock_getsockopt(struct socket *sock, int level, int optname,
 823                     char __user *optval, int __user *optlen)
 824 {
 825         struct sock *sk = sock->sk;
 826
 827         union {
 828                 int val;
 829                 struct linger ling;
 830                 struct timeval tm;
 831         } v;
 832
 833         int lv = sizeof(int);
 834         int len;
 835
 836         if (get_user(len, optlen))
 837                 return -EFAULT;
 838         if (len < 0)
 839                 return -EINVAL;
 840
 841         memset(&v, 0, sizeof(v));
 842
 843         switch (optname) {
 844         case SO_DEBUG:
 845                 v.val = sock_flag(sk, SOCK_DBG);
 846                 break;
 847
 848         case SO_DONTROUTE:
 849                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
 850                 break;
 851
 852         case SO_BROADCAST:
 853                 v.val = sock_flag(sk, SOCK_BROADCAST);
 854                 break;
 855
 856         case SO_SNDBUF:
 857                 v.val = sk->sk_sndbuf;
 858                 break;
 859
 860         case SO_RCVBUF:
 861                 v.val = sk->sk_rcvbuf;
 862                 break;
 863
 864         case SO_REUSEADDR:
 865                 v.val = sk->sk_reuse;
 866                 break;
 867
 868         case SO_KEEPALIVE:
 869                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
 870                 break;
 871
 872         case SO_TYPE:
 873                 v.val = sk->sk_type;
 874                 break;
 875
 876         case SO_PROTOCOL:
 877                 v.val = sk->sk_protocol;
 878                 break;
 879
 880         case SO_DOMAIN:
 881                 v.val = sk->sk_family;
 882                 break;
 883
 884         case SO_ERROR:
 885                 v.val = -sock_error(sk);
 886                 if (v.val == 0)
 887                         v.val = xchg(&sk->sk_err_soft, 0);
 888                 break;
 889
 890         case SO_OOBINLINE:
 891                 v.val = sock_flag(sk, SOCK_URGINLINE);
 892                 break;
 893
 894         case SO_NO_CHECK:
 895                 v.val = sk->sk_no_check;
 896                 break;
 897
 898         case SO_PRIORITY:
 899                 v.val = sk->sk_priority;
 900                 break;
 901
 902         case SO_LINGER:
 903                 lv              = sizeof(v.ling);
 904                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
 905                 v.ling.l_linger = sk->sk_lingertime / HZ;
 906                 break;
 907
 908         case SO_BSDCOMPAT:
 909                 sock_warn_obsolete_bsdism("getsockopt");
 910                 break;
 911
 912         case SO_TIMESTAMP:
 913                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 914                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
 915                 break;
 916
 917         case SO_TIMESTAMPNS:
 918                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 919                 break;
 920
 921         case SO_TIMESTAMPING:
 922                 v.val = 0;
 923                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
 924                         v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
 925                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 926                         v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
 927                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
 928                         v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
 929                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
 930                         v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
 931                 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
 932                         v.val |= SOF_TIMESTAMPING_SOFTWARE;
 933                 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
 934                         v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
 935                 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
 936                         v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
 937                 break;
 938
 939         case SO_RCVTIMEO:
 940                 lv = sizeof(struct timeval);
 941                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 942                         v.tm.tv_sec = 0;
 943                         v.tm.tv_usec = 0;
 944                 } else {
 945                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 946                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 947                 }
 948                 break;
 949
 950         case SO_SNDTIMEO:
 951                 lv = sizeof(struct timeval);
 952                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 953                         v.tm.tv_sec = 0;
 954                         v.tm.tv_usec = 0;
 955                 } else {
 956                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
 957                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
 958                 }
 959                 break;
 960
 961         case SO_RCVLOWAT:
 962                 v.val = sk->sk_rcvlowat;
 963                 break;
 964
 965         case SO_SNDLOWAT:
 966                 v.val = 1;
 967                 break;
 968
 969         case SO_PASSCRED:
 970                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
 971                 break;
 972
 973         case SO_PEERCRED:
 974         {
 975                 struct ucred peercred;
 976                 if (len > sizeof(peercred))
 977                         len = sizeof(peercred);
 978                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
 979                 if (copy_to_user(optval, &peercred, len))
 980                         return -EFAULT;
 981                 goto lenout;
 982         }
 983
 984         case SO_PEERNAME:
 985         {
 986                 char address[128];
 987
 988                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
 989                         return -ENOTCONN;
 990                 if (lv < len)
 991                         return -EINVAL;
 992                 if (copy_to_user(optval, address, len))
 993                         return -EFAULT;
 994                 goto lenout;
 995         }
 996
 997         /* Dubious BSD thing... Probably nobody even uses it, but
 998          * the UNIX standard wants it for whatever reason... -DaveM
 999          */
1000         case SO_ACCEPTCONN:
1001                 v.val = sk->sk_state == TCP_LISTEN;
1002                 break;
1003
1004         case SO_PASSSEC:
1005                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1006                 break;
1007
1008         case SO_PEERSEC:
1009                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1010
1011         case SO_MARK:
1012                 v.val = sk->sk_mark;
1013                 break;
1014
1015         case SO_RXQ_OVFL:
1016                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1017                 break;
1018
1019         case SO_WIFI_STATUS:
1020                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1021                 break;
1022
1023         case SO_PEEK_OFF:
1024                 if (!sock->ops->set_peek_off)
1025                         return -EOPNOTSUPP;
1026
1027                 v.val = sk->sk_peek_off;
1028                 break;
1029         case SO_NOFCS:
1030                 v.val = sock_flag(sk, SOCK_NOFCS);
1031                 break;
1032         default:
1033                 return -ENOPROTOOPT;
1034         }
1035
1036         if (len > lv)
1037                 len = lv;
1038         if (copy_to_user(optval, &v, len))
1039                 return -EFAULT;
1040 lenout:
1041         if (put_user(len, optlen))
1042                 return -EFAULT;
1043         return 0;
1044 }
1045
1046 /*
1047  * Initialize an sk_lock.
1048  *
1049  * (We also register the sk_lock with the lock validator.)
1050  */
1051 static inline void sock_lock_init(struct sock *sk)
1052 {
1053         sock_lock_init_class_and_name(sk,
1054                         af_family_slock_key_strings[sk->sk_family],
1055                         af_family_slock_keys + sk->sk_family,
1056                         af_family_key_strings[sk->sk_family],
1057                         af_family_keys + sk->sk_family);
1058 }
1059
1060 /*
1061  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1062  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1063  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1064  */
1065 static void sock_copy(struct sock *nsk, const struct sock *osk)
1066 {
1067 #ifdef CONFIG_SECURITY_NETWORK
1068         void *sptr = nsk->sk_security;
1069 #endif
1070         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1071
1072         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1073                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1074
1075 #ifdef CONFIG_SECURITY_NETWORK
1076         nsk->sk_security = sptr;
1077         security_sk_clone(osk, nsk);
1078 #endif
1079 }
1080
1081 /*
1082  * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1083  * un-modified. Special care is taken when initializing object to zero.
1084  */
1085 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1086 {
1087         if (offsetof(struct sock, sk_node.next) != 0)
1088                 memset(sk, 0, offsetof(struct sock, sk_node.next));
1089         memset(&sk->sk_node.pprev, 0,
1090                size - offsetof(struct sock, sk_node.pprev));
1091 }
1092
1093 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1094 {
1095         unsigned long nulls1, nulls2;
1096
1097         nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1098         nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1099         if (nulls1 > nulls2)
1100                 swap(nulls1, nulls2);
1101
1102         if (nulls1 != 0)
1103                 memset((char *)sk, 0, nulls1);
1104         memset((char *)sk + nulls1 + sizeof(void *), 0,
1105                nulls2 - nulls1 - sizeof(void *));
1106         memset((char *)sk + nulls2 + sizeof(void *), 0,
1107                size - nulls2 - sizeof(void *));
1108 }
1109 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1110
1111 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1112                 int family)
1113 {
1114         struct sock *sk;
1115         struct kmem_cache *slab;
1116
1117         slab = prot->slab;
1118         if (slab != NULL) {
1119                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1120                 if (!sk)
1121                         return sk;
1122                 if (priority & __GFP_ZERO) {
1123                         if (prot->clear_sk)
1124                                 prot->clear_sk(sk, prot->obj_size);
1125                         else
1126                                 sk_prot_clear_nulls(sk, prot->obj_size);
1127                 }
1128         } else
1129                 sk = kmalloc(prot->obj_size, priority);
1130
1131         if (sk != NULL) {
1132                 kmemcheck_annotate_bitfield(sk, flags);
1133
1134                 if (security_sk_alloc(sk, family, priority))
1135                         goto out_free;
1136
1137                 if (!try_module_get(prot->owner))
1138                         goto out_free_sec;
1139                 sk_tx_queue_clear(sk);
1140         }
1141
1142         return sk;
1143
1144 out_free_sec:
1145         security_sk_free(sk);
1146 out_free:
1147         if (slab != NULL)
1148                 kmem_cache_free(slab, sk);
1149         else
1150                 kfree(sk);
1151         return NULL;
1152 }
1153
1154 static void sk_prot_free(struct proto *prot, struct sock *sk)
1155 {
1156         struct kmem_cache *slab;
1157         struct module *owner;
1158
1159         owner = prot->owner;
1160         slab = prot->slab;
1161
1162         security_sk_free(sk);
1163         if (slab != NULL)
1164                 kmem_cache_free(slab, sk);
1165         else
1166                 kfree(sk);
1167         module_put(owner);
1168 }
1169
1170 #ifdef CONFIG_CGROUPS
1171 void sock_update_classid(struct sock *sk)
1172 {
1173         u32 classid;
1174
1175         rcu_read_lock();  /* doing current task, which cannot vanish. */
1176         classid = task_cls_classid(current);
1177         rcu_read_unlock();
1178         if (classid && classid != sk->sk_classid)
1179                 sk->sk_classid = classid;
1180 }
1181 EXPORT_SYMBOL(sock_update_classid);
1182
1183 void sock_update_netprioidx(struct sock *sk)
1184 {
1185         if (in_interrupt())
1186                 return;
1187
1188         sk->sk_cgrp_prioidx = task_netprioidx(current);
1189 }
1190 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1191 #endif
1192
1193 /**
1194  *      sk_alloc - All socket objects are allocated here
1195  *      @net: the applicable net namespace
1196  *      @family: protocol family
1197  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1198  *      @prot: struct proto associated with this new sock instance
1199  */
1200 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1201                       struct proto *prot)
1202 {
1203         struct sock *sk;
1204
1205         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1206         if (sk) {
1207                 sk->sk_family = family;
1208                 /*
1209                  * See comment in struct sock definition to understand
1210                  * why we need sk_prot_creator -acme
1211                  */
1212                 sk->sk_prot = sk->sk_prot_creator = prot;
1213                 sock_lock_init(sk);
1214                 sock_net_set(sk, get_net(net));
1215                 atomic_set(&sk->sk_wmem_alloc, 1);
1216
1217                 sock_update_classid(sk);
1218                 sock_update_netprioidx(sk);
1219         }
1220
1221         return sk;
1222 }
1223 EXPORT_SYMBOL(sk_alloc);
1224
1225 static void __sk_free(struct sock *sk)
1226 {
1227         struct sk_filter *filter;
1228
1229         if (sk->sk_destruct)
1230                 sk->sk_destruct(sk);
1231
1232         filter = rcu_dereference_check(sk->sk_filter,
1233                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1234         if (filter) {
1235                 sk_filter_uncharge(sk, filter);
1236                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1237         }
1238
1239         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1240
1241         if (atomic_read(&sk->sk_omem_alloc))
1242                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1243                          __func__, atomic_read(&sk->sk_omem_alloc));
1244
1245         if (sk->sk_peer_cred)
1246                 put_cred(sk->sk_peer_cred);
1247         put_pid(sk->sk_peer_pid);
1248         put_net(sock_net(sk));
1249         sk_prot_free(sk->sk_prot_creator, sk);
1250 }
1251
1252 void sk_free(struct sock *sk)
1253 {
1254         /*
1255          * We subtract one from sk_wmem_alloc and can know if
1256          * some packets are still in some tx queue.
1257          * If not null, sock_wfree() will call __sk_free(sk) later
1258          */
1259         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1260                 __sk_free(sk);
1261 }
1262 EXPORT_SYMBOL(sk_free);
1263
1264 /*
1265  * Last sock_put should drop reference to sk->sk_net. It has already
1266  * been dropped in sk_change_net. Taking reference to stopping namespace
1267  * is not an option.
1268  * Take reference to a socket to remove it from hash _alive_ and after that
1269  * destroy it in the context of init_net.
1270  */
1271 void sk_release_kernel(struct sock *sk)
1272 {
1273         if (sk == NULL || sk->sk_socket == NULL)
1274                 return;
1275
1276         sock_hold(sk);
1277         sock_release(sk->sk_socket);
1278         release_net(sock_net(sk));
1279         sock_net_set(sk, get_net(&init_net));
1280         sock_put(sk);
1281 }
1282 EXPORT_SYMBOL(sk_release_kernel);
1283
1284 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1285 {
1286         if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1287                 sock_update_memcg(newsk);
1288 }
1289
1290 /**
1291  *      sk_clone_lock - clone a socket, and lock its clone
1292  *      @sk: the socket to clone
1293  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1294  *
1295  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1296  */
1297 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1298 {
1299         struct sock *newsk;
1300
1301         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1302         if (newsk != NULL) {
1303                 struct sk_filter *filter;
1304
1305                 sock_copy(newsk, sk);
1306
1307                 /* SANITY */
1308                 get_net(sock_net(newsk));
1309                 sk_node_init(&newsk->sk_node);
1310                 sock_lock_init(newsk);
1311                 bh_lock_sock(newsk);
1312                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1313                 newsk->sk_backlog.len = 0;
1314
1315                 atomic_set(&newsk->sk_rmem_alloc, 0);
1316                 /*
1317                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1318                  */
1319                 atomic_set(&newsk->sk_wmem_alloc, 1);
1320                 atomic_set(&newsk->sk_omem_alloc, 0);
1321                 skb_queue_head_init(&newsk->sk_receive_queue);
1322                 skb_queue_head_init(&newsk->sk_write_queue);
1323 #ifdef CONFIG_NET_DMA
1324                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1325 #endif
1326
1327                 spin_lock_init(&newsk->sk_dst_lock);
1328                 rwlock_init(&newsk->sk_callback_lock);
1329                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1330                                 af_callback_keys + newsk->sk_family,
1331                                 af_family_clock_key_strings[newsk->sk_family]);
1332
1333                 newsk->sk_dst_cache     = NULL;
1334                 newsk->sk_wmem_queued   = 0;
1335                 newsk->sk_forward_alloc = 0;
1336                 newsk->sk_send_head     = NULL;
1337                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1338
1339                 sock_reset_flag(newsk, SOCK_DONE);
1340                 skb_queue_head_init(&newsk->sk_error_queue);
1341
1342                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1343                 if (filter != NULL)
1344                         sk_filter_charge(newsk, filter);
1345
1346                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1347                         /* It is still raw copy of parent, so invalidate
1348                          * destructor and make plain sk_free() */
1349                         newsk->sk_destruct = NULL;
1350                         bh_unlock_sock(newsk);
1351                         sk_free(newsk);
1352                         newsk = NULL;
1353                         goto out;
1354                 }
1355
1356                 newsk->sk_err      = 0;
1357                 newsk->sk_priority = 0;
1358                 /*
1359                  * Before updating sk_refcnt, we must commit prior changes to memory
1360                  * (Documentation/RCU/rculist_nulls.txt for details)
1361                  */
1362                 smp_wmb();
1363                 atomic_set(&newsk->sk_refcnt, 2);
1364
1365                 /*
1366                  * Increment the counter in the same struct proto as the master
1367                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1368                  * is the same as sk->sk_prot->socks, as this field was copied
1369                  * with memcpy).
1370                  *
1371                  * This _changes_ the previous behaviour, where
1372                  * tcp_create_openreq_child always was incrementing the
1373                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1374                  * to be taken into account in all callers. -acme
1375                  */
1376                 sk_refcnt_debug_inc(newsk);
1377                 sk_set_socket(newsk, NULL);
1378                 newsk->sk_wq = NULL;
1379
1380                 sk_update_clone(sk, newsk);
1381
1382                 if (newsk->sk_prot->sockets_allocated)
1383                         sk_sockets_allocated_inc(newsk);
1384
1385                 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1386                         net_enable_timestamp();
1387         }
1388 out:
1389         return newsk;
1390 }
1391 EXPORT_SYMBOL_GPL(sk_clone_lock);
1392
1393 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1394 {
1395         __sk_dst_set(sk, dst);
1396         sk->sk_route_caps = dst->dev->features;
1397         if (sk->sk_route_caps & NETIF_F_GSO)
1398                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1399         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1400         if (sk_can_gso(sk)) {
1401                 if (dst->header_len) {
1402                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1403                 } else {
1404                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1405                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1406                 }
1407         }
1408 }
1409 EXPORT_SYMBOL_GPL(sk_setup_caps);
1410
1411 void __init sk_init(void)
1412 {
1413         if (totalram_pages <= 4096) {
1414                 sysctl_wmem_max = 32767;
1415                 sysctl_rmem_max = 32767;
1416                 sysctl_wmem_default = 32767;
1417                 sysctl_rmem_default = 32767;
1418         } else if (totalram_pages >= 131072) {
1419                 sysctl_wmem_max = 131071;
1420                 sysctl_rmem_max = 131071;
1421         }
1422 }
1423
1424 /*
1425  *      Simple resource managers for sockets.
1426  */
1427
1428
1429 /*
1430  * Write buffer destructor automatically called from kfree_skb.
1431  */
1432 void sock_wfree(struct sk_buff *skb)
1433 {
1434         struct sock *sk = skb->sk;
1435         unsigned int len = skb->truesize;
1436
1437         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1438                 /*
1439                  * Keep a reference on sk_wmem_alloc, this will be released
1440                  * after sk_write_space() call
1441                  */
1442                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1443                 sk->sk_write_space(sk);
1444                 len = 1;
1445         }
1446         /*
1447          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1448          * could not do because of in-flight packets
1449          */
1450         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1451                 __sk_free(sk);
1452 }
1453 EXPORT_SYMBOL(sock_wfree);
1454
1455 /*
1456  * Read buffer destructor automatically called from kfree_skb.
1457  */
1458 void sock_rfree(struct sk_buff *skb)
1459 {
1460         struct sock *sk = skb->sk;
1461         unsigned int len = skb->truesize;
1462
1463         atomic_sub(len, &sk->sk_rmem_alloc);
1464         sk_mem_uncharge(sk, len);
1465 }
1466 EXPORT_SYMBOL(sock_rfree);
1467
1468
1469 int sock_i_uid(struct sock *sk)
1470 {
1471         int uid;
1472
1473         read_lock_bh(&sk->sk_callback_lock);
1474         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1475         read_unlock_bh(&sk->sk_callback_lock);
1476         return uid;
1477 }
1478 EXPORT_SYMBOL(sock_i_uid);
1479
1480 unsigned long sock_i_ino(struct sock *sk)
1481 {
1482         unsigned long ino;
1483
1484         read_lock_bh(&sk->sk_callback_lock);
1485         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1486         read_unlock_bh(&sk->sk_callback_lock);
1487         return ino;
1488 }
1489 EXPORT_SYMBOL(sock_i_ino);
1490
1491 /*
1492  * Allocate a skb from the socket's send buffer.
1493  */
1494 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1495                              gfp_t priority)
1496 {
1497         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1498                 struct sk_buff *skb = alloc_skb(size, priority);
1499                 if (skb) {
1500                         skb_set_owner_w(skb, sk);
1501                         return skb;
1502                 }
1503         }
1504         return NULL;
1505 }
1506 EXPORT_SYMBOL(sock_wmalloc);
1507
1508 /*
1509  * Allocate a skb from the socket's receive buffer.
1510  */
1511 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1512                              gfp_t priority)
1513 {
1514         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1515                 struct sk_buff *skb = alloc_skb(size, priority);
1516                 if (skb) {
1517                         skb_set_owner_r(skb, sk);
1518                         return skb;
1519                 }
1520         }
1521         return NULL;
1522 }
1523
1524 /*
1525  * Allocate a memory block from the socket's option memory buffer.
1526  */
1527 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1528 {
1529         if ((unsigned int)size <= sysctl_optmem_max &&
1530             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1531                 void *mem;
1532                 /* First do the add, to avoid the race if kmalloc
1533                  * might sleep.
1534                  */
1535                 atomic_add(size, &sk->sk_omem_alloc);
1536                 mem = kmalloc(size, priority);
1537                 if (mem)
1538                         return mem;
1539                 atomic_sub(size, &sk->sk_omem_alloc);
1540         }
1541         return NULL;
1542 }
1543 EXPORT_SYMBOL(sock_kmalloc);
1544
1545 /*
1546  * Free an option memory block.
1547  */
1548 void sock_kfree_s(struct sock *sk, void *mem, int size)
1549 {
1550         kfree(mem);
1551         atomic_sub(size, &sk->sk_omem_alloc);
1552 }
1553 EXPORT_SYMBOL(sock_kfree_s);
1554
1555 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1556    I think, these locks should be removed for datagram sockets.
1557  */
1558 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1559 {
1560         DEFINE_WAIT(wait);
1561
1562         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1563         for (;;) {
1564                 if (!timeo)
1565                         break;
1566                 if (signal_pending(current))
1567                         break;
1568                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1569                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1570                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1571                         break;
1572                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1573                         break;
1574                 if (sk->sk_err)
1575                         break;
1576                 timeo = schedule_timeout(timeo);
1577         }
1578         finish_wait(sk_sleep(sk), &wait);
1579         return timeo;
1580 }
1581
1582
1583 /*
1584  *      Generic send/receive buffer handlers
1585  */
1586
1587 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1588                                      unsigned long data_len, int noblock,
1589                                      int *errcode)
1590 {
1591         struct sk_buff *skb;
1592         gfp_t gfp_mask;
1593         long timeo;
1594         int err;
1595         int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1596
1597         err = -EMSGSIZE;
1598         if (npages > MAX_SKB_FRAGS)
1599                 goto failure;
1600
1601         gfp_mask = sk->sk_allocation;
1602         if (gfp_mask & __GFP_WAIT)
1603                 gfp_mask |= __GFP_REPEAT;
1604
1605         timeo = sock_sndtimeo(sk, noblock);
1606         while (1) {
1607                 err = sock_error(sk);
1608                 if (err != 0)
1609                         goto failure;
1610
1611                 err = -EPIPE;
1612                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1613                         goto failure;
1614
1615                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1616                         skb = alloc_skb(header_len, gfp_mask);
1617                         if (skb) {
1618                                 int i;
1619
1620                                 /* No pages, we're done... */
1621                                 if (!data_len)
1622                                         break;
1623
1624                                 skb->truesize += data_len;
1625                                 skb_shinfo(skb)->nr_frags = npages;
1626                                 for (i = 0; i < npages; i++) {
1627                                         struct page *page;
1628
1629                                         page = alloc_pages(sk->sk_allocation, 0);
1630                                         if (!page) {
1631                                                 err = -ENOBUFS;
1632                                                 skb_shinfo(skb)->nr_frags = i;
1633                                                 kfree_skb(skb);
1634                                                 goto failure;
1635                                         }
1636
1637                                         __skb_fill_page_desc(skb, i,
1638                                                         page, 0,
1639                                                         (data_len >= PAGE_SIZE ?
1640                                                          PAGE_SIZE :
1641                                                          data_len));
1642                                         data_len -= PAGE_SIZE;
1643                                 }
1644
1645                                 /* Full success... */
1646                                 break;
1647                         }
1648                         err = -ENOBUFS;
1649                         goto failure;
1650                 }
1651                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1652                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1653                 err = -EAGAIN;
1654                 if (!timeo)
1655                         goto failure;
1656                 if (signal_pending(current))
1657                         goto interrupted;
1658                 timeo = sock_wait_for_wmem(sk, timeo);
1659         }
1660
1661         skb_set_owner_w(skb, sk);
1662         return skb;
1663
1664 interrupted:
1665         err = sock_intr_errno(timeo);
1666 failure:
1667         *errcode = err;
1668         return NULL;
1669 }
1670 EXPORT_SYMBOL(sock_alloc_send_pskb);
1671
1672 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1673                                     int noblock, int *errcode)
1674 {
1675         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1676 }
1677 EXPORT_SYMBOL(sock_alloc_send_skb);
1678
1679 static void __lock_sock(struct sock *sk)
1680         __releases(&sk->sk_lock.slock)
1681         __acquires(&sk->sk_lock.slock)
1682 {
1683         DEFINE_WAIT(wait);
1684
1685         for (;;) {
1686                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1687                                         TASK_UNINTERRUPTIBLE);
1688                 spin_unlock_bh(&sk->sk_lock.slock);
1689                 schedule();
1690                 spin_lock_bh(&sk->sk_lock.slock);
1691                 if (!sock_owned_by_user(sk))
1692                         break;
1693         }
1694         finish_wait(&sk->sk_lock.wq, &wait);
1695 }
1696
1697 static void __release_sock(struct sock *sk)
1698         __releases(&sk->sk_lock.slock)
1699         __acquires(&sk->sk_lock.slock)
1700 {
1701         struct sk_buff *skb = sk->sk_backlog.head;
1702
1703         do {
1704                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1705                 bh_unlock_sock(sk);
1706
1707                 do {
1708                         struct sk_buff *next = skb->next;
1709
1710                         prefetch(next);
1711                         WARN_ON_ONCE(skb_dst_is_noref(skb));
1712                         skb->next = NULL;
1713                         sk_backlog_rcv(sk, skb);
1714
1715                         /*
1716                          * We are in process context here with softirqs
1717                          * disabled, use cond_resched_softirq() to preempt.
1718                          * This is safe to do because we've taken the backlog
1719                          * queue private:
1720                          */
1721                         cond_resched_softirq();
1722
1723                         skb = next;
1724                 } while (skb != NULL);
1725
1726                 bh_lock_sock(sk);
1727         } while ((skb = sk->sk_backlog.head) != NULL);
1728
1729         /*
1730          * Doing the zeroing here guarantee we can not loop forever
1731          * while a wild producer attempts to flood us.
1732          */
1733         sk->sk_backlog.len = 0;
1734 }
1735
1736 /**
1737  * sk_wait_data - wait for data to arrive at sk_receive_queue
1738  * @sk:    sock to wait on
1739  * @timeo: for how long
1740  *
1741  * Now socket state including sk->sk_err is changed only under lock,
1742  * hence we may omit checks after joining wait queue.
1743  * We check receive queue before schedule() only as optimization;
1744  * it is very likely that release_sock() added new data.
1745  */
1746 int sk_wait_data(struct sock *sk, long *timeo)
1747 {
1748         int rc;
1749         DEFINE_WAIT(wait);
1750
1751         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1752         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1753         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1754         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1755         finish_wait(sk_sleep(sk), &wait);
1756         return rc;
1757 }
1758 EXPORT_SYMBOL(sk_wait_data);
1759
1760 /**
1761  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1762  *      @sk: socket
1763  *      @size: memory size to allocate
1764  *      @kind: allocation type
1765  *
1766  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1767  *      rmem allocation. This function assumes that protocols which have
1768  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1769  */
1770 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1771 {
1772         struct proto *prot = sk->sk_prot;
1773         int amt = sk_mem_pages(size);
1774         long allocated;
1775         int parent_status = UNDER_LIMIT;
1776
1777         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1778
1779         allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1780
1781         /* Under limit. */
1782         if (parent_status == UNDER_LIMIT &&
1783                         allocated <= sk_prot_mem_limits(sk, 0)) {
1784                 sk_leave_memory_pressure(sk);
1785                 return 1;
1786         }
1787
1788         /* Under pressure. (we or our parents) */
1789         if ((parent_status > SOFT_LIMIT) ||
1790                         allocated > sk_prot_mem_limits(sk, 1))
1791                 sk_enter_memory_pressure(sk);
1792
1793         /* Over hard limit (we or our parents) */
1794         if ((parent_status == OVER_LIMIT) ||
1795                         (allocated > sk_prot_mem_limits(sk, 2)))
1796                 goto suppress_allocation;
1797
1798         /* guarantee minimum buffer size under pressure */
1799         if (kind == SK_MEM_RECV) {
1800                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1801                         return 1;
1802
1803         } else { /* SK_MEM_SEND */
1804                 if (sk->sk_type == SOCK_STREAM) {
1805                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1806                                 return 1;
1807                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1808                            prot->sysctl_wmem[0])
1809                                 return 1;
1810         }
1811
1812         if (sk_has_memory_pressure(sk)) {
1813                 int alloc;
1814
1815                 if (!sk_under_memory_pressure(sk))
1816                         return 1;
1817                 alloc = sk_sockets_allocated_read_positive(sk);
1818                 if (sk_prot_mem_limits(sk, 2) > alloc *
1819                     sk_mem_pages(sk->sk_wmem_queued +
1820                                  atomic_read(&sk->sk_rmem_alloc) +
1821                                  sk->sk_forward_alloc))
1822                         return 1;
1823         }
1824
1825 suppress_allocation:
1826
1827         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1828                 sk_stream_moderate_sndbuf(sk);
1829
1830                 /* Fail only if socket is _under_ its sndbuf.
1831                  * In this case we cannot block, so that we have to fail.
1832                  */
1833                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1834                         return 1;
1835         }
1836
1837         trace_sock_exceed_buf_limit(sk, prot, allocated);
1838
1839         /* Alas. Undo changes. */
1840         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1841
1842         sk_memory_allocated_sub(sk, amt);
1843
1844         return 0;
1845 }
1846 EXPORT_SYMBOL(__sk_mem_schedule);
1847
1848 /**
1849  *      __sk_reclaim - reclaim memory_allocated
1850  *      @sk: socket
1851  */
1852 void __sk_mem_reclaim(struct sock *sk)
1853 {
1854         sk_memory_allocated_sub(sk,
1855                                 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1856         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1857
1858         if (sk_under_memory_pressure(sk) &&
1859             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1860                 sk_leave_memory_pressure(sk);
1861 }
1862 EXPORT_SYMBOL(__sk_mem_reclaim);
1863
1864
1865 /*
1866  * Set of default routines for initialising struct proto_ops when
1867  * the protocol does not support a particular function. In certain
1868  * cases where it makes no sense for a protocol to have a "do nothing"
1869  * function, some default processing is provided.
1870  */
1871
1872 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1873 {
1874         return -EOPNOTSUPP;
1875 }
1876 EXPORT_SYMBOL(sock_no_bind);
1877
1878 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1879                     int len, int flags)
1880 {
1881         return -EOPNOTSUPP;
1882 }
1883 EXPORT_SYMBOL(sock_no_connect);
1884
1885 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1886 {
1887         return -EOPNOTSUPP;
1888 }
1889 EXPORT_SYMBOL(sock_no_socketpair);
1890
1891 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1892 {
1893         return -EOPNOTSUPP;
1894 }
1895 EXPORT_SYMBOL(sock_no_accept);
1896
1897 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1898                     int *len, int peer)
1899 {
1900         return -EOPNOTSUPP;
1901 }
1902 EXPORT_SYMBOL(sock_no_getname);
1903
1904 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1905 {
1906         return 0;
1907 }
1908 EXPORT_SYMBOL(sock_no_poll);
1909
1910 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1911 {
1912         return -EOPNOTSUPP;
1913 }
1914 EXPORT_SYMBOL(sock_no_ioctl);
1915
1916 int sock_no_listen(struct socket *sock, int backlog)
1917 {
1918         return -EOPNOTSUPP;
1919 }
1920 EXPORT_SYMBOL(sock_no_listen);
1921
1922 int sock_no_shutdown(struct socket *sock, int how)
1923 {
1924         return -EOPNOTSUPP;
1925 }
1926 EXPORT_SYMBOL(sock_no_shutdown);
1927
1928 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1929                     char __user *optval, unsigned int optlen)
1930 {
1931         return -EOPNOTSUPP;
1932 }
1933 EXPORT_SYMBOL(sock_no_setsockopt);
1934
1935 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1936                     char __user *optval, int __user *optlen)
1937 {
1938         return -EOPNOTSUPP;
1939 }
1940 EXPORT_SYMBOL(sock_no_getsockopt);
1941
1942 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1943                     size_t len)
1944 {
1945         return -EOPNOTSUPP;
1946 }
1947 EXPORT_SYMBOL(sock_no_sendmsg);
1948
1949 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1950                     size_t len, int flags)
1951 {
1952         return -EOPNOTSUPP;
1953 }
1954 EXPORT_SYMBOL(sock_no_recvmsg);
1955
1956 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1957 {
1958         /* Mirror missing mmap method error code */
1959         return -ENODEV;
1960 }
1961 EXPORT_SYMBOL(sock_no_mmap);
1962
1963 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1964 {
1965         ssize_t res;
1966         struct msghdr msg = {.msg_flags = flags};
1967         struct kvec iov;
1968         char *kaddr = kmap(page);
1969         iov.iov_base = kaddr + offset;
1970         iov.iov_len = size;
1971         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1972         kunmap(page);
1973         return res;
1974 }
1975 EXPORT_SYMBOL(sock_no_sendpage);
1976
1977 /*
1978  *      Default Socket Callbacks
1979  */
1980
1981 static void sock_def_wakeup(struct sock *sk)
1982 {
1983         struct socket_wq *wq;
1984
1985         rcu_read_lock();
1986         wq = rcu_dereference(sk->sk_wq);
1987         if (wq_has_sleeper(wq))
1988                 wake_up_interruptible_all(&wq->wait);
1989         rcu_read_unlock();
1990 }
1991
1992 static void sock_def_error_report(struct sock *sk)
1993 {
1994         struct socket_wq *wq;
1995
1996         rcu_read_lock();
1997         wq = rcu_dereference(sk->sk_wq);
1998         if (wq_has_sleeper(wq))
1999                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2000         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2001         rcu_read_unlock();
2002 }
2003
2004 static void sock_def_readable(struct sock *sk, int len)
2005 {
2006         struct socket_wq *wq;
2007
2008         rcu_read_lock();
2009         wq = rcu_dereference(sk->sk_wq);
2010         if (wq_has_sleeper(wq))
2011                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2012                                                 POLLRDNORM | POLLRDBAND);
2013         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2014         rcu_read_unlock();
2015 }
2016
2017 static void sock_def_write_space(struct sock *sk)
2018 {
2019         struct socket_wq *wq;
2020
2021         rcu_read_lock();
2022
2023         /* Do not wake up a writer until he can make "significant"
2024          * progress.  --DaveM
2025          */
2026         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2027                 wq = rcu_dereference(sk->sk_wq);
2028                 if (wq_has_sleeper(wq))
2029                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2030                                                 POLLWRNORM | POLLWRBAND);
2031
2032                 /* Should agree with poll, otherwise some programs break */
2033                 if (sock_writeable(sk))
2034                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2035         }
2036
2037         rcu_read_unlock();
2038 }
2039
2040 static void sock_def_destruct(struct sock *sk)
2041 {
2042         kfree(sk->sk_protinfo);
2043 }
2044
2045 void sk_send_sigurg(struct sock *sk)
2046 {
2047         if (sk->sk_socket && sk->sk_socket->file)
2048                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2049                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2050 }
2051 EXPORT_SYMBOL(sk_send_sigurg);
2052
2053 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2054                     unsigned long expires)
2055 {
2056         if (!mod_timer(timer, expires))
2057                 sock_hold(sk);
2058 }
2059 EXPORT_SYMBOL(sk_reset_timer);
2060
2061 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2062 {
2063         if (timer_pending(timer) && del_timer(timer))
2064                 __sock_put(sk);
2065 }
2066 EXPORT_SYMBOL(sk_stop_timer);
2067
2068 void sock_init_data(struct socket *sock, struct sock *sk)
2069 {
2070         skb_queue_head_init(&sk->sk_receive_queue);
2071         skb_queue_head_init(&sk->sk_write_queue);
2072         skb_queue_head_init(&sk->sk_error_queue);
2073 #ifdef CONFIG_NET_DMA
2074         skb_queue_head_init(&sk->sk_async_wait_queue);
2075 #endif
2076
2077         sk->sk_send_head        =       NULL;
2078
2079         init_timer(&sk->sk_timer);
2080
2081         sk->sk_allocation       =       GFP_KERNEL;
2082         sk->sk_rcvbuf           =       sysctl_rmem_default;
2083         sk->sk_sndbuf           =       sysctl_wmem_default;
2084         sk->sk_state            =       TCP_CLOSE;
2085         sk_set_socket(sk, sock);
2086
2087         sock_set_flag(sk, SOCK_ZAPPED);
2088
2089         if (sock) {
2090                 sk->sk_type     =       sock->type;
2091                 sk->sk_wq       =       sock->wq;
2092                 sock->sk        =       sk;
2093         } else
2094                 sk->sk_wq       =       NULL;
2095
2096         spin_lock_init(&sk->sk_dst_lock);
2097         rwlock_init(&sk->sk_callback_lock);
2098         lockdep_set_class_and_name(&sk->sk_callback_lock,
2099                         af_callback_keys + sk->sk_family,
2100                         af_family_clock_key_strings[sk->sk_family]);
2101
2102         sk->sk_state_change     =       sock_def_wakeup;
2103         sk->sk_data_ready       =       sock_def_readable;
2104         sk->sk_write_space      =       sock_def_write_space;
2105         sk->sk_error_report     =       sock_def_error_report;
2106         sk->sk_destruct         =       sock_def_destruct;
2107
2108         sk->sk_sndmsg_page      =       NULL;
2109         sk->sk_sndmsg_off       =       0;
2110         sk->sk_peek_off         =       -1;
2111
2112         sk->sk_peer_pid         =       NULL;
2113         sk->sk_peer_cred        =       NULL;
2114         sk->sk_write_pending    =       0;
2115         sk->sk_rcvlowat         =       1;
2116         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2117         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2118
2119         sk->sk_stamp = ktime_set(-1L, 0);
2120
2121         /*
2122          * Before updating sk_refcnt, we must commit prior changes to memory
2123          * (Documentation/RCU/rculist_nulls.txt for details)
2124          */
2125         smp_wmb();
2126         atomic_set(&sk->sk_refcnt, 1);
2127         atomic_set(&sk->sk_drops, 0);
2128 }
2129 EXPORT_SYMBOL(sock_init_data);
2130
2131 void lock_sock_nested(struct sock *sk, int subclass)
2132 {
2133         might_sleep();
2134         spin_lock_bh(&sk->sk_lock.slock);
2135         if (sk->sk_lock.owned)
2136                 __lock_sock(sk);
2137         sk->sk_lock.owned = 1;
2138         spin_unlock(&sk->sk_lock.slock);
2139         /*
2140          * The sk_lock has mutex_lock() semantics here:
2141          */
2142         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2143         local_bh_enable();
2144 }
2145 EXPORT_SYMBOL(lock_sock_nested);
2146
2147 void release_sock(struct sock *sk)
2148 {
2149         /*
2150          * The sk_lock has mutex_unlock() semantics:
2151          */
2152         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2153
2154         spin_lock_bh(&sk->sk_lock.slock);
2155         if (sk->sk_backlog.tail)
2156                 __release_sock(sk);
2157         sk->sk_lock.owned = 0;
2158         if (waitqueue_active(&sk->sk_lock.wq))
2159                 wake_up(&sk->sk_lock.wq);
2160         spin_unlock_bh(&sk->sk_lock.slock);
2161 }
2162 EXPORT_SYMBOL(release_sock);
2163
2164 /**
2165  * lock_sock_fast - fast version of lock_sock
2166  * @sk: socket
2167  *
2168  * This version should be used for very small section, where process wont block
2169  * return false if fast path is taken
2170  *   sk_lock.slock locked, owned = 0, BH disabled
2171  * return true if slow path is taken
2172  *   sk_lock.slock unlocked, owned = 1, BH enabled
2173  */
2174 bool lock_sock_fast(struct sock *sk)
2175 {
2176         might_sleep();
2177         spin_lock_bh(&sk->sk_lock.slock);
2178
2179         if (!sk->sk_lock.owned)
2180                 /*
2181                  * Note : We must disable BH
2182                  */
2183                 return false;
2184
2185         __lock_sock(sk);
2186         sk->sk_lock.owned = 1;
2187         spin_unlock(&sk->sk_lock.slock);
2188         /*
2189          * The sk_lock has mutex_lock() semantics here:
2190          */
2191         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2192         local_bh_enable();
2193         return true;
2194 }
2195 EXPORT_SYMBOL(lock_sock_fast);
2196
2197 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2198 {
2199         struct timeval tv;
2200         if (!sock_flag(sk, SOCK_TIMESTAMP))
2201                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2202         tv = ktime_to_timeval(sk->sk_stamp);
2203         if (tv.tv_sec == -1)
2204                 return -ENOENT;
2205         if (tv.tv_sec == 0) {
2206                 sk->sk_stamp = ktime_get_real();
2207                 tv = ktime_to_timeval(sk->sk_stamp);
2208         }
2209         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2210 }
2211 EXPORT_SYMBOL(sock_get_timestamp);
2212
2213 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2214 {
2215         struct timespec ts;
2216         if (!sock_flag(sk, SOCK_TIMESTAMP))
2217                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2218         ts = ktime_to_timespec(sk->sk_stamp);
2219         if (ts.tv_sec == -1)
2220                 return -ENOENT;
2221         if (ts.tv_sec == 0) {
2222                 sk->sk_stamp = ktime_get_real();
2223                 ts = ktime_to_timespec(sk->sk_stamp);
2224         }
2225         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2226 }
2227 EXPORT_SYMBOL(sock_get_timestampns);
2228
2229 void sock_enable_timestamp(struct sock *sk, int flag)
2230 {
2231         if (!sock_flag(sk, flag)) {
2232                 unsigned long previous_flags = sk->sk_flags;
2233
2234                 sock_set_flag(sk, flag);
2235                 /*
2236                  * we just set one of the two flags which require net
2237                  * time stamping, but time stamping might have been on
2238                  * already because of the other one
2239                  */
2240                 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2241                         net_enable_timestamp();
2242         }
2243 }
2244
2245 /*
2246  *      Get a socket option on an socket.
2247  *
2248  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2249  *      asynchronous errors should be reported by getsockopt. We assume
2250  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2251  */
2252 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2253                            char __user *optval, int __user *optlen)
2254 {
2255         struct sock *sk = sock->sk;
2256
2257         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2258 }
2259 EXPORT_SYMBOL(sock_common_getsockopt);
2260
2261 #ifdef CONFIG_COMPAT
2262 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2263                                   char __user *optval, int __user *optlen)
2264 {
2265         struct sock *sk = sock->sk;
2266
2267         if (sk->sk_prot->compat_getsockopt != NULL)
2268                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2269                                                       optval, optlen);
2270         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2271 }
2272 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2273 #endif
2274
2275 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2276                         struct msghdr *msg, size_t size, int flags)
2277 {
2278         struct sock *sk = sock->sk;
2279         int addr_len = 0;
2280         int err;
2281
2282         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2283                                    flags & ~MSG_DONTWAIT, &addr_len);
2284         if (err >= 0)
2285                 msg->msg_namelen = addr_len;
2286         return err;
2287 }
2288 EXPORT_SYMBOL(sock_common_recvmsg);
2289
2290 /*
2291  *      Set socket options on an inet socket.
2292  */
2293 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2294                            char __user *optval, unsigned int optlen)
2295 {
2296         struct sock *sk = sock->sk;
2297
2298         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2299 }
2300 EXPORT_SYMBOL(sock_common_setsockopt);
2301
2302 #ifdef CONFIG_COMPAT
2303 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2304                                   char __user *optval, unsigned int optlen)
2305 {
2306         struct sock *sk = sock->sk;
2307
2308         if (sk->sk_prot->compat_setsockopt != NULL)
2309                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2310                                                       optval, optlen);
2311         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2312 }
2313 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2314 #endif
2315
2316 void sk_common_release(struct sock *sk)
2317 {
2318         if (sk->sk_prot->destroy)
2319                 sk->sk_prot->destroy(sk);
2320
2321         /*
2322          * Observation: when sock_common_release is called, processes have
2323          * no access to socket. But net still has.
2324          * Step one, detach it from networking:
2325          *
2326          * A. Remove from hash tables.
2327          */
2328
2329         sk->sk_prot->unhash(sk);
2330
2331         /*
2332          * In this point socket cannot receive new packets, but it is possible
2333          * that some packets are in flight because some CPU runs receiver and
2334          * did hash table lookup before we unhashed socket. They will achieve
2335          * receive queue and will be purged by socket destructor.
2336          *
2337          * Also we still have packets pending on receive queue and probably,
2338          * our own packets waiting in device queues. sock_destroy will drain
2339          * receive queue, but transmitted packets will delay socket destruction
2340          * until the last reference will be released.
2341          */
2342
2343         sock_orphan(sk);
2344
2345         xfrm_sk_free_policy(sk);
2346
2347         sk_refcnt_debug_release(sk);
2348         sock_put(sk);
2349 }
2350 EXPORT_SYMBOL(sk_common_release);
2351
2352 #ifdef CONFIG_PROC_FS
2353 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2354 struct prot_inuse {
2355         int val[PROTO_INUSE_NR];
2356 };
2357
2358 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2359
2360 #ifdef CONFIG_NET_NS
2361 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2362 {
2363         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2364 }
2365 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2366
2367 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2368 {
2369         int cpu, idx = prot->inuse_idx;
2370         int res = 0;
2371
2372         for_each_possible_cpu(cpu)
2373                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2374
2375         return res >= 0 ? res : 0;
2376 }
2377 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2378
2379 static int __net_init sock_inuse_init_net(struct net *net)
2380 {
2381         net->core.inuse = alloc_percpu(struct prot_inuse);
2382         return net->core.inuse ? 0 : -ENOMEM;
2383 }
2384
2385 static void __net_exit sock_inuse_exit_net(struct net *net)
2386 {
2387         free_percpu(net->core.inuse);
2388 }
2389
2390 static struct pernet_operations net_inuse_ops = {
2391         .init = sock_inuse_init_net,
2392         .exit = sock_inuse_exit_net,
2393 };
2394
2395 static __init int net_inuse_init(void)
2396 {
2397         if (register_pernet_subsys(&net_inuse_ops))
2398                 panic("Cannot initialize net inuse counters");
2399
2400         return 0;
2401 }
2402
2403 core_initcall(net_inuse_init);
2404 #else
2405 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2406
2407 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2408 {
2409         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2410 }
2411 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2412
2413 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2414 {
2415         int cpu, idx = prot->inuse_idx;
2416         int res = 0;
2417
2418         for_each_possible_cpu(cpu)
2419                 res += per_cpu(prot_inuse, cpu).val[idx];
2420
2421         return res >= 0 ? res : 0;
2422 }
2423 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2424 #endif
2425
2426 static void assign_proto_idx(struct proto *prot)
2427 {
2428         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2429
2430         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2431                 pr_err("PROTO_INUSE_NR exhausted\n");
2432                 return;
2433         }
2434
2435         set_bit(prot->inuse_idx, proto_inuse_idx);
2436 }
2437
2438 static void release_proto_idx(struct proto *prot)
2439 {
2440         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2441                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2442 }
2443 #else
2444 static inline void assign_proto_idx(struct proto *prot)
2445 {
2446 }
2447
2448 static inline void release_proto_idx(struct proto *prot)
2449 {
2450 }
2451 #endif
2452
2453 int proto_register(struct proto *prot, int alloc_slab)
2454 {
2455         if (alloc_slab) {
2456                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2457                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2458                                         NULL);
2459
2460                 if (prot->slab == NULL) {
2461                         pr_crit("%s: Can't create sock SLAB cache!\n",
2462                                 prot->name);
2463                         goto out;
2464                 }
2465
2466                 if (prot->rsk_prot != NULL) {
2467                         prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2468                         if (prot->rsk_prot->slab_name == NULL)
2469                                 goto out_free_sock_slab;
2470
2471                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2472                                                                  prot->rsk_prot->obj_size, 0,
2473                                                                  SLAB_HWCACHE_ALIGN, NULL);
2474
2475                         if (prot->rsk_prot->slab == NULL) {
2476                                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2477                                         prot->name);
2478                                 goto out_free_request_sock_slab_name;
2479                         }
2480                 }
2481
2482                 if (prot->twsk_prot != NULL) {
2483                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2484
2485                         if (prot->twsk_prot->twsk_slab_name == NULL)
2486                                 goto out_free_request_sock_slab;
2487
2488                         prot->twsk_prot->twsk_slab =
2489                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2490                                                   prot->twsk_prot->twsk_obj_size,
2491                                                   0,
2492                                                   SLAB_HWCACHE_ALIGN |
2493                                                         prot->slab_flags,
2494                                                   NULL);
2495                         if (prot->twsk_prot->twsk_slab == NULL)
2496                                 goto out_free_timewait_sock_slab_name;
2497                 }
2498         }
2499
2500         mutex_lock(&proto_list_mutex);
2501         list_add(&prot->node, &proto_list);
2502         assign_proto_idx(prot);
2503         mutex_unlock(&proto_list_mutex);
2504         return 0;
2505
2506 out_free_timewait_sock_slab_name:
2507         kfree(prot->twsk_prot->twsk_slab_name);
2508 out_free_request_sock_slab:
2509         if (prot->rsk_prot && prot->rsk_prot->slab) {
2510                 kmem_cache_destroy(prot->rsk_prot->slab);
2511                 prot->rsk_prot->slab = NULL;
2512         }
2513 out_free_request_sock_slab_name:
2514         if (prot->rsk_prot)
2515                 kfree(prot->rsk_prot->slab_name);
2516 out_free_sock_slab:
2517         kmem_cache_destroy(prot->slab);
2518         prot->slab = NULL;
2519 out:
2520         return -ENOBUFS;
2521 }
2522 EXPORT_SYMBOL(proto_register);
2523
2524 void proto_unregister(struct proto *prot)
2525 {
2526         mutex_lock(&proto_list_mutex);
2527         release_proto_idx(prot);
2528         list_del(&prot->node);
2529         mutex_unlock(&proto_list_mutex);
2530
2531         if (prot->slab != NULL) {
2532                 kmem_cache_destroy(prot->slab);
2533                 prot->slab = NULL;
2534         }
2535
2536         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2537                 kmem_cache_destroy(prot->rsk_prot->slab);
2538                 kfree(prot->rsk_prot->slab_name);
2539                 prot->rsk_prot->slab = NULL;
2540         }
2541
2542         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2543                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2544                 kfree(prot->twsk_prot->twsk_slab_name);
2545                 prot->twsk_prot->twsk_slab = NULL;
2546         }
2547 }
2548 EXPORT_SYMBOL(proto_unregister);
2549
2550 #ifdef CONFIG_PROC_FS
2551 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2552         __acquires(proto_list_mutex)
2553 {
2554         mutex_lock(&proto_list_mutex);
2555         return seq_list_start_head(&proto_list, *pos);
2556 }
2557
2558 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2559 {
2560         return seq_list_next(v, &proto_list, pos);
2561 }
2562
2563 static void proto_seq_stop(struct seq_file *seq, void *v)
2564         __releases(proto_list_mutex)
2565 {
2566         mutex_unlock(&proto_list_mutex);
2567 }
2568
2569 static char proto_method_implemented(const void *method)
2570 {
2571         return method == NULL ? 'n' : 'y';
2572 }
2573 static long sock_prot_memory_allocated(struct proto *proto)
2574 {
2575         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2576 }
2577
2578 static char *sock_prot_memory_pressure(struct proto *proto)
2579 {
2580         return proto->memory_pressure != NULL ?
2581         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2582 }
2583
2584 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2585 {
2586
2587         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2588                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2589                    proto->name,
2590                    proto->obj_size,
2591                    sock_prot_inuse_get(seq_file_net(seq), proto),
2592                    sock_prot_memory_allocated(proto),
2593                    sock_prot_memory_pressure(proto),
2594                    proto->max_header,
2595                    proto->slab == NULL ? "no" : "yes",
2596                    module_name(proto->owner),
2597                    proto_method_implemented(proto->close),
2598                    proto_method_implemented(proto->connect),
2599                    proto_method_implemented(proto->disconnect),
2600                    proto_method_implemented(proto->accept),
2601                    proto_method_implemented(proto->ioctl),
2602                    proto_method_implemented(proto->init),
2603                    proto_method_implemented(proto->destroy),
2604                    proto_method_implemented(proto->shutdown),
2605                    proto_method_implemented(proto->setsockopt),
2606                    proto_method_implemented(proto->getsockopt),
2607                    proto_method_implemented(proto->sendmsg),
2608                    proto_method_implemented(proto->recvmsg),
2609                    proto_method_implemented(proto->sendpage),
2610                    proto_method_implemented(proto->bind),
2611                    proto_method_implemented(proto->backlog_rcv),
2612                    proto_method_implemented(proto->hash),
2613                    proto_method_implemented(proto->unhash),
2614                    proto_method_implemented(proto->get_port),
2615                    proto_method_implemented(proto->enter_memory_pressure));
2616 }
2617
2618 static int proto_seq_show(struct seq_file *seq, void *v)
2619 {
2620         if (v == &proto_list)
2621                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2622                            "protocol",
2623                            "size",
2624                            "sockets",
2625                            "memory",
2626                            "press",
2627                            "maxhdr",
2628                            "slab",
2629                            "module",
2630                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2631         else
2632                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2633         return 0;
2634 }
2635
2636 static const struct seq_operations proto_seq_ops = {
2637         .start  = proto_seq_start,
2638         .next   = proto_seq_next,
2639         .stop   = proto_seq_stop,
2640         .show   = proto_seq_show,
2641 };
2642
2643 static int proto_seq_open(struct inode *inode, struct file *file)
2644 {
2645         return seq_open_net(inode, file, &proto_seq_ops,
2646                             sizeof(struct seq_net_private));
2647 }
2648
2649 static const struct file_operations proto_seq_fops = {
2650         .owner          = THIS_MODULE,
2651         .open           = proto_seq_open,
2652         .read           = seq_read,
2653         .llseek         = seq_lseek,
2654         .release        = seq_release_net,
2655 };
2656
2657 static __net_init int proto_init_net(struct net *net)
2658 {
2659         if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2660                 return -ENOMEM;
2661
2662         return 0;
2663 }
2664
2665 static __net_exit void proto_exit_net(struct net *net)
2666 {
2667         proc_net_remove(net, "protocols");
2668 }
2669
2670
2671 static __net_initdata struct pernet_operations proto_net_ops = {
2672         .init = proto_init_net,
2673         .exit = proto_exit_net,
2674 };
2675
2676 static int __init proto_init(void)
2677 {
2678         return register_pernet_subsys(&proto_net_ops);
2679 }
2680
2681 subsys_initcall(proto_init);
2682
2683 #endif /* PROC_FS */