net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #include <linux/capability.h>
  93 #include <linux/errno.h>
  94 #include <linux/types.h>
  95 #include <linux/socket.h>
  96 #include <linux/in.h>
  97 #include <linux/kernel.h>
  98 #include <linux/module.h>
  99 #include <linux/proc_fs.h>
 100 #include <linux/seq_file.h>
 101 #include <linux/sched.h>
 102 #include <linux/timer.h>
 103 #include <linux/string.h>
 104 #include <linux/sockios.h>
 105 #include <linux/net.h>
 106 #include <linux/mm.h>
 107 #include <linux/slab.h>
 108 #include <linux/interrupt.h>
 109 #include <linux/poll.h>
 110 #include <linux/tcp.h>
 111 #include <linux/init.h>
 112 #include <linux/highmem.h>
 113
 114 #include <asm/uaccess.h>
 115 #include <asm/system.h>
 116
 117 #include <linux/netdevice.h>
 118 #include <net/protocol.h>
 119 #include <linux/skbuff.h>
 120 #include <net/net_namespace.h>
 121 #include <net/request_sock.h>
 122 #include <net/sock.h>
 123 #include <linux/net_tstamp.h>
 124 #include <net/xfrm.h>
 125 #include <linux/ipsec.h>
 126 #include <net/cls_cgroup.h>
 127
 128 #include <linux/filter.h>
 129
 130 #ifdef CONFIG_INET
 131 #include <net/tcp.h>
 132 #endif
 133
 134 /*
 135  * Each address family might have different locking rules, so we have
 136  * one slock key per address family:
 137  */
 138 static struct lock_class_key af_family_keys[AF_MAX];
 139 static struct lock_class_key af_family_slock_keys[AF_MAX];
 140
 141 /*
 142  * Make lock validator output more readable. (we pre-construct these
 143  * strings build-time, so that runtime initialization of socket
 144  * locks is fast):
 145  */
 146 static const char *const af_family_key_strings[AF_MAX+1] = {
 147   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 148   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 149   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 150   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 151   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 152   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 153   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 154   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 155   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 156   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 157   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 158   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 159   "sk_lock-AF_IEEE802154",
 160   "sk_lock-AF_MAX"
 161 };
 162 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 163   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 164   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 165   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 166   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 167   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 168   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 169   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 170   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 171   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 172   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 173   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 174   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 175   "slock-AF_IEEE802154",
 176   "slock-AF_MAX"
 177 };
 178 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 179   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 180   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 181   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 182   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 183   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 184   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 185   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 186   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 187   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 188   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 189   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 190   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 191   "clock-AF_IEEE802154",
 192   "clock-AF_MAX"
 193 };
 194
 195 /*
 196  * sk_callback_lock locking rules are per-address-family,
 197  * so split the lock classes by using a per-AF key:
 198  */
 199 static struct lock_class_key af_callback_keys[AF_MAX];
 200
 201 /* Take into consideration the size of the struct sk_buff overhead in the
 202  * determination of these values, since that is non-constant across
 203  * platforms.  This makes socket queueing behavior and performance
 204  * not depend upon such differences.
 205  */
 206 #define _SK_MEM_PACKETS         256
 207 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
 208 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 209 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 210
 211 /* Run time adjustable parameters. */
 212 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 213 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 214 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 215 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 216
 217 /* Maximal space eaten by iovec or ancilliary data plus some space */
 218 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 219 EXPORT_SYMBOL(sysctl_optmem_max);
 220
 221 #if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
 222 int net_cls_subsys_id = -1;
 223 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
 224 #endif
 225
 226 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 227 {
 228         struct timeval tv;
 229
 230         if (optlen < sizeof(tv))
 231                 return -EINVAL;
 232         if (copy_from_user(&tv, optval, sizeof(tv)))
 233                 return -EFAULT;
 234         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 235                 return -EDOM;
 236
 237         if (tv.tv_sec < 0) {
 238                 static int warned __read_mostly;
 239
 240                 *timeo_p = 0;
 241                 if (warned < 10 && net_ratelimit()) {
 242                         warned++;
 243                         printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
 244                                "tries to set negative timeout\n",
 245                                 current->comm, task_pid_nr(current));
 246                 }
 247                 return 0;
 248         }
 249         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 250         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 251                 return 0;
 252         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 253                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 254         return 0;
 255 }
 256
 257 static void sock_warn_obsolete_bsdism(const char *name)
 258 {
 259         static int warned;
 260         static char warncomm[TASK_COMM_LEN];
 261         if (strcmp(warncomm, current->comm) && warned < 5) {
 262                 strcpy(warncomm,  current->comm);
 263                 printk(KERN_WARNING "process `%s' is using obsolete "
 264                        "%s SO_BSDCOMPAT\n", warncomm, name);
 265                 warned++;
 266         }
 267 }
 268
 269 static void sock_disable_timestamp(struct sock *sk, int flag)
 270 {
 271         if (sock_flag(sk, flag)) {
 272                 sock_reset_flag(sk, flag);
 273                 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
 274                     !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
 275                         net_disable_timestamp();
 276                 }
 277         }
 278 }
 279
 280
 281 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 282 {
 283         int err;
 284         int skb_len;
 285         unsigned long flags;
 286         struct sk_buff_head *list = &sk->sk_receive_queue;
 287
 288         /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
 289            number of warnings when compiling with -W --ANK
 290          */
 291         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 292             (unsigned)sk->sk_rcvbuf) {
 293                 atomic_inc(&sk->sk_drops);
 294                 return -ENOMEM;
 295         }
 296
 297         err = sk_filter(sk, skb);
 298         if (err)
 299                 return err;
 300
 301         if (!sk_rmem_schedule(sk, skb->truesize)) {
 302                 atomic_inc(&sk->sk_drops);
 303                 return -ENOBUFS;
 304         }
 305
 306         skb->dev = NULL;
 307         skb_set_owner_r(skb, sk);
 308
 309         /* Cache the SKB length before we tack it onto the receive
 310          * queue.  Once it is added it no longer belongs to us and
 311          * may be freed by other threads of control pulling packets
 312          * from the queue.
 313          */
 314         skb_len = skb->len;
 315
 316         /* we escape from rcu protected region, make sure we dont leak
 317          * a norefcounted dst
 318          */
 319         skb_dst_force(skb);
 320
 321         spin_lock_irqsave(&list->lock, flags);
 322         skb->dropcount = atomic_read(&sk->sk_drops);
 323         __skb_queue_tail(list, skb);
 324         spin_unlock_irqrestore(&list->lock, flags);
 325
 326         if (!sock_flag(sk, SOCK_DEAD))
 327                 sk->sk_data_ready(sk, skb_len);
 328         return 0;
 329 }
 330 EXPORT_SYMBOL(sock_queue_rcv_skb);
 331
 332 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 333 {
 334         int rc = NET_RX_SUCCESS;
 335
 336         if (sk_filter(sk, skb))
 337                 goto discard_and_relse;
 338
 339         skb->dev = NULL;
 340
 341         if (sk_rcvqueues_full(sk, skb)) {
 342                 atomic_inc(&sk->sk_drops);
 343                 goto discard_and_relse;
 344         }
 345         if (nested)
 346                 bh_lock_sock_nested(sk);
 347         else
 348                 bh_lock_sock(sk);
 349         if (!sock_owned_by_user(sk)) {
 350                 /*
 351                  * trylock + unlock semantics:
 352                  */
 353                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 354
 355                 rc = sk_backlog_rcv(sk, skb);
 356
 357                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 358         } else if (sk_add_backlog(sk, skb)) {
 359                 bh_unlock_sock(sk);
 360                 atomic_inc(&sk->sk_drops);
 361                 goto discard_and_relse;
 362         }
 363
 364         bh_unlock_sock(sk);
 365 out:
 366         sock_put(sk);
 367         return rc;
 368 discard_and_relse:
 369         kfree_skb(skb);
 370         goto out;
 371 }
 372 EXPORT_SYMBOL(sk_receive_skb);
 373
 374 void sk_reset_txq(struct sock *sk)
 375 {
 376         sk_tx_queue_clear(sk);
 377 }
 378 EXPORT_SYMBOL(sk_reset_txq);
 379
 380 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 381 {
 382         struct dst_entry *dst = __sk_dst_get(sk);
 383
 384         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 385                 sk_tx_queue_clear(sk);
 386                 rcu_assign_pointer(sk->sk_dst_cache, NULL);
 387                 dst_release(dst);
 388                 return NULL;
 389         }
 390
 391         return dst;
 392 }
 393 EXPORT_SYMBOL(__sk_dst_check);
 394
 395 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 396 {
 397         struct dst_entry *dst = sk_dst_get(sk);
 398
 399         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 400                 sk_dst_reset(sk);
 401                 dst_release(dst);
 402                 return NULL;
 403         }
 404
 405         return dst;
 406 }
 407 EXPORT_SYMBOL(sk_dst_check);
 408
 409 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 410 {
 411         int ret = -ENOPROTOOPT;
 412 #ifdef CONFIG_NETDEVICES
 413         struct net *net = sock_net(sk);
 414         char devname[IFNAMSIZ];
 415         int index;
 416
 417         /* Sorry... */
 418         ret = -EPERM;
 419         if (!capable(CAP_NET_RAW))
 420                 goto out;
 421
 422         ret = -EINVAL;
 423         if (optlen < 0)
 424                 goto out;
 425
 426         /* Bind this socket to a particular device like "eth0",
 427          * as specified in the passed interface name. If the
 428          * name is "" or the option length is zero the socket
 429          * is not bound.
 430          */
 431         if (optlen > IFNAMSIZ - 1)
 432                 optlen = IFNAMSIZ - 1;
 433         memset(devname, 0, sizeof(devname));
 434
 435         ret = -EFAULT;
 436         if (copy_from_user(devname, optval, optlen))
 437                 goto out;
 438
 439         index = 0;
 440         if (devname[0] != '\0') {
 441                 struct net_device *dev;
 442
 443                 rcu_read_lock();
 444                 dev = dev_get_by_name_rcu(net, devname);
 445                 if (dev)
 446                         index = dev->ifindex;
 447                 rcu_read_unlock();
 448                 ret = -ENODEV;
 449                 if (!dev)
 450                         goto out;
 451         }
 452
 453         lock_sock(sk);
 454         sk->sk_bound_dev_if = index;
 455         sk_dst_reset(sk);
 456         release_sock(sk);
 457
 458         ret = 0;
 459
 460 out:
 461 #endif
 462
 463         return ret;
 464 }
 465
 466 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 467 {
 468         if (valbool)
 469                 sock_set_flag(sk, bit);
 470         else
 471                 sock_reset_flag(sk, bit);
 472 }
 473
 474 /*
 475  *      This is meant for all protocols to use and covers goings on
 476  *      at the socket level. Everything here is generic.
 477  */
 478
 479 int sock_setsockopt(struct socket *sock, int level, int optname,
 480                     char __user *optval, unsigned int optlen)
 481 {
 482         struct sock *sk = sock->sk;
 483         int val;
 484         int valbool;
 485         struct linger ling;
 486         int ret = 0;
 487
 488         /*
 489          *      Options without arguments
 490          */
 491
 492         if (optname == SO_BINDTODEVICE)
 493                 return sock_bindtodevice(sk, optval, optlen);
 494
 495         if (optlen < sizeof(int))
 496                 return -EINVAL;
 497
 498         if (get_user(val, (int __user *)optval))
 499                 return -EFAULT;
 500
 501         valbool = val ? 1 : 0;
 502
 503         lock_sock(sk);
 504
 505         switch (optname) {
 506         case SO_DEBUG:
 507                 if (val && !capable(CAP_NET_ADMIN))
 508                         ret = -EACCES;
 509                 else
 510                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 511                 break;
 512         case SO_REUSEADDR:
 513                 sk->sk_reuse = valbool;
 514                 break;
 515         case SO_TYPE:
 516         case SO_PROTOCOL:
 517         case SO_DOMAIN:
 518         case SO_ERROR:
 519                 ret = -ENOPROTOOPT;
 520                 break;
 521         case SO_DONTROUTE:
 522                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 523                 break;
 524         case SO_BROADCAST:
 525                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 526                 break;
 527         case SO_SNDBUF:
 528                 /* Don't error on this BSD doesn't and if you think
 529                    about it this is right. Otherwise apps have to
 530                    play 'guess the biggest size' games. RCVBUF/SNDBUF
 531                    are treated in BSD as hints */
 532
 533                 if (val > sysctl_wmem_max)
 534                         val = sysctl_wmem_max;
 535 set_sndbuf:
 536                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 537                 if ((val * 2) < SOCK_MIN_SNDBUF)
 538                         sk->sk_sndbuf = SOCK_MIN_SNDBUF;
 539                 else
 540                         sk->sk_sndbuf = val * 2;
 541
 542                 /*
 543                  *      Wake up sending tasks if we
 544                  *      upped the value.
 545                  */
 546                 sk->sk_write_space(sk);
 547                 break;
 548
 549         case SO_SNDBUFFORCE:
 550                 if (!capable(CAP_NET_ADMIN)) {
 551                         ret = -EPERM;
 552                         break;
 553                 }
 554                 goto set_sndbuf;
 555
 556         case SO_RCVBUF:
 557                 /* Don't error on this BSD doesn't and if you think
 558                    about it this is right. Otherwise apps have to
 559                    play 'guess the biggest size' games. RCVBUF/SNDBUF
 560                    are treated in BSD as hints */
 561
 562                 if (val > sysctl_rmem_max)
 563                         val = sysctl_rmem_max;
 564 set_rcvbuf:
 565                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 566                 /*
 567                  * We double it on the way in to account for
 568                  * "struct sk_buff" etc. overhead.   Applications
 569                  * assume that the SO_RCVBUF setting they make will
 570                  * allow that much actual data to be received on that
 571                  * socket.
 572                  *
 573                  * Applications are unaware that "struct sk_buff" and
 574                  * other overheads allocate from the receive buffer
 575                  * during socket buffer allocation.
 576                  *
 577                  * And after considering the possible alternatives,
 578                  * returning the value we actually used in getsockopt
 579                  * is the most desirable behavior.
 580                  */
 581                 if ((val * 2) < SOCK_MIN_RCVBUF)
 582                         sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
 583                 else
 584                         sk->sk_rcvbuf = val * 2;
 585                 break;
 586
 587         case SO_RCVBUFFORCE:
 588                 if (!capable(CAP_NET_ADMIN)) {
 589                         ret = -EPERM;
 590                         break;
 591                 }
 592                 goto set_rcvbuf;
 593
 594         case SO_KEEPALIVE:
 595 #ifdef CONFIG_INET
 596                 if (sk->sk_protocol == IPPROTO_TCP)
 597                         tcp_set_keepalive(sk, valbool);
 598 #endif
 599                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 600                 break;
 601
 602         case SO_OOBINLINE:
 603                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 604                 break;
 605
 606         case SO_NO_CHECK:
 607                 sk->sk_no_check = valbool;
 608                 break;
 609
 610         case SO_PRIORITY:
 611                 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 612                         sk->sk_priority = val;
 613                 else
 614                         ret = -EPERM;
 615                 break;
 616
 617         case SO_LINGER:
 618                 if (optlen < sizeof(ling)) {
 619                         ret = -EINVAL;  /* 1003.1g */
 620                         break;
 621                 }
 622                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 623                         ret = -EFAULT;
 624                         break;
 625                 }
 626                 if (!ling.l_onoff)
 627                         sock_reset_flag(sk, SOCK_LINGER);
 628                 else {
 629 #if (BITS_PER_LONG == 32)
 630                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 631                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 632                         else
 633 #endif
 634                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 635                         sock_set_flag(sk, SOCK_LINGER);
 636                 }
 637                 break;
 638
 639         case SO_BSDCOMPAT:
 640                 sock_warn_obsolete_bsdism("setsockopt");
 641                 break;
 642
 643         case SO_PASSCRED:
 644                 if (valbool)
 645                         set_bit(SOCK_PASSCRED, &sock->flags);
 646                 else
 647                         clear_bit(SOCK_PASSCRED, &sock->flags);
 648                 break;
 649
 650         case SO_TIMESTAMP:
 651         case SO_TIMESTAMPNS:
 652                 if (valbool)  {
 653                         if (optname == SO_TIMESTAMP)
 654                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 655                         else
 656                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 657                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 658                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 659                 } else {
 660                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 661                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 662                 }
 663                 break;
 664
 665         case SO_TIMESTAMPING:
 666                 if (val & ~SOF_TIMESTAMPING_MASK) {
 667                         ret = -EINVAL;
 668                         break;
 669                 }
 670                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 671                                   val & SOF_TIMESTAMPING_TX_HARDWARE);
 672                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 673                                   val & SOF_TIMESTAMPING_TX_SOFTWARE);
 674                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 675                                   val & SOF_TIMESTAMPING_RX_HARDWARE);
 676                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 677                         sock_enable_timestamp(sk,
 678                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 679                 else
 680                         sock_disable_timestamp(sk,
 681                                                SOCK_TIMESTAMPING_RX_SOFTWARE);
 682                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 683                                   val & SOF_TIMESTAMPING_SOFTWARE);
 684                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 685                                   val & SOF_TIMESTAMPING_SYS_HARDWARE);
 686                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 687                                   val & SOF_TIMESTAMPING_RAW_HARDWARE);
 688                 break;
 689
 690         case SO_RCVLOWAT:
 691                 if (val < 0)
 692                         val = INT_MAX;
 693                 sk->sk_rcvlowat = val ? : 1;
 694                 break;
 695
 696         case SO_RCVTIMEO:
 697                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 698                 break;
 699
 700         case SO_SNDTIMEO:
 701                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 702                 break;
 703
 704         case SO_ATTACH_FILTER:
 705                 ret = -EINVAL;
 706                 if (optlen == sizeof(struct sock_fprog)) {
 707                         struct sock_fprog fprog;
 708
 709                         ret = -EFAULT;
 710                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 711                                 break;
 712
 713                         ret = sk_attach_filter(&fprog, sk);
 714                 }
 715                 break;
 716
 717         case SO_DETACH_FILTER:
 718                 ret = sk_detach_filter(sk);
 719                 break;
 720
 721         case SO_PASSSEC:
 722                 if (valbool)
 723                         set_bit(SOCK_PASSSEC, &sock->flags);
 724                 else
 725                         clear_bit(SOCK_PASSSEC, &sock->flags);
 726                 break;
 727         case SO_MARK:
 728                 if (!capable(CAP_NET_ADMIN))
 729                         ret = -EPERM;
 730                 else
 731                         sk->sk_mark = val;
 732                 break;
 733
 734                 /* We implement the SO_SNDLOWAT etc to
 735                    not be settable (1003.1g 5.3) */
 736         case SO_RXQ_OVFL:
 737                 if (valbool)
 738                         sock_set_flag(sk, SOCK_RXQ_OVFL);
 739                 else
 740                         sock_reset_flag(sk, SOCK_RXQ_OVFL);
 741                 break;
 742         default:
 743                 ret = -ENOPROTOOPT;
 744                 break;
 745         }
 746         release_sock(sk);
 747         return ret;
 748 }
 749 EXPORT_SYMBOL(sock_setsockopt);
 750
 751
 752 int sock_getsockopt(struct socket *sock, int level, int optname,
 753                     char __user *optval, int __user *optlen)
 754 {
 755         struct sock *sk = sock->sk;
 756
 757         union {
 758                 int val;
 759                 struct linger ling;
 760                 struct timeval tm;
 761         } v;
 762
 763         int lv = sizeof(int);
 764         int len;
 765
 766         if (get_user(len, optlen))
 767                 return -EFAULT;
 768         if (len < 0)
 769                 return -EINVAL;
 770
 771         memset(&v, 0, sizeof(v));
 772
 773         switch (optname) {
 774         case SO_DEBUG:
 775                 v.val = sock_flag(sk, SOCK_DBG);
 776                 break;
 777
 778         case SO_DONTROUTE:
 779                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
 780                 break;
 781
 782         case SO_BROADCAST:
 783                 v.val = !!sock_flag(sk, SOCK_BROADCAST);
 784                 break;
 785
 786         case SO_SNDBUF:
 787                 v.val = sk->sk_sndbuf;
 788                 break;
 789
 790         case SO_RCVBUF:
 791                 v.val = sk->sk_rcvbuf;
 792                 break;
 793
 794         case SO_REUSEADDR:
 795                 v.val = sk->sk_reuse;
 796                 break;
 797
 798         case SO_KEEPALIVE:
 799                 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
 800                 break;
 801
 802         case SO_TYPE:
 803                 v.val = sk->sk_type;
 804                 break;
 805
 806         case SO_PROTOCOL:
 807                 v.val = sk->sk_protocol;
 808                 break;
 809
 810         case SO_DOMAIN:
 811                 v.val = sk->sk_family;
 812                 break;
 813
 814         case SO_ERROR:
 815                 v.val = -sock_error(sk);
 816                 if (v.val == 0)
 817                         v.val = xchg(&sk->sk_err_soft, 0);
 818                 break;
 819
 820         case SO_OOBINLINE:
 821                 v.val = !!sock_flag(sk, SOCK_URGINLINE);
 822                 break;
 823
 824         case SO_NO_CHECK:
 825                 v.val = sk->sk_no_check;
 826                 break;
 827
 828         case SO_PRIORITY:
 829                 v.val = sk->sk_priority;
 830                 break;
 831
 832         case SO_LINGER:
 833                 lv              = sizeof(v.ling);
 834                 v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
 835                 v.ling.l_linger = sk->sk_lingertime / HZ;
 836                 break;
 837
 838         case SO_BSDCOMPAT:
 839                 sock_warn_obsolete_bsdism("getsockopt");
 840                 break;
 841
 842         case SO_TIMESTAMP:
 843                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 844                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
 845                 break;
 846
 847         case SO_TIMESTAMPNS:
 848                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 849                 break;
 850
 851         case SO_TIMESTAMPING:
 852                 v.val = 0;
 853                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
 854                         v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
 855                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 856                         v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
 857                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
 858                         v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
 859                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
 860                         v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
 861                 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
 862                         v.val |= SOF_TIMESTAMPING_SOFTWARE;
 863                 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
 864                         v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
 865                 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
 866                         v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
 867                 break;
 868
 869         case SO_RCVTIMEO:
 870                 lv = sizeof(struct timeval);
 871                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 872                         v.tm.tv_sec = 0;
 873                         v.tm.tv_usec = 0;
 874                 } else {
 875                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 876                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 877                 }
 878                 break;
 879
 880         case SO_SNDTIMEO:
 881                 lv = sizeof(struct timeval);
 882                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 883                         v.tm.tv_sec = 0;
 884                         v.tm.tv_usec = 0;
 885                 } else {
 886                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
 887                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
 888                 }
 889                 break;
 890
 891         case SO_RCVLOWAT:
 892                 v.val = sk->sk_rcvlowat;
 893                 break;
 894
 895         case SO_SNDLOWAT:
 896                 v.val = 1;
 897                 break;
 898
 899         case SO_PASSCRED:
 900                 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
 901                 break;
 902
 903         case SO_PEERCRED:
 904                 if (len > sizeof(sk->sk_peercred))
 905                         len = sizeof(sk->sk_peercred);
 906                 if (copy_to_user(optval, &sk->sk_peercred, len))
 907                         return -EFAULT;
 908                 goto lenout;
 909
 910         case SO_PEERNAME:
 911         {
 912                 char address[128];
 913
 914                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
 915                         return -ENOTCONN;
 916                 if (lv < len)
 917                         return -EINVAL;
 918                 if (copy_to_user(optval, address, len))
 919                         return -EFAULT;
 920                 goto lenout;
 921         }
 922
 923         /* Dubious BSD thing... Probably nobody even uses it, but
 924          * the UNIX standard wants it for whatever reason... -DaveM
 925          */
 926         case SO_ACCEPTCONN:
 927                 v.val = sk->sk_state == TCP_LISTEN;
 928                 break;
 929
 930         case SO_PASSSEC:
 931                 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
 932                 break;
 933
 934         case SO_PEERSEC:
 935                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
 936
 937         case SO_MARK:
 938                 v.val = sk->sk_mark;
 939                 break;
 940
 941         case SO_RXQ_OVFL:
 942                 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
 943                 break;
 944
 945         default:
 946                 return -ENOPROTOOPT;
 947         }
 948
 949         if (len > lv)
 950                 len = lv;
 951         if (copy_to_user(optval, &v, len))
 952                 return -EFAULT;
 953 lenout:
 954         if (put_user(len, optlen))
 955                 return -EFAULT;
 956         return 0;
 957 }
 958
 959 /*
 960  * Initialize an sk_lock.
 961  *
 962  * (We also register the sk_lock with the lock validator.)
 963  */
 964 static inline void sock_lock_init(struct sock *sk)
 965 {
 966         sock_lock_init_class_and_name(sk,
 967                         af_family_slock_key_strings[sk->sk_family],
 968                         af_family_slock_keys + sk->sk_family,
 969                         af_family_key_strings[sk->sk_family],
 970                         af_family_keys + sk->sk_family);
 971 }
 972
 973 /*
 974  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
 975  * even temporarly, because of RCU lookups. sk_node should also be left as is.
 976  */
 977 static void sock_copy(struct sock *nsk, const struct sock *osk)
 978 {
 979 #ifdef CONFIG_SECURITY_NETWORK
 980         void *sptr = nsk->sk_security;
 981 #endif
 982         BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
 983                      sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) +
 984                      sizeof(osk->sk_tx_queue_mapping));
 985         memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
 986                osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
 987 #ifdef CONFIG_SECURITY_NETWORK
 988         nsk->sk_security = sptr;
 989         security_sk_clone(osk, nsk);
 990 #endif
 991 }
 992
 993 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 994                 int family)
 995 {
 996         struct sock *sk;
 997         struct kmem_cache *slab;
 998
 999         slab = prot->slab;
1000         if (slab != NULL) {
1001                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1002                 if (!sk)
1003                         return sk;
1004                 if (priority & __GFP_ZERO) {
1005                         /*
1006                          * caches using SLAB_DESTROY_BY_RCU should let
1007                          * sk_node.next un-modified. Special care is taken
1008                          * when initializing object to zero.
1009                          */
1010                         if (offsetof(struct sock, sk_node.next) != 0)
1011                                 memset(sk, 0, offsetof(struct sock, sk_node.next));
1012                         memset(&sk->sk_node.pprev, 0,
1013                                prot->obj_size - offsetof(struct sock,
1014                                                          sk_node.pprev));
1015                 }
1016         }
1017         else
1018                 sk = kmalloc(prot->obj_size, priority);
1019
1020         if (sk != NULL) {
1021                 kmemcheck_annotate_bitfield(sk, flags);
1022
1023                 if (security_sk_alloc(sk, family, priority))
1024                         goto out_free;
1025
1026                 if (!try_module_get(prot->owner))
1027                         goto out_free_sec;
1028                 sk_tx_queue_clear(sk);
1029         }
1030
1031         return sk;
1032
1033 out_free_sec:
1034         security_sk_free(sk);
1035 out_free:
1036         if (slab != NULL)
1037                 kmem_cache_free(slab, sk);
1038         else
1039                 kfree(sk);
1040         return NULL;
1041 }
1042
1043 static void sk_prot_free(struct proto *prot, struct sock *sk)
1044 {
1045         struct kmem_cache *slab;
1046         struct module *owner;
1047
1048         owner = prot->owner;
1049         slab = prot->slab;
1050
1051         security_sk_free(sk);
1052         if (slab != NULL)
1053                 kmem_cache_free(slab, sk);
1054         else
1055                 kfree(sk);
1056         module_put(owner);
1057 }
1058
1059 #ifdef CONFIG_CGROUPS
1060 void sock_update_classid(struct sock *sk)
1061 {
1062         u32 classid = task_cls_classid(current);
1063
1064         if (classid && classid != sk->sk_classid)
1065                 sk->sk_classid = classid;
1066 }
1067 EXPORT_SYMBOL(sock_update_classid);
1068 #endif
1069
1070 /**
1071  *      sk_alloc - All socket objects are allocated here
1072  *      @net: the applicable net namespace
1073  *      @family: protocol family
1074  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1075  *      @prot: struct proto associated with this new sock instance
1076  */
1077 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1078                       struct proto *prot)
1079 {
1080         struct sock *sk;
1081
1082         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1083         if (sk) {
1084                 sk->sk_family = family;
1085                 /*
1086                  * See comment in struct sock definition to understand
1087                  * why we need sk_prot_creator -acme
1088                  */
1089                 sk->sk_prot = sk->sk_prot_creator = prot;
1090                 sock_lock_init(sk);
1091                 sock_net_set(sk, get_net(net));
1092                 atomic_set(&sk->sk_wmem_alloc, 1);
1093
1094                 sock_update_classid(sk);
1095         }
1096
1097         return sk;
1098 }
1099 EXPORT_SYMBOL(sk_alloc);
1100
1101 static void __sk_free(struct sock *sk)
1102 {
1103         struct sk_filter *filter;
1104
1105         if (sk->sk_destruct)
1106                 sk->sk_destruct(sk);
1107
1108         filter = rcu_dereference_check(sk->sk_filter,
1109                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1110         if (filter) {
1111                 sk_filter_uncharge(sk, filter);
1112                 rcu_assign_pointer(sk->sk_filter, NULL);
1113         }
1114
1115         sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1116         sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1117
1118         if (atomic_read(&sk->sk_omem_alloc))
1119                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1120                        __func__, atomic_read(&sk->sk_omem_alloc));
1121
1122         put_net(sock_net(sk));
1123         sk_prot_free(sk->sk_prot_creator, sk);
1124 }
1125
1126 void sk_free(struct sock *sk)
1127 {
1128         /*
1129          * We substract one from sk_wmem_alloc and can know if
1130          * some packets are still in some tx queue.
1131          * If not null, sock_wfree() will call __sk_free(sk) later
1132          */
1133         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1134                 __sk_free(sk);
1135 }
1136 EXPORT_SYMBOL(sk_free);
1137
1138 /*
1139  * Last sock_put should drop referrence to sk->sk_net. It has already
1140  * been dropped in sk_change_net. Taking referrence to stopping namespace
1141  * is not an option.
1142  * Take referrence to a socket to remove it from hash _alive_ and after that
1143  * destroy it in the context of init_net.
1144  */
1145 void sk_release_kernel(struct sock *sk)
1146 {
1147         if (sk == NULL || sk->sk_socket == NULL)
1148                 return;
1149
1150         sock_hold(sk);
1151         sock_release(sk->sk_socket);
1152         release_net(sock_net(sk));
1153         sock_net_set(sk, get_net(&init_net));
1154         sock_put(sk);
1155 }
1156 EXPORT_SYMBOL(sk_release_kernel);
1157
1158 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1159 {
1160         struct sock *newsk;
1161
1162         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1163         if (newsk != NULL) {
1164                 struct sk_filter *filter;
1165
1166                 sock_copy(newsk, sk);
1167
1168                 /* SANITY */
1169                 get_net(sock_net(newsk));
1170                 sk_node_init(&newsk->sk_node);
1171                 sock_lock_init(newsk);
1172                 bh_lock_sock(newsk);
1173                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1174                 newsk->sk_backlog.len = 0;
1175
1176                 atomic_set(&newsk->sk_rmem_alloc, 0);
1177                 /*
1178                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1179                  */
1180                 atomic_set(&newsk->sk_wmem_alloc, 1);
1181                 atomic_set(&newsk->sk_omem_alloc, 0);
1182                 skb_queue_head_init(&newsk->sk_receive_queue);
1183                 skb_queue_head_init(&newsk->sk_write_queue);
1184 #ifdef CONFIG_NET_DMA
1185                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1186 #endif
1187
1188                 spin_lock_init(&newsk->sk_dst_lock);
1189                 rwlock_init(&newsk->sk_callback_lock);
1190                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1191                                 af_callback_keys + newsk->sk_family,
1192                                 af_family_clock_key_strings[newsk->sk_family]);
1193
1194                 newsk->sk_dst_cache     = NULL;
1195                 newsk->sk_wmem_queued   = 0;
1196                 newsk->sk_forward_alloc = 0;
1197                 newsk->sk_send_head     = NULL;
1198                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1199
1200                 sock_reset_flag(newsk, SOCK_DONE);
1201                 skb_queue_head_init(&newsk->sk_error_queue);
1202
1203                 filter = newsk->sk_filter;
1204                 if (filter != NULL)
1205                         sk_filter_charge(newsk, filter);
1206
1207                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1208                         /* It is still raw copy of parent, so invalidate
1209                          * destructor and make plain sk_free() */
1210                         newsk->sk_destruct = NULL;
1211                         sk_free(newsk);
1212                         newsk = NULL;
1213                         goto out;
1214                 }
1215
1216                 newsk->sk_err      = 0;
1217                 newsk->sk_priority = 0;
1218                 /*
1219                  * Before updating sk_refcnt, we must commit prior changes to memory
1220                  * (Documentation/RCU/rculist_nulls.txt for details)
1221                  */
1222                 smp_wmb();
1223                 atomic_set(&newsk->sk_refcnt, 2);
1224
1225                 /*
1226                  * Increment the counter in the same struct proto as the master
1227                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1228                  * is the same as sk->sk_prot->socks, as this field was copied
1229                  * with memcpy).
1230                  *
1231                  * This _changes_ the previous behaviour, where
1232                  * tcp_create_openreq_child always was incrementing the
1233                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1234                  * to be taken into account in all callers. -acme
1235                  */
1236                 sk_refcnt_debug_inc(newsk);
1237                 sk_set_socket(newsk, NULL);
1238                 newsk->sk_wq = NULL;
1239
1240                 if (newsk->sk_prot->sockets_allocated)
1241                         percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1242
1243                 if (sock_flag(newsk, SOCK_TIMESTAMP) ||
1244                     sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1245                         net_enable_timestamp();
1246         }
1247 out:
1248         return newsk;
1249 }
1250 EXPORT_SYMBOL_GPL(sk_clone);
1251
1252 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1253 {
1254         __sk_dst_set(sk, dst);
1255         sk->sk_route_caps = dst->dev->features;
1256         if (sk->sk_route_caps & NETIF_F_GSO)
1257                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1258         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1259         if (sk_can_gso(sk)) {
1260                 if (dst->header_len) {
1261                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1262                 } else {
1263                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1264                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1265                 }
1266         }
1267 }
1268 EXPORT_SYMBOL_GPL(sk_setup_caps);
1269
1270 void __init sk_init(void)
1271 {
1272         if (totalram_pages <= 4096) {
1273                 sysctl_wmem_max = 32767;
1274                 sysctl_rmem_max = 32767;
1275                 sysctl_wmem_default = 32767;
1276                 sysctl_rmem_default = 32767;
1277         } else if (totalram_pages >= 131072) {
1278                 sysctl_wmem_max = 131071;
1279                 sysctl_rmem_max = 131071;
1280         }
1281 }
1282
1283 /*
1284  *      Simple resource managers for sockets.
1285  */
1286
1287
1288 /*
1289  * Write buffer destructor automatically called from kfree_skb.
1290  */
1291 void sock_wfree(struct sk_buff *skb)
1292 {
1293         struct sock *sk = skb->sk;
1294         unsigned int len = skb->truesize;
1295
1296         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1297                 /*
1298                  * Keep a reference on sk_wmem_alloc, this will be released
1299                  * after sk_write_space() call
1300                  */
1301                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1302                 sk->sk_write_space(sk);
1303                 len = 1;
1304         }
1305         /*
1306          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1307          * could not do because of in-flight packets
1308          */
1309         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1310                 __sk_free(sk);
1311 }
1312 EXPORT_SYMBOL(sock_wfree);
1313
1314 /*
1315  * Read buffer destructor automatically called from kfree_skb.
1316  */
1317 void sock_rfree(struct sk_buff *skb)
1318 {
1319         struct sock *sk = skb->sk;
1320
1321         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1322         sk_mem_uncharge(skb->sk, skb->truesize);
1323 }
1324 EXPORT_SYMBOL(sock_rfree);
1325
1326
1327 int sock_i_uid(struct sock *sk)
1328 {
1329         int uid;
1330
1331         read_lock(&sk->sk_callback_lock);
1332         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1333         read_unlock(&sk->sk_callback_lock);
1334         return uid;
1335 }
1336 EXPORT_SYMBOL(sock_i_uid);
1337
1338 unsigned long sock_i_ino(struct sock *sk)
1339 {
1340         unsigned long ino;
1341
1342         read_lock(&sk->sk_callback_lock);
1343         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1344         read_unlock(&sk->sk_callback_lock);
1345         return ino;
1346 }
1347 EXPORT_SYMBOL(sock_i_ino);
1348
1349 /*
1350  * Allocate a skb from the socket's send buffer.
1351  */
1352 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1353                              gfp_t priority)
1354 {
1355         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1356                 struct sk_buff *skb = alloc_skb(size, priority);
1357                 if (skb) {
1358                         skb_set_owner_w(skb, sk);
1359                         return skb;
1360                 }
1361         }
1362         return NULL;
1363 }
1364 EXPORT_SYMBOL(sock_wmalloc);
1365
1366 /*
1367  * Allocate a skb from the socket's receive buffer.
1368  */
1369 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1370                              gfp_t priority)
1371 {
1372         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1373                 struct sk_buff *skb = alloc_skb(size, priority);
1374                 if (skb) {
1375                         skb_set_owner_r(skb, sk);
1376                         return skb;
1377                 }
1378         }
1379         return NULL;
1380 }
1381
1382 /*
1383  * Allocate a memory block from the socket's option memory buffer.
1384  */
1385 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1386 {
1387         if ((unsigned)size <= sysctl_optmem_max &&
1388             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1389                 void *mem;
1390                 /* First do the add, to avoid the race if kmalloc
1391                  * might sleep.
1392                  */
1393                 atomic_add(size, &sk->sk_omem_alloc);
1394                 mem = kmalloc(size, priority);
1395                 if (mem)
1396                         return mem;
1397                 atomic_sub(size, &sk->sk_omem_alloc);
1398         }
1399         return NULL;
1400 }
1401 EXPORT_SYMBOL(sock_kmalloc);
1402
1403 /*
1404  * Free an option memory block.
1405  */
1406 void sock_kfree_s(struct sock *sk, void *mem, int size)
1407 {
1408         kfree(mem);
1409         atomic_sub(size, &sk->sk_omem_alloc);
1410 }
1411 EXPORT_SYMBOL(sock_kfree_s);
1412
1413 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1414    I think, these locks should be removed for datagram sockets.
1415  */
1416 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1417 {
1418         DEFINE_WAIT(wait);
1419
1420         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1421         for (;;) {
1422                 if (!timeo)
1423                         break;
1424                 if (signal_pending(current))
1425                         break;
1426                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1427                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1428                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1429                         break;
1430                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1431                         break;
1432                 if (sk->sk_err)
1433                         break;
1434                 timeo = schedule_timeout(timeo);
1435         }
1436         finish_wait(sk_sleep(sk), &wait);
1437         return timeo;
1438 }
1439
1440
1441 /*
1442  *      Generic send/receive buffer handlers
1443  */
1444
1445 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1446                                      unsigned long data_len, int noblock,
1447                                      int *errcode)
1448 {
1449         struct sk_buff *skb;
1450         gfp_t gfp_mask;
1451         long timeo;
1452         int err;
1453
1454         gfp_mask = sk->sk_allocation;
1455         if (gfp_mask & __GFP_WAIT)
1456                 gfp_mask |= __GFP_REPEAT;
1457
1458         timeo = sock_sndtimeo(sk, noblock);
1459         while (1) {
1460                 err = sock_error(sk);
1461                 if (err != 0)
1462                         goto failure;
1463
1464                 err = -EPIPE;
1465                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1466                         goto failure;
1467
1468                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1469                         skb = alloc_skb(header_len, gfp_mask);
1470                         if (skb) {
1471                                 int npages;
1472                                 int i;
1473
1474                                 /* No pages, we're done... */
1475                                 if (!data_len)
1476                                         break;
1477
1478                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1479                                 skb->truesize += data_len;
1480                                 skb_shinfo(skb)->nr_frags = npages;
1481                                 for (i = 0; i < npages; i++) {
1482                                         struct page *page;
1483                                         skb_frag_t *frag;
1484
1485                                         page = alloc_pages(sk->sk_allocation, 0);
1486                                         if (!page) {
1487                                                 err = -ENOBUFS;
1488                                                 skb_shinfo(skb)->nr_frags = i;
1489                                                 kfree_skb(skb);
1490                                                 goto failure;
1491                                         }
1492
1493                                         frag = &skb_shinfo(skb)->frags[i];
1494                                         frag->page = page;
1495                                         frag->page_offset = 0;
1496                                         frag->size = (data_len >= PAGE_SIZE ?
1497                                                       PAGE_SIZE :
1498                                                       data_len);
1499                                         data_len -= PAGE_SIZE;
1500                                 }
1501
1502                                 /* Full success... */
1503                                 break;
1504                         }
1505                         err = -ENOBUFS;
1506                         goto failure;
1507                 }
1508                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1509                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1510                 err = -EAGAIN;
1511                 if (!timeo)
1512                         goto failure;
1513                 if (signal_pending(current))
1514                         goto interrupted;
1515                 timeo = sock_wait_for_wmem(sk, timeo);
1516         }
1517
1518         skb_set_owner_w(skb, sk);
1519         return skb;
1520
1521 interrupted:
1522         err = sock_intr_errno(timeo);
1523 failure:
1524         *errcode = err;
1525         return NULL;
1526 }
1527 EXPORT_SYMBOL(sock_alloc_send_pskb);
1528
1529 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1530                                     int noblock, int *errcode)
1531 {
1532         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1533 }
1534 EXPORT_SYMBOL(sock_alloc_send_skb);
1535
1536 static void __lock_sock(struct sock *sk)
1537 {
1538         DEFINE_WAIT(wait);
1539
1540         for (;;) {
1541                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1542                                         TASK_UNINTERRUPTIBLE);
1543                 spin_unlock_bh(&sk->sk_lock.slock);
1544                 schedule();
1545                 spin_lock_bh(&sk->sk_lock.slock);
1546                 if (!sock_owned_by_user(sk))
1547                         break;
1548         }
1549         finish_wait(&sk->sk_lock.wq, &wait);
1550 }
1551
1552 static void __release_sock(struct sock *sk)
1553 {
1554         struct sk_buff *skb = sk->sk_backlog.head;
1555
1556         do {
1557                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1558                 bh_unlock_sock(sk);
1559
1560                 do {
1561                         struct sk_buff *next = skb->next;
1562
1563                         WARN_ON_ONCE(skb_dst_is_noref(skb));
1564                         skb->next = NULL;
1565                         sk_backlog_rcv(sk, skb);
1566
1567                         /*
1568                          * We are in process context here with softirqs
1569                          * disabled, use cond_resched_softirq() to preempt.
1570                          * This is safe to do because we've taken the backlog
1571                          * queue private:
1572                          */
1573                         cond_resched_softirq();
1574
1575                         skb = next;
1576                 } while (skb != NULL);
1577
1578                 bh_lock_sock(sk);
1579         } while ((skb = sk->sk_backlog.head) != NULL);
1580
1581         /*
1582          * Doing the zeroing here guarantee we can not loop forever
1583          * while a wild producer attempts to flood us.
1584          */
1585         sk->sk_backlog.len = 0;
1586 }
1587
1588 /**
1589  * sk_wait_data - wait for data to arrive at sk_receive_queue
1590  * @sk:    sock to wait on
1591  * @timeo: for how long
1592  *
1593  * Now socket state including sk->sk_err is changed only under lock,
1594  * hence we may omit checks after joining wait queue.
1595  * We check receive queue before schedule() only as optimization;
1596  * it is very likely that release_sock() added new data.
1597  */
1598 int sk_wait_data(struct sock *sk, long *timeo)
1599 {
1600         int rc;
1601         DEFINE_WAIT(wait);
1602
1603         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1604         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1605         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1606         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1607         finish_wait(sk_sleep(sk), &wait);
1608         return rc;
1609 }
1610 EXPORT_SYMBOL(sk_wait_data);
1611
1612 /**
1613  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1614  *      @sk: socket
1615  *      @size: memory size to allocate
1616  *      @kind: allocation type
1617  *
1618  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1619  *      rmem allocation. This function assumes that protocols which have
1620  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1621  */
1622 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1623 {
1624         struct proto *prot = sk->sk_prot;
1625         int amt = sk_mem_pages(size);
1626         int allocated;
1627
1628         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1629         allocated = atomic_add_return(amt, prot->memory_allocated);
1630
1631         /* Under limit. */
1632         if (allocated <= prot->sysctl_mem[0]) {
1633                 if (prot->memory_pressure && *prot->memory_pressure)
1634                         *prot->memory_pressure = 0;
1635                 return 1;
1636         }
1637
1638         /* Under pressure. */
1639         if (allocated > prot->sysctl_mem[1])
1640                 if (prot->enter_memory_pressure)
1641                         prot->enter_memory_pressure(sk);
1642
1643         /* Over hard limit. */
1644         if (allocated > prot->sysctl_mem[2])
1645                 goto suppress_allocation;
1646
1647         /* guarantee minimum buffer size under pressure */
1648         if (kind == SK_MEM_RECV) {
1649                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1650                         return 1;
1651         } else { /* SK_MEM_SEND */
1652                 if (sk->sk_type == SOCK_STREAM) {
1653                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1654                                 return 1;
1655                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1656                            prot->sysctl_wmem[0])
1657                                 return 1;
1658         }
1659
1660         if (prot->memory_pressure) {
1661                 int alloc;
1662
1663                 if (!*prot->memory_pressure)
1664                         return 1;
1665                 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1666                 if (prot->sysctl_mem[2] > alloc *
1667                     sk_mem_pages(sk->sk_wmem_queued +
1668                                  atomic_read(&sk->sk_rmem_alloc) +
1669                                  sk->sk_forward_alloc))
1670                         return 1;
1671         }
1672
1673 suppress_allocation:
1674
1675         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1676                 sk_stream_moderate_sndbuf(sk);
1677
1678                 /* Fail only if socket is _under_ its sndbuf.
1679                  * In this case we cannot block, so that we have to fail.
1680                  */
1681                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1682                         return 1;
1683         }
1684
1685         /* Alas. Undo changes. */
1686         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1687         atomic_sub(amt, prot->memory_allocated);
1688         return 0;
1689 }
1690 EXPORT_SYMBOL(__sk_mem_schedule);
1691
1692 /**
1693  *      __sk_reclaim - reclaim memory_allocated
1694  *      @sk: socket
1695  */
1696 void __sk_mem_reclaim(struct sock *sk)
1697 {
1698         struct proto *prot = sk->sk_prot;
1699
1700         atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1701                    prot->memory_allocated);
1702         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1703
1704         if (prot->memory_pressure && *prot->memory_pressure &&
1705             (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1706                 *prot->memory_pressure = 0;
1707 }
1708 EXPORT_SYMBOL(__sk_mem_reclaim);
1709
1710
1711 /*
1712  * Set of default routines for initialising struct proto_ops when
1713  * the protocol does not support a particular function. In certain
1714  * cases where it makes no sense for a protocol to have a "do nothing"
1715  * function, some default processing is provided.
1716  */
1717
1718 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1719 {
1720         return -EOPNOTSUPP;
1721 }
1722 EXPORT_SYMBOL(sock_no_bind);
1723
1724 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1725                     int len, int flags)
1726 {
1727         return -EOPNOTSUPP;
1728 }
1729 EXPORT_SYMBOL(sock_no_connect);
1730
1731 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1732 {
1733         return -EOPNOTSUPP;
1734 }
1735 EXPORT_SYMBOL(sock_no_socketpair);
1736
1737 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1738 {
1739         return -EOPNOTSUPP;
1740 }
1741 EXPORT_SYMBOL(sock_no_accept);
1742
1743 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1744                     int *len, int peer)
1745 {
1746         return -EOPNOTSUPP;
1747 }
1748 EXPORT_SYMBOL(sock_no_getname);
1749
1750 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1751 {
1752         return 0;
1753 }
1754 EXPORT_SYMBOL(sock_no_poll);
1755
1756 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1757 {
1758         return -EOPNOTSUPP;
1759 }
1760 EXPORT_SYMBOL(sock_no_ioctl);
1761
1762 int sock_no_listen(struct socket *sock, int backlog)
1763 {
1764         return -EOPNOTSUPP;
1765 }
1766 EXPORT_SYMBOL(sock_no_listen);
1767
1768 int sock_no_shutdown(struct socket *sock, int how)
1769 {
1770         return -EOPNOTSUPP;
1771 }
1772 EXPORT_SYMBOL(sock_no_shutdown);
1773
1774 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1775                     char __user *optval, unsigned int optlen)
1776 {
1777         return -EOPNOTSUPP;
1778 }
1779 EXPORT_SYMBOL(sock_no_setsockopt);
1780
1781 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1782                     char __user *optval, int __user *optlen)
1783 {
1784         return -EOPNOTSUPP;
1785 }
1786 EXPORT_SYMBOL(sock_no_getsockopt);
1787
1788 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1789                     size_t len)
1790 {
1791         return -EOPNOTSUPP;
1792 }
1793 EXPORT_SYMBOL(sock_no_sendmsg);
1794
1795 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1796                     size_t len, int flags)
1797 {
1798         return -EOPNOTSUPP;
1799 }
1800 EXPORT_SYMBOL(sock_no_recvmsg);
1801
1802 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1803 {
1804         /* Mirror missing mmap method error code */
1805         return -ENODEV;
1806 }
1807 EXPORT_SYMBOL(sock_no_mmap);
1808
1809 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1810 {
1811         ssize_t res;
1812         struct msghdr msg = {.msg_flags = flags};
1813         struct kvec iov;
1814         char *kaddr = kmap(page);
1815         iov.iov_base = kaddr + offset;
1816         iov.iov_len = size;
1817         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1818         kunmap(page);
1819         return res;
1820 }
1821 EXPORT_SYMBOL(sock_no_sendpage);
1822
1823 /*
1824  *      Default Socket Callbacks
1825  */
1826
1827 static void sock_def_wakeup(struct sock *sk)
1828 {
1829         struct socket_wq *wq;
1830
1831         rcu_read_lock();
1832         wq = rcu_dereference(sk->sk_wq);
1833         if (wq_has_sleeper(wq))
1834                 wake_up_interruptible_all(&wq->wait);
1835         rcu_read_unlock();
1836 }
1837
1838 static void sock_def_error_report(struct sock *sk)
1839 {
1840         struct socket_wq *wq;
1841
1842         rcu_read_lock();
1843         wq = rcu_dereference(sk->sk_wq);
1844         if (wq_has_sleeper(wq))
1845                 wake_up_interruptible_poll(&wq->wait, POLLERR);
1846         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1847         rcu_read_unlock();
1848 }
1849
1850 static void sock_def_readable(struct sock *sk, int len)
1851 {
1852         struct socket_wq *wq;
1853
1854         rcu_read_lock();
1855         wq = rcu_dereference(sk->sk_wq);
1856         if (wq_has_sleeper(wq))
1857                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
1858                                                 POLLRDNORM | POLLRDBAND);
1859         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1860         rcu_read_unlock();
1861 }
1862
1863 static void sock_def_write_space(struct sock *sk)
1864 {
1865         struct socket_wq *wq;
1866
1867         rcu_read_lock();
1868
1869         /* Do not wake up a writer until he can make "significant"
1870          * progress.  --DaveM
1871          */
1872         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1873                 wq = rcu_dereference(sk->sk_wq);
1874                 if (wq_has_sleeper(wq))
1875                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
1876                                                 POLLWRNORM | POLLWRBAND);
1877
1878                 /* Should agree with poll, otherwise some programs break */
1879                 if (sock_writeable(sk))
1880                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1881         }
1882
1883         rcu_read_unlock();
1884 }
1885
1886 static void sock_def_destruct(struct sock *sk)
1887 {
1888         kfree(sk->sk_protinfo);
1889 }
1890
1891 void sk_send_sigurg(struct sock *sk)
1892 {
1893         if (sk->sk_socket && sk->sk_socket->file)
1894                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1895                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1896 }
1897 EXPORT_SYMBOL(sk_send_sigurg);
1898
1899 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1900                     unsigned long expires)
1901 {
1902         if (!mod_timer(timer, expires))
1903                 sock_hold(sk);
1904 }
1905 EXPORT_SYMBOL(sk_reset_timer);
1906
1907 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1908 {
1909         if (timer_pending(timer) && del_timer(timer))
1910                 __sock_put(sk);
1911 }
1912 EXPORT_SYMBOL(sk_stop_timer);
1913
1914 void sock_init_data(struct socket *sock, struct sock *sk)
1915 {
1916         skb_queue_head_init(&sk->sk_receive_queue);
1917         skb_queue_head_init(&sk->sk_write_queue);
1918         skb_queue_head_init(&sk->sk_error_queue);
1919 #ifdef CONFIG_NET_DMA
1920         skb_queue_head_init(&sk->sk_async_wait_queue);
1921 #endif
1922
1923         sk->sk_send_head        =       NULL;
1924
1925         init_timer(&sk->sk_timer);
1926
1927         sk->sk_allocation       =       GFP_KERNEL;
1928         sk->sk_rcvbuf           =       sysctl_rmem_default;
1929         sk->sk_sndbuf           =       sysctl_wmem_default;
1930         sk->sk_state            =       TCP_CLOSE;
1931         sk_set_socket(sk, sock);
1932
1933         sock_set_flag(sk, SOCK_ZAPPED);
1934
1935         if (sock) {
1936                 sk->sk_type     =       sock->type;
1937                 sk->sk_wq       =       sock->wq;
1938                 sock->sk        =       sk;
1939         } else
1940                 sk->sk_wq       =       NULL;
1941
1942         spin_lock_init(&sk->sk_dst_lock);
1943         rwlock_init(&sk->sk_callback_lock);
1944         lockdep_set_class_and_name(&sk->sk_callback_lock,
1945                         af_callback_keys + sk->sk_family,
1946                         af_family_clock_key_strings[sk->sk_family]);
1947
1948         sk->sk_state_change     =       sock_def_wakeup;
1949         sk->sk_data_ready       =       sock_def_readable;
1950         sk->sk_write_space      =       sock_def_write_space;
1951         sk->sk_error_report     =       sock_def_error_report;
1952         sk->sk_destruct         =       sock_def_destruct;
1953
1954         sk->sk_sndmsg_page      =       NULL;
1955         sk->sk_sndmsg_off       =       0;
1956
1957         sk->sk_peercred.pid     =       0;
1958         sk->sk_peercred.uid     =       -1;
1959         sk->sk_peercred.gid     =       -1;
1960         sk->sk_write_pending    =       0;
1961         sk->sk_rcvlowat         =       1;
1962         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1963         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1964
1965         sk->sk_stamp = ktime_set(-1L, 0);
1966
1967         /*
1968          * Before updating sk_refcnt, we must commit prior changes to memory
1969          * (Documentation/RCU/rculist_nulls.txt for details)
1970          */
1971         smp_wmb();
1972         atomic_set(&sk->sk_refcnt, 1);
1973         atomic_set(&sk->sk_drops, 0);
1974 }
1975 EXPORT_SYMBOL(sock_init_data);
1976
1977 void lock_sock_nested(struct sock *sk, int subclass)
1978 {
1979         might_sleep();
1980         spin_lock_bh(&sk->sk_lock.slock);
1981         if (sk->sk_lock.owned)
1982                 __lock_sock(sk);
1983         sk->sk_lock.owned = 1;
1984         spin_unlock(&sk->sk_lock.slock);
1985         /*
1986          * The sk_lock has mutex_lock() semantics here:
1987          */
1988         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1989         local_bh_enable();
1990 }
1991 EXPORT_SYMBOL(lock_sock_nested);
1992
1993 void release_sock(struct sock *sk)
1994 {
1995         /*
1996          * The sk_lock has mutex_unlock() semantics:
1997          */
1998         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1999
2000         spin_lock_bh(&sk->sk_lock.slock);
2001         if (sk->sk_backlog.tail)
2002                 __release_sock(sk);
2003         sk->sk_lock.owned = 0;
2004         if (waitqueue_active(&sk->sk_lock.wq))
2005                 wake_up(&sk->sk_lock.wq);
2006         spin_unlock_bh(&sk->sk_lock.slock);
2007 }
2008 EXPORT_SYMBOL(release_sock);
2009
2010 /**
2011  * lock_sock_fast - fast version of lock_sock
2012  * @sk: socket
2013  *
2014  * This version should be used for very small section, where process wont block
2015  * return false if fast path is taken
2016  *   sk_lock.slock locked, owned = 0, BH disabled
2017  * return true if slow path is taken
2018  *   sk_lock.slock unlocked, owned = 1, BH enabled
2019  */
2020 bool lock_sock_fast(struct sock *sk)
2021 {
2022         might_sleep();
2023         spin_lock_bh(&sk->sk_lock.slock);
2024
2025         if (!sk->sk_lock.owned)
2026                 /*
2027                  * Note : We must disable BH
2028                  */
2029                 return false;
2030
2031         __lock_sock(sk);
2032         sk->sk_lock.owned = 1;
2033         spin_unlock(&sk->sk_lock.slock);
2034         /*
2035          * The sk_lock has mutex_lock() semantics here:
2036          */
2037         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2038         local_bh_enable();
2039         return true;
2040 }
2041 EXPORT_SYMBOL(lock_sock_fast);
2042
2043 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2044 {
2045         struct timeval tv;
2046         if (!sock_flag(sk, SOCK_TIMESTAMP))
2047                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2048         tv = ktime_to_timeval(sk->sk_stamp);
2049         if (tv.tv_sec == -1)
2050                 return -ENOENT;
2051         if (tv.tv_sec == 0) {
2052                 sk->sk_stamp = ktime_get_real();
2053                 tv = ktime_to_timeval(sk->sk_stamp);
2054         }
2055         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2056 }
2057 EXPORT_SYMBOL(sock_get_timestamp);
2058
2059 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2060 {
2061         struct timespec ts;
2062         if (!sock_flag(sk, SOCK_TIMESTAMP))
2063                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2064         ts = ktime_to_timespec(sk->sk_stamp);
2065         if (ts.tv_sec == -1)
2066                 return -ENOENT;
2067         if (ts.tv_sec == 0) {
2068                 sk->sk_stamp = ktime_get_real();
2069                 ts = ktime_to_timespec(sk->sk_stamp);
2070         }
2071         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2072 }
2073 EXPORT_SYMBOL(sock_get_timestampns);
2074
2075 void sock_enable_timestamp(struct sock *sk, int flag)
2076 {
2077         if (!sock_flag(sk, flag)) {
2078                 sock_set_flag(sk, flag);
2079                 /*
2080                  * we just set one of the two flags which require net
2081                  * time stamping, but time stamping might have been on
2082                  * already because of the other one
2083                  */
2084                 if (!sock_flag(sk,
2085                                 flag == SOCK_TIMESTAMP ?
2086                                 SOCK_TIMESTAMPING_RX_SOFTWARE :
2087                                 SOCK_TIMESTAMP))
2088                         net_enable_timestamp();
2089         }
2090 }
2091
2092 /*
2093  *      Get a socket option on an socket.
2094  *
2095  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2096  *      asynchronous errors should be reported by getsockopt. We assume
2097  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2098  */
2099 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2100                            char __user *optval, int __user *optlen)
2101 {
2102         struct sock *sk = sock->sk;
2103
2104         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2105 }
2106 EXPORT_SYMBOL(sock_common_getsockopt);
2107
2108 #ifdef CONFIG_COMPAT
2109 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2110                                   char __user *optval, int __user *optlen)
2111 {
2112         struct sock *sk = sock->sk;
2113
2114         if (sk->sk_prot->compat_getsockopt != NULL)
2115                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2116                                                       optval, optlen);
2117         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2118 }
2119 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2120 #endif
2121
2122 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2123                         struct msghdr *msg, size_t size, int flags)
2124 {
2125         struct sock *sk = sock->sk;
2126         int addr_len = 0;
2127         int err;
2128
2129         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2130                                    flags & ~MSG_DONTWAIT, &addr_len);
2131         if (err >= 0)
2132                 msg->msg_namelen = addr_len;
2133         return err;
2134 }
2135 EXPORT_SYMBOL(sock_common_recvmsg);
2136
2137 /*
2138  *      Set socket options on an inet socket.
2139  */
2140 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2141                            char __user *optval, unsigned int optlen)
2142 {
2143         struct sock *sk = sock->sk;
2144
2145         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2146 }
2147 EXPORT_SYMBOL(sock_common_setsockopt);
2148
2149 #ifdef CONFIG_COMPAT
2150 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2151                                   char __user *optval, unsigned int optlen)
2152 {
2153         struct sock *sk = sock->sk;
2154
2155         if (sk->sk_prot->compat_setsockopt != NULL)
2156                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2157                                                       optval, optlen);
2158         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2159 }
2160 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2161 #endif
2162
2163 void sk_common_release(struct sock *sk)
2164 {
2165         if (sk->sk_prot->destroy)
2166                 sk->sk_prot->destroy(sk);
2167
2168         /*
2169          * Observation: when sock_common_release is called, processes have
2170          * no access to socket. But net still has.
2171          * Step one, detach it from networking:
2172          *
2173          * A. Remove from hash tables.
2174          */
2175
2176         sk->sk_prot->unhash(sk);
2177
2178         /*
2179          * In this point socket cannot receive new packets, but it is possible
2180          * that some packets are in flight because some CPU runs receiver and
2181          * did hash table lookup before we unhashed socket. They will achieve
2182          * receive queue and will be purged by socket destructor.
2183          *
2184          * Also we still have packets pending on receive queue and probably,
2185          * our own packets waiting in device queues. sock_destroy will drain
2186          * receive queue, but transmitted packets will delay socket destruction
2187          * until the last reference will be released.
2188          */
2189
2190         sock_orphan(sk);
2191
2192         xfrm_sk_free_policy(sk);
2193
2194         sk_refcnt_debug_release(sk);
2195         sock_put(sk);
2196 }
2197 EXPORT_SYMBOL(sk_common_release);
2198
2199 static DEFINE_RWLOCK(proto_list_lock);
2200 static LIST_HEAD(proto_list);
2201
2202 #ifdef CONFIG_PROC_FS
2203 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2204 struct prot_inuse {
2205         int val[PROTO_INUSE_NR];
2206 };
2207
2208 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2209
2210 #ifdef CONFIG_NET_NS
2211 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2212 {
2213         int cpu = smp_processor_id();
2214         per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2215 }
2216 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2217
2218 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2219 {
2220         int cpu, idx = prot->inuse_idx;
2221         int res = 0;
2222
2223         for_each_possible_cpu(cpu)
2224                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2225
2226         return res >= 0 ? res : 0;
2227 }
2228 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2229
2230 static int __net_init sock_inuse_init_net(struct net *net)
2231 {
2232         net->core.inuse = alloc_percpu(struct prot_inuse);
2233         return net->core.inuse ? 0 : -ENOMEM;
2234 }
2235
2236 static void __net_exit sock_inuse_exit_net(struct net *net)
2237 {
2238         free_percpu(net->core.inuse);
2239 }
2240
2241 static struct pernet_operations net_inuse_ops = {
2242         .init = sock_inuse_init_net,
2243         .exit = sock_inuse_exit_net,
2244 };
2245
2246 static __init int net_inuse_init(void)
2247 {
2248         if (register_pernet_subsys(&net_inuse_ops))
2249                 panic("Cannot initialize net inuse counters");
2250
2251         return 0;
2252 }
2253
2254 core_initcall(net_inuse_init);
2255 #else
2256 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2257
2258 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2259 {
2260         __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2261 }
2262 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2263
2264 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2265 {
2266         int cpu, idx = prot->inuse_idx;
2267         int res = 0;
2268
2269         for_each_possible_cpu(cpu)
2270                 res += per_cpu(prot_inuse, cpu).val[idx];
2271
2272         return res >= 0 ? res : 0;
2273 }
2274 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2275 #endif
2276
2277 static void assign_proto_idx(struct proto *prot)
2278 {
2279         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2280
2281         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2282                 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2283                 return;
2284         }
2285
2286         set_bit(prot->inuse_idx, proto_inuse_idx);
2287 }
2288
2289 static void release_proto_idx(struct proto *prot)
2290 {
2291         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2292                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2293 }
2294 #else
2295 static inline void assign_proto_idx(struct proto *prot)
2296 {
2297 }
2298
2299 static inline void release_proto_idx(struct proto *prot)
2300 {
2301 }
2302 #endif
2303
2304 int proto_register(struct proto *prot, int alloc_slab)
2305 {
2306         if (alloc_slab) {
2307                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2308                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2309                                         NULL);
2310
2311                 if (prot->slab == NULL) {
2312                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2313                                prot->name);
2314                         goto out;
2315                 }
2316
2317                 if (prot->rsk_prot != NULL) {
2318                         prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2319                         if (prot->rsk_prot->slab_name == NULL)
2320                                 goto out_free_sock_slab;
2321
2322                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2323                                                                  prot->rsk_prot->obj_size, 0,
2324                                                                  SLAB_HWCACHE_ALIGN, NULL);
2325
2326                         if (prot->rsk_prot->slab == NULL) {
2327                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2328                                        prot->name);
2329                                 goto out_free_request_sock_slab_name;
2330                         }
2331                 }
2332
2333                 if (prot->twsk_prot != NULL) {
2334                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2335
2336                         if (prot->twsk_prot->twsk_slab_name == NULL)
2337                                 goto out_free_request_sock_slab;
2338
2339                         prot->twsk_prot->twsk_slab =
2340                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2341                                                   prot->twsk_prot->twsk_obj_size,
2342                                                   0,
2343                                                   SLAB_HWCACHE_ALIGN |
2344                                                         prot->slab_flags,
2345                                                   NULL);
2346                         if (prot->twsk_prot->twsk_slab == NULL)
2347                                 goto out_free_timewait_sock_slab_name;
2348                 }
2349         }
2350
2351         write_lock(&proto_list_lock);
2352         list_add(&prot->node, &proto_list);
2353         assign_proto_idx(prot);
2354         write_unlock(&proto_list_lock);
2355         return 0;
2356
2357 out_free_timewait_sock_slab_name:
2358         kfree(prot->twsk_prot->twsk_slab_name);
2359 out_free_request_sock_slab:
2360         if (prot->rsk_prot && prot->rsk_prot->slab) {
2361                 kmem_cache_destroy(prot->rsk_prot->slab);
2362                 prot->rsk_prot->slab = NULL;
2363         }
2364 out_free_request_sock_slab_name:
2365         if (prot->rsk_prot)
2366                 kfree(prot->rsk_prot->slab_name);
2367 out_free_sock_slab:
2368         kmem_cache_destroy(prot->slab);
2369         prot->slab = NULL;
2370 out:
2371         return -ENOBUFS;
2372 }
2373 EXPORT_SYMBOL(proto_register);
2374
2375 void proto_unregister(struct proto *prot)
2376 {
2377         write_lock(&proto_list_lock);
2378         release_proto_idx(prot);
2379         list_del(&prot->node);
2380         write_unlock(&proto_list_lock);
2381
2382         if (prot->slab != NULL) {
2383                 kmem_cache_destroy(prot->slab);
2384                 prot->slab = NULL;
2385         }
2386
2387         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2388                 kmem_cache_destroy(prot->rsk_prot->slab);
2389                 kfree(prot->rsk_prot->slab_name);
2390                 prot->rsk_prot->slab = NULL;
2391         }
2392
2393         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2394                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2395                 kfree(prot->twsk_prot->twsk_slab_name);
2396                 prot->twsk_prot->twsk_slab = NULL;
2397         }
2398 }
2399 EXPORT_SYMBOL(proto_unregister);
2400
2401 #ifdef CONFIG_PROC_FS
2402 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2403         __acquires(proto_list_lock)
2404 {
2405         read_lock(&proto_list_lock);
2406         return seq_list_start_head(&proto_list, *pos);
2407 }
2408
2409 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2410 {
2411         return seq_list_next(v, &proto_list, pos);
2412 }
2413
2414 static void proto_seq_stop(struct seq_file *seq, void *v)
2415         __releases(proto_list_lock)
2416 {
2417         read_unlock(&proto_list_lock);
2418 }
2419
2420 static char proto_method_implemented(const void *method)
2421 {
2422         return method == NULL ? 'n' : 'y';
2423 }
2424
2425 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2426 {
2427         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2428                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2429                    proto->name,
2430                    proto->obj_size,
2431                    sock_prot_inuse_get(seq_file_net(seq), proto),
2432                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2433                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2434                    proto->max_header,
2435                    proto->slab == NULL ? "no" : "yes",
2436                    module_name(proto->owner),
2437                    proto_method_implemented(proto->close),
2438                    proto_method_implemented(proto->connect),
2439                    proto_method_implemented(proto->disconnect),
2440                    proto_method_implemented(proto->accept),
2441                    proto_method_implemented(proto->ioctl),
2442                    proto_method_implemented(proto->init),
2443                    proto_method_implemented(proto->destroy),
2444                    proto_method_implemented(proto->shutdown),
2445                    proto_method_implemented(proto->setsockopt),
2446                    proto_method_implemented(proto->getsockopt),
2447                    proto_method_implemented(proto->sendmsg),
2448                    proto_method_implemented(proto->recvmsg),
2449                    proto_method_implemented(proto->sendpage),
2450                    proto_method_implemented(proto->bind),
2451                    proto_method_implemented(proto->backlog_rcv),
2452                    proto_method_implemented(proto->hash),
2453                    proto_method_implemented(proto->unhash),
2454                    proto_method_implemented(proto->get_port),
2455                    proto_method_implemented(proto->enter_memory_pressure));
2456 }
2457
2458 static int proto_seq_show(struct seq_file *seq, void *v)
2459 {
2460         if (v == &proto_list)
2461                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2462                            "protocol",
2463                            "size",
2464                            "sockets",
2465                            "memory",
2466                            "press",
2467                            "maxhdr",
2468                            "slab",
2469                            "module",
2470                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2471         else
2472                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2473         return 0;
2474 }
2475
2476 static const struct seq_operations proto_seq_ops = {
2477         .start  = proto_seq_start,
2478         .next   = proto_seq_next,
2479         .stop   = proto_seq_stop,
2480         .show   = proto_seq_show,
2481 };
2482
2483 static int proto_seq_open(struct inode *inode, struct file *file)
2484 {
2485         return seq_open_net(inode, file, &proto_seq_ops,
2486                             sizeof(struct seq_net_private));
2487 }
2488
2489 static const struct file_operations proto_seq_fops = {
2490         .owner          = THIS_MODULE,
2491         .open           = proto_seq_open,
2492         .read           = seq_read,
2493         .llseek         = seq_lseek,
2494         .release        = seq_release_net,
2495 };
2496
2497 static __net_init int proto_init_net(struct net *net)
2498 {
2499         if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2500                 return -ENOMEM;
2501
2502         return 0;
2503 }
2504
2505 static __net_exit void proto_exit_net(struct net *net)
2506 {
2507         proc_net_remove(net, "protocols");
2508 }
2509
2510
2511 static __net_initdata struct pernet_operations proto_net_ops = {
2512         .init = proto_init_net,
2513         .exit = proto_exit_net,
2514 };
2515
2516 static int __init proto_init(void)
2517 {
2518         return register_pernet_subsys(&proto_net_ops);
2519 }
2520
2521 subsys_initcall(proto_init);
2522
2523 #endif /* PROC_FS */