net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Version:     $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
  11  *
  12  * Authors:     Ross Biro
  13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Alan Cox, <A.Cox@swansea.ac.uk>
  16  *
  17  * Fixes:
  18  *              Alan Cox        :       Numerous verify_area() problems
  19  *              Alan Cox        :       Connecting on a connecting socket
  20  *                                      now returns an error for tcp.
  21  *              Alan Cox        :       sock->protocol is set correctly.
  22  *                                      and is not sometimes left as 0.
  23  *              Alan Cox        :       connect handles icmp errors on a
  24  *                                      connect properly. Unfortunately there
  25  *                                      is a restart syscall nasty there. I
  26  *                                      can't match BSD without hacking the C
  27  *                                      library. Ideas urgently sought!
  28  *              Alan Cox        :       Disallow bind() to addresses that are
  29  *                                      not ours - especially broadcast ones!!
  30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  32  *                                      instead they leave that for the DESTROY timer.
  33  *              Alan Cox        :       Clean up error flag in accept
  34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  35  *                                      was buggy. Put a remove_sock() in the handler
  36  *                                      for memory when we hit 0. Also altered the timer
  37  *                                      code. The ACK stuff can wait and needs major
  38  *                                      TCP layer surgery.
  39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  40  *                                      and fixed timer/inet_bh race.
  41  *              Alan Cox        :       Added zapped flag for TCP
  42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  49  *      Pauline Middelink       :       identd support
  50  *              Alan Cox        :       Fixed connect() taking signals I think.
  51  *              Alan Cox        :       SO_LINGER supported
  52  *              Alan Cox        :       Error reporting fixes
  53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  54  *              Alan Cox        :       inet sockets don't set sk->type!
  55  *              Alan Cox        :       Split socket option code
  56  *              Alan Cox        :       Callbacks
  57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  58  *              Alex            :       Removed restriction on inet fioctl
  59  *              Alan Cox        :       Splitting INET from NET core
  60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  62  *              Alan Cox        :       Split IP from generic code
  63  *              Alan Cox        :       New kfree_skbmem()
  64  *              Alan Cox        :       Make SO_DEBUG superuser only.
  65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  66  *                                      (compatibility fix)
  67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  68  *              Alan Cox        :       Allocator for a socket is settable.
  69  *              Alan Cox        :       SO_ERROR includes soft errors.
  70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  71  *              Alan Cox        :       Generic socket allocation to make hooks
  72  *                                      easier (suggested by Craig Metz).
  73  *              Michael Pall    :       SO_ERROR returns positive errno again
  74  *              Steve Whitehouse:       Added default destructor to free
  75  *                                      protocol private data.
  76  *              Steve Whitehouse:       Added various other default routines
  77  *                                      common to several socket families.
  78  *              Chris Evans     :       Call suser() check last on F_SETOWN
  79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  81  *              Andi Kleen      :       Fix write_space callback
  82  *              Chris Evans     :       Security fixes - signedness again
  83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  84  *
  85  * To Fix:
  86  *
  87  *
  88  *              This program is free software; you can redistribute it and/or
  89  *              modify it under the terms of the GNU General Public License
  90  *              as published by the Free Software Foundation; either version
  91  *              2 of the License, or (at your option) any later version.
  92  */
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/types.h>
  97 #include <linux/socket.h>
  98 #include <linux/in.h>
  99 #include <linux/kernel.h>
 100 #include <linux/module.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/sched.h>
 104 #include <linux/timer.h>
 105 #include <linux/string.h>
 106 #include <linux/sockios.h>
 107 #include <linux/net.h>
 108 #include <linux/mm.h>
 109 #include <linux/slab.h>
 110 #include <linux/interrupt.h>
 111 #include <linux/poll.h>
 112 #include <linux/tcp.h>
 113 #include <linux/init.h>
 114 #include <linux/highmem.h>
 115
 116 #include <asm/uaccess.h>
 117 #include <asm/system.h>
 118
 119 #include <linux/netdevice.h>
 120 #include <net/protocol.h>
 121 #include <linux/skbuff.h>
 122 #include <net/net_namespace.h>
 123 #include <net/request_sock.h>
 124 #include <net/sock.h>
 125 #include <net/xfrm.h>
 126 #include <linux/ipsec.h>
 127
 128 #include <linux/filter.h>
 129
 130 #ifdef CONFIG_INET
 131 #include <net/tcp.h>
 132 #endif
 133
 134 /*
 135  * Each address family might have different locking rules, so we have
 136  * one slock key per address family:
 137  */
 138 static struct lock_class_key af_family_keys[AF_MAX];
 139 static struct lock_class_key af_family_slock_keys[AF_MAX];
 140
 141 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 142 /*
 143  * Make lock validator output more readable. (we pre-construct these
 144  * strings build-time, so that runtime initialization of socket
 145  * locks is fast):
 146  */
 147 static const char *af_family_key_strings[AF_MAX+1] = {
 148   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 149   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 150   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 151   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 152   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 153   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 154   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 155   "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 156   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 157   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 158   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 159   "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
 160 };
 161 static const char *af_family_slock_key_strings[AF_MAX+1] = {
 162   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 163   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 164   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 165   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 166   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 167   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 168   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 169   "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 170   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 171   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 172   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 173   "slock-AF_RXRPC" , "slock-AF_MAX"
 174 };
 175 static const char *af_family_clock_key_strings[AF_MAX+1] = {
 176   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 177   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 178   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 179   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 180   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 181   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 182   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 183   "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 184   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 185   "clock-27"       , "clock-28"          , "clock-29"          ,
 186   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 187   "clock-AF_RXRPC" , "clock-AF_MAX"
 188 };
 189 #endif
 190
 191 /*
 192  * sk_callback_lock locking rules are per-address-family,
 193  * so split the lock classes by using a per-AF key:
 194  */
 195 static struct lock_class_key af_callback_keys[AF_MAX];
 196
 197 /* Take into consideration the size of the struct sk_buff overhead in the
 198  * determination of these values, since that is non-constant across
 199  * platforms.  This makes socket queueing behavior and performance
 200  * not depend upon such differences.
 201  */
 202 #define _SK_MEM_PACKETS         256
 203 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
 204 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 205 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 206
 207 /* Run time adjustable parameters. */
 208 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 209 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 210 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 211 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 212
 213 /* Maximal space eaten by iovec or ancilliary data plus some space */
 214 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 215
 216 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 217 {
 218         struct timeval tv;
 219
 220         if (optlen < sizeof(tv))
 221                 return -EINVAL;
 222         if (copy_from_user(&tv, optval, sizeof(tv)))
 223                 return -EFAULT;
 224         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 225                 return -EDOM;
 226
 227         if (tv.tv_sec < 0) {
 228                 static int warned __read_mostly;
 229
 230                 *timeo_p = 0;
 231                 if (warned < 10 && net_ratelimit())
 232                         warned++;
 233                         printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
 234                                "tries to set negative timeout\n",
 235                                 current->comm, task_pid_nr(current));
 236                 return 0;
 237         }
 238         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 239         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 240                 return 0;
 241         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 242                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 243         return 0;
 244 }
 245
 246 static void sock_warn_obsolete_bsdism(const char *name)
 247 {
 248         static int warned;
 249         static char warncomm[TASK_COMM_LEN];
 250         if (strcmp(warncomm, current->comm) && warned < 5) {
 251                 strcpy(warncomm,  current->comm);
 252                 printk(KERN_WARNING "process `%s' is using obsolete "
 253                        "%s SO_BSDCOMPAT\n", warncomm, name);
 254                 warned++;
 255         }
 256 }
 257
 258 static void sock_disable_timestamp(struct sock *sk)
 259 {
 260         if (sock_flag(sk, SOCK_TIMESTAMP)) {
 261                 sock_reset_flag(sk, SOCK_TIMESTAMP);
 262                 net_disable_timestamp();
 263         }
 264 }
 265
 266
 267 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 268 {
 269         int err = 0;
 270         int skb_len;
 271
 272         /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
 273            number of warnings when compiling with -W --ANK
 274          */
 275         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 276             (unsigned)sk->sk_rcvbuf) {
 277                 err = -ENOMEM;
 278                 goto out;
 279         }
 280
 281         err = sk_filter(sk, skb);
 282         if (err)
 283                 goto out;
 284
 285         if (!sk_rmem_schedule(sk, skb->truesize)) {
 286                 err = -ENOBUFS;
 287                 goto out;
 288         }
 289
 290         skb->dev = NULL;
 291         skb_set_owner_r(skb, sk);
 292
 293         /* Cache the SKB length before we tack it onto the receive
 294          * queue.  Once it is added it no longer belongs to us and
 295          * may be freed by other threads of control pulling packets
 296          * from the queue.
 297          */
 298         skb_len = skb->len;
 299
 300         skb_queue_tail(&sk->sk_receive_queue, skb);
 301
 302         if (!sock_flag(sk, SOCK_DEAD))
 303                 sk->sk_data_ready(sk, skb_len);
 304 out:
 305         return err;
 306 }
 307 EXPORT_SYMBOL(sock_queue_rcv_skb);
 308
 309 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 310 {
 311         int rc = NET_RX_SUCCESS;
 312
 313         if (sk_filter(sk, skb))
 314                 goto discard_and_relse;
 315
 316         skb->dev = NULL;
 317
 318         if (nested)
 319                 bh_lock_sock_nested(sk);
 320         else
 321                 bh_lock_sock(sk);
 322         if (!sock_owned_by_user(sk)) {
 323                 /*
 324                  * trylock + unlock semantics:
 325                  */
 326                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 327
 328                 rc = sk->sk_backlog_rcv(sk, skb);
 329
 330                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 331         } else
 332                 sk_add_backlog(sk, skb);
 333         bh_unlock_sock(sk);
 334 out:
 335         sock_put(sk);
 336         return rc;
 337 discard_and_relse:
 338         kfree_skb(skb);
 339         goto out;
 340 }
 341 EXPORT_SYMBOL(sk_receive_skb);
 342
 343 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 344 {
 345         struct dst_entry *dst = sk->sk_dst_cache;
 346
 347         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 348                 sk->sk_dst_cache = NULL;
 349                 dst_release(dst);
 350                 return NULL;
 351         }
 352
 353         return dst;
 354 }
 355 EXPORT_SYMBOL(__sk_dst_check);
 356
 357 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 358 {
 359         struct dst_entry *dst = sk_dst_get(sk);
 360
 361         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 362                 sk_dst_reset(sk);
 363                 dst_release(dst);
 364                 return NULL;
 365         }
 366
 367         return dst;
 368 }
 369 EXPORT_SYMBOL(sk_dst_check);
 370
 371 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 372 {
 373         int ret = -ENOPROTOOPT;
 374 #ifdef CONFIG_NETDEVICES
 375         struct net *net = sk->sk_net;
 376         char devname[IFNAMSIZ];
 377         int index;
 378
 379         /* Sorry... */
 380         ret = -EPERM;
 381         if (!capable(CAP_NET_RAW))
 382                 goto out;
 383
 384         ret = -EINVAL;
 385         if (optlen < 0)
 386                 goto out;
 387
 388         /* Bind this socket to a particular device like "eth0",
 389          * as specified in the passed interface name. If the
 390          * name is "" or the option length is zero the socket
 391          * is not bound.
 392          */
 393         if (optlen > IFNAMSIZ - 1)
 394                 optlen = IFNAMSIZ - 1;
 395         memset(devname, 0, sizeof(devname));
 396
 397         ret = -EFAULT;
 398         if (copy_from_user(devname, optval, optlen))
 399                 goto out;
 400
 401         if (devname[0] == '\0') {
 402                 index = 0;
 403         } else {
 404                 struct net_device *dev = dev_get_by_name(net, devname);
 405
 406                 ret = -ENODEV;
 407                 if (!dev)
 408                         goto out;
 409
 410                 index = dev->ifindex;
 411                 dev_put(dev);
 412         }
 413
 414         lock_sock(sk);
 415         sk->sk_bound_dev_if = index;
 416         sk_dst_reset(sk);
 417         release_sock(sk);
 418
 419         ret = 0;
 420
 421 out:
 422 #endif
 423
 424         return ret;
 425 }
 426
 427 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 428 {
 429         if (valbool)
 430                 sock_set_flag(sk, bit);
 431         else
 432                 sock_reset_flag(sk, bit);
 433 }
 434
 435 /*
 436  *      This is meant for all protocols to use and covers goings on
 437  *      at the socket level. Everything here is generic.
 438  */
 439
 440 int sock_setsockopt(struct socket *sock, int level, int optname,
 441                     char __user *optval, int optlen)
 442 {
 443         struct sock *sk=sock->sk;
 444         int val;
 445         int valbool;
 446         struct linger ling;
 447         int ret = 0;
 448
 449         /*
 450          *      Options without arguments
 451          */
 452
 453 #ifdef SO_DONTLINGER            /* Compatibility item... */
 454         if (optname == SO_DONTLINGER) {
 455                 lock_sock(sk);
 456                 sock_reset_flag(sk, SOCK_LINGER);
 457                 release_sock(sk);
 458                 return 0;
 459         }
 460 #endif
 461
 462         if (optname == SO_BINDTODEVICE)
 463                 return sock_bindtodevice(sk, optval, optlen);
 464
 465         if (optlen < sizeof(int))
 466                 return -EINVAL;
 467
 468         if (get_user(val, (int __user *)optval))
 469                 return -EFAULT;
 470
 471         valbool = val?1:0;
 472
 473         lock_sock(sk);
 474
 475         switch(optname) {
 476         case SO_DEBUG:
 477                 if (val && !capable(CAP_NET_ADMIN)) {
 478                         ret = -EACCES;
 479                 } else
 480                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 481                 break;
 482         case SO_REUSEADDR:
 483                 sk->sk_reuse = valbool;
 484                 break;
 485         case SO_TYPE:
 486         case SO_ERROR:
 487                 ret = -ENOPROTOOPT;
 488                 break;
 489         case SO_DONTROUTE:
 490                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 491                 break;
 492         case SO_BROADCAST:
 493                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 494                 break;
 495         case SO_SNDBUF:
 496                 /* Don't error on this BSD doesn't and if you think
 497                    about it this is right. Otherwise apps have to
 498                    play 'guess the biggest size' games. RCVBUF/SNDBUF
 499                    are treated in BSD as hints */
 500
 501                 if (val > sysctl_wmem_max)
 502                         val = sysctl_wmem_max;
 503 set_sndbuf:
 504                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 505                 if ((val * 2) < SOCK_MIN_SNDBUF)
 506                         sk->sk_sndbuf = SOCK_MIN_SNDBUF;
 507                 else
 508                         sk->sk_sndbuf = val * 2;
 509
 510                 /*
 511                  *      Wake up sending tasks if we
 512                  *      upped the value.
 513                  */
 514                 sk->sk_write_space(sk);
 515                 break;
 516
 517         case SO_SNDBUFFORCE:
 518                 if (!capable(CAP_NET_ADMIN)) {
 519                         ret = -EPERM;
 520                         break;
 521                 }
 522                 goto set_sndbuf;
 523
 524         case SO_RCVBUF:
 525                 /* Don't error on this BSD doesn't and if you think
 526                    about it this is right. Otherwise apps have to
 527                    play 'guess the biggest size' games. RCVBUF/SNDBUF
 528                    are treated in BSD as hints */
 529
 530                 if (val > sysctl_rmem_max)
 531                         val = sysctl_rmem_max;
 532 set_rcvbuf:
 533                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 534                 /*
 535                  * We double it on the way in to account for
 536                  * "struct sk_buff" etc. overhead.   Applications
 537                  * assume that the SO_RCVBUF setting they make will
 538                  * allow that much actual data to be received on that
 539                  * socket.
 540                  *
 541                  * Applications are unaware that "struct sk_buff" and
 542                  * other overheads allocate from the receive buffer
 543                  * during socket buffer allocation.
 544                  *
 545                  * And after considering the possible alternatives,
 546                  * returning the value we actually used in getsockopt
 547                  * is the most desirable behavior.
 548                  */
 549                 if ((val * 2) < SOCK_MIN_RCVBUF)
 550                         sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
 551                 else
 552                         sk->sk_rcvbuf = val * 2;
 553                 break;
 554
 555         case SO_RCVBUFFORCE:
 556                 if (!capable(CAP_NET_ADMIN)) {
 557                         ret = -EPERM;
 558                         break;
 559                 }
 560                 goto set_rcvbuf;
 561
 562         case SO_KEEPALIVE:
 563 #ifdef CONFIG_INET
 564                 if (sk->sk_protocol == IPPROTO_TCP)
 565                         tcp_set_keepalive(sk, valbool);
 566 #endif
 567                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 568                 break;
 569
 570         case SO_OOBINLINE:
 571                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 572                 break;
 573
 574         case SO_NO_CHECK:
 575                 sk->sk_no_check = valbool;
 576                 break;
 577
 578         case SO_PRIORITY:
 579                 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 580                         sk->sk_priority = val;
 581                 else
 582                         ret = -EPERM;
 583                 break;
 584
 585         case SO_LINGER:
 586                 if (optlen < sizeof(ling)) {
 587                         ret = -EINVAL;  /* 1003.1g */
 588                         break;
 589                 }
 590                 if (copy_from_user(&ling,optval,sizeof(ling))) {
 591                         ret = -EFAULT;
 592                         break;
 593                 }
 594                 if (!ling.l_onoff)
 595                         sock_reset_flag(sk, SOCK_LINGER);
 596                 else {
 597 #if (BITS_PER_LONG == 32)
 598                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 599                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 600                         else
 601 #endif
 602                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 603                         sock_set_flag(sk, SOCK_LINGER);
 604                 }
 605                 break;
 606
 607         case SO_BSDCOMPAT:
 608                 sock_warn_obsolete_bsdism("setsockopt");
 609                 break;
 610
 611         case SO_PASSCRED:
 612                 if (valbool)
 613                         set_bit(SOCK_PASSCRED, &sock->flags);
 614                 else
 615                         clear_bit(SOCK_PASSCRED, &sock->flags);
 616                 break;
 617
 618         case SO_TIMESTAMP:
 619         case SO_TIMESTAMPNS:
 620                 if (valbool)  {
 621                         if (optname == SO_TIMESTAMP)
 622                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 623                         else
 624                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 625                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 626                         sock_enable_timestamp(sk);
 627                 } else {
 628                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 629                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 630                 }
 631                 break;
 632
 633         case SO_RCVLOWAT:
 634                 if (val < 0)
 635                         val = INT_MAX;
 636                 sk->sk_rcvlowat = val ? : 1;
 637                 break;
 638
 639         case SO_RCVTIMEO:
 640                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 641                 break;
 642
 643         case SO_SNDTIMEO:
 644                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 645                 break;
 646
 647         case SO_ATTACH_FILTER:
 648                 ret = -EINVAL;
 649                 if (optlen == sizeof(struct sock_fprog)) {
 650                         struct sock_fprog fprog;
 651
 652                         ret = -EFAULT;
 653                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 654                                 break;
 655
 656                         ret = sk_attach_filter(&fprog, sk);
 657                 }
 658                 break;
 659
 660         case SO_DETACH_FILTER:
 661                 ret = sk_detach_filter(sk);
 662                 break;
 663
 664         case SO_PASSSEC:
 665                 if (valbool)
 666                         set_bit(SOCK_PASSSEC, &sock->flags);
 667                 else
 668                         clear_bit(SOCK_PASSSEC, &sock->flags);
 669                 break;
 670
 671                 /* We implement the SO_SNDLOWAT etc to
 672                    not be settable (1003.1g 5.3) */
 673         default:
 674                 ret = -ENOPROTOOPT;
 675                 break;
 676         }
 677         release_sock(sk);
 678         return ret;
 679 }
 680
 681
 682 int sock_getsockopt(struct socket *sock, int level, int optname,
 683                     char __user *optval, int __user *optlen)
 684 {
 685         struct sock *sk = sock->sk;
 686
 687         union {
 688                 int val;
 689                 struct linger ling;
 690                 struct timeval tm;
 691         } v;
 692
 693         unsigned int lv = sizeof(int);
 694         int len;
 695
 696         if (get_user(len, optlen))
 697                 return -EFAULT;
 698         if (len < 0)
 699                 return -EINVAL;
 700
 701         switch(optname) {
 702         case SO_DEBUG:
 703                 v.val = sock_flag(sk, SOCK_DBG);
 704                 break;
 705
 706         case SO_DONTROUTE:
 707                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
 708                 break;
 709
 710         case SO_BROADCAST:
 711                 v.val = !!sock_flag(sk, SOCK_BROADCAST);
 712                 break;
 713
 714         case SO_SNDBUF:
 715                 v.val = sk->sk_sndbuf;
 716                 break;
 717
 718         case SO_RCVBUF:
 719                 v.val = sk->sk_rcvbuf;
 720                 break;
 721
 722         case SO_REUSEADDR:
 723                 v.val = sk->sk_reuse;
 724                 break;
 725
 726         case SO_KEEPALIVE:
 727                 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
 728                 break;
 729
 730         case SO_TYPE:
 731                 v.val = sk->sk_type;
 732                 break;
 733
 734         case SO_ERROR:
 735                 v.val = -sock_error(sk);
 736                 if (v.val==0)
 737                         v.val = xchg(&sk->sk_err_soft, 0);
 738                 break;
 739
 740         case SO_OOBINLINE:
 741                 v.val = !!sock_flag(sk, SOCK_URGINLINE);
 742                 break;
 743
 744         case SO_NO_CHECK:
 745                 v.val = sk->sk_no_check;
 746                 break;
 747
 748         case SO_PRIORITY:
 749                 v.val = sk->sk_priority;
 750                 break;
 751
 752         case SO_LINGER:
 753                 lv              = sizeof(v.ling);
 754                 v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
 755                 v.ling.l_linger = sk->sk_lingertime / HZ;
 756                 break;
 757
 758         case SO_BSDCOMPAT:
 759                 sock_warn_obsolete_bsdism("getsockopt");
 760                 break;
 761
 762         case SO_TIMESTAMP:
 763                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 764                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
 765                 break;
 766
 767         case SO_TIMESTAMPNS:
 768                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 769                 break;
 770
 771         case SO_RCVTIMEO:
 772                 lv=sizeof(struct timeval);
 773                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 774                         v.tm.tv_sec = 0;
 775                         v.tm.tv_usec = 0;
 776                 } else {
 777                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 778                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 779                 }
 780                 break;
 781
 782         case SO_SNDTIMEO:
 783                 lv=sizeof(struct timeval);
 784                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 785                         v.tm.tv_sec = 0;
 786                         v.tm.tv_usec = 0;
 787                 } else {
 788                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
 789                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
 790                 }
 791                 break;
 792
 793         case SO_RCVLOWAT:
 794                 v.val = sk->sk_rcvlowat;
 795                 break;
 796
 797         case SO_SNDLOWAT:
 798                 v.val=1;
 799                 break;
 800
 801         case SO_PASSCRED:
 802                 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
 803                 break;
 804
 805         case SO_PEERCRED:
 806                 if (len > sizeof(sk->sk_peercred))
 807                         len = sizeof(sk->sk_peercred);
 808                 if (copy_to_user(optval, &sk->sk_peercred, len))
 809                         return -EFAULT;
 810                 goto lenout;
 811
 812         case SO_PEERNAME:
 813         {
 814                 char address[128];
 815
 816                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
 817                         return -ENOTCONN;
 818                 if (lv < len)
 819                         return -EINVAL;
 820                 if (copy_to_user(optval, address, len))
 821                         return -EFAULT;
 822                 goto lenout;
 823         }
 824
 825         /* Dubious BSD thing... Probably nobody even uses it, but
 826          * the UNIX standard wants it for whatever reason... -DaveM
 827          */
 828         case SO_ACCEPTCONN:
 829                 v.val = sk->sk_state == TCP_LISTEN;
 830                 break;
 831
 832         case SO_PASSSEC:
 833                 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
 834                 break;
 835
 836         case SO_PEERSEC:
 837                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
 838
 839         default:
 840                 return -ENOPROTOOPT;
 841         }
 842
 843         if (len > lv)
 844                 len = lv;
 845         if (copy_to_user(optval, &v, len))
 846                 return -EFAULT;
 847 lenout:
 848         if (put_user(len, optlen))
 849                 return -EFAULT;
 850         return 0;
 851 }
 852
 853 /*
 854  * Initialize an sk_lock.
 855  *
 856  * (We also register the sk_lock with the lock validator.)
 857  */
 858 static inline void sock_lock_init(struct sock *sk)
 859 {
 860         sock_lock_init_class_and_name(sk,
 861                         af_family_slock_key_strings[sk->sk_family],
 862                         af_family_slock_keys + sk->sk_family,
 863                         af_family_key_strings[sk->sk_family],
 864                         af_family_keys + sk->sk_family);
 865 }
 866
 867 static void sock_copy(struct sock *nsk, const struct sock *osk)
 868 {
 869 #ifdef CONFIG_SECURITY_NETWORK
 870         void *sptr = nsk->sk_security;
 871 #endif
 872
 873         memcpy(nsk, osk, osk->sk_prot->obj_size);
 874 #ifdef CONFIG_SECURITY_NETWORK
 875         nsk->sk_security = sptr;
 876         security_sk_clone(osk, nsk);
 877 #endif
 878 }
 879
 880 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 881                 int family)
 882 {
 883         struct sock *sk;
 884         struct kmem_cache *slab;
 885
 886         slab = prot->slab;
 887         if (slab != NULL)
 888                 sk = kmem_cache_alloc(slab, priority);
 889         else
 890                 sk = kmalloc(prot->obj_size, priority);
 891
 892         if (sk != NULL) {
 893                 if (security_sk_alloc(sk, family, priority))
 894                         goto out_free;
 895
 896                 if (!try_module_get(prot->owner))
 897                         goto out_free_sec;
 898         }
 899
 900         return sk;
 901
 902 out_free_sec:
 903         security_sk_free(sk);
 904 out_free:
 905         if (slab != NULL)
 906                 kmem_cache_free(slab, sk);
 907         else
 908                 kfree(sk);
 909         return NULL;
 910 }
 911
 912 static void sk_prot_free(struct proto *prot, struct sock *sk)
 913 {
 914         struct kmem_cache *slab;
 915         struct module *owner;
 916
 917         owner = prot->owner;
 918         slab = prot->slab;
 919
 920         security_sk_free(sk);
 921         if (slab != NULL)
 922                 kmem_cache_free(slab, sk);
 923         else
 924                 kfree(sk);
 925         module_put(owner);
 926 }
 927
 928 /**
 929  *      sk_alloc - All socket objects are allocated here
 930  *      @net: the applicable net namespace
 931  *      @family: protocol family
 932  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 933  *      @prot: struct proto associated with this new sock instance
 934  *      @zero_it: if we should zero the newly allocated sock
 935  */
 936 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 937                       struct proto *prot)
 938 {
 939         struct sock *sk;
 940
 941         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
 942         if (sk) {
 943                 sk->sk_family = family;
 944                 /*
 945                  * See comment in struct sock definition to understand
 946                  * why we need sk_prot_creator -acme
 947                  */
 948                 sk->sk_prot = sk->sk_prot_creator = prot;
 949                 sock_lock_init(sk);
 950                 sk->sk_net = get_net(net);
 951         }
 952
 953         return sk;
 954 }
 955
 956 void sk_free(struct sock *sk)
 957 {
 958         struct sk_filter *filter;
 959
 960         if (sk->sk_destruct)
 961                 sk->sk_destruct(sk);
 962
 963         filter = rcu_dereference(sk->sk_filter);
 964         if (filter) {
 965                 sk_filter_uncharge(sk, filter);
 966                 rcu_assign_pointer(sk->sk_filter, NULL);
 967         }
 968
 969         sock_disable_timestamp(sk);
 970
 971         if (atomic_read(&sk->sk_omem_alloc))
 972                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
 973                        __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
 974
 975         put_net(sk->sk_net);
 976         sk_prot_free(sk->sk_prot_creator, sk);
 977 }
 978
 979 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 980 {
 981         struct sock *newsk;
 982
 983         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
 984         if (newsk != NULL) {
 985                 struct sk_filter *filter;
 986
 987                 sock_copy(newsk, sk);
 988
 989                 /* SANITY */
 990                 get_net(newsk->sk_net);
 991                 sk_node_init(&newsk->sk_node);
 992                 sock_lock_init(newsk);
 993                 bh_lock_sock(newsk);
 994                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
 995
 996                 atomic_set(&newsk->sk_rmem_alloc, 0);
 997                 atomic_set(&newsk->sk_wmem_alloc, 0);
 998                 atomic_set(&newsk->sk_omem_alloc, 0);
 999                 skb_queue_head_init(&newsk->sk_receive_queue);
1000                 skb_queue_head_init(&newsk->sk_write_queue);
1001 #ifdef CONFIG_NET_DMA
1002                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1003 #endif
1004
1005                 rwlock_init(&newsk->sk_dst_lock);
1006                 rwlock_init(&newsk->sk_callback_lock);
1007                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1008                                 af_callback_keys + newsk->sk_family,
1009                                 af_family_clock_key_strings[newsk->sk_family]);
1010
1011                 newsk->sk_dst_cache     = NULL;
1012                 newsk->sk_wmem_queued   = 0;
1013                 newsk->sk_forward_alloc = 0;
1014                 newsk->sk_send_head     = NULL;
1015                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1016
1017                 sock_reset_flag(newsk, SOCK_DONE);
1018                 skb_queue_head_init(&newsk->sk_error_queue);
1019
1020                 filter = newsk->sk_filter;
1021                 if (filter != NULL)
1022                         sk_filter_charge(newsk, filter);
1023
1024                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1025                         /* It is still raw copy of parent, so invalidate
1026                          * destructor and make plain sk_free() */
1027                         newsk->sk_destruct = NULL;
1028                         sk_free(newsk);
1029                         newsk = NULL;
1030                         goto out;
1031                 }
1032
1033                 newsk->sk_err      = 0;
1034                 newsk->sk_priority = 0;
1035                 atomic_set(&newsk->sk_refcnt, 2);
1036
1037                 /*
1038                  * Increment the counter in the same struct proto as the master
1039                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1040                  * is the same as sk->sk_prot->socks, as this field was copied
1041                  * with memcpy).
1042                  *
1043                  * This _changes_ the previous behaviour, where
1044                  * tcp_create_openreq_child always was incrementing the
1045                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1046                  * to be taken into account in all callers. -acme
1047                  */
1048                 sk_refcnt_debug_inc(newsk);
1049                 newsk->sk_socket = NULL;
1050                 newsk->sk_sleep  = NULL;
1051
1052                 if (newsk->sk_prot->sockets_allocated)
1053                         atomic_inc(newsk->sk_prot->sockets_allocated);
1054         }
1055 out:
1056         return newsk;
1057 }
1058
1059 EXPORT_SYMBOL_GPL(sk_clone);
1060
1061 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1062 {
1063         __sk_dst_set(sk, dst);
1064         sk->sk_route_caps = dst->dev->features;
1065         if (sk->sk_route_caps & NETIF_F_GSO)
1066                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1067         if (sk_can_gso(sk)) {
1068                 if (dst->header_len)
1069                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1070                 else
1071                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1072         }
1073 }
1074 EXPORT_SYMBOL_GPL(sk_setup_caps);
1075
1076 void __init sk_init(void)
1077 {
1078         if (num_physpages <= 4096) {
1079                 sysctl_wmem_max = 32767;
1080                 sysctl_rmem_max = 32767;
1081                 sysctl_wmem_default = 32767;
1082                 sysctl_rmem_default = 32767;
1083         } else if (num_physpages >= 131072) {
1084                 sysctl_wmem_max = 131071;
1085                 sysctl_rmem_max = 131071;
1086         }
1087 }
1088
1089 /*
1090  *      Simple resource managers for sockets.
1091  */
1092
1093
1094 /*
1095  * Write buffer destructor automatically called from kfree_skb.
1096  */
1097 void sock_wfree(struct sk_buff *skb)
1098 {
1099         struct sock *sk = skb->sk;
1100
1101         /* In case it might be waiting for more memory. */
1102         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1103         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1104                 sk->sk_write_space(sk);
1105         sock_put(sk);
1106 }
1107
1108 /*
1109  * Read buffer destructor automatically called from kfree_skb.
1110  */
1111 void sock_rfree(struct sk_buff *skb)
1112 {
1113         struct sock *sk = skb->sk;
1114
1115         skb_truesize_check(skb);
1116         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1117         sk_mem_uncharge(skb->sk, skb->truesize);
1118 }
1119
1120
1121 int sock_i_uid(struct sock *sk)
1122 {
1123         int uid;
1124
1125         read_lock(&sk->sk_callback_lock);
1126         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1127         read_unlock(&sk->sk_callback_lock);
1128         return uid;
1129 }
1130
1131 unsigned long sock_i_ino(struct sock *sk)
1132 {
1133         unsigned long ino;
1134
1135         read_lock(&sk->sk_callback_lock);
1136         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1137         read_unlock(&sk->sk_callback_lock);
1138         return ino;
1139 }
1140
1141 /*
1142  * Allocate a skb from the socket's send buffer.
1143  */
1144 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1145                              gfp_t priority)
1146 {
1147         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1148                 struct sk_buff * skb = alloc_skb(size, priority);
1149                 if (skb) {
1150                         skb_set_owner_w(skb, sk);
1151                         return skb;
1152                 }
1153         }
1154         return NULL;
1155 }
1156
1157 /*
1158  * Allocate a skb from the socket's receive buffer.
1159  */
1160 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1161                              gfp_t priority)
1162 {
1163         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1164                 struct sk_buff *skb = alloc_skb(size, priority);
1165                 if (skb) {
1166                         skb_set_owner_r(skb, sk);
1167                         return skb;
1168                 }
1169         }
1170         return NULL;
1171 }
1172
1173 /*
1174  * Allocate a memory block from the socket's option memory buffer.
1175  */
1176 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1177 {
1178         if ((unsigned)size <= sysctl_optmem_max &&
1179             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1180                 void *mem;
1181                 /* First do the add, to avoid the race if kmalloc
1182                  * might sleep.
1183                  */
1184                 atomic_add(size, &sk->sk_omem_alloc);
1185                 mem = kmalloc(size, priority);
1186                 if (mem)
1187                         return mem;
1188                 atomic_sub(size, &sk->sk_omem_alloc);
1189         }
1190         return NULL;
1191 }
1192
1193 /*
1194  * Free an option memory block.
1195  */
1196 void sock_kfree_s(struct sock *sk, void *mem, int size)
1197 {
1198         kfree(mem);
1199         atomic_sub(size, &sk->sk_omem_alloc);
1200 }
1201
1202 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1203    I think, these locks should be removed for datagram sockets.
1204  */
1205 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1206 {
1207         DEFINE_WAIT(wait);
1208
1209         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1210         for (;;) {
1211                 if (!timeo)
1212                         break;
1213                 if (signal_pending(current))
1214                         break;
1215                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1216                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1217                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1218                         break;
1219                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1220                         break;
1221                 if (sk->sk_err)
1222                         break;
1223                 timeo = schedule_timeout(timeo);
1224         }
1225         finish_wait(sk->sk_sleep, &wait);
1226         return timeo;
1227 }
1228
1229
1230 /*
1231  *      Generic send/receive buffer handlers
1232  */
1233
1234 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1235                                             unsigned long header_len,
1236                                             unsigned long data_len,
1237                                             int noblock, int *errcode)
1238 {
1239         struct sk_buff *skb;
1240         gfp_t gfp_mask;
1241         long timeo;
1242         int err;
1243
1244         gfp_mask = sk->sk_allocation;
1245         if (gfp_mask & __GFP_WAIT)
1246                 gfp_mask |= __GFP_REPEAT;
1247
1248         timeo = sock_sndtimeo(sk, noblock);
1249         while (1) {
1250                 err = sock_error(sk);
1251                 if (err != 0)
1252                         goto failure;
1253
1254                 err = -EPIPE;
1255                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1256                         goto failure;
1257
1258                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1259                         skb = alloc_skb(header_len, gfp_mask);
1260                         if (skb) {
1261                                 int npages;
1262                                 int i;
1263
1264                                 /* No pages, we're done... */
1265                                 if (!data_len)
1266                                         break;
1267
1268                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1269                                 skb->truesize += data_len;
1270                                 skb_shinfo(skb)->nr_frags = npages;
1271                                 for (i = 0; i < npages; i++) {
1272                                         struct page *page;
1273                                         skb_frag_t *frag;
1274
1275                                         page = alloc_pages(sk->sk_allocation, 0);
1276                                         if (!page) {
1277                                                 err = -ENOBUFS;
1278                                                 skb_shinfo(skb)->nr_frags = i;
1279                                                 kfree_skb(skb);
1280                                                 goto failure;
1281                                         }
1282
1283                                         frag = &skb_shinfo(skb)->frags[i];
1284                                         frag->page = page;
1285                                         frag->page_offset = 0;
1286                                         frag->size = (data_len >= PAGE_SIZE ?
1287                                                       PAGE_SIZE :
1288                                                       data_len);
1289                                         data_len -= PAGE_SIZE;
1290                                 }
1291
1292                                 /* Full success... */
1293                                 break;
1294                         }
1295                         err = -ENOBUFS;
1296                         goto failure;
1297                 }
1298                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1299                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1300                 err = -EAGAIN;
1301                 if (!timeo)
1302                         goto failure;
1303                 if (signal_pending(current))
1304                         goto interrupted;
1305                 timeo = sock_wait_for_wmem(sk, timeo);
1306         }
1307
1308         skb_set_owner_w(skb, sk);
1309         return skb;
1310
1311 interrupted:
1312         err = sock_intr_errno(timeo);
1313 failure:
1314         *errcode = err;
1315         return NULL;
1316 }
1317
1318 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1319                                     int noblock, int *errcode)
1320 {
1321         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1322 }
1323
1324 static void __lock_sock(struct sock *sk)
1325 {
1326         DEFINE_WAIT(wait);
1327
1328         for (;;) {
1329                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1330                                         TASK_UNINTERRUPTIBLE);
1331                 spin_unlock_bh(&sk->sk_lock.slock);
1332                 schedule();
1333                 spin_lock_bh(&sk->sk_lock.slock);
1334                 if (!sock_owned_by_user(sk))
1335                         break;
1336         }
1337         finish_wait(&sk->sk_lock.wq, &wait);
1338 }
1339
1340 static void __release_sock(struct sock *sk)
1341 {
1342         struct sk_buff *skb = sk->sk_backlog.head;
1343
1344         do {
1345                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1346                 bh_unlock_sock(sk);
1347
1348                 do {
1349                         struct sk_buff *next = skb->next;
1350
1351                         skb->next = NULL;
1352                         sk->sk_backlog_rcv(sk, skb);
1353
1354                         /*
1355                          * We are in process context here with softirqs
1356                          * disabled, use cond_resched_softirq() to preempt.
1357                          * This is safe to do because we've taken the backlog
1358                          * queue private:
1359                          */
1360                         cond_resched_softirq();
1361
1362                         skb = next;
1363                 } while (skb != NULL);
1364
1365                 bh_lock_sock(sk);
1366         } while ((skb = sk->sk_backlog.head) != NULL);
1367 }
1368
1369 /**
1370  * sk_wait_data - wait for data to arrive at sk_receive_queue
1371  * @sk:    sock to wait on
1372  * @timeo: for how long
1373  *
1374  * Now socket state including sk->sk_err is changed only under lock,
1375  * hence we may omit checks after joining wait queue.
1376  * We check receive queue before schedule() only as optimization;
1377  * it is very likely that release_sock() added new data.
1378  */
1379 int sk_wait_data(struct sock *sk, long *timeo)
1380 {
1381         int rc;
1382         DEFINE_WAIT(wait);
1383
1384         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1385         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1386         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1387         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1388         finish_wait(sk->sk_sleep, &wait);
1389         return rc;
1390 }
1391
1392 EXPORT_SYMBOL(sk_wait_data);
1393
1394 /**
1395  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1396  *      @sk: socket
1397  *      @size: memory size to allocate
1398  *      @kind: allocation type
1399  *
1400  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1401  *      rmem allocation. This function assumes that protocols which have
1402  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1403  */
1404 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1405 {
1406         struct proto *prot = sk->sk_prot;
1407         int amt = sk_mem_pages(size);
1408         int allocated;
1409
1410         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1411         allocated = atomic_add_return(amt, prot->memory_allocated);
1412
1413         /* Under limit. */
1414         if (allocated <= prot->sysctl_mem[0]) {
1415                 if (prot->memory_pressure && *prot->memory_pressure)
1416                         *prot->memory_pressure = 0;
1417                 return 1;
1418         }
1419
1420         /* Under pressure. */
1421         if (allocated > prot->sysctl_mem[1])
1422                 if (prot->enter_memory_pressure)
1423                         prot->enter_memory_pressure();
1424
1425         /* Over hard limit. */
1426         if (allocated > prot->sysctl_mem[2])
1427                 goto suppress_allocation;
1428
1429         /* guarantee minimum buffer size under pressure */
1430         if (kind == SK_MEM_RECV) {
1431                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1432                         return 1;
1433         } else { /* SK_MEM_SEND */
1434                 if (sk->sk_type == SOCK_STREAM) {
1435                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1436                                 return 1;
1437                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1438                            prot->sysctl_wmem[0])
1439                                 return 1;
1440         }
1441
1442         if (prot->memory_pressure) {
1443                 if (!*prot->memory_pressure ||
1444                     prot->sysctl_mem[2] > atomic_read(prot->sockets_allocated) *
1445                     sk_mem_pages(sk->sk_wmem_queued +
1446                                  atomic_read(&sk->sk_rmem_alloc) +
1447                                  sk->sk_forward_alloc))
1448                         return 1;
1449         }
1450
1451 suppress_allocation:
1452
1453         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1454                 sk_stream_moderate_sndbuf(sk);
1455
1456                 /* Fail only if socket is _under_ its sndbuf.
1457                  * In this case we cannot block, so that we have to fail.
1458                  */
1459                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1460                         return 1;
1461         }
1462
1463         /* Alas. Undo changes. */
1464         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1465         atomic_sub(amt, prot->memory_allocated);
1466         return 0;
1467 }
1468
1469 EXPORT_SYMBOL(__sk_mem_schedule);
1470
1471 /**
1472  *      __sk_reclaim - reclaim memory_allocated
1473  *      @sk: socket
1474  */
1475 void __sk_mem_reclaim(struct sock *sk)
1476 {
1477         struct proto *prot = sk->sk_prot;
1478
1479         atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1480                    prot->memory_allocated);
1481         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1482
1483         if (prot->memory_pressure && *prot->memory_pressure &&
1484             (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1485                 *prot->memory_pressure = 0;
1486 }
1487
1488 EXPORT_SYMBOL(__sk_mem_reclaim);
1489
1490
1491 /*
1492  * Set of default routines for initialising struct proto_ops when
1493  * the protocol does not support a particular function. In certain
1494  * cases where it makes no sense for a protocol to have a "do nothing"
1495  * function, some default processing is provided.
1496  */
1497
1498 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1499 {
1500         return -EOPNOTSUPP;
1501 }
1502
1503 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1504                     int len, int flags)
1505 {
1506         return -EOPNOTSUPP;
1507 }
1508
1509 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1510 {
1511         return -EOPNOTSUPP;
1512 }
1513
1514 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1515 {
1516         return -EOPNOTSUPP;
1517 }
1518
1519 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1520                     int *len, int peer)
1521 {
1522         return -EOPNOTSUPP;
1523 }
1524
1525 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1526 {
1527         return 0;
1528 }
1529
1530 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1531 {
1532         return -EOPNOTSUPP;
1533 }
1534
1535 int sock_no_listen(struct socket *sock, int backlog)
1536 {
1537         return -EOPNOTSUPP;
1538 }
1539
1540 int sock_no_shutdown(struct socket *sock, int how)
1541 {
1542         return -EOPNOTSUPP;
1543 }
1544
1545 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1546                     char __user *optval, int optlen)
1547 {
1548         return -EOPNOTSUPP;
1549 }
1550
1551 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1552                     char __user *optval, int __user *optlen)
1553 {
1554         return -EOPNOTSUPP;
1555 }
1556
1557 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1558                     size_t len)
1559 {
1560         return -EOPNOTSUPP;
1561 }
1562
1563 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1564                     size_t len, int flags)
1565 {
1566         return -EOPNOTSUPP;
1567 }
1568
1569 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1570 {
1571         /* Mirror missing mmap method error code */
1572         return -ENODEV;
1573 }
1574
1575 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1576 {
1577         ssize_t res;
1578         struct msghdr msg = {.msg_flags = flags};
1579         struct kvec iov;
1580         char *kaddr = kmap(page);
1581         iov.iov_base = kaddr + offset;
1582         iov.iov_len = size;
1583         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1584         kunmap(page);
1585         return res;
1586 }
1587
1588 /*
1589  *      Default Socket Callbacks
1590  */
1591
1592 static void sock_def_wakeup(struct sock *sk)
1593 {
1594         read_lock(&sk->sk_callback_lock);
1595         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1596                 wake_up_interruptible_all(sk->sk_sleep);
1597         read_unlock(&sk->sk_callback_lock);
1598 }
1599
1600 static void sock_def_error_report(struct sock *sk)
1601 {
1602         read_lock(&sk->sk_callback_lock);
1603         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1604                 wake_up_interruptible(sk->sk_sleep);
1605         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1606         read_unlock(&sk->sk_callback_lock);
1607 }
1608
1609 static void sock_def_readable(struct sock *sk, int len)
1610 {
1611         read_lock(&sk->sk_callback_lock);
1612         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1613                 wake_up_interruptible(sk->sk_sleep);
1614         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1615         read_unlock(&sk->sk_callback_lock);
1616 }
1617
1618 static void sock_def_write_space(struct sock *sk)
1619 {
1620         read_lock(&sk->sk_callback_lock);
1621
1622         /* Do not wake up a writer until he can make "significant"
1623          * progress.  --DaveM
1624          */
1625         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1626                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1627                         wake_up_interruptible(sk->sk_sleep);
1628
1629                 /* Should agree with poll, otherwise some programs break */
1630                 if (sock_writeable(sk))
1631                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1632         }
1633
1634         read_unlock(&sk->sk_callback_lock);
1635 }
1636
1637 static void sock_def_destruct(struct sock *sk)
1638 {
1639         kfree(sk->sk_protinfo);
1640 }
1641
1642 void sk_send_sigurg(struct sock *sk)
1643 {
1644         if (sk->sk_socket && sk->sk_socket->file)
1645                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1646                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1647 }
1648
1649 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1650                     unsigned long expires)
1651 {
1652         if (!mod_timer(timer, expires))
1653                 sock_hold(sk);
1654 }
1655
1656 EXPORT_SYMBOL(sk_reset_timer);
1657
1658 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1659 {
1660         if (timer_pending(timer) && del_timer(timer))
1661                 __sock_put(sk);
1662 }
1663
1664 EXPORT_SYMBOL(sk_stop_timer);
1665
1666 void sock_init_data(struct socket *sock, struct sock *sk)
1667 {
1668         skb_queue_head_init(&sk->sk_receive_queue);
1669         skb_queue_head_init(&sk->sk_write_queue);
1670         skb_queue_head_init(&sk->sk_error_queue);
1671 #ifdef CONFIG_NET_DMA
1672         skb_queue_head_init(&sk->sk_async_wait_queue);
1673 #endif
1674
1675         sk->sk_send_head        =       NULL;
1676
1677         init_timer(&sk->sk_timer);
1678
1679         sk->sk_allocation       =       GFP_KERNEL;
1680         sk->sk_rcvbuf           =       sysctl_rmem_default;
1681         sk->sk_sndbuf           =       sysctl_wmem_default;
1682         sk->sk_state            =       TCP_CLOSE;
1683         sk->sk_socket           =       sock;
1684
1685         sock_set_flag(sk, SOCK_ZAPPED);
1686
1687         if (sock) {
1688                 sk->sk_type     =       sock->type;
1689                 sk->sk_sleep    =       &sock->wait;
1690                 sock->sk        =       sk;
1691         } else
1692                 sk->sk_sleep    =       NULL;
1693
1694         rwlock_init(&sk->sk_dst_lock);
1695         rwlock_init(&sk->sk_callback_lock);
1696         lockdep_set_class_and_name(&sk->sk_callback_lock,
1697                         af_callback_keys + sk->sk_family,
1698                         af_family_clock_key_strings[sk->sk_family]);
1699
1700         sk->sk_state_change     =       sock_def_wakeup;
1701         sk->sk_data_ready       =       sock_def_readable;
1702         sk->sk_write_space      =       sock_def_write_space;
1703         sk->sk_error_report     =       sock_def_error_report;
1704         sk->sk_destruct         =       sock_def_destruct;
1705
1706         sk->sk_sndmsg_page      =       NULL;
1707         sk->sk_sndmsg_off       =       0;
1708
1709         sk->sk_peercred.pid     =       0;
1710         sk->sk_peercred.uid     =       -1;
1711         sk->sk_peercred.gid     =       -1;
1712         sk->sk_write_pending    =       0;
1713         sk->sk_rcvlowat         =       1;
1714         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1715         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1716
1717         sk->sk_stamp = ktime_set(-1L, -1L);
1718
1719         atomic_set(&sk->sk_refcnt, 1);
1720         atomic_set(&sk->sk_drops, 0);
1721 }
1722
1723 void fastcall lock_sock_nested(struct sock *sk, int subclass)
1724 {
1725         might_sleep();
1726         spin_lock_bh(&sk->sk_lock.slock);
1727         if (sk->sk_lock.owned)
1728                 __lock_sock(sk);
1729         sk->sk_lock.owned = 1;
1730         spin_unlock(&sk->sk_lock.slock);
1731         /*
1732          * The sk_lock has mutex_lock() semantics here:
1733          */
1734         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1735         local_bh_enable();
1736 }
1737
1738 EXPORT_SYMBOL(lock_sock_nested);
1739
1740 void fastcall release_sock(struct sock *sk)
1741 {
1742         /*
1743          * The sk_lock has mutex_unlock() semantics:
1744          */
1745         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1746
1747         spin_lock_bh(&sk->sk_lock.slock);
1748         if (sk->sk_backlog.tail)
1749                 __release_sock(sk);
1750         sk->sk_lock.owned = 0;
1751         if (waitqueue_active(&sk->sk_lock.wq))
1752                 wake_up(&sk->sk_lock.wq);
1753         spin_unlock_bh(&sk->sk_lock.slock);
1754 }
1755 EXPORT_SYMBOL(release_sock);
1756
1757 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1758 {
1759         struct timeval tv;
1760         if (!sock_flag(sk, SOCK_TIMESTAMP))
1761                 sock_enable_timestamp(sk);
1762         tv = ktime_to_timeval(sk->sk_stamp);
1763         if (tv.tv_sec == -1)
1764                 return -ENOENT;
1765         if (tv.tv_sec == 0) {
1766                 sk->sk_stamp = ktime_get_real();
1767                 tv = ktime_to_timeval(sk->sk_stamp);
1768         }
1769         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1770 }
1771 EXPORT_SYMBOL(sock_get_timestamp);
1772
1773 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1774 {
1775         struct timespec ts;
1776         if (!sock_flag(sk, SOCK_TIMESTAMP))
1777                 sock_enable_timestamp(sk);
1778         ts = ktime_to_timespec(sk->sk_stamp);
1779         if (ts.tv_sec == -1)
1780                 return -ENOENT;
1781         if (ts.tv_sec == 0) {
1782                 sk->sk_stamp = ktime_get_real();
1783                 ts = ktime_to_timespec(sk->sk_stamp);
1784         }
1785         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1786 }
1787 EXPORT_SYMBOL(sock_get_timestampns);
1788
1789 void sock_enable_timestamp(struct sock *sk)
1790 {
1791         if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1792                 sock_set_flag(sk, SOCK_TIMESTAMP);
1793                 net_enable_timestamp();
1794         }
1795 }
1796
1797 /*
1798  *      Get a socket option on an socket.
1799  *
1800  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1801  *      asynchronous errors should be reported by getsockopt. We assume
1802  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1803  */
1804 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1805                            char __user *optval, int __user *optlen)
1806 {
1807         struct sock *sk = sock->sk;
1808
1809         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1810 }
1811
1812 EXPORT_SYMBOL(sock_common_getsockopt);
1813
1814 #ifdef CONFIG_COMPAT
1815 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1816                                   char __user *optval, int __user *optlen)
1817 {
1818         struct sock *sk = sock->sk;
1819
1820         if (sk->sk_prot->compat_getsockopt != NULL)
1821                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1822                                                       optval, optlen);
1823         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1824 }
1825 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1826 #endif
1827
1828 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1829                         struct msghdr *msg, size_t size, int flags)
1830 {
1831         struct sock *sk = sock->sk;
1832         int addr_len = 0;
1833         int err;
1834
1835         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1836                                    flags & ~MSG_DONTWAIT, &addr_len);
1837         if (err >= 0)
1838                 msg->msg_namelen = addr_len;
1839         return err;
1840 }
1841
1842 EXPORT_SYMBOL(sock_common_recvmsg);
1843
1844 /*
1845  *      Set socket options on an inet socket.
1846  */
1847 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1848                            char __user *optval, int optlen)
1849 {
1850         struct sock *sk = sock->sk;
1851
1852         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1853 }
1854
1855 EXPORT_SYMBOL(sock_common_setsockopt);
1856
1857 #ifdef CONFIG_COMPAT
1858 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1859                                   char __user *optval, int optlen)
1860 {
1861         struct sock *sk = sock->sk;
1862
1863         if (sk->sk_prot->compat_setsockopt != NULL)
1864                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1865                                                       optval, optlen);
1866         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1867 }
1868 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1869 #endif
1870
1871 void sk_common_release(struct sock *sk)
1872 {
1873         if (sk->sk_prot->destroy)
1874                 sk->sk_prot->destroy(sk);
1875
1876         /*
1877          * Observation: when sock_common_release is called, processes have
1878          * no access to socket. But net still has.
1879          * Step one, detach it from networking:
1880          *
1881          * A. Remove from hash tables.
1882          */
1883
1884         sk->sk_prot->unhash(sk);
1885
1886         /*
1887          * In this point socket cannot receive new packets, but it is possible
1888          * that some packets are in flight because some CPU runs receiver and
1889          * did hash table lookup before we unhashed socket. They will achieve
1890          * receive queue and will be purged by socket destructor.
1891          *
1892          * Also we still have packets pending on receive queue and probably,
1893          * our own packets waiting in device queues. sock_destroy will drain
1894          * receive queue, but transmitted packets will delay socket destruction
1895          * until the last reference will be released.
1896          */
1897
1898         sock_orphan(sk);
1899
1900         xfrm_sk_free_policy(sk);
1901
1902         sk_refcnt_debug_release(sk);
1903         sock_put(sk);
1904 }
1905
1906 EXPORT_SYMBOL(sk_common_release);
1907
1908 static DEFINE_RWLOCK(proto_list_lock);
1909 static LIST_HEAD(proto_list);
1910
1911 int proto_register(struct proto *prot, int alloc_slab)
1912 {
1913         char *request_sock_slab_name = NULL;
1914         char *timewait_sock_slab_name;
1915
1916         if (pcounter_alloc(&prot->inuse) != 0) {
1917                 printk(KERN_CRIT "%s: Can't alloc inuse counters!\n", prot->name);
1918                 goto out;
1919         }
1920
1921         if (alloc_slab) {
1922                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1923                                                SLAB_HWCACHE_ALIGN, NULL);
1924
1925                 if (prot->slab == NULL) {
1926                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1927                                prot->name);
1928                         goto out_free_inuse;
1929                 }
1930
1931                 if (prot->rsk_prot != NULL) {
1932                         static const char mask[] = "request_sock_%s";
1933
1934                         request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1935                         if (request_sock_slab_name == NULL)
1936                                 goto out_free_sock_slab;
1937
1938                         sprintf(request_sock_slab_name, mask, prot->name);
1939                         prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1940                                                                  prot->rsk_prot->obj_size, 0,
1941                                                                  SLAB_HWCACHE_ALIGN, NULL);
1942
1943                         if (prot->rsk_prot->slab == NULL) {
1944                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1945                                        prot->name);
1946                                 goto out_free_request_sock_slab_name;
1947                         }
1948                 }
1949
1950                 if (prot->twsk_prot != NULL) {
1951                         static const char mask[] = "tw_sock_%s";
1952
1953                         timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1954
1955                         if (timewait_sock_slab_name == NULL)
1956                                 goto out_free_request_sock_slab;
1957
1958                         sprintf(timewait_sock_slab_name, mask, prot->name);
1959                         prot->twsk_prot->twsk_slab =
1960                                 kmem_cache_create(timewait_sock_slab_name,
1961                                                   prot->twsk_prot->twsk_obj_size,
1962                                                   0, SLAB_HWCACHE_ALIGN,
1963                                                   NULL);
1964                         if (prot->twsk_prot->twsk_slab == NULL)
1965                                 goto out_free_timewait_sock_slab_name;
1966                 }
1967         }
1968
1969         write_lock(&proto_list_lock);
1970         list_add(&prot->node, &proto_list);
1971         write_unlock(&proto_list_lock);
1972         return 0;
1973
1974 out_free_timewait_sock_slab_name:
1975         kfree(timewait_sock_slab_name);
1976 out_free_request_sock_slab:
1977         if (prot->rsk_prot && prot->rsk_prot->slab) {
1978                 kmem_cache_destroy(prot->rsk_prot->slab);
1979                 prot->rsk_prot->slab = NULL;
1980         }
1981 out_free_request_sock_slab_name:
1982         kfree(request_sock_slab_name);
1983 out_free_sock_slab:
1984         kmem_cache_destroy(prot->slab);
1985         prot->slab = NULL;
1986 out_free_inuse:
1987         pcounter_free(&prot->inuse);
1988 out:
1989         return -ENOBUFS;
1990 }
1991
1992 EXPORT_SYMBOL(proto_register);
1993
1994 void proto_unregister(struct proto *prot)
1995 {
1996         write_lock(&proto_list_lock);
1997         list_del(&prot->node);
1998         write_unlock(&proto_list_lock);
1999
2000         pcounter_free(&prot->inuse);
2001
2002         if (prot->slab != NULL) {
2003                 kmem_cache_destroy(prot->slab);
2004                 prot->slab = NULL;
2005         }
2006
2007         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2008                 const char *name = kmem_cache_name(prot->rsk_prot->slab);
2009
2010                 kmem_cache_destroy(prot->rsk_prot->slab);
2011                 kfree(name);
2012                 prot->rsk_prot->slab = NULL;
2013         }
2014
2015         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2016                 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
2017
2018                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2019                 kfree(name);
2020                 prot->twsk_prot->twsk_slab = NULL;
2021         }
2022 }
2023
2024 EXPORT_SYMBOL(proto_unregister);
2025
2026 #ifdef CONFIG_PROC_FS
2027 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2028         __acquires(proto_list_lock)
2029 {
2030         read_lock(&proto_list_lock);
2031         return seq_list_start_head(&proto_list, *pos);
2032 }
2033
2034 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2035 {
2036         return seq_list_next(v, &proto_list, pos);
2037 }
2038
2039 static void proto_seq_stop(struct seq_file *seq, void *v)
2040         __releases(proto_list_lock)
2041 {
2042         read_unlock(&proto_list_lock);
2043 }
2044
2045 static char proto_method_implemented(const void *method)
2046 {
2047         return method == NULL ? 'n' : 'y';
2048 }
2049
2050 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2051 {
2052         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2053                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2054                    proto->name,
2055                    proto->obj_size,
2056                    proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
2057                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2058                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2059                    proto->max_header,
2060                    proto->slab == NULL ? "no" : "yes",
2061                    module_name(proto->owner),
2062                    proto_method_implemented(proto->close),
2063                    proto_method_implemented(proto->connect),
2064                    proto_method_implemented(proto->disconnect),
2065                    proto_method_implemented(proto->accept),
2066                    proto_method_implemented(proto->ioctl),
2067                    proto_method_implemented(proto->init),
2068                    proto_method_implemented(proto->destroy),
2069                    proto_method_implemented(proto->shutdown),
2070                    proto_method_implemented(proto->setsockopt),
2071                    proto_method_implemented(proto->getsockopt),
2072                    proto_method_implemented(proto->sendmsg),
2073                    proto_method_implemented(proto->recvmsg),
2074                    proto_method_implemented(proto->sendpage),
2075                    proto_method_implemented(proto->bind),
2076                    proto_method_implemented(proto->backlog_rcv),
2077                    proto_method_implemented(proto->hash),
2078                    proto_method_implemented(proto->unhash),
2079                    proto_method_implemented(proto->get_port),
2080                    proto_method_implemented(proto->enter_memory_pressure));
2081 }
2082
2083 static int proto_seq_show(struct seq_file *seq, void *v)
2084 {
2085         if (v == &proto_list)
2086                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2087                            "protocol",
2088                            "size",
2089                            "sockets",
2090                            "memory",
2091                            "press",
2092                            "maxhdr",
2093                            "slab",
2094                            "module",
2095                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2096         else
2097                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2098         return 0;
2099 }
2100
2101 static const struct seq_operations proto_seq_ops = {
2102         .start  = proto_seq_start,
2103         .next   = proto_seq_next,
2104         .stop   = proto_seq_stop,
2105         .show   = proto_seq_show,
2106 };
2107
2108 static int proto_seq_open(struct inode *inode, struct file *file)
2109 {
2110         return seq_open(file, &proto_seq_ops);
2111 }
2112
2113 static const struct file_operations proto_seq_fops = {
2114         .owner          = THIS_MODULE,
2115         .open           = proto_seq_open,
2116         .read           = seq_read,
2117         .llseek         = seq_lseek,
2118         .release        = seq_release,
2119 };
2120
2121 static int __init proto_init(void)
2122 {
2123         /* register /proc/net/protocols */
2124         return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
2125 }
2126
2127 subsys_initcall(proto_init);
2128
2129 #endif /* PROC_FS */
2130
2131 EXPORT_SYMBOL(sk_alloc);
2132 EXPORT_SYMBOL(sk_free);
2133 EXPORT_SYMBOL(sk_send_sigurg);
2134 EXPORT_SYMBOL(sock_alloc_send_skb);
2135 EXPORT_SYMBOL(sock_init_data);
2136 EXPORT_SYMBOL(sock_kfree_s);
2137 EXPORT_SYMBOL(sock_kmalloc);
2138 EXPORT_SYMBOL(sock_no_accept);
2139 EXPORT_SYMBOL(sock_no_bind);
2140 EXPORT_SYMBOL(sock_no_connect);
2141 EXPORT_SYMBOL(sock_no_getname);
2142 EXPORT_SYMBOL(sock_no_getsockopt);
2143 EXPORT_SYMBOL(sock_no_ioctl);
2144 EXPORT_SYMBOL(sock_no_listen);
2145 EXPORT_SYMBOL(sock_no_mmap);
2146 EXPORT_SYMBOL(sock_no_poll);
2147 EXPORT_SYMBOL(sock_no_recvmsg);
2148 EXPORT_SYMBOL(sock_no_sendmsg);
2149 EXPORT_SYMBOL(sock_no_sendpage);
2150 EXPORT_SYMBOL(sock_no_setsockopt);
2151 EXPORT_SYMBOL(sock_no_shutdown);
2152 EXPORT_SYMBOL(sock_no_socketpair);
2153 EXPORT_SYMBOL(sock_rfree);
2154 EXPORT_SYMBOL(sock_setsockopt);
2155 EXPORT_SYMBOL(sock_wfree);
2156 EXPORT_SYMBOL(sock_wmalloc);
2157 EXPORT_SYMBOL(sock_i_uid);
2158 EXPORT_SYMBOL(sock_i_ino);
2159 EXPORT_SYMBOL(sysctl_optmem_max);