net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #include <linux/capability.h>
  93 #include <linux/errno.h>
  94 #include <linux/types.h>
  95 #include <linux/socket.h>
  96 #include <linux/in.h>
  97 #include <linux/kernel.h>
  98 #include <linux/module.h>
  99 #include <linux/proc_fs.h>
 100 #include <linux/seq_file.h>
 101 #include <linux/sched.h>
 102 #include <linux/timer.h>
 103 #include <linux/string.h>
 104 #include <linux/sockios.h>
 105 #include <linux/net.h>
 106 #include <linux/mm.h>
 107 #include <linux/slab.h>
 108 #include <linux/interrupt.h>
 109 #include <linux/poll.h>
 110 #include <linux/tcp.h>
 111 #include <linux/init.h>
 112 #include <linux/highmem.h>
 113
 114 #include <asm/uaccess.h>
 115 #include <asm/system.h>
 116
 117 #include <linux/netdevice.h>
 118 #include <net/protocol.h>
 119 #include <linux/skbuff.h>
 120 #include <net/net_namespace.h>
 121 #include <net/request_sock.h>
 122 #include <net/sock.h>
 123 #include <net/xfrm.h>
 124 #include <linux/ipsec.h>
 125
 126 #include <linux/filter.h>
 127
 128 #ifdef CONFIG_INET
 129 #include <net/tcp.h>
 130 #endif
 131
 132 /*
 133  * Each address family might have different locking rules, so we have
 134  * one slock key per address family:
 135  */
 136 static struct lock_class_key af_family_keys[AF_MAX];
 137 static struct lock_class_key af_family_slock_keys[AF_MAX];
 138
 139 /*
 140  * Make lock validator output more readable. (we pre-construct these
 141  * strings build-time, so that runtime initialization of socket
 142  * locks is fast):
 143  */
 144 static const char *af_family_key_strings[AF_MAX+1] = {
 145   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 146   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 147   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 148   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 149   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 150   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 151   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 152   "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 153   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 154   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 155   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 156   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 157   "sk_lock-AF_MAX"
 158 };
 159 static const char *af_family_slock_key_strings[AF_MAX+1] = {
 160   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 161   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 162   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 163   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 164   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 165   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 166   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 167   "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 168   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 169   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 170   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 171   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 172   "slock-AF_MAX"
 173 };
 174 static const char *af_family_clock_key_strings[AF_MAX+1] = {
 175   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 176   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 177   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 178   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 179   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 180   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 181   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 182   "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 183   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 184   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 185   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 186   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 187   "clock-AF_MAX"
 188 };
 189
 190 /*
 191  * sk_callback_lock locking rules are per-address-family,
 192  * so split the lock classes by using a per-AF key:
 193  */
 194 static struct lock_class_key af_callback_keys[AF_MAX];
 195
 196 /* Take into consideration the size of the struct sk_buff overhead in the
 197  * determination of these values, since that is non-constant across
 198  * platforms.  This makes socket queueing behavior and performance
 199  * not depend upon such differences.
 200  */
 201 #define _SK_MEM_PACKETS         256
 202 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
 203 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 204 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 205
 206 /* Run time adjustable parameters. */
 207 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 208 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 209 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 210 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 211
 212 /* Maximal space eaten by iovec or ancilliary data plus some space */
 213 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 214
 215 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 216 {
 217         struct timeval tv;
 218
 219         if (optlen < sizeof(tv))
 220                 return -EINVAL;
 221         if (copy_from_user(&tv, optval, sizeof(tv)))
 222                 return -EFAULT;
 223         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 224                 return -EDOM;
 225
 226         if (tv.tv_sec < 0) {
 227                 static int warned __read_mostly;
 228
 229                 *timeo_p = 0;
 230                 if (warned < 10 && net_ratelimit()) {
 231                         warned++;
 232                         printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
 233                                "tries to set negative timeout\n",
 234                                 current->comm, task_pid_nr(current));
 235                 }
 236                 return 0;
 237         }
 238         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 239         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 240                 return 0;
 241         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 242                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 243         return 0;
 244 }
 245
 246 static void sock_warn_obsolete_bsdism(const char *name)
 247 {
 248         static int warned;
 249         static char warncomm[TASK_COMM_LEN];
 250         if (strcmp(warncomm, current->comm) && warned < 5) {
 251                 strcpy(warncomm,  current->comm);
 252                 printk(KERN_WARNING "process `%s' is using obsolete "
 253                        "%s SO_BSDCOMPAT\n", warncomm, name);
 254                 warned++;
 255         }
 256 }
 257
 258 static void sock_disable_timestamp(struct sock *sk)
 259 {
 260         if (sock_flag(sk, SOCK_TIMESTAMP)) {
 261                 sock_reset_flag(sk, SOCK_TIMESTAMP);
 262                 net_disable_timestamp();
 263         }
 264 }
 265
 266
 267 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 268 {
 269         int err = 0;
 270         int skb_len;
 271
 272         /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
 273            number of warnings when compiling with -W --ANK
 274          */
 275         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 276             (unsigned)sk->sk_rcvbuf) {
 277                 err = -ENOMEM;
 278                 goto out;
 279         }
 280
 281         err = sk_filter(sk, skb);
 282         if (err)
 283                 goto out;
 284
 285         if (!sk_rmem_schedule(sk, skb->truesize)) {
 286                 err = -ENOBUFS;
 287                 goto out;
 288         }
 289
 290         skb->dev = NULL;
 291         skb_set_owner_r(skb, sk);
 292
 293         /* Cache the SKB length before we tack it onto the receive
 294          * queue.  Once it is added it no longer belongs to us and
 295          * may be freed by other threads of control pulling packets
 296          * from the queue.
 297          */
 298         skb_len = skb->len;
 299
 300         skb_queue_tail(&sk->sk_receive_queue, skb);
 301
 302         if (!sock_flag(sk, SOCK_DEAD))
 303                 sk->sk_data_ready(sk, skb_len);
 304 out:
 305         return err;
 306 }
 307 EXPORT_SYMBOL(sock_queue_rcv_skb);
 308
 309 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 310 {
 311         int rc = NET_RX_SUCCESS;
 312
 313         if (sk_filter(sk, skb))
 314                 goto discard_and_relse;
 315
 316         skb->dev = NULL;
 317
 318         if (nested)
 319                 bh_lock_sock_nested(sk);
 320         else
 321                 bh_lock_sock(sk);
 322         if (!sock_owned_by_user(sk)) {
 323                 /*
 324                  * trylock + unlock semantics:
 325                  */
 326                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 327
 328                 rc = sk_backlog_rcv(sk, skb);
 329
 330                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 331         } else
 332                 sk_add_backlog(sk, skb);
 333         bh_unlock_sock(sk);
 334 out:
 335         sock_put(sk);
 336         return rc;
 337 discard_and_relse:
 338         kfree_skb(skb);
 339         goto out;
 340 }
 341 EXPORT_SYMBOL(sk_receive_skb);
 342
 343 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 344 {
 345         struct dst_entry *dst = sk->sk_dst_cache;
 346
 347         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 348                 sk->sk_dst_cache = NULL;
 349                 dst_release(dst);
 350                 return NULL;
 351         }
 352
 353         return dst;
 354 }
 355 EXPORT_SYMBOL(__sk_dst_check);
 356
 357 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 358 {
 359         struct dst_entry *dst = sk_dst_get(sk);
 360
 361         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 362                 sk_dst_reset(sk);
 363                 dst_release(dst);
 364                 return NULL;
 365         }
 366
 367         return dst;
 368 }
 369 EXPORT_SYMBOL(sk_dst_check);
 370
 371 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 372 {
 373         int ret = -ENOPROTOOPT;
 374 #ifdef CONFIG_NETDEVICES
 375         struct net *net = sock_net(sk);
 376         char devname[IFNAMSIZ];
 377         int index;
 378
 379         /* Sorry... */
 380         ret = -EPERM;
 381         if (!capable(CAP_NET_RAW))
 382                 goto out;
 383
 384         ret = -EINVAL;
 385         if (optlen < 0)
 386                 goto out;
 387
 388         /* Bind this socket to a particular device like "eth0",
 389          * as specified in the passed interface name. If the
 390          * name is "" or the option length is zero the socket
 391          * is not bound.
 392          */
 393         if (optlen > IFNAMSIZ - 1)
 394                 optlen = IFNAMSIZ - 1;
 395         memset(devname, 0, sizeof(devname));
 396
 397         ret = -EFAULT;
 398         if (copy_from_user(devname, optval, optlen))
 399                 goto out;
 400
 401         if (devname[0] == '\0') {
 402                 index = 0;
 403         } else {
 404                 struct net_device *dev = dev_get_by_name(net, devname);
 405
 406                 ret = -ENODEV;
 407                 if (!dev)
 408                         goto out;
 409
 410                 index = dev->ifindex;
 411                 dev_put(dev);
 412         }
 413
 414         lock_sock(sk);
 415         sk->sk_bound_dev_if = index;
 416         sk_dst_reset(sk);
 417         release_sock(sk);
 418
 419         ret = 0;
 420
 421 out:
 422 #endif
 423
 424         return ret;
 425 }
 426
 427 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 428 {
 429         if (valbool)
 430                 sock_set_flag(sk, bit);
 431         else
 432                 sock_reset_flag(sk, bit);
 433 }
 434
 435 /*
 436  *      This is meant for all protocols to use and covers goings on
 437  *      at the socket level. Everything here is generic.
 438  */
 439
 440 int sock_setsockopt(struct socket *sock, int level, int optname,
 441                     char __user *optval, int optlen)
 442 {
 443         struct sock *sk=sock->sk;
 444         int val;
 445         int valbool;
 446         struct linger ling;
 447         int ret = 0;
 448
 449         /*
 450          *      Options without arguments
 451          */
 452
 453         if (optname == SO_BINDTODEVICE)
 454                 return sock_bindtodevice(sk, optval, optlen);
 455
 456         if (optlen < sizeof(int))
 457                 return -EINVAL;
 458
 459         if (get_user(val, (int __user *)optval))
 460                 return -EFAULT;
 461
 462         valbool = val?1:0;
 463
 464         lock_sock(sk);
 465
 466         switch(optname) {
 467         case SO_DEBUG:
 468                 if (val && !capable(CAP_NET_ADMIN)) {
 469                         ret = -EACCES;
 470                 } else
 471                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 472                 break;
 473         case SO_REUSEADDR:
 474                 sk->sk_reuse = valbool;
 475                 break;
 476         case SO_TYPE:
 477         case SO_ERROR:
 478                 ret = -ENOPROTOOPT;
 479                 break;
 480         case SO_DONTROUTE:
 481                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 482                 break;
 483         case SO_BROADCAST:
 484                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 485                 break;
 486         case SO_SNDBUF:
 487                 /* Don't error on this BSD doesn't and if you think
 488                    about it this is right. Otherwise apps have to
 489                    play 'guess the biggest size' games. RCVBUF/SNDBUF
 490                    are treated in BSD as hints */
 491
 492                 if (val > sysctl_wmem_max)
 493                         val = sysctl_wmem_max;
 494 set_sndbuf:
 495                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 496                 if ((val * 2) < SOCK_MIN_SNDBUF)
 497                         sk->sk_sndbuf = SOCK_MIN_SNDBUF;
 498                 else
 499                         sk->sk_sndbuf = val * 2;
 500
 501                 /*
 502                  *      Wake up sending tasks if we
 503                  *      upped the value.
 504                  */
 505                 sk->sk_write_space(sk);
 506                 break;
 507
 508         case SO_SNDBUFFORCE:
 509                 if (!capable(CAP_NET_ADMIN)) {
 510                         ret = -EPERM;
 511                         break;
 512                 }
 513                 goto set_sndbuf;
 514
 515         case SO_RCVBUF:
 516                 /* Don't error on this BSD doesn't and if you think
 517                    about it this is right. Otherwise apps have to
 518                    play 'guess the biggest size' games. RCVBUF/SNDBUF
 519                    are treated in BSD as hints */
 520
 521                 if (val > sysctl_rmem_max)
 522                         val = sysctl_rmem_max;
 523 set_rcvbuf:
 524                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 525                 /*
 526                  * We double it on the way in to account for
 527                  * "struct sk_buff" etc. overhead.   Applications
 528                  * assume that the SO_RCVBUF setting they make will
 529                  * allow that much actual data to be received on that
 530                  * socket.
 531                  *
 532                  * Applications are unaware that "struct sk_buff" and
 533                  * other overheads allocate from the receive buffer
 534                  * during socket buffer allocation.
 535                  *
 536                  * And after considering the possible alternatives,
 537                  * returning the value we actually used in getsockopt
 538                  * is the most desirable behavior.
 539                  */
 540                 if ((val * 2) < SOCK_MIN_RCVBUF)
 541                         sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
 542                 else
 543                         sk->sk_rcvbuf = val * 2;
 544                 break;
 545
 546         case SO_RCVBUFFORCE:
 547                 if (!capable(CAP_NET_ADMIN)) {
 548                         ret = -EPERM;
 549                         break;
 550                 }
 551                 goto set_rcvbuf;
 552
 553         case SO_KEEPALIVE:
 554 #ifdef CONFIG_INET
 555                 if (sk->sk_protocol == IPPROTO_TCP)
 556                         tcp_set_keepalive(sk, valbool);
 557 #endif
 558                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 559                 break;
 560
 561         case SO_OOBINLINE:
 562                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 563                 break;
 564
 565         case SO_NO_CHECK:
 566                 sk->sk_no_check = valbool;
 567                 break;
 568
 569         case SO_PRIORITY:
 570                 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 571                         sk->sk_priority = val;
 572                 else
 573                         ret = -EPERM;
 574                 break;
 575
 576         case SO_LINGER:
 577                 if (optlen < sizeof(ling)) {
 578                         ret = -EINVAL;  /* 1003.1g */
 579                         break;
 580                 }
 581                 if (copy_from_user(&ling,optval,sizeof(ling))) {
 582                         ret = -EFAULT;
 583                         break;
 584                 }
 585                 if (!ling.l_onoff)
 586                         sock_reset_flag(sk, SOCK_LINGER);
 587                 else {
 588 #if (BITS_PER_LONG == 32)
 589                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 590                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 591                         else
 592 #endif
 593                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 594                         sock_set_flag(sk, SOCK_LINGER);
 595                 }
 596                 break;
 597
 598         case SO_BSDCOMPAT:
 599                 sock_warn_obsolete_bsdism("setsockopt");
 600                 break;
 601
 602         case SO_PASSCRED:
 603                 if (valbool)
 604                         set_bit(SOCK_PASSCRED, &sock->flags);
 605                 else
 606                         clear_bit(SOCK_PASSCRED, &sock->flags);
 607                 break;
 608
 609         case SO_TIMESTAMP:
 610         case SO_TIMESTAMPNS:
 611                 if (valbool)  {
 612                         if (optname == SO_TIMESTAMP)
 613                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 614                         else
 615                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 616                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 617                         sock_enable_timestamp(sk);
 618                 } else {
 619                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 620                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 621                 }
 622                 break;
 623
 624         case SO_RCVLOWAT:
 625                 if (val < 0)
 626                         val = INT_MAX;
 627                 sk->sk_rcvlowat = val ? : 1;
 628                 break;
 629
 630         case SO_RCVTIMEO:
 631                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 632                 break;
 633
 634         case SO_SNDTIMEO:
 635                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 636                 break;
 637
 638         case SO_ATTACH_FILTER:
 639                 ret = -EINVAL;
 640                 if (optlen == sizeof(struct sock_fprog)) {
 641                         struct sock_fprog fprog;
 642
 643                         ret = -EFAULT;
 644                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 645                                 break;
 646
 647                         ret = sk_attach_filter(&fprog, sk);
 648                 }
 649                 break;
 650
 651         case SO_DETACH_FILTER:
 652                 ret = sk_detach_filter(sk);
 653                 break;
 654
 655         case SO_PASSSEC:
 656                 if (valbool)
 657                         set_bit(SOCK_PASSSEC, &sock->flags);
 658                 else
 659                         clear_bit(SOCK_PASSSEC, &sock->flags);
 660                 break;
 661         case SO_MARK:
 662                 if (!capable(CAP_NET_ADMIN))
 663                         ret = -EPERM;
 664                 else {
 665                         sk->sk_mark = val;
 666                 }
 667                 break;
 668
 669                 /* We implement the SO_SNDLOWAT etc to
 670                    not be settable (1003.1g 5.3) */
 671         default:
 672                 ret = -ENOPROTOOPT;
 673                 break;
 674         }
 675         release_sock(sk);
 676         return ret;
 677 }
 678
 679
 680 int sock_getsockopt(struct socket *sock, int level, int optname,
 681                     char __user *optval, int __user *optlen)
 682 {
 683         struct sock *sk = sock->sk;
 684
 685         union {
 686                 int val;
 687                 struct linger ling;
 688                 struct timeval tm;
 689         } v;
 690
 691         unsigned int lv = sizeof(int);
 692         int len;
 693
 694         if (get_user(len, optlen))
 695                 return -EFAULT;
 696         if (len < 0)
 697                 return -EINVAL;
 698
 699         switch(optname) {
 700         case SO_DEBUG:
 701                 v.val = sock_flag(sk, SOCK_DBG);
 702                 break;
 703
 704         case SO_DONTROUTE:
 705                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
 706                 break;
 707
 708         case SO_BROADCAST:
 709                 v.val = !!sock_flag(sk, SOCK_BROADCAST);
 710                 break;
 711
 712         case SO_SNDBUF:
 713                 v.val = sk->sk_sndbuf;
 714                 break;
 715
 716         case SO_RCVBUF:
 717                 v.val = sk->sk_rcvbuf;
 718                 break;
 719
 720         case SO_REUSEADDR:
 721                 v.val = sk->sk_reuse;
 722                 break;
 723
 724         case SO_KEEPALIVE:
 725                 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
 726                 break;
 727
 728         case SO_TYPE:
 729                 v.val = sk->sk_type;
 730                 break;
 731
 732         case SO_ERROR:
 733                 v.val = -sock_error(sk);
 734                 if (v.val==0)
 735                         v.val = xchg(&sk->sk_err_soft, 0);
 736                 break;
 737
 738         case SO_OOBINLINE:
 739                 v.val = !!sock_flag(sk, SOCK_URGINLINE);
 740                 break;
 741
 742         case SO_NO_CHECK:
 743                 v.val = sk->sk_no_check;
 744                 break;
 745
 746         case SO_PRIORITY:
 747                 v.val = sk->sk_priority;
 748                 break;
 749
 750         case SO_LINGER:
 751                 lv              = sizeof(v.ling);
 752                 v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
 753                 v.ling.l_linger = sk->sk_lingertime / HZ;
 754                 break;
 755
 756         case SO_BSDCOMPAT:
 757                 sock_warn_obsolete_bsdism("getsockopt");
 758                 break;
 759
 760         case SO_TIMESTAMP:
 761                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 762                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
 763                 break;
 764
 765         case SO_TIMESTAMPNS:
 766                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 767                 break;
 768
 769         case SO_RCVTIMEO:
 770                 lv=sizeof(struct timeval);
 771                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 772                         v.tm.tv_sec = 0;
 773                         v.tm.tv_usec = 0;
 774                 } else {
 775                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 776                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 777                 }
 778                 break;
 779
 780         case SO_SNDTIMEO:
 781                 lv=sizeof(struct timeval);
 782                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 783                         v.tm.tv_sec = 0;
 784                         v.tm.tv_usec = 0;
 785                 } else {
 786                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
 787                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
 788                 }
 789                 break;
 790
 791         case SO_RCVLOWAT:
 792                 v.val = sk->sk_rcvlowat;
 793                 break;
 794
 795         case SO_SNDLOWAT:
 796                 v.val=1;
 797                 break;
 798
 799         case SO_PASSCRED:
 800                 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
 801                 break;
 802
 803         case SO_PEERCRED:
 804                 if (len > sizeof(sk->sk_peercred))
 805                         len = sizeof(sk->sk_peercred);
 806                 if (copy_to_user(optval, &sk->sk_peercred, len))
 807                         return -EFAULT;
 808                 goto lenout;
 809
 810         case SO_PEERNAME:
 811         {
 812                 char address[128];
 813
 814                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
 815                         return -ENOTCONN;
 816                 if (lv < len)
 817                         return -EINVAL;
 818                 if (copy_to_user(optval, address, len))
 819                         return -EFAULT;
 820                 goto lenout;
 821         }
 822
 823         /* Dubious BSD thing... Probably nobody even uses it, but
 824          * the UNIX standard wants it for whatever reason... -DaveM
 825          */
 826         case SO_ACCEPTCONN:
 827                 v.val = sk->sk_state == TCP_LISTEN;
 828                 break;
 829
 830         case SO_PASSSEC:
 831                 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
 832                 break;
 833
 834         case SO_PEERSEC:
 835                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
 836
 837         case SO_MARK:
 838                 v.val = sk->sk_mark;
 839                 break;
 840
 841         default:
 842                 return -ENOPROTOOPT;
 843         }
 844
 845         if (len > lv)
 846                 len = lv;
 847         if (copy_to_user(optval, &v, len))
 848                 return -EFAULT;
 849 lenout:
 850         if (put_user(len, optlen))
 851                 return -EFAULT;
 852         return 0;
 853 }
 854
 855 /*
 856  * Initialize an sk_lock.
 857  *
 858  * (We also register the sk_lock with the lock validator.)
 859  */
 860 static inline void sock_lock_init(struct sock *sk)
 861 {
 862         sock_lock_init_class_and_name(sk,
 863                         af_family_slock_key_strings[sk->sk_family],
 864                         af_family_slock_keys + sk->sk_family,
 865                         af_family_key_strings[sk->sk_family],
 866                         af_family_keys + sk->sk_family);
 867 }
 868
 869 static void sock_copy(struct sock *nsk, const struct sock *osk)
 870 {
 871 #ifdef CONFIG_SECURITY_NETWORK
 872         void *sptr = nsk->sk_security;
 873 #endif
 874
 875         memcpy(nsk, osk, osk->sk_prot->obj_size);
 876 #ifdef CONFIG_SECURITY_NETWORK
 877         nsk->sk_security = sptr;
 878         security_sk_clone(osk, nsk);
 879 #endif
 880 }
 881
 882 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 883                 int family)
 884 {
 885         struct sock *sk;
 886         struct kmem_cache *slab;
 887
 888         slab = prot->slab;
 889         if (slab != NULL)
 890                 sk = kmem_cache_alloc(slab, priority);
 891         else
 892                 sk = kmalloc(prot->obj_size, priority);
 893
 894         if (sk != NULL) {
 895                 if (security_sk_alloc(sk, family, priority))
 896                         goto out_free;
 897
 898                 if (!try_module_get(prot->owner))
 899                         goto out_free_sec;
 900         }
 901
 902         return sk;
 903
 904 out_free_sec:
 905         security_sk_free(sk);
 906 out_free:
 907         if (slab != NULL)
 908                 kmem_cache_free(slab, sk);
 909         else
 910                 kfree(sk);
 911         return NULL;
 912 }
 913
 914 static void sk_prot_free(struct proto *prot, struct sock *sk)
 915 {
 916         struct kmem_cache *slab;
 917         struct module *owner;
 918
 919         owner = prot->owner;
 920         slab = prot->slab;
 921
 922         security_sk_free(sk);
 923         if (slab != NULL)
 924                 kmem_cache_free(slab, sk);
 925         else
 926                 kfree(sk);
 927         module_put(owner);
 928 }
 929
 930 /**
 931  *      sk_alloc - All socket objects are allocated here
 932  *      @net: the applicable net namespace
 933  *      @family: protocol family
 934  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 935  *      @prot: struct proto associated with this new sock instance
 936  */
 937 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 938                       struct proto *prot)
 939 {
 940         struct sock *sk;
 941
 942         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
 943         if (sk) {
 944                 sk->sk_family = family;
 945                 /*
 946                  * See comment in struct sock definition to understand
 947                  * why we need sk_prot_creator -acme
 948                  */
 949                 sk->sk_prot = sk->sk_prot_creator = prot;
 950                 sock_lock_init(sk);
 951                 sock_net_set(sk, get_net(net));
 952         }
 953
 954         return sk;
 955 }
 956
 957 void sk_free(struct sock *sk)
 958 {
 959         struct sk_filter *filter;
 960
 961         if (sk->sk_destruct)
 962                 sk->sk_destruct(sk);
 963
 964         filter = rcu_dereference(sk->sk_filter);
 965         if (filter) {
 966                 sk_filter_uncharge(sk, filter);
 967                 rcu_assign_pointer(sk->sk_filter, NULL);
 968         }
 969
 970         sock_disable_timestamp(sk);
 971
 972         if (atomic_read(&sk->sk_omem_alloc))
 973                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
 974                        __func__, atomic_read(&sk->sk_omem_alloc));
 975
 976         put_net(sock_net(sk));
 977         sk_prot_free(sk->sk_prot_creator, sk);
 978 }
 979
 980 /*
 981  * Last sock_put should drop referrence to sk->sk_net. It has already
 982  * been dropped in sk_change_net. Taking referrence to stopping namespace
 983  * is not an option.
 984  * Take referrence to a socket to remove it from hash _alive_ and after that
 985  * destroy it in the context of init_net.
 986  */
 987 void sk_release_kernel(struct sock *sk)
 988 {
 989         if (sk == NULL || sk->sk_socket == NULL)
 990                 return;
 991
 992         sock_hold(sk);
 993         sock_release(sk->sk_socket);
 994         release_net(sock_net(sk));
 995         sock_net_set(sk, get_net(&init_net));
 996         sock_put(sk);
 997 }
 998 EXPORT_SYMBOL(sk_release_kernel);
 999
1000 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1001 {
1002         struct sock *newsk;
1003
1004         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1005         if (newsk != NULL) {
1006                 struct sk_filter *filter;
1007
1008                 sock_copy(newsk, sk);
1009
1010                 /* SANITY */
1011                 get_net(sock_net(newsk));
1012                 sk_node_init(&newsk->sk_node);
1013                 sock_lock_init(newsk);
1014                 bh_lock_sock(newsk);
1015                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1016
1017                 atomic_set(&newsk->sk_rmem_alloc, 0);
1018                 atomic_set(&newsk->sk_wmem_alloc, 0);
1019                 atomic_set(&newsk->sk_omem_alloc, 0);
1020                 skb_queue_head_init(&newsk->sk_receive_queue);
1021                 skb_queue_head_init(&newsk->sk_write_queue);
1022 #ifdef CONFIG_NET_DMA
1023                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1024 #endif
1025
1026                 rwlock_init(&newsk->sk_dst_lock);
1027                 rwlock_init(&newsk->sk_callback_lock);
1028                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1029                                 af_callback_keys + newsk->sk_family,
1030                                 af_family_clock_key_strings[newsk->sk_family]);
1031
1032                 newsk->sk_dst_cache     = NULL;
1033                 newsk->sk_wmem_queued   = 0;
1034                 newsk->sk_forward_alloc = 0;
1035                 newsk->sk_send_head     = NULL;
1036                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1037
1038                 sock_reset_flag(newsk, SOCK_DONE);
1039                 skb_queue_head_init(&newsk->sk_error_queue);
1040
1041                 filter = newsk->sk_filter;
1042                 if (filter != NULL)
1043                         sk_filter_charge(newsk, filter);
1044
1045                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1046                         /* It is still raw copy of parent, so invalidate
1047                          * destructor and make plain sk_free() */
1048                         newsk->sk_destruct = NULL;
1049                         sk_free(newsk);
1050                         newsk = NULL;
1051                         goto out;
1052                 }
1053
1054                 newsk->sk_err      = 0;
1055                 newsk->sk_priority = 0;
1056                 atomic_set(&newsk->sk_refcnt, 2);
1057
1058                 /*
1059                  * Increment the counter in the same struct proto as the master
1060                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1061                  * is the same as sk->sk_prot->socks, as this field was copied
1062                  * with memcpy).
1063                  *
1064                  * This _changes_ the previous behaviour, where
1065                  * tcp_create_openreq_child always was incrementing the
1066                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1067                  * to be taken into account in all callers. -acme
1068                  */
1069                 sk_refcnt_debug_inc(newsk);
1070                 sk_set_socket(newsk, NULL);
1071                 newsk->sk_sleep  = NULL;
1072
1073                 if (newsk->sk_prot->sockets_allocated)
1074                         percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1075         }
1076 out:
1077         return newsk;
1078 }
1079
1080 EXPORT_SYMBOL_GPL(sk_clone);
1081
1082 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1083 {
1084         __sk_dst_set(sk, dst);
1085         sk->sk_route_caps = dst->dev->features;
1086         if (sk->sk_route_caps & NETIF_F_GSO)
1087                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1088         if (sk_can_gso(sk)) {
1089                 if (dst->header_len) {
1090                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1091                 } else {
1092                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1093                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1094                 }
1095         }
1096 }
1097 EXPORT_SYMBOL_GPL(sk_setup_caps);
1098
1099 void __init sk_init(void)
1100 {
1101         if (num_physpages <= 4096) {
1102                 sysctl_wmem_max = 32767;
1103                 sysctl_rmem_max = 32767;
1104                 sysctl_wmem_default = 32767;
1105                 sysctl_rmem_default = 32767;
1106         } else if (num_physpages >= 131072) {
1107                 sysctl_wmem_max = 131071;
1108                 sysctl_rmem_max = 131071;
1109         }
1110 }
1111
1112 /*
1113  *      Simple resource managers for sockets.
1114  */
1115
1116
1117 /*
1118  * Write buffer destructor automatically called from kfree_skb.
1119  */
1120 void sock_wfree(struct sk_buff *skb)
1121 {
1122         struct sock *sk = skb->sk;
1123
1124         /* In case it might be waiting for more memory. */
1125         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1126         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1127                 sk->sk_write_space(sk);
1128         sock_put(sk);
1129 }
1130
1131 /*
1132  * Read buffer destructor automatically called from kfree_skb.
1133  */
1134 void sock_rfree(struct sk_buff *skb)
1135 {
1136         struct sock *sk = skb->sk;
1137
1138         skb_truesize_check(skb);
1139         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1140         sk_mem_uncharge(skb->sk, skb->truesize);
1141 }
1142
1143
1144 int sock_i_uid(struct sock *sk)
1145 {
1146         int uid;
1147
1148         read_lock(&sk->sk_callback_lock);
1149         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1150         read_unlock(&sk->sk_callback_lock);
1151         return uid;
1152 }
1153
1154 unsigned long sock_i_ino(struct sock *sk)
1155 {
1156         unsigned long ino;
1157
1158         read_lock(&sk->sk_callback_lock);
1159         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1160         read_unlock(&sk->sk_callback_lock);
1161         return ino;
1162 }
1163
1164 /*
1165  * Allocate a skb from the socket's send buffer.
1166  */
1167 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1168                              gfp_t priority)
1169 {
1170         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1171                 struct sk_buff * skb = alloc_skb(size, priority);
1172                 if (skb) {
1173                         skb_set_owner_w(skb, sk);
1174                         return skb;
1175                 }
1176         }
1177         return NULL;
1178 }
1179
1180 /*
1181  * Allocate a skb from the socket's receive buffer.
1182  */
1183 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1184                              gfp_t priority)
1185 {
1186         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1187                 struct sk_buff *skb = alloc_skb(size, priority);
1188                 if (skb) {
1189                         skb_set_owner_r(skb, sk);
1190                         return skb;
1191                 }
1192         }
1193         return NULL;
1194 }
1195
1196 /*
1197  * Allocate a memory block from the socket's option memory buffer.
1198  */
1199 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1200 {
1201         if ((unsigned)size <= sysctl_optmem_max &&
1202             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1203                 void *mem;
1204                 /* First do the add, to avoid the race if kmalloc
1205                  * might sleep.
1206                  */
1207                 atomic_add(size, &sk->sk_omem_alloc);
1208                 mem = kmalloc(size, priority);
1209                 if (mem)
1210                         return mem;
1211                 atomic_sub(size, &sk->sk_omem_alloc);
1212         }
1213         return NULL;
1214 }
1215
1216 /*
1217  * Free an option memory block.
1218  */
1219 void sock_kfree_s(struct sock *sk, void *mem, int size)
1220 {
1221         kfree(mem);
1222         atomic_sub(size, &sk->sk_omem_alloc);
1223 }
1224
1225 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1226    I think, these locks should be removed for datagram sockets.
1227  */
1228 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1229 {
1230         DEFINE_WAIT(wait);
1231
1232         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1233         for (;;) {
1234                 if (!timeo)
1235                         break;
1236                 if (signal_pending(current))
1237                         break;
1238                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1239                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1240                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1241                         break;
1242                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1243                         break;
1244                 if (sk->sk_err)
1245                         break;
1246                 timeo = schedule_timeout(timeo);
1247         }
1248         finish_wait(sk->sk_sleep, &wait);
1249         return timeo;
1250 }
1251
1252
1253 /*
1254  *      Generic send/receive buffer handlers
1255  */
1256
1257 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1258                                             unsigned long header_len,
1259                                             unsigned long data_len,
1260                                             int noblock, int *errcode)
1261 {
1262         struct sk_buff *skb;
1263         gfp_t gfp_mask;
1264         long timeo;
1265         int err;
1266
1267         gfp_mask = sk->sk_allocation;
1268         if (gfp_mask & __GFP_WAIT)
1269                 gfp_mask |= __GFP_REPEAT;
1270
1271         timeo = sock_sndtimeo(sk, noblock);
1272         while (1) {
1273                 err = sock_error(sk);
1274                 if (err != 0)
1275                         goto failure;
1276
1277                 err = -EPIPE;
1278                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1279                         goto failure;
1280
1281                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1282                         skb = alloc_skb(header_len, gfp_mask);
1283                         if (skb) {
1284                                 int npages;
1285                                 int i;
1286
1287                                 /* No pages, we're done... */
1288                                 if (!data_len)
1289                                         break;
1290
1291                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1292                                 skb->truesize += data_len;
1293                                 skb_shinfo(skb)->nr_frags = npages;
1294                                 for (i = 0; i < npages; i++) {
1295                                         struct page *page;
1296                                         skb_frag_t *frag;
1297
1298                                         page = alloc_pages(sk->sk_allocation, 0);
1299                                         if (!page) {
1300                                                 err = -ENOBUFS;
1301                                                 skb_shinfo(skb)->nr_frags = i;
1302                                                 kfree_skb(skb);
1303                                                 goto failure;
1304                                         }
1305
1306                                         frag = &skb_shinfo(skb)->frags[i];
1307                                         frag->page = page;
1308                                         frag->page_offset = 0;
1309                                         frag->size = (data_len >= PAGE_SIZE ?
1310                                                       PAGE_SIZE :
1311                                                       data_len);
1312                                         data_len -= PAGE_SIZE;
1313                                 }
1314
1315                                 /* Full success... */
1316                                 break;
1317                         }
1318                         err = -ENOBUFS;
1319                         goto failure;
1320                 }
1321                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1322                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1323                 err = -EAGAIN;
1324                 if (!timeo)
1325                         goto failure;
1326                 if (signal_pending(current))
1327                         goto interrupted;
1328                 timeo = sock_wait_for_wmem(sk, timeo);
1329         }
1330
1331         skb_set_owner_w(skb, sk);
1332         return skb;
1333
1334 interrupted:
1335         err = sock_intr_errno(timeo);
1336 failure:
1337         *errcode = err;
1338         return NULL;
1339 }
1340
1341 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1342                                     int noblock, int *errcode)
1343 {
1344         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1345 }
1346
1347 static void __lock_sock(struct sock *sk)
1348 {
1349         DEFINE_WAIT(wait);
1350
1351         for (;;) {
1352                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1353                                         TASK_UNINTERRUPTIBLE);
1354                 spin_unlock_bh(&sk->sk_lock.slock);
1355                 schedule();
1356                 spin_lock_bh(&sk->sk_lock.slock);
1357                 if (!sock_owned_by_user(sk))
1358                         break;
1359         }
1360         finish_wait(&sk->sk_lock.wq, &wait);
1361 }
1362
1363 static void __release_sock(struct sock *sk)
1364 {
1365         struct sk_buff *skb = sk->sk_backlog.head;
1366
1367         do {
1368                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1369                 bh_unlock_sock(sk);
1370
1371                 do {
1372                         struct sk_buff *next = skb->next;
1373
1374                         skb->next = NULL;
1375                         sk_backlog_rcv(sk, skb);
1376
1377                         /*
1378                          * We are in process context here with softirqs
1379                          * disabled, use cond_resched_softirq() to preempt.
1380                          * This is safe to do because we've taken the backlog
1381                          * queue private:
1382                          */
1383                         cond_resched_softirq();
1384
1385                         skb = next;
1386                 } while (skb != NULL);
1387
1388                 bh_lock_sock(sk);
1389         } while ((skb = sk->sk_backlog.head) != NULL);
1390 }
1391
1392 /**
1393  * sk_wait_data - wait for data to arrive at sk_receive_queue
1394  * @sk:    sock to wait on
1395  * @timeo: for how long
1396  *
1397  * Now socket state including sk->sk_err is changed only under lock,
1398  * hence we may omit checks after joining wait queue.
1399  * We check receive queue before schedule() only as optimization;
1400  * it is very likely that release_sock() added new data.
1401  */
1402 int sk_wait_data(struct sock *sk, long *timeo)
1403 {
1404         int rc;
1405         DEFINE_WAIT(wait);
1406
1407         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1408         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1409         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1410         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1411         finish_wait(sk->sk_sleep, &wait);
1412         return rc;
1413 }
1414
1415 EXPORT_SYMBOL(sk_wait_data);
1416
1417 /**
1418  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1419  *      @sk: socket
1420  *      @size: memory size to allocate
1421  *      @kind: allocation type
1422  *
1423  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1424  *      rmem allocation. This function assumes that protocols which have
1425  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1426  */
1427 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1428 {
1429         struct proto *prot = sk->sk_prot;
1430         int amt = sk_mem_pages(size);
1431         int allocated;
1432
1433         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1434         allocated = atomic_add_return(amt, prot->memory_allocated);
1435
1436         /* Under limit. */
1437         if (allocated <= prot->sysctl_mem[0]) {
1438                 if (prot->memory_pressure && *prot->memory_pressure)
1439                         *prot->memory_pressure = 0;
1440                 return 1;
1441         }
1442
1443         /* Under pressure. */
1444         if (allocated > prot->sysctl_mem[1])
1445                 if (prot->enter_memory_pressure)
1446                         prot->enter_memory_pressure(sk);
1447
1448         /* Over hard limit. */
1449         if (allocated > prot->sysctl_mem[2])
1450                 goto suppress_allocation;
1451
1452         /* guarantee minimum buffer size under pressure */
1453         if (kind == SK_MEM_RECV) {
1454                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1455                         return 1;
1456         } else { /* SK_MEM_SEND */
1457                 if (sk->sk_type == SOCK_STREAM) {
1458                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1459                                 return 1;
1460                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1461                            prot->sysctl_wmem[0])
1462                                 return 1;
1463         }
1464
1465         if (prot->memory_pressure) {
1466                 int alloc;
1467
1468                 if (!*prot->memory_pressure)
1469                         return 1;
1470                 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1471                 if (prot->sysctl_mem[2] > alloc *
1472                     sk_mem_pages(sk->sk_wmem_queued +
1473                                  atomic_read(&sk->sk_rmem_alloc) +
1474                                  sk->sk_forward_alloc))
1475                         return 1;
1476         }
1477
1478 suppress_allocation:
1479
1480         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1481                 sk_stream_moderate_sndbuf(sk);
1482
1483                 /* Fail only if socket is _under_ its sndbuf.
1484                  * In this case we cannot block, so that we have to fail.
1485                  */
1486                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1487                         return 1;
1488         }
1489
1490         /* Alas. Undo changes. */
1491         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1492         atomic_sub(amt, prot->memory_allocated);
1493         return 0;
1494 }
1495
1496 EXPORT_SYMBOL(__sk_mem_schedule);
1497
1498 /**
1499  *      __sk_reclaim - reclaim memory_allocated
1500  *      @sk: socket
1501  */
1502 void __sk_mem_reclaim(struct sock *sk)
1503 {
1504         struct proto *prot = sk->sk_prot;
1505
1506         atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1507                    prot->memory_allocated);
1508         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1509
1510         if (prot->memory_pressure && *prot->memory_pressure &&
1511             (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1512                 *prot->memory_pressure = 0;
1513 }
1514
1515 EXPORT_SYMBOL(__sk_mem_reclaim);
1516
1517
1518 /*
1519  * Set of default routines for initialising struct proto_ops when
1520  * the protocol does not support a particular function. In certain
1521  * cases where it makes no sense for a protocol to have a "do nothing"
1522  * function, some default processing is provided.
1523  */
1524
1525 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1526 {
1527         return -EOPNOTSUPP;
1528 }
1529
1530 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1531                     int len, int flags)
1532 {
1533         return -EOPNOTSUPP;
1534 }
1535
1536 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1537 {
1538         return -EOPNOTSUPP;
1539 }
1540
1541 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1542 {
1543         return -EOPNOTSUPP;
1544 }
1545
1546 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1547                     int *len, int peer)
1548 {
1549         return -EOPNOTSUPP;
1550 }
1551
1552 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1553 {
1554         return 0;
1555 }
1556
1557 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1558 {
1559         return -EOPNOTSUPP;
1560 }
1561
1562 int sock_no_listen(struct socket *sock, int backlog)
1563 {
1564         return -EOPNOTSUPP;
1565 }
1566
1567 int sock_no_shutdown(struct socket *sock, int how)
1568 {
1569         return -EOPNOTSUPP;
1570 }
1571
1572 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1573                     char __user *optval, int optlen)
1574 {
1575         return -EOPNOTSUPP;
1576 }
1577
1578 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1579                     char __user *optval, int __user *optlen)
1580 {
1581         return -EOPNOTSUPP;
1582 }
1583
1584 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1585                     size_t len)
1586 {
1587         return -EOPNOTSUPP;
1588 }
1589
1590 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1591                     size_t len, int flags)
1592 {
1593         return -EOPNOTSUPP;
1594 }
1595
1596 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1597 {
1598         /* Mirror missing mmap method error code */
1599         return -ENODEV;
1600 }
1601
1602 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1603 {
1604         ssize_t res;
1605         struct msghdr msg = {.msg_flags = flags};
1606         struct kvec iov;
1607         char *kaddr = kmap(page);
1608         iov.iov_base = kaddr + offset;
1609         iov.iov_len = size;
1610         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1611         kunmap(page);
1612         return res;
1613 }
1614
1615 /*
1616  *      Default Socket Callbacks
1617  */
1618
1619 static void sock_def_wakeup(struct sock *sk)
1620 {
1621         read_lock(&sk->sk_callback_lock);
1622         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1623                 wake_up_interruptible_all(sk->sk_sleep);
1624         read_unlock(&sk->sk_callback_lock);
1625 }
1626
1627 static void sock_def_error_report(struct sock *sk)
1628 {
1629         read_lock(&sk->sk_callback_lock);
1630         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1631                 wake_up_interruptible(sk->sk_sleep);
1632         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1633         read_unlock(&sk->sk_callback_lock);
1634 }
1635
1636 static void sock_def_readable(struct sock *sk, int len)
1637 {
1638         read_lock(&sk->sk_callback_lock);
1639         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1640                 wake_up_interruptible_sync(sk->sk_sleep);
1641         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1642         read_unlock(&sk->sk_callback_lock);
1643 }
1644
1645 static void sock_def_write_space(struct sock *sk)
1646 {
1647         read_lock(&sk->sk_callback_lock);
1648
1649         /* Do not wake up a writer until he can make "significant"
1650          * progress.  --DaveM
1651          */
1652         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1653                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1654                         wake_up_interruptible_sync(sk->sk_sleep);
1655
1656                 /* Should agree with poll, otherwise some programs break */
1657                 if (sock_writeable(sk))
1658                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1659         }
1660
1661         read_unlock(&sk->sk_callback_lock);
1662 }
1663
1664 static void sock_def_destruct(struct sock *sk)
1665 {
1666         kfree(sk->sk_protinfo);
1667 }
1668
1669 void sk_send_sigurg(struct sock *sk)
1670 {
1671         if (sk->sk_socket && sk->sk_socket->file)
1672                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1673                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1674 }
1675
1676 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1677                     unsigned long expires)
1678 {
1679         if (!mod_timer(timer, expires))
1680                 sock_hold(sk);
1681 }
1682
1683 EXPORT_SYMBOL(sk_reset_timer);
1684
1685 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1686 {
1687         if (timer_pending(timer) && del_timer(timer))
1688                 __sock_put(sk);
1689 }
1690
1691 EXPORT_SYMBOL(sk_stop_timer);
1692
1693 void sock_init_data(struct socket *sock, struct sock *sk)
1694 {
1695         skb_queue_head_init(&sk->sk_receive_queue);
1696         skb_queue_head_init(&sk->sk_write_queue);
1697         skb_queue_head_init(&sk->sk_error_queue);
1698 #ifdef CONFIG_NET_DMA
1699         skb_queue_head_init(&sk->sk_async_wait_queue);
1700 #endif
1701
1702         sk->sk_send_head        =       NULL;
1703
1704         init_timer(&sk->sk_timer);
1705
1706         sk->sk_allocation       =       GFP_KERNEL;
1707         sk->sk_rcvbuf           =       sysctl_rmem_default;
1708         sk->sk_sndbuf           =       sysctl_wmem_default;
1709         sk->sk_state            =       TCP_CLOSE;
1710         sk_set_socket(sk, sock);
1711
1712         sock_set_flag(sk, SOCK_ZAPPED);
1713
1714         if (sock) {
1715                 sk->sk_type     =       sock->type;
1716                 sk->sk_sleep    =       &sock->wait;
1717                 sock->sk        =       sk;
1718         } else
1719                 sk->sk_sleep    =       NULL;
1720
1721         rwlock_init(&sk->sk_dst_lock);
1722         rwlock_init(&sk->sk_callback_lock);
1723         lockdep_set_class_and_name(&sk->sk_callback_lock,
1724                         af_callback_keys + sk->sk_family,
1725                         af_family_clock_key_strings[sk->sk_family]);
1726
1727         sk->sk_state_change     =       sock_def_wakeup;
1728         sk->sk_data_ready       =       sock_def_readable;
1729         sk->sk_write_space      =       sock_def_write_space;
1730         sk->sk_error_report     =       sock_def_error_report;
1731         sk->sk_destruct         =       sock_def_destruct;
1732
1733         sk->sk_sndmsg_page      =       NULL;
1734         sk->sk_sndmsg_off       =       0;
1735
1736         sk->sk_peercred.pid     =       0;
1737         sk->sk_peercred.uid     =       -1;
1738         sk->sk_peercred.gid     =       -1;
1739         sk->sk_write_pending    =       0;
1740         sk->sk_rcvlowat         =       1;
1741         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1742         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1743
1744         sk->sk_stamp = ktime_set(-1L, 0);
1745
1746         atomic_set(&sk->sk_refcnt, 1);
1747         atomic_set(&sk->sk_drops, 0);
1748 }
1749
1750 void lock_sock_nested(struct sock *sk, int subclass)
1751 {
1752         might_sleep();
1753         spin_lock_bh(&sk->sk_lock.slock);
1754         if (sk->sk_lock.owned)
1755                 __lock_sock(sk);
1756         sk->sk_lock.owned = 1;
1757         spin_unlock(&sk->sk_lock.slock);
1758         /*
1759          * The sk_lock has mutex_lock() semantics here:
1760          */
1761         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1762         local_bh_enable();
1763 }
1764
1765 EXPORT_SYMBOL(lock_sock_nested);
1766
1767 void release_sock(struct sock *sk)
1768 {
1769         /*
1770          * The sk_lock has mutex_unlock() semantics:
1771          */
1772         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1773
1774         spin_lock_bh(&sk->sk_lock.slock);
1775         if (sk->sk_backlog.tail)
1776                 __release_sock(sk);
1777         sk->sk_lock.owned = 0;
1778         if (waitqueue_active(&sk->sk_lock.wq))
1779                 wake_up(&sk->sk_lock.wq);
1780         spin_unlock_bh(&sk->sk_lock.slock);
1781 }
1782 EXPORT_SYMBOL(release_sock);
1783
1784 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1785 {
1786         struct timeval tv;
1787         if (!sock_flag(sk, SOCK_TIMESTAMP))
1788                 sock_enable_timestamp(sk);
1789         tv = ktime_to_timeval(sk->sk_stamp);
1790         if (tv.tv_sec == -1)
1791                 return -ENOENT;
1792         if (tv.tv_sec == 0) {
1793                 sk->sk_stamp = ktime_get_real();
1794                 tv = ktime_to_timeval(sk->sk_stamp);
1795         }
1796         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1797 }
1798 EXPORT_SYMBOL(sock_get_timestamp);
1799
1800 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1801 {
1802         struct timespec ts;
1803         if (!sock_flag(sk, SOCK_TIMESTAMP))
1804                 sock_enable_timestamp(sk);
1805         ts = ktime_to_timespec(sk->sk_stamp);
1806         if (ts.tv_sec == -1)
1807                 return -ENOENT;
1808         if (ts.tv_sec == 0) {
1809                 sk->sk_stamp = ktime_get_real();
1810                 ts = ktime_to_timespec(sk->sk_stamp);
1811         }
1812         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1813 }
1814 EXPORT_SYMBOL(sock_get_timestampns);
1815
1816 void sock_enable_timestamp(struct sock *sk)
1817 {
1818         if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1819                 sock_set_flag(sk, SOCK_TIMESTAMP);
1820                 net_enable_timestamp();
1821         }
1822 }
1823
1824 /*
1825  *      Get a socket option on an socket.
1826  *
1827  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1828  *      asynchronous errors should be reported by getsockopt. We assume
1829  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1830  */
1831 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1832                            char __user *optval, int __user *optlen)
1833 {
1834         struct sock *sk = sock->sk;
1835
1836         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1837 }
1838
1839 EXPORT_SYMBOL(sock_common_getsockopt);
1840
1841 #ifdef CONFIG_COMPAT
1842 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1843                                   char __user *optval, int __user *optlen)
1844 {
1845         struct sock *sk = sock->sk;
1846
1847         if (sk->sk_prot->compat_getsockopt != NULL)
1848                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1849                                                       optval, optlen);
1850         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1851 }
1852 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1853 #endif
1854
1855 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1856                         struct msghdr *msg, size_t size, int flags)
1857 {
1858         struct sock *sk = sock->sk;
1859         int addr_len = 0;
1860         int err;
1861
1862         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1863                                    flags & ~MSG_DONTWAIT, &addr_len);
1864         if (err >= 0)
1865                 msg->msg_namelen = addr_len;
1866         return err;
1867 }
1868
1869 EXPORT_SYMBOL(sock_common_recvmsg);
1870
1871 /*
1872  *      Set socket options on an inet socket.
1873  */
1874 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1875                            char __user *optval, int optlen)
1876 {
1877         struct sock *sk = sock->sk;
1878
1879         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1880 }
1881
1882 EXPORT_SYMBOL(sock_common_setsockopt);
1883
1884 #ifdef CONFIG_COMPAT
1885 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1886                                   char __user *optval, int optlen)
1887 {
1888         struct sock *sk = sock->sk;
1889
1890         if (sk->sk_prot->compat_setsockopt != NULL)
1891                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1892                                                       optval, optlen);
1893         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1894 }
1895 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1896 #endif
1897
1898 void sk_common_release(struct sock *sk)
1899 {
1900         if (sk->sk_prot->destroy)
1901                 sk->sk_prot->destroy(sk);
1902
1903         /*
1904          * Observation: when sock_common_release is called, processes have
1905          * no access to socket. But net still has.
1906          * Step one, detach it from networking:
1907          *
1908          * A. Remove from hash tables.
1909          */
1910
1911         sk->sk_prot->unhash(sk);
1912
1913         /*
1914          * In this point socket cannot receive new packets, but it is possible
1915          * that some packets are in flight because some CPU runs receiver and
1916          * did hash table lookup before we unhashed socket. They will achieve
1917          * receive queue and will be purged by socket destructor.
1918          *
1919          * Also we still have packets pending on receive queue and probably,
1920          * our own packets waiting in device queues. sock_destroy will drain
1921          * receive queue, but transmitted packets will delay socket destruction
1922          * until the last reference will be released.
1923          */
1924
1925         sock_orphan(sk);
1926
1927         xfrm_sk_free_policy(sk);
1928
1929         sk_refcnt_debug_release(sk);
1930         sock_put(sk);
1931 }
1932
1933 EXPORT_SYMBOL(sk_common_release);
1934
1935 static DEFINE_RWLOCK(proto_list_lock);
1936 static LIST_HEAD(proto_list);
1937
1938 #ifdef CONFIG_PROC_FS
1939 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
1940 struct prot_inuse {
1941         int val[PROTO_INUSE_NR];
1942 };
1943
1944 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
1945
1946 #ifdef CONFIG_NET_NS
1947 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1948 {
1949         int cpu = smp_processor_id();
1950         per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
1951 }
1952 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
1953
1954 int sock_prot_inuse_get(struct net *net, struct proto *prot)
1955 {
1956         int cpu, idx = prot->inuse_idx;
1957         int res = 0;
1958
1959         for_each_possible_cpu(cpu)
1960                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
1961
1962         return res >= 0 ? res : 0;
1963 }
1964 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
1965
1966 static int sock_inuse_init_net(struct net *net)
1967 {
1968         net->core.inuse = alloc_percpu(struct prot_inuse);
1969         return net->core.inuse ? 0 : -ENOMEM;
1970 }
1971
1972 static void sock_inuse_exit_net(struct net *net)
1973 {
1974         free_percpu(net->core.inuse);
1975 }
1976
1977 static struct pernet_operations net_inuse_ops = {
1978         .init = sock_inuse_init_net,
1979         .exit = sock_inuse_exit_net,
1980 };
1981
1982 static __init int net_inuse_init(void)
1983 {
1984         if (register_pernet_subsys(&net_inuse_ops))
1985                 panic("Cannot initialize net inuse counters");
1986
1987         return 0;
1988 }
1989
1990 core_initcall(net_inuse_init);
1991 #else
1992 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
1993
1994 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1995 {
1996         __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
1997 }
1998 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
1999
2000 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2001 {
2002         int cpu, idx = prot->inuse_idx;
2003         int res = 0;
2004
2005         for_each_possible_cpu(cpu)
2006                 res += per_cpu(prot_inuse, cpu).val[idx];
2007
2008         return res >= 0 ? res : 0;
2009 }
2010 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2011 #endif
2012
2013 static void assign_proto_idx(struct proto *prot)
2014 {
2015         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2016
2017         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2018                 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2019                 return;
2020         }
2021
2022         set_bit(prot->inuse_idx, proto_inuse_idx);
2023 }
2024
2025 static void release_proto_idx(struct proto *prot)
2026 {
2027         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2028                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2029 }
2030 #else
2031 static inline void assign_proto_idx(struct proto *prot)
2032 {
2033 }
2034
2035 static inline void release_proto_idx(struct proto *prot)
2036 {
2037 }
2038 #endif
2039
2040 int proto_register(struct proto *prot, int alloc_slab)
2041 {
2042         if (alloc_slab) {
2043                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2044                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2045                                         NULL);
2046
2047                 if (prot->slab == NULL) {
2048                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2049                                prot->name);
2050                         goto out;
2051                 }
2052
2053                 if (prot->rsk_prot != NULL) {
2054                         static const char mask[] = "request_sock_%s";
2055
2056                         prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2057                         if (prot->rsk_prot->slab_name == NULL)
2058                                 goto out_free_sock_slab;
2059
2060                         sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2061                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2062                                                                  prot->rsk_prot->obj_size, 0,
2063                                                                  SLAB_HWCACHE_ALIGN, NULL);
2064
2065                         if (prot->rsk_prot->slab == NULL) {
2066                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2067                                        prot->name);
2068                                 goto out_free_request_sock_slab_name;
2069                         }
2070                 }
2071
2072                 if (prot->twsk_prot != NULL) {
2073                         static const char mask[] = "tw_sock_%s";
2074
2075                         prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2076
2077                         if (prot->twsk_prot->twsk_slab_name == NULL)
2078                                 goto out_free_request_sock_slab;
2079
2080                         sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2081                         prot->twsk_prot->twsk_slab =
2082                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2083                                                   prot->twsk_prot->twsk_obj_size,
2084                                                   0,
2085                                                   SLAB_HWCACHE_ALIGN |
2086                                                         prot->slab_flags,
2087                                                   NULL);
2088                         if (prot->twsk_prot->twsk_slab == NULL)
2089                                 goto out_free_timewait_sock_slab_name;
2090                 }
2091         }
2092
2093         write_lock(&proto_list_lock);
2094         list_add(&prot->node, &proto_list);
2095         assign_proto_idx(prot);
2096         write_unlock(&proto_list_lock);
2097         return 0;
2098
2099 out_free_timewait_sock_slab_name:
2100         kfree(prot->twsk_prot->twsk_slab_name);
2101 out_free_request_sock_slab:
2102         if (prot->rsk_prot && prot->rsk_prot->slab) {
2103                 kmem_cache_destroy(prot->rsk_prot->slab);
2104                 prot->rsk_prot->slab = NULL;
2105         }
2106 out_free_request_sock_slab_name:
2107         kfree(prot->rsk_prot->slab_name);
2108 out_free_sock_slab:
2109         kmem_cache_destroy(prot->slab);
2110         prot->slab = NULL;
2111 out:
2112         return -ENOBUFS;
2113 }
2114
2115 EXPORT_SYMBOL(proto_register);
2116
2117 void proto_unregister(struct proto *prot)
2118 {
2119         write_lock(&proto_list_lock);
2120         release_proto_idx(prot);
2121         list_del(&prot->node);
2122         write_unlock(&proto_list_lock);
2123
2124         if (prot->slab != NULL) {
2125                 kmem_cache_destroy(prot->slab);
2126                 prot->slab = NULL;
2127         }
2128
2129         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2130                 kmem_cache_destroy(prot->rsk_prot->slab);
2131                 kfree(prot->rsk_prot->slab_name);
2132                 prot->rsk_prot->slab = NULL;
2133         }
2134
2135         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2136                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2137                 kfree(prot->twsk_prot->twsk_slab_name);
2138                 prot->twsk_prot->twsk_slab = NULL;
2139         }
2140 }
2141
2142 EXPORT_SYMBOL(proto_unregister);
2143
2144 #ifdef CONFIG_PROC_FS
2145 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2146         __acquires(proto_list_lock)
2147 {
2148         read_lock(&proto_list_lock);
2149         return seq_list_start_head(&proto_list, *pos);
2150 }
2151
2152 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2153 {
2154         return seq_list_next(v, &proto_list, pos);
2155 }
2156
2157 static void proto_seq_stop(struct seq_file *seq, void *v)
2158         __releases(proto_list_lock)
2159 {
2160         read_unlock(&proto_list_lock);
2161 }
2162
2163 static char proto_method_implemented(const void *method)
2164 {
2165         return method == NULL ? 'n' : 'y';
2166 }
2167
2168 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2169 {
2170         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2171                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2172                    proto->name,
2173                    proto->obj_size,
2174                    sock_prot_inuse_get(seq_file_net(seq), proto),
2175                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2176                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2177                    proto->max_header,
2178                    proto->slab == NULL ? "no" : "yes",
2179                    module_name(proto->owner),
2180                    proto_method_implemented(proto->close),
2181                    proto_method_implemented(proto->connect),
2182                    proto_method_implemented(proto->disconnect),
2183                    proto_method_implemented(proto->accept),
2184                    proto_method_implemented(proto->ioctl),
2185                    proto_method_implemented(proto->init),
2186                    proto_method_implemented(proto->destroy),
2187                    proto_method_implemented(proto->shutdown),
2188                    proto_method_implemented(proto->setsockopt),
2189                    proto_method_implemented(proto->getsockopt),
2190                    proto_method_implemented(proto->sendmsg),
2191                    proto_method_implemented(proto->recvmsg),
2192                    proto_method_implemented(proto->sendpage),
2193                    proto_method_implemented(proto->bind),
2194                    proto_method_implemented(proto->backlog_rcv),
2195                    proto_method_implemented(proto->hash),
2196                    proto_method_implemented(proto->unhash),
2197                    proto_method_implemented(proto->get_port),
2198                    proto_method_implemented(proto->enter_memory_pressure));
2199 }
2200
2201 static int proto_seq_show(struct seq_file *seq, void *v)
2202 {
2203         if (v == &proto_list)
2204                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2205                            "protocol",
2206                            "size",
2207                            "sockets",
2208                            "memory",
2209                            "press",
2210                            "maxhdr",
2211                            "slab",
2212                            "module",
2213                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2214         else
2215                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2216         return 0;
2217 }
2218
2219 static const struct seq_operations proto_seq_ops = {
2220         .start  = proto_seq_start,
2221         .next   = proto_seq_next,
2222         .stop   = proto_seq_stop,
2223         .show   = proto_seq_show,
2224 };
2225
2226 static int proto_seq_open(struct inode *inode, struct file *file)
2227 {
2228         return seq_open_net(inode, file, &proto_seq_ops,
2229                             sizeof(struct seq_net_private));
2230 }
2231
2232 static const struct file_operations proto_seq_fops = {
2233         .owner          = THIS_MODULE,
2234         .open           = proto_seq_open,
2235         .read           = seq_read,
2236         .llseek         = seq_lseek,
2237         .release        = seq_release_net,
2238 };
2239
2240 static __net_init int proto_init_net(struct net *net)
2241 {
2242         if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2243                 return -ENOMEM;
2244
2245         return 0;
2246 }
2247
2248 static __net_exit void proto_exit_net(struct net *net)
2249 {
2250         proc_net_remove(net, "protocols");
2251 }
2252
2253
2254 static __net_initdata struct pernet_operations proto_net_ops = {
2255         .init = proto_init_net,
2256         .exit = proto_exit_net,
2257 };
2258
2259 static int __init proto_init(void)
2260 {
2261         return register_pernet_subsys(&proto_net_ops);
2262 }
2263
2264 subsys_initcall(proto_init);
2265
2266 #endif /* PROC_FS */
2267
2268 EXPORT_SYMBOL(sk_alloc);
2269 EXPORT_SYMBOL(sk_free);
2270 EXPORT_SYMBOL(sk_send_sigurg);
2271 EXPORT_SYMBOL(sock_alloc_send_skb);
2272 EXPORT_SYMBOL(sock_init_data);
2273 EXPORT_SYMBOL(sock_kfree_s);
2274 EXPORT_SYMBOL(sock_kmalloc);
2275 EXPORT_SYMBOL(sock_no_accept);
2276 EXPORT_SYMBOL(sock_no_bind);
2277 EXPORT_SYMBOL(sock_no_connect);
2278 EXPORT_SYMBOL(sock_no_getname);
2279 EXPORT_SYMBOL(sock_no_getsockopt);
2280 EXPORT_SYMBOL(sock_no_ioctl);
2281 EXPORT_SYMBOL(sock_no_listen);
2282 EXPORT_SYMBOL(sock_no_mmap);
2283 EXPORT_SYMBOL(sock_no_poll);
2284 EXPORT_SYMBOL(sock_no_recvmsg);
2285 EXPORT_SYMBOL(sock_no_sendmsg);
2286 EXPORT_SYMBOL(sock_no_sendpage);
2287 EXPORT_SYMBOL(sock_no_setsockopt);
2288 EXPORT_SYMBOL(sock_no_shutdown);
2289 EXPORT_SYMBOL(sock_no_socketpair);
2290 EXPORT_SYMBOL(sock_rfree);
2291 EXPORT_SYMBOL(sock_setsockopt);
2292 EXPORT_SYMBOL(sock_wfree);
2293 EXPORT_SYMBOL(sock_wmalloc);
2294 EXPORT_SYMBOL(sock_i_uid);
2295 EXPORT_SYMBOL(sock_i_ino);
2296 EXPORT_SYMBOL(sysctl_optmem_max);