net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135
 136 #include "net-sysfs.h"
 137
 138 /* Instead of increasing this, you should create a hash table. */
 139 #define MAX_GRO_SKBS 8
 140
 141 /* This should be increased if a protocol with a bigger head is added. */
 142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144 /*
 145  *      The list of packet types we will receive (as opposed to discard)
 146  *      and the routines to invoke.
 147  *
 148  *      Why 16. Because with 16 the only overlap we get on a hash of the
 149  *      low nibble of the protocol value is RARP/SNAP/X.25.
 150  *
 151  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 152  *             sure which should go first, but I bet it won't make much
 153  *             difference if we are running VLANs.  The good news is that
 154  *             this protocol won't be in the list unless compiled in, so
 155  *             the average user (w/out VLANs) will not be adversely affected.
 156  *             --BLG
 157  *
 158  *              0800    IP
 159  *              8100    802.1Q VLAN
 160  *              0001    802.3
 161  *              0002    AX.25
 162  *              0004    802.2
 163  *              8035    RARP
 164  *              0005    SNAP
 165  *              0805    X.25
 166  *              0806    ARP
 167  *              8137    IPX
 168  *              0009    Localtalk
 169  *              86DD    IPv6
 170  */
 171
 172 #define PTYPE_HASH_SIZE (16)
 173 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 174
 175 static DEFINE_SPINLOCK(ptype_lock);
 176 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 177 static struct list_head ptype_all __read_mostly;        /* Taps */
 178
 179 /*
 180  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 181  * semaphore.
 182  *
 183  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 184  *
 185  * Writers must hold the rtnl semaphore while they loop through the
 186  * dev_base_head list, and hold dev_base_lock for writing when they do the
 187  * actual updates.  This allows pure readers to access the list even
 188  * while a writer is preparing to update it.
 189  *
 190  * To put it another way, dev_base_lock is held for writing only to
 191  * protect against pure readers; the rtnl semaphore provides the
 192  * protection against other writers.
 193  *
 194  * See, for example usages, register_netdevice() and
 195  * unregister_netdevice(), which must be called with the rtnl
 196  * semaphore held.
 197  */
 198 DEFINE_RWLOCK(dev_base_lock);
 199 EXPORT_SYMBOL(dev_base_lock);
 200
 201 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 202 {
 203         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 204         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 205 }
 206
 207 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 208 {
 209         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 210 }
 211
 212 static inline void rps_lock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_lock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 static inline void rps_unlock(struct softnet_data *sd)
 220 {
 221 #ifdef CONFIG_RPS
 222         spin_unlock(&sd->input_pkt_queue.lock);
 223 #endif
 224 }
 225
 226 /* Device list insertion */
 227 static int list_netdevice(struct net_device *dev)
 228 {
 229         struct net *net = dev_net(dev);
 230
 231         ASSERT_RTNL();
 232
 233         write_lock_bh(&dev_base_lock);
 234         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 235         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 236         hlist_add_head_rcu(&dev->index_hlist,
 237                            dev_index_hash(net, dev->ifindex));
 238         write_unlock_bh(&dev_base_lock);
 239         return 0;
 240 }
 241
 242 /* Device list removal
 243  * caller must respect a RCU grace period before freeing/reusing dev
 244  */
 245 static void unlist_netdevice(struct net_device *dev)
 246 {
 247         ASSERT_RTNL();
 248
 249         /* Unlink dev from the device chain */
 250         write_lock_bh(&dev_base_lock);
 251         list_del_rcu(&dev->dev_list);
 252         hlist_del_rcu(&dev->name_hlist);
 253         hlist_del_rcu(&dev->index_hlist);
 254         write_unlock_bh(&dev_base_lock);
 255 }
 256
 257 /*
 258  *      Our notifier list
 259  */
 260
 261 static RAW_NOTIFIER_HEAD(netdev_chain);
 262
 263 /*
 264  *      Device drivers call our routines to queue packets here. We empty the
 265  *      queue in the local softnet handler.
 266  */
 267
 268 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 269 EXPORT_PER_CPU_SYMBOL(softnet_data);
 270
 271 #ifdef CONFIG_LOCKDEP
 272 /*
 273  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 274  * according to dev->type
 275  */
 276 static const unsigned short netdev_lock_type[] =
 277         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 278          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 279          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 280          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 281          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 282          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 283          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 284          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 285          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 286          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 287          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 288          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 289          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 290          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 291          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 292          ARPHRD_VOID, ARPHRD_NONE};
 293
 294 static const char *const netdev_lock_name[] =
 295         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 308          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 309          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 310          "_xmit_VOID", "_xmit_NONE"};
 311
 312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314
 315 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 316 {
 317         int i;
 318
 319         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 320                 if (netdev_lock_type[i] == dev_type)
 321                         return i;
 322         /* the last key is used by default */
 323         return ARRAY_SIZE(netdev_lock_type) - 1;
 324 }
 325
 326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 327                                                  unsigned short dev_type)
 328 {
 329         int i;
 330
 331         i = netdev_lock_pos(dev_type);
 332         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 333                                    netdev_lock_name[i]);
 334 }
 335
 336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 337 {
 338         int i;
 339
 340         i = netdev_lock_pos(dev->type);
 341         lockdep_set_class_and_name(&dev->addr_list_lock,
 342                                    &netdev_addr_lock_key[i],
 343                                    netdev_lock_name[i]);
 344 }
 345 #else
 346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 347                                                  unsigned short dev_type)
 348 {
 349 }
 350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351 {
 352 }
 353 #endif
 354
 355 /*******************************************************************************
 356
 357                 Protocol management and registration routines
 358
 359 *******************************************************************************/
 360
 361 /*
 362  *      Add a protocol ID to the list. Now that the input handler is
 363  *      smarter we can dispense with all the messy stuff that used to be
 364  *      here.
 365  *
 366  *      BEWARE!!! Protocol handlers, mangling input packets,
 367  *      MUST BE last in hash buckets and checking protocol handlers
 368  *      MUST start from promiscuous ptype_all chain in net_bh.
 369  *      It is true now, do not change it.
 370  *      Explanation follows: if protocol handler, mangling packet, will
 371  *      be the first on list, it is not able to sense, that packet
 372  *      is cloned and should be copied-on-write, so that it will
 373  *      change it and subsequent readers will get broken packet.
 374  *                                                      --ANK (980803)
 375  */
 376
 377 static inline struct list_head *ptype_head(const struct packet_type *pt)
 378 {
 379         if (pt->type == htons(ETH_P_ALL))
 380                 return &ptype_all;
 381         else
 382                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383 }
 384
 385 /**
 386  *      dev_add_pack - add packet handler
 387  *      @pt: packet type declaration
 388  *
 389  *      Add a protocol handler to the networking stack. The passed &packet_type
 390  *      is linked into kernel lists and may not be freed until it has been
 391  *      removed from the kernel lists.
 392  *
 393  *      This call does not sleep therefore it can not
 394  *      guarantee all CPU's that are in middle of receiving packets
 395  *      will see the new packet type (until the next received packet).
 396  */
 397
 398 void dev_add_pack(struct packet_type *pt)
 399 {
 400         struct list_head *head = ptype_head(pt);
 401
 402         spin_lock(&ptype_lock);
 403         list_add_rcu(&pt->list, head);
 404         spin_unlock(&ptype_lock);
 405 }
 406 EXPORT_SYMBOL(dev_add_pack);
 407
 408 /**
 409  *      __dev_remove_pack        - remove packet handler
 410  *      @pt: packet type declaration
 411  *
 412  *      Remove a protocol handler that was previously added to the kernel
 413  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414  *      from the kernel lists and can be freed or reused once this function
 415  *      returns.
 416  *
 417  *      The packet type might still be in use by receivers
 418  *      and must not be freed until after all the CPU's have gone
 419  *      through a quiescent state.
 420  */
 421 void __dev_remove_pack(struct packet_type *pt)
 422 {
 423         struct list_head *head = ptype_head(pt);
 424         struct packet_type *pt1;
 425
 426         spin_lock(&ptype_lock);
 427
 428         list_for_each_entry(pt1, head, list) {
 429                 if (pt == pt1) {
 430                         list_del_rcu(&pt->list);
 431                         goto out;
 432                 }
 433         }
 434
 435         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 436 out:
 437         spin_unlock(&ptype_lock);
 438 }
 439 EXPORT_SYMBOL(__dev_remove_pack);
 440
 441 /**
 442  *      dev_remove_pack  - remove packet handler
 443  *      @pt: packet type declaration
 444  *
 445  *      Remove a protocol handler that was previously added to the kernel
 446  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447  *      from the kernel lists and can be freed or reused once this function
 448  *      returns.
 449  *
 450  *      This call sleeps to guarantee that no CPU is looking at the packet
 451  *      type after return.
 452  */
 453 void dev_remove_pack(struct packet_type *pt)
 454 {
 455         __dev_remove_pack(pt);
 456
 457         synchronize_net();
 458 }
 459 EXPORT_SYMBOL(dev_remove_pack);
 460
 461 /******************************************************************************
 462
 463                       Device Boot-time Settings Routines
 464
 465 *******************************************************************************/
 466
 467 /* Boot time configuration table */
 468 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 469
 470 /**
 471  *      netdev_boot_setup_add   - add new setup entry
 472  *      @name: name of the device
 473  *      @map: configured settings for the device
 474  *
 475  *      Adds new setup entry to the dev_boot_setup list.  The function
 476  *      returns 0 on error and 1 on success.  This is a generic routine to
 477  *      all netdevices.
 478  */
 479 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 480 {
 481         struct netdev_boot_setup *s;
 482         int i;
 483
 484         s = dev_boot_setup;
 485         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 486                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 487                         memset(s[i].name, 0, sizeof(s[i].name));
 488                         strlcpy(s[i].name, name, IFNAMSIZ);
 489                         memcpy(&s[i].map, map, sizeof(s[i].map));
 490                         break;
 491                 }
 492         }
 493
 494         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 495 }
 496
 497 /**
 498  *      netdev_boot_setup_check - check boot time settings
 499  *      @dev: the netdevice
 500  *
 501  *      Check boot time settings for the device.
 502  *      The found settings are set for the device to be used
 503  *      later in the device probing.
 504  *      Returns 0 if no settings found, 1 if they are.
 505  */
 506 int netdev_boot_setup_check(struct net_device *dev)
 507 {
 508         struct netdev_boot_setup *s = dev_boot_setup;
 509         int i;
 510
 511         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 512                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 513                     !strcmp(dev->name, s[i].name)) {
 514                         dev->irq        = s[i].map.irq;
 515                         dev->base_addr  = s[i].map.base_addr;
 516                         dev->mem_start  = s[i].map.mem_start;
 517                         dev->mem_end    = s[i].map.mem_end;
 518                         return 1;
 519                 }
 520         }
 521         return 0;
 522 }
 523 EXPORT_SYMBOL(netdev_boot_setup_check);
 524
 525
 526 /**
 527  *      netdev_boot_base        - get address from boot time settings
 528  *      @prefix: prefix for network device
 529  *      @unit: id for network device
 530  *
 531  *      Check boot time settings for the base address of device.
 532  *      The found settings are set for the device to be used
 533  *      later in the device probing.
 534  *      Returns 0 if no settings found.
 535  */
 536 unsigned long netdev_boot_base(const char *prefix, int unit)
 537 {
 538         const struct netdev_boot_setup *s = dev_boot_setup;
 539         char name[IFNAMSIZ];
 540         int i;
 541
 542         sprintf(name, "%s%d", prefix, unit);
 543
 544         /*
 545          * If device already registered then return base of 1
 546          * to indicate not to probe for this interface
 547          */
 548         if (__dev_get_by_name(&init_net, name))
 549                 return 1;
 550
 551         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 552                 if (!strcmp(name, s[i].name))
 553                         return s[i].map.base_addr;
 554         return 0;
 555 }
 556
 557 /*
 558  * Saves at boot time configured settings for any netdevice.
 559  */
 560 int __init netdev_boot_setup(char *str)
 561 {
 562         int ints[5];
 563         struct ifmap map;
 564
 565         str = get_options(str, ARRAY_SIZE(ints), ints);
 566         if (!str || !*str)
 567                 return 0;
 568
 569         /* Save settings */
 570         memset(&map, 0, sizeof(map));
 571         if (ints[0] > 0)
 572                 map.irq = ints[1];
 573         if (ints[0] > 1)
 574                 map.base_addr = ints[2];
 575         if (ints[0] > 2)
 576                 map.mem_start = ints[3];
 577         if (ints[0] > 3)
 578                 map.mem_end = ints[4];
 579
 580         /* Add new entry to the list */
 581         return netdev_boot_setup_add(str, &map);
 582 }
 583
 584 __setup("netdev=", netdev_boot_setup);
 585
 586 /*******************************************************************************
 587
 588                             Device Interface Subroutines
 589
 590 *******************************************************************************/
 591
 592 /**
 593  *      __dev_get_by_name       - find a device by its name
 594  *      @net: the applicable net namespace
 595  *      @name: name to find
 596  *
 597  *      Find an interface by name. Must be called under RTNL semaphore
 598  *      or @dev_base_lock. If the name is found a pointer to the device
 599  *      is returned. If the name is not found then %NULL is returned. The
 600  *      reference counters are not incremented so the caller must be
 601  *      careful with locks.
 602  */
 603
 604 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 605 {
 606         struct hlist_node *p;
 607         struct net_device *dev;
 608         struct hlist_head *head = dev_name_hash(net, name);
 609
 610         hlist_for_each_entry(dev, p, head, name_hlist)
 611                 if (!strncmp(dev->name, name, IFNAMSIZ))
 612                         return dev;
 613
 614         return NULL;
 615 }
 616 EXPORT_SYMBOL(__dev_get_by_name);
 617
 618 /**
 619  *      dev_get_by_name_rcu     - find a device by its name
 620  *      @net: the applicable net namespace
 621  *      @name: name to find
 622  *
 623  *      Find an interface by name.
 624  *      If the name is found a pointer to the device is returned.
 625  *      If the name is not found then %NULL is returned.
 626  *      The reference counters are not incremented so the caller must be
 627  *      careful with locks. The caller must hold RCU lock.
 628  */
 629
 630 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 631 {
 632         struct hlist_node *p;
 633         struct net_device *dev;
 634         struct hlist_head *head = dev_name_hash(net, name);
 635
 636         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 637                 if (!strncmp(dev->name, name, IFNAMSIZ))
 638                         return dev;
 639
 640         return NULL;
 641 }
 642 EXPORT_SYMBOL(dev_get_by_name_rcu);
 643
 644 /**
 645  *      dev_get_by_name         - find a device by its name
 646  *      @net: the applicable net namespace
 647  *      @name: name to find
 648  *
 649  *      Find an interface by name. This can be called from any
 650  *      context and does its own locking. The returned handle has
 651  *      the usage count incremented and the caller must use dev_put() to
 652  *      release it when it is no longer needed. %NULL is returned if no
 653  *      matching device is found.
 654  */
 655
 656 struct net_device *dev_get_by_name(struct net *net, const char *name)
 657 {
 658         struct net_device *dev;
 659
 660         rcu_read_lock();
 661         dev = dev_get_by_name_rcu(net, name);
 662         if (dev)
 663                 dev_hold(dev);
 664         rcu_read_unlock();
 665         return dev;
 666 }
 667 EXPORT_SYMBOL(dev_get_by_name);
 668
 669 /**
 670  *      __dev_get_by_index - find a device by its ifindex
 671  *      @net: the applicable net namespace
 672  *      @ifindex: index of device
 673  *
 674  *      Search for an interface by index. Returns %NULL if the device
 675  *      is not found or a pointer to the device. The device has not
 676  *      had its reference counter increased so the caller must be careful
 677  *      about locking. The caller must hold either the RTNL semaphore
 678  *      or @dev_base_lock.
 679  */
 680
 681 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 682 {
 683         struct hlist_node *p;
 684         struct net_device *dev;
 685         struct hlist_head *head = dev_index_hash(net, ifindex);
 686
 687         hlist_for_each_entry(dev, p, head, index_hlist)
 688                 if (dev->ifindex == ifindex)
 689                         return dev;
 690
 691         return NULL;
 692 }
 693 EXPORT_SYMBOL(__dev_get_by_index);
 694
 695 /**
 696  *      dev_get_by_index_rcu - find a device by its ifindex
 697  *      @net: the applicable net namespace
 698  *      @ifindex: index of device
 699  *
 700  *      Search for an interface by index. Returns %NULL if the device
 701  *      is not found or a pointer to the device. The device has not
 702  *      had its reference counter increased so the caller must be careful
 703  *      about locking. The caller must hold RCU lock.
 704  */
 705
 706 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 707 {
 708         struct hlist_node *p;
 709         struct net_device *dev;
 710         struct hlist_head *head = dev_index_hash(net, ifindex);
 711
 712         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 713                 if (dev->ifindex == ifindex)
 714                         return dev;
 715
 716         return NULL;
 717 }
 718 EXPORT_SYMBOL(dev_get_by_index_rcu);
 719
 720
 721 /**
 722  *      dev_get_by_index - find a device by its ifindex
 723  *      @net: the applicable net namespace
 724  *      @ifindex: index of device
 725  *
 726  *      Search for an interface by index. Returns NULL if the device
 727  *      is not found or a pointer to the device. The device returned has
 728  *      had a reference added and the pointer is safe until the user calls
 729  *      dev_put to indicate they have finished with it.
 730  */
 731
 732 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 733 {
 734         struct net_device *dev;
 735
 736         rcu_read_lock();
 737         dev = dev_get_by_index_rcu(net, ifindex);
 738         if (dev)
 739                 dev_hold(dev);
 740         rcu_read_unlock();
 741         return dev;
 742 }
 743 EXPORT_SYMBOL(dev_get_by_index);
 744
 745 /**
 746  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 747  *      @net: the applicable net namespace
 748  *      @type: media type of device
 749  *      @ha: hardware address
 750  *
 751  *      Search for an interface by MAC address. Returns NULL if the device
 752  *      is not found or a pointer to the device. The caller must hold RCU
 753  *      The returned device has not had its ref count increased
 754  *      and the caller must therefore be careful about locking
 755  *
 756  */
 757
 758 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 759                                        const char *ha)
 760 {
 761         struct net_device *dev;
 762
 763         for_each_netdev_rcu(net, dev)
 764                 if (dev->type == type &&
 765                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 766                         return dev;
 767
 768         return NULL;
 769 }
 770 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 771
 772 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 773 {
 774         struct net_device *dev;
 775
 776         ASSERT_RTNL();
 777         for_each_netdev(net, dev)
 778                 if (dev->type == type)
 779                         return dev;
 780
 781         return NULL;
 782 }
 783 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 784
 785 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 786 {
 787         struct net_device *dev, *ret = NULL;
 788
 789         rcu_read_lock();
 790         for_each_netdev_rcu(net, dev)
 791                 if (dev->type == type) {
 792                         dev_hold(dev);
 793                         ret = dev;
 794                         break;
 795                 }
 796         rcu_read_unlock();
 797         return ret;
 798 }
 799 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 800
 801 /**
 802  *      dev_get_by_flags_rcu - find any device with given flags
 803  *      @net: the applicable net namespace
 804  *      @if_flags: IFF_* values
 805  *      @mask: bitmask of bits in if_flags to check
 806  *
 807  *      Search for any interface with the given flags. Returns NULL if a device
 808  *      is not found or a pointer to the device. Must be called inside
 809  *      rcu_read_lock(), and result refcount is unchanged.
 810  */
 811
 812 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 813                                     unsigned short mask)
 814 {
 815         struct net_device *dev, *ret;
 816
 817         ret = NULL;
 818         for_each_netdev_rcu(net, dev) {
 819                 if (((dev->flags ^ if_flags) & mask) == 0) {
 820                         ret = dev;
 821                         break;
 822                 }
 823         }
 824         return ret;
 825 }
 826 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 827
 828 /**
 829  *      dev_valid_name - check if name is okay for network device
 830  *      @name: name string
 831  *
 832  *      Network device names need to be valid file names to
 833  *      to allow sysfs to work.  We also disallow any kind of
 834  *      whitespace.
 835  */
 836 int dev_valid_name(const char *name)
 837 {
 838         if (*name == '\0')
 839                 return 0;
 840         if (strlen(name) >= IFNAMSIZ)
 841                 return 0;
 842         if (!strcmp(name, ".") || !strcmp(name, ".."))
 843                 return 0;
 844
 845         while (*name) {
 846                 if (*name == '/' || isspace(*name))
 847                         return 0;
 848                 name++;
 849         }
 850         return 1;
 851 }
 852 EXPORT_SYMBOL(dev_valid_name);
 853
 854 /**
 855  *      __dev_alloc_name - allocate a name for a device
 856  *      @net: network namespace to allocate the device name in
 857  *      @name: name format string
 858  *      @buf:  scratch buffer and result name string
 859  *
 860  *      Passed a format string - eg "lt%d" it will try and find a suitable
 861  *      id. It scans list of devices to build up a free map, then chooses
 862  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 863  *      while allocating the name and adding the device in order to avoid
 864  *      duplicates.
 865  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 866  *      Returns the number of the unit assigned or a negative errno code.
 867  */
 868
 869 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 870 {
 871         int i = 0;
 872         const char *p;
 873         const int max_netdevices = 8*PAGE_SIZE;
 874         unsigned long *inuse;
 875         struct net_device *d;
 876
 877         p = strnchr(name, IFNAMSIZ-1, '%');
 878         if (p) {
 879                 /*
 880                  * Verify the string as this thing may have come from
 881                  * the user.  There must be either one "%d" and no other "%"
 882                  * characters.
 883                  */
 884                 if (p[1] != 'd' || strchr(p + 2, '%'))
 885                         return -EINVAL;
 886
 887                 /* Use one page as a bit array of possible slots */
 888                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 889                 if (!inuse)
 890                         return -ENOMEM;
 891
 892                 for_each_netdev(net, d) {
 893                         if (!sscanf(d->name, name, &i))
 894                                 continue;
 895                         if (i < 0 || i >= max_netdevices)
 896                                 continue;
 897
 898                         /*  avoid cases where sscanf is not exact inverse of printf */
 899                         snprintf(buf, IFNAMSIZ, name, i);
 900                         if (!strncmp(buf, d->name, IFNAMSIZ))
 901                                 set_bit(i, inuse);
 902                 }
 903
 904                 i = find_first_zero_bit(inuse, max_netdevices);
 905                 free_page((unsigned long) inuse);
 906         }
 907
 908         if (buf != name)
 909                 snprintf(buf, IFNAMSIZ, name, i);
 910         if (!__dev_get_by_name(net, buf))
 911                 return i;
 912
 913         /* It is possible to run out of possible slots
 914          * when the name is long and there isn't enough space left
 915          * for the digits, or if all bits are used.
 916          */
 917         return -ENFILE;
 918 }
 919
 920 /**
 921  *      dev_alloc_name - allocate a name for a device
 922  *      @dev: device
 923  *      @name: name format string
 924  *
 925  *      Passed a format string - eg "lt%d" it will try and find a suitable
 926  *      id. It scans list of devices to build up a free map, then chooses
 927  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 928  *      while allocating the name and adding the device in order to avoid
 929  *      duplicates.
 930  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 931  *      Returns the number of the unit assigned or a negative errno code.
 932  */
 933
 934 int dev_alloc_name(struct net_device *dev, const char *name)
 935 {
 936         char buf[IFNAMSIZ];
 937         struct net *net;
 938         int ret;
 939
 940         BUG_ON(!dev_net(dev));
 941         net = dev_net(dev);
 942         ret = __dev_alloc_name(net, name, buf);
 943         if (ret >= 0)
 944                 strlcpy(dev->name, buf, IFNAMSIZ);
 945         return ret;
 946 }
 947 EXPORT_SYMBOL(dev_alloc_name);
 948
 949 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 950 {
 951         struct net *net;
 952
 953         BUG_ON(!dev_net(dev));
 954         net = dev_net(dev);
 955
 956         if (!dev_valid_name(name))
 957                 return -EINVAL;
 958
 959         if (fmt && strchr(name, '%'))
 960                 return dev_alloc_name(dev, name);
 961         else if (__dev_get_by_name(net, name))
 962                 return -EEXIST;
 963         else if (dev->name != name)
 964                 strlcpy(dev->name, name, IFNAMSIZ);
 965
 966         return 0;
 967 }
 968
 969 /**
 970  *      dev_change_name - change name of a device
 971  *      @dev: device
 972  *      @newname: name (or format string) must be at least IFNAMSIZ
 973  *
 974  *      Change name of a device, can pass format strings "eth%d".
 975  *      for wildcarding.
 976  */
 977 int dev_change_name(struct net_device *dev, const char *newname)
 978 {
 979         char oldname[IFNAMSIZ];
 980         int err = 0;
 981         int ret;
 982         struct net *net;
 983
 984         ASSERT_RTNL();
 985         BUG_ON(!dev_net(dev));
 986
 987         net = dev_net(dev);
 988         if (dev->flags & IFF_UP)
 989                 return -EBUSY;
 990
 991         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 992                 return 0;
 993
 994         memcpy(oldname, dev->name, IFNAMSIZ);
 995
 996         err = dev_get_valid_name(dev, newname, 1);
 997         if (err < 0)
 998                 return err;
 999
1000 rollback:
1001         ret = device_rename(&dev->dev, dev->name);
1002         if (ret) {
1003                 memcpy(dev->name, oldname, IFNAMSIZ);
1004                 return ret;
1005         }
1006
1007         write_lock_bh(&dev_base_lock);
1008         hlist_del(&dev->name_hlist);
1009         write_unlock_bh(&dev_base_lock);
1010
1011         synchronize_rcu();
1012
1013         write_lock_bh(&dev_base_lock);
1014         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1015         write_unlock_bh(&dev_base_lock);
1016
1017         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1018         ret = notifier_to_errno(ret);
1019
1020         if (ret) {
1021                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1022                 if (err >= 0) {
1023                         err = ret;
1024                         memcpy(dev->name, oldname, IFNAMSIZ);
1025                         goto rollback;
1026                 } else {
1027                         printk(KERN_ERR
1028                                "%s: name change rollback failed: %d.\n",
1029                                dev->name, ret);
1030                 }
1031         }
1032
1033         return err;
1034 }
1035
1036 /**
1037  *      dev_set_alias - change ifalias of a device
1038  *      @dev: device
1039  *      @alias: name up to IFALIASZ
1040  *      @len: limit of bytes to copy from info
1041  *
1042  *      Set ifalias for a device,
1043  */
1044 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1045 {
1046         ASSERT_RTNL();
1047
1048         if (len >= IFALIASZ)
1049                 return -EINVAL;
1050
1051         if (!len) {
1052                 if (dev->ifalias) {
1053                         kfree(dev->ifalias);
1054                         dev->ifalias = NULL;
1055                 }
1056                 return 0;
1057         }
1058
1059         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1060         if (!dev->ifalias)
1061                 return -ENOMEM;
1062
1063         strlcpy(dev->ifalias, alias, len+1);
1064         return len;
1065 }
1066
1067
1068 /**
1069  *      netdev_features_change - device changes features
1070  *      @dev: device to cause notification
1071  *
1072  *      Called to indicate a device has changed features.
1073  */
1074 void netdev_features_change(struct net_device *dev)
1075 {
1076         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1077 }
1078 EXPORT_SYMBOL(netdev_features_change);
1079
1080 /**
1081  *      netdev_state_change - device changes state
1082  *      @dev: device to cause notification
1083  *
1084  *      Called to indicate a device has changed state. This function calls
1085  *      the notifier chains for netdev_chain and sends a NEWLINK message
1086  *      to the routing socket.
1087  */
1088 void netdev_state_change(struct net_device *dev)
1089 {
1090         if (dev->flags & IFF_UP) {
1091                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1092                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1093         }
1094 }
1095 EXPORT_SYMBOL(netdev_state_change);
1096
1097 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1098 {
1099         return call_netdevice_notifiers(event, dev);
1100 }
1101 EXPORT_SYMBOL(netdev_bonding_change);
1102
1103 /**
1104  *      dev_load        - load a network module
1105  *      @net: the applicable net namespace
1106  *      @name: name of interface
1107  *
1108  *      If a network interface is not present and the process has suitable
1109  *      privileges this function loads the module. If module loading is not
1110  *      available in this kernel then it becomes a nop.
1111  */
1112
1113 void dev_load(struct net *net, const char *name)
1114 {
1115         struct net_device *dev;
1116
1117         rcu_read_lock();
1118         dev = dev_get_by_name_rcu(net, name);
1119         rcu_read_unlock();
1120
1121         if (!dev && capable(CAP_NET_ADMIN))
1122                 request_module("%s", name);
1123 }
1124 EXPORT_SYMBOL(dev_load);
1125
1126 static int __dev_open(struct net_device *dev)
1127 {
1128         const struct net_device_ops *ops = dev->netdev_ops;
1129         int ret;
1130
1131         ASSERT_RTNL();
1132
1133         /*
1134          *      Is it even present?
1135          */
1136         if (!netif_device_present(dev))
1137                 return -ENODEV;
1138
1139         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1140         ret = notifier_to_errno(ret);
1141         if (ret)
1142                 return ret;
1143
1144         /*
1145          *      Call device private open method
1146          */
1147         set_bit(__LINK_STATE_START, &dev->state);
1148
1149         if (ops->ndo_validate_addr)
1150                 ret = ops->ndo_validate_addr(dev);
1151
1152         if (!ret && ops->ndo_open)
1153                 ret = ops->ndo_open(dev);
1154
1155         /*
1156          *      If it went open OK then:
1157          */
1158
1159         if (ret)
1160                 clear_bit(__LINK_STATE_START, &dev->state);
1161         else {
1162                 /*
1163                  *      Set the flags.
1164                  */
1165                 dev->flags |= IFF_UP;
1166
1167                 /*
1168                  *      Enable NET_DMA
1169                  */
1170                 net_dmaengine_get();
1171
1172                 /*
1173                  *      Initialize multicasting status
1174                  */
1175                 dev_set_rx_mode(dev);
1176
1177                 /*
1178                  *      Wakeup transmit queue engine
1179                  */
1180                 dev_activate(dev);
1181         }
1182
1183         return ret;
1184 }
1185
1186 /**
1187  *      dev_open        - prepare an interface for use.
1188  *      @dev:   device to open
1189  *
1190  *      Takes a device from down to up state. The device's private open
1191  *      function is invoked and then the multicast lists are loaded. Finally
1192  *      the device is moved into the up state and a %NETDEV_UP message is
1193  *      sent to the netdev notifier chain.
1194  *
1195  *      Calling this function on an active interface is a nop. On a failure
1196  *      a negative errno code is returned.
1197  */
1198 int dev_open(struct net_device *dev)
1199 {
1200         int ret;
1201
1202         /*
1203          *      Is it already up?
1204          */
1205         if (dev->flags & IFF_UP)
1206                 return 0;
1207
1208         /*
1209          *      Open device
1210          */
1211         ret = __dev_open(dev);
1212         if (ret < 0)
1213                 return ret;
1214
1215         /*
1216          *      ... and announce new interface.
1217          */
1218         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1219         call_netdevice_notifiers(NETDEV_UP, dev);
1220
1221         return ret;
1222 }
1223 EXPORT_SYMBOL(dev_open);
1224
1225 static int __dev_close_many(struct list_head *head)
1226 {
1227         struct net_device *dev;
1228
1229         ASSERT_RTNL();
1230         might_sleep();
1231
1232         list_for_each_entry(dev, head, unreg_list) {
1233                 /*
1234                  *      Tell people we are going down, so that they can
1235                  *      prepare to death, when device is still operating.
1236                  */
1237                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1238
1239                 clear_bit(__LINK_STATE_START, &dev->state);
1240
1241                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1242                  * can be even on different cpu. So just clear netif_running().
1243                  *
1244                  * dev->stop() will invoke napi_disable() on all of it's
1245                  * napi_struct instances on this device.
1246                  */
1247                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1248         }
1249
1250         dev_deactivate_many(head);
1251
1252         list_for_each_entry(dev, head, unreg_list) {
1253                 const struct net_device_ops *ops = dev->netdev_ops;
1254
1255                 /*
1256                  *      Call the device specific close. This cannot fail.
1257                  *      Only if device is UP
1258                  *
1259                  *      We allow it to be called even after a DETACH hot-plug
1260                  *      event.
1261                  */
1262                 if (ops->ndo_stop)
1263                         ops->ndo_stop(dev);
1264
1265                 /*
1266                  *      Device is now down.
1267                  */
1268
1269                 dev->flags &= ~IFF_UP;
1270
1271                 /*
1272                  *      Shutdown NET_DMA
1273                  */
1274                 net_dmaengine_put();
1275         }
1276
1277         return 0;
1278 }
1279
1280 static int __dev_close(struct net_device *dev)
1281 {
1282         LIST_HEAD(single);
1283
1284         list_add(&dev->unreg_list, &single);
1285         return __dev_close_many(&single);
1286 }
1287
1288 int dev_close_many(struct list_head *head)
1289 {
1290         struct net_device *dev, *tmp;
1291         LIST_HEAD(tmp_list);
1292
1293         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1294                 if (!(dev->flags & IFF_UP))
1295                         list_move(&dev->unreg_list, &tmp_list);
1296
1297         __dev_close_many(head);
1298
1299         /*
1300          * Tell people we are down
1301          */
1302         list_for_each_entry(dev, head, unreg_list) {
1303                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1304                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1305         }
1306
1307         /* rollback_registered_many needs the complete original list */
1308         list_splice(&tmp_list, head);
1309         return 0;
1310 }
1311
1312 /**
1313  *      dev_close - shutdown an interface.
1314  *      @dev: device to shutdown
1315  *
1316  *      This function moves an active device into down state. A
1317  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1318  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1319  *      chain.
1320  */
1321 int dev_close(struct net_device *dev)
1322 {
1323         LIST_HEAD(single);
1324
1325         list_add(&dev->unreg_list, &single);
1326         dev_close_many(&single);
1327
1328         return 0;
1329 }
1330 EXPORT_SYMBOL(dev_close);
1331
1332
1333 /**
1334  *      dev_disable_lro - disable Large Receive Offload on a device
1335  *      @dev: device
1336  *
1337  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1338  *      called under RTNL.  This is needed if received packets may be
1339  *      forwarded to another interface.
1340  */
1341 void dev_disable_lro(struct net_device *dev)
1342 {
1343         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1344             dev->ethtool_ops->set_flags) {
1345                 u32 flags = dev->ethtool_ops->get_flags(dev);
1346                 if (flags & ETH_FLAG_LRO) {
1347                         flags &= ~ETH_FLAG_LRO;
1348                         dev->ethtool_ops->set_flags(dev, flags);
1349                 }
1350         }
1351         WARN_ON(dev->features & NETIF_F_LRO);
1352 }
1353 EXPORT_SYMBOL(dev_disable_lro);
1354
1355
1356 static int dev_boot_phase = 1;
1357
1358 /*
1359  *      Device change register/unregister. These are not inline or static
1360  *      as we export them to the world.
1361  */
1362
1363 /**
1364  *      register_netdevice_notifier - register a network notifier block
1365  *      @nb: notifier
1366  *
1367  *      Register a notifier to be called when network device events occur.
1368  *      The notifier passed is linked into the kernel structures and must
1369  *      not be reused until it has been unregistered. A negative errno code
1370  *      is returned on a failure.
1371  *
1372  *      When registered all registration and up events are replayed
1373  *      to the new notifier to allow device to have a race free
1374  *      view of the network device list.
1375  */
1376
1377 int register_netdevice_notifier(struct notifier_block *nb)
1378 {
1379         struct net_device *dev;
1380         struct net_device *last;
1381         struct net *net;
1382         int err;
1383
1384         rtnl_lock();
1385         err = raw_notifier_chain_register(&netdev_chain, nb);
1386         if (err)
1387                 goto unlock;
1388         if (dev_boot_phase)
1389                 goto unlock;
1390         for_each_net(net) {
1391                 for_each_netdev(net, dev) {
1392                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1393                         err = notifier_to_errno(err);
1394                         if (err)
1395                                 goto rollback;
1396
1397                         if (!(dev->flags & IFF_UP))
1398                                 continue;
1399
1400                         nb->notifier_call(nb, NETDEV_UP, dev);
1401                 }
1402         }
1403
1404 unlock:
1405         rtnl_unlock();
1406         return err;
1407
1408 rollback:
1409         last = dev;
1410         for_each_net(net) {
1411                 for_each_netdev(net, dev) {
1412                         if (dev == last)
1413                                 break;
1414
1415                         if (dev->flags & IFF_UP) {
1416                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1417                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1418                         }
1419                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1420                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1421                 }
1422         }
1423
1424         raw_notifier_chain_unregister(&netdev_chain, nb);
1425         goto unlock;
1426 }
1427 EXPORT_SYMBOL(register_netdevice_notifier);
1428
1429 /**
1430  *      unregister_netdevice_notifier - unregister a network notifier block
1431  *      @nb: notifier
1432  *
1433  *      Unregister a notifier previously registered by
1434  *      register_netdevice_notifier(). The notifier is unlinked into the
1435  *      kernel structures and may then be reused. A negative errno code
1436  *      is returned on a failure.
1437  */
1438
1439 int unregister_netdevice_notifier(struct notifier_block *nb)
1440 {
1441         int err;
1442
1443         rtnl_lock();
1444         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1445         rtnl_unlock();
1446         return err;
1447 }
1448 EXPORT_SYMBOL(unregister_netdevice_notifier);
1449
1450 /**
1451  *      call_netdevice_notifiers - call all network notifier blocks
1452  *      @val: value passed unmodified to notifier function
1453  *      @dev: net_device pointer passed unmodified to notifier function
1454  *
1455  *      Call all network notifier blocks.  Parameters and return value
1456  *      are as for raw_notifier_call_chain().
1457  */
1458
1459 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1460 {
1461         ASSERT_RTNL();
1462         return raw_notifier_call_chain(&netdev_chain, val, dev);
1463 }
1464
1465 /* When > 0 there are consumers of rx skb time stamps */
1466 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1467
1468 void net_enable_timestamp(void)
1469 {
1470         atomic_inc(&netstamp_needed);
1471 }
1472 EXPORT_SYMBOL(net_enable_timestamp);
1473
1474 void net_disable_timestamp(void)
1475 {
1476         atomic_dec(&netstamp_needed);
1477 }
1478 EXPORT_SYMBOL(net_disable_timestamp);
1479
1480 static inline void net_timestamp_set(struct sk_buff *skb)
1481 {
1482         if (atomic_read(&netstamp_needed))
1483                 __net_timestamp(skb);
1484         else
1485                 skb->tstamp.tv64 = 0;
1486 }
1487
1488 static inline void net_timestamp_check(struct sk_buff *skb)
1489 {
1490         if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1491                 __net_timestamp(skb);
1492 }
1493
1494 /**
1495  * dev_forward_skb - loopback an skb to another netif
1496  *
1497  * @dev: destination network device
1498  * @skb: buffer to forward
1499  *
1500  * return values:
1501  *      NET_RX_SUCCESS  (no congestion)
1502  *      NET_RX_DROP     (packet was dropped, but freed)
1503  *
1504  * dev_forward_skb can be used for injecting an skb from the
1505  * start_xmit function of one device into the receive queue
1506  * of another device.
1507  *
1508  * The receiving device may be in another namespace, so
1509  * we have to clear all information in the skb that could
1510  * impact namespace isolation.
1511  */
1512 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1513 {
1514         skb_orphan(skb);
1515         nf_reset(skb);
1516
1517         if (unlikely(!(dev->flags & IFF_UP) ||
1518                      (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1519                 atomic_long_inc(&dev->rx_dropped);
1520                 kfree_skb(skb);
1521                 return NET_RX_DROP;
1522         }
1523         skb_set_dev(skb, dev);
1524         skb->tstamp.tv64 = 0;
1525         skb->pkt_type = PACKET_HOST;
1526         skb->protocol = eth_type_trans(skb, dev);
1527         return netif_rx(skb);
1528 }
1529 EXPORT_SYMBOL_GPL(dev_forward_skb);
1530
1531 static inline int deliver_skb(struct sk_buff *skb,
1532                               struct packet_type *pt_prev,
1533                               struct net_device *orig_dev)
1534 {
1535         atomic_inc(&skb->users);
1536         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1537 }
1538
1539 /*
1540  *      Support routine. Sends outgoing frames to any network
1541  *      taps currently in use.
1542  */
1543
1544 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1545 {
1546         struct packet_type *ptype;
1547         struct sk_buff *skb2 = NULL;
1548         struct packet_type *pt_prev = NULL;
1549
1550         rcu_read_lock();
1551         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1552                 /* Never send packets back to the socket
1553                  * they originated from - MvS (miquels@drinkel.ow.org)
1554                  */
1555                 if ((ptype->dev == dev || !ptype->dev) &&
1556                     (ptype->af_packet_priv == NULL ||
1557                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1558                         if (pt_prev) {
1559                                 deliver_skb(skb2, pt_prev, skb->dev);
1560                                 pt_prev = ptype;
1561                                 continue;
1562                         }
1563
1564                         skb2 = skb_clone(skb, GFP_ATOMIC);
1565                         if (!skb2)
1566                                 break;
1567
1568                         net_timestamp_set(skb2);
1569
1570                         /* skb->nh should be correctly
1571                            set by sender, so that the second statement is
1572                            just protection against buggy protocols.
1573                          */
1574                         skb_reset_mac_header(skb2);
1575
1576                         if (skb_network_header(skb2) < skb2->data ||
1577                             skb2->network_header > skb2->tail) {
1578                                 if (net_ratelimit())
1579                                         printk(KERN_CRIT "protocol %04x is "
1580                                                "buggy, dev %s\n",
1581                                                ntohs(skb2->protocol),
1582                                                dev->name);
1583                                 skb_reset_network_header(skb2);
1584                         }
1585
1586                         skb2->transport_header = skb2->network_header;
1587                         skb2->pkt_type = PACKET_OUTGOING;
1588                         pt_prev = ptype;
1589                 }
1590         }
1591         if (pt_prev)
1592                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1593         rcu_read_unlock();
1594 }
1595
1596 /*
1597  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1598  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1599  */
1600 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1601 {
1602         int rc;
1603
1604         if (txq < 1 || txq > dev->num_tx_queues)
1605                 return -EINVAL;
1606
1607         if (dev->reg_state == NETREG_REGISTERED) {
1608                 ASSERT_RTNL();
1609
1610                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1611                                                   txq);
1612                 if (rc)
1613                         return rc;
1614
1615                 if (txq < dev->real_num_tx_queues)
1616                         qdisc_reset_all_tx_gt(dev, txq);
1617         }
1618
1619         dev->real_num_tx_queues = txq;
1620         return 0;
1621 }
1622 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1623
1624 #ifdef CONFIG_RPS
1625 /**
1626  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1627  *      @dev: Network device
1628  *      @rxq: Actual number of RX queues
1629  *
1630  *      This must be called either with the rtnl_lock held or before
1631  *      registration of the net device.  Returns 0 on success, or a
1632  *      negative error code.  If called before registration, it always
1633  *      succeeds.
1634  */
1635 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1636 {
1637         int rc;
1638
1639         if (rxq < 1 || rxq > dev->num_rx_queues)
1640                 return -EINVAL;
1641
1642         if (dev->reg_state == NETREG_REGISTERED) {
1643                 ASSERT_RTNL();
1644
1645                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1646                                                   rxq);
1647                 if (rc)
1648                         return rc;
1649         }
1650
1651         dev->real_num_rx_queues = rxq;
1652         return 0;
1653 }
1654 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1655 #endif
1656
1657 static inline void __netif_reschedule(struct Qdisc *q)
1658 {
1659         struct softnet_data *sd;
1660         unsigned long flags;
1661
1662         local_irq_save(flags);
1663         sd = &__get_cpu_var(softnet_data);
1664         q->next_sched = NULL;
1665         *sd->output_queue_tailp = q;
1666         sd->output_queue_tailp = &q->next_sched;
1667         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1668         local_irq_restore(flags);
1669 }
1670
1671 void __netif_schedule(struct Qdisc *q)
1672 {
1673         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1674                 __netif_reschedule(q);
1675 }
1676 EXPORT_SYMBOL(__netif_schedule);
1677
1678 void dev_kfree_skb_irq(struct sk_buff *skb)
1679 {
1680         if (atomic_dec_and_test(&skb->users)) {
1681                 struct softnet_data *sd;
1682                 unsigned long flags;
1683
1684                 local_irq_save(flags);
1685                 sd = &__get_cpu_var(softnet_data);
1686                 skb->next = sd->completion_queue;
1687                 sd->completion_queue = skb;
1688                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1689                 local_irq_restore(flags);
1690         }
1691 }
1692 EXPORT_SYMBOL(dev_kfree_skb_irq);
1693
1694 void dev_kfree_skb_any(struct sk_buff *skb)
1695 {
1696         if (in_irq() || irqs_disabled())
1697                 dev_kfree_skb_irq(skb);
1698         else
1699                 dev_kfree_skb(skb);
1700 }
1701 EXPORT_SYMBOL(dev_kfree_skb_any);
1702
1703
1704 /**
1705  * netif_device_detach - mark device as removed
1706  * @dev: network device
1707  *
1708  * Mark device as removed from system and therefore no longer available.
1709  */
1710 void netif_device_detach(struct net_device *dev)
1711 {
1712         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1713             netif_running(dev)) {
1714                 netif_tx_stop_all_queues(dev);
1715         }
1716 }
1717 EXPORT_SYMBOL(netif_device_detach);
1718
1719 /**
1720  * netif_device_attach - mark device as attached
1721  * @dev: network device
1722  *
1723  * Mark device as attached from system and restart if needed.
1724  */
1725 void netif_device_attach(struct net_device *dev)
1726 {
1727         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1728             netif_running(dev)) {
1729                 netif_tx_wake_all_queues(dev);
1730                 __netdev_watchdog_up(dev);
1731         }
1732 }
1733 EXPORT_SYMBOL(netif_device_attach);
1734
1735 /**
1736  * skb_dev_set -- assign a new device to a buffer
1737  * @skb: buffer for the new device
1738  * @dev: network device
1739  *
1740  * If an skb is owned by a device already, we have to reset
1741  * all data private to the namespace a device belongs to
1742  * before assigning it a new device.
1743  */
1744 #ifdef CONFIG_NET_NS
1745 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1746 {
1747         skb_dst_drop(skb);
1748         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1749                 secpath_reset(skb);
1750                 nf_reset(skb);
1751                 skb_init_secmark(skb);
1752                 skb->mark = 0;
1753                 skb->priority = 0;
1754                 skb->nf_trace = 0;
1755                 skb->ipvs_property = 0;
1756 #ifdef CONFIG_NET_SCHED
1757                 skb->tc_index = 0;
1758 #endif
1759         }
1760         skb->dev = dev;
1761 }
1762 EXPORT_SYMBOL(skb_set_dev);
1763 #endif /* CONFIG_NET_NS */
1764
1765 /*
1766  * Invalidate hardware checksum when packet is to be mangled, and
1767  * complete checksum manually on outgoing path.
1768  */
1769 int skb_checksum_help(struct sk_buff *skb)
1770 {
1771         __wsum csum;
1772         int ret = 0, offset;
1773
1774         if (skb->ip_summed == CHECKSUM_COMPLETE)
1775                 goto out_set_summed;
1776
1777         if (unlikely(skb_shinfo(skb)->gso_size)) {
1778                 /* Let GSO fix up the checksum. */
1779                 goto out_set_summed;
1780         }
1781
1782         offset = skb_checksum_start_offset(skb);
1783         BUG_ON(offset >= skb_headlen(skb));
1784         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1785
1786         offset += skb->csum_offset;
1787         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1788
1789         if (skb_cloned(skb) &&
1790             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1791                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1792                 if (ret)
1793                         goto out;
1794         }
1795
1796         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1797 out_set_summed:
1798         skb->ip_summed = CHECKSUM_NONE;
1799 out:
1800         return ret;
1801 }
1802 EXPORT_SYMBOL(skb_checksum_help);
1803
1804 /**
1805  *      skb_gso_segment - Perform segmentation on skb.
1806  *      @skb: buffer to segment
1807  *      @features: features for the output path (see dev->features)
1808  *
1809  *      This function segments the given skb and returns a list of segments.
1810  *
1811  *      It may return NULL if the skb requires no segmentation.  This is
1812  *      only possible when GSO is used for verifying header integrity.
1813  */
1814 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1815 {
1816         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1817         struct packet_type *ptype;
1818         __be16 type = skb->protocol;
1819         int vlan_depth = ETH_HLEN;
1820         int err;
1821
1822         while (type == htons(ETH_P_8021Q)) {
1823                 struct vlan_hdr *vh;
1824
1825                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1826                         return ERR_PTR(-EINVAL);
1827
1828                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1829                 type = vh->h_vlan_encapsulated_proto;
1830                 vlan_depth += VLAN_HLEN;
1831         }
1832
1833         skb_reset_mac_header(skb);
1834         skb->mac_len = skb->network_header - skb->mac_header;
1835         __skb_pull(skb, skb->mac_len);
1836
1837         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1838                 struct net_device *dev = skb->dev;
1839                 struct ethtool_drvinfo info = {};
1840
1841                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1842                         dev->ethtool_ops->get_drvinfo(dev, &info);
1843
1844                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1845                      info.driver, dev ? dev->features : 0L,
1846                      skb->sk ? skb->sk->sk_route_caps : 0L,
1847                      skb->len, skb->data_len, skb->ip_summed);
1848
1849                 if (skb_header_cloned(skb) &&
1850                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1851                         return ERR_PTR(err);
1852         }
1853
1854         rcu_read_lock();
1855         list_for_each_entry_rcu(ptype,
1856                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1857                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1858                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1859                                 err = ptype->gso_send_check(skb);
1860                                 segs = ERR_PTR(err);
1861                                 if (err || skb_gso_ok(skb, features))
1862                                         break;
1863                                 __skb_push(skb, (skb->data -
1864                                                  skb_network_header(skb)));
1865                         }
1866                         segs = ptype->gso_segment(skb, features);
1867                         break;
1868                 }
1869         }
1870         rcu_read_unlock();
1871
1872         __skb_push(skb, skb->data - skb_mac_header(skb));
1873
1874         return segs;
1875 }
1876 EXPORT_SYMBOL(skb_gso_segment);
1877
1878 /* Take action when hardware reception checksum errors are detected. */
1879 #ifdef CONFIG_BUG
1880 void netdev_rx_csum_fault(struct net_device *dev)
1881 {
1882         if (net_ratelimit()) {
1883                 printk(KERN_ERR "%s: hw csum failure.\n",
1884                         dev ? dev->name : "<unknown>");
1885                 dump_stack();
1886         }
1887 }
1888 EXPORT_SYMBOL(netdev_rx_csum_fault);
1889 #endif
1890
1891 /* Actually, we should eliminate this check as soon as we know, that:
1892  * 1. IOMMU is present and allows to map all the memory.
1893  * 2. No high memory really exists on this machine.
1894  */
1895
1896 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1897 {
1898 #ifdef CONFIG_HIGHMEM
1899         int i;
1900         if (!(dev->features & NETIF_F_HIGHDMA)) {
1901                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1902                         if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1903                                 return 1;
1904         }
1905
1906         if (PCI_DMA_BUS_IS_PHYS) {
1907                 struct device *pdev = dev->dev.parent;
1908
1909                 if (!pdev)
1910                         return 0;
1911                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1912                         dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1913                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1914                                 return 1;
1915                 }
1916         }
1917 #endif
1918         return 0;
1919 }
1920
1921 struct dev_gso_cb {
1922         void (*destructor)(struct sk_buff *skb);
1923 };
1924
1925 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1926
1927 static void dev_gso_skb_destructor(struct sk_buff *skb)
1928 {
1929         struct dev_gso_cb *cb;
1930
1931         do {
1932                 struct sk_buff *nskb = skb->next;
1933
1934                 skb->next = nskb->next;
1935                 nskb->next = NULL;
1936                 kfree_skb(nskb);
1937         } while (skb->next);
1938
1939         cb = DEV_GSO_CB(skb);
1940         if (cb->destructor)
1941                 cb->destructor(skb);
1942 }
1943
1944 /**
1945  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1946  *      @skb: buffer to segment
1947  *      @features: device features as applicable to this skb
1948  *
1949  *      This function segments the given skb and stores the list of segments
1950  *      in skb->next.
1951  */
1952 static int dev_gso_segment(struct sk_buff *skb, int features)
1953 {
1954         struct sk_buff *segs;
1955
1956         segs = skb_gso_segment(skb, features);
1957
1958         /* Verifying header integrity only. */
1959         if (!segs)
1960                 return 0;
1961
1962         if (IS_ERR(segs))
1963                 return PTR_ERR(segs);
1964
1965         skb->next = segs;
1966         DEV_GSO_CB(skb)->destructor = skb->destructor;
1967         skb->destructor = dev_gso_skb_destructor;
1968
1969         return 0;
1970 }
1971
1972 /*
1973  * Try to orphan skb early, right before transmission by the device.
1974  * We cannot orphan skb if tx timestamp is requested or the sk-reference
1975  * is needed on driver level for other reasons, e.g. see net/can/raw.c
1976  */
1977 static inline void skb_orphan_try(struct sk_buff *skb)
1978 {
1979         struct sock *sk = skb->sk;
1980
1981         if (sk && !skb_shinfo(skb)->tx_flags) {
1982                 /* skb_tx_hash() wont be able to get sk.
1983                  * We copy sk_hash into skb->rxhash
1984                  */
1985                 if (!skb->rxhash)
1986                         skb->rxhash = sk->sk_hash;
1987                 skb_orphan(skb);
1988         }
1989 }
1990
1991 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1992 {
1993         return ((features & NETIF_F_GEN_CSUM) ||
1994                 ((features & NETIF_F_V4_CSUM) &&
1995                  protocol == htons(ETH_P_IP)) ||
1996                 ((features & NETIF_F_V6_CSUM) &&
1997                  protocol == htons(ETH_P_IPV6)) ||
1998                 ((features & NETIF_F_FCOE_CRC) &&
1999                  protocol == htons(ETH_P_FCOE)));
2000 }
2001
2002 static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
2003 {
2004         if (!can_checksum_protocol(protocol, features)) {
2005                 features &= ~NETIF_F_ALL_CSUM;
2006                 features &= ~NETIF_F_SG;
2007         } else if (illegal_highdma(skb->dev, skb)) {
2008                 features &= ~NETIF_F_SG;
2009         }
2010
2011         return features;
2012 }
2013
2014 int netif_skb_features(struct sk_buff *skb)
2015 {
2016         __be16 protocol = skb->protocol;
2017         int features = skb->dev->features;
2018
2019         if (protocol == htons(ETH_P_8021Q)) {
2020                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2021                 protocol = veh->h_vlan_encapsulated_proto;
2022         } else if (!vlan_tx_tag_present(skb)) {
2023                 return harmonize_features(skb, protocol, features);
2024         }
2025
2026         features &= skb->dev->vlan_features;
2027
2028         if (protocol != htons(ETH_P_8021Q)) {
2029                 return harmonize_features(skb, protocol, features);
2030         } else {
2031                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2032                                 NETIF_F_GEN_CSUM;
2033                 return harmonize_features(skb, protocol, features);
2034         }
2035 }
2036 EXPORT_SYMBOL(netif_skb_features);
2037
2038 /*
2039  * Returns true if either:
2040  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2041  *      2. skb is fragmented and the device does not support SG, or if
2042  *         at least one of fragments is in highmem and device does not
2043  *         support DMA from it.
2044  */
2045 static inline int skb_needs_linearize(struct sk_buff *skb,
2046                                       int features)
2047 {
2048         return skb_is_nonlinear(skb) &&
2049                         ((skb_has_frag_list(skb) &&
2050                                 !(features & NETIF_F_FRAGLIST)) ||
2051                         (skb_shinfo(skb)->nr_frags &&
2052                                 !(features & NETIF_F_SG)));
2053 }
2054
2055 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2056                         struct netdev_queue *txq)
2057 {
2058         const struct net_device_ops *ops = dev->netdev_ops;
2059         int rc = NETDEV_TX_OK;
2060
2061         if (likely(!skb->next)) {
2062                 int features;
2063
2064                 /*
2065                  * If device doesnt need skb->dst, release it right now while
2066                  * its hot in this cpu cache
2067                  */
2068                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2069                         skb_dst_drop(skb);
2070
2071                 if (!list_empty(&ptype_all))
2072                         dev_queue_xmit_nit(skb, dev);
2073
2074                 skb_orphan_try(skb);
2075
2076                 features = netif_skb_features(skb);
2077
2078                 if (vlan_tx_tag_present(skb) &&
2079                     !(features & NETIF_F_HW_VLAN_TX)) {
2080                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2081                         if (unlikely(!skb))
2082                                 goto out;
2083
2084                         skb->vlan_tci = 0;
2085                 }
2086
2087                 if (netif_needs_gso(skb, features)) {
2088                         if (unlikely(dev_gso_segment(skb, features)))
2089                                 goto out_kfree_skb;
2090                         if (skb->next)
2091                                 goto gso;
2092                 } else {
2093                         if (skb_needs_linearize(skb, features) &&
2094                             __skb_linearize(skb))
2095                                 goto out_kfree_skb;
2096
2097                         /* If packet is not checksummed and device does not
2098                          * support checksumming for this protocol, complete
2099                          * checksumming here.
2100                          */
2101                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2102                                 skb_set_transport_header(skb,
2103                                         skb_checksum_start_offset(skb));
2104                                 if (!(features & NETIF_F_ALL_CSUM) &&
2105                                      skb_checksum_help(skb))
2106                                         goto out_kfree_skb;
2107                         }
2108                 }
2109
2110                 rc = ops->ndo_start_xmit(skb, dev);
2111                 trace_net_dev_xmit(skb, rc);
2112                 if (rc == NETDEV_TX_OK)
2113                         txq_trans_update(txq);
2114                 return rc;
2115         }
2116
2117 gso:
2118         do {
2119                 struct sk_buff *nskb = skb->next;
2120
2121                 skb->next = nskb->next;
2122                 nskb->next = NULL;
2123
2124                 /*
2125                  * If device doesnt need nskb->dst, release it right now while
2126                  * its hot in this cpu cache
2127                  */
2128                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2129                         skb_dst_drop(nskb);
2130
2131                 rc = ops->ndo_start_xmit(nskb, dev);
2132                 trace_net_dev_xmit(nskb, rc);
2133                 if (unlikely(rc != NETDEV_TX_OK)) {
2134                         if (rc & ~NETDEV_TX_MASK)
2135                                 goto out_kfree_gso_skb;
2136                         nskb->next = skb->next;
2137                         skb->next = nskb;
2138                         return rc;
2139                 }
2140                 txq_trans_update(txq);
2141                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2142                         return NETDEV_TX_BUSY;
2143         } while (skb->next);
2144
2145 out_kfree_gso_skb:
2146         if (likely(skb->next == NULL))
2147                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2148 out_kfree_skb:
2149         kfree_skb(skb);
2150 out:
2151         return rc;
2152 }
2153
2154 static u32 hashrnd __read_mostly;
2155
2156 /*
2157  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2158  * to be used as a distribution range.
2159  */
2160 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2161                   unsigned int num_tx_queues)
2162 {
2163         u32 hash;
2164
2165         if (skb_rx_queue_recorded(skb)) {
2166                 hash = skb_get_rx_queue(skb);
2167                 while (unlikely(hash >= num_tx_queues))
2168                         hash -= num_tx_queues;
2169                 return hash;
2170         }
2171
2172         if (skb->sk && skb->sk->sk_hash)
2173                 hash = skb->sk->sk_hash;
2174         else
2175                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2176         hash = jhash_1word(hash, hashrnd);
2177
2178         return (u16) (((u64) hash * num_tx_queues) >> 32);
2179 }
2180 EXPORT_SYMBOL(__skb_tx_hash);
2181
2182 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2183 {
2184         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2185                 if (net_ratelimit()) {
2186                         pr_warning("%s selects TX queue %d, but "
2187                                 "real number of TX queues is %d\n",
2188                                 dev->name, queue_index, dev->real_num_tx_queues);
2189                 }
2190                 return 0;
2191         }
2192         return queue_index;
2193 }
2194
2195 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2196 {
2197 #ifdef CONFIG_XPS
2198         struct xps_dev_maps *dev_maps;
2199         struct xps_map *map;
2200         int queue_index = -1;
2201
2202         rcu_read_lock();
2203         dev_maps = rcu_dereference(dev->xps_maps);
2204         if (dev_maps) {
2205                 map = rcu_dereference(
2206                     dev_maps->cpu_map[raw_smp_processor_id()]);
2207                 if (map) {
2208                         if (map->len == 1)
2209                                 queue_index = map->queues[0];
2210                         else {
2211                                 u32 hash;
2212                                 if (skb->sk && skb->sk->sk_hash)
2213                                         hash = skb->sk->sk_hash;
2214                                 else
2215                                         hash = (__force u16) skb->protocol ^
2216                                             skb->rxhash;
2217                                 hash = jhash_1word(hash, hashrnd);
2218                                 queue_index = map->queues[
2219                                     ((u64)hash * map->len) >> 32];
2220                         }
2221                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2222                                 queue_index = -1;
2223                 }
2224         }
2225         rcu_read_unlock();
2226
2227         return queue_index;
2228 #else
2229         return -1;
2230 #endif
2231 }
2232
2233 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2234                                         struct sk_buff *skb)
2235 {
2236         int queue_index;
2237         const struct net_device_ops *ops = dev->netdev_ops;
2238
2239         if (dev->real_num_tx_queues == 1)
2240                 queue_index = 0;
2241         else if (ops->ndo_select_queue) {
2242                 queue_index = ops->ndo_select_queue(dev, skb);
2243                 queue_index = dev_cap_txqueue(dev, queue_index);
2244         } else {
2245                 struct sock *sk = skb->sk;
2246                 queue_index = sk_tx_queue_get(sk);
2247
2248                 if (queue_index < 0 || skb->ooo_okay ||
2249                     queue_index >= dev->real_num_tx_queues) {
2250                         int old_index = queue_index;
2251
2252                         queue_index = get_xps_queue(dev, skb);
2253                         if (queue_index < 0)
2254                                 queue_index = skb_tx_hash(dev, skb);
2255
2256                         if (queue_index != old_index && sk) {
2257                                 struct dst_entry *dst =
2258                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2259
2260                                 if (dst && skb_dst(skb) == dst)
2261                                         sk_tx_queue_set(sk, queue_index);
2262                         }
2263                 }
2264         }
2265
2266         skb_set_queue_mapping(skb, queue_index);
2267         return netdev_get_tx_queue(dev, queue_index);
2268 }
2269
2270 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2271                                  struct net_device *dev,
2272                                  struct netdev_queue *txq)
2273 {
2274         spinlock_t *root_lock = qdisc_lock(q);
2275         bool contended = qdisc_is_running(q);
2276         int rc;
2277
2278         /*
2279          * Heuristic to force contended enqueues to serialize on a
2280          * separate lock before trying to get qdisc main lock.
2281          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2282          * and dequeue packets faster.
2283          */
2284         if (unlikely(contended))
2285                 spin_lock(&q->busylock);
2286
2287         spin_lock(root_lock);
2288         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2289                 kfree_skb(skb);
2290                 rc = NET_XMIT_DROP;
2291         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2292                    qdisc_run_begin(q)) {
2293                 /*
2294                  * This is a work-conserving queue; there are no old skbs
2295                  * waiting to be sent out; and the qdisc is not running -
2296                  * xmit the skb directly.
2297                  */
2298                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2299                         skb_dst_force(skb);
2300                 __qdisc_update_bstats(q, skb->len);
2301                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2302                         if (unlikely(contended)) {
2303                                 spin_unlock(&q->busylock);
2304                                 contended = false;
2305                         }
2306                         __qdisc_run(q);
2307                 } else
2308                         qdisc_run_end(q);
2309
2310                 rc = NET_XMIT_SUCCESS;
2311         } else {
2312                 skb_dst_force(skb);
2313                 rc = qdisc_enqueue_root(skb, q);
2314                 if (qdisc_run_begin(q)) {
2315                         if (unlikely(contended)) {
2316                                 spin_unlock(&q->busylock);
2317                                 contended = false;
2318                         }
2319                         __qdisc_run(q);
2320                 }
2321         }
2322         spin_unlock(root_lock);
2323         if (unlikely(contended))
2324                 spin_unlock(&q->busylock);
2325         return rc;
2326 }
2327
2328 static DEFINE_PER_CPU(int, xmit_recursion);
2329 #define RECURSION_LIMIT 10
2330
2331 /**
2332  *      dev_queue_xmit - transmit a buffer
2333  *      @skb: buffer to transmit
2334  *
2335  *      Queue a buffer for transmission to a network device. The caller must
2336  *      have set the device and priority and built the buffer before calling
2337  *      this function. The function can be called from an interrupt.
2338  *
2339  *      A negative errno code is returned on a failure. A success does not
2340  *      guarantee the frame will be transmitted as it may be dropped due
2341  *      to congestion or traffic shaping.
2342  *
2343  * -----------------------------------------------------------------------------------
2344  *      I notice this method can also return errors from the queue disciplines,
2345  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2346  *      be positive.
2347  *
2348  *      Regardless of the return value, the skb is consumed, so it is currently
2349  *      difficult to retry a send to this method.  (You can bump the ref count
2350  *      before sending to hold a reference for retry if you are careful.)
2351  *
2352  *      When calling this method, interrupts MUST be enabled.  This is because
2353  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2354  *          --BLG
2355  */
2356 int dev_queue_xmit(struct sk_buff *skb)
2357 {
2358         struct net_device *dev = skb->dev;
2359         struct netdev_queue *txq;
2360         struct Qdisc *q;
2361         int rc = -ENOMEM;
2362
2363         /* Disable soft irqs for various locks below. Also
2364          * stops preemption for RCU.
2365          */
2366         rcu_read_lock_bh();
2367
2368         txq = dev_pick_tx(dev, skb);
2369         q = rcu_dereference_bh(txq->qdisc);
2370
2371 #ifdef CONFIG_NET_CLS_ACT
2372         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2373 #endif
2374         trace_net_dev_queue(skb);
2375         if (q->enqueue) {
2376                 rc = __dev_xmit_skb(skb, q, dev, txq);
2377                 goto out;
2378         }
2379
2380         /* The device has no queue. Common case for software devices:
2381            loopback, all the sorts of tunnels...
2382
2383            Really, it is unlikely that netif_tx_lock protection is necessary
2384            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2385            counters.)
2386            However, it is possible, that they rely on protection
2387            made by us here.
2388
2389            Check this and shot the lock. It is not prone from deadlocks.
2390            Either shot noqueue qdisc, it is even simpler 8)
2391          */
2392         if (dev->flags & IFF_UP) {
2393                 int cpu = smp_processor_id(); /* ok because BHs are off */
2394
2395                 if (txq->xmit_lock_owner != cpu) {
2396
2397                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2398                                 goto recursion_alert;
2399
2400                         HARD_TX_LOCK(dev, txq, cpu);
2401
2402                         if (!netif_tx_queue_stopped(txq)) {
2403                                 __this_cpu_inc(xmit_recursion);
2404                                 rc = dev_hard_start_xmit(skb, dev, txq);
2405                                 __this_cpu_dec(xmit_recursion);
2406                                 if (dev_xmit_complete(rc)) {
2407                                         HARD_TX_UNLOCK(dev, txq);
2408                                         goto out;
2409                                 }
2410                         }
2411                         HARD_TX_UNLOCK(dev, txq);
2412                         if (net_ratelimit())
2413                                 printk(KERN_CRIT "Virtual device %s asks to "
2414                                        "queue packet!\n", dev->name);
2415                 } else {
2416                         /* Recursion is detected! It is possible,
2417                          * unfortunately
2418                          */
2419 recursion_alert:
2420                         if (net_ratelimit())
2421                                 printk(KERN_CRIT "Dead loop on virtual device "
2422                                        "%s, fix it urgently!\n", dev->name);
2423                 }
2424         }
2425
2426         rc = -ENETDOWN;
2427         rcu_read_unlock_bh();
2428
2429         kfree_skb(skb);
2430         return rc;
2431 out:
2432         rcu_read_unlock_bh();
2433         return rc;
2434 }
2435 EXPORT_SYMBOL(dev_queue_xmit);
2436
2437
2438 /*=======================================================================
2439                         Receiver routines
2440   =======================================================================*/
2441
2442 int netdev_max_backlog __read_mostly = 1000;
2443 int netdev_tstamp_prequeue __read_mostly = 1;
2444 int netdev_budget __read_mostly = 300;
2445 int weight_p __read_mostly = 64;            /* old backlog weight */
2446
2447 /* Called with irq disabled */
2448 static inline void ____napi_schedule(struct softnet_data *sd,
2449                                      struct napi_struct *napi)
2450 {
2451         list_add_tail(&napi->poll_list, &sd->poll_list);
2452         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2453 }
2454
2455 /*
2456  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2457  * and src/dst port numbers. Returns a non-zero hash number on success
2458  * and 0 on failure.
2459  */
2460 __u32 __skb_get_rxhash(struct sk_buff *skb)
2461 {
2462         int nhoff, hash = 0, poff;
2463         struct ipv6hdr *ip6;
2464         struct iphdr *ip;
2465         u8 ip_proto;
2466         u32 addr1, addr2, ihl;
2467         union {
2468                 u32 v32;
2469                 u16 v16[2];
2470         } ports;
2471
2472         nhoff = skb_network_offset(skb);
2473
2474         switch (skb->protocol) {
2475         case __constant_htons(ETH_P_IP):
2476                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2477                         goto done;
2478
2479                 ip = (struct iphdr *) (skb->data + nhoff);
2480                 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2481                         ip_proto = 0;
2482                 else
2483                         ip_proto = ip->protocol;
2484                 addr1 = (__force u32) ip->saddr;
2485                 addr2 = (__force u32) ip->daddr;
2486                 ihl = ip->ihl;
2487                 break;
2488         case __constant_htons(ETH_P_IPV6):
2489                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2490                         goto done;
2491
2492                 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2493                 ip_proto = ip6->nexthdr;
2494                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2495                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2496                 ihl = (40 >> 2);
2497                 break;
2498         default:
2499                 goto done;
2500         }
2501
2502         ports.v32 = 0;
2503         poff = proto_ports_offset(ip_proto);
2504         if (poff >= 0) {
2505                 nhoff += ihl * 4 + poff;
2506                 if (pskb_may_pull(skb, nhoff + 4)) {
2507                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2508                         if (ports.v16[1] < ports.v16[0])
2509                                 swap(ports.v16[0], ports.v16[1]);
2510                 }
2511         }
2512
2513         /* get a consistent hash (same value on both flow directions) */
2514         if (addr2 < addr1)
2515                 swap(addr1, addr2);
2516
2517         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2518         if (!hash)
2519                 hash = 1;
2520
2521 done:
2522         return hash;
2523 }
2524 EXPORT_SYMBOL(__skb_get_rxhash);
2525
2526 #ifdef CONFIG_RPS
2527
2528 /* One global table that all flow-based protocols share. */
2529 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2530 EXPORT_SYMBOL(rps_sock_flow_table);
2531
2532 /*
2533  * get_rps_cpu is called from netif_receive_skb and returns the target
2534  * CPU from the RPS map of the receiving queue for a given skb.
2535  * rcu_read_lock must be held on entry.
2536  */
2537 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2538                        struct rps_dev_flow **rflowp)
2539 {
2540         struct netdev_rx_queue *rxqueue;
2541         struct rps_map *map;
2542         struct rps_dev_flow_table *flow_table;
2543         struct rps_sock_flow_table *sock_flow_table;
2544         int cpu = -1;
2545         u16 tcpu;
2546
2547         if (skb_rx_queue_recorded(skb)) {
2548                 u16 index = skb_get_rx_queue(skb);
2549                 if (unlikely(index >= dev->real_num_rx_queues)) {
2550                         WARN_ONCE(dev->real_num_rx_queues > 1,
2551                                   "%s received packet on queue %u, but number "
2552                                   "of RX queues is %u\n",
2553                                   dev->name, index, dev->real_num_rx_queues);
2554                         goto done;
2555                 }
2556                 rxqueue = dev->_rx + index;
2557         } else
2558                 rxqueue = dev->_rx;
2559
2560         map = rcu_dereference(rxqueue->rps_map);
2561         if (map) {
2562                 if (map->len == 1) {
2563                         tcpu = map->cpus[0];
2564                         if (cpu_online(tcpu))
2565                                 cpu = tcpu;
2566                         goto done;
2567                 }
2568         } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2569                 goto done;
2570         }
2571
2572         skb_reset_network_header(skb);
2573         if (!skb_get_rxhash(skb))
2574                 goto done;
2575
2576         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2577         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2578         if (flow_table && sock_flow_table) {
2579                 u16 next_cpu;
2580                 struct rps_dev_flow *rflow;
2581
2582                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2583                 tcpu = rflow->cpu;
2584
2585                 next_cpu = sock_flow_table->ents[skb->rxhash &
2586                     sock_flow_table->mask];
2587
2588                 /*
2589                  * If the desired CPU (where last recvmsg was done) is
2590                  * different from current CPU (one in the rx-queue flow
2591                  * table entry), switch if one of the following holds:
2592                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2593                  *   - Current CPU is offline.
2594                  *   - The current CPU's queue tail has advanced beyond the
2595                  *     last packet that was enqueued using this table entry.
2596                  *     This guarantees that all previous packets for the flow
2597                  *     have been dequeued, thus preserving in order delivery.
2598                  */
2599                 if (unlikely(tcpu != next_cpu) &&
2600                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2601                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2602                       rflow->last_qtail)) >= 0)) {
2603                         tcpu = rflow->cpu = next_cpu;
2604                         if (tcpu != RPS_NO_CPU)
2605                                 rflow->last_qtail = per_cpu(softnet_data,
2606                                     tcpu).input_queue_head;
2607                 }
2608                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2609                         *rflowp = rflow;
2610                         cpu = tcpu;
2611                         goto done;
2612                 }
2613         }
2614
2615         if (map) {
2616                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2617
2618                 if (cpu_online(tcpu)) {
2619                         cpu = tcpu;
2620                         goto done;
2621                 }
2622         }
2623
2624 done:
2625         return cpu;
2626 }
2627
2628 /* Called from hardirq (IPI) context */
2629 static void rps_trigger_softirq(void *data)
2630 {
2631         struct softnet_data *sd = data;
2632
2633         ____napi_schedule(sd, &sd->backlog);
2634         sd->received_rps++;
2635 }
2636
2637 #endif /* CONFIG_RPS */
2638
2639 /*
2640  * Check if this softnet_data structure is another cpu one
2641  * If yes, queue it to our IPI list and return 1
2642  * If no, return 0
2643  */
2644 static int rps_ipi_queued(struct softnet_data *sd)
2645 {
2646 #ifdef CONFIG_RPS
2647         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2648
2649         if (sd != mysd) {
2650                 sd->rps_ipi_next = mysd->rps_ipi_list;
2651                 mysd->rps_ipi_list = sd;
2652
2653                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2654                 return 1;
2655         }
2656 #endif /* CONFIG_RPS */
2657         return 0;
2658 }
2659
2660 /*
2661  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2662  * queue (may be a remote CPU queue).
2663  */
2664 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2665                               unsigned int *qtail)
2666 {
2667         struct softnet_data *sd;
2668         unsigned long flags;
2669
2670         sd = &per_cpu(softnet_data, cpu);
2671
2672         local_irq_save(flags);
2673
2674         rps_lock(sd);
2675         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2676                 if (skb_queue_len(&sd->input_pkt_queue)) {
2677 enqueue:
2678                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2679                         input_queue_tail_incr_save(sd, qtail);
2680                         rps_unlock(sd);
2681                         local_irq_restore(flags);
2682                         return NET_RX_SUCCESS;
2683                 }
2684
2685                 /* Schedule NAPI for backlog device
2686                  * We can use non atomic operation since we own the queue lock
2687                  */
2688                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2689                         if (!rps_ipi_queued(sd))
2690                                 ____napi_schedule(sd, &sd->backlog);
2691                 }
2692                 goto enqueue;
2693         }
2694
2695         sd->dropped++;
2696         rps_unlock(sd);
2697
2698         local_irq_restore(flags);
2699
2700         atomic_long_inc(&skb->dev->rx_dropped);
2701         kfree_skb(skb);
2702         return NET_RX_DROP;
2703 }
2704
2705 /**
2706  *      netif_rx        -       post buffer to the network code
2707  *      @skb: buffer to post
2708  *
2709  *      This function receives a packet from a device driver and queues it for
2710  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2711  *      may be dropped during processing for congestion control or by the
2712  *      protocol layers.
2713  *
2714  *      return values:
2715  *      NET_RX_SUCCESS  (no congestion)
2716  *      NET_RX_DROP     (packet was dropped)
2717  *
2718  */
2719
2720 int netif_rx(struct sk_buff *skb)
2721 {
2722         int ret;
2723
2724         /* if netpoll wants it, pretend we never saw it */
2725         if (netpoll_rx(skb))
2726                 return NET_RX_DROP;
2727
2728         if (netdev_tstamp_prequeue)
2729                 net_timestamp_check(skb);
2730
2731         trace_netif_rx(skb);
2732 #ifdef CONFIG_RPS
2733         {
2734                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2735                 int cpu;
2736
2737                 preempt_disable();
2738                 rcu_read_lock();
2739
2740                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2741                 if (cpu < 0)
2742                         cpu = smp_processor_id();
2743
2744                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2745
2746                 rcu_read_unlock();
2747                 preempt_enable();
2748         }
2749 #else
2750         {
2751                 unsigned int qtail;
2752                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2753                 put_cpu();
2754         }
2755 #endif
2756         return ret;
2757 }
2758 EXPORT_SYMBOL(netif_rx);
2759
2760 int netif_rx_ni(struct sk_buff *skb)
2761 {
2762         int err;
2763
2764         preempt_disable();
2765         err = netif_rx(skb);
2766         if (local_softirq_pending())
2767                 do_softirq();
2768         preempt_enable();
2769
2770         return err;
2771 }
2772 EXPORT_SYMBOL(netif_rx_ni);
2773
2774 static void net_tx_action(struct softirq_action *h)
2775 {
2776         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2777
2778         if (sd->completion_queue) {
2779                 struct sk_buff *clist;
2780
2781                 local_irq_disable();
2782                 clist = sd->completion_queue;
2783                 sd->completion_queue = NULL;
2784                 local_irq_enable();
2785
2786                 while (clist) {
2787                         struct sk_buff *skb = clist;
2788                         clist = clist->next;
2789
2790                         WARN_ON(atomic_read(&skb->users));
2791                         trace_kfree_skb(skb, net_tx_action);
2792                         __kfree_skb(skb);
2793                 }
2794         }
2795
2796         if (sd->output_queue) {
2797                 struct Qdisc *head;
2798
2799                 local_irq_disable();
2800                 head = sd->output_queue;
2801                 sd->output_queue = NULL;
2802                 sd->output_queue_tailp = &sd->output_queue;
2803                 local_irq_enable();
2804
2805                 while (head) {
2806                         struct Qdisc *q = head;
2807                         spinlock_t *root_lock;
2808
2809                         head = head->next_sched;
2810
2811                         root_lock = qdisc_lock(q);
2812                         if (spin_trylock(root_lock)) {
2813                                 smp_mb__before_clear_bit();
2814                                 clear_bit(__QDISC_STATE_SCHED,
2815                                           &q->state);
2816                                 qdisc_run(q);
2817                                 spin_unlock(root_lock);
2818                         } else {
2819                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2820                                               &q->state)) {
2821                                         __netif_reschedule(q);
2822                                 } else {
2823                                         smp_mb__before_clear_bit();
2824                                         clear_bit(__QDISC_STATE_SCHED,
2825                                                   &q->state);
2826                                 }
2827                         }
2828                 }
2829         }
2830 }
2831
2832 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2833     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2834 /* This hook is defined here for ATM LANE */
2835 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2836                              unsigned char *addr) __read_mostly;
2837 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2838 #endif
2839
2840 #ifdef CONFIG_NET_CLS_ACT
2841 /* TODO: Maybe we should just force sch_ingress to be compiled in
2842  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2843  * a compare and 2 stores extra right now if we dont have it on
2844  * but have CONFIG_NET_CLS_ACT
2845  * NOTE: This doesnt stop any functionality; if you dont have
2846  * the ingress scheduler, you just cant add policies on ingress.
2847  *
2848  */
2849 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2850 {
2851         struct net_device *dev = skb->dev;
2852         u32 ttl = G_TC_RTTL(skb->tc_verd);
2853         int result = TC_ACT_OK;
2854         struct Qdisc *q;
2855
2856         if (unlikely(MAX_RED_LOOP < ttl++)) {
2857                 if (net_ratelimit())
2858                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2859                                skb->skb_iif, dev->ifindex);
2860                 return TC_ACT_SHOT;
2861         }
2862
2863         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2864         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2865
2866         q = rxq->qdisc;
2867         if (q != &noop_qdisc) {
2868                 spin_lock(qdisc_lock(q));
2869                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2870                         result = qdisc_enqueue_root(skb, q);
2871                 spin_unlock(qdisc_lock(q));
2872         }
2873
2874         return result;
2875 }
2876
2877 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2878                                          struct packet_type **pt_prev,
2879                                          int *ret, struct net_device *orig_dev)
2880 {
2881         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2882
2883         if (!rxq || rxq->qdisc == &noop_qdisc)
2884                 goto out;
2885
2886         if (*pt_prev) {
2887                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2888                 *pt_prev = NULL;
2889         }
2890
2891         switch (ing_filter(skb, rxq)) {
2892         case TC_ACT_SHOT:
2893         case TC_ACT_STOLEN:
2894                 kfree_skb(skb);
2895                 return NULL;
2896         }
2897
2898 out:
2899         skb->tc_verd = 0;
2900         return skb;
2901 }
2902 #endif
2903
2904 /**
2905  *      netdev_rx_handler_register - register receive handler
2906  *      @dev: device to register a handler for
2907  *      @rx_handler: receive handler to register
2908  *      @rx_handler_data: data pointer that is used by rx handler
2909  *
2910  *      Register a receive hander for a device. This handler will then be
2911  *      called from __netif_receive_skb. A negative errno code is returned
2912  *      on a failure.
2913  *
2914  *      The caller must hold the rtnl_mutex.
2915  */
2916 int netdev_rx_handler_register(struct net_device *dev,
2917                                rx_handler_func_t *rx_handler,
2918                                void *rx_handler_data)
2919 {
2920         ASSERT_RTNL();
2921
2922         if (dev->rx_handler)
2923                 return -EBUSY;
2924
2925         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2926         rcu_assign_pointer(dev->rx_handler, rx_handler);
2927
2928         return 0;
2929 }
2930 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2931
2932 /**
2933  *      netdev_rx_handler_unregister - unregister receive handler
2934  *      @dev: device to unregister a handler from
2935  *
2936  *      Unregister a receive hander from a device.
2937  *
2938  *      The caller must hold the rtnl_mutex.
2939  */
2940 void netdev_rx_handler_unregister(struct net_device *dev)
2941 {
2942
2943         ASSERT_RTNL();
2944         rcu_assign_pointer(dev->rx_handler, NULL);
2945         rcu_assign_pointer(dev->rx_handler_data, NULL);
2946 }
2947 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2948
2949 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2950                                               struct net_device *master)
2951 {
2952         if (skb->pkt_type == PACKET_HOST) {
2953                 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2954
2955                 memcpy(dest, master->dev_addr, ETH_ALEN);
2956         }
2957 }
2958
2959 /* On bonding slaves other than the currently active slave, suppress
2960  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2961  * ARP on active-backup slaves with arp_validate enabled.
2962  */
2963 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2964 {
2965         struct net_device *dev = skb->dev;
2966
2967         if (master->priv_flags & IFF_MASTER_ARPMON)
2968                 dev->last_rx = jiffies;
2969
2970         if ((master->priv_flags & IFF_MASTER_ALB) &&
2971             (master->priv_flags & IFF_BRIDGE_PORT)) {
2972                 /* Do address unmangle. The local destination address
2973                  * will be always the one master has. Provides the right
2974                  * functionality in a bridge.
2975                  */
2976                 skb_bond_set_mac_by_master(skb, master);
2977         }
2978
2979         if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2980                 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2981                     skb->protocol == __cpu_to_be16(ETH_P_ARP))
2982                         return 0;
2983
2984                 if (master->priv_flags & IFF_MASTER_ALB) {
2985                         if (skb->pkt_type != PACKET_BROADCAST &&
2986                             skb->pkt_type != PACKET_MULTICAST)
2987                                 return 0;
2988                 }
2989                 if (master->priv_flags & IFF_MASTER_8023AD &&
2990                     skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2991                         return 0;
2992
2993                 return 1;
2994         }
2995         return 0;
2996 }
2997 EXPORT_SYMBOL(__skb_bond_should_drop);
2998
2999 static int __netif_receive_skb(struct sk_buff *skb)
3000 {
3001         struct packet_type *ptype, *pt_prev;
3002         rx_handler_func_t *rx_handler;
3003         struct net_device *orig_dev;
3004         struct net_device *master;
3005         struct net_device *null_or_orig;
3006         struct net_device *orig_or_bond;
3007         int ret = NET_RX_DROP;
3008         __be16 type;
3009
3010         if (!netdev_tstamp_prequeue)
3011                 net_timestamp_check(skb);
3012
3013         trace_netif_receive_skb(skb);
3014
3015         /* if we've gotten here through NAPI, check netpoll */
3016         if (netpoll_receive_skb(skb))
3017                 return NET_RX_DROP;
3018
3019         if (!skb->skb_iif)
3020                 skb->skb_iif = skb->dev->ifindex;
3021
3022         /*
3023          * bonding note: skbs received on inactive slaves should only
3024          * be delivered to pkt handlers that are exact matches.  Also
3025          * the deliver_no_wcard flag will be set.  If packet handlers
3026          * are sensitive to duplicate packets these skbs will need to
3027          * be dropped at the handler.
3028          */
3029         null_or_orig = NULL;
3030         orig_dev = skb->dev;
3031         master = ACCESS_ONCE(orig_dev->master);
3032         if (skb->deliver_no_wcard)
3033                 null_or_orig = orig_dev;
3034         else if (master) {
3035                 if (skb_bond_should_drop(skb, master)) {
3036                         skb->deliver_no_wcard = 1;
3037                         null_or_orig = orig_dev; /* deliver only exact match */
3038                 } else
3039                         skb->dev = master;
3040         }
3041
3042         __this_cpu_inc(softnet_data.processed);
3043         skb_reset_network_header(skb);
3044         skb_reset_transport_header(skb);
3045         skb->mac_len = skb->network_header - skb->mac_header;
3046
3047         pt_prev = NULL;
3048
3049         rcu_read_lock();
3050
3051 #ifdef CONFIG_NET_CLS_ACT
3052         if (skb->tc_verd & TC_NCLS) {
3053                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3054                 goto ncls;
3055         }
3056 #endif
3057
3058         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3059                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3060                     ptype->dev == orig_dev) {
3061                         if (pt_prev)
3062                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3063                         pt_prev = ptype;
3064                 }
3065         }
3066
3067 #ifdef CONFIG_NET_CLS_ACT
3068         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3069         if (!skb)
3070                 goto out;
3071 ncls:
3072 #endif
3073
3074         /* Handle special case of bridge or macvlan */
3075         rx_handler = rcu_dereference(skb->dev->rx_handler);
3076         if (rx_handler) {
3077                 if (pt_prev) {
3078                         ret = deliver_skb(skb, pt_prev, orig_dev);
3079                         pt_prev = NULL;
3080                 }
3081                 skb = rx_handler(skb);
3082                 if (!skb)
3083                         goto out;
3084         }
3085
3086         if (vlan_tx_tag_present(skb)) {
3087                 if (pt_prev) {
3088                         ret = deliver_skb(skb, pt_prev, orig_dev);
3089                         pt_prev = NULL;
3090                 }
3091                 if (vlan_hwaccel_do_receive(&skb)) {
3092                         ret = __netif_receive_skb(skb);
3093                         goto out;
3094                 } else if (unlikely(!skb))
3095                         goto out;
3096         }
3097
3098         /*
3099          * Make sure frames received on VLAN interfaces stacked on
3100          * bonding interfaces still make their way to any base bonding
3101          * device that may have registered for a specific ptype.  The
3102          * handler may have to adjust skb->dev and orig_dev.
3103          */
3104         orig_or_bond = orig_dev;
3105         if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3106             (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
3107                 orig_or_bond = vlan_dev_real_dev(skb->dev);
3108         }
3109
3110         type = skb->protocol;
3111         list_for_each_entry_rcu(ptype,
3112                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3113                 if (ptype->type == type && (ptype->dev == null_or_orig ||
3114                      ptype->dev == skb->dev || ptype->dev == orig_dev ||
3115                      ptype->dev == orig_or_bond)) {
3116                         if (pt_prev)
3117                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3118                         pt_prev = ptype;
3119                 }
3120         }
3121
3122         if (pt_prev) {
3123                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3124         } else {
3125                 atomic_long_inc(&skb->dev->rx_dropped);
3126                 kfree_skb(skb);
3127                 /* Jamal, now you will not able to escape explaining
3128                  * me how you were going to use this. :-)
3129                  */
3130                 ret = NET_RX_DROP;
3131         }
3132
3133 out:
3134         rcu_read_unlock();
3135         return ret;
3136 }
3137
3138 /**
3139  *      netif_receive_skb - process receive buffer from network
3140  *      @skb: buffer to process
3141  *
3142  *      netif_receive_skb() is the main receive data processing function.
3143  *      It always succeeds. The buffer may be dropped during processing
3144  *      for congestion control or by the protocol layers.
3145  *
3146  *      This function may only be called from softirq context and interrupts
3147  *      should be enabled.
3148  *
3149  *      Return values (usually ignored):
3150  *      NET_RX_SUCCESS: no congestion
3151  *      NET_RX_DROP: packet was dropped
3152  */
3153 int netif_receive_skb(struct sk_buff *skb)
3154 {
3155         if (netdev_tstamp_prequeue)
3156                 net_timestamp_check(skb);
3157
3158         if (skb_defer_rx_timestamp(skb))
3159                 return NET_RX_SUCCESS;
3160
3161 #ifdef CONFIG_RPS
3162         {
3163                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3164                 int cpu, ret;
3165
3166                 rcu_read_lock();
3167
3168                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3169
3170                 if (cpu >= 0) {
3171                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3172                         rcu_read_unlock();
3173                 } else {
3174                         rcu_read_unlock();
3175                         ret = __netif_receive_skb(skb);
3176                 }
3177
3178                 return ret;
3179         }
3180 #else
3181         return __netif_receive_skb(skb);
3182 #endif
3183 }
3184 EXPORT_SYMBOL(netif_receive_skb);
3185
3186 /* Network device is going away, flush any packets still pending
3187  * Called with irqs disabled.
3188  */
3189 static void flush_backlog(void *arg)
3190 {
3191         struct net_device *dev = arg;
3192         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3193         struct sk_buff *skb, *tmp;
3194
3195         rps_lock(sd);
3196         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3197                 if (skb->dev == dev) {
3198                         __skb_unlink(skb, &sd->input_pkt_queue);
3199                         kfree_skb(skb);
3200                         input_queue_head_incr(sd);
3201                 }
3202         }
3203         rps_unlock(sd);
3204
3205         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3206                 if (skb->dev == dev) {
3207                         __skb_unlink(skb, &sd->process_queue);
3208                         kfree_skb(skb);
3209                         input_queue_head_incr(sd);
3210                 }
3211         }
3212 }
3213
3214 static int napi_gro_complete(struct sk_buff *skb)
3215 {
3216         struct packet_type *ptype;
3217         __be16 type = skb->protocol;
3218         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3219         int err = -ENOENT;
3220
3221         if (NAPI_GRO_CB(skb)->count == 1) {
3222                 skb_shinfo(skb)->gso_size = 0;
3223                 goto out;
3224         }
3225
3226         rcu_read_lock();
3227         list_for_each_entry_rcu(ptype, head, list) {
3228                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3229                         continue;
3230
3231                 err = ptype->gro_complete(skb);
3232                 break;
3233         }
3234         rcu_read_unlock();
3235
3236         if (err) {
3237                 WARN_ON(&ptype->list == head);
3238                 kfree_skb(skb);
3239                 return NET_RX_SUCCESS;
3240         }
3241
3242 out:
3243         return netif_receive_skb(skb);
3244 }
3245
3246 inline void napi_gro_flush(struct napi_struct *napi)
3247 {
3248         struct sk_buff *skb, *next;
3249
3250         for (skb = napi->gro_list; skb; skb = next) {
3251                 next = skb->next;
3252                 skb->next = NULL;
3253                 napi_gro_complete(skb);
3254         }
3255
3256         napi->gro_count = 0;
3257         napi->gro_list = NULL;
3258 }
3259 EXPORT_SYMBOL(napi_gro_flush);
3260
3261 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3262 {
3263         struct sk_buff **pp = NULL;
3264         struct packet_type *ptype;
3265         __be16 type = skb->protocol;
3266         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3267         int same_flow;
3268         int mac_len;
3269         enum gro_result ret;
3270
3271         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3272                 goto normal;
3273
3274         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3275                 goto normal;
3276
3277         rcu_read_lock();
3278         list_for_each_entry_rcu(ptype, head, list) {
3279                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3280                         continue;
3281
3282                 skb_set_network_header(skb, skb_gro_offset(skb));
3283                 mac_len = skb->network_header - skb->mac_header;
3284                 skb->mac_len = mac_len;
3285                 NAPI_GRO_CB(skb)->same_flow = 0;
3286                 NAPI_GRO_CB(skb)->flush = 0;
3287                 NAPI_GRO_CB(skb)->free = 0;
3288
3289                 pp = ptype->gro_receive(&napi->gro_list, skb);
3290                 break;
3291         }
3292         rcu_read_unlock();
3293
3294         if (&ptype->list == head)
3295                 goto normal;
3296
3297         same_flow = NAPI_GRO_CB(skb)->same_flow;
3298         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3299
3300         if (pp) {
3301                 struct sk_buff *nskb = *pp;
3302
3303                 *pp = nskb->next;
3304                 nskb->next = NULL;
3305                 napi_gro_complete(nskb);
3306                 napi->gro_count--;
3307         }
3308
3309         if (same_flow)
3310                 goto ok;
3311
3312         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3313                 goto normal;
3314
3315         napi->gro_count++;
3316         NAPI_GRO_CB(skb)->count = 1;
3317         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3318         skb->next = napi->gro_list;
3319         napi->gro_list = skb;
3320         ret = GRO_HELD;
3321
3322 pull:
3323         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3324                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3325
3326                 BUG_ON(skb->end - skb->tail < grow);
3327
3328                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3329
3330                 skb->tail += grow;
3331                 skb->data_len -= grow;
3332
3333                 skb_shinfo(skb)->frags[0].page_offset += grow;
3334                 skb_shinfo(skb)->frags[0].size -= grow;
3335
3336                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3337                         put_page(skb_shinfo(skb)->frags[0].page);
3338                         memmove(skb_shinfo(skb)->frags,
3339                                 skb_shinfo(skb)->frags + 1,
3340                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3341                 }
3342         }
3343
3344 ok:
3345         return ret;
3346
3347 normal:
3348         ret = GRO_NORMAL;
3349         goto pull;
3350 }
3351 EXPORT_SYMBOL(dev_gro_receive);
3352
3353 static inline gro_result_t
3354 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3355 {
3356         struct sk_buff *p;
3357
3358         for (p = napi->gro_list; p; p = p->next) {
3359                 unsigned long diffs;
3360
3361                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3362                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3363                 diffs |= compare_ether_header(skb_mac_header(p),
3364                                               skb_gro_mac_header(skb));
3365                 NAPI_GRO_CB(p)->same_flow = !diffs;
3366                 NAPI_GRO_CB(p)->flush = 0;
3367         }
3368
3369         return dev_gro_receive(napi, skb);
3370 }
3371
3372 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3373 {
3374         switch (ret) {
3375         case GRO_NORMAL:
3376                 if (netif_receive_skb(skb))
3377                         ret = GRO_DROP;
3378                 break;
3379
3380         case GRO_DROP:
3381         case GRO_MERGED_FREE:
3382                 kfree_skb(skb);
3383                 break;
3384
3385         case GRO_HELD:
3386         case GRO_MERGED:
3387                 break;
3388         }
3389
3390         return ret;
3391 }
3392 EXPORT_SYMBOL(napi_skb_finish);
3393
3394 void skb_gro_reset_offset(struct sk_buff *skb)
3395 {
3396         NAPI_GRO_CB(skb)->data_offset = 0;
3397         NAPI_GRO_CB(skb)->frag0 = NULL;
3398         NAPI_GRO_CB(skb)->frag0_len = 0;
3399
3400         if (skb->mac_header == skb->tail &&
3401             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3402                 NAPI_GRO_CB(skb)->frag0 =
3403                         page_address(skb_shinfo(skb)->frags[0].page) +
3404                         skb_shinfo(skb)->frags[0].page_offset;
3405                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3406         }
3407 }
3408 EXPORT_SYMBOL(skb_gro_reset_offset);
3409
3410 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3411 {
3412         skb_gro_reset_offset(skb);
3413
3414         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3415 }
3416 EXPORT_SYMBOL(napi_gro_receive);
3417
3418 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3419 {
3420         __skb_pull(skb, skb_headlen(skb));
3421         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3422         skb->vlan_tci = 0;
3423
3424         napi->skb = skb;
3425 }
3426
3427 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3428 {
3429         struct sk_buff *skb = napi->skb;
3430
3431         if (!skb) {
3432                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3433                 if (skb)
3434                         napi->skb = skb;
3435         }
3436         return skb;
3437 }
3438 EXPORT_SYMBOL(napi_get_frags);
3439
3440 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3441                                gro_result_t ret)
3442 {
3443         switch (ret) {
3444         case GRO_NORMAL:
3445         case GRO_HELD:
3446                 skb->protocol = eth_type_trans(skb, skb->dev);
3447
3448                 if (ret == GRO_HELD)
3449                         skb_gro_pull(skb, -ETH_HLEN);
3450                 else if (netif_receive_skb(skb))
3451                         ret = GRO_DROP;
3452                 break;
3453
3454         case GRO_DROP:
3455         case GRO_MERGED_FREE:
3456                 napi_reuse_skb(napi, skb);
3457                 break;
3458
3459         case GRO_MERGED:
3460                 break;
3461         }
3462
3463         return ret;
3464 }
3465 EXPORT_SYMBOL(napi_frags_finish);
3466
3467 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3468 {
3469         struct sk_buff *skb = napi->skb;
3470         struct ethhdr *eth;
3471         unsigned int hlen;
3472         unsigned int off;
3473
3474         napi->skb = NULL;
3475
3476         skb_reset_mac_header(skb);
3477         skb_gro_reset_offset(skb);
3478
3479         off = skb_gro_offset(skb);
3480         hlen = off + sizeof(*eth);
3481         eth = skb_gro_header_fast(skb, off);
3482         if (skb_gro_header_hard(skb, hlen)) {
3483                 eth = skb_gro_header_slow(skb, hlen, off);
3484                 if (unlikely(!eth)) {
3485                         napi_reuse_skb(napi, skb);
3486                         skb = NULL;
3487                         goto out;
3488                 }
3489         }
3490
3491         skb_gro_pull(skb, sizeof(*eth));
3492
3493         /*
3494          * This works because the only protocols we care about don't require
3495          * special handling.  We'll fix it up properly at the end.
3496          */
3497         skb->protocol = eth->h_proto;
3498
3499 out:
3500         return skb;
3501 }
3502 EXPORT_SYMBOL(napi_frags_skb);
3503
3504 gro_result_t napi_gro_frags(struct napi_struct *napi)
3505 {
3506         struct sk_buff *skb = napi_frags_skb(napi);
3507
3508         if (!skb)
3509                 return GRO_DROP;
3510
3511         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3512 }
3513 EXPORT_SYMBOL(napi_gro_frags);
3514
3515 /*
3516  * net_rps_action sends any pending IPI's for rps.
3517  * Note: called with local irq disabled, but exits with local irq enabled.
3518  */
3519 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3520 {
3521 #ifdef CONFIG_RPS
3522         struct softnet_data *remsd = sd->rps_ipi_list;
3523
3524         if (remsd) {
3525                 sd->rps_ipi_list = NULL;
3526
3527                 local_irq_enable();
3528
3529                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3530                 while (remsd) {
3531                         struct softnet_data *next = remsd->rps_ipi_next;
3532
3533                         if (cpu_online(remsd->cpu))
3534                                 __smp_call_function_single(remsd->cpu,
3535                                                            &remsd->csd, 0);
3536                         remsd = next;
3537                 }
3538         } else
3539 #endif
3540                 local_irq_enable();
3541 }
3542
3543 static int process_backlog(struct napi_struct *napi, int quota)
3544 {
3545         int work = 0;
3546         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3547
3548 #ifdef CONFIG_RPS
3549         /* Check if we have pending ipi, its better to send them now,
3550          * not waiting net_rx_action() end.
3551          */
3552         if (sd->rps_ipi_list) {
3553                 local_irq_disable();
3554                 net_rps_action_and_irq_enable(sd);
3555         }
3556 #endif
3557         napi->weight = weight_p;
3558         local_irq_disable();
3559         while (work < quota) {
3560                 struct sk_buff *skb;
3561                 unsigned int qlen;
3562
3563                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3564                         local_irq_enable();
3565                         __netif_receive_skb(skb);
3566                         local_irq_disable();
3567                         input_queue_head_incr(sd);
3568                         if (++work >= quota) {
3569                                 local_irq_enable();
3570                                 return work;
3571                         }
3572                 }
3573
3574                 rps_lock(sd);
3575                 qlen = skb_queue_len(&sd->input_pkt_queue);
3576                 if (qlen)
3577                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3578                                                    &sd->process_queue);
3579
3580                 if (qlen < quota - work) {
3581                         /*
3582                          * Inline a custom version of __napi_complete().
3583                          * only current cpu owns and manipulates this napi,
3584                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3585                          * we can use a plain write instead of clear_bit(),
3586                          * and we dont need an smp_mb() memory barrier.
3587                          */
3588                         list_del(&napi->poll_list);
3589                         napi->state = 0;
3590
3591                         quota = work + qlen;
3592                 }
3593                 rps_unlock(sd);
3594         }
3595         local_irq_enable();
3596
3597         return work;
3598 }
3599
3600 /**
3601  * __napi_schedule - schedule for receive
3602  * @n: entry to schedule
3603  *
3604  * The entry's receive function will be scheduled to run
3605  */
3606 void __napi_schedule(struct napi_struct *n)
3607 {
3608         unsigned long flags;
3609
3610         local_irq_save(flags);
3611         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3612         local_irq_restore(flags);
3613 }
3614 EXPORT_SYMBOL(__napi_schedule);
3615
3616 void __napi_complete(struct napi_struct *n)
3617 {
3618         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3619         BUG_ON(n->gro_list);
3620
3621         list_del(&n->poll_list);
3622         smp_mb__before_clear_bit();
3623         clear_bit(NAPI_STATE_SCHED, &n->state);
3624 }
3625 EXPORT_SYMBOL(__napi_complete);
3626
3627 void napi_complete(struct napi_struct *n)
3628 {
3629         unsigned long flags;
3630
3631         /*
3632          * don't let napi dequeue from the cpu poll list
3633          * just in case its running on a different cpu
3634          */
3635         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3636                 return;
3637
3638         napi_gro_flush(n);
3639         local_irq_save(flags);
3640         __napi_complete(n);
3641         local_irq_restore(flags);
3642 }
3643 EXPORT_SYMBOL(napi_complete);
3644
3645 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3646                     int (*poll)(struct napi_struct *, int), int weight)
3647 {
3648         INIT_LIST_HEAD(&napi->poll_list);
3649         napi->gro_count = 0;
3650         napi->gro_list = NULL;
3651         napi->skb = NULL;
3652         napi->poll = poll;
3653         napi->weight = weight;
3654         list_add(&napi->dev_list, &dev->napi_list);
3655         napi->dev = dev;
3656 #ifdef CONFIG_NETPOLL
3657         spin_lock_init(&napi->poll_lock);
3658         napi->poll_owner = -1;
3659 #endif
3660         set_bit(NAPI_STATE_SCHED, &napi->state);
3661 }
3662 EXPORT_SYMBOL(netif_napi_add);
3663
3664 void netif_napi_del(struct napi_struct *napi)
3665 {
3666         struct sk_buff *skb, *next;
3667
3668         list_del_init(&napi->dev_list);
3669         napi_free_frags(napi);
3670
3671         for (skb = napi->gro_list; skb; skb = next) {
3672                 next = skb->next;
3673                 skb->next = NULL;
3674                 kfree_skb(skb);
3675         }
3676
3677         napi->gro_list = NULL;
3678         napi->gro_count = 0;
3679 }
3680 EXPORT_SYMBOL(netif_napi_del);
3681
3682 static void net_rx_action(struct softirq_action *h)
3683 {
3684         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3685         unsigned long time_limit = jiffies + 2;
3686         int budget = netdev_budget;
3687         void *have;
3688
3689         local_irq_disable();
3690
3691         while (!list_empty(&sd->poll_list)) {
3692                 struct napi_struct *n;
3693                 int work, weight;
3694
3695                 /* If softirq window is exhuasted then punt.
3696                  * Allow this to run for 2 jiffies since which will allow
3697                  * an average latency of 1.5/HZ.
3698                  */
3699                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3700                         goto softnet_break;
3701
3702                 local_irq_enable();
3703
3704                 /* Even though interrupts have been re-enabled, this
3705                  * access is safe because interrupts can only add new
3706                  * entries to the tail of this list, and only ->poll()
3707                  * calls can remove this head entry from the list.
3708                  */
3709                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3710
3711                 have = netpoll_poll_lock(n);
3712
3713                 weight = n->weight;
3714
3715                 /* This NAPI_STATE_SCHED test is for avoiding a race
3716                  * with netpoll's poll_napi().  Only the entity which
3717                  * obtains the lock and sees NAPI_STATE_SCHED set will
3718                  * actually make the ->poll() call.  Therefore we avoid
3719                  * accidently calling ->poll() when NAPI is not scheduled.
3720                  */
3721                 work = 0;
3722                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3723                         work = n->poll(n, weight);
3724                         trace_napi_poll(n);
3725                 }
3726
3727                 WARN_ON_ONCE(work > weight);
3728
3729                 budget -= work;
3730
3731                 local_irq_disable();
3732
3733                 /* Drivers must not modify the NAPI state if they
3734                  * consume the entire weight.  In such cases this code
3735                  * still "owns" the NAPI instance and therefore can
3736                  * move the instance around on the list at-will.
3737                  */
3738                 if (unlikely(work == weight)) {
3739                         if (unlikely(napi_disable_pending(n))) {
3740                                 local_irq_enable();
3741                                 napi_complete(n);
3742                                 local_irq_disable();
3743                         } else
3744                                 list_move_tail(&n->poll_list, &sd->poll_list);
3745                 }
3746
3747                 netpoll_poll_unlock(have);
3748         }
3749 out:
3750         net_rps_action_and_irq_enable(sd);
3751
3752 #ifdef CONFIG_NET_DMA
3753         /*
3754          * There may not be any more sk_buffs coming right now, so push
3755          * any pending DMA copies to hardware
3756          */
3757         dma_issue_pending_all();
3758 #endif
3759
3760         return;
3761
3762 softnet_break:
3763         sd->time_squeeze++;
3764         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3765         goto out;
3766 }
3767
3768 static gifconf_func_t *gifconf_list[NPROTO];
3769
3770 /**
3771  *      register_gifconf        -       register a SIOCGIF handler
3772  *      @family: Address family
3773  *      @gifconf: Function handler
3774  *
3775  *      Register protocol dependent address dumping routines. The handler
3776  *      that is passed must not be freed or reused until it has been replaced
3777  *      by another handler.
3778  */
3779 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3780 {
3781         if (family >= NPROTO)
3782                 return -EINVAL;
3783         gifconf_list[family] = gifconf;
3784         return 0;
3785 }
3786 EXPORT_SYMBOL(register_gifconf);
3787
3788
3789 /*
3790  *      Map an interface index to its name (SIOCGIFNAME)
3791  */
3792
3793 /*
3794  *      We need this ioctl for efficient implementation of the
3795  *      if_indextoname() function required by the IPv6 API.  Without
3796  *      it, we would have to search all the interfaces to find a
3797  *      match.  --pb
3798  */
3799
3800 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3801 {
3802         struct net_device *dev;
3803         struct ifreq ifr;
3804
3805         /*
3806          *      Fetch the caller's info block.
3807          */
3808
3809         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3810                 return -EFAULT;
3811
3812         rcu_read_lock();
3813         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3814         if (!dev) {
3815                 rcu_read_unlock();
3816                 return -ENODEV;
3817         }
3818
3819         strcpy(ifr.ifr_name, dev->name);
3820         rcu_read_unlock();
3821
3822         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3823                 return -EFAULT;
3824         return 0;
3825 }
3826
3827 /*
3828  *      Perform a SIOCGIFCONF call. This structure will change
3829  *      size eventually, and there is nothing I can do about it.
3830  *      Thus we will need a 'compatibility mode'.
3831  */
3832
3833 static int dev_ifconf(struct net *net, char __user *arg)
3834 {
3835         struct ifconf ifc;
3836         struct net_device *dev;
3837         char __user *pos;
3838         int len;
3839         int total;
3840         int i;
3841
3842         /*
3843          *      Fetch the caller's info block.
3844          */
3845
3846         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3847                 return -EFAULT;
3848
3849         pos = ifc.ifc_buf;
3850         len = ifc.ifc_len;
3851
3852         /*
3853          *      Loop over the interfaces, and write an info block for each.
3854          */
3855
3856         total = 0;
3857         for_each_netdev(net, dev) {
3858                 for (i = 0; i < NPROTO; i++) {
3859                         if (gifconf_list[i]) {
3860                                 int done;
3861                                 if (!pos)
3862                                         done = gifconf_list[i](dev, NULL, 0);
3863                                 else
3864                                         done = gifconf_list[i](dev, pos + total,
3865                                                                len - total);
3866                                 if (done < 0)
3867                                         return -EFAULT;
3868                                 total += done;
3869                         }
3870                 }
3871         }
3872
3873         /*
3874          *      All done.  Write the updated control block back to the caller.
3875          */
3876         ifc.ifc_len = total;
3877
3878         /*
3879          *      Both BSD and Solaris return 0 here, so we do too.
3880          */
3881         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3882 }
3883
3884 #ifdef CONFIG_PROC_FS
3885 /*
3886  *      This is invoked by the /proc filesystem handler to display a device
3887  *      in detail.
3888  */
3889 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3890         __acquires(RCU)
3891 {
3892         struct net *net = seq_file_net(seq);
3893         loff_t off;
3894         struct net_device *dev;
3895
3896         rcu_read_lock();
3897         if (!*pos)
3898                 return SEQ_START_TOKEN;
3899
3900         off = 1;
3901         for_each_netdev_rcu(net, dev)
3902                 if (off++ == *pos)
3903                         return dev;
3904
3905         return NULL;
3906 }
3907
3908 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3909 {
3910         struct net_device *dev = (v == SEQ_START_TOKEN) ?
3911                                   first_net_device(seq_file_net(seq)) :
3912                                   next_net_device((struct net_device *)v);
3913
3914         ++*pos;
3915         return rcu_dereference(dev);
3916 }
3917
3918 void dev_seq_stop(struct seq_file *seq, void *v)
3919         __releases(RCU)
3920 {
3921         rcu_read_unlock();
3922 }
3923
3924 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3925 {
3926         struct rtnl_link_stats64 temp;
3927         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3928
3929         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3930                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3931                    dev->name, stats->rx_bytes, stats->rx_packets,
3932                    stats->rx_errors,
3933                    stats->rx_dropped + stats->rx_missed_errors,
3934                    stats->rx_fifo_errors,
3935                    stats->rx_length_errors + stats->rx_over_errors +
3936                     stats->rx_crc_errors + stats->rx_frame_errors,
3937                    stats->rx_compressed, stats->multicast,
3938                    stats->tx_bytes, stats->tx_packets,
3939                    stats->tx_errors, stats->tx_dropped,
3940                    stats->tx_fifo_errors, stats->collisions,
3941                    stats->tx_carrier_errors +
3942                     stats->tx_aborted_errors +
3943                     stats->tx_window_errors +
3944                     stats->tx_heartbeat_errors,
3945                    stats->tx_compressed);
3946 }
3947
3948 /*
3949  *      Called from the PROCfs module. This now uses the new arbitrary sized
3950  *      /proc/net interface to create /proc/net/dev
3951  */
3952 static int dev_seq_show(struct seq_file *seq, void *v)
3953 {
3954         if (v == SEQ_START_TOKEN)
3955                 seq_puts(seq, "Inter-|   Receive                            "
3956                               "                    |  Transmit\n"
3957                               " face |bytes    packets errs drop fifo frame "
3958                               "compressed multicast|bytes    packets errs "
3959                               "drop fifo colls carrier compressed\n");
3960         else
3961                 dev_seq_printf_stats(seq, v);
3962         return 0;
3963 }
3964
3965 static struct softnet_data *softnet_get_online(loff_t *pos)
3966 {
3967         struct softnet_data *sd = NULL;
3968
3969         while (*pos < nr_cpu_ids)
3970                 if (cpu_online(*pos)) {
3971                         sd = &per_cpu(softnet_data, *pos);
3972                         break;
3973                 } else
3974                         ++*pos;
3975         return sd;
3976 }
3977
3978 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3979 {
3980         return softnet_get_online(pos);
3981 }
3982
3983 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3984 {
3985         ++*pos;
3986         return softnet_get_online(pos);
3987 }
3988
3989 static void softnet_seq_stop(struct seq_file *seq, void *v)
3990 {
3991 }
3992
3993 static int softnet_seq_show(struct seq_file *seq, void *v)
3994 {
3995         struct softnet_data *sd = v;
3996
3997         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3998                    sd->processed, sd->dropped, sd->time_squeeze, 0,
3999                    0, 0, 0, 0, /* was fastroute */
4000                    sd->cpu_collision, sd->received_rps);
4001         return 0;
4002 }
4003
4004 static const struct seq_operations dev_seq_ops = {
4005         .start = dev_seq_start,
4006         .next  = dev_seq_next,
4007         .stop  = dev_seq_stop,
4008         .show  = dev_seq_show,
4009 };
4010
4011 static int dev_seq_open(struct inode *inode, struct file *file)
4012 {
4013         return seq_open_net(inode, file, &dev_seq_ops,
4014                             sizeof(struct seq_net_private));
4015 }
4016
4017 static const struct file_operations dev_seq_fops = {
4018         .owner   = THIS_MODULE,
4019         .open    = dev_seq_open,
4020         .read    = seq_read,
4021         .llseek  = seq_lseek,
4022         .release = seq_release_net,
4023 };
4024
4025 static const struct seq_operations softnet_seq_ops = {
4026         .start = softnet_seq_start,
4027         .next  = softnet_seq_next,
4028         .stop  = softnet_seq_stop,
4029         .show  = softnet_seq_show,
4030 };
4031
4032 static int softnet_seq_open(struct inode *inode, struct file *file)
4033 {
4034         return seq_open(file, &softnet_seq_ops);
4035 }
4036
4037 static const struct file_operations softnet_seq_fops = {
4038         .owner   = THIS_MODULE,
4039         .open    = softnet_seq_open,
4040         .read    = seq_read,
4041         .llseek  = seq_lseek,
4042         .release = seq_release,
4043 };
4044
4045 static void *ptype_get_idx(loff_t pos)
4046 {
4047         struct packet_type *pt = NULL;
4048         loff_t i = 0;
4049         int t;
4050
4051         list_for_each_entry_rcu(pt, &ptype_all, list) {
4052                 if (i == pos)
4053                         return pt;
4054                 ++i;
4055         }
4056
4057         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4058                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4059                         if (i == pos)
4060                                 return pt;
4061                         ++i;
4062                 }
4063         }
4064         return NULL;
4065 }
4066
4067 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4068         __acquires(RCU)
4069 {
4070         rcu_read_lock();
4071         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4072 }
4073
4074 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4075 {
4076         struct packet_type *pt;
4077         struct list_head *nxt;
4078         int hash;
4079
4080         ++*pos;
4081         if (v == SEQ_START_TOKEN)
4082                 return ptype_get_idx(0);
4083
4084         pt = v;
4085         nxt = pt->list.next;
4086         if (pt->type == htons(ETH_P_ALL)) {
4087                 if (nxt != &ptype_all)
4088                         goto found;
4089                 hash = 0;
4090                 nxt = ptype_base[0].next;
4091         } else
4092                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4093
4094         while (nxt == &ptype_base[hash]) {
4095                 if (++hash >= PTYPE_HASH_SIZE)
4096                         return NULL;
4097                 nxt = ptype_base[hash].next;
4098         }
4099 found:
4100         return list_entry(nxt, struct packet_type, list);
4101 }
4102
4103 static void ptype_seq_stop(struct seq_file *seq, void *v)
4104         __releases(RCU)
4105 {
4106         rcu_read_unlock();
4107 }
4108
4109 static int ptype_seq_show(struct seq_file *seq, void *v)
4110 {
4111         struct packet_type *pt = v;
4112
4113         if (v == SEQ_START_TOKEN)
4114                 seq_puts(seq, "Type Device      Function\n");
4115         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4116                 if (pt->type == htons(ETH_P_ALL))
4117                         seq_puts(seq, "ALL ");
4118                 else
4119                         seq_printf(seq, "%04x", ntohs(pt->type));
4120
4121                 seq_printf(seq, " %-8s %pF\n",
4122                            pt->dev ? pt->dev->name : "", pt->func);
4123         }
4124
4125         return 0;
4126 }
4127
4128 static const struct seq_operations ptype_seq_ops = {
4129         .start = ptype_seq_start,
4130         .next  = ptype_seq_next,
4131         .stop  = ptype_seq_stop,
4132         .show  = ptype_seq_show,
4133 };
4134
4135 static int ptype_seq_open(struct inode *inode, struct file *file)
4136 {
4137         return seq_open_net(inode, file, &ptype_seq_ops,
4138                         sizeof(struct seq_net_private));
4139 }
4140
4141 static const struct file_operations ptype_seq_fops = {
4142         .owner   = THIS_MODULE,
4143         .open    = ptype_seq_open,
4144         .read    = seq_read,
4145         .llseek  = seq_lseek,
4146         .release = seq_release_net,
4147 };
4148
4149
4150 static int __net_init dev_proc_net_init(struct net *net)
4151 {
4152         int rc = -ENOMEM;
4153
4154         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4155                 goto out;
4156         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4157                 goto out_dev;
4158         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4159                 goto out_softnet;
4160
4161         if (wext_proc_init(net))
4162                 goto out_ptype;
4163         rc = 0;
4164 out:
4165         return rc;
4166 out_ptype:
4167         proc_net_remove(net, "ptype");
4168 out_softnet:
4169         proc_net_remove(net, "softnet_stat");
4170 out_dev:
4171         proc_net_remove(net, "dev");
4172         goto out;
4173 }
4174
4175 static void __net_exit dev_proc_net_exit(struct net *net)
4176 {
4177         wext_proc_exit(net);
4178
4179         proc_net_remove(net, "ptype");
4180         proc_net_remove(net, "softnet_stat");
4181         proc_net_remove(net, "dev");
4182 }
4183
4184 static struct pernet_operations __net_initdata dev_proc_ops = {
4185         .init = dev_proc_net_init,
4186         .exit = dev_proc_net_exit,
4187 };
4188
4189 static int __init dev_proc_init(void)
4190 {
4191         return register_pernet_subsys(&dev_proc_ops);
4192 }
4193 #else
4194 #define dev_proc_init() 0
4195 #endif  /* CONFIG_PROC_FS */
4196
4197
4198 /**
4199  *      netdev_set_master       -       set up master/slave pair
4200  *      @slave: slave device
4201  *      @master: new master device
4202  *
4203  *      Changes the master device of the slave. Pass %NULL to break the
4204  *      bonding. The caller must hold the RTNL semaphore. On a failure
4205  *      a negative errno code is returned. On success the reference counts
4206  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4207  *      function returns zero.
4208  */
4209 int netdev_set_master(struct net_device *slave, struct net_device *master)
4210 {
4211         struct net_device *old = slave->master;
4212
4213         ASSERT_RTNL();
4214
4215         if (master) {
4216                 if (old)
4217                         return -EBUSY;
4218                 dev_hold(master);
4219         }
4220
4221         slave->master = master;
4222
4223         if (old) {
4224                 synchronize_net();
4225                 dev_put(old);
4226         }
4227         if (master)
4228                 slave->flags |= IFF_SLAVE;
4229         else
4230                 slave->flags &= ~IFF_SLAVE;
4231
4232         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4233         return 0;
4234 }
4235 EXPORT_SYMBOL(netdev_set_master);
4236
4237 static void dev_change_rx_flags(struct net_device *dev, int flags)
4238 {
4239         const struct net_device_ops *ops = dev->netdev_ops;
4240
4241         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4242                 ops->ndo_change_rx_flags(dev, flags);
4243 }
4244
4245 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4246 {
4247         unsigned short old_flags = dev->flags;
4248         uid_t uid;
4249         gid_t gid;
4250
4251         ASSERT_RTNL();
4252
4253         dev->flags |= IFF_PROMISC;
4254         dev->promiscuity += inc;
4255         if (dev->promiscuity == 0) {
4256                 /*
4257                  * Avoid overflow.
4258                  * If inc causes overflow, untouch promisc and return error.
4259                  */
4260                 if (inc < 0)
4261                         dev->flags &= ~IFF_PROMISC;
4262                 else {
4263                         dev->promiscuity -= inc;
4264                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4265                                 "set promiscuity failed, promiscuity feature "
4266                                 "of device might be broken.\n", dev->name);
4267                         return -EOVERFLOW;
4268                 }
4269         }
4270         if (dev->flags != old_flags) {
4271                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4272                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4273                                                                "left");
4274                 if (audit_enabled) {
4275                         current_uid_gid(&uid, &gid);
4276                         audit_log(current->audit_context, GFP_ATOMIC,
4277                                 AUDIT_ANOM_PROMISCUOUS,
4278                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4279                                 dev->name, (dev->flags & IFF_PROMISC),
4280                                 (old_flags & IFF_PROMISC),
4281                                 audit_get_loginuid(current),
4282                                 uid, gid,
4283                                 audit_get_sessionid(current));
4284                 }
4285
4286                 dev_change_rx_flags(dev, IFF_PROMISC);
4287         }
4288         return 0;
4289 }
4290
4291 /**
4292  *      dev_set_promiscuity     - update promiscuity count on a device
4293  *      @dev: device
4294  *      @inc: modifier
4295  *
4296  *      Add or remove promiscuity from a device. While the count in the device
4297  *      remains above zero the interface remains promiscuous. Once it hits zero
4298  *      the device reverts back to normal filtering operation. A negative inc
4299  *      value is used to drop promiscuity on the device.
4300  *      Return 0 if successful or a negative errno code on error.
4301  */
4302 int dev_set_promiscuity(struct net_device *dev, int inc)
4303 {
4304         unsigned short old_flags = dev->flags;
4305         int err;
4306
4307         err = __dev_set_promiscuity(dev, inc);
4308         if (err < 0)
4309                 return err;
4310         if (dev->flags != old_flags)
4311                 dev_set_rx_mode(dev);
4312         return err;
4313 }
4314 EXPORT_SYMBOL(dev_set_promiscuity);
4315
4316 /**
4317  *      dev_set_allmulti        - update allmulti count on a device
4318  *      @dev: device
4319  *      @inc: modifier
4320  *
4321  *      Add or remove reception of all multicast frames to a device. While the
4322  *      count in the device remains above zero the interface remains listening
4323  *      to all interfaces. Once it hits zero the device reverts back to normal
4324  *      filtering operation. A negative @inc value is used to drop the counter
4325  *      when releasing a resource needing all multicasts.
4326  *      Return 0 if successful or a negative errno code on error.
4327  */
4328
4329 int dev_set_allmulti(struct net_device *dev, int inc)
4330 {
4331         unsigned short old_flags = dev->flags;
4332
4333         ASSERT_RTNL();
4334
4335         dev->flags |= IFF_ALLMULTI;
4336         dev->allmulti += inc;
4337         if (dev->allmulti == 0) {
4338                 /*
4339                  * Avoid overflow.
4340                  * If inc causes overflow, untouch allmulti and return error.
4341                  */
4342                 if (inc < 0)
4343                         dev->flags &= ~IFF_ALLMULTI;
4344                 else {
4345                         dev->allmulti -= inc;
4346                         printk(KERN_WARNING "%s: allmulti touches roof, "
4347                                 "set allmulti failed, allmulti feature of "
4348                                 "device might be broken.\n", dev->name);
4349                         return -EOVERFLOW;
4350                 }
4351         }
4352         if (dev->flags ^ old_flags) {
4353                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4354                 dev_set_rx_mode(dev);
4355         }
4356         return 0;
4357 }
4358 EXPORT_SYMBOL(dev_set_allmulti);
4359
4360 /*
4361  *      Upload unicast and multicast address lists to device and
4362  *      configure RX filtering. When the device doesn't support unicast
4363  *      filtering it is put in promiscuous mode while unicast addresses
4364  *      are present.
4365  */
4366 void __dev_set_rx_mode(struct net_device *dev)
4367 {
4368         const struct net_device_ops *ops = dev->netdev_ops;
4369
4370         /* dev_open will call this function so the list will stay sane. */
4371         if (!(dev->flags&IFF_UP))
4372                 return;
4373
4374         if (!netif_device_present(dev))
4375                 return;
4376
4377         if (ops->ndo_set_rx_mode)
4378                 ops->ndo_set_rx_mode(dev);
4379         else {
4380                 /* Unicast addresses changes may only happen under the rtnl,
4381                  * therefore calling __dev_set_promiscuity here is safe.
4382                  */
4383                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4384                         __dev_set_promiscuity(dev, 1);
4385                         dev->uc_promisc = 1;
4386                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4387                         __dev_set_promiscuity(dev, -1);
4388                         dev->uc_promisc = 0;
4389                 }
4390
4391                 if (ops->ndo_set_multicast_list)
4392                         ops->ndo_set_multicast_list(dev);
4393         }
4394 }
4395
4396 void dev_set_rx_mode(struct net_device *dev)
4397 {
4398         netif_addr_lock_bh(dev);
4399         __dev_set_rx_mode(dev);
4400         netif_addr_unlock_bh(dev);
4401 }
4402
4403 /**
4404  *      dev_get_flags - get flags reported to userspace
4405  *      @dev: device
4406  *
4407  *      Get the combination of flag bits exported through APIs to userspace.
4408  */
4409 unsigned dev_get_flags(const struct net_device *dev)
4410 {
4411         unsigned flags;
4412
4413         flags = (dev->flags & ~(IFF_PROMISC |
4414                                 IFF_ALLMULTI |
4415                                 IFF_RUNNING |
4416                                 IFF_LOWER_UP |
4417                                 IFF_DORMANT)) |
4418                 (dev->gflags & (IFF_PROMISC |
4419                                 IFF_ALLMULTI));
4420
4421         if (netif_running(dev)) {
4422                 if (netif_oper_up(dev))
4423                         flags |= IFF_RUNNING;
4424                 if (netif_carrier_ok(dev))
4425                         flags |= IFF_LOWER_UP;
4426                 if (netif_dormant(dev))
4427                         flags |= IFF_DORMANT;
4428         }
4429
4430         return flags;
4431 }
4432 EXPORT_SYMBOL(dev_get_flags);
4433
4434 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4435 {
4436         int old_flags = dev->flags;
4437         int ret;
4438
4439         ASSERT_RTNL();
4440
4441         /*
4442          *      Set the flags on our device.
4443          */
4444
4445         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4446                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4447                                IFF_AUTOMEDIA)) |
4448                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4449                                     IFF_ALLMULTI));
4450
4451         /*
4452          *      Load in the correct multicast list now the flags have changed.
4453          */
4454
4455         if ((old_flags ^ flags) & IFF_MULTICAST)
4456                 dev_change_rx_flags(dev, IFF_MULTICAST);
4457
4458         dev_set_rx_mode(dev);
4459
4460         /*
4461          *      Have we downed the interface. We handle IFF_UP ourselves
4462          *      according to user attempts to set it, rather than blindly
4463          *      setting it.
4464          */
4465
4466         ret = 0;
4467         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4468                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4469
4470                 if (!ret)
4471                         dev_set_rx_mode(dev);
4472         }
4473
4474         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4475                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4476
4477                 dev->gflags ^= IFF_PROMISC;
4478                 dev_set_promiscuity(dev, inc);
4479         }
4480
4481         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4482            is important. Some (broken) drivers set IFF_PROMISC, when
4483            IFF_ALLMULTI is requested not asking us and not reporting.
4484          */
4485         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4486                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4487
4488                 dev->gflags ^= IFF_ALLMULTI;
4489                 dev_set_allmulti(dev, inc);
4490         }
4491
4492         return ret;
4493 }
4494
4495 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4496 {
4497         unsigned int changes = dev->flags ^ old_flags;
4498
4499         if (changes & IFF_UP) {
4500                 if (dev->flags & IFF_UP)
4501                         call_netdevice_notifiers(NETDEV_UP, dev);
4502                 else
4503                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4504         }
4505
4506         if (dev->flags & IFF_UP &&
4507             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4508                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4509 }
4510
4511 /**
4512  *      dev_change_flags - change device settings
4513  *      @dev: device
4514  *      @flags: device state flags
4515  *
4516  *      Change settings on device based state flags. The flags are
4517  *      in the userspace exported format.
4518  */
4519 int dev_change_flags(struct net_device *dev, unsigned flags)
4520 {
4521         int ret, changes;
4522         int old_flags = dev->flags;
4523
4524         ret = __dev_change_flags(dev, flags);
4525         if (ret < 0)
4526                 return ret;
4527
4528         changes = old_flags ^ dev->flags;
4529         if (changes)
4530                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4531
4532         __dev_notify_flags(dev, old_flags);
4533         return ret;
4534 }
4535 EXPORT_SYMBOL(dev_change_flags);
4536
4537 /**
4538  *      dev_set_mtu - Change maximum transfer unit
4539  *      @dev: device
4540  *      @new_mtu: new transfer unit
4541  *
4542  *      Change the maximum transfer size of the network device.
4543  */
4544 int dev_set_mtu(struct net_device *dev, int new_mtu)
4545 {
4546         const struct net_device_ops *ops = dev->netdev_ops;
4547         int err;
4548
4549         if (new_mtu == dev->mtu)
4550                 return 0;
4551
4552         /*      MTU must be positive.    */
4553         if (new_mtu < 0)
4554                 return -EINVAL;
4555
4556         if (!netif_device_present(dev))
4557                 return -ENODEV;
4558
4559         err = 0;
4560         if (ops->ndo_change_mtu)
4561                 err = ops->ndo_change_mtu(dev, new_mtu);
4562         else
4563                 dev->mtu = new_mtu;
4564
4565         if (!err && dev->flags & IFF_UP)
4566                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4567         return err;
4568 }
4569 EXPORT_SYMBOL(dev_set_mtu);
4570
4571 /**
4572  *      dev_set_mac_address - Change Media Access Control Address
4573  *      @dev: device
4574  *      @sa: new address
4575  *
4576  *      Change the hardware (MAC) address of the device
4577  */
4578 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4579 {
4580         const struct net_device_ops *ops = dev->netdev_ops;
4581         int err;
4582
4583         if (!ops->ndo_set_mac_address)
4584                 return -EOPNOTSUPP;
4585         if (sa->sa_family != dev->type)
4586                 return -EINVAL;
4587         if (!netif_device_present(dev))
4588                 return -ENODEV;
4589         err = ops->ndo_set_mac_address(dev, sa);
4590         if (!err)
4591                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4592         return err;
4593 }
4594 EXPORT_SYMBOL(dev_set_mac_address);
4595
4596 /*
4597  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4598  */
4599 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4600 {
4601         int err;
4602         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4603
4604         if (!dev)
4605                 return -ENODEV;
4606
4607         switch (cmd) {
4608         case SIOCGIFFLAGS:      /* Get interface flags */
4609                 ifr->ifr_flags = (short) dev_get_flags(dev);
4610                 return 0;
4611
4612         case SIOCGIFMETRIC:     /* Get the metric on the interface
4613                                    (currently unused) */
4614                 ifr->ifr_metric = 0;
4615                 return 0;
4616
4617         case SIOCGIFMTU:        /* Get the MTU of a device */
4618                 ifr->ifr_mtu = dev->mtu;
4619                 return 0;
4620
4621         case SIOCGIFHWADDR:
4622                 if (!dev->addr_len)
4623                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4624                 else
4625                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4626                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4627                 ifr->ifr_hwaddr.sa_family = dev->type;
4628                 return 0;
4629
4630         case SIOCGIFSLAVE:
4631                 err = -EINVAL;
4632                 break;
4633
4634         case SIOCGIFMAP:
4635                 ifr->ifr_map.mem_start = dev->mem_start;
4636                 ifr->ifr_map.mem_end   = dev->mem_end;
4637                 ifr->ifr_map.base_addr = dev->base_addr;
4638                 ifr->ifr_map.irq       = dev->irq;
4639                 ifr->ifr_map.dma       = dev->dma;
4640                 ifr->ifr_map.port      = dev->if_port;
4641                 return 0;
4642
4643         case SIOCGIFINDEX:
4644                 ifr->ifr_ifindex = dev->ifindex;
4645                 return 0;
4646
4647         case SIOCGIFTXQLEN:
4648                 ifr->ifr_qlen = dev->tx_queue_len;
4649                 return 0;
4650
4651         default:
4652                 /* dev_ioctl() should ensure this case
4653                  * is never reached
4654                  */
4655                 WARN_ON(1);
4656                 err = -EINVAL;
4657                 break;
4658
4659         }
4660         return err;
4661 }
4662
4663 /*
4664  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4665  */
4666 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4667 {
4668         int err;
4669         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4670         const struct net_device_ops *ops;
4671
4672         if (!dev)
4673                 return -ENODEV;
4674
4675         ops = dev->netdev_ops;
4676
4677         switch (cmd) {
4678         case SIOCSIFFLAGS:      /* Set interface flags */
4679                 return dev_change_flags(dev, ifr->ifr_flags);
4680
4681         case SIOCSIFMETRIC:     /* Set the metric on the interface
4682                                    (currently unused) */
4683                 return -EOPNOTSUPP;
4684
4685         case SIOCSIFMTU:        /* Set the MTU of a device */
4686                 return dev_set_mtu(dev, ifr->ifr_mtu);
4687
4688         case SIOCSIFHWADDR:
4689                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4690
4691         case SIOCSIFHWBROADCAST:
4692                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4693                         return -EINVAL;
4694                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4695                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4696                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4697                 return 0;
4698
4699         case SIOCSIFMAP:
4700                 if (ops->ndo_set_config) {
4701                         if (!netif_device_present(dev))
4702                                 return -ENODEV;
4703                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4704                 }
4705                 return -EOPNOTSUPP;
4706
4707         case SIOCADDMULTI:
4708                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4709                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4710                         return -EINVAL;
4711                 if (!netif_device_present(dev))
4712                         return -ENODEV;
4713                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4714
4715         case SIOCDELMULTI:
4716                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4717                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4718                         return -EINVAL;
4719                 if (!netif_device_present(dev))
4720                         return -ENODEV;
4721                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4722
4723         case SIOCSIFTXQLEN:
4724                 if (ifr->ifr_qlen < 0)
4725                         return -EINVAL;
4726                 dev->tx_queue_len = ifr->ifr_qlen;
4727                 return 0;
4728
4729         case SIOCSIFNAME:
4730                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4731                 return dev_change_name(dev, ifr->ifr_newname);
4732
4733         /*
4734          *      Unknown or private ioctl
4735          */
4736         default:
4737                 if ((cmd >= SIOCDEVPRIVATE &&
4738                     cmd <= SIOCDEVPRIVATE + 15) ||
4739                     cmd == SIOCBONDENSLAVE ||
4740                     cmd == SIOCBONDRELEASE ||
4741                     cmd == SIOCBONDSETHWADDR ||
4742                     cmd == SIOCBONDSLAVEINFOQUERY ||
4743                     cmd == SIOCBONDINFOQUERY ||
4744                     cmd == SIOCBONDCHANGEACTIVE ||
4745                     cmd == SIOCGMIIPHY ||
4746                     cmd == SIOCGMIIREG ||
4747                     cmd == SIOCSMIIREG ||
4748                     cmd == SIOCBRADDIF ||
4749                     cmd == SIOCBRDELIF ||
4750                     cmd == SIOCSHWTSTAMP ||
4751                     cmd == SIOCWANDEV) {
4752                         err = -EOPNOTSUPP;
4753                         if (ops->ndo_do_ioctl) {
4754                                 if (netif_device_present(dev))
4755                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4756                                 else
4757                                         err = -ENODEV;
4758                         }
4759                 } else
4760                         err = -EINVAL;
4761
4762         }
4763         return err;
4764 }
4765
4766 /*
4767  *      This function handles all "interface"-type I/O control requests. The actual
4768  *      'doing' part of this is dev_ifsioc above.
4769  */
4770
4771 /**
4772  *      dev_ioctl       -       network device ioctl
4773  *      @net: the applicable net namespace
4774  *      @cmd: command to issue
4775  *      @arg: pointer to a struct ifreq in user space
4776  *
4777  *      Issue ioctl functions to devices. This is normally called by the
4778  *      user space syscall interfaces but can sometimes be useful for
4779  *      other purposes. The return value is the return from the syscall if
4780  *      positive or a negative errno code on error.
4781  */
4782
4783 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4784 {
4785         struct ifreq ifr;
4786         int ret;
4787         char *colon;
4788
4789         /* One special case: SIOCGIFCONF takes ifconf argument
4790            and requires shared lock, because it sleeps writing
4791            to user space.
4792          */
4793
4794         if (cmd == SIOCGIFCONF) {
4795                 rtnl_lock();
4796                 ret = dev_ifconf(net, (char __user *) arg);
4797                 rtnl_unlock();
4798                 return ret;
4799         }
4800         if (cmd == SIOCGIFNAME)
4801                 return dev_ifname(net, (struct ifreq __user *)arg);
4802
4803         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4804                 return -EFAULT;
4805
4806         ifr.ifr_name[IFNAMSIZ-1] = 0;
4807
4808         colon = strchr(ifr.ifr_name, ':');
4809         if (colon)
4810                 *colon = 0;
4811
4812         /*
4813          *      See which interface the caller is talking about.
4814          */
4815
4816         switch (cmd) {
4817         /*
4818          *      These ioctl calls:
4819          *      - can be done by all.
4820          *      - atomic and do not require locking.
4821          *      - return a value
4822          */
4823         case SIOCGIFFLAGS:
4824         case SIOCGIFMETRIC:
4825         case SIOCGIFMTU:
4826         case SIOCGIFHWADDR:
4827         case SIOCGIFSLAVE:
4828         case SIOCGIFMAP:
4829         case SIOCGIFINDEX:
4830         case SIOCGIFTXQLEN:
4831                 dev_load(net, ifr.ifr_name);
4832                 rcu_read_lock();
4833                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4834                 rcu_read_unlock();
4835                 if (!ret) {
4836                         if (colon)
4837                                 *colon = ':';
4838                         if (copy_to_user(arg, &ifr,
4839                                          sizeof(struct ifreq)))
4840                                 ret = -EFAULT;
4841                 }
4842                 return ret;
4843
4844         case SIOCETHTOOL:
4845                 dev_load(net, ifr.ifr_name);
4846                 rtnl_lock();
4847                 ret = dev_ethtool(net, &ifr);
4848                 rtnl_unlock();
4849                 if (!ret) {
4850                         if (colon)
4851                                 *colon = ':';
4852                         if (copy_to_user(arg, &ifr,
4853                                          sizeof(struct ifreq)))
4854                                 ret = -EFAULT;
4855                 }
4856                 return ret;
4857
4858         /*
4859          *      These ioctl calls:
4860          *      - require superuser power.
4861          *      - require strict serialization.
4862          *      - return a value
4863          */
4864         case SIOCGMIIPHY:
4865         case SIOCGMIIREG:
4866         case SIOCSIFNAME:
4867                 if (!capable(CAP_NET_ADMIN))
4868                         return -EPERM;
4869                 dev_load(net, ifr.ifr_name);
4870                 rtnl_lock();
4871                 ret = dev_ifsioc(net, &ifr, cmd);
4872                 rtnl_unlock();
4873                 if (!ret) {
4874                         if (colon)
4875                                 *colon = ':';
4876                         if (copy_to_user(arg, &ifr,
4877                                          sizeof(struct ifreq)))
4878                                 ret = -EFAULT;
4879                 }
4880                 return ret;
4881
4882         /*
4883          *      These ioctl calls:
4884          *      - require superuser power.
4885          *      - require strict serialization.
4886          *      - do not return a value
4887          */
4888         case SIOCSIFFLAGS:
4889         case SIOCSIFMETRIC:
4890         case SIOCSIFMTU:
4891         case SIOCSIFMAP:
4892         case SIOCSIFHWADDR:
4893         case SIOCSIFSLAVE:
4894         case SIOCADDMULTI:
4895         case SIOCDELMULTI:
4896         case SIOCSIFHWBROADCAST:
4897         case SIOCSIFTXQLEN:
4898         case SIOCSMIIREG:
4899         case SIOCBONDENSLAVE:
4900         case SIOCBONDRELEASE:
4901         case SIOCBONDSETHWADDR:
4902         case SIOCBONDCHANGEACTIVE:
4903         case SIOCBRADDIF:
4904         case SIOCBRDELIF:
4905         case SIOCSHWTSTAMP:
4906                 if (!capable(CAP_NET_ADMIN))
4907                         return -EPERM;
4908                 /* fall through */
4909         case SIOCBONDSLAVEINFOQUERY:
4910         case SIOCBONDINFOQUERY:
4911                 dev_load(net, ifr.ifr_name);
4912                 rtnl_lock();
4913                 ret = dev_ifsioc(net, &ifr, cmd);
4914                 rtnl_unlock();
4915                 return ret;
4916
4917         case SIOCGIFMEM:
4918                 /* Get the per device memory space. We can add this but
4919                  * currently do not support it */
4920         case SIOCSIFMEM:
4921                 /* Set the per device memory buffer space.
4922                  * Not applicable in our case */
4923         case SIOCSIFLINK:
4924                 return -EINVAL;
4925
4926         /*
4927          *      Unknown or private ioctl.
4928          */
4929         default:
4930                 if (cmd == SIOCWANDEV ||
4931                     (cmd >= SIOCDEVPRIVATE &&
4932                      cmd <= SIOCDEVPRIVATE + 15)) {
4933                         dev_load(net, ifr.ifr_name);
4934                         rtnl_lock();
4935                         ret = dev_ifsioc(net, &ifr, cmd);
4936                         rtnl_unlock();
4937                         if (!ret && copy_to_user(arg, &ifr,
4938                                                  sizeof(struct ifreq)))
4939                                 ret = -EFAULT;
4940                         return ret;
4941                 }
4942                 /* Take care of Wireless Extensions */
4943                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4944                         return wext_handle_ioctl(net, &ifr, cmd, arg);
4945                 return -EINVAL;
4946         }
4947 }
4948
4949
4950 /**
4951  *      dev_new_index   -       allocate an ifindex
4952  *      @net: the applicable net namespace
4953  *
4954  *      Returns a suitable unique value for a new device interface
4955  *      number.  The caller must hold the rtnl semaphore or the
4956  *      dev_base_lock to be sure it remains unique.
4957  */
4958 static int dev_new_index(struct net *net)
4959 {
4960         static int ifindex;
4961         for (;;) {
4962                 if (++ifindex <= 0)
4963                         ifindex = 1;
4964                 if (!__dev_get_by_index(net, ifindex))
4965                         return ifindex;
4966         }
4967 }
4968
4969 /* Delayed registration/unregisteration */
4970 static LIST_HEAD(net_todo_list);
4971
4972 static void net_set_todo(struct net_device *dev)
4973 {
4974         list_add_tail(&dev->todo_list, &net_todo_list);
4975 }
4976
4977 static void rollback_registered_many(struct list_head *head)
4978 {
4979         struct net_device *dev, *tmp;
4980
4981         BUG_ON(dev_boot_phase);
4982         ASSERT_RTNL();
4983
4984         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4985                 /* Some devices call without registering
4986                  * for initialization unwind. Remove those
4987                  * devices and proceed with the remaining.
4988                  */
4989                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4990                         pr_debug("unregister_netdevice: device %s/%p never "
4991                                  "was registered\n", dev->name, dev);
4992
4993                         WARN_ON(1);
4994                         list_del(&dev->unreg_list);
4995                         continue;
4996                 }
4997
4998                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4999         }
5000
5001         /* If device is running, close it first. */
5002         dev_close_many(head);
5003
5004         list_for_each_entry(dev, head, unreg_list) {
5005                 /* And unlink it from device chain. */
5006                 unlist_netdevice(dev);
5007
5008                 dev->reg_state = NETREG_UNREGISTERING;
5009         }
5010
5011         synchronize_net();
5012
5013         list_for_each_entry(dev, head, unreg_list) {
5014                 /* Shutdown queueing discipline. */
5015                 dev_shutdown(dev);
5016
5017
5018                 /* Notify protocols, that we are about to destroy
5019                    this device. They should clean all the things.
5020                 */
5021                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5022
5023                 if (!dev->rtnl_link_ops ||
5024                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5025                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5026
5027                 /*
5028                  *      Flush the unicast and multicast chains
5029                  */
5030                 dev_uc_flush(dev);
5031                 dev_mc_flush(dev);
5032
5033                 if (dev->netdev_ops->ndo_uninit)
5034                         dev->netdev_ops->ndo_uninit(dev);
5035
5036                 /* Notifier chain MUST detach us from master device. */
5037                 WARN_ON(dev->master);
5038
5039                 /* Remove entries from kobject tree */
5040                 netdev_unregister_kobject(dev);
5041         }
5042
5043         /* Process any work delayed until the end of the batch */
5044         dev = list_first_entry(head, struct net_device, unreg_list);
5045         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5046
5047         rcu_barrier();
5048
5049         list_for_each_entry(dev, head, unreg_list)
5050                 dev_put(dev);
5051 }
5052
5053 static void rollback_registered(struct net_device *dev)
5054 {
5055         LIST_HEAD(single);
5056
5057         list_add(&dev->unreg_list, &single);
5058         rollback_registered_many(&single);
5059 }
5060
5061 unsigned long netdev_fix_features(unsigned long features, const char *name)
5062 {
5063         /* Fix illegal SG+CSUM combinations. */
5064         if ((features & NETIF_F_SG) &&
5065             !(features & NETIF_F_ALL_CSUM)) {
5066                 if (name)
5067                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5068                                "checksum feature.\n", name);
5069                 features &= ~NETIF_F_SG;
5070         }
5071
5072         /* TSO requires that SG is present as well. */
5073         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5074                 if (name)
5075                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5076                                "SG feature.\n", name);
5077                 features &= ~NETIF_F_TSO;
5078         }
5079
5080         if (features & NETIF_F_UFO) {
5081                 /* maybe split UFO into V4 and V6? */
5082                 if (!((features & NETIF_F_GEN_CSUM) ||
5083                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5084                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5085                         if (name)
5086                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5087                                        "since no checksum offload features.\n",
5088                                        name);
5089                         features &= ~NETIF_F_UFO;
5090                 }
5091
5092                 if (!(features & NETIF_F_SG)) {
5093                         if (name)
5094                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5095                                        "since no NETIF_F_SG feature.\n", name);
5096                         features &= ~NETIF_F_UFO;
5097                 }
5098         }
5099
5100         return features;
5101 }
5102 EXPORT_SYMBOL(netdev_fix_features);
5103
5104 /**
5105  *      netif_stacked_transfer_operstate -      transfer operstate
5106  *      @rootdev: the root or lower level device to transfer state from
5107  *      @dev: the device to transfer operstate to
5108  *
5109  *      Transfer operational state from root to device. This is normally
5110  *      called when a stacking relationship exists between the root
5111  *      device and the device(a leaf device).
5112  */
5113 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5114                                         struct net_device *dev)
5115 {
5116         if (rootdev->operstate == IF_OPER_DORMANT)
5117                 netif_dormant_on(dev);
5118         else
5119                 netif_dormant_off(dev);
5120
5121         if (netif_carrier_ok(rootdev)) {
5122                 if (!netif_carrier_ok(dev))
5123                         netif_carrier_on(dev);
5124         } else {
5125                 if (netif_carrier_ok(dev))
5126                         netif_carrier_off(dev);
5127         }
5128 }
5129 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5130
5131 #ifdef CONFIG_RPS
5132 static int netif_alloc_rx_queues(struct net_device *dev)
5133 {
5134         unsigned int i, count = dev->num_rx_queues;
5135         struct netdev_rx_queue *rx;
5136
5137         BUG_ON(count < 1);
5138
5139         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5140         if (!rx) {
5141                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5142                 return -ENOMEM;
5143         }
5144         dev->_rx = rx;
5145
5146         for (i = 0; i < count; i++)
5147                 rx[i].dev = dev;
5148         return 0;
5149 }
5150 #endif
5151
5152 static void netdev_init_one_queue(struct net_device *dev,
5153                                   struct netdev_queue *queue, void *_unused)
5154 {
5155         /* Initialize queue lock */
5156         spin_lock_init(&queue->_xmit_lock);
5157         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5158         queue->xmit_lock_owner = -1;
5159         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5160         queue->dev = dev;
5161 }
5162
5163 static int netif_alloc_netdev_queues(struct net_device *dev)
5164 {
5165         unsigned int count = dev->num_tx_queues;
5166         struct netdev_queue *tx;
5167
5168         BUG_ON(count < 1);
5169
5170         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5171         if (!tx) {
5172                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5173                        count);
5174                 return -ENOMEM;
5175         }
5176         dev->_tx = tx;
5177
5178         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5179         spin_lock_init(&dev->tx_global_lock);
5180
5181         return 0;
5182 }
5183
5184 /**
5185  *      register_netdevice      - register a network device
5186  *      @dev: device to register
5187  *
5188  *      Take a completed network device structure and add it to the kernel
5189  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5190  *      chain. 0 is returned on success. A negative errno code is returned
5191  *      on a failure to set up the device, or if the name is a duplicate.
5192  *
5193  *      Callers must hold the rtnl semaphore. You may want
5194  *      register_netdev() instead of this.
5195  *
5196  *      BUGS:
5197  *      The locking appears insufficient to guarantee two parallel registers
5198  *      will not get the same name.
5199  */
5200
5201 int register_netdevice(struct net_device *dev)
5202 {
5203         int ret;
5204         struct net *net = dev_net(dev);
5205
5206         BUG_ON(dev_boot_phase);
5207         ASSERT_RTNL();
5208
5209         might_sleep();
5210
5211         /* When net_device's are persistent, this will be fatal. */
5212         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5213         BUG_ON(!net);
5214
5215         spin_lock_init(&dev->addr_list_lock);
5216         netdev_set_addr_lockdep_class(dev);
5217
5218         dev->iflink = -1;
5219
5220         /* Init, if this function is available */
5221         if (dev->netdev_ops->ndo_init) {
5222                 ret = dev->netdev_ops->ndo_init(dev);
5223                 if (ret) {
5224                         if (ret > 0)
5225                                 ret = -EIO;
5226                         goto out;
5227                 }
5228         }
5229
5230         ret = dev_get_valid_name(dev, dev->name, 0);
5231         if (ret)
5232                 goto err_uninit;
5233
5234         dev->ifindex = dev_new_index(net);
5235         if (dev->iflink == -1)
5236                 dev->iflink = dev->ifindex;
5237
5238         /* Fix illegal checksum combinations */
5239         if ((dev->features & NETIF_F_HW_CSUM) &&
5240             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5241                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5242                        dev->name);
5243                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5244         }
5245
5246         if ((dev->features & NETIF_F_NO_CSUM) &&
5247             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5248                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5249                        dev->name);
5250                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5251         }
5252
5253         dev->features = netdev_fix_features(dev->features, dev->name);
5254
5255         /* Enable software GSO if SG is supported. */
5256         if (dev->features & NETIF_F_SG)
5257                 dev->features |= NETIF_F_GSO;
5258
5259         /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5260          * vlan_dev_init() will do the dev->features check, so these features
5261          * are enabled only if supported by underlying device.
5262          */
5263         dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5264
5265         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5266         ret = notifier_to_errno(ret);
5267         if (ret)
5268                 goto err_uninit;
5269
5270         ret = netdev_register_kobject(dev);
5271         if (ret)
5272                 goto err_uninit;
5273         dev->reg_state = NETREG_REGISTERED;
5274
5275         /*
5276          *      Default initial state at registry is that the
5277          *      device is present.
5278          */
5279
5280         set_bit(__LINK_STATE_PRESENT, &dev->state);
5281
5282         dev_init_scheduler(dev);
5283         dev_hold(dev);
5284         list_netdevice(dev);
5285
5286         /* Notify protocols, that a new device appeared. */
5287         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5288         ret = notifier_to_errno(ret);
5289         if (ret) {
5290                 rollback_registered(dev);
5291                 dev->reg_state = NETREG_UNREGISTERED;
5292         }
5293         /*
5294          *      Prevent userspace races by waiting until the network
5295          *      device is fully setup before sending notifications.
5296          */
5297         if (!dev->rtnl_link_ops ||
5298             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5299                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5300
5301 out:
5302         return ret;
5303
5304 err_uninit:
5305         if (dev->netdev_ops->ndo_uninit)
5306                 dev->netdev_ops->ndo_uninit(dev);
5307         goto out;
5308 }
5309 EXPORT_SYMBOL(register_netdevice);
5310
5311 /**
5312  *      init_dummy_netdev       - init a dummy network device for NAPI
5313  *      @dev: device to init
5314  *
5315  *      This takes a network device structure and initialize the minimum
5316  *      amount of fields so it can be used to schedule NAPI polls without
5317  *      registering a full blown interface. This is to be used by drivers
5318  *      that need to tie several hardware interfaces to a single NAPI
5319  *      poll scheduler due to HW limitations.
5320  */
5321 int init_dummy_netdev(struct net_device *dev)
5322 {
5323         /* Clear everything. Note we don't initialize spinlocks
5324          * are they aren't supposed to be taken by any of the
5325          * NAPI code and this dummy netdev is supposed to be
5326          * only ever used for NAPI polls
5327          */
5328         memset(dev, 0, sizeof(struct net_device));
5329
5330         /* make sure we BUG if trying to hit standard
5331          * register/unregister code path
5332          */
5333         dev->reg_state = NETREG_DUMMY;
5334
5335         /* NAPI wants this */
5336         INIT_LIST_HEAD(&dev->napi_list);
5337
5338         /* a dummy interface is started by default */
5339         set_bit(__LINK_STATE_PRESENT, &dev->state);
5340         set_bit(__LINK_STATE_START, &dev->state);
5341
5342         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5343          * because users of this 'device' dont need to change
5344          * its refcount.
5345          */
5346
5347         return 0;
5348 }
5349 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5350
5351
5352 /**
5353  *      register_netdev - register a network device
5354  *      @dev: device to register
5355  *
5356  *      Take a completed network device structure and add it to the kernel
5357  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5358  *      chain. 0 is returned on success. A negative errno code is returned
5359  *      on a failure to set up the device, or if the name is a duplicate.
5360  *
5361  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5362  *      and expands the device name if you passed a format string to
5363  *      alloc_netdev.
5364  */
5365 int register_netdev(struct net_device *dev)
5366 {
5367         int err;
5368
5369         rtnl_lock();
5370
5371         /*
5372          * If the name is a format string the caller wants us to do a
5373          * name allocation.
5374          */
5375         if (strchr(dev->name, '%')) {
5376                 err = dev_alloc_name(dev, dev->name);
5377                 if (err < 0)
5378                         goto out;
5379         }
5380
5381         err = register_netdevice(dev);
5382 out:
5383         rtnl_unlock();
5384         return err;
5385 }
5386 EXPORT_SYMBOL(register_netdev);
5387
5388 int netdev_refcnt_read(const struct net_device *dev)
5389 {
5390         int i, refcnt = 0;
5391
5392         for_each_possible_cpu(i)
5393                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5394         return refcnt;
5395 }
5396 EXPORT_SYMBOL(netdev_refcnt_read);
5397
5398 /*
5399  * netdev_wait_allrefs - wait until all references are gone.
5400  *
5401  * This is called when unregistering network devices.
5402  *
5403  * Any protocol or device that holds a reference should register
5404  * for netdevice notification, and cleanup and put back the
5405  * reference if they receive an UNREGISTER event.
5406  * We can get stuck here if buggy protocols don't correctly
5407  * call dev_put.
5408  */
5409 static void netdev_wait_allrefs(struct net_device *dev)
5410 {
5411         unsigned long rebroadcast_time, warning_time;
5412         int refcnt;
5413
5414         linkwatch_forget_dev(dev);
5415
5416         rebroadcast_time = warning_time = jiffies;
5417         refcnt = netdev_refcnt_read(dev);
5418
5419         while (refcnt != 0) {
5420                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5421                         rtnl_lock();
5422
5423                         /* Rebroadcast unregister notification */
5424                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5425                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5426                          * should have already handle it the first time */
5427
5428                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5429                                      &dev->state)) {
5430                                 /* We must not have linkwatch events
5431                                  * pending on unregister. If this
5432                                  * happens, we simply run the queue
5433                                  * unscheduled, resulting in a noop
5434                                  * for this device.
5435                                  */
5436                                 linkwatch_run_queue();
5437                         }
5438
5439                         __rtnl_unlock();
5440
5441                         rebroadcast_time = jiffies;
5442                 }
5443
5444                 msleep(250);
5445
5446                 refcnt = netdev_refcnt_read(dev);
5447
5448                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5449                         printk(KERN_EMERG "unregister_netdevice: "
5450                                "waiting for %s to become free. Usage "
5451                                "count = %d\n",
5452                                dev->name, refcnt);
5453                         warning_time = jiffies;
5454                 }
5455         }
5456 }
5457
5458 /* The sequence is:
5459  *
5460  *      rtnl_lock();
5461  *      ...
5462  *      register_netdevice(x1);
5463  *      register_netdevice(x2);
5464  *      ...
5465  *      unregister_netdevice(y1);
5466  *      unregister_netdevice(y2);
5467  *      ...
5468  *      rtnl_unlock();
5469  *      free_netdev(y1);
5470  *      free_netdev(y2);
5471  *
5472  * We are invoked by rtnl_unlock().
5473  * This allows us to deal with problems:
5474  * 1) We can delete sysfs objects which invoke hotplug
5475  *    without deadlocking with linkwatch via keventd.
5476  * 2) Since we run with the RTNL semaphore not held, we can sleep
5477  *    safely in order to wait for the netdev refcnt to drop to zero.
5478  *
5479  * We must not return until all unregister events added during
5480  * the interval the lock was held have been completed.
5481  */
5482 void netdev_run_todo(void)
5483 {
5484         struct list_head list;
5485
5486         /* Snapshot list, allow later requests */
5487         list_replace_init(&net_todo_list, &list);
5488
5489         __rtnl_unlock();
5490
5491         while (!list_empty(&list)) {
5492                 struct net_device *dev
5493                         = list_first_entry(&list, struct net_device, todo_list);
5494                 list_del(&dev->todo_list);
5495
5496                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5497                         printk(KERN_ERR "network todo '%s' but state %d\n",
5498                                dev->name, dev->reg_state);
5499                         dump_stack();
5500                         continue;
5501                 }
5502
5503                 dev->reg_state = NETREG_UNREGISTERED;
5504
5505                 on_each_cpu(flush_backlog, dev, 1);
5506
5507                 netdev_wait_allrefs(dev);
5508
5509                 /* paranoia */
5510                 BUG_ON(netdev_refcnt_read(dev));
5511                 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5512                 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5513                 WARN_ON(dev->dn_ptr);
5514
5515                 if (dev->destructor)
5516                         dev->destructor(dev);
5517
5518                 /* Free network device */
5519                 kobject_put(&dev->dev.kobj);
5520         }
5521 }
5522
5523 /**
5524  *      dev_txq_stats_fold - fold tx_queues stats
5525  *      @dev: device to get statistics from
5526  *      @stats: struct rtnl_link_stats64 to hold results
5527  */
5528 void dev_txq_stats_fold(const struct net_device *dev,
5529                         struct rtnl_link_stats64 *stats)
5530 {
5531         u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5532         unsigned int i;
5533         struct netdev_queue *txq;
5534
5535         for (i = 0; i < dev->num_tx_queues; i++) {
5536                 txq = netdev_get_tx_queue(dev, i);
5537                 spin_lock_bh(&txq->_xmit_lock);
5538                 tx_bytes   += txq->tx_bytes;
5539                 tx_packets += txq->tx_packets;
5540                 tx_dropped += txq->tx_dropped;
5541                 spin_unlock_bh(&txq->_xmit_lock);
5542         }
5543         if (tx_bytes || tx_packets || tx_dropped) {
5544                 stats->tx_bytes   = tx_bytes;
5545                 stats->tx_packets = tx_packets;
5546                 stats->tx_dropped = tx_dropped;
5547         }
5548 }
5549 EXPORT_SYMBOL(dev_txq_stats_fold);
5550
5551 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5552  * fields in the same order, with only the type differing.
5553  */
5554 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5555                                     const struct net_device_stats *netdev_stats)
5556 {
5557 #if BITS_PER_LONG == 64
5558         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5559         memcpy(stats64, netdev_stats, sizeof(*stats64));
5560 #else
5561         size_t i, n = sizeof(*stats64) / sizeof(u64);
5562         const unsigned long *src = (const unsigned long *)netdev_stats;
5563         u64 *dst = (u64 *)stats64;
5564
5565         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5566                      sizeof(*stats64) / sizeof(u64));
5567         for (i = 0; i < n; i++)
5568                 dst[i] = src[i];
5569 #endif
5570 }
5571
5572 /**
5573  *      dev_get_stats   - get network device statistics
5574  *      @dev: device to get statistics from
5575  *      @storage: place to store stats
5576  *
5577  *      Get network statistics from device. Return @storage.
5578  *      The device driver may provide its own method by setting
5579  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5580  *      otherwise the internal statistics structure is used.
5581  */
5582 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5583                                         struct rtnl_link_stats64 *storage)
5584 {
5585         const struct net_device_ops *ops = dev->netdev_ops;
5586
5587         if (ops->ndo_get_stats64) {
5588                 memset(storage, 0, sizeof(*storage));
5589                 ops->ndo_get_stats64(dev, storage);
5590         } else if (ops->ndo_get_stats) {
5591                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5592         } else {
5593                 netdev_stats_to_stats64(storage, &dev->stats);
5594                 dev_txq_stats_fold(dev, storage);
5595         }
5596         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5597         return storage;
5598 }
5599 EXPORT_SYMBOL(dev_get_stats);
5600
5601 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5602 {
5603         struct netdev_queue *queue = dev_ingress_queue(dev);
5604
5605 #ifdef CONFIG_NET_CLS_ACT
5606         if (queue)
5607                 return queue;
5608         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5609         if (!queue)
5610                 return NULL;
5611         netdev_init_one_queue(dev, queue, NULL);
5612         queue->qdisc = &noop_qdisc;
5613         queue->qdisc_sleeping = &noop_qdisc;
5614         rcu_assign_pointer(dev->ingress_queue, queue);
5615 #endif
5616         return queue;
5617 }
5618
5619 /**
5620  *      alloc_netdev_mq - allocate network device
5621  *      @sizeof_priv:   size of private data to allocate space for
5622  *      @name:          device name format string
5623  *      @setup:         callback to initialize device
5624  *      @queue_count:   the number of subqueues to allocate
5625  *
5626  *      Allocates a struct net_device with private data area for driver use
5627  *      and performs basic initialization.  Also allocates subquue structs
5628  *      for each queue on the device at the end of the netdevice.
5629  */
5630 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5631                 void (*setup)(struct net_device *), unsigned int queue_count)
5632 {
5633         struct net_device *dev;
5634         size_t alloc_size;
5635         struct net_device *p;
5636
5637         BUG_ON(strlen(name) >= sizeof(dev->name));
5638
5639         if (queue_count < 1) {
5640                 pr_err("alloc_netdev: Unable to allocate device "
5641                        "with zero queues.\n");
5642                 return NULL;
5643         }
5644
5645         alloc_size = sizeof(struct net_device);
5646         if (sizeof_priv) {
5647                 /* ensure 32-byte alignment of private area */
5648                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5649                 alloc_size += sizeof_priv;
5650         }
5651         /* ensure 32-byte alignment of whole construct */
5652         alloc_size += NETDEV_ALIGN - 1;
5653
5654         p = kzalloc(alloc_size, GFP_KERNEL);
5655         if (!p) {
5656                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5657                 return NULL;
5658         }
5659
5660         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5661         dev->padded = (char *)dev - (char *)p;
5662
5663         dev->pcpu_refcnt = alloc_percpu(int);
5664         if (!dev->pcpu_refcnt)
5665                 goto free_p;
5666
5667         if (dev_addr_init(dev))
5668                 goto free_pcpu;
5669
5670         dev_mc_init(dev);
5671         dev_uc_init(dev);
5672
5673         dev_net_set(dev, &init_net);
5674
5675         dev->num_tx_queues = queue_count;
5676         dev->real_num_tx_queues = queue_count;
5677         if (netif_alloc_netdev_queues(dev))
5678                 goto free_pcpu;
5679
5680 #ifdef CONFIG_RPS
5681         dev->num_rx_queues = queue_count;
5682         dev->real_num_rx_queues = queue_count;
5683         if (netif_alloc_rx_queues(dev))
5684                 goto free_pcpu;
5685 #endif
5686
5687         dev->gso_max_size = GSO_MAX_SIZE;
5688
5689         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5690         dev->ethtool_ntuple_list.count = 0;
5691         INIT_LIST_HEAD(&dev->napi_list);
5692         INIT_LIST_HEAD(&dev->unreg_list);
5693         INIT_LIST_HEAD(&dev->link_watch_list);
5694         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5695         setup(dev);
5696         strcpy(dev->name, name);
5697         return dev;
5698
5699 free_pcpu:
5700         free_percpu(dev->pcpu_refcnt);
5701         kfree(dev->_tx);
5702 #ifdef CONFIG_RPS
5703         kfree(dev->_rx);
5704 #endif
5705
5706 free_p:
5707         kfree(p);
5708         return NULL;
5709 }
5710 EXPORT_SYMBOL(alloc_netdev_mq);
5711
5712 /**
5713  *      free_netdev - free network device
5714  *      @dev: device
5715  *
5716  *      This function does the last stage of destroying an allocated device
5717  *      interface. The reference to the device object is released.
5718  *      If this is the last reference then it will be freed.
5719  */
5720 void free_netdev(struct net_device *dev)
5721 {
5722         struct napi_struct *p, *n;
5723
5724         release_net(dev_net(dev));
5725
5726         kfree(dev->_tx);
5727 #ifdef CONFIG_RPS
5728         kfree(dev->_rx);
5729 #endif
5730
5731         kfree(rcu_dereference_raw(dev->ingress_queue));
5732
5733         /* Flush device addresses */
5734         dev_addr_flush(dev);
5735
5736         /* Clear ethtool n-tuple list */
5737         ethtool_ntuple_flush(dev);
5738
5739         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5740                 netif_napi_del(p);
5741
5742         free_percpu(dev->pcpu_refcnt);
5743         dev->pcpu_refcnt = NULL;
5744
5745         /*  Compatibility with error handling in drivers */
5746         if (dev->reg_state == NETREG_UNINITIALIZED) {
5747                 kfree((char *)dev - dev->padded);
5748                 return;
5749         }
5750
5751         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5752         dev->reg_state = NETREG_RELEASED;
5753
5754         /* will free via device release */
5755         put_device(&dev->dev);
5756 }
5757 EXPORT_SYMBOL(free_netdev);
5758
5759 /**
5760  *      synchronize_net -  Synchronize with packet receive processing
5761  *
5762  *      Wait for packets currently being received to be done.
5763  *      Does not block later packets from starting.
5764  */
5765 void synchronize_net(void)
5766 {
5767         might_sleep();
5768         synchronize_rcu();
5769 }
5770 EXPORT_SYMBOL(synchronize_net);
5771
5772 /**
5773  *      unregister_netdevice_queue - remove device from the kernel
5774  *      @dev: device
5775  *      @head: list
5776  *
5777  *      This function shuts down a device interface and removes it
5778  *      from the kernel tables.
5779  *      If head not NULL, device is queued to be unregistered later.
5780  *
5781  *      Callers must hold the rtnl semaphore.  You may want
5782  *      unregister_netdev() instead of this.
5783  */
5784
5785 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5786 {
5787         ASSERT_RTNL();
5788
5789         if (head) {
5790                 list_move_tail(&dev->unreg_list, head);
5791         } else {
5792                 rollback_registered(dev);
5793                 /* Finish processing unregister after unlock */
5794                 net_set_todo(dev);
5795         }
5796 }
5797 EXPORT_SYMBOL(unregister_netdevice_queue);
5798
5799 /**
5800  *      unregister_netdevice_many - unregister many devices
5801  *      @head: list of devices
5802  */
5803 void unregister_netdevice_many(struct list_head *head)
5804 {
5805         struct net_device *dev;
5806
5807         if (!list_empty(head)) {
5808                 rollback_registered_many(head);
5809                 list_for_each_entry(dev, head, unreg_list)
5810                         net_set_todo(dev);
5811         }
5812 }
5813 EXPORT_SYMBOL(unregister_netdevice_many);
5814
5815 /**
5816  *      unregister_netdev - remove device from the kernel
5817  *      @dev: device
5818  *
5819  *      This function shuts down a device interface and removes it
5820  *      from the kernel tables.
5821  *
5822  *      This is just a wrapper for unregister_netdevice that takes
5823  *      the rtnl semaphore.  In general you want to use this and not
5824  *      unregister_netdevice.
5825  */
5826 void unregister_netdev(struct net_device *dev)
5827 {
5828         rtnl_lock();
5829         unregister_netdevice(dev);
5830         rtnl_unlock();
5831 }
5832 EXPORT_SYMBOL(unregister_netdev);
5833
5834 /**
5835  *      dev_change_net_namespace - move device to different nethost namespace
5836  *      @dev: device
5837  *      @net: network namespace
5838  *      @pat: If not NULL name pattern to try if the current device name
5839  *            is already taken in the destination network namespace.
5840  *
5841  *      This function shuts down a device interface and moves it
5842  *      to a new network namespace. On success 0 is returned, on
5843  *      a failure a netagive errno code is returned.
5844  *
5845  *      Callers must hold the rtnl semaphore.
5846  */
5847
5848 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5849 {
5850         int err;
5851
5852         ASSERT_RTNL();
5853
5854         /* Don't allow namespace local devices to be moved. */
5855         err = -EINVAL;
5856         if (dev->features & NETIF_F_NETNS_LOCAL)
5857                 goto out;
5858
5859         /* Ensure the device has been registrered */
5860         err = -EINVAL;
5861         if (dev->reg_state != NETREG_REGISTERED)
5862                 goto out;
5863
5864         /* Get out if there is nothing todo */
5865         err = 0;
5866         if (net_eq(dev_net(dev), net))
5867                 goto out;
5868
5869         /* Pick the destination device name, and ensure
5870          * we can use it in the destination network namespace.
5871          */
5872         err = -EEXIST;
5873         if (__dev_get_by_name(net, dev->name)) {
5874                 /* We get here if we can't use the current device name */
5875                 if (!pat)
5876                         goto out;
5877                 if (dev_get_valid_name(dev, pat, 1))
5878                         goto out;
5879         }
5880
5881         /*
5882          * And now a mini version of register_netdevice unregister_netdevice.
5883          */
5884
5885         /* If device is running close it first. */
5886         dev_close(dev);
5887
5888         /* And unlink it from device chain */
5889         err = -ENODEV;
5890         unlist_netdevice(dev);
5891
5892         synchronize_net();
5893
5894         /* Shutdown queueing discipline. */
5895         dev_shutdown(dev);
5896
5897         /* Notify protocols, that we are about to destroy
5898            this device. They should clean all the things.
5899
5900            Note that dev->reg_state stays at NETREG_REGISTERED.
5901            This is wanted because this way 8021q and macvlan know
5902            the device is just moving and can keep their slaves up.
5903         */
5904         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5905         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5906
5907         /*
5908          *      Flush the unicast and multicast chains
5909          */
5910         dev_uc_flush(dev);
5911         dev_mc_flush(dev);
5912
5913         /* Actually switch the network namespace */
5914         dev_net_set(dev, net);
5915
5916         /* If there is an ifindex conflict assign a new one */
5917         if (__dev_get_by_index(net, dev->ifindex)) {
5918                 int iflink = (dev->iflink == dev->ifindex);
5919                 dev->ifindex = dev_new_index(net);
5920                 if (iflink)
5921                         dev->iflink = dev->ifindex;
5922         }
5923
5924         /* Fixup kobjects */
5925         err = device_rename(&dev->dev, dev->name);
5926         WARN_ON(err);
5927
5928         /* Add the device back in the hashes */
5929         list_netdevice(dev);
5930
5931         /* Notify protocols, that a new device appeared. */
5932         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5933
5934         /*
5935          *      Prevent userspace races by waiting until the network
5936          *      device is fully setup before sending notifications.
5937          */
5938         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5939
5940         synchronize_net();
5941         err = 0;
5942 out:
5943         return err;
5944 }
5945 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5946
5947 static int dev_cpu_callback(struct notifier_block *nfb,
5948                             unsigned long action,
5949                             void *ocpu)
5950 {
5951         struct sk_buff **list_skb;
5952         struct sk_buff *skb;
5953         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5954         struct softnet_data *sd, *oldsd;
5955
5956         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5957                 return NOTIFY_OK;
5958
5959         local_irq_disable();
5960         cpu = smp_processor_id();
5961         sd = &per_cpu(softnet_data, cpu);
5962         oldsd = &per_cpu(softnet_data, oldcpu);
5963
5964         /* Find end of our completion_queue. */
5965         list_skb = &sd->completion_queue;
5966         while (*list_skb)
5967                 list_skb = &(*list_skb)->next;
5968         /* Append completion queue from offline CPU. */
5969         *list_skb = oldsd->completion_queue;
5970         oldsd->completion_queue = NULL;
5971
5972         /* Append output queue from offline CPU. */
5973         if (oldsd->output_queue) {
5974                 *sd->output_queue_tailp = oldsd->output_queue;
5975                 sd->output_queue_tailp = oldsd->output_queue_tailp;
5976                 oldsd->output_queue = NULL;
5977                 oldsd->output_queue_tailp = &oldsd->output_queue;
5978         }
5979
5980         raise_softirq_irqoff(NET_TX_SOFTIRQ);
5981         local_irq_enable();
5982
5983         /* Process offline CPU's input_pkt_queue */
5984         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5985                 netif_rx(skb);
5986                 input_queue_head_incr(oldsd);
5987         }
5988         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5989                 netif_rx(skb);
5990                 input_queue_head_incr(oldsd);
5991         }
5992
5993         return NOTIFY_OK;
5994 }
5995
5996
5997 /**
5998  *      netdev_increment_features - increment feature set by one
5999  *      @all: current feature set
6000  *      @one: new feature set
6001  *      @mask: mask feature set
6002  *
6003  *      Computes a new feature set after adding a device with feature set
6004  *      @one to the master device with current feature set @all.  Will not
6005  *      enable anything that is off in @mask. Returns the new feature set.
6006  */
6007 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
6008                                         unsigned long mask)
6009 {
6010         /* If device needs checksumming, downgrade to it. */
6011         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6012                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6013         else if (mask & NETIF_F_ALL_CSUM) {
6014                 /* If one device supports v4/v6 checksumming, set for all. */
6015                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6016                     !(all & NETIF_F_GEN_CSUM)) {
6017                         all &= ~NETIF_F_ALL_CSUM;
6018                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6019                 }
6020
6021                 /* If one device supports hw checksumming, set for all. */
6022                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6023                         all &= ~NETIF_F_ALL_CSUM;
6024                         all |= NETIF_F_HW_CSUM;
6025                 }
6026         }
6027
6028         one |= NETIF_F_ALL_CSUM;
6029
6030         one |= all & NETIF_F_ONE_FOR_ALL;
6031         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6032         all |= one & mask & NETIF_F_ONE_FOR_ALL;
6033
6034         return all;
6035 }
6036 EXPORT_SYMBOL(netdev_increment_features);
6037
6038 static struct hlist_head *netdev_create_hash(void)
6039 {
6040         int i;
6041         struct hlist_head *hash;
6042
6043         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6044         if (hash != NULL)
6045                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6046                         INIT_HLIST_HEAD(&hash[i]);
6047
6048         return hash;
6049 }
6050
6051 /* Initialize per network namespace state */
6052 static int __net_init netdev_init(struct net *net)
6053 {
6054         INIT_LIST_HEAD(&net->dev_base_head);
6055
6056         net->dev_name_head = netdev_create_hash();
6057         if (net->dev_name_head == NULL)
6058                 goto err_name;
6059
6060         net->dev_index_head = netdev_create_hash();
6061         if (net->dev_index_head == NULL)
6062                 goto err_idx;
6063
6064         return 0;
6065
6066 err_idx:
6067         kfree(net->dev_name_head);
6068 err_name:
6069         return -ENOMEM;
6070 }
6071
6072 /**
6073  *      netdev_drivername - network driver for the device
6074  *      @dev: network device
6075  *      @buffer: buffer for resulting name
6076  *      @len: size of buffer
6077  *
6078  *      Determine network driver for device.
6079  */
6080 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6081 {
6082         const struct device_driver *driver;
6083         const struct device *parent;
6084
6085         if (len <= 0 || !buffer)
6086                 return buffer;
6087         buffer[0] = 0;
6088
6089         parent = dev->dev.parent;
6090
6091         if (!parent)
6092                 return buffer;
6093
6094         driver = parent->driver;
6095         if (driver && driver->name)
6096                 strlcpy(buffer, driver->name, len);
6097         return buffer;
6098 }
6099
6100 static int __netdev_printk(const char *level, const struct net_device *dev,
6101                            struct va_format *vaf)
6102 {
6103         int r;
6104
6105         if (dev && dev->dev.parent)
6106                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6107                                netdev_name(dev), vaf);
6108         else if (dev)
6109                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6110         else
6111                 r = printk("%s(NULL net_device): %pV", level, vaf);
6112
6113         return r;
6114 }
6115
6116 int netdev_printk(const char *level, const struct net_device *dev,
6117                   const char *format, ...)
6118 {
6119         struct va_format vaf;
6120         va_list args;
6121         int r;
6122
6123         va_start(args, format);
6124
6125         vaf.fmt = format;
6126         vaf.va = &args;
6127
6128         r = __netdev_printk(level, dev, &vaf);
6129         va_end(args);
6130
6131         return r;
6132 }
6133 EXPORT_SYMBOL(netdev_printk);
6134
6135 #define define_netdev_printk_level(func, level)                 \
6136 int func(const struct net_device *dev, const char *fmt, ...)    \
6137 {                                                               \
6138         int r;                                                  \
6139         struct va_format vaf;                                   \
6140         va_list args;                                           \
6141                                                                 \
6142         va_start(args, fmt);                                    \
6143                                                                 \
6144         vaf.fmt = fmt;                                          \
6145         vaf.va = &args;                                         \
6146                                                                 \
6147         r = __netdev_printk(level, dev, &vaf);                  \
6148         va_end(args);                                           \
6149                                                                 \
6150         return r;                                               \
6151 }                                                               \
6152 EXPORT_SYMBOL(func);
6153
6154 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6155 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6156 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6157 define_netdev_printk_level(netdev_err, KERN_ERR);
6158 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6159 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6160 define_netdev_printk_level(netdev_info, KERN_INFO);
6161
6162 static void __net_exit netdev_exit(struct net *net)
6163 {
6164         kfree(net->dev_name_head);
6165         kfree(net->dev_index_head);
6166 }
6167
6168 static struct pernet_operations __net_initdata netdev_net_ops = {
6169         .init = netdev_init,
6170         .exit = netdev_exit,
6171 };
6172
6173 static void __net_exit default_device_exit(struct net *net)
6174 {
6175         struct net_device *dev, *aux;
6176         /*
6177          * Push all migratable network devices back to the
6178          * initial network namespace
6179          */
6180         rtnl_lock();
6181         for_each_netdev_safe(net, dev, aux) {
6182                 int err;
6183                 char fb_name[IFNAMSIZ];
6184
6185                 /* Ignore unmoveable devices (i.e. loopback) */
6186                 if (dev->features & NETIF_F_NETNS_LOCAL)
6187                         continue;
6188
6189                 /* Leave virtual devices for the generic cleanup */
6190                 if (dev->rtnl_link_ops)
6191                         continue;
6192
6193                 /* Push remaing network devices to init_net */
6194                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6195                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6196                 if (err) {
6197                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6198                                 __func__, dev->name, err);
6199                         BUG();
6200                 }
6201         }
6202         rtnl_unlock();
6203 }
6204
6205 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6206 {
6207         /* At exit all network devices most be removed from a network
6208          * namespace.  Do this in the reverse order of registeration.
6209          * Do this across as many network namespaces as possible to
6210          * improve batching efficiency.
6211          */
6212         struct net_device *dev;
6213         struct net *net;
6214         LIST_HEAD(dev_kill_list);
6215
6216         rtnl_lock();
6217         list_for_each_entry(net, net_list, exit_list) {
6218                 for_each_netdev_reverse(net, dev) {
6219                         if (dev->rtnl_link_ops)
6220                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6221                         else
6222                                 unregister_netdevice_queue(dev, &dev_kill_list);
6223                 }
6224         }
6225         unregister_netdevice_many(&dev_kill_list);
6226         rtnl_unlock();
6227 }
6228
6229 static struct pernet_operations __net_initdata default_device_ops = {
6230         .exit = default_device_exit,
6231         .exit_batch = default_device_exit_batch,
6232 };
6233
6234 /*
6235  *      Initialize the DEV module. At boot time this walks the device list and
6236  *      unhooks any devices that fail to initialise (normally hardware not
6237  *      present) and leaves us with a valid list of present and active devices.
6238  *
6239  */
6240
6241 /*
6242  *       This is called single threaded during boot, so no need
6243  *       to take the rtnl semaphore.
6244  */
6245 static int __init net_dev_init(void)
6246 {
6247         int i, rc = -ENOMEM;
6248
6249         BUG_ON(!dev_boot_phase);
6250
6251         if (dev_proc_init())
6252                 goto out;
6253
6254         if (netdev_kobject_init())
6255                 goto out;
6256
6257         INIT_LIST_HEAD(&ptype_all);
6258         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6259                 INIT_LIST_HEAD(&ptype_base[i]);
6260
6261         if (register_pernet_subsys(&netdev_net_ops))
6262                 goto out;
6263
6264         /*
6265          *      Initialise the packet receive queues.
6266          */
6267
6268         for_each_possible_cpu(i) {
6269                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6270
6271                 memset(sd, 0, sizeof(*sd));
6272                 skb_queue_head_init(&sd->input_pkt_queue);
6273                 skb_queue_head_init(&sd->process_queue);
6274                 sd->completion_queue = NULL;
6275                 INIT_LIST_HEAD(&sd->poll_list);
6276                 sd->output_queue = NULL;
6277                 sd->output_queue_tailp = &sd->output_queue;
6278 #ifdef CONFIG_RPS
6279                 sd->csd.func = rps_trigger_softirq;
6280                 sd->csd.info = sd;
6281                 sd->csd.flags = 0;
6282                 sd->cpu = i;
6283 #endif
6284
6285                 sd->backlog.poll = process_backlog;
6286                 sd->backlog.weight = weight_p;
6287                 sd->backlog.gro_list = NULL;
6288                 sd->backlog.gro_count = 0;
6289         }
6290
6291         dev_boot_phase = 0;
6292
6293         /* The loopback device is special if any other network devices
6294          * is present in a network namespace the loopback device must
6295          * be present. Since we now dynamically allocate and free the
6296          * loopback device ensure this invariant is maintained by
6297          * keeping the loopback device as the first device on the
6298          * list of network devices.  Ensuring the loopback devices
6299          * is the first device that appears and the last network device
6300          * that disappears.
6301          */
6302         if (register_pernet_device(&loopback_net_ops))
6303                 goto out;
6304
6305         if (register_pernet_device(&default_device_ops))
6306                 goto out;
6307
6308         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6309         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6310
6311         hotcpu_notifier(dev_cpu_callback, 0);
6312         dst_init();
6313         dev_mcast_init();
6314         rc = 0;
6315 out:
6316         return rc;
6317 }
6318
6319 subsys_initcall(net_dev_init);
6320
6321 static int __init initialize_hashrnd(void)
6322 {
6323         get_random_bytes(&hashrnd, sizeof(hashrnd));
6324         return 0;
6325 }
6326
6327 late_initcall_sync(initialize_hashrnd);
6328