net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135
 136 #include "net-sysfs.h"
 137
 138 /* Instead of increasing this, you should create a hash table. */
 139 #define MAX_GRO_SKBS 8
 140
 141 /* This should be increased if a protocol with a bigger head is added. */
 142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144 /*
 145  *      The list of packet types we will receive (as opposed to discard)
 146  *      and the routines to invoke.
 147  *
 148  *      Why 16. Because with 16 the only overlap we get on a hash of the
 149  *      low nibble of the protocol value is RARP/SNAP/X.25.
 150  *
 151  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 152  *             sure which should go first, but I bet it won't make much
 153  *             difference if we are running VLANs.  The good news is that
 154  *             this protocol won't be in the list unless compiled in, so
 155  *             the average user (w/out VLANs) will not be adversely affected.
 156  *             --BLG
 157  *
 158  *              0800    IP
 159  *              8100    802.1Q VLAN
 160  *              0001    802.3
 161  *              0002    AX.25
 162  *              0004    802.2
 163  *              8035    RARP
 164  *              0005    SNAP
 165  *              0805    X.25
 166  *              0806    ARP
 167  *              8137    IPX
 168  *              0009    Localtalk
 169  *              86DD    IPv6
 170  */
 171
 172 #define PTYPE_HASH_SIZE (16)
 173 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 174
 175 static DEFINE_SPINLOCK(ptype_lock);
 176 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 177 static struct list_head ptype_all __read_mostly;        /* Taps */
 178
 179 /*
 180  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 181  * semaphore.
 182  *
 183  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 184  *
 185  * Writers must hold the rtnl semaphore while they loop through the
 186  * dev_base_head list, and hold dev_base_lock for writing when they do the
 187  * actual updates.  This allows pure readers to access the list even
 188  * while a writer is preparing to update it.
 189  *
 190  * To put it another way, dev_base_lock is held for writing only to
 191  * protect against pure readers; the rtnl semaphore provides the
 192  * protection against other writers.
 193  *
 194  * See, for example usages, register_netdevice() and
 195  * unregister_netdevice(), which must be called with the rtnl
 196  * semaphore held.
 197  */
 198 DEFINE_RWLOCK(dev_base_lock);
 199 EXPORT_SYMBOL(dev_base_lock);
 200
 201 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 202 {
 203         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 204         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 205 }
 206
 207 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 208 {
 209         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 210 }
 211
 212 static inline void rps_lock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_lock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 static inline void rps_unlock(struct softnet_data *sd)
 220 {
 221 #ifdef CONFIG_RPS
 222         spin_unlock(&sd->input_pkt_queue.lock);
 223 #endif
 224 }
 225
 226 /* Device list insertion */
 227 static int list_netdevice(struct net_device *dev)
 228 {
 229         struct net *net = dev_net(dev);
 230
 231         ASSERT_RTNL();
 232
 233         write_lock_bh(&dev_base_lock);
 234         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 235         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 236         hlist_add_head_rcu(&dev->index_hlist,
 237                            dev_index_hash(net, dev->ifindex));
 238         write_unlock_bh(&dev_base_lock);
 239         return 0;
 240 }
 241
 242 /* Device list removal
 243  * caller must respect a RCU grace period before freeing/reusing dev
 244  */
 245 static void unlist_netdevice(struct net_device *dev)
 246 {
 247         ASSERT_RTNL();
 248
 249         /* Unlink dev from the device chain */
 250         write_lock_bh(&dev_base_lock);
 251         list_del_rcu(&dev->dev_list);
 252         hlist_del_rcu(&dev->name_hlist);
 253         hlist_del_rcu(&dev->index_hlist);
 254         write_unlock_bh(&dev_base_lock);
 255 }
 256
 257 /*
 258  *      Our notifier list
 259  */
 260
 261 static RAW_NOTIFIER_HEAD(netdev_chain);
 262
 263 /*
 264  *      Device drivers call our routines to queue packets here. We empty the
 265  *      queue in the local softnet handler.
 266  */
 267
 268 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 269 EXPORT_PER_CPU_SYMBOL(softnet_data);
 270
 271 #ifdef CONFIG_LOCKDEP
 272 /*
 273  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 274  * according to dev->type
 275  */
 276 static const unsigned short netdev_lock_type[] =
 277         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 278          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 279          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 280          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 281          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 282          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 283          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 284          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 285          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 286          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 287          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 288          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 289          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 290          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 291          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 292          ARPHRD_VOID, ARPHRD_NONE};
 293
 294 static const char *const netdev_lock_name[] =
 295         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 308          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 309          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 310          "_xmit_VOID", "_xmit_NONE"};
 311
 312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314
 315 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 316 {
 317         int i;
 318
 319         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 320                 if (netdev_lock_type[i] == dev_type)
 321                         return i;
 322         /* the last key is used by default */
 323         return ARRAY_SIZE(netdev_lock_type) - 1;
 324 }
 325
 326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 327                                                  unsigned short dev_type)
 328 {
 329         int i;
 330
 331         i = netdev_lock_pos(dev_type);
 332         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 333                                    netdev_lock_name[i]);
 334 }
 335
 336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 337 {
 338         int i;
 339
 340         i = netdev_lock_pos(dev->type);
 341         lockdep_set_class_and_name(&dev->addr_list_lock,
 342                                    &netdev_addr_lock_key[i],
 343                                    netdev_lock_name[i]);
 344 }
 345 #else
 346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 347                                                  unsigned short dev_type)
 348 {
 349 }
 350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351 {
 352 }
 353 #endif
 354
 355 /*******************************************************************************
 356
 357                 Protocol management and registration routines
 358
 359 *******************************************************************************/
 360
 361 /*
 362  *      Add a protocol ID to the list. Now that the input handler is
 363  *      smarter we can dispense with all the messy stuff that used to be
 364  *      here.
 365  *
 366  *      BEWARE!!! Protocol handlers, mangling input packets,
 367  *      MUST BE last in hash buckets and checking protocol handlers
 368  *      MUST start from promiscuous ptype_all chain in net_bh.
 369  *      It is true now, do not change it.
 370  *      Explanation follows: if protocol handler, mangling packet, will
 371  *      be the first on list, it is not able to sense, that packet
 372  *      is cloned and should be copied-on-write, so that it will
 373  *      change it and subsequent readers will get broken packet.
 374  *                                                      --ANK (980803)
 375  */
 376
 377 static inline struct list_head *ptype_head(const struct packet_type *pt)
 378 {
 379         if (pt->type == htons(ETH_P_ALL))
 380                 return &ptype_all;
 381         else
 382                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383 }
 384
 385 /**
 386  *      dev_add_pack - add packet handler
 387  *      @pt: packet type declaration
 388  *
 389  *      Add a protocol handler to the networking stack. The passed &packet_type
 390  *      is linked into kernel lists and may not be freed until it has been
 391  *      removed from the kernel lists.
 392  *
 393  *      This call does not sleep therefore it can not
 394  *      guarantee all CPU's that are in middle of receiving packets
 395  *      will see the new packet type (until the next received packet).
 396  */
 397
 398 void dev_add_pack(struct packet_type *pt)
 399 {
 400         struct list_head *head = ptype_head(pt);
 401
 402         spin_lock(&ptype_lock);
 403         list_add_rcu(&pt->list, head);
 404         spin_unlock(&ptype_lock);
 405 }
 406 EXPORT_SYMBOL(dev_add_pack);
 407
 408 /**
 409  *      __dev_remove_pack        - remove packet handler
 410  *      @pt: packet type declaration
 411  *
 412  *      Remove a protocol handler that was previously added to the kernel
 413  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414  *      from the kernel lists and can be freed or reused once this function
 415  *      returns.
 416  *
 417  *      The packet type might still be in use by receivers
 418  *      and must not be freed until after all the CPU's have gone
 419  *      through a quiescent state.
 420  */
 421 void __dev_remove_pack(struct packet_type *pt)
 422 {
 423         struct list_head *head = ptype_head(pt);
 424         struct packet_type *pt1;
 425
 426         spin_lock(&ptype_lock);
 427
 428         list_for_each_entry(pt1, head, list) {
 429                 if (pt == pt1) {
 430                         list_del_rcu(&pt->list);
 431                         goto out;
 432                 }
 433         }
 434
 435         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 436 out:
 437         spin_unlock(&ptype_lock);
 438 }
 439 EXPORT_SYMBOL(__dev_remove_pack);
 440
 441 /**
 442  *      dev_remove_pack  - remove packet handler
 443  *      @pt: packet type declaration
 444  *
 445  *      Remove a protocol handler that was previously added to the kernel
 446  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447  *      from the kernel lists and can be freed or reused once this function
 448  *      returns.
 449  *
 450  *      This call sleeps to guarantee that no CPU is looking at the packet
 451  *      type after return.
 452  */
 453 void dev_remove_pack(struct packet_type *pt)
 454 {
 455         __dev_remove_pack(pt);
 456
 457         synchronize_net();
 458 }
 459 EXPORT_SYMBOL(dev_remove_pack);
 460
 461 /******************************************************************************
 462
 463                       Device Boot-time Settings Routines
 464
 465 *******************************************************************************/
 466
 467 /* Boot time configuration table */
 468 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 469
 470 /**
 471  *      netdev_boot_setup_add   - add new setup entry
 472  *      @name: name of the device
 473  *      @map: configured settings for the device
 474  *
 475  *      Adds new setup entry to the dev_boot_setup list.  The function
 476  *      returns 0 on error and 1 on success.  This is a generic routine to
 477  *      all netdevices.
 478  */
 479 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 480 {
 481         struct netdev_boot_setup *s;
 482         int i;
 483
 484         s = dev_boot_setup;
 485         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 486                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 487                         memset(s[i].name, 0, sizeof(s[i].name));
 488                         strlcpy(s[i].name, name, IFNAMSIZ);
 489                         memcpy(&s[i].map, map, sizeof(s[i].map));
 490                         break;
 491                 }
 492         }
 493
 494         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 495 }
 496
 497 /**
 498  *      netdev_boot_setup_check - check boot time settings
 499  *      @dev: the netdevice
 500  *
 501  *      Check boot time settings for the device.
 502  *      The found settings are set for the device to be used
 503  *      later in the device probing.
 504  *      Returns 0 if no settings found, 1 if they are.
 505  */
 506 int netdev_boot_setup_check(struct net_device *dev)
 507 {
 508         struct netdev_boot_setup *s = dev_boot_setup;
 509         int i;
 510
 511         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 512                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 513                     !strcmp(dev->name, s[i].name)) {
 514                         dev->irq        = s[i].map.irq;
 515                         dev->base_addr  = s[i].map.base_addr;
 516                         dev->mem_start  = s[i].map.mem_start;
 517                         dev->mem_end    = s[i].map.mem_end;
 518                         return 1;
 519                 }
 520         }
 521         return 0;
 522 }
 523 EXPORT_SYMBOL(netdev_boot_setup_check);
 524
 525
 526 /**
 527  *      netdev_boot_base        - get address from boot time settings
 528  *      @prefix: prefix for network device
 529  *      @unit: id for network device
 530  *
 531  *      Check boot time settings for the base address of device.
 532  *      The found settings are set for the device to be used
 533  *      later in the device probing.
 534  *      Returns 0 if no settings found.
 535  */
 536 unsigned long netdev_boot_base(const char *prefix, int unit)
 537 {
 538         const struct netdev_boot_setup *s = dev_boot_setup;
 539         char name[IFNAMSIZ];
 540         int i;
 541
 542         sprintf(name, "%s%d", prefix, unit);
 543
 544         /*
 545          * If device already registered then return base of 1
 546          * to indicate not to probe for this interface
 547          */
 548         if (__dev_get_by_name(&init_net, name))
 549                 return 1;
 550
 551         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 552                 if (!strcmp(name, s[i].name))
 553                         return s[i].map.base_addr;
 554         return 0;
 555 }
 556
 557 /*
 558  * Saves at boot time configured settings for any netdevice.
 559  */
 560 int __init netdev_boot_setup(char *str)
 561 {
 562         int ints[5];
 563         struct ifmap map;
 564
 565         str = get_options(str, ARRAY_SIZE(ints), ints);
 566         if (!str || !*str)
 567                 return 0;
 568
 569         /* Save settings */
 570         memset(&map, 0, sizeof(map));
 571         if (ints[0] > 0)
 572                 map.irq = ints[1];
 573         if (ints[0] > 1)
 574                 map.base_addr = ints[2];
 575         if (ints[0] > 2)
 576                 map.mem_start = ints[3];
 577         if (ints[0] > 3)
 578                 map.mem_end = ints[4];
 579
 580         /* Add new entry to the list */
 581         return netdev_boot_setup_add(str, &map);
 582 }
 583
 584 __setup("netdev=", netdev_boot_setup);
 585
 586 /*******************************************************************************
 587
 588                             Device Interface Subroutines
 589
 590 *******************************************************************************/
 591
 592 /**
 593  *      __dev_get_by_name       - find a device by its name
 594  *      @net: the applicable net namespace
 595  *      @name: name to find
 596  *
 597  *      Find an interface by name. Must be called under RTNL semaphore
 598  *      or @dev_base_lock. If the name is found a pointer to the device
 599  *      is returned. If the name is not found then %NULL is returned. The
 600  *      reference counters are not incremented so the caller must be
 601  *      careful with locks.
 602  */
 603
 604 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 605 {
 606         struct hlist_node *p;
 607         struct net_device *dev;
 608         struct hlist_head *head = dev_name_hash(net, name);
 609
 610         hlist_for_each_entry(dev, p, head, name_hlist)
 611                 if (!strncmp(dev->name, name, IFNAMSIZ))
 612                         return dev;
 613
 614         return NULL;
 615 }
 616 EXPORT_SYMBOL(__dev_get_by_name);
 617
 618 /**
 619  *      dev_get_by_name_rcu     - find a device by its name
 620  *      @net: the applicable net namespace
 621  *      @name: name to find
 622  *
 623  *      Find an interface by name.
 624  *      If the name is found a pointer to the device is returned.
 625  *      If the name is not found then %NULL is returned.
 626  *      The reference counters are not incremented so the caller must be
 627  *      careful with locks. The caller must hold RCU lock.
 628  */
 629
 630 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 631 {
 632         struct hlist_node *p;
 633         struct net_device *dev;
 634         struct hlist_head *head = dev_name_hash(net, name);
 635
 636         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 637                 if (!strncmp(dev->name, name, IFNAMSIZ))
 638                         return dev;
 639
 640         return NULL;
 641 }
 642 EXPORT_SYMBOL(dev_get_by_name_rcu);
 643
 644 /**
 645  *      dev_get_by_name         - find a device by its name
 646  *      @net: the applicable net namespace
 647  *      @name: name to find
 648  *
 649  *      Find an interface by name. This can be called from any
 650  *      context and does its own locking. The returned handle has
 651  *      the usage count incremented and the caller must use dev_put() to
 652  *      release it when it is no longer needed. %NULL is returned if no
 653  *      matching device is found.
 654  */
 655
 656 struct net_device *dev_get_by_name(struct net *net, const char *name)
 657 {
 658         struct net_device *dev;
 659
 660         rcu_read_lock();
 661         dev = dev_get_by_name_rcu(net, name);
 662         if (dev)
 663                 dev_hold(dev);
 664         rcu_read_unlock();
 665         return dev;
 666 }
 667 EXPORT_SYMBOL(dev_get_by_name);
 668
 669 /**
 670  *      __dev_get_by_index - find a device by its ifindex
 671  *      @net: the applicable net namespace
 672  *      @ifindex: index of device
 673  *
 674  *      Search for an interface by index. Returns %NULL if the device
 675  *      is not found or a pointer to the device. The device has not
 676  *      had its reference counter increased so the caller must be careful
 677  *      about locking. The caller must hold either the RTNL semaphore
 678  *      or @dev_base_lock.
 679  */
 680
 681 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 682 {
 683         struct hlist_node *p;
 684         struct net_device *dev;
 685         struct hlist_head *head = dev_index_hash(net, ifindex);
 686
 687         hlist_for_each_entry(dev, p, head, index_hlist)
 688                 if (dev->ifindex == ifindex)
 689                         return dev;
 690
 691         return NULL;
 692 }
 693 EXPORT_SYMBOL(__dev_get_by_index);
 694
 695 /**
 696  *      dev_get_by_index_rcu - find a device by its ifindex
 697  *      @net: the applicable net namespace
 698  *      @ifindex: index of device
 699  *
 700  *      Search for an interface by index. Returns %NULL if the device
 701  *      is not found or a pointer to the device. The device has not
 702  *      had its reference counter increased so the caller must be careful
 703  *      about locking. The caller must hold RCU lock.
 704  */
 705
 706 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 707 {
 708         struct hlist_node *p;
 709         struct net_device *dev;
 710         struct hlist_head *head = dev_index_hash(net, ifindex);
 711
 712         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 713                 if (dev->ifindex == ifindex)
 714                         return dev;
 715
 716         return NULL;
 717 }
 718 EXPORT_SYMBOL(dev_get_by_index_rcu);
 719
 720
 721 /**
 722  *      dev_get_by_index - find a device by its ifindex
 723  *      @net: the applicable net namespace
 724  *      @ifindex: index of device
 725  *
 726  *      Search for an interface by index. Returns NULL if the device
 727  *      is not found or a pointer to the device. The device returned has
 728  *      had a reference added and the pointer is safe until the user calls
 729  *      dev_put to indicate they have finished with it.
 730  */
 731
 732 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 733 {
 734         struct net_device *dev;
 735
 736         rcu_read_lock();
 737         dev = dev_get_by_index_rcu(net, ifindex);
 738         if (dev)
 739                 dev_hold(dev);
 740         rcu_read_unlock();
 741         return dev;
 742 }
 743 EXPORT_SYMBOL(dev_get_by_index);
 744
 745 /**
 746  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 747  *      @net: the applicable net namespace
 748  *      @type: media type of device
 749  *      @ha: hardware address
 750  *
 751  *      Search for an interface by MAC address. Returns NULL if the device
 752  *      is not found or a pointer to the device. The caller must hold RCU
 753  *      The returned device has not had its ref count increased
 754  *      and the caller must therefore be careful about locking
 755  *
 756  */
 757
 758 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 759                                        const char *ha)
 760 {
 761         struct net_device *dev;
 762
 763         for_each_netdev_rcu(net, dev)
 764                 if (dev->type == type &&
 765                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 766                         return dev;
 767
 768         return NULL;
 769 }
 770 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 771
 772 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 773 {
 774         struct net_device *dev;
 775
 776         ASSERT_RTNL();
 777         for_each_netdev(net, dev)
 778                 if (dev->type == type)
 779                         return dev;
 780
 781         return NULL;
 782 }
 783 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 784
 785 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 786 {
 787         struct net_device *dev, *ret = NULL;
 788
 789         rcu_read_lock();
 790         for_each_netdev_rcu(net, dev)
 791                 if (dev->type == type) {
 792                         dev_hold(dev);
 793                         ret = dev;
 794                         break;
 795                 }
 796         rcu_read_unlock();
 797         return ret;
 798 }
 799 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 800
 801 /**
 802  *      dev_get_by_flags_rcu - find any device with given flags
 803  *      @net: the applicable net namespace
 804  *      @if_flags: IFF_* values
 805  *      @mask: bitmask of bits in if_flags to check
 806  *
 807  *      Search for any interface with the given flags. Returns NULL if a device
 808  *      is not found or a pointer to the device. Must be called inside
 809  *      rcu_read_lock(), and result refcount is unchanged.
 810  */
 811
 812 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 813                                     unsigned short mask)
 814 {
 815         struct net_device *dev, *ret;
 816
 817         ret = NULL;
 818         for_each_netdev_rcu(net, dev) {
 819                 if (((dev->flags ^ if_flags) & mask) == 0) {
 820                         ret = dev;
 821                         break;
 822                 }
 823         }
 824         return ret;
 825 }
 826 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 827
 828 /**
 829  *      dev_valid_name - check if name is okay for network device
 830  *      @name: name string
 831  *
 832  *      Network device names need to be valid file names to
 833  *      to allow sysfs to work.  We also disallow any kind of
 834  *      whitespace.
 835  */
 836 int dev_valid_name(const char *name)
 837 {
 838         if (*name == '\0')
 839                 return 0;
 840         if (strlen(name) >= IFNAMSIZ)
 841                 return 0;
 842         if (!strcmp(name, ".") || !strcmp(name, ".."))
 843                 return 0;
 844
 845         while (*name) {
 846                 if (*name == '/' || isspace(*name))
 847                         return 0;
 848                 name++;
 849         }
 850         return 1;
 851 }
 852 EXPORT_SYMBOL(dev_valid_name);
 853
 854 /**
 855  *      __dev_alloc_name - allocate a name for a device
 856  *      @net: network namespace to allocate the device name in
 857  *      @name: name format string
 858  *      @buf:  scratch buffer and result name string
 859  *
 860  *      Passed a format string - eg "lt%d" it will try and find a suitable
 861  *      id. It scans list of devices to build up a free map, then chooses
 862  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 863  *      while allocating the name and adding the device in order to avoid
 864  *      duplicates.
 865  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 866  *      Returns the number of the unit assigned or a negative errno code.
 867  */
 868
 869 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 870 {
 871         int i = 0;
 872         const char *p;
 873         const int max_netdevices = 8*PAGE_SIZE;
 874         unsigned long *inuse;
 875         struct net_device *d;
 876
 877         p = strnchr(name, IFNAMSIZ-1, '%');
 878         if (p) {
 879                 /*
 880                  * Verify the string as this thing may have come from
 881                  * the user.  There must be either one "%d" and no other "%"
 882                  * characters.
 883                  */
 884                 if (p[1] != 'd' || strchr(p + 2, '%'))
 885                         return -EINVAL;
 886
 887                 /* Use one page as a bit array of possible slots */
 888                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 889                 if (!inuse)
 890                         return -ENOMEM;
 891
 892                 for_each_netdev(net, d) {
 893                         if (!sscanf(d->name, name, &i))
 894                                 continue;
 895                         if (i < 0 || i >= max_netdevices)
 896                                 continue;
 897
 898                         /*  avoid cases where sscanf is not exact inverse of printf */
 899                         snprintf(buf, IFNAMSIZ, name, i);
 900                         if (!strncmp(buf, d->name, IFNAMSIZ))
 901                                 set_bit(i, inuse);
 902                 }
 903
 904                 i = find_first_zero_bit(inuse, max_netdevices);
 905                 free_page((unsigned long) inuse);
 906         }
 907
 908         if (buf != name)
 909                 snprintf(buf, IFNAMSIZ, name, i);
 910         if (!__dev_get_by_name(net, buf))
 911                 return i;
 912
 913         /* It is possible to run out of possible slots
 914          * when the name is long and there isn't enough space left
 915          * for the digits, or if all bits are used.
 916          */
 917         return -ENFILE;
 918 }
 919
 920 /**
 921  *      dev_alloc_name - allocate a name for a device
 922  *      @dev: device
 923  *      @name: name format string
 924  *
 925  *      Passed a format string - eg "lt%d" it will try and find a suitable
 926  *      id. It scans list of devices to build up a free map, then chooses
 927  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 928  *      while allocating the name and adding the device in order to avoid
 929  *      duplicates.
 930  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 931  *      Returns the number of the unit assigned or a negative errno code.
 932  */
 933
 934 int dev_alloc_name(struct net_device *dev, const char *name)
 935 {
 936         char buf[IFNAMSIZ];
 937         struct net *net;
 938         int ret;
 939
 940         BUG_ON(!dev_net(dev));
 941         net = dev_net(dev);
 942         ret = __dev_alloc_name(net, name, buf);
 943         if (ret >= 0)
 944                 strlcpy(dev->name, buf, IFNAMSIZ);
 945         return ret;
 946 }
 947 EXPORT_SYMBOL(dev_alloc_name);
 948
 949 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 950 {
 951         struct net *net;
 952
 953         BUG_ON(!dev_net(dev));
 954         net = dev_net(dev);
 955
 956         if (!dev_valid_name(name))
 957                 return -EINVAL;
 958
 959         if (fmt && strchr(name, '%'))
 960                 return dev_alloc_name(dev, name);
 961         else if (__dev_get_by_name(net, name))
 962                 return -EEXIST;
 963         else if (dev->name != name)
 964                 strlcpy(dev->name, name, IFNAMSIZ);
 965
 966         return 0;
 967 }
 968
 969 /**
 970  *      dev_change_name - change name of a device
 971  *      @dev: device
 972  *      @newname: name (or format string) must be at least IFNAMSIZ
 973  *
 974  *      Change name of a device, can pass format strings "eth%d".
 975  *      for wildcarding.
 976  */
 977 int dev_change_name(struct net_device *dev, const char *newname)
 978 {
 979         char oldname[IFNAMSIZ];
 980         int err = 0;
 981         int ret;
 982         struct net *net;
 983
 984         ASSERT_RTNL();
 985         BUG_ON(!dev_net(dev));
 986
 987         net = dev_net(dev);
 988         if (dev->flags & IFF_UP)
 989                 return -EBUSY;
 990
 991         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 992                 return 0;
 993
 994         memcpy(oldname, dev->name, IFNAMSIZ);
 995
 996         err = dev_get_valid_name(dev, newname, 1);
 997         if (err < 0)
 998                 return err;
 999
1000 rollback:
1001         ret = device_rename(&dev->dev, dev->name);
1002         if (ret) {
1003                 memcpy(dev->name, oldname, IFNAMSIZ);
1004                 return ret;
1005         }
1006
1007         write_lock_bh(&dev_base_lock);
1008         hlist_del(&dev->name_hlist);
1009         write_unlock_bh(&dev_base_lock);
1010
1011         synchronize_rcu();
1012
1013         write_lock_bh(&dev_base_lock);
1014         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1015         write_unlock_bh(&dev_base_lock);
1016
1017         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1018         ret = notifier_to_errno(ret);
1019
1020         if (ret) {
1021                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1022                 if (err >= 0) {
1023                         err = ret;
1024                         memcpy(dev->name, oldname, IFNAMSIZ);
1025                         goto rollback;
1026                 } else {
1027                         printk(KERN_ERR
1028                                "%s: name change rollback failed: %d.\n",
1029                                dev->name, ret);
1030                 }
1031         }
1032
1033         return err;
1034 }
1035
1036 /**
1037  *      dev_set_alias - change ifalias of a device
1038  *      @dev: device
1039  *      @alias: name up to IFALIASZ
1040  *      @len: limit of bytes to copy from info
1041  *
1042  *      Set ifalias for a device,
1043  */
1044 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1045 {
1046         ASSERT_RTNL();
1047
1048         if (len >= IFALIASZ)
1049                 return -EINVAL;
1050
1051         if (!len) {
1052                 if (dev->ifalias) {
1053                         kfree(dev->ifalias);
1054                         dev->ifalias = NULL;
1055                 }
1056                 return 0;
1057         }
1058
1059         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1060         if (!dev->ifalias)
1061                 return -ENOMEM;
1062
1063         strlcpy(dev->ifalias, alias, len+1);
1064         return len;
1065 }
1066
1067
1068 /**
1069  *      netdev_features_change - device changes features
1070  *      @dev: device to cause notification
1071  *
1072  *      Called to indicate a device has changed features.
1073  */
1074 void netdev_features_change(struct net_device *dev)
1075 {
1076         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1077 }
1078 EXPORT_SYMBOL(netdev_features_change);
1079
1080 /**
1081  *      netdev_state_change - device changes state
1082  *      @dev: device to cause notification
1083  *
1084  *      Called to indicate a device has changed state. This function calls
1085  *      the notifier chains for netdev_chain and sends a NEWLINK message
1086  *      to the routing socket.
1087  */
1088 void netdev_state_change(struct net_device *dev)
1089 {
1090         if (dev->flags & IFF_UP) {
1091                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1092                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1093         }
1094 }
1095 EXPORT_SYMBOL(netdev_state_change);
1096
1097 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1098 {
1099         return call_netdevice_notifiers(event, dev);
1100 }
1101 EXPORT_SYMBOL(netdev_bonding_change);
1102
1103 /**
1104  *      dev_load        - load a network module
1105  *      @net: the applicable net namespace
1106  *      @name: name of interface
1107  *
1108  *      If a network interface is not present and the process has suitable
1109  *      privileges this function loads the module. If module loading is not
1110  *      available in this kernel then it becomes a nop.
1111  */
1112
1113 void dev_load(struct net *net, const char *name)
1114 {
1115         struct net_device *dev;
1116
1117         rcu_read_lock();
1118         dev = dev_get_by_name_rcu(net, name);
1119         rcu_read_unlock();
1120
1121         if (!dev && capable(CAP_NET_ADMIN))
1122                 request_module("%s", name);
1123 }
1124 EXPORT_SYMBOL(dev_load);
1125
1126 static int __dev_open(struct net_device *dev)
1127 {
1128         const struct net_device_ops *ops = dev->netdev_ops;
1129         int ret;
1130
1131         ASSERT_RTNL();
1132
1133         /*
1134          *      Is it even present?
1135          */
1136         if (!netif_device_present(dev))
1137                 return -ENODEV;
1138
1139         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1140         ret = notifier_to_errno(ret);
1141         if (ret)
1142                 return ret;
1143
1144         /*
1145          *      Call device private open method
1146          */
1147         set_bit(__LINK_STATE_START, &dev->state);
1148
1149         if (ops->ndo_validate_addr)
1150                 ret = ops->ndo_validate_addr(dev);
1151
1152         if (!ret && ops->ndo_open)
1153                 ret = ops->ndo_open(dev);
1154
1155         /*
1156          *      If it went open OK then:
1157          */
1158
1159         if (ret)
1160                 clear_bit(__LINK_STATE_START, &dev->state);
1161         else {
1162                 /*
1163                  *      Set the flags.
1164                  */
1165                 dev->flags |= IFF_UP;
1166
1167                 /*
1168                  *      Enable NET_DMA
1169                  */
1170                 net_dmaengine_get();
1171
1172                 /*
1173                  *      Initialize multicasting status
1174                  */
1175                 dev_set_rx_mode(dev);
1176
1177                 /*
1178                  *      Wakeup transmit queue engine
1179                  */
1180                 dev_activate(dev);
1181         }
1182
1183         return ret;
1184 }
1185
1186 /**
1187  *      dev_open        - prepare an interface for use.
1188  *      @dev:   device to open
1189  *
1190  *      Takes a device from down to up state. The device's private open
1191  *      function is invoked and then the multicast lists are loaded. Finally
1192  *      the device is moved into the up state and a %NETDEV_UP message is
1193  *      sent to the netdev notifier chain.
1194  *
1195  *      Calling this function on an active interface is a nop. On a failure
1196  *      a negative errno code is returned.
1197  */
1198 int dev_open(struct net_device *dev)
1199 {
1200         int ret;
1201
1202         /*
1203          *      Is it already up?
1204          */
1205         if (dev->flags & IFF_UP)
1206                 return 0;
1207
1208         /*
1209          *      Open device
1210          */
1211         ret = __dev_open(dev);
1212         if (ret < 0)
1213                 return ret;
1214
1215         /*
1216          *      ... and announce new interface.
1217          */
1218         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1219         call_netdevice_notifiers(NETDEV_UP, dev);
1220
1221         return ret;
1222 }
1223 EXPORT_SYMBOL(dev_open);
1224
1225 static int __dev_close_many(struct list_head *head)
1226 {
1227         struct net_device *dev;
1228
1229         ASSERT_RTNL();
1230         might_sleep();
1231
1232         list_for_each_entry(dev, head, unreg_list) {
1233                 /*
1234                  *      Tell people we are going down, so that they can
1235                  *      prepare to death, when device is still operating.
1236                  */
1237                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1238
1239                 clear_bit(__LINK_STATE_START, &dev->state);
1240
1241                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1242                  * can be even on different cpu. So just clear netif_running().
1243                  *
1244                  * dev->stop() will invoke napi_disable() on all of it's
1245                  * napi_struct instances on this device.
1246                  */
1247                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1248         }
1249
1250         dev_deactivate_many(head);
1251
1252         list_for_each_entry(dev, head, unreg_list) {
1253                 const struct net_device_ops *ops = dev->netdev_ops;
1254
1255                 /*
1256                  *      Call the device specific close. This cannot fail.
1257                  *      Only if device is UP
1258                  *
1259                  *      We allow it to be called even after a DETACH hot-plug
1260                  *      event.
1261                  */
1262                 if (ops->ndo_stop)
1263                         ops->ndo_stop(dev);
1264
1265                 /*
1266                  *      Device is now down.
1267                  */
1268
1269                 dev->flags &= ~IFF_UP;
1270
1271                 /*
1272                  *      Shutdown NET_DMA
1273                  */
1274                 net_dmaengine_put();
1275         }
1276
1277         return 0;
1278 }
1279
1280 static int __dev_close(struct net_device *dev)
1281 {
1282         LIST_HEAD(single);
1283
1284         list_add(&dev->unreg_list, &single);
1285         return __dev_close_many(&single);
1286 }
1287
1288 int dev_close_many(struct list_head *head)
1289 {
1290         struct net_device *dev, *tmp;
1291         LIST_HEAD(tmp_list);
1292
1293         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1294                 if (!(dev->flags & IFF_UP))
1295                         list_move(&dev->unreg_list, &tmp_list);
1296
1297         __dev_close_many(head);
1298
1299         /*
1300          * Tell people we are down
1301          */
1302         list_for_each_entry(dev, head, unreg_list) {
1303                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1304                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1305         }
1306
1307         /* rollback_registered_many needs the complete original list */
1308         list_splice(&tmp_list, head);
1309         return 0;
1310 }
1311
1312 /**
1313  *      dev_close - shutdown an interface.
1314  *      @dev: device to shutdown
1315  *
1316  *      This function moves an active device into down state. A
1317  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1318  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1319  *      chain.
1320  */
1321 int dev_close(struct net_device *dev)
1322 {
1323         LIST_HEAD(single);
1324
1325         list_add(&dev->unreg_list, &single);
1326         dev_close_many(&single);
1327
1328         return 0;
1329 }
1330 EXPORT_SYMBOL(dev_close);
1331
1332
1333 /**
1334  *      dev_disable_lro - disable Large Receive Offload on a device
1335  *      @dev: device
1336  *
1337  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1338  *      called under RTNL.  This is needed if received packets may be
1339  *      forwarded to another interface.
1340  */
1341 void dev_disable_lro(struct net_device *dev)
1342 {
1343         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1344             dev->ethtool_ops->set_flags) {
1345                 u32 flags = dev->ethtool_ops->get_flags(dev);
1346                 if (flags & ETH_FLAG_LRO) {
1347                         flags &= ~ETH_FLAG_LRO;
1348                         dev->ethtool_ops->set_flags(dev, flags);
1349                 }
1350         }
1351         WARN_ON(dev->features & NETIF_F_LRO);
1352 }
1353 EXPORT_SYMBOL(dev_disable_lro);
1354
1355
1356 static int dev_boot_phase = 1;
1357
1358 /*
1359  *      Device change register/unregister. These are not inline or static
1360  *      as we export them to the world.
1361  */
1362
1363 /**
1364  *      register_netdevice_notifier - register a network notifier block
1365  *      @nb: notifier
1366  *
1367  *      Register a notifier to be called when network device events occur.
1368  *      The notifier passed is linked into the kernel structures and must
1369  *      not be reused until it has been unregistered. A negative errno code
1370  *      is returned on a failure.
1371  *
1372  *      When registered all registration and up events are replayed
1373  *      to the new notifier to allow device to have a race free
1374  *      view of the network device list.
1375  */
1376
1377 int register_netdevice_notifier(struct notifier_block *nb)
1378 {
1379         struct net_device *dev;
1380         struct net_device *last;
1381         struct net *net;
1382         int err;
1383
1384         rtnl_lock();
1385         err = raw_notifier_chain_register(&netdev_chain, nb);
1386         if (err)
1387                 goto unlock;
1388         if (dev_boot_phase)
1389                 goto unlock;
1390         for_each_net(net) {
1391                 for_each_netdev(net, dev) {
1392                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1393                         err = notifier_to_errno(err);
1394                         if (err)
1395                                 goto rollback;
1396
1397                         if (!(dev->flags & IFF_UP))
1398                                 continue;
1399
1400                         nb->notifier_call(nb, NETDEV_UP, dev);
1401                 }
1402         }
1403
1404 unlock:
1405         rtnl_unlock();
1406         return err;
1407
1408 rollback:
1409         last = dev;
1410         for_each_net(net) {
1411                 for_each_netdev(net, dev) {
1412                         if (dev == last)
1413                                 break;
1414
1415                         if (dev->flags & IFF_UP) {
1416                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1417                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1418                         }
1419                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1420                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1421                 }
1422         }
1423
1424         raw_notifier_chain_unregister(&netdev_chain, nb);
1425         goto unlock;
1426 }
1427 EXPORT_SYMBOL(register_netdevice_notifier);
1428
1429 /**
1430  *      unregister_netdevice_notifier - unregister a network notifier block
1431  *      @nb: notifier
1432  *
1433  *      Unregister a notifier previously registered by
1434  *      register_netdevice_notifier(). The notifier is unlinked into the
1435  *      kernel structures and may then be reused. A negative errno code
1436  *      is returned on a failure.
1437  */
1438
1439 int unregister_netdevice_notifier(struct notifier_block *nb)
1440 {
1441         int err;
1442
1443         rtnl_lock();
1444         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1445         rtnl_unlock();
1446         return err;
1447 }
1448 EXPORT_SYMBOL(unregister_netdevice_notifier);
1449
1450 /**
1451  *      call_netdevice_notifiers - call all network notifier blocks
1452  *      @val: value passed unmodified to notifier function
1453  *      @dev: net_device pointer passed unmodified to notifier function
1454  *
1455  *      Call all network notifier blocks.  Parameters and return value
1456  *      are as for raw_notifier_call_chain().
1457  */
1458
1459 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1460 {
1461         ASSERT_RTNL();
1462         return raw_notifier_call_chain(&netdev_chain, val, dev);
1463 }
1464
1465 /* When > 0 there are consumers of rx skb time stamps */
1466 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1467
1468 void net_enable_timestamp(void)
1469 {
1470         atomic_inc(&netstamp_needed);
1471 }
1472 EXPORT_SYMBOL(net_enable_timestamp);
1473
1474 void net_disable_timestamp(void)
1475 {
1476         atomic_dec(&netstamp_needed);
1477 }
1478 EXPORT_SYMBOL(net_disable_timestamp);
1479
1480 static inline void net_timestamp_set(struct sk_buff *skb)
1481 {
1482         if (atomic_read(&netstamp_needed))
1483                 __net_timestamp(skb);
1484         else
1485                 skb->tstamp.tv64 = 0;
1486 }
1487
1488 static inline void net_timestamp_check(struct sk_buff *skb)
1489 {
1490         if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1491                 __net_timestamp(skb);
1492 }
1493
1494 /**
1495  * dev_forward_skb - loopback an skb to another netif
1496  *
1497  * @dev: destination network device
1498  * @skb: buffer to forward
1499  *
1500  * return values:
1501  *      NET_RX_SUCCESS  (no congestion)
1502  *      NET_RX_DROP     (packet was dropped, but freed)
1503  *
1504  * dev_forward_skb can be used for injecting an skb from the
1505  * start_xmit function of one device into the receive queue
1506  * of another device.
1507  *
1508  * The receiving device may be in another namespace, so
1509  * we have to clear all information in the skb that could
1510  * impact namespace isolation.
1511  */
1512 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1513 {
1514         skb_orphan(skb);
1515         nf_reset(skb);
1516
1517         if (unlikely(!(dev->flags & IFF_UP) ||
1518                      (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1519                 atomic_long_inc(&dev->rx_dropped);
1520                 kfree_skb(skb);
1521                 return NET_RX_DROP;
1522         }
1523         skb_set_dev(skb, dev);
1524         skb->tstamp.tv64 = 0;
1525         skb->pkt_type = PACKET_HOST;
1526         skb->protocol = eth_type_trans(skb, dev);
1527         return netif_rx(skb);
1528 }
1529 EXPORT_SYMBOL_GPL(dev_forward_skb);
1530
1531 static inline int deliver_skb(struct sk_buff *skb,
1532                               struct packet_type *pt_prev,
1533                               struct net_device *orig_dev)
1534 {
1535         atomic_inc(&skb->users);
1536         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1537 }
1538
1539 /*
1540  *      Support routine. Sends outgoing frames to any network
1541  *      taps currently in use.
1542  */
1543
1544 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1545 {
1546         struct packet_type *ptype;
1547         struct sk_buff *skb2 = NULL;
1548         struct packet_type *pt_prev = NULL;
1549
1550         rcu_read_lock();
1551         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1552                 /* Never send packets back to the socket
1553                  * they originated from - MvS (miquels@drinkel.ow.org)
1554                  */
1555                 if ((ptype->dev == dev || !ptype->dev) &&
1556                     (ptype->af_packet_priv == NULL ||
1557                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1558                         if (pt_prev) {
1559                                 deliver_skb(skb2, pt_prev, skb->dev);
1560                                 pt_prev = ptype;
1561                                 continue;
1562                         }
1563
1564                         skb2 = skb_clone(skb, GFP_ATOMIC);
1565                         if (!skb2)
1566                                 break;
1567
1568                         net_timestamp_set(skb2);
1569
1570                         /* skb->nh should be correctly
1571                            set by sender, so that the second statement is
1572                            just protection against buggy protocols.
1573                          */
1574                         skb_reset_mac_header(skb2);
1575
1576                         if (skb_network_header(skb2) < skb2->data ||
1577                             skb2->network_header > skb2->tail) {
1578                                 if (net_ratelimit())
1579                                         printk(KERN_CRIT "protocol %04x is "
1580                                                "buggy, dev %s\n",
1581                                                ntohs(skb2->protocol),
1582                                                dev->name);
1583                                 skb_reset_network_header(skb2);
1584                         }
1585
1586                         skb2->transport_header = skb2->network_header;
1587                         skb2->pkt_type = PACKET_OUTGOING;
1588                         pt_prev = ptype;
1589                 }
1590         }
1591         if (pt_prev)
1592                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1593         rcu_read_unlock();
1594 }
1595
1596 /*
1597  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1598  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1599  */
1600 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1601 {
1602         int rc;
1603
1604         if (txq < 1 || txq > dev->num_tx_queues)
1605                 return -EINVAL;
1606
1607         if (dev->reg_state == NETREG_REGISTERED) {
1608                 ASSERT_RTNL();
1609
1610                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1611                                                   txq);
1612                 if (rc)
1613                         return rc;
1614
1615                 if (txq < dev->real_num_tx_queues)
1616                         qdisc_reset_all_tx_gt(dev, txq);
1617         }
1618
1619         dev->real_num_tx_queues = txq;
1620         return 0;
1621 }
1622 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1623
1624 #ifdef CONFIG_RPS
1625 /**
1626  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1627  *      @dev: Network device
1628  *      @rxq: Actual number of RX queues
1629  *
1630  *      This must be called either with the rtnl_lock held or before
1631  *      registration of the net device.  Returns 0 on success, or a
1632  *      negative error code.  If called before registration, it always
1633  *      succeeds.
1634  */
1635 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1636 {
1637         int rc;
1638
1639         if (rxq < 1 || rxq > dev->num_rx_queues)
1640                 return -EINVAL;
1641
1642         if (dev->reg_state == NETREG_REGISTERED) {
1643                 ASSERT_RTNL();
1644
1645                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1646                                                   rxq);
1647                 if (rc)
1648                         return rc;
1649         }
1650
1651         dev->real_num_rx_queues = rxq;
1652         return 0;
1653 }
1654 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1655 #endif
1656
1657 static inline void __netif_reschedule(struct Qdisc *q)
1658 {
1659         struct softnet_data *sd;
1660         unsigned long flags;
1661
1662         local_irq_save(flags);
1663         sd = &__get_cpu_var(softnet_data);
1664         q->next_sched = NULL;
1665         *sd->output_queue_tailp = q;
1666         sd->output_queue_tailp = &q->next_sched;
1667         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1668         local_irq_restore(flags);
1669 }
1670
1671 void __netif_schedule(struct Qdisc *q)
1672 {
1673         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1674                 __netif_reschedule(q);
1675 }
1676 EXPORT_SYMBOL(__netif_schedule);
1677
1678 void dev_kfree_skb_irq(struct sk_buff *skb)
1679 {
1680         if (atomic_dec_and_test(&skb->users)) {
1681                 struct softnet_data *sd;
1682                 unsigned long flags;
1683
1684                 local_irq_save(flags);
1685                 sd = &__get_cpu_var(softnet_data);
1686                 skb->next = sd->completion_queue;
1687                 sd->completion_queue = skb;
1688                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1689                 local_irq_restore(flags);
1690         }
1691 }
1692 EXPORT_SYMBOL(dev_kfree_skb_irq);
1693
1694 void dev_kfree_skb_any(struct sk_buff *skb)
1695 {
1696         if (in_irq() || irqs_disabled())
1697                 dev_kfree_skb_irq(skb);
1698         else
1699                 dev_kfree_skb(skb);
1700 }
1701 EXPORT_SYMBOL(dev_kfree_skb_any);
1702
1703
1704 /**
1705  * netif_device_detach - mark device as removed
1706  * @dev: network device
1707  *
1708  * Mark device as removed from system and therefore no longer available.
1709  */
1710 void netif_device_detach(struct net_device *dev)
1711 {
1712         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1713             netif_running(dev)) {
1714                 netif_tx_stop_all_queues(dev);
1715         }
1716 }
1717 EXPORT_SYMBOL(netif_device_detach);
1718
1719 /**
1720  * netif_device_attach - mark device as attached
1721  * @dev: network device
1722  *
1723  * Mark device as attached from system and restart if needed.
1724  */
1725 void netif_device_attach(struct net_device *dev)
1726 {
1727         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1728             netif_running(dev)) {
1729                 netif_tx_wake_all_queues(dev);
1730                 __netdev_watchdog_up(dev);
1731         }
1732 }
1733 EXPORT_SYMBOL(netif_device_attach);
1734
1735 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1736 {
1737         return ((features & NETIF_F_GEN_CSUM) ||
1738                 ((features & NETIF_F_V4_CSUM) &&
1739                  protocol == htons(ETH_P_IP)) ||
1740                 ((features & NETIF_F_V6_CSUM) &&
1741                  protocol == htons(ETH_P_IPV6)) ||
1742                 ((features & NETIF_F_FCOE_CRC) &&
1743                  protocol == htons(ETH_P_FCOE)));
1744 }
1745
1746 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1747 {
1748         __be16 protocol = skb->protocol;
1749         int features = dev->features;
1750
1751         if (vlan_tx_tag_present(skb)) {
1752                 features &= dev->vlan_features;
1753         } else if (protocol == htons(ETH_P_8021Q)) {
1754                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1755                 protocol = veh->h_vlan_encapsulated_proto;
1756                 features &= dev->vlan_features;
1757         }
1758
1759         return can_checksum_protocol(features, protocol);
1760 }
1761
1762 /**
1763  * skb_dev_set -- assign a new device to a buffer
1764  * @skb: buffer for the new device
1765  * @dev: network device
1766  *
1767  * If an skb is owned by a device already, we have to reset
1768  * all data private to the namespace a device belongs to
1769  * before assigning it a new device.
1770  */
1771 #ifdef CONFIG_NET_NS
1772 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1773 {
1774         skb_dst_drop(skb);
1775         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1776                 secpath_reset(skb);
1777                 nf_reset(skb);
1778                 skb_init_secmark(skb);
1779                 skb->mark = 0;
1780                 skb->priority = 0;
1781                 skb->nf_trace = 0;
1782                 skb->ipvs_property = 0;
1783 #ifdef CONFIG_NET_SCHED
1784                 skb->tc_index = 0;
1785 #endif
1786         }
1787         skb->dev = dev;
1788 }
1789 EXPORT_SYMBOL(skb_set_dev);
1790 #endif /* CONFIG_NET_NS */
1791
1792 /*
1793  * Invalidate hardware checksum when packet is to be mangled, and
1794  * complete checksum manually on outgoing path.
1795  */
1796 int skb_checksum_help(struct sk_buff *skb)
1797 {
1798         __wsum csum;
1799         int ret = 0, offset;
1800
1801         if (skb->ip_summed == CHECKSUM_COMPLETE)
1802                 goto out_set_summed;
1803
1804         if (unlikely(skb_shinfo(skb)->gso_size)) {
1805                 /* Let GSO fix up the checksum. */
1806                 goto out_set_summed;
1807         }
1808
1809         offset = skb_checksum_start_offset(skb);
1810         BUG_ON(offset >= skb_headlen(skb));
1811         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1812
1813         offset += skb->csum_offset;
1814         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1815
1816         if (skb_cloned(skb) &&
1817             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1818                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1819                 if (ret)
1820                         goto out;
1821         }
1822
1823         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1824 out_set_summed:
1825         skb->ip_summed = CHECKSUM_NONE;
1826 out:
1827         return ret;
1828 }
1829 EXPORT_SYMBOL(skb_checksum_help);
1830
1831 /**
1832  *      skb_gso_segment - Perform segmentation on skb.
1833  *      @skb: buffer to segment
1834  *      @features: features for the output path (see dev->features)
1835  *
1836  *      This function segments the given skb and returns a list of segments.
1837  *
1838  *      It may return NULL if the skb requires no segmentation.  This is
1839  *      only possible when GSO is used for verifying header integrity.
1840  */
1841 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1842 {
1843         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1844         struct packet_type *ptype;
1845         __be16 type = skb->protocol;
1846         int vlan_depth = ETH_HLEN;
1847         int err;
1848
1849         while (type == htons(ETH_P_8021Q)) {
1850                 struct vlan_hdr *vh;
1851
1852                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1853                         return ERR_PTR(-EINVAL);
1854
1855                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1856                 type = vh->h_vlan_encapsulated_proto;
1857                 vlan_depth += VLAN_HLEN;
1858         }
1859
1860         skb_reset_mac_header(skb);
1861         skb->mac_len = skb->network_header - skb->mac_header;
1862         __skb_pull(skb, skb->mac_len);
1863
1864         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1865                 struct net_device *dev = skb->dev;
1866                 struct ethtool_drvinfo info = {};
1867
1868                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1869                         dev->ethtool_ops->get_drvinfo(dev, &info);
1870
1871                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1872                      info.driver, dev ? dev->features : 0L,
1873                      skb->sk ? skb->sk->sk_route_caps : 0L,
1874                      skb->len, skb->data_len, skb->ip_summed);
1875
1876                 if (skb_header_cloned(skb) &&
1877                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1878                         return ERR_PTR(err);
1879         }
1880
1881         rcu_read_lock();
1882         list_for_each_entry_rcu(ptype,
1883                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1884                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1885                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1886                                 err = ptype->gso_send_check(skb);
1887                                 segs = ERR_PTR(err);
1888                                 if (err || skb_gso_ok(skb, features))
1889                                         break;
1890                                 __skb_push(skb, (skb->data -
1891                                                  skb_network_header(skb)));
1892                         }
1893                         segs = ptype->gso_segment(skb, features);
1894                         break;
1895                 }
1896         }
1897         rcu_read_unlock();
1898
1899         __skb_push(skb, skb->data - skb_mac_header(skb));
1900
1901         return segs;
1902 }
1903 EXPORT_SYMBOL(skb_gso_segment);
1904
1905 /* Take action when hardware reception checksum errors are detected. */
1906 #ifdef CONFIG_BUG
1907 void netdev_rx_csum_fault(struct net_device *dev)
1908 {
1909         if (net_ratelimit()) {
1910                 printk(KERN_ERR "%s: hw csum failure.\n",
1911                         dev ? dev->name : "<unknown>");
1912                 dump_stack();
1913         }
1914 }
1915 EXPORT_SYMBOL(netdev_rx_csum_fault);
1916 #endif
1917
1918 /* Actually, we should eliminate this check as soon as we know, that:
1919  * 1. IOMMU is present and allows to map all the memory.
1920  * 2. No high memory really exists on this machine.
1921  */
1922
1923 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1924 {
1925 #ifdef CONFIG_HIGHMEM
1926         int i;
1927         if (!(dev->features & NETIF_F_HIGHDMA)) {
1928                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1929                         if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1930                                 return 1;
1931         }
1932
1933         if (PCI_DMA_BUS_IS_PHYS) {
1934                 struct device *pdev = dev->dev.parent;
1935
1936                 if (!pdev)
1937                         return 0;
1938                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1939                         dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1940                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1941                                 return 1;
1942                 }
1943         }
1944 #endif
1945         return 0;
1946 }
1947
1948 struct dev_gso_cb {
1949         void (*destructor)(struct sk_buff *skb);
1950 };
1951
1952 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1953
1954 static void dev_gso_skb_destructor(struct sk_buff *skb)
1955 {
1956         struct dev_gso_cb *cb;
1957
1958         do {
1959                 struct sk_buff *nskb = skb->next;
1960
1961                 skb->next = nskb->next;
1962                 nskb->next = NULL;
1963                 kfree_skb(nskb);
1964         } while (skb->next);
1965
1966         cb = DEV_GSO_CB(skb);
1967         if (cb->destructor)
1968                 cb->destructor(skb);
1969 }
1970
1971 /**
1972  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1973  *      @skb: buffer to segment
1974  *      @features: device features as applicable to this skb
1975  *
1976  *      This function segments the given skb and stores the list of segments
1977  *      in skb->next.
1978  */
1979 static int dev_gso_segment(struct sk_buff *skb, int features)
1980 {
1981         struct sk_buff *segs;
1982
1983         segs = skb_gso_segment(skb, features);
1984
1985         /* Verifying header integrity only. */
1986         if (!segs)
1987                 return 0;
1988
1989         if (IS_ERR(segs))
1990                 return PTR_ERR(segs);
1991
1992         skb->next = segs;
1993         DEV_GSO_CB(skb)->destructor = skb->destructor;
1994         skb->destructor = dev_gso_skb_destructor;
1995
1996         return 0;
1997 }
1998
1999 /*
2000  * Try to orphan skb early, right before transmission by the device.
2001  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2002  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2003  */
2004 static inline void skb_orphan_try(struct sk_buff *skb)
2005 {
2006         struct sock *sk = skb->sk;
2007
2008         if (sk && !skb_shinfo(skb)->tx_flags) {
2009                 /* skb_tx_hash() wont be able to get sk.
2010                  * We copy sk_hash into skb->rxhash
2011                  */
2012                 if (!skb->rxhash)
2013                         skb->rxhash = sk->sk_hash;
2014                 skb_orphan(skb);
2015         }
2016 }
2017
2018 static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
2019 {
2020         if (!can_checksum_protocol(protocol, features)) {
2021                 features &= ~NETIF_F_ALL_CSUM;
2022                 features &= ~NETIF_F_SG;
2023         } else if (illegal_highdma(skb->dev, skb)) {
2024                 features &= ~NETIF_F_SG;
2025         }
2026
2027         return features;
2028 }
2029
2030 int netif_skb_features(struct sk_buff *skb)
2031 {
2032         __be16 protocol = skb->protocol;
2033         int features = skb->dev->features;
2034
2035         if (protocol == htons(ETH_P_8021Q)) {
2036                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2037                 protocol = veh->h_vlan_encapsulated_proto;
2038         } else if (!vlan_tx_tag_present(skb)) {
2039                 return harmonize_features(skb, protocol, features);
2040         }
2041
2042         features &= skb->dev->vlan_features;
2043
2044         if (protocol != htons(ETH_P_8021Q)) {
2045                 return harmonize_features(skb, protocol, features);
2046         } else {
2047                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2048                                 NETIF_F_GEN_CSUM;
2049                 return harmonize_features(skb, protocol, features);
2050         }
2051 }
2052 EXPORT_SYMBOL(netif_skb_features);
2053
2054 /*
2055  * Returns true if either:
2056  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2057  *      2. skb is fragmented and the device does not support SG, or if
2058  *         at least one of fragments is in highmem and device does not
2059  *         support DMA from it.
2060  */
2061 static inline int skb_needs_linearize(struct sk_buff *skb,
2062                                       int features)
2063 {
2064         return skb_is_nonlinear(skb) &&
2065                         ((skb_has_frag_list(skb) &&
2066                                 !(features & NETIF_F_FRAGLIST)) ||
2067                         (skb_shinfo(skb)->nr_frags &&
2068                                 !(features & NETIF_F_SG)));
2069 }
2070
2071 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2072                         struct netdev_queue *txq)
2073 {
2074         const struct net_device_ops *ops = dev->netdev_ops;
2075         int rc = NETDEV_TX_OK;
2076
2077         if (likely(!skb->next)) {
2078                 int features;
2079
2080                 /*
2081                  * If device doesnt need skb->dst, release it right now while
2082                  * its hot in this cpu cache
2083                  */
2084                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2085                         skb_dst_drop(skb);
2086
2087                 if (!list_empty(&ptype_all))
2088                         dev_queue_xmit_nit(skb, dev);
2089
2090                 skb_orphan_try(skb);
2091
2092                 features = netif_skb_features(skb);
2093
2094                 if (vlan_tx_tag_present(skb) &&
2095                     !(features & NETIF_F_HW_VLAN_TX)) {
2096                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2097                         if (unlikely(!skb))
2098                                 goto out;
2099
2100                         skb->vlan_tci = 0;
2101                 }
2102
2103                 if (netif_needs_gso(skb, features)) {
2104                         if (unlikely(dev_gso_segment(skb, features)))
2105                                 goto out_kfree_skb;
2106                         if (skb->next)
2107                                 goto gso;
2108                 } else {
2109                         if (skb_needs_linearize(skb, features) &&
2110                             __skb_linearize(skb))
2111                                 goto out_kfree_skb;
2112
2113                         /* If packet is not checksummed and device does not
2114                          * support checksumming for this protocol, complete
2115                          * checksumming here.
2116                          */
2117                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2118                                 skb_set_transport_header(skb,
2119                                         skb_checksum_start_offset(skb));
2120                                 if (!dev_can_checksum(dev, skb) &&
2121                                      skb_checksum_help(skb))
2122                                         goto out_kfree_skb;
2123                         }
2124                 }
2125
2126                 rc = ops->ndo_start_xmit(skb, dev);
2127                 trace_net_dev_xmit(skb, rc);
2128                 if (rc == NETDEV_TX_OK)
2129                         txq_trans_update(txq);
2130                 return rc;
2131         }
2132
2133 gso:
2134         do {
2135                 struct sk_buff *nskb = skb->next;
2136
2137                 skb->next = nskb->next;
2138                 nskb->next = NULL;
2139
2140                 /*
2141                  * If device doesnt need nskb->dst, release it right now while
2142                  * its hot in this cpu cache
2143                  */
2144                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2145                         skb_dst_drop(nskb);
2146
2147                 rc = ops->ndo_start_xmit(nskb, dev);
2148                 trace_net_dev_xmit(nskb, rc);
2149                 if (unlikely(rc != NETDEV_TX_OK)) {
2150                         if (rc & ~NETDEV_TX_MASK)
2151                                 goto out_kfree_gso_skb;
2152                         nskb->next = skb->next;
2153                         skb->next = nskb;
2154                         return rc;
2155                 }
2156                 txq_trans_update(txq);
2157                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2158                         return NETDEV_TX_BUSY;
2159         } while (skb->next);
2160
2161 out_kfree_gso_skb:
2162         if (likely(skb->next == NULL))
2163                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2164 out_kfree_skb:
2165         kfree_skb(skb);
2166 out:
2167         return rc;
2168 }
2169
2170 static u32 hashrnd __read_mostly;
2171
2172 /*
2173  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2174  * to be used as a distribution range.
2175  */
2176 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2177                   unsigned int num_tx_queues)
2178 {
2179         u32 hash;
2180
2181         if (skb_rx_queue_recorded(skb)) {
2182                 hash = skb_get_rx_queue(skb);
2183                 while (unlikely(hash >= num_tx_queues))
2184                         hash -= num_tx_queues;
2185                 return hash;
2186         }
2187
2188         if (skb->sk && skb->sk->sk_hash)
2189                 hash = skb->sk->sk_hash;
2190         else
2191                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2192         hash = jhash_1word(hash, hashrnd);
2193
2194         return (u16) (((u64) hash * num_tx_queues) >> 32);
2195 }
2196 EXPORT_SYMBOL(__skb_tx_hash);
2197
2198 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2199 {
2200         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2201                 if (net_ratelimit()) {
2202                         pr_warning("%s selects TX queue %d, but "
2203                                 "real number of TX queues is %d\n",
2204                                 dev->name, queue_index, dev->real_num_tx_queues);
2205                 }
2206                 return 0;
2207         }
2208         return queue_index;
2209 }
2210
2211 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2212 {
2213 #ifdef CONFIG_XPS
2214         struct xps_dev_maps *dev_maps;
2215         struct xps_map *map;
2216         int queue_index = -1;
2217
2218         rcu_read_lock();
2219         dev_maps = rcu_dereference(dev->xps_maps);
2220         if (dev_maps) {
2221                 map = rcu_dereference(
2222                     dev_maps->cpu_map[raw_smp_processor_id()]);
2223                 if (map) {
2224                         if (map->len == 1)
2225                                 queue_index = map->queues[0];
2226                         else {
2227                                 u32 hash;
2228                                 if (skb->sk && skb->sk->sk_hash)
2229                                         hash = skb->sk->sk_hash;
2230                                 else
2231                                         hash = (__force u16) skb->protocol ^
2232                                             skb->rxhash;
2233                                 hash = jhash_1word(hash, hashrnd);
2234                                 queue_index = map->queues[
2235                                     ((u64)hash * map->len) >> 32];
2236                         }
2237                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2238                                 queue_index = -1;
2239                 }
2240         }
2241         rcu_read_unlock();
2242
2243         return queue_index;
2244 #else
2245         return -1;
2246 #endif
2247 }
2248
2249 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2250                                         struct sk_buff *skb)
2251 {
2252         int queue_index;
2253         const struct net_device_ops *ops = dev->netdev_ops;
2254
2255         if (dev->real_num_tx_queues == 1)
2256                 queue_index = 0;
2257         else if (ops->ndo_select_queue) {
2258                 queue_index = ops->ndo_select_queue(dev, skb);
2259                 queue_index = dev_cap_txqueue(dev, queue_index);
2260         } else {
2261                 struct sock *sk = skb->sk;
2262                 queue_index = sk_tx_queue_get(sk);
2263
2264                 if (queue_index < 0 || skb->ooo_okay ||
2265                     queue_index >= dev->real_num_tx_queues) {
2266                         int old_index = queue_index;
2267
2268                         queue_index = get_xps_queue(dev, skb);
2269                         if (queue_index < 0)
2270                                 queue_index = skb_tx_hash(dev, skb);
2271
2272                         if (queue_index != old_index && sk) {
2273                                 struct dst_entry *dst =
2274                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2275
2276                                 if (dst && skb_dst(skb) == dst)
2277                                         sk_tx_queue_set(sk, queue_index);
2278                         }
2279                 }
2280         }
2281
2282         skb_set_queue_mapping(skb, queue_index);
2283         return netdev_get_tx_queue(dev, queue_index);
2284 }
2285
2286 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2287                                  struct net_device *dev,
2288                                  struct netdev_queue *txq)
2289 {
2290         spinlock_t *root_lock = qdisc_lock(q);
2291         bool contended = qdisc_is_running(q);
2292         int rc;
2293
2294         /*
2295          * Heuristic to force contended enqueues to serialize on a
2296          * separate lock before trying to get qdisc main lock.
2297          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2298          * and dequeue packets faster.
2299          */
2300         if (unlikely(contended))
2301                 spin_lock(&q->busylock);
2302
2303         spin_lock(root_lock);
2304         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2305                 kfree_skb(skb);
2306                 rc = NET_XMIT_DROP;
2307         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2308                    qdisc_run_begin(q)) {
2309                 /*
2310                  * This is a work-conserving queue; there are no old skbs
2311                  * waiting to be sent out; and the qdisc is not running -
2312                  * xmit the skb directly.
2313                  */
2314                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2315                         skb_dst_force(skb);
2316                 __qdisc_update_bstats(q, skb->len);
2317                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2318                         if (unlikely(contended)) {
2319                                 spin_unlock(&q->busylock);
2320                                 contended = false;
2321                         }
2322                         __qdisc_run(q);
2323                 } else
2324                         qdisc_run_end(q);
2325
2326                 rc = NET_XMIT_SUCCESS;
2327         } else {
2328                 skb_dst_force(skb);
2329                 rc = qdisc_enqueue_root(skb, q);
2330                 if (qdisc_run_begin(q)) {
2331                         if (unlikely(contended)) {
2332                                 spin_unlock(&q->busylock);
2333                                 contended = false;
2334                         }
2335                         __qdisc_run(q);
2336                 }
2337         }
2338         spin_unlock(root_lock);
2339         if (unlikely(contended))
2340                 spin_unlock(&q->busylock);
2341         return rc;
2342 }
2343
2344 static DEFINE_PER_CPU(int, xmit_recursion);
2345 #define RECURSION_LIMIT 10
2346
2347 /**
2348  *      dev_queue_xmit - transmit a buffer
2349  *      @skb: buffer to transmit
2350  *
2351  *      Queue a buffer for transmission to a network device. The caller must
2352  *      have set the device and priority and built the buffer before calling
2353  *      this function. The function can be called from an interrupt.
2354  *
2355  *      A negative errno code is returned on a failure. A success does not
2356  *      guarantee the frame will be transmitted as it may be dropped due
2357  *      to congestion or traffic shaping.
2358  *
2359  * -----------------------------------------------------------------------------------
2360  *      I notice this method can also return errors from the queue disciplines,
2361  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2362  *      be positive.
2363  *
2364  *      Regardless of the return value, the skb is consumed, so it is currently
2365  *      difficult to retry a send to this method.  (You can bump the ref count
2366  *      before sending to hold a reference for retry if you are careful.)
2367  *
2368  *      When calling this method, interrupts MUST be enabled.  This is because
2369  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2370  *          --BLG
2371  */
2372 int dev_queue_xmit(struct sk_buff *skb)
2373 {
2374         struct net_device *dev = skb->dev;
2375         struct netdev_queue *txq;
2376         struct Qdisc *q;
2377         int rc = -ENOMEM;
2378
2379         /* Disable soft irqs for various locks below. Also
2380          * stops preemption for RCU.
2381          */
2382         rcu_read_lock_bh();
2383
2384         txq = dev_pick_tx(dev, skb);
2385         q = rcu_dereference_bh(txq->qdisc);
2386
2387 #ifdef CONFIG_NET_CLS_ACT
2388         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2389 #endif
2390         trace_net_dev_queue(skb);
2391         if (q->enqueue) {
2392                 rc = __dev_xmit_skb(skb, q, dev, txq);
2393                 goto out;
2394         }
2395
2396         /* The device has no queue. Common case for software devices:
2397            loopback, all the sorts of tunnels...
2398
2399            Really, it is unlikely that netif_tx_lock protection is necessary
2400            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2401            counters.)
2402            However, it is possible, that they rely on protection
2403            made by us here.
2404
2405            Check this and shot the lock. It is not prone from deadlocks.
2406            Either shot noqueue qdisc, it is even simpler 8)
2407          */
2408         if (dev->flags & IFF_UP) {
2409                 int cpu = smp_processor_id(); /* ok because BHs are off */
2410
2411                 if (txq->xmit_lock_owner != cpu) {
2412
2413                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2414                                 goto recursion_alert;
2415
2416                         HARD_TX_LOCK(dev, txq, cpu);
2417
2418                         if (!netif_tx_queue_stopped(txq)) {
2419                                 __this_cpu_inc(xmit_recursion);
2420                                 rc = dev_hard_start_xmit(skb, dev, txq);
2421                                 __this_cpu_dec(xmit_recursion);
2422                                 if (dev_xmit_complete(rc)) {
2423                                         HARD_TX_UNLOCK(dev, txq);
2424                                         goto out;
2425                                 }
2426                         }
2427                         HARD_TX_UNLOCK(dev, txq);
2428                         if (net_ratelimit())
2429                                 printk(KERN_CRIT "Virtual device %s asks to "
2430                                        "queue packet!\n", dev->name);
2431                 } else {
2432                         /* Recursion is detected! It is possible,
2433                          * unfortunately
2434                          */
2435 recursion_alert:
2436                         if (net_ratelimit())
2437                                 printk(KERN_CRIT "Dead loop on virtual device "
2438                                        "%s, fix it urgently!\n", dev->name);
2439                 }
2440         }
2441
2442         rc = -ENETDOWN;
2443         rcu_read_unlock_bh();
2444
2445         kfree_skb(skb);
2446         return rc;
2447 out:
2448         rcu_read_unlock_bh();
2449         return rc;
2450 }
2451 EXPORT_SYMBOL(dev_queue_xmit);
2452
2453
2454 /*=======================================================================
2455                         Receiver routines
2456   =======================================================================*/
2457
2458 int netdev_max_backlog __read_mostly = 1000;
2459 int netdev_tstamp_prequeue __read_mostly = 1;
2460 int netdev_budget __read_mostly = 300;
2461 int weight_p __read_mostly = 64;            /* old backlog weight */
2462
2463 /* Called with irq disabled */
2464 static inline void ____napi_schedule(struct softnet_data *sd,
2465                                      struct napi_struct *napi)
2466 {
2467         list_add_tail(&napi->poll_list, &sd->poll_list);
2468         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2469 }
2470
2471 /*
2472  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2473  * and src/dst port numbers. Returns a non-zero hash number on success
2474  * and 0 on failure.
2475  */
2476 __u32 __skb_get_rxhash(struct sk_buff *skb)
2477 {
2478         int nhoff, hash = 0, poff;
2479         struct ipv6hdr *ip6;
2480         struct iphdr *ip;
2481         u8 ip_proto;
2482         u32 addr1, addr2, ihl;
2483         union {
2484                 u32 v32;
2485                 u16 v16[2];
2486         } ports;
2487
2488         nhoff = skb_network_offset(skb);
2489
2490         switch (skb->protocol) {
2491         case __constant_htons(ETH_P_IP):
2492                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2493                         goto done;
2494
2495                 ip = (struct iphdr *) (skb->data + nhoff);
2496                 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2497                         ip_proto = 0;
2498                 else
2499                         ip_proto = ip->protocol;
2500                 addr1 = (__force u32) ip->saddr;
2501                 addr2 = (__force u32) ip->daddr;
2502                 ihl = ip->ihl;
2503                 break;
2504         case __constant_htons(ETH_P_IPV6):
2505                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2506                         goto done;
2507
2508                 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2509                 ip_proto = ip6->nexthdr;
2510                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2511                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2512                 ihl = (40 >> 2);
2513                 break;
2514         default:
2515                 goto done;
2516         }
2517
2518         ports.v32 = 0;
2519         poff = proto_ports_offset(ip_proto);
2520         if (poff >= 0) {
2521                 nhoff += ihl * 4 + poff;
2522                 if (pskb_may_pull(skb, nhoff + 4)) {
2523                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2524                         if (ports.v16[1] < ports.v16[0])
2525                                 swap(ports.v16[0], ports.v16[1]);
2526                 }
2527         }
2528
2529         /* get a consistent hash (same value on both flow directions) */
2530         if (addr2 < addr1)
2531                 swap(addr1, addr2);
2532
2533         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2534         if (!hash)
2535                 hash = 1;
2536
2537 done:
2538         return hash;
2539 }
2540 EXPORT_SYMBOL(__skb_get_rxhash);
2541
2542 #ifdef CONFIG_RPS
2543
2544 /* One global table that all flow-based protocols share. */
2545 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2546 EXPORT_SYMBOL(rps_sock_flow_table);
2547
2548 /*
2549  * get_rps_cpu is called from netif_receive_skb and returns the target
2550  * CPU from the RPS map of the receiving queue for a given skb.
2551  * rcu_read_lock must be held on entry.
2552  */
2553 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2554                        struct rps_dev_flow **rflowp)
2555 {
2556         struct netdev_rx_queue *rxqueue;
2557         struct rps_map *map;
2558         struct rps_dev_flow_table *flow_table;
2559         struct rps_sock_flow_table *sock_flow_table;
2560         int cpu = -1;
2561         u16 tcpu;
2562
2563         if (skb_rx_queue_recorded(skb)) {
2564                 u16 index = skb_get_rx_queue(skb);
2565                 if (unlikely(index >= dev->real_num_rx_queues)) {
2566                         WARN_ONCE(dev->real_num_rx_queues > 1,
2567                                   "%s received packet on queue %u, but number "
2568                                   "of RX queues is %u\n",
2569                                   dev->name, index, dev->real_num_rx_queues);
2570                         goto done;
2571                 }
2572                 rxqueue = dev->_rx + index;
2573         } else
2574                 rxqueue = dev->_rx;
2575
2576         map = rcu_dereference(rxqueue->rps_map);
2577         if (map) {
2578                 if (map->len == 1) {
2579                         tcpu = map->cpus[0];
2580                         if (cpu_online(tcpu))
2581                                 cpu = tcpu;
2582                         goto done;
2583                 }
2584         } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2585                 goto done;
2586         }
2587
2588         skb_reset_network_header(skb);
2589         if (!skb_get_rxhash(skb))
2590                 goto done;
2591
2592         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2593         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2594         if (flow_table && sock_flow_table) {
2595                 u16 next_cpu;
2596                 struct rps_dev_flow *rflow;
2597
2598                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2599                 tcpu = rflow->cpu;
2600
2601                 next_cpu = sock_flow_table->ents[skb->rxhash &
2602                     sock_flow_table->mask];
2603
2604                 /*
2605                  * If the desired CPU (where last recvmsg was done) is
2606                  * different from current CPU (one in the rx-queue flow
2607                  * table entry), switch if one of the following holds:
2608                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2609                  *   - Current CPU is offline.
2610                  *   - The current CPU's queue tail has advanced beyond the
2611                  *     last packet that was enqueued using this table entry.
2612                  *     This guarantees that all previous packets for the flow
2613                  *     have been dequeued, thus preserving in order delivery.
2614                  */
2615                 if (unlikely(tcpu != next_cpu) &&
2616                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2617                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2618                       rflow->last_qtail)) >= 0)) {
2619                         tcpu = rflow->cpu = next_cpu;
2620                         if (tcpu != RPS_NO_CPU)
2621                                 rflow->last_qtail = per_cpu(softnet_data,
2622                                     tcpu).input_queue_head;
2623                 }
2624                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2625                         *rflowp = rflow;
2626                         cpu = tcpu;
2627                         goto done;
2628                 }
2629         }
2630
2631         if (map) {
2632                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2633
2634                 if (cpu_online(tcpu)) {
2635                         cpu = tcpu;
2636                         goto done;
2637                 }
2638         }
2639
2640 done:
2641         return cpu;
2642 }
2643
2644 /* Called from hardirq (IPI) context */
2645 static void rps_trigger_softirq(void *data)
2646 {
2647         struct softnet_data *sd = data;
2648
2649         ____napi_schedule(sd, &sd->backlog);
2650         sd->received_rps++;
2651 }
2652
2653 #endif /* CONFIG_RPS */
2654
2655 /*
2656  * Check if this softnet_data structure is another cpu one
2657  * If yes, queue it to our IPI list and return 1
2658  * If no, return 0
2659  */
2660 static int rps_ipi_queued(struct softnet_data *sd)
2661 {
2662 #ifdef CONFIG_RPS
2663         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2664
2665         if (sd != mysd) {
2666                 sd->rps_ipi_next = mysd->rps_ipi_list;
2667                 mysd->rps_ipi_list = sd;
2668
2669                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2670                 return 1;
2671         }
2672 #endif /* CONFIG_RPS */
2673         return 0;
2674 }
2675
2676 /*
2677  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2678  * queue (may be a remote CPU queue).
2679  */
2680 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2681                               unsigned int *qtail)
2682 {
2683         struct softnet_data *sd;
2684         unsigned long flags;
2685
2686         sd = &per_cpu(softnet_data, cpu);
2687
2688         local_irq_save(flags);
2689
2690         rps_lock(sd);
2691         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2692                 if (skb_queue_len(&sd->input_pkt_queue)) {
2693 enqueue:
2694                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2695                         input_queue_tail_incr_save(sd, qtail);
2696                         rps_unlock(sd);
2697                         local_irq_restore(flags);
2698                         return NET_RX_SUCCESS;
2699                 }
2700
2701                 /* Schedule NAPI for backlog device
2702                  * We can use non atomic operation since we own the queue lock
2703                  */
2704                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2705                         if (!rps_ipi_queued(sd))
2706                                 ____napi_schedule(sd, &sd->backlog);
2707                 }
2708                 goto enqueue;
2709         }
2710
2711         sd->dropped++;
2712         rps_unlock(sd);
2713
2714         local_irq_restore(flags);
2715
2716         atomic_long_inc(&skb->dev->rx_dropped);
2717         kfree_skb(skb);
2718         return NET_RX_DROP;
2719 }
2720
2721 /**
2722  *      netif_rx        -       post buffer to the network code
2723  *      @skb: buffer to post
2724  *
2725  *      This function receives a packet from a device driver and queues it for
2726  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2727  *      may be dropped during processing for congestion control or by the
2728  *      protocol layers.
2729  *
2730  *      return values:
2731  *      NET_RX_SUCCESS  (no congestion)
2732  *      NET_RX_DROP     (packet was dropped)
2733  *
2734  */
2735
2736 int netif_rx(struct sk_buff *skb)
2737 {
2738         int ret;
2739
2740         /* if netpoll wants it, pretend we never saw it */
2741         if (netpoll_rx(skb))
2742                 return NET_RX_DROP;
2743
2744         if (netdev_tstamp_prequeue)
2745                 net_timestamp_check(skb);
2746
2747         trace_netif_rx(skb);
2748 #ifdef CONFIG_RPS
2749         {
2750                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2751                 int cpu;
2752
2753                 preempt_disable();
2754                 rcu_read_lock();
2755
2756                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2757                 if (cpu < 0)
2758                         cpu = smp_processor_id();
2759
2760                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2761
2762                 rcu_read_unlock();
2763                 preempt_enable();
2764         }
2765 #else
2766         {
2767                 unsigned int qtail;
2768                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2769                 put_cpu();
2770         }
2771 #endif
2772         return ret;
2773 }
2774 EXPORT_SYMBOL(netif_rx);
2775
2776 int netif_rx_ni(struct sk_buff *skb)
2777 {
2778         int err;
2779
2780         preempt_disable();
2781         err = netif_rx(skb);
2782         if (local_softirq_pending())
2783                 do_softirq();
2784         preempt_enable();
2785
2786         return err;
2787 }
2788 EXPORT_SYMBOL(netif_rx_ni);
2789
2790 static void net_tx_action(struct softirq_action *h)
2791 {
2792         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2793
2794         if (sd->completion_queue) {
2795                 struct sk_buff *clist;
2796
2797                 local_irq_disable();
2798                 clist = sd->completion_queue;
2799                 sd->completion_queue = NULL;
2800                 local_irq_enable();
2801
2802                 while (clist) {
2803                         struct sk_buff *skb = clist;
2804                         clist = clist->next;
2805
2806                         WARN_ON(atomic_read(&skb->users));
2807                         trace_kfree_skb(skb, net_tx_action);
2808                         __kfree_skb(skb);
2809                 }
2810         }
2811
2812         if (sd->output_queue) {
2813                 struct Qdisc *head;
2814
2815                 local_irq_disable();
2816                 head = sd->output_queue;
2817                 sd->output_queue = NULL;
2818                 sd->output_queue_tailp = &sd->output_queue;
2819                 local_irq_enable();
2820
2821                 while (head) {
2822                         struct Qdisc *q = head;
2823                         spinlock_t *root_lock;
2824
2825                         head = head->next_sched;
2826
2827                         root_lock = qdisc_lock(q);
2828                         if (spin_trylock(root_lock)) {
2829                                 smp_mb__before_clear_bit();
2830                                 clear_bit(__QDISC_STATE_SCHED,
2831                                           &q->state);
2832                                 qdisc_run(q);
2833                                 spin_unlock(root_lock);
2834                         } else {
2835                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2836                                               &q->state)) {
2837                                         __netif_reschedule(q);
2838                                 } else {
2839                                         smp_mb__before_clear_bit();
2840                                         clear_bit(__QDISC_STATE_SCHED,
2841                                                   &q->state);
2842                                 }
2843                         }
2844                 }
2845         }
2846 }
2847
2848 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2849     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2850 /* This hook is defined here for ATM LANE */
2851 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2852                              unsigned char *addr) __read_mostly;
2853 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2854 #endif
2855
2856 #ifdef CONFIG_NET_CLS_ACT
2857 /* TODO: Maybe we should just force sch_ingress to be compiled in
2858  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2859  * a compare and 2 stores extra right now if we dont have it on
2860  * but have CONFIG_NET_CLS_ACT
2861  * NOTE: This doesnt stop any functionality; if you dont have
2862  * the ingress scheduler, you just cant add policies on ingress.
2863  *
2864  */
2865 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2866 {
2867         struct net_device *dev = skb->dev;
2868         u32 ttl = G_TC_RTTL(skb->tc_verd);
2869         int result = TC_ACT_OK;
2870         struct Qdisc *q;
2871
2872         if (unlikely(MAX_RED_LOOP < ttl++)) {
2873                 if (net_ratelimit())
2874                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2875                                skb->skb_iif, dev->ifindex);
2876                 return TC_ACT_SHOT;
2877         }
2878
2879         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2880         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2881
2882         q = rxq->qdisc;
2883         if (q != &noop_qdisc) {
2884                 spin_lock(qdisc_lock(q));
2885                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2886                         result = qdisc_enqueue_root(skb, q);
2887                 spin_unlock(qdisc_lock(q));
2888         }
2889
2890         return result;
2891 }
2892
2893 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2894                                          struct packet_type **pt_prev,
2895                                          int *ret, struct net_device *orig_dev)
2896 {
2897         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2898
2899         if (!rxq || rxq->qdisc == &noop_qdisc)
2900                 goto out;
2901
2902         if (*pt_prev) {
2903                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2904                 *pt_prev = NULL;
2905         }
2906
2907         switch (ing_filter(skb, rxq)) {
2908         case TC_ACT_SHOT:
2909         case TC_ACT_STOLEN:
2910                 kfree_skb(skb);
2911                 return NULL;
2912         }
2913
2914 out:
2915         skb->tc_verd = 0;
2916         return skb;
2917 }
2918 #endif
2919
2920 /**
2921  *      netdev_rx_handler_register - register receive handler
2922  *      @dev: device to register a handler for
2923  *      @rx_handler: receive handler to register
2924  *      @rx_handler_data: data pointer that is used by rx handler
2925  *
2926  *      Register a receive hander for a device. This handler will then be
2927  *      called from __netif_receive_skb. A negative errno code is returned
2928  *      on a failure.
2929  *
2930  *      The caller must hold the rtnl_mutex.
2931  */
2932 int netdev_rx_handler_register(struct net_device *dev,
2933                                rx_handler_func_t *rx_handler,
2934                                void *rx_handler_data)
2935 {
2936         ASSERT_RTNL();
2937
2938         if (dev->rx_handler)
2939                 return -EBUSY;
2940
2941         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2942         rcu_assign_pointer(dev->rx_handler, rx_handler);
2943
2944         return 0;
2945 }
2946 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2947
2948 /**
2949  *      netdev_rx_handler_unregister - unregister receive handler
2950  *      @dev: device to unregister a handler from
2951  *
2952  *      Unregister a receive hander from a device.
2953  *
2954  *      The caller must hold the rtnl_mutex.
2955  */
2956 void netdev_rx_handler_unregister(struct net_device *dev)
2957 {
2958
2959         ASSERT_RTNL();
2960         rcu_assign_pointer(dev->rx_handler, NULL);
2961         rcu_assign_pointer(dev->rx_handler_data, NULL);
2962 }
2963 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2964
2965 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2966                                               struct net_device *master)
2967 {
2968         if (skb->pkt_type == PACKET_HOST) {
2969                 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2970
2971                 memcpy(dest, master->dev_addr, ETH_ALEN);
2972         }
2973 }
2974
2975 /* On bonding slaves other than the currently active slave, suppress
2976  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2977  * ARP on active-backup slaves with arp_validate enabled.
2978  */
2979 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2980 {
2981         struct net_device *dev = skb->dev;
2982
2983         if (master->priv_flags & IFF_MASTER_ARPMON)
2984                 dev->last_rx = jiffies;
2985
2986         if ((master->priv_flags & IFF_MASTER_ALB) &&
2987             (master->priv_flags & IFF_BRIDGE_PORT)) {
2988                 /* Do address unmangle. The local destination address
2989                  * will be always the one master has. Provides the right
2990                  * functionality in a bridge.
2991                  */
2992                 skb_bond_set_mac_by_master(skb, master);
2993         }
2994
2995         if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2996                 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2997                     skb->protocol == __cpu_to_be16(ETH_P_ARP))
2998                         return 0;
2999
3000                 if (master->priv_flags & IFF_MASTER_ALB) {
3001                         if (skb->pkt_type != PACKET_BROADCAST &&
3002                             skb->pkt_type != PACKET_MULTICAST)
3003                                 return 0;
3004                 }
3005                 if (master->priv_flags & IFF_MASTER_8023AD &&
3006                     skb->protocol == __cpu_to_be16(ETH_P_SLOW))
3007                         return 0;
3008
3009                 return 1;
3010         }
3011         return 0;
3012 }
3013 EXPORT_SYMBOL(__skb_bond_should_drop);
3014
3015 static int __netif_receive_skb(struct sk_buff *skb)
3016 {
3017         struct packet_type *ptype, *pt_prev;
3018         rx_handler_func_t *rx_handler;
3019         struct net_device *orig_dev;
3020         struct net_device *master;
3021         struct net_device *null_or_orig;
3022         struct net_device *orig_or_bond;
3023         int ret = NET_RX_DROP;
3024         __be16 type;
3025
3026         if (!netdev_tstamp_prequeue)
3027                 net_timestamp_check(skb);
3028
3029         trace_netif_receive_skb(skb);
3030
3031         /* if we've gotten here through NAPI, check netpoll */
3032         if (netpoll_receive_skb(skb))
3033                 return NET_RX_DROP;
3034
3035         if (!skb->skb_iif)
3036                 skb->skb_iif = skb->dev->ifindex;
3037
3038         /*
3039          * bonding note: skbs received on inactive slaves should only
3040          * be delivered to pkt handlers that are exact matches.  Also
3041          * the deliver_no_wcard flag will be set.  If packet handlers
3042          * are sensitive to duplicate packets these skbs will need to
3043          * be dropped at the handler.
3044          */
3045         null_or_orig = NULL;
3046         orig_dev = skb->dev;
3047         master = ACCESS_ONCE(orig_dev->master);
3048         if (skb->deliver_no_wcard)
3049                 null_or_orig = orig_dev;
3050         else if (master) {
3051                 if (skb_bond_should_drop(skb, master)) {
3052                         skb->deliver_no_wcard = 1;
3053                         null_or_orig = orig_dev; /* deliver only exact match */
3054                 } else
3055                         skb->dev = master;
3056         }
3057
3058         __this_cpu_inc(softnet_data.processed);
3059         skb_reset_network_header(skb);
3060         skb_reset_transport_header(skb);
3061         skb->mac_len = skb->network_header - skb->mac_header;
3062
3063         pt_prev = NULL;
3064
3065         rcu_read_lock();
3066
3067 #ifdef CONFIG_NET_CLS_ACT
3068         if (skb->tc_verd & TC_NCLS) {
3069                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3070                 goto ncls;
3071         }
3072 #endif
3073
3074         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3075                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3076                     ptype->dev == orig_dev) {
3077                         if (pt_prev)
3078                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3079                         pt_prev = ptype;
3080                 }
3081         }
3082
3083 #ifdef CONFIG_NET_CLS_ACT
3084         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3085         if (!skb)
3086                 goto out;
3087 ncls:
3088 #endif
3089
3090         /* Handle special case of bridge or macvlan */
3091         rx_handler = rcu_dereference(skb->dev->rx_handler);
3092         if (rx_handler) {
3093                 if (pt_prev) {
3094                         ret = deliver_skb(skb, pt_prev, orig_dev);
3095                         pt_prev = NULL;
3096                 }
3097                 skb = rx_handler(skb);
3098                 if (!skb)
3099                         goto out;
3100         }
3101
3102         if (vlan_tx_tag_present(skb)) {
3103                 if (pt_prev) {
3104                         ret = deliver_skb(skb, pt_prev, orig_dev);
3105                         pt_prev = NULL;
3106                 }
3107                 if (vlan_hwaccel_do_receive(&skb)) {
3108                         ret = __netif_receive_skb(skb);
3109                         goto out;
3110                 } else if (unlikely(!skb))
3111                         goto out;
3112         }
3113
3114         /*
3115          * Make sure frames received on VLAN interfaces stacked on
3116          * bonding interfaces still make their way to any base bonding
3117          * device that may have registered for a specific ptype.  The
3118          * handler may have to adjust skb->dev and orig_dev.
3119          */
3120         orig_or_bond = orig_dev;
3121         if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3122             (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
3123                 orig_or_bond = vlan_dev_real_dev(skb->dev);
3124         }
3125
3126         type = skb->protocol;
3127         list_for_each_entry_rcu(ptype,
3128                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3129                 if (ptype->type == type && (ptype->dev == null_or_orig ||
3130                      ptype->dev == skb->dev || ptype->dev == orig_dev ||
3131                      ptype->dev == orig_or_bond)) {
3132                         if (pt_prev)
3133                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3134                         pt_prev = ptype;
3135                 }
3136         }
3137
3138         if (pt_prev) {
3139                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3140         } else {
3141                 atomic_long_inc(&skb->dev->rx_dropped);
3142                 kfree_skb(skb);
3143                 /* Jamal, now you will not able to escape explaining
3144                  * me how you were going to use this. :-)
3145                  */
3146                 ret = NET_RX_DROP;
3147         }
3148
3149 out:
3150         rcu_read_unlock();
3151         return ret;
3152 }
3153
3154 /**
3155  *      netif_receive_skb - process receive buffer from network
3156  *      @skb: buffer to process
3157  *
3158  *      netif_receive_skb() is the main receive data processing function.
3159  *      It always succeeds. The buffer may be dropped during processing
3160  *      for congestion control or by the protocol layers.
3161  *
3162  *      This function may only be called from softirq context and interrupts
3163  *      should be enabled.
3164  *
3165  *      Return values (usually ignored):
3166  *      NET_RX_SUCCESS: no congestion
3167  *      NET_RX_DROP: packet was dropped
3168  */
3169 int netif_receive_skb(struct sk_buff *skb)
3170 {
3171         if (netdev_tstamp_prequeue)
3172                 net_timestamp_check(skb);
3173
3174         if (skb_defer_rx_timestamp(skb))
3175                 return NET_RX_SUCCESS;
3176
3177 #ifdef CONFIG_RPS
3178         {
3179                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3180                 int cpu, ret;
3181
3182                 rcu_read_lock();
3183
3184                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3185
3186                 if (cpu >= 0) {
3187                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3188                         rcu_read_unlock();
3189                 } else {
3190                         rcu_read_unlock();
3191                         ret = __netif_receive_skb(skb);
3192                 }
3193
3194                 return ret;
3195         }
3196 #else
3197         return __netif_receive_skb(skb);
3198 #endif
3199 }
3200 EXPORT_SYMBOL(netif_receive_skb);
3201
3202 /* Network device is going away, flush any packets still pending
3203  * Called with irqs disabled.
3204  */
3205 static void flush_backlog(void *arg)
3206 {
3207         struct net_device *dev = arg;
3208         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3209         struct sk_buff *skb, *tmp;
3210
3211         rps_lock(sd);
3212         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3213                 if (skb->dev == dev) {
3214                         __skb_unlink(skb, &sd->input_pkt_queue);
3215                         kfree_skb(skb);
3216                         input_queue_head_incr(sd);
3217                 }
3218         }
3219         rps_unlock(sd);
3220
3221         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3222                 if (skb->dev == dev) {
3223                         __skb_unlink(skb, &sd->process_queue);
3224                         kfree_skb(skb);
3225                         input_queue_head_incr(sd);
3226                 }
3227         }
3228 }
3229
3230 static int napi_gro_complete(struct sk_buff *skb)
3231 {
3232         struct packet_type *ptype;
3233         __be16 type = skb->protocol;
3234         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3235         int err = -ENOENT;
3236
3237         if (NAPI_GRO_CB(skb)->count == 1) {
3238                 skb_shinfo(skb)->gso_size = 0;
3239                 goto out;
3240         }
3241
3242         rcu_read_lock();
3243         list_for_each_entry_rcu(ptype, head, list) {
3244                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3245                         continue;
3246
3247                 err = ptype->gro_complete(skb);
3248                 break;
3249         }
3250         rcu_read_unlock();
3251
3252         if (err) {
3253                 WARN_ON(&ptype->list == head);
3254                 kfree_skb(skb);
3255                 return NET_RX_SUCCESS;
3256         }
3257
3258 out:
3259         return netif_receive_skb(skb);
3260 }
3261
3262 inline void napi_gro_flush(struct napi_struct *napi)
3263 {
3264         struct sk_buff *skb, *next;
3265
3266         for (skb = napi->gro_list; skb; skb = next) {
3267                 next = skb->next;
3268                 skb->next = NULL;
3269                 napi_gro_complete(skb);
3270         }
3271
3272         napi->gro_count = 0;
3273         napi->gro_list = NULL;
3274 }
3275 EXPORT_SYMBOL(napi_gro_flush);
3276
3277 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3278 {
3279         struct sk_buff **pp = NULL;
3280         struct packet_type *ptype;
3281         __be16 type = skb->protocol;
3282         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3283         int same_flow;
3284         int mac_len;
3285         enum gro_result ret;
3286
3287         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3288                 goto normal;
3289
3290         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3291                 goto normal;
3292
3293         rcu_read_lock();
3294         list_for_each_entry_rcu(ptype, head, list) {
3295                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3296                         continue;
3297
3298                 skb_set_network_header(skb, skb_gro_offset(skb));
3299                 mac_len = skb->network_header - skb->mac_header;
3300                 skb->mac_len = mac_len;
3301                 NAPI_GRO_CB(skb)->same_flow = 0;
3302                 NAPI_GRO_CB(skb)->flush = 0;
3303                 NAPI_GRO_CB(skb)->free = 0;
3304
3305                 pp = ptype->gro_receive(&napi->gro_list, skb);
3306                 break;
3307         }
3308         rcu_read_unlock();
3309
3310         if (&ptype->list == head)
3311                 goto normal;
3312
3313         same_flow = NAPI_GRO_CB(skb)->same_flow;
3314         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3315
3316         if (pp) {
3317                 struct sk_buff *nskb = *pp;
3318
3319                 *pp = nskb->next;
3320                 nskb->next = NULL;
3321                 napi_gro_complete(nskb);
3322                 napi->gro_count--;
3323         }
3324
3325         if (same_flow)
3326                 goto ok;
3327
3328         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3329                 goto normal;
3330
3331         napi->gro_count++;
3332         NAPI_GRO_CB(skb)->count = 1;
3333         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3334         skb->next = napi->gro_list;
3335         napi->gro_list = skb;
3336         ret = GRO_HELD;
3337
3338 pull:
3339         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3340                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3341
3342                 BUG_ON(skb->end - skb->tail < grow);
3343
3344                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3345
3346                 skb->tail += grow;
3347                 skb->data_len -= grow;
3348
3349                 skb_shinfo(skb)->frags[0].page_offset += grow;
3350                 skb_shinfo(skb)->frags[0].size -= grow;
3351
3352                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3353                         put_page(skb_shinfo(skb)->frags[0].page);
3354                         memmove(skb_shinfo(skb)->frags,
3355                                 skb_shinfo(skb)->frags + 1,
3356                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3357                 }
3358         }
3359
3360 ok:
3361         return ret;
3362
3363 normal:
3364         ret = GRO_NORMAL;
3365         goto pull;
3366 }
3367 EXPORT_SYMBOL(dev_gro_receive);
3368
3369 static inline gro_result_t
3370 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3371 {
3372         struct sk_buff *p;
3373
3374         for (p = napi->gro_list; p; p = p->next) {
3375                 unsigned long diffs;
3376
3377                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3378                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3379                 diffs |= compare_ether_header(skb_mac_header(p),
3380                                               skb_gro_mac_header(skb));
3381                 NAPI_GRO_CB(p)->same_flow = !diffs;
3382                 NAPI_GRO_CB(p)->flush = 0;
3383         }
3384
3385         return dev_gro_receive(napi, skb);
3386 }
3387
3388 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3389 {
3390         switch (ret) {
3391         case GRO_NORMAL:
3392                 if (netif_receive_skb(skb))
3393                         ret = GRO_DROP;
3394                 break;
3395
3396         case GRO_DROP:
3397         case GRO_MERGED_FREE:
3398                 kfree_skb(skb);
3399                 break;
3400
3401         case GRO_HELD:
3402         case GRO_MERGED:
3403                 break;
3404         }
3405
3406         return ret;
3407 }
3408 EXPORT_SYMBOL(napi_skb_finish);
3409
3410 void skb_gro_reset_offset(struct sk_buff *skb)
3411 {
3412         NAPI_GRO_CB(skb)->data_offset = 0;
3413         NAPI_GRO_CB(skb)->frag0 = NULL;
3414         NAPI_GRO_CB(skb)->frag0_len = 0;
3415
3416         if (skb->mac_header == skb->tail &&
3417             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3418                 NAPI_GRO_CB(skb)->frag0 =
3419                         page_address(skb_shinfo(skb)->frags[0].page) +
3420                         skb_shinfo(skb)->frags[0].page_offset;
3421                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3422         }
3423 }
3424 EXPORT_SYMBOL(skb_gro_reset_offset);
3425
3426 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3427 {
3428         skb_gro_reset_offset(skb);
3429
3430         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3431 }
3432 EXPORT_SYMBOL(napi_gro_receive);
3433
3434 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3435 {
3436         __skb_pull(skb, skb_headlen(skb));
3437         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3438         skb->vlan_tci = 0;
3439
3440         napi->skb = skb;
3441 }
3442
3443 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3444 {
3445         struct sk_buff *skb = napi->skb;
3446
3447         if (!skb) {
3448                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3449                 if (skb)
3450                         napi->skb = skb;
3451         }
3452         return skb;
3453 }
3454 EXPORT_SYMBOL(napi_get_frags);
3455
3456 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3457                                gro_result_t ret)
3458 {
3459         switch (ret) {
3460         case GRO_NORMAL:
3461         case GRO_HELD:
3462                 skb->protocol = eth_type_trans(skb, skb->dev);
3463
3464                 if (ret == GRO_HELD)
3465                         skb_gro_pull(skb, -ETH_HLEN);
3466                 else if (netif_receive_skb(skb))
3467                         ret = GRO_DROP;
3468                 break;
3469
3470         case GRO_DROP:
3471         case GRO_MERGED_FREE:
3472                 napi_reuse_skb(napi, skb);
3473                 break;
3474
3475         case GRO_MERGED:
3476                 break;
3477         }
3478
3479         return ret;
3480 }
3481 EXPORT_SYMBOL(napi_frags_finish);
3482
3483 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3484 {
3485         struct sk_buff *skb = napi->skb;
3486         struct ethhdr *eth;
3487         unsigned int hlen;
3488         unsigned int off;
3489
3490         napi->skb = NULL;
3491
3492         skb_reset_mac_header(skb);
3493         skb_gro_reset_offset(skb);
3494
3495         off = skb_gro_offset(skb);
3496         hlen = off + sizeof(*eth);
3497         eth = skb_gro_header_fast(skb, off);
3498         if (skb_gro_header_hard(skb, hlen)) {
3499                 eth = skb_gro_header_slow(skb, hlen, off);
3500                 if (unlikely(!eth)) {
3501                         napi_reuse_skb(napi, skb);
3502                         skb = NULL;
3503                         goto out;
3504                 }
3505         }
3506
3507         skb_gro_pull(skb, sizeof(*eth));
3508
3509         /*
3510          * This works because the only protocols we care about don't require
3511          * special handling.  We'll fix it up properly at the end.
3512          */
3513         skb->protocol = eth->h_proto;
3514
3515 out:
3516         return skb;
3517 }
3518 EXPORT_SYMBOL(napi_frags_skb);
3519
3520 gro_result_t napi_gro_frags(struct napi_struct *napi)
3521 {
3522         struct sk_buff *skb = napi_frags_skb(napi);
3523
3524         if (!skb)
3525                 return GRO_DROP;
3526
3527         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3528 }
3529 EXPORT_SYMBOL(napi_gro_frags);
3530
3531 /*
3532  * net_rps_action sends any pending IPI's for rps.
3533  * Note: called with local irq disabled, but exits with local irq enabled.
3534  */
3535 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3536 {
3537 #ifdef CONFIG_RPS
3538         struct softnet_data *remsd = sd->rps_ipi_list;
3539
3540         if (remsd) {
3541                 sd->rps_ipi_list = NULL;
3542
3543                 local_irq_enable();
3544
3545                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3546                 while (remsd) {
3547                         struct softnet_data *next = remsd->rps_ipi_next;
3548
3549                         if (cpu_online(remsd->cpu))
3550                                 __smp_call_function_single(remsd->cpu,
3551                                                            &remsd->csd, 0);
3552                         remsd = next;
3553                 }
3554         } else
3555 #endif
3556                 local_irq_enable();
3557 }
3558
3559 static int process_backlog(struct napi_struct *napi, int quota)
3560 {
3561         int work = 0;
3562         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3563
3564 #ifdef CONFIG_RPS
3565         /* Check if we have pending ipi, its better to send them now,
3566          * not waiting net_rx_action() end.
3567          */
3568         if (sd->rps_ipi_list) {
3569                 local_irq_disable();
3570                 net_rps_action_and_irq_enable(sd);
3571         }
3572 #endif
3573         napi->weight = weight_p;
3574         local_irq_disable();
3575         while (work < quota) {
3576                 struct sk_buff *skb;
3577                 unsigned int qlen;
3578
3579                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3580                         local_irq_enable();
3581                         __netif_receive_skb(skb);
3582                         local_irq_disable();
3583                         input_queue_head_incr(sd);
3584                         if (++work >= quota) {
3585                                 local_irq_enable();
3586                                 return work;
3587                         }
3588                 }
3589
3590                 rps_lock(sd);
3591                 qlen = skb_queue_len(&sd->input_pkt_queue);
3592                 if (qlen)
3593                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3594                                                    &sd->process_queue);
3595
3596                 if (qlen < quota - work) {
3597                         /*
3598                          * Inline a custom version of __napi_complete().
3599                          * only current cpu owns and manipulates this napi,
3600                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3601                          * we can use a plain write instead of clear_bit(),
3602                          * and we dont need an smp_mb() memory barrier.
3603                          */
3604                         list_del(&napi->poll_list);
3605                         napi->state = 0;
3606
3607                         quota = work + qlen;
3608                 }
3609                 rps_unlock(sd);
3610         }
3611         local_irq_enable();
3612
3613         return work;
3614 }
3615
3616 /**
3617  * __napi_schedule - schedule for receive
3618  * @n: entry to schedule
3619  *
3620  * The entry's receive function will be scheduled to run
3621  */
3622 void __napi_schedule(struct napi_struct *n)
3623 {
3624         unsigned long flags;
3625
3626         local_irq_save(flags);
3627         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3628         local_irq_restore(flags);
3629 }
3630 EXPORT_SYMBOL(__napi_schedule);
3631
3632 void __napi_complete(struct napi_struct *n)
3633 {
3634         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3635         BUG_ON(n->gro_list);
3636
3637         list_del(&n->poll_list);
3638         smp_mb__before_clear_bit();
3639         clear_bit(NAPI_STATE_SCHED, &n->state);
3640 }
3641 EXPORT_SYMBOL(__napi_complete);
3642
3643 void napi_complete(struct napi_struct *n)
3644 {
3645         unsigned long flags;
3646
3647         /*
3648          * don't let napi dequeue from the cpu poll list
3649          * just in case its running on a different cpu
3650          */
3651         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3652                 return;
3653
3654         napi_gro_flush(n);
3655         local_irq_save(flags);
3656         __napi_complete(n);
3657         local_irq_restore(flags);
3658 }
3659 EXPORT_SYMBOL(napi_complete);
3660
3661 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3662                     int (*poll)(struct napi_struct *, int), int weight)
3663 {
3664         INIT_LIST_HEAD(&napi->poll_list);
3665         napi->gro_count = 0;
3666         napi->gro_list = NULL;
3667         napi->skb = NULL;
3668         napi->poll = poll;
3669         napi->weight = weight;
3670         list_add(&napi->dev_list, &dev->napi_list);
3671         napi->dev = dev;
3672 #ifdef CONFIG_NETPOLL
3673         spin_lock_init(&napi->poll_lock);
3674         napi->poll_owner = -1;
3675 #endif
3676         set_bit(NAPI_STATE_SCHED, &napi->state);
3677 }
3678 EXPORT_SYMBOL(netif_napi_add);
3679
3680 void netif_napi_del(struct napi_struct *napi)
3681 {
3682         struct sk_buff *skb, *next;
3683
3684         list_del_init(&napi->dev_list);
3685         napi_free_frags(napi);
3686
3687         for (skb = napi->gro_list; skb; skb = next) {
3688                 next = skb->next;
3689                 skb->next = NULL;
3690                 kfree_skb(skb);
3691         }
3692
3693         napi->gro_list = NULL;
3694         napi->gro_count = 0;
3695 }
3696 EXPORT_SYMBOL(netif_napi_del);
3697
3698 static void net_rx_action(struct softirq_action *h)
3699 {
3700         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3701         unsigned long time_limit = jiffies + 2;
3702         int budget = netdev_budget;
3703         void *have;
3704
3705         local_irq_disable();
3706
3707         while (!list_empty(&sd->poll_list)) {
3708                 struct napi_struct *n;
3709                 int work, weight;
3710
3711                 /* If softirq window is exhuasted then punt.
3712                  * Allow this to run for 2 jiffies since which will allow
3713                  * an average latency of 1.5/HZ.
3714                  */
3715                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3716                         goto softnet_break;
3717
3718                 local_irq_enable();
3719
3720                 /* Even though interrupts have been re-enabled, this
3721                  * access is safe because interrupts can only add new
3722                  * entries to the tail of this list, and only ->poll()
3723                  * calls can remove this head entry from the list.
3724                  */
3725                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3726
3727                 have = netpoll_poll_lock(n);
3728
3729                 weight = n->weight;
3730
3731                 /* This NAPI_STATE_SCHED test is for avoiding a race
3732                  * with netpoll's poll_napi().  Only the entity which
3733                  * obtains the lock and sees NAPI_STATE_SCHED set will
3734                  * actually make the ->poll() call.  Therefore we avoid
3735                  * accidently calling ->poll() when NAPI is not scheduled.
3736                  */
3737                 work = 0;
3738                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3739                         work = n->poll(n, weight);
3740                         trace_napi_poll(n);
3741                 }
3742
3743                 WARN_ON_ONCE(work > weight);
3744
3745                 budget -= work;
3746
3747                 local_irq_disable();
3748
3749                 /* Drivers must not modify the NAPI state if they
3750                  * consume the entire weight.  In such cases this code
3751                  * still "owns" the NAPI instance and therefore can
3752                  * move the instance around on the list at-will.
3753                  */
3754                 if (unlikely(work == weight)) {
3755                         if (unlikely(napi_disable_pending(n))) {
3756                                 local_irq_enable();
3757                                 napi_complete(n);
3758                                 local_irq_disable();
3759                         } else
3760                                 list_move_tail(&n->poll_list, &sd->poll_list);
3761                 }
3762
3763                 netpoll_poll_unlock(have);
3764         }
3765 out:
3766         net_rps_action_and_irq_enable(sd);
3767
3768 #ifdef CONFIG_NET_DMA
3769         /*
3770          * There may not be any more sk_buffs coming right now, so push
3771          * any pending DMA copies to hardware
3772          */
3773         dma_issue_pending_all();
3774 #endif
3775
3776         return;
3777
3778 softnet_break:
3779         sd->time_squeeze++;
3780         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3781         goto out;
3782 }
3783
3784 static gifconf_func_t *gifconf_list[NPROTO];
3785
3786 /**
3787  *      register_gifconf        -       register a SIOCGIF handler
3788  *      @family: Address family
3789  *      @gifconf: Function handler
3790  *
3791  *      Register protocol dependent address dumping routines. The handler
3792  *      that is passed must not be freed or reused until it has been replaced
3793  *      by another handler.
3794  */
3795 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3796 {
3797         if (family >= NPROTO)
3798                 return -EINVAL;
3799         gifconf_list[family] = gifconf;
3800         return 0;
3801 }
3802 EXPORT_SYMBOL(register_gifconf);
3803
3804
3805 /*
3806  *      Map an interface index to its name (SIOCGIFNAME)
3807  */
3808
3809 /*
3810  *      We need this ioctl for efficient implementation of the
3811  *      if_indextoname() function required by the IPv6 API.  Without
3812  *      it, we would have to search all the interfaces to find a
3813  *      match.  --pb
3814  */
3815
3816 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3817 {
3818         struct net_device *dev;
3819         struct ifreq ifr;
3820
3821         /*
3822          *      Fetch the caller's info block.
3823          */
3824
3825         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3826                 return -EFAULT;
3827
3828         rcu_read_lock();
3829         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3830         if (!dev) {
3831                 rcu_read_unlock();
3832                 return -ENODEV;
3833         }
3834
3835         strcpy(ifr.ifr_name, dev->name);
3836         rcu_read_unlock();
3837
3838         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3839                 return -EFAULT;
3840         return 0;
3841 }
3842
3843 /*
3844  *      Perform a SIOCGIFCONF call. This structure will change
3845  *      size eventually, and there is nothing I can do about it.
3846  *      Thus we will need a 'compatibility mode'.
3847  */
3848
3849 static int dev_ifconf(struct net *net, char __user *arg)
3850 {
3851         struct ifconf ifc;
3852         struct net_device *dev;
3853         char __user *pos;
3854         int len;
3855         int total;
3856         int i;
3857
3858         /*
3859          *      Fetch the caller's info block.
3860          */
3861
3862         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3863                 return -EFAULT;
3864
3865         pos = ifc.ifc_buf;
3866         len = ifc.ifc_len;
3867
3868         /*
3869          *      Loop over the interfaces, and write an info block for each.
3870          */
3871
3872         total = 0;
3873         for_each_netdev(net, dev) {
3874                 for (i = 0; i < NPROTO; i++) {
3875                         if (gifconf_list[i]) {
3876                                 int done;
3877                                 if (!pos)
3878                                         done = gifconf_list[i](dev, NULL, 0);
3879                                 else
3880                                         done = gifconf_list[i](dev, pos + total,
3881                                                                len - total);
3882                                 if (done < 0)
3883                                         return -EFAULT;
3884                                 total += done;
3885                         }
3886                 }
3887         }
3888
3889         /*
3890          *      All done.  Write the updated control block back to the caller.
3891          */
3892         ifc.ifc_len = total;
3893
3894         /*
3895          *      Both BSD and Solaris return 0 here, so we do too.
3896          */
3897         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3898 }
3899
3900 #ifdef CONFIG_PROC_FS
3901 /*
3902  *      This is invoked by the /proc filesystem handler to display a device
3903  *      in detail.
3904  */
3905 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3906         __acquires(RCU)
3907 {
3908         struct net *net = seq_file_net(seq);
3909         loff_t off;
3910         struct net_device *dev;
3911
3912         rcu_read_lock();
3913         if (!*pos)
3914                 return SEQ_START_TOKEN;
3915
3916         off = 1;
3917         for_each_netdev_rcu(net, dev)
3918                 if (off++ == *pos)
3919                         return dev;
3920
3921         return NULL;
3922 }
3923
3924 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3925 {
3926         struct net_device *dev = (v == SEQ_START_TOKEN) ?
3927                                   first_net_device(seq_file_net(seq)) :
3928                                   next_net_device((struct net_device *)v);
3929
3930         ++*pos;
3931         return rcu_dereference(dev);
3932 }
3933
3934 void dev_seq_stop(struct seq_file *seq, void *v)
3935         __releases(RCU)
3936 {
3937         rcu_read_unlock();
3938 }
3939
3940 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3941 {
3942         struct rtnl_link_stats64 temp;
3943         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3944
3945         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3946                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3947                    dev->name, stats->rx_bytes, stats->rx_packets,
3948                    stats->rx_errors,
3949                    stats->rx_dropped + stats->rx_missed_errors,
3950                    stats->rx_fifo_errors,
3951                    stats->rx_length_errors + stats->rx_over_errors +
3952                     stats->rx_crc_errors + stats->rx_frame_errors,
3953                    stats->rx_compressed, stats->multicast,
3954                    stats->tx_bytes, stats->tx_packets,
3955                    stats->tx_errors, stats->tx_dropped,
3956                    stats->tx_fifo_errors, stats->collisions,
3957                    stats->tx_carrier_errors +
3958                     stats->tx_aborted_errors +
3959                     stats->tx_window_errors +
3960                     stats->tx_heartbeat_errors,
3961                    stats->tx_compressed);
3962 }
3963
3964 /*
3965  *      Called from the PROCfs module. This now uses the new arbitrary sized
3966  *      /proc/net interface to create /proc/net/dev
3967  */
3968 static int dev_seq_show(struct seq_file *seq, void *v)
3969 {
3970         if (v == SEQ_START_TOKEN)
3971                 seq_puts(seq, "Inter-|   Receive                            "
3972                               "                    |  Transmit\n"
3973                               " face |bytes    packets errs drop fifo frame "
3974                               "compressed multicast|bytes    packets errs "
3975                               "drop fifo colls carrier compressed\n");
3976         else
3977                 dev_seq_printf_stats(seq, v);
3978         return 0;
3979 }
3980
3981 static struct softnet_data *softnet_get_online(loff_t *pos)
3982 {
3983         struct softnet_data *sd = NULL;
3984
3985         while (*pos < nr_cpu_ids)
3986                 if (cpu_online(*pos)) {
3987                         sd = &per_cpu(softnet_data, *pos);
3988                         break;
3989                 } else
3990                         ++*pos;
3991         return sd;
3992 }
3993
3994 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3995 {
3996         return softnet_get_online(pos);
3997 }
3998
3999 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4000 {
4001         ++*pos;
4002         return softnet_get_online(pos);
4003 }
4004
4005 static void softnet_seq_stop(struct seq_file *seq, void *v)
4006 {
4007 }
4008
4009 static int softnet_seq_show(struct seq_file *seq, void *v)
4010 {
4011         struct softnet_data *sd = v;
4012
4013         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4014                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4015                    0, 0, 0, 0, /* was fastroute */
4016                    sd->cpu_collision, sd->received_rps);
4017         return 0;
4018 }
4019
4020 static const struct seq_operations dev_seq_ops = {
4021         .start = dev_seq_start,
4022         .next  = dev_seq_next,
4023         .stop  = dev_seq_stop,
4024         .show  = dev_seq_show,
4025 };
4026
4027 static int dev_seq_open(struct inode *inode, struct file *file)
4028 {
4029         return seq_open_net(inode, file, &dev_seq_ops,
4030                             sizeof(struct seq_net_private));
4031 }
4032
4033 static const struct file_operations dev_seq_fops = {
4034         .owner   = THIS_MODULE,
4035         .open    = dev_seq_open,
4036         .read    = seq_read,
4037         .llseek  = seq_lseek,
4038         .release = seq_release_net,
4039 };
4040
4041 static const struct seq_operations softnet_seq_ops = {
4042         .start = softnet_seq_start,
4043         .next  = softnet_seq_next,
4044         .stop  = softnet_seq_stop,
4045         .show  = softnet_seq_show,
4046 };
4047
4048 static int softnet_seq_open(struct inode *inode, struct file *file)
4049 {
4050         return seq_open(file, &softnet_seq_ops);
4051 }
4052
4053 static const struct file_operations softnet_seq_fops = {
4054         .owner   = THIS_MODULE,
4055         .open    = softnet_seq_open,
4056         .read    = seq_read,
4057         .llseek  = seq_lseek,
4058         .release = seq_release,
4059 };
4060
4061 static void *ptype_get_idx(loff_t pos)
4062 {
4063         struct packet_type *pt = NULL;
4064         loff_t i = 0;
4065         int t;
4066
4067         list_for_each_entry_rcu(pt, &ptype_all, list) {
4068                 if (i == pos)
4069                         return pt;
4070                 ++i;
4071         }
4072
4073         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4074                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4075                         if (i == pos)
4076                                 return pt;
4077                         ++i;
4078                 }
4079         }
4080         return NULL;
4081 }
4082
4083 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4084         __acquires(RCU)
4085 {
4086         rcu_read_lock();
4087         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4088 }
4089
4090 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4091 {
4092         struct packet_type *pt;
4093         struct list_head *nxt;
4094         int hash;
4095
4096         ++*pos;
4097         if (v == SEQ_START_TOKEN)
4098                 return ptype_get_idx(0);
4099
4100         pt = v;
4101         nxt = pt->list.next;
4102         if (pt->type == htons(ETH_P_ALL)) {
4103                 if (nxt != &ptype_all)
4104                         goto found;
4105                 hash = 0;
4106                 nxt = ptype_base[0].next;
4107         } else
4108                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4109
4110         while (nxt == &ptype_base[hash]) {
4111                 if (++hash >= PTYPE_HASH_SIZE)
4112                         return NULL;
4113                 nxt = ptype_base[hash].next;
4114         }
4115 found:
4116         return list_entry(nxt, struct packet_type, list);
4117 }
4118
4119 static void ptype_seq_stop(struct seq_file *seq, void *v)
4120         __releases(RCU)
4121 {
4122         rcu_read_unlock();
4123 }
4124
4125 static int ptype_seq_show(struct seq_file *seq, void *v)
4126 {
4127         struct packet_type *pt = v;
4128
4129         if (v == SEQ_START_TOKEN)
4130                 seq_puts(seq, "Type Device      Function\n");
4131         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4132                 if (pt->type == htons(ETH_P_ALL))
4133                         seq_puts(seq, "ALL ");
4134                 else
4135                         seq_printf(seq, "%04x", ntohs(pt->type));
4136
4137                 seq_printf(seq, " %-8s %pF\n",
4138                            pt->dev ? pt->dev->name : "", pt->func);
4139         }
4140
4141         return 0;
4142 }
4143
4144 static const struct seq_operations ptype_seq_ops = {
4145         .start = ptype_seq_start,
4146         .next  = ptype_seq_next,
4147         .stop  = ptype_seq_stop,
4148         .show  = ptype_seq_show,
4149 };
4150
4151 static int ptype_seq_open(struct inode *inode, struct file *file)
4152 {
4153         return seq_open_net(inode, file, &ptype_seq_ops,
4154                         sizeof(struct seq_net_private));
4155 }
4156
4157 static const struct file_operations ptype_seq_fops = {
4158         .owner   = THIS_MODULE,
4159         .open    = ptype_seq_open,
4160         .read    = seq_read,
4161         .llseek  = seq_lseek,
4162         .release = seq_release_net,
4163 };
4164
4165
4166 static int __net_init dev_proc_net_init(struct net *net)
4167 {
4168         int rc = -ENOMEM;
4169
4170         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4171                 goto out;
4172         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4173                 goto out_dev;
4174         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4175                 goto out_softnet;
4176
4177         if (wext_proc_init(net))
4178                 goto out_ptype;
4179         rc = 0;
4180 out:
4181         return rc;
4182 out_ptype:
4183         proc_net_remove(net, "ptype");
4184 out_softnet:
4185         proc_net_remove(net, "softnet_stat");
4186 out_dev:
4187         proc_net_remove(net, "dev");
4188         goto out;
4189 }
4190
4191 static void __net_exit dev_proc_net_exit(struct net *net)
4192 {
4193         wext_proc_exit(net);
4194
4195         proc_net_remove(net, "ptype");
4196         proc_net_remove(net, "softnet_stat");
4197         proc_net_remove(net, "dev");
4198 }
4199
4200 static struct pernet_operations __net_initdata dev_proc_ops = {
4201         .init = dev_proc_net_init,
4202         .exit = dev_proc_net_exit,
4203 };
4204
4205 static int __init dev_proc_init(void)
4206 {
4207         return register_pernet_subsys(&dev_proc_ops);
4208 }
4209 #else
4210 #define dev_proc_init() 0
4211 #endif  /* CONFIG_PROC_FS */
4212
4213
4214 /**
4215  *      netdev_set_master       -       set up master/slave pair
4216  *      @slave: slave device
4217  *      @master: new master device
4218  *
4219  *      Changes the master device of the slave. Pass %NULL to break the
4220  *      bonding. The caller must hold the RTNL semaphore. On a failure
4221  *      a negative errno code is returned. On success the reference counts
4222  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4223  *      function returns zero.
4224  */
4225 int netdev_set_master(struct net_device *slave, struct net_device *master)
4226 {
4227         struct net_device *old = slave->master;
4228
4229         ASSERT_RTNL();
4230
4231         if (master) {
4232                 if (old)
4233                         return -EBUSY;
4234                 dev_hold(master);
4235         }
4236
4237         slave->master = master;
4238
4239         if (old) {
4240                 synchronize_net();
4241                 dev_put(old);
4242         }
4243         if (master)
4244                 slave->flags |= IFF_SLAVE;
4245         else
4246                 slave->flags &= ~IFF_SLAVE;
4247
4248         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4249         return 0;
4250 }
4251 EXPORT_SYMBOL(netdev_set_master);
4252
4253 static void dev_change_rx_flags(struct net_device *dev, int flags)
4254 {
4255         const struct net_device_ops *ops = dev->netdev_ops;
4256
4257         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4258                 ops->ndo_change_rx_flags(dev, flags);
4259 }
4260
4261 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4262 {
4263         unsigned short old_flags = dev->flags;
4264         uid_t uid;
4265         gid_t gid;
4266
4267         ASSERT_RTNL();
4268
4269         dev->flags |= IFF_PROMISC;
4270         dev->promiscuity += inc;
4271         if (dev->promiscuity == 0) {
4272                 /*
4273                  * Avoid overflow.
4274                  * If inc causes overflow, untouch promisc and return error.
4275                  */
4276                 if (inc < 0)
4277                         dev->flags &= ~IFF_PROMISC;
4278                 else {
4279                         dev->promiscuity -= inc;
4280                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4281                                 "set promiscuity failed, promiscuity feature "
4282                                 "of device might be broken.\n", dev->name);
4283                         return -EOVERFLOW;
4284                 }
4285         }
4286         if (dev->flags != old_flags) {
4287                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4288                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4289                                                                "left");
4290                 if (audit_enabled) {
4291                         current_uid_gid(&uid, &gid);
4292                         audit_log(current->audit_context, GFP_ATOMIC,
4293                                 AUDIT_ANOM_PROMISCUOUS,
4294                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4295                                 dev->name, (dev->flags & IFF_PROMISC),
4296                                 (old_flags & IFF_PROMISC),
4297                                 audit_get_loginuid(current),
4298                                 uid, gid,
4299                                 audit_get_sessionid(current));
4300                 }
4301
4302                 dev_change_rx_flags(dev, IFF_PROMISC);
4303         }
4304         return 0;
4305 }
4306
4307 /**
4308  *      dev_set_promiscuity     - update promiscuity count on a device
4309  *      @dev: device
4310  *      @inc: modifier
4311  *
4312  *      Add or remove promiscuity from a device. While the count in the device
4313  *      remains above zero the interface remains promiscuous. Once it hits zero
4314  *      the device reverts back to normal filtering operation. A negative inc
4315  *      value is used to drop promiscuity on the device.
4316  *      Return 0 if successful or a negative errno code on error.
4317  */
4318 int dev_set_promiscuity(struct net_device *dev, int inc)
4319 {
4320         unsigned short old_flags = dev->flags;
4321         int err;
4322
4323         err = __dev_set_promiscuity(dev, inc);
4324         if (err < 0)
4325                 return err;
4326         if (dev->flags != old_flags)
4327                 dev_set_rx_mode(dev);
4328         return err;
4329 }
4330 EXPORT_SYMBOL(dev_set_promiscuity);
4331
4332 /**
4333  *      dev_set_allmulti        - update allmulti count on a device
4334  *      @dev: device
4335  *      @inc: modifier
4336  *
4337  *      Add or remove reception of all multicast frames to a device. While the
4338  *      count in the device remains above zero the interface remains listening
4339  *      to all interfaces. Once it hits zero the device reverts back to normal
4340  *      filtering operation. A negative @inc value is used to drop the counter
4341  *      when releasing a resource needing all multicasts.
4342  *      Return 0 if successful or a negative errno code on error.
4343  */
4344
4345 int dev_set_allmulti(struct net_device *dev, int inc)
4346 {
4347         unsigned short old_flags = dev->flags;
4348
4349         ASSERT_RTNL();
4350
4351         dev->flags |= IFF_ALLMULTI;
4352         dev->allmulti += inc;
4353         if (dev->allmulti == 0) {
4354                 /*
4355                  * Avoid overflow.
4356                  * If inc causes overflow, untouch allmulti and return error.
4357                  */
4358                 if (inc < 0)
4359                         dev->flags &= ~IFF_ALLMULTI;
4360                 else {
4361                         dev->allmulti -= inc;
4362                         printk(KERN_WARNING "%s: allmulti touches roof, "
4363                                 "set allmulti failed, allmulti feature of "
4364                                 "device might be broken.\n", dev->name);
4365                         return -EOVERFLOW;
4366                 }
4367         }
4368         if (dev->flags ^ old_flags) {
4369                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4370                 dev_set_rx_mode(dev);
4371         }
4372         return 0;
4373 }
4374 EXPORT_SYMBOL(dev_set_allmulti);
4375
4376 /*
4377  *      Upload unicast and multicast address lists to device and
4378  *      configure RX filtering. When the device doesn't support unicast
4379  *      filtering it is put in promiscuous mode while unicast addresses
4380  *      are present.
4381  */
4382 void __dev_set_rx_mode(struct net_device *dev)
4383 {
4384         const struct net_device_ops *ops = dev->netdev_ops;
4385
4386         /* dev_open will call this function so the list will stay sane. */
4387         if (!(dev->flags&IFF_UP))
4388                 return;
4389
4390         if (!netif_device_present(dev))
4391                 return;
4392
4393         if (ops->ndo_set_rx_mode)
4394                 ops->ndo_set_rx_mode(dev);
4395         else {
4396                 /* Unicast addresses changes may only happen under the rtnl,
4397                  * therefore calling __dev_set_promiscuity here is safe.
4398                  */
4399                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4400                         __dev_set_promiscuity(dev, 1);
4401                         dev->uc_promisc = 1;
4402                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4403                         __dev_set_promiscuity(dev, -1);
4404                         dev->uc_promisc = 0;
4405                 }
4406
4407                 if (ops->ndo_set_multicast_list)
4408                         ops->ndo_set_multicast_list(dev);
4409         }
4410 }
4411
4412 void dev_set_rx_mode(struct net_device *dev)
4413 {
4414         netif_addr_lock_bh(dev);
4415         __dev_set_rx_mode(dev);
4416         netif_addr_unlock_bh(dev);
4417 }
4418
4419 /**
4420  *      dev_get_flags - get flags reported to userspace
4421  *      @dev: device
4422  *
4423  *      Get the combination of flag bits exported through APIs to userspace.
4424  */
4425 unsigned dev_get_flags(const struct net_device *dev)
4426 {
4427         unsigned flags;
4428
4429         flags = (dev->flags & ~(IFF_PROMISC |
4430                                 IFF_ALLMULTI |
4431                                 IFF_RUNNING |
4432                                 IFF_LOWER_UP |
4433                                 IFF_DORMANT)) |
4434                 (dev->gflags & (IFF_PROMISC |
4435                                 IFF_ALLMULTI));
4436
4437         if (netif_running(dev)) {
4438                 if (netif_oper_up(dev))
4439                         flags |= IFF_RUNNING;
4440                 if (netif_carrier_ok(dev))
4441                         flags |= IFF_LOWER_UP;
4442                 if (netif_dormant(dev))
4443                         flags |= IFF_DORMANT;
4444         }
4445
4446         return flags;
4447 }
4448 EXPORT_SYMBOL(dev_get_flags);
4449
4450 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4451 {
4452         int old_flags = dev->flags;
4453         int ret;
4454
4455         ASSERT_RTNL();
4456
4457         /*
4458          *      Set the flags on our device.
4459          */
4460
4461         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4462                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4463                                IFF_AUTOMEDIA)) |
4464                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4465                                     IFF_ALLMULTI));
4466
4467         /*
4468          *      Load in the correct multicast list now the flags have changed.
4469          */
4470
4471         if ((old_flags ^ flags) & IFF_MULTICAST)
4472                 dev_change_rx_flags(dev, IFF_MULTICAST);
4473
4474         dev_set_rx_mode(dev);
4475
4476         /*
4477          *      Have we downed the interface. We handle IFF_UP ourselves
4478          *      according to user attempts to set it, rather than blindly
4479          *      setting it.
4480          */
4481
4482         ret = 0;
4483         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4484                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4485
4486                 if (!ret)
4487                         dev_set_rx_mode(dev);
4488         }
4489
4490         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4491                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4492
4493                 dev->gflags ^= IFF_PROMISC;
4494                 dev_set_promiscuity(dev, inc);
4495         }
4496
4497         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4498            is important. Some (broken) drivers set IFF_PROMISC, when
4499            IFF_ALLMULTI is requested not asking us and not reporting.
4500          */
4501         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4502                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4503
4504                 dev->gflags ^= IFF_ALLMULTI;
4505                 dev_set_allmulti(dev, inc);
4506         }
4507
4508         return ret;
4509 }
4510
4511 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4512 {
4513         unsigned int changes = dev->flags ^ old_flags;
4514
4515         if (changes & IFF_UP) {
4516                 if (dev->flags & IFF_UP)
4517                         call_netdevice_notifiers(NETDEV_UP, dev);
4518                 else
4519                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4520         }
4521
4522         if (dev->flags & IFF_UP &&
4523             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4524                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4525 }
4526
4527 /**
4528  *      dev_change_flags - change device settings
4529  *      @dev: device
4530  *      @flags: device state flags
4531  *
4532  *      Change settings on device based state flags. The flags are
4533  *      in the userspace exported format.
4534  */
4535 int dev_change_flags(struct net_device *dev, unsigned flags)
4536 {
4537         int ret, changes;
4538         int old_flags = dev->flags;
4539
4540         ret = __dev_change_flags(dev, flags);
4541         if (ret < 0)
4542                 return ret;
4543
4544         changes = old_flags ^ dev->flags;
4545         if (changes)
4546                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4547
4548         __dev_notify_flags(dev, old_flags);
4549         return ret;
4550 }
4551 EXPORT_SYMBOL(dev_change_flags);
4552
4553 /**
4554  *      dev_set_mtu - Change maximum transfer unit
4555  *      @dev: device
4556  *      @new_mtu: new transfer unit
4557  *
4558  *      Change the maximum transfer size of the network device.
4559  */
4560 int dev_set_mtu(struct net_device *dev, int new_mtu)
4561 {
4562         const struct net_device_ops *ops = dev->netdev_ops;
4563         int err;
4564
4565         if (new_mtu == dev->mtu)
4566                 return 0;
4567
4568         /*      MTU must be positive.    */
4569         if (new_mtu < 0)
4570                 return -EINVAL;
4571
4572         if (!netif_device_present(dev))
4573                 return -ENODEV;
4574
4575         err = 0;
4576         if (ops->ndo_change_mtu)
4577                 err = ops->ndo_change_mtu(dev, new_mtu);
4578         else
4579                 dev->mtu = new_mtu;
4580
4581         if (!err && dev->flags & IFF_UP)
4582                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4583         return err;
4584 }
4585 EXPORT_SYMBOL(dev_set_mtu);
4586
4587 /**
4588  *      dev_set_mac_address - Change Media Access Control Address
4589  *      @dev: device
4590  *      @sa: new address
4591  *
4592  *      Change the hardware (MAC) address of the device
4593  */
4594 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4595 {
4596         const struct net_device_ops *ops = dev->netdev_ops;
4597         int err;
4598
4599         if (!ops->ndo_set_mac_address)
4600                 return -EOPNOTSUPP;
4601         if (sa->sa_family != dev->type)
4602                 return -EINVAL;
4603         if (!netif_device_present(dev))
4604                 return -ENODEV;
4605         err = ops->ndo_set_mac_address(dev, sa);
4606         if (!err)
4607                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4608         return err;
4609 }
4610 EXPORT_SYMBOL(dev_set_mac_address);
4611
4612 /*
4613  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4614  */
4615 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4616 {
4617         int err;
4618         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4619
4620         if (!dev)
4621                 return -ENODEV;
4622
4623         switch (cmd) {
4624         case SIOCGIFFLAGS:      /* Get interface flags */
4625                 ifr->ifr_flags = (short) dev_get_flags(dev);
4626                 return 0;
4627
4628         case SIOCGIFMETRIC:     /* Get the metric on the interface
4629                                    (currently unused) */
4630                 ifr->ifr_metric = 0;
4631                 return 0;
4632
4633         case SIOCGIFMTU:        /* Get the MTU of a device */
4634                 ifr->ifr_mtu = dev->mtu;
4635                 return 0;
4636
4637         case SIOCGIFHWADDR:
4638                 if (!dev->addr_len)
4639                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4640                 else
4641                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4642                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4643                 ifr->ifr_hwaddr.sa_family = dev->type;
4644                 return 0;
4645
4646         case SIOCGIFSLAVE:
4647                 err = -EINVAL;
4648                 break;
4649
4650         case SIOCGIFMAP:
4651                 ifr->ifr_map.mem_start = dev->mem_start;
4652                 ifr->ifr_map.mem_end   = dev->mem_end;
4653                 ifr->ifr_map.base_addr = dev->base_addr;
4654                 ifr->ifr_map.irq       = dev->irq;
4655                 ifr->ifr_map.dma       = dev->dma;
4656                 ifr->ifr_map.port      = dev->if_port;
4657                 return 0;
4658
4659         case SIOCGIFINDEX:
4660                 ifr->ifr_ifindex = dev->ifindex;
4661                 return 0;
4662
4663         case SIOCGIFTXQLEN:
4664                 ifr->ifr_qlen = dev->tx_queue_len;
4665                 return 0;
4666
4667         default:
4668                 /* dev_ioctl() should ensure this case
4669                  * is never reached
4670                  */
4671                 WARN_ON(1);
4672                 err = -EINVAL;
4673                 break;
4674
4675         }
4676         return err;
4677 }
4678
4679 /*
4680  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4681  */
4682 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4683 {
4684         int err;
4685         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4686         const struct net_device_ops *ops;
4687
4688         if (!dev)
4689                 return -ENODEV;
4690
4691         ops = dev->netdev_ops;
4692
4693         switch (cmd) {
4694         case SIOCSIFFLAGS:      /* Set interface flags */
4695                 return dev_change_flags(dev, ifr->ifr_flags);
4696
4697         case SIOCSIFMETRIC:     /* Set the metric on the interface
4698                                    (currently unused) */
4699                 return -EOPNOTSUPP;
4700
4701         case SIOCSIFMTU:        /* Set the MTU of a device */
4702                 return dev_set_mtu(dev, ifr->ifr_mtu);
4703
4704         case SIOCSIFHWADDR:
4705                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4706
4707         case SIOCSIFHWBROADCAST:
4708                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4709                         return -EINVAL;
4710                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4711                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4712                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4713                 return 0;
4714
4715         case SIOCSIFMAP:
4716                 if (ops->ndo_set_config) {
4717                         if (!netif_device_present(dev))
4718                                 return -ENODEV;
4719                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4720                 }
4721                 return -EOPNOTSUPP;
4722
4723         case SIOCADDMULTI:
4724                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4725                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4726                         return -EINVAL;
4727                 if (!netif_device_present(dev))
4728                         return -ENODEV;
4729                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4730
4731         case SIOCDELMULTI:
4732                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4733                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4734                         return -EINVAL;
4735                 if (!netif_device_present(dev))
4736                         return -ENODEV;
4737                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4738
4739         case SIOCSIFTXQLEN:
4740                 if (ifr->ifr_qlen < 0)
4741                         return -EINVAL;
4742                 dev->tx_queue_len = ifr->ifr_qlen;
4743                 return 0;
4744
4745         case SIOCSIFNAME:
4746                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4747                 return dev_change_name(dev, ifr->ifr_newname);
4748
4749         /*
4750          *      Unknown or private ioctl
4751          */
4752         default:
4753                 if ((cmd >= SIOCDEVPRIVATE &&
4754                     cmd <= SIOCDEVPRIVATE + 15) ||
4755                     cmd == SIOCBONDENSLAVE ||
4756                     cmd == SIOCBONDRELEASE ||
4757                     cmd == SIOCBONDSETHWADDR ||
4758                     cmd == SIOCBONDSLAVEINFOQUERY ||
4759                     cmd == SIOCBONDINFOQUERY ||
4760                     cmd == SIOCBONDCHANGEACTIVE ||
4761                     cmd == SIOCGMIIPHY ||
4762                     cmd == SIOCGMIIREG ||
4763                     cmd == SIOCSMIIREG ||
4764                     cmd == SIOCBRADDIF ||
4765                     cmd == SIOCBRDELIF ||
4766                     cmd == SIOCSHWTSTAMP ||
4767                     cmd == SIOCWANDEV) {
4768                         err = -EOPNOTSUPP;
4769                         if (ops->ndo_do_ioctl) {
4770                                 if (netif_device_present(dev))
4771                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4772                                 else
4773                                         err = -ENODEV;
4774                         }
4775                 } else
4776                         err = -EINVAL;
4777
4778         }
4779         return err;
4780 }
4781
4782 /*
4783  *      This function handles all "interface"-type I/O control requests. The actual
4784  *      'doing' part of this is dev_ifsioc above.
4785  */
4786
4787 /**
4788  *      dev_ioctl       -       network device ioctl
4789  *      @net: the applicable net namespace
4790  *      @cmd: command to issue
4791  *      @arg: pointer to a struct ifreq in user space
4792  *
4793  *      Issue ioctl functions to devices. This is normally called by the
4794  *      user space syscall interfaces but can sometimes be useful for
4795  *      other purposes. The return value is the return from the syscall if
4796  *      positive or a negative errno code on error.
4797  */
4798
4799 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4800 {
4801         struct ifreq ifr;
4802         int ret;
4803         char *colon;
4804
4805         /* One special case: SIOCGIFCONF takes ifconf argument
4806            and requires shared lock, because it sleeps writing
4807            to user space.
4808          */
4809
4810         if (cmd == SIOCGIFCONF) {
4811                 rtnl_lock();
4812                 ret = dev_ifconf(net, (char __user *) arg);
4813                 rtnl_unlock();
4814                 return ret;
4815         }
4816         if (cmd == SIOCGIFNAME)
4817                 return dev_ifname(net, (struct ifreq __user *)arg);
4818
4819         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4820                 return -EFAULT;
4821
4822         ifr.ifr_name[IFNAMSIZ-1] = 0;
4823
4824         colon = strchr(ifr.ifr_name, ':');
4825         if (colon)
4826                 *colon = 0;
4827
4828         /*
4829          *      See which interface the caller is talking about.
4830          */
4831
4832         switch (cmd) {
4833         /*
4834          *      These ioctl calls:
4835          *      - can be done by all.
4836          *      - atomic and do not require locking.
4837          *      - return a value
4838          */
4839         case SIOCGIFFLAGS:
4840         case SIOCGIFMETRIC:
4841         case SIOCGIFMTU:
4842         case SIOCGIFHWADDR:
4843         case SIOCGIFSLAVE:
4844         case SIOCGIFMAP:
4845         case SIOCGIFINDEX:
4846         case SIOCGIFTXQLEN:
4847                 dev_load(net, ifr.ifr_name);
4848                 rcu_read_lock();
4849                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4850                 rcu_read_unlock();
4851                 if (!ret) {
4852                         if (colon)
4853                                 *colon = ':';
4854                         if (copy_to_user(arg, &ifr,
4855                                          sizeof(struct ifreq)))
4856                                 ret = -EFAULT;
4857                 }
4858                 return ret;
4859
4860         case SIOCETHTOOL:
4861                 dev_load(net, ifr.ifr_name);
4862                 rtnl_lock();
4863                 ret = dev_ethtool(net, &ifr);
4864                 rtnl_unlock();
4865                 if (!ret) {
4866                         if (colon)
4867                                 *colon = ':';
4868                         if (copy_to_user(arg, &ifr,
4869                                          sizeof(struct ifreq)))
4870                                 ret = -EFAULT;
4871                 }
4872                 return ret;
4873
4874         /*
4875          *      These ioctl calls:
4876          *      - require superuser power.
4877          *      - require strict serialization.
4878          *      - return a value
4879          */
4880         case SIOCGMIIPHY:
4881         case SIOCGMIIREG:
4882         case SIOCSIFNAME:
4883                 if (!capable(CAP_NET_ADMIN))
4884                         return -EPERM;
4885                 dev_load(net, ifr.ifr_name);
4886                 rtnl_lock();
4887                 ret = dev_ifsioc(net, &ifr, cmd);
4888                 rtnl_unlock();
4889                 if (!ret) {
4890                         if (colon)
4891                                 *colon = ':';
4892                         if (copy_to_user(arg, &ifr,
4893                                          sizeof(struct ifreq)))
4894                                 ret = -EFAULT;
4895                 }
4896                 return ret;
4897
4898         /*
4899          *      These ioctl calls:
4900          *      - require superuser power.
4901          *      - require strict serialization.
4902          *      - do not return a value
4903          */
4904         case SIOCSIFFLAGS:
4905         case SIOCSIFMETRIC:
4906         case SIOCSIFMTU:
4907         case SIOCSIFMAP:
4908         case SIOCSIFHWADDR:
4909         case SIOCSIFSLAVE:
4910         case SIOCADDMULTI:
4911         case SIOCDELMULTI:
4912         case SIOCSIFHWBROADCAST:
4913         case SIOCSIFTXQLEN:
4914         case SIOCSMIIREG:
4915         case SIOCBONDENSLAVE:
4916         case SIOCBONDRELEASE:
4917         case SIOCBONDSETHWADDR:
4918         case SIOCBONDCHANGEACTIVE:
4919         case SIOCBRADDIF:
4920         case SIOCBRDELIF:
4921         case SIOCSHWTSTAMP:
4922                 if (!capable(CAP_NET_ADMIN))
4923                         return -EPERM;
4924                 /* fall through */
4925         case SIOCBONDSLAVEINFOQUERY:
4926         case SIOCBONDINFOQUERY:
4927                 dev_load(net, ifr.ifr_name);
4928                 rtnl_lock();
4929                 ret = dev_ifsioc(net, &ifr, cmd);
4930                 rtnl_unlock();
4931                 return ret;
4932
4933         case SIOCGIFMEM:
4934                 /* Get the per device memory space. We can add this but
4935                  * currently do not support it */
4936         case SIOCSIFMEM:
4937                 /* Set the per device memory buffer space.
4938                  * Not applicable in our case */
4939         case SIOCSIFLINK:
4940                 return -EINVAL;
4941
4942         /*
4943          *      Unknown or private ioctl.
4944          */
4945         default:
4946                 if (cmd == SIOCWANDEV ||
4947                     (cmd >= SIOCDEVPRIVATE &&
4948                      cmd <= SIOCDEVPRIVATE + 15)) {
4949                         dev_load(net, ifr.ifr_name);
4950                         rtnl_lock();
4951                         ret = dev_ifsioc(net, &ifr, cmd);
4952                         rtnl_unlock();
4953                         if (!ret && copy_to_user(arg, &ifr,
4954                                                  sizeof(struct ifreq)))
4955                                 ret = -EFAULT;
4956                         return ret;
4957                 }
4958                 /* Take care of Wireless Extensions */
4959                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4960                         return wext_handle_ioctl(net, &ifr, cmd, arg);
4961                 return -EINVAL;
4962         }
4963 }
4964
4965
4966 /**
4967  *      dev_new_index   -       allocate an ifindex
4968  *      @net: the applicable net namespace
4969  *
4970  *      Returns a suitable unique value for a new device interface
4971  *      number.  The caller must hold the rtnl semaphore or the
4972  *      dev_base_lock to be sure it remains unique.
4973  */
4974 static int dev_new_index(struct net *net)
4975 {
4976         static int ifindex;
4977         for (;;) {
4978                 if (++ifindex <= 0)
4979                         ifindex = 1;
4980                 if (!__dev_get_by_index(net, ifindex))
4981                         return ifindex;
4982         }
4983 }
4984
4985 /* Delayed registration/unregisteration */
4986 static LIST_HEAD(net_todo_list);
4987
4988 static void net_set_todo(struct net_device *dev)
4989 {
4990         list_add_tail(&dev->todo_list, &net_todo_list);
4991 }
4992
4993 static void rollback_registered_many(struct list_head *head)
4994 {
4995         struct net_device *dev, *tmp;
4996
4997         BUG_ON(dev_boot_phase);
4998         ASSERT_RTNL();
4999
5000         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5001                 /* Some devices call without registering
5002                  * for initialization unwind. Remove those
5003                  * devices and proceed with the remaining.
5004                  */
5005                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5006                         pr_debug("unregister_netdevice: device %s/%p never "
5007                                  "was registered\n", dev->name, dev);
5008
5009                         WARN_ON(1);
5010                         list_del(&dev->unreg_list);
5011                         continue;
5012                 }
5013
5014                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5015         }
5016
5017         /* If device is running, close it first. */
5018         dev_close_many(head);
5019
5020         list_for_each_entry(dev, head, unreg_list) {
5021                 /* And unlink it from device chain. */
5022                 unlist_netdevice(dev);
5023
5024                 dev->reg_state = NETREG_UNREGISTERING;
5025         }
5026
5027         synchronize_net();
5028
5029         list_for_each_entry(dev, head, unreg_list) {
5030                 /* Shutdown queueing discipline. */
5031                 dev_shutdown(dev);
5032
5033
5034                 /* Notify protocols, that we are about to destroy
5035                    this device. They should clean all the things.
5036                 */
5037                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5038
5039                 if (!dev->rtnl_link_ops ||
5040                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5041                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5042
5043                 /*
5044                  *      Flush the unicast and multicast chains
5045                  */
5046                 dev_uc_flush(dev);
5047                 dev_mc_flush(dev);
5048
5049                 if (dev->netdev_ops->ndo_uninit)
5050                         dev->netdev_ops->ndo_uninit(dev);
5051
5052                 /* Notifier chain MUST detach us from master device. */
5053                 WARN_ON(dev->master);
5054
5055                 /* Remove entries from kobject tree */
5056                 netdev_unregister_kobject(dev);
5057         }
5058
5059         /* Process any work delayed until the end of the batch */
5060         dev = list_first_entry(head, struct net_device, unreg_list);
5061         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5062
5063         rcu_barrier();
5064
5065         list_for_each_entry(dev, head, unreg_list)
5066                 dev_put(dev);
5067 }
5068
5069 static void rollback_registered(struct net_device *dev)
5070 {
5071         LIST_HEAD(single);
5072
5073         list_add(&dev->unreg_list, &single);
5074         rollback_registered_many(&single);
5075 }
5076
5077 unsigned long netdev_fix_features(unsigned long features, const char *name)
5078 {
5079         /* Fix illegal SG+CSUM combinations. */
5080         if ((features & NETIF_F_SG) &&
5081             !(features & NETIF_F_ALL_CSUM)) {
5082                 if (name)
5083                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5084                                "checksum feature.\n", name);
5085                 features &= ~NETIF_F_SG;
5086         }
5087
5088         /* TSO requires that SG is present as well. */
5089         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5090                 if (name)
5091                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5092                                "SG feature.\n", name);
5093                 features &= ~NETIF_F_TSO;
5094         }
5095
5096         if (features & NETIF_F_UFO) {
5097                 /* maybe split UFO into V4 and V6? */
5098                 if (!((features & NETIF_F_GEN_CSUM) ||
5099                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5100                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5101                         if (name)
5102                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5103                                        "since no checksum offload features.\n",
5104                                        name);
5105                         features &= ~NETIF_F_UFO;
5106                 }
5107
5108                 if (!(features & NETIF_F_SG)) {
5109                         if (name)
5110                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5111                                        "since no NETIF_F_SG feature.\n", name);
5112                         features &= ~NETIF_F_UFO;
5113                 }
5114         }
5115
5116         return features;
5117 }
5118 EXPORT_SYMBOL(netdev_fix_features);
5119
5120 /**
5121  *      netif_stacked_transfer_operstate -      transfer operstate
5122  *      @rootdev: the root or lower level device to transfer state from
5123  *      @dev: the device to transfer operstate to
5124  *
5125  *      Transfer operational state from root to device. This is normally
5126  *      called when a stacking relationship exists between the root
5127  *      device and the device(a leaf device).
5128  */
5129 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5130                                         struct net_device *dev)
5131 {
5132         if (rootdev->operstate == IF_OPER_DORMANT)
5133                 netif_dormant_on(dev);
5134         else
5135                 netif_dormant_off(dev);
5136
5137         if (netif_carrier_ok(rootdev)) {
5138                 if (!netif_carrier_ok(dev))
5139                         netif_carrier_on(dev);
5140         } else {
5141                 if (netif_carrier_ok(dev))
5142                         netif_carrier_off(dev);
5143         }
5144 }
5145 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5146
5147 #ifdef CONFIG_RPS
5148 static int netif_alloc_rx_queues(struct net_device *dev)
5149 {
5150         unsigned int i, count = dev->num_rx_queues;
5151         struct netdev_rx_queue *rx;
5152
5153         BUG_ON(count < 1);
5154
5155         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5156         if (!rx) {
5157                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5158                 return -ENOMEM;
5159         }
5160         dev->_rx = rx;
5161
5162         for (i = 0; i < count; i++)
5163                 rx[i].dev = dev;
5164         return 0;
5165 }
5166 #endif
5167
5168 static void netdev_init_one_queue(struct net_device *dev,
5169                                   struct netdev_queue *queue, void *_unused)
5170 {
5171         /* Initialize queue lock */
5172         spin_lock_init(&queue->_xmit_lock);
5173         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5174         queue->xmit_lock_owner = -1;
5175         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5176         queue->dev = dev;
5177 }
5178
5179 static int netif_alloc_netdev_queues(struct net_device *dev)
5180 {
5181         unsigned int count = dev->num_tx_queues;
5182         struct netdev_queue *tx;
5183
5184         BUG_ON(count < 1);
5185
5186         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5187         if (!tx) {
5188                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5189                        count);
5190                 return -ENOMEM;
5191         }
5192         dev->_tx = tx;
5193
5194         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5195         spin_lock_init(&dev->tx_global_lock);
5196
5197         return 0;
5198 }
5199
5200 /**
5201  *      register_netdevice      - register a network device
5202  *      @dev: device to register
5203  *
5204  *      Take a completed network device structure and add it to the kernel
5205  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5206  *      chain. 0 is returned on success. A negative errno code is returned
5207  *      on a failure to set up the device, or if the name is a duplicate.
5208  *
5209  *      Callers must hold the rtnl semaphore. You may want
5210  *      register_netdev() instead of this.
5211  *
5212  *      BUGS:
5213  *      The locking appears insufficient to guarantee two parallel registers
5214  *      will not get the same name.
5215  */
5216
5217 int register_netdevice(struct net_device *dev)
5218 {
5219         int ret;
5220         struct net *net = dev_net(dev);
5221
5222         BUG_ON(dev_boot_phase);
5223         ASSERT_RTNL();
5224
5225         might_sleep();
5226
5227         /* When net_device's are persistent, this will be fatal. */
5228         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5229         BUG_ON(!net);
5230
5231         spin_lock_init(&dev->addr_list_lock);
5232         netdev_set_addr_lockdep_class(dev);
5233
5234         dev->iflink = -1;
5235
5236         /* Init, if this function is available */
5237         if (dev->netdev_ops->ndo_init) {
5238                 ret = dev->netdev_ops->ndo_init(dev);
5239                 if (ret) {
5240                         if (ret > 0)
5241                                 ret = -EIO;
5242                         goto out;
5243                 }
5244         }
5245
5246         ret = dev_get_valid_name(dev, dev->name, 0);
5247         if (ret)
5248                 goto err_uninit;
5249
5250         dev->ifindex = dev_new_index(net);
5251         if (dev->iflink == -1)
5252                 dev->iflink = dev->ifindex;
5253
5254         /* Fix illegal checksum combinations */
5255         if ((dev->features & NETIF_F_HW_CSUM) &&
5256             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5257                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5258                        dev->name);
5259                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5260         }
5261
5262         if ((dev->features & NETIF_F_NO_CSUM) &&
5263             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5264                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5265                        dev->name);
5266                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5267         }
5268
5269         dev->features = netdev_fix_features(dev->features, dev->name);
5270
5271         /* Enable software GSO if SG is supported. */
5272         if (dev->features & NETIF_F_SG)
5273                 dev->features |= NETIF_F_GSO;
5274
5275         /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5276          * vlan_dev_init() will do the dev->features check, so these features
5277          * are enabled only if supported by underlying device.
5278          */
5279         dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5280
5281         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5282         ret = notifier_to_errno(ret);
5283         if (ret)
5284                 goto err_uninit;
5285
5286         ret = netdev_register_kobject(dev);
5287         if (ret)
5288                 goto err_uninit;
5289         dev->reg_state = NETREG_REGISTERED;
5290
5291         /*
5292          *      Default initial state at registry is that the
5293          *      device is present.
5294          */
5295
5296         set_bit(__LINK_STATE_PRESENT, &dev->state);
5297
5298         dev_init_scheduler(dev);
5299         dev_hold(dev);
5300         list_netdevice(dev);
5301
5302         /* Notify protocols, that a new device appeared. */
5303         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5304         ret = notifier_to_errno(ret);
5305         if (ret) {
5306                 rollback_registered(dev);
5307                 dev->reg_state = NETREG_UNREGISTERED;
5308         }
5309         /*
5310          *      Prevent userspace races by waiting until the network
5311          *      device is fully setup before sending notifications.
5312          */
5313         if (!dev->rtnl_link_ops ||
5314             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5315                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5316
5317 out:
5318         return ret;
5319
5320 err_uninit:
5321         if (dev->netdev_ops->ndo_uninit)
5322                 dev->netdev_ops->ndo_uninit(dev);
5323         goto out;
5324 }
5325 EXPORT_SYMBOL(register_netdevice);
5326
5327 /**
5328  *      init_dummy_netdev       - init a dummy network device for NAPI
5329  *      @dev: device to init
5330  *
5331  *      This takes a network device structure and initialize the minimum
5332  *      amount of fields so it can be used to schedule NAPI polls without
5333  *      registering a full blown interface. This is to be used by drivers
5334  *      that need to tie several hardware interfaces to a single NAPI
5335  *      poll scheduler due to HW limitations.
5336  */
5337 int init_dummy_netdev(struct net_device *dev)
5338 {
5339         /* Clear everything. Note we don't initialize spinlocks
5340          * are they aren't supposed to be taken by any of the
5341          * NAPI code and this dummy netdev is supposed to be
5342          * only ever used for NAPI polls
5343          */
5344         memset(dev, 0, sizeof(struct net_device));
5345
5346         /* make sure we BUG if trying to hit standard
5347          * register/unregister code path
5348          */
5349         dev->reg_state = NETREG_DUMMY;
5350
5351         /* NAPI wants this */
5352         INIT_LIST_HEAD(&dev->napi_list);
5353
5354         /* a dummy interface is started by default */
5355         set_bit(__LINK_STATE_PRESENT, &dev->state);
5356         set_bit(__LINK_STATE_START, &dev->state);
5357
5358         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5359          * because users of this 'device' dont need to change
5360          * its refcount.
5361          */
5362
5363         return 0;
5364 }
5365 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5366
5367
5368 /**
5369  *      register_netdev - register a network device
5370  *      @dev: device to register
5371  *
5372  *      Take a completed network device structure and add it to the kernel
5373  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5374  *      chain. 0 is returned on success. A negative errno code is returned
5375  *      on a failure to set up the device, or if the name is a duplicate.
5376  *
5377  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5378  *      and expands the device name if you passed a format string to
5379  *      alloc_netdev.
5380  */
5381 int register_netdev(struct net_device *dev)
5382 {
5383         int err;
5384
5385         rtnl_lock();
5386
5387         /*
5388          * If the name is a format string the caller wants us to do a
5389          * name allocation.
5390          */
5391         if (strchr(dev->name, '%')) {
5392                 err = dev_alloc_name(dev, dev->name);
5393                 if (err < 0)
5394                         goto out;
5395         }
5396
5397         err = register_netdevice(dev);
5398 out:
5399         rtnl_unlock();
5400         return err;
5401 }
5402 EXPORT_SYMBOL(register_netdev);
5403
5404 int netdev_refcnt_read(const struct net_device *dev)
5405 {
5406         int i, refcnt = 0;
5407
5408         for_each_possible_cpu(i)
5409                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5410         return refcnt;
5411 }
5412 EXPORT_SYMBOL(netdev_refcnt_read);
5413
5414 /*
5415  * netdev_wait_allrefs - wait until all references are gone.
5416  *
5417  * This is called when unregistering network devices.
5418  *
5419  * Any protocol or device that holds a reference should register
5420  * for netdevice notification, and cleanup and put back the
5421  * reference if they receive an UNREGISTER event.
5422  * We can get stuck here if buggy protocols don't correctly
5423  * call dev_put.
5424  */
5425 static void netdev_wait_allrefs(struct net_device *dev)
5426 {
5427         unsigned long rebroadcast_time, warning_time;
5428         int refcnt;
5429
5430         linkwatch_forget_dev(dev);
5431
5432         rebroadcast_time = warning_time = jiffies;
5433         refcnt = netdev_refcnt_read(dev);
5434
5435         while (refcnt != 0) {
5436                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5437                         rtnl_lock();
5438
5439                         /* Rebroadcast unregister notification */
5440                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5441                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5442                          * should have already handle it the first time */
5443
5444                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5445                                      &dev->state)) {
5446                                 /* We must not have linkwatch events
5447                                  * pending on unregister. If this
5448                                  * happens, we simply run the queue
5449                                  * unscheduled, resulting in a noop
5450                                  * for this device.
5451                                  */
5452                                 linkwatch_run_queue();
5453                         }
5454
5455                         __rtnl_unlock();
5456
5457                         rebroadcast_time = jiffies;
5458                 }
5459
5460                 msleep(250);
5461
5462                 refcnt = netdev_refcnt_read(dev);
5463
5464                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5465                         printk(KERN_EMERG "unregister_netdevice: "
5466                                "waiting for %s to become free. Usage "
5467                                "count = %d\n",
5468                                dev->name, refcnt);
5469                         warning_time = jiffies;
5470                 }
5471         }
5472 }
5473
5474 /* The sequence is:
5475  *
5476  *      rtnl_lock();
5477  *      ...
5478  *      register_netdevice(x1);
5479  *      register_netdevice(x2);
5480  *      ...
5481  *      unregister_netdevice(y1);
5482  *      unregister_netdevice(y2);
5483  *      ...
5484  *      rtnl_unlock();
5485  *      free_netdev(y1);
5486  *      free_netdev(y2);
5487  *
5488  * We are invoked by rtnl_unlock().
5489  * This allows us to deal with problems:
5490  * 1) We can delete sysfs objects which invoke hotplug
5491  *    without deadlocking with linkwatch via keventd.
5492  * 2) Since we run with the RTNL semaphore not held, we can sleep
5493  *    safely in order to wait for the netdev refcnt to drop to zero.
5494  *
5495  * We must not return until all unregister events added during
5496  * the interval the lock was held have been completed.
5497  */
5498 void netdev_run_todo(void)
5499 {
5500         struct list_head list;
5501
5502         /* Snapshot list, allow later requests */
5503         list_replace_init(&net_todo_list, &list);
5504
5505         __rtnl_unlock();
5506
5507         while (!list_empty(&list)) {
5508                 struct net_device *dev
5509                         = list_first_entry(&list, struct net_device, todo_list);
5510                 list_del(&dev->todo_list);
5511
5512                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5513                         printk(KERN_ERR "network todo '%s' but state %d\n",
5514                                dev->name, dev->reg_state);
5515                         dump_stack();
5516                         continue;
5517                 }
5518
5519                 dev->reg_state = NETREG_UNREGISTERED;
5520
5521                 on_each_cpu(flush_backlog, dev, 1);
5522
5523                 netdev_wait_allrefs(dev);
5524
5525                 /* paranoia */
5526                 BUG_ON(netdev_refcnt_read(dev));
5527                 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5528                 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5529                 WARN_ON(dev->dn_ptr);
5530
5531                 if (dev->destructor)
5532                         dev->destructor(dev);
5533
5534                 /* Free network device */
5535                 kobject_put(&dev->dev.kobj);
5536         }
5537 }
5538
5539 /**
5540  *      dev_txq_stats_fold - fold tx_queues stats
5541  *      @dev: device to get statistics from
5542  *      @stats: struct rtnl_link_stats64 to hold results
5543  */
5544 void dev_txq_stats_fold(const struct net_device *dev,
5545                         struct rtnl_link_stats64 *stats)
5546 {
5547         u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5548         unsigned int i;
5549         struct netdev_queue *txq;
5550
5551         for (i = 0; i < dev->num_tx_queues; i++) {
5552                 txq = netdev_get_tx_queue(dev, i);
5553                 spin_lock_bh(&txq->_xmit_lock);
5554                 tx_bytes   += txq->tx_bytes;
5555                 tx_packets += txq->tx_packets;
5556                 tx_dropped += txq->tx_dropped;
5557                 spin_unlock_bh(&txq->_xmit_lock);
5558         }
5559         if (tx_bytes || tx_packets || tx_dropped) {
5560                 stats->tx_bytes   = tx_bytes;
5561                 stats->tx_packets = tx_packets;
5562                 stats->tx_dropped = tx_dropped;
5563         }
5564 }
5565 EXPORT_SYMBOL(dev_txq_stats_fold);
5566
5567 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5568  * fields in the same order, with only the type differing.
5569  */
5570 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5571                                     const struct net_device_stats *netdev_stats)
5572 {
5573 #if BITS_PER_LONG == 64
5574         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5575         memcpy(stats64, netdev_stats, sizeof(*stats64));
5576 #else
5577         size_t i, n = sizeof(*stats64) / sizeof(u64);
5578         const unsigned long *src = (const unsigned long *)netdev_stats;
5579         u64 *dst = (u64 *)stats64;
5580
5581         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5582                      sizeof(*stats64) / sizeof(u64));
5583         for (i = 0; i < n; i++)
5584                 dst[i] = src[i];
5585 #endif
5586 }
5587
5588 /**
5589  *      dev_get_stats   - get network device statistics
5590  *      @dev: device to get statistics from
5591  *      @storage: place to store stats
5592  *
5593  *      Get network statistics from device. Return @storage.
5594  *      The device driver may provide its own method by setting
5595  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5596  *      otherwise the internal statistics structure is used.
5597  */
5598 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5599                                         struct rtnl_link_stats64 *storage)
5600 {
5601         const struct net_device_ops *ops = dev->netdev_ops;
5602
5603         if (ops->ndo_get_stats64) {
5604                 memset(storage, 0, sizeof(*storage));
5605                 ops->ndo_get_stats64(dev, storage);
5606         } else if (ops->ndo_get_stats) {
5607                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5608         } else {
5609                 netdev_stats_to_stats64(storage, &dev->stats);
5610                 dev_txq_stats_fold(dev, storage);
5611         }
5612         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5613         return storage;
5614 }
5615 EXPORT_SYMBOL(dev_get_stats);
5616
5617 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5618 {
5619         struct netdev_queue *queue = dev_ingress_queue(dev);
5620
5621 #ifdef CONFIG_NET_CLS_ACT
5622         if (queue)
5623                 return queue;
5624         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5625         if (!queue)
5626                 return NULL;
5627         netdev_init_one_queue(dev, queue, NULL);
5628         queue->qdisc = &noop_qdisc;
5629         queue->qdisc_sleeping = &noop_qdisc;
5630         rcu_assign_pointer(dev->ingress_queue, queue);
5631 #endif
5632         return queue;
5633 }
5634
5635 /**
5636  *      alloc_netdev_mq - allocate network device
5637  *      @sizeof_priv:   size of private data to allocate space for
5638  *      @name:          device name format string
5639  *      @setup:         callback to initialize device
5640  *      @queue_count:   the number of subqueues to allocate
5641  *
5642  *      Allocates a struct net_device with private data area for driver use
5643  *      and performs basic initialization.  Also allocates subquue structs
5644  *      for each queue on the device at the end of the netdevice.
5645  */
5646 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5647                 void (*setup)(struct net_device *), unsigned int queue_count)
5648 {
5649         struct net_device *dev;
5650         size_t alloc_size;
5651         struct net_device *p;
5652
5653         BUG_ON(strlen(name) >= sizeof(dev->name));
5654
5655         if (queue_count < 1) {
5656                 pr_err("alloc_netdev: Unable to allocate device "
5657                        "with zero queues.\n");
5658                 return NULL;
5659         }
5660
5661         alloc_size = sizeof(struct net_device);
5662         if (sizeof_priv) {
5663                 /* ensure 32-byte alignment of private area */
5664                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5665                 alloc_size += sizeof_priv;
5666         }
5667         /* ensure 32-byte alignment of whole construct */
5668         alloc_size += NETDEV_ALIGN - 1;
5669
5670         p = kzalloc(alloc_size, GFP_KERNEL);
5671         if (!p) {
5672                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5673                 return NULL;
5674         }
5675
5676         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5677         dev->padded = (char *)dev - (char *)p;
5678
5679         dev->pcpu_refcnt = alloc_percpu(int);
5680         if (!dev->pcpu_refcnt)
5681                 goto free_p;
5682
5683         if (dev_addr_init(dev))
5684                 goto free_pcpu;
5685
5686         dev_mc_init(dev);
5687         dev_uc_init(dev);
5688
5689         dev_net_set(dev, &init_net);
5690
5691         dev->num_tx_queues = queue_count;
5692         dev->real_num_tx_queues = queue_count;
5693         if (netif_alloc_netdev_queues(dev))
5694                 goto free_pcpu;
5695
5696 #ifdef CONFIG_RPS
5697         dev->num_rx_queues = queue_count;
5698         dev->real_num_rx_queues = queue_count;
5699         if (netif_alloc_rx_queues(dev))
5700                 goto free_pcpu;
5701 #endif
5702
5703         dev->gso_max_size = GSO_MAX_SIZE;
5704
5705         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5706         dev->ethtool_ntuple_list.count = 0;
5707         INIT_LIST_HEAD(&dev->napi_list);
5708         INIT_LIST_HEAD(&dev->unreg_list);
5709         INIT_LIST_HEAD(&dev->link_watch_list);
5710         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5711         setup(dev);
5712         strcpy(dev->name, name);
5713         return dev;
5714
5715 free_pcpu:
5716         free_percpu(dev->pcpu_refcnt);
5717         kfree(dev->_tx);
5718 #ifdef CONFIG_RPS
5719         kfree(dev->_rx);
5720 #endif
5721
5722 free_p:
5723         kfree(p);
5724         return NULL;
5725 }
5726 EXPORT_SYMBOL(alloc_netdev_mq);
5727
5728 /**
5729  *      free_netdev - free network device
5730  *      @dev: device
5731  *
5732  *      This function does the last stage of destroying an allocated device
5733  *      interface. The reference to the device object is released.
5734  *      If this is the last reference then it will be freed.
5735  */
5736 void free_netdev(struct net_device *dev)
5737 {
5738         struct napi_struct *p, *n;
5739
5740         release_net(dev_net(dev));
5741
5742         kfree(dev->_tx);
5743 #ifdef CONFIG_RPS
5744         kfree(dev->_rx);
5745 #endif
5746
5747         kfree(rcu_dereference_raw(dev->ingress_queue));
5748
5749         /* Flush device addresses */
5750         dev_addr_flush(dev);
5751
5752         /* Clear ethtool n-tuple list */
5753         ethtool_ntuple_flush(dev);
5754
5755         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5756                 netif_napi_del(p);
5757
5758         free_percpu(dev->pcpu_refcnt);
5759         dev->pcpu_refcnt = NULL;
5760
5761         /*  Compatibility with error handling in drivers */
5762         if (dev->reg_state == NETREG_UNINITIALIZED) {
5763                 kfree((char *)dev - dev->padded);
5764                 return;
5765         }
5766
5767         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5768         dev->reg_state = NETREG_RELEASED;
5769
5770         /* will free via device release */
5771         put_device(&dev->dev);
5772 }
5773 EXPORT_SYMBOL(free_netdev);
5774
5775 /**
5776  *      synchronize_net -  Synchronize with packet receive processing
5777  *
5778  *      Wait for packets currently being received to be done.
5779  *      Does not block later packets from starting.
5780  */
5781 void synchronize_net(void)
5782 {
5783         might_sleep();
5784         synchronize_rcu();
5785 }
5786 EXPORT_SYMBOL(synchronize_net);
5787
5788 /**
5789  *      unregister_netdevice_queue - remove device from the kernel
5790  *      @dev: device
5791  *      @head: list
5792  *
5793  *      This function shuts down a device interface and removes it
5794  *      from the kernel tables.
5795  *      If head not NULL, device is queued to be unregistered later.
5796  *
5797  *      Callers must hold the rtnl semaphore.  You may want
5798  *      unregister_netdev() instead of this.
5799  */
5800
5801 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5802 {
5803         ASSERT_RTNL();
5804
5805         if (head) {
5806                 list_move_tail(&dev->unreg_list, head);
5807         } else {
5808                 rollback_registered(dev);
5809                 /* Finish processing unregister after unlock */
5810                 net_set_todo(dev);
5811         }
5812 }
5813 EXPORT_SYMBOL(unregister_netdevice_queue);
5814
5815 /**
5816  *      unregister_netdevice_many - unregister many devices
5817  *      @head: list of devices
5818  */
5819 void unregister_netdevice_many(struct list_head *head)
5820 {
5821         struct net_device *dev;
5822
5823         if (!list_empty(head)) {
5824                 rollback_registered_many(head);
5825                 list_for_each_entry(dev, head, unreg_list)
5826                         net_set_todo(dev);
5827         }
5828 }
5829 EXPORT_SYMBOL(unregister_netdevice_many);
5830
5831 /**
5832  *      unregister_netdev - remove device from the kernel
5833  *      @dev: device
5834  *
5835  *      This function shuts down a device interface and removes it
5836  *      from the kernel tables.
5837  *
5838  *      This is just a wrapper for unregister_netdevice that takes
5839  *      the rtnl semaphore.  In general you want to use this and not
5840  *      unregister_netdevice.
5841  */
5842 void unregister_netdev(struct net_device *dev)
5843 {
5844         rtnl_lock();
5845         unregister_netdevice(dev);
5846         rtnl_unlock();
5847 }
5848 EXPORT_SYMBOL(unregister_netdev);
5849
5850 /**
5851  *      dev_change_net_namespace - move device to different nethost namespace
5852  *      @dev: device
5853  *      @net: network namespace
5854  *      @pat: If not NULL name pattern to try if the current device name
5855  *            is already taken in the destination network namespace.
5856  *
5857  *      This function shuts down a device interface and moves it
5858  *      to a new network namespace. On success 0 is returned, on
5859  *      a failure a netagive errno code is returned.
5860  *
5861  *      Callers must hold the rtnl semaphore.
5862  */
5863
5864 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5865 {
5866         int err;
5867
5868         ASSERT_RTNL();
5869
5870         /* Don't allow namespace local devices to be moved. */
5871         err = -EINVAL;
5872         if (dev->features & NETIF_F_NETNS_LOCAL)
5873                 goto out;
5874
5875         /* Ensure the device has been registrered */
5876         err = -EINVAL;
5877         if (dev->reg_state != NETREG_REGISTERED)
5878                 goto out;
5879
5880         /* Get out if there is nothing todo */
5881         err = 0;
5882         if (net_eq(dev_net(dev), net))
5883                 goto out;
5884
5885         /* Pick the destination device name, and ensure
5886          * we can use it in the destination network namespace.
5887          */
5888         err = -EEXIST;
5889         if (__dev_get_by_name(net, dev->name)) {
5890                 /* We get here if we can't use the current device name */
5891                 if (!pat)
5892                         goto out;
5893                 if (dev_get_valid_name(dev, pat, 1))
5894                         goto out;
5895         }
5896
5897         /*
5898          * And now a mini version of register_netdevice unregister_netdevice.
5899          */
5900
5901         /* If device is running close it first. */
5902         dev_close(dev);
5903
5904         /* And unlink it from device chain */
5905         err = -ENODEV;
5906         unlist_netdevice(dev);
5907
5908         synchronize_net();
5909
5910         /* Shutdown queueing discipline. */
5911         dev_shutdown(dev);
5912
5913         /* Notify protocols, that we are about to destroy
5914            this device. They should clean all the things.
5915
5916            Note that dev->reg_state stays at NETREG_REGISTERED.
5917            This is wanted because this way 8021q and macvlan know
5918            the device is just moving and can keep their slaves up.
5919         */
5920         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5921         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5922
5923         /*
5924          *      Flush the unicast and multicast chains
5925          */
5926         dev_uc_flush(dev);
5927         dev_mc_flush(dev);
5928
5929         /* Actually switch the network namespace */
5930         dev_net_set(dev, net);
5931
5932         /* If there is an ifindex conflict assign a new one */
5933         if (__dev_get_by_index(net, dev->ifindex)) {
5934                 int iflink = (dev->iflink == dev->ifindex);
5935                 dev->ifindex = dev_new_index(net);
5936                 if (iflink)
5937                         dev->iflink = dev->ifindex;
5938         }
5939
5940         /* Fixup kobjects */
5941         err = device_rename(&dev->dev, dev->name);
5942         WARN_ON(err);
5943
5944         /* Add the device back in the hashes */
5945         list_netdevice(dev);
5946
5947         /* Notify protocols, that a new device appeared. */
5948         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5949
5950         /*
5951          *      Prevent userspace races by waiting until the network
5952          *      device is fully setup before sending notifications.
5953          */
5954         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5955
5956         synchronize_net();
5957         err = 0;
5958 out:
5959         return err;
5960 }
5961 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5962
5963 static int dev_cpu_callback(struct notifier_block *nfb,
5964                             unsigned long action,
5965                             void *ocpu)
5966 {
5967         struct sk_buff **list_skb;
5968         struct sk_buff *skb;
5969         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5970         struct softnet_data *sd, *oldsd;
5971
5972         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5973                 return NOTIFY_OK;
5974
5975         local_irq_disable();
5976         cpu = smp_processor_id();
5977         sd = &per_cpu(softnet_data, cpu);
5978         oldsd = &per_cpu(softnet_data, oldcpu);
5979
5980         /* Find end of our completion_queue. */
5981         list_skb = &sd->completion_queue;
5982         while (*list_skb)
5983                 list_skb = &(*list_skb)->next;
5984         /* Append completion queue from offline CPU. */
5985         *list_skb = oldsd->completion_queue;
5986         oldsd->completion_queue = NULL;
5987
5988         /* Append output queue from offline CPU. */
5989         if (oldsd->output_queue) {
5990                 *sd->output_queue_tailp = oldsd->output_queue;
5991                 sd->output_queue_tailp = oldsd->output_queue_tailp;
5992                 oldsd->output_queue = NULL;
5993                 oldsd->output_queue_tailp = &oldsd->output_queue;
5994         }
5995
5996         raise_softirq_irqoff(NET_TX_SOFTIRQ);
5997         local_irq_enable();
5998
5999         /* Process offline CPU's input_pkt_queue */
6000         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6001                 netif_rx(skb);
6002                 input_queue_head_incr(oldsd);
6003         }
6004         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6005                 netif_rx(skb);
6006                 input_queue_head_incr(oldsd);
6007         }
6008
6009         return NOTIFY_OK;
6010 }
6011
6012
6013 /**
6014  *      netdev_increment_features - increment feature set by one
6015  *      @all: current feature set
6016  *      @one: new feature set
6017  *      @mask: mask feature set
6018  *
6019  *      Computes a new feature set after adding a device with feature set
6020  *      @one to the master device with current feature set @all.  Will not
6021  *      enable anything that is off in @mask. Returns the new feature set.
6022  */
6023 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
6024                                         unsigned long mask)
6025 {
6026         /* If device needs checksumming, downgrade to it. */
6027         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6028                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6029         else if (mask & NETIF_F_ALL_CSUM) {
6030                 /* If one device supports v4/v6 checksumming, set for all. */
6031                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6032                     !(all & NETIF_F_GEN_CSUM)) {
6033                         all &= ~NETIF_F_ALL_CSUM;
6034                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6035                 }
6036
6037                 /* If one device supports hw checksumming, set for all. */
6038                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6039                         all &= ~NETIF_F_ALL_CSUM;
6040                         all |= NETIF_F_HW_CSUM;
6041                 }
6042         }
6043
6044         one |= NETIF_F_ALL_CSUM;
6045
6046         one |= all & NETIF_F_ONE_FOR_ALL;
6047         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6048         all |= one & mask & NETIF_F_ONE_FOR_ALL;
6049
6050         return all;
6051 }
6052 EXPORT_SYMBOL(netdev_increment_features);
6053
6054 static struct hlist_head *netdev_create_hash(void)
6055 {
6056         int i;
6057         struct hlist_head *hash;
6058
6059         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6060         if (hash != NULL)
6061                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6062                         INIT_HLIST_HEAD(&hash[i]);
6063
6064         return hash;
6065 }
6066
6067 /* Initialize per network namespace state */
6068 static int __net_init netdev_init(struct net *net)
6069 {
6070         INIT_LIST_HEAD(&net->dev_base_head);
6071
6072         net->dev_name_head = netdev_create_hash();
6073         if (net->dev_name_head == NULL)
6074                 goto err_name;
6075
6076         net->dev_index_head = netdev_create_hash();
6077         if (net->dev_index_head == NULL)
6078                 goto err_idx;
6079
6080         return 0;
6081
6082 err_idx:
6083         kfree(net->dev_name_head);
6084 err_name:
6085         return -ENOMEM;
6086 }
6087
6088 /**
6089  *      netdev_drivername - network driver for the device
6090  *      @dev: network device
6091  *      @buffer: buffer for resulting name
6092  *      @len: size of buffer
6093  *
6094  *      Determine network driver for device.
6095  */
6096 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6097 {
6098         const struct device_driver *driver;
6099         const struct device *parent;
6100
6101         if (len <= 0 || !buffer)
6102                 return buffer;
6103         buffer[0] = 0;
6104
6105         parent = dev->dev.parent;
6106
6107         if (!parent)
6108                 return buffer;
6109
6110         driver = parent->driver;
6111         if (driver && driver->name)
6112                 strlcpy(buffer, driver->name, len);
6113         return buffer;
6114 }
6115
6116 static int __netdev_printk(const char *level, const struct net_device *dev,
6117                            struct va_format *vaf)
6118 {
6119         int r;
6120
6121         if (dev && dev->dev.parent)
6122                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6123                                netdev_name(dev), vaf);
6124         else if (dev)
6125                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6126         else
6127                 r = printk("%s(NULL net_device): %pV", level, vaf);
6128
6129         return r;
6130 }
6131
6132 int netdev_printk(const char *level, const struct net_device *dev,
6133                   const char *format, ...)
6134 {
6135         struct va_format vaf;
6136         va_list args;
6137         int r;
6138
6139         va_start(args, format);
6140
6141         vaf.fmt = format;
6142         vaf.va = &args;
6143
6144         r = __netdev_printk(level, dev, &vaf);
6145         va_end(args);
6146
6147         return r;
6148 }
6149 EXPORT_SYMBOL(netdev_printk);
6150
6151 #define define_netdev_printk_level(func, level)                 \
6152 int func(const struct net_device *dev, const char *fmt, ...)    \
6153 {                                                               \
6154         int r;                                                  \
6155         struct va_format vaf;                                   \
6156         va_list args;                                           \
6157                                                                 \
6158         va_start(args, fmt);                                    \
6159                                                                 \
6160         vaf.fmt = fmt;                                          \
6161         vaf.va = &args;                                         \
6162                                                                 \
6163         r = __netdev_printk(level, dev, &vaf);                  \
6164         va_end(args);                                           \
6165                                                                 \
6166         return r;                                               \
6167 }                                                               \
6168 EXPORT_SYMBOL(func);
6169
6170 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6171 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6172 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6173 define_netdev_printk_level(netdev_err, KERN_ERR);
6174 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6175 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6176 define_netdev_printk_level(netdev_info, KERN_INFO);
6177
6178 static void __net_exit netdev_exit(struct net *net)
6179 {
6180         kfree(net->dev_name_head);
6181         kfree(net->dev_index_head);
6182 }
6183
6184 static struct pernet_operations __net_initdata netdev_net_ops = {
6185         .init = netdev_init,
6186         .exit = netdev_exit,
6187 };
6188
6189 static void __net_exit default_device_exit(struct net *net)
6190 {
6191         struct net_device *dev, *aux;
6192         /*
6193          * Push all migratable network devices back to the
6194          * initial network namespace
6195          */
6196         rtnl_lock();
6197         for_each_netdev_safe(net, dev, aux) {
6198                 int err;
6199                 char fb_name[IFNAMSIZ];
6200
6201                 /* Ignore unmoveable devices (i.e. loopback) */
6202                 if (dev->features & NETIF_F_NETNS_LOCAL)
6203                         continue;
6204
6205                 /* Leave virtual devices for the generic cleanup */
6206                 if (dev->rtnl_link_ops)
6207                         continue;
6208
6209                 /* Push remaing network devices to init_net */
6210                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6211                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6212                 if (err) {
6213                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6214                                 __func__, dev->name, err);
6215                         BUG();
6216                 }
6217         }
6218         rtnl_unlock();
6219 }
6220
6221 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6222 {
6223         /* At exit all network devices most be removed from a network
6224          * namespace.  Do this in the reverse order of registeration.
6225          * Do this across as many network namespaces as possible to
6226          * improve batching efficiency.
6227          */
6228         struct net_device *dev;
6229         struct net *net;
6230         LIST_HEAD(dev_kill_list);
6231
6232         rtnl_lock();
6233         list_for_each_entry(net, net_list, exit_list) {
6234                 for_each_netdev_reverse(net, dev) {
6235                         if (dev->rtnl_link_ops)
6236                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6237                         else
6238                                 unregister_netdevice_queue(dev, &dev_kill_list);
6239                 }
6240         }
6241         unregister_netdevice_many(&dev_kill_list);
6242         rtnl_unlock();
6243 }
6244
6245 static struct pernet_operations __net_initdata default_device_ops = {
6246         .exit = default_device_exit,
6247         .exit_batch = default_device_exit_batch,
6248 };
6249
6250 /*
6251  *      Initialize the DEV module. At boot time this walks the device list and
6252  *      unhooks any devices that fail to initialise (normally hardware not
6253  *      present) and leaves us with a valid list of present and active devices.
6254  *
6255  */
6256
6257 /*
6258  *       This is called single threaded during boot, so no need
6259  *       to take the rtnl semaphore.
6260  */
6261 static int __init net_dev_init(void)
6262 {
6263         int i, rc = -ENOMEM;
6264
6265         BUG_ON(!dev_boot_phase);
6266
6267         if (dev_proc_init())
6268                 goto out;
6269
6270         if (netdev_kobject_init())
6271                 goto out;
6272
6273         INIT_LIST_HEAD(&ptype_all);
6274         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6275                 INIT_LIST_HEAD(&ptype_base[i]);
6276
6277         if (register_pernet_subsys(&netdev_net_ops))
6278                 goto out;
6279
6280         /*
6281          *      Initialise the packet receive queues.
6282          */
6283
6284         for_each_possible_cpu(i) {
6285                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6286
6287                 memset(sd, 0, sizeof(*sd));
6288                 skb_queue_head_init(&sd->input_pkt_queue);
6289                 skb_queue_head_init(&sd->process_queue);
6290                 sd->completion_queue = NULL;
6291                 INIT_LIST_HEAD(&sd->poll_list);
6292                 sd->output_queue = NULL;
6293                 sd->output_queue_tailp = &sd->output_queue;
6294 #ifdef CONFIG_RPS
6295                 sd->csd.func = rps_trigger_softirq;
6296                 sd->csd.info = sd;
6297                 sd->csd.flags = 0;
6298                 sd->cpu = i;
6299 #endif
6300
6301                 sd->backlog.poll = process_backlog;
6302                 sd->backlog.weight = weight_p;
6303                 sd->backlog.gro_list = NULL;
6304                 sd->backlog.gro_count = 0;
6305         }
6306
6307         dev_boot_phase = 0;
6308
6309         /* The loopback device is special if any other network devices
6310          * is present in a network namespace the loopback device must
6311          * be present. Since we now dynamically allocate and free the
6312          * loopback device ensure this invariant is maintained by
6313          * keeping the loopback device as the first device on the
6314          * list of network devices.  Ensuring the loopback devices
6315          * is the first device that appears and the last network device
6316          * that disappears.
6317          */
6318         if (register_pernet_device(&loopback_net_ops))
6319                 goto out;
6320
6321         if (register_pernet_device(&default_device_ops))
6322                 goto out;
6323
6324         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6325         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6326
6327         hotcpu_notifier(dev_cpu_callback, 0);
6328         dst_init();
6329         dev_mcast_init();
6330         rc = 0;
6331 out:
6332         return rc;
6333 }
6334
6335 subsys_initcall(net_dev_init);
6336
6337 static int __init initialize_hashrnd(void)
6338 {
6339         get_random_bytes(&hashrnd, sizeof(hashrnd));
6340         return 0;
6341 }
6342
6343 late_initcall_sync(initialize_hashrnd);
6344