net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/proc_fs.h>
 101 #include <linux/seq_file.h>
 102 #include <linux/stat.h>
 103 #include <net/dst.h>
 104 #include <net/pkt_sched.h>
 105 #include <net/checksum.h>
 106 #include <net/xfrm.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130 #include <trace/events/net.h>
 131 #include <trace/events/skb.h>
 132 #include <linux/pci.h>
 133 #include <linux/inetdevice.h>
 134 #include <linux/cpu_rmap.h>
 135 #include <linux/net_tstamp.h>
 136 #include <linux/static_key.h>
 137
 138 #include "net-sysfs.h"
 139
 140 /* Instead of increasing this, you should create a hash table. */
 141 #define MAX_GRO_SKBS 8
 142
 143 /* This should be increased if a protocol with a bigger head is added. */
 144 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 145
 146 /*
 147  *      The list of packet types we will receive (as opposed to discard)
 148  *      and the routines to invoke.
 149  *
 150  *      Why 16. Because with 16 the only overlap we get on a hash of the
 151  *      low nibble of the protocol value is RARP/SNAP/X.25.
 152  *
 153  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 154  *             sure which should go first, but I bet it won't make much
 155  *             difference if we are running VLANs.  The good news is that
 156  *             this protocol won't be in the list unless compiled in, so
 157  *             the average user (w/out VLANs) will not be adversely affected.
 158  *             --BLG
 159  *
 160  *              0800    IP
 161  *              8100    802.1Q VLAN
 162  *              0001    802.3
 163  *              0002    AX.25
 164  *              0004    802.2
 165  *              8035    RARP
 166  *              0005    SNAP
 167  *              0805    X.25
 168  *              0806    ARP
 169  *              8137    IPX
 170  *              0009    Localtalk
 171  *              86DD    IPv6
 172  */
 173
 174 #define PTYPE_HASH_SIZE (16)
 175 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 176
 177 static DEFINE_SPINLOCK(ptype_lock);
 178 static DEFINE_SPINLOCK(offload_lock);
 179 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 180 static struct list_head ptype_all __read_mostly;        /* Taps */
 181 static struct list_head offload_base __read_mostly;
 182
 183 /*
 184  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 185  * semaphore.
 186  *
 187  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 188  *
 189  * Writers must hold the rtnl semaphore while they loop through the
 190  * dev_base_head list, and hold dev_base_lock for writing when they do the
 191  * actual updates.  This allows pure readers to access the list even
 192  * while a writer is preparing to update it.
 193  *
 194  * To put it another way, dev_base_lock is held for writing only to
 195  * protect against pure readers; the rtnl semaphore provides the
 196  * protection against other writers.
 197  *
 198  * See, for example usages, register_netdevice() and
 199  * unregister_netdevice(), which must be called with the rtnl
 200  * semaphore held.
 201  */
 202 DEFINE_RWLOCK(dev_base_lock);
 203 EXPORT_SYMBOL(dev_base_lock);
 204
 205 seqcount_t devnet_rename_seq;
 206
 207 static inline void dev_base_seq_inc(struct net *net)
 208 {
 209         while (++net->dev_base_seq == 0);
 210 }
 211
 212 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 213 {
 214         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 215
 216         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 217 }
 218
 219 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 220 {
 221         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 222 }
 223
 224 static inline void rps_lock(struct softnet_data *sd)
 225 {
 226 #ifdef CONFIG_RPS
 227         spin_lock(&sd->input_pkt_queue.lock);
 228 #endif
 229 }
 230
 231 static inline void rps_unlock(struct softnet_data *sd)
 232 {
 233 #ifdef CONFIG_RPS
 234         spin_unlock(&sd->input_pkt_queue.lock);
 235 #endif
 236 }
 237
 238 /* Device list insertion */
 239 static int list_netdevice(struct net_device *dev)
 240 {
 241         struct net *net = dev_net(dev);
 242
 243         ASSERT_RTNL();
 244
 245         write_lock_bh(&dev_base_lock);
 246         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 247         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 248         hlist_add_head_rcu(&dev->index_hlist,
 249                            dev_index_hash(net, dev->ifindex));
 250         write_unlock_bh(&dev_base_lock);
 251
 252         dev_base_seq_inc(net);
 253
 254         return 0;
 255 }
 256
 257 /* Device list removal
 258  * caller must respect a RCU grace period before freeing/reusing dev
 259  */
 260 static void unlist_netdevice(struct net_device *dev)
 261 {
 262         ASSERT_RTNL();
 263
 264         /* Unlink dev from the device chain */
 265         write_lock_bh(&dev_base_lock);
 266         list_del_rcu(&dev->dev_list);
 267         hlist_del_rcu(&dev->name_hlist);
 268         hlist_del_rcu(&dev->index_hlist);
 269         write_unlock_bh(&dev_base_lock);
 270
 271         dev_base_seq_inc(dev_net(dev));
 272 }
 273
 274 /*
 275  *      Our notifier list
 276  */
 277
 278 static RAW_NOTIFIER_HEAD(netdev_chain);
 279
 280 /*
 281  *      Device drivers call our routines to queue packets here. We empty the
 282  *      queue in the local softnet handler.
 283  */
 284
 285 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 286 EXPORT_PER_CPU_SYMBOL(softnet_data);
 287
 288 #ifdef CONFIG_LOCKDEP
 289 /*
 290  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 291  * according to dev->type
 292  */
 293 static const unsigned short netdev_lock_type[] =
 294         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 295          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 296          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 297          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 298          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 299          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 300          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 301          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 302          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 303          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 304          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 305          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 306          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 307          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 308          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 309
 310 static const char *const netdev_lock_name[] =
 311         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 312          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 313          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 314          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 315          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 316          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 317          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 318          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 319          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 320          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 321          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 322          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 323          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 324          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 325          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 326
 327 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 329
 330 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 331 {
 332         int i;
 333
 334         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 335                 if (netdev_lock_type[i] == dev_type)
 336                         return i;
 337         /* the last key is used by default */
 338         return ARRAY_SIZE(netdev_lock_type) - 1;
 339 }
 340
 341 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 342                                                  unsigned short dev_type)
 343 {
 344         int i;
 345
 346         i = netdev_lock_pos(dev_type);
 347         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 348                                    netdev_lock_name[i]);
 349 }
 350
 351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352 {
 353         int i;
 354
 355         i = netdev_lock_pos(dev->type);
 356         lockdep_set_class_and_name(&dev->addr_list_lock,
 357                                    &netdev_addr_lock_key[i],
 358                                    netdev_lock_name[i]);
 359 }
 360 #else
 361 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 362                                                  unsigned short dev_type)
 363 {
 364 }
 365 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 366 {
 367 }
 368 #endif
 369
 370 /*******************************************************************************
 371
 372                 Protocol management and registration routines
 373
 374 *******************************************************************************/
 375
 376 /*
 377  *      Add a protocol ID to the list. Now that the input handler is
 378  *      smarter we can dispense with all the messy stuff that used to be
 379  *      here.
 380  *
 381  *      BEWARE!!! Protocol handlers, mangling input packets,
 382  *      MUST BE last in hash buckets and checking protocol handlers
 383  *      MUST start from promiscuous ptype_all chain in net_bh.
 384  *      It is true now, do not change it.
 385  *      Explanation follows: if protocol handler, mangling packet, will
 386  *      be the first on list, it is not able to sense, that packet
 387  *      is cloned and should be copied-on-write, so that it will
 388  *      change it and subsequent readers will get broken packet.
 389  *                                                      --ANK (980803)
 390  */
 391
 392 static inline struct list_head *ptype_head(const struct packet_type *pt)
 393 {
 394         if (pt->type == htons(ETH_P_ALL))
 395                 return &ptype_all;
 396         else
 397                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 398 }
 399
 400 /**
 401  *      dev_add_pack - add packet handler
 402  *      @pt: packet type declaration
 403  *
 404  *      Add a protocol handler to the networking stack. The passed &packet_type
 405  *      is linked into kernel lists and may not be freed until it has been
 406  *      removed from the kernel lists.
 407  *
 408  *      This call does not sleep therefore it can not
 409  *      guarantee all CPU's that are in middle of receiving packets
 410  *      will see the new packet type (until the next received packet).
 411  */
 412
 413 void dev_add_pack(struct packet_type *pt)
 414 {
 415         struct list_head *head = ptype_head(pt);
 416
 417         spin_lock(&ptype_lock);
 418         list_add_rcu(&pt->list, head);
 419         spin_unlock(&ptype_lock);
 420 }
 421 EXPORT_SYMBOL(dev_add_pack);
 422
 423 /**
 424  *      __dev_remove_pack        - remove packet handler
 425  *      @pt: packet type declaration
 426  *
 427  *      Remove a protocol handler that was previously added to the kernel
 428  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 429  *      from the kernel lists and can be freed or reused once this function
 430  *      returns.
 431  *
 432  *      The packet type might still be in use by receivers
 433  *      and must not be freed until after all the CPU's have gone
 434  *      through a quiescent state.
 435  */
 436 void __dev_remove_pack(struct packet_type *pt)
 437 {
 438         struct list_head *head = ptype_head(pt);
 439         struct packet_type *pt1;
 440
 441         spin_lock(&ptype_lock);
 442
 443         list_for_each_entry(pt1, head, list) {
 444                 if (pt == pt1) {
 445                         list_del_rcu(&pt->list);
 446                         goto out;
 447                 }
 448         }
 449
 450         pr_warn("dev_remove_pack: %p not found\n", pt);
 451 out:
 452         spin_unlock(&ptype_lock);
 453 }
 454 EXPORT_SYMBOL(__dev_remove_pack);
 455
 456 /**
 457  *      dev_remove_pack  - remove packet handler
 458  *      @pt: packet type declaration
 459  *
 460  *      Remove a protocol handler that was previously added to the kernel
 461  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 462  *      from the kernel lists and can be freed or reused once this function
 463  *      returns.
 464  *
 465  *      This call sleeps to guarantee that no CPU is looking at the packet
 466  *      type after return.
 467  */
 468 void dev_remove_pack(struct packet_type *pt)
 469 {
 470         __dev_remove_pack(pt);
 471
 472         synchronize_net();
 473 }
 474 EXPORT_SYMBOL(dev_remove_pack);
 475
 476
 477 /**
 478  *      dev_add_offload - register offload handlers
 479  *      @po: protocol offload declaration
 480  *
 481  *      Add protocol offload handlers to the networking stack. The passed
 482  *      &proto_offload is linked into kernel lists and may not be freed until
 483  *      it has been removed from the kernel lists.
 484  *
 485  *      This call does not sleep therefore it can not
 486  *      guarantee all CPU's that are in middle of receiving packets
 487  *      will see the new offload handlers (until the next received packet).
 488  */
 489 void dev_add_offload(struct packet_offload *po)
 490 {
 491         struct list_head *head = &offload_base;
 492
 493         spin_lock(&offload_lock);
 494         list_add_rcu(&po->list, head);
 495         spin_unlock(&offload_lock);
 496 }
 497 EXPORT_SYMBOL(dev_add_offload);
 498
 499 /**
 500  *      __dev_remove_offload     - remove offload handler
 501  *      @po: packet offload declaration
 502  *
 503  *      Remove a protocol offload handler that was previously added to the
 504  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 505  *      is removed from the kernel lists and can be freed or reused once this
 506  *      function returns.
 507  *
 508  *      The packet type might still be in use by receivers
 509  *      and must not be freed until after all the CPU's have gone
 510  *      through a quiescent state.
 511  */
 512 void __dev_remove_offload(struct packet_offload *po)
 513 {
 514         struct list_head *head = &offload_base;
 515         struct packet_offload *po1;
 516
 517         spin_lock(&offload_lock);
 518
 519         list_for_each_entry(po1, head, list) {
 520                 if (po == po1) {
 521                         list_del_rcu(&po->list);
 522                         goto out;
 523                 }
 524         }
 525
 526         pr_warn("dev_remove_offload: %p not found\n", po);
 527 out:
 528         spin_unlock(&offload_lock);
 529 }
 530 EXPORT_SYMBOL(__dev_remove_offload);
 531
 532 /**
 533  *      dev_remove_offload       - remove packet offload handler
 534  *      @po: packet offload declaration
 535  *
 536  *      Remove a packet offload handler that was previously added to the kernel
 537  *      offload handlers by dev_add_offload(). The passed &offload_type is
 538  *      removed from the kernel lists and can be freed or reused once this
 539  *      function returns.
 540  *
 541  *      This call sleeps to guarantee that no CPU is looking at the packet
 542  *      type after return.
 543  */
 544 void dev_remove_offload(struct packet_offload *po)
 545 {
 546         __dev_remove_offload(po);
 547
 548         synchronize_net();
 549 }
 550 EXPORT_SYMBOL(dev_remove_offload);
 551
 552 /******************************************************************************
 553
 554                       Device Boot-time Settings Routines
 555
 556 *******************************************************************************/
 557
 558 /* Boot time configuration table */
 559 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 560
 561 /**
 562  *      netdev_boot_setup_add   - add new setup entry
 563  *      @name: name of the device
 564  *      @map: configured settings for the device
 565  *
 566  *      Adds new setup entry to the dev_boot_setup list.  The function
 567  *      returns 0 on error and 1 on success.  This is a generic routine to
 568  *      all netdevices.
 569  */
 570 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 571 {
 572         struct netdev_boot_setup *s;
 573         int i;
 574
 575         s = dev_boot_setup;
 576         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 577                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 578                         memset(s[i].name, 0, sizeof(s[i].name));
 579                         strlcpy(s[i].name, name, IFNAMSIZ);
 580                         memcpy(&s[i].map, map, sizeof(s[i].map));
 581                         break;
 582                 }
 583         }
 584
 585         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 586 }
 587
 588 /**
 589  *      netdev_boot_setup_check - check boot time settings
 590  *      @dev: the netdevice
 591  *
 592  *      Check boot time settings for the device.
 593  *      The found settings are set for the device to be used
 594  *      later in the device probing.
 595  *      Returns 0 if no settings found, 1 if they are.
 596  */
 597 int netdev_boot_setup_check(struct net_device *dev)
 598 {
 599         struct netdev_boot_setup *s = dev_boot_setup;
 600         int i;
 601
 602         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 603                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 604                     !strcmp(dev->name, s[i].name)) {
 605                         dev->irq        = s[i].map.irq;
 606                         dev->base_addr  = s[i].map.base_addr;
 607                         dev->mem_start  = s[i].map.mem_start;
 608                         dev->mem_end    = s[i].map.mem_end;
 609                         return 1;
 610                 }
 611         }
 612         return 0;
 613 }
 614 EXPORT_SYMBOL(netdev_boot_setup_check);
 615
 616
 617 /**
 618  *      netdev_boot_base        - get address from boot time settings
 619  *      @prefix: prefix for network device
 620  *      @unit: id for network device
 621  *
 622  *      Check boot time settings for the base address of device.
 623  *      The found settings are set for the device to be used
 624  *      later in the device probing.
 625  *      Returns 0 if no settings found.
 626  */
 627 unsigned long netdev_boot_base(const char *prefix, int unit)
 628 {
 629         const struct netdev_boot_setup *s = dev_boot_setup;
 630         char name[IFNAMSIZ];
 631         int i;
 632
 633         sprintf(name, "%s%d", prefix, unit);
 634
 635         /*
 636          * If device already registered then return base of 1
 637          * to indicate not to probe for this interface
 638          */
 639         if (__dev_get_by_name(&init_net, name))
 640                 return 1;
 641
 642         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 643                 if (!strcmp(name, s[i].name))
 644                         return s[i].map.base_addr;
 645         return 0;
 646 }
 647
 648 /*
 649  * Saves at boot time configured settings for any netdevice.
 650  */
 651 int __init netdev_boot_setup(char *str)
 652 {
 653         int ints[5];
 654         struct ifmap map;
 655
 656         str = get_options(str, ARRAY_SIZE(ints), ints);
 657         if (!str || !*str)
 658                 return 0;
 659
 660         /* Save settings */
 661         memset(&map, 0, sizeof(map));
 662         if (ints[0] > 0)
 663                 map.irq = ints[1];
 664         if (ints[0] > 1)
 665                 map.base_addr = ints[2];
 666         if (ints[0] > 2)
 667                 map.mem_start = ints[3];
 668         if (ints[0] > 3)
 669                 map.mem_end = ints[4];
 670
 671         /* Add new entry to the list */
 672         return netdev_boot_setup_add(str, &map);
 673 }
 674
 675 __setup("netdev=", netdev_boot_setup);
 676
 677 /*******************************************************************************
 678
 679                             Device Interface Subroutines
 680
 681 *******************************************************************************/
 682
 683 /**
 684  *      __dev_get_by_name       - find a device by its name
 685  *      @net: the applicable net namespace
 686  *      @name: name to find
 687  *
 688  *      Find an interface by name. Must be called under RTNL semaphore
 689  *      or @dev_base_lock. If the name is found a pointer to the device
 690  *      is returned. If the name is not found then %NULL is returned. The
 691  *      reference counters are not incremented so the caller must be
 692  *      careful with locks.
 693  */
 694
 695 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 696 {
 697         struct hlist_node *p;
 698         struct net_device *dev;
 699         struct hlist_head *head = dev_name_hash(net, name);
 700
 701         hlist_for_each_entry(dev, p, head, name_hlist)
 702                 if (!strncmp(dev->name, name, IFNAMSIZ))
 703                         return dev;
 704
 705         return NULL;
 706 }
 707 EXPORT_SYMBOL(__dev_get_by_name);
 708
 709 /**
 710  *      dev_get_by_name_rcu     - find a device by its name
 711  *      @net: the applicable net namespace
 712  *      @name: name to find
 713  *
 714  *      Find an interface by name.
 715  *      If the name is found a pointer to the device is returned.
 716  *      If the name is not found then %NULL is returned.
 717  *      The reference counters are not incremented so the caller must be
 718  *      careful with locks. The caller must hold RCU lock.
 719  */
 720
 721 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 722 {
 723         struct hlist_node *p;
 724         struct net_device *dev;
 725         struct hlist_head *head = dev_name_hash(net, name);
 726
 727         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 728                 if (!strncmp(dev->name, name, IFNAMSIZ))
 729                         return dev;
 730
 731         return NULL;
 732 }
 733 EXPORT_SYMBOL(dev_get_by_name_rcu);
 734
 735 /**
 736  *      dev_get_by_name         - find a device by its name
 737  *      @net: the applicable net namespace
 738  *      @name: name to find
 739  *
 740  *      Find an interface by name. This can be called from any
 741  *      context and does its own locking. The returned handle has
 742  *      the usage count incremented and the caller must use dev_put() to
 743  *      release it when it is no longer needed. %NULL is returned if no
 744  *      matching device is found.
 745  */
 746
 747 struct net_device *dev_get_by_name(struct net *net, const char *name)
 748 {
 749         struct net_device *dev;
 750
 751         rcu_read_lock();
 752         dev = dev_get_by_name_rcu(net, name);
 753         if (dev)
 754                 dev_hold(dev);
 755         rcu_read_unlock();
 756         return dev;
 757 }
 758 EXPORT_SYMBOL(dev_get_by_name);
 759
 760 /**
 761  *      __dev_get_by_index - find a device by its ifindex
 762  *      @net: the applicable net namespace
 763  *      @ifindex: index of device
 764  *
 765  *      Search for an interface by index. Returns %NULL if the device
 766  *      is not found or a pointer to the device. The device has not
 767  *      had its reference counter increased so the caller must be careful
 768  *      about locking. The caller must hold either the RTNL semaphore
 769  *      or @dev_base_lock.
 770  */
 771
 772 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 773 {
 774         struct hlist_node *p;
 775         struct net_device *dev;
 776         struct hlist_head *head = dev_index_hash(net, ifindex);
 777
 778         hlist_for_each_entry(dev, p, head, index_hlist)
 779                 if (dev->ifindex == ifindex)
 780                         return dev;
 781
 782         return NULL;
 783 }
 784 EXPORT_SYMBOL(__dev_get_by_index);
 785
 786 /**
 787  *      dev_get_by_index_rcu - find a device by its ifindex
 788  *      @net: the applicable net namespace
 789  *      @ifindex: index of device
 790  *
 791  *      Search for an interface by index. Returns %NULL if the device
 792  *      is not found or a pointer to the device. The device has not
 793  *      had its reference counter increased so the caller must be careful
 794  *      about locking. The caller must hold RCU lock.
 795  */
 796
 797 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 798 {
 799         struct hlist_node *p;
 800         struct net_device *dev;
 801         struct hlist_head *head = dev_index_hash(net, ifindex);
 802
 803         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 804                 if (dev->ifindex == ifindex)
 805                         return dev;
 806
 807         return NULL;
 808 }
 809 EXPORT_SYMBOL(dev_get_by_index_rcu);
 810
 811
 812 /**
 813  *      dev_get_by_index - find a device by its ifindex
 814  *      @net: the applicable net namespace
 815  *      @ifindex: index of device
 816  *
 817  *      Search for an interface by index. Returns NULL if the device
 818  *      is not found or a pointer to the device. The device returned has
 819  *      had a reference added and the pointer is safe until the user calls
 820  *      dev_put to indicate they have finished with it.
 821  */
 822
 823 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 824 {
 825         struct net_device *dev;
 826
 827         rcu_read_lock();
 828         dev = dev_get_by_index_rcu(net, ifindex);
 829         if (dev)
 830                 dev_hold(dev);
 831         rcu_read_unlock();
 832         return dev;
 833 }
 834 EXPORT_SYMBOL(dev_get_by_index);
 835
 836 /**
 837  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 838  *      @net: the applicable net namespace
 839  *      @type: media type of device
 840  *      @ha: hardware address
 841  *
 842  *      Search for an interface by MAC address. Returns NULL if the device
 843  *      is not found or a pointer to the device.
 844  *      The caller must hold RCU or RTNL.
 845  *      The returned device has not had its ref count increased
 846  *      and the caller must therefore be careful about locking
 847  *
 848  */
 849
 850 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 851                                        const char *ha)
 852 {
 853         struct net_device *dev;
 854
 855         for_each_netdev_rcu(net, dev)
 856                 if (dev->type == type &&
 857                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 858                         return dev;
 859
 860         return NULL;
 861 }
 862 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 863
 864 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 865 {
 866         struct net_device *dev;
 867
 868         ASSERT_RTNL();
 869         for_each_netdev(net, dev)
 870                 if (dev->type == type)
 871                         return dev;
 872
 873         return NULL;
 874 }
 875 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 876
 877 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 878 {
 879         struct net_device *dev, *ret = NULL;
 880
 881         rcu_read_lock();
 882         for_each_netdev_rcu(net, dev)
 883                 if (dev->type == type) {
 884                         dev_hold(dev);
 885                         ret = dev;
 886                         break;
 887                 }
 888         rcu_read_unlock();
 889         return ret;
 890 }
 891 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 892
 893 /**
 894  *      dev_get_by_flags_rcu - find any device with given flags
 895  *      @net: the applicable net namespace
 896  *      @if_flags: IFF_* values
 897  *      @mask: bitmask of bits in if_flags to check
 898  *
 899  *      Search for any interface with the given flags. Returns NULL if a device
 900  *      is not found or a pointer to the device. Must be called inside
 901  *      rcu_read_lock(), and result refcount is unchanged.
 902  */
 903
 904 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 905                                     unsigned short mask)
 906 {
 907         struct net_device *dev, *ret;
 908
 909         ret = NULL;
 910         for_each_netdev_rcu(net, dev) {
 911                 if (((dev->flags ^ if_flags) & mask) == 0) {
 912                         ret = dev;
 913                         break;
 914                 }
 915         }
 916         return ret;
 917 }
 918 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 919
 920 /**
 921  *      dev_valid_name - check if name is okay for network device
 922  *      @name: name string
 923  *
 924  *      Network device names need to be valid file names to
 925  *      to allow sysfs to work.  We also disallow any kind of
 926  *      whitespace.
 927  */
 928 bool dev_valid_name(const char *name)
 929 {
 930         if (*name == '\0')
 931                 return false;
 932         if (strlen(name) >= IFNAMSIZ)
 933                 return false;
 934         if (!strcmp(name, ".") || !strcmp(name, ".."))
 935                 return false;
 936
 937         while (*name) {
 938                 if (*name == '/' || isspace(*name))
 939                         return false;
 940                 name++;
 941         }
 942         return true;
 943 }
 944 EXPORT_SYMBOL(dev_valid_name);
 945
 946 /**
 947  *      __dev_alloc_name - allocate a name for a device
 948  *      @net: network namespace to allocate the device name in
 949  *      @name: name format string
 950  *      @buf:  scratch buffer and result name string
 951  *
 952  *      Passed a format string - eg "lt%d" it will try and find a suitable
 953  *      id. It scans list of devices to build up a free map, then chooses
 954  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 955  *      while allocating the name and adding the device in order to avoid
 956  *      duplicates.
 957  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 958  *      Returns the number of the unit assigned or a negative errno code.
 959  */
 960
 961 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 962 {
 963         int i = 0;
 964         const char *p;
 965         const int max_netdevices = 8*PAGE_SIZE;
 966         unsigned long *inuse;
 967         struct net_device *d;
 968
 969         p = strnchr(name, IFNAMSIZ-1, '%');
 970         if (p) {
 971                 /*
 972                  * Verify the string as this thing may have come from
 973                  * the user.  There must be either one "%d" and no other "%"
 974                  * characters.
 975                  */
 976                 if (p[1] != 'd' || strchr(p + 2, '%'))
 977                         return -EINVAL;
 978
 979                 /* Use one page as a bit array of possible slots */
 980                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 981                 if (!inuse)
 982                         return -ENOMEM;
 983
 984                 for_each_netdev(net, d) {
 985                         if (!sscanf(d->name, name, &i))
 986                                 continue;
 987                         if (i < 0 || i >= max_netdevices)
 988                                 continue;
 989
 990                         /*  avoid cases where sscanf is not exact inverse of printf */
 991                         snprintf(buf, IFNAMSIZ, name, i);
 992                         if (!strncmp(buf, d->name, IFNAMSIZ))
 993                                 set_bit(i, inuse);
 994                 }
 995
 996                 i = find_first_zero_bit(inuse, max_netdevices);
 997                 free_page((unsigned long) inuse);
 998         }
 999
1000         if (buf != name)
1001                 snprintf(buf, IFNAMSIZ, name, i);
1002         if (!__dev_get_by_name(net, buf))
1003                 return i;
1004
1005         /* It is possible to run out of possible slots
1006          * when the name is long and there isn't enough space left
1007          * for the digits, or if all bits are used.
1008          */
1009         return -ENFILE;
1010 }
1011
1012 /**
1013  *      dev_alloc_name - allocate a name for a device
1014  *      @dev: device
1015  *      @name: name format string
1016  *
1017  *      Passed a format string - eg "lt%d" it will try and find a suitable
1018  *      id. It scans list of devices to build up a free map, then chooses
1019  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1020  *      while allocating the name and adding the device in order to avoid
1021  *      duplicates.
1022  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1023  *      Returns the number of the unit assigned or a negative errno code.
1024  */
1025
1026 int dev_alloc_name(struct net_device *dev, const char *name)
1027 {
1028         char buf[IFNAMSIZ];
1029         struct net *net;
1030         int ret;
1031
1032         BUG_ON(!dev_net(dev));
1033         net = dev_net(dev);
1034         ret = __dev_alloc_name(net, name, buf);
1035         if (ret >= 0)
1036                 strlcpy(dev->name, buf, IFNAMSIZ);
1037         return ret;
1038 }
1039 EXPORT_SYMBOL(dev_alloc_name);
1040
1041 static int dev_alloc_name_ns(struct net *net,
1042                              struct net_device *dev,
1043                              const char *name)
1044 {
1045         char buf[IFNAMSIZ];
1046         int ret;
1047
1048         ret = __dev_alloc_name(net, name, buf);
1049         if (ret >= 0)
1050                 strlcpy(dev->name, buf, IFNAMSIZ);
1051         return ret;
1052 }
1053
1054 static int dev_get_valid_name(struct net *net,
1055                               struct net_device *dev,
1056                               const char *name)
1057 {
1058         BUG_ON(!net);
1059
1060         if (!dev_valid_name(name))
1061                 return -EINVAL;
1062
1063         if (strchr(name, '%'))
1064                 return dev_alloc_name_ns(net, dev, name);
1065         else if (__dev_get_by_name(net, name))
1066                 return -EEXIST;
1067         else if (dev->name != name)
1068                 strlcpy(dev->name, name, IFNAMSIZ);
1069
1070         return 0;
1071 }
1072
1073 /**
1074  *      dev_change_name - change name of a device
1075  *      @dev: device
1076  *      @newname: name (or format string) must be at least IFNAMSIZ
1077  *
1078  *      Change name of a device, can pass format strings "eth%d".
1079  *      for wildcarding.
1080  */
1081 int dev_change_name(struct net_device *dev, const char *newname)
1082 {
1083         char oldname[IFNAMSIZ];
1084         int err = 0;
1085         int ret;
1086         struct net *net;
1087
1088         ASSERT_RTNL();
1089         BUG_ON(!dev_net(dev));
1090
1091         net = dev_net(dev);
1092         if (dev->flags & IFF_UP)
1093                 return -EBUSY;
1094
1095         write_seqcount_begin(&devnet_rename_seq);
1096
1097         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1098                 write_seqcount_end(&devnet_rename_seq);
1099                 return 0;
1100         }
1101
1102         memcpy(oldname, dev->name, IFNAMSIZ);
1103
1104         err = dev_get_valid_name(net, dev, newname);
1105         if (err < 0) {
1106                 write_seqcount_end(&devnet_rename_seq);
1107                 return err;
1108         }
1109
1110 rollback:
1111         ret = device_rename(&dev->dev, dev->name);
1112         if (ret) {
1113                 memcpy(dev->name, oldname, IFNAMSIZ);
1114                 write_seqcount_end(&devnet_rename_seq);
1115                 return ret;
1116         }
1117
1118         write_seqcount_end(&devnet_rename_seq);
1119
1120         write_lock_bh(&dev_base_lock);
1121         hlist_del_rcu(&dev->name_hlist);
1122         write_unlock_bh(&dev_base_lock);
1123
1124         synchronize_rcu();
1125
1126         write_lock_bh(&dev_base_lock);
1127         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1128         write_unlock_bh(&dev_base_lock);
1129
1130         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1131         ret = notifier_to_errno(ret);
1132
1133         if (ret) {
1134                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1135                 if (err >= 0) {
1136                         err = ret;
1137                         write_seqcount_begin(&devnet_rename_seq);
1138                         memcpy(dev->name, oldname, IFNAMSIZ);
1139                         goto rollback;
1140                 } else {
1141                         pr_err("%s: name change rollback failed: %d\n",
1142                                dev->name, ret);
1143                 }
1144         }
1145
1146         return err;
1147 }
1148
1149 /**
1150  *      dev_set_alias - change ifalias of a device
1151  *      @dev: device
1152  *      @alias: name up to IFALIASZ
1153  *      @len: limit of bytes to copy from info
1154  *
1155  *      Set ifalias for a device,
1156  */
1157 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1158 {
1159         char *new_ifalias;
1160
1161         ASSERT_RTNL();
1162
1163         if (len >= IFALIASZ)
1164                 return -EINVAL;
1165
1166         if (!len) {
1167                 kfree(dev->ifalias);
1168                 dev->ifalias = NULL;
1169                 return 0;
1170         }
1171
1172         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1173         if (!new_ifalias)
1174                 return -ENOMEM;
1175         dev->ifalias = new_ifalias;
1176
1177         strlcpy(dev->ifalias, alias, len+1);
1178         return len;
1179 }
1180
1181
1182 /**
1183  *      netdev_features_change - device changes features
1184  *      @dev: device to cause notification
1185  *
1186  *      Called to indicate a device has changed features.
1187  */
1188 void netdev_features_change(struct net_device *dev)
1189 {
1190         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1191 }
1192 EXPORT_SYMBOL(netdev_features_change);
1193
1194 /**
1195  *      netdev_state_change - device changes state
1196  *      @dev: device to cause notification
1197  *
1198  *      Called to indicate a device has changed state. This function calls
1199  *      the notifier chains for netdev_chain and sends a NEWLINK message
1200  *      to the routing socket.
1201  */
1202 void netdev_state_change(struct net_device *dev)
1203 {
1204         if (dev->flags & IFF_UP) {
1205                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1206                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1207         }
1208 }
1209 EXPORT_SYMBOL(netdev_state_change);
1210
1211 /**
1212  *      netdev_notify_peers - notify network peers about existence of @dev
1213  *      @dev: network device
1214  *
1215  * Generate traffic such that interested network peers are aware of
1216  * @dev, such as by generating a gratuitous ARP. This may be used when
1217  * a device wants to inform the rest of the network about some sort of
1218  * reconfiguration such as a failover event or virtual machine
1219  * migration.
1220  */
1221 void netdev_notify_peers(struct net_device *dev)
1222 {
1223         rtnl_lock();
1224         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1225         rtnl_unlock();
1226 }
1227 EXPORT_SYMBOL(netdev_notify_peers);
1228
1229 /**
1230  *      dev_load        - load a network module
1231  *      @net: the applicable net namespace
1232  *      @name: name of interface
1233  *
1234  *      If a network interface is not present and the process has suitable
1235  *      privileges this function loads the module. If module loading is not
1236  *      available in this kernel then it becomes a nop.
1237  */
1238
1239 void dev_load(struct net *net, const char *name)
1240 {
1241         struct net_device *dev;
1242         int no_module;
1243
1244         rcu_read_lock();
1245         dev = dev_get_by_name_rcu(net, name);
1246         rcu_read_unlock();
1247
1248         no_module = !dev;
1249         if (no_module && capable(CAP_NET_ADMIN))
1250                 no_module = request_module("netdev-%s", name);
1251         if (no_module && capable(CAP_SYS_MODULE)) {
1252                 if (!request_module("%s", name))
1253                         pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1254                                 name);
1255         }
1256 }
1257 EXPORT_SYMBOL(dev_load);
1258
1259 static int __dev_open(struct net_device *dev)
1260 {
1261         const struct net_device_ops *ops = dev->netdev_ops;
1262         int ret;
1263
1264         ASSERT_RTNL();
1265
1266         if (!netif_device_present(dev))
1267                 return -ENODEV;
1268
1269         /* Block netpoll from trying to do any rx path servicing.
1270          * If we don't do this there is a chance ndo_poll_controller
1271          * or ndo_poll may be running while we open the device
1272          */
1273         ret = netpoll_rx_disable(dev);
1274         if (ret)
1275                 return ret;
1276
1277         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1278         ret = notifier_to_errno(ret);
1279         if (ret)
1280                 return ret;
1281
1282         set_bit(__LINK_STATE_START, &dev->state);
1283
1284         if (ops->ndo_validate_addr)
1285                 ret = ops->ndo_validate_addr(dev);
1286
1287         if (!ret && ops->ndo_open)
1288                 ret = ops->ndo_open(dev);
1289
1290         netpoll_rx_enable(dev);
1291
1292         if (ret)
1293                 clear_bit(__LINK_STATE_START, &dev->state);
1294         else {
1295                 dev->flags |= IFF_UP;
1296                 net_dmaengine_get();
1297                 dev_set_rx_mode(dev);
1298                 dev_activate(dev);
1299                 add_device_randomness(dev->dev_addr, dev->addr_len);
1300         }
1301
1302         return ret;
1303 }
1304
1305 /**
1306  *      dev_open        - prepare an interface for use.
1307  *      @dev:   device to open
1308  *
1309  *      Takes a device from down to up state. The device's private open
1310  *      function is invoked and then the multicast lists are loaded. Finally
1311  *      the device is moved into the up state and a %NETDEV_UP message is
1312  *      sent to the netdev notifier chain.
1313  *
1314  *      Calling this function on an active interface is a nop. On a failure
1315  *      a negative errno code is returned.
1316  */
1317 int dev_open(struct net_device *dev)
1318 {
1319         int ret;
1320
1321         if (dev->flags & IFF_UP)
1322                 return 0;
1323
1324         ret = __dev_open(dev);
1325         if (ret < 0)
1326                 return ret;
1327
1328         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1329         call_netdevice_notifiers(NETDEV_UP, dev);
1330
1331         return ret;
1332 }
1333 EXPORT_SYMBOL(dev_open);
1334
1335 static int __dev_close_many(struct list_head *head)
1336 {
1337         struct net_device *dev;
1338
1339         ASSERT_RTNL();
1340         might_sleep();
1341
1342         list_for_each_entry(dev, head, unreg_list) {
1343                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1344
1345                 clear_bit(__LINK_STATE_START, &dev->state);
1346
1347                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1348                  * can be even on different cpu. So just clear netif_running().
1349                  *
1350                  * dev->stop() will invoke napi_disable() on all of it's
1351                  * napi_struct instances on this device.
1352                  */
1353                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1354         }
1355
1356         dev_deactivate_many(head);
1357
1358         list_for_each_entry(dev, head, unreg_list) {
1359                 const struct net_device_ops *ops = dev->netdev_ops;
1360
1361                 /*
1362                  *      Call the device specific close. This cannot fail.
1363                  *      Only if device is UP
1364                  *
1365                  *      We allow it to be called even after a DETACH hot-plug
1366                  *      event.
1367                  */
1368                 if (ops->ndo_stop)
1369                         ops->ndo_stop(dev);
1370
1371                 dev->flags &= ~IFF_UP;
1372                 net_dmaengine_put();
1373         }
1374
1375         return 0;
1376 }
1377
1378 static int __dev_close(struct net_device *dev)
1379 {
1380         int retval;
1381         LIST_HEAD(single);
1382
1383         /* Temporarily disable netpoll until the interface is down */
1384         retval = netpoll_rx_disable(dev);
1385         if (retval)
1386                 return retval;
1387
1388         list_add(&dev->unreg_list, &single);
1389         retval = __dev_close_many(&single);
1390         list_del(&single);
1391
1392         netpoll_rx_enable(dev);
1393         return retval;
1394 }
1395
1396 static int dev_close_many(struct list_head *head)
1397 {
1398         struct net_device *dev, *tmp;
1399         LIST_HEAD(tmp_list);
1400
1401         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1402                 if (!(dev->flags & IFF_UP))
1403                         list_move(&dev->unreg_list, &tmp_list);
1404
1405         __dev_close_many(head);
1406
1407         list_for_each_entry(dev, head, unreg_list) {
1408                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1409                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1410         }
1411
1412         /* rollback_registered_many needs the complete original list */
1413         list_splice(&tmp_list, head);
1414         return 0;
1415 }
1416
1417 /**
1418  *      dev_close - shutdown an interface.
1419  *      @dev: device to shutdown
1420  *
1421  *      This function moves an active device into down state. A
1422  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1423  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1424  *      chain.
1425  */
1426 int dev_close(struct net_device *dev)
1427 {
1428         int ret = 0;
1429         if (dev->flags & IFF_UP) {
1430                 LIST_HEAD(single);
1431
1432                 /* Block netpoll rx while the interface is going down */
1433                 ret = netpoll_rx_disable(dev);
1434                 if (ret)
1435                         return ret;
1436
1437                 list_add(&dev->unreg_list, &single);
1438                 dev_close_many(&single);
1439                 list_del(&single);
1440
1441                 netpoll_rx_enable(dev);
1442         }
1443         return ret;
1444 }
1445 EXPORT_SYMBOL(dev_close);
1446
1447
1448 /**
1449  *      dev_disable_lro - disable Large Receive Offload on a device
1450  *      @dev: device
1451  *
1452  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1453  *      called under RTNL.  This is needed if received packets may be
1454  *      forwarded to another interface.
1455  */
1456 void dev_disable_lro(struct net_device *dev)
1457 {
1458         /*
1459          * If we're trying to disable lro on a vlan device
1460          * use the underlying physical device instead
1461          */
1462         if (is_vlan_dev(dev))
1463                 dev = vlan_dev_real_dev(dev);
1464
1465         dev->wanted_features &= ~NETIF_F_LRO;
1466         netdev_update_features(dev);
1467
1468         if (unlikely(dev->features & NETIF_F_LRO))
1469                 netdev_WARN(dev, "failed to disable LRO!\n");
1470 }
1471 EXPORT_SYMBOL(dev_disable_lro);
1472
1473
1474 static int dev_boot_phase = 1;
1475
1476 /**
1477  *      register_netdevice_notifier - register a network notifier block
1478  *      @nb: notifier
1479  *
1480  *      Register a notifier to be called when network device events occur.
1481  *      The notifier passed is linked into the kernel structures and must
1482  *      not be reused until it has been unregistered. A negative errno code
1483  *      is returned on a failure.
1484  *
1485  *      When registered all registration and up events are replayed
1486  *      to the new notifier to allow device to have a race free
1487  *      view of the network device list.
1488  */
1489
1490 int register_netdevice_notifier(struct notifier_block *nb)
1491 {
1492         struct net_device *dev;
1493         struct net_device *last;
1494         struct net *net;
1495         int err;
1496
1497         rtnl_lock();
1498         err = raw_notifier_chain_register(&netdev_chain, nb);
1499         if (err)
1500                 goto unlock;
1501         if (dev_boot_phase)
1502                 goto unlock;
1503         for_each_net(net) {
1504                 for_each_netdev(net, dev) {
1505                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1506                         err = notifier_to_errno(err);
1507                         if (err)
1508                                 goto rollback;
1509
1510                         if (!(dev->flags & IFF_UP))
1511                                 continue;
1512
1513                         nb->notifier_call(nb, NETDEV_UP, dev);
1514                 }
1515         }
1516
1517 unlock:
1518         rtnl_unlock();
1519         return err;
1520
1521 rollback:
1522         last = dev;
1523         for_each_net(net) {
1524                 for_each_netdev(net, dev) {
1525                         if (dev == last)
1526                                 goto outroll;
1527
1528                         if (dev->flags & IFF_UP) {
1529                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1530                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1531                         }
1532                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1533                 }
1534         }
1535
1536 outroll:
1537         raw_notifier_chain_unregister(&netdev_chain, nb);
1538         goto unlock;
1539 }
1540 EXPORT_SYMBOL(register_netdevice_notifier);
1541
1542 /**
1543  *      unregister_netdevice_notifier - unregister a network notifier block
1544  *      @nb: notifier
1545  *
1546  *      Unregister a notifier previously registered by
1547  *      register_netdevice_notifier(). The notifier is unlinked into the
1548  *      kernel structures and may then be reused. A negative errno code
1549  *      is returned on a failure.
1550  *
1551  *      After unregistering unregister and down device events are synthesized
1552  *      for all devices on the device list to the removed notifier to remove
1553  *      the need for special case cleanup code.
1554  */
1555
1556 int unregister_netdevice_notifier(struct notifier_block *nb)
1557 {
1558         struct net_device *dev;
1559         struct net *net;
1560         int err;
1561
1562         rtnl_lock();
1563         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1564         if (err)
1565                 goto unlock;
1566
1567         for_each_net(net) {
1568                 for_each_netdev(net, dev) {
1569                         if (dev->flags & IFF_UP) {
1570                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1571                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1572                         }
1573                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1574                 }
1575         }
1576 unlock:
1577         rtnl_unlock();
1578         return err;
1579 }
1580 EXPORT_SYMBOL(unregister_netdevice_notifier);
1581
1582 /**
1583  *      call_netdevice_notifiers - call all network notifier blocks
1584  *      @val: value passed unmodified to notifier function
1585  *      @dev: net_device pointer passed unmodified to notifier function
1586  *
1587  *      Call all network notifier blocks.  Parameters and return value
1588  *      are as for raw_notifier_call_chain().
1589  */
1590
1591 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1592 {
1593         ASSERT_RTNL();
1594         return raw_notifier_call_chain(&netdev_chain, val, dev);
1595 }
1596 EXPORT_SYMBOL(call_netdevice_notifiers);
1597
1598 static struct static_key netstamp_needed __read_mostly;
1599 #ifdef HAVE_JUMP_LABEL
1600 /* We are not allowed to call static_key_slow_dec() from irq context
1601  * If net_disable_timestamp() is called from irq context, defer the
1602  * static_key_slow_dec() calls.
1603  */
1604 static atomic_t netstamp_needed_deferred;
1605 #endif
1606
1607 void net_enable_timestamp(void)
1608 {
1609 #ifdef HAVE_JUMP_LABEL
1610         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1611
1612         if (deferred) {
1613                 while (--deferred)
1614                         static_key_slow_dec(&netstamp_needed);
1615                 return;
1616         }
1617 #endif
1618         WARN_ON(in_interrupt());
1619         static_key_slow_inc(&netstamp_needed);
1620 }
1621 EXPORT_SYMBOL(net_enable_timestamp);
1622
1623 void net_disable_timestamp(void)
1624 {
1625 #ifdef HAVE_JUMP_LABEL
1626         if (in_interrupt()) {
1627                 atomic_inc(&netstamp_needed_deferred);
1628                 return;
1629         }
1630 #endif
1631         static_key_slow_dec(&netstamp_needed);
1632 }
1633 EXPORT_SYMBOL(net_disable_timestamp);
1634
1635 static inline void net_timestamp_set(struct sk_buff *skb)
1636 {
1637         skb->tstamp.tv64 = 0;
1638         if (static_key_false(&netstamp_needed))
1639                 __net_timestamp(skb);
1640 }
1641
1642 #define net_timestamp_check(COND, SKB)                  \
1643         if (static_key_false(&netstamp_needed)) {               \
1644                 if ((COND) && !(SKB)->tstamp.tv64)      \
1645                         __net_timestamp(SKB);           \
1646         }                                               \
1647
1648 static int net_hwtstamp_validate(struct ifreq *ifr)
1649 {
1650         struct hwtstamp_config cfg;
1651         enum hwtstamp_tx_types tx_type;
1652         enum hwtstamp_rx_filters rx_filter;
1653         int tx_type_valid = 0;
1654         int rx_filter_valid = 0;
1655
1656         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1657                 return -EFAULT;
1658
1659         if (cfg.flags) /* reserved for future extensions */
1660                 return -EINVAL;
1661
1662         tx_type = cfg.tx_type;
1663         rx_filter = cfg.rx_filter;
1664
1665         switch (tx_type) {
1666         case HWTSTAMP_TX_OFF:
1667         case HWTSTAMP_TX_ON:
1668         case HWTSTAMP_TX_ONESTEP_SYNC:
1669                 tx_type_valid = 1;
1670                 break;
1671         }
1672
1673         switch (rx_filter) {
1674         case HWTSTAMP_FILTER_NONE:
1675         case HWTSTAMP_FILTER_ALL:
1676         case HWTSTAMP_FILTER_SOME:
1677         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1678         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1679         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1680         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1681         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1682         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1683         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1684         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1685         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1686         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1687         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1688         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1689                 rx_filter_valid = 1;
1690                 break;
1691         }
1692
1693         if (!tx_type_valid || !rx_filter_valid)
1694                 return -ERANGE;
1695
1696         return 0;
1697 }
1698
1699 static inline bool is_skb_forwardable(struct net_device *dev,
1700                                       struct sk_buff *skb)
1701 {
1702         unsigned int len;
1703
1704         if (!(dev->flags & IFF_UP))
1705                 return false;
1706
1707         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1708         if (skb->len <= len)
1709                 return true;
1710
1711         /* if TSO is enabled, we don't care about the length as the packet
1712          * could be forwarded without being segmented before
1713          */
1714         if (skb_is_gso(skb))
1715                 return true;
1716
1717         return false;
1718 }
1719
1720 /**
1721  * dev_forward_skb - loopback an skb to another netif
1722  *
1723  * @dev: destination network device
1724  * @skb: buffer to forward
1725  *
1726  * return values:
1727  *      NET_RX_SUCCESS  (no congestion)
1728  *      NET_RX_DROP     (packet was dropped, but freed)
1729  *
1730  * dev_forward_skb can be used for injecting an skb from the
1731  * start_xmit function of one device into the receive queue
1732  * of another device.
1733  *
1734  * The receiving device may be in another namespace, so
1735  * we have to clear all information in the skb that could
1736  * impact namespace isolation.
1737  */
1738 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1739 {
1740         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1741                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1742                         atomic_long_inc(&dev->rx_dropped);
1743                         kfree_skb(skb);
1744                         return NET_RX_DROP;
1745                 }
1746         }
1747
1748         skb_orphan(skb);
1749         nf_reset(skb);
1750
1751         if (unlikely(!is_skb_forwardable(dev, skb))) {
1752                 atomic_long_inc(&dev->rx_dropped);
1753                 kfree_skb(skb);
1754                 return NET_RX_DROP;
1755         }
1756         skb->skb_iif = 0;
1757         skb->dev = dev;
1758         skb_dst_drop(skb);
1759         skb->tstamp.tv64 = 0;
1760         skb->pkt_type = PACKET_HOST;
1761         skb->protocol = eth_type_trans(skb, dev);
1762         skb->mark = 0;
1763         secpath_reset(skb);
1764         nf_reset(skb);
1765         return netif_rx(skb);
1766 }
1767 EXPORT_SYMBOL_GPL(dev_forward_skb);
1768
1769 static inline int deliver_skb(struct sk_buff *skb,
1770                               struct packet_type *pt_prev,
1771                               struct net_device *orig_dev)
1772 {
1773         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1774                 return -ENOMEM;
1775         atomic_inc(&skb->users);
1776         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1777 }
1778
1779 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1780 {
1781         if (!ptype->af_packet_priv || !skb->sk)
1782                 return false;
1783
1784         if (ptype->id_match)
1785                 return ptype->id_match(ptype, skb->sk);
1786         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1787                 return true;
1788
1789         return false;
1790 }
1791
1792 /*
1793  *      Support routine. Sends outgoing frames to any network
1794  *      taps currently in use.
1795  */
1796
1797 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1798 {
1799         struct packet_type *ptype;
1800         struct sk_buff *skb2 = NULL;
1801         struct packet_type *pt_prev = NULL;
1802
1803         rcu_read_lock();
1804         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1805                 /* Never send packets back to the socket
1806                  * they originated from - MvS (miquels@drinkel.ow.org)
1807                  */
1808                 if ((ptype->dev == dev || !ptype->dev) &&
1809                     (!skb_loop_sk(ptype, skb))) {
1810                         if (pt_prev) {
1811                                 deliver_skb(skb2, pt_prev, skb->dev);
1812                                 pt_prev = ptype;
1813                                 continue;
1814                         }
1815
1816                         skb2 = skb_clone(skb, GFP_ATOMIC);
1817                         if (!skb2)
1818                                 break;
1819
1820                         net_timestamp_set(skb2);
1821
1822                         /* skb->nh should be correctly
1823                            set by sender, so that the second statement is
1824                            just protection against buggy protocols.
1825                          */
1826                         skb_reset_mac_header(skb2);
1827
1828                         if (skb_network_header(skb2) < skb2->data ||
1829                             skb2->network_header > skb2->tail) {
1830                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1831                                                      ntohs(skb2->protocol),
1832                                                      dev->name);
1833                                 skb_reset_network_header(skb2);
1834                         }
1835
1836                         skb2->transport_header = skb2->network_header;
1837                         skb2->pkt_type = PACKET_OUTGOING;
1838                         pt_prev = ptype;
1839                 }
1840         }
1841         if (pt_prev)
1842                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1843         rcu_read_unlock();
1844 }
1845
1846 /**
1847  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1848  * @dev: Network device
1849  * @txq: number of queues available
1850  *
1851  * If real_num_tx_queues is changed the tc mappings may no longer be
1852  * valid. To resolve this verify the tc mapping remains valid and if
1853  * not NULL the mapping. With no priorities mapping to this
1854  * offset/count pair it will no longer be used. In the worst case TC0
1855  * is invalid nothing can be done so disable priority mappings. If is
1856  * expected that drivers will fix this mapping if they can before
1857  * calling netif_set_real_num_tx_queues.
1858  */
1859 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1860 {
1861         int i;
1862         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1863
1864         /* If TC0 is invalidated disable TC mapping */
1865         if (tc->offset + tc->count > txq) {
1866                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1867                 dev->num_tc = 0;
1868                 return;
1869         }
1870
1871         /* Invalidated prio to tc mappings set to TC0 */
1872         for (i = 1; i < TC_BITMASK + 1; i++) {
1873                 int q = netdev_get_prio_tc_map(dev, i);
1874
1875                 tc = &dev->tc_to_txq[q];
1876                 if (tc->offset + tc->count > txq) {
1877                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1878                                 i, q);
1879                         netdev_set_prio_tc_map(dev, i, 0);
1880                 }
1881         }
1882 }
1883
1884 #ifdef CONFIG_XPS
1885 static DEFINE_MUTEX(xps_map_mutex);
1886 #define xmap_dereference(P)             \
1887         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1888
1889 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1890                                         int cpu, u16 index)
1891 {
1892         struct xps_map *map = NULL;
1893         int pos;
1894
1895         if (dev_maps)
1896                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1897
1898         for (pos = 0; map && pos < map->len; pos++) {
1899                 if (map->queues[pos] == index) {
1900                         if (map->len > 1) {
1901                                 map->queues[pos] = map->queues[--map->len];
1902                         } else {
1903                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1904                                 kfree_rcu(map, rcu);
1905                                 map = NULL;
1906                         }
1907                         break;
1908                 }
1909         }
1910
1911         return map;
1912 }
1913
1914 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1915 {
1916         struct xps_dev_maps *dev_maps;
1917         int cpu, i;
1918         bool active = false;
1919
1920         mutex_lock(&xps_map_mutex);
1921         dev_maps = xmap_dereference(dev->xps_maps);
1922
1923         if (!dev_maps)
1924                 goto out_no_maps;
1925
1926         for_each_possible_cpu(cpu) {
1927                 for (i = index; i < dev->num_tx_queues; i++) {
1928                         if (!remove_xps_queue(dev_maps, cpu, i))
1929                                 break;
1930                 }
1931                 if (i == dev->num_tx_queues)
1932                         active = true;
1933         }
1934
1935         if (!active) {
1936                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1937                 kfree_rcu(dev_maps, rcu);
1938         }
1939
1940         for (i = index; i < dev->num_tx_queues; i++)
1941                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1942                                              NUMA_NO_NODE);
1943
1944 out_no_maps:
1945         mutex_unlock(&xps_map_mutex);
1946 }
1947
1948 static struct xps_map *expand_xps_map(struct xps_map *map,
1949                                       int cpu, u16 index)
1950 {
1951         struct xps_map *new_map;
1952         int alloc_len = XPS_MIN_MAP_ALLOC;
1953         int i, pos;
1954
1955         for (pos = 0; map && pos < map->len; pos++) {
1956                 if (map->queues[pos] != index)
1957                         continue;
1958                 return map;
1959         }
1960
1961         /* Need to add queue to this CPU's existing map */
1962         if (map) {
1963                 if (pos < map->alloc_len)
1964                         return map;
1965
1966                 alloc_len = map->alloc_len * 2;
1967         }
1968
1969         /* Need to allocate new map to store queue on this CPU's map */
1970         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1971                                cpu_to_node(cpu));
1972         if (!new_map)
1973                 return NULL;
1974
1975         for (i = 0; i < pos; i++)
1976                 new_map->queues[i] = map->queues[i];
1977         new_map->alloc_len = alloc_len;
1978         new_map->len = pos;
1979
1980         return new_map;
1981 }
1982
1983 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1984 {
1985         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1986         struct xps_map *map, *new_map;
1987         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1988         int cpu, numa_node_id = -2;
1989         bool active = false;
1990
1991         mutex_lock(&xps_map_mutex);
1992
1993         dev_maps = xmap_dereference(dev->xps_maps);
1994
1995         /* allocate memory for queue storage */
1996         for_each_online_cpu(cpu) {
1997                 if (!cpumask_test_cpu(cpu, mask))
1998                         continue;
1999
2000                 if (!new_dev_maps)
2001                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2002                 if (!new_dev_maps)
2003                         return -ENOMEM;
2004
2005                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2006                                  NULL;
2007
2008                 map = expand_xps_map(map, cpu, index);
2009                 if (!map)
2010                         goto error;
2011
2012                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2013         }
2014
2015         if (!new_dev_maps)
2016                 goto out_no_new_maps;
2017
2018         for_each_possible_cpu(cpu) {
2019                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2020                         /* add queue to CPU maps */
2021                         int pos = 0;
2022
2023                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2024                         while ((pos < map->len) && (map->queues[pos] != index))
2025                                 pos++;
2026
2027                         if (pos == map->len)
2028                                 map->queues[map->len++] = index;
2029 #ifdef CONFIG_NUMA
2030                         if (numa_node_id == -2)
2031                                 numa_node_id = cpu_to_node(cpu);
2032                         else if (numa_node_id != cpu_to_node(cpu))
2033                                 numa_node_id = -1;
2034 #endif
2035                 } else if (dev_maps) {
2036                         /* fill in the new device map from the old device map */
2037                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2038                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2039                 }
2040
2041         }
2042
2043         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2044
2045         /* Cleanup old maps */
2046         if (dev_maps) {
2047                 for_each_possible_cpu(cpu) {
2048                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2049                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2050                         if (map && map != new_map)
2051                                 kfree_rcu(map, rcu);
2052                 }
2053
2054                 kfree_rcu(dev_maps, rcu);
2055         }
2056
2057         dev_maps = new_dev_maps;
2058         active = true;
2059
2060 out_no_new_maps:
2061         /* update Tx queue numa node */
2062         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2063                                      (numa_node_id >= 0) ? numa_node_id :
2064                                      NUMA_NO_NODE);
2065
2066         if (!dev_maps)
2067                 goto out_no_maps;
2068
2069         /* removes queue from unused CPUs */
2070         for_each_possible_cpu(cpu) {
2071                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2072                         continue;
2073
2074                 if (remove_xps_queue(dev_maps, cpu, index))
2075                         active = true;
2076         }
2077
2078         /* free map if not active */
2079         if (!active) {
2080                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2081                 kfree_rcu(dev_maps, rcu);
2082         }
2083
2084 out_no_maps:
2085         mutex_unlock(&xps_map_mutex);
2086
2087         return 0;
2088 error:
2089         /* remove any maps that we added */
2090         for_each_possible_cpu(cpu) {
2091                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2092                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2093                                  NULL;
2094                 if (new_map && new_map != map)
2095                         kfree(new_map);
2096         }
2097
2098         mutex_unlock(&xps_map_mutex);
2099
2100         kfree(new_dev_maps);
2101         return -ENOMEM;
2102 }
2103 EXPORT_SYMBOL(netif_set_xps_queue);
2104
2105 #endif
2106 /*
2107  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2108  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2109  */
2110 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2111 {
2112         int rc;
2113
2114         if (txq < 1 || txq > dev->num_tx_queues)
2115                 return -EINVAL;
2116
2117         if (dev->reg_state == NETREG_REGISTERED ||
2118             dev->reg_state == NETREG_UNREGISTERING) {
2119                 ASSERT_RTNL();
2120
2121                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2122                                                   txq);
2123                 if (rc)
2124                         return rc;
2125
2126                 if (dev->num_tc)
2127                         netif_setup_tc(dev, txq);
2128
2129                 if (txq < dev->real_num_tx_queues) {
2130                         qdisc_reset_all_tx_gt(dev, txq);
2131 #ifdef CONFIG_XPS
2132                         netif_reset_xps_queues_gt(dev, txq);
2133 #endif
2134                 }
2135         }
2136
2137         dev->real_num_tx_queues = txq;
2138         return 0;
2139 }
2140 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2141
2142 #ifdef CONFIG_RPS
2143 /**
2144  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2145  *      @dev: Network device
2146  *      @rxq: Actual number of RX queues
2147  *
2148  *      This must be called either with the rtnl_lock held or before
2149  *      registration of the net device.  Returns 0 on success, or a
2150  *      negative error code.  If called before registration, it always
2151  *      succeeds.
2152  */
2153 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2154 {
2155         int rc;
2156
2157         if (rxq < 1 || rxq > dev->num_rx_queues)
2158                 return -EINVAL;
2159
2160         if (dev->reg_state == NETREG_REGISTERED) {
2161                 ASSERT_RTNL();
2162
2163                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2164                                                   rxq);
2165                 if (rc)
2166                         return rc;
2167         }
2168
2169         dev->real_num_rx_queues = rxq;
2170         return 0;
2171 }
2172 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2173 #endif
2174
2175 /**
2176  * netif_get_num_default_rss_queues - default number of RSS queues
2177  *
2178  * This routine should set an upper limit on the number of RSS queues
2179  * used by default by multiqueue devices.
2180  */
2181 int netif_get_num_default_rss_queues(void)
2182 {
2183         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2184 }
2185 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2186
2187 static inline void __netif_reschedule(struct Qdisc *q)
2188 {
2189         struct softnet_data *sd;
2190         unsigned long flags;
2191
2192         local_irq_save(flags);
2193         sd = &__get_cpu_var(softnet_data);
2194         q->next_sched = NULL;
2195         *sd->output_queue_tailp = q;
2196         sd->output_queue_tailp = &q->next_sched;
2197         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2198         local_irq_restore(flags);
2199 }
2200
2201 void __netif_schedule(struct Qdisc *q)
2202 {
2203         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2204                 __netif_reschedule(q);
2205 }
2206 EXPORT_SYMBOL(__netif_schedule);
2207
2208 void dev_kfree_skb_irq(struct sk_buff *skb)
2209 {
2210         if (atomic_dec_and_test(&skb->users)) {
2211                 struct softnet_data *sd;
2212                 unsigned long flags;
2213
2214                 local_irq_save(flags);
2215                 sd = &__get_cpu_var(softnet_data);
2216                 skb->next = sd->completion_queue;
2217                 sd->completion_queue = skb;
2218                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2219                 local_irq_restore(flags);
2220         }
2221 }
2222 EXPORT_SYMBOL(dev_kfree_skb_irq);
2223
2224 void dev_kfree_skb_any(struct sk_buff *skb)
2225 {
2226         if (in_irq() || irqs_disabled())
2227                 dev_kfree_skb_irq(skb);
2228         else
2229                 dev_kfree_skb(skb);
2230 }
2231 EXPORT_SYMBOL(dev_kfree_skb_any);
2232
2233
2234 /**
2235  * netif_device_detach - mark device as removed
2236  * @dev: network device
2237  *
2238  * Mark device as removed from system and therefore no longer available.
2239  */
2240 void netif_device_detach(struct net_device *dev)
2241 {
2242         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2243             netif_running(dev)) {
2244                 netif_tx_stop_all_queues(dev);
2245         }
2246 }
2247 EXPORT_SYMBOL(netif_device_detach);
2248
2249 /**
2250  * netif_device_attach - mark device as attached
2251  * @dev: network device
2252  *
2253  * Mark device as attached from system and restart if needed.
2254  */
2255 void netif_device_attach(struct net_device *dev)
2256 {
2257         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2258             netif_running(dev)) {
2259                 netif_tx_wake_all_queues(dev);
2260                 __netdev_watchdog_up(dev);
2261         }
2262 }
2263 EXPORT_SYMBOL(netif_device_attach);
2264
2265 static void skb_warn_bad_offload(const struct sk_buff *skb)
2266 {
2267         static const netdev_features_t null_features = 0;
2268         struct net_device *dev = skb->dev;
2269         const char *driver = "";
2270
2271         if (dev && dev->dev.parent)
2272                 driver = dev_driver_string(dev->dev.parent);
2273
2274         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2275              "gso_type=%d ip_summed=%d\n",
2276              driver, dev ? &dev->features : &null_features,
2277              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2278              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2279              skb_shinfo(skb)->gso_type, skb->ip_summed);
2280 }
2281
2282 /*
2283  * Invalidate hardware checksum when packet is to be mangled, and
2284  * complete checksum manually on outgoing path.
2285  */
2286 int skb_checksum_help(struct sk_buff *skb)
2287 {
2288         __wsum csum;
2289         int ret = 0, offset;
2290
2291         if (skb->ip_summed == CHECKSUM_COMPLETE)
2292                 goto out_set_summed;
2293
2294         if (unlikely(skb_shinfo(skb)->gso_size)) {
2295                 skb_warn_bad_offload(skb);
2296                 return -EINVAL;
2297         }
2298
2299         /* Before computing a checksum, we should make sure no frag could
2300          * be modified by an external entity : checksum could be wrong.
2301          */
2302         if (skb_has_shared_frag(skb)) {
2303                 ret = __skb_linearize(skb);
2304                 if (ret)
2305                         goto out;
2306         }
2307
2308         offset = skb_checksum_start_offset(skb);
2309         BUG_ON(offset >= skb_headlen(skb));
2310         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2311
2312         offset += skb->csum_offset;
2313         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2314
2315         if (skb_cloned(skb) &&
2316             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2317                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2318                 if (ret)
2319                         goto out;
2320         }
2321
2322         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2323 out_set_summed:
2324         skb->ip_summed = CHECKSUM_NONE;
2325 out:
2326         return ret;
2327 }
2328 EXPORT_SYMBOL(skb_checksum_help);
2329
2330 /* openvswitch calls this on rx path, so we need a different check.
2331  */
2332 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2333 {
2334         if (tx_path)
2335                 return skb->ip_summed != CHECKSUM_PARTIAL;
2336         else
2337                 return skb->ip_summed == CHECKSUM_NONE;
2338 }
2339
2340 /**
2341  *      __skb_gso_segment - Perform segmentation on skb.
2342  *      @skb: buffer to segment
2343  *      @features: features for the output path (see dev->features)
2344  *      @tx_path: whether it is called in TX path
2345  *
2346  *      This function segments the given skb and returns a list of segments.
2347  *
2348  *      It may return NULL if the skb requires no segmentation.  This is
2349  *      only possible when GSO is used for verifying header integrity.
2350  */
2351 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2352                                   netdev_features_t features, bool tx_path)
2353 {
2354         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2355         struct packet_offload *ptype;
2356         __be16 type = skb->protocol;
2357         int vlan_depth = ETH_HLEN;
2358         int err;
2359
2360         while (type == htons(ETH_P_8021Q)) {
2361                 struct vlan_hdr *vh;
2362
2363                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2364                         return ERR_PTR(-EINVAL);
2365
2366                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2367                 type = vh->h_vlan_encapsulated_proto;
2368                 vlan_depth += VLAN_HLEN;
2369         }
2370
2371         skb_reset_mac_header(skb);
2372         skb->mac_len = skb->network_header - skb->mac_header;
2373         __skb_pull(skb, skb->mac_len);
2374
2375         if (unlikely(skb_needs_check(skb, tx_path))) {
2376                 skb_warn_bad_offload(skb);
2377
2378                 if (skb_header_cloned(skb) &&
2379                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2380                         return ERR_PTR(err);
2381         }
2382
2383         rcu_read_lock();
2384         list_for_each_entry_rcu(ptype, &offload_base, list) {
2385                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2386                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2387                                 err = ptype->callbacks.gso_send_check(skb);
2388                                 segs = ERR_PTR(err);
2389                                 if (err || skb_gso_ok(skb, features))
2390                                         break;
2391                                 __skb_push(skb, (skb->data -
2392                                                  skb_network_header(skb)));
2393                         }
2394                         segs = ptype->callbacks.gso_segment(skb, features);
2395                         break;
2396                 }
2397         }
2398         rcu_read_unlock();
2399
2400         __skb_push(skb, skb->data - skb_mac_header(skb));
2401
2402         return segs;
2403 }
2404 EXPORT_SYMBOL(__skb_gso_segment);
2405
2406 /* Take action when hardware reception checksum errors are detected. */
2407 #ifdef CONFIG_BUG
2408 void netdev_rx_csum_fault(struct net_device *dev)
2409 {
2410         if (net_ratelimit()) {
2411                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2412                 dump_stack();
2413         }
2414 }
2415 EXPORT_SYMBOL(netdev_rx_csum_fault);
2416 #endif
2417
2418 /* Actually, we should eliminate this check as soon as we know, that:
2419  * 1. IOMMU is present and allows to map all the memory.
2420  * 2. No high memory really exists on this machine.
2421  */
2422
2423 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2424 {
2425 #ifdef CONFIG_HIGHMEM
2426         int i;
2427         if (!(dev->features & NETIF_F_HIGHDMA)) {
2428                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2429                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2430                         if (PageHighMem(skb_frag_page(frag)))
2431                                 return 1;
2432                 }
2433         }
2434
2435         if (PCI_DMA_BUS_IS_PHYS) {
2436                 struct device *pdev = dev->dev.parent;
2437
2438                 if (!pdev)
2439                         return 0;
2440                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2441                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2442                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2443                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2444                                 return 1;
2445                 }
2446         }
2447 #endif
2448         return 0;
2449 }
2450
2451 struct dev_gso_cb {
2452         void (*destructor)(struct sk_buff *skb);
2453 };
2454
2455 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2456
2457 static void dev_gso_skb_destructor(struct sk_buff *skb)
2458 {
2459         struct dev_gso_cb *cb;
2460
2461         do {
2462                 struct sk_buff *nskb = skb->next;
2463
2464                 skb->next = nskb->next;
2465                 nskb->next = NULL;
2466                 kfree_skb(nskb);
2467         } while (skb->next);
2468
2469         cb = DEV_GSO_CB(skb);
2470         if (cb->destructor)
2471                 cb->destructor(skb);
2472 }
2473
2474 /**
2475  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2476  *      @skb: buffer to segment
2477  *      @features: device features as applicable to this skb
2478  *
2479  *      This function segments the given skb and stores the list of segments
2480  *      in skb->next.
2481  */
2482 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2483 {
2484         struct sk_buff *segs;
2485
2486         segs = skb_gso_segment(skb, features);
2487
2488         /* Verifying header integrity only. */
2489         if (!segs)
2490                 return 0;
2491
2492         if (IS_ERR(segs))
2493                 return PTR_ERR(segs);
2494
2495         skb->next = segs;
2496         DEV_GSO_CB(skb)->destructor = skb->destructor;
2497         skb->destructor = dev_gso_skb_destructor;
2498
2499         return 0;
2500 }
2501
2502 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2503 {
2504         return ((features & NETIF_F_GEN_CSUM) ||
2505                 ((features & NETIF_F_V4_CSUM) &&
2506                  protocol == htons(ETH_P_IP)) ||
2507                 ((features & NETIF_F_V6_CSUM) &&
2508                  protocol == htons(ETH_P_IPV6)) ||
2509                 ((features & NETIF_F_FCOE_CRC) &&
2510                  protocol == htons(ETH_P_FCOE)));
2511 }
2512
2513 static netdev_features_t harmonize_features(struct sk_buff *skb,
2514         __be16 protocol, netdev_features_t features)
2515 {
2516         if (skb->ip_summed != CHECKSUM_NONE &&
2517             !can_checksum_protocol(features, protocol)) {
2518                 features &= ~NETIF_F_ALL_CSUM;
2519                 features &= ~NETIF_F_SG;
2520         } else if (illegal_highdma(skb->dev, skb)) {
2521                 features &= ~NETIF_F_SG;
2522         }
2523
2524         return features;
2525 }
2526
2527 netdev_features_t netif_skb_features(struct sk_buff *skb)
2528 {
2529         __be16 protocol = skb->protocol;
2530         netdev_features_t features = skb->dev->features;
2531
2532         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2533                 features &= ~NETIF_F_GSO_MASK;
2534
2535         if (protocol == htons(ETH_P_8021Q)) {
2536                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2537                 protocol = veh->h_vlan_encapsulated_proto;
2538         } else if (!vlan_tx_tag_present(skb)) {
2539                 return harmonize_features(skb, protocol, features);
2540         }
2541
2542         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2543
2544         if (protocol != htons(ETH_P_8021Q)) {
2545                 return harmonize_features(skb, protocol, features);
2546         } else {
2547                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2548                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2549                 return harmonize_features(skb, protocol, features);
2550         }
2551 }
2552 EXPORT_SYMBOL(netif_skb_features);
2553
2554 /*
2555  * Returns true if either:
2556  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2557  *      2. skb is fragmented and the device does not support SG.
2558  */
2559 static inline int skb_needs_linearize(struct sk_buff *skb,
2560                                       int features)
2561 {
2562         return skb_is_nonlinear(skb) &&
2563                         ((skb_has_frag_list(skb) &&
2564                                 !(features & NETIF_F_FRAGLIST)) ||
2565                         (skb_shinfo(skb)->nr_frags &&
2566                                 !(features & NETIF_F_SG)));
2567 }
2568
2569 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2570                         struct netdev_queue *txq)
2571 {
2572         const struct net_device_ops *ops = dev->netdev_ops;
2573         int rc = NETDEV_TX_OK;
2574         unsigned int skb_len;
2575
2576         if (likely(!skb->next)) {
2577                 netdev_features_t features;
2578
2579                 /*
2580                  * If device doesn't need skb->dst, release it right now while
2581                  * its hot in this cpu cache
2582                  */
2583                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2584                         skb_dst_drop(skb);
2585
2586                 features = netif_skb_features(skb);
2587
2588                 if (vlan_tx_tag_present(skb) &&
2589                     !(features & NETIF_F_HW_VLAN_TX)) {
2590                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2591                         if (unlikely(!skb))
2592                                 goto out;
2593
2594                         skb->vlan_tci = 0;
2595                 }
2596
2597                 /* If encapsulation offload request, verify we are testing
2598                  * hardware encapsulation features instead of standard
2599                  * features for the netdev
2600                  */
2601                 if (skb->encapsulation)
2602                         features &= dev->hw_enc_features;
2603
2604                 if (netif_needs_gso(skb, features)) {
2605                         if (unlikely(dev_gso_segment(skb, features)))
2606                                 goto out_kfree_skb;
2607                         if (skb->next)
2608                                 goto gso;
2609                 } else {
2610                         if (skb_needs_linearize(skb, features) &&
2611                             __skb_linearize(skb))
2612                                 goto out_kfree_skb;
2613
2614                         /* If packet is not checksummed and device does not
2615                          * support checksumming for this protocol, complete
2616                          * checksumming here.
2617                          */
2618                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2619                                 if (skb->encapsulation)
2620                                         skb_set_inner_transport_header(skb,
2621                                                 skb_checksum_start_offset(skb));
2622                                 else
2623                                         skb_set_transport_header(skb,
2624                                                 skb_checksum_start_offset(skb));
2625                                 if (!(features & NETIF_F_ALL_CSUM) &&
2626                                      skb_checksum_help(skb))
2627                                         goto out_kfree_skb;
2628                         }
2629                 }
2630
2631                 if (!list_empty(&ptype_all))
2632                         dev_queue_xmit_nit(skb, dev);
2633
2634                 skb_len = skb->len;
2635                 rc = ops->ndo_start_xmit(skb, dev);
2636                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2637                 if (rc == NETDEV_TX_OK)
2638                         txq_trans_update(txq);
2639                 return rc;
2640         }
2641
2642 gso:
2643         do {
2644                 struct sk_buff *nskb = skb->next;
2645
2646                 skb->next = nskb->next;
2647                 nskb->next = NULL;
2648
2649                 /*
2650                  * If device doesn't need nskb->dst, release it right now while
2651                  * its hot in this cpu cache
2652                  */
2653                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2654                         skb_dst_drop(nskb);
2655
2656                 if (!list_empty(&ptype_all))
2657                         dev_queue_xmit_nit(nskb, dev);
2658
2659                 skb_len = nskb->len;
2660                 rc = ops->ndo_start_xmit(nskb, dev);
2661                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2662                 if (unlikely(rc != NETDEV_TX_OK)) {
2663                         if (rc & ~NETDEV_TX_MASK)
2664                                 goto out_kfree_gso_skb;
2665                         nskb->next = skb->next;
2666                         skb->next = nskb;
2667                         return rc;
2668                 }
2669                 txq_trans_update(txq);
2670                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2671                         return NETDEV_TX_BUSY;
2672         } while (skb->next);
2673
2674 out_kfree_gso_skb:
2675         if (likely(skb->next == NULL))
2676                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2677 out_kfree_skb:
2678         kfree_skb(skb);
2679 out:
2680         return rc;
2681 }
2682
2683 static void qdisc_pkt_len_init(struct sk_buff *skb)
2684 {
2685         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2686
2687         qdisc_skb_cb(skb)->pkt_len = skb->len;
2688
2689         /* To get more precise estimation of bytes sent on wire,
2690          * we add to pkt_len the headers size of all segments
2691          */
2692         if (shinfo->gso_size)  {
2693                 unsigned int hdr_len;
2694
2695                 /* mac layer + network layer */
2696                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2697
2698                 /* + transport layer */
2699                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2700                         hdr_len += tcp_hdrlen(skb);
2701                 else
2702                         hdr_len += sizeof(struct udphdr);
2703                 qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
2704         }
2705 }
2706
2707 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2708                                  struct net_device *dev,
2709                                  struct netdev_queue *txq)
2710 {
2711         spinlock_t *root_lock = qdisc_lock(q);
2712         bool contended;
2713         int rc;
2714
2715         qdisc_pkt_len_init(skb);
2716         qdisc_calculate_pkt_len(skb, q);
2717         /*
2718          * Heuristic to force contended enqueues to serialize on a
2719          * separate lock before trying to get qdisc main lock.
2720          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2721          * and dequeue packets faster.
2722          */
2723         contended = qdisc_is_running(q);
2724         if (unlikely(contended))
2725                 spin_lock(&q->busylock);
2726
2727         spin_lock(root_lock);
2728         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2729                 kfree_skb(skb);
2730                 rc = NET_XMIT_DROP;
2731         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2732                    qdisc_run_begin(q)) {
2733                 /*
2734                  * This is a work-conserving queue; there are no old skbs
2735                  * waiting to be sent out; and the qdisc is not running -
2736                  * xmit the skb directly.
2737                  */
2738                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2739                         skb_dst_force(skb);
2740
2741                 qdisc_bstats_update(q, skb);
2742
2743                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2744                         if (unlikely(contended)) {
2745                                 spin_unlock(&q->busylock);
2746                                 contended = false;
2747                         }
2748                         __qdisc_run(q);
2749                 } else
2750                         qdisc_run_end(q);
2751
2752                 rc = NET_XMIT_SUCCESS;
2753         } else {
2754                 skb_dst_force(skb);
2755                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2756                 if (qdisc_run_begin(q)) {
2757                         if (unlikely(contended)) {
2758                                 spin_unlock(&q->busylock);
2759                                 contended = false;
2760                         }
2761                         __qdisc_run(q);
2762                 }
2763         }
2764         spin_unlock(root_lock);
2765         if (unlikely(contended))
2766                 spin_unlock(&q->busylock);
2767         return rc;
2768 }
2769
2770 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2771 static void skb_update_prio(struct sk_buff *skb)
2772 {
2773         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2774
2775         if (!skb->priority && skb->sk && map) {
2776                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2777
2778                 if (prioidx < map->priomap_len)
2779                         skb->priority = map->priomap[prioidx];
2780         }
2781 }
2782 #else
2783 #define skb_update_prio(skb)
2784 #endif
2785
2786 static DEFINE_PER_CPU(int, xmit_recursion);
2787 #define RECURSION_LIMIT 10
2788
2789 /**
2790  *      dev_loopback_xmit - loop back @skb
2791  *      @skb: buffer to transmit
2792  */
2793 int dev_loopback_xmit(struct sk_buff *skb)
2794 {
2795         skb_reset_mac_header(skb);
2796         __skb_pull(skb, skb_network_offset(skb));
2797         skb->pkt_type = PACKET_LOOPBACK;
2798         skb->ip_summed = CHECKSUM_UNNECESSARY;
2799         WARN_ON(!skb_dst(skb));
2800         skb_dst_force(skb);
2801         netif_rx_ni(skb);
2802         return 0;
2803 }
2804 EXPORT_SYMBOL(dev_loopback_xmit);
2805
2806 /**
2807  *      dev_queue_xmit - transmit a buffer
2808  *      @skb: buffer to transmit
2809  *
2810  *      Queue a buffer for transmission to a network device. The caller must
2811  *      have set the device and priority and built the buffer before calling
2812  *      this function. The function can be called from an interrupt.
2813  *
2814  *      A negative errno code is returned on a failure. A success does not
2815  *      guarantee the frame will be transmitted as it may be dropped due
2816  *      to congestion or traffic shaping.
2817  *
2818  * -----------------------------------------------------------------------------------
2819  *      I notice this method can also return errors from the queue disciplines,
2820  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2821  *      be positive.
2822  *
2823  *      Regardless of the return value, the skb is consumed, so it is currently
2824  *      difficult to retry a send to this method.  (You can bump the ref count
2825  *      before sending to hold a reference for retry if you are careful.)
2826  *
2827  *      When calling this method, interrupts MUST be enabled.  This is because
2828  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2829  *          --BLG
2830  */
2831 int dev_queue_xmit(struct sk_buff *skb)
2832 {
2833         struct net_device *dev = skb->dev;
2834         struct netdev_queue *txq;
2835         struct Qdisc *q;
2836         int rc = -ENOMEM;
2837
2838         skb_reset_mac_header(skb);
2839
2840         /* Disable soft irqs for various locks below. Also
2841          * stops preemption for RCU.
2842          */
2843         rcu_read_lock_bh();
2844
2845         skb_update_prio(skb);
2846
2847         txq = netdev_pick_tx(dev, skb);
2848         q = rcu_dereference_bh(txq->qdisc);
2849
2850 #ifdef CONFIG_NET_CLS_ACT
2851         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2852 #endif
2853         trace_net_dev_queue(skb);
2854         if (q->enqueue) {
2855                 rc = __dev_xmit_skb(skb, q, dev, txq);
2856                 goto out;
2857         }
2858
2859         /* The device has no queue. Common case for software devices:
2860            loopback, all the sorts of tunnels...
2861
2862            Really, it is unlikely that netif_tx_lock protection is necessary
2863            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2864            counters.)
2865            However, it is possible, that they rely on protection
2866            made by us here.
2867
2868            Check this and shot the lock. It is not prone from deadlocks.
2869            Either shot noqueue qdisc, it is even simpler 8)
2870          */
2871         if (dev->flags & IFF_UP) {
2872                 int cpu = smp_processor_id(); /* ok because BHs are off */
2873
2874                 if (txq->xmit_lock_owner != cpu) {
2875
2876                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2877                                 goto recursion_alert;
2878
2879                         HARD_TX_LOCK(dev, txq, cpu);
2880
2881                         if (!netif_xmit_stopped(txq)) {
2882                                 __this_cpu_inc(xmit_recursion);
2883                                 rc = dev_hard_start_xmit(skb, dev, txq);
2884                                 __this_cpu_dec(xmit_recursion);
2885                                 if (dev_xmit_complete(rc)) {
2886                                         HARD_TX_UNLOCK(dev, txq);
2887                                         goto out;
2888                                 }
2889                         }
2890                         HARD_TX_UNLOCK(dev, txq);
2891                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2892                                              dev->name);
2893                 } else {
2894                         /* Recursion is detected! It is possible,
2895                          * unfortunately
2896                          */
2897 recursion_alert:
2898                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2899                                              dev->name);
2900                 }
2901         }
2902
2903         rc = -ENETDOWN;
2904         rcu_read_unlock_bh();
2905
2906         kfree_skb(skb);
2907         return rc;
2908 out:
2909         rcu_read_unlock_bh();
2910         return rc;
2911 }
2912 EXPORT_SYMBOL(dev_queue_xmit);
2913
2914
2915 /*=======================================================================
2916                         Receiver routines
2917   =======================================================================*/
2918
2919 int netdev_max_backlog __read_mostly = 1000;
2920 EXPORT_SYMBOL(netdev_max_backlog);
2921
2922 int netdev_tstamp_prequeue __read_mostly = 1;
2923 int netdev_budget __read_mostly = 300;
2924 int weight_p __read_mostly = 64;            /* old backlog weight */
2925
2926 /* Called with irq disabled */
2927 static inline void ____napi_schedule(struct softnet_data *sd,
2928                                      struct napi_struct *napi)
2929 {
2930         list_add_tail(&napi->poll_list, &sd->poll_list);
2931         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2932 }
2933
2934 #ifdef CONFIG_RPS
2935
2936 /* One global table that all flow-based protocols share. */
2937 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2938 EXPORT_SYMBOL(rps_sock_flow_table);
2939
2940 struct static_key rps_needed __read_mostly;
2941
2942 static struct rps_dev_flow *
2943 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2944             struct rps_dev_flow *rflow, u16 next_cpu)
2945 {
2946         if (next_cpu != RPS_NO_CPU) {
2947 #ifdef CONFIG_RFS_ACCEL
2948                 struct netdev_rx_queue *rxqueue;
2949                 struct rps_dev_flow_table *flow_table;
2950                 struct rps_dev_flow *old_rflow;
2951                 u32 flow_id;
2952                 u16 rxq_index;
2953                 int rc;
2954
2955                 /* Should we steer this flow to a different hardware queue? */
2956                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2957                     !(dev->features & NETIF_F_NTUPLE))
2958                         goto out;
2959                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2960                 if (rxq_index == skb_get_rx_queue(skb))
2961                         goto out;
2962
2963                 rxqueue = dev->_rx + rxq_index;
2964                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2965                 if (!flow_table)
2966                         goto out;
2967                 flow_id = skb->rxhash & flow_table->mask;
2968                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2969                                                         rxq_index, flow_id);
2970                 if (rc < 0)
2971                         goto out;
2972                 old_rflow = rflow;
2973                 rflow = &flow_table->flows[flow_id];
2974                 rflow->filter = rc;
2975                 if (old_rflow->filter == rflow->filter)
2976                         old_rflow->filter = RPS_NO_FILTER;
2977         out:
2978 #endif
2979                 rflow->last_qtail =
2980                         per_cpu(softnet_data, next_cpu).input_queue_head;
2981         }
2982
2983         rflow->cpu = next_cpu;
2984         return rflow;
2985 }
2986
2987 /*
2988  * get_rps_cpu is called from netif_receive_skb and returns the target
2989  * CPU from the RPS map of the receiving queue for a given skb.
2990  * rcu_read_lock must be held on entry.
2991  */
2992 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2993                        struct rps_dev_flow **rflowp)
2994 {
2995         struct netdev_rx_queue *rxqueue;
2996         struct rps_map *map;
2997         struct rps_dev_flow_table *flow_table;
2998         struct rps_sock_flow_table *sock_flow_table;
2999         int cpu = -1;
3000         u16 tcpu;
3001
3002         if (skb_rx_queue_recorded(skb)) {
3003                 u16 index = skb_get_rx_queue(skb);
3004                 if (unlikely(index >= dev->real_num_rx_queues)) {
3005                         WARN_ONCE(dev->real_num_rx_queues > 1,
3006                                   "%s received packet on queue %u, but number "
3007                                   "of RX queues is %u\n",
3008                                   dev->name, index, dev->real_num_rx_queues);
3009                         goto done;
3010                 }
3011                 rxqueue = dev->_rx + index;
3012         } else
3013                 rxqueue = dev->_rx;
3014
3015         map = rcu_dereference(rxqueue->rps_map);
3016         if (map) {
3017                 if (map->len == 1 &&
3018                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3019                         tcpu = map->cpus[0];
3020                         if (cpu_online(tcpu))
3021                                 cpu = tcpu;
3022                         goto done;
3023                 }
3024         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3025                 goto done;
3026         }
3027
3028         skb_reset_network_header(skb);
3029         if (!skb_get_rxhash(skb))
3030                 goto done;
3031
3032         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3033         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3034         if (flow_table && sock_flow_table) {
3035                 u16 next_cpu;
3036                 struct rps_dev_flow *rflow;
3037
3038                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3039                 tcpu = rflow->cpu;
3040
3041                 next_cpu = sock_flow_table->ents[skb->rxhash &
3042                     sock_flow_table->mask];
3043
3044                 /*
3045                  * If the desired CPU (where last recvmsg was done) is
3046                  * different from current CPU (one in the rx-queue flow
3047                  * table entry), switch if one of the following holds:
3048                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3049                  *   - Current CPU is offline.
3050                  *   - The current CPU's queue tail has advanced beyond the
3051                  *     last packet that was enqueued using this table entry.
3052                  *     This guarantees that all previous packets for the flow
3053                  *     have been dequeued, thus preserving in order delivery.
3054                  */
3055                 if (unlikely(tcpu != next_cpu) &&
3056                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3057                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3058                       rflow->last_qtail)) >= 0)) {
3059                         tcpu = next_cpu;
3060                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3061                 }
3062
3063                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3064                         *rflowp = rflow;
3065                         cpu = tcpu;
3066                         goto done;
3067                 }
3068         }
3069
3070         if (map) {
3071                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3072
3073                 if (cpu_online(tcpu)) {
3074                         cpu = tcpu;
3075                         goto done;
3076                 }
3077         }
3078
3079 done:
3080         return cpu;
3081 }
3082
3083 #ifdef CONFIG_RFS_ACCEL
3084
3085 /**
3086  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3087  * @dev: Device on which the filter was set
3088  * @rxq_index: RX queue index
3089  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3090  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3091  *
3092  * Drivers that implement ndo_rx_flow_steer() should periodically call
3093  * this function for each installed filter and remove the filters for
3094  * which it returns %true.
3095  */
3096 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3097                          u32 flow_id, u16 filter_id)
3098 {
3099         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3100         struct rps_dev_flow_table *flow_table;
3101         struct rps_dev_flow *rflow;
3102         bool expire = true;
3103         int cpu;
3104
3105         rcu_read_lock();
3106         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3107         if (flow_table && flow_id <= flow_table->mask) {
3108                 rflow = &flow_table->flows[flow_id];
3109                 cpu = ACCESS_ONCE(rflow->cpu);
3110                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3111                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3112                            rflow->last_qtail) <
3113                      (int)(10 * flow_table->mask)))
3114                         expire = false;
3115         }
3116         rcu_read_unlock();
3117         return expire;
3118 }
3119 EXPORT_SYMBOL(rps_may_expire_flow);
3120
3121 #endif /* CONFIG_RFS_ACCEL */
3122
3123 /* Called from hardirq (IPI) context */
3124 static void rps_trigger_softirq(void *data)
3125 {
3126         struct softnet_data *sd = data;
3127
3128         ____napi_schedule(sd, &sd->backlog);
3129         sd->received_rps++;
3130 }
3131
3132 #endif /* CONFIG_RPS */
3133
3134 /*
3135  * Check if this softnet_data structure is another cpu one
3136  * If yes, queue it to our IPI list and return 1
3137  * If no, return 0
3138  */
3139 static int rps_ipi_queued(struct softnet_data *sd)
3140 {
3141 #ifdef CONFIG_RPS
3142         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3143
3144         if (sd != mysd) {
3145                 sd->rps_ipi_next = mysd->rps_ipi_list;
3146                 mysd->rps_ipi_list = sd;
3147
3148                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3149                 return 1;
3150         }
3151 #endif /* CONFIG_RPS */
3152         return 0;
3153 }
3154
3155 /*
3156  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3157  * queue (may be a remote CPU queue).
3158  */
3159 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3160                               unsigned int *qtail)
3161 {
3162         struct softnet_data *sd;
3163         unsigned long flags;
3164
3165         sd = &per_cpu(softnet_data, cpu);
3166
3167         local_irq_save(flags);
3168
3169         rps_lock(sd);
3170         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3171                 if (skb_queue_len(&sd->input_pkt_queue)) {
3172 enqueue:
3173                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3174                         input_queue_tail_incr_save(sd, qtail);
3175                         rps_unlock(sd);
3176                         local_irq_restore(flags);
3177                         return NET_RX_SUCCESS;
3178                 }
3179
3180                 /* Schedule NAPI for backlog device
3181                  * We can use non atomic operation since we own the queue lock
3182                  */
3183                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3184                         if (!rps_ipi_queued(sd))
3185                                 ____napi_schedule(sd, &sd->backlog);
3186                 }
3187                 goto enqueue;
3188         }
3189
3190         sd->dropped++;
3191         rps_unlock(sd);
3192
3193         local_irq_restore(flags);
3194
3195         atomic_long_inc(&skb->dev->rx_dropped);
3196         kfree_skb(skb);
3197         return NET_RX_DROP;
3198 }
3199
3200 /**
3201  *      netif_rx        -       post buffer to the network code
3202  *      @skb: buffer to post
3203  *
3204  *      This function receives a packet from a device driver and queues it for
3205  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3206  *      may be dropped during processing for congestion control or by the
3207  *      protocol layers.
3208  *
3209  *      return values:
3210  *      NET_RX_SUCCESS  (no congestion)
3211  *      NET_RX_DROP     (packet was dropped)
3212  *
3213  */
3214
3215 int netif_rx(struct sk_buff *skb)
3216 {
3217         int ret;
3218
3219         /* if netpoll wants it, pretend we never saw it */
3220         if (netpoll_rx(skb))
3221                 return NET_RX_DROP;
3222
3223         net_timestamp_check(netdev_tstamp_prequeue, skb);
3224
3225         trace_netif_rx(skb);
3226 #ifdef CONFIG_RPS
3227         if (static_key_false(&rps_needed)) {
3228                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3229                 int cpu;
3230
3231                 preempt_disable();
3232                 rcu_read_lock();
3233
3234                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3235                 if (cpu < 0)
3236                         cpu = smp_processor_id();
3237
3238                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3239
3240                 rcu_read_unlock();
3241                 preempt_enable();
3242         } else
3243 #endif
3244         {
3245                 unsigned int qtail;
3246                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3247                 put_cpu();
3248         }
3249         return ret;
3250 }
3251 EXPORT_SYMBOL(netif_rx);
3252
3253 int netif_rx_ni(struct sk_buff *skb)
3254 {
3255         int err;
3256
3257         preempt_disable();
3258         err = netif_rx(skb);
3259         if (local_softirq_pending())
3260                 do_softirq();
3261         preempt_enable();
3262
3263         return err;
3264 }
3265 EXPORT_SYMBOL(netif_rx_ni);
3266
3267 static void net_tx_action(struct softirq_action *h)
3268 {
3269         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3270
3271         if (sd->completion_queue) {
3272                 struct sk_buff *clist;
3273
3274                 local_irq_disable();
3275                 clist = sd->completion_queue;
3276                 sd->completion_queue = NULL;
3277                 local_irq_enable();
3278
3279                 while (clist) {
3280                         struct sk_buff *skb = clist;
3281                         clist = clist->next;
3282
3283                         WARN_ON(atomic_read(&skb->users));
3284                         trace_kfree_skb(skb, net_tx_action);
3285                         __kfree_skb(skb);
3286                 }
3287         }
3288
3289         if (sd->output_queue) {
3290                 struct Qdisc *head;
3291
3292                 local_irq_disable();
3293                 head = sd->output_queue;
3294                 sd->output_queue = NULL;
3295                 sd->output_queue_tailp = &sd->output_queue;
3296                 local_irq_enable();
3297
3298                 while (head) {
3299                         struct Qdisc *q = head;
3300                         spinlock_t *root_lock;
3301
3302                         head = head->next_sched;
3303
3304                         root_lock = qdisc_lock(q);
3305                         if (spin_trylock(root_lock)) {
3306                                 smp_mb__before_clear_bit();
3307                                 clear_bit(__QDISC_STATE_SCHED,
3308                                           &q->state);
3309                                 qdisc_run(q);
3310                                 spin_unlock(root_lock);
3311                         } else {
3312                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3313                                               &q->state)) {
3314                                         __netif_reschedule(q);
3315                                 } else {
3316                                         smp_mb__before_clear_bit();
3317                                         clear_bit(__QDISC_STATE_SCHED,
3318                                                   &q->state);
3319                                 }
3320                         }
3321                 }
3322         }
3323 }
3324
3325 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3326     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3327 /* This hook is defined here for ATM LANE */
3328 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3329                              unsigned char *addr) __read_mostly;
3330 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3331 #endif
3332
3333 #ifdef CONFIG_NET_CLS_ACT
3334 /* TODO: Maybe we should just force sch_ingress to be compiled in
3335  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3336  * a compare and 2 stores extra right now if we dont have it on
3337  * but have CONFIG_NET_CLS_ACT
3338  * NOTE: This doesn't stop any functionality; if you dont have
3339  * the ingress scheduler, you just can't add policies on ingress.
3340  *
3341  */
3342 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3343 {
3344         struct net_device *dev = skb->dev;
3345         u32 ttl = G_TC_RTTL(skb->tc_verd);
3346         int result = TC_ACT_OK;
3347         struct Qdisc *q;
3348
3349         if (unlikely(MAX_RED_LOOP < ttl++)) {
3350                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3351                                      skb->skb_iif, dev->ifindex);
3352                 return TC_ACT_SHOT;
3353         }
3354
3355         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3356         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3357
3358         q = rxq->qdisc;
3359         if (q != &noop_qdisc) {
3360                 spin_lock(qdisc_lock(q));
3361                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3362                         result = qdisc_enqueue_root(skb, q);
3363                 spin_unlock(qdisc_lock(q));
3364         }
3365
3366         return result;
3367 }
3368
3369 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3370                                          struct packet_type **pt_prev,
3371                                          int *ret, struct net_device *orig_dev)
3372 {
3373         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3374
3375         if (!rxq || rxq->qdisc == &noop_qdisc)
3376                 goto out;
3377
3378         if (*pt_prev) {
3379                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3380                 *pt_prev = NULL;
3381         }
3382
3383         switch (ing_filter(skb, rxq)) {
3384         case TC_ACT_SHOT:
3385         case TC_ACT_STOLEN:
3386                 kfree_skb(skb);
3387                 return NULL;
3388         }
3389
3390 out:
3391         skb->tc_verd = 0;
3392         return skb;
3393 }
3394 #endif
3395
3396 /**
3397  *      netdev_rx_handler_register - register receive handler
3398  *      @dev: device to register a handler for
3399  *      @rx_handler: receive handler to register
3400  *      @rx_handler_data: data pointer that is used by rx handler
3401  *
3402  *      Register a receive hander for a device. This handler will then be
3403  *      called from __netif_receive_skb. A negative errno code is returned
3404  *      on a failure.
3405  *
3406  *      The caller must hold the rtnl_mutex.
3407  *
3408  *      For a general description of rx_handler, see enum rx_handler_result.
3409  */
3410 int netdev_rx_handler_register(struct net_device *dev,
3411                                rx_handler_func_t *rx_handler,
3412                                void *rx_handler_data)
3413 {
3414         ASSERT_RTNL();
3415
3416         if (dev->rx_handler)
3417                 return -EBUSY;
3418
3419         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3420         rcu_assign_pointer(dev->rx_handler, rx_handler);
3421
3422         return 0;
3423 }
3424 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3425
3426 /**
3427  *      netdev_rx_handler_unregister - unregister receive handler
3428  *      @dev: device to unregister a handler from
3429  *
3430  *      Unregister a receive hander from a device.
3431  *
3432  *      The caller must hold the rtnl_mutex.
3433  */
3434 void netdev_rx_handler_unregister(struct net_device *dev)
3435 {
3436
3437         ASSERT_RTNL();
3438         RCU_INIT_POINTER(dev->rx_handler, NULL);
3439         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3440 }
3441 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3442
3443 /*
3444  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3445  * the special handling of PFMEMALLOC skbs.
3446  */
3447 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3448 {
3449         switch (skb->protocol) {
3450         case __constant_htons(ETH_P_ARP):
3451         case __constant_htons(ETH_P_IP):
3452         case __constant_htons(ETH_P_IPV6):
3453         case __constant_htons(ETH_P_8021Q):
3454                 return true;
3455         default:
3456                 return false;
3457         }
3458 }
3459
3460 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3461 {
3462         struct packet_type *ptype, *pt_prev;
3463         rx_handler_func_t *rx_handler;
3464         struct net_device *orig_dev;
3465         struct net_device *null_or_dev;
3466         bool deliver_exact = false;
3467         int ret = NET_RX_DROP;
3468         __be16 type;
3469
3470         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3471
3472         trace_netif_receive_skb(skb);
3473
3474         /* if we've gotten here through NAPI, check netpoll */
3475         if (netpoll_receive_skb(skb))
3476                 goto out;
3477
3478         orig_dev = skb->dev;
3479
3480         skb_reset_network_header(skb);
3481         if (!skb_transport_header_was_set(skb))
3482                 skb_reset_transport_header(skb);
3483         skb_reset_mac_len(skb);
3484
3485         pt_prev = NULL;
3486
3487         rcu_read_lock();
3488
3489 another_round:
3490         skb->skb_iif = skb->dev->ifindex;
3491
3492         __this_cpu_inc(softnet_data.processed);
3493
3494         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3495                 skb = vlan_untag(skb);
3496                 if (unlikely(!skb))
3497                         goto unlock;
3498         }
3499
3500 #ifdef CONFIG_NET_CLS_ACT
3501         if (skb->tc_verd & TC_NCLS) {
3502                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3503                 goto ncls;
3504         }
3505 #endif
3506
3507         if (pfmemalloc)
3508                 goto skip_taps;
3509
3510         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3511                 if (!ptype->dev || ptype->dev == skb->dev) {
3512                         if (pt_prev)
3513                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3514                         pt_prev = ptype;
3515                 }
3516         }
3517
3518 skip_taps:
3519 #ifdef CONFIG_NET_CLS_ACT
3520         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3521         if (!skb)
3522                 goto unlock;
3523 ncls:
3524 #endif
3525
3526         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3527                 goto drop;
3528
3529         if (vlan_tx_tag_present(skb)) {
3530                 if (pt_prev) {
3531                         ret = deliver_skb(skb, pt_prev, orig_dev);
3532                         pt_prev = NULL;
3533                 }
3534                 if (vlan_do_receive(&skb))
3535                         goto another_round;
3536                 else if (unlikely(!skb))
3537                         goto unlock;
3538         }
3539
3540         rx_handler = rcu_dereference(skb->dev->rx_handler);
3541         if (rx_handler) {
3542                 if (pt_prev) {
3543                         ret = deliver_skb(skb, pt_prev, orig_dev);
3544                         pt_prev = NULL;
3545                 }
3546                 switch (rx_handler(&skb)) {
3547                 case RX_HANDLER_CONSUMED:
3548                         goto unlock;
3549                 case RX_HANDLER_ANOTHER:
3550                         goto another_round;
3551                 case RX_HANDLER_EXACT:
3552                         deliver_exact = true;
3553                 case RX_HANDLER_PASS:
3554                         break;
3555                 default:
3556                         BUG();
3557                 }
3558         }
3559
3560         if (vlan_tx_nonzero_tag_present(skb))
3561                 skb->pkt_type = PACKET_OTHERHOST;
3562
3563         /* deliver only exact match when indicated */
3564         null_or_dev = deliver_exact ? skb->dev : NULL;
3565
3566         type = skb->protocol;
3567         list_for_each_entry_rcu(ptype,
3568                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3569                 if (ptype->type == type &&
3570                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3571                      ptype->dev == orig_dev)) {
3572                         if (pt_prev)
3573                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3574                         pt_prev = ptype;
3575                 }
3576         }
3577
3578         if (pt_prev) {
3579                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3580                         goto drop;
3581                 else
3582                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3583         } else {
3584 drop:
3585                 atomic_long_inc(&skb->dev->rx_dropped);
3586                 kfree_skb(skb);
3587                 /* Jamal, now you will not able to escape explaining
3588                  * me how you were going to use this. :-)
3589                  */
3590                 ret = NET_RX_DROP;
3591         }
3592
3593 unlock:
3594         rcu_read_unlock();
3595 out:
3596         return ret;
3597 }
3598
3599 static int __netif_receive_skb(struct sk_buff *skb)
3600 {
3601         int ret;
3602
3603         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3604                 unsigned long pflags = current->flags;
3605
3606                 /*
3607                  * PFMEMALLOC skbs are special, they should
3608                  * - be delivered to SOCK_MEMALLOC sockets only
3609                  * - stay away from userspace
3610                  * - have bounded memory usage
3611                  *
3612                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3613                  * context down to all allocation sites.
3614                  */
3615                 current->flags |= PF_MEMALLOC;
3616                 ret = __netif_receive_skb_core(skb, true);
3617                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3618         } else
3619                 ret = __netif_receive_skb_core(skb, false);
3620
3621         return ret;
3622 }
3623
3624 /**
3625  *      netif_receive_skb - process receive buffer from network
3626  *      @skb: buffer to process
3627  *
3628  *      netif_receive_skb() is the main receive data processing function.
3629  *      It always succeeds. The buffer may be dropped during processing
3630  *      for congestion control or by the protocol layers.
3631  *
3632  *      This function may only be called from softirq context and interrupts
3633  *      should be enabled.
3634  *
3635  *      Return values (usually ignored):
3636  *      NET_RX_SUCCESS: no congestion
3637  *      NET_RX_DROP: packet was dropped
3638  */
3639 int netif_receive_skb(struct sk_buff *skb)
3640 {
3641         net_timestamp_check(netdev_tstamp_prequeue, skb);
3642
3643         if (skb_defer_rx_timestamp(skb))
3644                 return NET_RX_SUCCESS;
3645
3646 #ifdef CONFIG_RPS
3647         if (static_key_false(&rps_needed)) {
3648                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3649                 int cpu, ret;
3650
3651                 rcu_read_lock();
3652
3653                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3654
3655                 if (cpu >= 0) {
3656                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3657                         rcu_read_unlock();
3658                         return ret;
3659                 }
3660                 rcu_read_unlock();
3661         }
3662 #endif
3663         return __netif_receive_skb(skb);
3664 }
3665 EXPORT_SYMBOL(netif_receive_skb);
3666
3667 /* Network device is going away, flush any packets still pending
3668  * Called with irqs disabled.
3669  */
3670 static void flush_backlog(void *arg)
3671 {
3672         struct net_device *dev = arg;
3673         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3674         struct sk_buff *skb, *tmp;
3675
3676         rps_lock(sd);
3677         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3678                 if (skb->dev == dev) {
3679                         __skb_unlink(skb, &sd->input_pkt_queue);
3680                         kfree_skb(skb);
3681                         input_queue_head_incr(sd);
3682                 }
3683         }
3684         rps_unlock(sd);
3685
3686         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3687                 if (skb->dev == dev) {
3688                         __skb_unlink(skb, &sd->process_queue);
3689                         kfree_skb(skb);
3690                         input_queue_head_incr(sd);
3691                 }
3692         }
3693 }
3694
3695 static int napi_gro_complete(struct sk_buff *skb)
3696 {
3697         struct packet_offload *ptype;
3698         __be16 type = skb->protocol;
3699         struct list_head *head = &offload_base;
3700         int err = -ENOENT;
3701
3702         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3703
3704         if (NAPI_GRO_CB(skb)->count == 1) {
3705                 skb_shinfo(skb)->gso_size = 0;
3706                 goto out;
3707         }
3708
3709         rcu_read_lock();
3710         list_for_each_entry_rcu(ptype, head, list) {
3711                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3712                         continue;
3713
3714                 err = ptype->callbacks.gro_complete(skb);
3715                 break;
3716         }
3717         rcu_read_unlock();
3718
3719         if (err) {
3720                 WARN_ON(&ptype->list == head);
3721                 kfree_skb(skb);
3722                 return NET_RX_SUCCESS;
3723         }
3724
3725 out:
3726         return netif_receive_skb(skb);
3727 }
3728
3729 /* napi->gro_list contains packets ordered by age.
3730  * youngest packets at the head of it.
3731  * Complete skbs in reverse order to reduce latencies.
3732  */
3733 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3734 {
3735         struct sk_buff *skb, *prev = NULL;
3736
3737         /* scan list and build reverse chain */
3738         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3739                 skb->prev = prev;
3740                 prev = skb;
3741         }
3742
3743         for (skb = prev; skb; skb = prev) {
3744                 skb->next = NULL;
3745
3746                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3747                         return;
3748
3749                 prev = skb->prev;
3750                 napi_gro_complete(skb);
3751                 napi->gro_count--;
3752         }
3753
3754         napi->gro_list = NULL;
3755 }
3756 EXPORT_SYMBOL(napi_gro_flush);
3757
3758 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3759 {
3760         struct sk_buff *p;
3761         unsigned int maclen = skb->dev->hard_header_len;
3762
3763         for (p = napi->gro_list; p; p = p->next) {
3764                 unsigned long diffs;
3765
3766                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3767                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3768                 if (maclen == ETH_HLEN)
3769                         diffs |= compare_ether_header(skb_mac_header(p),
3770                                                       skb_gro_mac_header(skb));
3771                 else if (!diffs)
3772                         diffs = memcmp(skb_mac_header(p),
3773                                        skb_gro_mac_header(skb),
3774                                        maclen);
3775                 NAPI_GRO_CB(p)->same_flow = !diffs;
3776                 NAPI_GRO_CB(p)->flush = 0;
3777         }
3778 }
3779
3780 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3781 {
3782         struct sk_buff **pp = NULL;
3783         struct packet_offload *ptype;
3784         __be16 type = skb->protocol;
3785         struct list_head *head = &offload_base;
3786         int same_flow;
3787         int mac_len;
3788         enum gro_result ret;
3789
3790         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3791                 goto normal;
3792
3793         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3794                 goto normal;
3795
3796         gro_list_prepare(napi, skb);
3797
3798         rcu_read_lock();
3799         list_for_each_entry_rcu(ptype, head, list) {
3800                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3801                         continue;
3802
3803                 skb_set_network_header(skb, skb_gro_offset(skb));
3804                 mac_len = skb->network_header - skb->mac_header;
3805                 skb->mac_len = mac_len;
3806                 NAPI_GRO_CB(skb)->same_flow = 0;
3807                 NAPI_GRO_CB(skb)->flush = 0;
3808                 NAPI_GRO_CB(skb)->free = 0;
3809
3810                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3811                 break;
3812         }
3813         rcu_read_unlock();
3814
3815         if (&ptype->list == head)
3816                 goto normal;
3817
3818         same_flow = NAPI_GRO_CB(skb)->same_flow;
3819         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3820
3821         if (pp) {
3822                 struct sk_buff *nskb = *pp;
3823
3824                 *pp = nskb->next;
3825                 nskb->next = NULL;
3826                 napi_gro_complete(nskb);
3827                 napi->gro_count--;
3828         }
3829
3830         if (same_flow)
3831                 goto ok;
3832
3833         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3834                 goto normal;
3835
3836         napi->gro_count++;
3837         NAPI_GRO_CB(skb)->count = 1;
3838         NAPI_GRO_CB(skb)->age = jiffies;
3839         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3840         skb->next = napi->gro_list;
3841         napi->gro_list = skb;
3842         ret = GRO_HELD;
3843
3844 pull:
3845         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3846                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3847
3848                 BUG_ON(skb->end - skb->tail < grow);
3849
3850                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3851
3852                 skb->tail += grow;
3853                 skb->data_len -= grow;
3854
3855                 skb_shinfo(skb)->frags[0].page_offset += grow;
3856                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3857
3858                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3859                         skb_frag_unref(skb, 0);
3860                         memmove(skb_shinfo(skb)->frags,
3861                                 skb_shinfo(skb)->frags + 1,
3862                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3863                 }
3864         }
3865
3866 ok:
3867         return ret;
3868
3869 normal:
3870         ret = GRO_NORMAL;
3871         goto pull;
3872 }
3873
3874
3875 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3876 {
3877         switch (ret) {
3878         case GRO_NORMAL:
3879                 if (netif_receive_skb(skb))
3880                         ret = GRO_DROP;
3881                 break;
3882
3883         case GRO_DROP:
3884                 kfree_skb(skb);
3885                 break;
3886
3887         case GRO_MERGED_FREE:
3888                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3889                         kmem_cache_free(skbuff_head_cache, skb);
3890                 else
3891                         __kfree_skb(skb);
3892                 break;
3893
3894         case GRO_HELD:
3895         case GRO_MERGED:
3896                 break;
3897         }
3898
3899         return ret;
3900 }
3901
3902 static void skb_gro_reset_offset(struct sk_buff *skb)
3903 {
3904         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3905         const skb_frag_t *frag0 = &pinfo->frags[0];
3906
3907         NAPI_GRO_CB(skb)->data_offset = 0;
3908         NAPI_GRO_CB(skb)->frag0 = NULL;
3909         NAPI_GRO_CB(skb)->frag0_len = 0;
3910
3911         if (skb->mac_header == skb->tail &&
3912             pinfo->nr_frags &&
3913             !PageHighMem(skb_frag_page(frag0))) {
3914                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3915                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3916         }
3917 }
3918
3919 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3920 {
3921         skb_gro_reset_offset(skb);
3922
3923         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3924 }
3925 EXPORT_SYMBOL(napi_gro_receive);
3926
3927 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3928 {
3929         __skb_pull(skb, skb_headlen(skb));
3930         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3931         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3932         skb->vlan_tci = 0;
3933         skb->dev = napi->dev;
3934         skb->skb_iif = 0;
3935
3936         napi->skb = skb;
3937 }
3938
3939 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3940 {
3941         struct sk_buff *skb = napi->skb;
3942
3943         if (!skb) {
3944                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3945                 if (skb)
3946                         napi->skb = skb;
3947         }
3948         return skb;
3949 }
3950 EXPORT_SYMBOL(napi_get_frags);
3951
3952 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3953                                gro_result_t ret)
3954 {
3955         switch (ret) {
3956         case GRO_NORMAL:
3957         case GRO_HELD:
3958                 skb->protocol = eth_type_trans(skb, skb->dev);
3959
3960                 if (ret == GRO_HELD)
3961                         skb_gro_pull(skb, -ETH_HLEN);
3962                 else if (netif_receive_skb(skb))
3963                         ret = GRO_DROP;
3964                 break;
3965
3966         case GRO_DROP:
3967         case GRO_MERGED_FREE:
3968                 napi_reuse_skb(napi, skb);
3969                 break;
3970
3971         case GRO_MERGED:
3972                 break;
3973         }
3974
3975         return ret;
3976 }
3977
3978 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3979 {
3980         struct sk_buff *skb = napi->skb;
3981         struct ethhdr *eth;
3982         unsigned int hlen;
3983         unsigned int off;
3984
3985         napi->skb = NULL;
3986
3987         skb_reset_mac_header(skb);
3988         skb_gro_reset_offset(skb);
3989
3990         off = skb_gro_offset(skb);
3991         hlen = off + sizeof(*eth);
3992         eth = skb_gro_header_fast(skb, off);
3993         if (skb_gro_header_hard(skb, hlen)) {
3994                 eth = skb_gro_header_slow(skb, hlen, off);
3995                 if (unlikely(!eth)) {
3996                         napi_reuse_skb(napi, skb);
3997                         skb = NULL;
3998                         goto out;
3999                 }
4000         }
4001
4002         skb_gro_pull(skb, sizeof(*eth));
4003
4004         /*
4005          * This works because the only protocols we care about don't require
4006          * special handling.  We'll fix it up properly at the end.
4007          */
4008         skb->protocol = eth->h_proto;
4009
4010 out:
4011         return skb;
4012 }
4013
4014 gro_result_t napi_gro_frags(struct napi_struct *napi)
4015 {
4016         struct sk_buff *skb = napi_frags_skb(napi);
4017
4018         if (!skb)
4019                 return GRO_DROP;
4020
4021         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4022 }
4023 EXPORT_SYMBOL(napi_gro_frags);
4024
4025 /*
4026  * net_rps_action sends any pending IPI's for rps.
4027  * Note: called with local irq disabled, but exits with local irq enabled.
4028  */
4029 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4030 {
4031 #ifdef CONFIG_RPS
4032         struct softnet_data *remsd = sd->rps_ipi_list;
4033
4034         if (remsd) {
4035                 sd->rps_ipi_list = NULL;
4036
4037                 local_irq_enable();
4038
4039                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4040                 while (remsd) {
4041                         struct softnet_data *next = remsd->rps_ipi_next;
4042
4043                         if (cpu_online(remsd->cpu))
4044                                 __smp_call_function_single(remsd->cpu,
4045                                                            &remsd->csd, 0);
4046                         remsd = next;
4047                 }
4048         } else
4049 #endif
4050                 local_irq_enable();
4051 }
4052
4053 static int process_backlog(struct napi_struct *napi, int quota)
4054 {
4055         int work = 0;
4056         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4057
4058 #ifdef CONFIG_RPS
4059         /* Check if we have pending ipi, its better to send them now,
4060          * not waiting net_rx_action() end.
4061          */
4062         if (sd->rps_ipi_list) {
4063                 local_irq_disable();
4064                 net_rps_action_and_irq_enable(sd);
4065         }
4066 #endif
4067         napi->weight = weight_p;
4068         local_irq_disable();
4069         while (work < quota) {
4070                 struct sk_buff *skb;
4071                 unsigned int qlen;
4072
4073                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4074                         local_irq_enable();
4075                         __netif_receive_skb(skb);
4076                         local_irq_disable();
4077                         input_queue_head_incr(sd);
4078                         if (++work >= quota) {
4079                                 local_irq_enable();
4080                                 return work;
4081                         }
4082                 }
4083
4084                 rps_lock(sd);
4085                 qlen = skb_queue_len(&sd->input_pkt_queue);
4086                 if (qlen)
4087                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4088                                                    &sd->process_queue);
4089
4090                 if (qlen < quota - work) {
4091                         /*
4092                          * Inline a custom version of __napi_complete().
4093                          * only current cpu owns and manipulates this napi,
4094                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4095                          * we can use a plain write instead of clear_bit(),
4096                          * and we dont need an smp_mb() memory barrier.
4097                          */
4098                         list_del(&napi->poll_list);
4099                         napi->state = 0;
4100
4101                         quota = work + qlen;
4102                 }
4103                 rps_unlock(sd);
4104         }
4105         local_irq_enable();
4106
4107         return work;
4108 }
4109
4110 /**
4111  * __napi_schedule - schedule for receive
4112  * @n: entry to schedule
4113  *
4114  * The entry's receive function will be scheduled to run
4115  */
4116 void __napi_schedule(struct napi_struct *n)
4117 {
4118         unsigned long flags;
4119
4120         local_irq_save(flags);
4121         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4122         local_irq_restore(flags);
4123 }
4124 EXPORT_SYMBOL(__napi_schedule);
4125
4126 void __napi_complete(struct napi_struct *n)
4127 {
4128         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4129         BUG_ON(n->gro_list);
4130
4131         list_del(&n->poll_list);
4132         smp_mb__before_clear_bit();
4133         clear_bit(NAPI_STATE_SCHED, &n->state);
4134 }
4135 EXPORT_SYMBOL(__napi_complete);
4136
4137 void napi_complete(struct napi_struct *n)
4138 {
4139         unsigned long flags;
4140
4141         /*
4142          * don't let napi dequeue from the cpu poll list
4143          * just in case its running on a different cpu
4144          */
4145         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4146                 return;
4147
4148         napi_gro_flush(n, false);
4149         local_irq_save(flags);
4150         __napi_complete(n);
4151         local_irq_restore(flags);
4152 }
4153 EXPORT_SYMBOL(napi_complete);
4154
4155 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4156                     int (*poll)(struct napi_struct *, int), int weight)
4157 {
4158         INIT_LIST_HEAD(&napi->poll_list);
4159         napi->gro_count = 0;
4160         napi->gro_list = NULL;
4161         napi->skb = NULL;
4162         napi->poll = poll;
4163         napi->weight = weight;
4164         list_add(&napi->dev_list, &dev->napi_list);
4165         napi->dev = dev;
4166 #ifdef CONFIG_NETPOLL
4167         spin_lock_init(&napi->poll_lock);
4168         napi->poll_owner = -1;
4169 #endif
4170         set_bit(NAPI_STATE_SCHED, &napi->state);
4171 }
4172 EXPORT_SYMBOL(netif_napi_add);
4173
4174 void netif_napi_del(struct napi_struct *napi)
4175 {
4176         struct sk_buff *skb, *next;
4177
4178         list_del_init(&napi->dev_list);
4179         napi_free_frags(napi);
4180
4181         for (skb = napi->gro_list; skb; skb = next) {
4182                 next = skb->next;
4183                 skb->next = NULL;
4184                 kfree_skb(skb);
4185         }
4186
4187         napi->gro_list = NULL;
4188         napi->gro_count = 0;
4189 }
4190 EXPORT_SYMBOL(netif_napi_del);
4191
4192 static void net_rx_action(struct softirq_action *h)
4193 {
4194         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4195         unsigned long time_limit = jiffies + 2;
4196         int budget = netdev_budget;
4197         void *have;
4198
4199         local_irq_disable();
4200
4201         while (!list_empty(&sd->poll_list)) {
4202                 struct napi_struct *n;
4203                 int work, weight;
4204
4205                 /* If softirq window is exhuasted then punt.
4206                  * Allow this to run for 2 jiffies since which will allow
4207                  * an average latency of 1.5/HZ.
4208                  */
4209                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4210                         goto softnet_break;
4211
4212                 local_irq_enable();
4213
4214                 /* Even though interrupts have been re-enabled, this
4215                  * access is safe because interrupts can only add new
4216                  * entries to the tail of this list, and only ->poll()
4217                  * calls can remove this head entry from the list.
4218                  */
4219                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4220
4221                 have = netpoll_poll_lock(n);
4222
4223                 weight = n->weight;
4224
4225                 /* This NAPI_STATE_SCHED test is for avoiding a race
4226                  * with netpoll's poll_napi().  Only the entity which
4227                  * obtains the lock and sees NAPI_STATE_SCHED set will
4228                  * actually make the ->poll() call.  Therefore we avoid
4229                  * accidentally calling ->poll() when NAPI is not scheduled.
4230                  */
4231                 work = 0;
4232                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4233                         work = n->poll(n, weight);
4234                         trace_napi_poll(n);
4235                 }
4236
4237                 WARN_ON_ONCE(work > weight);
4238
4239                 budget -= work;
4240
4241                 local_irq_disable();
4242
4243                 /* Drivers must not modify the NAPI state if they
4244                  * consume the entire weight.  In such cases this code
4245                  * still "owns" the NAPI instance and therefore can
4246                  * move the instance around on the list at-will.
4247                  */
4248                 if (unlikely(work == weight)) {
4249                         if (unlikely(napi_disable_pending(n))) {
4250                                 local_irq_enable();
4251                                 napi_complete(n);
4252                                 local_irq_disable();
4253                         } else {
4254                                 if (n->gro_list) {
4255                                         /* flush too old packets
4256                                          * If HZ < 1000, flush all packets.
4257                                          */
4258                                         local_irq_enable();
4259                                         napi_gro_flush(n, HZ >= 1000);
4260                                         local_irq_disable();
4261                                 }
4262                                 list_move_tail(&n->poll_list, &sd->poll_list);
4263                         }
4264                 }
4265
4266                 netpoll_poll_unlock(have);
4267         }
4268 out:
4269         net_rps_action_and_irq_enable(sd);
4270
4271 #ifdef CONFIG_NET_DMA
4272         /*
4273          * There may not be any more sk_buffs coming right now, so push
4274          * any pending DMA copies to hardware
4275          */
4276         dma_issue_pending_all();
4277 #endif
4278
4279         return;
4280
4281 softnet_break:
4282         sd->time_squeeze++;
4283         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4284         goto out;
4285 }
4286
4287 static gifconf_func_t *gifconf_list[NPROTO];
4288
4289 /**
4290  *      register_gifconf        -       register a SIOCGIF handler
4291  *      @family: Address family
4292  *      @gifconf: Function handler
4293  *
4294  *      Register protocol dependent address dumping routines. The handler
4295  *      that is passed must not be freed or reused until it has been replaced
4296  *      by another handler.
4297  */
4298 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4299 {
4300         if (family >= NPROTO)
4301                 return -EINVAL;
4302         gifconf_list[family] = gifconf;
4303         return 0;
4304 }
4305 EXPORT_SYMBOL(register_gifconf);
4306
4307
4308 /*
4309  *      Map an interface index to its name (SIOCGIFNAME)
4310  */
4311
4312 /*
4313  *      We need this ioctl for efficient implementation of the
4314  *      if_indextoname() function required by the IPv6 API.  Without
4315  *      it, we would have to search all the interfaces to find a
4316  *      match.  --pb
4317  */
4318
4319 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4320 {
4321         struct net_device *dev;
4322         struct ifreq ifr;
4323         unsigned seq;
4324
4325         /*
4326          *      Fetch the caller's info block.
4327          */
4328
4329         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4330                 return -EFAULT;
4331
4332 retry:
4333         seq = read_seqcount_begin(&devnet_rename_seq);
4334         rcu_read_lock();
4335         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4336         if (!dev) {
4337                 rcu_read_unlock();
4338                 return -ENODEV;
4339         }
4340
4341         strcpy(ifr.ifr_name, dev->name);
4342         rcu_read_unlock();
4343         if (read_seqcount_retry(&devnet_rename_seq, seq))
4344                 goto retry;
4345
4346         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4347                 return -EFAULT;
4348         return 0;
4349 }
4350
4351 /*
4352  *      Perform a SIOCGIFCONF call. This structure will change
4353  *      size eventually, and there is nothing I can do about it.
4354  *      Thus we will need a 'compatibility mode'.
4355  */
4356
4357 static int dev_ifconf(struct net *net, char __user *arg)
4358 {
4359         struct ifconf ifc;
4360         struct net_device *dev;
4361         char __user *pos;
4362         int len;
4363         int total;
4364         int i;
4365
4366         /*
4367          *      Fetch the caller's info block.
4368          */
4369
4370         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4371                 return -EFAULT;
4372
4373         pos = ifc.ifc_buf;
4374         len = ifc.ifc_len;
4375
4376         /*
4377          *      Loop over the interfaces, and write an info block for each.
4378          */
4379
4380         total = 0;
4381         for_each_netdev(net, dev) {
4382                 for (i = 0; i < NPROTO; i++) {
4383                         if (gifconf_list[i]) {
4384                                 int done;
4385                                 if (!pos)
4386                                         done = gifconf_list[i](dev, NULL, 0);
4387                                 else
4388                                         done = gifconf_list[i](dev, pos + total,
4389                                                                len - total);
4390                                 if (done < 0)
4391                                         return -EFAULT;
4392                                 total += done;
4393                         }
4394                 }
4395         }
4396
4397         /*
4398          *      All done.  Write the updated control block back to the caller.
4399          */
4400         ifc.ifc_len = total;
4401
4402         /*
4403          *      Both BSD and Solaris return 0 here, so we do too.
4404          */
4405         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4406 }
4407
4408 #ifdef CONFIG_PROC_FS
4409
4410 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4411
4412 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4413 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4414 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4415
4416 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4417 {
4418         struct net *net = seq_file_net(seq);
4419         struct net_device *dev;
4420         struct hlist_node *p;
4421         struct hlist_head *h;
4422         unsigned int count = 0, offset = get_offset(*pos);
4423
4424         h = &net->dev_name_head[get_bucket(*pos)];
4425         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4426                 if (++count == offset)
4427                         return dev;
4428         }
4429
4430         return NULL;
4431 }
4432
4433 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4434 {
4435         struct net_device *dev;
4436         unsigned int bucket;
4437
4438         do {
4439                 dev = dev_from_same_bucket(seq, pos);
4440                 if (dev)
4441                         return dev;
4442
4443                 bucket = get_bucket(*pos) + 1;
4444                 *pos = set_bucket_offset(bucket, 1);
4445         } while (bucket < NETDEV_HASHENTRIES);
4446
4447         return NULL;
4448 }
4449
4450 /*
4451  *      This is invoked by the /proc filesystem handler to display a device
4452  *      in detail.
4453  */
4454 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4455         __acquires(RCU)
4456 {
4457         rcu_read_lock();
4458         if (!*pos)
4459                 return SEQ_START_TOKEN;
4460
4461         if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4462                 return NULL;
4463
4464         return dev_from_bucket(seq, pos);
4465 }
4466
4467 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4468 {
4469         ++*pos;
4470         return dev_from_bucket(seq, pos);
4471 }
4472
4473 void dev_seq_stop(struct seq_file *seq, void *v)
4474         __releases(RCU)
4475 {
4476         rcu_read_unlock();
4477 }
4478
4479 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4480 {
4481         struct rtnl_link_stats64 temp;
4482         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4483
4484         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4485                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4486                    dev->name, stats->rx_bytes, stats->rx_packets,
4487                    stats->rx_errors,
4488                    stats->rx_dropped + stats->rx_missed_errors,
4489                    stats->rx_fifo_errors,
4490                    stats->rx_length_errors + stats->rx_over_errors +
4491                     stats->rx_crc_errors + stats->rx_frame_errors,
4492                    stats->rx_compressed, stats->multicast,
4493                    stats->tx_bytes, stats->tx_packets,
4494                    stats->tx_errors, stats->tx_dropped,
4495                    stats->tx_fifo_errors, stats->collisions,
4496                    stats->tx_carrier_errors +
4497                     stats->tx_aborted_errors +
4498                     stats->tx_window_errors +
4499                     stats->tx_heartbeat_errors,
4500                    stats->tx_compressed);
4501 }
4502
4503 /*
4504  *      Called from the PROCfs module. This now uses the new arbitrary sized
4505  *      /proc/net interface to create /proc/net/dev
4506  */
4507 static int dev_seq_show(struct seq_file *seq, void *v)
4508 {
4509         if (v == SEQ_START_TOKEN)
4510                 seq_puts(seq, "Inter-|   Receive                            "
4511                               "                    |  Transmit\n"
4512                               " face |bytes    packets errs drop fifo frame "
4513                               "compressed multicast|bytes    packets errs "
4514                               "drop fifo colls carrier compressed\n");
4515         else
4516                 dev_seq_printf_stats(seq, v);
4517         return 0;
4518 }
4519
4520 static struct softnet_data *softnet_get_online(loff_t *pos)
4521 {
4522         struct softnet_data *sd = NULL;
4523
4524         while (*pos < nr_cpu_ids)
4525                 if (cpu_online(*pos)) {
4526                         sd = &per_cpu(softnet_data, *pos);
4527                         break;
4528                 } else
4529                         ++*pos;
4530         return sd;
4531 }
4532
4533 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4534 {
4535         return softnet_get_online(pos);
4536 }
4537
4538 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4539 {
4540         ++*pos;
4541         return softnet_get_online(pos);
4542 }
4543
4544 static void softnet_seq_stop(struct seq_file *seq, void *v)
4545 {
4546 }
4547
4548 static int softnet_seq_show(struct seq_file *seq, void *v)
4549 {
4550         struct softnet_data *sd = v;
4551
4552         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4553                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4554                    0, 0, 0, 0, /* was fastroute */
4555                    sd->cpu_collision, sd->received_rps);
4556         return 0;
4557 }
4558
4559 static const struct seq_operations dev_seq_ops = {
4560         .start = dev_seq_start,
4561         .next  = dev_seq_next,
4562         .stop  = dev_seq_stop,
4563         .show  = dev_seq_show,
4564 };
4565
4566 static int dev_seq_open(struct inode *inode, struct file *file)
4567 {
4568         return seq_open_net(inode, file, &dev_seq_ops,
4569                             sizeof(struct seq_net_private));
4570 }
4571
4572 static const struct file_operations dev_seq_fops = {
4573         .owner   = THIS_MODULE,
4574         .open    = dev_seq_open,
4575         .read    = seq_read,
4576         .llseek  = seq_lseek,
4577         .release = seq_release_net,
4578 };
4579
4580 static const struct seq_operations softnet_seq_ops = {
4581         .start = softnet_seq_start,
4582         .next  = softnet_seq_next,
4583         .stop  = softnet_seq_stop,
4584         .show  = softnet_seq_show,
4585 };
4586
4587 static int softnet_seq_open(struct inode *inode, struct file *file)
4588 {
4589         return seq_open(file, &softnet_seq_ops);
4590 }
4591
4592 static const struct file_operations softnet_seq_fops = {
4593         .owner   = THIS_MODULE,
4594         .open    = softnet_seq_open,
4595         .read    = seq_read,
4596         .llseek  = seq_lseek,
4597         .release = seq_release,
4598 };
4599
4600 static void *ptype_get_idx(loff_t pos)
4601 {
4602         struct packet_type *pt = NULL;
4603         loff_t i = 0;
4604         int t;
4605
4606         list_for_each_entry_rcu(pt, &ptype_all, list) {
4607                 if (i == pos)
4608                         return pt;
4609                 ++i;
4610         }
4611
4612         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4613                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4614                         if (i == pos)
4615                                 return pt;
4616                         ++i;
4617                 }
4618         }
4619         return NULL;
4620 }
4621
4622 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4623         __acquires(RCU)
4624 {
4625         rcu_read_lock();
4626         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4627 }
4628
4629 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4630 {
4631         struct packet_type *pt;
4632         struct list_head *nxt;
4633         int hash;
4634
4635         ++*pos;
4636         if (v == SEQ_START_TOKEN)
4637                 return ptype_get_idx(0);
4638
4639         pt = v;
4640         nxt = pt->list.next;
4641         if (pt->type == htons(ETH_P_ALL)) {
4642                 if (nxt != &ptype_all)
4643                         goto found;
4644                 hash = 0;
4645                 nxt = ptype_base[0].next;
4646         } else
4647                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4648
4649         while (nxt == &ptype_base[hash]) {
4650                 if (++hash >= PTYPE_HASH_SIZE)
4651                         return NULL;
4652                 nxt = ptype_base[hash].next;
4653         }
4654 found:
4655         return list_entry(nxt, struct packet_type, list);
4656 }
4657
4658 static void ptype_seq_stop(struct seq_file *seq, void *v)
4659         __releases(RCU)
4660 {
4661         rcu_read_unlock();
4662 }
4663
4664 static int ptype_seq_show(struct seq_file *seq, void *v)
4665 {
4666         struct packet_type *pt = v;
4667
4668         if (v == SEQ_START_TOKEN)
4669                 seq_puts(seq, "Type Device      Function\n");
4670         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4671                 if (pt->type == htons(ETH_P_ALL))
4672                         seq_puts(seq, "ALL ");
4673                 else
4674                         seq_printf(seq, "%04x", ntohs(pt->type));
4675
4676                 seq_printf(seq, " %-8s %pF\n",
4677                            pt->dev ? pt->dev->name : "", pt->func);
4678         }
4679
4680         return 0;
4681 }
4682
4683 static const struct seq_operations ptype_seq_ops = {
4684         .start = ptype_seq_start,
4685         .next  = ptype_seq_next,
4686         .stop  = ptype_seq_stop,
4687         .show  = ptype_seq_show,
4688 };
4689
4690 static int ptype_seq_open(struct inode *inode, struct file *file)
4691 {
4692         return seq_open_net(inode, file, &ptype_seq_ops,
4693                         sizeof(struct seq_net_private));
4694 }
4695
4696 static const struct file_operations ptype_seq_fops = {
4697         .owner   = THIS_MODULE,
4698         .open    = ptype_seq_open,
4699         .read    = seq_read,
4700         .llseek  = seq_lseek,
4701         .release = seq_release_net,
4702 };
4703
4704
4705 static int __net_init dev_proc_net_init(struct net *net)
4706 {
4707         int rc = -ENOMEM;
4708
4709         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4710                 goto out;
4711         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4712                 goto out_dev;
4713         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4714                 goto out_softnet;
4715
4716         if (wext_proc_init(net))
4717                 goto out_ptype;
4718         rc = 0;
4719 out:
4720         return rc;
4721 out_ptype:
4722         proc_net_remove(net, "ptype");
4723 out_softnet:
4724         proc_net_remove(net, "softnet_stat");
4725 out_dev:
4726         proc_net_remove(net, "dev");
4727         goto out;
4728 }
4729
4730 static void __net_exit dev_proc_net_exit(struct net *net)
4731 {
4732         wext_proc_exit(net);
4733
4734         proc_net_remove(net, "ptype");
4735         proc_net_remove(net, "softnet_stat");
4736         proc_net_remove(net, "dev");
4737 }
4738
4739 static struct pernet_operations __net_initdata dev_proc_ops = {
4740         .init = dev_proc_net_init,
4741         .exit = dev_proc_net_exit,
4742 };
4743
4744 static int __init dev_proc_init(void)
4745 {
4746         return register_pernet_subsys(&dev_proc_ops);
4747 }
4748 #else
4749 #define dev_proc_init() 0
4750 #endif  /* CONFIG_PROC_FS */
4751
4752
4753 struct netdev_upper {
4754         struct net_device *dev;
4755         bool master;
4756         struct list_head list;
4757         struct rcu_head rcu;
4758         struct list_head search_list;
4759 };
4760
4761 static void __append_search_uppers(struct list_head *search_list,
4762                                    struct net_device *dev)
4763 {
4764         struct netdev_upper *upper;
4765
4766         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4767                 /* check if this upper is not already in search list */
4768                 if (list_empty(&upper->search_list))
4769                         list_add_tail(&upper->search_list, search_list);
4770         }
4771 }
4772
4773 static bool __netdev_search_upper_dev(struct net_device *dev,
4774                                       struct net_device *upper_dev)
4775 {
4776         LIST_HEAD(search_list);
4777         struct netdev_upper *upper;
4778         struct netdev_upper *tmp;
4779         bool ret = false;
4780
4781         __append_search_uppers(&search_list, dev);
4782         list_for_each_entry(upper, &search_list, search_list) {
4783                 if (upper->dev == upper_dev) {
4784                         ret = true;
4785                         break;
4786                 }
4787                 __append_search_uppers(&search_list, upper->dev);
4788         }
4789         list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4790                 INIT_LIST_HEAD(&upper->search_list);
4791         return ret;
4792 }
4793
4794 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4795                                                 struct net_device *upper_dev)
4796 {
4797         struct netdev_upper *upper;
4798
4799         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4800                 if (upper->dev == upper_dev)
4801                         return upper;
4802         }
4803         return NULL;
4804 }
4805
4806 /**
4807  * netdev_has_upper_dev - Check if device is linked to an upper device
4808  * @dev: device
4809  * @upper_dev: upper device to check
4810  *
4811  * Find out if a device is linked to specified upper device and return true
4812  * in case it is. Note that this checks only immediate upper device,
4813  * not through a complete stack of devices. The caller must hold the RTNL lock.
4814  */
4815 bool netdev_has_upper_dev(struct net_device *dev,
4816                           struct net_device *upper_dev)
4817 {
4818         ASSERT_RTNL();
4819
4820         return __netdev_find_upper(dev, upper_dev);
4821 }
4822 EXPORT_SYMBOL(netdev_has_upper_dev);
4823
4824 /**
4825  * netdev_has_any_upper_dev - Check if device is linked to some device
4826  * @dev: device
4827  *
4828  * Find out if a device is linked to an upper device and return true in case
4829  * it is. The caller must hold the RTNL lock.
4830  */
4831 bool netdev_has_any_upper_dev(struct net_device *dev)
4832 {
4833         ASSERT_RTNL();
4834
4835         return !list_empty(&dev->upper_dev_list);
4836 }
4837 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4838
4839 /**
4840  * netdev_master_upper_dev_get - Get master upper device
4841  * @dev: device
4842  *
4843  * Find a master upper device and return pointer to it or NULL in case
4844  * it's not there. The caller must hold the RTNL lock.
4845  */
4846 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4847 {
4848         struct netdev_upper *upper;
4849
4850         ASSERT_RTNL();
4851
4852         if (list_empty(&dev->upper_dev_list))
4853                 return NULL;
4854
4855         upper = list_first_entry(&dev->upper_dev_list,
4856                                  struct netdev_upper, list);
4857         if (likely(upper->master))
4858                 return upper->dev;
4859         return NULL;
4860 }
4861 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4862
4863 /**
4864  * netdev_master_upper_dev_get_rcu - Get master upper device
4865  * @dev: device
4866  *
4867  * Find a master upper device and return pointer to it or NULL in case
4868  * it's not there. The caller must hold the RCU read lock.
4869  */
4870 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4871 {
4872         struct netdev_upper *upper;
4873
4874         upper = list_first_or_null_rcu(&dev->upper_dev_list,
4875                                        struct netdev_upper, list);
4876         if (upper && likely(upper->master))
4877                 return upper->dev;
4878         return NULL;
4879 }
4880 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4881
4882 static int __netdev_upper_dev_link(struct net_device *dev,
4883                                    struct net_device *upper_dev, bool master)
4884 {
4885         struct netdev_upper *upper;
4886
4887         ASSERT_RTNL();
4888
4889         if (dev == upper_dev)
4890                 return -EBUSY;
4891
4892         /* To prevent loops, check if dev is not upper device to upper_dev. */
4893         if (__netdev_search_upper_dev(upper_dev, dev))
4894                 return -EBUSY;
4895
4896         if (__netdev_find_upper(dev, upper_dev))
4897                 return -EEXIST;
4898
4899         if (master && netdev_master_upper_dev_get(dev))
4900                 return -EBUSY;
4901
4902         upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4903         if (!upper)
4904                 return -ENOMEM;
4905
4906         upper->dev = upper_dev;
4907         upper->master = master;
4908         INIT_LIST_HEAD(&upper->search_list);
4909
4910         /* Ensure that master upper link is always the first item in list. */
4911         if (master)
4912                 list_add_rcu(&upper->list, &dev->upper_dev_list);
4913         else
4914                 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4915         dev_hold(upper_dev);
4916
4917         return 0;
4918 }
4919
4920 /**
4921  * netdev_upper_dev_link - Add a link to the upper device
4922  * @dev: device
4923  * @upper_dev: new upper device
4924  *
4925  * Adds a link to device which is upper to this one. The caller must hold
4926  * the RTNL lock. On a failure a negative errno code is returned.
4927  * On success the reference counts are adjusted and the function
4928  * returns zero.
4929  */
4930 int netdev_upper_dev_link(struct net_device *dev,
4931                           struct net_device *upper_dev)
4932 {
4933         return __netdev_upper_dev_link(dev, upper_dev, false);
4934 }
4935 EXPORT_SYMBOL(netdev_upper_dev_link);
4936
4937 /**
4938  * netdev_master_upper_dev_link - Add a master link to the upper device
4939  * @dev: device
4940  * @upper_dev: new upper device
4941  *
4942  * Adds a link to device which is upper to this one. In this case, only
4943  * one master upper device can be linked, although other non-master devices
4944  * might be linked as well. The caller must hold the RTNL lock.
4945  * On a failure a negative errno code is returned. On success the reference
4946  * counts are adjusted and the function returns zero.
4947  */
4948 int netdev_master_upper_dev_link(struct net_device *dev,
4949                                  struct net_device *upper_dev)
4950 {
4951         return __netdev_upper_dev_link(dev, upper_dev, true);
4952 }
4953 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4954
4955 /**
4956  * netdev_upper_dev_unlink - Removes a link to upper device
4957  * @dev: device
4958  * @upper_dev: new upper device
4959  *
4960  * Removes a link to device which is upper to this one. The caller must hold
4961  * the RTNL lock.
4962  */
4963 void netdev_upper_dev_unlink(struct net_device *dev,
4964                              struct net_device *upper_dev)
4965 {
4966         struct netdev_upper *upper;
4967
4968         ASSERT_RTNL();
4969
4970         upper = __netdev_find_upper(dev, upper_dev);
4971         if (!upper)
4972                 return;
4973         list_del_rcu(&upper->list);
4974         dev_put(upper_dev);
4975         kfree_rcu(upper, rcu);
4976 }
4977 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4978
4979 static void dev_change_rx_flags(struct net_device *dev, int flags)
4980 {
4981         const struct net_device_ops *ops = dev->netdev_ops;
4982
4983         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4984                 ops->ndo_change_rx_flags(dev, flags);
4985 }
4986
4987 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4988 {
4989         unsigned int old_flags = dev->flags;
4990         kuid_t uid;
4991         kgid_t gid;
4992
4993         ASSERT_RTNL();
4994
4995         dev->flags |= IFF_PROMISC;
4996         dev->promiscuity += inc;
4997         if (dev->promiscuity == 0) {
4998                 /*
4999                  * Avoid overflow.
5000                  * If inc causes overflow, untouch promisc and return error.
5001                  */
5002                 if (inc < 0)
5003                         dev->flags &= ~IFF_PROMISC;
5004                 else {
5005                         dev->promiscuity -= inc;
5006                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5007                                 dev->name);
5008                         return -EOVERFLOW;
5009                 }
5010         }
5011         if (dev->flags != old_flags) {
5012                 pr_info("device %s %s promiscuous mode\n",
5013                         dev->name,
5014                         dev->flags & IFF_PROMISC ? "entered" : "left");
5015                 if (audit_enabled) {
5016                         current_uid_gid(&uid, &gid);
5017                         audit_log(current->audit_context, GFP_ATOMIC,
5018                                 AUDIT_ANOM_PROMISCUOUS,
5019                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5020                                 dev->name, (dev->flags & IFF_PROMISC),
5021                                 (old_flags & IFF_PROMISC),
5022                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5023                                 from_kuid(&init_user_ns, uid),
5024                                 from_kgid(&init_user_ns, gid),
5025                                 audit_get_sessionid(current));
5026                 }
5027
5028                 dev_change_rx_flags(dev, IFF_PROMISC);
5029         }
5030         return 0;
5031 }
5032
5033 /**
5034  *      dev_set_promiscuity     - update promiscuity count on a device
5035  *      @dev: device
5036  *      @inc: modifier
5037  *
5038  *      Add or remove promiscuity from a device. While the count in the device
5039  *      remains above zero the interface remains promiscuous. Once it hits zero
5040  *      the device reverts back to normal filtering operation. A negative inc
5041  *      value is used to drop promiscuity on the device.
5042  *      Return 0 if successful or a negative errno code on error.
5043  */
5044 int dev_set_promiscuity(struct net_device *dev, int inc)
5045 {
5046         unsigned int old_flags = dev->flags;
5047         int err;
5048
5049         err = __dev_set_promiscuity(dev, inc);
5050         if (err < 0)
5051                 return err;
5052         if (dev->flags != old_flags)
5053                 dev_set_rx_mode(dev);
5054         return err;
5055 }
5056 EXPORT_SYMBOL(dev_set_promiscuity);
5057
5058 /**
5059  *      dev_set_allmulti        - update allmulti count on a device
5060  *      @dev: device
5061  *      @inc: modifier
5062  *
5063  *      Add or remove reception of all multicast frames to a device. While the
5064  *      count in the device remains above zero the interface remains listening
5065  *      to all interfaces. Once it hits zero the device reverts back to normal
5066  *      filtering operation. A negative @inc value is used to drop the counter
5067  *      when releasing a resource needing all multicasts.
5068  *      Return 0 if successful or a negative errno code on error.
5069  */
5070
5071 int dev_set_allmulti(struct net_device *dev, int inc)
5072 {
5073         unsigned int old_flags = dev->flags;
5074
5075         ASSERT_RTNL();
5076
5077         dev->flags |= IFF_ALLMULTI;
5078         dev->allmulti += inc;
5079         if (dev->allmulti == 0) {
5080                 /*
5081                  * Avoid overflow.
5082                  * If inc causes overflow, untouch allmulti and return error.
5083                  */
5084                 if (inc < 0)
5085                         dev->flags &= ~IFF_ALLMULTI;
5086                 else {
5087                         dev->allmulti -= inc;
5088                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5089                                 dev->name);
5090                         return -EOVERFLOW;
5091                 }
5092         }
5093         if (dev->flags ^ old_flags) {
5094                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5095                 dev_set_rx_mode(dev);
5096         }
5097         return 0;
5098 }
5099 EXPORT_SYMBOL(dev_set_allmulti);
5100
5101 /*
5102  *      Upload unicast and multicast address lists to device and
5103  *      configure RX filtering. When the device doesn't support unicast
5104  *      filtering it is put in promiscuous mode while unicast addresses
5105  *      are present.
5106  */
5107 void __dev_set_rx_mode(struct net_device *dev)
5108 {
5109         const struct net_device_ops *ops = dev->netdev_ops;
5110
5111         /* dev_open will call this function so the list will stay sane. */
5112         if (!(dev->flags&IFF_UP))
5113                 return;
5114
5115         if (!netif_device_present(dev))
5116                 return;
5117
5118         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5119                 /* Unicast addresses changes may only happen under the rtnl,
5120                  * therefore calling __dev_set_promiscuity here is safe.
5121                  */
5122                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5123                         __dev_set_promiscuity(dev, 1);
5124                         dev->uc_promisc = true;
5125                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5126                         __dev_set_promiscuity(dev, -1);
5127                         dev->uc_promisc = false;
5128                 }
5129         }
5130
5131         if (ops->ndo_set_rx_mode)
5132                 ops->ndo_set_rx_mode(dev);
5133 }
5134
5135 void dev_set_rx_mode(struct net_device *dev)
5136 {
5137         netif_addr_lock_bh(dev);
5138         __dev_set_rx_mode(dev);
5139         netif_addr_unlock_bh(dev);
5140 }
5141
5142 /**
5143  *      dev_get_flags - get flags reported to userspace
5144  *      @dev: device
5145  *
5146  *      Get the combination of flag bits exported through APIs to userspace.
5147  */
5148 unsigned int dev_get_flags(const struct net_device *dev)
5149 {
5150         unsigned int flags;
5151
5152         flags = (dev->flags & ~(IFF_PROMISC |
5153                                 IFF_ALLMULTI |
5154                                 IFF_RUNNING |
5155                                 IFF_LOWER_UP |
5156                                 IFF_DORMANT)) |
5157                 (dev->gflags & (IFF_PROMISC |
5158                                 IFF_ALLMULTI));
5159
5160         if (netif_running(dev)) {
5161                 if (netif_oper_up(dev))
5162                         flags |= IFF_RUNNING;
5163                 if (netif_carrier_ok(dev))
5164                         flags |= IFF_LOWER_UP;
5165                 if (netif_dormant(dev))
5166                         flags |= IFF_DORMANT;
5167         }
5168
5169         return flags;
5170 }
5171 EXPORT_SYMBOL(dev_get_flags);
5172
5173 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5174 {
5175         unsigned int old_flags = dev->flags;
5176         int ret;
5177
5178         ASSERT_RTNL();
5179
5180         /*
5181          *      Set the flags on our device.
5182          */
5183
5184         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5185                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5186                                IFF_AUTOMEDIA)) |
5187                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5188                                     IFF_ALLMULTI));
5189
5190         /*
5191          *      Load in the correct multicast list now the flags have changed.
5192          */
5193
5194         if ((old_flags ^ flags) & IFF_MULTICAST)
5195                 dev_change_rx_flags(dev, IFF_MULTICAST);
5196
5197         dev_set_rx_mode(dev);
5198
5199         /*
5200          *      Have we downed the interface. We handle IFF_UP ourselves
5201          *      according to user attempts to set it, rather than blindly
5202          *      setting it.
5203          */
5204
5205         ret = 0;
5206         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
5207                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5208
5209                 if (!ret)
5210                         dev_set_rx_mode(dev);
5211         }
5212
5213         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5214                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5215
5216                 dev->gflags ^= IFF_PROMISC;
5217                 dev_set_promiscuity(dev, inc);
5218         }
5219
5220         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5221            is important. Some (broken) drivers set IFF_PROMISC, when
5222            IFF_ALLMULTI is requested not asking us and not reporting.
5223          */
5224         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5225                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5226
5227                 dev->gflags ^= IFF_ALLMULTI;
5228                 dev_set_allmulti(dev, inc);
5229         }
5230
5231         return ret;
5232 }
5233
5234 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
5235 {
5236         unsigned int changes = dev->flags ^ old_flags;
5237
5238         if (changes & IFF_UP) {
5239                 if (dev->flags & IFF_UP)
5240                         call_netdevice_notifiers(NETDEV_UP, dev);
5241                 else
5242                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5243         }
5244
5245         if (dev->flags & IFF_UP &&
5246             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
5247                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
5248 }
5249
5250 /**
5251  *      dev_change_flags - change device settings
5252  *      @dev: device
5253  *      @flags: device state flags
5254  *
5255  *      Change settings on device based state flags. The flags are
5256  *      in the userspace exported format.
5257  */
5258 int dev_change_flags(struct net_device *dev, unsigned int flags)
5259 {
5260         int ret;
5261         unsigned int changes, old_flags = dev->flags;
5262
5263         ret = __dev_change_flags(dev, flags);
5264         if (ret < 0)
5265                 return ret;
5266
5267         changes = old_flags ^ dev->flags;
5268         if (changes)
5269                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
5270
5271         __dev_notify_flags(dev, old_flags);
5272         return ret;
5273 }
5274 EXPORT_SYMBOL(dev_change_flags);
5275
5276 /**
5277  *      dev_set_mtu - Change maximum transfer unit
5278  *      @dev: device
5279  *      @new_mtu: new transfer unit
5280  *
5281  *      Change the maximum transfer size of the network device.
5282  */
5283 int dev_set_mtu(struct net_device *dev, int new_mtu)
5284 {
5285         const struct net_device_ops *ops = dev->netdev_ops;
5286         int err;
5287
5288         if (new_mtu == dev->mtu)
5289                 return 0;
5290
5291         /*      MTU must be positive.    */
5292         if (new_mtu < 0)
5293                 return -EINVAL;
5294
5295         if (!netif_device_present(dev))
5296                 return -ENODEV;
5297
5298         err = 0;
5299         if (ops->ndo_change_mtu)
5300                 err = ops->ndo_change_mtu(dev, new_mtu);
5301         else
5302                 dev->mtu = new_mtu;
5303
5304         if (!err)
5305                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5306         return err;
5307 }
5308 EXPORT_SYMBOL(dev_set_mtu);
5309
5310 /**
5311  *      dev_set_group - Change group this device belongs to
5312  *      @dev: device
5313  *      @new_group: group this device should belong to
5314  */
5315 void dev_set_group(struct net_device *dev, int new_group)
5316 {
5317         dev->group = new_group;
5318 }
5319 EXPORT_SYMBOL(dev_set_group);
5320
5321 /**
5322  *      dev_set_mac_address - Change Media Access Control Address
5323  *      @dev: device
5324  *      @sa: new address
5325  *
5326  *      Change the hardware (MAC) address of the device
5327  */
5328 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5329 {
5330         const struct net_device_ops *ops = dev->netdev_ops;
5331         int err;
5332
5333         if (!ops->ndo_set_mac_address)
5334                 return -EOPNOTSUPP;
5335         if (sa->sa_family != dev->type)
5336                 return -EINVAL;
5337         if (!netif_device_present(dev))
5338                 return -ENODEV;
5339         err = ops->ndo_set_mac_address(dev, sa);
5340         if (err)
5341                 return err;
5342         dev->addr_assign_type = NET_ADDR_SET;
5343         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5344         add_device_randomness(dev->dev_addr, dev->addr_len);
5345         return 0;
5346 }
5347 EXPORT_SYMBOL(dev_set_mac_address);
5348
5349 /**
5350  *      dev_change_carrier - Change device carrier
5351  *      @dev: device
5352  *      @new_carries: new value
5353  *
5354  *      Change device carrier
5355  */
5356 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5357 {
5358         const struct net_device_ops *ops = dev->netdev_ops;
5359
5360         if (!ops->ndo_change_carrier)
5361                 return -EOPNOTSUPP;
5362         if (!netif_device_present(dev))
5363                 return -ENODEV;
5364         return ops->ndo_change_carrier(dev, new_carrier);
5365 }
5366 EXPORT_SYMBOL(dev_change_carrier);
5367
5368 /*
5369  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
5370  */
5371 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
5372 {
5373         int err;
5374         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
5375
5376         if (!dev)
5377                 return -ENODEV;
5378
5379         switch (cmd) {
5380         case SIOCGIFFLAGS:      /* Get interface flags */
5381                 ifr->ifr_flags = (short) dev_get_flags(dev);
5382                 return 0;
5383
5384         case SIOCGIFMETRIC:     /* Get the metric on the interface
5385                                    (currently unused) */
5386                 ifr->ifr_metric = 0;
5387                 return 0;
5388
5389         case SIOCGIFMTU:        /* Get the MTU of a device */
5390                 ifr->ifr_mtu = dev->mtu;
5391                 return 0;
5392
5393         case SIOCGIFHWADDR:
5394                 if (!dev->addr_len)
5395                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
5396                 else
5397                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
5398                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5399                 ifr->ifr_hwaddr.sa_family = dev->type;
5400                 return 0;
5401
5402         case SIOCGIFSLAVE:
5403                 err = -EINVAL;
5404                 break;
5405
5406         case SIOCGIFMAP:
5407                 ifr->ifr_map.mem_start = dev->mem_start;
5408                 ifr->ifr_map.mem_end   = dev->mem_end;
5409                 ifr->ifr_map.base_addr = dev->base_addr;
5410                 ifr->ifr_map.irq       = dev->irq;
5411                 ifr->ifr_map.dma       = dev->dma;
5412                 ifr->ifr_map.port      = dev->if_port;
5413                 return 0;
5414
5415         case SIOCGIFINDEX:
5416                 ifr->ifr_ifindex = dev->ifindex;
5417                 return 0;
5418
5419         case SIOCGIFTXQLEN:
5420                 ifr->ifr_qlen = dev->tx_queue_len;
5421                 return 0;
5422
5423         default:
5424                 /* dev_ioctl() should ensure this case
5425                  * is never reached
5426                  */
5427                 WARN_ON(1);
5428                 err = -ENOTTY;
5429                 break;
5430
5431         }
5432         return err;
5433 }
5434
5435 /*
5436  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
5437  */
5438 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
5439 {
5440         int err;
5441         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5442         const struct net_device_ops *ops;
5443
5444         if (!dev)
5445                 return -ENODEV;
5446
5447         ops = dev->netdev_ops;
5448
5449         switch (cmd) {
5450         case SIOCSIFFLAGS:      /* Set interface flags */
5451                 return dev_change_flags(dev, ifr->ifr_flags);
5452
5453         case SIOCSIFMETRIC:     /* Set the metric on the interface
5454                                    (currently unused) */
5455                 return -EOPNOTSUPP;
5456
5457         case SIOCSIFMTU:        /* Set the MTU of a device */
5458                 return dev_set_mtu(dev, ifr->ifr_mtu);
5459
5460         case SIOCSIFHWADDR:
5461                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
5462
5463         case SIOCSIFHWBROADCAST:
5464                 if (ifr->ifr_hwaddr.sa_family != dev->type)
5465                         return -EINVAL;
5466                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
5467                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5468                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5469                 return 0;
5470
5471         case SIOCSIFMAP:
5472                 if (ops->ndo_set_config) {
5473                         if (!netif_device_present(dev))
5474                                 return -ENODEV;
5475                         return ops->ndo_set_config(dev, &ifr->ifr_map);
5476                 }
5477                 return -EOPNOTSUPP;
5478
5479         case SIOCADDMULTI:
5480                 if (!ops->ndo_set_rx_mode ||
5481                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5482                         return -EINVAL;
5483                 if (!netif_device_present(dev))
5484                         return -ENODEV;
5485                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5486
5487         case SIOCDELMULTI:
5488                 if (!ops->ndo_set_rx_mode ||
5489                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5490                         return -EINVAL;
5491                 if (!netif_device_present(dev))
5492                         return -ENODEV;
5493                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5494
5495         case SIOCSIFTXQLEN:
5496                 if (ifr->ifr_qlen < 0)
5497                         return -EINVAL;
5498                 dev->tx_queue_len = ifr->ifr_qlen;
5499                 return 0;
5500
5501         case SIOCSIFNAME:
5502                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5503                 return dev_change_name(dev, ifr->ifr_newname);
5504
5505         case SIOCSHWTSTAMP:
5506                 err = net_hwtstamp_validate(ifr);
5507                 if (err)
5508                         return err;
5509                 /* fall through */
5510
5511         /*
5512          *      Unknown or private ioctl
5513          */
5514         default:
5515                 if ((cmd >= SIOCDEVPRIVATE &&
5516                     cmd <= SIOCDEVPRIVATE + 15) ||
5517                     cmd == SIOCBONDENSLAVE ||
5518                     cmd == SIOCBONDRELEASE ||
5519                     cmd == SIOCBONDSETHWADDR ||
5520                     cmd == SIOCBONDSLAVEINFOQUERY ||
5521                     cmd == SIOCBONDINFOQUERY ||
5522                     cmd == SIOCBONDCHANGEACTIVE ||
5523                     cmd == SIOCGMIIPHY ||
5524                     cmd == SIOCGMIIREG ||
5525                     cmd == SIOCSMIIREG ||
5526                     cmd == SIOCBRADDIF ||
5527                     cmd == SIOCBRDELIF ||
5528                     cmd == SIOCSHWTSTAMP ||
5529                     cmd == SIOCWANDEV) {
5530                         err = -EOPNOTSUPP;
5531                         if (ops->ndo_do_ioctl) {
5532                                 if (netif_device_present(dev))
5533                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
5534                                 else
5535                                         err = -ENODEV;
5536                         }
5537                 } else
5538                         err = -EINVAL;
5539
5540         }
5541         return err;
5542 }
5543
5544 /*
5545  *      This function handles all "interface"-type I/O control requests. The actual
5546  *      'doing' part of this is dev_ifsioc above.
5547  */
5548
5549 /**
5550  *      dev_ioctl       -       network device ioctl
5551  *      @net: the applicable net namespace
5552  *      @cmd: command to issue
5553  *      @arg: pointer to a struct ifreq in user space
5554  *
5555  *      Issue ioctl functions to devices. This is normally called by the
5556  *      user space syscall interfaces but can sometimes be useful for
5557  *      other purposes. The return value is the return from the syscall if
5558  *      positive or a negative errno code on error.
5559  */
5560
5561 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5562 {
5563         struct ifreq ifr;
5564         int ret;
5565         char *colon;
5566
5567         /* One special case: SIOCGIFCONF takes ifconf argument
5568            and requires shared lock, because it sleeps writing
5569            to user space.
5570          */
5571
5572         if (cmd == SIOCGIFCONF) {
5573                 rtnl_lock();
5574                 ret = dev_ifconf(net, (char __user *) arg);
5575                 rtnl_unlock();
5576                 return ret;
5577         }
5578         if (cmd == SIOCGIFNAME)
5579                 return dev_ifname(net, (struct ifreq __user *)arg);
5580
5581         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5582                 return -EFAULT;
5583
5584         ifr.ifr_name[IFNAMSIZ-1] = 0;
5585
5586         colon = strchr(ifr.ifr_name, ':');
5587         if (colon)
5588                 *colon = 0;
5589
5590         /*
5591          *      See which interface the caller is talking about.
5592          */
5593
5594         switch (cmd) {
5595         /*
5596          *      These ioctl calls:
5597          *      - can be done by all.
5598          *      - atomic and do not require locking.
5599          *      - return a value
5600          */
5601         case SIOCGIFFLAGS:
5602         case SIOCGIFMETRIC:
5603         case SIOCGIFMTU:
5604         case SIOCGIFHWADDR:
5605         case SIOCGIFSLAVE:
5606         case SIOCGIFMAP:
5607         case SIOCGIFINDEX:
5608         case SIOCGIFTXQLEN:
5609                 dev_load(net, ifr.ifr_name);
5610                 rcu_read_lock();
5611                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5612                 rcu_read_unlock();
5613                 if (!ret) {
5614                         if (colon)
5615                                 *colon = ':';
5616                         if (copy_to_user(arg, &ifr,
5617                                          sizeof(struct ifreq)))
5618                                 ret = -EFAULT;
5619                 }
5620                 return ret;
5621
5622         case SIOCETHTOOL:
5623                 dev_load(net, ifr.ifr_name);
5624                 rtnl_lock();
5625                 ret = dev_ethtool(net, &ifr);
5626                 rtnl_unlock();
5627                 if (!ret) {
5628                         if (colon)
5629                                 *colon = ':';
5630                         if (copy_to_user(arg, &ifr,
5631                                          sizeof(struct ifreq)))
5632                                 ret = -EFAULT;
5633                 }
5634                 return ret;
5635
5636         /*
5637          *      These ioctl calls:
5638          *      - require superuser power.
5639          *      - require strict serialization.
5640          *      - return a value
5641          */
5642         case SIOCGMIIPHY:
5643         case SIOCGMIIREG:
5644         case SIOCSIFNAME:
5645                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5646                         return -EPERM;
5647                 dev_load(net, ifr.ifr_name);
5648                 rtnl_lock();
5649                 ret = dev_ifsioc(net, &ifr, cmd);
5650                 rtnl_unlock();
5651                 if (!ret) {
5652                         if (colon)
5653                                 *colon = ':';
5654                         if (copy_to_user(arg, &ifr,
5655                                          sizeof(struct ifreq)))
5656                                 ret = -EFAULT;
5657                 }
5658                 return ret;
5659
5660         /*
5661          *      These ioctl calls:
5662          *      - require superuser power.
5663          *      - require strict serialization.
5664          *      - do not return a value
5665          */
5666         case SIOCSIFMAP:
5667         case SIOCSIFTXQLEN:
5668                 if (!capable(CAP_NET_ADMIN))
5669                         return -EPERM;
5670                 /* fall through */
5671         /*
5672          *      These ioctl calls:
5673          *      - require local superuser power.
5674          *      - require strict serialization.
5675          *      - do not return a value
5676          */
5677         case SIOCSIFFLAGS:
5678         case SIOCSIFMETRIC:
5679         case SIOCSIFMTU:
5680         case SIOCSIFHWADDR:
5681         case SIOCSIFSLAVE:
5682         case SIOCADDMULTI:
5683         case SIOCDELMULTI:
5684         case SIOCSIFHWBROADCAST:
5685         case SIOCSMIIREG:
5686         case SIOCBONDENSLAVE:
5687         case SIOCBONDRELEASE:
5688         case SIOCBONDSETHWADDR:
5689         case SIOCBONDCHANGEACTIVE:
5690         case SIOCBRADDIF:
5691         case SIOCBRDELIF:
5692         case SIOCSHWTSTAMP:
5693                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5694                         return -EPERM;
5695                 /* fall through */
5696         case SIOCBONDSLAVEINFOQUERY:
5697         case SIOCBONDINFOQUERY:
5698                 dev_load(net, ifr.ifr_name);
5699                 rtnl_lock();
5700                 ret = dev_ifsioc(net, &ifr, cmd);
5701                 rtnl_unlock();
5702                 return ret;
5703
5704         case SIOCGIFMEM:
5705                 /* Get the per device memory space. We can add this but
5706                  * currently do not support it */
5707         case SIOCSIFMEM:
5708                 /* Set the per device memory buffer space.
5709                  * Not applicable in our case */
5710         case SIOCSIFLINK:
5711                 return -ENOTTY;
5712
5713         /*
5714          *      Unknown or private ioctl.
5715          */
5716         default:
5717                 if (cmd == SIOCWANDEV ||
5718                     (cmd >= SIOCDEVPRIVATE &&
5719                      cmd <= SIOCDEVPRIVATE + 15)) {
5720                         dev_load(net, ifr.ifr_name);
5721                         rtnl_lock();
5722                         ret = dev_ifsioc(net, &ifr, cmd);
5723                         rtnl_unlock();
5724                         if (!ret && copy_to_user(arg, &ifr,
5725                                                  sizeof(struct ifreq)))
5726                                 ret = -EFAULT;
5727                         return ret;
5728                 }
5729                 /* Take care of Wireless Extensions */
5730                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5731                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5732                 return -ENOTTY;
5733         }
5734 }
5735
5736
5737 /**
5738  *      dev_new_index   -       allocate an ifindex
5739  *      @net: the applicable net namespace
5740  *
5741  *      Returns a suitable unique value for a new device interface
5742  *      number.  The caller must hold the rtnl semaphore or the
5743  *      dev_base_lock to be sure it remains unique.
5744  */
5745 static int dev_new_index(struct net *net)
5746 {
5747         int ifindex = net->ifindex;
5748         for (;;) {
5749                 if (++ifindex <= 0)
5750                         ifindex = 1;
5751                 if (!__dev_get_by_index(net, ifindex))
5752                         return net->ifindex = ifindex;
5753         }
5754 }
5755
5756 /* Delayed registration/unregisteration */
5757 static LIST_HEAD(net_todo_list);
5758
5759 static void net_set_todo(struct net_device *dev)
5760 {
5761         list_add_tail(&dev->todo_list, &net_todo_list);
5762 }
5763
5764 static void rollback_registered_many(struct list_head *head)
5765 {
5766         struct net_device *dev, *tmp;
5767
5768         BUG_ON(dev_boot_phase);
5769         ASSERT_RTNL();
5770
5771         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5772                 /* Some devices call without registering
5773                  * for initialization unwind. Remove those
5774                  * devices and proceed with the remaining.
5775                  */
5776                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5777                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5778                                  dev->name, dev);
5779
5780                         WARN_ON(1);
5781                         list_del(&dev->unreg_list);
5782                         continue;
5783                 }
5784                 dev->dismantle = true;
5785                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5786         }
5787
5788         /* If device is running, close it first. */
5789         dev_close_many(head);
5790
5791         list_for_each_entry(dev, head, unreg_list) {
5792                 /* And unlink it from device chain. */
5793                 unlist_netdevice(dev);
5794
5795                 dev->reg_state = NETREG_UNREGISTERING;
5796         }
5797
5798         synchronize_net();
5799
5800         list_for_each_entry(dev, head, unreg_list) {
5801                 /* Shutdown queueing discipline. */
5802                 dev_shutdown(dev);
5803
5804
5805                 /* Notify protocols, that we are about to destroy
5806                    this device. They should clean all the things.
5807                 */
5808                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5809
5810                 if (!dev->rtnl_link_ops ||
5811                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5812                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5813
5814                 /*
5815                  *      Flush the unicast and multicast chains
5816                  */
5817                 dev_uc_flush(dev);
5818                 dev_mc_flush(dev);
5819
5820                 if (dev->netdev_ops->ndo_uninit)
5821                         dev->netdev_ops->ndo_uninit(dev);
5822
5823                 /* Notifier chain MUST detach us all upper devices. */
5824                 WARN_ON(netdev_has_any_upper_dev(dev));
5825
5826                 /* Remove entries from kobject tree */
5827                 netdev_unregister_kobject(dev);
5828 #ifdef CONFIG_XPS
5829                 /* Remove XPS queueing entries */
5830                 netif_reset_xps_queues_gt(dev, 0);
5831 #endif
5832         }
5833
5834         synchronize_net();
5835
5836         list_for_each_entry(dev, head, unreg_list)
5837                 dev_put(dev);
5838 }
5839
5840 static void rollback_registered(struct net_device *dev)
5841 {
5842         LIST_HEAD(single);
5843
5844         list_add(&dev->unreg_list, &single);
5845         rollback_registered_many(&single);
5846         list_del(&single);
5847 }
5848
5849 static netdev_features_t netdev_fix_features(struct net_device *dev,
5850         netdev_features_t features)
5851 {
5852         /* Fix illegal checksum combinations */
5853         if ((features & NETIF_F_HW_CSUM) &&
5854             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5855                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5856                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5857         }
5858
5859         /* Fix illegal SG+CSUM combinations. */
5860         if ((features & NETIF_F_SG) &&
5861             !(features & NETIF_F_ALL_CSUM)) {
5862                 netdev_dbg(dev,
5863                         "Dropping NETIF_F_SG since no checksum feature.\n");
5864                 features &= ~NETIF_F_SG;
5865         }
5866
5867         /* TSO requires that SG is present as well. */
5868         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5869                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5870                 features &= ~NETIF_F_ALL_TSO;
5871         }
5872
5873         /* TSO ECN requires that TSO is present as well. */
5874         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5875                 features &= ~NETIF_F_TSO_ECN;
5876
5877         /* Software GSO depends on SG. */
5878         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5879                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5880                 features &= ~NETIF_F_GSO;
5881         }
5882
5883         /* UFO needs SG and checksumming */
5884         if (features & NETIF_F_UFO) {
5885                 /* maybe split UFO into V4 and V6? */
5886                 if (!((features & NETIF_F_GEN_CSUM) ||
5887                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5888                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5889                         netdev_dbg(dev,
5890                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5891                         features &= ~NETIF_F_UFO;
5892                 }
5893
5894                 if (!(features & NETIF_F_SG)) {
5895                         netdev_dbg(dev,
5896                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5897                         features &= ~NETIF_F_UFO;
5898                 }
5899         }
5900
5901         return features;
5902 }
5903
5904 int __netdev_update_features(struct net_device *dev)
5905 {
5906         netdev_features_t features;
5907         int err = 0;
5908
5909         ASSERT_RTNL();
5910
5911         features = netdev_get_wanted_features(dev);
5912
5913         if (dev->netdev_ops->ndo_fix_features)
5914                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5915
5916         /* driver might be less strict about feature dependencies */
5917         features = netdev_fix_features(dev, features);
5918
5919         if (dev->features == features)
5920                 return 0;
5921
5922         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5923                 &dev->features, &features);
5924
5925         if (dev->netdev_ops->ndo_set_features)
5926                 err = dev->netdev_ops->ndo_set_features(dev, features);
5927
5928         if (unlikely(err < 0)) {
5929                 netdev_err(dev,
5930                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5931                         err, &features, &dev->features);
5932                 return -1;
5933         }
5934
5935         if (!err)
5936                 dev->features = features;
5937
5938         return 1;
5939 }
5940
5941 /**
5942  *      netdev_update_features - recalculate device features
5943  *      @dev: the device to check
5944  *
5945  *      Recalculate dev->features set and send notifications if it
5946  *      has changed. Should be called after driver or hardware dependent
5947  *      conditions might have changed that influence the features.
5948  */
5949 void netdev_update_features(struct net_device *dev)
5950 {
5951         if (__netdev_update_features(dev))
5952                 netdev_features_change(dev);
5953 }
5954 EXPORT_SYMBOL(netdev_update_features);
5955
5956 /**
5957  *      netdev_change_features - recalculate device features
5958  *      @dev: the device to check
5959  *
5960  *      Recalculate dev->features set and send notifications even
5961  *      if they have not changed. Should be called instead of
5962  *      netdev_update_features() if also dev->vlan_features might
5963  *      have changed to allow the changes to be propagated to stacked
5964  *      VLAN devices.
5965  */
5966 void netdev_change_features(struct net_device *dev)
5967 {
5968         __netdev_update_features(dev);
5969         netdev_features_change(dev);
5970 }
5971 EXPORT_SYMBOL(netdev_change_features);
5972
5973 /**
5974  *      netif_stacked_transfer_operstate -      transfer operstate
5975  *      @rootdev: the root or lower level device to transfer state from
5976  *      @dev: the device to transfer operstate to
5977  *
5978  *      Transfer operational state from root to device. This is normally
5979  *      called when a stacking relationship exists between the root
5980  *      device and the device(a leaf device).
5981  */
5982 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5983                                         struct net_device *dev)
5984 {
5985         if (rootdev->operstate == IF_OPER_DORMANT)
5986                 netif_dormant_on(dev);
5987         else
5988                 netif_dormant_off(dev);
5989
5990         if (netif_carrier_ok(rootdev)) {
5991                 if (!netif_carrier_ok(dev))
5992                         netif_carrier_on(dev);
5993         } else {
5994                 if (netif_carrier_ok(dev))
5995                         netif_carrier_off(dev);
5996         }
5997 }
5998 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5999
6000 #ifdef CONFIG_RPS
6001 static int netif_alloc_rx_queues(struct net_device *dev)
6002 {
6003         unsigned int i, count = dev->num_rx_queues;
6004         struct netdev_rx_queue *rx;
6005
6006         BUG_ON(count < 1);
6007
6008         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6009         if (!rx)
6010                 return -ENOMEM;
6011
6012         dev->_rx = rx;
6013
6014         for (i = 0; i < count; i++)
6015                 rx[i].dev = dev;
6016         return 0;
6017 }
6018 #endif
6019
6020 static void netdev_init_one_queue(struct net_device *dev,
6021                                   struct netdev_queue *queue, void *_unused)
6022 {
6023         /* Initialize queue lock */
6024         spin_lock_init(&queue->_xmit_lock);
6025         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6026         queue->xmit_lock_owner = -1;
6027         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6028         queue->dev = dev;
6029 #ifdef CONFIG_BQL
6030         dql_init(&queue->dql, HZ);
6031 #endif
6032 }
6033
6034 static int netif_alloc_netdev_queues(struct net_device *dev)
6035 {
6036         unsigned int count = dev->num_tx_queues;
6037         struct netdev_queue *tx;
6038
6039         BUG_ON(count < 1);
6040
6041         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
6042         if (!tx)
6043                 return -ENOMEM;
6044
6045         dev->_tx = tx;
6046
6047         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6048         spin_lock_init(&dev->tx_global_lock);
6049
6050         return 0;
6051 }
6052
6053 /**
6054  *      register_netdevice      - register a network device
6055  *      @dev: device to register
6056  *
6057  *      Take a completed network device structure and add it to the kernel
6058  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6059  *      chain. 0 is returned on success. A negative errno code is returned
6060  *      on a failure to set up the device, or if the name is a duplicate.
6061  *
6062  *      Callers must hold the rtnl semaphore. You may want
6063  *      register_netdev() instead of this.
6064  *
6065  *      BUGS:
6066  *      The locking appears insufficient to guarantee two parallel registers
6067  *      will not get the same name.
6068  */
6069
6070 int register_netdevice(struct net_device *dev)
6071 {
6072         int ret;
6073         struct net *net = dev_net(dev);
6074
6075         BUG_ON(dev_boot_phase);
6076         ASSERT_RTNL();
6077
6078         might_sleep();
6079
6080         /* When net_device's are persistent, this will be fatal. */
6081         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6082         BUG_ON(!net);
6083
6084         spin_lock_init(&dev->addr_list_lock);
6085         netdev_set_addr_lockdep_class(dev);
6086
6087         dev->iflink = -1;
6088
6089         ret = dev_get_valid_name(net, dev, dev->name);
6090         if (ret < 0)
6091                 goto out;
6092
6093         /* Init, if this function is available */
6094         if (dev->netdev_ops->ndo_init) {
6095                 ret = dev->netdev_ops->ndo_init(dev);
6096                 if (ret) {
6097                         if (ret > 0)
6098                                 ret = -EIO;
6099                         goto out;
6100                 }
6101         }
6102
6103         if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) &&
6104             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6105              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6106                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6107                 ret = -EINVAL;
6108                 goto err_uninit;
6109         }
6110
6111         ret = -EBUSY;
6112         if (!dev->ifindex)
6113                 dev->ifindex = dev_new_index(net);
6114         else if (__dev_get_by_index(net, dev->ifindex))
6115                 goto err_uninit;
6116
6117         if (dev->iflink == -1)
6118                 dev->iflink = dev->ifindex;
6119
6120         /* Transfer changeable features to wanted_features and enable
6121          * software offloads (GSO and GRO).
6122          */
6123         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6124         dev->features |= NETIF_F_SOFT_FEATURES;
6125         dev->wanted_features = dev->features & dev->hw_features;
6126
6127         /* Turn on no cache copy if HW is doing checksum */
6128         if (!(dev->flags & IFF_LOOPBACK)) {
6129                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6130                 if (dev->features & NETIF_F_ALL_CSUM) {
6131                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
6132                         dev->features |= NETIF_F_NOCACHE_COPY;
6133                 }
6134         }
6135
6136         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6137          */
6138         dev->vlan_features |= NETIF_F_HIGHDMA;
6139
6140         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6141         ret = notifier_to_errno(ret);
6142         if (ret)
6143                 goto err_uninit;
6144
6145         ret = netdev_register_kobject(dev);
6146         if (ret)
6147                 goto err_uninit;
6148         dev->reg_state = NETREG_REGISTERED;
6149
6150         __netdev_update_features(dev);
6151
6152         /*
6153          *      Default initial state at registry is that the
6154          *      device is present.
6155          */
6156
6157         set_bit(__LINK_STATE_PRESENT, &dev->state);
6158
6159         linkwatch_init_dev(dev);
6160
6161         dev_init_scheduler(dev);
6162         dev_hold(dev);
6163         list_netdevice(dev);
6164         add_device_randomness(dev->dev_addr, dev->addr_len);
6165
6166         /* If the device has permanent device address, driver should
6167          * set dev_addr and also addr_assign_type should be set to
6168          * NET_ADDR_PERM (default value).
6169          */
6170         if (dev->addr_assign_type == NET_ADDR_PERM)
6171                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6172
6173         /* Notify protocols, that a new device appeared. */
6174         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6175         ret = notifier_to_errno(ret);
6176         if (ret) {
6177                 rollback_registered(dev);
6178                 dev->reg_state = NETREG_UNREGISTERED;
6179         }
6180         /*
6181          *      Prevent userspace races by waiting until the network
6182          *      device is fully setup before sending notifications.
6183          */
6184         if (!dev->rtnl_link_ops ||
6185             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6186                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6187
6188 out:
6189         return ret;
6190
6191 err_uninit:
6192         if (dev->netdev_ops->ndo_uninit)
6193                 dev->netdev_ops->ndo_uninit(dev);
6194         goto out;
6195 }
6196 EXPORT_SYMBOL(register_netdevice);
6197
6198 /**
6199  *      init_dummy_netdev       - init a dummy network device for NAPI
6200  *      @dev: device to init
6201  *
6202  *      This takes a network device structure and initialize the minimum
6203  *      amount of fields so it can be used to schedule NAPI polls without
6204  *      registering a full blown interface. This is to be used by drivers
6205  *      that need to tie several hardware interfaces to a single NAPI
6206  *      poll scheduler due to HW limitations.
6207  */
6208 int init_dummy_netdev(struct net_device *dev)
6209 {
6210         /* Clear everything. Note we don't initialize spinlocks
6211          * are they aren't supposed to be taken by any of the
6212          * NAPI code and this dummy netdev is supposed to be
6213          * only ever used for NAPI polls
6214          */
6215         memset(dev, 0, sizeof(struct net_device));
6216
6217         /* make sure we BUG if trying to hit standard
6218          * register/unregister code path
6219          */
6220         dev->reg_state = NETREG_DUMMY;
6221
6222         /* NAPI wants this */
6223         INIT_LIST_HEAD(&dev->napi_list);
6224
6225         /* a dummy interface is started by default */
6226         set_bit(__LINK_STATE_PRESENT, &dev->state);
6227         set_bit(__LINK_STATE_START, &dev->state);
6228
6229         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6230          * because users of this 'device' dont need to change
6231          * its refcount.
6232          */
6233
6234         return 0;
6235 }
6236 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6237
6238
6239 /**
6240  *      register_netdev - register a network device
6241  *      @dev: device to register
6242  *
6243  *      Take a completed network device structure and add it to the kernel
6244  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6245  *      chain. 0 is returned on success. A negative errno code is returned
6246  *      on a failure to set up the device, or if the name is a duplicate.
6247  *
6248  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6249  *      and expands the device name if you passed a format string to
6250  *      alloc_netdev.
6251  */
6252 int register_netdev(struct net_device *dev)
6253 {
6254         int err;
6255
6256         rtnl_lock();
6257         err = register_netdevice(dev);
6258         rtnl_unlock();
6259         return err;
6260 }
6261 EXPORT_SYMBOL(register_netdev);
6262
6263 int netdev_refcnt_read(const struct net_device *dev)
6264 {
6265         int i, refcnt = 0;
6266
6267         for_each_possible_cpu(i)
6268                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6269         return refcnt;
6270 }
6271 EXPORT_SYMBOL(netdev_refcnt_read);
6272
6273 /**
6274  * netdev_wait_allrefs - wait until all references are gone.
6275  * @dev: target net_device
6276  *
6277  * This is called when unregistering network devices.
6278  *
6279  * Any protocol or device that holds a reference should register
6280  * for netdevice notification, and cleanup and put back the
6281  * reference if they receive an UNREGISTER event.
6282  * We can get stuck here if buggy protocols don't correctly
6283  * call dev_put.
6284  */
6285 static void netdev_wait_allrefs(struct net_device *dev)
6286 {
6287         unsigned long rebroadcast_time, warning_time;
6288         int refcnt;
6289
6290         linkwatch_forget_dev(dev);
6291
6292         rebroadcast_time = warning_time = jiffies;
6293         refcnt = netdev_refcnt_read(dev);
6294
6295         while (refcnt != 0) {
6296                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6297                         rtnl_lock();
6298
6299                         /* Rebroadcast unregister notification */
6300                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6301
6302                         __rtnl_unlock();
6303                         rcu_barrier();
6304                         rtnl_lock();
6305
6306                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6307                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6308                                      &dev->state)) {
6309                                 /* We must not have linkwatch events
6310                                  * pending on unregister. If this
6311                                  * happens, we simply run the queue
6312                                  * unscheduled, resulting in a noop
6313                                  * for this device.
6314                                  */
6315                                 linkwatch_run_queue();
6316                         }
6317
6318                         __rtnl_unlock();
6319
6320                         rebroadcast_time = jiffies;
6321                 }
6322
6323                 msleep(250);
6324
6325                 refcnt = netdev_refcnt_read(dev);
6326
6327                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6328                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6329                                  dev->name, refcnt);
6330                         warning_time = jiffies;
6331                 }
6332         }
6333 }
6334
6335 /* The sequence is:
6336  *
6337  *      rtnl_lock();
6338  *      ...
6339  *      register_netdevice(x1);
6340  *      register_netdevice(x2);
6341  *      ...
6342  *      unregister_netdevice(y1);
6343  *      unregister_netdevice(y2);
6344  *      ...
6345  *      rtnl_unlock();
6346  *      free_netdev(y1);
6347  *      free_netdev(y2);
6348  *
6349  * We are invoked by rtnl_unlock().
6350  * This allows us to deal with problems:
6351  * 1) We can delete sysfs objects which invoke hotplug
6352  *    without deadlocking with linkwatch via keventd.
6353  * 2) Since we run with the RTNL semaphore not held, we can sleep
6354  *    safely in order to wait for the netdev refcnt to drop to zero.
6355  *
6356  * We must not return until all unregister events added during
6357  * the interval the lock was held have been completed.
6358  */
6359 void netdev_run_todo(void)
6360 {
6361         struct list_head list;
6362
6363         /* Snapshot list, allow later requests */
6364         list_replace_init(&net_todo_list, &list);
6365
6366         __rtnl_unlock();
6367
6368
6369         /* Wait for rcu callbacks to finish before next phase */
6370         if (!list_empty(&list))
6371                 rcu_barrier();
6372
6373         while (!list_empty(&list)) {
6374                 struct net_device *dev
6375                         = list_first_entry(&list, struct net_device, todo_list);
6376                 list_del(&dev->todo_list);
6377
6378                 rtnl_lock();
6379                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6380                 __rtnl_unlock();
6381
6382                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6383                         pr_err("network todo '%s' but state %d\n",
6384                                dev->name, dev->reg_state);
6385                         dump_stack();
6386                         continue;
6387                 }
6388
6389                 dev->reg_state = NETREG_UNREGISTERED;
6390
6391                 on_each_cpu(flush_backlog, dev, 1);
6392
6393                 netdev_wait_allrefs(dev);
6394
6395                 /* paranoia */
6396                 BUG_ON(netdev_refcnt_read(dev));
6397                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6398                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6399                 WARN_ON(dev->dn_ptr);
6400
6401                 if (dev->destructor)
6402                         dev->destructor(dev);
6403
6404                 /* Free network device */
6405                 kobject_put(&dev->dev.kobj);
6406         }
6407 }
6408
6409 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6410  * fields in the same order, with only the type differing.
6411  */
6412 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6413                              const struct net_device_stats *netdev_stats)
6414 {
6415 #if BITS_PER_LONG == 64
6416         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6417         memcpy(stats64, netdev_stats, sizeof(*stats64));
6418 #else
6419         size_t i, n = sizeof(*stats64) / sizeof(u64);
6420         const unsigned long *src = (const unsigned long *)netdev_stats;
6421         u64 *dst = (u64 *)stats64;
6422
6423         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6424                      sizeof(*stats64) / sizeof(u64));
6425         for (i = 0; i < n; i++)
6426                 dst[i] = src[i];
6427 #endif
6428 }
6429 EXPORT_SYMBOL(netdev_stats_to_stats64);
6430
6431 /**
6432  *      dev_get_stats   - get network device statistics
6433  *      @dev: device to get statistics from
6434  *      @storage: place to store stats
6435  *
6436  *      Get network statistics from device. Return @storage.
6437  *      The device driver may provide its own method by setting
6438  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6439  *      otherwise the internal statistics structure is used.
6440  */
6441 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6442                                         struct rtnl_link_stats64 *storage)
6443 {
6444         const struct net_device_ops *ops = dev->netdev_ops;
6445
6446         if (ops->ndo_get_stats64) {
6447                 memset(storage, 0, sizeof(*storage));
6448                 ops->ndo_get_stats64(dev, storage);
6449         } else if (ops->ndo_get_stats) {
6450                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6451         } else {
6452                 netdev_stats_to_stats64(storage, &dev->stats);
6453         }
6454         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6455         return storage;
6456 }
6457 EXPORT_SYMBOL(dev_get_stats);
6458
6459 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6460 {
6461         struct netdev_queue *queue = dev_ingress_queue(dev);
6462
6463 #ifdef CONFIG_NET_CLS_ACT
6464         if (queue)
6465                 return queue;
6466         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6467         if (!queue)
6468                 return NULL;
6469         netdev_init_one_queue(dev, queue, NULL);
6470         queue->qdisc = &noop_qdisc;
6471         queue->qdisc_sleeping = &noop_qdisc;
6472         rcu_assign_pointer(dev->ingress_queue, queue);
6473 #endif
6474         return queue;
6475 }
6476
6477 static const struct ethtool_ops default_ethtool_ops;
6478
6479 void netdev_set_default_ethtool_ops(struct net_device *dev,
6480                                     const struct ethtool_ops *ops)
6481 {
6482         if (dev->ethtool_ops == &default_ethtool_ops)
6483                 dev->ethtool_ops = ops;
6484 }
6485 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6486
6487 /**
6488  *      alloc_netdev_mqs - allocate network device
6489  *      @sizeof_priv:   size of private data to allocate space for
6490  *      @name:          device name format string
6491  *      @setup:         callback to initialize device
6492  *      @txqs:          the number of TX subqueues to allocate
6493  *      @rxqs:          the number of RX subqueues to allocate
6494  *
6495  *      Allocates a struct net_device with private data area for driver use
6496  *      and performs basic initialization.  Also allocates subquue structs
6497  *      for each queue on the device.
6498  */
6499 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6500                 void (*setup)(struct net_device *),
6501                 unsigned int txqs, unsigned int rxqs)
6502 {
6503         struct net_device *dev;
6504         size_t alloc_size;
6505         struct net_device *p;
6506
6507         BUG_ON(strlen(name) >= sizeof(dev->name));
6508
6509         if (txqs < 1) {
6510                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6511                 return NULL;
6512         }
6513
6514 #ifdef CONFIG_RPS
6515         if (rxqs < 1) {
6516                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6517                 return NULL;
6518         }
6519 #endif
6520
6521         alloc_size = sizeof(struct net_device);
6522         if (sizeof_priv) {
6523                 /* ensure 32-byte alignment of private area */
6524                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6525                 alloc_size += sizeof_priv;
6526         }
6527         /* ensure 32-byte alignment of whole construct */
6528         alloc_size += NETDEV_ALIGN - 1;
6529
6530         p = kzalloc(alloc_size, GFP_KERNEL);
6531         if (!p)
6532                 return NULL;
6533
6534         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6535         dev->padded = (char *)dev - (char *)p;
6536
6537         dev->pcpu_refcnt = alloc_percpu(int);
6538         if (!dev->pcpu_refcnt)
6539                 goto free_p;
6540
6541         if (dev_addr_init(dev))
6542                 goto free_pcpu;
6543
6544         dev_mc_init(dev);
6545         dev_uc_init(dev);
6546
6547         dev_net_set(dev, &init_net);
6548
6549         dev->gso_max_size = GSO_MAX_SIZE;
6550         dev->gso_max_segs = GSO_MAX_SEGS;
6551
6552         INIT_LIST_HEAD(&dev->napi_list);
6553         INIT_LIST_HEAD(&dev->unreg_list);
6554         INIT_LIST_HEAD(&dev->link_watch_list);
6555         INIT_LIST_HEAD(&dev->upper_dev_list);
6556         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6557         setup(dev);
6558
6559         dev->num_tx_queues = txqs;
6560         dev->real_num_tx_queues = txqs;
6561         if (netif_alloc_netdev_queues(dev))
6562                 goto free_all;
6563
6564 #ifdef CONFIG_RPS
6565         dev->num_rx_queues = rxqs;
6566         dev->real_num_rx_queues = rxqs;
6567         if (netif_alloc_rx_queues(dev))
6568                 goto free_all;
6569 #endif
6570
6571         strcpy(dev->name, name);
6572         dev->group = INIT_NETDEV_GROUP;
6573         if (!dev->ethtool_ops)
6574                 dev->ethtool_ops = &default_ethtool_ops;
6575         return dev;
6576
6577 free_all:
6578         free_netdev(dev);
6579         return NULL;
6580
6581 free_pcpu:
6582         free_percpu(dev->pcpu_refcnt);
6583         kfree(dev->_tx);
6584 #ifdef CONFIG_RPS
6585         kfree(dev->_rx);
6586 #endif
6587
6588 free_p:
6589         kfree(p);
6590         return NULL;
6591 }
6592 EXPORT_SYMBOL(alloc_netdev_mqs);
6593
6594 /**
6595  *      free_netdev - free network device
6596  *      @dev: device
6597  *
6598  *      This function does the last stage of destroying an allocated device
6599  *      interface. The reference to the device object is released.
6600  *      If this is the last reference then it will be freed.
6601  */
6602 void free_netdev(struct net_device *dev)
6603 {
6604         struct napi_struct *p, *n;
6605
6606         release_net(dev_net(dev));
6607
6608         kfree(dev->_tx);
6609 #ifdef CONFIG_RPS
6610         kfree(dev->_rx);
6611 #endif
6612
6613         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6614
6615         /* Flush device addresses */
6616         dev_addr_flush(dev);
6617
6618         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6619                 netif_napi_del(p);
6620
6621         free_percpu(dev->pcpu_refcnt);
6622         dev->pcpu_refcnt = NULL;
6623
6624         /*  Compatibility with error handling in drivers */
6625         if (dev->reg_state == NETREG_UNINITIALIZED) {
6626                 kfree((char *)dev - dev->padded);
6627                 return;
6628         }
6629
6630         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6631         dev->reg_state = NETREG_RELEASED;
6632
6633         /* will free via device release */
6634         put_device(&dev->dev);
6635 }
6636 EXPORT_SYMBOL(free_netdev);
6637
6638 /**
6639  *      synchronize_net -  Synchronize with packet receive processing
6640  *
6641  *      Wait for packets currently being received to be done.
6642  *      Does not block later packets from starting.
6643  */
6644 void synchronize_net(void)
6645 {
6646         might_sleep();
6647         if (rtnl_is_locked())
6648                 synchronize_rcu_expedited();
6649         else
6650                 synchronize_rcu();
6651 }
6652 EXPORT_SYMBOL(synchronize_net);
6653
6654 /**
6655  *      unregister_netdevice_queue - remove device from the kernel
6656  *      @dev: device
6657  *      @head: list
6658  *
6659  *      This function shuts down a device interface and removes it
6660  *      from the kernel tables.
6661  *      If head not NULL, device is queued to be unregistered later.
6662  *
6663  *      Callers must hold the rtnl semaphore.  You may want
6664  *      unregister_netdev() instead of this.
6665  */
6666
6667 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6668 {
6669         ASSERT_RTNL();
6670
6671         if (head) {
6672                 list_move_tail(&dev->unreg_list, head);
6673         } else {
6674                 rollback_registered(dev);
6675                 /* Finish processing unregister after unlock */
6676                 net_set_todo(dev);
6677         }
6678 }
6679 EXPORT_SYMBOL(unregister_netdevice_queue);
6680
6681 /**
6682  *      unregister_netdevice_many - unregister many devices
6683  *      @head: list of devices
6684  */
6685 void unregister_netdevice_many(struct list_head *head)
6686 {
6687         struct net_device *dev;
6688
6689         if (!list_empty(head)) {
6690                 rollback_registered_many(head);
6691                 list_for_each_entry(dev, head, unreg_list)
6692                         net_set_todo(dev);
6693         }
6694 }
6695 EXPORT_SYMBOL(unregister_netdevice_many);
6696
6697 /**
6698  *      unregister_netdev - remove device from the kernel
6699  *      @dev: device
6700  *
6701  *      This function shuts down a device interface and removes it
6702  *      from the kernel tables.
6703  *
6704  *      This is just a wrapper for unregister_netdevice that takes
6705  *      the rtnl semaphore.  In general you want to use this and not
6706  *      unregister_netdevice.
6707  */
6708 void unregister_netdev(struct net_device *dev)
6709 {
6710         rtnl_lock();
6711         unregister_netdevice(dev);
6712         rtnl_unlock();
6713 }
6714 EXPORT_SYMBOL(unregister_netdev);
6715
6716 /**
6717  *      dev_change_net_namespace - move device to different nethost namespace
6718  *      @dev: device
6719  *      @net: network namespace
6720  *      @pat: If not NULL name pattern to try if the current device name
6721  *            is already taken in the destination network namespace.
6722  *
6723  *      This function shuts down a device interface and moves it
6724  *      to a new network namespace. On success 0 is returned, on
6725  *      a failure a netagive errno code is returned.
6726  *
6727  *      Callers must hold the rtnl semaphore.
6728  */
6729
6730 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6731 {
6732         int err;
6733
6734         ASSERT_RTNL();
6735
6736         /* Don't allow namespace local devices to be moved. */
6737         err = -EINVAL;
6738         if (dev->features & NETIF_F_NETNS_LOCAL)
6739                 goto out;
6740
6741         /* Ensure the device has been registrered */
6742         if (dev->reg_state != NETREG_REGISTERED)
6743                 goto out;
6744
6745         /* Get out if there is nothing todo */
6746         err = 0;
6747         if (net_eq(dev_net(dev), net))
6748                 goto out;
6749
6750         /* Pick the destination device name, and ensure
6751          * we can use it in the destination network namespace.
6752          */
6753         err = -EEXIST;
6754         if (__dev_get_by_name(net, dev->name)) {
6755                 /* We get here if we can't use the current device name */
6756                 if (!pat)
6757                         goto out;
6758                 if (dev_get_valid_name(net, dev, pat) < 0)
6759                         goto out;
6760         }
6761
6762         /*
6763          * And now a mini version of register_netdevice unregister_netdevice.
6764          */
6765
6766         /* If device is running close it first. */
6767         dev_close(dev);
6768
6769         /* And unlink it from device chain */
6770         err = -ENODEV;
6771         unlist_netdevice(dev);
6772
6773         synchronize_net();
6774
6775         /* Shutdown queueing discipline. */
6776         dev_shutdown(dev);
6777
6778         /* Notify protocols, that we are about to destroy
6779            this device. They should clean all the things.
6780
6781            Note that dev->reg_state stays at NETREG_REGISTERED.
6782            This is wanted because this way 8021q and macvlan know
6783            the device is just moving and can keep their slaves up.
6784         */
6785         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6786         rcu_barrier();
6787         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6788         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6789
6790         /*
6791          *      Flush the unicast and multicast chains
6792          */
6793         dev_uc_flush(dev);
6794         dev_mc_flush(dev);
6795
6796         /* Send a netdev-removed uevent to the old namespace */
6797         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6798
6799         /* Actually switch the network namespace */
6800         dev_net_set(dev, net);
6801
6802         /* If there is an ifindex conflict assign a new one */
6803         if (__dev_get_by_index(net, dev->ifindex)) {
6804                 int iflink = (dev->iflink == dev->ifindex);
6805                 dev->ifindex = dev_new_index(net);
6806                 if (iflink)
6807                         dev->iflink = dev->ifindex;
6808         }
6809
6810         /* Send a netdev-add uevent to the new namespace */
6811         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6812
6813         /* Fixup kobjects */
6814         err = device_rename(&dev->dev, dev->name);
6815         WARN_ON(err);
6816
6817         /* Add the device back in the hashes */
6818         list_netdevice(dev);
6819
6820         /* Notify protocols, that a new device appeared. */
6821         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6822
6823         /*
6824          *      Prevent userspace races by waiting until the network
6825          *      device is fully setup before sending notifications.
6826          */
6827         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6828
6829         synchronize_net();
6830         err = 0;
6831 out:
6832         return err;
6833 }
6834 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6835
6836 static int dev_cpu_callback(struct notifier_block *nfb,
6837                             unsigned long action,
6838                             void *ocpu)
6839 {
6840         struct sk_buff **list_skb;
6841         struct sk_buff *skb;
6842         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6843         struct softnet_data *sd, *oldsd;
6844
6845         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6846                 return NOTIFY_OK;
6847
6848         local_irq_disable();
6849         cpu = smp_processor_id();
6850         sd = &per_cpu(softnet_data, cpu);
6851         oldsd = &per_cpu(softnet_data, oldcpu);
6852
6853         /* Find end of our completion_queue. */
6854         list_skb = &sd->completion_queue;
6855         while (*list_skb)
6856                 list_skb = &(*list_skb)->next;
6857         /* Append completion queue from offline CPU. */
6858         *list_skb = oldsd->completion_queue;
6859         oldsd->completion_queue = NULL;
6860
6861         /* Append output queue from offline CPU. */
6862         if (oldsd->output_queue) {
6863                 *sd->output_queue_tailp = oldsd->output_queue;
6864                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6865                 oldsd->output_queue = NULL;
6866                 oldsd->output_queue_tailp = &oldsd->output_queue;
6867         }
6868         /* Append NAPI poll list from offline CPU. */
6869         if (!list_empty(&oldsd->poll_list)) {
6870                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6871                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6872         }
6873
6874         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6875         local_irq_enable();
6876
6877         /* Process offline CPU's input_pkt_queue */
6878         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6879                 netif_rx(skb);
6880                 input_queue_head_incr(oldsd);
6881         }
6882         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6883                 netif_rx(skb);
6884                 input_queue_head_incr(oldsd);
6885         }
6886
6887         return NOTIFY_OK;
6888 }
6889
6890
6891 /**
6892  *      netdev_increment_features - increment feature set by one
6893  *      @all: current feature set
6894  *      @one: new feature set
6895  *      @mask: mask feature set
6896  *
6897  *      Computes a new feature set after adding a device with feature set
6898  *      @one to the master device with current feature set @all.  Will not
6899  *      enable anything that is off in @mask. Returns the new feature set.
6900  */
6901 netdev_features_t netdev_increment_features(netdev_features_t all,
6902         netdev_features_t one, netdev_features_t mask)
6903 {
6904         if (mask & NETIF_F_GEN_CSUM)
6905                 mask |= NETIF_F_ALL_CSUM;
6906         mask |= NETIF_F_VLAN_CHALLENGED;
6907
6908         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6909         all &= one | ~NETIF_F_ALL_FOR_ALL;
6910
6911         /* If one device supports hw checksumming, set for all. */
6912         if (all & NETIF_F_GEN_CSUM)
6913                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6914
6915         return all;
6916 }
6917 EXPORT_SYMBOL(netdev_increment_features);
6918
6919 static struct hlist_head *netdev_create_hash(void)
6920 {
6921         int i;
6922         struct hlist_head *hash;
6923
6924         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6925         if (hash != NULL)
6926                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6927                         INIT_HLIST_HEAD(&hash[i]);
6928
6929         return hash;
6930 }
6931
6932 /* Initialize per network namespace state */
6933 static int __net_init netdev_init(struct net *net)
6934 {
6935         if (net != &init_net)
6936                 INIT_LIST_HEAD(&net->dev_base_head);
6937
6938         net->dev_name_head = netdev_create_hash();
6939         if (net->dev_name_head == NULL)
6940                 goto err_name;
6941
6942         net->dev_index_head = netdev_create_hash();
6943         if (net->dev_index_head == NULL)
6944                 goto err_idx;
6945
6946         return 0;
6947
6948 err_idx:
6949         kfree(net->dev_name_head);
6950 err_name:
6951         return -ENOMEM;
6952 }
6953
6954 /**
6955  *      netdev_drivername - network driver for the device
6956  *      @dev: network device
6957  *
6958  *      Determine network driver for device.
6959  */
6960 const char *netdev_drivername(const struct net_device *dev)
6961 {
6962         const struct device_driver *driver;
6963         const struct device *parent;
6964         const char *empty = "";
6965
6966         parent = dev->dev.parent;
6967         if (!parent)
6968                 return empty;
6969
6970         driver = parent->driver;
6971         if (driver && driver->name)
6972                 return driver->name;
6973         return empty;
6974 }
6975
6976 static int __netdev_printk(const char *level, const struct net_device *dev,
6977                            struct va_format *vaf)
6978 {
6979         int r;
6980
6981         if (dev && dev->dev.parent) {
6982                 r = dev_printk_emit(level[1] - '0',
6983                                     dev->dev.parent,
6984                                     "%s %s %s: %pV",
6985                                     dev_driver_string(dev->dev.parent),
6986                                     dev_name(dev->dev.parent),
6987                                     netdev_name(dev), vaf);
6988         } else if (dev) {
6989                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6990         } else {
6991                 r = printk("%s(NULL net_device): %pV", level, vaf);
6992         }
6993
6994         return r;
6995 }
6996
6997 int netdev_printk(const char *level, const struct net_device *dev,
6998                   const char *format, ...)
6999 {
7000         struct va_format vaf;
7001         va_list args;
7002         int r;
7003
7004         va_start(args, format);
7005
7006         vaf.fmt = format;
7007         vaf.va = &args;
7008
7009         r = __netdev_printk(level, dev, &vaf);
7010
7011         va_end(args);
7012
7013         return r;
7014 }
7015 EXPORT_SYMBOL(netdev_printk);
7016
7017 #define define_netdev_printk_level(func, level)                 \
7018 int func(const struct net_device *dev, const char *fmt, ...)    \
7019 {                                                               \
7020         int r;                                                  \
7021         struct va_format vaf;                                   \
7022         va_list args;                                           \
7023                                                                 \
7024         va_start(args, fmt);                                    \
7025                                                                 \
7026         vaf.fmt = fmt;                                          \
7027         vaf.va = &args;                                         \
7028                                                                 \
7029         r = __netdev_printk(level, dev, &vaf);                  \
7030                                                                 \
7031         va_end(args);                                           \
7032                                                                 \
7033         return r;                                               \
7034 }                                                               \
7035 EXPORT_SYMBOL(func);
7036
7037 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7038 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7039 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7040 define_netdev_printk_level(netdev_err, KERN_ERR);
7041 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7042 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7043 define_netdev_printk_level(netdev_info, KERN_INFO);
7044
7045 static void __net_exit netdev_exit(struct net *net)
7046 {
7047         kfree(net->dev_name_head);
7048         kfree(net->dev_index_head);
7049 }
7050
7051 static struct pernet_operations __net_initdata netdev_net_ops = {
7052         .init = netdev_init,
7053         .exit = netdev_exit,
7054 };
7055
7056 static void __net_exit default_device_exit(struct net *net)
7057 {
7058         struct net_device *dev, *aux;
7059         /*
7060          * Push all migratable network devices back to the
7061          * initial network namespace
7062          */
7063         rtnl_lock();
7064         for_each_netdev_safe(net, dev, aux) {
7065                 int err;
7066                 char fb_name[IFNAMSIZ];
7067
7068                 /* Ignore unmoveable devices (i.e. loopback) */
7069                 if (dev->features & NETIF_F_NETNS_LOCAL)
7070                         continue;
7071
7072                 /* Leave virtual devices for the generic cleanup */
7073                 if (dev->rtnl_link_ops)
7074                         continue;
7075
7076                 /* Push remaining network devices to init_net */
7077                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7078                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7079                 if (err) {
7080                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7081                                  __func__, dev->name, err);
7082                         BUG();
7083                 }
7084         }
7085         rtnl_unlock();
7086 }
7087
7088 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7089 {
7090         /* At exit all network devices most be removed from a network
7091          * namespace.  Do this in the reverse order of registration.
7092          * Do this across as many network namespaces as possible to
7093          * improve batching efficiency.
7094          */
7095         struct net_device *dev;
7096         struct net *net;
7097         LIST_HEAD(dev_kill_list);
7098
7099         rtnl_lock();
7100         list_for_each_entry(net, net_list, exit_list) {
7101                 for_each_netdev_reverse(net, dev) {
7102                         if (dev->rtnl_link_ops)
7103                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7104                         else
7105                                 unregister_netdevice_queue(dev, &dev_kill_list);
7106                 }
7107         }
7108         unregister_netdevice_many(&dev_kill_list);
7109         list_del(&dev_kill_list);
7110         rtnl_unlock();
7111 }
7112
7113 static struct pernet_operations __net_initdata default_device_ops = {
7114         .exit = default_device_exit,
7115         .exit_batch = default_device_exit_batch,
7116 };
7117
7118 /*
7119  *      Initialize the DEV module. At boot time this walks the device list and
7120  *      unhooks any devices that fail to initialise (normally hardware not
7121  *      present) and leaves us with a valid list of present and active devices.
7122  *
7123  */
7124
7125 /*
7126  *       This is called single threaded during boot, so no need
7127  *       to take the rtnl semaphore.
7128  */
7129 static int __init net_dev_init(void)
7130 {
7131         int i, rc = -ENOMEM;
7132
7133         BUG_ON(!dev_boot_phase);
7134
7135         if (dev_proc_init())
7136                 goto out;
7137
7138         if (netdev_kobject_init())
7139                 goto out;
7140
7141         INIT_LIST_HEAD(&ptype_all);
7142         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7143                 INIT_LIST_HEAD(&ptype_base[i]);
7144
7145         INIT_LIST_HEAD(&offload_base);
7146
7147         if (register_pernet_subsys(&netdev_net_ops))
7148                 goto out;
7149
7150         /*
7151          *      Initialise the packet receive queues.
7152          */
7153
7154         for_each_possible_cpu(i) {
7155                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7156
7157                 memset(sd, 0, sizeof(*sd));
7158                 skb_queue_head_init(&sd->input_pkt_queue);
7159                 skb_queue_head_init(&sd->process_queue);
7160                 sd->completion_queue = NULL;
7161                 INIT_LIST_HEAD(&sd->poll_list);
7162                 sd->output_queue = NULL;
7163                 sd->output_queue_tailp = &sd->output_queue;
7164 #ifdef CONFIG_RPS
7165                 sd->csd.func = rps_trigger_softirq;
7166                 sd->csd.info = sd;
7167                 sd->csd.flags = 0;
7168                 sd->cpu = i;
7169 #endif
7170
7171                 sd->backlog.poll = process_backlog;
7172                 sd->backlog.weight = weight_p;
7173                 sd->backlog.gro_list = NULL;
7174                 sd->backlog.gro_count = 0;
7175         }
7176
7177         dev_boot_phase = 0;
7178
7179         /* The loopback device is special if any other network devices
7180          * is present in a network namespace the loopback device must
7181          * be present. Since we now dynamically allocate and free the
7182          * loopback device ensure this invariant is maintained by
7183          * keeping the loopback device as the first device on the
7184          * list of network devices.  Ensuring the loopback devices
7185          * is the first device that appears and the last network device
7186          * that disappears.
7187          */
7188         if (register_pernet_device(&loopback_net_ops))
7189                 goto out;
7190
7191         if (register_pernet_device(&default_device_ops))
7192                 goto out;
7193
7194         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7195         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7196
7197         hotcpu_notifier(dev_cpu_callback, 0);
7198         dst_init();
7199         dev_mcast_init();
7200         rc = 0;
7201 out:
7202         return rc;
7203 }
7204
7205 subsys_initcall(net_dev_init);