net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <linux/bpf.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <net/busy_poll.h>
 101 #include <linux/rtnetlink.h>
 102 #include <linux/stat.h>
 103 #include <net/dst.h>
 104 #include <net/dst_metadata.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/iw_handler.h>
 115 #include <asm/current.h>
 116 #include <linux/audit.h>
 117 #include <linux/dmaengine.h>
 118 #include <linux/err.h>
 119 #include <linux/ctype.h>
 120 #include <linux/if_arp.h>
 121 #include <linux/if_vlan.h>
 122 #include <linux/ip.h>
 123 #include <net/ip.h>
 124 #include <net/mpls.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130 #include <trace/events/net.h>
 131 #include <trace/events/skb.h>
 132 #include <linux/pci.h>
 133 #include <linux/inetdevice.h>
 134 #include <linux/cpu_rmap.h>
 135 #include <linux/static_key.h>
 136 #include <linux/hashtable.h>
 137 #include <linux/vmalloc.h>
 138 #include <linux/if_macvlan.h>
 139 #include <linux/errqueue.h>
 140 #include <linux/hrtimer.h>
 141 #include <linux/netfilter_ingress.h>
 142 #include <linux/crash_dump.h>
 143
 144 #include "net-sysfs.h"
 145
 146 /* Instead of increasing this, you should create a hash table. */
 147 #define MAX_GRO_SKBS 8
 148
 149 /* This should be increased if a protocol with a bigger head is added. */
 150 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 151
 152 static DEFINE_SPINLOCK(ptype_lock);
 153 static DEFINE_SPINLOCK(offload_lock);
 154 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 155 struct list_head ptype_all __read_mostly;       /* Taps */
 156 static struct list_head offload_base __read_mostly;
 157
 158 static int netif_rx_internal(struct sk_buff *skb);
 159 static int call_netdevice_notifiers_info(unsigned long val,
 160                                          struct net_device *dev,
 161                                          struct netdev_notifier_info *info);
 162
 163 /*
 164  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 165  * semaphore.
 166  *
 167  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 168  *
 169  * Writers must hold the rtnl semaphore while they loop through the
 170  * dev_base_head list, and hold dev_base_lock for writing when they do the
 171  * actual updates.  This allows pure readers to access the list even
 172  * while a writer is preparing to update it.
 173  *
 174  * To put it another way, dev_base_lock is held for writing only to
 175  * protect against pure readers; the rtnl semaphore provides the
 176  * protection against other writers.
 177  *
 178  * See, for example usages, register_netdevice() and
 179  * unregister_netdevice(), which must be called with the rtnl
 180  * semaphore held.
 181  */
 182 DEFINE_RWLOCK(dev_base_lock);
 183 EXPORT_SYMBOL(dev_base_lock);
 184
 185 /* protects napi_hash addition/deletion and napi_gen_id */
 186 static DEFINE_SPINLOCK(napi_hash_lock);
 187
 188 static unsigned int napi_gen_id = NR_CPUS;
 189 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 190
 191 static seqcount_t devnet_rename_seq;
 192
 193 static inline void dev_base_seq_inc(struct net *net)
 194 {
 195         while (++net->dev_base_seq == 0);
 196 }
 197
 198 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 199 {
 200         unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 201
 202         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 203 }
 204
 205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206 {
 207         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 208 }
 209
 210 static inline void rps_lock(struct softnet_data *sd)
 211 {
 212 #ifdef CONFIG_RPS
 213         spin_lock(&sd->input_pkt_queue.lock);
 214 #endif
 215 }
 216
 217 static inline void rps_unlock(struct softnet_data *sd)
 218 {
 219 #ifdef CONFIG_RPS
 220         spin_unlock(&sd->input_pkt_queue.lock);
 221 #endif
 222 }
 223
 224 /* Device list insertion */
 225 static void list_netdevice(struct net_device *dev)
 226 {
 227         struct net *net = dev_net(dev);
 228
 229         ASSERT_RTNL();
 230
 231         write_lock_bh(&dev_base_lock);
 232         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 233         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 234         hlist_add_head_rcu(&dev->index_hlist,
 235                            dev_index_hash(net, dev->ifindex));
 236         write_unlock_bh(&dev_base_lock);
 237
 238         dev_base_seq_inc(net);
 239 }
 240
 241 /* Device list removal
 242  * caller must respect a RCU grace period before freeing/reusing dev
 243  */
 244 static void unlist_netdevice(struct net_device *dev)
 245 {
 246         ASSERT_RTNL();
 247
 248         /* Unlink dev from the device chain */
 249         write_lock_bh(&dev_base_lock);
 250         list_del_rcu(&dev->dev_list);
 251         hlist_del_rcu(&dev->name_hlist);
 252         hlist_del_rcu(&dev->index_hlist);
 253         write_unlock_bh(&dev_base_lock);
 254
 255         dev_base_seq_inc(dev_net(dev));
 256 }
 257
 258 /*
 259  *      Our notifier list
 260  */
 261
 262 static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264 /*
 265  *      Device drivers call our routines to queue packets here. We empty the
 266  *      queue in the local softnet handler.
 267  */
 268
 269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270 EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272 #ifdef CONFIG_LOCKDEP
 273 /*
 274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275  * according to dev->type
 276  */
 277 static const unsigned short netdev_lock_type[] =
 278         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 291          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 292          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 293
 294 static const char *const netdev_lock_name[] =
 295         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 308          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 309          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 310
 311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313
 314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 315 {
 316         int i;
 317
 318         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 319                 if (netdev_lock_type[i] == dev_type)
 320                         return i;
 321         /* the last key is used by default */
 322         return ARRAY_SIZE(netdev_lock_type) - 1;
 323 }
 324
 325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 326                                                  unsigned short dev_type)
 327 {
 328         int i;
 329
 330         i = netdev_lock_pos(dev_type);
 331         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 332                                    netdev_lock_name[i]);
 333 }
 334
 335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 336 {
 337         int i;
 338
 339         i = netdev_lock_pos(dev->type);
 340         lockdep_set_class_and_name(&dev->addr_list_lock,
 341                                    &netdev_addr_lock_key[i],
 342                                    netdev_lock_name[i]);
 343 }
 344 #else
 345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 346                                                  unsigned short dev_type)
 347 {
 348 }
 349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 350 {
 351 }
 352 #endif
 353
 354 /*******************************************************************************
 355
 356                 Protocol management and registration routines
 357
 358 *******************************************************************************/
 359
 360 /*
 361  *      Add a protocol ID to the list. Now that the input handler is
 362  *      smarter we can dispense with all the messy stuff that used to be
 363  *      here.
 364  *
 365  *      BEWARE!!! Protocol handlers, mangling input packets,
 366  *      MUST BE last in hash buckets and checking protocol handlers
 367  *      MUST start from promiscuous ptype_all chain in net_bh.
 368  *      It is true now, do not change it.
 369  *      Explanation follows: if protocol handler, mangling packet, will
 370  *      be the first on list, it is not able to sense, that packet
 371  *      is cloned and should be copied-on-write, so that it will
 372  *      change it and subsequent readers will get broken packet.
 373  *                                                      --ANK (980803)
 374  */
 375
 376 static inline struct list_head *ptype_head(const struct packet_type *pt)
 377 {
 378         if (pt->type == htons(ETH_P_ALL))
 379                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 380         else
 381                 return pt->dev ? &pt->dev->ptype_specific :
 382                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383 }
 384
 385 /**
 386  *      dev_add_pack - add packet handler
 387  *      @pt: packet type declaration
 388  *
 389  *      Add a protocol handler to the networking stack. The passed &packet_type
 390  *      is linked into kernel lists and may not be freed until it has been
 391  *      removed from the kernel lists.
 392  *
 393  *      This call does not sleep therefore it can not
 394  *      guarantee all CPU's that are in middle of receiving packets
 395  *      will see the new packet type (until the next received packet).
 396  */
 397
 398 void dev_add_pack(struct packet_type *pt)
 399 {
 400         struct list_head *head = ptype_head(pt);
 401
 402         spin_lock(&ptype_lock);
 403         list_add_rcu(&pt->list, head);
 404         spin_unlock(&ptype_lock);
 405 }
 406 EXPORT_SYMBOL(dev_add_pack);
 407
 408 /**
 409  *      __dev_remove_pack        - remove packet handler
 410  *      @pt: packet type declaration
 411  *
 412  *      Remove a protocol handler that was previously added to the kernel
 413  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414  *      from the kernel lists and can be freed or reused once this function
 415  *      returns.
 416  *
 417  *      The packet type might still be in use by receivers
 418  *      and must not be freed until after all the CPU's have gone
 419  *      through a quiescent state.
 420  */
 421 void __dev_remove_pack(struct packet_type *pt)
 422 {
 423         struct list_head *head = ptype_head(pt);
 424         struct packet_type *pt1;
 425
 426         spin_lock(&ptype_lock);
 427
 428         list_for_each_entry(pt1, head, list) {
 429                 if (pt == pt1) {
 430                         list_del_rcu(&pt->list);
 431                         goto out;
 432                 }
 433         }
 434
 435         pr_warn("dev_remove_pack: %p not found\n", pt);
 436 out:
 437         spin_unlock(&ptype_lock);
 438 }
 439 EXPORT_SYMBOL(__dev_remove_pack);
 440
 441 /**
 442  *      dev_remove_pack  - remove packet handler
 443  *      @pt: packet type declaration
 444  *
 445  *      Remove a protocol handler that was previously added to the kernel
 446  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447  *      from the kernel lists and can be freed or reused once this function
 448  *      returns.
 449  *
 450  *      This call sleeps to guarantee that no CPU is looking at the packet
 451  *      type after return.
 452  */
 453 void dev_remove_pack(struct packet_type *pt)
 454 {
 455         __dev_remove_pack(pt);
 456
 457         synchronize_net();
 458 }
 459 EXPORT_SYMBOL(dev_remove_pack);
 460
 461
 462 /**
 463  *      dev_add_offload - register offload handlers
 464  *      @po: protocol offload declaration
 465  *
 466  *      Add protocol offload handlers to the networking stack. The passed
 467  *      &proto_offload is linked into kernel lists and may not be freed until
 468  *      it has been removed from the kernel lists.
 469  *
 470  *      This call does not sleep therefore it can not
 471  *      guarantee all CPU's that are in middle of receiving packets
 472  *      will see the new offload handlers (until the next received packet).
 473  */
 474 void dev_add_offload(struct packet_offload *po)
 475 {
 476         struct packet_offload *elem;
 477
 478         spin_lock(&offload_lock);
 479         list_for_each_entry(elem, &offload_base, list) {
 480                 if (po->priority < elem->priority)
 481                         break;
 482         }
 483         list_add_rcu(&po->list, elem->list.prev);
 484         spin_unlock(&offload_lock);
 485 }
 486 EXPORT_SYMBOL(dev_add_offload);
 487
 488 /**
 489  *      __dev_remove_offload     - remove offload handler
 490  *      @po: packet offload declaration
 491  *
 492  *      Remove a protocol offload handler that was previously added to the
 493  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 494  *      is removed from the kernel lists and can be freed or reused once this
 495  *      function returns.
 496  *
 497  *      The packet type might still be in use by receivers
 498  *      and must not be freed until after all the CPU's have gone
 499  *      through a quiescent state.
 500  */
 501 static void __dev_remove_offload(struct packet_offload *po)
 502 {
 503         struct list_head *head = &offload_base;
 504         struct packet_offload *po1;
 505
 506         spin_lock(&offload_lock);
 507
 508         list_for_each_entry(po1, head, list) {
 509                 if (po == po1) {
 510                         list_del_rcu(&po->list);
 511                         goto out;
 512                 }
 513         }
 514
 515         pr_warn("dev_remove_offload: %p not found\n", po);
 516 out:
 517         spin_unlock(&offload_lock);
 518 }
 519
 520 /**
 521  *      dev_remove_offload       - remove packet offload handler
 522  *      @po: packet offload declaration
 523  *
 524  *      Remove a packet offload handler that was previously added to the kernel
 525  *      offload handlers by dev_add_offload(). The passed &offload_type is
 526  *      removed from the kernel lists and can be freed or reused once this
 527  *      function returns.
 528  *
 529  *      This call sleeps to guarantee that no CPU is looking at the packet
 530  *      type after return.
 531  */
 532 void dev_remove_offload(struct packet_offload *po)
 533 {
 534         __dev_remove_offload(po);
 535
 536         synchronize_net();
 537 }
 538 EXPORT_SYMBOL(dev_remove_offload);
 539
 540 /******************************************************************************
 541
 542                       Device Boot-time Settings Routines
 543
 544 *******************************************************************************/
 545
 546 /* Boot time configuration table */
 547 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 548
 549 /**
 550  *      netdev_boot_setup_add   - add new setup entry
 551  *      @name: name of the device
 552  *      @map: configured settings for the device
 553  *
 554  *      Adds new setup entry to the dev_boot_setup list.  The function
 555  *      returns 0 on error and 1 on success.  This is a generic routine to
 556  *      all netdevices.
 557  */
 558 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 559 {
 560         struct netdev_boot_setup *s;
 561         int i;
 562
 563         s = dev_boot_setup;
 564         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 565                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 566                         memset(s[i].name, 0, sizeof(s[i].name));
 567                         strlcpy(s[i].name, name, IFNAMSIZ);
 568                         memcpy(&s[i].map, map, sizeof(s[i].map));
 569                         break;
 570                 }
 571         }
 572
 573         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 574 }
 575
 576 /**
 577  *      netdev_boot_setup_check - check boot time settings
 578  *      @dev: the netdevice
 579  *
 580  *      Check boot time settings for the device.
 581  *      The found settings are set for the device to be used
 582  *      later in the device probing.
 583  *      Returns 0 if no settings found, 1 if they are.
 584  */
 585 int netdev_boot_setup_check(struct net_device *dev)
 586 {
 587         struct netdev_boot_setup *s = dev_boot_setup;
 588         int i;
 589
 590         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 591                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 592                     !strcmp(dev->name, s[i].name)) {
 593                         dev->irq        = s[i].map.irq;
 594                         dev->base_addr  = s[i].map.base_addr;
 595                         dev->mem_start  = s[i].map.mem_start;
 596                         dev->mem_end    = s[i].map.mem_end;
 597                         return 1;
 598                 }
 599         }
 600         return 0;
 601 }
 602 EXPORT_SYMBOL(netdev_boot_setup_check);
 603
 604
 605 /**
 606  *      netdev_boot_base        - get address from boot time settings
 607  *      @prefix: prefix for network device
 608  *      @unit: id for network device
 609  *
 610  *      Check boot time settings for the base address of device.
 611  *      The found settings are set for the device to be used
 612  *      later in the device probing.
 613  *      Returns 0 if no settings found.
 614  */
 615 unsigned long netdev_boot_base(const char *prefix, int unit)
 616 {
 617         const struct netdev_boot_setup *s = dev_boot_setup;
 618         char name[IFNAMSIZ];
 619         int i;
 620
 621         sprintf(name, "%s%d", prefix, unit);
 622
 623         /*
 624          * If device already registered then return base of 1
 625          * to indicate not to probe for this interface
 626          */
 627         if (__dev_get_by_name(&init_net, name))
 628                 return 1;
 629
 630         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 631                 if (!strcmp(name, s[i].name))
 632                         return s[i].map.base_addr;
 633         return 0;
 634 }
 635
 636 /*
 637  * Saves at boot time configured settings for any netdevice.
 638  */
 639 int __init netdev_boot_setup(char *str)
 640 {
 641         int ints[5];
 642         struct ifmap map;
 643
 644         str = get_options(str, ARRAY_SIZE(ints), ints);
 645         if (!str || !*str)
 646                 return 0;
 647
 648         /* Save settings */
 649         memset(&map, 0, sizeof(map));
 650         if (ints[0] > 0)
 651                 map.irq = ints[1];
 652         if (ints[0] > 1)
 653                 map.base_addr = ints[2];
 654         if (ints[0] > 2)
 655                 map.mem_start = ints[3];
 656         if (ints[0] > 3)
 657                 map.mem_end = ints[4];
 658
 659         /* Add new entry to the list */
 660         return netdev_boot_setup_add(str, &map);
 661 }
 662
 663 __setup("netdev=", netdev_boot_setup);
 664
 665 /*******************************************************************************
 666
 667                             Device Interface Subroutines
 668
 669 *******************************************************************************/
 670
 671 /**
 672  *      dev_get_iflink  - get 'iflink' value of a interface
 673  *      @dev: targeted interface
 674  *
 675  *      Indicates the ifindex the interface is linked to.
 676  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 677  */
 678
 679 int dev_get_iflink(const struct net_device *dev)
 680 {
 681         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 682                 return dev->netdev_ops->ndo_get_iflink(dev);
 683
 684         return dev->ifindex;
 685 }
 686 EXPORT_SYMBOL(dev_get_iflink);
 687
 688 /**
 689  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 690  *      @dev: targeted interface
 691  *      @skb: The packet.
 692  *
 693  *      For better visibility of tunnel traffic OVS needs to retrieve
 694  *      egress tunnel information for a packet. Following API allows
 695  *      user to get this info.
 696  */
 697 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 698 {
 699         struct ip_tunnel_info *info;
 700
 701         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 702                 return -EINVAL;
 703
 704         info = skb_tunnel_info_unclone(skb);
 705         if (!info)
 706                 return -ENOMEM;
 707         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 708                 return -EINVAL;
 709
 710         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 711 }
 712 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 713
 714 /**
 715  *      __dev_get_by_name       - find a device by its name
 716  *      @net: the applicable net namespace
 717  *      @name: name to find
 718  *
 719  *      Find an interface by name. Must be called under RTNL semaphore
 720  *      or @dev_base_lock. If the name is found a pointer to the device
 721  *      is returned. If the name is not found then %NULL is returned. The
 722  *      reference counters are not incremented so the caller must be
 723  *      careful with locks.
 724  */
 725
 726 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 727 {
 728         struct net_device *dev;
 729         struct hlist_head *head = dev_name_hash(net, name);
 730
 731         hlist_for_each_entry(dev, head, name_hlist)
 732                 if (!strncmp(dev->name, name, IFNAMSIZ))
 733                         return dev;
 734
 735         return NULL;
 736 }
 737 EXPORT_SYMBOL(__dev_get_by_name);
 738
 739 /**
 740  *      dev_get_by_name_rcu     - find a device by its name
 741  *      @net: the applicable net namespace
 742  *      @name: name to find
 743  *
 744  *      Find an interface by name.
 745  *      If the name is found a pointer to the device is returned.
 746  *      If the name is not found then %NULL is returned.
 747  *      The reference counters are not incremented so the caller must be
 748  *      careful with locks. The caller must hold RCU lock.
 749  */
 750
 751 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 752 {
 753         struct net_device *dev;
 754         struct hlist_head *head = dev_name_hash(net, name);
 755
 756         hlist_for_each_entry_rcu(dev, head, name_hlist)
 757                 if (!strncmp(dev->name, name, IFNAMSIZ))
 758                         return dev;
 759
 760         return NULL;
 761 }
 762 EXPORT_SYMBOL(dev_get_by_name_rcu);
 763
 764 /**
 765  *      dev_get_by_name         - find a device by its name
 766  *      @net: the applicable net namespace
 767  *      @name: name to find
 768  *
 769  *      Find an interface by name. This can be called from any
 770  *      context and does its own locking. The returned handle has
 771  *      the usage count incremented and the caller must use dev_put() to
 772  *      release it when it is no longer needed. %NULL is returned if no
 773  *      matching device is found.
 774  */
 775
 776 struct net_device *dev_get_by_name(struct net *net, const char *name)
 777 {
 778         struct net_device *dev;
 779
 780         rcu_read_lock();
 781         dev = dev_get_by_name_rcu(net, name);
 782         if (dev)
 783                 dev_hold(dev);
 784         rcu_read_unlock();
 785         return dev;
 786 }
 787 EXPORT_SYMBOL(dev_get_by_name);
 788
 789 /**
 790  *      __dev_get_by_index - find a device by its ifindex
 791  *      @net: the applicable net namespace
 792  *      @ifindex: index of device
 793  *
 794  *      Search for an interface by index. Returns %NULL if the device
 795  *      is not found or a pointer to the device. The device has not
 796  *      had its reference counter increased so the caller must be careful
 797  *      about locking. The caller must hold either the RTNL semaphore
 798  *      or @dev_base_lock.
 799  */
 800
 801 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 802 {
 803         struct net_device *dev;
 804         struct hlist_head *head = dev_index_hash(net, ifindex);
 805
 806         hlist_for_each_entry(dev, head, index_hlist)
 807                 if (dev->ifindex == ifindex)
 808                         return dev;
 809
 810         return NULL;
 811 }
 812 EXPORT_SYMBOL(__dev_get_by_index);
 813
 814 /**
 815  *      dev_get_by_index_rcu - find a device by its ifindex
 816  *      @net: the applicable net namespace
 817  *      @ifindex: index of device
 818  *
 819  *      Search for an interface by index. Returns %NULL if the device
 820  *      is not found or a pointer to the device. The device has not
 821  *      had its reference counter increased so the caller must be careful
 822  *      about locking. The caller must hold RCU lock.
 823  */
 824
 825 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 826 {
 827         struct net_device *dev;
 828         struct hlist_head *head = dev_index_hash(net, ifindex);
 829
 830         hlist_for_each_entry_rcu(dev, head, index_hlist)
 831                 if (dev->ifindex == ifindex)
 832                         return dev;
 833
 834         return NULL;
 835 }
 836 EXPORT_SYMBOL(dev_get_by_index_rcu);
 837
 838
 839 /**
 840  *      dev_get_by_index - find a device by its ifindex
 841  *      @net: the applicable net namespace
 842  *      @ifindex: index of device
 843  *
 844  *      Search for an interface by index. Returns NULL if the device
 845  *      is not found or a pointer to the device. The device returned has
 846  *      had a reference added and the pointer is safe until the user calls
 847  *      dev_put to indicate they have finished with it.
 848  */
 849
 850 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 851 {
 852         struct net_device *dev;
 853
 854         rcu_read_lock();
 855         dev = dev_get_by_index_rcu(net, ifindex);
 856         if (dev)
 857                 dev_hold(dev);
 858         rcu_read_unlock();
 859         return dev;
 860 }
 861 EXPORT_SYMBOL(dev_get_by_index);
 862
 863 /**
 864  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 865  *      @net: network namespace
 866  *      @name: a pointer to the buffer where the name will be stored.
 867  *      @ifindex: the ifindex of the interface to get the name from.
 868  *
 869  *      The use of raw_seqcount_begin() and cond_resched() before
 870  *      retrying is required as we want to give the writers a chance
 871  *      to complete when CONFIG_PREEMPT is not set.
 872  */
 873 int netdev_get_name(struct net *net, char *name, int ifindex)
 874 {
 875         struct net_device *dev;
 876         unsigned int seq;
 877
 878 retry:
 879         seq = raw_seqcount_begin(&devnet_rename_seq);
 880         rcu_read_lock();
 881         dev = dev_get_by_index_rcu(net, ifindex);
 882         if (!dev) {
 883                 rcu_read_unlock();
 884                 return -ENODEV;
 885         }
 886
 887         strcpy(name, dev->name);
 888         rcu_read_unlock();
 889         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 890                 cond_resched();
 891                 goto retry;
 892         }
 893
 894         return 0;
 895 }
 896
 897 /**
 898  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 899  *      @net: the applicable net namespace
 900  *      @type: media type of device
 901  *      @ha: hardware address
 902  *
 903  *      Search for an interface by MAC address. Returns NULL if the device
 904  *      is not found or a pointer to the device.
 905  *      The caller must hold RCU or RTNL.
 906  *      The returned device has not had its ref count increased
 907  *      and the caller must therefore be careful about locking
 908  *
 909  */
 910
 911 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 912                                        const char *ha)
 913 {
 914         struct net_device *dev;
 915
 916         for_each_netdev_rcu(net, dev)
 917                 if (dev->type == type &&
 918                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 919                         return dev;
 920
 921         return NULL;
 922 }
 923 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 924
 925 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 926 {
 927         struct net_device *dev;
 928
 929         ASSERT_RTNL();
 930         for_each_netdev(net, dev)
 931                 if (dev->type == type)
 932                         return dev;
 933
 934         return NULL;
 935 }
 936 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 937
 938 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 939 {
 940         struct net_device *dev, *ret = NULL;
 941
 942         rcu_read_lock();
 943         for_each_netdev_rcu(net, dev)
 944                 if (dev->type == type) {
 945                         dev_hold(dev);
 946                         ret = dev;
 947                         break;
 948                 }
 949         rcu_read_unlock();
 950         return ret;
 951 }
 952 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 953
 954 /**
 955  *      __dev_get_by_flags - find any device with given flags
 956  *      @net: the applicable net namespace
 957  *      @if_flags: IFF_* values
 958  *      @mask: bitmask of bits in if_flags to check
 959  *
 960  *      Search for any interface with the given flags. Returns NULL if a device
 961  *      is not found or a pointer to the device. Must be called inside
 962  *      rtnl_lock(), and result refcount is unchanged.
 963  */
 964
 965 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 966                                       unsigned short mask)
 967 {
 968         struct net_device *dev, *ret;
 969
 970         ASSERT_RTNL();
 971
 972         ret = NULL;
 973         for_each_netdev(net, dev) {
 974                 if (((dev->flags ^ if_flags) & mask) == 0) {
 975                         ret = dev;
 976                         break;
 977                 }
 978         }
 979         return ret;
 980 }
 981 EXPORT_SYMBOL(__dev_get_by_flags);
 982
 983 /**
 984  *      dev_valid_name - check if name is okay for network device
 985  *      @name: name string
 986  *
 987  *      Network device names need to be valid file names to
 988  *      to allow sysfs to work.  We also disallow any kind of
 989  *      whitespace.
 990  */
 991 bool dev_valid_name(const char *name)
 992 {
 993         if (*name == '\0')
 994                 return false;
 995         if (strlen(name) >= IFNAMSIZ)
 996                 return false;
 997         if (!strcmp(name, ".") || !strcmp(name, ".."))
 998                 return false;
 999
1000         while (*name) {
1001                 if (*name == '/' || *name == ':' || isspace(*name))
1002                         return false;
1003                 name++;
1004         }
1005         return true;
1006 }
1007 EXPORT_SYMBOL(dev_valid_name);
1008
1009 /**
1010  *      __dev_alloc_name - allocate a name for a device
1011  *      @net: network namespace to allocate the device name in
1012  *      @name: name format string
1013  *      @buf:  scratch buffer and result name string
1014  *
1015  *      Passed a format string - eg "lt%d" it will try and find a suitable
1016  *      id. It scans list of devices to build up a free map, then chooses
1017  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1018  *      while allocating the name and adding the device in order to avoid
1019  *      duplicates.
1020  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021  *      Returns the number of the unit assigned or a negative errno code.
1022  */
1023
1024 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025 {
1026         int i = 0;
1027         const char *p;
1028         const int max_netdevices = 8*PAGE_SIZE;
1029         unsigned long *inuse;
1030         struct net_device *d;
1031
1032         p = strnchr(name, IFNAMSIZ-1, '%');
1033         if (p) {
1034                 /*
1035                  * Verify the string as this thing may have come from
1036                  * the user.  There must be either one "%d" and no other "%"
1037                  * characters.
1038                  */
1039                 if (p[1] != 'd' || strchr(p + 2, '%'))
1040                         return -EINVAL;
1041
1042                 /* Use one page as a bit array of possible slots */
1043                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044                 if (!inuse)
1045                         return -ENOMEM;
1046
1047                 for_each_netdev(net, d) {
1048                         if (!sscanf(d->name, name, &i))
1049                                 continue;
1050                         if (i < 0 || i >= max_netdevices)
1051                                 continue;
1052
1053                         /*  avoid cases where sscanf is not exact inverse of printf */
1054                         snprintf(buf, IFNAMSIZ, name, i);
1055                         if (!strncmp(buf, d->name, IFNAMSIZ))
1056                                 set_bit(i, inuse);
1057                 }
1058
1059                 i = find_first_zero_bit(inuse, max_netdevices);
1060                 free_page((unsigned long) inuse);
1061         }
1062
1063         if (buf != name)
1064                 snprintf(buf, IFNAMSIZ, name, i);
1065         if (!__dev_get_by_name(net, buf))
1066                 return i;
1067
1068         /* It is possible to run out of possible slots
1069          * when the name is long and there isn't enough space left
1070          * for the digits, or if all bits are used.
1071          */
1072         return -ENFILE;
1073 }
1074
1075 /**
1076  *      dev_alloc_name - allocate a name for a device
1077  *      @dev: device
1078  *      @name: name format string
1079  *
1080  *      Passed a format string - eg "lt%d" it will try and find a suitable
1081  *      id. It scans list of devices to build up a free map, then chooses
1082  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1083  *      while allocating the name and adding the device in order to avoid
1084  *      duplicates.
1085  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086  *      Returns the number of the unit assigned or a negative errno code.
1087  */
1088
1089 int dev_alloc_name(struct net_device *dev, const char *name)
1090 {
1091         char buf[IFNAMSIZ];
1092         struct net *net;
1093         int ret;
1094
1095         BUG_ON(!dev_net(dev));
1096         net = dev_net(dev);
1097         ret = __dev_alloc_name(net, name, buf);
1098         if (ret >= 0)
1099                 strlcpy(dev->name, buf, IFNAMSIZ);
1100         return ret;
1101 }
1102 EXPORT_SYMBOL(dev_alloc_name);
1103
1104 static int dev_alloc_name_ns(struct net *net,
1105                              struct net_device *dev,
1106                              const char *name)
1107 {
1108         char buf[IFNAMSIZ];
1109         int ret;
1110
1111         ret = __dev_alloc_name(net, name, buf);
1112         if (ret >= 0)
1113                 strlcpy(dev->name, buf, IFNAMSIZ);
1114         return ret;
1115 }
1116
1117 static int dev_get_valid_name(struct net *net,
1118                               struct net_device *dev,
1119                               const char *name)
1120 {
1121         BUG_ON(!net);
1122
1123         if (!dev_valid_name(name))
1124                 return -EINVAL;
1125
1126         if (strchr(name, '%'))
1127                 return dev_alloc_name_ns(net, dev, name);
1128         else if (__dev_get_by_name(net, name))
1129                 return -EEXIST;
1130         else if (dev->name != name)
1131                 strlcpy(dev->name, name, IFNAMSIZ);
1132
1133         return 0;
1134 }
1135
1136 /**
1137  *      dev_change_name - change name of a device
1138  *      @dev: device
1139  *      @newname: name (or format string) must be at least IFNAMSIZ
1140  *
1141  *      Change name of a device, can pass format strings "eth%d".
1142  *      for wildcarding.
1143  */
1144 int dev_change_name(struct net_device *dev, const char *newname)
1145 {
1146         unsigned char old_assign_type;
1147         char oldname[IFNAMSIZ];
1148         int err = 0;
1149         int ret;
1150         struct net *net;
1151
1152         ASSERT_RTNL();
1153         BUG_ON(!dev_net(dev));
1154
1155         net = dev_net(dev);
1156         if (dev->flags & IFF_UP)
1157                 return -EBUSY;
1158
1159         write_seqcount_begin(&devnet_rename_seq);
1160
1161         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162                 write_seqcount_end(&devnet_rename_seq);
1163                 return 0;
1164         }
1165
1166         memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168         err = dev_get_valid_name(net, dev, newname);
1169         if (err < 0) {
1170                 write_seqcount_end(&devnet_rename_seq);
1171                 return err;
1172         }
1173
1174         if (oldname[0] && !strchr(oldname, '%'))
1175                 netdev_info(dev, "renamed from %s\n", oldname);
1176
1177         old_assign_type = dev->name_assign_type;
1178         dev->name_assign_type = NET_NAME_RENAMED;
1179
1180 rollback:
1181         ret = device_rename(&dev->dev, dev->name);
1182         if (ret) {
1183                 memcpy(dev->name, oldname, IFNAMSIZ);
1184                 dev->name_assign_type = old_assign_type;
1185                 write_seqcount_end(&devnet_rename_seq);
1186                 return ret;
1187         }
1188
1189         write_seqcount_end(&devnet_rename_seq);
1190
1191         netdev_adjacent_rename_links(dev, oldname);
1192
1193         write_lock_bh(&dev_base_lock);
1194         hlist_del_rcu(&dev->name_hlist);
1195         write_unlock_bh(&dev_base_lock);
1196
1197         synchronize_rcu();
1198
1199         write_lock_bh(&dev_base_lock);
1200         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201         write_unlock_bh(&dev_base_lock);
1202
1203         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204         ret = notifier_to_errno(ret);
1205
1206         if (ret) {
1207                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1208                 if (err >= 0) {
1209                         err = ret;
1210                         write_seqcount_begin(&devnet_rename_seq);
1211                         memcpy(dev->name, oldname, IFNAMSIZ);
1212                         memcpy(oldname, newname, IFNAMSIZ);
1213                         dev->name_assign_type = old_assign_type;
1214                         old_assign_type = NET_NAME_RENAMED;
1215                         goto rollback;
1216                 } else {
1217                         pr_err("%s: name change rollback failed: %d\n",
1218                                dev->name, ret);
1219                 }
1220         }
1221
1222         return err;
1223 }
1224
1225 /**
1226  *      dev_set_alias - change ifalias of a device
1227  *      @dev: device
1228  *      @alias: name up to IFALIASZ
1229  *      @len: limit of bytes to copy from info
1230  *
1231  *      Set ifalias for a device,
1232  */
1233 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234 {
1235         char *new_ifalias;
1236
1237         ASSERT_RTNL();
1238
1239         if (len >= IFALIASZ)
1240                 return -EINVAL;
1241
1242         if (!len) {
1243                 kfree(dev->ifalias);
1244                 dev->ifalias = NULL;
1245                 return 0;
1246         }
1247
1248         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249         if (!new_ifalias)
1250                 return -ENOMEM;
1251         dev->ifalias = new_ifalias;
1252
1253         strlcpy(dev->ifalias, alias, len+1);
1254         return len;
1255 }
1256
1257
1258 /**
1259  *      netdev_features_change - device changes features
1260  *      @dev: device to cause notification
1261  *
1262  *      Called to indicate a device has changed features.
1263  */
1264 void netdev_features_change(struct net_device *dev)
1265 {
1266         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267 }
1268 EXPORT_SYMBOL(netdev_features_change);
1269
1270 /**
1271  *      netdev_state_change - device changes state
1272  *      @dev: device to cause notification
1273  *
1274  *      Called to indicate a device has changed state. This function calls
1275  *      the notifier chains for netdev_chain and sends a NEWLINK message
1276  *      to the routing socket.
1277  */
1278 void netdev_state_change(struct net_device *dev)
1279 {
1280         if (dev->flags & IFF_UP) {
1281                 struct netdev_notifier_change_info change_info;
1282
1283                 change_info.flags_changed = 0;
1284                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285                                               &change_info.info);
1286                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287         }
1288 }
1289 EXPORT_SYMBOL(netdev_state_change);
1290
1291 /**
1292  *      netdev_notify_peers - notify network peers about existence of @dev
1293  *      @dev: network device
1294  *
1295  * Generate traffic such that interested network peers are aware of
1296  * @dev, such as by generating a gratuitous ARP. This may be used when
1297  * a device wants to inform the rest of the network about some sort of
1298  * reconfiguration such as a failover event or virtual machine
1299  * migration.
1300  */
1301 void netdev_notify_peers(struct net_device *dev)
1302 {
1303         rtnl_lock();
1304         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305         rtnl_unlock();
1306 }
1307 EXPORT_SYMBOL(netdev_notify_peers);
1308
1309 static int __dev_open(struct net_device *dev)
1310 {
1311         const struct net_device_ops *ops = dev->netdev_ops;
1312         int ret;
1313
1314         ASSERT_RTNL();
1315
1316         if (!netif_device_present(dev))
1317                 return -ENODEV;
1318
1319         /* Block netpoll from trying to do any rx path servicing.
1320          * If we don't do this there is a chance ndo_poll_controller
1321          * or ndo_poll may be running while we open the device
1322          */
1323         netpoll_poll_disable(dev);
1324
1325         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326         ret = notifier_to_errno(ret);
1327         if (ret)
1328                 return ret;
1329
1330         set_bit(__LINK_STATE_START, &dev->state);
1331
1332         if (ops->ndo_validate_addr)
1333                 ret = ops->ndo_validate_addr(dev);
1334
1335         if (!ret && ops->ndo_open)
1336                 ret = ops->ndo_open(dev);
1337
1338         netpoll_poll_enable(dev);
1339
1340         if (ret)
1341                 clear_bit(__LINK_STATE_START, &dev->state);
1342         else {
1343                 dev->flags |= IFF_UP;
1344                 dev_set_rx_mode(dev);
1345                 dev_activate(dev);
1346                 add_device_randomness(dev->dev_addr, dev->addr_len);
1347         }
1348
1349         return ret;
1350 }
1351
1352 /**
1353  *      dev_open        - prepare an interface for use.
1354  *      @dev:   device to open
1355  *
1356  *      Takes a device from down to up state. The device's private open
1357  *      function is invoked and then the multicast lists are loaded. Finally
1358  *      the device is moved into the up state and a %NETDEV_UP message is
1359  *      sent to the netdev notifier chain.
1360  *
1361  *      Calling this function on an active interface is a nop. On a failure
1362  *      a negative errno code is returned.
1363  */
1364 int dev_open(struct net_device *dev)
1365 {
1366         int ret;
1367
1368         if (dev->flags & IFF_UP)
1369                 return 0;
1370
1371         ret = __dev_open(dev);
1372         if (ret < 0)
1373                 return ret;
1374
1375         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376         call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378         return ret;
1379 }
1380 EXPORT_SYMBOL(dev_open);
1381
1382 static int __dev_close_many(struct list_head *head)
1383 {
1384         struct net_device *dev;
1385
1386         ASSERT_RTNL();
1387         might_sleep();
1388
1389         list_for_each_entry(dev, head, close_list) {
1390                 /* Temporarily disable netpoll until the interface is down */
1391                 netpoll_poll_disable(dev);
1392
1393                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395                 clear_bit(__LINK_STATE_START, &dev->state);
1396
1397                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1398                  * can be even on different cpu. So just clear netif_running().
1399                  *
1400                  * dev->stop() will invoke napi_disable() on all of it's
1401                  * napi_struct instances on this device.
1402                  */
1403                 smp_mb__after_atomic(); /* Commit netif_running(). */
1404         }
1405
1406         dev_deactivate_many(head);
1407
1408         list_for_each_entry(dev, head, close_list) {
1409                 const struct net_device_ops *ops = dev->netdev_ops;
1410
1411                 /*
1412                  *      Call the device specific close. This cannot fail.
1413                  *      Only if device is UP
1414                  *
1415                  *      We allow it to be called even after a DETACH hot-plug
1416                  *      event.
1417                  */
1418                 if (ops->ndo_stop)
1419                         ops->ndo_stop(dev);
1420
1421                 dev->flags &= ~IFF_UP;
1422                 netpoll_poll_enable(dev);
1423         }
1424
1425         return 0;
1426 }
1427
1428 static int __dev_close(struct net_device *dev)
1429 {
1430         int retval;
1431         LIST_HEAD(single);
1432
1433         list_add(&dev->close_list, &single);
1434         retval = __dev_close_many(&single);
1435         list_del(&single);
1436
1437         return retval;
1438 }
1439
1440 int dev_close_many(struct list_head *head, bool unlink)
1441 {
1442         struct net_device *dev, *tmp;
1443
1444         /* Remove the devices that don't need to be closed */
1445         list_for_each_entry_safe(dev, tmp, head, close_list)
1446                 if (!(dev->flags & IFF_UP))
1447                         list_del_init(&dev->close_list);
1448
1449         __dev_close_many(head);
1450
1451         list_for_each_entry_safe(dev, tmp, head, close_list) {
1452                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1454                 if (unlink)
1455                         list_del_init(&dev->close_list);
1456         }
1457
1458         return 0;
1459 }
1460 EXPORT_SYMBOL(dev_close_many);
1461
1462 /**
1463  *      dev_close - shutdown an interface.
1464  *      @dev: device to shutdown
1465  *
1466  *      This function moves an active device into down state. A
1467  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469  *      chain.
1470  */
1471 int dev_close(struct net_device *dev)
1472 {
1473         if (dev->flags & IFF_UP) {
1474                 LIST_HEAD(single);
1475
1476                 list_add(&dev->close_list, &single);
1477                 dev_close_many(&single, true);
1478                 list_del(&single);
1479         }
1480         return 0;
1481 }
1482 EXPORT_SYMBOL(dev_close);
1483
1484
1485 /**
1486  *      dev_disable_lro - disable Large Receive Offload on a device
1487  *      @dev: device
1488  *
1489  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1490  *      called under RTNL.  This is needed if received packets may be
1491  *      forwarded to another interface.
1492  */
1493 void dev_disable_lro(struct net_device *dev)
1494 {
1495         struct net_device *lower_dev;
1496         struct list_head *iter;
1497
1498         dev->wanted_features &= ~NETIF_F_LRO;
1499         netdev_update_features(dev);
1500
1501         if (unlikely(dev->features & NETIF_F_LRO))
1502                 netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504         netdev_for_each_lower_dev(dev, lower_dev, iter)
1505                 dev_disable_lro(lower_dev);
1506 }
1507 EXPORT_SYMBOL(dev_disable_lro);
1508
1509 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510                                    struct net_device *dev)
1511 {
1512         struct netdev_notifier_info info;
1513
1514         netdev_notifier_info_init(&info, dev);
1515         return nb->notifier_call(nb, val, &info);
1516 }
1517
1518 static int dev_boot_phase = 1;
1519
1520 /**
1521  *      register_netdevice_notifier - register a network notifier block
1522  *      @nb: notifier
1523  *
1524  *      Register a notifier to be called when network device events occur.
1525  *      The notifier passed is linked into the kernel structures and must
1526  *      not be reused until it has been unregistered. A negative errno code
1527  *      is returned on a failure.
1528  *
1529  *      When registered all registration and up events are replayed
1530  *      to the new notifier to allow device to have a race free
1531  *      view of the network device list.
1532  */
1533
1534 int register_netdevice_notifier(struct notifier_block *nb)
1535 {
1536         struct net_device *dev;
1537         struct net_device *last;
1538         struct net *net;
1539         int err;
1540
1541         rtnl_lock();
1542         err = raw_notifier_chain_register(&netdev_chain, nb);
1543         if (err)
1544                 goto unlock;
1545         if (dev_boot_phase)
1546                 goto unlock;
1547         for_each_net(net) {
1548                 for_each_netdev(net, dev) {
1549                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550                         err = notifier_to_errno(err);
1551                         if (err)
1552                                 goto rollback;
1553
1554                         if (!(dev->flags & IFF_UP))
1555                                 continue;
1556
1557                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1558                 }
1559         }
1560
1561 unlock:
1562         rtnl_unlock();
1563         return err;
1564
1565 rollback:
1566         last = dev;
1567         for_each_net(net) {
1568                 for_each_netdev(net, dev) {
1569                         if (dev == last)
1570                                 goto outroll;
1571
1572                         if (dev->flags & IFF_UP) {
1573                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574                                                         dev);
1575                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576                         }
1577                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578                 }
1579         }
1580
1581 outroll:
1582         raw_notifier_chain_unregister(&netdev_chain, nb);
1583         goto unlock;
1584 }
1585 EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587 /**
1588  *      unregister_netdevice_notifier - unregister a network notifier block
1589  *      @nb: notifier
1590  *
1591  *      Unregister a notifier previously registered by
1592  *      register_netdevice_notifier(). The notifier is unlinked into the
1593  *      kernel structures and may then be reused. A negative errno code
1594  *      is returned on a failure.
1595  *
1596  *      After unregistering unregister and down device events are synthesized
1597  *      for all devices on the device list to the removed notifier to remove
1598  *      the need for special case cleanup code.
1599  */
1600
1601 int unregister_netdevice_notifier(struct notifier_block *nb)
1602 {
1603         struct net_device *dev;
1604         struct net *net;
1605         int err;
1606
1607         rtnl_lock();
1608         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609         if (err)
1610                 goto unlock;
1611
1612         for_each_net(net) {
1613                 for_each_netdev(net, dev) {
1614                         if (dev->flags & IFF_UP) {
1615                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616                                                         dev);
1617                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618                         }
1619                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620                 }
1621         }
1622 unlock:
1623         rtnl_unlock();
1624         return err;
1625 }
1626 EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
1628 /**
1629  *      call_netdevice_notifiers_info - call all network notifier blocks
1630  *      @val: value passed unmodified to notifier function
1631  *      @dev: net_device pointer passed unmodified to notifier function
1632  *      @info: notifier information data
1633  *
1634  *      Call all network notifier blocks.  Parameters and return value
1635  *      are as for raw_notifier_call_chain().
1636  */
1637
1638 static int call_netdevice_notifiers_info(unsigned long val,
1639                                          struct net_device *dev,
1640                                          struct netdev_notifier_info *info)
1641 {
1642         ASSERT_RTNL();
1643         netdev_notifier_info_init(info, dev);
1644         return raw_notifier_call_chain(&netdev_chain, val, info);
1645 }
1646
1647 /**
1648  *      call_netdevice_notifiers - call all network notifier blocks
1649  *      @val: value passed unmodified to notifier function
1650  *      @dev: net_device pointer passed unmodified to notifier function
1651  *
1652  *      Call all network notifier blocks.  Parameters and return value
1653  *      are as for raw_notifier_call_chain().
1654  */
1655
1656 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657 {
1658         struct netdev_notifier_info info;
1659
1660         return call_netdevice_notifiers_info(val, dev, &info);
1661 }
1662 EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664 #ifdef CONFIG_NET_INGRESS
1665 static struct static_key ingress_needed __read_mostly;
1666
1667 void net_inc_ingress_queue(void)
1668 {
1669         static_key_slow_inc(&ingress_needed);
1670 }
1671 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673 void net_dec_ingress_queue(void)
1674 {
1675         static_key_slow_dec(&ingress_needed);
1676 }
1677 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678 #endif
1679
1680 #ifdef CONFIG_NET_EGRESS
1681 static struct static_key egress_needed __read_mostly;
1682
1683 void net_inc_egress_queue(void)
1684 {
1685         static_key_slow_inc(&egress_needed);
1686 }
1687 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689 void net_dec_egress_queue(void)
1690 {
1691         static_key_slow_dec(&egress_needed);
1692 }
1693 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694 #endif
1695
1696 static struct static_key netstamp_needed __read_mostly;
1697 #ifdef HAVE_JUMP_LABEL
1698 /* We are not allowed to call static_key_slow_dec() from irq context
1699  * If net_disable_timestamp() is called from irq context, defer the
1700  * static_key_slow_dec() calls.
1701  */
1702 static atomic_t netstamp_needed_deferred;
1703 #endif
1704
1705 void net_enable_timestamp(void)
1706 {
1707 #ifdef HAVE_JUMP_LABEL
1708         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1709
1710         if (deferred) {
1711                 while (--deferred)
1712                         static_key_slow_dec(&netstamp_needed);
1713                 return;
1714         }
1715 #endif
1716         static_key_slow_inc(&netstamp_needed);
1717 }
1718 EXPORT_SYMBOL(net_enable_timestamp);
1719
1720 void net_disable_timestamp(void)
1721 {
1722 #ifdef HAVE_JUMP_LABEL
1723         if (in_interrupt()) {
1724                 atomic_inc(&netstamp_needed_deferred);
1725                 return;
1726         }
1727 #endif
1728         static_key_slow_dec(&netstamp_needed);
1729 }
1730 EXPORT_SYMBOL(net_disable_timestamp);
1731
1732 static inline void net_timestamp_set(struct sk_buff *skb)
1733 {
1734         skb->tstamp.tv64 = 0;
1735         if (static_key_false(&netstamp_needed))
1736                 __net_timestamp(skb);
1737 }
1738
1739 #define net_timestamp_check(COND, SKB)                  \
1740         if (static_key_false(&netstamp_needed)) {               \
1741                 if ((COND) && !(SKB)->tstamp.tv64)      \
1742                         __net_timestamp(SKB);           \
1743         }                                               \
1744
1745 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1746 {
1747         unsigned int len;
1748
1749         if (!(dev->flags & IFF_UP))
1750                 return false;
1751
1752         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1753         if (skb->len <= len)
1754                 return true;
1755
1756         /* if TSO is enabled, we don't care about the length as the packet
1757          * could be forwarded without being segmented before
1758          */
1759         if (skb_is_gso(skb))
1760                 return true;
1761
1762         return false;
1763 }
1764 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1765
1766 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1767 {
1768         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1769             unlikely(!is_skb_forwardable(dev, skb))) {
1770                 atomic_long_inc(&dev->rx_dropped);
1771                 kfree_skb(skb);
1772                 return NET_RX_DROP;
1773         }
1774
1775         skb_scrub_packet(skb, true);
1776         skb->priority = 0;
1777         skb->protocol = eth_type_trans(skb, dev);
1778         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1779
1780         return 0;
1781 }
1782 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1783
1784 /**
1785  * dev_forward_skb - loopback an skb to another netif
1786  *
1787  * @dev: destination network device
1788  * @skb: buffer to forward
1789  *
1790  * return values:
1791  *      NET_RX_SUCCESS  (no congestion)
1792  *      NET_RX_DROP     (packet was dropped, but freed)
1793  *
1794  * dev_forward_skb can be used for injecting an skb from the
1795  * start_xmit function of one device into the receive queue
1796  * of another device.
1797  *
1798  * The receiving device may be in another namespace, so
1799  * we have to clear all information in the skb that could
1800  * impact namespace isolation.
1801  */
1802 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1803 {
1804         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1805 }
1806 EXPORT_SYMBOL_GPL(dev_forward_skb);
1807
1808 static inline int deliver_skb(struct sk_buff *skb,
1809                               struct packet_type *pt_prev,
1810                               struct net_device *orig_dev)
1811 {
1812         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1813                 return -ENOMEM;
1814         atomic_inc(&skb->users);
1815         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1816 }
1817
1818 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1819                                           struct packet_type **pt,
1820                                           struct net_device *orig_dev,
1821                                           __be16 type,
1822                                           struct list_head *ptype_list)
1823 {
1824         struct packet_type *ptype, *pt_prev = *pt;
1825
1826         list_for_each_entry_rcu(ptype, ptype_list, list) {
1827                 if (ptype->type != type)
1828                         continue;
1829                 if (pt_prev)
1830                         deliver_skb(skb, pt_prev, orig_dev);
1831                 pt_prev = ptype;
1832         }
1833         *pt = pt_prev;
1834 }
1835
1836 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1837 {
1838         if (!ptype->af_packet_priv || !skb->sk)
1839                 return false;
1840
1841         if (ptype->id_match)
1842                 return ptype->id_match(ptype, skb->sk);
1843         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1844                 return true;
1845
1846         return false;
1847 }
1848
1849 /*
1850  *      Support routine. Sends outgoing frames to any network
1851  *      taps currently in use.
1852  */
1853
1854 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1855 {
1856         struct packet_type *ptype;
1857         struct sk_buff *skb2 = NULL;
1858         struct packet_type *pt_prev = NULL;
1859         struct list_head *ptype_list = &ptype_all;
1860
1861         rcu_read_lock();
1862 again:
1863         list_for_each_entry_rcu(ptype, ptype_list, list) {
1864                 /* Never send packets back to the socket
1865                  * they originated from - MvS (miquels@drinkel.ow.org)
1866                  */
1867                 if (skb_loop_sk(ptype, skb))
1868                         continue;
1869
1870                 if (pt_prev) {
1871                         deliver_skb(skb2, pt_prev, skb->dev);
1872                         pt_prev = ptype;
1873                         continue;
1874                 }
1875
1876                 /* need to clone skb, done only once */
1877                 skb2 = skb_clone(skb, GFP_ATOMIC);
1878                 if (!skb2)
1879                         goto out_unlock;
1880
1881                 net_timestamp_set(skb2);
1882
1883                 /* skb->nh should be correctly
1884                  * set by sender, so that the second statement is
1885                  * just protection against buggy protocols.
1886                  */
1887                 skb_reset_mac_header(skb2);
1888
1889                 if (skb_network_header(skb2) < skb2->data ||
1890                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1891                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1892                                              ntohs(skb2->protocol),
1893                                              dev->name);
1894                         skb_reset_network_header(skb2);
1895                 }
1896
1897                 skb2->transport_header = skb2->network_header;
1898                 skb2->pkt_type = PACKET_OUTGOING;
1899                 pt_prev = ptype;
1900         }
1901
1902         if (ptype_list == &ptype_all) {
1903                 ptype_list = &dev->ptype_all;
1904                 goto again;
1905         }
1906 out_unlock:
1907         if (pt_prev)
1908                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1909         rcu_read_unlock();
1910 }
1911 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1912
1913 /**
1914  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1915  * @dev: Network device
1916  * @txq: number of queues available
1917  *
1918  * If real_num_tx_queues is changed the tc mappings may no longer be
1919  * valid. To resolve this verify the tc mapping remains valid and if
1920  * not NULL the mapping. With no priorities mapping to this
1921  * offset/count pair it will no longer be used. In the worst case TC0
1922  * is invalid nothing can be done so disable priority mappings. If is
1923  * expected that drivers will fix this mapping if they can before
1924  * calling netif_set_real_num_tx_queues.
1925  */
1926 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1927 {
1928         int i;
1929         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1930
1931         /* If TC0 is invalidated disable TC mapping */
1932         if (tc->offset + tc->count > txq) {
1933                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1934                 dev->num_tc = 0;
1935                 return;
1936         }
1937
1938         /* Invalidated prio to tc mappings set to TC0 */
1939         for (i = 1; i < TC_BITMASK + 1; i++) {
1940                 int q = netdev_get_prio_tc_map(dev, i);
1941
1942                 tc = &dev->tc_to_txq[q];
1943                 if (tc->offset + tc->count > txq) {
1944                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1945                                 i, q);
1946                         netdev_set_prio_tc_map(dev, i, 0);
1947                 }
1948         }
1949 }
1950
1951 #ifdef CONFIG_XPS
1952 static DEFINE_MUTEX(xps_map_mutex);
1953 #define xmap_dereference(P)             \
1954         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1955
1956 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1957                                         int cpu, u16 index)
1958 {
1959         struct xps_map *map = NULL;
1960         int pos;
1961
1962         if (dev_maps)
1963                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1964
1965         for (pos = 0; map && pos < map->len; pos++) {
1966                 if (map->queues[pos] == index) {
1967                         if (map->len > 1) {
1968                                 map->queues[pos] = map->queues[--map->len];
1969                         } else {
1970                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1971                                 kfree_rcu(map, rcu);
1972                                 map = NULL;
1973                         }
1974                         break;
1975                 }
1976         }
1977
1978         return map;
1979 }
1980
1981 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1982 {
1983         struct xps_dev_maps *dev_maps;
1984         int cpu, i;
1985         bool active = false;
1986
1987         mutex_lock(&xps_map_mutex);
1988         dev_maps = xmap_dereference(dev->xps_maps);
1989
1990         if (!dev_maps)
1991                 goto out_no_maps;
1992
1993         for_each_possible_cpu(cpu) {
1994                 for (i = index; i < dev->num_tx_queues; i++) {
1995                         if (!remove_xps_queue(dev_maps, cpu, i))
1996                                 break;
1997                 }
1998                 if (i == dev->num_tx_queues)
1999                         active = true;
2000         }
2001
2002         if (!active) {
2003                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2004                 kfree_rcu(dev_maps, rcu);
2005         }
2006
2007         for (i = index; i < dev->num_tx_queues; i++)
2008                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2009                                              NUMA_NO_NODE);
2010
2011 out_no_maps:
2012         mutex_unlock(&xps_map_mutex);
2013 }
2014
2015 static struct xps_map *expand_xps_map(struct xps_map *map,
2016                                       int cpu, u16 index)
2017 {
2018         struct xps_map *new_map;
2019         int alloc_len = XPS_MIN_MAP_ALLOC;
2020         int i, pos;
2021
2022         for (pos = 0; map && pos < map->len; pos++) {
2023                 if (map->queues[pos] != index)
2024                         continue;
2025                 return map;
2026         }
2027
2028         /* Need to add queue to this CPU's existing map */
2029         if (map) {
2030                 if (pos < map->alloc_len)
2031                         return map;
2032
2033                 alloc_len = map->alloc_len * 2;
2034         }
2035
2036         /* Need to allocate new map to store queue on this CPU's map */
2037         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2038                                cpu_to_node(cpu));
2039         if (!new_map)
2040                 return NULL;
2041
2042         for (i = 0; i < pos; i++)
2043                 new_map->queues[i] = map->queues[i];
2044         new_map->alloc_len = alloc_len;
2045         new_map->len = pos;
2046
2047         return new_map;
2048 }
2049
2050 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2051                         u16 index)
2052 {
2053         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2054         struct xps_map *map, *new_map;
2055         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2056         int cpu, numa_node_id = -2;
2057         bool active = false;
2058
2059         mutex_lock(&xps_map_mutex);
2060
2061         dev_maps = xmap_dereference(dev->xps_maps);
2062
2063         /* allocate memory for queue storage */
2064         for_each_online_cpu(cpu) {
2065                 if (!cpumask_test_cpu(cpu, mask))
2066                         continue;
2067
2068                 if (!new_dev_maps)
2069                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2070                 if (!new_dev_maps) {
2071                         mutex_unlock(&xps_map_mutex);
2072                         return -ENOMEM;
2073                 }
2074
2075                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2076                                  NULL;
2077
2078                 map = expand_xps_map(map, cpu, index);
2079                 if (!map)
2080                         goto error;
2081
2082                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2083         }
2084
2085         if (!new_dev_maps)
2086                 goto out_no_new_maps;
2087
2088         for_each_possible_cpu(cpu) {
2089                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2090                         /* add queue to CPU maps */
2091                         int pos = 0;
2092
2093                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2094                         while ((pos < map->len) && (map->queues[pos] != index))
2095                                 pos++;
2096
2097                         if (pos == map->len)
2098                                 map->queues[map->len++] = index;
2099 #ifdef CONFIG_NUMA
2100                         if (numa_node_id == -2)
2101                                 numa_node_id = cpu_to_node(cpu);
2102                         else if (numa_node_id != cpu_to_node(cpu))
2103                                 numa_node_id = -1;
2104 #endif
2105                 } else if (dev_maps) {
2106                         /* fill in the new device map from the old device map */
2107                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2108                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2109                 }
2110
2111         }
2112
2113         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2114
2115         /* Cleanup old maps */
2116         if (dev_maps) {
2117                 for_each_possible_cpu(cpu) {
2118                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2119                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2120                         if (map && map != new_map)
2121                                 kfree_rcu(map, rcu);
2122                 }
2123
2124                 kfree_rcu(dev_maps, rcu);
2125         }
2126
2127         dev_maps = new_dev_maps;
2128         active = true;
2129
2130 out_no_new_maps:
2131         /* update Tx queue numa node */
2132         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2133                                      (numa_node_id >= 0) ? numa_node_id :
2134                                      NUMA_NO_NODE);
2135
2136         if (!dev_maps)
2137                 goto out_no_maps;
2138
2139         /* removes queue from unused CPUs */
2140         for_each_possible_cpu(cpu) {
2141                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2142                         continue;
2143
2144                 if (remove_xps_queue(dev_maps, cpu, index))
2145                         active = true;
2146         }
2147
2148         /* free map if not active */
2149         if (!active) {
2150                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2151                 kfree_rcu(dev_maps, rcu);
2152         }
2153
2154 out_no_maps:
2155         mutex_unlock(&xps_map_mutex);
2156
2157         return 0;
2158 error:
2159         /* remove any maps that we added */
2160         for_each_possible_cpu(cpu) {
2161                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2162                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2163                                  NULL;
2164                 if (new_map && new_map != map)
2165                         kfree(new_map);
2166         }
2167
2168         mutex_unlock(&xps_map_mutex);
2169
2170         kfree(new_dev_maps);
2171         return -ENOMEM;
2172 }
2173 EXPORT_SYMBOL(netif_set_xps_queue);
2174
2175 #endif
2176 void netdev_reset_tc(struct net_device *dev)
2177 {
2178         dev->num_tc = 0;
2179         memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2180         memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2181 }
2182 EXPORT_SYMBOL(netdev_reset_tc);
2183
2184 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2185 {
2186         if (tc >= dev->num_tc)
2187                 return -EINVAL;
2188
2189         dev->tc_to_txq[tc].count = count;
2190         dev->tc_to_txq[tc].offset = offset;
2191         return 0;
2192 }
2193 EXPORT_SYMBOL(netdev_set_tc_queue);
2194
2195 int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2196 {
2197         if (num_tc > TC_MAX_QUEUE)
2198                 return -EINVAL;
2199
2200         dev->num_tc = num_tc;
2201         return 0;
2202 }
2203 EXPORT_SYMBOL(netdev_set_num_tc);
2204
2205 /*
2206  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2207  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2208  */
2209 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2210 {
2211         int rc;
2212
2213         if (txq < 1 || txq > dev->num_tx_queues)
2214                 return -EINVAL;
2215
2216         if (dev->reg_state == NETREG_REGISTERED ||
2217             dev->reg_state == NETREG_UNREGISTERING) {
2218                 ASSERT_RTNL();
2219
2220                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2221                                                   txq);
2222                 if (rc)
2223                         return rc;
2224
2225                 if (dev->num_tc)
2226                         netif_setup_tc(dev, txq);
2227
2228                 if (txq < dev->real_num_tx_queues) {
2229                         qdisc_reset_all_tx_gt(dev, txq);
2230 #ifdef CONFIG_XPS
2231                         netif_reset_xps_queues_gt(dev, txq);
2232 #endif
2233                 }
2234         }
2235
2236         dev->real_num_tx_queues = txq;
2237         return 0;
2238 }
2239 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2240
2241 #ifdef CONFIG_SYSFS
2242 /**
2243  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2244  *      @dev: Network device
2245  *      @rxq: Actual number of RX queues
2246  *
2247  *      This must be called either with the rtnl_lock held or before
2248  *      registration of the net device.  Returns 0 on success, or a
2249  *      negative error code.  If called before registration, it always
2250  *      succeeds.
2251  */
2252 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2253 {
2254         int rc;
2255
2256         if (rxq < 1 || rxq > dev->num_rx_queues)
2257                 return -EINVAL;
2258
2259         if (dev->reg_state == NETREG_REGISTERED) {
2260                 ASSERT_RTNL();
2261
2262                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2263                                                   rxq);
2264                 if (rc)
2265                         return rc;
2266         }
2267
2268         dev->real_num_rx_queues = rxq;
2269         return 0;
2270 }
2271 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2272 #endif
2273
2274 /**
2275  * netif_get_num_default_rss_queues - default number of RSS queues
2276  *
2277  * This routine should set an upper limit on the number of RSS queues
2278  * used by default by multiqueue devices.
2279  */
2280 int netif_get_num_default_rss_queues(void)
2281 {
2282         return is_kdump_kernel() ?
2283                 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2284 }
2285 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2286
2287 static void __netif_reschedule(struct Qdisc *q)
2288 {
2289         struct softnet_data *sd;
2290         unsigned long flags;
2291
2292         local_irq_save(flags);
2293         sd = this_cpu_ptr(&softnet_data);
2294         q->next_sched = NULL;
2295         *sd->output_queue_tailp = q;
2296         sd->output_queue_tailp = &q->next_sched;
2297         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2298         local_irq_restore(flags);
2299 }
2300
2301 void __netif_schedule(struct Qdisc *q)
2302 {
2303         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2304                 __netif_reschedule(q);
2305 }
2306 EXPORT_SYMBOL(__netif_schedule);
2307
2308 struct dev_kfree_skb_cb {
2309         enum skb_free_reason reason;
2310 };
2311
2312 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2313 {
2314         return (struct dev_kfree_skb_cb *)skb->cb;
2315 }
2316
2317 void netif_schedule_queue(struct netdev_queue *txq)
2318 {
2319         rcu_read_lock();
2320         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2321                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2322
2323                 __netif_schedule(q);
2324         }
2325         rcu_read_unlock();
2326 }
2327 EXPORT_SYMBOL(netif_schedule_queue);
2328
2329 /**
2330  *      netif_wake_subqueue - allow sending packets on subqueue
2331  *      @dev: network device
2332  *      @queue_index: sub queue index
2333  *
2334  * Resume individual transmit queue of a device with multiple transmit queues.
2335  */
2336 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2337 {
2338         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2339
2340         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2341                 struct Qdisc *q;
2342
2343                 rcu_read_lock();
2344                 q = rcu_dereference(txq->qdisc);
2345                 __netif_schedule(q);
2346                 rcu_read_unlock();
2347         }
2348 }
2349 EXPORT_SYMBOL(netif_wake_subqueue);
2350
2351 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2352 {
2353         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2354                 struct Qdisc *q;
2355
2356                 rcu_read_lock();
2357                 q = rcu_dereference(dev_queue->qdisc);
2358                 __netif_schedule(q);
2359                 rcu_read_unlock();
2360         }
2361 }
2362 EXPORT_SYMBOL(netif_tx_wake_queue);
2363
2364 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2365 {
2366         unsigned long flags;
2367
2368         if (likely(atomic_read(&skb->users) == 1)) {
2369                 smp_rmb();
2370                 atomic_set(&skb->users, 0);
2371         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2372                 return;
2373         }
2374         get_kfree_skb_cb(skb)->reason = reason;
2375         local_irq_save(flags);
2376         skb->next = __this_cpu_read(softnet_data.completion_queue);
2377         __this_cpu_write(softnet_data.completion_queue, skb);
2378         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2379         local_irq_restore(flags);
2380 }
2381 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2382
2383 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2384 {
2385         if (in_irq() || irqs_disabled())
2386                 __dev_kfree_skb_irq(skb, reason);
2387         else
2388                 dev_kfree_skb(skb);
2389 }
2390 EXPORT_SYMBOL(__dev_kfree_skb_any);
2391
2392
2393 /**
2394  * netif_device_detach - mark device as removed
2395  * @dev: network device
2396  *
2397  * Mark device as removed from system and therefore no longer available.
2398  */
2399 void netif_device_detach(struct net_device *dev)
2400 {
2401         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2402             netif_running(dev)) {
2403                 netif_tx_stop_all_queues(dev);
2404         }
2405 }
2406 EXPORT_SYMBOL(netif_device_detach);
2407
2408 /**
2409  * netif_device_attach - mark device as attached
2410  * @dev: network device
2411  *
2412  * Mark device as attached from system and restart if needed.
2413  */
2414 void netif_device_attach(struct net_device *dev)
2415 {
2416         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2417             netif_running(dev)) {
2418                 netif_tx_wake_all_queues(dev);
2419                 __netdev_watchdog_up(dev);
2420         }
2421 }
2422 EXPORT_SYMBOL(netif_device_attach);
2423
2424 /*
2425  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2426  * to be used as a distribution range.
2427  */
2428 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2429                   unsigned int num_tx_queues)
2430 {
2431         u32 hash;
2432         u16 qoffset = 0;
2433         u16 qcount = num_tx_queues;
2434
2435         if (skb_rx_queue_recorded(skb)) {
2436                 hash = skb_get_rx_queue(skb);
2437                 while (unlikely(hash >= num_tx_queues))
2438                         hash -= num_tx_queues;
2439                 return hash;
2440         }
2441
2442         if (dev->num_tc) {
2443                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2444                 qoffset = dev->tc_to_txq[tc].offset;
2445                 qcount = dev->tc_to_txq[tc].count;
2446         }
2447
2448         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2449 }
2450 EXPORT_SYMBOL(__skb_tx_hash);
2451
2452 static void skb_warn_bad_offload(const struct sk_buff *skb)
2453 {
2454         static const netdev_features_t null_features;
2455         struct net_device *dev = skb->dev;
2456         const char *name = "";
2457
2458         if (!net_ratelimit())
2459                 return;
2460
2461         if (dev) {
2462                 if (dev->dev.parent)
2463                         name = dev_driver_string(dev->dev.parent);
2464                 else
2465                         name = netdev_name(dev);
2466         }
2467         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2468              "gso_type=%d ip_summed=%d\n",
2469              name, dev ? &dev->features : &null_features,
2470              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2471              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2472              skb_shinfo(skb)->gso_type, skb->ip_summed);
2473 }
2474
2475 /*
2476  * Invalidate hardware checksum when packet is to be mangled, and
2477  * complete checksum manually on outgoing path.
2478  */
2479 int skb_checksum_help(struct sk_buff *skb)
2480 {
2481         __wsum csum;
2482         int ret = 0, offset;
2483
2484         if (skb->ip_summed == CHECKSUM_COMPLETE)
2485                 goto out_set_summed;
2486
2487         if (unlikely(skb_shinfo(skb)->gso_size)) {
2488                 skb_warn_bad_offload(skb);
2489                 return -EINVAL;
2490         }
2491
2492         /* Before computing a checksum, we should make sure no frag could
2493          * be modified by an external entity : checksum could be wrong.
2494          */
2495         if (skb_has_shared_frag(skb)) {
2496                 ret = __skb_linearize(skb);
2497                 if (ret)
2498                         goto out;
2499         }
2500
2501         offset = skb_checksum_start_offset(skb);
2502         BUG_ON(offset >= skb_headlen(skb));
2503         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2504
2505         offset += skb->csum_offset;
2506         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2507
2508         if (skb_cloned(skb) &&
2509             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2510                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2511                 if (ret)
2512                         goto out;
2513         }
2514
2515         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2516 out_set_summed:
2517         skb->ip_summed = CHECKSUM_NONE;
2518 out:
2519         return ret;
2520 }
2521 EXPORT_SYMBOL(skb_checksum_help);
2522
2523 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2524 {
2525         __be16 type = skb->protocol;
2526
2527         /* Tunnel gso handlers can set protocol to ethernet. */
2528         if (type == htons(ETH_P_TEB)) {
2529                 struct ethhdr *eth;
2530
2531                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2532                         return 0;
2533
2534                 eth = (struct ethhdr *)skb_mac_header(skb);
2535                 type = eth->h_proto;
2536         }
2537
2538         return __vlan_get_protocol(skb, type, depth);
2539 }
2540
2541 /**
2542  *      skb_mac_gso_segment - mac layer segmentation handler.
2543  *      @skb: buffer to segment
2544  *      @features: features for the output path (see dev->features)
2545  */
2546 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2547                                     netdev_features_t features)
2548 {
2549         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2550         struct packet_offload *ptype;
2551         int vlan_depth = skb->mac_len;
2552         __be16 type = skb_network_protocol(skb, &vlan_depth);
2553
2554         if (unlikely(!type))
2555                 return ERR_PTR(-EINVAL);
2556
2557         __skb_pull(skb, vlan_depth);
2558
2559         rcu_read_lock();
2560         list_for_each_entry_rcu(ptype, &offload_base, list) {
2561                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2562                         segs = ptype->callbacks.gso_segment(skb, features);
2563                         break;
2564                 }
2565         }
2566         rcu_read_unlock();
2567
2568         __skb_push(skb, skb->data - skb_mac_header(skb));
2569
2570         return segs;
2571 }
2572 EXPORT_SYMBOL(skb_mac_gso_segment);
2573
2574
2575 /* openvswitch calls this on rx path, so we need a different check.
2576  */
2577 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2578 {
2579         if (tx_path)
2580                 return skb->ip_summed != CHECKSUM_PARTIAL;
2581         else
2582                 return skb->ip_summed == CHECKSUM_NONE;
2583 }
2584
2585 /**
2586  *      __skb_gso_segment - Perform segmentation on skb.
2587  *      @skb: buffer to segment
2588  *      @features: features for the output path (see dev->features)
2589  *      @tx_path: whether it is called in TX path
2590  *
2591  *      This function segments the given skb and returns a list of segments.
2592  *
2593  *      It may return NULL if the skb requires no segmentation.  This is
2594  *      only possible when GSO is used for verifying header integrity.
2595  *
2596  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2597  */
2598 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2599                                   netdev_features_t features, bool tx_path)
2600 {
2601         if (unlikely(skb_needs_check(skb, tx_path))) {
2602                 int err;
2603
2604                 skb_warn_bad_offload(skb);
2605
2606                 err = skb_cow_head(skb, 0);
2607                 if (err < 0)
2608                         return ERR_PTR(err);
2609         }
2610
2611         /* Only report GSO partial support if it will enable us to
2612          * support segmentation on this frame without needing additional
2613          * work.
2614          */
2615         if (features & NETIF_F_GSO_PARTIAL) {
2616                 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2617                 struct net_device *dev = skb->dev;
2618
2619                 partial_features |= dev->features & dev->gso_partial_features;
2620                 if (!skb_gso_ok(skb, features | partial_features))
2621                         features &= ~NETIF_F_GSO_PARTIAL;
2622         }
2623
2624         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2625                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2626
2627         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2628         SKB_GSO_CB(skb)->encap_level = 0;
2629
2630         skb_reset_mac_header(skb);
2631         skb_reset_mac_len(skb);
2632
2633         return skb_mac_gso_segment(skb, features);
2634 }
2635 EXPORT_SYMBOL(__skb_gso_segment);
2636
2637 /* Take action when hardware reception checksum errors are detected. */
2638 #ifdef CONFIG_BUG
2639 void netdev_rx_csum_fault(struct net_device *dev)
2640 {
2641         if (net_ratelimit()) {
2642                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2643                 dump_stack();
2644         }
2645 }
2646 EXPORT_SYMBOL(netdev_rx_csum_fault);
2647 #endif
2648
2649 /* Actually, we should eliminate this check as soon as we know, that:
2650  * 1. IOMMU is present and allows to map all the memory.
2651  * 2. No high memory really exists on this machine.
2652  */
2653
2654 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2655 {
2656 #ifdef CONFIG_HIGHMEM
2657         int i;
2658         if (!(dev->features & NETIF_F_HIGHDMA)) {
2659                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2660                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2661                         if (PageHighMem(skb_frag_page(frag)))
2662                                 return 1;
2663                 }
2664         }
2665
2666         if (PCI_DMA_BUS_IS_PHYS) {
2667                 struct device *pdev = dev->dev.parent;
2668
2669                 if (!pdev)
2670                         return 0;
2671                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2672                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2673                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2674                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2675                                 return 1;
2676                 }
2677         }
2678 #endif
2679         return 0;
2680 }
2681
2682 /* If MPLS offload request, verify we are testing hardware MPLS features
2683  * instead of standard features for the netdev.
2684  */
2685 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2686 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2687                                            netdev_features_t features,
2688                                            __be16 type)
2689 {
2690         if (eth_p_mpls(type))
2691                 features &= skb->dev->mpls_features;
2692
2693         return features;
2694 }
2695 #else
2696 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2697                                            netdev_features_t features,
2698                                            __be16 type)
2699 {
2700         return features;
2701 }
2702 #endif
2703
2704 static netdev_features_t harmonize_features(struct sk_buff *skb,
2705         netdev_features_t features)
2706 {
2707         int tmp;
2708         __be16 type;
2709
2710         type = skb_network_protocol(skb, &tmp);
2711         features = net_mpls_features(skb, features, type);
2712
2713         if (skb->ip_summed != CHECKSUM_NONE &&
2714             !can_checksum_protocol(features, type)) {
2715                 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2716         } else if (illegal_highdma(skb->dev, skb)) {
2717                 features &= ~NETIF_F_SG;
2718         }
2719
2720         return features;
2721 }
2722
2723 netdev_features_t passthru_features_check(struct sk_buff *skb,
2724                                           struct net_device *dev,
2725                                           netdev_features_t features)
2726 {
2727         return features;
2728 }
2729 EXPORT_SYMBOL(passthru_features_check);
2730
2731 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2732                                              struct net_device *dev,
2733                                              netdev_features_t features)
2734 {
2735         return vlan_features_check(skb, features);
2736 }
2737
2738 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2739                                             struct net_device *dev,
2740                                             netdev_features_t features)
2741 {
2742         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2743
2744         if (gso_segs > dev->gso_max_segs)
2745                 return features & ~NETIF_F_GSO_MASK;
2746
2747         /* Support for GSO partial features requires software
2748          * intervention before we can actually process the packets
2749          * so we need to strip support for any partial features now
2750          * and we can pull them back in after we have partially
2751          * segmented the frame.
2752          */
2753         if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2754                 features &= ~dev->gso_partial_features;
2755
2756         /* Make sure to clear the IPv4 ID mangling feature if the
2757          * IPv4 header has the potential to be fragmented.
2758          */
2759         if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2760                 struct iphdr *iph = skb->encapsulation ?
2761                                     inner_ip_hdr(skb) : ip_hdr(skb);
2762
2763                 if (!(iph->frag_off & htons(IP_DF)))
2764                         features &= ~NETIF_F_TSO_MANGLEID;
2765         }
2766
2767         return features;
2768 }
2769
2770 netdev_features_t netif_skb_features(struct sk_buff *skb)
2771 {
2772         struct net_device *dev = skb->dev;
2773         netdev_features_t features = dev->features;
2774
2775         if (skb_is_gso(skb))
2776                 features = gso_features_check(skb, dev, features);
2777
2778         /* If encapsulation offload request, verify we are testing
2779          * hardware encapsulation features instead of standard
2780          * features for the netdev
2781          */
2782         if (skb->encapsulation)
2783                 features &= dev->hw_enc_features;
2784
2785         if (skb_vlan_tagged(skb))
2786                 features = netdev_intersect_features(features,
2787                                                      dev->vlan_features |
2788                                                      NETIF_F_HW_VLAN_CTAG_TX |
2789                                                      NETIF_F_HW_VLAN_STAG_TX);
2790
2791         if (dev->netdev_ops->ndo_features_check)
2792                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2793                                                                 features);
2794         else
2795                 features &= dflt_features_check(skb, dev, features);
2796
2797         return harmonize_features(skb, features);
2798 }
2799 EXPORT_SYMBOL(netif_skb_features);
2800
2801 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2802                     struct netdev_queue *txq, bool more)
2803 {
2804         unsigned int len;
2805         int rc;
2806
2807         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2808                 dev_queue_xmit_nit(skb, dev);
2809
2810         len = skb->len;
2811         trace_net_dev_start_xmit(skb, dev);
2812         rc = netdev_start_xmit(skb, dev, txq, more);
2813         trace_net_dev_xmit(skb, rc, dev, len);
2814
2815         return rc;
2816 }
2817
2818 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2819                                     struct netdev_queue *txq, int *ret)
2820 {
2821         struct sk_buff *skb = first;
2822         int rc = NETDEV_TX_OK;
2823
2824         while (skb) {
2825                 struct sk_buff *next = skb->next;
2826
2827                 skb->next = NULL;
2828                 rc = xmit_one(skb, dev, txq, next != NULL);
2829                 if (unlikely(!dev_xmit_complete(rc))) {
2830                         skb->next = next;
2831                         goto out;
2832                 }
2833
2834                 skb = next;
2835                 if (netif_xmit_stopped(txq) && skb) {
2836                         rc = NETDEV_TX_BUSY;
2837                         break;
2838                 }
2839         }
2840
2841 out:
2842         *ret = rc;
2843         return skb;
2844 }
2845
2846 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2847                                           netdev_features_t features)
2848 {
2849         if (skb_vlan_tag_present(skb) &&
2850             !vlan_hw_offload_capable(features, skb->vlan_proto))
2851                 skb = __vlan_hwaccel_push_inside(skb);
2852         return skb;
2853 }
2854
2855 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2856 {
2857         netdev_features_t features;
2858
2859         features = netif_skb_features(skb);
2860         skb = validate_xmit_vlan(skb, features);
2861         if (unlikely(!skb))
2862                 goto out_null;
2863
2864         if (netif_needs_gso(skb, features)) {
2865                 struct sk_buff *segs;
2866
2867                 segs = skb_gso_segment(skb, features);
2868                 if (IS_ERR(segs)) {
2869                         goto out_kfree_skb;
2870                 } else if (segs) {
2871                         consume_skb(skb);
2872                         skb = segs;
2873                 }
2874         } else {
2875                 if (skb_needs_linearize(skb, features) &&
2876                     __skb_linearize(skb))
2877                         goto out_kfree_skb;
2878
2879                 /* If packet is not checksummed and device does not
2880                  * support checksumming for this protocol, complete
2881                  * checksumming here.
2882                  */
2883                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2884                         if (skb->encapsulation)
2885                                 skb_set_inner_transport_header(skb,
2886                                                                skb_checksum_start_offset(skb));
2887                         else
2888                                 skb_set_transport_header(skb,
2889                                                          skb_checksum_start_offset(skb));
2890                         if (!(features & NETIF_F_CSUM_MASK) &&
2891                             skb_checksum_help(skb))
2892                                 goto out_kfree_skb;
2893                 }
2894         }
2895
2896         return skb;
2897
2898 out_kfree_skb:
2899         kfree_skb(skb);
2900 out_null:
2901         atomic_long_inc(&dev->tx_dropped);
2902         return NULL;
2903 }
2904
2905 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2906 {
2907         struct sk_buff *next, *head = NULL, *tail;
2908
2909         for (; skb != NULL; skb = next) {
2910                 next = skb->next;
2911                 skb->next = NULL;
2912
2913                 /* in case skb wont be segmented, point to itself */
2914                 skb->prev = skb;
2915
2916                 skb = validate_xmit_skb(skb, dev);
2917                 if (!skb)
2918                         continue;
2919
2920                 if (!head)
2921                         head = skb;
2922                 else
2923                         tail->next = skb;
2924                 /* If skb was segmented, skb->prev points to
2925                  * the last segment. If not, it still contains skb.
2926                  */
2927                 tail = skb->prev;
2928         }
2929         return head;
2930 }
2931 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
2932
2933 static void qdisc_pkt_len_init(struct sk_buff *skb)
2934 {
2935         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2936
2937         qdisc_skb_cb(skb)->pkt_len = skb->len;
2938
2939         /* To get more precise estimation of bytes sent on wire,
2940          * we add to pkt_len the headers size of all segments
2941          */
2942         if (shinfo->gso_size)  {
2943                 unsigned int hdr_len;
2944                 u16 gso_segs = shinfo->gso_segs;
2945
2946                 /* mac layer + network layer */
2947                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2948
2949                 /* + transport layer */
2950                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2951                         hdr_len += tcp_hdrlen(skb);
2952                 else
2953                         hdr_len += sizeof(struct udphdr);
2954
2955                 if (shinfo->gso_type & SKB_GSO_DODGY)
2956                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2957                                                 shinfo->gso_size);
2958
2959                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2960         }
2961 }
2962
2963 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2964                                  struct net_device *dev,
2965                                  struct netdev_queue *txq)
2966 {
2967         spinlock_t *root_lock = qdisc_lock(q);
2968         struct sk_buff *to_free = NULL;
2969         bool contended;
2970         int rc;
2971
2972         qdisc_calculate_pkt_len(skb, q);
2973         /*
2974          * Heuristic to force contended enqueues to serialize on a
2975          * separate lock before trying to get qdisc main lock.
2976          * This permits qdisc->running owner to get the lock more
2977          * often and dequeue packets faster.
2978          */
2979         contended = qdisc_is_running(q);
2980         if (unlikely(contended))
2981                 spin_lock(&q->busylock);
2982
2983         spin_lock(root_lock);
2984         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2985                 __qdisc_drop(skb, &to_free);
2986                 rc = NET_XMIT_DROP;
2987         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2988                    qdisc_run_begin(q)) {
2989                 /*
2990                  * This is a work-conserving queue; there are no old skbs
2991                  * waiting to be sent out; and the qdisc is not running -
2992                  * xmit the skb directly.
2993                  */
2994
2995                 qdisc_bstats_update(q, skb);
2996
2997                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2998                         if (unlikely(contended)) {
2999                                 spin_unlock(&q->busylock);
3000                                 contended = false;
3001                         }
3002                         __qdisc_run(q);
3003                 } else
3004                         qdisc_run_end(q);
3005
3006                 rc = NET_XMIT_SUCCESS;
3007         } else {
3008                 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3009                 if (qdisc_run_begin(q)) {
3010                         if (unlikely(contended)) {
3011                                 spin_unlock(&q->busylock);
3012                                 contended = false;
3013                         }
3014                         __qdisc_run(q);
3015                 }
3016         }
3017         spin_unlock(root_lock);
3018         if (unlikely(to_free))
3019                 kfree_skb_list(to_free);
3020         if (unlikely(contended))
3021                 spin_unlock(&q->busylock);
3022         return rc;
3023 }
3024
3025 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3026 static void skb_update_prio(struct sk_buff *skb)
3027 {
3028         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3029
3030         if (!skb->priority && skb->sk && map) {
3031                 unsigned int prioidx =
3032                         sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3033
3034                 if (prioidx < map->priomap_len)
3035                         skb->priority = map->priomap[prioidx];
3036         }
3037 }
3038 #else
3039 #define skb_update_prio(skb)
3040 #endif
3041
3042 DEFINE_PER_CPU(int, xmit_recursion);
3043 EXPORT_SYMBOL(xmit_recursion);
3044
3045 /**
3046  *      dev_loopback_xmit - loop back @skb
3047  *      @net: network namespace this loopback is happening in
3048  *      @sk:  sk needed to be a netfilter okfn
3049  *      @skb: buffer to transmit
3050  */
3051 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3052 {
3053         skb_reset_mac_header(skb);
3054         __skb_pull(skb, skb_network_offset(skb));
3055         skb->pkt_type = PACKET_LOOPBACK;
3056         skb->ip_summed = CHECKSUM_UNNECESSARY;
3057         WARN_ON(!skb_dst(skb));
3058         skb_dst_force(skb);
3059         netif_rx_ni(skb);
3060         return 0;
3061 }
3062 EXPORT_SYMBOL(dev_loopback_xmit);
3063
3064 #ifdef CONFIG_NET_EGRESS
3065 static struct sk_buff *
3066 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3067 {
3068         struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3069         struct tcf_result cl_res;
3070
3071         if (!cl)
3072                 return skb;
3073
3074         /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3075          * earlier by the caller.
3076          */
3077         qdisc_bstats_cpu_update(cl->q, skb);
3078
3079         switch (tc_classify(skb, cl, &cl_res, false)) {
3080         case TC_ACT_OK:
3081         case TC_ACT_RECLASSIFY:
3082                 skb->tc_index = TC_H_MIN(cl_res.classid);
3083                 break;
3084         case TC_ACT_SHOT:
3085                 qdisc_qstats_cpu_drop(cl->q);
3086                 *ret = NET_XMIT_DROP;
3087                 kfree_skb(skb);
3088                 return NULL;
3089         case TC_ACT_STOLEN:
3090         case TC_ACT_QUEUED:
3091                 *ret = NET_XMIT_SUCCESS;
3092                 consume_skb(skb);
3093                 return NULL;
3094         case TC_ACT_REDIRECT:
3095                 /* No need to push/pop skb's mac_header here on egress! */
3096                 skb_do_redirect(skb);
3097                 *ret = NET_XMIT_SUCCESS;
3098                 return NULL;
3099         default:
3100                 break;
3101         }
3102
3103         return skb;
3104 }
3105 #endif /* CONFIG_NET_EGRESS */
3106
3107 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3108 {
3109 #ifdef CONFIG_XPS
3110         struct xps_dev_maps *dev_maps;
3111         struct xps_map *map;
3112         int queue_index = -1;
3113
3114         rcu_read_lock();
3115         dev_maps = rcu_dereference(dev->xps_maps);
3116         if (dev_maps) {
3117                 map = rcu_dereference(
3118                     dev_maps->cpu_map[skb->sender_cpu - 1]);
3119                 if (map) {
3120                         if (map->len == 1)
3121                                 queue_index = map->queues[0];
3122                         else
3123                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3124                                                                            map->len)];
3125                         if (unlikely(queue_index >= dev->real_num_tx_queues))
3126                                 queue_index = -1;
3127                 }
3128         }
3129         rcu_read_unlock();
3130
3131         return queue_index;
3132 #else
3133         return -1;
3134 #endif
3135 }
3136
3137 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3138 {
3139         struct sock *sk = skb->sk;
3140         int queue_index = sk_tx_queue_get(sk);
3141
3142         if (queue_index < 0 || skb->ooo_okay ||
3143             queue_index >= dev->real_num_tx_queues) {
3144                 int new_index = get_xps_queue(dev, skb);
3145                 if (new_index < 0)
3146                         new_index = skb_tx_hash(dev, skb);
3147
3148                 if (queue_index != new_index && sk &&
3149                     sk_fullsock(sk) &&
3150                     rcu_access_pointer(sk->sk_dst_cache))
3151                         sk_tx_queue_set(sk, new_index);
3152
3153                 queue_index = new_index;
3154         }
3155
3156         return queue_index;
3157 }
3158
3159 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3160                                     struct sk_buff *skb,
3161                                     void *accel_priv)
3162 {
3163         int queue_index = 0;
3164
3165 #ifdef CONFIG_XPS
3166         u32 sender_cpu = skb->sender_cpu - 1;
3167
3168         if (sender_cpu >= (u32)NR_CPUS)
3169                 skb->sender_cpu = raw_smp_processor_id() + 1;
3170 #endif
3171
3172         if (dev->real_num_tx_queues != 1) {
3173                 const struct net_device_ops *ops = dev->netdev_ops;
3174                 if (ops->ndo_select_queue)
3175                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3176                                                             __netdev_pick_tx);
3177                 else
3178                         queue_index = __netdev_pick_tx(dev, skb);
3179
3180                 if (!accel_priv)
3181                         queue_index = netdev_cap_txqueue(dev, queue_index);
3182         }
3183
3184         skb_set_queue_mapping(skb, queue_index);
3185         return netdev_get_tx_queue(dev, queue_index);
3186 }
3187
3188 /**
3189  *      __dev_queue_xmit - transmit a buffer
3190  *      @skb: buffer to transmit
3191  *      @accel_priv: private data used for L2 forwarding offload
3192  *
3193  *      Queue a buffer for transmission to a network device. The caller must
3194  *      have set the device and priority and built the buffer before calling
3195  *      this function. The function can be called from an interrupt.
3196  *
3197  *      A negative errno code is returned on a failure. A success does not
3198  *      guarantee the frame will be transmitted as it may be dropped due
3199  *      to congestion or traffic shaping.
3200  *
3201  * -----------------------------------------------------------------------------------
3202  *      I notice this method can also return errors from the queue disciplines,
3203  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3204  *      be positive.
3205  *
3206  *      Regardless of the return value, the skb is consumed, so it is currently
3207  *      difficult to retry a send to this method.  (You can bump the ref count
3208  *      before sending to hold a reference for retry if you are careful.)
3209  *
3210  *      When calling this method, interrupts MUST be enabled.  This is because
3211  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3212  *          --BLG
3213  */
3214 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3215 {
3216         struct net_device *dev = skb->dev;
3217         struct netdev_queue *txq;
3218         struct Qdisc *q;
3219         int rc = -ENOMEM;
3220
3221         skb_reset_mac_header(skb);
3222
3223         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3224                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3225
3226         /* Disable soft irqs for various locks below. Also
3227          * stops preemption for RCU.
3228          */
3229         rcu_read_lock_bh();
3230
3231         skb_update_prio(skb);
3232
3233         qdisc_pkt_len_init(skb);
3234 #ifdef CONFIG_NET_CLS_ACT
3235         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3236 # ifdef CONFIG_NET_EGRESS
3237         if (static_key_false(&egress_needed)) {
3238                 skb = sch_handle_egress(skb, &rc, dev);
3239                 if (!skb)
3240                         goto out;
3241         }
3242 # endif
3243 #endif
3244         /* If device/qdisc don't need skb->dst, release it right now while
3245          * its hot in this cpu cache.
3246          */
3247         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3248                 skb_dst_drop(skb);
3249         else
3250                 skb_dst_force(skb);
3251
3252         txq = netdev_pick_tx(dev, skb, accel_priv);
3253         q = rcu_dereference_bh(txq->qdisc);
3254
3255         trace_net_dev_queue(skb);
3256         if (q->enqueue) {
3257                 rc = __dev_xmit_skb(skb, q, dev, txq);
3258                 goto out;
3259         }
3260
3261         /* The device has no queue. Common case for software devices:
3262            loopback, all the sorts of tunnels...
3263
3264            Really, it is unlikely that netif_tx_lock protection is necessary
3265            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3266            counters.)
3267            However, it is possible, that they rely on protection
3268            made by us here.
3269
3270            Check this and shot the lock. It is not prone from deadlocks.
3271            Either shot noqueue qdisc, it is even simpler 8)
3272          */
3273         if (dev->flags & IFF_UP) {
3274                 int cpu = smp_processor_id(); /* ok because BHs are off */
3275
3276                 if (txq->xmit_lock_owner != cpu) {
3277                         if (unlikely(__this_cpu_read(xmit_recursion) >
3278                                      XMIT_RECURSION_LIMIT))
3279                                 goto recursion_alert;
3280
3281                         skb = validate_xmit_skb(skb, dev);
3282                         if (!skb)
3283                                 goto out;
3284
3285                         HARD_TX_LOCK(dev, txq, cpu);
3286
3287                         if (!netif_xmit_stopped(txq)) {
3288                                 __this_cpu_inc(xmit_recursion);
3289                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3290                                 __this_cpu_dec(xmit_recursion);
3291                                 if (dev_xmit_complete(rc)) {
3292                                         HARD_TX_UNLOCK(dev, txq);
3293                                         goto out;
3294                                 }
3295                         }
3296                         HARD_TX_UNLOCK(dev, txq);
3297                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3298                                              dev->name);
3299                 } else {
3300                         /* Recursion is detected! It is possible,
3301                          * unfortunately
3302                          */
3303 recursion_alert:
3304                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3305                                              dev->name);
3306                 }
3307         }
3308
3309         rc = -ENETDOWN;
3310         rcu_read_unlock_bh();
3311
3312         atomic_long_inc(&dev->tx_dropped);
3313         kfree_skb_list(skb);
3314         return rc;
3315 out:
3316         rcu_read_unlock_bh();
3317         return rc;
3318 }
3319
3320 int dev_queue_xmit(struct sk_buff *skb)
3321 {
3322         return __dev_queue_xmit(skb, NULL);
3323 }
3324 EXPORT_SYMBOL(dev_queue_xmit);
3325
3326 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3327 {
3328         return __dev_queue_xmit(skb, accel_priv);
3329 }
3330 EXPORT_SYMBOL(dev_queue_xmit_accel);
3331
3332
3333 /*=======================================================================
3334                         Receiver routines
3335   =======================================================================*/
3336
3337 int netdev_max_backlog __read_mostly = 1000;
3338 EXPORT_SYMBOL(netdev_max_backlog);
3339
3340 int netdev_tstamp_prequeue __read_mostly = 1;
3341 int netdev_budget __read_mostly = 300;
3342 int weight_p __read_mostly = 64;            /* old backlog weight */
3343
3344 /* Called with irq disabled */
3345 static inline void ____napi_schedule(struct softnet_data *sd,
3346                                      struct napi_struct *napi)
3347 {
3348         list_add_tail(&napi->poll_list, &sd->poll_list);
3349         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3350 }
3351
3352 #ifdef CONFIG_RPS
3353
3354 /* One global table that all flow-based protocols share. */
3355 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3356 EXPORT_SYMBOL(rps_sock_flow_table);
3357 u32 rps_cpu_mask __read_mostly;
3358 EXPORT_SYMBOL(rps_cpu_mask);
3359
3360 struct static_key rps_needed __read_mostly;
3361 EXPORT_SYMBOL(rps_needed);
3362
3363 static struct rps_dev_flow *
3364 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3365             struct rps_dev_flow *rflow, u16 next_cpu)
3366 {
3367         if (next_cpu < nr_cpu_ids) {
3368 #ifdef CONFIG_RFS_ACCEL
3369                 struct netdev_rx_queue *rxqueue;
3370                 struct rps_dev_flow_table *flow_table;
3371                 struct rps_dev_flow *old_rflow;
3372                 u32 flow_id;
3373                 u16 rxq_index;
3374                 int rc;
3375
3376                 /* Should we steer this flow to a different hardware queue? */
3377                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3378                     !(dev->features & NETIF_F_NTUPLE))
3379                         goto out;
3380                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3381                 if (rxq_index == skb_get_rx_queue(skb))
3382                         goto out;
3383
3384                 rxqueue = dev->_rx + rxq_index;
3385                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3386                 if (!flow_table)
3387                         goto out;
3388                 flow_id = skb_get_hash(skb) & flow_table->mask;
3389                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3390                                                         rxq_index, flow_id);
3391                 if (rc < 0)
3392                         goto out;
3393                 old_rflow = rflow;
3394                 rflow = &flow_table->flows[flow_id];
3395                 rflow->filter = rc;
3396                 if (old_rflow->filter == rflow->filter)
3397                         old_rflow->filter = RPS_NO_FILTER;
3398         out:
3399 #endif
3400                 rflow->last_qtail =
3401                         per_cpu(softnet_data, next_cpu).input_queue_head;
3402         }
3403
3404         rflow->cpu = next_cpu;
3405         return rflow;
3406 }
3407
3408 /*
3409  * get_rps_cpu is called from netif_receive_skb and returns the target
3410  * CPU from the RPS map of the receiving queue for a given skb.
3411  * rcu_read_lock must be held on entry.
3412  */
3413 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3414                        struct rps_dev_flow **rflowp)
3415 {
3416         const struct rps_sock_flow_table *sock_flow_table;
3417         struct netdev_rx_queue *rxqueue = dev->_rx;
3418         struct rps_dev_flow_table *flow_table;
3419         struct rps_map *map;
3420         int cpu = -1;
3421         u32 tcpu;
3422         u32 hash;
3423
3424         if (skb_rx_queue_recorded(skb)) {
3425                 u16 index = skb_get_rx_queue(skb);
3426
3427                 if (unlikely(index >= dev->real_num_rx_queues)) {
3428                         WARN_ONCE(dev->real_num_rx_queues > 1,
3429                                   "%s received packet on queue %u, but number "
3430                                   "of RX queues is %u\n",
3431                                   dev->name, index, dev->real_num_rx_queues);
3432                         goto done;
3433                 }
3434                 rxqueue += index;
3435         }
3436
3437         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3438
3439         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3440         map = rcu_dereference(rxqueue->rps_map);
3441         if (!flow_table && !map)
3442                 goto done;
3443
3444         skb_reset_network_header(skb);
3445         hash = skb_get_hash(skb);
3446         if (!hash)
3447                 goto done;
3448
3449         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3450         if (flow_table && sock_flow_table) {
3451                 struct rps_dev_flow *rflow;
3452                 u32 next_cpu;
3453                 u32 ident;
3454
3455                 /* First check into global flow table if there is a match */
3456                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3457                 if ((ident ^ hash) & ~rps_cpu_mask)
3458                         goto try_rps;
3459
3460                 next_cpu = ident & rps_cpu_mask;
3461
3462                 /* OK, now we know there is a match,
3463                  * we can look at the local (per receive queue) flow table
3464                  */
3465                 rflow = &flow_table->flows[hash & flow_table->mask];
3466                 tcpu = rflow->cpu;
3467
3468                 /*
3469                  * If the desired CPU (where last recvmsg was done) is
3470                  * different from current CPU (one in the rx-queue flow
3471                  * table entry), switch if one of the following holds:
3472                  *   - Current CPU is unset (>= nr_cpu_ids).
3473                  *   - Current CPU is offline.
3474                  *   - The current CPU's queue tail has advanced beyond the
3475                  *     last packet that was enqueued using this table entry.
3476                  *     This guarantees that all previous packets for the flow
3477                  *     have been dequeued, thus preserving in order delivery.
3478                  */
3479                 if (unlikely(tcpu != next_cpu) &&
3480                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3481                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3482                       rflow->last_qtail)) >= 0)) {
3483                         tcpu = next_cpu;
3484                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3485                 }
3486
3487                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3488                         *rflowp = rflow;
3489                         cpu = tcpu;
3490                         goto done;
3491                 }
3492         }
3493
3494 try_rps:
3495
3496         if (map) {
3497                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3498                 if (cpu_online(tcpu)) {
3499                         cpu = tcpu;
3500                         goto done;
3501                 }
3502         }
3503
3504 done:
3505         return cpu;
3506 }
3507
3508 #ifdef CONFIG_RFS_ACCEL
3509
3510 /**
3511  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3512  * @dev: Device on which the filter was set
3513  * @rxq_index: RX queue index
3514  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3515  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3516  *
3517  * Drivers that implement ndo_rx_flow_steer() should periodically call
3518  * this function for each installed filter and remove the filters for
3519  * which it returns %true.
3520  */
3521 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3522                          u32 flow_id, u16 filter_id)
3523 {
3524         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3525         struct rps_dev_flow_table *flow_table;
3526         struct rps_dev_flow *rflow;
3527         bool expire = true;
3528         unsigned int cpu;
3529
3530         rcu_read_lock();
3531         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3532         if (flow_table && flow_id <= flow_table->mask) {
3533                 rflow = &flow_table->flows[flow_id];
3534                 cpu = ACCESS_ONCE(rflow->cpu);
3535                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3536                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3537                            rflow->last_qtail) <
3538                      (int)(10 * flow_table->mask)))
3539                         expire = false;
3540         }
3541         rcu_read_unlock();
3542         return expire;
3543 }
3544 EXPORT_SYMBOL(rps_may_expire_flow);
3545
3546 #endif /* CONFIG_RFS_ACCEL */
3547
3548 /* Called from hardirq (IPI) context */
3549 static void rps_trigger_softirq(void *data)
3550 {
3551         struct softnet_data *sd = data;
3552
3553         ____napi_schedule(sd, &sd->backlog);
3554         sd->received_rps++;
3555 }
3556
3557 #endif /* CONFIG_RPS */
3558
3559 /*
3560  * Check if this softnet_data structure is another cpu one
3561  * If yes, queue it to our IPI list and return 1
3562  * If no, return 0
3563  */
3564 static int rps_ipi_queued(struct softnet_data *sd)
3565 {
3566 #ifdef CONFIG_RPS
3567         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3568
3569         if (sd != mysd) {
3570                 sd->rps_ipi_next = mysd->rps_ipi_list;
3571                 mysd->rps_ipi_list = sd;
3572
3573                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3574                 return 1;
3575         }
3576 #endif /* CONFIG_RPS */
3577         return 0;
3578 }
3579
3580 #ifdef CONFIG_NET_FLOW_LIMIT
3581 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3582 #endif
3583
3584 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3585 {
3586 #ifdef CONFIG_NET_FLOW_LIMIT
3587         struct sd_flow_limit *fl;
3588         struct softnet_data *sd;
3589         unsigned int old_flow, new_flow;
3590
3591         if (qlen < (netdev_max_backlog >> 1))
3592                 return false;
3593
3594         sd = this_cpu_ptr(&softnet_data);
3595
3596         rcu_read_lock();
3597         fl = rcu_dereference(sd->flow_limit);
3598         if (fl) {
3599                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3600                 old_flow = fl->history[fl->history_head];
3601                 fl->history[fl->history_head] = new_flow;
3602
3603                 fl->history_head++;
3604                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3605
3606                 if (likely(fl->buckets[old_flow]))
3607                         fl->buckets[old_flow]--;
3608
3609                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3610                         fl->count++;
3611                         rcu_read_unlock();
3612                         return true;
3613                 }
3614         }
3615         rcu_read_unlock();
3616 #endif
3617         return false;
3618 }
3619
3620 /*
3621  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3622  * queue (may be a remote CPU queue).
3623  */
3624 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3625                               unsigned int *qtail)
3626 {
3627         struct softnet_data *sd;
3628         unsigned long flags;
3629         unsigned int qlen;
3630
3631         sd = &per_cpu(softnet_data, cpu);
3632
3633         local_irq_save(flags);
3634
3635         rps_lock(sd);
3636         if (!netif_running(skb->dev))
3637                 goto drop;
3638         qlen = skb_queue_len(&sd->input_pkt_queue);
3639         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3640                 if (qlen) {
3641 enqueue:
3642                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3643                         input_queue_tail_incr_save(sd, qtail);
3644                         rps_unlock(sd);
3645                         local_irq_restore(flags);
3646                         return NET_RX_SUCCESS;
3647                 }
3648
3649                 /* Schedule NAPI for backlog device
3650                  * We can use non atomic operation since we own the queue lock
3651                  */
3652                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3653                         if (!rps_ipi_queued(sd))
3654                                 ____napi_schedule(sd, &sd->backlog);
3655                 }
3656                 goto enqueue;
3657         }
3658
3659 drop:
3660         sd->dropped++;
3661         rps_unlock(sd);
3662
3663         local_irq_restore(flags);
3664
3665         atomic_long_inc(&skb->dev->rx_dropped);
3666         kfree_skb(skb);
3667         return NET_RX_DROP;
3668 }
3669
3670 static int netif_rx_internal(struct sk_buff *skb)
3671 {
3672         int ret;
3673
3674         net_timestamp_check(netdev_tstamp_prequeue, skb);
3675
3676         trace_netif_rx(skb);
3677 #ifdef CONFIG_RPS
3678         if (static_key_false(&rps_needed)) {
3679                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3680                 int cpu;
3681
3682                 preempt_disable();
3683                 rcu_read_lock();
3684
3685                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3686                 if (cpu < 0)
3687                         cpu = smp_processor_id();
3688
3689                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3690
3691                 rcu_read_unlock();
3692                 preempt_enable();
3693         } else
3694 #endif
3695         {
3696                 unsigned int qtail;
3697                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3698                 put_cpu();
3699         }
3700         return ret;
3701 }
3702
3703 /**
3704  *      netif_rx        -       post buffer to the network code
3705  *      @skb: buffer to post
3706  *
3707  *      This function receives a packet from a device driver and queues it for
3708  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3709  *      may be dropped during processing for congestion control or by the
3710  *      protocol layers.
3711  *
3712  *      return values:
3713  *      NET_RX_SUCCESS  (no congestion)
3714  *      NET_RX_DROP     (packet was dropped)
3715  *
3716  */
3717
3718 int netif_rx(struct sk_buff *skb)
3719 {
3720         trace_netif_rx_entry(skb);
3721
3722         return netif_rx_internal(skb);
3723 }
3724 EXPORT_SYMBOL(netif_rx);
3725
3726 int netif_rx_ni(struct sk_buff *skb)
3727 {
3728         int err;
3729
3730         trace_netif_rx_ni_entry(skb);
3731
3732         preempt_disable();
3733         err = netif_rx_internal(skb);
3734         if (local_softirq_pending())
3735                 do_softirq();
3736         preempt_enable();
3737
3738         return err;
3739 }
3740 EXPORT_SYMBOL(netif_rx_ni);
3741
3742 static __latent_entropy void net_tx_action(struct softirq_action *h)
3743 {
3744         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3745
3746         if (sd->completion_queue) {
3747                 struct sk_buff *clist;
3748
3749                 local_irq_disable();
3750                 clist = sd->completion_queue;
3751                 sd->completion_queue = NULL;
3752                 local_irq_enable();
3753
3754                 while (clist) {
3755                         struct sk_buff *skb = clist;
3756                         clist = clist->next;
3757
3758                         WARN_ON(atomic_read(&skb->users));
3759                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3760                                 trace_consume_skb(skb);
3761                         else
3762                                 trace_kfree_skb(skb, net_tx_action);
3763
3764                         if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3765                                 __kfree_skb(skb);
3766                         else
3767                                 __kfree_skb_defer(skb);
3768                 }
3769
3770                 __kfree_skb_flush();
3771         }
3772
3773         if (sd->output_queue) {
3774                 struct Qdisc *head;
3775
3776                 local_irq_disable();
3777                 head = sd->output_queue;
3778                 sd->output_queue = NULL;
3779                 sd->output_queue_tailp = &sd->output_queue;
3780                 local_irq_enable();
3781
3782                 while (head) {
3783                         struct Qdisc *q = head;
3784                         spinlock_t *root_lock;
3785
3786                         head = head->next_sched;
3787
3788                         root_lock = qdisc_lock(q);
3789                         spin_lock(root_lock);
3790                         /* We need to make sure head->next_sched is read
3791                          * before clearing __QDISC_STATE_SCHED
3792                          */
3793                         smp_mb__before_atomic();
3794                         clear_bit(__QDISC_STATE_SCHED, &q->state);
3795                         qdisc_run(q);
3796                         spin_unlock(root_lock);
3797                 }
3798         }
3799 }
3800
3801 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3802 /* This hook is defined here for ATM LANE */
3803 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3804                              unsigned char *addr) __read_mostly;
3805 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3806 #endif
3807
3808 static inline struct sk_buff *
3809 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3810                    struct net_device *orig_dev)
3811 {
3812 #ifdef CONFIG_NET_CLS_ACT
3813         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3814         struct tcf_result cl_res;
3815
3816         /* If there's at least one ingress present somewhere (so
3817          * we get here via enabled static key), remaining devices
3818          * that are not configured with an ingress qdisc will bail
3819          * out here.
3820          */
3821         if (!cl)
3822                 return skb;
3823         if (*pt_prev) {
3824                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3825                 *pt_prev = NULL;
3826         }
3827
3828         qdisc_skb_cb(skb)->pkt_len = skb->len;
3829         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3830         qdisc_bstats_cpu_update(cl->q, skb);
3831
3832         switch (tc_classify(skb, cl, &cl_res, false)) {
3833         case TC_ACT_OK:
3834         case TC_ACT_RECLASSIFY:
3835                 skb->tc_index = TC_H_MIN(cl_res.classid);
3836                 break;
3837         case TC_ACT_SHOT:
3838                 qdisc_qstats_cpu_drop(cl->q);
3839                 kfree_skb(skb);
3840                 return NULL;
3841         case TC_ACT_STOLEN:
3842         case TC_ACT_QUEUED:
3843                 consume_skb(skb);
3844                 return NULL;
3845         case TC_ACT_REDIRECT:
3846                 /* skb_mac_header check was done by cls/act_bpf, so
3847                  * we can safely push the L2 header back before
3848                  * redirecting to another netdev
3849                  */
3850                 __skb_push(skb, skb->mac_len);
3851                 skb_do_redirect(skb);
3852                 return NULL;
3853         default:
3854                 break;
3855         }
3856 #endif /* CONFIG_NET_CLS_ACT */
3857         return skb;
3858 }
3859
3860 /**
3861  *      netdev_is_rx_handler_busy - check if receive handler is registered
3862  *      @dev: device to check
3863  *
3864  *      Check if a receive handler is already registered for a given device.
3865  *      Return true if there one.
3866  *
3867  *      The caller must hold the rtnl_mutex.
3868  */
3869 bool netdev_is_rx_handler_busy(struct net_device *dev)
3870 {
3871         ASSERT_RTNL();
3872         return dev && rtnl_dereference(dev->rx_handler);
3873 }
3874 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3875
3876 /**
3877  *      netdev_rx_handler_register - register receive handler
3878  *      @dev: device to register a handler for
3879  *      @rx_handler: receive handler to register
3880  *      @rx_handler_data: data pointer that is used by rx handler
3881  *
3882  *      Register a receive handler for a device. This handler will then be
3883  *      called from __netif_receive_skb. A negative errno code is returned
3884  *      on a failure.
3885  *
3886  *      The caller must hold the rtnl_mutex.
3887  *
3888  *      For a general description of rx_handler, see enum rx_handler_result.
3889  */
3890 int netdev_rx_handler_register(struct net_device *dev,
3891                                rx_handler_func_t *rx_handler,
3892                                void *rx_handler_data)
3893 {
3894         ASSERT_RTNL();
3895
3896         if (dev->rx_handler)
3897                 return -EBUSY;
3898
3899         /* Note: rx_handler_data must be set before rx_handler */
3900         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3901         rcu_assign_pointer(dev->rx_handler, rx_handler);
3902
3903         return 0;
3904 }
3905 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3906
3907 /**
3908  *      netdev_rx_handler_unregister - unregister receive handler
3909  *      @dev: device to unregister a handler from
3910  *
3911  *      Unregister a receive handler from a device.
3912  *
3913  *      The caller must hold the rtnl_mutex.
3914  */
3915 void netdev_rx_handler_unregister(struct net_device *dev)
3916 {
3917
3918         ASSERT_RTNL();
3919         RCU_INIT_POINTER(dev->rx_handler, NULL);
3920         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3921          * section has a guarantee to see a non NULL rx_handler_data
3922          * as well.
3923          */
3924         synchronize_net();
3925         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3926 }
3927 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3928
3929 /*
3930  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3931  * the special handling of PFMEMALLOC skbs.
3932  */
3933 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3934 {
3935         switch (skb->protocol) {
3936         case htons(ETH_P_ARP):
3937         case htons(ETH_P_IP):
3938         case htons(ETH_P_IPV6):
3939         case htons(ETH_P_8021Q):
3940         case htons(ETH_P_8021AD):
3941                 return true;
3942         default:
3943                 return false;
3944         }
3945 }
3946
3947 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3948                              int *ret, struct net_device *orig_dev)
3949 {
3950 #ifdef CONFIG_NETFILTER_INGRESS
3951         if (nf_hook_ingress_active(skb)) {
3952                 int ingress_retval;
3953
3954                 if (*pt_prev) {
3955                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
3956                         *pt_prev = NULL;
3957                 }
3958
3959                 rcu_read_lock();
3960                 ingress_retval = nf_hook_ingress(skb);
3961                 rcu_read_unlock();
3962                 return ingress_retval;
3963         }
3964 #endif /* CONFIG_NETFILTER_INGRESS */
3965         return 0;
3966 }
3967
3968 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3969 {
3970         struct packet_type *ptype, *pt_prev;
3971         rx_handler_func_t *rx_handler;
3972         struct net_device *orig_dev;
3973         bool deliver_exact = false;
3974         int ret = NET_RX_DROP;
3975         __be16 type;
3976
3977         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3978
3979         trace_netif_receive_skb(skb);
3980
3981         orig_dev = skb->dev;
3982
3983         skb_reset_network_header(skb);
3984         if (!skb_transport_header_was_set(skb))
3985                 skb_reset_transport_header(skb);
3986         skb_reset_mac_len(skb);
3987
3988         pt_prev = NULL;
3989
3990 another_round:
3991         skb->skb_iif = skb->dev->ifindex;
3992
3993         __this_cpu_inc(softnet_data.processed);
3994
3995         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3996             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3997                 skb = skb_vlan_untag(skb);
3998                 if (unlikely(!skb))
3999                         goto out;
4000         }
4001
4002 #ifdef CONFIG_NET_CLS_ACT
4003         if (skb->tc_verd & TC_NCLS) {
4004                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4005                 goto ncls;
4006         }
4007 #endif
4008
4009         if (pfmemalloc)
4010                 goto skip_taps;
4011
4012         list_for_each_entry_rcu(ptype, &ptype_all, list) {
4013                 if (pt_prev)
4014                         ret = deliver_skb(skb, pt_prev, orig_dev);
4015                 pt_prev = ptype;
4016         }
4017
4018         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4019                 if (pt_prev)
4020                         ret = deliver_skb(skb, pt_prev, orig_dev);
4021                 pt_prev = ptype;
4022         }
4023
4024 skip_taps:
4025 #ifdef CONFIG_NET_INGRESS
4026         if (static_key_false(&ingress_needed)) {
4027                 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4028                 if (!skb)
4029                         goto out;
4030
4031                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4032                         goto out;
4033         }
4034 #endif
4035 #ifdef CONFIG_NET_CLS_ACT
4036         skb->tc_verd = 0;
4037 ncls:
4038 #endif
4039         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4040                 goto drop;
4041
4042         if (skb_vlan_tag_present(skb)) {
4043                 if (pt_prev) {
4044                         ret = deliver_skb(skb, pt_prev, orig_dev);
4045                         pt_prev = NULL;
4046                 }
4047                 if (vlan_do_receive(&skb))
4048                         goto another_round;
4049                 else if (unlikely(!skb))
4050                         goto out;
4051         }
4052
4053         rx_handler = rcu_dereference(skb->dev->rx_handler);
4054         if (rx_handler) {
4055                 if (pt_prev) {
4056                         ret = deliver_skb(skb, pt_prev, orig_dev);
4057                         pt_prev = NULL;
4058                 }
4059                 switch (rx_handler(&skb)) {
4060                 case RX_HANDLER_CONSUMED:
4061                         ret = NET_RX_SUCCESS;
4062                         goto out;
4063                 case RX_HANDLER_ANOTHER:
4064                         goto another_round;
4065                 case RX_HANDLER_EXACT:
4066                         deliver_exact = true;
4067                 case RX_HANDLER_PASS:
4068                         break;
4069                 default:
4070                         BUG();
4071                 }
4072         }
4073
4074         if (unlikely(skb_vlan_tag_present(skb))) {
4075                 if (skb_vlan_tag_get_id(skb))
4076                         skb->pkt_type = PACKET_OTHERHOST;
4077                 /* Note: we might in the future use prio bits
4078                  * and set skb->priority like in vlan_do_receive()
4079                  * For the time being, just ignore Priority Code Point
4080                  */
4081                 skb->vlan_tci = 0;
4082         }
4083
4084         type = skb->protocol;
4085
4086         /* deliver only exact match when indicated */
4087         if (likely(!deliver_exact)) {
4088                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4089                                        &ptype_base[ntohs(type) &
4090                                                    PTYPE_HASH_MASK]);
4091         }
4092
4093         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4094                                &orig_dev->ptype_specific);
4095
4096         if (unlikely(skb->dev != orig_dev)) {
4097                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4098                                        &skb->dev->ptype_specific);
4099         }
4100
4101         if (pt_prev) {
4102                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4103                         goto drop;
4104                 else
4105                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4106         } else {
4107 drop:
4108                 if (!deliver_exact)
4109                         atomic_long_inc(&skb->dev->rx_dropped);
4110                 else
4111                         atomic_long_inc(&skb->dev->rx_nohandler);
4112                 kfree_skb(skb);
4113                 /* Jamal, now you will not able to escape explaining
4114                  * me how you were going to use this. :-)
4115                  */
4116                 ret = NET_RX_DROP;
4117         }
4118
4119 out:
4120         return ret;
4121 }
4122
4123 static int __netif_receive_skb(struct sk_buff *skb)
4124 {
4125         int ret;
4126
4127         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4128                 unsigned long pflags = current->flags;
4129
4130                 /*
4131                  * PFMEMALLOC skbs are special, they should
4132                  * - be delivered to SOCK_MEMALLOC sockets only
4133                  * - stay away from userspace
4134                  * - have bounded memory usage
4135                  *
4136                  * Use PF_MEMALLOC as this saves us from propagating the allocation
4137                  * context down to all allocation sites.
4138                  */
4139                 current->flags |= PF_MEMALLOC;
4140                 ret = __netif_receive_skb_core(skb, true);
4141                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4142         } else
4143                 ret = __netif_receive_skb_core(skb, false);
4144
4145         return ret;
4146 }
4147
4148 static int netif_receive_skb_internal(struct sk_buff *skb)
4149 {
4150         int ret;
4151
4152         net_timestamp_check(netdev_tstamp_prequeue, skb);
4153
4154         if (skb_defer_rx_timestamp(skb))
4155                 return NET_RX_SUCCESS;
4156
4157         rcu_read_lock();
4158
4159 #ifdef CONFIG_RPS
4160         if (static_key_false(&rps_needed)) {
4161                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4162                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4163
4164                 if (cpu >= 0) {
4165                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4166                         rcu_read_unlock();
4167                         return ret;
4168                 }
4169         }
4170 #endif
4171         ret = __netif_receive_skb(skb);
4172         rcu_read_unlock();
4173         return ret;
4174 }
4175
4176 /**
4177  *      netif_receive_skb - process receive buffer from network
4178  *      @skb: buffer to process
4179  *
4180  *      netif_receive_skb() is the main receive data processing function.
4181  *      It always succeeds. The buffer may be dropped during processing
4182  *      for congestion control or by the protocol layers.
4183  *
4184  *      This function may only be called from softirq context and interrupts
4185  *      should be enabled.
4186  *
4187  *      Return values (usually ignored):
4188  *      NET_RX_SUCCESS: no congestion
4189  *      NET_RX_DROP: packet was dropped
4190  */
4191 int netif_receive_skb(struct sk_buff *skb)
4192 {
4193         trace_netif_receive_skb_entry(skb);
4194
4195         return netif_receive_skb_internal(skb);
4196 }
4197 EXPORT_SYMBOL(netif_receive_skb);
4198
4199 DEFINE_PER_CPU(struct work_struct, flush_works);
4200
4201 /* Network device is going away, flush any packets still pending */
4202 static void flush_backlog(struct work_struct *work)
4203 {
4204         struct sk_buff *skb, *tmp;
4205         struct softnet_data *sd;
4206
4207         local_bh_disable();
4208         sd = this_cpu_ptr(&softnet_data);
4209
4210         local_irq_disable();
4211         rps_lock(sd);
4212         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4213                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4214                         __skb_unlink(skb, &sd->input_pkt_queue);
4215                         kfree_skb(skb);
4216                         input_queue_head_incr(sd);
4217                 }
4218         }
4219         rps_unlock(sd);
4220         local_irq_enable();
4221
4222         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4223                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4224                         __skb_unlink(skb, &sd->process_queue);
4225                         kfree_skb(skb);
4226                         input_queue_head_incr(sd);
4227                 }
4228         }
4229         local_bh_enable();
4230 }
4231
4232 static void flush_all_backlogs(void)
4233 {
4234         unsigned int cpu;
4235
4236         get_online_cpus();
4237
4238         for_each_online_cpu(cpu)
4239                 queue_work_on(cpu, system_highpri_wq,
4240                               per_cpu_ptr(&flush_works, cpu));
4241
4242         for_each_online_cpu(cpu)
4243                 flush_work(per_cpu_ptr(&flush_works, cpu));
4244
4245         put_online_cpus();
4246 }
4247
4248 static int napi_gro_complete(struct sk_buff *skb)
4249 {
4250         struct packet_offload *ptype;
4251         __be16 type = skb->protocol;
4252         struct list_head *head = &offload_base;
4253         int err = -ENOENT;
4254
4255         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4256
4257         if (NAPI_GRO_CB(skb)->count == 1) {
4258                 skb_shinfo(skb)->gso_size = 0;
4259                 goto out;
4260         }
4261
4262         rcu_read_lock();
4263         list_for_each_entry_rcu(ptype, head, list) {
4264                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4265                         continue;
4266
4267                 err = ptype->callbacks.gro_complete(skb, 0);
4268                 break;
4269         }
4270         rcu_read_unlock();
4271
4272         if (err) {
4273                 WARN_ON(&ptype->list == head);
4274                 kfree_skb(skb);
4275                 return NET_RX_SUCCESS;
4276         }
4277
4278 out:
4279         return netif_receive_skb_internal(skb);
4280 }
4281
4282 /* napi->gro_list contains packets ordered by age.
4283  * youngest packets at the head of it.
4284  * Complete skbs in reverse order to reduce latencies.
4285  */
4286 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4287 {
4288         struct sk_buff *skb, *prev = NULL;
4289
4290         /* scan list and build reverse chain */
4291         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4292                 skb->prev = prev;
4293                 prev = skb;
4294         }
4295
4296         for (skb = prev; skb; skb = prev) {
4297                 skb->next = NULL;
4298
4299                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4300                         return;
4301
4302                 prev = skb->prev;
4303                 napi_gro_complete(skb);
4304                 napi->gro_count--;
4305         }
4306
4307         napi->gro_list = NULL;
4308 }
4309 EXPORT_SYMBOL(napi_gro_flush);
4310
4311 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4312 {
4313         struct sk_buff *p;
4314         unsigned int maclen = skb->dev->hard_header_len;
4315         u32 hash = skb_get_hash_raw(skb);
4316
4317         for (p = napi->gro_list; p; p = p->next) {
4318                 unsigned long diffs;
4319
4320                 NAPI_GRO_CB(p)->flush = 0;
4321
4322                 if (hash != skb_get_hash_raw(p)) {
4323                         NAPI_GRO_CB(p)->same_flow = 0;
4324                         continue;
4325                 }
4326
4327                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4328                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4329                 diffs |= skb_metadata_dst_cmp(p, skb);
4330                 if (maclen == ETH_HLEN)
4331                         diffs |= compare_ether_header(skb_mac_header(p),
4332                                                       skb_mac_header(skb));
4333                 else if (!diffs)
4334                         diffs = memcmp(skb_mac_header(p),
4335                                        skb_mac_header(skb),
4336                                        maclen);
4337                 NAPI_GRO_CB(p)->same_flow = !diffs;
4338         }
4339 }
4340
4341 static void skb_gro_reset_offset(struct sk_buff *skb)
4342 {
4343         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4344         const skb_frag_t *frag0 = &pinfo->frags[0];
4345
4346         NAPI_GRO_CB(skb)->data_offset = 0;
4347         NAPI_GRO_CB(skb)->frag0 = NULL;
4348         NAPI_GRO_CB(skb)->frag0_len = 0;
4349
4350         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4351             pinfo->nr_frags &&
4352             !PageHighMem(skb_frag_page(frag0))) {
4353                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4354                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4355         }
4356 }
4357
4358 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4359 {
4360         struct skb_shared_info *pinfo = skb_shinfo(skb);
4361
4362         BUG_ON(skb->end - skb->tail < grow);
4363
4364         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4365
4366         skb->data_len -= grow;
4367         skb->tail += grow;
4368
4369         pinfo->frags[0].page_offset += grow;
4370         skb_frag_size_sub(&pinfo->frags[0], grow);
4371
4372         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4373                 skb_frag_unref(skb, 0);
4374                 memmove(pinfo->frags, pinfo->frags + 1,
4375                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4376         }
4377 }
4378
4379 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4380 {
4381         struct sk_buff **pp = NULL;
4382         struct packet_offload *ptype;
4383         __be16 type = skb->protocol;
4384         struct list_head *head = &offload_base;
4385         int same_flow;
4386         enum gro_result ret;
4387         int grow;
4388
4389         if (!(skb->dev->features & NETIF_F_GRO))
4390                 goto normal;
4391
4392         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4393                 goto normal;
4394
4395         gro_list_prepare(napi, skb);
4396
4397         rcu_read_lock();
4398         list_for_each_entry_rcu(ptype, head, list) {
4399                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4400                         continue;
4401
4402                 skb_set_network_header(skb, skb_gro_offset(skb));
4403                 skb_reset_mac_len(skb);
4404                 NAPI_GRO_CB(skb)->same_flow = 0;
4405                 NAPI_GRO_CB(skb)->flush = 0;
4406                 NAPI_GRO_CB(skb)->free = 0;
4407                 NAPI_GRO_CB(skb)->encap_mark = 0;
4408                 NAPI_GRO_CB(skb)->recursion_counter = 0;
4409                 NAPI_GRO_CB(skb)->is_fou = 0;
4410                 NAPI_GRO_CB(skb)->is_atomic = 1;
4411                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4412
4413                 /* Setup for GRO checksum validation */
4414                 switch (skb->ip_summed) {
4415                 case CHECKSUM_COMPLETE:
4416                         NAPI_GRO_CB(skb)->csum = skb->csum;
4417                         NAPI_GRO_CB(skb)->csum_valid = 1;
4418                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4419                         break;
4420                 case CHECKSUM_UNNECESSARY:
4421                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4422                         NAPI_GRO_CB(skb)->csum_valid = 0;
4423                         break;
4424                 default:
4425                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4426                         NAPI_GRO_CB(skb)->csum_valid = 0;
4427                 }
4428
4429                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4430                 break;
4431         }
4432         rcu_read_unlock();
4433
4434         if (&ptype->list == head)
4435                 goto normal;
4436
4437         same_flow = NAPI_GRO_CB(skb)->same_flow;
4438         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4439
4440         if (pp) {
4441                 struct sk_buff *nskb = *pp;
4442
4443                 *pp = nskb->next;
4444                 nskb->next = NULL;
4445                 napi_gro_complete(nskb);
4446                 napi->gro_count--;
4447         }
4448
4449         if (same_flow)
4450                 goto ok;
4451
4452         if (NAPI_GRO_CB(skb)->flush)
4453                 goto normal;
4454
4455         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4456                 struct sk_buff *nskb = napi->gro_list;
4457
4458                 /* locate the end of the list to select the 'oldest' flow */
4459                 while (nskb->next) {
4460                         pp = &nskb->next;
4461                         nskb = *pp;
4462                 }
4463                 *pp = NULL;
4464                 nskb->next = NULL;
4465                 napi_gro_complete(nskb);
4466         } else {
4467                 napi->gro_count++;
4468         }
4469         NAPI_GRO_CB(skb)->count = 1;
4470         NAPI_GRO_CB(skb)->age = jiffies;
4471         NAPI_GRO_CB(skb)->last = skb;
4472         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4473         skb->next = napi->gro_list;
4474         napi->gro_list = skb;
4475         ret = GRO_HELD;
4476
4477 pull:
4478         grow = skb_gro_offset(skb) - skb_headlen(skb);
4479         if (grow > 0)
4480                 gro_pull_from_frag0(skb, grow);
4481 ok:
4482         return ret;
4483
4484 normal:
4485         ret = GRO_NORMAL;
4486         goto pull;
4487 }
4488
4489 struct packet_offload *gro_find_receive_by_type(__be16 type)
4490 {
4491         struct list_head *offload_head = &offload_base;
4492         struct packet_offload *ptype;
4493
4494         list_for_each_entry_rcu(ptype, offload_head, list) {
4495                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4496                         continue;
4497                 return ptype;
4498         }
4499         return NULL;
4500 }
4501 EXPORT_SYMBOL(gro_find_receive_by_type);
4502
4503 struct packet_offload *gro_find_complete_by_type(__be16 type)
4504 {
4505         struct list_head *offload_head = &offload_base;
4506         struct packet_offload *ptype;
4507
4508         list_for_each_entry_rcu(ptype, offload_head, list) {
4509                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4510                         continue;
4511                 return ptype;
4512         }
4513         return NULL;
4514 }
4515 EXPORT_SYMBOL(gro_find_complete_by_type);
4516
4517 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4518 {
4519         switch (ret) {
4520         case GRO_NORMAL:
4521                 if (netif_receive_skb_internal(skb))
4522                         ret = GRO_DROP;
4523                 break;
4524
4525         case GRO_DROP:
4526                 kfree_skb(skb);
4527                 break;
4528
4529         case GRO_MERGED_FREE:
4530                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4531                         skb_dst_drop(skb);
4532                         kmem_cache_free(skbuff_head_cache, skb);
4533                 } else {
4534                         __kfree_skb(skb);
4535                 }
4536                 break;
4537
4538         case GRO_HELD:
4539         case GRO_MERGED:
4540                 break;
4541         }
4542
4543         return ret;
4544 }
4545
4546 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4547 {
4548         skb_mark_napi_id(skb, napi);
4549         trace_napi_gro_receive_entry(skb);
4550
4551         skb_gro_reset_offset(skb);
4552
4553         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4554 }
4555 EXPORT_SYMBOL(napi_gro_receive);
4556
4557 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4558 {
4559         if (unlikely(skb->pfmemalloc)) {
4560                 consume_skb(skb);
4561                 return;
4562         }
4563         __skb_pull(skb, skb_headlen(skb));
4564         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4565         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4566         skb->vlan_tci = 0;
4567         skb->dev = napi->dev;
4568         skb->skb_iif = 0;
4569         skb->encapsulation = 0;
4570         skb_shinfo(skb)->gso_type = 0;
4571         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4572
4573         napi->skb = skb;
4574 }
4575
4576 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4577 {
4578         struct sk_buff *skb = napi->skb;
4579
4580         if (!skb) {
4581                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4582                 if (skb) {
4583                         napi->skb = skb;
4584                         skb_mark_napi_id(skb, napi);
4585                 }
4586         }
4587         return skb;
4588 }
4589 EXPORT_SYMBOL(napi_get_frags);
4590
4591 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4592                                       struct sk_buff *skb,
4593                                       gro_result_t ret)
4594 {
4595         switch (ret) {
4596         case GRO_NORMAL:
4597         case GRO_HELD:
4598                 __skb_push(skb, ETH_HLEN);
4599                 skb->protocol = eth_type_trans(skb, skb->dev);
4600                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4601                         ret = GRO_DROP;
4602                 break;
4603
4604         case GRO_DROP:
4605         case GRO_MERGED_FREE:
4606                 napi_reuse_skb(napi, skb);
4607                 break;
4608
4609         case GRO_MERGED:
4610                 break;
4611         }
4612
4613         return ret;
4614 }
4615
4616 /* Upper GRO stack assumes network header starts at gro_offset=0
4617  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4618  * We copy ethernet header into skb->data to have a common layout.
4619  */
4620 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4621 {
4622         struct sk_buff *skb = napi->skb;
4623         const struct ethhdr *eth;
4624         unsigned int hlen = sizeof(*eth);
4625
4626         napi->skb = NULL;
4627
4628         skb_reset_mac_header(skb);
4629         skb_gro_reset_offset(skb);
4630
4631         eth = skb_gro_header_fast(skb, 0);
4632         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4633                 eth = skb_gro_header_slow(skb, hlen, 0);
4634                 if (unlikely(!eth)) {
4635                         net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4636                                              __func__, napi->dev->name);
4637                         napi_reuse_skb(napi, skb);
4638                         return NULL;
4639                 }
4640         } else {
4641                 gro_pull_from_frag0(skb, hlen);
4642                 NAPI_GRO_CB(skb)->frag0 += hlen;
4643                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4644         }
4645         __skb_pull(skb, hlen);
4646
4647         /*
4648          * This works because the only protocols we care about don't require
4649          * special handling.
4650          * We'll fix it up properly in napi_frags_finish()
4651          */
4652         skb->protocol = eth->h_proto;
4653
4654         return skb;
4655 }
4656
4657 gro_result_t napi_gro_frags(struct napi_struct *napi)
4658 {
4659         struct sk_buff *skb = napi_frags_skb(napi);
4660
4661         if (!skb)
4662                 return GRO_DROP;
4663
4664         trace_napi_gro_frags_entry(skb);
4665
4666         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4667 }
4668 EXPORT_SYMBOL(napi_gro_frags);
4669
4670 /* Compute the checksum from gro_offset and return the folded value
4671  * after adding in any pseudo checksum.
4672  */
4673 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4674 {
4675         __wsum wsum;
4676         __sum16 sum;
4677
4678         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4679
4680         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4681         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4682         if (likely(!sum)) {
4683                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4684                     !skb->csum_complete_sw)
4685                         netdev_rx_csum_fault(skb->dev);
4686         }
4687
4688         NAPI_GRO_CB(skb)->csum = wsum;
4689         NAPI_GRO_CB(skb)->csum_valid = 1;
4690
4691         return sum;
4692 }
4693 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4694
4695 /*
4696  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4697  * Note: called with local irq disabled, but exits with local irq enabled.
4698  */
4699 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4700 {
4701 #ifdef CONFIG_RPS
4702         struct softnet_data *remsd = sd->rps_ipi_list;
4703
4704         if (remsd) {
4705                 sd->rps_ipi_list = NULL;
4706
4707                 local_irq_enable();
4708
4709                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4710                 while (remsd) {
4711                         struct softnet_data *next = remsd->rps_ipi_next;
4712
4713                         if (cpu_online(remsd->cpu))
4714                                 smp_call_function_single_async(remsd->cpu,
4715                                                            &remsd->csd);
4716                         remsd = next;
4717                 }
4718         } else
4719 #endif
4720                 local_irq_enable();
4721 }
4722
4723 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4724 {
4725 #ifdef CONFIG_RPS
4726         return sd->rps_ipi_list != NULL;
4727 #else
4728         return false;
4729 #endif
4730 }
4731
4732 static int process_backlog(struct napi_struct *napi, int quota)
4733 {
4734         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4735         bool again = true;
4736         int work = 0;
4737
4738         /* Check if we have pending ipi, its better to send them now,
4739          * not waiting net_rx_action() end.
4740          */
4741         if (sd_has_rps_ipi_waiting(sd)) {
4742                 local_irq_disable();
4743                 net_rps_action_and_irq_enable(sd);
4744         }
4745
4746         napi->weight = weight_p;
4747         while (again) {
4748                 struct sk_buff *skb;
4749
4750                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4751                         rcu_read_lock();
4752                         __netif_receive_skb(skb);
4753                         rcu_read_unlock();
4754                         input_queue_head_incr(sd);
4755                         if (++work >= quota)
4756                                 return work;
4757
4758                 }
4759
4760                 local_irq_disable();
4761                 rps_lock(sd);
4762                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4763                         /*
4764                          * Inline a custom version of __napi_complete().
4765                          * only current cpu owns and manipulates this napi,
4766                          * and NAPI_STATE_SCHED is the only possible flag set
4767                          * on backlog.
4768                          * We can use a plain write instead of clear_bit(),
4769                          * and we dont need an smp_mb() memory barrier.
4770                          */
4771                         napi->state = 0;
4772                         again = false;
4773                 } else {
4774                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4775                                                    &sd->process_queue);
4776                 }
4777                 rps_unlock(sd);
4778                 local_irq_enable();
4779         }
4780
4781         return work;
4782 }
4783
4784 /**
4785  * __napi_schedule - schedule for receive
4786  * @n: entry to schedule
4787  *
4788  * The entry's receive function will be scheduled to run.
4789  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4790  */
4791 void __napi_schedule(struct napi_struct *n)
4792 {
4793         unsigned long flags;
4794
4795         local_irq_save(flags);
4796         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4797         local_irq_restore(flags);
4798 }
4799 EXPORT_SYMBOL(__napi_schedule);
4800
4801 /**
4802  * __napi_schedule_irqoff - schedule for receive
4803  * @n: entry to schedule
4804  *
4805  * Variant of __napi_schedule() assuming hard irqs are masked
4806  */
4807 void __napi_schedule_irqoff(struct napi_struct *n)
4808 {
4809         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4810 }
4811 EXPORT_SYMBOL(__napi_schedule_irqoff);
4812
4813 void __napi_complete(struct napi_struct *n)
4814 {
4815         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4816
4817         list_del_init(&n->poll_list);
4818         smp_mb__before_atomic();
4819         clear_bit(NAPI_STATE_SCHED, &n->state);
4820 }
4821 EXPORT_SYMBOL(__napi_complete);
4822
4823 void napi_complete_done(struct napi_struct *n, int work_done)
4824 {
4825         unsigned long flags;
4826
4827         /*
4828          * don't let napi dequeue from the cpu poll list
4829          * just in case its running on a different cpu
4830          */
4831         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4832                 return;
4833
4834         if (n->gro_list) {
4835                 unsigned long timeout = 0;
4836
4837                 if (work_done)
4838                         timeout = n->dev->gro_flush_timeout;
4839
4840                 if (timeout)
4841                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4842                                       HRTIMER_MODE_REL_PINNED);
4843                 else
4844                         napi_gro_flush(n, false);
4845         }
4846         if (likely(list_empty(&n->poll_list))) {
4847                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4848         } else {
4849                 /* If n->poll_list is not empty, we need to mask irqs */
4850                 local_irq_save(flags);
4851                 __napi_complete(n);
4852                 local_irq_restore(flags);
4853         }
4854 }
4855 EXPORT_SYMBOL(napi_complete_done);
4856
4857 /* must be called under rcu_read_lock(), as we dont take a reference */
4858 static struct napi_struct *napi_by_id(unsigned int napi_id)
4859 {
4860         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4861         struct napi_struct *napi;
4862
4863         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4864                 if (napi->napi_id == napi_id)
4865                         return napi;
4866
4867         return NULL;
4868 }
4869
4870 #if defined(CONFIG_NET_RX_BUSY_POLL)
4871 #define BUSY_POLL_BUDGET 8
4872 bool sk_busy_loop(struct sock *sk, int nonblock)
4873 {
4874         unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4875         int (*busy_poll)(struct napi_struct *dev);
4876         struct napi_struct *napi;
4877         int rc = false;
4878
4879         rcu_read_lock();
4880
4881         napi = napi_by_id(sk->sk_napi_id);
4882         if (!napi)
4883                 goto out;
4884
4885         /* Note: ndo_busy_poll method is optional in linux-4.5 */
4886         busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4887
4888         do {
4889                 rc = 0;
4890                 local_bh_disable();
4891                 if (busy_poll) {
4892                         rc = busy_poll(napi);
4893                 } else if (napi_schedule_prep(napi)) {
4894                         void *have = netpoll_poll_lock(napi);
4895
4896                         if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4897                                 rc = napi->poll(napi, BUSY_POLL_BUDGET);
4898                                 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
4899                                 if (rc == BUSY_POLL_BUDGET) {
4900                                         napi_complete_done(napi, rc);
4901                                         napi_schedule(napi);
4902                                 }
4903                         }
4904                         netpoll_poll_unlock(have);
4905                 }
4906                 if (rc > 0)
4907                         __NET_ADD_STATS(sock_net(sk),
4908                                         LINUX_MIB_BUSYPOLLRXPACKETS, rc);
4909                 local_bh_enable();
4910
4911                 if (rc == LL_FLUSH_FAILED)
4912                         break; /* permanent failure */
4913
4914                 cpu_relax();
4915         } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
4916                  !need_resched() && !busy_loop_timeout(end_time));
4917
4918         rc = !skb_queue_empty(&sk->sk_receive_queue);
4919 out:
4920         rcu_read_unlock();
4921         return rc;
4922 }
4923 EXPORT_SYMBOL(sk_busy_loop);
4924
4925 #endif /* CONFIG_NET_RX_BUSY_POLL */
4926
4927 void napi_hash_add(struct napi_struct *napi)
4928 {
4929         if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
4930             test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
4931                 return;
4932
4933         spin_lock(&napi_hash_lock);
4934
4935         /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
4936         do {
4937                 if (unlikely(++napi_gen_id < NR_CPUS + 1))
4938                         napi_gen_id = NR_CPUS + 1;
4939         } while (napi_by_id(napi_gen_id));
4940         napi->napi_id = napi_gen_id;
4941
4942         hlist_add_head_rcu(&napi->napi_hash_node,
4943                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4944
4945         spin_unlock(&napi_hash_lock);
4946 }
4947 EXPORT_SYMBOL_GPL(napi_hash_add);
4948
4949 /* Warning : caller is responsible to make sure rcu grace period
4950  * is respected before freeing memory containing @napi
4951  */
4952 bool napi_hash_del(struct napi_struct *napi)
4953 {
4954         bool rcu_sync_needed = false;
4955
4956         spin_lock(&napi_hash_lock);
4957
4958         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
4959                 rcu_sync_needed = true;
4960                 hlist_del_rcu(&napi->napi_hash_node);
4961         }
4962         spin_unlock(&napi_hash_lock);
4963         return rcu_sync_needed;
4964 }
4965 EXPORT_SYMBOL_GPL(napi_hash_del);
4966
4967 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4968 {
4969         struct napi_struct *napi;
4970
4971         napi = container_of(timer, struct napi_struct, timer);
4972         if (napi->gro_list)
4973                 napi_schedule(napi);
4974
4975         return HRTIMER_NORESTART;
4976 }
4977
4978 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4979                     int (*poll)(struct napi_struct *, int), int weight)
4980 {
4981         INIT_LIST_HEAD(&napi->poll_list);
4982         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4983         napi->timer.function = napi_watchdog;
4984         napi->gro_count = 0;
4985         napi->gro_list = NULL;
4986         napi->skb = NULL;
4987         napi->poll = poll;
4988         if (weight > NAPI_POLL_WEIGHT)
4989                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4990                             weight, dev->name);
4991         napi->weight = weight;
4992         list_add(&napi->dev_list, &dev->napi_list);
4993         napi->dev = dev;
4994 #ifdef CONFIG_NETPOLL
4995         spin_lock_init(&napi->poll_lock);
4996         napi->poll_owner = -1;
4997 #endif
4998         set_bit(NAPI_STATE_SCHED, &napi->state);
4999         napi_hash_add(napi);
5000 }
5001 EXPORT_SYMBOL(netif_napi_add);
5002
5003 void napi_disable(struct napi_struct *n)
5004 {
5005         might_sleep();
5006         set_bit(NAPI_STATE_DISABLE, &n->state);
5007
5008         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5009                 msleep(1);
5010         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5011                 msleep(1);
5012
5013         hrtimer_cancel(&n->timer);
5014
5015         clear_bit(NAPI_STATE_DISABLE, &n->state);
5016 }
5017 EXPORT_SYMBOL(napi_disable);
5018
5019 /* Must be called in process context */
5020 void netif_napi_del(struct napi_struct *napi)
5021 {
5022         might_sleep();
5023         if (napi_hash_del(napi))
5024                 synchronize_net();
5025         list_del_init(&napi->dev_list);
5026         napi_free_frags(napi);
5027
5028         kfree_skb_list(napi->gro_list);
5029         napi->gro_list = NULL;
5030         napi->gro_count = 0;
5031 }
5032 EXPORT_SYMBOL(netif_napi_del);
5033
5034 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5035 {
5036         void *have;
5037         int work, weight;
5038
5039         list_del_init(&n->poll_list);
5040
5041         have = netpoll_poll_lock(n);
5042
5043         weight = n->weight;
5044
5045         /* This NAPI_STATE_SCHED test is for avoiding a race
5046          * with netpoll's poll_napi().  Only the entity which
5047          * obtains the lock and sees NAPI_STATE_SCHED set will
5048          * actually make the ->poll() call.  Therefore we avoid
5049          * accidentally calling ->poll() when NAPI is not scheduled.
5050          */
5051         work = 0;
5052         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5053                 work = n->poll(n, weight);
5054                 trace_napi_poll(n, work, weight);
5055         }
5056
5057         WARN_ON_ONCE(work > weight);
5058
5059         if (likely(work < weight))
5060                 goto out_unlock;
5061
5062         /* Drivers must not modify the NAPI state if they
5063          * consume the entire weight.  In such cases this code
5064          * still "owns" the NAPI instance and therefore can
5065          * move the instance around on the list at-will.
5066          */
5067         if (unlikely(napi_disable_pending(n))) {
5068                 napi_complete(n);
5069                 goto out_unlock;
5070         }
5071
5072         if (n->gro_list) {
5073                 /* flush too old packets
5074                  * If HZ < 1000, flush all packets.
5075                  */
5076                 napi_gro_flush(n, HZ >= 1000);
5077         }
5078
5079         /* Some drivers may have called napi_schedule
5080          * prior to exhausting their budget.
5081          */
5082         if (unlikely(!list_empty(&n->poll_list))) {
5083                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5084                              n->dev ? n->dev->name : "backlog");
5085                 goto out_unlock;
5086         }
5087
5088         list_add_tail(&n->poll_list, repoll);
5089
5090 out_unlock:
5091         netpoll_poll_unlock(have);
5092
5093         return work;
5094 }
5095
5096 static __latent_entropy void net_rx_action(struct softirq_action *h)
5097 {
5098         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5099         unsigned long time_limit = jiffies + 2;
5100         int budget = netdev_budget;
5101         LIST_HEAD(list);
5102         LIST_HEAD(repoll);
5103
5104         local_irq_disable();
5105         list_splice_init(&sd->poll_list, &list);
5106         local_irq_enable();
5107
5108         for (;;) {
5109                 struct napi_struct *n;
5110
5111                 if (list_empty(&list)) {
5112                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5113                                 return;
5114                         break;
5115                 }
5116
5117                 n = list_first_entry(&list, struct napi_struct, poll_list);
5118                 budget -= napi_poll(n, &repoll);
5119
5120                 /* If softirq window is exhausted then punt.
5121                  * Allow this to run for 2 jiffies since which will allow
5122                  * an average latency of 1.5/HZ.
5123                  */
5124                 if (unlikely(budget <= 0 ||
5125                              time_after_eq(jiffies, time_limit))) {
5126                         sd->time_squeeze++;
5127                         break;
5128                 }
5129         }
5130
5131         __kfree_skb_flush();
5132         local_irq_disable();
5133
5134         list_splice_tail_init(&sd->poll_list, &list);
5135         list_splice_tail(&repoll, &list);
5136         list_splice(&list, &sd->poll_list);
5137         if (!list_empty(&sd->poll_list))
5138                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5139
5140         net_rps_action_and_irq_enable(sd);
5141 }
5142
5143 struct netdev_adjacent {
5144         struct net_device *dev;
5145
5146         /* upper master flag, there can only be one master device per list */
5147         bool master;
5148
5149         /* counter for the number of times this device was added to us */
5150         u16 ref_nr;
5151
5152         /* private field for the users */
5153         void *private;
5154
5155         struct list_head list;
5156         struct rcu_head rcu;
5157 };
5158
5159 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5160                                                  struct list_head *adj_list)
5161 {
5162         struct netdev_adjacent *adj;
5163
5164         list_for_each_entry(adj, adj_list, list) {
5165                 if (adj->dev == adj_dev)
5166                         return adj;
5167         }
5168         return NULL;
5169 }
5170
5171 static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5172 {
5173         struct net_device *dev = data;
5174
5175         return upper_dev == dev;
5176 }
5177
5178 /**
5179  * netdev_has_upper_dev - Check if device is linked to an upper device
5180  * @dev: device
5181  * @upper_dev: upper device to check
5182  *
5183  * Find out if a device is linked to specified upper device and return true
5184  * in case it is. Note that this checks only immediate upper device,
5185  * not through a complete stack of devices. The caller must hold the RTNL lock.
5186  */
5187 bool netdev_has_upper_dev(struct net_device *dev,
5188                           struct net_device *upper_dev)
5189 {
5190         ASSERT_RTNL();
5191
5192         return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5193                                              upper_dev);
5194 }
5195 EXPORT_SYMBOL(netdev_has_upper_dev);
5196
5197 /**
5198  * netdev_has_upper_dev_all - Check if device is linked to an upper device
5199  * @dev: device
5200  * @upper_dev: upper device to check
5201  *
5202  * Find out if a device is linked to specified upper device and return true
5203  * in case it is. Note that this checks the entire upper device chain.
5204  * The caller must hold rcu lock.
5205  */
5206
5207 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5208                                   struct net_device *upper_dev)
5209 {
5210         return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5211                                                upper_dev);
5212 }
5213 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5214
5215 /**
5216  * netdev_has_any_upper_dev - Check if device is linked to some device
5217  * @dev: device
5218  *
5219  * Find out if a device is linked to an upper device and return true in case
5220  * it is. The caller must hold the RTNL lock.
5221  */
5222 static bool netdev_has_any_upper_dev(struct net_device *dev)
5223 {
5224         ASSERT_RTNL();
5225
5226         return !list_empty(&dev->adj_list.upper);
5227 }
5228
5229 /**
5230  * netdev_master_upper_dev_get - Get master upper device
5231  * @dev: device
5232  *
5233  * Find a master upper device and return pointer to it or NULL in case
5234  * it's not there. The caller must hold the RTNL lock.
5235  */
5236 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5237 {
5238         struct netdev_adjacent *upper;
5239
5240         ASSERT_RTNL();
5241
5242         if (list_empty(&dev->adj_list.upper))
5243                 return NULL;
5244
5245         upper = list_first_entry(&dev->adj_list.upper,
5246                                  struct netdev_adjacent, list);
5247         if (likely(upper->master))
5248                 return upper->dev;
5249         return NULL;
5250 }
5251 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5252
5253 /**
5254  * netdev_has_any_lower_dev - Check if device is linked to some device
5255  * @dev: device
5256  *
5257  * Find out if a device is linked to a lower device and return true in case
5258  * it is. The caller must hold the RTNL lock.
5259  */
5260 static bool netdev_has_any_lower_dev(struct net_device *dev)
5261 {
5262         ASSERT_RTNL();
5263
5264         return !list_empty(&dev->adj_list.lower);
5265 }
5266
5267 void *netdev_adjacent_get_private(struct list_head *adj_list)
5268 {
5269         struct netdev_adjacent *adj;
5270
5271         adj = list_entry(adj_list, struct netdev_adjacent, list);
5272
5273         return adj->private;
5274 }
5275 EXPORT_SYMBOL(netdev_adjacent_get_private);
5276
5277 /**
5278  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5279  * @dev: device
5280  * @iter: list_head ** of the current position
5281  *
5282  * Gets the next device from the dev's upper list, starting from iter
5283  * position. The caller must hold RCU read lock.
5284  */
5285 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5286                                                  struct list_head **iter)
5287 {
5288         struct netdev_adjacent *upper;
5289
5290         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5291
5292         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5293
5294         if (&upper->list == &dev->adj_list.upper)
5295                 return NULL;
5296
5297         *iter = &upper->list;
5298
5299         return upper->dev;
5300 }
5301 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5302
5303 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5304                                                     struct list_head **iter)
5305 {
5306         struct netdev_adjacent *upper;
5307
5308         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5309
5310         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5311
5312         if (&upper->list == &dev->adj_list.upper)
5313                 return NULL;
5314
5315         *iter = &upper->list;
5316
5317         return upper->dev;
5318 }
5319
5320 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5321                                   int (*fn)(struct net_device *dev,
5322                                             void *data),
5323                                   void *data)
5324 {
5325         struct net_device *udev;
5326         struct list_head *iter;
5327         int ret;
5328
5329         for (iter = &dev->adj_list.upper,
5330              udev = netdev_next_upper_dev_rcu(dev, &iter);
5331              udev;
5332              udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5333                 /* first is the upper device itself */
5334                 ret = fn(udev, data);
5335                 if (ret)
5336                         return ret;
5337
5338                 /* then look at all of its upper devices */
5339                 ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5340                 if (ret)
5341                         return ret;
5342         }
5343
5344         return 0;
5345 }
5346 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5347
5348 /**
5349  * netdev_lower_get_next_private - Get the next ->private from the
5350  *                                 lower neighbour list
5351  * @dev: device
5352  * @iter: list_head ** of the current position
5353  *
5354  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5355  * list, starting from iter position. The caller must hold either hold the
5356  * RTNL lock or its own locking that guarantees that the neighbour lower
5357  * list will remain unchanged.
5358  */
5359 void *netdev_lower_get_next_private(struct net_device *dev,
5360                                     struct list_head **iter)
5361 {
5362         struct netdev_adjacent *lower;
5363
5364         lower = list_entry(*iter, struct netdev_adjacent, list);
5365
5366         if (&lower->list == &dev->adj_list.lower)
5367                 return NULL;
5368
5369         *iter = lower->list.next;
5370
5371         return lower->private;
5372 }
5373 EXPORT_SYMBOL(netdev_lower_get_next_private);
5374
5375 /**
5376  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5377  *                                     lower neighbour list, RCU
5378  *                                     variant
5379  * @dev: device
5380  * @iter: list_head ** of the current position
5381  *
5382  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5383  * list, starting from iter position. The caller must hold RCU read lock.
5384  */
5385 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5386                                         struct list_head **iter)
5387 {
5388         struct netdev_adjacent *lower;
5389
5390         WARN_ON_ONCE(!rcu_read_lock_held());
5391
5392         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5393
5394         if (&lower->list == &dev->adj_list.lower)
5395                 return NULL;
5396
5397         *iter = &lower->list;
5398
5399         return lower->private;
5400 }
5401 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5402
5403 /**
5404  * netdev_lower_get_next - Get the next device from the lower neighbour
5405  *                         list
5406  * @dev: device
5407  * @iter: list_head ** of the current position
5408  *
5409  * Gets the next netdev_adjacent from the dev's lower neighbour
5410  * list, starting from iter position. The caller must hold RTNL lock or
5411  * its own locking that guarantees that the neighbour lower
5412  * list will remain unchanged.
5413  */
5414 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5415 {
5416         struct netdev_adjacent *lower;
5417
5418         lower = list_entry(*iter, struct netdev_adjacent, list);
5419
5420         if (&lower->list == &dev->adj_list.lower)
5421                 return NULL;
5422
5423         *iter = lower->list.next;
5424
5425         return lower->dev;
5426 }
5427 EXPORT_SYMBOL(netdev_lower_get_next);
5428
5429 static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5430                                                 struct list_head **iter)
5431 {
5432         struct netdev_adjacent *lower;
5433
5434         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5435
5436         if (&lower->list == &dev->adj_list.lower)
5437                 return NULL;
5438
5439         *iter = &lower->list;
5440
5441         return lower->dev;
5442 }
5443
5444 int netdev_walk_all_lower_dev(struct net_device *dev,
5445                               int (*fn)(struct net_device *dev,
5446                                         void *data),
5447                               void *data)
5448 {
5449         struct net_device *ldev;
5450         struct list_head *iter;
5451         int ret;
5452
5453         for (iter = &dev->adj_list.lower,
5454              ldev = netdev_next_lower_dev(dev, &iter);
5455              ldev;
5456              ldev = netdev_next_lower_dev(dev, &iter)) {
5457                 /* first is the lower device itself */
5458                 ret = fn(ldev, data);
5459                 if (ret)
5460                         return ret;
5461
5462                 /* then look at all of its lower devices */
5463                 ret = netdev_walk_all_lower_dev(ldev, fn, data);
5464                 if (ret)
5465                         return ret;
5466         }
5467
5468         return 0;
5469 }
5470 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5471
5472 static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5473                                                     struct list_head **iter)
5474 {
5475         struct netdev_adjacent *lower;
5476
5477         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5478         if (&lower->list == &dev->adj_list.lower)
5479                 return NULL;
5480
5481         *iter = &lower->list;
5482
5483         return lower->dev;
5484 }
5485
5486 int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5487                                   int (*fn)(struct net_device *dev,
5488                                             void *data),
5489                                   void *data)
5490 {
5491         struct net_device *ldev;
5492         struct list_head *iter;
5493         int ret;
5494
5495         for (iter = &dev->adj_list.lower,
5496              ldev = netdev_next_lower_dev_rcu(dev, &iter);
5497              ldev;
5498              ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5499                 /* first is the lower device itself */
5500                 ret = fn(ldev, data);
5501                 if (ret)
5502                         return ret;
5503
5504                 /* then look at all of its lower devices */
5505                 ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5506                 if (ret)
5507                         return ret;
5508         }
5509
5510         return 0;
5511 }
5512 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5513
5514 /**
5515  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5516  *                                     lower neighbour list, RCU
5517  *                                     variant
5518  * @dev: device
5519  *
5520  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5521  * list. The caller must hold RCU read lock.
5522  */
5523 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5524 {
5525         struct netdev_adjacent *lower;
5526
5527         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5528                         struct netdev_adjacent, list);
5529         if (lower)
5530                 return lower->private;
5531         return NULL;
5532 }
5533 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5534
5535 /**
5536  * netdev_master_upper_dev_get_rcu - Get master upper device
5537  * @dev: device
5538  *
5539  * Find a master upper device and return pointer to it or NULL in case
5540  * it's not there. The caller must hold the RCU read lock.
5541  */
5542 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5543 {
5544         struct netdev_adjacent *upper;
5545
5546         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5547                                        struct netdev_adjacent, list);
5548         if (upper && likely(upper->master))
5549                 return upper->dev;
5550         return NULL;
5551 }
5552 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5553
5554 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5555                               struct net_device *adj_dev,
5556                               struct list_head *dev_list)
5557 {
5558         char linkname[IFNAMSIZ+7];
5559         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5560                 "upper_%s" : "lower_%s", adj_dev->name);
5561         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5562                                  linkname);
5563 }
5564 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5565                                char *name,
5566                                struct list_head *dev_list)
5567 {
5568         char linkname[IFNAMSIZ+7];
5569         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5570                 "upper_%s" : "lower_%s", name);
5571         sysfs_remove_link(&(dev->dev.kobj), linkname);
5572 }
5573
5574 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5575                                                  struct net_device *adj_dev,
5576                                                  struct list_head *dev_list)
5577 {
5578         return (dev_list == &dev->adj_list.upper ||
5579                 dev_list == &dev->adj_list.lower) &&
5580                 net_eq(dev_net(dev), dev_net(adj_dev));
5581 }
5582
5583 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5584                                         struct net_device *adj_dev,
5585                                         struct list_head *dev_list,
5586                                         void *private, bool master)
5587 {
5588         struct netdev_adjacent *adj;
5589         int ret;
5590
5591         adj = __netdev_find_adj(adj_dev, dev_list);
5592
5593         if (adj) {
5594                 adj->ref_nr += 1;
5595                 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5596                          dev->name, adj_dev->name, adj->ref_nr);
5597
5598                 return 0;
5599         }
5600
5601         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5602         if (!adj)
5603                 return -ENOMEM;
5604
5605         adj->dev = adj_dev;
5606         adj->master = master;
5607         adj->ref_nr = 1;
5608         adj->private = private;
5609         dev_hold(adj_dev);
5610
5611         pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5612                  dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5613
5614         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5615                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5616                 if (ret)
5617                         goto free_adj;
5618         }
5619
5620         /* Ensure that master link is always the first item in list. */
5621         if (master) {
5622                 ret = sysfs_create_link(&(dev->dev.kobj),
5623                                         &(adj_dev->dev.kobj), "master");
5624                 if (ret)
5625                         goto remove_symlinks;
5626
5627                 list_add_rcu(&adj->list, dev_list);
5628         } else {
5629                 list_add_tail_rcu(&adj->list, dev_list);
5630         }
5631
5632         return 0;
5633
5634 remove_symlinks:
5635         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5636                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5637 free_adj:
5638         kfree(adj);
5639         dev_put(adj_dev);
5640
5641         return ret;
5642 }
5643
5644 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5645                                          struct net_device *adj_dev,
5646                                          u16 ref_nr,
5647                                          struct list_head *dev_list)
5648 {
5649         struct netdev_adjacent *adj;
5650
5651         pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5652                  dev->name, adj_dev->name, ref_nr);
5653
5654         adj = __netdev_find_adj(adj_dev, dev_list);
5655
5656         if (!adj) {
5657                 pr_err("Adjacency does not exist for device %s from %s\n",
5658                        dev->name, adj_dev->name);
5659                 WARN_ON(1);
5660                 return;
5661         }
5662
5663         if (adj->ref_nr > ref_nr) {
5664                 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5665                          dev->name, adj_dev->name, ref_nr,
5666                          adj->ref_nr - ref_nr);
5667                 adj->ref_nr -= ref_nr;
5668                 return;
5669         }
5670
5671         if (adj->master)
5672                 sysfs_remove_link(&(dev->dev.kobj), "master");
5673
5674         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5675                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5676
5677         list_del_rcu(&adj->list);
5678         pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5679                  adj_dev->name, dev->name, adj_dev->name);
5680         dev_put(adj_dev);
5681         kfree_rcu(adj, rcu);
5682 }
5683
5684 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5685                                             struct net_device *upper_dev,
5686                                             struct list_head *up_list,
5687                                             struct list_head *down_list,
5688                                             void *private, bool master)
5689 {
5690         int ret;
5691
5692         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5693                                            private, master);
5694         if (ret)
5695                 return ret;
5696
5697         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5698                                            private, false);
5699         if (ret) {
5700                 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5701                 return ret;
5702         }
5703
5704         return 0;
5705 }
5706
5707 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5708                                                struct net_device *upper_dev,
5709                                                u16 ref_nr,
5710                                                struct list_head *up_list,
5711                                                struct list_head *down_list)
5712 {
5713         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5714         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5715 }
5716
5717 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5718                                                 struct net_device *upper_dev,
5719                                                 void *private, bool master)
5720 {
5721         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5722                                                 &dev->adj_list.upper,
5723                                                 &upper_dev->adj_list.lower,
5724                                                 private, master);
5725 }
5726
5727 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5728                                                    struct net_device *upper_dev)
5729 {
5730         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5731                                            &dev->adj_list.upper,
5732                                            &upper_dev->adj_list.lower);
5733 }
5734
5735 static int __netdev_upper_dev_link(struct net_device *dev,
5736                                    struct net_device *upper_dev, bool master,
5737                                    void *upper_priv, void *upper_info)
5738 {
5739         struct netdev_notifier_changeupper_info changeupper_info;
5740         int ret = 0;
5741
5742         ASSERT_RTNL();
5743
5744         if (dev == upper_dev)
5745                 return -EBUSY;
5746
5747         /* To prevent loops, check if dev is not upper device to upper_dev. */
5748         if (netdev_has_upper_dev(upper_dev, dev))
5749                 return -EBUSY;
5750
5751         if (netdev_has_upper_dev(dev, upper_dev))
5752                 return -EEXIST;
5753
5754         if (master && netdev_master_upper_dev_get(dev))
5755                 return -EBUSY;
5756
5757         changeupper_info.upper_dev = upper_dev;
5758         changeupper_info.master = master;
5759         changeupper_info.linking = true;
5760         changeupper_info.upper_info = upper_info;
5761
5762         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5763                                             &changeupper_info.info);
5764         ret = notifier_to_errno(ret);
5765         if (ret)
5766                 return ret;
5767
5768         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5769                                                    master);
5770         if (ret)
5771                 return ret;
5772
5773         ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5774                                             &changeupper_info.info);
5775         ret = notifier_to_errno(ret);
5776         if (ret)
5777                 goto rollback;
5778
5779         return 0;
5780
5781 rollback:
5782         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5783
5784         return ret;
5785 }
5786
5787 /**
5788  * netdev_upper_dev_link - Add a link to the upper device
5789  * @dev: device
5790  * @upper_dev: new upper device
5791  *
5792  * Adds a link to device which is upper to this one. The caller must hold
5793  * the RTNL lock. On a failure a negative errno code is returned.
5794  * On success the reference counts are adjusted and the function
5795  * returns zero.
5796  */
5797 int netdev_upper_dev_link(struct net_device *dev,
5798                           struct net_device *upper_dev)
5799 {
5800         return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5801 }
5802 EXPORT_SYMBOL(netdev_upper_dev_link);
5803
5804 /**
5805  * netdev_master_upper_dev_link - Add a master link to the upper device
5806  * @dev: device
5807  * @upper_dev: new upper device
5808  * @upper_priv: upper device private
5809  * @upper_info: upper info to be passed down via notifier
5810  *
5811  * Adds a link to device which is upper to this one. In this case, only
5812  * one master upper device can be linked, although other non-master devices
5813  * might be linked as well. The caller must hold the RTNL lock.
5814  * On a failure a negative errno code is returned. On success the reference
5815  * counts are adjusted and the function returns zero.
5816  */
5817 int netdev_master_upper_dev_link(struct net_device *dev,
5818                                  struct net_device *upper_dev,
5819                                  void *upper_priv, void *upper_info)
5820 {
5821         return __netdev_upper_dev_link(dev, upper_dev, true,
5822                                        upper_priv, upper_info);
5823 }
5824 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5825
5826 /**
5827  * netdev_upper_dev_unlink - Removes a link to upper device
5828  * @dev: device
5829  * @upper_dev: new upper device
5830  *
5831  * Removes a link to device which is upper to this one. The caller must hold
5832  * the RTNL lock.
5833  */
5834 void netdev_upper_dev_unlink(struct net_device *dev,
5835                              struct net_device *upper_dev)
5836 {
5837         struct netdev_notifier_changeupper_info changeupper_info;
5838         ASSERT_RTNL();
5839
5840         changeupper_info.upper_dev = upper_dev;
5841         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5842         changeupper_info.linking = false;
5843
5844         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5845                                       &changeupper_info.info);
5846
5847         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5848
5849         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5850                                       &changeupper_info.info);
5851 }
5852 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5853
5854 /**
5855  * netdev_bonding_info_change - Dispatch event about slave change
5856  * @dev: device
5857  * @bonding_info: info to dispatch
5858  *
5859  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5860  * The caller must hold the RTNL lock.
5861  */
5862 void netdev_bonding_info_change(struct net_device *dev,
5863                                 struct netdev_bonding_info *bonding_info)
5864 {
5865         struct netdev_notifier_bonding_info     info;
5866
5867         memcpy(&info.bonding_info, bonding_info,
5868                sizeof(struct netdev_bonding_info));
5869         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5870                                       &info.info);
5871 }
5872 EXPORT_SYMBOL(netdev_bonding_info_change);
5873
5874 static void netdev_adjacent_add_links(struct net_device *dev)
5875 {
5876         struct netdev_adjacent *iter;
5877
5878         struct net *net = dev_net(dev);
5879
5880         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5881                 if (!net_eq(net, dev_net(iter->dev)))
5882                         continue;
5883                 netdev_adjacent_sysfs_add(iter->dev, dev,
5884                                           &iter->dev->adj_list.lower);
5885                 netdev_adjacent_sysfs_add(dev, iter->dev,
5886                                           &dev->adj_list.upper);
5887         }
5888
5889         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5890                 if (!net_eq(net, dev_net(iter->dev)))
5891                         continue;
5892                 netdev_adjacent_sysfs_add(iter->dev, dev,
5893                                           &iter->dev->adj_list.upper);
5894                 netdev_adjacent_sysfs_add(dev, iter->dev,
5895                                           &dev->adj_list.lower);
5896         }
5897 }
5898
5899 static void netdev_adjacent_del_links(struct net_device *dev)
5900 {
5901         struct netdev_adjacent *iter;
5902
5903         struct net *net = dev_net(dev);
5904
5905         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5906                 if (!net_eq(net, dev_net(iter->dev)))
5907                         continue;
5908                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5909                                           &iter->dev->adj_list.lower);
5910                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5911                                           &dev->adj_list.upper);
5912         }
5913
5914         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5915                 if (!net_eq(net, dev_net(iter->dev)))
5916                         continue;
5917                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5918                                           &iter->dev->adj_list.upper);
5919                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5920                                           &dev->adj_list.lower);
5921         }
5922 }
5923
5924 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5925 {
5926         struct netdev_adjacent *iter;
5927
5928         struct net *net = dev_net(dev);
5929
5930         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5931                 if (!net_eq(net, dev_net(iter->dev)))
5932                         continue;
5933                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5934                                           &iter->dev->adj_list.lower);
5935                 netdev_adjacent_sysfs_add(iter->dev, dev,
5936                                           &iter->dev->adj_list.lower);
5937         }
5938
5939         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5940                 if (!net_eq(net, dev_net(iter->dev)))
5941                         continue;
5942                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5943                                           &iter->dev->adj_list.upper);
5944                 netdev_adjacent_sysfs_add(iter->dev, dev,
5945                                           &iter->dev->adj_list.upper);
5946         }
5947 }
5948
5949 void *netdev_lower_dev_get_private(struct net_device *dev,
5950                                    struct net_device *lower_dev)
5951 {
5952         struct netdev_adjacent *lower;
5953
5954         if (!lower_dev)
5955                 return NULL;
5956         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5957         if (!lower)
5958                 return NULL;
5959
5960         return lower->private;
5961 }
5962 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5963
5964
5965 int dev_get_nest_level(struct net_device *dev)
5966 {
5967         struct net_device *lower = NULL;
5968         struct list_head *iter;
5969         int max_nest = -1;
5970         int nest;
5971
5972         ASSERT_RTNL();
5973
5974         netdev_for_each_lower_dev(dev, lower, iter) {
5975                 nest = dev_get_nest_level(lower);
5976                 if (max_nest < nest)
5977                         max_nest = nest;
5978         }
5979
5980         return max_nest + 1;
5981 }
5982 EXPORT_SYMBOL(dev_get_nest_level);
5983
5984 /**
5985  * netdev_lower_change - Dispatch event about lower device state change
5986  * @lower_dev: device
5987  * @lower_state_info: state to dispatch
5988  *
5989  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
5990  * The caller must hold the RTNL lock.
5991  */
5992 void netdev_lower_state_changed(struct net_device *lower_dev,
5993                                 void *lower_state_info)
5994 {
5995         struct netdev_notifier_changelowerstate_info changelowerstate_info;
5996
5997         ASSERT_RTNL();
5998         changelowerstate_info.lower_state_info = lower_state_info;
5999         call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6000                                       &changelowerstate_info.info);
6001 }
6002 EXPORT_SYMBOL(netdev_lower_state_changed);
6003
6004 int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6005                                            struct neighbour *n)
6006 {
6007         struct net_device *lower_dev, *stop_dev;
6008         struct list_head *iter;
6009         int err;
6010
6011         netdev_for_each_lower_dev(dev, lower_dev, iter) {
6012                 if (!lower_dev->netdev_ops->ndo_neigh_construct)
6013                         continue;
6014                 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6015                 if (err) {
6016                         stop_dev = lower_dev;
6017                         goto rollback;
6018                 }
6019         }
6020         return 0;
6021
6022 rollback:
6023         netdev_for_each_lower_dev(dev, lower_dev, iter) {
6024                 if (lower_dev == stop_dev)
6025                         break;
6026                 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6027                         continue;
6028                 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6029         }
6030         return err;
6031 }
6032 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6033
6034 void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6035                                           struct neighbour *n)
6036 {
6037         struct net_device *lower_dev;
6038         struct list_head *iter;
6039
6040         netdev_for_each_lower_dev(dev, lower_dev, iter) {
6041                 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6042                         continue;
6043                 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6044         }
6045 }
6046 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6047
6048 static void dev_change_rx_flags(struct net_device *dev, int flags)
6049 {
6050         const struct net_device_ops *ops = dev->netdev_ops;
6051
6052         if (ops->ndo_change_rx_flags)
6053                 ops->ndo_change_rx_flags(dev, flags);
6054 }
6055
6056 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6057 {
6058         unsigned int old_flags = dev->flags;
6059         kuid_t uid;
6060         kgid_t gid;
6061
6062         ASSERT_RTNL();
6063
6064         dev->flags |= IFF_PROMISC;
6065         dev->promiscuity += inc;
6066         if (dev->promiscuity == 0) {
6067                 /*
6068                  * Avoid overflow.
6069                  * If inc causes overflow, untouch promisc and return error.
6070                  */
6071                 if (inc < 0)
6072                         dev->flags &= ~IFF_PROMISC;
6073                 else {
6074                         dev->promiscuity -= inc;
6075                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6076                                 dev->name);
6077                         return -EOVERFLOW;
6078                 }
6079         }
6080         if (dev->flags != old_flags) {
6081                 pr_info("device %s %s promiscuous mode\n",
6082                         dev->name,
6083                         dev->flags & IFF_PROMISC ? "entered" : "left");
6084                 if (audit_enabled) {
6085                         current_uid_gid(&uid, &gid);
6086                         audit_log(current->audit_context, GFP_ATOMIC,
6087                                 AUDIT_ANOM_PROMISCUOUS,
6088                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6089                                 dev->name, (dev->flags & IFF_PROMISC),
6090                                 (old_flags & IFF_PROMISC),
6091                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6092                                 from_kuid(&init_user_ns, uid),
6093                                 from_kgid(&init_user_ns, gid),
6094                                 audit_get_sessionid(current));
6095                 }
6096
6097                 dev_change_rx_flags(dev, IFF_PROMISC);
6098         }
6099         if (notify)
6100                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6101         return 0;
6102 }
6103
6104 /**
6105  *      dev_set_promiscuity     - update promiscuity count on a device
6106  *      @dev: device
6107  *      @inc: modifier
6108  *
6109  *      Add or remove promiscuity from a device. While the count in the device
6110  *      remains above zero the interface remains promiscuous. Once it hits zero
6111  *      the device reverts back to normal filtering operation. A negative inc
6112  *      value is used to drop promiscuity on the device.
6113  *      Return 0 if successful or a negative errno code on error.
6114  */
6115 int dev_set_promiscuity(struct net_device *dev, int inc)
6116 {
6117         unsigned int old_flags = dev->flags;
6118         int err;
6119
6120         err = __dev_set_promiscuity(dev, inc, true);
6121         if (err < 0)
6122                 return err;
6123         if (dev->flags != old_flags)
6124                 dev_set_rx_mode(dev);
6125         return err;
6126 }
6127 EXPORT_SYMBOL(dev_set_promiscuity);
6128
6129 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6130 {
6131         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6132
6133         ASSERT_RTNL();
6134
6135         dev->flags |= IFF_ALLMULTI;
6136         dev->allmulti += inc;
6137         if (dev->allmulti == 0) {
6138                 /*
6139                  * Avoid overflow.
6140                  * If inc causes overflow, untouch allmulti and return error.
6141                  */
6142                 if (inc < 0)
6143                         dev->flags &= ~IFF_ALLMULTI;
6144                 else {
6145                         dev->allmulti -= inc;
6146                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6147                                 dev->name);
6148                         return -EOVERFLOW;
6149                 }
6150         }
6151         if (dev->flags ^ old_flags) {
6152                 dev_change_rx_flags(dev, IFF_ALLMULTI);
6153                 dev_set_rx_mode(dev);
6154                 if (notify)
6155                         __dev_notify_flags(dev, old_flags,
6156                                            dev->gflags ^ old_gflags);
6157         }
6158         return 0;
6159 }
6160
6161 /**
6162  *      dev_set_allmulti        - update allmulti count on a device
6163  *      @dev: device
6164  *      @inc: modifier
6165  *
6166  *      Add or remove reception of all multicast frames to a device. While the
6167  *      count in the device remains above zero the interface remains listening
6168  *      to all interfaces. Once it hits zero the device reverts back to normal
6169  *      filtering operation. A negative @inc value is used to drop the counter
6170  *      when releasing a resource needing all multicasts.
6171  *      Return 0 if successful or a negative errno code on error.
6172  */
6173
6174 int dev_set_allmulti(struct net_device *dev, int inc)
6175 {
6176         return __dev_set_allmulti(dev, inc, true);
6177 }
6178 EXPORT_SYMBOL(dev_set_allmulti);
6179
6180 /*
6181  *      Upload unicast and multicast address lists to device and
6182  *      configure RX filtering. When the device doesn't support unicast
6183  *      filtering it is put in promiscuous mode while unicast addresses
6184  *      are present.
6185  */
6186 void __dev_set_rx_mode(struct net_device *dev)
6187 {
6188         const struct net_device_ops *ops = dev->netdev_ops;
6189
6190         /* dev_open will call this function so the list will stay sane. */
6191         if (!(dev->flags&IFF_UP))
6192                 return;
6193
6194         if (!netif_device_present(dev))
6195                 return;
6196
6197         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6198                 /* Unicast addresses changes may only happen under the rtnl,
6199                  * therefore calling __dev_set_promiscuity here is safe.
6200                  */
6201                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6202                         __dev_set_promiscuity(dev, 1, false);
6203                         dev->uc_promisc = true;
6204                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6205                         __dev_set_promiscuity(dev, -1, false);
6206                         dev->uc_promisc = false;
6207                 }
6208         }
6209
6210         if (ops->ndo_set_rx_mode)
6211                 ops->ndo_set_rx_mode(dev);
6212 }
6213
6214 void dev_set_rx_mode(struct net_device *dev)
6215 {
6216         netif_addr_lock_bh(dev);
6217         __dev_set_rx_mode(dev);
6218         netif_addr_unlock_bh(dev);
6219 }
6220
6221 /**
6222  *      dev_get_flags - get flags reported to userspace
6223  *      @dev: device
6224  *
6225  *      Get the combination of flag bits exported through APIs to userspace.
6226  */
6227 unsigned int dev_get_flags(const struct net_device *dev)
6228 {
6229         unsigned int flags;
6230
6231         flags = (dev->flags & ~(IFF_PROMISC |
6232                                 IFF_ALLMULTI |
6233                                 IFF_RUNNING |
6234                                 IFF_LOWER_UP |
6235                                 IFF_DORMANT)) |
6236                 (dev->gflags & (IFF_PROMISC |
6237                                 IFF_ALLMULTI));
6238
6239         if (netif_running(dev)) {
6240                 if (netif_oper_up(dev))
6241                         flags |= IFF_RUNNING;
6242                 if (netif_carrier_ok(dev))
6243                         flags |= IFF_LOWER_UP;
6244                 if (netif_dormant(dev))
6245                         flags |= IFF_DORMANT;
6246         }
6247
6248         return flags;
6249 }
6250 EXPORT_SYMBOL(dev_get_flags);
6251
6252 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6253 {
6254         unsigned int old_flags = dev->flags;
6255         int ret;
6256
6257         ASSERT_RTNL();
6258
6259         /*
6260          *      Set the flags on our device.
6261          */
6262
6263         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6264                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6265                                IFF_AUTOMEDIA)) |
6266                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6267                                     IFF_ALLMULTI));
6268
6269         /*
6270          *      Load in the correct multicast list now the flags have changed.
6271          */
6272
6273         if ((old_flags ^ flags) & IFF_MULTICAST)
6274                 dev_change_rx_flags(dev, IFF_MULTICAST);
6275
6276         dev_set_rx_mode(dev);
6277
6278         /*
6279          *      Have we downed the interface. We handle IFF_UP ourselves
6280          *      according to user attempts to set it, rather than blindly
6281          *      setting it.
6282          */
6283
6284         ret = 0;
6285         if ((old_flags ^ flags) & IFF_UP)
6286                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6287
6288         if ((flags ^ dev->gflags) & IFF_PROMISC) {
6289                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6290                 unsigned int old_flags = dev->flags;
6291
6292                 dev->gflags ^= IFF_PROMISC;
6293
6294                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6295                         if (dev->flags != old_flags)
6296                                 dev_set_rx_mode(dev);
6297         }
6298
6299         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6300            is important. Some (broken) drivers set IFF_PROMISC, when
6301            IFF_ALLMULTI is requested not asking us and not reporting.
6302          */
6303         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6304                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6305
6306                 dev->gflags ^= IFF_ALLMULTI;
6307                 __dev_set_allmulti(dev, inc, false);
6308         }
6309
6310         return ret;
6311 }
6312
6313 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6314                         unsigned int gchanges)
6315 {
6316         unsigned int changes = dev->flags ^ old_flags;
6317
6318         if (gchanges)
6319                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6320
6321         if (changes & IFF_UP) {
6322                 if (dev->flags & IFF_UP)
6323                         call_netdevice_notifiers(NETDEV_UP, dev);
6324                 else
6325                         call_netdevice_notifiers(NETDEV_DOWN, dev);
6326         }
6327
6328         if (dev->flags & IFF_UP &&
6329             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6330                 struct netdev_notifier_change_info change_info;
6331
6332                 change_info.flags_changed = changes;
6333                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6334                                               &change_info.info);
6335         }
6336 }
6337
6338 /**
6339  *      dev_change_flags - change device settings
6340  *      @dev: device
6341  *      @flags: device state flags
6342  *
6343  *      Change settings on device based state flags. The flags are
6344  *      in the userspace exported format.
6345  */
6346 int dev_change_flags(struct net_device *dev, unsigned int flags)
6347 {
6348         int ret;
6349         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6350
6351         ret = __dev_change_flags(dev, flags);
6352         if (ret < 0)
6353                 return ret;
6354
6355         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6356         __dev_notify_flags(dev, old_flags, changes);
6357         return ret;
6358 }
6359 EXPORT_SYMBOL(dev_change_flags);
6360
6361 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6362 {
6363         const struct net_device_ops *ops = dev->netdev_ops;
6364
6365         if (ops->ndo_change_mtu)
6366                 return ops->ndo_change_mtu(dev, new_mtu);
6367
6368         dev->mtu = new_mtu;
6369         return 0;
6370 }
6371
6372 /**
6373  *      dev_set_mtu - Change maximum transfer unit
6374  *      @dev: device
6375  *      @new_mtu: new transfer unit
6376  *
6377  *      Change the maximum transfer size of the network device.
6378  */
6379 int dev_set_mtu(struct net_device *dev, int new_mtu)
6380 {
6381         int err, orig_mtu;
6382
6383         if (new_mtu == dev->mtu)
6384                 return 0;
6385
6386         /* MTU must be positive, and in range */
6387         if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6388                 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6389                                     dev->name, new_mtu, dev->min_mtu);
6390                 return -EINVAL;
6391         }
6392
6393         if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6394                 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6395                                     dev->name, new_mtu, dev->max_mtu);
6396                 return -EINVAL;
6397         }
6398
6399         if (!netif_device_present(dev))
6400                 return -ENODEV;
6401
6402         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6403         err = notifier_to_errno(err);
6404         if (err)
6405                 return err;
6406
6407         orig_mtu = dev->mtu;
6408         err = __dev_set_mtu(dev, new_mtu);
6409
6410         if (!err) {
6411                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6412                 err = notifier_to_errno(err);
6413                 if (err) {
6414                         /* setting mtu back and notifying everyone again,
6415                          * so that they have a chance to revert changes.
6416                          */
6417                         __dev_set_mtu(dev, orig_mtu);
6418                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6419                 }
6420         }
6421         return err;
6422 }
6423 EXPORT_SYMBOL(dev_set_mtu);
6424
6425 /**
6426  *      dev_set_group - Change group this device belongs to
6427  *      @dev: device
6428  *      @new_group: group this device should belong to
6429  */
6430 void dev_set_group(struct net_device *dev, int new_group)
6431 {
6432         dev->group = new_group;
6433 }
6434 EXPORT_SYMBOL(dev_set_group);
6435
6436 /**
6437  *      dev_set_mac_address - Change Media Access Control Address
6438  *      @dev: device
6439  *      @sa: new address
6440  *
6441  *      Change the hardware (MAC) address of the device
6442  */
6443 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6444 {
6445         const struct net_device_ops *ops = dev->netdev_ops;
6446         int err;
6447
6448         if (!ops->ndo_set_mac_address)
6449                 return -EOPNOTSUPP;
6450         if (sa->sa_family != dev->type)
6451                 return -EINVAL;
6452         if (!netif_device_present(dev))
6453                 return -ENODEV;
6454         err = ops->ndo_set_mac_address(dev, sa);
6455         if (err)
6456                 return err;
6457         dev->addr_assign_type = NET_ADDR_SET;
6458         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6459         add_device_randomness(dev->dev_addr, dev->addr_len);
6460         return 0;
6461 }
6462 EXPORT_SYMBOL(dev_set_mac_address);
6463
6464 /**
6465  *      dev_change_carrier - Change device carrier
6466  *      @dev: device
6467  *      @new_carrier: new value
6468  *
6469  *      Change device carrier
6470  */
6471 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6472 {
6473         const struct net_device_ops *ops = dev->netdev_ops;
6474
6475         if (!ops->ndo_change_carrier)
6476                 return -EOPNOTSUPP;
6477         if (!netif_device_present(dev))
6478                 return -ENODEV;
6479         return ops->ndo_change_carrier(dev, new_carrier);
6480 }
6481 EXPORT_SYMBOL(dev_change_carrier);
6482
6483 /**
6484  *      dev_get_phys_port_id - Get device physical port ID
6485  *      @dev: device
6486  *      @ppid: port ID
6487  *
6488  *      Get device physical port ID
6489  */
6490 int dev_get_phys_port_id(struct net_device *dev,
6491                          struct netdev_phys_item_id *ppid)
6492 {
6493         const struct net_device_ops *ops = dev->netdev_ops;
6494
6495         if (!ops->ndo_get_phys_port_id)
6496                 return -EOPNOTSUPP;
6497         return ops->ndo_get_phys_port_id(dev, ppid);
6498 }
6499 EXPORT_SYMBOL(dev_get_phys_port_id);
6500
6501 /**
6502  *      dev_get_phys_port_name - Get device physical port name
6503  *      @dev: device
6504  *      @name: port name
6505  *      @len: limit of bytes to copy to name
6506  *
6507  *      Get device physical port name
6508  */
6509 int dev_get_phys_port_name(struct net_device *dev,
6510                            char *name, size_t len)
6511 {
6512         const struct net_device_ops *ops = dev->netdev_ops;
6513
6514         if (!ops->ndo_get_phys_port_name)
6515                 return -EOPNOTSUPP;
6516         return ops->ndo_get_phys_port_name(dev, name, len);
6517 }
6518 EXPORT_SYMBOL(dev_get_phys_port_name);
6519
6520 /**
6521  *      dev_change_proto_down - update protocol port state information
6522  *      @dev: device
6523  *      @proto_down: new value
6524  *
6525  *      This info can be used by switch drivers to set the phys state of the
6526  *      port.
6527  */
6528 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6529 {
6530         const struct net_device_ops *ops = dev->netdev_ops;
6531
6532         if (!ops->ndo_change_proto_down)
6533                 return -EOPNOTSUPP;
6534         if (!netif_device_present(dev))
6535                 return -ENODEV;
6536         return ops->ndo_change_proto_down(dev, proto_down);
6537 }
6538 EXPORT_SYMBOL(dev_change_proto_down);
6539
6540 /**
6541  *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
6542  *      @dev: device
6543  *      @fd: new program fd or negative value to clear
6544  *
6545  *      Set or clear a bpf program for a device
6546  */
6547 int dev_change_xdp_fd(struct net_device *dev, int fd)
6548 {
6549         const struct net_device_ops *ops = dev->netdev_ops;
6550         struct bpf_prog *prog = NULL;
6551         struct netdev_xdp xdp = {};
6552         int err;
6553
6554         if (!ops->ndo_xdp)
6555                 return -EOPNOTSUPP;
6556         if (fd >= 0) {
6557                 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6558                 if (IS_ERR(prog))
6559                         return PTR_ERR(prog);
6560         }
6561
6562         xdp.command = XDP_SETUP_PROG;
6563         xdp.prog = prog;
6564         err = ops->ndo_xdp(dev, &xdp);
6565         if (err < 0 && prog)
6566                 bpf_prog_put(prog);
6567
6568         return err;
6569 }
6570 EXPORT_SYMBOL(dev_change_xdp_fd);
6571
6572 /**
6573  *      dev_new_index   -       allocate an ifindex
6574  *      @net: the applicable net namespace
6575  *
6576  *      Returns a suitable unique value for a new device interface
6577  *      number.  The caller must hold the rtnl semaphore or the
6578  *      dev_base_lock to be sure it remains unique.
6579  */
6580 static int dev_new_index(struct net *net)
6581 {
6582         int ifindex = net->ifindex;
6583         for (;;) {
6584                 if (++ifindex <= 0)
6585                         ifindex = 1;
6586                 if (!__dev_get_by_index(net, ifindex))
6587                         return net->ifindex = ifindex;
6588         }
6589 }
6590
6591 /* Delayed registration/unregisteration */
6592 static LIST_HEAD(net_todo_list);
6593 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6594
6595 static void net_set_todo(struct net_device *dev)
6596 {
6597         list_add_tail(&dev->todo_list, &net_todo_list);
6598         dev_net(dev)->dev_unreg_count++;
6599 }
6600
6601 static void rollback_registered_many(struct list_head *head)
6602 {
6603         struct net_device *dev, *tmp;
6604         LIST_HEAD(close_head);
6605
6606         BUG_ON(dev_boot_phase);
6607         ASSERT_RTNL();
6608
6609         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6610                 /* Some devices call without registering
6611                  * for initialization unwind. Remove those
6612                  * devices and proceed with the remaining.
6613                  */
6614                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6615                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6616                                  dev->name, dev);
6617
6618                         WARN_ON(1);
6619                         list_del(&dev->unreg_list);
6620                         continue;
6621                 }
6622                 dev->dismantle = true;
6623                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6624         }
6625
6626         /* If device is running, close it first. */
6627         list_for_each_entry(dev, head, unreg_list)
6628                 list_add_tail(&dev->close_list, &close_head);
6629         dev_close_many(&close_head, true);
6630
6631         list_for_each_entry(dev, head, unreg_list) {
6632                 /* And unlink it from device chain. */
6633                 unlist_netdevice(dev);
6634
6635                 dev->reg_state = NETREG_UNREGISTERING;
6636         }
6637         flush_all_backlogs();
6638
6639         synchronize_net();
6640
6641         list_for_each_entry(dev, head, unreg_list) {
6642                 struct sk_buff *skb = NULL;
6643
6644                 /* Shutdown queueing discipline. */
6645                 dev_shutdown(dev);
6646
6647
6648                 /* Notify protocols, that we are about to destroy
6649                    this device. They should clean all the things.
6650                 */
6651                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6652
6653                 if (!dev->rtnl_link_ops ||
6654                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6655                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6656                                                      GFP_KERNEL);
6657
6658                 /*
6659                  *      Flush the unicast and multicast chains
6660                  */
6661                 dev_uc_flush(dev);
6662                 dev_mc_flush(dev);
6663
6664                 if (dev->netdev_ops->ndo_uninit)
6665                         dev->netdev_ops->ndo_uninit(dev);
6666
6667                 if (skb)
6668                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6669
6670                 /* Notifier chain MUST detach us all upper devices. */
6671                 WARN_ON(netdev_has_any_upper_dev(dev));
6672                 WARN_ON(netdev_has_any_lower_dev(dev));
6673
6674                 /* Remove entries from kobject tree */
6675                 netdev_unregister_kobject(dev);
6676 #ifdef CONFIG_XPS
6677                 /* Remove XPS queueing entries */
6678                 netif_reset_xps_queues_gt(dev, 0);
6679 #endif
6680         }
6681
6682         synchronize_net();
6683
6684         list_for_each_entry(dev, head, unreg_list)
6685                 dev_put(dev);
6686 }
6687
6688 static void rollback_registered(struct net_device *dev)
6689 {
6690         LIST_HEAD(single);
6691
6692         list_add(&dev->unreg_list, &single);
6693         rollback_registered_many(&single);
6694         list_del(&single);
6695 }
6696
6697 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6698         struct net_device *upper, netdev_features_t features)
6699 {
6700         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6701         netdev_features_t feature;
6702         int feature_bit;
6703
6704         for_each_netdev_feature(&upper_disables, feature_bit) {
6705                 feature = __NETIF_F_BIT(feature_bit);
6706                 if (!(upper->wanted_features & feature)
6707                     && (features & feature)) {
6708                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6709                                    &feature, upper->name);
6710                         features &= ~feature;
6711                 }
6712         }
6713
6714         return features;
6715 }
6716
6717 static void netdev_sync_lower_features(struct net_device *upper,
6718         struct net_device *lower, netdev_features_t features)
6719 {
6720         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6721         netdev_features_t feature;
6722         int feature_bit;
6723
6724         for_each_netdev_feature(&upper_disables, feature_bit) {
6725                 feature = __NETIF_F_BIT(feature_bit);
6726                 if (!(features & feature) && (lower->features & feature)) {
6727                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6728                                    &feature, lower->name);
6729                         lower->wanted_features &= ~feature;
6730                         netdev_update_features(lower);
6731
6732                         if (unlikely(lower->features & feature))
6733                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6734                                             &feature, lower->name);
6735                 }
6736         }
6737 }
6738
6739 static netdev_features_t netdev_fix_features(struct net_device *dev,
6740         netdev_features_t features)
6741 {
6742         /* Fix illegal checksum combinations */
6743         if ((features & NETIF_F_HW_CSUM) &&
6744             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6745                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6746                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6747         }
6748
6749         /* TSO requires that SG is present as well. */
6750         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6751                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6752                 features &= ~NETIF_F_ALL_TSO;
6753         }
6754
6755         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6756                                         !(features & NETIF_F_IP_CSUM)) {
6757                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6758                 features &= ~NETIF_F_TSO;
6759                 features &= ~NETIF_F_TSO_ECN;
6760         }
6761
6762         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6763                                          !(features & NETIF_F_IPV6_CSUM)) {
6764                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6765                 features &= ~NETIF_F_TSO6;
6766         }
6767
6768         /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6769         if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6770                 features &= ~NETIF_F_TSO_MANGLEID;
6771
6772         /* TSO ECN requires that TSO is present as well. */
6773         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6774                 features &= ~NETIF_F_TSO_ECN;
6775
6776         /* Software GSO depends on SG. */
6777         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6778                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6779                 features &= ~NETIF_F_GSO;
6780         }
6781
6782         /* UFO needs SG and checksumming */
6783         if (features & NETIF_F_UFO) {
6784                 /* maybe split UFO into V4 and V6? */
6785                 if (!(features & NETIF_F_HW_CSUM) &&
6786                     ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6787                      (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6788                         netdev_dbg(dev,
6789                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6790                         features &= ~NETIF_F_UFO;
6791                 }
6792
6793                 if (!(features & NETIF_F_SG)) {
6794                         netdev_dbg(dev,
6795                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6796                         features &= ~NETIF_F_UFO;
6797                 }
6798         }
6799
6800         /* GSO partial features require GSO partial be set */
6801         if ((features & dev->gso_partial_features) &&
6802             !(features & NETIF_F_GSO_PARTIAL)) {
6803                 netdev_dbg(dev,
6804                            "Dropping partially supported GSO features since no GSO partial.\n");
6805                 features &= ~dev->gso_partial_features;
6806         }
6807
6808 #ifdef CONFIG_NET_RX_BUSY_POLL
6809         if (dev->netdev_ops->ndo_busy_poll)
6810                 features |= NETIF_F_BUSY_POLL;
6811         else
6812 #endif
6813                 features &= ~NETIF_F_BUSY_POLL;
6814
6815         return features;
6816 }
6817
6818 int __netdev_update_features(struct net_device *dev)
6819 {
6820         struct net_device *upper, *lower;
6821         netdev_features_t features;
6822         struct list_head *iter;
6823         int err = -1;
6824
6825         ASSERT_RTNL();
6826
6827         features = netdev_get_wanted_features(dev);
6828
6829         if (dev->netdev_ops->ndo_fix_features)
6830                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6831
6832         /* driver might be less strict about feature dependencies */
6833         features = netdev_fix_features(dev, features);
6834
6835         /* some features can't be enabled if they're off an an upper device */
6836         netdev_for_each_upper_dev_rcu(dev, upper, iter)
6837                 features = netdev_sync_upper_features(dev, upper, features);
6838
6839         if (dev->features == features)
6840                 goto sync_lower;
6841
6842         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6843                 &dev->features, &features);
6844
6845         if (dev->netdev_ops->ndo_set_features)
6846                 err = dev->netdev_ops->ndo_set_features(dev, features);
6847         else
6848                 err = 0;
6849
6850         if (unlikely(err < 0)) {
6851                 netdev_err(dev,
6852                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6853                         err, &features, &dev->features);
6854                 /* return non-0 since some features might have changed and
6855                  * it's better to fire a spurious notification than miss it
6856                  */
6857                 return -1;
6858         }
6859
6860 sync_lower:
6861         /* some features must be disabled on lower devices when disabled
6862          * on an upper device (think: bonding master or bridge)
6863          */
6864         netdev_for_each_lower_dev(dev, lower, iter)
6865                 netdev_sync_lower_features(dev, lower, features);
6866
6867         if (!err)
6868                 dev->features = features;
6869
6870         return err < 0 ? 0 : 1;
6871 }
6872
6873 /**
6874  *      netdev_update_features - recalculate device features
6875  *      @dev: the device to check
6876  *
6877  *      Recalculate dev->features set and send notifications if it
6878  *      has changed. Should be called after driver or hardware dependent
6879  *      conditions might have changed that influence the features.
6880  */
6881 void netdev_update_features(struct net_device *dev)
6882 {
6883         if (__netdev_update_features(dev))
6884                 netdev_features_change(dev);
6885 }
6886 EXPORT_SYMBOL(netdev_update_features);
6887
6888 /**
6889  *      netdev_change_features - recalculate device features
6890  *      @dev: the device to check
6891  *
6892  *      Recalculate dev->features set and send notifications even
6893  *      if they have not changed. Should be called instead of
6894  *      netdev_update_features() if also dev->vlan_features might
6895  *      have changed to allow the changes to be propagated to stacked
6896  *      VLAN devices.
6897  */
6898 void netdev_change_features(struct net_device *dev)
6899 {
6900         __netdev_update_features(dev);
6901         netdev_features_change(dev);
6902 }
6903 EXPORT_SYMBOL(netdev_change_features);
6904
6905 /**
6906  *      netif_stacked_transfer_operstate -      transfer operstate
6907  *      @rootdev: the root or lower level device to transfer state from
6908  *      @dev: the device to transfer operstate to
6909  *
6910  *      Transfer operational state from root to device. This is normally
6911  *      called when a stacking relationship exists between the root
6912  *      device and the device(a leaf device).
6913  */
6914 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6915                                         struct net_device *dev)
6916 {
6917         if (rootdev->operstate == IF_OPER_DORMANT)
6918                 netif_dormant_on(dev);
6919         else
6920                 netif_dormant_off(dev);
6921
6922         if (netif_carrier_ok(rootdev)) {
6923                 if (!netif_carrier_ok(dev))
6924                         netif_carrier_on(dev);
6925         } else {
6926                 if (netif_carrier_ok(dev))
6927                         netif_carrier_off(dev);
6928         }
6929 }
6930 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6931
6932 #ifdef CONFIG_SYSFS
6933 static int netif_alloc_rx_queues(struct net_device *dev)
6934 {
6935         unsigned int i, count = dev->num_rx_queues;
6936         struct netdev_rx_queue *rx;
6937         size_t sz = count * sizeof(*rx);
6938
6939         BUG_ON(count < 1);
6940
6941         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6942         if (!rx) {
6943                 rx = vzalloc(sz);
6944                 if (!rx)
6945                         return -ENOMEM;
6946         }
6947         dev->_rx = rx;
6948
6949         for (i = 0; i < count; i++)
6950                 rx[i].dev = dev;
6951         return 0;
6952 }
6953 #endif
6954
6955 static void netdev_init_one_queue(struct net_device *dev,
6956                                   struct netdev_queue *queue, void *_unused)
6957 {
6958         /* Initialize queue lock */
6959         spin_lock_init(&queue->_xmit_lock);
6960         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6961         queue->xmit_lock_owner = -1;
6962         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6963         queue->dev = dev;
6964 #ifdef CONFIG_BQL
6965         dql_init(&queue->dql, HZ);
6966 #endif
6967 }
6968
6969 static void netif_free_tx_queues(struct net_device *dev)
6970 {
6971         kvfree(dev->_tx);
6972 }
6973
6974 static int netif_alloc_netdev_queues(struct net_device *dev)
6975 {
6976         unsigned int count = dev->num_tx_queues;
6977         struct netdev_queue *tx;
6978         size_t sz = count * sizeof(*tx);
6979
6980         if (count < 1 || count > 0xffff)
6981                 return -EINVAL;
6982
6983         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6984         if (!tx) {
6985                 tx = vzalloc(sz);
6986                 if (!tx)
6987                         return -ENOMEM;
6988         }
6989         dev->_tx = tx;
6990
6991         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6992         spin_lock_init(&dev->tx_global_lock);
6993
6994         return 0;
6995 }
6996
6997 void netif_tx_stop_all_queues(struct net_device *dev)
6998 {
6999         unsigned int i;
7000
7001         for (i = 0; i < dev->num_tx_queues; i++) {
7002                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7003                 netif_tx_stop_queue(txq);
7004         }
7005 }
7006 EXPORT_SYMBOL(netif_tx_stop_all_queues);
7007
7008 /**
7009  *      register_netdevice      - register a network device
7010  *      @dev: device to register
7011  *
7012  *      Take a completed network device structure and add it to the kernel
7013  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7014  *      chain. 0 is returned on success. A negative errno code is returned
7015  *      on a failure to set up the device, or if the name is a duplicate.
7016  *
7017  *      Callers must hold the rtnl semaphore. You may want
7018  *      register_netdev() instead of this.
7019  *
7020  *      BUGS:
7021  *      The locking appears insufficient to guarantee two parallel registers
7022  *      will not get the same name.
7023  */
7024
7025 int register_netdevice(struct net_device *dev)
7026 {
7027         int ret;
7028         struct net *net = dev_net(dev);
7029
7030         BUG_ON(dev_boot_phase);
7031         ASSERT_RTNL();
7032
7033         might_sleep();
7034
7035         /* When net_device's are persistent, this will be fatal. */
7036         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7037         BUG_ON(!net);
7038
7039         spin_lock_init(&dev->addr_list_lock);
7040         netdev_set_addr_lockdep_class(dev);
7041
7042         ret = dev_get_valid_name(net, dev, dev->name);
7043         if (ret < 0)
7044                 goto out;
7045
7046         /* Init, if this function is available */
7047         if (dev->netdev_ops->ndo_init) {
7048                 ret = dev->netdev_ops->ndo_init(dev);
7049                 if (ret) {
7050                         if (ret > 0)
7051                                 ret = -EIO;
7052                         goto out;
7053                 }
7054         }
7055
7056         if (((dev->hw_features | dev->features) &
7057              NETIF_F_HW_VLAN_CTAG_FILTER) &&
7058             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7059              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7060                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7061                 ret = -EINVAL;
7062                 goto err_uninit;
7063         }
7064
7065         ret = -EBUSY;
7066         if (!dev->ifindex)
7067                 dev->ifindex = dev_new_index(net);
7068         else if (__dev_get_by_index(net, dev->ifindex))
7069                 goto err_uninit;
7070
7071         /* Transfer changeable features to wanted_features and enable
7072          * software offloads (GSO and GRO).
7073          */
7074         dev->hw_features |= NETIF_F_SOFT_FEATURES;
7075         dev->features |= NETIF_F_SOFT_FEATURES;
7076         dev->wanted_features = dev->features & dev->hw_features;
7077
7078         if (!(dev->flags & IFF_LOOPBACK))
7079                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7080
7081         /* If IPv4 TCP segmentation offload is supported we should also
7082          * allow the device to enable segmenting the frame with the option
7083          * of ignoring a static IP ID value.  This doesn't enable the
7084          * feature itself but allows the user to enable it later.
7085          */
7086         if (dev->hw_features & NETIF_F_TSO)
7087                 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7088         if (dev->vlan_features & NETIF_F_TSO)
7089                 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7090         if (dev->mpls_features & NETIF_F_TSO)
7091                 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7092         if (dev->hw_enc_features & NETIF_F_TSO)
7093                 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7094
7095         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7096          */
7097         dev->vlan_features |= NETIF_F_HIGHDMA;
7098
7099         /* Make NETIF_F_SG inheritable to tunnel devices.
7100          */
7101         dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7102
7103         /* Make NETIF_F_SG inheritable to MPLS.
7104          */
7105         dev->mpls_features |= NETIF_F_SG;
7106
7107         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7108         ret = notifier_to_errno(ret);
7109         if (ret)
7110                 goto err_uninit;
7111
7112         ret = netdev_register_kobject(dev);
7113         if (ret)
7114                 goto err_uninit;
7115         dev->reg_state = NETREG_REGISTERED;
7116
7117         __netdev_update_features(dev);
7118
7119         /*
7120          *      Default initial state at registry is that the
7121          *      device is present.
7122          */
7123
7124         set_bit(__LINK_STATE_PRESENT, &dev->state);
7125
7126         linkwatch_init_dev(dev);
7127
7128         dev_init_scheduler(dev);
7129         dev_hold(dev);
7130         list_netdevice(dev);
7131         add_device_randomness(dev->dev_addr, dev->addr_len);
7132
7133         /* If the device has permanent device address, driver should
7134          * set dev_addr and also addr_assign_type should be set to
7135          * NET_ADDR_PERM (default value).
7136          */
7137         if (dev->addr_assign_type == NET_ADDR_PERM)
7138                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7139
7140         /* Notify protocols, that a new device appeared. */
7141         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7142         ret = notifier_to_errno(ret);
7143         if (ret) {
7144                 rollback_registered(dev);
7145                 dev->reg_state = NETREG_UNREGISTERED;
7146         }
7147         /*
7148          *      Prevent userspace races by waiting until the network
7149          *      device is fully setup before sending notifications.
7150          */
7151         if (!dev->rtnl_link_ops ||
7152             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7153                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7154
7155 out:
7156         return ret;
7157
7158 err_uninit:
7159         if (dev->netdev_ops->ndo_uninit)
7160                 dev->netdev_ops->ndo_uninit(dev);
7161         goto out;
7162 }
7163 EXPORT_SYMBOL(register_netdevice);
7164
7165 /**
7166  *      init_dummy_netdev       - init a dummy network device for NAPI
7167  *      @dev: device to init
7168  *
7169  *      This takes a network device structure and initialize the minimum
7170  *      amount of fields so it can be used to schedule NAPI polls without
7171  *      registering a full blown interface. This is to be used by drivers
7172  *      that need to tie several hardware interfaces to a single NAPI
7173  *      poll scheduler due to HW limitations.
7174  */
7175 int init_dummy_netdev(struct net_device *dev)
7176 {
7177         /* Clear everything. Note we don't initialize spinlocks
7178          * are they aren't supposed to be taken by any of the
7179          * NAPI code and this dummy netdev is supposed to be
7180          * only ever used for NAPI polls
7181          */
7182         memset(dev, 0, sizeof(struct net_device));
7183
7184         /* make sure we BUG if trying to hit standard
7185          * register/unregister code path
7186          */
7187         dev->reg_state = NETREG_DUMMY;
7188
7189         /* NAPI wants this */
7190         INIT_LIST_HEAD(&dev->napi_list);
7191
7192         /* a dummy interface is started by default */
7193         set_bit(__LINK_STATE_PRESENT, &dev->state);
7194         set_bit(__LINK_STATE_START, &dev->state);
7195
7196         /* Note : We dont allocate pcpu_refcnt for dummy devices,
7197          * because users of this 'device' dont need to change
7198          * its refcount.
7199          */
7200
7201         return 0;
7202 }
7203 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7204
7205
7206 /**
7207  *      register_netdev - register a network device
7208  *      @dev: device to register
7209  *
7210  *      Take a completed network device structure and add it to the kernel
7211  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7212  *      chain. 0 is returned on success. A negative errno code is returned
7213  *      on a failure to set up the device, or if the name is a duplicate.
7214  *
7215  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
7216  *      and expands the device name if you passed a format string to
7217  *      alloc_netdev.
7218  */
7219 int register_netdev(struct net_device *dev)
7220 {
7221         int err;
7222
7223         rtnl_lock();
7224         err = register_netdevice(dev);
7225         rtnl_unlock();
7226         return err;
7227 }
7228 EXPORT_SYMBOL(register_netdev);
7229
7230 int netdev_refcnt_read(const struct net_device *dev)
7231 {
7232         int i, refcnt = 0;
7233
7234         for_each_possible_cpu(i)
7235                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7236         return refcnt;
7237 }
7238 EXPORT_SYMBOL(netdev_refcnt_read);
7239
7240 /**
7241  * netdev_wait_allrefs - wait until all references are gone.
7242  * @dev: target net_device
7243  *
7244  * This is called when unregistering network devices.
7245  *
7246  * Any protocol or device that holds a reference should register
7247  * for netdevice notification, and cleanup and put back the
7248  * reference if they receive an UNREGISTER event.
7249  * We can get stuck here if buggy protocols don't correctly
7250  * call dev_put.
7251  */
7252 static void netdev_wait_allrefs(struct net_device *dev)
7253 {
7254         unsigned long rebroadcast_time, warning_time;
7255         int refcnt;
7256
7257         linkwatch_forget_dev(dev);
7258
7259         rebroadcast_time = warning_time = jiffies;
7260         refcnt = netdev_refcnt_read(dev);
7261
7262         while (refcnt != 0) {
7263                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7264                         rtnl_lock();
7265
7266                         /* Rebroadcast unregister notification */
7267                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7268
7269                         __rtnl_unlock();
7270                         rcu_barrier();
7271                         rtnl_lock();
7272
7273                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7274                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7275                                      &dev->state)) {
7276                                 /* We must not have linkwatch events
7277                                  * pending on unregister. If this
7278                                  * happens, we simply run the queue
7279                                  * unscheduled, resulting in a noop
7280                                  * for this device.
7281                                  */
7282                                 linkwatch_run_queue();
7283                         }
7284
7285                         __rtnl_unlock();
7286
7287                         rebroadcast_time = jiffies;
7288                 }
7289
7290                 msleep(250);
7291
7292                 refcnt = netdev_refcnt_read(dev);
7293
7294                 if (time_after(jiffies, warning_time + 10 * HZ)) {
7295                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7296                                  dev->name, refcnt);
7297                         warning_time = jiffies;
7298                 }
7299         }
7300 }
7301
7302 /* The sequence is:
7303  *
7304  *      rtnl_lock();
7305  *      ...
7306  *      register_netdevice(x1);
7307  *      register_netdevice(x2);
7308  *      ...
7309  *      unregister_netdevice(y1);
7310  *      unregister_netdevice(y2);
7311  *      ...
7312  *      rtnl_unlock();
7313  *      free_netdev(y1);
7314  *      free_netdev(y2);
7315  *
7316  * We are invoked by rtnl_unlock().
7317  * This allows us to deal with problems:
7318  * 1) We can delete sysfs objects which invoke hotplug
7319  *    without deadlocking with linkwatch via keventd.
7320  * 2) Since we run with the RTNL semaphore not held, we can sleep
7321  *    safely in order to wait for the netdev refcnt to drop to zero.
7322  *
7323  * We must not return until all unregister events added during
7324  * the interval the lock was held have been completed.
7325  */
7326 void netdev_run_todo(void)
7327 {
7328         struct list_head list;
7329
7330         /* Snapshot list, allow later requests */
7331         list_replace_init(&net_todo_list, &list);
7332
7333         __rtnl_unlock();
7334
7335
7336         /* Wait for rcu callbacks to finish before next phase */
7337         if (!list_empty(&list))
7338                 rcu_barrier();
7339
7340         while (!list_empty(&list)) {
7341                 struct net_device *dev
7342                         = list_first_entry(&list, struct net_device, todo_list);
7343                 list_del(&dev->todo_list);
7344
7345                 rtnl_lock();
7346                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7347                 __rtnl_unlock();
7348
7349                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7350                         pr_err("network todo '%s' but state %d\n",
7351                                dev->name, dev->reg_state);
7352                         dump_stack();
7353                         continue;
7354                 }
7355
7356                 dev->reg_state = NETREG_UNREGISTERED;
7357
7358                 netdev_wait_allrefs(dev);
7359
7360                 /* paranoia */
7361                 BUG_ON(netdev_refcnt_read(dev));
7362                 BUG_ON(!list_empty(&dev->ptype_all));
7363                 BUG_ON(!list_empty(&dev->ptype_specific));
7364                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7365                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7366                 WARN_ON(dev->dn_ptr);
7367
7368                 if (dev->destructor)
7369                         dev->destructor(dev);
7370
7371                 /* Report a network device has been unregistered */
7372                 rtnl_lock();
7373                 dev_net(dev)->dev_unreg_count--;
7374                 __rtnl_unlock();
7375                 wake_up(&netdev_unregistering_wq);
7376
7377                 /* Free network device */
7378                 kobject_put(&dev->dev.kobj);
7379         }
7380 }
7381
7382 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7383  * all the same fields in the same order as net_device_stats, with only
7384  * the type differing, but rtnl_link_stats64 may have additional fields
7385  * at the end for newer counters.
7386  */
7387 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7388                              const struct net_device_stats *netdev_stats)
7389 {
7390 #if BITS_PER_LONG == 64
7391         BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7392         memcpy(stats64, netdev_stats, sizeof(*stats64));
7393         /* zero out counters that only exist in rtnl_link_stats64 */
7394         memset((char *)stats64 + sizeof(*netdev_stats), 0,
7395                sizeof(*stats64) - sizeof(*netdev_stats));
7396 #else
7397         size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7398         const unsigned long *src = (const unsigned long *)netdev_stats;
7399         u64 *dst = (u64 *)stats64;
7400
7401         BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7402         for (i = 0; i < n; i++)
7403                 dst[i] = src[i];
7404         /* zero out counters that only exist in rtnl_link_stats64 */
7405         memset((char *)stats64 + n * sizeof(u64), 0,
7406                sizeof(*stats64) - n * sizeof(u64));
7407 #endif
7408 }
7409 EXPORT_SYMBOL(netdev_stats_to_stats64);
7410
7411 /**
7412  *      dev_get_stats   - get network device statistics
7413  *      @dev: device to get statistics from
7414  *      @storage: place to store stats
7415  *
7416  *      Get network statistics from device. Return @storage.
7417  *      The device driver may provide its own method by setting
7418  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7419  *      otherwise the internal statistics structure is used.
7420  */
7421 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7422                                         struct rtnl_link_stats64 *storage)
7423 {
7424         const struct net_device_ops *ops = dev->netdev_ops;
7425
7426         if (ops->ndo_get_stats64) {
7427                 memset(storage, 0, sizeof(*storage));
7428                 ops->ndo_get_stats64(dev, storage);
7429         } else if (ops->ndo_get_stats) {
7430                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7431         } else {
7432                 netdev_stats_to_stats64(storage, &dev->stats);
7433         }
7434         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7435         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7436         storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7437         return storage;
7438 }
7439 EXPORT_SYMBOL(dev_get_stats);
7440
7441 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7442 {
7443         struct netdev_queue *queue = dev_ingress_queue(dev);
7444
7445 #ifdef CONFIG_NET_CLS_ACT
7446         if (queue)
7447                 return queue;
7448         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7449         if (!queue)
7450                 return NULL;
7451         netdev_init_one_queue(dev, queue, NULL);
7452         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7453         queue->qdisc_sleeping = &noop_qdisc;
7454         rcu_assign_pointer(dev->ingress_queue, queue);
7455 #endif
7456         return queue;
7457 }
7458
7459 static const struct ethtool_ops default_ethtool_ops;
7460
7461 void netdev_set_default_ethtool_ops(struct net_device *dev,
7462                                     const struct ethtool_ops *ops)
7463 {
7464         if (dev->ethtool_ops == &default_ethtool_ops)
7465                 dev->ethtool_ops = ops;
7466 }
7467 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7468
7469 void netdev_freemem(struct net_device *dev)
7470 {
7471         char *addr = (char *)dev - dev->padded;
7472
7473         kvfree(addr);
7474 }
7475
7476 /**
7477  *      alloc_netdev_mqs - allocate network device
7478  *      @sizeof_priv:           size of private data to allocate space for
7479  *      @name:                  device name format string
7480  *      @name_assign_type:      origin of device name
7481  *      @setup:                 callback to initialize device
7482  *      @txqs:                  the number of TX subqueues to allocate
7483  *      @rxqs:                  the number of RX subqueues to allocate
7484  *
7485  *      Allocates a struct net_device with private data area for driver use
7486  *      and performs basic initialization.  Also allocates subqueue structs
7487  *      for each queue on the device.
7488  */
7489 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7490                 unsigned char name_assign_type,
7491                 void (*setup)(struct net_device *),
7492                 unsigned int txqs, unsigned int rxqs)
7493 {
7494         struct net_device *dev;
7495         size_t alloc_size;
7496         struct net_device *p;
7497
7498         BUG_ON(strlen(name) >= sizeof(dev->name));
7499
7500         if (txqs < 1) {
7501                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7502                 return NULL;
7503         }
7504
7505 #ifdef CONFIG_SYSFS
7506         if (rxqs < 1) {
7507                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7508                 return NULL;
7509         }
7510 #endif
7511
7512         alloc_size = sizeof(struct net_device);
7513         if (sizeof_priv) {
7514                 /* ensure 32-byte alignment of private area */
7515                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7516                 alloc_size += sizeof_priv;
7517         }
7518         /* ensure 32-byte alignment of whole construct */
7519         alloc_size += NETDEV_ALIGN - 1;
7520
7521         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7522         if (!p)
7523                 p = vzalloc(alloc_size);
7524         if (!p)
7525                 return NULL;
7526
7527         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7528         dev->padded = (char *)dev - (char *)p;
7529
7530         dev->pcpu_refcnt = alloc_percpu(int);
7531         if (!dev->pcpu_refcnt)
7532                 goto free_dev;
7533
7534         if (dev_addr_init(dev))
7535                 goto free_pcpu;
7536
7537         dev_mc_init(dev);
7538         dev_uc_init(dev);
7539
7540         dev_net_set(dev, &init_net);
7541
7542         dev->gso_max_size = GSO_MAX_SIZE;
7543         dev->gso_max_segs = GSO_MAX_SEGS;
7544
7545         INIT_LIST_HEAD(&dev->napi_list);
7546         INIT_LIST_HEAD(&dev->unreg_list);
7547         INIT_LIST_HEAD(&dev->close_list);
7548         INIT_LIST_HEAD(&dev->link_watch_list);
7549         INIT_LIST_HEAD(&dev->adj_list.upper);
7550         INIT_LIST_HEAD(&dev->adj_list.lower);
7551         INIT_LIST_HEAD(&dev->ptype_all);
7552         INIT_LIST_HEAD(&dev->ptype_specific);
7553 #ifdef CONFIG_NET_SCHED
7554         hash_init(dev->qdisc_hash);
7555 #endif
7556         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7557         setup(dev);
7558
7559         if (!dev->tx_queue_len) {
7560                 dev->priv_flags |= IFF_NO_QUEUE;
7561                 dev->tx_queue_len = 1;
7562         }
7563
7564         dev->num_tx_queues = txqs;
7565         dev->real_num_tx_queues = txqs;
7566         if (netif_alloc_netdev_queues(dev))
7567                 goto free_all;
7568
7569 #ifdef CONFIG_SYSFS
7570         dev->num_rx_queues = rxqs;
7571         dev->real_num_rx_queues = rxqs;
7572         if (netif_alloc_rx_queues(dev))
7573                 goto free_all;
7574 #endif
7575
7576         strcpy(dev->name, name);
7577         dev->name_assign_type = name_assign_type;
7578         dev->group = INIT_NETDEV_GROUP;
7579         if (!dev->ethtool_ops)
7580                 dev->ethtool_ops = &default_ethtool_ops;
7581
7582         nf_hook_ingress_init(dev);
7583
7584         return dev;
7585
7586 free_all:
7587         free_netdev(dev);
7588         return NULL;
7589
7590 free_pcpu:
7591         free_percpu(dev->pcpu_refcnt);
7592 free_dev:
7593         netdev_freemem(dev);
7594         return NULL;
7595 }
7596 EXPORT_SYMBOL(alloc_netdev_mqs);
7597
7598 /**
7599  *      free_netdev - free network device
7600  *      @dev: device
7601  *
7602  *      This function does the last stage of destroying an allocated device
7603  *      interface. The reference to the device object is released.
7604  *      If this is the last reference then it will be freed.
7605  *      Must be called in process context.
7606  */
7607 void free_netdev(struct net_device *dev)
7608 {
7609         struct napi_struct *p, *n;
7610
7611         might_sleep();
7612         netif_free_tx_queues(dev);
7613 #ifdef CONFIG_SYSFS
7614         kvfree(dev->_rx);
7615 #endif
7616
7617         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7618
7619         /* Flush device addresses */
7620         dev_addr_flush(dev);
7621
7622         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7623                 netif_napi_del(p);
7624
7625         free_percpu(dev->pcpu_refcnt);
7626         dev->pcpu_refcnt = NULL;
7627
7628         /*  Compatibility with error handling in drivers */
7629         if (dev->reg_state == NETREG_UNINITIALIZED) {
7630                 netdev_freemem(dev);
7631                 return;
7632         }
7633
7634         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7635         dev->reg_state = NETREG_RELEASED;
7636
7637         /* will free via device release */
7638         put_device(&dev->dev);
7639 }
7640 EXPORT_SYMBOL(free_netdev);
7641
7642 /**
7643  *      synchronize_net -  Synchronize with packet receive processing
7644  *
7645  *      Wait for packets currently being received to be done.
7646  *      Does not block later packets from starting.
7647  */
7648 void synchronize_net(void)
7649 {
7650         might_sleep();
7651         if (rtnl_is_locked())
7652                 synchronize_rcu_expedited();
7653         else
7654                 synchronize_rcu();
7655 }
7656 EXPORT_SYMBOL(synchronize_net);
7657
7658 /**
7659  *      unregister_netdevice_queue - remove device from the kernel
7660  *      @dev: device
7661  *      @head: list
7662  *
7663  *      This function shuts down a device interface and removes it
7664  *      from the kernel tables.
7665  *      If head not NULL, device is queued to be unregistered later.
7666  *
7667  *      Callers must hold the rtnl semaphore.  You may want
7668  *      unregister_netdev() instead of this.
7669  */
7670
7671 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7672 {
7673         ASSERT_RTNL();
7674
7675         if (head) {
7676                 list_move_tail(&dev->unreg_list, head);
7677         } else {
7678                 rollback_registered(dev);
7679                 /* Finish processing unregister after unlock */
7680                 net_set_todo(dev);
7681         }
7682 }
7683 EXPORT_SYMBOL(unregister_netdevice_queue);
7684
7685 /**
7686  *      unregister_netdevice_many - unregister many devices
7687  *      @head: list of devices
7688  *
7689  *  Note: As most callers use a stack allocated list_head,
7690  *  we force a list_del() to make sure stack wont be corrupted later.
7691  */
7692 void unregister_netdevice_many(struct list_head *head)
7693 {
7694         struct net_device *dev;
7695
7696         if (!list_empty(head)) {
7697                 rollback_registered_many(head);
7698                 list_for_each_entry(dev, head, unreg_list)
7699                         net_set_todo(dev);
7700                 list_del(head);
7701         }
7702 }
7703 EXPORT_SYMBOL(unregister_netdevice_many);
7704
7705 /**
7706  *      unregister_netdev - remove device from the kernel
7707  *      @dev: device
7708  *
7709  *      This function shuts down a device interface and removes it
7710  *      from the kernel tables.
7711  *
7712  *      This is just a wrapper for unregister_netdevice that takes
7713  *      the rtnl semaphore.  In general you want to use this and not
7714  *      unregister_netdevice.
7715  */
7716 void unregister_netdev(struct net_device *dev)
7717 {
7718         rtnl_lock();
7719         unregister_netdevice(dev);
7720         rtnl_unlock();
7721 }
7722 EXPORT_SYMBOL(unregister_netdev);
7723
7724 /**
7725  *      dev_change_net_namespace - move device to different nethost namespace
7726  *      @dev: device
7727  *      @net: network namespace
7728  *      @pat: If not NULL name pattern to try if the current device name
7729  *            is already taken in the destination network namespace.
7730  *
7731  *      This function shuts down a device interface and moves it
7732  *      to a new network namespace. On success 0 is returned, on
7733  *      a failure a netagive errno code is returned.
7734  *
7735  *      Callers must hold the rtnl semaphore.
7736  */
7737
7738 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7739 {
7740         int err;
7741
7742         ASSERT_RTNL();
7743
7744         /* Don't allow namespace local devices to be moved. */
7745         err = -EINVAL;
7746         if (dev->features & NETIF_F_NETNS_LOCAL)
7747                 goto out;
7748
7749         /* Ensure the device has been registrered */
7750         if (dev->reg_state != NETREG_REGISTERED)
7751                 goto out;
7752
7753         /* Get out if there is nothing todo */
7754         err = 0;
7755         if (net_eq(dev_net(dev), net))
7756                 goto out;
7757
7758         /* Pick the destination device name, and ensure
7759          * we can use it in the destination network namespace.
7760          */
7761         err = -EEXIST;
7762         if (__dev_get_by_name(net, dev->name)) {
7763                 /* We get here if we can't use the current device name */
7764                 if (!pat)
7765                         goto out;
7766                 if (dev_get_valid_name(net, dev, pat) < 0)
7767                         goto out;
7768         }
7769
7770         /*
7771          * And now a mini version of register_netdevice unregister_netdevice.
7772          */
7773
7774         /* If device is running close it first. */
7775         dev_close(dev);
7776
7777         /* And unlink it from device chain */
7778         err = -ENODEV;
7779         unlist_netdevice(dev);
7780
7781         synchronize_net();
7782
7783         /* Shutdown queueing discipline. */
7784         dev_shutdown(dev);
7785
7786         /* Notify protocols, that we are about to destroy
7787            this device. They should clean all the things.
7788
7789            Note that dev->reg_state stays at NETREG_REGISTERED.
7790            This is wanted because this way 8021q and macvlan know
7791            the device is just moving and can keep their slaves up.
7792         */
7793         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7794         rcu_barrier();
7795         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7796         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7797
7798         /*
7799          *      Flush the unicast and multicast chains
7800          */
7801         dev_uc_flush(dev);
7802         dev_mc_flush(dev);
7803
7804         /* Send a netdev-removed uevent to the old namespace */
7805         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7806         netdev_adjacent_del_links(dev);
7807
7808         /* Actually switch the network namespace */
7809         dev_net_set(dev, net);
7810
7811         /* If there is an ifindex conflict assign a new one */
7812         if (__dev_get_by_index(net, dev->ifindex))
7813                 dev->ifindex = dev_new_index(net);
7814
7815         /* Send a netdev-add uevent to the new namespace */
7816         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7817         netdev_adjacent_add_links(dev);
7818
7819         /* Fixup kobjects */
7820         err = device_rename(&dev->dev, dev->name);
7821         WARN_ON(err);
7822
7823         /* Add the device back in the hashes */
7824         list_netdevice(dev);
7825
7826         /* Notify protocols, that a new device appeared. */
7827         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7828
7829         /*
7830          *      Prevent userspace races by waiting until the network
7831          *      device is fully setup before sending notifications.
7832          */
7833         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7834
7835         synchronize_net();
7836         err = 0;
7837 out:
7838         return err;
7839 }
7840 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7841
7842 static int dev_cpu_callback(struct notifier_block *nfb,
7843                             unsigned long action,
7844                             void *ocpu)
7845 {
7846         struct sk_buff **list_skb;
7847         struct sk_buff *skb;
7848         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7849         struct softnet_data *sd, *oldsd;
7850
7851         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7852                 return NOTIFY_OK;
7853
7854         local_irq_disable();
7855         cpu = smp_processor_id();
7856         sd = &per_cpu(softnet_data, cpu);
7857         oldsd = &per_cpu(softnet_data, oldcpu);
7858
7859         /* Find end of our completion_queue. */
7860         list_skb = &sd->completion_queue;
7861         while (*list_skb)
7862                 list_skb = &(*list_skb)->next;
7863         /* Append completion queue from offline CPU. */
7864         *list_skb = oldsd->completion_queue;
7865         oldsd->completion_queue = NULL;
7866
7867         /* Append output queue from offline CPU. */
7868         if (oldsd->output_queue) {
7869                 *sd->output_queue_tailp = oldsd->output_queue;
7870                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7871                 oldsd->output_queue = NULL;
7872                 oldsd->output_queue_tailp = &oldsd->output_queue;
7873         }
7874         /* Append NAPI poll list from offline CPU, with one exception :
7875          * process_backlog() must be called by cpu owning percpu backlog.
7876          * We properly handle process_queue & input_pkt_queue later.
7877          */
7878         while (!list_empty(&oldsd->poll_list)) {
7879                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7880                                                             struct napi_struct,
7881                                                             poll_list);
7882
7883                 list_del_init(&napi->poll_list);
7884                 if (napi->poll == process_backlog)
7885                         napi->state = 0;
7886                 else
7887                         ____napi_schedule(sd, napi);
7888         }
7889
7890         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7891         local_irq_enable();
7892
7893         /* Process offline CPU's input_pkt_queue */
7894         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7895                 netif_rx_ni(skb);
7896                 input_queue_head_incr(oldsd);
7897         }
7898         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7899                 netif_rx_ni(skb);
7900                 input_queue_head_incr(oldsd);
7901         }
7902
7903         return NOTIFY_OK;
7904 }
7905
7906
7907 /**
7908  *      netdev_increment_features - increment feature set by one
7909  *      @all: current feature set
7910  *      @one: new feature set
7911  *      @mask: mask feature set
7912  *
7913  *      Computes a new feature set after adding a device with feature set
7914  *      @one to the master device with current feature set @all.  Will not
7915  *      enable anything that is off in @mask. Returns the new feature set.
7916  */
7917 netdev_features_t netdev_increment_features(netdev_features_t all,
7918         netdev_features_t one, netdev_features_t mask)
7919 {
7920         if (mask & NETIF_F_HW_CSUM)
7921                 mask |= NETIF_F_CSUM_MASK;
7922         mask |= NETIF_F_VLAN_CHALLENGED;
7923
7924         all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
7925         all &= one | ~NETIF_F_ALL_FOR_ALL;
7926
7927         /* If one device supports hw checksumming, set for all. */
7928         if (all & NETIF_F_HW_CSUM)
7929                 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7930
7931         return all;
7932 }
7933 EXPORT_SYMBOL(netdev_increment_features);
7934
7935 static struct hlist_head * __net_init netdev_create_hash(void)
7936 {
7937         int i;
7938         struct hlist_head *hash;
7939
7940         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7941         if (hash != NULL)
7942                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7943                         INIT_HLIST_HEAD(&hash[i]);
7944
7945         return hash;
7946 }
7947
7948 /* Initialize per network namespace state */
7949 static int __net_init netdev_init(struct net *net)
7950 {
7951         if (net != &init_net)
7952                 INIT_LIST_HEAD(&net->dev_base_head);
7953
7954         net->dev_name_head = netdev_create_hash();
7955         if (net->dev_name_head == NULL)
7956                 goto err_name;
7957
7958         net->dev_index_head = netdev_create_hash();
7959         if (net->dev_index_head == NULL)
7960                 goto err_idx;
7961
7962         return 0;
7963
7964 err_idx:
7965         kfree(net->dev_name_head);
7966 err_name:
7967         return -ENOMEM;
7968 }
7969
7970 /**
7971  *      netdev_drivername - network driver for the device
7972  *      @dev: network device
7973  *
7974  *      Determine network driver for device.
7975  */
7976 const char *netdev_drivername(const struct net_device *dev)
7977 {
7978         const struct device_driver *driver;
7979         const struct device *parent;
7980         const char *empty = "";
7981
7982         parent = dev->dev.parent;
7983         if (!parent)
7984                 return empty;
7985
7986         driver = parent->driver;
7987         if (driver && driver->name)
7988                 return driver->name;
7989         return empty;
7990 }
7991
7992 static void __netdev_printk(const char *level, const struct net_device *dev,
7993                             struct va_format *vaf)
7994 {
7995         if (dev && dev->dev.parent) {
7996                 dev_printk_emit(level[1] - '0',
7997                                 dev->dev.parent,
7998                                 "%s %s %s%s: %pV",
7999                                 dev_driver_string(dev->dev.parent),
8000                                 dev_name(dev->dev.parent),
8001                                 netdev_name(dev), netdev_reg_state(dev),
8002                                 vaf);
8003         } else if (dev) {
8004                 printk("%s%s%s: %pV",
8005                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
8006         } else {
8007                 printk("%s(NULL net_device): %pV", level, vaf);
8008         }
8009 }
8010
8011 void netdev_printk(const char *level, const struct net_device *dev,
8012                    const char *format, ...)
8013 {
8014         struct va_format vaf;
8015         va_list args;
8016
8017         va_start(args, format);
8018
8019         vaf.fmt = format;
8020         vaf.va = &args;
8021
8022         __netdev_printk(level, dev, &vaf);
8023
8024         va_end(args);
8025 }
8026 EXPORT_SYMBOL(netdev_printk);
8027
8028 #define define_netdev_printk_level(func, level)                 \
8029 void func(const struct net_device *dev, const char *fmt, ...)   \
8030 {                                                               \
8031         struct va_format vaf;                                   \
8032         va_list args;                                           \
8033                                                                 \
8034         va_start(args, fmt);                                    \
8035                                                                 \
8036         vaf.fmt = fmt;                                          \
8037         vaf.va = &args;                                         \
8038                                                                 \
8039         __netdev_printk(level, dev, &vaf);                      \
8040                                                                 \
8041         va_end(args);                                           \
8042 }                                                               \
8043 EXPORT_SYMBOL(func);
8044
8045 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8046 define_netdev_printk_level(netdev_alert, KERN_ALERT);
8047 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8048 define_netdev_printk_level(netdev_err, KERN_ERR);
8049 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8050 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8051 define_netdev_printk_level(netdev_info, KERN_INFO);
8052
8053 static void __net_exit netdev_exit(struct net *net)
8054 {
8055         kfree(net->dev_name_head);
8056         kfree(net->dev_index_head);
8057 }
8058
8059 static struct pernet_operations __net_initdata netdev_net_ops = {
8060         .init = netdev_init,
8061         .exit = netdev_exit,
8062 };
8063
8064 static void __net_exit default_device_exit(struct net *net)
8065 {
8066         struct net_device *dev, *aux;
8067         /*
8068          * Push all migratable network devices back to the
8069          * initial network namespace
8070          */
8071         rtnl_lock();
8072         for_each_netdev_safe(net, dev, aux) {
8073                 int err;
8074                 char fb_name[IFNAMSIZ];
8075
8076                 /* Ignore unmoveable devices (i.e. loopback) */
8077                 if (dev->features & NETIF_F_NETNS_LOCAL)
8078                         continue;
8079
8080                 /* Leave virtual devices for the generic cleanup */
8081                 if (dev->rtnl_link_ops)
8082                         continue;
8083
8084                 /* Push remaining network devices to init_net */
8085                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8086                 err = dev_change_net_namespace(dev, &init_net, fb_name);
8087                 if (err) {
8088                         pr_emerg("%s: failed to move %s to init_net: %d\n",
8089                                  __func__, dev->name, err);
8090                         BUG();
8091                 }
8092         }
8093         rtnl_unlock();
8094 }
8095
8096 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8097 {
8098         /* Return with the rtnl_lock held when there are no network
8099          * devices unregistering in any network namespace in net_list.
8100          */
8101         struct net *net;
8102         bool unregistering;
8103         DEFINE_WAIT_FUNC(wait, woken_wake_function);
8104
8105         add_wait_queue(&netdev_unregistering_wq, &wait);
8106         for (;;) {
8107                 unregistering = false;
8108                 rtnl_lock();
8109                 list_for_each_entry(net, net_list, exit_list) {
8110                         if (net->dev_unreg_count > 0) {
8111                                 unregistering = true;
8112                                 break;
8113                         }
8114                 }
8115                 if (!unregistering)
8116                         break;
8117                 __rtnl_unlock();
8118
8119                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8120         }
8121         remove_wait_queue(&netdev_unregistering_wq, &wait);
8122 }
8123
8124 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8125 {
8126         /* At exit all network devices most be removed from a network
8127          * namespace.  Do this in the reverse order of registration.
8128          * Do this across as many network namespaces as possible to
8129          * improve batching efficiency.
8130          */
8131         struct net_device *dev;
8132         struct net *net;
8133         LIST_HEAD(dev_kill_list);
8134
8135         /* To prevent network device cleanup code from dereferencing
8136          * loopback devices or network devices that have been freed
8137          * wait here for all pending unregistrations to complete,
8138          * before unregistring the loopback device and allowing the
8139          * network namespace be freed.
8140          *
8141          * The netdev todo list containing all network devices
8142          * unregistrations that happen in default_device_exit_batch
8143          * will run in the rtnl_unlock() at the end of
8144          * default_device_exit_batch.
8145          */
8146         rtnl_lock_unregistering(net_list);
8147         list_for_each_entry(net, net_list, exit_list) {
8148                 for_each_netdev_reverse(net, dev) {
8149                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8150                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8151                         else
8152                                 unregister_netdevice_queue(dev, &dev_kill_list);
8153                 }
8154         }
8155         unregister_netdevice_many(&dev_kill_list);
8156         rtnl_unlock();
8157 }
8158
8159 static struct pernet_operations __net_initdata default_device_ops = {
8160         .exit = default_device_exit,
8161         .exit_batch = default_device_exit_batch,
8162 };
8163
8164 /*
8165  *      Initialize the DEV module. At boot time this walks the device list and
8166  *      unhooks any devices that fail to initialise (normally hardware not
8167  *      present) and leaves us with a valid list of present and active devices.
8168  *
8169  */
8170
8171 /*
8172  *       This is called single threaded during boot, so no need
8173  *       to take the rtnl semaphore.
8174  */
8175 static int __init net_dev_init(void)
8176 {
8177         int i, rc = -ENOMEM;
8178
8179         BUG_ON(!dev_boot_phase);
8180
8181         if (dev_proc_init())
8182                 goto out;
8183
8184         if (netdev_kobject_init())
8185                 goto out;
8186
8187         INIT_LIST_HEAD(&ptype_all);
8188         for (i = 0; i < PTYPE_HASH_SIZE; i++)
8189                 INIT_LIST_HEAD(&ptype_base[i]);
8190
8191         INIT_LIST_HEAD(&offload_base);
8192
8193         if (register_pernet_subsys(&netdev_net_ops))
8194                 goto out;
8195
8196         /*
8197          *      Initialise the packet receive queues.
8198          */
8199
8200         for_each_possible_cpu(i) {
8201                 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8202                 struct softnet_data *sd = &per_cpu(softnet_data, i);
8203
8204                 INIT_WORK(flush, flush_backlog);
8205
8206                 skb_queue_head_init(&sd->input_pkt_queue);
8207                 skb_queue_head_init(&sd->process_queue);
8208                 INIT_LIST_HEAD(&sd->poll_list);
8209                 sd->output_queue_tailp = &sd->output_queue;
8210 #ifdef CONFIG_RPS
8211                 sd->csd.func = rps_trigger_softirq;
8212                 sd->csd.info = sd;
8213                 sd->cpu = i;
8214 #endif
8215
8216                 sd->backlog.poll = process_backlog;
8217                 sd->backlog.weight = weight_p;
8218         }
8219
8220         dev_boot_phase = 0;
8221
8222         /* The loopback device is special if any other network devices
8223          * is present in a network namespace the loopback device must
8224          * be present. Since we now dynamically allocate and free the
8225          * loopback device ensure this invariant is maintained by
8226          * keeping the loopback device as the first device on the
8227          * list of network devices.  Ensuring the loopback devices
8228          * is the first device that appears and the last network device
8229          * that disappears.
8230          */
8231         if (register_pernet_device(&loopback_net_ops))
8232                 goto out;
8233
8234         if (register_pernet_device(&default_device_ops))
8235                 goto out;
8236
8237         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8238         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8239
8240         hotcpu_notifier(dev_cpu_callback, 0);
8241         dst_subsys_init();
8242         rc = 0;
8243 out:
8244         return rc;
8245 }
8246
8247 subsys_initcall(net_dev_init);