net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/mutex.h>
  84 #include <linux/string.h>
  85 #include <linux/mm.h>
  86 #include <linux/socket.h>
  87 #include <linux/sockios.h>
  88 #include <linux/errno.h>
  89 #include <linux/interrupt.h>
  90 #include <linux/if_ether.h>
  91 #include <linux/netdevice.h>
  92 #include <linux/etherdevice.h>
  93 #include <linux/ethtool.h>
  94 #include <linux/notifier.h>
  95 #include <linux/skbuff.h>
  96 #include <net/net_namespace.h>
  97 #include <net/sock.h>
  98 #include <linux/rtnetlink.h>
  99 #include <linux/proc_fs.h>
 100 #include <linux/seq_file.h>
 101 #include <linux/stat.h>
 102 #include <linux/if_bridge.h>
 103 #include <linux/if_macvlan.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130
 131 #include "net-sysfs.h"
 132
 133 /* Instead of increasing this, you should create a hash table. */
 134 #define MAX_GRO_SKBS 8
 135
 136 /* This should be increased if a protocol with a bigger head is added. */
 137 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 138
 139 /*
 140  *      The list of packet types we will receive (as opposed to discard)
 141  *      and the routines to invoke.
 142  *
 143  *      Why 16. Because with 16 the only overlap we get on a hash of the
 144  *      low nibble of the protocol value is RARP/SNAP/X.25.
 145  *
 146  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 147  *             sure which should go first, but I bet it won't make much
 148  *             difference if we are running VLANs.  The good news is that
 149  *             this protocol won't be in the list unless compiled in, so
 150  *             the average user (w/out VLANs) will not be adversely affected.
 151  *             --BLG
 152  *
 153  *              0800    IP
 154  *              8100    802.1Q VLAN
 155  *              0001    802.3
 156  *              0002    AX.25
 157  *              0004    802.2
 158  *              8035    RARP
 159  *              0005    SNAP
 160  *              0805    X.25
 161  *              0806    ARP
 162  *              8137    IPX
 163  *              0009    Localtalk
 164  *              86DD    IPv6
 165  */
 166
 167 #define PTYPE_HASH_SIZE (16)
 168 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 169
 170 static DEFINE_SPINLOCK(ptype_lock);
 171 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 172 static struct list_head ptype_all __read_mostly;        /* Taps */
 173
 174 /*
 175  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 176  * semaphore.
 177  *
 178  * Pure readers hold dev_base_lock for reading.
 179  *
 180  * Writers must hold the rtnl semaphore while they loop through the
 181  * dev_base_head list, and hold dev_base_lock for writing when they do the
 182  * actual updates.  This allows pure readers to access the list even
 183  * while a writer is preparing to update it.
 184  *
 185  * To put it another way, dev_base_lock is held for writing only to
 186  * protect against pure readers; the rtnl semaphore provides the
 187  * protection against other writers.
 188  *
 189  * See, for example usages, register_netdevice() and
 190  * unregister_netdevice(), which must be called with the rtnl
 191  * semaphore held.
 192  */
 193 DEFINE_RWLOCK(dev_base_lock);
 194 EXPORT_SYMBOL(dev_base_lock);
 195
 196 #define NETDEV_HASHBITS 8
 197 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 198
 199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 200 {
 201         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 202         return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 203 }
 204
 205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206 {
 207         return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 208 }
 209
 210 /* Device list insertion */
 211 static int list_netdevice(struct net_device *dev)
 212 {
 213         struct net *net = dev_net(dev);
 214
 215         ASSERT_RTNL();
 216
 217         write_lock_bh(&dev_base_lock);
 218         list_add_tail(&dev->dev_list, &net->dev_base_head);
 219         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 220         hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 221         write_unlock_bh(&dev_base_lock);
 222         return 0;
 223 }
 224
 225 /* Device list removal */
 226 static void unlist_netdevice(struct net_device *dev)
 227 {
 228         ASSERT_RTNL();
 229
 230         /* Unlink dev from the device chain */
 231         write_lock_bh(&dev_base_lock);
 232         list_del(&dev->dev_list);
 233         hlist_del(&dev->name_hlist);
 234         hlist_del(&dev->index_hlist);
 235         write_unlock_bh(&dev_base_lock);
 236 }
 237
 238 /*
 239  *      Our notifier list
 240  */
 241
 242 static RAW_NOTIFIER_HEAD(netdev_chain);
 243
 244 /*
 245  *      Device drivers call our routines to queue packets here. We empty the
 246  *      queue in the local softnet handler.
 247  */
 248
 249 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 250 EXPORT_PER_CPU_SYMBOL(softnet_data);
 251
 252 #ifdef CONFIG_LOCKDEP
 253 /*
 254  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 255  * according to dev->type
 256  */
 257 static const unsigned short netdev_lock_type[] =
 258         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 259          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 260          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 261          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 262          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 263          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 264          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 265          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 266          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 267          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 268          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 269          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 270          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 271          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 272          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 273          ARPHRD_VOID, ARPHRD_NONE};
 274
 275 static const char *const netdev_lock_name[] =
 276         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 277          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 278          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 279          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 280          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 281          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 282          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 283          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 284          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 285          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 286          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 287          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 288          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 289          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 290          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 291          "_xmit_VOID", "_xmit_NONE"};
 292
 293 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 294 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 295
 296 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 297 {
 298         int i;
 299
 300         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 301                 if (netdev_lock_type[i] == dev_type)
 302                         return i;
 303         /* the last key is used by default */
 304         return ARRAY_SIZE(netdev_lock_type) - 1;
 305 }
 306
 307 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 308                                                  unsigned short dev_type)
 309 {
 310         int i;
 311
 312         i = netdev_lock_pos(dev_type);
 313         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 314                                    netdev_lock_name[i]);
 315 }
 316
 317 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 318 {
 319         int i;
 320
 321         i = netdev_lock_pos(dev->type);
 322         lockdep_set_class_and_name(&dev->addr_list_lock,
 323                                    &netdev_addr_lock_key[i],
 324                                    netdev_lock_name[i]);
 325 }
 326 #else
 327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 328                                                  unsigned short dev_type)
 329 {
 330 }
 331 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 332 {
 333 }
 334 #endif
 335
 336 /*******************************************************************************
 337
 338                 Protocol management and registration routines
 339
 340 *******************************************************************************/
 341
 342 /*
 343  *      Add a protocol ID to the list. Now that the input handler is
 344  *      smarter we can dispense with all the messy stuff that used to be
 345  *      here.
 346  *
 347  *      BEWARE!!! Protocol handlers, mangling input packets,
 348  *      MUST BE last in hash buckets and checking protocol handlers
 349  *      MUST start from promiscuous ptype_all chain in net_bh.
 350  *      It is true now, do not change it.
 351  *      Explanation follows: if protocol handler, mangling packet, will
 352  *      be the first on list, it is not able to sense, that packet
 353  *      is cloned and should be copied-on-write, so that it will
 354  *      change it and subsequent readers will get broken packet.
 355  *                                                      --ANK (980803)
 356  */
 357
 358 /**
 359  *      dev_add_pack - add packet handler
 360  *      @pt: packet type declaration
 361  *
 362  *      Add a protocol handler to the networking stack. The passed &packet_type
 363  *      is linked into kernel lists and may not be freed until it has been
 364  *      removed from the kernel lists.
 365  *
 366  *      This call does not sleep therefore it can not
 367  *      guarantee all CPU's that are in middle of receiving packets
 368  *      will see the new packet type (until the next received packet).
 369  */
 370
 371 void dev_add_pack(struct packet_type *pt)
 372 {
 373         int hash;
 374
 375         spin_lock_bh(&ptype_lock);
 376         if (pt->type == htons(ETH_P_ALL))
 377                 list_add_rcu(&pt->list, &ptype_all);
 378         else {
 379                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 380                 list_add_rcu(&pt->list, &ptype_base[hash]);
 381         }
 382         spin_unlock_bh(&ptype_lock);
 383 }
 384 EXPORT_SYMBOL(dev_add_pack);
 385
 386 /**
 387  *      __dev_remove_pack        - remove packet handler
 388  *      @pt: packet type declaration
 389  *
 390  *      Remove a protocol handler that was previously added to the kernel
 391  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 392  *      from the kernel lists and can be freed or reused once this function
 393  *      returns.
 394  *
 395  *      The packet type might still be in use by receivers
 396  *      and must not be freed until after all the CPU's have gone
 397  *      through a quiescent state.
 398  */
 399 void __dev_remove_pack(struct packet_type *pt)
 400 {
 401         struct list_head *head;
 402         struct packet_type *pt1;
 403
 404         spin_lock_bh(&ptype_lock);
 405
 406         if (pt->type == htons(ETH_P_ALL))
 407                 head = &ptype_all;
 408         else
 409                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 410
 411         list_for_each_entry(pt1, head, list) {
 412                 if (pt == pt1) {
 413                         list_del_rcu(&pt->list);
 414                         goto out;
 415                 }
 416         }
 417
 418         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 419 out:
 420         spin_unlock_bh(&ptype_lock);
 421 }
 422 EXPORT_SYMBOL(__dev_remove_pack);
 423
 424 /**
 425  *      dev_remove_pack  - remove packet handler
 426  *      @pt: packet type declaration
 427  *
 428  *      Remove a protocol handler that was previously added to the kernel
 429  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 430  *      from the kernel lists and can be freed or reused once this function
 431  *      returns.
 432  *
 433  *      This call sleeps to guarantee that no CPU is looking at the packet
 434  *      type after return.
 435  */
 436 void dev_remove_pack(struct packet_type *pt)
 437 {
 438         __dev_remove_pack(pt);
 439
 440         synchronize_net();
 441 }
 442 EXPORT_SYMBOL(dev_remove_pack);
 443
 444 /******************************************************************************
 445
 446                       Device Boot-time Settings Routines
 447
 448 *******************************************************************************/
 449
 450 /* Boot time configuration table */
 451 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 452
 453 /**
 454  *      netdev_boot_setup_add   - add new setup entry
 455  *      @name: name of the device
 456  *      @map: configured settings for the device
 457  *
 458  *      Adds new setup entry to the dev_boot_setup list.  The function
 459  *      returns 0 on error and 1 on success.  This is a generic routine to
 460  *      all netdevices.
 461  */
 462 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 463 {
 464         struct netdev_boot_setup *s;
 465         int i;
 466
 467         s = dev_boot_setup;
 468         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 469                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 470                         memset(s[i].name, 0, sizeof(s[i].name));
 471                         strlcpy(s[i].name, name, IFNAMSIZ);
 472                         memcpy(&s[i].map, map, sizeof(s[i].map));
 473                         break;
 474                 }
 475         }
 476
 477         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 478 }
 479
 480 /**
 481  *      netdev_boot_setup_check - check boot time settings
 482  *      @dev: the netdevice
 483  *
 484  *      Check boot time settings for the device.
 485  *      The found settings are set for the device to be used
 486  *      later in the device probing.
 487  *      Returns 0 if no settings found, 1 if they are.
 488  */
 489 int netdev_boot_setup_check(struct net_device *dev)
 490 {
 491         struct netdev_boot_setup *s = dev_boot_setup;
 492         int i;
 493
 494         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 495                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 496                     !strcmp(dev->name, s[i].name)) {
 497                         dev->irq        = s[i].map.irq;
 498                         dev->base_addr  = s[i].map.base_addr;
 499                         dev->mem_start  = s[i].map.mem_start;
 500                         dev->mem_end    = s[i].map.mem_end;
 501                         return 1;
 502                 }
 503         }
 504         return 0;
 505 }
 506 EXPORT_SYMBOL(netdev_boot_setup_check);
 507
 508
 509 /**
 510  *      netdev_boot_base        - get address from boot time settings
 511  *      @prefix: prefix for network device
 512  *      @unit: id for network device
 513  *
 514  *      Check boot time settings for the base address of device.
 515  *      The found settings are set for the device to be used
 516  *      later in the device probing.
 517  *      Returns 0 if no settings found.
 518  */
 519 unsigned long netdev_boot_base(const char *prefix, int unit)
 520 {
 521         const struct netdev_boot_setup *s = dev_boot_setup;
 522         char name[IFNAMSIZ];
 523         int i;
 524
 525         sprintf(name, "%s%d", prefix, unit);
 526
 527         /*
 528          * If device already registered then return base of 1
 529          * to indicate not to probe for this interface
 530          */
 531         if (__dev_get_by_name(&init_net, name))
 532                 return 1;
 533
 534         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 535                 if (!strcmp(name, s[i].name))
 536                         return s[i].map.base_addr;
 537         return 0;
 538 }
 539
 540 /*
 541  * Saves at boot time configured settings for any netdevice.
 542  */
 543 int __init netdev_boot_setup(char *str)
 544 {
 545         int ints[5];
 546         struct ifmap map;
 547
 548         str = get_options(str, ARRAY_SIZE(ints), ints);
 549         if (!str || !*str)
 550                 return 0;
 551
 552         /* Save settings */
 553         memset(&map, 0, sizeof(map));
 554         if (ints[0] > 0)
 555                 map.irq = ints[1];
 556         if (ints[0] > 1)
 557                 map.base_addr = ints[2];
 558         if (ints[0] > 2)
 559                 map.mem_start = ints[3];
 560         if (ints[0] > 3)
 561                 map.mem_end = ints[4];
 562
 563         /* Add new entry to the list */
 564         return netdev_boot_setup_add(str, &map);
 565 }
 566
 567 __setup("netdev=", netdev_boot_setup);
 568
 569 /*******************************************************************************
 570
 571                             Device Interface Subroutines
 572
 573 *******************************************************************************/
 574
 575 /**
 576  *      __dev_get_by_name       - find a device by its name
 577  *      @net: the applicable net namespace
 578  *      @name: name to find
 579  *
 580  *      Find an interface by name. Must be called under RTNL semaphore
 581  *      or @dev_base_lock. If the name is found a pointer to the device
 582  *      is returned. If the name is not found then %NULL is returned. The
 583  *      reference counters are not incremented so the caller must be
 584  *      careful with locks.
 585  */
 586
 587 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 588 {
 589         struct hlist_node *p;
 590
 591         hlist_for_each(p, dev_name_hash(net, name)) {
 592                 struct net_device *dev
 593                         = hlist_entry(p, struct net_device, name_hlist);
 594                 if (!strncmp(dev->name, name, IFNAMSIZ))
 595                         return dev;
 596         }
 597         return NULL;
 598 }
 599 EXPORT_SYMBOL(__dev_get_by_name);
 600
 601 /**
 602  *      dev_get_by_name         - find a device by its name
 603  *      @net: the applicable net namespace
 604  *      @name: name to find
 605  *
 606  *      Find an interface by name. This can be called from any
 607  *      context and does its own locking. The returned handle has
 608  *      the usage count incremented and the caller must use dev_put() to
 609  *      release it when it is no longer needed. %NULL is returned if no
 610  *      matching device is found.
 611  */
 612
 613 struct net_device *dev_get_by_name(struct net *net, const char *name)
 614 {
 615         struct net_device *dev;
 616
 617         read_lock(&dev_base_lock);
 618         dev = __dev_get_by_name(net, name);
 619         if (dev)
 620                 dev_hold(dev);
 621         read_unlock(&dev_base_lock);
 622         return dev;
 623 }
 624 EXPORT_SYMBOL(dev_get_by_name);
 625
 626 /**
 627  *      __dev_get_by_index - find a device by its ifindex
 628  *      @net: the applicable net namespace
 629  *      @ifindex: index of device
 630  *
 631  *      Search for an interface by index. Returns %NULL if the device
 632  *      is not found or a pointer to the device. The device has not
 633  *      had its reference counter increased so the caller must be careful
 634  *      about locking. The caller must hold either the RTNL semaphore
 635  *      or @dev_base_lock.
 636  */
 637
 638 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 639 {
 640         struct hlist_node *p;
 641
 642         hlist_for_each(p, dev_index_hash(net, ifindex)) {
 643                 struct net_device *dev
 644                         = hlist_entry(p, struct net_device, index_hlist);
 645                 if (dev->ifindex == ifindex)
 646                         return dev;
 647         }
 648         return NULL;
 649 }
 650 EXPORT_SYMBOL(__dev_get_by_index);
 651
 652
 653 /**
 654  *      dev_get_by_index - find a device by its ifindex
 655  *      @net: the applicable net namespace
 656  *      @ifindex: index of device
 657  *
 658  *      Search for an interface by index. Returns NULL if the device
 659  *      is not found or a pointer to the device. The device returned has
 660  *      had a reference added and the pointer is safe until the user calls
 661  *      dev_put to indicate they have finished with it.
 662  */
 663
 664 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 665 {
 666         struct net_device *dev;
 667
 668         read_lock(&dev_base_lock);
 669         dev = __dev_get_by_index(net, ifindex);
 670         if (dev)
 671                 dev_hold(dev);
 672         read_unlock(&dev_base_lock);
 673         return dev;
 674 }
 675 EXPORT_SYMBOL(dev_get_by_index);
 676
 677 /**
 678  *      dev_getbyhwaddr - find a device by its hardware address
 679  *      @net: the applicable net namespace
 680  *      @type: media type of device
 681  *      @ha: hardware address
 682  *
 683  *      Search for an interface by MAC address. Returns NULL if the device
 684  *      is not found or a pointer to the device. The caller must hold the
 685  *      rtnl semaphore. The returned device has not had its ref count increased
 686  *      and the caller must therefore be careful about locking
 687  *
 688  *      BUGS:
 689  *      If the API was consistent this would be __dev_get_by_hwaddr
 690  */
 691
 692 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 693 {
 694         struct net_device *dev;
 695
 696         ASSERT_RTNL();
 697
 698         for_each_netdev(net, dev)
 699                 if (dev->type == type &&
 700                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 701                         return dev;
 702
 703         return NULL;
 704 }
 705 EXPORT_SYMBOL(dev_getbyhwaddr);
 706
 707 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 708 {
 709         struct net_device *dev;
 710
 711         ASSERT_RTNL();
 712         for_each_netdev(net, dev)
 713                 if (dev->type == type)
 714                         return dev;
 715
 716         return NULL;
 717 }
 718 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 719
 720 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 721 {
 722         struct net_device *dev;
 723
 724         rtnl_lock();
 725         dev = __dev_getfirstbyhwtype(net, type);
 726         if (dev)
 727                 dev_hold(dev);
 728         rtnl_unlock();
 729         return dev;
 730 }
 731 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 732
 733 /**
 734  *      dev_get_by_flags - find any device with given flags
 735  *      @net: the applicable net namespace
 736  *      @if_flags: IFF_* values
 737  *      @mask: bitmask of bits in if_flags to check
 738  *
 739  *      Search for any interface with the given flags. Returns NULL if a device
 740  *      is not found or a pointer to the device. The device returned has
 741  *      had a reference added and the pointer is safe until the user calls
 742  *      dev_put to indicate they have finished with it.
 743  */
 744
 745 struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
 746                                     unsigned short mask)
 747 {
 748         struct net_device *dev, *ret;
 749
 750         ret = NULL;
 751         read_lock(&dev_base_lock);
 752         for_each_netdev(net, dev) {
 753                 if (((dev->flags ^ if_flags) & mask) == 0) {
 754                         dev_hold(dev);
 755                         ret = dev;
 756                         break;
 757                 }
 758         }
 759         read_unlock(&dev_base_lock);
 760         return ret;
 761 }
 762 EXPORT_SYMBOL(dev_get_by_flags);
 763
 764 /**
 765  *      dev_valid_name - check if name is okay for network device
 766  *      @name: name string
 767  *
 768  *      Network device names need to be valid file names to
 769  *      to allow sysfs to work.  We also disallow any kind of
 770  *      whitespace.
 771  */
 772 int dev_valid_name(const char *name)
 773 {
 774         if (*name == '\0')
 775                 return 0;
 776         if (strlen(name) >= IFNAMSIZ)
 777                 return 0;
 778         if (!strcmp(name, ".") || !strcmp(name, ".."))
 779                 return 0;
 780
 781         while (*name) {
 782                 if (*name == '/' || isspace(*name))
 783                         return 0;
 784                 name++;
 785         }
 786         return 1;
 787 }
 788 EXPORT_SYMBOL(dev_valid_name);
 789
 790 /**
 791  *      __dev_alloc_name - allocate a name for a device
 792  *      @net: network namespace to allocate the device name in
 793  *      @name: name format string
 794  *      @buf:  scratch buffer and result name string
 795  *
 796  *      Passed a format string - eg "lt%d" it will try and find a suitable
 797  *      id. It scans list of devices to build up a free map, then chooses
 798  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 799  *      while allocating the name and adding the device in order to avoid
 800  *      duplicates.
 801  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 802  *      Returns the number of the unit assigned or a negative errno code.
 803  */
 804
 805 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 806 {
 807         int i = 0;
 808         const char *p;
 809         const int max_netdevices = 8*PAGE_SIZE;
 810         unsigned long *inuse;
 811         struct net_device *d;
 812
 813         p = strnchr(name, IFNAMSIZ-1, '%');
 814         if (p) {
 815                 /*
 816                  * Verify the string as this thing may have come from
 817                  * the user.  There must be either one "%d" and no other "%"
 818                  * characters.
 819                  */
 820                 if (p[1] != 'd' || strchr(p + 2, '%'))
 821                         return -EINVAL;
 822
 823                 /* Use one page as a bit array of possible slots */
 824                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 825                 if (!inuse)
 826                         return -ENOMEM;
 827
 828                 for_each_netdev(net, d) {
 829                         if (!sscanf(d->name, name, &i))
 830                                 continue;
 831                         if (i < 0 || i >= max_netdevices)
 832                                 continue;
 833
 834                         /*  avoid cases where sscanf is not exact inverse of printf */
 835                         snprintf(buf, IFNAMSIZ, name, i);
 836                         if (!strncmp(buf, d->name, IFNAMSIZ))
 837                                 set_bit(i, inuse);
 838                 }
 839
 840                 i = find_first_zero_bit(inuse, max_netdevices);
 841                 free_page((unsigned long) inuse);
 842         }
 843
 844         snprintf(buf, IFNAMSIZ, name, i);
 845         if (!__dev_get_by_name(net, buf))
 846                 return i;
 847
 848         /* It is possible to run out of possible slots
 849          * when the name is long and there isn't enough space left
 850          * for the digits, or if all bits are used.
 851          */
 852         return -ENFILE;
 853 }
 854
 855 /**
 856  *      dev_alloc_name - allocate a name for a device
 857  *      @dev: device
 858  *      @name: name format string
 859  *
 860  *      Passed a format string - eg "lt%d" it will try and find a suitable
 861  *      id. It scans list of devices to build up a free map, then chooses
 862  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 863  *      while allocating the name and adding the device in order to avoid
 864  *      duplicates.
 865  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 866  *      Returns the number of the unit assigned or a negative errno code.
 867  */
 868
 869 int dev_alloc_name(struct net_device *dev, const char *name)
 870 {
 871         char buf[IFNAMSIZ];
 872         struct net *net;
 873         int ret;
 874
 875         BUG_ON(!dev_net(dev));
 876         net = dev_net(dev);
 877         ret = __dev_alloc_name(net, name, buf);
 878         if (ret >= 0)
 879                 strlcpy(dev->name, buf, IFNAMSIZ);
 880         return ret;
 881 }
 882 EXPORT_SYMBOL(dev_alloc_name);
 883
 884
 885 /**
 886  *      dev_change_name - change name of a device
 887  *      @dev: device
 888  *      @newname: name (or format string) must be at least IFNAMSIZ
 889  *
 890  *      Change name of a device, can pass format strings "eth%d".
 891  *      for wildcarding.
 892  */
 893 int dev_change_name(struct net_device *dev, const char *newname)
 894 {
 895         char oldname[IFNAMSIZ];
 896         int err = 0;
 897         int ret;
 898         struct net *net;
 899
 900         ASSERT_RTNL();
 901         BUG_ON(!dev_net(dev));
 902
 903         net = dev_net(dev);
 904         if (dev->flags & IFF_UP)
 905                 return -EBUSY;
 906
 907         if (!dev_valid_name(newname))
 908                 return -EINVAL;
 909
 910         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 911                 return 0;
 912
 913         memcpy(oldname, dev->name, IFNAMSIZ);
 914
 915         if (strchr(newname, '%')) {
 916                 err = dev_alloc_name(dev, newname);
 917                 if (err < 0)
 918                         return err;
 919         } else if (__dev_get_by_name(net, newname))
 920                 return -EEXIST;
 921         else
 922                 strlcpy(dev->name, newname, IFNAMSIZ);
 923
 924 rollback:
 925         /* For now only devices in the initial network namespace
 926          * are in sysfs.
 927          */
 928         if (net == &init_net) {
 929                 ret = device_rename(&dev->dev, dev->name);
 930                 if (ret) {
 931                         memcpy(dev->name, oldname, IFNAMSIZ);
 932                         return ret;
 933                 }
 934         }
 935
 936         write_lock_bh(&dev_base_lock);
 937         hlist_del(&dev->name_hlist);
 938         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 939         write_unlock_bh(&dev_base_lock);
 940
 941         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 942         ret = notifier_to_errno(ret);
 943
 944         if (ret) {
 945                 /* err >= 0 after dev_alloc_name() or stores the first errno */
 946                 if (err >= 0) {
 947                         err = ret;
 948                         memcpy(dev->name, oldname, IFNAMSIZ);
 949                         goto rollback;
 950                 } else {
 951                         printk(KERN_ERR
 952                                "%s: name change rollback failed: %d.\n",
 953                                dev->name, ret);
 954                 }
 955         }
 956
 957         return err;
 958 }
 959
 960 /**
 961  *      dev_set_alias - change ifalias of a device
 962  *      @dev: device
 963  *      @alias: name up to IFALIASZ
 964  *      @len: limit of bytes to copy from info
 965  *
 966  *      Set ifalias for a device,
 967  */
 968 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 969 {
 970         ASSERT_RTNL();
 971
 972         if (len >= IFALIASZ)
 973                 return -EINVAL;
 974
 975         if (!len) {
 976                 if (dev->ifalias) {
 977                         kfree(dev->ifalias);
 978                         dev->ifalias = NULL;
 979                 }
 980                 return 0;
 981         }
 982
 983         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
 984         if (!dev->ifalias)
 985                 return -ENOMEM;
 986
 987         strlcpy(dev->ifalias, alias, len+1);
 988         return len;
 989 }
 990
 991
 992 /**
 993  *      netdev_features_change - device changes features
 994  *      @dev: device to cause notification
 995  *
 996  *      Called to indicate a device has changed features.
 997  */
 998 void netdev_features_change(struct net_device *dev)
 999 {
1000         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1001 }
1002 EXPORT_SYMBOL(netdev_features_change);
1003
1004 /**
1005  *      netdev_state_change - device changes state
1006  *      @dev: device to cause notification
1007  *
1008  *      Called to indicate a device has changed state. This function calls
1009  *      the notifier chains for netdev_chain and sends a NEWLINK message
1010  *      to the routing socket.
1011  */
1012 void netdev_state_change(struct net_device *dev)
1013 {
1014         if (dev->flags & IFF_UP) {
1015                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1016                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1017         }
1018 }
1019 EXPORT_SYMBOL(netdev_state_change);
1020
1021 void netdev_bonding_change(struct net_device *dev, unsigned long event)
1022 {
1023         call_netdevice_notifiers(event, dev);
1024 }
1025 EXPORT_SYMBOL(netdev_bonding_change);
1026
1027 /**
1028  *      dev_load        - load a network module
1029  *      @net: the applicable net namespace
1030  *      @name: name of interface
1031  *
1032  *      If a network interface is not present and the process has suitable
1033  *      privileges this function loads the module. If module loading is not
1034  *      available in this kernel then it becomes a nop.
1035  */
1036
1037 void dev_load(struct net *net, const char *name)
1038 {
1039         struct net_device *dev;
1040         int no_module;
1041
1042         read_lock(&dev_base_lock);
1043         dev = __dev_get_by_name(net, name);
1044         read_unlock(&dev_base_lock);
1045
1046         no_module = !dev;
1047         if (no_module && capable(CAP_NET_ADMIN))
1048                 no_module = request_module("netdev-%s", name);
1049         if (no_module && capable(CAP_SYS_MODULE)) {
1050                 if (!request_module("%s", name))
1051                         pr_err("Loading kernel module for a network device "
1052 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1053 "instead\n", name);
1054         }
1055 }
1056 EXPORT_SYMBOL(dev_load);
1057
1058 /**
1059  *      dev_open        - prepare an interface for use.
1060  *      @dev:   device to open
1061  *
1062  *      Takes a device from down to up state. The device's private open
1063  *      function is invoked and then the multicast lists are loaded. Finally
1064  *      the device is moved into the up state and a %NETDEV_UP message is
1065  *      sent to the netdev notifier chain.
1066  *
1067  *      Calling this function on an active interface is a nop. On a failure
1068  *      a negative errno code is returned.
1069  */
1070 int dev_open(struct net_device *dev)
1071 {
1072         const struct net_device_ops *ops = dev->netdev_ops;
1073         int ret;
1074
1075         ASSERT_RTNL();
1076
1077         /*
1078          *      Is it already up?
1079          */
1080
1081         if (dev->flags & IFF_UP)
1082                 return 0;
1083
1084         /*
1085          *      Is it even present?
1086          */
1087         if (!netif_device_present(dev))
1088                 return -ENODEV;
1089
1090         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1091         ret = notifier_to_errno(ret);
1092         if (ret)
1093                 return ret;
1094
1095         /*
1096          *      Call device private open method
1097          */
1098         set_bit(__LINK_STATE_START, &dev->state);
1099
1100         if (ops->ndo_validate_addr)
1101                 ret = ops->ndo_validate_addr(dev);
1102
1103         if (!ret && ops->ndo_open)
1104                 ret = ops->ndo_open(dev);
1105
1106         /*
1107          *      If it went open OK then:
1108          */
1109
1110         if (ret)
1111                 clear_bit(__LINK_STATE_START, &dev->state);
1112         else {
1113                 /*
1114                  *      Set the flags.
1115                  */
1116                 dev->flags |= IFF_UP;
1117
1118                 /*
1119                  *      Enable NET_DMA
1120                  */
1121                 net_dmaengine_get();
1122
1123                 /*
1124                  *      Initialize multicasting status
1125                  */
1126                 dev_set_rx_mode(dev);
1127
1128                 /*
1129                  *      Wakeup transmit queue engine
1130                  */
1131                 dev_activate(dev);
1132
1133                 /*
1134                  *      ... and announce new interface.
1135                  */
1136                 call_netdevice_notifiers(NETDEV_UP, dev);
1137         }
1138
1139         return ret;
1140 }
1141 EXPORT_SYMBOL(dev_open);
1142
1143 /**
1144  *      dev_close - shutdown an interface.
1145  *      @dev: device to shutdown
1146  *
1147  *      This function moves an active device into down state. A
1148  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1149  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1150  *      chain.
1151  */
1152 int dev_close(struct net_device *dev)
1153 {
1154         const struct net_device_ops *ops = dev->netdev_ops;
1155         ASSERT_RTNL();
1156
1157         might_sleep();
1158
1159         if (!(dev->flags & IFF_UP))
1160                 return 0;
1161
1162         /*
1163          *      Tell people we are going down, so that they can
1164          *      prepare to death, when device is still operating.
1165          */
1166         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1167
1168         clear_bit(__LINK_STATE_START, &dev->state);
1169
1170         /* Synchronize to scheduled poll. We cannot touch poll list,
1171          * it can be even on different cpu. So just clear netif_running().
1172          *
1173          * dev->stop() will invoke napi_disable() on all of it's
1174          * napi_struct instances on this device.
1175          */
1176         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1177
1178         dev_deactivate(dev);
1179
1180         /*
1181          *      Call the device specific close. This cannot fail.
1182          *      Only if device is UP
1183          *
1184          *      We allow it to be called even after a DETACH hot-plug
1185          *      event.
1186          */
1187         if (ops->ndo_stop)
1188                 ops->ndo_stop(dev);
1189
1190         /*
1191          *      Device is now down.
1192          */
1193
1194         dev->flags &= ~IFF_UP;
1195
1196         /*
1197          * Tell people we are down
1198          */
1199         call_netdevice_notifiers(NETDEV_DOWN, dev);
1200
1201         /*
1202          *      Shutdown NET_DMA
1203          */
1204         net_dmaengine_put();
1205
1206         return 0;
1207 }
1208 EXPORT_SYMBOL(dev_close);
1209
1210
1211 /**
1212  *      dev_disable_lro - disable Large Receive Offload on a device
1213  *      @dev: device
1214  *
1215  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1216  *      called under RTNL.  This is needed if received packets may be
1217  *      forwarded to another interface.
1218  */
1219 void dev_disable_lro(struct net_device *dev)
1220 {
1221         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1222             dev->ethtool_ops->set_flags) {
1223                 u32 flags = dev->ethtool_ops->get_flags(dev);
1224                 if (flags & ETH_FLAG_LRO) {
1225                         flags &= ~ETH_FLAG_LRO;
1226                         dev->ethtool_ops->set_flags(dev, flags);
1227                 }
1228         }
1229         WARN_ON(dev->features & NETIF_F_LRO);
1230 }
1231 EXPORT_SYMBOL(dev_disable_lro);
1232
1233
1234 static int dev_boot_phase = 1;
1235
1236 /*
1237  *      Device change register/unregister. These are not inline or static
1238  *      as we export them to the world.
1239  */
1240
1241 /**
1242  *      register_netdevice_notifier - register a network notifier block
1243  *      @nb: notifier
1244  *
1245  *      Register a notifier to be called when network device events occur.
1246  *      The notifier passed is linked into the kernel structures and must
1247  *      not be reused until it has been unregistered. A negative errno code
1248  *      is returned on a failure.
1249  *
1250  *      When registered all registration and up events are replayed
1251  *      to the new notifier to allow device to have a race free
1252  *      view of the network device list.
1253  */
1254
1255 int register_netdevice_notifier(struct notifier_block *nb)
1256 {
1257         struct net_device *dev;
1258         struct net_device *last;
1259         struct net *net;
1260         int err;
1261
1262         rtnl_lock();
1263         err = raw_notifier_chain_register(&netdev_chain, nb);
1264         if (err)
1265                 goto unlock;
1266         if (dev_boot_phase)
1267                 goto unlock;
1268         for_each_net(net) {
1269                 for_each_netdev(net, dev) {
1270                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1271                         err = notifier_to_errno(err);
1272                         if (err)
1273                                 goto rollback;
1274
1275                         if (!(dev->flags & IFF_UP))
1276                                 continue;
1277
1278                         nb->notifier_call(nb, NETDEV_UP, dev);
1279                 }
1280         }
1281
1282 unlock:
1283         rtnl_unlock();
1284         return err;
1285
1286 rollback:
1287         last = dev;
1288         for_each_net(net) {
1289                 for_each_netdev(net, dev) {
1290                         if (dev == last)
1291                                 break;
1292
1293                         if (dev->flags & IFF_UP) {
1294                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1295                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1296                         }
1297                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1298                 }
1299         }
1300
1301         raw_notifier_chain_unregister(&netdev_chain, nb);
1302         goto unlock;
1303 }
1304 EXPORT_SYMBOL(register_netdevice_notifier);
1305
1306 /**
1307  *      unregister_netdevice_notifier - unregister a network notifier block
1308  *      @nb: notifier
1309  *
1310  *      Unregister a notifier previously registered by
1311  *      register_netdevice_notifier(). The notifier is unlinked into the
1312  *      kernel structures and may then be reused. A negative errno code
1313  *      is returned on a failure.
1314  */
1315
1316 int unregister_netdevice_notifier(struct notifier_block *nb)
1317 {
1318         int err;
1319
1320         rtnl_lock();
1321         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1322         rtnl_unlock();
1323         return err;
1324 }
1325 EXPORT_SYMBOL(unregister_netdevice_notifier);
1326
1327 /**
1328  *      call_netdevice_notifiers - call all network notifier blocks
1329  *      @val: value passed unmodified to notifier function
1330  *      @dev: net_device pointer passed unmodified to notifier function
1331  *
1332  *      Call all network notifier blocks.  Parameters and return value
1333  *      are as for raw_notifier_call_chain().
1334  */
1335
1336 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1337 {
1338         return raw_notifier_call_chain(&netdev_chain, val, dev);
1339 }
1340
1341 /* When > 0 there are consumers of rx skb time stamps */
1342 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1343
1344 void net_enable_timestamp(void)
1345 {
1346         atomic_inc(&netstamp_needed);
1347 }
1348 EXPORT_SYMBOL(net_enable_timestamp);
1349
1350 void net_disable_timestamp(void)
1351 {
1352         atomic_dec(&netstamp_needed);
1353 }
1354 EXPORT_SYMBOL(net_disable_timestamp);
1355
1356 static inline void net_timestamp(struct sk_buff *skb)
1357 {
1358         if (atomic_read(&netstamp_needed))
1359                 __net_timestamp(skb);
1360         else
1361                 skb->tstamp.tv64 = 0;
1362 }
1363
1364 /*
1365  *      Support routine. Sends outgoing frames to any network
1366  *      taps currently in use.
1367  */
1368
1369 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1370 {
1371         struct packet_type *ptype;
1372
1373 #ifdef CONFIG_NET_CLS_ACT
1374         if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1375                 net_timestamp(skb);
1376 #else
1377         net_timestamp(skb);
1378 #endif
1379
1380         rcu_read_lock();
1381         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1382                 /* Never send packets back to the socket
1383                  * they originated from - MvS (miquels@drinkel.ow.org)
1384                  */
1385                 if ((ptype->dev == dev || !ptype->dev) &&
1386                     (ptype->af_packet_priv == NULL ||
1387                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1388                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1389                         if (!skb2)
1390                                 break;
1391
1392                         /* skb->nh should be correctly
1393                            set by sender, so that the second statement is
1394                            just protection against buggy protocols.
1395                          */
1396                         skb_reset_mac_header(skb2);
1397
1398                         if (skb_network_header(skb2) < skb2->data ||
1399                             skb2->network_header > skb2->tail) {
1400                                 if (net_ratelimit())
1401                                         printk(KERN_CRIT "protocol %04x is "
1402                                                "buggy, dev %s\n",
1403                                                skb2->protocol, dev->name);
1404                                 skb_reset_network_header(skb2);
1405                         }
1406
1407                         skb2->transport_header = skb2->network_header;
1408                         skb2->pkt_type = PACKET_OUTGOING;
1409                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1410                 }
1411         }
1412         rcu_read_unlock();
1413 }
1414
1415
1416 static inline void __netif_reschedule(struct Qdisc *q)
1417 {
1418         struct softnet_data *sd;
1419         unsigned long flags;
1420
1421         local_irq_save(flags);
1422         sd = &__get_cpu_var(softnet_data);
1423         q->next_sched = sd->output_queue;
1424         sd->output_queue = q;
1425         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1426         local_irq_restore(flags);
1427 }
1428
1429 void __netif_schedule(struct Qdisc *q)
1430 {
1431         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1432                 __netif_reschedule(q);
1433 }
1434 EXPORT_SYMBOL(__netif_schedule);
1435
1436 void dev_kfree_skb_irq(struct sk_buff *skb)
1437 {
1438         if (atomic_dec_and_test(&skb->users)) {
1439                 struct softnet_data *sd;
1440                 unsigned long flags;
1441
1442                 local_irq_save(flags);
1443                 sd = &__get_cpu_var(softnet_data);
1444                 skb->next = sd->completion_queue;
1445                 sd->completion_queue = skb;
1446                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1447                 local_irq_restore(flags);
1448         }
1449 }
1450 EXPORT_SYMBOL(dev_kfree_skb_irq);
1451
1452 void dev_kfree_skb_any(struct sk_buff *skb)
1453 {
1454         if (in_irq() || irqs_disabled())
1455                 dev_kfree_skb_irq(skb);
1456         else
1457                 dev_kfree_skb(skb);
1458 }
1459 EXPORT_SYMBOL(dev_kfree_skb_any);
1460
1461
1462 /**
1463  * netif_device_detach - mark device as removed
1464  * @dev: network device
1465  *
1466  * Mark device as removed from system and therefore no longer available.
1467  */
1468 void netif_device_detach(struct net_device *dev)
1469 {
1470         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1471             netif_running(dev)) {
1472                 netif_tx_stop_all_queues(dev);
1473         }
1474 }
1475 EXPORT_SYMBOL(netif_device_detach);
1476
1477 /**
1478  * netif_device_attach - mark device as attached
1479  * @dev: network device
1480  *
1481  * Mark device as attached from system and restart if needed.
1482  */
1483 void netif_device_attach(struct net_device *dev)
1484 {
1485         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1486             netif_running(dev)) {
1487                 netif_tx_wake_all_queues(dev);
1488                 __netdev_watchdog_up(dev);
1489         }
1490 }
1491 EXPORT_SYMBOL(netif_device_attach);
1492
1493 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1494 {
1495         return ((features & NETIF_F_NO_CSUM) ||
1496                 ((features & NETIF_F_V4_CSUM) &&
1497                  protocol == htons(ETH_P_IP)) ||
1498                 ((features & NETIF_F_V6_CSUM) &&
1499                  protocol == htons(ETH_P_IPV6)) ||
1500                 ((features & NETIF_F_FCOE_CRC) &&
1501                  protocol == htons(ETH_P_FCOE)));
1502 }
1503
1504 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1505 {
1506         if (can_checksum_protocol(dev->features, skb->protocol))
1507                 return true;
1508
1509         if (skb->protocol == htons(ETH_P_8021Q)) {
1510                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1511                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1512                                           veh->h_vlan_encapsulated_proto))
1513                         return true;
1514         }
1515
1516         return false;
1517 }
1518
1519 /*
1520  * Invalidate hardware checksum when packet is to be mangled, and
1521  * complete checksum manually on outgoing path.
1522  */
1523 int skb_checksum_help(struct sk_buff *skb)
1524 {
1525         __wsum csum;
1526         int ret = 0, offset;
1527
1528         if (skb->ip_summed == CHECKSUM_COMPLETE)
1529                 goto out_set_summed;
1530
1531         if (unlikely(skb_shinfo(skb)->gso_size)) {
1532                 /* Let GSO fix up the checksum. */
1533                 goto out_set_summed;
1534         }
1535
1536         offset = skb->csum_start - skb_headroom(skb);
1537         BUG_ON(offset >= skb_headlen(skb));
1538         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1539
1540         offset += skb->csum_offset;
1541         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1542
1543         if (skb_cloned(skb) &&
1544             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1545                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1546                 if (ret)
1547                         goto out;
1548         }
1549
1550         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1551 out_set_summed:
1552         skb->ip_summed = CHECKSUM_NONE;
1553 out:
1554         return ret;
1555 }
1556 EXPORT_SYMBOL(skb_checksum_help);
1557
1558 /**
1559  *      skb_gso_segment - Perform segmentation on skb.
1560  *      @skb: buffer to segment
1561  *      @features: features for the output path (see dev->features)
1562  *
1563  *      This function segments the given skb and returns a list of segments.
1564  *
1565  *      It may return NULL if the skb requires no segmentation.  This is
1566  *      only possible when GSO is used for verifying header integrity.
1567  */
1568 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1569 {
1570         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1571         struct packet_type *ptype;
1572         __be16 type = skb->protocol;
1573         int err;
1574
1575         skb_reset_mac_header(skb);
1576         skb->mac_len = skb->network_header - skb->mac_header;
1577         __skb_pull(skb, skb->mac_len);
1578
1579         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1580                 struct net_device *dev = skb->dev;
1581                 struct ethtool_drvinfo info = {};
1582
1583                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1584                         dev->ethtool_ops->get_drvinfo(dev, &info);
1585
1586                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1587                         "ip_summed=%d",
1588                      info.driver, dev ? dev->features : 0L,
1589                      skb->sk ? skb->sk->sk_route_caps : 0L,
1590                      skb->len, skb->data_len, skb->ip_summed);
1591
1592                 if (skb_header_cloned(skb) &&
1593                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1594                         return ERR_PTR(err);
1595         }
1596
1597         rcu_read_lock();
1598         list_for_each_entry_rcu(ptype,
1599                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1600                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1601                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1602                                 err = ptype->gso_send_check(skb);
1603                                 segs = ERR_PTR(err);
1604                                 if (err || skb_gso_ok(skb, features))
1605                                         break;
1606                                 __skb_push(skb, (skb->data -
1607                                                  skb_network_header(skb)));
1608                         }
1609                         segs = ptype->gso_segment(skb, features);
1610                         break;
1611                 }
1612         }
1613         rcu_read_unlock();
1614
1615         __skb_push(skb, skb->data - skb_mac_header(skb));
1616
1617         return segs;
1618 }
1619 EXPORT_SYMBOL(skb_gso_segment);
1620
1621 /* Take action when hardware reception checksum errors are detected. */
1622 #ifdef CONFIG_BUG
1623 void netdev_rx_csum_fault(struct net_device *dev)
1624 {
1625         if (net_ratelimit()) {
1626                 printk(KERN_ERR "%s: hw csum failure.\n",
1627                         dev ? dev->name : "<unknown>");
1628                 dump_stack();
1629         }
1630 }
1631 EXPORT_SYMBOL(netdev_rx_csum_fault);
1632 #endif
1633
1634 /* Actually, we should eliminate this check as soon as we know, that:
1635  * 1. IOMMU is present and allows to map all the memory.
1636  * 2. No high memory really exists on this machine.
1637  */
1638
1639 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1640 {
1641 #ifdef CONFIG_HIGHMEM
1642         int i;
1643
1644         if (dev->features & NETIF_F_HIGHDMA)
1645                 return 0;
1646
1647         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1648                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1649                         return 1;
1650
1651 #endif
1652         return 0;
1653 }
1654
1655 struct dev_gso_cb {
1656         void (*destructor)(struct sk_buff *skb);
1657 };
1658
1659 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1660
1661 static void dev_gso_skb_destructor(struct sk_buff *skb)
1662 {
1663         struct dev_gso_cb *cb;
1664
1665         do {
1666                 struct sk_buff *nskb = skb->next;
1667
1668                 skb->next = nskb->next;
1669                 nskb->next = NULL;
1670                 kfree_skb(nskb);
1671         } while (skb->next);
1672
1673         cb = DEV_GSO_CB(skb);
1674         if (cb->destructor)
1675                 cb->destructor(skb);
1676 }
1677
1678 /**
1679  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1680  *      @skb: buffer to segment
1681  *
1682  *      This function segments the given skb and stores the list of segments
1683  *      in skb->next.
1684  */
1685 static int dev_gso_segment(struct sk_buff *skb)
1686 {
1687         struct net_device *dev = skb->dev;
1688         struct sk_buff *segs;
1689         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1690                                          NETIF_F_SG : 0);
1691
1692         segs = skb_gso_segment(skb, features);
1693
1694         /* Verifying header integrity only. */
1695         if (!segs)
1696                 return 0;
1697
1698         if (IS_ERR(segs))
1699                 return PTR_ERR(segs);
1700
1701         skb->next = segs;
1702         DEV_GSO_CB(skb)->destructor = skb->destructor;
1703         skb->destructor = dev_gso_skb_destructor;
1704
1705         return 0;
1706 }
1707
1708 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1709                         struct netdev_queue *txq)
1710 {
1711         const struct net_device_ops *ops = dev->netdev_ops;
1712         int rc;
1713
1714         if (likely(!skb->next)) {
1715                 if (!list_empty(&ptype_all))
1716                         dev_queue_xmit_nit(skb, dev);
1717
1718                 if (netif_needs_gso(dev, skb)) {
1719                         if (unlikely(dev_gso_segment(skb)))
1720                                 goto out_kfree_skb;
1721                         if (skb->next)
1722                                 goto gso;
1723                 }
1724
1725                 /*
1726                  * If device doesnt need skb->dst, release it right now while
1727                  * its hot in this cpu cache
1728                  */
1729                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1730                         skb_dst_drop(skb);
1731
1732                 rc = ops->ndo_start_xmit(skb, dev);
1733                 if (rc == NETDEV_TX_OK)
1734                         txq_trans_update(txq);
1735                 /*
1736                  * TODO: if skb_orphan() was called by
1737                  * dev->hard_start_xmit() (for example, the unmodified
1738                  * igb driver does that; bnx2 doesn't), then
1739                  * skb_tx_software_timestamp() will be unable to send
1740                  * back the time stamp.
1741                  *
1742                  * How can this be prevented? Always create another
1743                  * reference to the socket before calling
1744                  * dev->hard_start_xmit()? Prevent that skb_orphan()
1745                  * does anything in dev->hard_start_xmit() by clearing
1746                  * the skb destructor before the call and restoring it
1747                  * afterwards, then doing the skb_orphan() ourselves?
1748                  */
1749                 return rc;
1750         }
1751
1752 gso:
1753         do {
1754                 struct sk_buff *nskb = skb->next;
1755
1756                 skb->next = nskb->next;
1757                 nskb->next = NULL;
1758
1759                 /*
1760                  * If device doesnt need nskb->dst, release it right now while
1761                  * its hot in this cpu cache
1762                  */
1763                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1764                         skb_dst_drop(nskb);
1765
1766                 rc = ops->ndo_start_xmit(nskb, dev);
1767                 if (unlikely(rc != NETDEV_TX_OK)) {
1768                         nskb->next = skb->next;
1769                         skb->next = nskb;
1770                         return rc;
1771                 }
1772                 txq_trans_update(txq);
1773                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1774                         return NETDEV_TX_BUSY;
1775         } while (skb->next);
1776
1777         skb->destructor = DEV_GSO_CB(skb)->destructor;
1778
1779 out_kfree_skb:
1780         kfree_skb(skb);
1781         return NETDEV_TX_OK;
1782 }
1783
1784 static u32 skb_tx_hashrnd;
1785
1786 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1787 {
1788         u32 hash;
1789
1790         if (skb_rx_queue_recorded(skb)) {
1791                 hash = skb_get_rx_queue(skb);
1792                 while (unlikely(hash >= dev->real_num_tx_queues))
1793                         hash -= dev->real_num_tx_queues;
1794                 return hash;
1795         }
1796
1797         if (skb->sk && skb->sk->sk_hash)
1798                 hash = skb->sk->sk_hash;
1799         else
1800                 hash = skb->protocol;
1801
1802         hash = jhash_1word(hash, skb_tx_hashrnd);
1803
1804         return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1805 }
1806 EXPORT_SYMBOL(skb_tx_hash);
1807
1808 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1809                                         struct sk_buff *skb)
1810 {
1811         const struct net_device_ops *ops = dev->netdev_ops;
1812         u16 queue_index = 0;
1813
1814         if (ops->ndo_select_queue)
1815                 queue_index = ops->ndo_select_queue(dev, skb);
1816         else if (dev->real_num_tx_queues > 1)
1817                 queue_index = skb_tx_hash(dev, skb);
1818
1819         skb_set_queue_mapping(skb, queue_index);
1820         return netdev_get_tx_queue(dev, queue_index);
1821 }
1822
1823 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
1824                                  struct net_device *dev,
1825                                  struct netdev_queue *txq)
1826 {
1827         spinlock_t *root_lock = qdisc_lock(q);
1828         int rc;
1829
1830         spin_lock(root_lock);
1831         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1832                 kfree_skb(skb);
1833                 rc = NET_XMIT_DROP;
1834         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
1835                    !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
1836                 /*
1837                  * This is a work-conserving queue; there are no old skbs
1838                  * waiting to be sent out; and the qdisc is not running -
1839                  * xmit the skb directly.
1840                  */
1841                 __qdisc_update_bstats(q, skb->len);
1842                 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
1843                         __qdisc_run(q);
1844                 else
1845                         clear_bit(__QDISC_STATE_RUNNING, &q->state);
1846
1847                 rc = NET_XMIT_SUCCESS;
1848         } else {
1849                 rc = qdisc_enqueue_root(skb, q);
1850                 qdisc_run(q);
1851         }
1852         spin_unlock(root_lock);
1853
1854         return rc;
1855 }
1856
1857 /**
1858  *      dev_queue_xmit - transmit a buffer
1859  *      @skb: buffer to transmit
1860  *
1861  *      Queue a buffer for transmission to a network device. The caller must
1862  *      have set the device and priority and built the buffer before calling
1863  *      this function. The function can be called from an interrupt.
1864  *
1865  *      A negative errno code is returned on a failure. A success does not
1866  *      guarantee the frame will be transmitted as it may be dropped due
1867  *      to congestion or traffic shaping.
1868  *
1869  * -----------------------------------------------------------------------------------
1870  *      I notice this method can also return errors from the queue disciplines,
1871  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1872  *      be positive.
1873  *
1874  *      Regardless of the return value, the skb is consumed, so it is currently
1875  *      difficult to retry a send to this method.  (You can bump the ref count
1876  *      before sending to hold a reference for retry if you are careful.)
1877  *
1878  *      When calling this method, interrupts MUST be enabled.  This is because
1879  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1880  *          --BLG
1881  */
1882 int dev_queue_xmit(struct sk_buff *skb)
1883 {
1884         struct net_device *dev = skb->dev;
1885         struct netdev_queue *txq;
1886         struct Qdisc *q;
1887         int rc = -ENOMEM;
1888
1889         /* GSO will handle the following emulations directly. */
1890         if (netif_needs_gso(dev, skb))
1891                 goto gso;
1892
1893         if (skb_has_frags(skb) &&
1894             !(dev->features & NETIF_F_FRAGLIST) &&
1895             __skb_linearize(skb))
1896                 goto out_kfree_skb;
1897
1898         /* Fragmented skb is linearized if device does not support SG,
1899          * or if at least one of fragments is in highmem and device
1900          * does not support DMA from it.
1901          */
1902         if (skb_shinfo(skb)->nr_frags &&
1903             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1904             __skb_linearize(skb))
1905                 goto out_kfree_skb;
1906
1907         /* If packet is not checksummed and device does not support
1908          * checksumming for this protocol, complete checksumming here.
1909          */
1910         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1911                 skb_set_transport_header(skb, skb->csum_start -
1912                                               skb_headroom(skb));
1913                 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1914                         goto out_kfree_skb;
1915         }
1916
1917 gso:
1918         /* Disable soft irqs for various locks below. Also
1919          * stops preemption for RCU.
1920          */
1921         rcu_read_lock_bh();
1922
1923         txq = dev_pick_tx(dev, skb);
1924         q = rcu_dereference(txq->qdisc);
1925
1926 #ifdef CONFIG_NET_CLS_ACT
1927         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1928 #endif
1929         if (q->enqueue) {
1930                 rc = __dev_xmit_skb(skb, q, dev, txq);
1931                 goto out;
1932         }
1933
1934         /* The device has no queue. Common case for software devices:
1935            loopback, all the sorts of tunnels...
1936
1937            Really, it is unlikely that netif_tx_lock protection is necessary
1938            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1939            counters.)
1940            However, it is possible, that they rely on protection
1941            made by us here.
1942
1943            Check this and shot the lock. It is not prone from deadlocks.
1944            Either shot noqueue qdisc, it is even simpler 8)
1945          */
1946         if (dev->flags & IFF_UP) {
1947                 int cpu = smp_processor_id(); /* ok because BHs are off */
1948
1949                 if (txq->xmit_lock_owner != cpu) {
1950
1951                         HARD_TX_LOCK(dev, txq, cpu);
1952
1953                         if (!netif_tx_queue_stopped(txq)) {
1954                                 rc = NET_XMIT_SUCCESS;
1955                                 if (!dev_hard_start_xmit(skb, dev, txq)) {
1956                                         HARD_TX_UNLOCK(dev, txq);
1957                                         goto out;
1958                                 }
1959                         }
1960                         HARD_TX_UNLOCK(dev, txq);
1961                         if (net_ratelimit())
1962                                 printk(KERN_CRIT "Virtual device %s asks to "
1963                                        "queue packet!\n", dev->name);
1964                 } else {
1965                         /* Recursion is detected! It is possible,
1966                          * unfortunately */
1967                         if (net_ratelimit())
1968                                 printk(KERN_CRIT "Dead loop on virtual device "
1969                                        "%s, fix it urgently!\n", dev->name);
1970                 }
1971         }
1972
1973         rc = -ENETDOWN;
1974         rcu_read_unlock_bh();
1975
1976 out_kfree_skb:
1977         kfree_skb(skb);
1978         return rc;
1979 out:
1980         rcu_read_unlock_bh();
1981         return rc;
1982 }
1983 EXPORT_SYMBOL(dev_queue_xmit);
1984
1985
1986 /*=======================================================================
1987                         Receiver routines
1988   =======================================================================*/
1989
1990 int netdev_max_backlog __read_mostly = 1000;
1991 int netdev_budget __read_mostly = 300;
1992 int weight_p __read_mostly = 64;            /* old backlog weight */
1993
1994 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1995
1996
1997 /**
1998  *      netif_rx        -       post buffer to the network code
1999  *      @skb: buffer to post
2000  *
2001  *      This function receives a packet from a device driver and queues it for
2002  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2003  *      may be dropped during processing for congestion control or by the
2004  *      protocol layers.
2005  *
2006  *      return values:
2007  *      NET_RX_SUCCESS  (no congestion)
2008  *      NET_RX_DROP     (packet was dropped)
2009  *
2010  */
2011
2012 int netif_rx(struct sk_buff *skb)
2013 {
2014         struct softnet_data *queue;
2015         unsigned long flags;
2016
2017         /* if netpoll wants it, pretend we never saw it */
2018         if (netpoll_rx(skb))
2019                 return NET_RX_DROP;
2020
2021         if (!skb->tstamp.tv64)
2022                 net_timestamp(skb);
2023
2024         /*
2025          * The code is rearranged so that the path is the most
2026          * short when CPU is congested, but is still operating.
2027          */
2028         local_irq_save(flags);
2029         queue = &__get_cpu_var(softnet_data);
2030
2031         __get_cpu_var(netdev_rx_stat).total++;
2032         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2033                 if (queue->input_pkt_queue.qlen) {
2034 enqueue:
2035                         __skb_queue_tail(&queue->input_pkt_queue, skb);
2036                         local_irq_restore(flags);
2037                         return NET_RX_SUCCESS;
2038                 }
2039
2040                 napi_schedule(&queue->backlog);
2041                 goto enqueue;
2042         }
2043
2044         __get_cpu_var(netdev_rx_stat).dropped++;
2045         local_irq_restore(flags);
2046
2047         kfree_skb(skb);
2048         return NET_RX_DROP;
2049 }
2050 EXPORT_SYMBOL(netif_rx);
2051
2052 int netif_rx_ni(struct sk_buff *skb)
2053 {
2054         int err;
2055
2056         preempt_disable();
2057         err = netif_rx(skb);
2058         if (local_softirq_pending())
2059                 do_softirq();
2060         preempt_enable();
2061
2062         return err;
2063 }
2064 EXPORT_SYMBOL(netif_rx_ni);
2065
2066 static void net_tx_action(struct softirq_action *h)
2067 {
2068         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2069
2070         if (sd->completion_queue) {
2071                 struct sk_buff *clist;
2072
2073                 local_irq_disable();
2074                 clist = sd->completion_queue;
2075                 sd->completion_queue = NULL;
2076                 local_irq_enable();
2077
2078                 while (clist) {
2079                         struct sk_buff *skb = clist;
2080                         clist = clist->next;
2081
2082                         WARN_ON(atomic_read(&skb->users));
2083                         __kfree_skb(skb);
2084                 }
2085         }
2086
2087         if (sd->output_queue) {
2088                 struct Qdisc *head;
2089
2090                 local_irq_disable();
2091                 head = sd->output_queue;
2092                 sd->output_queue = NULL;
2093                 local_irq_enable();
2094
2095                 while (head) {
2096                         struct Qdisc *q = head;
2097                         spinlock_t *root_lock;
2098
2099                         head = head->next_sched;
2100
2101                         root_lock = qdisc_lock(q);
2102                         if (spin_trylock(root_lock)) {
2103                                 smp_mb__before_clear_bit();
2104                                 clear_bit(__QDISC_STATE_SCHED,
2105                                           &q->state);
2106                                 qdisc_run(q);
2107                                 spin_unlock(root_lock);
2108                         } else {
2109                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2110                                               &q->state)) {
2111                                         __netif_reschedule(q);
2112                                 } else {
2113                                         smp_mb__before_clear_bit();
2114                                         clear_bit(__QDISC_STATE_SCHED,
2115                                                   &q->state);
2116                                 }
2117                         }
2118                 }
2119         }
2120 }
2121
2122 static inline int deliver_skb(struct sk_buff *skb,
2123                               struct packet_type *pt_prev,
2124                               struct net_device *orig_dev)
2125 {
2126         atomic_inc(&skb->users);
2127         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2128 }
2129
2130 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2131
2132 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2133 /* This hook is defined here for ATM LANE */
2134 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2135                              unsigned char *addr) __read_mostly;
2136 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2137 #endif
2138
2139 /*
2140  * If bridge module is loaded call bridging hook.
2141  *  returns NULL if packet was consumed.
2142  */
2143 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2144                                         struct sk_buff *skb) __read_mostly;
2145 EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2146
2147 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2148                                             struct packet_type **pt_prev, int *ret,
2149                                             struct net_device *orig_dev)
2150 {
2151         struct net_bridge_port *port;
2152
2153         if (skb->pkt_type == PACKET_LOOPBACK ||
2154             (port = rcu_dereference(skb->dev->br_port)) == NULL)
2155                 return skb;
2156
2157         if (*pt_prev) {
2158                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2159                 *pt_prev = NULL;
2160         }
2161
2162         return br_handle_frame_hook(port, skb);
2163 }
2164 #else
2165 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
2166 #endif
2167
2168 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2169 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2170 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2171
2172 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2173                                              struct packet_type **pt_prev,
2174                                              int *ret,
2175                                              struct net_device *orig_dev)
2176 {
2177         if (skb->dev->macvlan_port == NULL)
2178                 return skb;
2179
2180         if (*pt_prev) {
2181                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2182                 *pt_prev = NULL;
2183         }
2184         return macvlan_handle_frame_hook(skb);
2185 }
2186 #else
2187 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
2188 #endif
2189
2190 #ifdef CONFIG_NET_CLS_ACT
2191 /* TODO: Maybe we should just force sch_ingress to be compiled in
2192  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2193  * a compare and 2 stores extra right now if we dont have it on
2194  * but have CONFIG_NET_CLS_ACT
2195  * NOTE: This doesnt stop any functionality; if you dont have
2196  * the ingress scheduler, you just cant add policies on ingress.
2197  *
2198  */
2199 static int ing_filter(struct sk_buff *skb)
2200 {
2201         struct net_device *dev = skb->dev;
2202         u32 ttl = G_TC_RTTL(skb->tc_verd);
2203         struct netdev_queue *rxq;
2204         int result = TC_ACT_OK;
2205         struct Qdisc *q;
2206
2207         if (MAX_RED_LOOP < ttl++) {
2208                 printk(KERN_WARNING
2209                        "Redir loop detected Dropping packet (%d->%d)\n",
2210                        skb->iif, dev->ifindex);
2211                 return TC_ACT_SHOT;
2212         }
2213
2214         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2215         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2216
2217         rxq = &dev->rx_queue;
2218
2219         q = rxq->qdisc;
2220         if (q != &noop_qdisc) {
2221                 spin_lock(qdisc_lock(q));
2222                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2223                         result = qdisc_enqueue_root(skb, q);
2224                 spin_unlock(qdisc_lock(q));
2225         }
2226
2227         return result;
2228 }
2229
2230 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2231                                          struct packet_type **pt_prev,
2232                                          int *ret, struct net_device *orig_dev)
2233 {
2234         if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2235                 goto out;
2236
2237         if (*pt_prev) {
2238                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2239                 *pt_prev = NULL;
2240         } else {
2241                 /* Huh? Why does turning on AF_PACKET affect this? */
2242                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2243         }
2244
2245         switch (ing_filter(skb)) {
2246         case TC_ACT_SHOT:
2247         case TC_ACT_STOLEN:
2248                 kfree_skb(skb);
2249                 return NULL;
2250         }
2251
2252 out:
2253         skb->tc_verd = 0;
2254         return skb;
2255 }
2256 #endif
2257
2258 /*
2259  *      netif_nit_deliver - deliver received packets to network taps
2260  *      @skb: buffer
2261  *
2262  *      This function is used to deliver incoming packets to network
2263  *      taps. It should be used when the normal netif_receive_skb path
2264  *      is bypassed, for example because of VLAN acceleration.
2265  */
2266 void netif_nit_deliver(struct sk_buff *skb)
2267 {
2268         struct packet_type *ptype;
2269
2270         if (list_empty(&ptype_all))
2271                 return;
2272
2273         skb_reset_network_header(skb);
2274         skb_reset_transport_header(skb);
2275         skb->mac_len = skb->network_header - skb->mac_header;
2276
2277         rcu_read_lock();
2278         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2279                 if (!ptype->dev || ptype->dev == skb->dev)
2280                         deliver_skb(skb, ptype, skb->dev);
2281         }
2282         rcu_read_unlock();
2283 }
2284
2285 /**
2286  *      netif_receive_skb - process receive buffer from network
2287  *      @skb: buffer to process
2288  *
2289  *      netif_receive_skb() is the main receive data processing function.
2290  *      It always succeeds. The buffer may be dropped during processing
2291  *      for congestion control or by the protocol layers.
2292  *
2293  *      This function may only be called from softirq context and interrupts
2294  *      should be enabled.
2295  *
2296  *      Return values (usually ignored):
2297  *      NET_RX_SUCCESS: no congestion
2298  *      NET_RX_DROP: packet was dropped
2299  */
2300 int netif_receive_skb(struct sk_buff *skb)
2301 {
2302         struct packet_type *ptype, *pt_prev;
2303         struct net_device *orig_dev;
2304         struct net_device *null_or_orig;
2305         int ret = NET_RX_DROP;
2306         __be16 type;
2307
2308         if (!skb->tstamp.tv64)
2309                 net_timestamp(skb);
2310
2311         if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2312                 return NET_RX_SUCCESS;
2313
2314         /* if we've gotten here through NAPI, check netpoll */
2315         if (netpoll_receive_skb(skb))
2316                 return NET_RX_DROP;
2317
2318         if (!skb->iif)
2319                 skb->iif = skb->dev->ifindex;
2320
2321         null_or_orig = NULL;
2322         orig_dev = skb->dev;
2323         if (orig_dev->master) {
2324                 if (skb_bond_should_drop(skb))
2325                         null_or_orig = orig_dev; /* deliver only exact match */
2326                 else
2327                         skb->dev = orig_dev->master;
2328         }
2329
2330         __get_cpu_var(netdev_rx_stat).total++;
2331
2332         skb_reset_network_header(skb);
2333         skb_reset_transport_header(skb);
2334         skb->mac_len = skb->network_header - skb->mac_header;
2335
2336         pt_prev = NULL;
2337
2338         rcu_read_lock();
2339
2340 #ifdef CONFIG_NET_CLS_ACT
2341         if (skb->tc_verd & TC_NCLS) {
2342                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2343                 goto ncls;
2344         }
2345 #endif
2346
2347         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2348                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2349                     ptype->dev == orig_dev) {
2350                         if (pt_prev)
2351                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2352                         pt_prev = ptype;
2353                 }
2354         }
2355
2356 #ifdef CONFIG_NET_CLS_ACT
2357         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2358         if (!skb)
2359                 goto out;
2360 ncls:
2361 #endif
2362
2363         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2364         if (!skb)
2365                 goto out;
2366         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2367         if (!skb)
2368                 goto out;
2369
2370         type = skb->protocol;
2371         list_for_each_entry_rcu(ptype,
2372                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2373                 if (ptype->type == type &&
2374                     (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2375                      ptype->dev == orig_dev)) {
2376                         if (pt_prev)
2377                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2378                         pt_prev = ptype;
2379                 }
2380         }
2381
2382         if (pt_prev) {
2383                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2384         } else {
2385                 kfree_skb(skb);
2386                 /* Jamal, now you will not able to escape explaining
2387                  * me how you were going to use this. :-)
2388                  */
2389                 ret = NET_RX_DROP;
2390         }
2391
2392 out:
2393         rcu_read_unlock();
2394         return ret;
2395 }
2396 EXPORT_SYMBOL(netif_receive_skb);
2397
2398 /* Network device is going away, flush any packets still pending  */
2399 static void flush_backlog(void *arg)
2400 {
2401         struct net_device *dev = arg;
2402         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2403         struct sk_buff *skb, *tmp;
2404
2405         skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2406                 if (skb->dev == dev) {
2407                         __skb_unlink(skb, &queue->input_pkt_queue);
2408                         kfree_skb(skb);
2409                 }
2410 }
2411
2412 static int napi_gro_complete(struct sk_buff *skb)
2413 {
2414         struct packet_type *ptype;
2415         __be16 type = skb->protocol;
2416         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2417         int err = -ENOENT;
2418
2419         if (NAPI_GRO_CB(skb)->count == 1) {
2420                 skb_shinfo(skb)->gso_size = 0;
2421                 goto out;
2422         }
2423
2424         rcu_read_lock();
2425         list_for_each_entry_rcu(ptype, head, list) {
2426                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2427                         continue;
2428
2429                 err = ptype->gro_complete(skb);
2430                 break;
2431         }
2432         rcu_read_unlock();
2433
2434         if (err) {
2435                 WARN_ON(&ptype->list == head);
2436                 kfree_skb(skb);
2437                 return NET_RX_SUCCESS;
2438         }
2439
2440 out:
2441         return netif_receive_skb(skb);
2442 }
2443
2444 void napi_gro_flush(struct napi_struct *napi)
2445 {
2446         struct sk_buff *skb, *next;
2447
2448         for (skb = napi->gro_list; skb; skb = next) {
2449                 next = skb->next;
2450                 skb->next = NULL;
2451                 napi_gro_complete(skb);
2452         }
2453
2454         napi->gro_count = 0;
2455         napi->gro_list = NULL;
2456 }
2457 EXPORT_SYMBOL(napi_gro_flush);
2458
2459 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2460 {
2461         struct sk_buff **pp = NULL;
2462         struct packet_type *ptype;
2463         __be16 type = skb->protocol;
2464         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2465         int same_flow;
2466         int mac_len;
2467         int ret;
2468
2469         if (!(skb->dev->features & NETIF_F_GRO))
2470                 goto normal;
2471
2472         if (skb_is_gso(skb) || skb_has_frags(skb))
2473                 goto normal;
2474
2475         rcu_read_lock();
2476         list_for_each_entry_rcu(ptype, head, list) {
2477                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2478                         continue;
2479
2480                 skb_set_network_header(skb, skb_gro_offset(skb));
2481                 mac_len = skb->network_header - skb->mac_header;
2482                 skb->mac_len = mac_len;
2483                 NAPI_GRO_CB(skb)->same_flow = 0;
2484                 NAPI_GRO_CB(skb)->flush = 0;
2485                 NAPI_GRO_CB(skb)->free = 0;
2486
2487                 pp = ptype->gro_receive(&napi->gro_list, skb);
2488                 break;
2489         }
2490         rcu_read_unlock();
2491
2492         if (&ptype->list == head)
2493                 goto normal;
2494
2495         same_flow = NAPI_GRO_CB(skb)->same_flow;
2496         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2497
2498         if (pp) {
2499                 struct sk_buff *nskb = *pp;
2500
2501                 *pp = nskb->next;
2502                 nskb->next = NULL;
2503                 napi_gro_complete(nskb);
2504                 napi->gro_count--;
2505         }
2506
2507         if (same_flow)
2508                 goto ok;
2509
2510         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2511                 goto normal;
2512
2513         napi->gro_count++;
2514         NAPI_GRO_CB(skb)->count = 1;
2515         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2516         skb->next = napi->gro_list;
2517         napi->gro_list = skb;
2518         ret = GRO_HELD;
2519
2520 pull:
2521         if (skb_headlen(skb) < skb_gro_offset(skb)) {
2522                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
2523
2524                 BUG_ON(skb->end - skb->tail < grow);
2525
2526                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
2527
2528                 skb->tail += grow;
2529                 skb->data_len -= grow;
2530
2531                 skb_shinfo(skb)->frags[0].page_offset += grow;
2532                 skb_shinfo(skb)->frags[0].size -= grow;
2533
2534                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
2535                         put_page(skb_shinfo(skb)->frags[0].page);
2536                         memmove(skb_shinfo(skb)->frags,
2537                                 skb_shinfo(skb)->frags + 1,
2538                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
2539                 }
2540         }
2541
2542 ok:
2543         return ret;
2544
2545 normal:
2546         ret = GRO_NORMAL;
2547         goto pull;
2548 }
2549 EXPORT_SYMBOL(dev_gro_receive);
2550
2551 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2552 {
2553         struct sk_buff *p;
2554
2555         if (netpoll_rx_on(skb))
2556                 return GRO_NORMAL;
2557
2558         for (p = napi->gro_list; p; p = p->next) {
2559                 NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev)
2560                         && !compare_ether_header(skb_mac_header(p),
2561                                                  skb_gro_mac_header(skb));
2562                 NAPI_GRO_CB(p)->flush = 0;
2563         }
2564
2565         return dev_gro_receive(napi, skb);
2566 }
2567
2568 int napi_skb_finish(int ret, struct sk_buff *skb)
2569 {
2570         int err = NET_RX_SUCCESS;
2571
2572         switch (ret) {
2573         case GRO_NORMAL:
2574                 return netif_receive_skb(skb);
2575
2576         case GRO_DROP:
2577                 err = NET_RX_DROP;
2578                 /* fall through */
2579
2580         case GRO_MERGED_FREE:
2581                 kfree_skb(skb);
2582                 break;
2583         }
2584
2585         return err;
2586 }
2587 EXPORT_SYMBOL(napi_skb_finish);
2588
2589 void skb_gro_reset_offset(struct sk_buff *skb)
2590 {
2591         NAPI_GRO_CB(skb)->data_offset = 0;
2592         NAPI_GRO_CB(skb)->frag0 = NULL;
2593         NAPI_GRO_CB(skb)->frag0_len = 0;
2594
2595         if (skb->mac_header == skb->tail &&
2596             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
2597                 NAPI_GRO_CB(skb)->frag0 =
2598                         page_address(skb_shinfo(skb)->frags[0].page) +
2599                         skb_shinfo(skb)->frags[0].page_offset;
2600                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
2601         }
2602 }
2603 EXPORT_SYMBOL(skb_gro_reset_offset);
2604
2605 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2606 {
2607         skb_gro_reset_offset(skb);
2608
2609         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2610 }
2611 EXPORT_SYMBOL(napi_gro_receive);
2612
2613 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2614 {
2615         __skb_pull(skb, skb_headlen(skb));
2616         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2617
2618         napi->skb = skb;
2619 }
2620 EXPORT_SYMBOL(napi_reuse_skb);
2621
2622 struct sk_buff *napi_get_frags(struct napi_struct *napi)
2623 {
2624         struct net_device *dev = napi->dev;
2625         struct sk_buff *skb = napi->skb;
2626
2627         if (!skb) {
2628                 skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2629                 if (!skb)
2630                         goto out;
2631
2632                 skb_reserve(skb, NET_IP_ALIGN);
2633
2634                 napi->skb = skb;
2635         }
2636
2637 out:
2638         return skb;
2639 }
2640 EXPORT_SYMBOL(napi_get_frags);
2641
2642 int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
2643 {
2644         int err = NET_RX_SUCCESS;
2645
2646         switch (ret) {
2647         case GRO_NORMAL:
2648         case GRO_HELD:
2649                 skb->protocol = eth_type_trans(skb, skb->dev);
2650
2651                 if (ret == GRO_NORMAL)
2652                         return netif_receive_skb(skb);
2653
2654                 skb_gro_pull(skb, -ETH_HLEN);
2655                 break;
2656
2657         case GRO_DROP:
2658                 err = NET_RX_DROP;
2659                 /* fall through */
2660
2661         case GRO_MERGED_FREE:
2662                 napi_reuse_skb(napi, skb);
2663                 break;
2664         }
2665
2666         return err;
2667 }
2668 EXPORT_SYMBOL(napi_frags_finish);
2669
2670 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
2671 {
2672         struct sk_buff *skb = napi->skb;
2673         struct ethhdr *eth;
2674         unsigned int hlen;
2675         unsigned int off;
2676
2677         napi->skb = NULL;
2678
2679         skb_reset_mac_header(skb);
2680         skb_gro_reset_offset(skb);
2681
2682         off = skb_gro_offset(skb);
2683         hlen = off + sizeof(*eth);
2684         eth = skb_gro_header_fast(skb, off);
2685         if (skb_gro_header_hard(skb, hlen)) {
2686                 eth = skb_gro_header_slow(skb, hlen, off);
2687                 if (unlikely(!eth)) {
2688                         napi_reuse_skb(napi, skb);
2689                         skb = NULL;
2690                         goto out;
2691                 }
2692         }
2693
2694         skb_gro_pull(skb, sizeof(*eth));
2695
2696         /*
2697          * This works because the only protocols we care about don't require
2698          * special handling.  We'll fix it up properly at the end.
2699          */
2700         skb->protocol = eth->h_proto;
2701
2702 out:
2703         return skb;
2704 }
2705 EXPORT_SYMBOL(napi_frags_skb);
2706
2707 int napi_gro_frags(struct napi_struct *napi)
2708 {
2709         struct sk_buff *skb = napi_frags_skb(napi);
2710
2711         if (!skb)
2712                 return NET_RX_DROP;
2713
2714         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2715 }
2716 EXPORT_SYMBOL(napi_gro_frags);
2717
2718 static int process_backlog(struct napi_struct *napi, int quota)
2719 {
2720         int work = 0;
2721         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2722         unsigned long start_time = jiffies;
2723
2724         napi->weight = weight_p;
2725         do {
2726                 struct sk_buff *skb;
2727
2728                 local_irq_disable();
2729                 skb = __skb_dequeue(&queue->input_pkt_queue);
2730                 if (!skb) {
2731                         __napi_complete(napi);
2732                         local_irq_enable();
2733                         break;
2734                 }
2735                 local_irq_enable();
2736
2737                 netif_receive_skb(skb);
2738         } while (++work < quota && jiffies == start_time);
2739
2740         return work;
2741 }
2742
2743 /**
2744  * __napi_schedule - schedule for receive
2745  * @n: entry to schedule
2746  *
2747  * The entry's receive function will be scheduled to run
2748  */
2749 void __napi_schedule(struct napi_struct *n)
2750 {
2751         unsigned long flags;
2752
2753         local_irq_save(flags);
2754         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2755         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2756         local_irq_restore(flags);
2757 }
2758 EXPORT_SYMBOL(__napi_schedule);
2759
2760 void __napi_complete(struct napi_struct *n)
2761 {
2762         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2763         BUG_ON(n->gro_list);
2764
2765         list_del(&n->poll_list);
2766         smp_mb__before_clear_bit();
2767         clear_bit(NAPI_STATE_SCHED, &n->state);
2768 }
2769 EXPORT_SYMBOL(__napi_complete);
2770
2771 void napi_complete(struct napi_struct *n)
2772 {
2773         unsigned long flags;
2774
2775         /*
2776          * don't let napi dequeue from the cpu poll list
2777          * just in case its running on a different cpu
2778          */
2779         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2780                 return;
2781
2782         napi_gro_flush(n);
2783         local_irq_save(flags);
2784         __napi_complete(n);
2785         local_irq_restore(flags);
2786 }
2787 EXPORT_SYMBOL(napi_complete);
2788
2789 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2790                     int (*poll)(struct napi_struct *, int), int weight)
2791 {
2792         INIT_LIST_HEAD(&napi->poll_list);
2793         napi->gro_count = 0;
2794         napi->gro_list = NULL;
2795         napi->skb = NULL;
2796         napi->poll = poll;
2797         napi->weight = weight;
2798         list_add(&napi->dev_list, &dev->napi_list);
2799         napi->dev = dev;
2800 #ifdef CONFIG_NETPOLL
2801         spin_lock_init(&napi->poll_lock);
2802         napi->poll_owner = -1;
2803 #endif
2804         set_bit(NAPI_STATE_SCHED, &napi->state);
2805 }
2806 EXPORT_SYMBOL(netif_napi_add);
2807
2808 void netif_napi_del(struct napi_struct *napi)
2809 {
2810         struct sk_buff *skb, *next;
2811
2812         list_del_init(&napi->dev_list);
2813         napi_free_frags(napi);
2814
2815         for (skb = napi->gro_list; skb; skb = next) {
2816                 next = skb->next;
2817                 skb->next = NULL;
2818                 kfree_skb(skb);
2819         }
2820
2821         napi->gro_list = NULL;
2822         napi->gro_count = 0;
2823 }
2824 EXPORT_SYMBOL(netif_napi_del);
2825
2826
2827 static void net_rx_action(struct softirq_action *h)
2828 {
2829         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2830         unsigned long time_limit = jiffies + 2;
2831         int budget = netdev_budget;
2832         void *have;
2833
2834         local_irq_disable();
2835
2836         while (!list_empty(list)) {
2837                 struct napi_struct *n;
2838                 int work, weight;
2839
2840                 /* If softirq window is exhuasted then punt.
2841                  * Allow this to run for 2 jiffies since which will allow
2842                  * an average latency of 1.5/HZ.
2843                  */
2844                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2845                         goto softnet_break;
2846
2847                 local_irq_enable();
2848
2849                 /* Even though interrupts have been re-enabled, this
2850                  * access is safe because interrupts can only add new
2851                  * entries to the tail of this list, and only ->poll()
2852                  * calls can remove this head entry from the list.
2853                  */
2854                 n = list_entry(list->next, struct napi_struct, poll_list);
2855
2856                 have = netpoll_poll_lock(n);
2857
2858                 weight = n->weight;
2859
2860                 /* This NAPI_STATE_SCHED test is for avoiding a race
2861                  * with netpoll's poll_napi().  Only the entity which
2862                  * obtains the lock and sees NAPI_STATE_SCHED set will
2863                  * actually make the ->poll() call.  Therefore we avoid
2864                  * accidently calling ->poll() when NAPI is not scheduled.
2865                  */
2866                 work = 0;
2867                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
2868                         work = n->poll(n, weight);
2869                         trace_napi_poll(n);
2870                 }
2871
2872                 WARN_ON_ONCE(work > weight);
2873
2874                 budget -= work;
2875
2876                 local_irq_disable();
2877
2878                 /* Drivers must not modify the NAPI state if they
2879                  * consume the entire weight.  In such cases this code
2880                  * still "owns" the NAPI instance and therefore can
2881                  * move the instance around on the list at-will.
2882                  */
2883                 if (unlikely(work == weight)) {
2884                         if (unlikely(napi_disable_pending(n))) {
2885                                 local_irq_enable();
2886                                 napi_complete(n);
2887                                 local_irq_disable();
2888                         } else
2889                                 list_move_tail(&n->poll_list, list);
2890                 }
2891
2892                 netpoll_poll_unlock(have);
2893         }
2894 out:
2895         local_irq_enable();
2896
2897 #ifdef CONFIG_NET_DMA
2898         /*
2899          * There may not be any more sk_buffs coming right now, so push
2900          * any pending DMA copies to hardware
2901          */
2902         dma_issue_pending_all();
2903 #endif
2904
2905         return;
2906
2907 softnet_break:
2908         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2909         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2910         goto out;
2911 }
2912
2913 static gifconf_func_t *gifconf_list[NPROTO];
2914
2915 /**
2916  *      register_gifconf        -       register a SIOCGIF handler
2917  *      @family: Address family
2918  *      @gifconf: Function handler
2919  *
2920  *      Register protocol dependent address dumping routines. The handler
2921  *      that is passed must not be freed or reused until it has been replaced
2922  *      by another handler.
2923  */
2924 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
2925 {
2926         if (family >= NPROTO)
2927                 return -EINVAL;
2928         gifconf_list[family] = gifconf;
2929         return 0;
2930 }
2931 EXPORT_SYMBOL(register_gifconf);
2932
2933
2934 /*
2935  *      Map an interface index to its name (SIOCGIFNAME)
2936  */
2937
2938 /*
2939  *      We need this ioctl for efficient implementation of the
2940  *      if_indextoname() function required by the IPv6 API.  Without
2941  *      it, we would have to search all the interfaces to find a
2942  *      match.  --pb
2943  */
2944
2945 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2946 {
2947         struct net_device *dev;
2948         struct ifreq ifr;
2949
2950         /*
2951          *      Fetch the caller's info block.
2952          */
2953
2954         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2955                 return -EFAULT;
2956
2957         read_lock(&dev_base_lock);
2958         dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2959         if (!dev) {
2960                 read_unlock(&dev_base_lock);
2961                 return -ENODEV;
2962         }
2963
2964         strcpy(ifr.ifr_name, dev->name);
2965         read_unlock(&dev_base_lock);
2966
2967         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2968                 return -EFAULT;
2969         return 0;
2970 }
2971
2972 /*
2973  *      Perform a SIOCGIFCONF call. This structure will change
2974  *      size eventually, and there is nothing I can do about it.
2975  *      Thus we will need a 'compatibility mode'.
2976  */
2977
2978 static int dev_ifconf(struct net *net, char __user *arg)
2979 {
2980         struct ifconf ifc;
2981         struct net_device *dev;
2982         char __user *pos;
2983         int len;
2984         int total;
2985         int i;
2986
2987         /*
2988          *      Fetch the caller's info block.
2989          */
2990
2991         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2992                 return -EFAULT;
2993
2994         pos = ifc.ifc_buf;
2995         len = ifc.ifc_len;
2996
2997         /*
2998          *      Loop over the interfaces, and write an info block for each.
2999          */
3000
3001         total = 0;
3002         for_each_netdev(net, dev) {
3003                 for (i = 0; i < NPROTO; i++) {
3004                         if (gifconf_list[i]) {
3005                                 int done;
3006                                 if (!pos)
3007                                         done = gifconf_list[i](dev, NULL, 0);
3008                                 else
3009                                         done = gifconf_list[i](dev, pos + total,
3010                                                                len - total);
3011                                 if (done < 0)
3012                                         return -EFAULT;
3013                                 total += done;
3014                         }
3015                 }
3016         }
3017
3018         /*
3019          *      All done.  Write the updated control block back to the caller.
3020          */
3021         ifc.ifc_len = total;
3022
3023         /*
3024          *      Both BSD and Solaris return 0 here, so we do too.
3025          */
3026         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3027 }
3028
3029 #ifdef CONFIG_PROC_FS
3030 /*
3031  *      This is invoked by the /proc filesystem handler to display a device
3032  *      in detail.
3033  */
3034 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3035         __acquires(dev_base_lock)
3036 {
3037         struct net *net = seq_file_net(seq);
3038         loff_t off;
3039         struct net_device *dev;
3040
3041         read_lock(&dev_base_lock);
3042         if (!*pos)
3043                 return SEQ_START_TOKEN;
3044
3045         off = 1;
3046         for_each_netdev(net, dev)
3047                 if (off++ == *pos)
3048                         return dev;
3049
3050         return NULL;
3051 }
3052
3053 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3054 {
3055         struct net *net = seq_file_net(seq);
3056         ++*pos;
3057         return v == SEQ_START_TOKEN ?
3058                 first_net_device(net) : next_net_device((struct net_device *)v);
3059 }
3060
3061 void dev_seq_stop(struct seq_file *seq, void *v)
3062         __releases(dev_base_lock)
3063 {
3064         read_unlock(&dev_base_lock);
3065 }
3066
3067 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3068 {
3069         const struct net_device_stats *stats = dev_get_stats(dev);
3070
3071         seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3072                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3073                    dev->name, stats->rx_bytes, stats->rx_packets,
3074                    stats->rx_errors,
3075                    stats->rx_dropped + stats->rx_missed_errors,
3076                    stats->rx_fifo_errors,
3077                    stats->rx_length_errors + stats->rx_over_errors +
3078                     stats->rx_crc_errors + stats->rx_frame_errors,
3079                    stats->rx_compressed, stats->multicast,
3080                    stats->tx_bytes, stats->tx_packets,
3081                    stats->tx_errors, stats->tx_dropped,
3082                    stats->tx_fifo_errors, stats->collisions,
3083                    stats->tx_carrier_errors +
3084                     stats->tx_aborted_errors +
3085                     stats->tx_window_errors +
3086                     stats->tx_heartbeat_errors,
3087                    stats->tx_compressed);
3088 }
3089
3090 /*
3091  *      Called from the PROCfs module. This now uses the new arbitrary sized
3092  *      /proc/net interface to create /proc/net/dev
3093  */
3094 static int dev_seq_show(struct seq_file *seq, void *v)
3095 {
3096         if (v == SEQ_START_TOKEN)
3097                 seq_puts(seq, "Inter-|   Receive                            "
3098                               "                    |  Transmit\n"
3099                               " face |bytes    packets errs drop fifo frame "
3100                               "compressed multicast|bytes    packets errs "
3101                               "drop fifo colls carrier compressed\n");
3102         else
3103                 dev_seq_printf_stats(seq, v);
3104         return 0;
3105 }
3106
3107 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3108 {
3109         struct netif_rx_stats *rc = NULL;
3110
3111         while (*pos < nr_cpu_ids)
3112                 if (cpu_online(*pos)) {
3113                         rc = &per_cpu(netdev_rx_stat, *pos);
3114                         break;
3115                 } else
3116                         ++*pos;
3117         return rc;
3118 }
3119
3120 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3121 {
3122         return softnet_get_online(pos);
3123 }
3124
3125 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3126 {
3127         ++*pos;
3128         return softnet_get_online(pos);
3129 }
3130
3131 static void softnet_seq_stop(struct seq_file *seq, void *v)
3132 {
3133 }
3134
3135 static int softnet_seq_show(struct seq_file *seq, void *v)
3136 {
3137         struct netif_rx_stats *s = v;
3138
3139         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3140                    s->total, s->dropped, s->time_squeeze, 0,
3141                    0, 0, 0, 0, /* was fastroute */
3142                    s->cpu_collision);
3143         return 0;
3144 }
3145
3146 static const struct seq_operations dev_seq_ops = {
3147         .start = dev_seq_start,
3148         .next  = dev_seq_next,
3149         .stop  = dev_seq_stop,
3150         .show  = dev_seq_show,
3151 };
3152
3153 static int dev_seq_open(struct inode *inode, struct file *file)
3154 {
3155         return seq_open_net(inode, file, &dev_seq_ops,
3156                             sizeof(struct seq_net_private));
3157 }
3158
3159 static const struct file_operations dev_seq_fops = {
3160         .owner   = THIS_MODULE,
3161         .open    = dev_seq_open,
3162         .read    = seq_read,
3163         .llseek  = seq_lseek,
3164         .release = seq_release_net,
3165 };
3166
3167 static const struct seq_operations softnet_seq_ops = {
3168         .start = softnet_seq_start,
3169         .next  = softnet_seq_next,
3170         .stop  = softnet_seq_stop,
3171         .show  = softnet_seq_show,
3172 };
3173
3174 static int softnet_seq_open(struct inode *inode, struct file *file)
3175 {
3176         return seq_open(file, &softnet_seq_ops);
3177 }
3178
3179 static const struct file_operations softnet_seq_fops = {
3180         .owner   = THIS_MODULE,
3181         .open    = softnet_seq_open,
3182         .read    = seq_read,
3183         .llseek  = seq_lseek,
3184         .release = seq_release,
3185 };
3186
3187 static void *ptype_get_idx(loff_t pos)
3188 {
3189         struct packet_type *pt = NULL;
3190         loff_t i = 0;
3191         int t;
3192
3193         list_for_each_entry_rcu(pt, &ptype_all, list) {
3194                 if (i == pos)
3195                         return pt;
3196                 ++i;
3197         }
3198
3199         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3200                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3201                         if (i == pos)
3202                                 return pt;
3203                         ++i;
3204                 }
3205         }
3206         return NULL;
3207 }
3208
3209 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3210         __acquires(RCU)
3211 {
3212         rcu_read_lock();
3213         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3214 }
3215
3216 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3217 {
3218         struct packet_type *pt;
3219         struct list_head *nxt;
3220         int hash;
3221
3222         ++*pos;
3223         if (v == SEQ_START_TOKEN)
3224                 return ptype_get_idx(0);
3225
3226         pt = v;
3227         nxt = pt->list.next;
3228         if (pt->type == htons(ETH_P_ALL)) {
3229                 if (nxt != &ptype_all)
3230                         goto found;
3231                 hash = 0;
3232                 nxt = ptype_base[0].next;
3233         } else
3234                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3235
3236         while (nxt == &ptype_base[hash]) {
3237                 if (++hash >= PTYPE_HASH_SIZE)
3238                         return NULL;
3239                 nxt = ptype_base[hash].next;
3240         }
3241 found:
3242         return list_entry(nxt, struct packet_type, list);
3243 }
3244
3245 static void ptype_seq_stop(struct seq_file *seq, void *v)
3246         __releases(RCU)
3247 {
3248         rcu_read_unlock();
3249 }
3250
3251 static int ptype_seq_show(struct seq_file *seq, void *v)
3252 {
3253         struct packet_type *pt = v;
3254
3255         if (v == SEQ_START_TOKEN)
3256                 seq_puts(seq, "Type Device      Function\n");
3257         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3258                 if (pt->type == htons(ETH_P_ALL))
3259                         seq_puts(seq, "ALL ");
3260                 else
3261                         seq_printf(seq, "%04x", ntohs(pt->type));
3262
3263                 seq_printf(seq, " %-8s %pF\n",
3264                            pt->dev ? pt->dev->name : "", pt->func);
3265         }
3266
3267         return 0;
3268 }
3269
3270 static const struct seq_operations ptype_seq_ops = {
3271         .start = ptype_seq_start,
3272         .next  = ptype_seq_next,
3273         .stop  = ptype_seq_stop,
3274         .show  = ptype_seq_show,
3275 };
3276
3277 static int ptype_seq_open(struct inode *inode, struct file *file)
3278 {
3279         return seq_open_net(inode, file, &ptype_seq_ops,
3280                         sizeof(struct seq_net_private));
3281 }
3282
3283 static const struct file_operations ptype_seq_fops = {
3284         .owner   = THIS_MODULE,
3285         .open    = ptype_seq_open,
3286         .read    = seq_read,
3287         .llseek  = seq_lseek,
3288         .release = seq_release_net,
3289 };
3290
3291
3292 static int __net_init dev_proc_net_init(struct net *net)
3293 {
3294         int rc = -ENOMEM;
3295
3296         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3297                 goto out;
3298         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3299                 goto out_dev;
3300         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3301                 goto out_softnet;
3302
3303         if (wext_proc_init(net))
3304                 goto out_ptype;
3305         rc = 0;
3306 out:
3307         return rc;
3308 out_ptype:
3309         proc_net_remove(net, "ptype");
3310 out_softnet:
3311         proc_net_remove(net, "softnet_stat");
3312 out_dev:
3313         proc_net_remove(net, "dev");
3314         goto out;
3315 }
3316
3317 static void __net_exit dev_proc_net_exit(struct net *net)
3318 {
3319         wext_proc_exit(net);
3320
3321         proc_net_remove(net, "ptype");
3322         proc_net_remove(net, "softnet_stat");
3323         proc_net_remove(net, "dev");
3324 }
3325
3326 static struct pernet_operations __net_initdata dev_proc_ops = {
3327         .init = dev_proc_net_init,
3328         .exit = dev_proc_net_exit,
3329 };
3330
3331 static int __init dev_proc_init(void)
3332 {
3333         return register_pernet_subsys(&dev_proc_ops);
3334 }
3335 #else
3336 #define dev_proc_init() 0
3337 #endif  /* CONFIG_PROC_FS */
3338
3339
3340 /**
3341  *      netdev_set_master       -       set up master/slave pair
3342  *      @slave: slave device
3343  *      @master: new master device
3344  *
3345  *      Changes the master device of the slave. Pass %NULL to break the
3346  *      bonding. The caller must hold the RTNL semaphore. On a failure
3347  *      a negative errno code is returned. On success the reference counts
3348  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3349  *      function returns zero.
3350  */
3351 int netdev_set_master(struct net_device *slave, struct net_device *master)
3352 {
3353         struct net_device *old = slave->master;
3354
3355         ASSERT_RTNL();
3356
3357         if (master) {
3358                 if (old)
3359                         return -EBUSY;
3360                 dev_hold(master);
3361         }
3362
3363         slave->master = master;
3364
3365         synchronize_net();
3366
3367         if (old)
3368                 dev_put(old);
3369
3370         if (master)
3371                 slave->flags |= IFF_SLAVE;
3372         else
3373                 slave->flags &= ~IFF_SLAVE;
3374
3375         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3376         return 0;
3377 }
3378 EXPORT_SYMBOL(netdev_set_master);
3379
3380 static void dev_change_rx_flags(struct net_device *dev, int flags)
3381 {
3382         const struct net_device_ops *ops = dev->netdev_ops;
3383
3384         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3385                 ops->ndo_change_rx_flags(dev, flags);
3386 }
3387
3388 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3389 {
3390         unsigned short old_flags = dev->flags;
3391         uid_t uid;
3392         gid_t gid;
3393
3394         ASSERT_RTNL();
3395
3396         dev->flags |= IFF_PROMISC;
3397         dev->promiscuity += inc;
3398         if (dev->promiscuity == 0) {
3399                 /*
3400                  * Avoid overflow.
3401                  * If inc causes overflow, untouch promisc and return error.
3402                  */
3403                 if (inc < 0)
3404                         dev->flags &= ~IFF_PROMISC;
3405                 else {
3406                         dev->promiscuity -= inc;
3407                         printk(KERN_WARNING "%s: promiscuity touches roof, "
3408                                 "set promiscuity failed, promiscuity feature "
3409                                 "of device might be broken.\n", dev->name);
3410                         return -EOVERFLOW;
3411                 }
3412         }
3413         if (dev->flags != old_flags) {
3414                 printk(KERN_INFO "device %s %s promiscuous mode\n",
3415                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3416                                                                "left");
3417                 if (audit_enabled) {
3418                         current_uid_gid(&uid, &gid);
3419                         audit_log(current->audit_context, GFP_ATOMIC,
3420                                 AUDIT_ANOM_PROMISCUOUS,
3421                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3422                                 dev->name, (dev->flags & IFF_PROMISC),
3423                                 (old_flags & IFF_PROMISC),
3424                                 audit_get_loginuid(current),
3425                                 uid, gid,
3426                                 audit_get_sessionid(current));
3427                 }
3428
3429                 dev_change_rx_flags(dev, IFF_PROMISC);
3430         }
3431         return 0;
3432 }
3433
3434 /**
3435  *      dev_set_promiscuity     - update promiscuity count on a device
3436  *      @dev: device
3437  *      @inc: modifier
3438  *
3439  *      Add or remove promiscuity from a device. While the count in the device
3440  *      remains above zero the interface remains promiscuous. Once it hits zero
3441  *      the device reverts back to normal filtering operation. A negative inc
3442  *      value is used to drop promiscuity on the device.
3443  *      Return 0 if successful or a negative errno code on error.
3444  */
3445 int dev_set_promiscuity(struct net_device *dev, int inc)
3446 {
3447         unsigned short old_flags = dev->flags;
3448         int err;
3449
3450         err = __dev_set_promiscuity(dev, inc);
3451         if (err < 0)
3452                 return err;
3453         if (dev->flags != old_flags)
3454                 dev_set_rx_mode(dev);
3455         return err;
3456 }
3457 EXPORT_SYMBOL(dev_set_promiscuity);
3458
3459 /**
3460  *      dev_set_allmulti        - update allmulti count on a device
3461  *      @dev: device
3462  *      @inc: modifier
3463  *
3464  *      Add or remove reception of all multicast frames to a device. While the
3465  *      count in the device remains above zero the interface remains listening
3466  *      to all interfaces. Once it hits zero the device reverts back to normal
3467  *      filtering operation. A negative @inc value is used to drop the counter
3468  *      when releasing a resource needing all multicasts.
3469  *      Return 0 if successful or a negative errno code on error.
3470  */
3471
3472 int dev_set_allmulti(struct net_device *dev, int inc)
3473 {
3474         unsigned short old_flags = dev->flags;
3475
3476         ASSERT_RTNL();
3477
3478         dev->flags |= IFF_ALLMULTI;
3479         dev->allmulti += inc;
3480         if (dev->allmulti == 0) {
3481                 /*
3482                  * Avoid overflow.
3483                  * If inc causes overflow, untouch allmulti and return error.
3484                  */
3485                 if (inc < 0)
3486                         dev->flags &= ~IFF_ALLMULTI;
3487                 else {
3488                         dev->allmulti -= inc;
3489                         printk(KERN_WARNING "%s: allmulti touches roof, "
3490                                 "set allmulti failed, allmulti feature of "
3491                                 "device might be broken.\n", dev->name);
3492                         return -EOVERFLOW;
3493                 }
3494         }
3495         if (dev->flags ^ old_flags) {
3496                 dev_change_rx_flags(dev, IFF_ALLMULTI);
3497                 dev_set_rx_mode(dev);
3498         }
3499         return 0;
3500 }
3501 EXPORT_SYMBOL(dev_set_allmulti);
3502
3503 /*
3504  *      Upload unicast and multicast address lists to device and
3505  *      configure RX filtering. When the device doesn't support unicast
3506  *      filtering it is put in promiscuous mode while unicast addresses
3507  *      are present.
3508  */
3509 void __dev_set_rx_mode(struct net_device *dev)
3510 {
3511         const struct net_device_ops *ops = dev->netdev_ops;
3512
3513         /* dev_open will call this function so the list will stay sane. */
3514         if (!(dev->flags&IFF_UP))
3515                 return;
3516
3517         if (!netif_device_present(dev))
3518                 return;
3519
3520         if (ops->ndo_set_rx_mode)
3521                 ops->ndo_set_rx_mode(dev);
3522         else {
3523                 /* Unicast addresses changes may only happen under the rtnl,
3524                  * therefore calling __dev_set_promiscuity here is safe.
3525                  */
3526                 if (dev->uc.count > 0 && !dev->uc_promisc) {
3527                         __dev_set_promiscuity(dev, 1);
3528                         dev->uc_promisc = 1;
3529                 } else if (dev->uc.count == 0 && dev->uc_promisc) {
3530                         __dev_set_promiscuity(dev, -1);
3531                         dev->uc_promisc = 0;
3532                 }
3533
3534                 if (ops->ndo_set_multicast_list)
3535                         ops->ndo_set_multicast_list(dev);
3536         }
3537 }
3538
3539 void dev_set_rx_mode(struct net_device *dev)
3540 {
3541         netif_addr_lock_bh(dev);
3542         __dev_set_rx_mode(dev);
3543         netif_addr_unlock_bh(dev);
3544 }
3545
3546 /* hw addresses list handling functions */
3547
3548 static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
3549                          int addr_len, unsigned char addr_type)
3550 {
3551         struct netdev_hw_addr *ha;
3552         int alloc_size;
3553
3554         if (addr_len > MAX_ADDR_LEN)
3555                 return -EINVAL;
3556
3557         list_for_each_entry(ha, &list->list, list) {
3558                 if (!memcmp(ha->addr, addr, addr_len) &&
3559                     ha->type == addr_type) {
3560                         ha->refcount++;
3561                         return 0;
3562                 }
3563         }
3564
3565
3566         alloc_size = sizeof(*ha);
3567         if (alloc_size < L1_CACHE_BYTES)
3568                 alloc_size = L1_CACHE_BYTES;
3569         ha = kmalloc(alloc_size, GFP_ATOMIC);
3570         if (!ha)
3571                 return -ENOMEM;
3572         memcpy(ha->addr, addr, addr_len);
3573         ha->type = addr_type;
3574         ha->refcount = 1;
3575         ha->synced = false;
3576         list_add_tail_rcu(&ha->list, &list->list);
3577         list->count++;
3578         return 0;
3579 }
3580
3581 static void ha_rcu_free(struct rcu_head *head)
3582 {
3583         struct netdev_hw_addr *ha;
3584
3585         ha = container_of(head, struct netdev_hw_addr, rcu_head);
3586         kfree(ha);
3587 }
3588
3589 static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
3590                          int addr_len, unsigned char addr_type)
3591 {
3592         struct netdev_hw_addr *ha;
3593
3594         list_for_each_entry(ha, &list->list, list) {
3595                 if (!memcmp(ha->addr, addr, addr_len) &&
3596                     (ha->type == addr_type || !addr_type)) {
3597                         if (--ha->refcount)
3598                                 return 0;
3599                         list_del_rcu(&ha->list);
3600                         call_rcu(&ha->rcu_head, ha_rcu_free);
3601                         list->count--;
3602                         return 0;
3603                 }
3604         }
3605         return -ENOENT;
3606 }
3607
3608 static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
3609                                   struct netdev_hw_addr_list *from_list,
3610                                   int addr_len,
3611                                   unsigned char addr_type)
3612 {
3613         int err;
3614         struct netdev_hw_addr *ha, *ha2;
3615         unsigned char type;
3616
3617         list_for_each_entry(ha, &from_list->list, list) {
3618                 type = addr_type ? addr_type : ha->type;
3619                 err = __hw_addr_add(to_list, ha->addr, addr_len, type);
3620                 if (err)
3621                         goto unroll;
3622         }
3623         return 0;
3624
3625 unroll:
3626         list_for_each_entry(ha2, &from_list->list, list) {
3627                 if (ha2 == ha)
3628                         break;
3629                 type = addr_type ? addr_type : ha2->type;
3630                 __hw_addr_del(to_list, ha2->addr, addr_len, type);
3631         }
3632         return err;
3633 }
3634
3635 static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
3636                                    struct netdev_hw_addr_list *from_list,
3637                                    int addr_len,
3638                                    unsigned char addr_type)
3639 {
3640         struct netdev_hw_addr *ha;
3641         unsigned char type;
3642
3643         list_for_each_entry(ha, &from_list->list, list) {
3644                 type = addr_type ? addr_type : ha->type;
3645                 __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
3646         }
3647 }
3648
3649 static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
3650                           struct netdev_hw_addr_list *from_list,
3651                           int addr_len)
3652 {
3653         int err = 0;
3654         struct netdev_hw_addr *ha, *tmp;
3655
3656         list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3657                 if (!ha->synced) {
3658                         err = __hw_addr_add(to_list, ha->addr,
3659                                             addr_len, ha->type);
3660                         if (err)
3661                                 break;
3662                         ha->synced = true;
3663                         ha->refcount++;
3664                 } else if (ha->refcount == 1) {
3665                         __hw_addr_del(to_list, ha->addr, addr_len, ha->type);
3666                         __hw_addr_del(from_list, ha->addr, addr_len, ha->type);
3667                 }
3668         }
3669         return err;
3670 }
3671
3672 static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
3673                              struct netdev_hw_addr_list *from_list,
3674                              int addr_len)
3675 {
3676         struct netdev_hw_addr *ha, *tmp;
3677
3678         list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3679                 if (ha->synced) {
3680                         __hw_addr_del(to_list, ha->addr,
3681                                       addr_len, ha->type);
3682                         ha->synced = false;
3683                         __hw_addr_del(from_list, ha->addr,
3684                                       addr_len, ha->type);
3685                 }
3686         }
3687 }
3688
3689 static void __hw_addr_flush(struct netdev_hw_addr_list *list)
3690 {
3691         struct netdev_hw_addr *ha, *tmp;
3692
3693         list_for_each_entry_safe(ha, tmp, &list->list, list) {
3694                 list_del_rcu(&ha->list);
3695                 call_rcu(&ha->rcu_head, ha_rcu_free);
3696         }
3697         list->count = 0;
3698 }
3699
3700 static void __hw_addr_init(struct netdev_hw_addr_list *list)
3701 {
3702         INIT_LIST_HEAD(&list->list);
3703         list->count = 0;
3704 }
3705
3706 /* Device addresses handling functions */
3707
3708 static void dev_addr_flush(struct net_device *dev)
3709 {
3710         /* rtnl_mutex must be held here */
3711
3712         __hw_addr_flush(&dev->dev_addrs);
3713         dev->dev_addr = NULL;
3714 }
3715
3716 static int dev_addr_init(struct net_device *dev)
3717 {
3718         unsigned char addr[MAX_ADDR_LEN];
3719         struct netdev_hw_addr *ha;
3720         int err;
3721
3722         /* rtnl_mutex must be held here */
3723
3724         __hw_addr_init(&dev->dev_addrs);
3725         memset(addr, 0, sizeof(addr));
3726         err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
3727                             NETDEV_HW_ADDR_T_LAN);
3728         if (!err) {
3729                 /*
3730                  * Get the first (previously created) address from the list
3731                  * and set dev_addr pointer to this location.
3732                  */
3733                 ha = list_first_entry(&dev->dev_addrs.list,
3734                                       struct netdev_hw_addr, list);
3735                 dev->dev_addr = ha->addr;
3736         }
3737         return err;
3738 }
3739
3740 /**
3741  *      dev_addr_add    - Add a device address
3742  *      @dev: device
3743  *      @addr: address to add
3744  *      @addr_type: address type
3745  *
3746  *      Add a device address to the device or increase the reference count if
3747  *      it already exists.
3748  *
3749  *      The caller must hold the rtnl_mutex.
3750  */
3751 int dev_addr_add(struct net_device *dev, unsigned char *addr,
3752                  unsigned char addr_type)
3753 {
3754         int err;
3755
3756         ASSERT_RTNL();
3757
3758         err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
3759         if (!err)
3760                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3761         return err;
3762 }
3763 EXPORT_SYMBOL(dev_addr_add);
3764
3765 /**
3766  *      dev_addr_del    - Release a device address.
3767  *      @dev: device
3768  *      @addr: address to delete
3769  *      @addr_type: address type
3770  *
3771  *      Release reference to a device address and remove it from the device
3772  *      if the reference count drops to zero.
3773  *
3774  *      The caller must hold the rtnl_mutex.
3775  */
3776 int dev_addr_del(struct net_device *dev, unsigned char *addr,
3777                  unsigned char addr_type)
3778 {
3779         int err;
3780         struct netdev_hw_addr *ha;
3781
3782         ASSERT_RTNL();
3783
3784         /*
3785          * We can not remove the first address from the list because
3786          * dev->dev_addr points to that.
3787          */
3788         ha = list_first_entry(&dev->dev_addrs.list,
3789                               struct netdev_hw_addr, list);
3790         if (ha->addr == dev->dev_addr && ha->refcount == 1)
3791                 return -ENOENT;
3792
3793         err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
3794                             addr_type);
3795         if (!err)
3796                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3797         return err;
3798 }
3799 EXPORT_SYMBOL(dev_addr_del);
3800
3801 /**
3802  *      dev_addr_add_multiple   - Add device addresses from another device
3803  *      @to_dev: device to which addresses will be added
3804  *      @from_dev: device from which addresses will be added
3805  *      @addr_type: address type - 0 means type will be used from from_dev
3806  *
3807  *      Add device addresses of the one device to another.
3808  **
3809  *      The caller must hold the rtnl_mutex.
3810  */
3811 int dev_addr_add_multiple(struct net_device *to_dev,
3812                           struct net_device *from_dev,
3813                           unsigned char addr_type)
3814 {
3815         int err;
3816
3817         ASSERT_RTNL();
3818
3819         if (from_dev->addr_len != to_dev->addr_len)
3820                 return -EINVAL;
3821         err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3822                                      to_dev->addr_len, addr_type);
3823         if (!err)
3824                 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3825         return err;
3826 }
3827 EXPORT_SYMBOL(dev_addr_add_multiple);
3828
3829 /**
3830  *      dev_addr_del_multiple   - Delete device addresses by another device
3831  *      @to_dev: device where the addresses will be deleted
3832  *      @from_dev: device by which addresses the addresses will be deleted
3833  *      @addr_type: address type - 0 means type will used from from_dev
3834  *
3835  *      Deletes addresses in to device by the list of addresses in from device.
3836  *
3837  *      The caller must hold the rtnl_mutex.
3838  */
3839 int dev_addr_del_multiple(struct net_device *to_dev,
3840                           struct net_device *from_dev,
3841                           unsigned char addr_type)
3842 {
3843         ASSERT_RTNL();
3844
3845         if (from_dev->addr_len != to_dev->addr_len)
3846                 return -EINVAL;
3847         __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3848                                to_dev->addr_len, addr_type);
3849         call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3850         return 0;
3851 }
3852 EXPORT_SYMBOL(dev_addr_del_multiple);
3853
3854 /* multicast addresses handling functions */
3855
3856 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3857                       void *addr, int alen, int glbl)
3858 {
3859         struct dev_addr_list *da;
3860
3861         for (; (da = *list) != NULL; list = &da->next) {
3862                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3863                     alen == da->da_addrlen) {
3864                         if (glbl) {
3865                                 int old_glbl = da->da_gusers;
3866                                 da->da_gusers = 0;
3867                                 if (old_glbl == 0)
3868                                         break;
3869                         }
3870                         if (--da->da_users)
3871                                 return 0;
3872
3873                         *list = da->next;
3874                         kfree(da);
3875                         (*count)--;
3876                         return 0;
3877                 }
3878         }
3879         return -ENOENT;
3880 }
3881
3882 int __dev_addr_add(struct dev_addr_list **list, int *count,
3883                    void *addr, int alen, int glbl)
3884 {
3885         struct dev_addr_list *da;
3886
3887         for (da = *list; da != NULL; da = da->next) {
3888                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3889                     da->da_addrlen == alen) {
3890                         if (glbl) {
3891                                 int old_glbl = da->da_gusers;
3892                                 da->da_gusers = 1;
3893                                 if (old_glbl)
3894                                         return 0;
3895                         }
3896                         da->da_users++;
3897                         return 0;
3898                 }
3899         }
3900
3901         da = kzalloc(sizeof(*da), GFP_ATOMIC);
3902         if (da == NULL)
3903                 return -ENOMEM;
3904         memcpy(da->da_addr, addr, alen);
3905         da->da_addrlen = alen;
3906         da->da_users = 1;
3907         da->da_gusers = glbl ? 1 : 0;
3908         da->next = *list;
3909         *list = da;
3910         (*count)++;
3911         return 0;
3912 }
3913
3914 /**
3915  *      dev_unicast_delete      - Release secondary unicast address.
3916  *      @dev: device
3917  *      @addr: address to delete
3918  *
3919  *      Release reference to a secondary unicast address and remove it
3920  *      from the device if the reference count drops to zero.
3921  *
3922  *      The caller must hold the rtnl_mutex.
3923  */
3924 int dev_unicast_delete(struct net_device *dev, void *addr)
3925 {
3926         int err;
3927
3928         ASSERT_RTNL();
3929
3930         netif_addr_lock_bh(dev);
3931         err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
3932                             NETDEV_HW_ADDR_T_UNICAST);
3933         if (!err)
3934                 __dev_set_rx_mode(dev);
3935         netif_addr_unlock_bh(dev);
3936         return err;
3937 }
3938 EXPORT_SYMBOL(dev_unicast_delete);
3939
3940 /**
3941  *      dev_unicast_add         - add a secondary unicast address
3942  *      @dev: device
3943  *      @addr: address to add
3944  *
3945  *      Add a secondary unicast address to the device or increase
3946  *      the reference count if it already exists.
3947  *
3948  *      The caller must hold the rtnl_mutex.
3949  */
3950 int dev_unicast_add(struct net_device *dev, void *addr)
3951 {
3952         int err;
3953
3954         ASSERT_RTNL();
3955
3956         netif_addr_lock_bh(dev);
3957         err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
3958                             NETDEV_HW_ADDR_T_UNICAST);
3959         if (!err)
3960                 __dev_set_rx_mode(dev);
3961         netif_addr_unlock_bh(dev);
3962         return err;
3963 }
3964 EXPORT_SYMBOL(dev_unicast_add);
3965
3966 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3967                     struct dev_addr_list **from, int *from_count)
3968 {
3969         struct dev_addr_list *da, *next;
3970         int err = 0;
3971
3972         da = *from;
3973         while (da != NULL) {
3974                 next = da->next;
3975                 if (!da->da_synced) {
3976                         err = __dev_addr_add(to, to_count,
3977                                              da->da_addr, da->da_addrlen, 0);
3978                         if (err < 0)
3979                                 break;
3980                         da->da_synced = 1;
3981                         da->da_users++;
3982                 } else if (da->da_users == 1) {
3983                         __dev_addr_delete(to, to_count,
3984                                           da->da_addr, da->da_addrlen, 0);
3985                         __dev_addr_delete(from, from_count,
3986                                           da->da_addr, da->da_addrlen, 0);
3987                 }
3988                 da = next;
3989         }
3990         return err;
3991 }
3992 EXPORT_SYMBOL_GPL(__dev_addr_sync);
3993
3994 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3995                        struct dev_addr_list **from, int *from_count)
3996 {
3997         struct dev_addr_list *da, *next;
3998
3999         da = *from;
4000         while (da != NULL) {
4001                 next = da->next;
4002                 if (da->da_synced) {
4003                         __dev_addr_delete(to, to_count,
4004                                           da->da_addr, da->da_addrlen, 0);
4005                         da->da_synced = 0;
4006                         __dev_addr_delete(from, from_count,
4007                                           da->da_addr, da->da_addrlen, 0);
4008                 }
4009                 da = next;
4010         }
4011 }
4012 EXPORT_SYMBOL_GPL(__dev_addr_unsync);
4013
4014 /**
4015  *      dev_unicast_sync - Synchronize device's unicast list to another device
4016  *      @to: destination device
4017  *      @from: source device
4018  *
4019  *      Add newly added addresses to the destination device and release
4020  *      addresses that have no users left. The source device must be
4021  *      locked by netif_tx_lock_bh.
4022  *
4023  *      This function is intended to be called from the dev->set_rx_mode
4024  *      function of layered software devices.
4025  */
4026 int dev_unicast_sync(struct net_device *to, struct net_device *from)
4027 {
4028         int err = 0;
4029
4030         if (to->addr_len != from->addr_len)
4031                 return -EINVAL;
4032
4033         netif_addr_lock_bh(to);
4034         err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
4035         if (!err)
4036                 __dev_set_rx_mode(to);
4037         netif_addr_unlock_bh(to);
4038         return err;
4039 }
4040 EXPORT_SYMBOL(dev_unicast_sync);
4041
4042 /**
4043  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
4044  *      @to: destination device
4045  *      @from: source device
4046  *
4047  *      Remove all addresses that were added to the destination device by
4048  *      dev_unicast_sync(). This function is intended to be called from the
4049  *      dev->stop function of layered software devices.
4050  */
4051 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4052 {
4053         if (to->addr_len != from->addr_len)
4054                 return;
4055
4056         netif_addr_lock_bh(from);
4057         netif_addr_lock(to);
4058         __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
4059         __dev_set_rx_mode(to);
4060         netif_addr_unlock(to);
4061         netif_addr_unlock_bh(from);
4062 }
4063 EXPORT_SYMBOL(dev_unicast_unsync);
4064
4065 static void dev_unicast_flush(struct net_device *dev)
4066 {
4067         netif_addr_lock_bh(dev);
4068         __hw_addr_flush(&dev->uc);
4069         netif_addr_unlock_bh(dev);
4070 }
4071
4072 static void dev_unicast_init(struct net_device *dev)
4073 {
4074         __hw_addr_init(&dev->uc);
4075 }
4076
4077
4078 static void __dev_addr_discard(struct dev_addr_list **list)
4079 {
4080         struct dev_addr_list *tmp;
4081
4082         while (*list != NULL) {
4083                 tmp = *list;
4084                 *list = tmp->next;
4085                 if (tmp->da_users > tmp->da_gusers)
4086                         printk("__dev_addr_discard: address leakage! "
4087                                "da_users=%d\n", tmp->da_users);
4088                 kfree(tmp);
4089         }
4090 }
4091
4092 static void dev_addr_discard(struct net_device *dev)
4093 {
4094         netif_addr_lock_bh(dev);
4095
4096         __dev_addr_discard(&dev->mc_list);
4097         dev->mc_count = 0;
4098
4099         netif_addr_unlock_bh(dev);
4100 }
4101
4102 /**
4103  *      dev_get_flags - get flags reported to userspace
4104  *      @dev: device
4105  *
4106  *      Get the combination of flag bits exported through APIs to userspace.
4107  */
4108 unsigned dev_get_flags(const struct net_device *dev)
4109 {
4110         unsigned flags;
4111
4112         flags = (dev->flags & ~(IFF_PROMISC |
4113                                 IFF_ALLMULTI |
4114                                 IFF_RUNNING |
4115                                 IFF_LOWER_UP |
4116                                 IFF_DORMANT)) |
4117                 (dev->gflags & (IFF_PROMISC |
4118                                 IFF_ALLMULTI));
4119
4120         if (netif_running(dev)) {
4121                 if (netif_oper_up(dev))
4122                         flags |= IFF_RUNNING;
4123                 if (netif_carrier_ok(dev))
4124                         flags |= IFF_LOWER_UP;
4125                 if (netif_dormant(dev))
4126                         flags |= IFF_DORMANT;
4127         }
4128
4129         return flags;
4130 }
4131 EXPORT_SYMBOL(dev_get_flags);
4132
4133 /**
4134  *      dev_change_flags - change device settings
4135  *      @dev: device
4136  *      @flags: device state flags
4137  *
4138  *      Change settings on device based state flags. The flags are
4139  *      in the userspace exported format.
4140  */
4141 int dev_change_flags(struct net_device *dev, unsigned flags)
4142 {
4143         int ret, changes;
4144         int old_flags = dev->flags;
4145
4146         ASSERT_RTNL();
4147
4148         /*
4149          *      Set the flags on our device.
4150          */
4151
4152         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4153                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4154                                IFF_AUTOMEDIA)) |
4155                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4156                                     IFF_ALLMULTI));
4157
4158         /*
4159          *      Load in the correct multicast list now the flags have changed.
4160          */
4161
4162         if ((old_flags ^ flags) & IFF_MULTICAST)
4163                 dev_change_rx_flags(dev, IFF_MULTICAST);
4164
4165         dev_set_rx_mode(dev);
4166
4167         /*
4168          *      Have we downed the interface. We handle IFF_UP ourselves
4169          *      according to user attempts to set it, rather than blindly
4170          *      setting it.
4171          */
4172
4173         ret = 0;
4174         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4175                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
4176
4177                 if (!ret)
4178                         dev_set_rx_mode(dev);
4179         }
4180
4181         if (dev->flags & IFF_UP &&
4182             ((old_flags ^ dev->flags) & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
4183                                           IFF_VOLATILE)))
4184                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4185
4186         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4187                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4188
4189                 dev->gflags ^= IFF_PROMISC;
4190                 dev_set_promiscuity(dev, inc);
4191         }
4192
4193         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4194            is important. Some (broken) drivers set IFF_PROMISC, when
4195            IFF_ALLMULTI is requested not asking us and not reporting.
4196          */
4197         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4198                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4199
4200                 dev->gflags ^= IFF_ALLMULTI;
4201                 dev_set_allmulti(dev, inc);
4202         }
4203
4204         /* Exclude state transition flags, already notified */
4205         changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
4206         if (changes)
4207                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4208
4209         return ret;
4210 }
4211 EXPORT_SYMBOL(dev_change_flags);
4212
4213 /**
4214  *      dev_set_mtu - Change maximum transfer unit
4215  *      @dev: device
4216  *      @new_mtu: new transfer unit
4217  *
4218  *      Change the maximum transfer size of the network device.
4219  */
4220 int dev_set_mtu(struct net_device *dev, int new_mtu)
4221 {
4222         const struct net_device_ops *ops = dev->netdev_ops;
4223         int err;
4224
4225         if (new_mtu == dev->mtu)
4226                 return 0;
4227
4228         /*      MTU must be positive.    */
4229         if (new_mtu < 0)
4230                 return -EINVAL;
4231
4232         if (!netif_device_present(dev))
4233                 return -ENODEV;
4234
4235         err = 0;
4236         if (ops->ndo_change_mtu)
4237                 err = ops->ndo_change_mtu(dev, new_mtu);
4238         else
4239                 dev->mtu = new_mtu;
4240
4241         if (!err && dev->flags & IFF_UP)
4242                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4243         return err;
4244 }
4245 EXPORT_SYMBOL(dev_set_mtu);
4246
4247 /**
4248  *      dev_set_mac_address - Change Media Access Control Address
4249  *      @dev: device
4250  *      @sa: new address
4251  *
4252  *      Change the hardware (MAC) address of the device
4253  */
4254 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4255 {
4256         const struct net_device_ops *ops = dev->netdev_ops;
4257         int err;
4258
4259         if (!ops->ndo_set_mac_address)
4260                 return -EOPNOTSUPP;
4261         if (sa->sa_family != dev->type)
4262                 return -EINVAL;
4263         if (!netif_device_present(dev))
4264                 return -ENODEV;
4265         err = ops->ndo_set_mac_address(dev, sa);
4266         if (!err)
4267                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4268         return err;
4269 }
4270 EXPORT_SYMBOL(dev_set_mac_address);
4271
4272 /*
4273  *      Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
4274  */
4275 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4276 {
4277         int err;
4278         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4279
4280         if (!dev)
4281                 return -ENODEV;
4282
4283         switch (cmd) {
4284         case SIOCGIFFLAGS:      /* Get interface flags */
4285                 ifr->ifr_flags = (short) dev_get_flags(dev);
4286                 return 0;
4287
4288         case SIOCGIFMETRIC:     /* Get the metric on the interface
4289                                    (currently unused) */
4290                 ifr->ifr_metric = 0;
4291                 return 0;
4292
4293         case SIOCGIFMTU:        /* Get the MTU of a device */
4294                 ifr->ifr_mtu = dev->mtu;
4295                 return 0;
4296
4297         case SIOCGIFHWADDR:
4298                 if (!dev->addr_len)
4299                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4300                 else
4301                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4302                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4303                 ifr->ifr_hwaddr.sa_family = dev->type;
4304                 return 0;
4305
4306         case SIOCGIFSLAVE:
4307                 err = -EINVAL;
4308                 break;
4309
4310         case SIOCGIFMAP:
4311                 ifr->ifr_map.mem_start = dev->mem_start;
4312                 ifr->ifr_map.mem_end   = dev->mem_end;
4313                 ifr->ifr_map.base_addr = dev->base_addr;
4314                 ifr->ifr_map.irq       = dev->irq;
4315                 ifr->ifr_map.dma       = dev->dma;
4316                 ifr->ifr_map.port      = dev->if_port;
4317                 return 0;
4318
4319         case SIOCGIFINDEX:
4320                 ifr->ifr_ifindex = dev->ifindex;
4321                 return 0;
4322
4323         case SIOCGIFTXQLEN:
4324                 ifr->ifr_qlen = dev->tx_queue_len;
4325                 return 0;
4326
4327         default:
4328                 /* dev_ioctl() should ensure this case
4329                  * is never reached
4330                  */
4331                 WARN_ON(1);
4332                 err = -EINVAL;
4333                 break;
4334
4335         }
4336         return err;
4337 }
4338
4339 /*
4340  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4341  */
4342 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4343 {
4344         int err;
4345         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4346         const struct net_device_ops *ops;
4347
4348         if (!dev)
4349                 return -ENODEV;
4350
4351         ops = dev->netdev_ops;
4352
4353         switch (cmd) {
4354         case SIOCSIFFLAGS:      /* Set interface flags */
4355                 return dev_change_flags(dev, ifr->ifr_flags);
4356
4357         case SIOCSIFMETRIC:     /* Set the metric on the interface
4358                                    (currently unused) */
4359                 return -EOPNOTSUPP;
4360
4361         case SIOCSIFMTU:        /* Set the MTU of a device */
4362                 return dev_set_mtu(dev, ifr->ifr_mtu);
4363
4364         case SIOCSIFHWADDR:
4365                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4366
4367         case SIOCSIFHWBROADCAST:
4368                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4369                         return -EINVAL;
4370                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4371                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4372                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4373                 return 0;
4374
4375         case SIOCSIFMAP:
4376                 if (ops->ndo_set_config) {
4377                         if (!netif_device_present(dev))
4378                                 return -ENODEV;
4379                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4380                 }
4381                 return -EOPNOTSUPP;
4382
4383         case SIOCADDMULTI:
4384                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4385                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4386                         return -EINVAL;
4387                 if (!netif_device_present(dev))
4388                         return -ENODEV;
4389                 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
4390                                   dev->addr_len, 1);
4391
4392         case SIOCDELMULTI:
4393                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4394                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4395                         return -EINVAL;
4396                 if (!netif_device_present(dev))
4397                         return -ENODEV;
4398                 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
4399                                      dev->addr_len, 1);
4400
4401         case SIOCSIFTXQLEN:
4402                 if (ifr->ifr_qlen < 0)
4403                         return -EINVAL;
4404                 dev->tx_queue_len = ifr->ifr_qlen;
4405                 return 0;
4406
4407         case SIOCSIFNAME:
4408                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4409                 return dev_change_name(dev, ifr->ifr_newname);
4410
4411         /*
4412          *      Unknown or private ioctl
4413          */
4414         default:
4415                 if ((cmd >= SIOCDEVPRIVATE &&
4416                     cmd <= SIOCDEVPRIVATE + 15) ||
4417                     cmd == SIOCBONDENSLAVE ||
4418                     cmd == SIOCBONDRELEASE ||
4419                     cmd == SIOCBONDSETHWADDR ||
4420                     cmd == SIOCBONDSLAVEINFOQUERY ||
4421                     cmd == SIOCBONDINFOQUERY ||
4422                     cmd == SIOCBONDCHANGEACTIVE ||
4423                     cmd == SIOCGMIIPHY ||
4424                     cmd == SIOCGMIIREG ||
4425                     cmd == SIOCSMIIREG ||
4426                     cmd == SIOCBRADDIF ||
4427                     cmd == SIOCBRDELIF ||
4428                     cmd == SIOCSHWTSTAMP ||
4429                     cmd == SIOCWANDEV) {
4430                         err = -EOPNOTSUPP;
4431                         if (ops->ndo_do_ioctl) {
4432                                 if (netif_device_present(dev))
4433                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4434                                 else
4435                                         err = -ENODEV;
4436                         }
4437                 } else
4438                         err = -EINVAL;
4439
4440         }
4441         return err;
4442 }
4443
4444 /*
4445  *      This function handles all "interface"-type I/O control requests. The actual
4446  *      'doing' part of this is dev_ifsioc above.
4447  */
4448
4449 /**
4450  *      dev_ioctl       -       network device ioctl
4451  *      @net: the applicable net namespace
4452  *      @cmd: command to issue
4453  *      @arg: pointer to a struct ifreq in user space
4454  *
4455  *      Issue ioctl functions to devices. This is normally called by the
4456  *      user space syscall interfaces but can sometimes be useful for
4457  *      other purposes. The return value is the return from the syscall if
4458  *      positive or a negative errno code on error.
4459  */
4460
4461 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4462 {
4463         struct ifreq ifr;
4464         int ret;
4465         char *colon;
4466
4467         /* One special case: SIOCGIFCONF takes ifconf argument
4468            and requires shared lock, because it sleeps writing
4469            to user space.
4470          */
4471
4472         if (cmd == SIOCGIFCONF) {
4473                 rtnl_lock();
4474                 ret = dev_ifconf(net, (char __user *) arg);
4475                 rtnl_unlock();
4476                 return ret;
4477         }
4478         if (cmd == SIOCGIFNAME)
4479                 return dev_ifname(net, (struct ifreq __user *)arg);
4480
4481         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4482                 return -EFAULT;
4483
4484         ifr.ifr_name[IFNAMSIZ-1] = 0;
4485
4486         colon = strchr(ifr.ifr_name, ':');
4487         if (colon)
4488                 *colon = 0;
4489
4490         /*
4491          *      See which interface the caller is talking about.
4492          */
4493
4494         switch (cmd) {
4495         /*
4496          *      These ioctl calls:
4497          *      - can be done by all.
4498          *      - atomic and do not require locking.
4499          *      - return a value
4500          */
4501         case SIOCGIFFLAGS:
4502         case SIOCGIFMETRIC:
4503         case SIOCGIFMTU:
4504         case SIOCGIFHWADDR:
4505         case SIOCGIFSLAVE:
4506         case SIOCGIFMAP:
4507         case SIOCGIFINDEX:
4508         case SIOCGIFTXQLEN:
4509                 dev_load(net, ifr.ifr_name);
4510                 read_lock(&dev_base_lock);
4511                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4512                 read_unlock(&dev_base_lock);
4513                 if (!ret) {
4514                         if (colon)
4515                                 *colon = ':';
4516                         if (copy_to_user(arg, &ifr,
4517                                          sizeof(struct ifreq)))
4518                                 ret = -EFAULT;
4519                 }
4520                 return ret;
4521
4522         case SIOCETHTOOL:
4523                 dev_load(net, ifr.ifr_name);
4524                 rtnl_lock();
4525                 ret = dev_ethtool(net, &ifr);
4526                 rtnl_unlock();
4527                 if (!ret) {
4528                         if (colon)
4529                                 *colon = ':';
4530                         if (copy_to_user(arg, &ifr,
4531                                          sizeof(struct ifreq)))
4532                                 ret = -EFAULT;
4533                 }
4534                 return ret;
4535
4536         /*
4537          *      These ioctl calls:
4538          *      - require superuser power.
4539          *      - require strict serialization.
4540          *      - return a value
4541          */
4542         case SIOCGMIIPHY:
4543         case SIOCGMIIREG:
4544         case SIOCSIFNAME:
4545                 if (!capable(CAP_NET_ADMIN))
4546                         return -EPERM;
4547                 dev_load(net, ifr.ifr_name);
4548                 rtnl_lock();
4549                 ret = dev_ifsioc(net, &ifr, cmd);
4550                 rtnl_unlock();
4551                 if (!ret) {
4552                         if (colon)
4553                                 *colon = ':';
4554                         if (copy_to_user(arg, &ifr,
4555                                          sizeof(struct ifreq)))
4556                                 ret = -EFAULT;
4557                 }
4558                 return ret;
4559
4560         /*
4561          *      These ioctl calls:
4562          *      - require superuser power.
4563          *      - require strict serialization.
4564          *      - do not return a value
4565          */
4566         case SIOCSIFFLAGS:
4567         case SIOCSIFMETRIC:
4568         case SIOCSIFMTU:
4569         case SIOCSIFMAP:
4570         case SIOCSIFHWADDR:
4571         case SIOCSIFSLAVE:
4572         case SIOCADDMULTI:
4573         case SIOCDELMULTI:
4574         case SIOCSIFHWBROADCAST:
4575         case SIOCSIFTXQLEN:
4576         case SIOCSMIIREG:
4577         case SIOCBONDENSLAVE:
4578         case SIOCBONDRELEASE:
4579         case SIOCBONDSETHWADDR:
4580         case SIOCBONDCHANGEACTIVE:
4581         case SIOCBRADDIF:
4582         case SIOCBRDELIF:
4583         case SIOCSHWTSTAMP:
4584                 if (!capable(CAP_NET_ADMIN))
4585                         return -EPERM;
4586                 /* fall through */
4587         case SIOCBONDSLAVEINFOQUERY:
4588         case SIOCBONDINFOQUERY:
4589                 dev_load(net, ifr.ifr_name);
4590                 rtnl_lock();
4591                 ret = dev_ifsioc(net, &ifr, cmd);
4592                 rtnl_unlock();
4593                 return ret;
4594
4595         case SIOCGIFMEM:
4596                 /* Get the per device memory space. We can add this but
4597                  * currently do not support it */
4598         case SIOCSIFMEM:
4599                 /* Set the per device memory buffer space.
4600                  * Not applicable in our case */
4601         case SIOCSIFLINK:
4602                 return -EINVAL;
4603
4604         /*
4605          *      Unknown or private ioctl.
4606          */
4607         default:
4608                 if (cmd == SIOCWANDEV ||
4609                     (cmd >= SIOCDEVPRIVATE &&
4610                      cmd <= SIOCDEVPRIVATE + 15)) {
4611                         dev_load(net, ifr.ifr_name);
4612                         rtnl_lock();
4613                         ret = dev_ifsioc(net, &ifr, cmd);
4614                         rtnl_unlock();
4615                         if (!ret && copy_to_user(arg, &ifr,
4616                                                  sizeof(struct ifreq)))
4617                                 ret = -EFAULT;
4618                         return ret;
4619                 }
4620                 /* Take care of Wireless Extensions */
4621                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4622                         return wext_handle_ioctl(net, &ifr, cmd, arg);
4623                 return -EINVAL;
4624         }
4625 }
4626
4627
4628 /**
4629  *      dev_new_index   -       allocate an ifindex
4630  *      @net: the applicable net namespace
4631  *
4632  *      Returns a suitable unique value for a new device interface
4633  *      number.  The caller must hold the rtnl semaphore or the
4634  *      dev_base_lock to be sure it remains unique.
4635  */
4636 static int dev_new_index(struct net *net)
4637 {
4638         static int ifindex;
4639         for (;;) {
4640                 if (++ifindex <= 0)
4641                         ifindex = 1;
4642                 if (!__dev_get_by_index(net, ifindex))
4643                         return ifindex;
4644         }
4645 }
4646
4647 /* Delayed registration/unregisteration */
4648 static LIST_HEAD(net_todo_list);
4649
4650 static void net_set_todo(struct net_device *dev)
4651 {
4652         list_add_tail(&dev->todo_list, &net_todo_list);
4653 }
4654
4655 static void rollback_registered(struct net_device *dev)
4656 {
4657         BUG_ON(dev_boot_phase);
4658         ASSERT_RTNL();
4659
4660         /* Some devices call without registering for initialization unwind. */
4661         if (dev->reg_state == NETREG_UNINITIALIZED) {
4662                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4663                                   "was registered\n", dev->name, dev);
4664
4665                 WARN_ON(1);
4666                 return;
4667         }
4668
4669         BUG_ON(dev->reg_state != NETREG_REGISTERED);
4670
4671         /* If device is running, close it first. */
4672         dev_close(dev);
4673
4674         /* And unlink it from device chain. */
4675         unlist_netdevice(dev);
4676
4677         dev->reg_state = NETREG_UNREGISTERING;
4678
4679         synchronize_net();
4680
4681         /* Shutdown queueing discipline. */
4682         dev_shutdown(dev);
4683
4684
4685         /* Notify protocols, that we are about to destroy
4686            this device. They should clean all the things.
4687         */
4688         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4689
4690         /*
4691          *      Flush the unicast and multicast chains
4692          */
4693         dev_unicast_flush(dev);
4694         dev_addr_discard(dev);
4695
4696         if (dev->netdev_ops->ndo_uninit)
4697                 dev->netdev_ops->ndo_uninit(dev);
4698
4699         /* Notifier chain MUST detach us from master device. */
4700         WARN_ON(dev->master);
4701
4702         /* Remove entries from kobject tree */
4703         netdev_unregister_kobject(dev);
4704
4705         synchronize_net();
4706
4707         dev_put(dev);
4708 }
4709
4710 static void __netdev_init_queue_locks_one(struct net_device *dev,
4711                                           struct netdev_queue *dev_queue,
4712                                           void *_unused)
4713 {
4714         spin_lock_init(&dev_queue->_xmit_lock);
4715         netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4716         dev_queue->xmit_lock_owner = -1;
4717 }
4718
4719 static void netdev_init_queue_locks(struct net_device *dev)
4720 {
4721         netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4722         __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4723 }
4724
4725 unsigned long netdev_fix_features(unsigned long features, const char *name)
4726 {
4727         /* Fix illegal SG+CSUM combinations. */
4728         if ((features & NETIF_F_SG) &&
4729             !(features & NETIF_F_ALL_CSUM)) {
4730                 if (name)
4731                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4732                                "checksum feature.\n", name);
4733                 features &= ~NETIF_F_SG;
4734         }
4735
4736         /* TSO requires that SG is present as well. */
4737         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4738                 if (name)
4739                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4740                                "SG feature.\n", name);
4741                 features &= ~NETIF_F_TSO;
4742         }
4743
4744         if (features & NETIF_F_UFO) {
4745                 if (!(features & NETIF_F_GEN_CSUM)) {
4746                         if (name)
4747                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4748                                        "since no NETIF_F_HW_CSUM feature.\n",
4749                                        name);
4750                         features &= ~NETIF_F_UFO;
4751                 }
4752
4753                 if (!(features & NETIF_F_SG)) {
4754                         if (name)
4755                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4756                                        "since no NETIF_F_SG feature.\n", name);
4757                         features &= ~NETIF_F_UFO;
4758                 }
4759         }
4760
4761         return features;
4762 }
4763 EXPORT_SYMBOL(netdev_fix_features);
4764
4765 /**
4766  *      register_netdevice      - register a network device
4767  *      @dev: device to register
4768  *
4769  *      Take a completed network device structure and add it to the kernel
4770  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4771  *      chain. 0 is returned on success. A negative errno code is returned
4772  *      on a failure to set up the device, or if the name is a duplicate.
4773  *
4774  *      Callers must hold the rtnl semaphore. You may want
4775  *      register_netdev() instead of this.
4776  *
4777  *      BUGS:
4778  *      The locking appears insufficient to guarantee two parallel registers
4779  *      will not get the same name.
4780  */
4781
4782 int register_netdevice(struct net_device *dev)
4783 {
4784         struct hlist_head *head;
4785         struct hlist_node *p;
4786         int ret;
4787         struct net *net = dev_net(dev);
4788
4789         BUG_ON(dev_boot_phase);
4790         ASSERT_RTNL();
4791
4792         might_sleep();
4793
4794         /* When net_device's are persistent, this will be fatal. */
4795         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4796         BUG_ON(!net);
4797
4798         spin_lock_init(&dev->addr_list_lock);
4799         netdev_set_addr_lockdep_class(dev);
4800         netdev_init_queue_locks(dev);
4801
4802         dev->iflink = -1;
4803
4804         /* Init, if this function is available */
4805         if (dev->netdev_ops->ndo_init) {
4806                 ret = dev->netdev_ops->ndo_init(dev);
4807                 if (ret) {
4808                         if (ret > 0)
4809                                 ret = -EIO;
4810                         goto out;
4811                 }
4812         }
4813
4814         if (!dev_valid_name(dev->name)) {
4815                 ret = -EINVAL;
4816                 goto err_uninit;
4817         }
4818
4819         dev->ifindex = dev_new_index(net);
4820         if (dev->iflink == -1)
4821                 dev->iflink = dev->ifindex;
4822
4823         /* Check for existence of name */
4824         head = dev_name_hash(net, dev->name);
4825         hlist_for_each(p, head) {
4826                 struct net_device *d
4827                         = hlist_entry(p, struct net_device, name_hlist);
4828                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4829                         ret = -EEXIST;
4830                         goto err_uninit;
4831                 }
4832         }
4833
4834         /* Fix illegal checksum combinations */
4835         if ((dev->features & NETIF_F_HW_CSUM) &&
4836             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4837                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4838                        dev->name);
4839                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4840         }
4841
4842         if ((dev->features & NETIF_F_NO_CSUM) &&
4843             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4844                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4845                        dev->name);
4846                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4847         }
4848
4849         dev->features = netdev_fix_features(dev->features, dev->name);
4850
4851         /* Enable software GSO if SG is supported. */
4852         if (dev->features & NETIF_F_SG)
4853                 dev->features |= NETIF_F_GSO;
4854
4855         netdev_initialize_kobject(dev);
4856         ret = netdev_register_kobject(dev);
4857         if (ret)
4858                 goto err_uninit;
4859         dev->reg_state = NETREG_REGISTERED;
4860
4861         /*
4862          *      Default initial state at registry is that the
4863          *      device is present.
4864          */
4865
4866         set_bit(__LINK_STATE_PRESENT, &dev->state);
4867
4868         dev_init_scheduler(dev);
4869         dev_hold(dev);
4870         list_netdevice(dev);
4871
4872         /* Notify protocols, that a new device appeared. */
4873         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4874         ret = notifier_to_errno(ret);
4875         if (ret) {
4876                 rollback_registered(dev);
4877                 dev->reg_state = NETREG_UNREGISTERED;
4878         }
4879         /*
4880          *      Prevent userspace races by waiting until the network
4881          *      device is fully setup before sending notifications.
4882          */
4883         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
4884
4885 out:
4886         return ret;
4887
4888 err_uninit:
4889         if (dev->netdev_ops->ndo_uninit)
4890                 dev->netdev_ops->ndo_uninit(dev);
4891         goto out;
4892 }
4893 EXPORT_SYMBOL(register_netdevice);
4894
4895 /**
4896  *      init_dummy_netdev       - init a dummy network device for NAPI
4897  *      @dev: device to init
4898  *
4899  *      This takes a network device structure and initialize the minimum
4900  *      amount of fields so it can be used to schedule NAPI polls without
4901  *      registering a full blown interface. This is to be used by drivers
4902  *      that need to tie several hardware interfaces to a single NAPI
4903  *      poll scheduler due to HW limitations.
4904  */
4905 int init_dummy_netdev(struct net_device *dev)
4906 {
4907         /* Clear everything. Note we don't initialize spinlocks
4908          * are they aren't supposed to be taken by any of the
4909          * NAPI code and this dummy netdev is supposed to be
4910          * only ever used for NAPI polls
4911          */
4912         memset(dev, 0, sizeof(struct net_device));
4913
4914         /* make sure we BUG if trying to hit standard
4915          * register/unregister code path
4916          */
4917         dev->reg_state = NETREG_DUMMY;
4918
4919         /* initialize the ref count */
4920         atomic_set(&dev->refcnt, 1);
4921
4922         /* NAPI wants this */
4923         INIT_LIST_HEAD(&dev->napi_list);
4924
4925         /* a dummy interface is started by default */
4926         set_bit(__LINK_STATE_PRESENT, &dev->state);
4927         set_bit(__LINK_STATE_START, &dev->state);
4928
4929         return 0;
4930 }
4931 EXPORT_SYMBOL_GPL(init_dummy_netdev);
4932
4933
4934 /**
4935  *      register_netdev - register a network device
4936  *      @dev: device to register
4937  *
4938  *      Take a completed network device structure and add it to the kernel
4939  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4940  *      chain. 0 is returned on success. A negative errno code is returned
4941  *      on a failure to set up the device, or if the name is a duplicate.
4942  *
4943  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
4944  *      and expands the device name if you passed a format string to
4945  *      alloc_netdev.
4946  */
4947 int register_netdev(struct net_device *dev)
4948 {
4949         int err;
4950
4951         rtnl_lock();
4952
4953         /*
4954          * If the name is a format string the caller wants us to do a
4955          * name allocation.
4956          */
4957         if (strchr(dev->name, '%')) {
4958                 err = dev_alloc_name(dev, dev->name);
4959                 if (err < 0)
4960                         goto out;
4961         }
4962
4963         err = register_netdevice(dev);
4964 out:
4965         rtnl_unlock();
4966         return err;
4967 }
4968 EXPORT_SYMBOL(register_netdev);
4969
4970 /*
4971  * netdev_wait_allrefs - wait until all references are gone.
4972  *
4973  * This is called when unregistering network devices.
4974  *
4975  * Any protocol or device that holds a reference should register
4976  * for netdevice notification, and cleanup and put back the
4977  * reference if they receive an UNREGISTER event.
4978  * We can get stuck here if buggy protocols don't correctly
4979  * call dev_put.
4980  */
4981 static void netdev_wait_allrefs(struct net_device *dev)
4982 {
4983         unsigned long rebroadcast_time, warning_time;
4984
4985         rebroadcast_time = warning_time = jiffies;
4986         while (atomic_read(&dev->refcnt) != 0) {
4987                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4988                         rtnl_lock();
4989
4990                         /* Rebroadcast unregister notification */
4991                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4992
4993                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4994                                      &dev->state)) {
4995                                 /* We must not have linkwatch events
4996                                  * pending on unregister. If this
4997                                  * happens, we simply run the queue
4998                                  * unscheduled, resulting in a noop
4999                                  * for this device.
5000                                  */
5001                                 linkwatch_run_queue();
5002                         }
5003
5004                         __rtnl_unlock();
5005
5006                         rebroadcast_time = jiffies;
5007                 }
5008
5009                 msleep(250);
5010
5011                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5012                         printk(KERN_EMERG "unregister_netdevice: "
5013                                "waiting for %s to become free. Usage "
5014                                "count = %d\n",
5015                                dev->name, atomic_read(&dev->refcnt));
5016                         warning_time = jiffies;
5017                 }
5018         }
5019 }
5020
5021 /* The sequence is:
5022  *
5023  *      rtnl_lock();
5024  *      ...
5025  *      register_netdevice(x1);
5026  *      register_netdevice(x2);
5027  *      ...
5028  *      unregister_netdevice(y1);
5029  *      unregister_netdevice(y2);
5030  *      ...
5031  *      rtnl_unlock();
5032  *      free_netdev(y1);
5033  *      free_netdev(y2);
5034  *
5035  * We are invoked by rtnl_unlock().
5036  * This allows us to deal with problems:
5037  * 1) We can delete sysfs objects which invoke hotplug
5038  *    without deadlocking with linkwatch via keventd.
5039  * 2) Since we run with the RTNL semaphore not held, we can sleep
5040  *    safely in order to wait for the netdev refcnt to drop to zero.
5041  *
5042  * We must not return until all unregister events added during
5043  * the interval the lock was held have been completed.
5044  */
5045 void netdev_run_todo(void)
5046 {
5047         struct list_head list;
5048
5049         /* Snapshot list, allow later requests */
5050         list_replace_init(&net_todo_list, &list);
5051
5052         __rtnl_unlock();
5053
5054         while (!list_empty(&list)) {
5055                 struct net_device *dev
5056                         = list_entry(list.next, struct net_device, todo_list);
5057                 list_del(&dev->todo_list);
5058
5059                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5060                         printk(KERN_ERR "network todo '%s' but state %d\n",
5061                                dev->name, dev->reg_state);
5062                         dump_stack();
5063                         continue;
5064                 }
5065
5066                 dev->reg_state = NETREG_UNREGISTERED;
5067
5068                 on_each_cpu(flush_backlog, dev, 1);
5069
5070                 netdev_wait_allrefs(dev);
5071
5072                 /* paranoia */
5073                 BUG_ON(atomic_read(&dev->refcnt));
5074                 WARN_ON(dev->ip_ptr);
5075                 WARN_ON(dev->ip6_ptr);
5076                 WARN_ON(dev->dn_ptr);
5077
5078                 if (dev->destructor)
5079                         dev->destructor(dev);
5080
5081                 /* Free network device */
5082                 kobject_put(&dev->dev.kobj);
5083         }
5084 }
5085
5086 /**
5087  *      dev_get_stats   - get network device statistics
5088  *      @dev: device to get statistics from
5089  *
5090  *      Get network statistics from device. The device driver may provide
5091  *      its own method by setting dev->netdev_ops->get_stats; otherwise
5092  *      the internal statistics structure is used.
5093  */
5094 const struct net_device_stats *dev_get_stats(struct net_device *dev)
5095 {
5096         const struct net_device_ops *ops = dev->netdev_ops;
5097
5098         if (ops->ndo_get_stats)
5099                 return ops->ndo_get_stats(dev);
5100         else {
5101                 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5102                 struct net_device_stats *stats = &dev->stats;
5103                 unsigned int i;
5104                 struct netdev_queue *txq;
5105
5106                 for (i = 0; i < dev->num_tx_queues; i++) {
5107                         txq = netdev_get_tx_queue(dev, i);
5108                         tx_bytes   += txq->tx_bytes;
5109                         tx_packets += txq->tx_packets;
5110                         tx_dropped += txq->tx_dropped;
5111                 }
5112                 if (tx_bytes || tx_packets || tx_dropped) {
5113                         stats->tx_bytes   = tx_bytes;
5114                         stats->tx_packets = tx_packets;
5115                         stats->tx_dropped = tx_dropped;
5116                 }
5117                 return stats;
5118         }
5119 }
5120 EXPORT_SYMBOL(dev_get_stats);
5121
5122 static void netdev_init_one_queue(struct net_device *dev,
5123                                   struct netdev_queue *queue,
5124                                   void *_unused)
5125 {
5126         queue->dev = dev;
5127 }
5128
5129 static void netdev_init_queues(struct net_device *dev)
5130 {
5131         netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5132         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5133         spin_lock_init(&dev->tx_global_lock);
5134 }
5135
5136 /**
5137  *      alloc_netdev_mq - allocate network device
5138  *      @sizeof_priv:   size of private data to allocate space for
5139  *      @name:          device name format string
5140  *      @setup:         callback to initialize device
5141  *      @queue_count:   the number of subqueues to allocate
5142  *
5143  *      Allocates a struct net_device with private data area for driver use
5144  *      and performs basic initialization.  Also allocates subquue structs
5145  *      for each queue on the device at the end of the netdevice.
5146  */
5147 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5148                 void (*setup)(struct net_device *), unsigned int queue_count)
5149 {
5150         struct netdev_queue *tx;
5151         struct net_device *dev;
5152         size_t alloc_size;
5153         struct net_device *p;
5154
5155         BUG_ON(strlen(name) >= sizeof(dev->name));
5156
5157         alloc_size = sizeof(struct net_device);
5158         if (sizeof_priv) {
5159                 /* ensure 32-byte alignment of private area */
5160                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5161                 alloc_size += sizeof_priv;
5162         }
5163         /* ensure 32-byte alignment of whole construct */
5164         alloc_size += NETDEV_ALIGN - 1;
5165
5166         p = kzalloc(alloc_size, GFP_KERNEL);
5167         if (!p) {
5168                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5169                 return NULL;
5170         }
5171
5172         tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5173         if (!tx) {
5174                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5175                        "tx qdiscs.\n");
5176                 goto free_p;
5177         }
5178
5179         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5180         dev->padded = (char *)dev - (char *)p;
5181
5182         if (dev_addr_init(dev))
5183                 goto free_tx;
5184
5185         dev_unicast_init(dev);
5186
5187         dev_net_set(dev, &init_net);
5188
5189         dev->_tx = tx;
5190         dev->num_tx_queues = queue_count;
5191         dev->real_num_tx_queues = queue_count;
5192
5193         dev->gso_max_size = GSO_MAX_SIZE;
5194
5195         netdev_init_queues(dev);
5196
5197         INIT_LIST_HEAD(&dev->napi_list);
5198         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5199         setup(dev);
5200         strcpy(dev->name, name);
5201         return dev;
5202
5203 free_tx:
5204         kfree(tx);
5205
5206 free_p:
5207         kfree(p);
5208         return NULL;
5209 }
5210 EXPORT_SYMBOL(alloc_netdev_mq);
5211
5212 /**
5213  *      free_netdev - free network device
5214  *      @dev: device
5215  *
5216  *      This function does the last stage of destroying an allocated device
5217  *      interface. The reference to the device object is released.
5218  *      If this is the last reference then it will be freed.
5219  */
5220 void free_netdev(struct net_device *dev)
5221 {
5222         struct napi_struct *p, *n;
5223
5224         release_net(dev_net(dev));
5225
5226         kfree(dev->_tx);
5227
5228         /* Flush device addresses */
5229         dev_addr_flush(dev);
5230
5231         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5232                 netif_napi_del(p);
5233
5234         /*  Compatibility with error handling in drivers */
5235         if (dev->reg_state == NETREG_UNINITIALIZED) {
5236                 kfree((char *)dev - dev->padded);
5237                 return;
5238         }
5239
5240         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5241         dev->reg_state = NETREG_RELEASED;
5242
5243         /* will free via device release */
5244         put_device(&dev->dev);
5245 }
5246 EXPORT_SYMBOL(free_netdev);
5247
5248 /**
5249  *      synchronize_net -  Synchronize with packet receive processing
5250  *
5251  *      Wait for packets currently being received to be done.
5252  *      Does not block later packets from starting.
5253  */
5254 void synchronize_net(void)
5255 {
5256         might_sleep();
5257         synchronize_rcu();
5258 }
5259 EXPORT_SYMBOL(synchronize_net);
5260
5261 /**
5262  *      unregister_netdevice - remove device from the kernel
5263  *      @dev: device
5264  *
5265  *      This function shuts down a device interface and removes it
5266  *      from the kernel tables.
5267  *
5268  *      Callers must hold the rtnl semaphore.  You may want
5269  *      unregister_netdev() instead of this.
5270  */
5271
5272 void unregister_netdevice(struct net_device *dev)
5273 {
5274         ASSERT_RTNL();
5275
5276         rollback_registered(dev);
5277         /* Finish processing unregister after unlock */
5278         net_set_todo(dev);
5279 }
5280 EXPORT_SYMBOL(unregister_netdevice);
5281
5282 /**
5283  *      unregister_netdev - remove device from the kernel
5284  *      @dev: device
5285  *
5286  *      This function shuts down a device interface and removes it
5287  *      from the kernel tables.
5288  *
5289  *      This is just a wrapper for unregister_netdevice that takes
5290  *      the rtnl semaphore.  In general you want to use this and not
5291  *      unregister_netdevice.
5292  */
5293 void unregister_netdev(struct net_device *dev)
5294 {
5295         rtnl_lock();
5296         unregister_netdevice(dev);
5297         rtnl_unlock();
5298 }
5299 EXPORT_SYMBOL(unregister_netdev);
5300
5301 /**
5302  *      dev_change_net_namespace - move device to different nethost namespace
5303  *      @dev: device
5304  *      @net: network namespace
5305  *      @pat: If not NULL name pattern to try if the current device name
5306  *            is already taken in the destination network namespace.
5307  *
5308  *      This function shuts down a device interface and moves it
5309  *      to a new network namespace. On success 0 is returned, on
5310  *      a failure a netagive errno code is returned.
5311  *
5312  *      Callers must hold the rtnl semaphore.
5313  */
5314
5315 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5316 {
5317         char buf[IFNAMSIZ];
5318         const char *destname;
5319         int err;
5320
5321         ASSERT_RTNL();
5322
5323         /* Don't allow namespace local devices to be moved. */
5324         err = -EINVAL;
5325         if (dev->features & NETIF_F_NETNS_LOCAL)
5326                 goto out;
5327
5328 #ifdef CONFIG_SYSFS
5329         /* Don't allow real devices to be moved when sysfs
5330          * is enabled.
5331          */
5332         err = -EINVAL;
5333         if (dev->dev.parent)
5334                 goto out;
5335 #endif
5336
5337         /* Ensure the device has been registrered */
5338         err = -EINVAL;
5339         if (dev->reg_state != NETREG_REGISTERED)
5340                 goto out;
5341
5342         /* Get out if there is nothing todo */
5343         err = 0;
5344         if (net_eq(dev_net(dev), net))
5345                 goto out;
5346
5347         /* Pick the destination device name, and ensure
5348          * we can use it in the destination network namespace.
5349          */
5350         err = -EEXIST;
5351         destname = dev->name;
5352         if (__dev_get_by_name(net, destname)) {
5353                 /* We get here if we can't use the current device name */
5354                 if (!pat)
5355                         goto out;
5356                 if (!dev_valid_name(pat))
5357                         goto out;
5358                 if (strchr(pat, '%')) {
5359                         if (__dev_alloc_name(net, pat, buf) < 0)
5360                                 goto out;
5361                         destname = buf;
5362                 } else
5363                         destname = pat;
5364                 if (__dev_get_by_name(net, destname))
5365                         goto out;
5366         }
5367
5368         /*
5369          * And now a mini version of register_netdevice unregister_netdevice.
5370          */
5371
5372         /* If device is running close it first. */
5373         dev_close(dev);
5374
5375         /* And unlink it from device chain */
5376         err = -ENODEV;
5377         unlist_netdevice(dev);
5378
5379         synchronize_net();
5380
5381         /* Shutdown queueing discipline. */
5382         dev_shutdown(dev);
5383
5384         /* Notify protocols, that we are about to destroy
5385            this device. They should clean all the things.
5386         */
5387         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5388
5389         /*
5390          *      Flush the unicast and multicast chains
5391          */
5392         dev_unicast_flush(dev);
5393         dev_addr_discard(dev);
5394
5395         netdev_unregister_kobject(dev);
5396
5397         /* Actually switch the network namespace */
5398         dev_net_set(dev, net);
5399
5400         /* Assign the new device name */
5401         if (destname != dev->name)
5402                 strcpy(dev->name, destname);
5403
5404         /* If there is an ifindex conflict assign a new one */
5405         if (__dev_get_by_index(net, dev->ifindex)) {
5406                 int iflink = (dev->iflink == dev->ifindex);
5407                 dev->ifindex = dev_new_index(net);
5408                 if (iflink)
5409                         dev->iflink = dev->ifindex;
5410         }
5411
5412         /* Fixup kobjects */
5413         err = netdev_register_kobject(dev);
5414         WARN_ON(err);
5415
5416         /* Add the device back in the hashes */
5417         list_netdevice(dev);
5418
5419         /* Notify protocols, that a new device appeared. */
5420         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5421
5422         /*
5423          *      Prevent userspace races by waiting until the network
5424          *      device is fully setup before sending notifications.
5425          */
5426         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5427
5428         synchronize_net();
5429         err = 0;
5430 out:
5431         return err;
5432 }
5433 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5434
5435 static int dev_cpu_callback(struct notifier_block *nfb,
5436                             unsigned long action,
5437                             void *ocpu)
5438 {
5439         struct sk_buff **list_skb;
5440         struct Qdisc **list_net;
5441         struct sk_buff *skb;
5442         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5443         struct softnet_data *sd, *oldsd;
5444
5445         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5446                 return NOTIFY_OK;
5447
5448         local_irq_disable();
5449         cpu = smp_processor_id();
5450         sd = &per_cpu(softnet_data, cpu);
5451         oldsd = &per_cpu(softnet_data, oldcpu);
5452
5453         /* Find end of our completion_queue. */
5454         list_skb = &sd->completion_queue;
5455         while (*list_skb)
5456                 list_skb = &(*list_skb)->next;
5457         /* Append completion queue from offline CPU. */
5458         *list_skb = oldsd->completion_queue;
5459         oldsd->completion_queue = NULL;
5460
5461         /* Find end of our output_queue. */
5462         list_net = &sd->output_queue;
5463         while (*list_net)
5464                 list_net = &(*list_net)->next_sched;
5465         /* Append output queue from offline CPU. */
5466         *list_net = oldsd->output_queue;
5467         oldsd->output_queue = NULL;
5468
5469         raise_softirq_irqoff(NET_TX_SOFTIRQ);
5470         local_irq_enable();
5471
5472         /* Process offline CPU's input_pkt_queue */
5473         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5474                 netif_rx(skb);
5475
5476         return NOTIFY_OK;
5477 }
5478
5479
5480 /**
5481  *      netdev_increment_features - increment feature set by one
5482  *      @all: current feature set
5483  *      @one: new feature set
5484  *      @mask: mask feature set
5485  *
5486  *      Computes a new feature set after adding a device with feature set
5487  *      @one to the master device with current feature set @all.  Will not
5488  *      enable anything that is off in @mask. Returns the new feature set.
5489  */
5490 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5491                                         unsigned long mask)
5492 {
5493         /* If device needs checksumming, downgrade to it. */
5494         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5495                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5496         else if (mask & NETIF_F_ALL_CSUM) {
5497                 /* If one device supports v4/v6 checksumming, set for all. */
5498                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5499                     !(all & NETIF_F_GEN_CSUM)) {
5500                         all &= ~NETIF_F_ALL_CSUM;
5501                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5502                 }
5503
5504                 /* If one device supports hw checksumming, set for all. */
5505                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5506                         all &= ~NETIF_F_ALL_CSUM;
5507                         all |= NETIF_F_HW_CSUM;
5508                 }
5509         }
5510
5511         one |= NETIF_F_ALL_CSUM;
5512
5513         one |= all & NETIF_F_ONE_FOR_ALL;
5514         all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5515         all |= one & mask & NETIF_F_ONE_FOR_ALL;
5516
5517         return all;
5518 }
5519 EXPORT_SYMBOL(netdev_increment_features);
5520
5521 static struct hlist_head *netdev_create_hash(void)
5522 {
5523         int i;
5524         struct hlist_head *hash;
5525
5526         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5527         if (hash != NULL)
5528                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5529                         INIT_HLIST_HEAD(&hash[i]);
5530
5531         return hash;
5532 }
5533
5534 /* Initialize per network namespace state */
5535 static int __net_init netdev_init(struct net *net)
5536 {
5537         INIT_LIST_HEAD(&net->dev_base_head);
5538
5539         net->dev_name_head = netdev_create_hash();
5540         if (net->dev_name_head == NULL)
5541                 goto err_name;
5542
5543         net->dev_index_head = netdev_create_hash();
5544         if (net->dev_index_head == NULL)
5545                 goto err_idx;
5546
5547         return 0;
5548
5549 err_idx:
5550         kfree(net->dev_name_head);
5551 err_name:
5552         return -ENOMEM;
5553 }
5554
5555 /**
5556  *      netdev_drivername - network driver for the device
5557  *      @dev: network device
5558  *      @buffer: buffer for resulting name
5559  *      @len: size of buffer
5560  *
5561  *      Determine network driver for device.
5562  */
5563 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5564 {
5565         const struct device_driver *driver;
5566         const struct device *parent;
5567
5568         if (len <= 0 || !buffer)
5569                 return buffer;
5570         buffer[0] = 0;
5571
5572         parent = dev->dev.parent;
5573
5574         if (!parent)
5575                 return buffer;
5576
5577         driver = parent->driver;
5578         if (driver && driver->name)
5579                 strlcpy(buffer, driver->name, len);
5580         return buffer;
5581 }
5582
5583 static void __net_exit netdev_exit(struct net *net)
5584 {
5585         kfree(net->dev_name_head);
5586         kfree(net->dev_index_head);
5587 }
5588
5589 static struct pernet_operations __net_initdata netdev_net_ops = {
5590         .init = netdev_init,
5591         .exit = netdev_exit,
5592 };
5593
5594 static void __net_exit default_device_exit(struct net *net)
5595 {
5596         struct net_device *dev;
5597         /*
5598          * Push all migratable of the network devices back to the
5599          * initial network namespace
5600          */
5601         rtnl_lock();
5602 restart:
5603         for_each_netdev(net, dev) {
5604                 int err;
5605                 char fb_name[IFNAMSIZ];
5606
5607                 /* Ignore unmoveable devices (i.e. loopback) */
5608                 if (dev->features & NETIF_F_NETNS_LOCAL)
5609                         continue;
5610
5611                 /* Delete virtual devices */
5612                 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5613                         dev->rtnl_link_ops->dellink(dev);
5614                         goto restart;
5615                 }
5616
5617                 /* Push remaing network devices to init_net */
5618                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5619                 err = dev_change_net_namespace(dev, &init_net, fb_name);
5620                 if (err) {
5621                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5622                                 __func__, dev->name, err);
5623                         BUG();
5624                 }
5625                 goto restart;
5626         }
5627         rtnl_unlock();
5628 }
5629
5630 static struct pernet_operations __net_initdata default_device_ops = {
5631         .exit = default_device_exit,
5632 };
5633
5634 /*
5635  *      Initialize the DEV module. At boot time this walks the device list and
5636  *      unhooks any devices that fail to initialise (normally hardware not
5637  *      present) and leaves us with a valid list of present and active devices.
5638  *
5639  */
5640
5641 /*
5642  *       This is called single threaded during boot, so no need
5643  *       to take the rtnl semaphore.
5644  */
5645 static int __init net_dev_init(void)
5646 {
5647         int i, rc = -ENOMEM;
5648
5649         BUG_ON(!dev_boot_phase);
5650
5651         if (dev_proc_init())
5652                 goto out;
5653
5654         if (netdev_kobject_init())
5655                 goto out;
5656
5657         INIT_LIST_HEAD(&ptype_all);
5658         for (i = 0; i < PTYPE_HASH_SIZE; i++)
5659                 INIT_LIST_HEAD(&ptype_base[i]);
5660
5661         if (register_pernet_subsys(&netdev_net_ops))
5662                 goto out;
5663
5664         /*
5665          *      Initialise the packet receive queues.
5666          */
5667
5668         for_each_possible_cpu(i) {
5669                 struct softnet_data *queue;
5670
5671                 queue = &per_cpu(softnet_data, i);
5672                 skb_queue_head_init(&queue->input_pkt_queue);
5673                 queue->completion_queue = NULL;
5674                 INIT_LIST_HEAD(&queue->poll_list);
5675
5676                 queue->backlog.poll = process_backlog;
5677                 queue->backlog.weight = weight_p;
5678                 queue->backlog.gro_list = NULL;
5679                 queue->backlog.gro_count = 0;
5680         }
5681
5682         dev_boot_phase = 0;
5683
5684         /* The loopback device is special if any other network devices
5685          * is present in a network namespace the loopback device must
5686          * be present. Since we now dynamically allocate and free the
5687          * loopback device ensure this invariant is maintained by
5688          * keeping the loopback device as the first device on the
5689          * list of network devices.  Ensuring the loopback devices
5690          * is the first device that appears and the last network device
5691          * that disappears.
5692          */
5693         if (register_pernet_device(&loopback_net_ops))
5694                 goto out;
5695
5696         if (register_pernet_device(&default_device_ops))
5697                 goto out;
5698
5699         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5700         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5701
5702         hotcpu_notifier(dev_cpu_callback, 0);
5703         dst_init();
5704         dev_mcast_init();
5705         rc = 0;
5706 out:
5707         return rc;
5708 }
5709
5710 subsys_initcall(net_dev_init);
5711
5712 static int __init initialize_hashrnd(void)
5713 {
5714         get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
5715         return 0;
5716 }
5717
5718 late_initcall_sync(initialize_hashrnd);
5719