net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/mutex.h>
  84 #include <linux/string.h>
  85 #include <linux/mm.h>
  86 #include <linux/socket.h>
  87 #include <linux/sockios.h>
  88 #include <linux/errno.h>
  89 #include <linux/interrupt.h>
  90 #include <linux/if_ether.h>
  91 #include <linux/netdevice.h>
  92 #include <linux/etherdevice.h>
  93 #include <linux/ethtool.h>
  94 #include <linux/notifier.h>
  95 #include <linux/skbuff.h>
  96 #include <net/net_namespace.h>
  97 #include <net/sock.h>
  98 #include <linux/rtnetlink.h>
  99 #include <linux/proc_fs.h>
 100 #include <linux/seq_file.h>
 101 #include <linux/stat.h>
 102 #include <linux/if_bridge.h>
 103 #include <linux/if_macvlan.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129
 130 #include "net-sysfs.h"
 131
 132 /* Instead of increasing this, you should create a hash table. */
 133 #define MAX_GRO_SKBS 8
 134
 135 /* This should be increased if a protocol with a bigger head is added. */
 136 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 137
 138 enum {
 139         GRO_MERGED,
 140         GRO_MERGED_FREE,
 141         GRO_HELD,
 142         GRO_NORMAL,
 143         GRO_DROP,
 144 };
 145
 146 /*
 147  *      The list of packet types we will receive (as opposed to discard)
 148  *      and the routines to invoke.
 149  *
 150  *      Why 16. Because with 16 the only overlap we get on a hash of the
 151  *      low nibble of the protocol value is RARP/SNAP/X.25.
 152  *
 153  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 154  *             sure which should go first, but I bet it won't make much
 155  *             difference if we are running VLANs.  The good news is that
 156  *             this protocol won't be in the list unless compiled in, so
 157  *             the average user (w/out VLANs) will not be adversely affected.
 158  *             --BLG
 159  *
 160  *              0800    IP
 161  *              8100    802.1Q VLAN
 162  *              0001    802.3
 163  *              0002    AX.25
 164  *              0004    802.2
 165  *              8035    RARP
 166  *              0005    SNAP
 167  *              0805    X.25
 168  *              0806    ARP
 169  *              8137    IPX
 170  *              0009    Localtalk
 171  *              86DD    IPv6
 172  */
 173
 174 #define PTYPE_HASH_SIZE (16)
 175 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 176
 177 static DEFINE_SPINLOCK(ptype_lock);
 178 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 179 static struct list_head ptype_all __read_mostly;        /* Taps */
 180
 181 /*
 182  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 183  * semaphore.
 184  *
 185  * Pure readers hold dev_base_lock for reading.
 186  *
 187  * Writers must hold the rtnl semaphore while they loop through the
 188  * dev_base_head list, and hold dev_base_lock for writing when they do the
 189  * actual updates.  This allows pure readers to access the list even
 190  * while a writer is preparing to update it.
 191  *
 192  * To put it another way, dev_base_lock is held for writing only to
 193  * protect against pure readers; the rtnl semaphore provides the
 194  * protection against other writers.
 195  *
 196  * See, for example usages, register_netdevice() and
 197  * unregister_netdevice(), which must be called with the rtnl
 198  * semaphore held.
 199  */
 200 DEFINE_RWLOCK(dev_base_lock);
 201
 202 EXPORT_SYMBOL(dev_base_lock);
 203
 204 #define NETDEV_HASHBITS 8
 205 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 206
 207 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 208 {
 209         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 210         return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 211 }
 212
 213 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 214 {
 215         return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 216 }
 217
 218 /* Device list insertion */
 219 static int list_netdevice(struct net_device *dev)
 220 {
 221         struct net *net = dev_net(dev);
 222
 223         ASSERT_RTNL();
 224
 225         write_lock_bh(&dev_base_lock);
 226         list_add_tail(&dev->dev_list, &net->dev_base_head);
 227         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 228         hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 229         write_unlock_bh(&dev_base_lock);
 230         return 0;
 231 }
 232
 233 /* Device list removal */
 234 static void unlist_netdevice(struct net_device *dev)
 235 {
 236         ASSERT_RTNL();
 237
 238         /* Unlink dev from the device chain */
 239         write_lock_bh(&dev_base_lock);
 240         list_del(&dev->dev_list);
 241         hlist_del(&dev->name_hlist);
 242         hlist_del(&dev->index_hlist);
 243         write_unlock_bh(&dev_base_lock);
 244 }
 245
 246 /*
 247  *      Our notifier list
 248  */
 249
 250 static RAW_NOTIFIER_HEAD(netdev_chain);
 251
 252 /*
 253  *      Device drivers call our routines to queue packets here. We empty the
 254  *      queue in the local softnet handler.
 255  */
 256
 257 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 258
 259 #ifdef CONFIG_LOCKDEP
 260 /*
 261  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 262  * according to dev->type
 263  */
 264 static const unsigned short netdev_lock_type[] =
 265         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 266          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 267          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 268          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 269          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 270          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 271          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 272          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 273          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 274          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 275          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 276          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 277          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 278          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 279          ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
 280
 281 static const char *netdev_lock_name[] =
 282         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 283          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 284          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 285          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 286          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 287          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 288          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 289          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 290          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 291          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 292          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 293          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 294          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 295          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 296          "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
 297
 298 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 299 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 300
 301 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 302 {
 303         int i;
 304
 305         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 306                 if (netdev_lock_type[i] == dev_type)
 307                         return i;
 308         /* the last key is used by default */
 309         return ARRAY_SIZE(netdev_lock_type) - 1;
 310 }
 311
 312 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 313                                                  unsigned short dev_type)
 314 {
 315         int i;
 316
 317         i = netdev_lock_pos(dev_type);
 318         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 319                                    netdev_lock_name[i]);
 320 }
 321
 322 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 323 {
 324         int i;
 325
 326         i = netdev_lock_pos(dev->type);
 327         lockdep_set_class_and_name(&dev->addr_list_lock,
 328                                    &netdev_addr_lock_key[i],
 329                                    netdev_lock_name[i]);
 330 }
 331 #else
 332 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 333                                                  unsigned short dev_type)
 334 {
 335 }
 336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 337 {
 338 }
 339 #endif
 340
 341 /*******************************************************************************
 342
 343                 Protocol management and registration routines
 344
 345 *******************************************************************************/
 346
 347 /*
 348  *      Add a protocol ID to the list. Now that the input handler is
 349  *      smarter we can dispense with all the messy stuff that used to be
 350  *      here.
 351  *
 352  *      BEWARE!!! Protocol handlers, mangling input packets,
 353  *      MUST BE last in hash buckets and checking protocol handlers
 354  *      MUST start from promiscuous ptype_all chain in net_bh.
 355  *      It is true now, do not change it.
 356  *      Explanation follows: if protocol handler, mangling packet, will
 357  *      be the first on list, it is not able to sense, that packet
 358  *      is cloned and should be copied-on-write, so that it will
 359  *      change it and subsequent readers will get broken packet.
 360  *                                                      --ANK (980803)
 361  */
 362
 363 /**
 364  *      dev_add_pack - add packet handler
 365  *      @pt: packet type declaration
 366  *
 367  *      Add a protocol handler to the networking stack. The passed &packet_type
 368  *      is linked into kernel lists and may not be freed until it has been
 369  *      removed from the kernel lists.
 370  *
 371  *      This call does not sleep therefore it can not
 372  *      guarantee all CPU's that are in middle of receiving packets
 373  *      will see the new packet type (until the next received packet).
 374  */
 375
 376 void dev_add_pack(struct packet_type *pt)
 377 {
 378         int hash;
 379
 380         spin_lock_bh(&ptype_lock);
 381         if (pt->type == htons(ETH_P_ALL))
 382                 list_add_rcu(&pt->list, &ptype_all);
 383         else {
 384                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 385                 list_add_rcu(&pt->list, &ptype_base[hash]);
 386         }
 387         spin_unlock_bh(&ptype_lock);
 388 }
 389
 390 /**
 391  *      __dev_remove_pack        - remove packet handler
 392  *      @pt: packet type declaration
 393  *
 394  *      Remove a protocol handler that was previously added to the kernel
 395  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 396  *      from the kernel lists and can be freed or reused once this function
 397  *      returns.
 398  *
 399  *      The packet type might still be in use by receivers
 400  *      and must not be freed until after all the CPU's have gone
 401  *      through a quiescent state.
 402  */
 403 void __dev_remove_pack(struct packet_type *pt)
 404 {
 405         struct list_head *head;
 406         struct packet_type *pt1;
 407
 408         spin_lock_bh(&ptype_lock);
 409
 410         if (pt->type == htons(ETH_P_ALL))
 411                 head = &ptype_all;
 412         else
 413                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 414
 415         list_for_each_entry(pt1, head, list) {
 416                 if (pt == pt1) {
 417                         list_del_rcu(&pt->list);
 418                         goto out;
 419                 }
 420         }
 421
 422         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 423 out:
 424         spin_unlock_bh(&ptype_lock);
 425 }
 426 /**
 427  *      dev_remove_pack  - remove packet handler
 428  *      @pt: packet type declaration
 429  *
 430  *      Remove a protocol handler that was previously added to the kernel
 431  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 432  *      from the kernel lists and can be freed or reused once this function
 433  *      returns.
 434  *
 435  *      This call sleeps to guarantee that no CPU is looking at the packet
 436  *      type after return.
 437  */
 438 void dev_remove_pack(struct packet_type *pt)
 439 {
 440         __dev_remove_pack(pt);
 441
 442         synchronize_net();
 443 }
 444
 445 /******************************************************************************
 446
 447                       Device Boot-time Settings Routines
 448
 449 *******************************************************************************/
 450
 451 /* Boot time configuration table */
 452 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 453
 454 /**
 455  *      netdev_boot_setup_add   - add new setup entry
 456  *      @name: name of the device
 457  *      @map: configured settings for the device
 458  *
 459  *      Adds new setup entry to the dev_boot_setup list.  The function
 460  *      returns 0 on error and 1 on success.  This is a generic routine to
 461  *      all netdevices.
 462  */
 463 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 464 {
 465         struct netdev_boot_setup *s;
 466         int i;
 467
 468         s = dev_boot_setup;
 469         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 470                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 471                         memset(s[i].name, 0, sizeof(s[i].name));
 472                         strlcpy(s[i].name, name, IFNAMSIZ);
 473                         memcpy(&s[i].map, map, sizeof(s[i].map));
 474                         break;
 475                 }
 476         }
 477
 478         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 479 }
 480
 481 /**
 482  *      netdev_boot_setup_check - check boot time settings
 483  *      @dev: the netdevice
 484  *
 485  *      Check boot time settings for the device.
 486  *      The found settings are set for the device to be used
 487  *      later in the device probing.
 488  *      Returns 0 if no settings found, 1 if they are.
 489  */
 490 int netdev_boot_setup_check(struct net_device *dev)
 491 {
 492         struct netdev_boot_setup *s = dev_boot_setup;
 493         int i;
 494
 495         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 496                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 497                     !strcmp(dev->name, s[i].name)) {
 498                         dev->irq        = s[i].map.irq;
 499                         dev->base_addr  = s[i].map.base_addr;
 500                         dev->mem_start  = s[i].map.mem_start;
 501                         dev->mem_end    = s[i].map.mem_end;
 502                         return 1;
 503                 }
 504         }
 505         return 0;
 506 }
 507
 508
 509 /**
 510  *      netdev_boot_base        - get address from boot time settings
 511  *      @prefix: prefix for network device
 512  *      @unit: id for network device
 513  *
 514  *      Check boot time settings for the base address of device.
 515  *      The found settings are set for the device to be used
 516  *      later in the device probing.
 517  *      Returns 0 if no settings found.
 518  */
 519 unsigned long netdev_boot_base(const char *prefix, int unit)
 520 {
 521         const struct netdev_boot_setup *s = dev_boot_setup;
 522         char name[IFNAMSIZ];
 523         int i;
 524
 525         sprintf(name, "%s%d", prefix, unit);
 526
 527         /*
 528          * If device already registered then return base of 1
 529          * to indicate not to probe for this interface
 530          */
 531         if (__dev_get_by_name(&init_net, name))
 532                 return 1;
 533
 534         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 535                 if (!strcmp(name, s[i].name))
 536                         return s[i].map.base_addr;
 537         return 0;
 538 }
 539
 540 /*
 541  * Saves at boot time configured settings for any netdevice.
 542  */
 543 int __init netdev_boot_setup(char *str)
 544 {
 545         int ints[5];
 546         struct ifmap map;
 547
 548         str = get_options(str, ARRAY_SIZE(ints), ints);
 549         if (!str || !*str)
 550                 return 0;
 551
 552         /* Save settings */
 553         memset(&map, 0, sizeof(map));
 554         if (ints[0] > 0)
 555                 map.irq = ints[1];
 556         if (ints[0] > 1)
 557                 map.base_addr = ints[2];
 558         if (ints[0] > 2)
 559                 map.mem_start = ints[3];
 560         if (ints[0] > 3)
 561                 map.mem_end = ints[4];
 562
 563         /* Add new entry to the list */
 564         return netdev_boot_setup_add(str, &map);
 565 }
 566
 567 __setup("netdev=", netdev_boot_setup);
 568
 569 /*******************************************************************************
 570
 571                             Device Interface Subroutines
 572
 573 *******************************************************************************/
 574
 575 /**
 576  *      __dev_get_by_name       - find a device by its name
 577  *      @net: the applicable net namespace
 578  *      @name: name to find
 579  *
 580  *      Find an interface by name. Must be called under RTNL semaphore
 581  *      or @dev_base_lock. If the name is found a pointer to the device
 582  *      is returned. If the name is not found then %NULL is returned. The
 583  *      reference counters are not incremented so the caller must be
 584  *      careful with locks.
 585  */
 586
 587 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 588 {
 589         struct hlist_node *p;
 590
 591         hlist_for_each(p, dev_name_hash(net, name)) {
 592                 struct net_device *dev
 593                         = hlist_entry(p, struct net_device, name_hlist);
 594                 if (!strncmp(dev->name, name, IFNAMSIZ))
 595                         return dev;
 596         }
 597         return NULL;
 598 }
 599
 600 /**
 601  *      dev_get_by_name         - find a device by its name
 602  *      @net: the applicable net namespace
 603  *      @name: name to find
 604  *
 605  *      Find an interface by name. This can be called from any
 606  *      context and does its own locking. The returned handle has
 607  *      the usage count incremented and the caller must use dev_put() to
 608  *      release it when it is no longer needed. %NULL is returned if no
 609  *      matching device is found.
 610  */
 611
 612 struct net_device *dev_get_by_name(struct net *net, const char *name)
 613 {
 614         struct net_device *dev;
 615
 616         read_lock(&dev_base_lock);
 617         dev = __dev_get_by_name(net, name);
 618         if (dev)
 619                 dev_hold(dev);
 620         read_unlock(&dev_base_lock);
 621         return dev;
 622 }
 623
 624 /**
 625  *      __dev_get_by_index - find a device by its ifindex
 626  *      @net: the applicable net namespace
 627  *      @ifindex: index of device
 628  *
 629  *      Search for an interface by index. Returns %NULL if the device
 630  *      is not found or a pointer to the device. The device has not
 631  *      had its reference counter increased so the caller must be careful
 632  *      about locking. The caller must hold either the RTNL semaphore
 633  *      or @dev_base_lock.
 634  */
 635
 636 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 637 {
 638         struct hlist_node *p;
 639
 640         hlist_for_each(p, dev_index_hash(net, ifindex)) {
 641                 struct net_device *dev
 642                         = hlist_entry(p, struct net_device, index_hlist);
 643                 if (dev->ifindex == ifindex)
 644                         return dev;
 645         }
 646         return NULL;
 647 }
 648
 649
 650 /**
 651  *      dev_get_by_index - find a device by its ifindex
 652  *      @net: the applicable net namespace
 653  *      @ifindex: index of device
 654  *
 655  *      Search for an interface by index. Returns NULL if the device
 656  *      is not found or a pointer to the device. The device returned has
 657  *      had a reference added and the pointer is safe until the user calls
 658  *      dev_put to indicate they have finished with it.
 659  */
 660
 661 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 662 {
 663         struct net_device *dev;
 664
 665         read_lock(&dev_base_lock);
 666         dev = __dev_get_by_index(net, ifindex);
 667         if (dev)
 668                 dev_hold(dev);
 669         read_unlock(&dev_base_lock);
 670         return dev;
 671 }
 672
 673 /**
 674  *      dev_getbyhwaddr - find a device by its hardware address
 675  *      @net: the applicable net namespace
 676  *      @type: media type of device
 677  *      @ha: hardware address
 678  *
 679  *      Search for an interface by MAC address. Returns NULL if the device
 680  *      is not found or a pointer to the device. The caller must hold the
 681  *      rtnl semaphore. The returned device has not had its ref count increased
 682  *      and the caller must therefore be careful about locking
 683  *
 684  *      BUGS:
 685  *      If the API was consistent this would be __dev_get_by_hwaddr
 686  */
 687
 688 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 689 {
 690         struct net_device *dev;
 691
 692         ASSERT_RTNL();
 693
 694         for_each_netdev(net, dev)
 695                 if (dev->type == type &&
 696                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 697                         return dev;
 698
 699         return NULL;
 700 }
 701
 702 EXPORT_SYMBOL(dev_getbyhwaddr);
 703
 704 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 705 {
 706         struct net_device *dev;
 707
 708         ASSERT_RTNL();
 709         for_each_netdev(net, dev)
 710                 if (dev->type == type)
 711                         return dev;
 712
 713         return NULL;
 714 }
 715
 716 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 717
 718 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 719 {
 720         struct net_device *dev;
 721
 722         rtnl_lock();
 723         dev = __dev_getfirstbyhwtype(net, type);
 724         if (dev)
 725                 dev_hold(dev);
 726         rtnl_unlock();
 727         return dev;
 728 }
 729
 730 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 731
 732 /**
 733  *      dev_get_by_flags - find any device with given flags
 734  *      @net: the applicable net namespace
 735  *      @if_flags: IFF_* values
 736  *      @mask: bitmask of bits in if_flags to check
 737  *
 738  *      Search for any interface with the given flags. Returns NULL if a device
 739  *      is not found or a pointer to the device. The device returned has
 740  *      had a reference added and the pointer is safe until the user calls
 741  *      dev_put to indicate they have finished with it.
 742  */
 743
 744 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
 745 {
 746         struct net_device *dev, *ret;
 747
 748         ret = NULL;
 749         read_lock(&dev_base_lock);
 750         for_each_netdev(net, dev) {
 751                 if (((dev->flags ^ if_flags) & mask) == 0) {
 752                         dev_hold(dev);
 753                         ret = dev;
 754                         break;
 755                 }
 756         }
 757         read_unlock(&dev_base_lock);
 758         return ret;
 759 }
 760
 761 /**
 762  *      dev_valid_name - check if name is okay for network device
 763  *      @name: name string
 764  *
 765  *      Network device names need to be valid file names to
 766  *      to allow sysfs to work.  We also disallow any kind of
 767  *      whitespace.
 768  */
 769 int dev_valid_name(const char *name)
 770 {
 771         if (*name == '\0')
 772                 return 0;
 773         if (strlen(name) >= IFNAMSIZ)
 774                 return 0;
 775         if (!strcmp(name, ".") || !strcmp(name, ".."))
 776                 return 0;
 777
 778         while (*name) {
 779                 if (*name == '/' || isspace(*name))
 780                         return 0;
 781                 name++;
 782         }
 783         return 1;
 784 }
 785
 786 /**
 787  *      __dev_alloc_name - allocate a name for a device
 788  *      @net: network namespace to allocate the device name in
 789  *      @name: name format string
 790  *      @buf:  scratch buffer and result name string
 791  *
 792  *      Passed a format string - eg "lt%d" it will try and find a suitable
 793  *      id. It scans list of devices to build up a free map, then chooses
 794  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 795  *      while allocating the name and adding the device in order to avoid
 796  *      duplicates.
 797  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 798  *      Returns the number of the unit assigned or a negative errno code.
 799  */
 800
 801 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 802 {
 803         int i = 0;
 804         const char *p;
 805         const int max_netdevices = 8*PAGE_SIZE;
 806         unsigned long *inuse;
 807         struct net_device *d;
 808
 809         p = strnchr(name, IFNAMSIZ-1, '%');
 810         if (p) {
 811                 /*
 812                  * Verify the string as this thing may have come from
 813                  * the user.  There must be either one "%d" and no other "%"
 814                  * characters.
 815                  */
 816                 if (p[1] != 'd' || strchr(p + 2, '%'))
 817                         return -EINVAL;
 818
 819                 /* Use one page as a bit array of possible slots */
 820                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 821                 if (!inuse)
 822                         return -ENOMEM;
 823
 824                 for_each_netdev(net, d) {
 825                         if (!sscanf(d->name, name, &i))
 826                                 continue;
 827                         if (i < 0 || i >= max_netdevices)
 828                                 continue;
 829
 830                         /*  avoid cases where sscanf is not exact inverse of printf */
 831                         snprintf(buf, IFNAMSIZ, name, i);
 832                         if (!strncmp(buf, d->name, IFNAMSIZ))
 833                                 set_bit(i, inuse);
 834                 }
 835
 836                 i = find_first_zero_bit(inuse, max_netdevices);
 837                 free_page((unsigned long) inuse);
 838         }
 839
 840         snprintf(buf, IFNAMSIZ, name, i);
 841         if (!__dev_get_by_name(net, buf))
 842                 return i;
 843
 844         /* It is possible to run out of possible slots
 845          * when the name is long and there isn't enough space left
 846          * for the digits, or if all bits are used.
 847          */
 848         return -ENFILE;
 849 }
 850
 851 /**
 852  *      dev_alloc_name - allocate a name for a device
 853  *      @dev: device
 854  *      @name: name format string
 855  *
 856  *      Passed a format string - eg "lt%d" it will try and find a suitable
 857  *      id. It scans list of devices to build up a free map, then chooses
 858  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 859  *      while allocating the name and adding the device in order to avoid
 860  *      duplicates.
 861  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 862  *      Returns the number of the unit assigned or a negative errno code.
 863  */
 864
 865 int dev_alloc_name(struct net_device *dev, const char *name)
 866 {
 867         char buf[IFNAMSIZ];
 868         struct net *net;
 869         int ret;
 870
 871         BUG_ON(!dev_net(dev));
 872         net = dev_net(dev);
 873         ret = __dev_alloc_name(net, name, buf);
 874         if (ret >= 0)
 875                 strlcpy(dev->name, buf, IFNAMSIZ);
 876         return ret;
 877 }
 878
 879
 880 /**
 881  *      dev_change_name - change name of a device
 882  *      @dev: device
 883  *      @newname: name (or format string) must be at least IFNAMSIZ
 884  *
 885  *      Change name of a device, can pass format strings "eth%d".
 886  *      for wildcarding.
 887  */
 888 int dev_change_name(struct net_device *dev, const char *newname)
 889 {
 890         char oldname[IFNAMSIZ];
 891         int err = 0;
 892         int ret;
 893         struct net *net;
 894
 895         ASSERT_RTNL();
 896         BUG_ON(!dev_net(dev));
 897
 898         net = dev_net(dev);
 899         if (dev->flags & IFF_UP)
 900                 return -EBUSY;
 901
 902         if (!dev_valid_name(newname))
 903                 return -EINVAL;
 904
 905         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 906                 return 0;
 907
 908         memcpy(oldname, dev->name, IFNAMSIZ);
 909
 910         if (strchr(newname, '%')) {
 911                 err = dev_alloc_name(dev, newname);
 912                 if (err < 0)
 913                         return err;
 914         }
 915         else if (__dev_get_by_name(net, newname))
 916                 return -EEXIST;
 917         else
 918                 strlcpy(dev->name, newname, IFNAMSIZ);
 919
 920 rollback:
 921         /* For now only devices in the initial network namespace
 922          * are in sysfs.
 923          */
 924         if (net == &init_net) {
 925                 ret = device_rename(&dev->dev, dev->name);
 926                 if (ret) {
 927                         memcpy(dev->name, oldname, IFNAMSIZ);
 928                         return ret;
 929                 }
 930         }
 931
 932         write_lock_bh(&dev_base_lock);
 933         hlist_del(&dev->name_hlist);
 934         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 935         write_unlock_bh(&dev_base_lock);
 936
 937         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 938         ret = notifier_to_errno(ret);
 939
 940         if (ret) {
 941                 if (err) {
 942                         printk(KERN_ERR
 943                                "%s: name change rollback failed: %d.\n",
 944                                dev->name, ret);
 945                 } else {
 946                         err = ret;
 947                         memcpy(dev->name, oldname, IFNAMSIZ);
 948                         goto rollback;
 949                 }
 950         }
 951
 952         return err;
 953 }
 954
 955 /**
 956  *      dev_set_alias - change ifalias of a device
 957  *      @dev: device
 958  *      @alias: name up to IFALIASZ
 959  *      @len: limit of bytes to copy from info
 960  *
 961  *      Set ifalias for a device,
 962  */
 963 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 964 {
 965         ASSERT_RTNL();
 966
 967         if (len >= IFALIASZ)
 968                 return -EINVAL;
 969
 970         if (!len) {
 971                 if (dev->ifalias) {
 972                         kfree(dev->ifalias);
 973                         dev->ifalias = NULL;
 974                 }
 975                 return 0;
 976         }
 977
 978         dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
 979         if (!dev->ifalias)
 980                 return -ENOMEM;
 981
 982         strlcpy(dev->ifalias, alias, len+1);
 983         return len;
 984 }
 985
 986
 987 /**
 988  *      netdev_features_change - device changes features
 989  *      @dev: device to cause notification
 990  *
 991  *      Called to indicate a device has changed features.
 992  */
 993 void netdev_features_change(struct net_device *dev)
 994 {
 995         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 996 }
 997 EXPORT_SYMBOL(netdev_features_change);
 998
 999 /**
1000  *      netdev_state_change - device changes state
1001  *      @dev: device to cause notification
1002  *
1003  *      Called to indicate a device has changed state. This function calls
1004  *      the notifier chains for netdev_chain and sends a NEWLINK message
1005  *      to the routing socket.
1006  */
1007 void netdev_state_change(struct net_device *dev)
1008 {
1009         if (dev->flags & IFF_UP) {
1010                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1011                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1012         }
1013 }
1014
1015 void netdev_bonding_change(struct net_device *dev)
1016 {
1017         call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1018 }
1019 EXPORT_SYMBOL(netdev_bonding_change);
1020
1021 /**
1022  *      dev_load        - load a network module
1023  *      @net: the applicable net namespace
1024  *      @name: name of interface
1025  *
1026  *      If a network interface is not present and the process has suitable
1027  *      privileges this function loads the module. If module loading is not
1028  *      available in this kernel then it becomes a nop.
1029  */
1030
1031 void dev_load(struct net *net, const char *name)
1032 {
1033         struct net_device *dev;
1034
1035         read_lock(&dev_base_lock);
1036         dev = __dev_get_by_name(net, name);
1037         read_unlock(&dev_base_lock);
1038
1039         if (!dev && capable(CAP_SYS_MODULE))
1040                 request_module("%s", name);
1041 }
1042
1043 /**
1044  *      dev_open        - prepare an interface for use.
1045  *      @dev:   device to open
1046  *
1047  *      Takes a device from down to up state. The device's private open
1048  *      function is invoked and then the multicast lists are loaded. Finally
1049  *      the device is moved into the up state and a %NETDEV_UP message is
1050  *      sent to the netdev notifier chain.
1051  *
1052  *      Calling this function on an active interface is a nop. On a failure
1053  *      a negative errno code is returned.
1054  */
1055 int dev_open(struct net_device *dev)
1056 {
1057         const struct net_device_ops *ops = dev->netdev_ops;
1058         int ret = 0;
1059
1060         ASSERT_RTNL();
1061
1062         /*
1063          *      Is it already up?
1064          */
1065
1066         if (dev->flags & IFF_UP)
1067                 return 0;
1068
1069         /*
1070          *      Is it even present?
1071          */
1072         if (!netif_device_present(dev))
1073                 return -ENODEV;
1074
1075         /*
1076          *      Call device private open method
1077          */
1078         set_bit(__LINK_STATE_START, &dev->state);
1079
1080         if (ops->ndo_validate_addr)
1081                 ret = ops->ndo_validate_addr(dev);
1082
1083         if (!ret && ops->ndo_open)
1084                 ret = ops->ndo_open(dev);
1085
1086         /*
1087          *      If it went open OK then:
1088          */
1089
1090         if (ret)
1091                 clear_bit(__LINK_STATE_START, &dev->state);
1092         else {
1093                 /*
1094                  *      Set the flags.
1095                  */
1096                 dev->flags |= IFF_UP;
1097
1098                 /*
1099                  *      Enable NET_DMA
1100                  */
1101                 net_dmaengine_get();
1102
1103                 /*
1104                  *      Initialize multicasting status
1105                  */
1106                 dev_set_rx_mode(dev);
1107
1108                 /*
1109                  *      Wakeup transmit queue engine
1110                  */
1111                 dev_activate(dev);
1112
1113                 /*
1114                  *      ... and announce new interface.
1115                  */
1116                 call_netdevice_notifiers(NETDEV_UP, dev);
1117         }
1118
1119         return ret;
1120 }
1121
1122 /**
1123  *      dev_close - shutdown an interface.
1124  *      @dev: device to shutdown
1125  *
1126  *      This function moves an active device into down state. A
1127  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1128  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1129  *      chain.
1130  */
1131 int dev_close(struct net_device *dev)
1132 {
1133         const struct net_device_ops *ops = dev->netdev_ops;
1134         ASSERT_RTNL();
1135
1136         might_sleep();
1137
1138         if (!(dev->flags & IFF_UP))
1139                 return 0;
1140
1141         /*
1142          *      Tell people we are going down, so that they can
1143          *      prepare to death, when device is still operating.
1144          */
1145         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1146
1147         clear_bit(__LINK_STATE_START, &dev->state);
1148
1149         /* Synchronize to scheduled poll. We cannot touch poll list,
1150          * it can be even on different cpu. So just clear netif_running().
1151          *
1152          * dev->stop() will invoke napi_disable() on all of it's
1153          * napi_struct instances on this device.
1154          */
1155         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1156
1157         dev_deactivate(dev);
1158
1159         /*
1160          *      Call the device specific close. This cannot fail.
1161          *      Only if device is UP
1162          *
1163          *      We allow it to be called even after a DETACH hot-plug
1164          *      event.
1165          */
1166         if (ops->ndo_stop)
1167                 ops->ndo_stop(dev);
1168
1169         /*
1170          *      Device is now down.
1171          */
1172
1173         dev->flags &= ~IFF_UP;
1174
1175         /*
1176          * Tell people we are down
1177          */
1178         call_netdevice_notifiers(NETDEV_DOWN, dev);
1179
1180         /*
1181          *      Shutdown NET_DMA
1182          */
1183         net_dmaengine_put();
1184
1185         return 0;
1186 }
1187
1188
1189 /**
1190  *      dev_disable_lro - disable Large Receive Offload on a device
1191  *      @dev: device
1192  *
1193  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1194  *      called under RTNL.  This is needed if received packets may be
1195  *      forwarded to another interface.
1196  */
1197 void dev_disable_lro(struct net_device *dev)
1198 {
1199         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1200             dev->ethtool_ops->set_flags) {
1201                 u32 flags = dev->ethtool_ops->get_flags(dev);
1202                 if (flags & ETH_FLAG_LRO) {
1203                         flags &= ~ETH_FLAG_LRO;
1204                         dev->ethtool_ops->set_flags(dev, flags);
1205                 }
1206         }
1207         WARN_ON(dev->features & NETIF_F_LRO);
1208 }
1209 EXPORT_SYMBOL(dev_disable_lro);
1210
1211
1212 static int dev_boot_phase = 1;
1213
1214 /*
1215  *      Device change register/unregister. These are not inline or static
1216  *      as we export them to the world.
1217  */
1218
1219 /**
1220  *      register_netdevice_notifier - register a network notifier block
1221  *      @nb: notifier
1222  *
1223  *      Register a notifier to be called when network device events occur.
1224  *      The notifier passed is linked into the kernel structures and must
1225  *      not be reused until it has been unregistered. A negative errno code
1226  *      is returned on a failure.
1227  *
1228  *      When registered all registration and up events are replayed
1229  *      to the new notifier to allow device to have a race free
1230  *      view of the network device list.
1231  */
1232
1233 int register_netdevice_notifier(struct notifier_block *nb)
1234 {
1235         struct net_device *dev;
1236         struct net_device *last;
1237         struct net *net;
1238         int err;
1239
1240         rtnl_lock();
1241         err = raw_notifier_chain_register(&netdev_chain, nb);
1242         if (err)
1243                 goto unlock;
1244         if (dev_boot_phase)
1245                 goto unlock;
1246         for_each_net(net) {
1247                 for_each_netdev(net, dev) {
1248                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1249                         err = notifier_to_errno(err);
1250                         if (err)
1251                                 goto rollback;
1252
1253                         if (!(dev->flags & IFF_UP))
1254                                 continue;
1255
1256                         nb->notifier_call(nb, NETDEV_UP, dev);
1257                 }
1258         }
1259
1260 unlock:
1261         rtnl_unlock();
1262         return err;
1263
1264 rollback:
1265         last = dev;
1266         for_each_net(net) {
1267                 for_each_netdev(net, dev) {
1268                         if (dev == last)
1269                                 break;
1270
1271                         if (dev->flags & IFF_UP) {
1272                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1273                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1274                         }
1275                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1276                 }
1277         }
1278
1279         raw_notifier_chain_unregister(&netdev_chain, nb);
1280         goto unlock;
1281 }
1282
1283 /**
1284  *      unregister_netdevice_notifier - unregister a network notifier block
1285  *      @nb: notifier
1286  *
1287  *      Unregister a notifier previously registered by
1288  *      register_netdevice_notifier(). The notifier is unlinked into the
1289  *      kernel structures and may then be reused. A negative errno code
1290  *      is returned on a failure.
1291  */
1292
1293 int unregister_netdevice_notifier(struct notifier_block *nb)
1294 {
1295         int err;
1296
1297         rtnl_lock();
1298         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1299         rtnl_unlock();
1300         return err;
1301 }
1302
1303 /**
1304  *      call_netdevice_notifiers - call all network notifier blocks
1305  *      @val: value passed unmodified to notifier function
1306  *      @dev: net_device pointer passed unmodified to notifier function
1307  *
1308  *      Call all network notifier blocks.  Parameters and return value
1309  *      are as for raw_notifier_call_chain().
1310  */
1311
1312 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1313 {
1314         return raw_notifier_call_chain(&netdev_chain, val, dev);
1315 }
1316
1317 /* When > 0 there are consumers of rx skb time stamps */
1318 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1319
1320 void net_enable_timestamp(void)
1321 {
1322         atomic_inc(&netstamp_needed);
1323 }
1324
1325 void net_disable_timestamp(void)
1326 {
1327         atomic_dec(&netstamp_needed);
1328 }
1329
1330 static inline void net_timestamp(struct sk_buff *skb)
1331 {
1332         if (atomic_read(&netstamp_needed))
1333                 __net_timestamp(skb);
1334         else
1335                 skb->tstamp.tv64 = 0;
1336 }
1337
1338 /*
1339  *      Support routine. Sends outgoing frames to any network
1340  *      taps currently in use.
1341  */
1342
1343 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1344 {
1345         struct packet_type *ptype;
1346
1347         net_timestamp(skb);
1348
1349         rcu_read_lock();
1350         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1351                 /* Never send packets back to the socket
1352                  * they originated from - MvS (miquels@drinkel.ow.org)
1353                  */
1354                 if ((ptype->dev == dev || !ptype->dev) &&
1355                     (ptype->af_packet_priv == NULL ||
1356                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1357                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1358                         if (!skb2)
1359                                 break;
1360
1361                         /* skb->nh should be correctly
1362                            set by sender, so that the second statement is
1363                            just protection against buggy protocols.
1364                          */
1365                         skb_reset_mac_header(skb2);
1366
1367                         if (skb_network_header(skb2) < skb2->data ||
1368                             skb2->network_header > skb2->tail) {
1369                                 if (net_ratelimit())
1370                                         printk(KERN_CRIT "protocol %04x is "
1371                                                "buggy, dev %s\n",
1372                                                skb2->protocol, dev->name);
1373                                 skb_reset_network_header(skb2);
1374                         }
1375
1376                         skb2->transport_header = skb2->network_header;
1377                         skb2->pkt_type = PACKET_OUTGOING;
1378                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1379                 }
1380         }
1381         rcu_read_unlock();
1382 }
1383
1384
1385 static inline void __netif_reschedule(struct Qdisc *q)
1386 {
1387         struct softnet_data *sd;
1388         unsigned long flags;
1389
1390         local_irq_save(flags);
1391         sd = &__get_cpu_var(softnet_data);
1392         q->next_sched = sd->output_queue;
1393         sd->output_queue = q;
1394         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1395         local_irq_restore(flags);
1396 }
1397
1398 void __netif_schedule(struct Qdisc *q)
1399 {
1400         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1401                 __netif_reschedule(q);
1402 }
1403 EXPORT_SYMBOL(__netif_schedule);
1404
1405 void dev_kfree_skb_irq(struct sk_buff *skb)
1406 {
1407         if (atomic_dec_and_test(&skb->users)) {
1408                 struct softnet_data *sd;
1409                 unsigned long flags;
1410
1411                 local_irq_save(flags);
1412                 sd = &__get_cpu_var(softnet_data);
1413                 skb->next = sd->completion_queue;
1414                 sd->completion_queue = skb;
1415                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1416                 local_irq_restore(flags);
1417         }
1418 }
1419 EXPORT_SYMBOL(dev_kfree_skb_irq);
1420
1421 void dev_kfree_skb_any(struct sk_buff *skb)
1422 {
1423         if (in_irq() || irqs_disabled())
1424                 dev_kfree_skb_irq(skb);
1425         else
1426                 dev_kfree_skb(skb);
1427 }
1428 EXPORT_SYMBOL(dev_kfree_skb_any);
1429
1430
1431 /**
1432  * netif_device_detach - mark device as removed
1433  * @dev: network device
1434  *
1435  * Mark device as removed from system and therefore no longer available.
1436  */
1437 void netif_device_detach(struct net_device *dev)
1438 {
1439         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1440             netif_running(dev)) {
1441                 netif_stop_queue(dev);
1442         }
1443 }
1444 EXPORT_SYMBOL(netif_device_detach);
1445
1446 /**
1447  * netif_device_attach - mark device as attached
1448  * @dev: network device
1449  *
1450  * Mark device as attached from system and restart if needed.
1451  */
1452 void netif_device_attach(struct net_device *dev)
1453 {
1454         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1455             netif_running(dev)) {
1456                 netif_wake_queue(dev);
1457                 __netdev_watchdog_up(dev);
1458         }
1459 }
1460 EXPORT_SYMBOL(netif_device_attach);
1461
1462 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1463 {
1464         return ((features & NETIF_F_GEN_CSUM) ||
1465                 ((features & NETIF_F_IP_CSUM) &&
1466                  protocol == htons(ETH_P_IP)) ||
1467                 ((features & NETIF_F_IPV6_CSUM) &&
1468                  protocol == htons(ETH_P_IPV6)));
1469 }
1470
1471 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1472 {
1473         if (can_checksum_protocol(dev->features, skb->protocol))
1474                 return true;
1475
1476         if (skb->protocol == htons(ETH_P_8021Q)) {
1477                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1478                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1479                                           veh->h_vlan_encapsulated_proto))
1480                         return true;
1481         }
1482
1483         return false;
1484 }
1485
1486 /*
1487  * Invalidate hardware checksum when packet is to be mangled, and
1488  * complete checksum manually on outgoing path.
1489  */
1490 int skb_checksum_help(struct sk_buff *skb)
1491 {
1492         __wsum csum;
1493         int ret = 0, offset;
1494
1495         if (skb->ip_summed == CHECKSUM_COMPLETE)
1496                 goto out_set_summed;
1497
1498         if (unlikely(skb_shinfo(skb)->gso_size)) {
1499                 /* Let GSO fix up the checksum. */
1500                 goto out_set_summed;
1501         }
1502
1503         offset = skb->csum_start - skb_headroom(skb);
1504         BUG_ON(offset >= skb_headlen(skb));
1505         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1506
1507         offset += skb->csum_offset;
1508         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1509
1510         if (skb_cloned(skb) &&
1511             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1512                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1513                 if (ret)
1514                         goto out;
1515         }
1516
1517         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1518 out_set_summed:
1519         skb->ip_summed = CHECKSUM_NONE;
1520 out:
1521         return ret;
1522 }
1523
1524 /**
1525  *      skb_gso_segment - Perform segmentation on skb.
1526  *      @skb: buffer to segment
1527  *      @features: features for the output path (see dev->features)
1528  *
1529  *      This function segments the given skb and returns a list of segments.
1530  *
1531  *      It may return NULL if the skb requires no segmentation.  This is
1532  *      only possible when GSO is used for verifying header integrity.
1533  */
1534 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1535 {
1536         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1537         struct packet_type *ptype;
1538         __be16 type = skb->protocol;
1539         int err;
1540
1541         skb_reset_mac_header(skb);
1542         skb->mac_len = skb->network_header - skb->mac_header;
1543         __skb_pull(skb, skb->mac_len);
1544
1545         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1546                 struct net_device *dev = skb->dev;
1547                 struct ethtool_drvinfo info = {};
1548
1549                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1550                         dev->ethtool_ops->get_drvinfo(dev, &info);
1551
1552                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1553                         "ip_summed=%d",
1554                      info.driver, dev ? dev->features : 0L,
1555                      skb->sk ? skb->sk->sk_route_caps : 0L,
1556                      skb->len, skb->data_len, skb->ip_summed);
1557
1558                 if (skb_header_cloned(skb) &&
1559                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1560                         return ERR_PTR(err);
1561         }
1562
1563         rcu_read_lock();
1564         list_for_each_entry_rcu(ptype,
1565                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1566                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1567                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1568                                 err = ptype->gso_send_check(skb);
1569                                 segs = ERR_PTR(err);
1570                                 if (err || skb_gso_ok(skb, features))
1571                                         break;
1572                                 __skb_push(skb, (skb->data -
1573                                                  skb_network_header(skb)));
1574                         }
1575                         segs = ptype->gso_segment(skb, features);
1576                         break;
1577                 }
1578         }
1579         rcu_read_unlock();
1580
1581         __skb_push(skb, skb->data - skb_mac_header(skb));
1582
1583         return segs;
1584 }
1585
1586 EXPORT_SYMBOL(skb_gso_segment);
1587
1588 /* Take action when hardware reception checksum errors are detected. */
1589 #ifdef CONFIG_BUG
1590 void netdev_rx_csum_fault(struct net_device *dev)
1591 {
1592         if (net_ratelimit()) {
1593                 printk(KERN_ERR "%s: hw csum failure.\n",
1594                         dev ? dev->name : "<unknown>");
1595                 dump_stack();
1596         }
1597 }
1598 EXPORT_SYMBOL(netdev_rx_csum_fault);
1599 #endif
1600
1601 /* Actually, we should eliminate this check as soon as we know, that:
1602  * 1. IOMMU is present and allows to map all the memory.
1603  * 2. No high memory really exists on this machine.
1604  */
1605
1606 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1607 {
1608 #ifdef CONFIG_HIGHMEM
1609         int i;
1610
1611         if (dev->features & NETIF_F_HIGHDMA)
1612                 return 0;
1613
1614         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1615                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1616                         return 1;
1617
1618 #endif
1619         return 0;
1620 }
1621
1622 struct dev_gso_cb {
1623         void (*destructor)(struct sk_buff *skb);
1624 };
1625
1626 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1627
1628 static void dev_gso_skb_destructor(struct sk_buff *skb)
1629 {
1630         struct dev_gso_cb *cb;
1631
1632         do {
1633                 struct sk_buff *nskb = skb->next;
1634
1635                 skb->next = nskb->next;
1636                 nskb->next = NULL;
1637                 kfree_skb(nskb);
1638         } while (skb->next);
1639
1640         cb = DEV_GSO_CB(skb);
1641         if (cb->destructor)
1642                 cb->destructor(skb);
1643 }
1644
1645 /**
1646  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1647  *      @skb: buffer to segment
1648  *
1649  *      This function segments the given skb and stores the list of segments
1650  *      in skb->next.
1651  */
1652 static int dev_gso_segment(struct sk_buff *skb)
1653 {
1654         struct net_device *dev = skb->dev;
1655         struct sk_buff *segs;
1656         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1657                                          NETIF_F_SG : 0);
1658
1659         segs = skb_gso_segment(skb, features);
1660
1661         /* Verifying header integrity only. */
1662         if (!segs)
1663                 return 0;
1664
1665         if (IS_ERR(segs))
1666                 return PTR_ERR(segs);
1667
1668         skb->next = segs;
1669         DEV_GSO_CB(skb)->destructor = skb->destructor;
1670         skb->destructor = dev_gso_skb_destructor;
1671
1672         return 0;
1673 }
1674
1675 static void tstamp_tx(struct sk_buff *skb)
1676 {
1677         union skb_shared_tx *shtx =
1678                 skb_tx(skb);
1679         if (unlikely(shtx->software &&
1680                         !shtx->in_progress)) {
1681                 skb_tstamp_tx(skb, NULL);
1682         }
1683 }
1684
1685 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1686                         struct netdev_queue *txq)
1687 {
1688         const struct net_device_ops *ops = dev->netdev_ops;
1689         int rc;
1690
1691         prefetch(&dev->netdev_ops->ndo_start_xmit);
1692         if (likely(!skb->next)) {
1693                 if (!list_empty(&ptype_all))
1694                         dev_queue_xmit_nit(skb, dev);
1695
1696                 if (netif_needs_gso(dev, skb)) {
1697                         if (unlikely(dev_gso_segment(skb)))
1698                                 goto out_kfree_skb;
1699                         if (skb->next)
1700                                 goto gso;
1701                 }
1702
1703                 rc = ops->ndo_start_xmit(skb, dev);
1704                 /*
1705                  * TODO: if skb_orphan() was called by
1706                  * dev->hard_start_xmit() (for example, the unmodified
1707                  * igb driver does that; bnx2 doesn't), then
1708                  * skb_tx_software_timestamp() will be unable to send
1709                  * back the time stamp.
1710                  *
1711                  * How can this be prevented? Always create another
1712                  * reference to the socket before calling
1713                  * dev->hard_start_xmit()? Prevent that skb_orphan()
1714                  * does anything in dev->hard_start_xmit() by clearing
1715                  * the skb destructor before the call and restoring it
1716                  * afterwards, then doing the skb_orphan() ourselves?
1717                  */
1718                 if (likely(!rc))
1719                         tstamp_tx(skb);
1720                 return rc;
1721         }
1722
1723 gso:
1724         do {
1725                 struct sk_buff *nskb = skb->next;
1726
1727                 skb->next = nskb->next;
1728                 nskb->next = NULL;
1729                 rc = ops->ndo_start_xmit(nskb, dev);
1730                 if (unlikely(rc)) {
1731                         nskb->next = skb->next;
1732                         skb->next = nskb;
1733                         return rc;
1734                 }
1735                 tstamp_tx(skb);
1736                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1737                         return NETDEV_TX_BUSY;
1738         } while (skb->next);
1739
1740         skb->destructor = DEV_GSO_CB(skb)->destructor;
1741
1742 out_kfree_skb:
1743         kfree_skb(skb);
1744         return 0;
1745 }
1746
1747 static u32 skb_tx_hashrnd;
1748 static int skb_tx_hashrnd_initialized = 0;
1749
1750 static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb)
1751 {
1752         u32 hash;
1753
1754         if (unlikely(!skb_tx_hashrnd_initialized)) {
1755                 get_random_bytes(&skb_tx_hashrnd, 4);
1756                 skb_tx_hashrnd_initialized = 1;
1757         }
1758
1759         if (skb_rx_queue_recorded(skb)) {
1760                 hash = skb_get_rx_queue(skb);
1761         } else if (skb->sk && skb->sk->sk_hash) {
1762                 hash = skb->sk->sk_hash;
1763         } else
1764                 hash = skb->protocol;
1765
1766         hash = jhash_1word(hash, skb_tx_hashrnd);
1767
1768         return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1769 }
1770
1771 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1772                                         struct sk_buff *skb)
1773 {
1774         const struct net_device_ops *ops = dev->netdev_ops;
1775         u16 queue_index = 0;
1776
1777         if (ops->ndo_select_queue)
1778                 queue_index = ops->ndo_select_queue(dev, skb);
1779         else if (dev->real_num_tx_queues > 1)
1780                 queue_index = skb_tx_hash(dev, skb);
1781
1782         skb_set_queue_mapping(skb, queue_index);
1783         return netdev_get_tx_queue(dev, queue_index);
1784 }
1785
1786 /**
1787  *      dev_queue_xmit - transmit a buffer
1788  *      @skb: buffer to transmit
1789  *
1790  *      Queue a buffer for transmission to a network device. The caller must
1791  *      have set the device and priority and built the buffer before calling
1792  *      this function. The function can be called from an interrupt.
1793  *
1794  *      A negative errno code is returned on a failure. A success does not
1795  *      guarantee the frame will be transmitted as it may be dropped due
1796  *      to congestion or traffic shaping.
1797  *
1798  * -----------------------------------------------------------------------------------
1799  *      I notice this method can also return errors from the queue disciplines,
1800  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1801  *      be positive.
1802  *
1803  *      Regardless of the return value, the skb is consumed, so it is currently
1804  *      difficult to retry a send to this method.  (You can bump the ref count
1805  *      before sending to hold a reference for retry if you are careful.)
1806  *
1807  *      When calling this method, interrupts MUST be enabled.  This is because
1808  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1809  *          --BLG
1810  */
1811 int dev_queue_xmit(struct sk_buff *skb)
1812 {
1813         struct net_device *dev = skb->dev;
1814         struct netdev_queue *txq;
1815         struct Qdisc *q;
1816         int rc = -ENOMEM;
1817
1818         /* GSO will handle the following emulations directly. */
1819         if (netif_needs_gso(dev, skb))
1820                 goto gso;
1821
1822         if (skb_shinfo(skb)->frag_list &&
1823             !(dev->features & NETIF_F_FRAGLIST) &&
1824             __skb_linearize(skb))
1825                 goto out_kfree_skb;
1826
1827         /* Fragmented skb is linearized if device does not support SG,
1828          * or if at least one of fragments is in highmem and device
1829          * does not support DMA from it.
1830          */
1831         if (skb_shinfo(skb)->nr_frags &&
1832             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1833             __skb_linearize(skb))
1834                 goto out_kfree_skb;
1835
1836         /* If packet is not checksummed and device does not support
1837          * checksumming for this protocol, complete checksumming here.
1838          */
1839         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1840                 skb_set_transport_header(skb, skb->csum_start -
1841                                               skb_headroom(skb));
1842                 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1843                         goto out_kfree_skb;
1844         }
1845
1846 gso:
1847         /* Disable soft irqs for various locks below. Also
1848          * stops preemption for RCU.
1849          */
1850         rcu_read_lock_bh();
1851
1852         txq = dev_pick_tx(dev, skb);
1853         q = rcu_dereference(txq->qdisc);
1854
1855 #ifdef CONFIG_NET_CLS_ACT
1856         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1857 #endif
1858         if (q->enqueue) {
1859                 spinlock_t *root_lock = qdisc_lock(q);
1860
1861                 spin_lock(root_lock);
1862
1863                 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1864                         kfree_skb(skb);
1865                         rc = NET_XMIT_DROP;
1866                 } else {
1867                         rc = qdisc_enqueue_root(skb, q);
1868                         qdisc_run(q);
1869                 }
1870                 spin_unlock(root_lock);
1871
1872                 goto out;
1873         }
1874
1875         /* The device has no queue. Common case for software devices:
1876            loopback, all the sorts of tunnels...
1877
1878            Really, it is unlikely that netif_tx_lock protection is necessary
1879            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1880            counters.)
1881            However, it is possible, that they rely on protection
1882            made by us here.
1883
1884            Check this and shot the lock. It is not prone from deadlocks.
1885            Either shot noqueue qdisc, it is even simpler 8)
1886          */
1887         if (dev->flags & IFF_UP) {
1888                 int cpu = smp_processor_id(); /* ok because BHs are off */
1889
1890                 if (txq->xmit_lock_owner != cpu) {
1891
1892                         HARD_TX_LOCK(dev, txq, cpu);
1893
1894                         if (!netif_tx_queue_stopped(txq)) {
1895                                 rc = 0;
1896                                 if (!dev_hard_start_xmit(skb, dev, txq)) {
1897                                         HARD_TX_UNLOCK(dev, txq);
1898                                         goto out;
1899                                 }
1900                         }
1901                         HARD_TX_UNLOCK(dev, txq);
1902                         if (net_ratelimit())
1903                                 printk(KERN_CRIT "Virtual device %s asks to "
1904                                        "queue packet!\n", dev->name);
1905                 } else {
1906                         /* Recursion is detected! It is possible,
1907                          * unfortunately */
1908                         if (net_ratelimit())
1909                                 printk(KERN_CRIT "Dead loop on virtual device "
1910                                        "%s, fix it urgently!\n", dev->name);
1911                 }
1912         }
1913
1914         rc = -ENETDOWN;
1915         rcu_read_unlock_bh();
1916
1917 out_kfree_skb:
1918         kfree_skb(skb);
1919         return rc;
1920 out:
1921         rcu_read_unlock_bh();
1922         return rc;
1923 }
1924
1925
1926 /*=======================================================================
1927                         Receiver routines
1928   =======================================================================*/
1929
1930 int netdev_max_backlog __read_mostly = 1000;
1931 int netdev_budget __read_mostly = 300;
1932 int weight_p __read_mostly = 64;            /* old backlog weight */
1933
1934 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1935
1936
1937 /**
1938  *      netif_rx        -       post buffer to the network code
1939  *      @skb: buffer to post
1940  *
1941  *      This function receives a packet from a device driver and queues it for
1942  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1943  *      may be dropped during processing for congestion control or by the
1944  *      protocol layers.
1945  *
1946  *      return values:
1947  *      NET_RX_SUCCESS  (no congestion)
1948  *      NET_RX_DROP     (packet was dropped)
1949  *
1950  */
1951
1952 int netif_rx(struct sk_buff *skb)
1953 {
1954         struct softnet_data *queue;
1955         unsigned long flags;
1956
1957         /* if netpoll wants it, pretend we never saw it */
1958         if (netpoll_rx(skb))
1959                 return NET_RX_DROP;
1960
1961         if (!skb->tstamp.tv64)
1962                 net_timestamp(skb);
1963
1964         /*
1965          * The code is rearranged so that the path is the most
1966          * short when CPU is congested, but is still operating.
1967          */
1968         local_irq_save(flags);
1969         queue = &__get_cpu_var(softnet_data);
1970
1971         __get_cpu_var(netdev_rx_stat).total++;
1972         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1973                 if (queue->input_pkt_queue.qlen) {
1974 enqueue:
1975                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1976                         local_irq_restore(flags);
1977                         return NET_RX_SUCCESS;
1978                 }
1979
1980                 napi_schedule(&queue->backlog);
1981                 goto enqueue;
1982         }
1983
1984         __get_cpu_var(netdev_rx_stat).dropped++;
1985         local_irq_restore(flags);
1986
1987         kfree_skb(skb);
1988         return NET_RX_DROP;
1989 }
1990
1991 int netif_rx_ni(struct sk_buff *skb)
1992 {
1993         int err;
1994
1995         preempt_disable();
1996         err = netif_rx(skb);
1997         if (local_softirq_pending())
1998                 do_softirq();
1999         preempt_enable();
2000
2001         return err;
2002 }
2003
2004 EXPORT_SYMBOL(netif_rx_ni);
2005
2006 static void net_tx_action(struct softirq_action *h)
2007 {
2008         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2009
2010         if (sd->completion_queue) {
2011                 struct sk_buff *clist;
2012
2013                 local_irq_disable();
2014                 clist = sd->completion_queue;
2015                 sd->completion_queue = NULL;
2016                 local_irq_enable();
2017
2018                 while (clist) {
2019                         struct sk_buff *skb = clist;
2020                         clist = clist->next;
2021
2022                         WARN_ON(atomic_read(&skb->users));
2023                         __kfree_skb(skb);
2024                 }
2025         }
2026
2027         if (sd->output_queue) {
2028                 struct Qdisc *head;
2029
2030                 local_irq_disable();
2031                 head = sd->output_queue;
2032                 sd->output_queue = NULL;
2033                 local_irq_enable();
2034
2035                 while (head) {
2036                         struct Qdisc *q = head;
2037                         spinlock_t *root_lock;
2038
2039                         head = head->next_sched;
2040
2041                         root_lock = qdisc_lock(q);
2042                         if (spin_trylock(root_lock)) {
2043                                 smp_mb__before_clear_bit();
2044                                 clear_bit(__QDISC_STATE_SCHED,
2045                                           &q->state);
2046                                 qdisc_run(q);
2047                                 spin_unlock(root_lock);
2048                         } else {
2049                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2050                                               &q->state)) {
2051                                         __netif_reschedule(q);
2052                                 } else {
2053                                         smp_mb__before_clear_bit();
2054                                         clear_bit(__QDISC_STATE_SCHED,
2055                                                   &q->state);
2056                                 }
2057                         }
2058                 }
2059         }
2060 }
2061
2062 static inline int deliver_skb(struct sk_buff *skb,
2063                               struct packet_type *pt_prev,
2064                               struct net_device *orig_dev)
2065 {
2066         atomic_inc(&skb->users);
2067         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2068 }
2069
2070 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2071 /* These hooks defined here for ATM */
2072 struct net_bridge;
2073 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2074                                                 unsigned char *addr);
2075 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2076
2077 /*
2078  * If bridge module is loaded call bridging hook.
2079  *  returns NULL if packet was consumed.
2080  */
2081 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2082                                         struct sk_buff *skb) __read_mostly;
2083 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2084                                             struct packet_type **pt_prev, int *ret,
2085                                             struct net_device *orig_dev)
2086 {
2087         struct net_bridge_port *port;
2088
2089         if (skb->pkt_type == PACKET_LOOPBACK ||
2090             (port = rcu_dereference(skb->dev->br_port)) == NULL)
2091                 return skb;
2092
2093         if (*pt_prev) {
2094                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2095                 *pt_prev = NULL;
2096         }
2097
2098         return br_handle_frame_hook(port, skb);
2099 }
2100 #else
2101 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
2102 #endif
2103
2104 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2105 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2106 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2107
2108 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2109                                              struct packet_type **pt_prev,
2110                                              int *ret,
2111                                              struct net_device *orig_dev)
2112 {
2113         if (skb->dev->macvlan_port == NULL)
2114                 return skb;
2115
2116         if (*pt_prev) {
2117                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2118                 *pt_prev = NULL;
2119         }
2120         return macvlan_handle_frame_hook(skb);
2121 }
2122 #else
2123 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
2124 #endif
2125
2126 #ifdef CONFIG_NET_CLS_ACT
2127 /* TODO: Maybe we should just force sch_ingress to be compiled in
2128  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2129  * a compare and 2 stores extra right now if we dont have it on
2130  * but have CONFIG_NET_CLS_ACT
2131  * NOTE: This doesnt stop any functionality; if you dont have
2132  * the ingress scheduler, you just cant add policies on ingress.
2133  *
2134  */
2135 static int ing_filter(struct sk_buff *skb)
2136 {
2137         struct net_device *dev = skb->dev;
2138         u32 ttl = G_TC_RTTL(skb->tc_verd);
2139         struct netdev_queue *rxq;
2140         int result = TC_ACT_OK;
2141         struct Qdisc *q;
2142
2143         if (MAX_RED_LOOP < ttl++) {
2144                 printk(KERN_WARNING
2145                        "Redir loop detected Dropping packet (%d->%d)\n",
2146                        skb->iif, dev->ifindex);
2147                 return TC_ACT_SHOT;
2148         }
2149
2150         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2151         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2152
2153         rxq = &dev->rx_queue;
2154
2155         q = rxq->qdisc;
2156         if (q != &noop_qdisc) {
2157                 spin_lock(qdisc_lock(q));
2158                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2159                         result = qdisc_enqueue_root(skb, q);
2160                 spin_unlock(qdisc_lock(q));
2161         }
2162
2163         return result;
2164 }
2165
2166 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2167                                          struct packet_type **pt_prev,
2168                                          int *ret, struct net_device *orig_dev)
2169 {
2170         if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2171                 goto out;
2172
2173         if (*pt_prev) {
2174                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2175                 *pt_prev = NULL;
2176         } else {
2177                 /* Huh? Why does turning on AF_PACKET affect this? */
2178                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2179         }
2180
2181         switch (ing_filter(skb)) {
2182         case TC_ACT_SHOT:
2183         case TC_ACT_STOLEN:
2184                 kfree_skb(skb);
2185                 return NULL;
2186         }
2187
2188 out:
2189         skb->tc_verd = 0;
2190         return skb;
2191 }
2192 #endif
2193
2194 /*
2195  *      netif_nit_deliver - deliver received packets to network taps
2196  *      @skb: buffer
2197  *
2198  *      This function is used to deliver incoming packets to network
2199  *      taps. It should be used when the normal netif_receive_skb path
2200  *      is bypassed, for example because of VLAN acceleration.
2201  */
2202 void netif_nit_deliver(struct sk_buff *skb)
2203 {
2204         struct packet_type *ptype;
2205
2206         if (list_empty(&ptype_all))
2207                 return;
2208
2209         skb_reset_network_header(skb);
2210         skb_reset_transport_header(skb);
2211         skb->mac_len = skb->network_header - skb->mac_header;
2212
2213         rcu_read_lock();
2214         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2215                 if (!ptype->dev || ptype->dev == skb->dev)
2216                         deliver_skb(skb, ptype, skb->dev);
2217         }
2218         rcu_read_unlock();
2219 }
2220
2221 /**
2222  *      netif_receive_skb - process receive buffer from network
2223  *      @skb: buffer to process
2224  *
2225  *      netif_receive_skb() is the main receive data processing function.
2226  *      It always succeeds. The buffer may be dropped during processing
2227  *      for congestion control or by the protocol layers.
2228  *
2229  *      This function may only be called from softirq context and interrupts
2230  *      should be enabled.
2231  *
2232  *      Return values (usually ignored):
2233  *      NET_RX_SUCCESS: no congestion
2234  *      NET_RX_DROP: packet was dropped
2235  */
2236 int netif_receive_skb(struct sk_buff *skb)
2237 {
2238         struct packet_type *ptype, *pt_prev;
2239         struct net_device *orig_dev;
2240         struct net_device *null_or_orig;
2241         int ret = NET_RX_DROP;
2242         __be16 type;
2243
2244         if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2245                 return NET_RX_SUCCESS;
2246
2247         /* if we've gotten here through NAPI, check netpoll */
2248         if (netpoll_receive_skb(skb))
2249                 return NET_RX_DROP;
2250
2251         if (!skb->tstamp.tv64)
2252                 net_timestamp(skb);
2253
2254         if (!skb->iif)
2255                 skb->iif = skb->dev->ifindex;
2256
2257         null_or_orig = NULL;
2258         orig_dev = skb->dev;
2259         if (orig_dev->master) {
2260                 if (skb_bond_should_drop(skb))
2261                         null_or_orig = orig_dev; /* deliver only exact match */
2262                 else
2263                         skb->dev = orig_dev->master;
2264         }
2265
2266         __get_cpu_var(netdev_rx_stat).total++;
2267
2268         skb_reset_network_header(skb);
2269         skb_reset_transport_header(skb);
2270         skb->mac_len = skb->network_header - skb->mac_header;
2271
2272         pt_prev = NULL;
2273
2274         rcu_read_lock();
2275
2276         /* Don't receive packets in an exiting network namespace */
2277         if (!net_alive(dev_net(skb->dev))) {
2278                 kfree_skb(skb);
2279                 goto out;
2280         }
2281
2282 #ifdef CONFIG_NET_CLS_ACT
2283         if (skb->tc_verd & TC_NCLS) {
2284                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2285                 goto ncls;
2286         }
2287 #endif
2288
2289         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2290                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2291                     ptype->dev == orig_dev) {
2292                         if (pt_prev)
2293                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2294                         pt_prev = ptype;
2295                 }
2296         }
2297
2298 #ifdef CONFIG_NET_CLS_ACT
2299         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2300         if (!skb)
2301                 goto out;
2302 ncls:
2303 #endif
2304
2305         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2306         if (!skb)
2307                 goto out;
2308         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2309         if (!skb)
2310                 goto out;
2311
2312         skb_orphan(skb);
2313
2314         type = skb->protocol;
2315         list_for_each_entry_rcu(ptype,
2316                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2317                 if (ptype->type == type &&
2318                     (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2319                      ptype->dev == orig_dev)) {
2320                         if (pt_prev)
2321                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2322                         pt_prev = ptype;
2323                 }
2324         }
2325
2326         if (pt_prev) {
2327                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2328         } else {
2329                 kfree_skb(skb);
2330                 /* Jamal, now you will not able to escape explaining
2331                  * me how you were going to use this. :-)
2332                  */
2333                 ret = NET_RX_DROP;
2334         }
2335
2336 out:
2337         rcu_read_unlock();
2338         return ret;
2339 }
2340
2341 /* Network device is going away, flush any packets still pending  */
2342 static void flush_backlog(void *arg)
2343 {
2344         struct net_device *dev = arg;
2345         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2346         struct sk_buff *skb, *tmp;
2347
2348         skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2349                 if (skb->dev == dev) {
2350                         __skb_unlink(skb, &queue->input_pkt_queue);
2351                         kfree_skb(skb);
2352                 }
2353 }
2354
2355 static int napi_gro_complete(struct sk_buff *skb)
2356 {
2357         struct packet_type *ptype;
2358         __be16 type = skb->protocol;
2359         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2360         int err = -ENOENT;
2361
2362         if (NAPI_GRO_CB(skb)->count == 1)
2363                 goto out;
2364
2365         rcu_read_lock();
2366         list_for_each_entry_rcu(ptype, head, list) {
2367                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2368                         continue;
2369
2370                 err = ptype->gro_complete(skb);
2371                 break;
2372         }
2373         rcu_read_unlock();
2374
2375         if (err) {
2376                 WARN_ON(&ptype->list == head);
2377                 kfree_skb(skb);
2378                 return NET_RX_SUCCESS;
2379         }
2380
2381 out:
2382         skb_shinfo(skb)->gso_size = 0;
2383         return netif_receive_skb(skb);
2384 }
2385
2386 void napi_gro_flush(struct napi_struct *napi)
2387 {
2388         struct sk_buff *skb, *next;
2389
2390         for (skb = napi->gro_list; skb; skb = next) {
2391                 next = skb->next;
2392                 skb->next = NULL;
2393                 napi_gro_complete(skb);
2394         }
2395
2396         napi->gro_count = 0;
2397         napi->gro_list = NULL;
2398 }
2399 EXPORT_SYMBOL(napi_gro_flush);
2400
2401 void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
2402 {
2403         unsigned int offset = skb_gro_offset(skb);
2404
2405         hlen += offset;
2406         if (hlen <= skb_headlen(skb))
2407                 return skb->data + offset;
2408
2409         if (unlikely(!skb_shinfo(skb)->nr_frags ||
2410                      skb_shinfo(skb)->frags[0].size <=
2411                      hlen - skb_headlen(skb) ||
2412                      PageHighMem(skb_shinfo(skb)->frags[0].page)))
2413                 return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
2414
2415         return page_address(skb_shinfo(skb)->frags[0].page) +
2416                skb_shinfo(skb)->frags[0].page_offset +
2417                offset - skb_headlen(skb);
2418 }
2419 EXPORT_SYMBOL(skb_gro_header);
2420
2421 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2422 {
2423         struct sk_buff **pp = NULL;
2424         struct packet_type *ptype;
2425         __be16 type = skb->protocol;
2426         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2427         int same_flow;
2428         int mac_len;
2429         int ret;
2430
2431         if (!(skb->dev->features & NETIF_F_GRO))
2432                 goto normal;
2433
2434         if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list)
2435                 goto normal;
2436
2437         rcu_read_lock();
2438         list_for_each_entry_rcu(ptype, head, list) {
2439                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2440                         continue;
2441
2442                 skb_set_network_header(skb, skb_gro_offset(skb));
2443                 mac_len = skb->network_header - skb->mac_header;
2444                 skb->mac_len = mac_len;
2445                 NAPI_GRO_CB(skb)->same_flow = 0;
2446                 NAPI_GRO_CB(skb)->flush = 0;
2447                 NAPI_GRO_CB(skb)->free = 0;
2448
2449                 pp = ptype->gro_receive(&napi->gro_list, skb);
2450                 break;
2451         }
2452         rcu_read_unlock();
2453
2454         if (&ptype->list == head)
2455                 goto normal;
2456
2457         same_flow = NAPI_GRO_CB(skb)->same_flow;
2458         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2459
2460         if (pp) {
2461                 struct sk_buff *nskb = *pp;
2462
2463                 *pp = nskb->next;
2464                 nskb->next = NULL;
2465                 napi_gro_complete(nskb);
2466                 napi->gro_count--;
2467         }
2468
2469         if (same_flow)
2470                 goto ok;
2471
2472         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2473                 goto normal;
2474
2475         napi->gro_count++;
2476         NAPI_GRO_CB(skb)->count = 1;
2477         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2478         skb->next = napi->gro_list;
2479         napi->gro_list = skb;
2480         ret = GRO_HELD;
2481
2482 pull:
2483         if (unlikely(!pskb_may_pull(skb, skb_gro_offset(skb)))) {
2484                 if (napi->gro_list == skb)
2485                         napi->gro_list = skb->next;
2486                 ret = GRO_DROP;
2487         }
2488
2489 ok:
2490         return ret;
2491
2492 normal:
2493         ret = GRO_NORMAL;
2494         goto pull;
2495 }
2496 EXPORT_SYMBOL(dev_gro_receive);
2497
2498 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2499 {
2500         struct sk_buff *p;
2501
2502         for (p = napi->gro_list; p; p = p->next) {
2503                 NAPI_GRO_CB(p)->same_flow = !compare_ether_header(
2504                         skb_mac_header(p), skb_gro_mac_header(skb));
2505                 NAPI_GRO_CB(p)->flush = 0;
2506         }
2507
2508         return dev_gro_receive(napi, skb);
2509 }
2510
2511 int napi_skb_finish(int ret, struct sk_buff *skb)
2512 {
2513         int err = NET_RX_SUCCESS;
2514
2515         switch (ret) {
2516         case GRO_NORMAL:
2517                 return netif_receive_skb(skb);
2518
2519         case GRO_DROP:
2520                 err = NET_RX_DROP;
2521                 /* fall through */
2522
2523         case GRO_MERGED_FREE:
2524                 kfree_skb(skb);
2525                 break;
2526         }
2527
2528         return err;
2529 }
2530 EXPORT_SYMBOL(napi_skb_finish);
2531
2532 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2533 {
2534         skb_gro_reset_offset(skb);
2535
2536         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2537 }
2538 EXPORT_SYMBOL(napi_gro_receive);
2539
2540 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2541 {
2542         __skb_pull(skb, skb_headlen(skb));
2543         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2544
2545         napi->skb = skb;
2546 }
2547 EXPORT_SYMBOL(napi_reuse_skb);
2548
2549 struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
2550                                   struct napi_gro_fraginfo *info)
2551 {
2552         struct net_device *dev = napi->dev;
2553         struct sk_buff *skb = napi->skb;
2554         struct ethhdr *eth;
2555         skb_frag_t *frag;
2556         int i;
2557
2558         napi->skb = NULL;
2559
2560         if (!skb) {
2561                 skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2562                 if (!skb)
2563                         goto out;
2564
2565                 skb_reserve(skb, NET_IP_ALIGN);
2566         }
2567
2568         BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
2569         frag = &info->frags[info->nr_frags - 1];
2570
2571         for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) {
2572                 skb_fill_page_desc(skb, i, frag->page, frag->page_offset,
2573                                    frag->size);
2574                 frag++;
2575         }
2576         skb_shinfo(skb)->nr_frags = info->nr_frags;
2577
2578         skb->data_len = info->len;
2579         skb->len += info->len;
2580         skb->truesize += info->len;
2581
2582         skb_reset_mac_header(skb);
2583         skb_gro_reset_offset(skb);
2584
2585         eth = skb_gro_header(skb, sizeof(*eth));
2586         if (!eth) {
2587                 napi_reuse_skb(napi, skb);
2588                 skb = NULL;
2589                 goto out;
2590         }
2591
2592         skb_gro_pull(skb, sizeof(*eth));
2593
2594         /*
2595          * This works because the only protocols we care about don't require
2596          * special handling.  We'll fix it up properly at the end.
2597          */
2598         skb->protocol = eth->h_proto;
2599
2600         skb->ip_summed = info->ip_summed;
2601         skb->csum = info->csum;
2602
2603 out:
2604         return skb;
2605 }
2606 EXPORT_SYMBOL(napi_fraginfo_skb);
2607
2608 int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
2609 {
2610         int err = NET_RX_SUCCESS;
2611
2612         switch (ret) {
2613         case GRO_NORMAL:
2614         case GRO_HELD:
2615                 skb->protocol = eth_type_trans(skb, napi->dev);
2616
2617                 if (ret == GRO_NORMAL)
2618                         return netif_receive_skb(skb);
2619
2620                 skb_gro_pull(skb, -ETH_HLEN);
2621                 break;
2622
2623         case GRO_DROP:
2624                 err = NET_RX_DROP;
2625                 /* fall through */
2626
2627         case GRO_MERGED_FREE:
2628                 napi_reuse_skb(napi, skb);
2629                 break;
2630         }
2631
2632         return err;
2633 }
2634 EXPORT_SYMBOL(napi_frags_finish);
2635
2636 int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
2637 {
2638         struct sk_buff *skb = napi_fraginfo_skb(napi, info);
2639
2640         if (!skb)
2641                 return NET_RX_DROP;
2642
2643         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2644 }
2645 EXPORT_SYMBOL(napi_gro_frags);
2646
2647 static int process_backlog(struct napi_struct *napi, int quota)
2648 {
2649         int work = 0;
2650         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2651         unsigned long start_time = jiffies;
2652
2653         napi->weight = weight_p;
2654         do {
2655                 struct sk_buff *skb;
2656
2657                 local_irq_disable();
2658                 skb = __skb_dequeue(&queue->input_pkt_queue);
2659                 if (!skb) {
2660                         __napi_complete(napi);
2661                         local_irq_enable();
2662                         break;
2663                 }
2664                 local_irq_enable();
2665
2666                 napi_gro_receive(napi, skb);
2667         } while (++work < quota && jiffies == start_time);
2668
2669         napi_gro_flush(napi);
2670
2671         return work;
2672 }
2673
2674 /**
2675  * __napi_schedule - schedule for receive
2676  * @n: entry to schedule
2677  *
2678  * The entry's receive function will be scheduled to run
2679  */
2680 void __napi_schedule(struct napi_struct *n)
2681 {
2682         unsigned long flags;
2683
2684         local_irq_save(flags);
2685         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2686         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2687         local_irq_restore(flags);
2688 }
2689 EXPORT_SYMBOL(__napi_schedule);
2690
2691 void __napi_complete(struct napi_struct *n)
2692 {
2693         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2694         BUG_ON(n->gro_list);
2695
2696         list_del(&n->poll_list);
2697         smp_mb__before_clear_bit();
2698         clear_bit(NAPI_STATE_SCHED, &n->state);
2699 }
2700 EXPORT_SYMBOL(__napi_complete);
2701
2702 void napi_complete(struct napi_struct *n)
2703 {
2704         unsigned long flags;
2705
2706         /*
2707          * don't let napi dequeue from the cpu poll list
2708          * just in case its running on a different cpu
2709          */
2710         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2711                 return;
2712
2713         napi_gro_flush(n);
2714         local_irq_save(flags);
2715         __napi_complete(n);
2716         local_irq_restore(flags);
2717 }
2718 EXPORT_SYMBOL(napi_complete);
2719
2720 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2721                     int (*poll)(struct napi_struct *, int), int weight)
2722 {
2723         INIT_LIST_HEAD(&napi->poll_list);
2724         napi->gro_count = 0;
2725         napi->gro_list = NULL;
2726         napi->skb = NULL;
2727         napi->poll = poll;
2728         napi->weight = weight;
2729         list_add(&napi->dev_list, &dev->napi_list);
2730         napi->dev = dev;
2731 #ifdef CONFIG_NETPOLL
2732         spin_lock_init(&napi->poll_lock);
2733         napi->poll_owner = -1;
2734 #endif
2735         set_bit(NAPI_STATE_SCHED, &napi->state);
2736 }
2737 EXPORT_SYMBOL(netif_napi_add);
2738
2739 void netif_napi_del(struct napi_struct *napi)
2740 {
2741         struct sk_buff *skb, *next;
2742
2743         list_del_init(&napi->dev_list);
2744         kfree(napi->skb);
2745
2746         for (skb = napi->gro_list; skb; skb = next) {
2747                 next = skb->next;
2748                 skb->next = NULL;
2749                 kfree_skb(skb);
2750         }
2751
2752         napi->gro_list = NULL;
2753         napi->gro_count = 0;
2754 }
2755 EXPORT_SYMBOL(netif_napi_del);
2756
2757
2758 static void net_rx_action(struct softirq_action *h)
2759 {
2760         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2761         unsigned long time_limit = jiffies + 2;
2762         int budget = netdev_budget;
2763         void *have;
2764
2765         local_irq_disable();
2766
2767         while (!list_empty(list)) {
2768                 struct napi_struct *n;
2769                 int work, weight;
2770
2771                 /* If softirq window is exhuasted then punt.
2772                  * Allow this to run for 2 jiffies since which will allow
2773                  * an average latency of 1.5/HZ.
2774                  */
2775                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2776                         goto softnet_break;
2777
2778                 local_irq_enable();
2779
2780                 /* Even though interrupts have been re-enabled, this
2781                  * access is safe because interrupts can only add new
2782                  * entries to the tail of this list, and only ->poll()
2783                  * calls can remove this head entry from the list.
2784                  */
2785                 n = list_entry(list->next, struct napi_struct, poll_list);
2786
2787                 have = netpoll_poll_lock(n);
2788
2789                 weight = n->weight;
2790
2791                 /* This NAPI_STATE_SCHED test is for avoiding a race
2792                  * with netpoll's poll_napi().  Only the entity which
2793                  * obtains the lock and sees NAPI_STATE_SCHED set will
2794                  * actually make the ->poll() call.  Therefore we avoid
2795                  * accidently calling ->poll() when NAPI is not scheduled.
2796                  */
2797                 work = 0;
2798                 if (test_bit(NAPI_STATE_SCHED, &n->state))
2799                         work = n->poll(n, weight);
2800
2801                 WARN_ON_ONCE(work > weight);
2802
2803                 budget -= work;
2804
2805                 local_irq_disable();
2806
2807                 /* Drivers must not modify the NAPI state if they
2808                  * consume the entire weight.  In such cases this code
2809                  * still "owns" the NAPI instance and therefore can
2810                  * move the instance around on the list at-will.
2811                  */
2812                 if (unlikely(work == weight)) {
2813                         if (unlikely(napi_disable_pending(n)))
2814                                 __napi_complete(n);
2815                         else
2816                                 list_move_tail(&n->poll_list, list);
2817                 }
2818
2819                 netpoll_poll_unlock(have);
2820         }
2821 out:
2822         local_irq_enable();
2823
2824 #ifdef CONFIG_NET_DMA
2825         /*
2826          * There may not be any more sk_buffs coming right now, so push
2827          * any pending DMA copies to hardware
2828          */
2829         dma_issue_pending_all();
2830 #endif
2831
2832         return;
2833
2834 softnet_break:
2835         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2836         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2837         goto out;
2838 }
2839
2840 static gifconf_func_t * gifconf_list [NPROTO];
2841
2842 /**
2843  *      register_gifconf        -       register a SIOCGIF handler
2844  *      @family: Address family
2845  *      @gifconf: Function handler
2846  *
2847  *      Register protocol dependent address dumping routines. The handler
2848  *      that is passed must not be freed or reused until it has been replaced
2849  *      by another handler.
2850  */
2851 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2852 {
2853         if (family >= NPROTO)
2854                 return -EINVAL;
2855         gifconf_list[family] = gifconf;
2856         return 0;
2857 }
2858
2859
2860 /*
2861  *      Map an interface index to its name (SIOCGIFNAME)
2862  */
2863
2864 /*
2865  *      We need this ioctl for efficient implementation of the
2866  *      if_indextoname() function required by the IPv6 API.  Without
2867  *      it, we would have to search all the interfaces to find a
2868  *      match.  --pb
2869  */
2870
2871 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2872 {
2873         struct net_device *dev;
2874         struct ifreq ifr;
2875
2876         /*
2877          *      Fetch the caller's info block.
2878          */
2879
2880         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2881                 return -EFAULT;
2882
2883         read_lock(&dev_base_lock);
2884         dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2885         if (!dev) {
2886                 read_unlock(&dev_base_lock);
2887                 return -ENODEV;
2888         }
2889
2890         strcpy(ifr.ifr_name, dev->name);
2891         read_unlock(&dev_base_lock);
2892
2893         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2894                 return -EFAULT;
2895         return 0;
2896 }
2897
2898 /*
2899  *      Perform a SIOCGIFCONF call. This structure will change
2900  *      size eventually, and there is nothing I can do about it.
2901  *      Thus we will need a 'compatibility mode'.
2902  */
2903
2904 static int dev_ifconf(struct net *net, char __user *arg)
2905 {
2906         struct ifconf ifc;
2907         struct net_device *dev;
2908         char __user *pos;
2909         int len;
2910         int total;
2911         int i;
2912
2913         /*
2914          *      Fetch the caller's info block.
2915          */
2916
2917         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2918                 return -EFAULT;
2919
2920         pos = ifc.ifc_buf;
2921         len = ifc.ifc_len;
2922
2923         /*
2924          *      Loop over the interfaces, and write an info block for each.
2925          */
2926
2927         total = 0;
2928         for_each_netdev(net, dev) {
2929                 for (i = 0; i < NPROTO; i++) {
2930                         if (gifconf_list[i]) {
2931                                 int done;
2932                                 if (!pos)
2933                                         done = gifconf_list[i](dev, NULL, 0);
2934                                 else
2935                                         done = gifconf_list[i](dev, pos + total,
2936                                                                len - total);
2937                                 if (done < 0)
2938                                         return -EFAULT;
2939                                 total += done;
2940                         }
2941                 }
2942         }
2943
2944         /*
2945          *      All done.  Write the updated control block back to the caller.
2946          */
2947         ifc.ifc_len = total;
2948
2949         /*
2950          *      Both BSD and Solaris return 0 here, so we do too.
2951          */
2952         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2953 }
2954
2955 #ifdef CONFIG_PROC_FS
2956 /*
2957  *      This is invoked by the /proc filesystem handler to display a device
2958  *      in detail.
2959  */
2960 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2961         __acquires(dev_base_lock)
2962 {
2963         struct net *net = seq_file_net(seq);
2964         loff_t off;
2965         struct net_device *dev;
2966
2967         read_lock(&dev_base_lock);
2968         if (!*pos)
2969                 return SEQ_START_TOKEN;
2970
2971         off = 1;
2972         for_each_netdev(net, dev)
2973                 if (off++ == *pos)
2974                         return dev;
2975
2976         return NULL;
2977 }
2978
2979 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2980 {
2981         struct net *net = seq_file_net(seq);
2982         ++*pos;
2983         return v == SEQ_START_TOKEN ?
2984                 first_net_device(net) : next_net_device((struct net_device *)v);
2985 }
2986
2987 void dev_seq_stop(struct seq_file *seq, void *v)
2988         __releases(dev_base_lock)
2989 {
2990         read_unlock(&dev_base_lock);
2991 }
2992
2993 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2994 {
2995         const struct net_device_stats *stats = dev_get_stats(dev);
2996
2997         seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2998                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2999                    dev->name, stats->rx_bytes, stats->rx_packets,
3000                    stats->rx_errors,
3001                    stats->rx_dropped + stats->rx_missed_errors,
3002                    stats->rx_fifo_errors,
3003                    stats->rx_length_errors + stats->rx_over_errors +
3004                     stats->rx_crc_errors + stats->rx_frame_errors,
3005                    stats->rx_compressed, stats->multicast,
3006                    stats->tx_bytes, stats->tx_packets,
3007                    stats->tx_errors, stats->tx_dropped,
3008                    stats->tx_fifo_errors, stats->collisions,
3009                    stats->tx_carrier_errors +
3010                     stats->tx_aborted_errors +
3011                     stats->tx_window_errors +
3012                     stats->tx_heartbeat_errors,
3013                    stats->tx_compressed);
3014 }
3015
3016 /*
3017  *      Called from the PROCfs module. This now uses the new arbitrary sized
3018  *      /proc/net interface to create /proc/net/dev
3019  */
3020 static int dev_seq_show(struct seq_file *seq, void *v)
3021 {
3022         if (v == SEQ_START_TOKEN)
3023                 seq_puts(seq, "Inter-|   Receive                            "
3024                               "                    |  Transmit\n"
3025                               " face |bytes    packets errs drop fifo frame "
3026                               "compressed multicast|bytes    packets errs "
3027                               "drop fifo colls carrier compressed\n");
3028         else
3029                 dev_seq_printf_stats(seq, v);
3030         return 0;
3031 }
3032
3033 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3034 {
3035         struct netif_rx_stats *rc = NULL;
3036
3037         while (*pos < nr_cpu_ids)
3038                 if (cpu_online(*pos)) {
3039                         rc = &per_cpu(netdev_rx_stat, *pos);
3040                         break;
3041                 } else
3042                         ++*pos;
3043         return rc;
3044 }
3045
3046 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3047 {
3048         return softnet_get_online(pos);
3049 }
3050
3051 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3052 {
3053         ++*pos;
3054         return softnet_get_online(pos);
3055 }
3056
3057 static void softnet_seq_stop(struct seq_file *seq, void *v)
3058 {
3059 }
3060
3061 static int softnet_seq_show(struct seq_file *seq, void *v)
3062 {
3063         struct netif_rx_stats *s = v;
3064
3065         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3066                    s->total, s->dropped, s->time_squeeze, 0,
3067                    0, 0, 0, 0, /* was fastroute */
3068                    s->cpu_collision );
3069         return 0;
3070 }
3071
3072 static const struct seq_operations dev_seq_ops = {
3073         .start = dev_seq_start,
3074         .next  = dev_seq_next,
3075         .stop  = dev_seq_stop,
3076         .show  = dev_seq_show,
3077 };
3078
3079 static int dev_seq_open(struct inode *inode, struct file *file)
3080 {
3081         return seq_open_net(inode, file, &dev_seq_ops,
3082                             sizeof(struct seq_net_private));
3083 }
3084
3085 static const struct file_operations dev_seq_fops = {
3086         .owner   = THIS_MODULE,
3087         .open    = dev_seq_open,
3088         .read    = seq_read,
3089         .llseek  = seq_lseek,
3090         .release = seq_release_net,
3091 };
3092
3093 static const struct seq_operations softnet_seq_ops = {
3094         .start = softnet_seq_start,
3095         .next  = softnet_seq_next,
3096         .stop  = softnet_seq_stop,
3097         .show  = softnet_seq_show,
3098 };
3099
3100 static int softnet_seq_open(struct inode *inode, struct file *file)
3101 {
3102         return seq_open(file, &softnet_seq_ops);
3103 }
3104
3105 static const struct file_operations softnet_seq_fops = {
3106         .owner   = THIS_MODULE,
3107         .open    = softnet_seq_open,
3108         .read    = seq_read,
3109         .llseek  = seq_lseek,
3110         .release = seq_release,
3111 };
3112
3113 static void *ptype_get_idx(loff_t pos)
3114 {
3115         struct packet_type *pt = NULL;
3116         loff_t i = 0;
3117         int t;
3118
3119         list_for_each_entry_rcu(pt, &ptype_all, list) {
3120                 if (i == pos)
3121                         return pt;
3122                 ++i;
3123         }
3124
3125         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3126                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3127                         if (i == pos)
3128                                 return pt;
3129                         ++i;
3130                 }
3131         }
3132         return NULL;
3133 }
3134
3135 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3136         __acquires(RCU)
3137 {
3138         rcu_read_lock();
3139         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3140 }
3141
3142 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3143 {
3144         struct packet_type *pt;
3145         struct list_head *nxt;
3146         int hash;
3147
3148         ++*pos;
3149         if (v == SEQ_START_TOKEN)
3150                 return ptype_get_idx(0);
3151
3152         pt = v;
3153         nxt = pt->list.next;
3154         if (pt->type == htons(ETH_P_ALL)) {
3155                 if (nxt != &ptype_all)
3156                         goto found;
3157                 hash = 0;
3158                 nxt = ptype_base[0].next;
3159         } else
3160                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3161
3162         while (nxt == &ptype_base[hash]) {
3163                 if (++hash >= PTYPE_HASH_SIZE)
3164                         return NULL;
3165                 nxt = ptype_base[hash].next;
3166         }
3167 found:
3168         return list_entry(nxt, struct packet_type, list);
3169 }
3170
3171 static void ptype_seq_stop(struct seq_file *seq, void *v)
3172         __releases(RCU)
3173 {
3174         rcu_read_unlock();
3175 }
3176
3177 static int ptype_seq_show(struct seq_file *seq, void *v)
3178 {
3179         struct packet_type *pt = v;
3180
3181         if (v == SEQ_START_TOKEN)
3182                 seq_puts(seq, "Type Device      Function\n");
3183         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3184                 if (pt->type == htons(ETH_P_ALL))
3185                         seq_puts(seq, "ALL ");
3186                 else
3187                         seq_printf(seq, "%04x", ntohs(pt->type));
3188
3189                 seq_printf(seq, " %-8s %pF\n",
3190                            pt->dev ? pt->dev->name : "", pt->func);
3191         }
3192
3193         return 0;
3194 }
3195
3196 static const struct seq_operations ptype_seq_ops = {
3197         .start = ptype_seq_start,
3198         .next  = ptype_seq_next,
3199         .stop  = ptype_seq_stop,
3200         .show  = ptype_seq_show,
3201 };
3202
3203 static int ptype_seq_open(struct inode *inode, struct file *file)
3204 {
3205         return seq_open_net(inode, file, &ptype_seq_ops,
3206                         sizeof(struct seq_net_private));
3207 }
3208
3209 static const struct file_operations ptype_seq_fops = {
3210         .owner   = THIS_MODULE,
3211         .open    = ptype_seq_open,
3212         .read    = seq_read,
3213         .llseek  = seq_lseek,
3214         .release = seq_release_net,
3215 };
3216
3217
3218 static int __net_init dev_proc_net_init(struct net *net)
3219 {
3220         int rc = -ENOMEM;
3221
3222         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3223                 goto out;
3224         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3225                 goto out_dev;
3226         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3227                 goto out_softnet;
3228
3229         if (wext_proc_init(net))
3230                 goto out_ptype;
3231         rc = 0;
3232 out:
3233         return rc;
3234 out_ptype:
3235         proc_net_remove(net, "ptype");
3236 out_softnet:
3237         proc_net_remove(net, "softnet_stat");
3238 out_dev:
3239         proc_net_remove(net, "dev");
3240         goto out;
3241 }
3242
3243 static void __net_exit dev_proc_net_exit(struct net *net)
3244 {
3245         wext_proc_exit(net);
3246
3247         proc_net_remove(net, "ptype");
3248         proc_net_remove(net, "softnet_stat");
3249         proc_net_remove(net, "dev");
3250 }
3251
3252 static struct pernet_operations __net_initdata dev_proc_ops = {
3253         .init = dev_proc_net_init,
3254         .exit = dev_proc_net_exit,
3255 };
3256
3257 static int __init dev_proc_init(void)
3258 {
3259         return register_pernet_subsys(&dev_proc_ops);
3260 }
3261 #else
3262 #define dev_proc_init() 0
3263 #endif  /* CONFIG_PROC_FS */
3264
3265
3266 /**
3267  *      netdev_set_master       -       set up master/slave pair
3268  *      @slave: slave device
3269  *      @master: new master device
3270  *
3271  *      Changes the master device of the slave. Pass %NULL to break the
3272  *      bonding. The caller must hold the RTNL semaphore. On a failure
3273  *      a negative errno code is returned. On success the reference counts
3274  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3275  *      function returns zero.
3276  */
3277 int netdev_set_master(struct net_device *slave, struct net_device *master)
3278 {
3279         struct net_device *old = slave->master;
3280
3281         ASSERT_RTNL();
3282
3283         if (master) {
3284                 if (old)
3285                         return -EBUSY;
3286                 dev_hold(master);
3287         }
3288
3289         slave->master = master;
3290
3291         synchronize_net();
3292
3293         if (old)
3294                 dev_put(old);
3295
3296         if (master)
3297                 slave->flags |= IFF_SLAVE;
3298         else
3299                 slave->flags &= ~IFF_SLAVE;
3300
3301         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3302         return 0;
3303 }
3304
3305 static void dev_change_rx_flags(struct net_device *dev, int flags)
3306 {
3307         const struct net_device_ops *ops = dev->netdev_ops;
3308
3309         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3310                 ops->ndo_change_rx_flags(dev, flags);
3311 }
3312
3313 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3314 {
3315         unsigned short old_flags = dev->flags;
3316         uid_t uid;
3317         gid_t gid;
3318
3319         ASSERT_RTNL();
3320
3321         dev->flags |= IFF_PROMISC;
3322         dev->promiscuity += inc;
3323         if (dev->promiscuity == 0) {
3324                 /*
3325                  * Avoid overflow.
3326                  * If inc causes overflow, untouch promisc and return error.
3327                  */
3328                 if (inc < 0)
3329                         dev->flags &= ~IFF_PROMISC;
3330                 else {
3331                         dev->promiscuity -= inc;
3332                         printk(KERN_WARNING "%s: promiscuity touches roof, "
3333                                 "set promiscuity failed, promiscuity feature "
3334                                 "of device might be broken.\n", dev->name);
3335                         return -EOVERFLOW;
3336                 }
3337         }
3338         if (dev->flags != old_flags) {
3339                 printk(KERN_INFO "device %s %s promiscuous mode\n",
3340                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3341                                                                "left");
3342                 if (audit_enabled) {
3343                         current_uid_gid(&uid, &gid);
3344                         audit_log(current->audit_context, GFP_ATOMIC,
3345                                 AUDIT_ANOM_PROMISCUOUS,
3346                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3347                                 dev->name, (dev->flags & IFF_PROMISC),
3348                                 (old_flags & IFF_PROMISC),
3349                                 audit_get_loginuid(current),
3350                                 uid, gid,
3351                                 audit_get_sessionid(current));
3352                 }
3353
3354                 dev_change_rx_flags(dev, IFF_PROMISC);
3355         }
3356         return 0;
3357 }
3358
3359 /**
3360  *      dev_set_promiscuity     - update promiscuity count on a device
3361  *      @dev: device
3362  *      @inc: modifier
3363  *
3364  *      Add or remove promiscuity from a device. While the count in the device
3365  *      remains above zero the interface remains promiscuous. Once it hits zero
3366  *      the device reverts back to normal filtering operation. A negative inc
3367  *      value is used to drop promiscuity on the device.
3368  *      Return 0 if successful or a negative errno code on error.
3369  */
3370 int dev_set_promiscuity(struct net_device *dev, int inc)
3371 {
3372         unsigned short old_flags = dev->flags;
3373         int err;
3374
3375         err = __dev_set_promiscuity(dev, inc);
3376         if (err < 0)
3377                 return err;
3378         if (dev->flags != old_flags)
3379                 dev_set_rx_mode(dev);
3380         return err;
3381 }
3382
3383 /**
3384  *      dev_set_allmulti        - update allmulti count on a device
3385  *      @dev: device
3386  *      @inc: modifier
3387  *
3388  *      Add or remove reception of all multicast frames to a device. While the
3389  *      count in the device remains above zero the interface remains listening
3390  *      to all interfaces. Once it hits zero the device reverts back to normal
3391  *      filtering operation. A negative @inc value is used to drop the counter
3392  *      when releasing a resource needing all multicasts.
3393  *      Return 0 if successful or a negative errno code on error.
3394  */
3395
3396 int dev_set_allmulti(struct net_device *dev, int inc)
3397 {
3398         unsigned short old_flags = dev->flags;
3399
3400         ASSERT_RTNL();
3401
3402         dev->flags |= IFF_ALLMULTI;
3403         dev->allmulti += inc;
3404         if (dev->allmulti == 0) {
3405                 /*
3406                  * Avoid overflow.
3407                  * If inc causes overflow, untouch allmulti and return error.
3408                  */
3409                 if (inc < 0)
3410                         dev->flags &= ~IFF_ALLMULTI;
3411                 else {
3412                         dev->allmulti -= inc;
3413                         printk(KERN_WARNING "%s: allmulti touches roof, "
3414                                 "set allmulti failed, allmulti feature of "
3415                                 "device might be broken.\n", dev->name);
3416                         return -EOVERFLOW;
3417                 }
3418         }
3419         if (dev->flags ^ old_flags) {
3420                 dev_change_rx_flags(dev, IFF_ALLMULTI);
3421                 dev_set_rx_mode(dev);
3422         }
3423         return 0;
3424 }
3425
3426 /*
3427  *      Upload unicast and multicast address lists to device and
3428  *      configure RX filtering. When the device doesn't support unicast
3429  *      filtering it is put in promiscuous mode while unicast addresses
3430  *      are present.
3431  */
3432 void __dev_set_rx_mode(struct net_device *dev)
3433 {
3434         const struct net_device_ops *ops = dev->netdev_ops;
3435
3436         /* dev_open will call this function so the list will stay sane. */
3437         if (!(dev->flags&IFF_UP))
3438                 return;
3439
3440         if (!netif_device_present(dev))
3441                 return;
3442
3443         if (ops->ndo_set_rx_mode)
3444                 ops->ndo_set_rx_mode(dev);
3445         else {
3446                 /* Unicast addresses changes may only happen under the rtnl,
3447                  * therefore calling __dev_set_promiscuity here is safe.
3448                  */
3449                 if (dev->uc_count > 0 && !dev->uc_promisc) {
3450                         __dev_set_promiscuity(dev, 1);
3451                         dev->uc_promisc = 1;
3452                 } else if (dev->uc_count == 0 && dev->uc_promisc) {
3453                         __dev_set_promiscuity(dev, -1);
3454                         dev->uc_promisc = 0;
3455                 }
3456
3457                 if (ops->ndo_set_multicast_list)
3458                         ops->ndo_set_multicast_list(dev);
3459         }
3460 }
3461
3462 void dev_set_rx_mode(struct net_device *dev)
3463 {
3464         netif_addr_lock_bh(dev);
3465         __dev_set_rx_mode(dev);
3466         netif_addr_unlock_bh(dev);
3467 }
3468
3469 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3470                       void *addr, int alen, int glbl)
3471 {
3472         struct dev_addr_list *da;
3473
3474         for (; (da = *list) != NULL; list = &da->next) {
3475                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3476                     alen == da->da_addrlen) {
3477                         if (glbl) {
3478                                 int old_glbl = da->da_gusers;
3479                                 da->da_gusers = 0;
3480                                 if (old_glbl == 0)
3481                                         break;
3482                         }
3483                         if (--da->da_users)
3484                                 return 0;
3485
3486                         *list = da->next;
3487                         kfree(da);
3488                         (*count)--;
3489                         return 0;
3490                 }
3491         }
3492         return -ENOENT;
3493 }
3494
3495 int __dev_addr_add(struct dev_addr_list **list, int *count,
3496                    void *addr, int alen, int glbl)
3497 {
3498         struct dev_addr_list *da;
3499
3500         for (da = *list; da != NULL; da = da->next) {
3501                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3502                     da->da_addrlen == alen) {
3503                         if (glbl) {
3504                                 int old_glbl = da->da_gusers;
3505                                 da->da_gusers = 1;
3506                                 if (old_glbl)
3507                                         return 0;
3508                         }
3509                         da->da_users++;
3510                         return 0;
3511                 }
3512         }
3513
3514         da = kzalloc(sizeof(*da), GFP_ATOMIC);
3515         if (da == NULL)
3516                 return -ENOMEM;
3517         memcpy(da->da_addr, addr, alen);
3518         da->da_addrlen = alen;
3519         da->da_users = 1;
3520         da->da_gusers = glbl ? 1 : 0;
3521         da->next = *list;
3522         *list = da;
3523         (*count)++;
3524         return 0;
3525 }
3526
3527 /**
3528  *      dev_unicast_delete      - Release secondary unicast address.
3529  *      @dev: device
3530  *      @addr: address to delete
3531  *      @alen: length of @addr
3532  *
3533  *      Release reference to a secondary unicast address and remove it
3534  *      from the device if the reference count drops to zero.
3535  *
3536  *      The caller must hold the rtnl_mutex.
3537  */
3538 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3539 {
3540         int err;
3541
3542         ASSERT_RTNL();
3543
3544         netif_addr_lock_bh(dev);
3545         err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3546         if (!err)
3547                 __dev_set_rx_mode(dev);
3548         netif_addr_unlock_bh(dev);
3549         return err;
3550 }
3551 EXPORT_SYMBOL(dev_unicast_delete);
3552
3553 /**
3554  *      dev_unicast_add         - add a secondary unicast address
3555  *      @dev: device
3556  *      @addr: address to add
3557  *      @alen: length of @addr
3558  *
3559  *      Add a secondary unicast address to the device or increase
3560  *      the reference count if it already exists.
3561  *
3562  *      The caller must hold the rtnl_mutex.
3563  */
3564 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3565 {
3566         int err;
3567
3568         ASSERT_RTNL();
3569
3570         netif_addr_lock_bh(dev);
3571         err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3572         if (!err)
3573                 __dev_set_rx_mode(dev);
3574         netif_addr_unlock_bh(dev);
3575         return err;
3576 }
3577 EXPORT_SYMBOL(dev_unicast_add);
3578
3579 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3580                     struct dev_addr_list **from, int *from_count)
3581 {
3582         struct dev_addr_list *da, *next;
3583         int err = 0;
3584
3585         da = *from;
3586         while (da != NULL) {
3587                 next = da->next;
3588                 if (!da->da_synced) {
3589                         err = __dev_addr_add(to, to_count,
3590                                              da->da_addr, da->da_addrlen, 0);
3591                         if (err < 0)
3592                                 break;
3593                         da->da_synced = 1;
3594                         da->da_users++;
3595                 } else if (da->da_users == 1) {
3596                         __dev_addr_delete(to, to_count,
3597                                           da->da_addr, da->da_addrlen, 0);
3598                         __dev_addr_delete(from, from_count,
3599                                           da->da_addr, da->da_addrlen, 0);
3600                 }
3601                 da = next;
3602         }
3603         return err;
3604 }
3605
3606 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3607                        struct dev_addr_list **from, int *from_count)
3608 {
3609         struct dev_addr_list *da, *next;
3610
3611         da = *from;
3612         while (da != NULL) {
3613                 next = da->next;
3614                 if (da->da_synced) {
3615                         __dev_addr_delete(to, to_count,
3616                                           da->da_addr, da->da_addrlen, 0);
3617                         da->da_synced = 0;
3618                         __dev_addr_delete(from, from_count,
3619                                           da->da_addr, da->da_addrlen, 0);
3620                 }
3621                 da = next;
3622         }
3623 }
3624
3625 /**
3626  *      dev_unicast_sync - Synchronize device's unicast list to another device
3627  *      @to: destination device
3628  *      @from: source device
3629  *
3630  *      Add newly added addresses to the destination device and release
3631  *      addresses that have no users left. The source device must be
3632  *      locked by netif_tx_lock_bh.
3633  *
3634  *      This function is intended to be called from the dev->set_rx_mode
3635  *      function of layered software devices.
3636  */
3637 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3638 {
3639         int err = 0;
3640
3641         netif_addr_lock_bh(to);
3642         err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3643                               &from->uc_list, &from->uc_count);
3644         if (!err)
3645                 __dev_set_rx_mode(to);
3646         netif_addr_unlock_bh(to);
3647         return err;
3648 }
3649 EXPORT_SYMBOL(dev_unicast_sync);
3650
3651 /**
3652  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
3653  *      @to: destination device
3654  *      @from: source device
3655  *
3656  *      Remove all addresses that were added to the destination device by
3657  *      dev_unicast_sync(). This function is intended to be called from the
3658  *      dev->stop function of layered software devices.
3659  */
3660 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3661 {
3662         netif_addr_lock_bh(from);
3663         netif_addr_lock(to);
3664
3665         __dev_addr_unsync(&to->uc_list, &to->uc_count,
3666                           &from->uc_list, &from->uc_count);
3667         __dev_set_rx_mode(to);
3668
3669         netif_addr_unlock(to);
3670         netif_addr_unlock_bh(from);
3671 }
3672 EXPORT_SYMBOL(dev_unicast_unsync);
3673
3674 static void __dev_addr_discard(struct dev_addr_list **list)
3675 {
3676         struct dev_addr_list *tmp;
3677
3678         while (*list != NULL) {
3679                 tmp = *list;
3680                 *list = tmp->next;
3681                 if (tmp->da_users > tmp->da_gusers)
3682                         printk("__dev_addr_discard: address leakage! "
3683                                "da_users=%d\n", tmp->da_users);
3684                 kfree(tmp);
3685         }
3686 }
3687
3688 static void dev_addr_discard(struct net_device *dev)
3689 {
3690         netif_addr_lock_bh(dev);
3691
3692         __dev_addr_discard(&dev->uc_list);
3693         dev->uc_count = 0;
3694
3695         __dev_addr_discard(&dev->mc_list);
3696         dev->mc_count = 0;
3697
3698         netif_addr_unlock_bh(dev);
3699 }
3700
3701 /**
3702  *      dev_get_flags - get flags reported to userspace
3703  *      @dev: device
3704  *
3705  *      Get the combination of flag bits exported through APIs to userspace.
3706  */
3707 unsigned dev_get_flags(const struct net_device *dev)
3708 {
3709         unsigned flags;
3710
3711         flags = (dev->flags & ~(IFF_PROMISC |
3712                                 IFF_ALLMULTI |
3713                                 IFF_RUNNING |
3714                                 IFF_LOWER_UP |
3715                                 IFF_DORMANT)) |
3716                 (dev->gflags & (IFF_PROMISC |
3717                                 IFF_ALLMULTI));
3718
3719         if (netif_running(dev)) {
3720                 if (netif_oper_up(dev))
3721                         flags |= IFF_RUNNING;
3722                 if (netif_carrier_ok(dev))
3723                         flags |= IFF_LOWER_UP;
3724                 if (netif_dormant(dev))
3725                         flags |= IFF_DORMANT;
3726         }
3727
3728         return flags;
3729 }
3730
3731 /**
3732  *      dev_change_flags - change device settings
3733  *      @dev: device
3734  *      @flags: device state flags
3735  *
3736  *      Change settings on device based state flags. The flags are
3737  *      in the userspace exported format.
3738  */
3739 int dev_change_flags(struct net_device *dev, unsigned flags)
3740 {
3741         int ret, changes;
3742         int old_flags = dev->flags;
3743
3744         ASSERT_RTNL();
3745
3746         /*
3747          *      Set the flags on our device.
3748          */
3749
3750         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3751                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3752                                IFF_AUTOMEDIA)) |
3753                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3754                                     IFF_ALLMULTI));
3755
3756         /*
3757          *      Load in the correct multicast list now the flags have changed.
3758          */
3759
3760         if ((old_flags ^ flags) & IFF_MULTICAST)
3761                 dev_change_rx_flags(dev, IFF_MULTICAST);
3762
3763         dev_set_rx_mode(dev);
3764
3765         /*
3766          *      Have we downed the interface. We handle IFF_UP ourselves
3767          *      according to user attempts to set it, rather than blindly
3768          *      setting it.
3769          */
3770
3771         ret = 0;
3772         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
3773                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3774
3775                 if (!ret)
3776                         dev_set_rx_mode(dev);
3777         }
3778
3779         if (dev->flags & IFF_UP &&
3780             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3781                                           IFF_VOLATILE)))
3782                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
3783
3784         if ((flags ^ dev->gflags) & IFF_PROMISC) {
3785                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
3786                 dev->gflags ^= IFF_PROMISC;
3787                 dev_set_promiscuity(dev, inc);
3788         }
3789
3790         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3791            is important. Some (broken) drivers set IFF_PROMISC, when
3792            IFF_ALLMULTI is requested not asking us and not reporting.
3793          */
3794         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3795                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3796                 dev->gflags ^= IFF_ALLMULTI;
3797                 dev_set_allmulti(dev, inc);
3798         }
3799
3800         /* Exclude state transition flags, already notified */
3801         changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3802         if (changes)
3803                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3804
3805         return ret;
3806 }
3807
3808 /**
3809  *      dev_set_mtu - Change maximum transfer unit
3810  *      @dev: device
3811  *      @new_mtu: new transfer unit
3812  *
3813  *      Change the maximum transfer size of the network device.
3814  */
3815 int dev_set_mtu(struct net_device *dev, int new_mtu)
3816 {
3817         const struct net_device_ops *ops = dev->netdev_ops;
3818         int err;
3819
3820         if (new_mtu == dev->mtu)
3821                 return 0;
3822
3823         /*      MTU must be positive.    */
3824         if (new_mtu < 0)
3825                 return -EINVAL;
3826
3827         if (!netif_device_present(dev))
3828                 return -ENODEV;
3829
3830         err = 0;
3831         if (ops->ndo_change_mtu)
3832                 err = ops->ndo_change_mtu(dev, new_mtu);
3833         else
3834                 dev->mtu = new_mtu;
3835
3836         if (!err && dev->flags & IFF_UP)
3837                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3838         return err;
3839 }
3840
3841 /**
3842  *      dev_set_mac_address - Change Media Access Control Address
3843  *      @dev: device
3844  *      @sa: new address
3845  *
3846  *      Change the hardware (MAC) address of the device
3847  */
3848 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3849 {
3850         const struct net_device_ops *ops = dev->netdev_ops;
3851         int err;
3852
3853         if (!ops->ndo_set_mac_address)
3854                 return -EOPNOTSUPP;
3855         if (sa->sa_family != dev->type)
3856                 return -EINVAL;
3857         if (!netif_device_present(dev))
3858                 return -ENODEV;
3859         err = ops->ndo_set_mac_address(dev, sa);
3860         if (!err)
3861                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3862         return err;
3863 }
3864
3865 /*
3866  *      Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3867  */
3868 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3869 {
3870         int err;
3871         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3872
3873         if (!dev)
3874                 return -ENODEV;
3875
3876         switch (cmd) {
3877                 case SIOCGIFFLAGS:      /* Get interface flags */
3878                         ifr->ifr_flags = dev_get_flags(dev);
3879                         return 0;
3880
3881                 case SIOCGIFMETRIC:     /* Get the metric on the interface
3882                                            (currently unused) */
3883                         ifr->ifr_metric = 0;
3884                         return 0;
3885
3886                 case SIOCGIFMTU:        /* Get the MTU of a device */
3887                         ifr->ifr_mtu = dev->mtu;
3888                         return 0;
3889
3890                 case SIOCGIFHWADDR:
3891                         if (!dev->addr_len)
3892                                 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3893                         else
3894                                 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3895                                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3896                         ifr->ifr_hwaddr.sa_family = dev->type;
3897                         return 0;
3898
3899                 case SIOCGIFSLAVE:
3900                         err = -EINVAL;
3901                         break;
3902
3903                 case SIOCGIFMAP:
3904                         ifr->ifr_map.mem_start = dev->mem_start;
3905                         ifr->ifr_map.mem_end   = dev->mem_end;
3906                         ifr->ifr_map.base_addr = dev->base_addr;
3907                         ifr->ifr_map.irq       = dev->irq;
3908                         ifr->ifr_map.dma       = dev->dma;
3909                         ifr->ifr_map.port      = dev->if_port;
3910                         return 0;
3911
3912                 case SIOCGIFINDEX:
3913                         ifr->ifr_ifindex = dev->ifindex;
3914                         return 0;
3915
3916                 case SIOCGIFTXQLEN:
3917                         ifr->ifr_qlen = dev->tx_queue_len;
3918                         return 0;
3919
3920                 default:
3921                         /* dev_ioctl() should ensure this case
3922                          * is never reached
3923                          */
3924                         WARN_ON(1);
3925                         err = -EINVAL;
3926                         break;
3927
3928         }
3929         return err;
3930 }
3931
3932 /*
3933  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
3934  */
3935 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3936 {
3937         int err;
3938         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3939         const struct net_device_ops *ops;
3940
3941         if (!dev)
3942                 return -ENODEV;
3943
3944         ops = dev->netdev_ops;
3945
3946         switch (cmd) {
3947                 case SIOCSIFFLAGS:      /* Set interface flags */
3948                         return dev_change_flags(dev, ifr->ifr_flags);
3949
3950                 case SIOCSIFMETRIC:     /* Set the metric on the interface
3951                                            (currently unused) */
3952                         return -EOPNOTSUPP;
3953
3954                 case SIOCSIFMTU:        /* Set the MTU of a device */
3955                         return dev_set_mtu(dev, ifr->ifr_mtu);
3956
3957                 case SIOCSIFHWADDR:
3958                         return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3959
3960                 case SIOCSIFHWBROADCAST:
3961                         if (ifr->ifr_hwaddr.sa_family != dev->type)
3962                                 return -EINVAL;
3963                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3964                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3965                         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3966                         return 0;
3967
3968                 case SIOCSIFMAP:
3969                         if (ops->ndo_set_config) {
3970                                 if (!netif_device_present(dev))
3971                                         return -ENODEV;
3972                                 return ops->ndo_set_config(dev, &ifr->ifr_map);
3973                         }
3974                         return -EOPNOTSUPP;
3975
3976                 case SIOCADDMULTI:
3977                         if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3978                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3979                                 return -EINVAL;
3980                         if (!netif_device_present(dev))
3981                                 return -ENODEV;
3982                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3983                                           dev->addr_len, 1);
3984
3985                 case SIOCDELMULTI:
3986                         if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3987                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3988                                 return -EINVAL;
3989                         if (!netif_device_present(dev))
3990                                 return -ENODEV;
3991                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3992                                              dev->addr_len, 1);
3993
3994                 case SIOCSIFTXQLEN:
3995                         if (ifr->ifr_qlen < 0)
3996                                 return -EINVAL;
3997                         dev->tx_queue_len = ifr->ifr_qlen;
3998                         return 0;
3999
4000                 case SIOCSIFNAME:
4001                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4002                         return dev_change_name(dev, ifr->ifr_newname);
4003
4004                 /*
4005                  *      Unknown or private ioctl
4006                  */
4007
4008                 default:
4009                         if ((cmd >= SIOCDEVPRIVATE &&
4010                             cmd <= SIOCDEVPRIVATE + 15) ||
4011                             cmd == SIOCBONDENSLAVE ||
4012                             cmd == SIOCBONDRELEASE ||
4013                             cmd == SIOCBONDSETHWADDR ||
4014                             cmd == SIOCBONDSLAVEINFOQUERY ||
4015                             cmd == SIOCBONDINFOQUERY ||
4016                             cmd == SIOCBONDCHANGEACTIVE ||
4017                             cmd == SIOCGMIIPHY ||
4018                             cmd == SIOCGMIIREG ||
4019                             cmd == SIOCSMIIREG ||
4020                             cmd == SIOCBRADDIF ||
4021                             cmd == SIOCBRDELIF ||
4022                             cmd == SIOCWANDEV) {
4023                                 err = -EOPNOTSUPP;
4024                                 if (ops->ndo_do_ioctl) {
4025                                         if (netif_device_present(dev))
4026                                                 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4027                                         else
4028                                                 err = -ENODEV;
4029                                 }
4030                         } else
4031                                 err = -EINVAL;
4032
4033         }
4034         return err;
4035 }
4036
4037 /*
4038  *      This function handles all "interface"-type I/O control requests. The actual
4039  *      'doing' part of this is dev_ifsioc above.
4040  */
4041
4042 /**
4043  *      dev_ioctl       -       network device ioctl
4044  *      @net: the applicable net namespace
4045  *      @cmd: command to issue
4046  *      @arg: pointer to a struct ifreq in user space
4047  *
4048  *      Issue ioctl functions to devices. This is normally called by the
4049  *      user space syscall interfaces but can sometimes be useful for
4050  *      other purposes. The return value is the return from the syscall if
4051  *      positive or a negative errno code on error.
4052  */
4053
4054 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4055 {
4056         struct ifreq ifr;
4057         int ret;
4058         char *colon;
4059
4060         /* One special case: SIOCGIFCONF takes ifconf argument
4061            and requires shared lock, because it sleeps writing
4062            to user space.
4063          */
4064
4065         if (cmd == SIOCGIFCONF) {
4066                 rtnl_lock();
4067                 ret = dev_ifconf(net, (char __user *) arg);
4068                 rtnl_unlock();
4069                 return ret;
4070         }
4071         if (cmd == SIOCGIFNAME)
4072                 return dev_ifname(net, (struct ifreq __user *)arg);
4073
4074         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4075                 return -EFAULT;
4076
4077         ifr.ifr_name[IFNAMSIZ-1] = 0;
4078
4079         colon = strchr(ifr.ifr_name, ':');
4080         if (colon)
4081                 *colon = 0;
4082
4083         /*
4084          *      See which interface the caller is talking about.
4085          */
4086
4087         switch (cmd) {
4088                 /*
4089                  *      These ioctl calls:
4090                  *      - can be done by all.
4091                  *      - atomic and do not require locking.
4092                  *      - return a value
4093                  */
4094                 case SIOCGIFFLAGS:
4095                 case SIOCGIFMETRIC:
4096                 case SIOCGIFMTU:
4097                 case SIOCGIFHWADDR:
4098                 case SIOCGIFSLAVE:
4099                 case SIOCGIFMAP:
4100                 case SIOCGIFINDEX:
4101                 case SIOCGIFTXQLEN:
4102                         dev_load(net, ifr.ifr_name);
4103                         read_lock(&dev_base_lock);
4104                         ret = dev_ifsioc_locked(net, &ifr, cmd);
4105                         read_unlock(&dev_base_lock);
4106                         if (!ret) {
4107                                 if (colon)
4108                                         *colon = ':';
4109                                 if (copy_to_user(arg, &ifr,
4110                                                  sizeof(struct ifreq)))
4111                                         ret = -EFAULT;
4112                         }
4113                         return ret;
4114
4115                 case SIOCETHTOOL:
4116                         dev_load(net, ifr.ifr_name);
4117                         rtnl_lock();
4118                         ret = dev_ethtool(net, &ifr);
4119                         rtnl_unlock();
4120                         if (!ret) {
4121                                 if (colon)
4122                                         *colon = ':';
4123                                 if (copy_to_user(arg, &ifr,
4124                                                  sizeof(struct ifreq)))
4125                                         ret = -EFAULT;
4126                         }
4127                         return ret;
4128
4129                 /*
4130                  *      These ioctl calls:
4131                  *      - require superuser power.
4132                  *      - require strict serialization.
4133                  *      - return a value
4134                  */
4135                 case SIOCGMIIPHY:
4136                 case SIOCGMIIREG:
4137                 case SIOCSIFNAME:
4138                         if (!capable(CAP_NET_ADMIN))
4139                                 return -EPERM;
4140                         dev_load(net, ifr.ifr_name);
4141                         rtnl_lock();
4142                         ret = dev_ifsioc(net, &ifr, cmd);
4143                         rtnl_unlock();
4144                         if (!ret) {
4145                                 if (colon)
4146                                         *colon = ':';
4147                                 if (copy_to_user(arg, &ifr,
4148                                                  sizeof(struct ifreq)))
4149                                         ret = -EFAULT;
4150                         }
4151                         return ret;
4152
4153                 /*
4154                  *      These ioctl calls:
4155                  *      - require superuser power.
4156                  *      - require strict serialization.
4157                  *      - do not return a value
4158                  */
4159                 case SIOCSIFFLAGS:
4160                 case SIOCSIFMETRIC:
4161                 case SIOCSIFMTU:
4162                 case SIOCSIFMAP:
4163                 case SIOCSIFHWADDR:
4164                 case SIOCSIFSLAVE:
4165                 case SIOCADDMULTI:
4166                 case SIOCDELMULTI:
4167                 case SIOCSIFHWBROADCAST:
4168                 case SIOCSIFTXQLEN:
4169                 case SIOCSMIIREG:
4170                 case SIOCBONDENSLAVE:
4171                 case SIOCBONDRELEASE:
4172                 case SIOCBONDSETHWADDR:
4173                 case SIOCBONDCHANGEACTIVE:
4174                 case SIOCBRADDIF:
4175                 case SIOCBRDELIF:
4176                         if (!capable(CAP_NET_ADMIN))
4177                                 return -EPERM;
4178                         /* fall through */
4179                 case SIOCBONDSLAVEINFOQUERY:
4180                 case SIOCBONDINFOQUERY:
4181                         dev_load(net, ifr.ifr_name);
4182                         rtnl_lock();
4183                         ret = dev_ifsioc(net, &ifr, cmd);
4184                         rtnl_unlock();
4185                         return ret;
4186
4187                 case SIOCGIFMEM:
4188                         /* Get the per device memory space. We can add this but
4189                          * currently do not support it */
4190                 case SIOCSIFMEM:
4191                         /* Set the per device memory buffer space.
4192                          * Not applicable in our case */
4193                 case SIOCSIFLINK:
4194                         return -EINVAL;
4195
4196                 /*
4197                  *      Unknown or private ioctl.
4198                  */
4199                 default:
4200                         if (cmd == SIOCWANDEV ||
4201                             (cmd >= SIOCDEVPRIVATE &&
4202                              cmd <= SIOCDEVPRIVATE + 15)) {
4203                                 dev_load(net, ifr.ifr_name);
4204                                 rtnl_lock();
4205                                 ret = dev_ifsioc(net, &ifr, cmd);
4206                                 rtnl_unlock();
4207                                 if (!ret && copy_to_user(arg, &ifr,
4208                                                          sizeof(struct ifreq)))
4209                                         ret = -EFAULT;
4210                                 return ret;
4211                         }
4212                         /* Take care of Wireless Extensions */
4213                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4214                                 return wext_handle_ioctl(net, &ifr, cmd, arg);
4215                         return -EINVAL;
4216         }
4217 }
4218
4219
4220 /**
4221  *      dev_new_index   -       allocate an ifindex
4222  *      @net: the applicable net namespace
4223  *
4224  *      Returns a suitable unique value for a new device interface
4225  *      number.  The caller must hold the rtnl semaphore or the
4226  *      dev_base_lock to be sure it remains unique.
4227  */
4228 static int dev_new_index(struct net *net)
4229 {
4230         static int ifindex;
4231         for (;;) {
4232                 if (++ifindex <= 0)
4233                         ifindex = 1;
4234                 if (!__dev_get_by_index(net, ifindex))
4235                         return ifindex;
4236         }
4237 }
4238
4239 /* Delayed registration/unregisteration */
4240 static LIST_HEAD(net_todo_list);
4241
4242 static void net_set_todo(struct net_device *dev)
4243 {
4244         list_add_tail(&dev->todo_list, &net_todo_list);
4245 }
4246
4247 static void rollback_registered(struct net_device *dev)
4248 {
4249         BUG_ON(dev_boot_phase);
4250         ASSERT_RTNL();
4251
4252         /* Some devices call without registering for initialization unwind. */
4253         if (dev->reg_state == NETREG_UNINITIALIZED) {
4254                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4255                                   "was registered\n", dev->name, dev);
4256
4257                 WARN_ON(1);
4258                 return;
4259         }
4260
4261         BUG_ON(dev->reg_state != NETREG_REGISTERED);
4262
4263         /* If device is running, close it first. */
4264         dev_close(dev);
4265
4266         /* And unlink it from device chain. */
4267         unlist_netdevice(dev);
4268
4269         dev->reg_state = NETREG_UNREGISTERING;
4270
4271         synchronize_net();
4272
4273         /* Shutdown queueing discipline. */
4274         dev_shutdown(dev);
4275
4276
4277         /* Notify protocols, that we are about to destroy
4278            this device. They should clean all the things.
4279         */
4280         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4281
4282         /*
4283          *      Flush the unicast and multicast chains
4284          */
4285         dev_addr_discard(dev);
4286
4287         if (dev->netdev_ops->ndo_uninit)
4288                 dev->netdev_ops->ndo_uninit(dev);
4289
4290         /* Notifier chain MUST detach us from master device. */
4291         WARN_ON(dev->master);
4292
4293         /* Remove entries from kobject tree */
4294         netdev_unregister_kobject(dev);
4295
4296         synchronize_net();
4297
4298         dev_put(dev);
4299 }
4300
4301 static void __netdev_init_queue_locks_one(struct net_device *dev,
4302                                           struct netdev_queue *dev_queue,
4303                                           void *_unused)
4304 {
4305         spin_lock_init(&dev_queue->_xmit_lock);
4306         netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4307         dev_queue->xmit_lock_owner = -1;
4308 }
4309
4310 static void netdev_init_queue_locks(struct net_device *dev)
4311 {
4312         netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4313         __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4314 }
4315
4316 unsigned long netdev_fix_features(unsigned long features, const char *name)
4317 {
4318         /* Fix illegal SG+CSUM combinations. */
4319         if ((features & NETIF_F_SG) &&
4320             !(features & NETIF_F_ALL_CSUM)) {
4321                 if (name)
4322                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4323                                "checksum feature.\n", name);
4324                 features &= ~NETIF_F_SG;
4325         }
4326
4327         /* TSO requires that SG is present as well. */
4328         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4329                 if (name)
4330                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4331                                "SG feature.\n", name);
4332                 features &= ~NETIF_F_TSO;
4333         }
4334
4335         if (features & NETIF_F_UFO) {
4336                 if (!(features & NETIF_F_GEN_CSUM)) {
4337                         if (name)
4338                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4339                                        "since no NETIF_F_HW_CSUM feature.\n",
4340                                        name);
4341                         features &= ~NETIF_F_UFO;
4342                 }
4343
4344                 if (!(features & NETIF_F_SG)) {
4345                         if (name)
4346                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4347                                        "since no NETIF_F_SG feature.\n", name);
4348                         features &= ~NETIF_F_UFO;
4349                 }
4350         }
4351
4352         return features;
4353 }
4354 EXPORT_SYMBOL(netdev_fix_features);
4355
4356 /**
4357  *      register_netdevice      - register a network device
4358  *      @dev: device to register
4359  *
4360  *      Take a completed network device structure and add it to the kernel
4361  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4362  *      chain. 0 is returned on success. A negative errno code is returned
4363  *      on a failure to set up the device, or if the name is a duplicate.
4364  *
4365  *      Callers must hold the rtnl semaphore. You may want
4366  *      register_netdev() instead of this.
4367  *
4368  *      BUGS:
4369  *      The locking appears insufficient to guarantee two parallel registers
4370  *      will not get the same name.
4371  */
4372
4373 int register_netdevice(struct net_device *dev)
4374 {
4375         struct hlist_head *head;
4376         struct hlist_node *p;
4377         int ret;
4378         struct net *net = dev_net(dev);
4379
4380         BUG_ON(dev_boot_phase);
4381         ASSERT_RTNL();
4382
4383         might_sleep();
4384
4385         /* When net_device's are persistent, this will be fatal. */
4386         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4387         BUG_ON(!net);
4388
4389         spin_lock_init(&dev->addr_list_lock);
4390         netdev_set_addr_lockdep_class(dev);
4391         netdev_init_queue_locks(dev);
4392
4393         dev->iflink = -1;
4394
4395 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4396         /* Netdevice_ops API compatiability support.
4397          * This is temporary until all network devices are converted.
4398          */
4399         if (dev->netdev_ops) {
4400                 const struct net_device_ops *ops = dev->netdev_ops;
4401
4402                 dev->init = ops->ndo_init;
4403                 dev->uninit = ops->ndo_uninit;
4404                 dev->open = ops->ndo_open;
4405                 dev->change_rx_flags = ops->ndo_change_rx_flags;
4406                 dev->set_rx_mode = ops->ndo_set_rx_mode;
4407                 dev->set_multicast_list = ops->ndo_set_multicast_list;
4408                 dev->set_mac_address = ops->ndo_set_mac_address;
4409                 dev->validate_addr = ops->ndo_validate_addr;
4410                 dev->do_ioctl = ops->ndo_do_ioctl;
4411                 dev->set_config = ops->ndo_set_config;
4412                 dev->change_mtu = ops->ndo_change_mtu;
4413                 dev->tx_timeout = ops->ndo_tx_timeout;
4414                 dev->get_stats = ops->ndo_get_stats;
4415                 dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4416                 dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4417                 dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4418 #ifdef CONFIG_NET_POLL_CONTROLLER
4419                 dev->poll_controller = ops->ndo_poll_controller;
4420 #endif
4421         } else {
4422                 char drivername[64];
4423                 pr_info("%s (%s): not using net_device_ops yet\n",
4424                         dev->name, netdev_drivername(dev, drivername, 64));
4425
4426                 /* This works only because net_device_ops and the
4427                    compatiablity structure are the same. */
4428                 dev->netdev_ops = (void *) &(dev->init);
4429         }
4430 #endif
4431
4432         /* Init, if this function is available */
4433         if (dev->netdev_ops->ndo_init) {
4434                 ret = dev->netdev_ops->ndo_init(dev);
4435                 if (ret) {
4436                         if (ret > 0)
4437                                 ret = -EIO;
4438                         goto out;
4439                 }
4440         }
4441
4442         if (!dev_valid_name(dev->name)) {
4443                 ret = -EINVAL;
4444                 goto err_uninit;
4445         }
4446
4447         dev->ifindex = dev_new_index(net);
4448         if (dev->iflink == -1)
4449                 dev->iflink = dev->ifindex;
4450
4451         /* Check for existence of name */
4452         head = dev_name_hash(net, dev->name);
4453         hlist_for_each(p, head) {
4454                 struct net_device *d
4455                         = hlist_entry(p, struct net_device, name_hlist);
4456                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4457                         ret = -EEXIST;
4458                         goto err_uninit;
4459                 }
4460         }
4461
4462         /* Fix illegal checksum combinations */
4463         if ((dev->features & NETIF_F_HW_CSUM) &&
4464             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4465                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4466                        dev->name);
4467                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4468         }
4469
4470         if ((dev->features & NETIF_F_NO_CSUM) &&
4471             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4472                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4473                        dev->name);
4474                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4475         }
4476
4477         dev->features = netdev_fix_features(dev->features, dev->name);
4478
4479         /* Enable software GSO if SG is supported. */
4480         if (dev->features & NETIF_F_SG)
4481                 dev->features |= NETIF_F_GSO;
4482
4483         netdev_initialize_kobject(dev);
4484         ret = netdev_register_kobject(dev);
4485         if (ret)
4486                 goto err_uninit;
4487         dev->reg_state = NETREG_REGISTERED;
4488
4489         /*
4490          *      Default initial state at registry is that the
4491          *      device is present.
4492          */
4493
4494         set_bit(__LINK_STATE_PRESENT, &dev->state);
4495
4496         dev_init_scheduler(dev);
4497         dev_hold(dev);
4498         list_netdevice(dev);
4499
4500         /* Notify protocols, that a new device appeared. */
4501         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4502         ret = notifier_to_errno(ret);
4503         if (ret) {
4504                 rollback_registered(dev);
4505                 dev->reg_state = NETREG_UNREGISTERED;
4506         }
4507
4508 out:
4509         return ret;
4510
4511 err_uninit:
4512         if (dev->netdev_ops->ndo_uninit)
4513                 dev->netdev_ops->ndo_uninit(dev);
4514         goto out;
4515 }
4516
4517 /**
4518  *      init_dummy_netdev       - init a dummy network device for NAPI
4519  *      @dev: device to init
4520  *
4521  *      This takes a network device structure and initialize the minimum
4522  *      amount of fields so it can be used to schedule NAPI polls without
4523  *      registering a full blown interface. This is to be used by drivers
4524  *      that need to tie several hardware interfaces to a single NAPI
4525  *      poll scheduler due to HW limitations.
4526  */
4527 int init_dummy_netdev(struct net_device *dev)
4528 {
4529         /* Clear everything. Note we don't initialize spinlocks
4530          * are they aren't supposed to be taken by any of the
4531          * NAPI code and this dummy netdev is supposed to be
4532          * only ever used for NAPI polls
4533          */
4534         memset(dev, 0, sizeof(struct net_device));
4535
4536         /* make sure we BUG if trying to hit standard
4537          * register/unregister code path
4538          */
4539         dev->reg_state = NETREG_DUMMY;
4540
4541         /* initialize the ref count */
4542         atomic_set(&dev->refcnt, 1);
4543
4544         /* NAPI wants this */
4545         INIT_LIST_HEAD(&dev->napi_list);
4546
4547         /* a dummy interface is started by default */
4548         set_bit(__LINK_STATE_PRESENT, &dev->state);
4549         set_bit(__LINK_STATE_START, &dev->state);
4550
4551         return 0;
4552 }
4553 EXPORT_SYMBOL_GPL(init_dummy_netdev);
4554
4555
4556 /**
4557  *      register_netdev - register a network device
4558  *      @dev: device to register
4559  *
4560  *      Take a completed network device structure and add it to the kernel
4561  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4562  *      chain. 0 is returned on success. A negative errno code is returned
4563  *      on a failure to set up the device, or if the name is a duplicate.
4564  *
4565  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
4566  *      and expands the device name if you passed a format string to
4567  *      alloc_netdev.
4568  */
4569 int register_netdev(struct net_device *dev)
4570 {
4571         int err;
4572
4573         rtnl_lock();
4574
4575         /*
4576          * If the name is a format string the caller wants us to do a
4577          * name allocation.
4578          */
4579         if (strchr(dev->name, '%')) {
4580                 err = dev_alloc_name(dev, dev->name);
4581                 if (err < 0)
4582                         goto out;
4583         }
4584
4585         err = register_netdevice(dev);
4586 out:
4587         rtnl_unlock();
4588         return err;
4589 }
4590 EXPORT_SYMBOL(register_netdev);
4591
4592 /*
4593  * netdev_wait_allrefs - wait until all references are gone.
4594  *
4595  * This is called when unregistering network devices.
4596  *
4597  * Any protocol or device that holds a reference should register
4598  * for netdevice notification, and cleanup and put back the
4599  * reference if they receive an UNREGISTER event.
4600  * We can get stuck here if buggy protocols don't correctly
4601  * call dev_put.
4602  */
4603 static void netdev_wait_allrefs(struct net_device *dev)
4604 {
4605         unsigned long rebroadcast_time, warning_time;
4606
4607         rebroadcast_time = warning_time = jiffies;
4608         while (atomic_read(&dev->refcnt) != 0) {
4609                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4610                         rtnl_lock();
4611
4612                         /* Rebroadcast unregister notification */
4613                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4614
4615                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4616                                      &dev->state)) {
4617                                 /* We must not have linkwatch events
4618                                  * pending on unregister. If this
4619                                  * happens, we simply run the queue
4620                                  * unscheduled, resulting in a noop
4621                                  * for this device.
4622                                  */
4623                                 linkwatch_run_queue();
4624                         }
4625
4626                         __rtnl_unlock();
4627
4628                         rebroadcast_time = jiffies;
4629                 }
4630
4631                 msleep(250);
4632
4633                 if (time_after(jiffies, warning_time + 10 * HZ)) {
4634                         printk(KERN_EMERG "unregister_netdevice: "
4635                                "waiting for %s to become free. Usage "
4636                                "count = %d\n",
4637                                dev->name, atomic_read(&dev->refcnt));
4638                         warning_time = jiffies;
4639                 }
4640         }
4641 }
4642
4643 /* The sequence is:
4644  *
4645  *      rtnl_lock();
4646  *      ...
4647  *      register_netdevice(x1);
4648  *      register_netdevice(x2);
4649  *      ...
4650  *      unregister_netdevice(y1);
4651  *      unregister_netdevice(y2);
4652  *      ...
4653  *      rtnl_unlock();
4654  *      free_netdev(y1);
4655  *      free_netdev(y2);
4656  *
4657  * We are invoked by rtnl_unlock().
4658  * This allows us to deal with problems:
4659  * 1) We can delete sysfs objects which invoke hotplug
4660  *    without deadlocking with linkwatch via keventd.
4661  * 2) Since we run with the RTNL semaphore not held, we can sleep
4662  *    safely in order to wait for the netdev refcnt to drop to zero.
4663  *
4664  * We must not return until all unregister events added during
4665  * the interval the lock was held have been completed.
4666  */
4667 void netdev_run_todo(void)
4668 {
4669         struct list_head list;
4670
4671         /* Snapshot list, allow later requests */
4672         list_replace_init(&net_todo_list, &list);
4673
4674         __rtnl_unlock();
4675
4676         while (!list_empty(&list)) {
4677                 struct net_device *dev
4678                         = list_entry(list.next, struct net_device, todo_list);
4679                 list_del(&dev->todo_list);
4680
4681                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4682                         printk(KERN_ERR "network todo '%s' but state %d\n",
4683                                dev->name, dev->reg_state);
4684                         dump_stack();
4685                         continue;
4686                 }
4687
4688                 dev->reg_state = NETREG_UNREGISTERED;
4689
4690                 on_each_cpu(flush_backlog, dev, 1);
4691
4692                 netdev_wait_allrefs(dev);
4693
4694                 /* paranoia */
4695                 BUG_ON(atomic_read(&dev->refcnt));
4696                 WARN_ON(dev->ip_ptr);
4697                 WARN_ON(dev->ip6_ptr);
4698                 WARN_ON(dev->dn_ptr);
4699
4700                 if (dev->destructor)
4701                         dev->destructor(dev);
4702
4703                 /* Free network device */
4704                 kobject_put(&dev->dev.kobj);
4705         }
4706 }
4707
4708 /**
4709  *      dev_get_stats   - get network device statistics
4710  *      @dev: device to get statistics from
4711  *
4712  *      Get network statistics from device. The device driver may provide
4713  *      its own method by setting dev->netdev_ops->get_stats; otherwise
4714  *      the internal statistics structure is used.
4715  */
4716 const struct net_device_stats *dev_get_stats(struct net_device *dev)
4717  {
4718         const struct net_device_ops *ops = dev->netdev_ops;
4719
4720         if (ops->ndo_get_stats)
4721                 return ops->ndo_get_stats(dev);
4722         else
4723                 return &dev->stats;
4724 }
4725 EXPORT_SYMBOL(dev_get_stats);
4726
4727 static void netdev_init_one_queue(struct net_device *dev,
4728                                   struct netdev_queue *queue,
4729                                   void *_unused)
4730 {
4731         queue->dev = dev;
4732 }
4733
4734 static void netdev_init_queues(struct net_device *dev)
4735 {
4736         netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4737         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4738         spin_lock_init(&dev->tx_global_lock);
4739 }
4740
4741 /**
4742  *      alloc_netdev_mq - allocate network device
4743  *      @sizeof_priv:   size of private data to allocate space for
4744  *      @name:          device name format string
4745  *      @setup:         callback to initialize device
4746  *      @queue_count:   the number of subqueues to allocate
4747  *
4748  *      Allocates a struct net_device with private data area for driver use
4749  *      and performs basic initialization.  Also allocates subquue structs
4750  *      for each queue on the device at the end of the netdevice.
4751  */
4752 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4753                 void (*setup)(struct net_device *), unsigned int queue_count)
4754 {
4755         struct netdev_queue *tx;
4756         struct net_device *dev;
4757         size_t alloc_size;
4758         void *p;
4759
4760         BUG_ON(strlen(name) >= sizeof(dev->name));
4761
4762         alloc_size = sizeof(struct net_device);
4763         if (sizeof_priv) {
4764                 /* ensure 32-byte alignment of private area */
4765                 alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4766                 alloc_size += sizeof_priv;
4767         }
4768         /* ensure 32-byte alignment of whole construct */
4769         alloc_size += NETDEV_ALIGN_CONST;
4770
4771         p = kzalloc(alloc_size, GFP_KERNEL);
4772         if (!p) {
4773                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4774                 return NULL;
4775         }
4776
4777         tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4778         if (!tx) {
4779                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
4780                        "tx qdiscs.\n");
4781                 kfree(p);
4782                 return NULL;
4783         }
4784
4785         dev = (struct net_device *)
4786                 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4787         dev->padded = (char *)dev - (char *)p;
4788         dev_net_set(dev, &init_net);
4789
4790         dev->_tx = tx;
4791         dev->num_tx_queues = queue_count;
4792         dev->real_num_tx_queues = queue_count;
4793
4794         dev->gso_max_size = GSO_MAX_SIZE;
4795
4796         netdev_init_queues(dev);
4797
4798         INIT_LIST_HEAD(&dev->napi_list);
4799         setup(dev);
4800         strcpy(dev->name, name);
4801         return dev;
4802 }
4803 EXPORT_SYMBOL(alloc_netdev_mq);
4804
4805 /**
4806  *      free_netdev - free network device
4807  *      @dev: device
4808  *
4809  *      This function does the last stage of destroying an allocated device
4810  *      interface. The reference to the device object is released.
4811  *      If this is the last reference then it will be freed.
4812  */
4813 void free_netdev(struct net_device *dev)
4814 {
4815         struct napi_struct *p, *n;
4816
4817         release_net(dev_net(dev));
4818
4819         kfree(dev->_tx);
4820
4821         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
4822                 netif_napi_del(p);
4823
4824         /*  Compatibility with error handling in drivers */
4825         if (dev->reg_state == NETREG_UNINITIALIZED) {
4826                 kfree((char *)dev - dev->padded);
4827                 return;
4828         }
4829
4830         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4831         dev->reg_state = NETREG_RELEASED;
4832
4833         /* will free via device release */
4834         put_device(&dev->dev);
4835 }
4836
4837 /**
4838  *      synchronize_net -  Synchronize with packet receive processing
4839  *
4840  *      Wait for packets currently being received to be done.
4841  *      Does not block later packets from starting.
4842  */
4843 void synchronize_net(void)
4844 {
4845         might_sleep();
4846         synchronize_rcu();
4847 }
4848
4849 /**
4850  *      unregister_netdevice - remove device from the kernel
4851  *      @dev: device
4852  *
4853  *      This function shuts down a device interface and removes it
4854  *      from the kernel tables.
4855  *
4856  *      Callers must hold the rtnl semaphore.  You may want
4857  *      unregister_netdev() instead of this.
4858  */
4859
4860 void unregister_netdevice(struct net_device *dev)
4861 {
4862         ASSERT_RTNL();
4863
4864         rollback_registered(dev);
4865         /* Finish processing unregister after unlock */
4866         net_set_todo(dev);
4867 }
4868
4869 /**
4870  *      unregister_netdev - remove device from the kernel
4871  *      @dev: device
4872  *
4873  *      This function shuts down a device interface and removes it
4874  *      from the kernel tables.
4875  *
4876  *      This is just a wrapper for unregister_netdevice that takes
4877  *      the rtnl semaphore.  In general you want to use this and not
4878  *      unregister_netdevice.
4879  */
4880 void unregister_netdev(struct net_device *dev)
4881 {
4882         rtnl_lock();
4883         unregister_netdevice(dev);
4884         rtnl_unlock();
4885 }
4886
4887 EXPORT_SYMBOL(unregister_netdev);
4888
4889 /**
4890  *      dev_change_net_namespace - move device to different nethost namespace
4891  *      @dev: device
4892  *      @net: network namespace
4893  *      @pat: If not NULL name pattern to try if the current device name
4894  *            is already taken in the destination network namespace.
4895  *
4896  *      This function shuts down a device interface and moves it
4897  *      to a new network namespace. On success 0 is returned, on
4898  *      a failure a netagive errno code is returned.
4899  *
4900  *      Callers must hold the rtnl semaphore.
4901  */
4902
4903 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4904 {
4905         char buf[IFNAMSIZ];
4906         const char *destname;
4907         int err;
4908
4909         ASSERT_RTNL();
4910
4911         /* Don't allow namespace local devices to be moved. */
4912         err = -EINVAL;
4913         if (dev->features & NETIF_F_NETNS_LOCAL)
4914                 goto out;
4915
4916 #ifdef CONFIG_SYSFS
4917         /* Don't allow real devices to be moved when sysfs
4918          * is enabled.
4919          */
4920         err = -EINVAL;
4921         if (dev->dev.parent)
4922                 goto out;
4923 #endif
4924
4925         /* Ensure the device has been registrered */
4926         err = -EINVAL;
4927         if (dev->reg_state != NETREG_REGISTERED)
4928                 goto out;
4929
4930         /* Get out if there is nothing todo */
4931         err = 0;
4932         if (net_eq(dev_net(dev), net))
4933                 goto out;
4934
4935         /* Pick the destination device name, and ensure
4936          * we can use it in the destination network namespace.
4937          */
4938         err = -EEXIST;
4939         destname = dev->name;
4940         if (__dev_get_by_name(net, destname)) {
4941                 /* We get here if we can't use the current device name */
4942                 if (!pat)
4943                         goto out;
4944                 if (!dev_valid_name(pat))
4945                         goto out;
4946                 if (strchr(pat, '%')) {
4947                         if (__dev_alloc_name(net, pat, buf) < 0)
4948                                 goto out;
4949                         destname = buf;
4950                 } else
4951                         destname = pat;
4952                 if (__dev_get_by_name(net, destname))
4953                         goto out;
4954         }
4955
4956         /*
4957          * And now a mini version of register_netdevice unregister_netdevice.
4958          */
4959
4960         /* If device is running close it first. */
4961         dev_close(dev);
4962
4963         /* And unlink it from device chain */
4964         err = -ENODEV;
4965         unlist_netdevice(dev);
4966
4967         synchronize_net();
4968
4969         /* Shutdown queueing discipline. */
4970         dev_shutdown(dev);
4971
4972         /* Notify protocols, that we are about to destroy
4973            this device. They should clean all the things.
4974         */
4975         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4976
4977         /*
4978          *      Flush the unicast and multicast chains
4979          */
4980         dev_addr_discard(dev);
4981
4982         netdev_unregister_kobject(dev);
4983
4984         /* Actually switch the network namespace */
4985         dev_net_set(dev, net);
4986
4987         /* Assign the new device name */
4988         if (destname != dev->name)
4989                 strcpy(dev->name, destname);
4990
4991         /* If there is an ifindex conflict assign a new one */
4992         if (__dev_get_by_index(net, dev->ifindex)) {
4993                 int iflink = (dev->iflink == dev->ifindex);
4994                 dev->ifindex = dev_new_index(net);
4995                 if (iflink)
4996                         dev->iflink = dev->ifindex;
4997         }
4998
4999         /* Fixup kobjects */
5000         err = netdev_register_kobject(dev);
5001         WARN_ON(err);
5002
5003         /* Add the device back in the hashes */
5004         list_netdevice(dev);
5005
5006         /* Notify protocols, that a new device appeared. */
5007         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5008
5009         synchronize_net();
5010         err = 0;
5011 out:
5012         return err;
5013 }
5014
5015 static int dev_cpu_callback(struct notifier_block *nfb,
5016                             unsigned long action,
5017                             void *ocpu)
5018 {
5019         struct sk_buff **list_skb;
5020         struct Qdisc **list_net;
5021         struct sk_buff *skb;
5022         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5023         struct softnet_data *sd, *oldsd;
5024
5025         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5026                 return NOTIFY_OK;
5027
5028         local_irq_disable();
5029         cpu = smp_processor_id();
5030         sd = &per_cpu(softnet_data, cpu);
5031         oldsd = &per_cpu(softnet_data, oldcpu);
5032
5033         /* Find end of our completion_queue. */
5034         list_skb = &sd->completion_queue;
5035         while (*list_skb)
5036                 list_skb = &(*list_skb)->next;
5037         /* Append completion queue from offline CPU. */
5038         *list_skb = oldsd->completion_queue;
5039         oldsd->completion_queue = NULL;
5040
5041         /* Find end of our output_queue. */
5042         list_net = &sd->output_queue;
5043         while (*list_net)
5044                 list_net = &(*list_net)->next_sched;
5045         /* Append output queue from offline CPU. */
5046         *list_net = oldsd->output_queue;
5047         oldsd->output_queue = NULL;
5048
5049         raise_softirq_irqoff(NET_TX_SOFTIRQ);
5050         local_irq_enable();
5051
5052         /* Process offline CPU's input_pkt_queue */
5053         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5054                 netif_rx(skb);
5055
5056         return NOTIFY_OK;
5057 }
5058
5059
5060 /**
5061  *      netdev_increment_features - increment feature set by one
5062  *      @all: current feature set
5063  *      @one: new feature set
5064  *      @mask: mask feature set
5065  *
5066  *      Computes a new feature set after adding a device with feature set
5067  *      @one to the master device with current feature set @all.  Will not
5068  *      enable anything that is off in @mask. Returns the new feature set.
5069  */
5070 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5071                                         unsigned long mask)
5072 {
5073         /* If device needs checksumming, downgrade to it. */
5074         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5075                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5076         else if (mask & NETIF_F_ALL_CSUM) {
5077                 /* If one device supports v4/v6 checksumming, set for all. */
5078                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5079                     !(all & NETIF_F_GEN_CSUM)) {
5080                         all &= ~NETIF_F_ALL_CSUM;
5081                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5082                 }
5083
5084                 /* If one device supports hw checksumming, set for all. */
5085                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5086                         all &= ~NETIF_F_ALL_CSUM;
5087                         all |= NETIF_F_HW_CSUM;
5088                 }
5089         }
5090
5091         one |= NETIF_F_ALL_CSUM;
5092
5093         one |= all & NETIF_F_ONE_FOR_ALL;
5094         all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5095         all |= one & mask & NETIF_F_ONE_FOR_ALL;
5096
5097         return all;
5098 }
5099 EXPORT_SYMBOL(netdev_increment_features);
5100
5101 static struct hlist_head *netdev_create_hash(void)
5102 {
5103         int i;
5104         struct hlist_head *hash;
5105
5106         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5107         if (hash != NULL)
5108                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5109                         INIT_HLIST_HEAD(&hash[i]);
5110
5111         return hash;
5112 }
5113
5114 /* Initialize per network namespace state */
5115 static int __net_init netdev_init(struct net *net)
5116 {
5117         INIT_LIST_HEAD(&net->dev_base_head);
5118
5119         net->dev_name_head = netdev_create_hash();
5120         if (net->dev_name_head == NULL)
5121                 goto err_name;
5122
5123         net->dev_index_head = netdev_create_hash();
5124         if (net->dev_index_head == NULL)
5125                 goto err_idx;
5126
5127         return 0;
5128
5129 err_idx:
5130         kfree(net->dev_name_head);
5131 err_name:
5132         return -ENOMEM;
5133 }
5134
5135 /**
5136  *      netdev_drivername - network driver for the device
5137  *      @dev: network device
5138  *      @buffer: buffer for resulting name
5139  *      @len: size of buffer
5140  *
5141  *      Determine network driver for device.
5142  */
5143 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5144 {
5145         const struct device_driver *driver;
5146         const struct device *parent;
5147
5148         if (len <= 0 || !buffer)
5149                 return buffer;
5150         buffer[0] = 0;
5151
5152         parent = dev->dev.parent;
5153
5154         if (!parent)
5155                 return buffer;
5156
5157         driver = parent->driver;
5158         if (driver && driver->name)
5159                 strlcpy(buffer, driver->name, len);
5160         return buffer;
5161 }
5162
5163 static void __net_exit netdev_exit(struct net *net)
5164 {
5165         kfree(net->dev_name_head);
5166         kfree(net->dev_index_head);
5167 }
5168
5169 static struct pernet_operations __net_initdata netdev_net_ops = {
5170         .init = netdev_init,
5171         .exit = netdev_exit,
5172 };
5173
5174 static void __net_exit default_device_exit(struct net *net)
5175 {
5176         struct net_device *dev;
5177         /*
5178          * Push all migratable of the network devices back to the
5179          * initial network namespace
5180          */
5181         rtnl_lock();
5182 restart:
5183         for_each_netdev(net, dev) {
5184                 int err;
5185                 char fb_name[IFNAMSIZ];
5186
5187                 /* Ignore unmoveable devices (i.e. loopback) */
5188                 if (dev->features & NETIF_F_NETNS_LOCAL)
5189                         continue;
5190
5191                 /* Delete virtual devices */
5192                 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5193                         dev->rtnl_link_ops->dellink(dev);
5194                         goto restart;
5195                 }
5196
5197                 /* Push remaing network devices to init_net */
5198                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5199                 err = dev_change_net_namespace(dev, &init_net, fb_name);
5200                 if (err) {
5201                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5202                                 __func__, dev->name, err);
5203                         BUG();
5204                 }
5205                 goto restart;
5206         }
5207         rtnl_unlock();
5208 }
5209
5210 static struct pernet_operations __net_initdata default_device_ops = {
5211         .exit = default_device_exit,
5212 };
5213
5214 /*
5215  *      Initialize the DEV module. At boot time this walks the device list and
5216  *      unhooks any devices that fail to initialise (normally hardware not
5217  *      present) and leaves us with a valid list of present and active devices.
5218  *
5219  */
5220
5221 /*
5222  *       This is called single threaded during boot, so no need
5223  *       to take the rtnl semaphore.
5224  */
5225 static int __init net_dev_init(void)
5226 {
5227         int i, rc = -ENOMEM;
5228
5229         BUG_ON(!dev_boot_phase);
5230
5231         if (dev_proc_init())
5232                 goto out;
5233
5234         if (netdev_kobject_init())
5235                 goto out;
5236
5237         INIT_LIST_HEAD(&ptype_all);
5238         for (i = 0; i < PTYPE_HASH_SIZE; i++)
5239                 INIT_LIST_HEAD(&ptype_base[i]);
5240
5241         if (register_pernet_subsys(&netdev_net_ops))
5242                 goto out;
5243
5244         /*
5245          *      Initialise the packet receive queues.
5246          */
5247
5248         for_each_possible_cpu(i) {
5249                 struct softnet_data *queue;
5250
5251                 queue = &per_cpu(softnet_data, i);
5252                 skb_queue_head_init(&queue->input_pkt_queue);
5253                 queue->completion_queue = NULL;
5254                 INIT_LIST_HEAD(&queue->poll_list);
5255
5256                 queue->backlog.poll = process_backlog;
5257                 queue->backlog.weight = weight_p;
5258                 queue->backlog.gro_list = NULL;
5259                 queue->backlog.gro_count = 0;
5260         }
5261
5262         dev_boot_phase = 0;
5263
5264         /* The loopback device is special if any other network devices
5265          * is present in a network namespace the loopback device must
5266          * be present. Since we now dynamically allocate and free the
5267          * loopback device ensure this invariant is maintained by
5268          * keeping the loopback device as the first device on the
5269          * list of network devices.  Ensuring the loopback devices
5270          * is the first device that appears and the last network device
5271          * that disappears.
5272          */
5273         if (register_pernet_device(&loopback_net_ops))
5274                 goto out;
5275
5276         if (register_pernet_device(&default_device_ops))
5277                 goto out;
5278
5279         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5280         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5281
5282         hotcpu_notifier(dev_cpu_callback, 0);
5283         dst_init();
5284         dev_mcast_init();
5285         rc = 0;
5286 out:
5287         return rc;
5288 }
5289
5290 subsys_initcall(net_dev_init);
5291
5292 EXPORT_SYMBOL(__dev_get_by_index);
5293 EXPORT_SYMBOL(__dev_get_by_name);
5294 EXPORT_SYMBOL(__dev_remove_pack);
5295 EXPORT_SYMBOL(dev_valid_name);
5296 EXPORT_SYMBOL(dev_add_pack);
5297 EXPORT_SYMBOL(dev_alloc_name);
5298 EXPORT_SYMBOL(dev_close);
5299 EXPORT_SYMBOL(dev_get_by_flags);
5300 EXPORT_SYMBOL(dev_get_by_index);
5301 EXPORT_SYMBOL(dev_get_by_name);
5302 EXPORT_SYMBOL(dev_open);
5303 EXPORT_SYMBOL(dev_queue_xmit);
5304 EXPORT_SYMBOL(dev_remove_pack);
5305 EXPORT_SYMBOL(dev_set_allmulti);
5306 EXPORT_SYMBOL(dev_set_promiscuity);
5307 EXPORT_SYMBOL(dev_change_flags);
5308 EXPORT_SYMBOL(dev_set_mtu);
5309 EXPORT_SYMBOL(dev_set_mac_address);
5310 EXPORT_SYMBOL(free_netdev);
5311 EXPORT_SYMBOL(netdev_boot_setup_check);
5312 EXPORT_SYMBOL(netdev_set_master);
5313 EXPORT_SYMBOL(netdev_state_change);
5314 EXPORT_SYMBOL(netif_receive_skb);
5315 EXPORT_SYMBOL(netif_rx);
5316 EXPORT_SYMBOL(register_gifconf);
5317 EXPORT_SYMBOL(register_netdevice);
5318 EXPORT_SYMBOL(register_netdevice_notifier);
5319 EXPORT_SYMBOL(skb_checksum_help);
5320 EXPORT_SYMBOL(synchronize_net);
5321 EXPORT_SYMBOL(unregister_netdevice);
5322 EXPORT_SYMBOL(unregister_netdevice_notifier);
5323 EXPORT_SYMBOL(net_enable_timestamp);
5324 EXPORT_SYMBOL(net_disable_timestamp);
5325 EXPORT_SYMBOL(dev_get_flags);
5326
5327 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
5328 EXPORT_SYMBOL(br_handle_frame_hook);
5329 EXPORT_SYMBOL(br_fdb_get_hook);
5330 EXPORT_SYMBOL(br_fdb_put_hook);
5331 #endif
5332
5333 EXPORT_SYMBOL(dev_load);
5334
5335 EXPORT_PER_CPU_SYMBOL(softnet_data);