net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/mutex.h>
  84 #include <linux/string.h>
  85 #include <linux/mm.h>
  86 #include <linux/socket.h>
  87 #include <linux/sockios.h>
  88 #include <linux/errno.h>
  89 #include <linux/interrupt.h>
  90 #include <linux/if_ether.h>
  91 #include <linux/netdevice.h>
  92 #include <linux/etherdevice.h>
  93 #include <linux/ethtool.h>
  94 #include <linux/notifier.h>
  95 #include <linux/skbuff.h>
  96 #include <net/net_namespace.h>
  97 #include <net/sock.h>
  98 #include <linux/rtnetlink.h>
  99 #include <linux/proc_fs.h>
 100 #include <linux/seq_file.h>
 101 #include <linux/stat.h>
 102 #include <linux/if_bridge.h>
 103 #include <linux/if_macvlan.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129
 130 #include "net-sysfs.h"
 131
 132 /*
 133  *      The list of packet types we will receive (as opposed to discard)
 134  *      and the routines to invoke.
 135  *
 136  *      Why 16. Because with 16 the only overlap we get on a hash of the
 137  *      low nibble of the protocol value is RARP/SNAP/X.25.
 138  *
 139  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 140  *             sure which should go first, but I bet it won't make much
 141  *             difference if we are running VLANs.  The good news is that
 142  *             this protocol won't be in the list unless compiled in, so
 143  *             the average user (w/out VLANs) will not be adversely affected.
 144  *             --BLG
 145  *
 146  *              0800    IP
 147  *              8100    802.1Q VLAN
 148  *              0001    802.3
 149  *              0002    AX.25
 150  *              0004    802.2
 151  *              8035    RARP
 152  *              0005    SNAP
 153  *              0805    X.25
 154  *              0806    ARP
 155  *              8137    IPX
 156  *              0009    Localtalk
 157  *              86DD    IPv6
 158  */
 159
 160 #define PTYPE_HASH_SIZE (16)
 161 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 162
 163 static DEFINE_SPINLOCK(ptype_lock);
 164 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 165 static struct list_head ptype_all __read_mostly;        /* Taps */
 166
 167 #ifdef CONFIG_NET_DMA
 168 struct net_dma {
 169         struct dma_client client;
 170         spinlock_t lock;
 171         cpumask_t channel_mask;
 172         struct dma_chan **channels;
 173 };
 174
 175 static enum dma_state_client
 176 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 177         enum dma_state state);
 178
 179 static struct net_dma net_dma = {
 180         .client = {
 181                 .event_callback = netdev_dma_event,
 182         },
 183 };
 184 #endif
 185
 186 /*
 187  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 188  * semaphore.
 189  *
 190  * Pure readers hold dev_base_lock for reading.
 191  *
 192  * Writers must hold the rtnl semaphore while they loop through the
 193  * dev_base_head list, and hold dev_base_lock for writing when they do the
 194  * actual updates.  This allows pure readers to access the list even
 195  * while a writer is preparing to update it.
 196  *
 197  * To put it another way, dev_base_lock is held for writing only to
 198  * protect against pure readers; the rtnl semaphore provides the
 199  * protection against other writers.
 200  *
 201  * See, for example usages, register_netdevice() and
 202  * unregister_netdevice(), which must be called with the rtnl
 203  * semaphore held.
 204  */
 205 DEFINE_RWLOCK(dev_base_lock);
 206
 207 EXPORT_SYMBOL(dev_base_lock);
 208
 209 #define NETDEV_HASHBITS 8
 210 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 211
 212 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 213 {
 214         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 215         return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 216 }
 217
 218 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 219 {
 220         return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 221 }
 222
 223 /* Device list insertion */
 224 static int list_netdevice(struct net_device *dev)
 225 {
 226         struct net *net = dev_net(dev);
 227
 228         ASSERT_RTNL();
 229
 230         write_lock_bh(&dev_base_lock);
 231         list_add_tail(&dev->dev_list, &net->dev_base_head);
 232         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 233         hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 234         write_unlock_bh(&dev_base_lock);
 235         return 0;
 236 }
 237
 238 /* Device list removal */
 239 static void unlist_netdevice(struct net_device *dev)
 240 {
 241         ASSERT_RTNL();
 242
 243         /* Unlink dev from the device chain */
 244         write_lock_bh(&dev_base_lock);
 245         list_del(&dev->dev_list);
 246         hlist_del(&dev->name_hlist);
 247         hlist_del(&dev->index_hlist);
 248         write_unlock_bh(&dev_base_lock);
 249 }
 250
 251 /*
 252  *      Our notifier list
 253  */
 254
 255 static RAW_NOTIFIER_HEAD(netdev_chain);
 256
 257 /*
 258  *      Device drivers call our routines to queue packets here. We empty the
 259  *      queue in the local softnet handler.
 260  */
 261
 262 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 263
 264 #ifdef CONFIG_LOCKDEP
 265 /*
 266  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 267  * according to dev->type
 268  */
 269 static const unsigned short netdev_lock_type[] =
 270         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 271          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 272          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 273          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 274          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 275          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 276          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 277          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 278          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 279          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 280          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 281          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 282          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 283          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
 284          ARPHRD_NONE};
 285
 286 static const char *netdev_lock_name[] =
 287         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 288          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 289          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 290          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 291          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 292          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 293          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 294          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 295          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 296          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 297          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 298          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 299          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 300          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
 301          "_xmit_NONE"};
 302
 303 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 304 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 305
 306 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 307 {
 308         int i;
 309
 310         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 311                 if (netdev_lock_type[i] == dev_type)
 312                         return i;
 313         /* the last key is used by default */
 314         return ARRAY_SIZE(netdev_lock_type) - 1;
 315 }
 316
 317 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 318                                                  unsigned short dev_type)
 319 {
 320         int i;
 321
 322         i = netdev_lock_pos(dev_type);
 323         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 324                                    netdev_lock_name[i]);
 325 }
 326
 327 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 328 {
 329         int i;
 330
 331         i = netdev_lock_pos(dev->type);
 332         lockdep_set_class_and_name(&dev->addr_list_lock,
 333                                    &netdev_addr_lock_key[i],
 334                                    netdev_lock_name[i]);
 335 }
 336 #else
 337 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 338                                                  unsigned short dev_type)
 339 {
 340 }
 341 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 342 {
 343 }
 344 #endif
 345
 346 /*******************************************************************************
 347
 348                 Protocol management and registration routines
 349
 350 *******************************************************************************/
 351
 352 /*
 353  *      Add a protocol ID to the list. Now that the input handler is
 354  *      smarter we can dispense with all the messy stuff that used to be
 355  *      here.
 356  *
 357  *      BEWARE!!! Protocol handlers, mangling input packets,
 358  *      MUST BE last in hash buckets and checking protocol handlers
 359  *      MUST start from promiscuous ptype_all chain in net_bh.
 360  *      It is true now, do not change it.
 361  *      Explanation follows: if protocol handler, mangling packet, will
 362  *      be the first on list, it is not able to sense, that packet
 363  *      is cloned and should be copied-on-write, so that it will
 364  *      change it and subsequent readers will get broken packet.
 365  *                                                      --ANK (980803)
 366  */
 367
 368 /**
 369  *      dev_add_pack - add packet handler
 370  *      @pt: packet type declaration
 371  *
 372  *      Add a protocol handler to the networking stack. The passed &packet_type
 373  *      is linked into kernel lists and may not be freed until it has been
 374  *      removed from the kernel lists.
 375  *
 376  *      This call does not sleep therefore it can not
 377  *      guarantee all CPU's that are in middle of receiving packets
 378  *      will see the new packet type (until the next received packet).
 379  */
 380
 381 void dev_add_pack(struct packet_type *pt)
 382 {
 383         int hash;
 384
 385         spin_lock_bh(&ptype_lock);
 386         if (pt->type == htons(ETH_P_ALL))
 387                 list_add_rcu(&pt->list, &ptype_all);
 388         else {
 389                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 390                 list_add_rcu(&pt->list, &ptype_base[hash]);
 391         }
 392         spin_unlock_bh(&ptype_lock);
 393 }
 394
 395 /**
 396  *      __dev_remove_pack        - remove packet handler
 397  *      @pt: packet type declaration
 398  *
 399  *      Remove a protocol handler that was previously added to the kernel
 400  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 401  *      from the kernel lists and can be freed or reused once this function
 402  *      returns.
 403  *
 404  *      The packet type might still be in use by receivers
 405  *      and must not be freed until after all the CPU's have gone
 406  *      through a quiescent state.
 407  */
 408 void __dev_remove_pack(struct packet_type *pt)
 409 {
 410         struct list_head *head;
 411         struct packet_type *pt1;
 412
 413         spin_lock_bh(&ptype_lock);
 414
 415         if (pt->type == htons(ETH_P_ALL))
 416                 head = &ptype_all;
 417         else
 418                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 419
 420         list_for_each_entry(pt1, head, list) {
 421                 if (pt == pt1) {
 422                         list_del_rcu(&pt->list);
 423                         goto out;
 424                 }
 425         }
 426
 427         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 428 out:
 429         spin_unlock_bh(&ptype_lock);
 430 }
 431 /**
 432  *      dev_remove_pack  - remove packet handler
 433  *      @pt: packet type declaration
 434  *
 435  *      Remove a protocol handler that was previously added to the kernel
 436  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 437  *      from the kernel lists and can be freed or reused once this function
 438  *      returns.
 439  *
 440  *      This call sleeps to guarantee that no CPU is looking at the packet
 441  *      type after return.
 442  */
 443 void dev_remove_pack(struct packet_type *pt)
 444 {
 445         __dev_remove_pack(pt);
 446
 447         synchronize_net();
 448 }
 449
 450 /******************************************************************************
 451
 452                       Device Boot-time Settings Routines
 453
 454 *******************************************************************************/
 455
 456 /* Boot time configuration table */
 457 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 458
 459 /**
 460  *      netdev_boot_setup_add   - add new setup entry
 461  *      @name: name of the device
 462  *      @map: configured settings for the device
 463  *
 464  *      Adds new setup entry to the dev_boot_setup list.  The function
 465  *      returns 0 on error and 1 on success.  This is a generic routine to
 466  *      all netdevices.
 467  */
 468 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 469 {
 470         struct netdev_boot_setup *s;
 471         int i;
 472
 473         s = dev_boot_setup;
 474         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 475                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 476                         memset(s[i].name, 0, sizeof(s[i].name));
 477                         strlcpy(s[i].name, name, IFNAMSIZ);
 478                         memcpy(&s[i].map, map, sizeof(s[i].map));
 479                         break;
 480                 }
 481         }
 482
 483         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 484 }
 485
 486 /**
 487  *      netdev_boot_setup_check - check boot time settings
 488  *      @dev: the netdevice
 489  *
 490  *      Check boot time settings for the device.
 491  *      The found settings are set for the device to be used
 492  *      later in the device probing.
 493  *      Returns 0 if no settings found, 1 if they are.
 494  */
 495 int netdev_boot_setup_check(struct net_device *dev)
 496 {
 497         struct netdev_boot_setup *s = dev_boot_setup;
 498         int i;
 499
 500         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 501                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 502                     !strcmp(dev->name, s[i].name)) {
 503                         dev->irq        = s[i].map.irq;
 504                         dev->base_addr  = s[i].map.base_addr;
 505                         dev->mem_start  = s[i].map.mem_start;
 506                         dev->mem_end    = s[i].map.mem_end;
 507                         return 1;
 508                 }
 509         }
 510         return 0;
 511 }
 512
 513
 514 /**
 515  *      netdev_boot_base        - get address from boot time settings
 516  *      @prefix: prefix for network device
 517  *      @unit: id for network device
 518  *
 519  *      Check boot time settings for the base address of device.
 520  *      The found settings are set for the device to be used
 521  *      later in the device probing.
 522  *      Returns 0 if no settings found.
 523  */
 524 unsigned long netdev_boot_base(const char *prefix, int unit)
 525 {
 526         const struct netdev_boot_setup *s = dev_boot_setup;
 527         char name[IFNAMSIZ];
 528         int i;
 529
 530         sprintf(name, "%s%d", prefix, unit);
 531
 532         /*
 533          * If device already registered then return base of 1
 534          * to indicate not to probe for this interface
 535          */
 536         if (__dev_get_by_name(&init_net, name))
 537                 return 1;
 538
 539         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 540                 if (!strcmp(name, s[i].name))
 541                         return s[i].map.base_addr;
 542         return 0;
 543 }
 544
 545 /*
 546  * Saves at boot time configured settings for any netdevice.
 547  */
 548 int __init netdev_boot_setup(char *str)
 549 {
 550         int ints[5];
 551         struct ifmap map;
 552
 553         str = get_options(str, ARRAY_SIZE(ints), ints);
 554         if (!str || !*str)
 555                 return 0;
 556
 557         /* Save settings */
 558         memset(&map, 0, sizeof(map));
 559         if (ints[0] > 0)
 560                 map.irq = ints[1];
 561         if (ints[0] > 1)
 562                 map.base_addr = ints[2];
 563         if (ints[0] > 2)
 564                 map.mem_start = ints[3];
 565         if (ints[0] > 3)
 566                 map.mem_end = ints[4];
 567
 568         /* Add new entry to the list */
 569         return netdev_boot_setup_add(str, &map);
 570 }
 571
 572 __setup("netdev=", netdev_boot_setup);
 573
 574 /*******************************************************************************
 575
 576                             Device Interface Subroutines
 577
 578 *******************************************************************************/
 579
 580 /**
 581  *      __dev_get_by_name       - find a device by its name
 582  *      @net: the applicable net namespace
 583  *      @name: name to find
 584  *
 585  *      Find an interface by name. Must be called under RTNL semaphore
 586  *      or @dev_base_lock. If the name is found a pointer to the device
 587  *      is returned. If the name is not found then %NULL is returned. The
 588  *      reference counters are not incremented so the caller must be
 589  *      careful with locks.
 590  */
 591
 592 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 593 {
 594         struct hlist_node *p;
 595
 596         hlist_for_each(p, dev_name_hash(net, name)) {
 597                 struct net_device *dev
 598                         = hlist_entry(p, struct net_device, name_hlist);
 599                 if (!strncmp(dev->name, name, IFNAMSIZ))
 600                         return dev;
 601         }
 602         return NULL;
 603 }
 604
 605 /**
 606  *      dev_get_by_name         - find a device by its name
 607  *      @net: the applicable net namespace
 608  *      @name: name to find
 609  *
 610  *      Find an interface by name. This can be called from any
 611  *      context and does its own locking. The returned handle has
 612  *      the usage count incremented and the caller must use dev_put() to
 613  *      release it when it is no longer needed. %NULL is returned if no
 614  *      matching device is found.
 615  */
 616
 617 struct net_device *dev_get_by_name(struct net *net, const char *name)
 618 {
 619         struct net_device *dev;
 620
 621         read_lock(&dev_base_lock);
 622         dev = __dev_get_by_name(net, name);
 623         if (dev)
 624                 dev_hold(dev);
 625         read_unlock(&dev_base_lock);
 626         return dev;
 627 }
 628
 629 /**
 630  *      __dev_get_by_index - find a device by its ifindex
 631  *      @net: the applicable net namespace
 632  *      @ifindex: index of device
 633  *
 634  *      Search for an interface by index. Returns %NULL if the device
 635  *      is not found or a pointer to the device. The device has not
 636  *      had its reference counter increased so the caller must be careful
 637  *      about locking. The caller must hold either the RTNL semaphore
 638  *      or @dev_base_lock.
 639  */
 640
 641 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 642 {
 643         struct hlist_node *p;
 644
 645         hlist_for_each(p, dev_index_hash(net, ifindex)) {
 646                 struct net_device *dev
 647                         = hlist_entry(p, struct net_device, index_hlist);
 648                 if (dev->ifindex == ifindex)
 649                         return dev;
 650         }
 651         return NULL;
 652 }
 653
 654
 655 /**
 656  *      dev_get_by_index - find a device by its ifindex
 657  *      @net: the applicable net namespace
 658  *      @ifindex: index of device
 659  *
 660  *      Search for an interface by index. Returns NULL if the device
 661  *      is not found or a pointer to the device. The device returned has
 662  *      had a reference added and the pointer is safe until the user calls
 663  *      dev_put to indicate they have finished with it.
 664  */
 665
 666 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 667 {
 668         struct net_device *dev;
 669
 670         read_lock(&dev_base_lock);
 671         dev = __dev_get_by_index(net, ifindex);
 672         if (dev)
 673                 dev_hold(dev);
 674         read_unlock(&dev_base_lock);
 675         return dev;
 676 }
 677
 678 /**
 679  *      dev_getbyhwaddr - find a device by its hardware address
 680  *      @net: the applicable net namespace
 681  *      @type: media type of device
 682  *      @ha: hardware address
 683  *
 684  *      Search for an interface by MAC address. Returns NULL if the device
 685  *      is not found or a pointer to the device. The caller must hold the
 686  *      rtnl semaphore. The returned device has not had its ref count increased
 687  *      and the caller must therefore be careful about locking
 688  *
 689  *      BUGS:
 690  *      If the API was consistent this would be __dev_get_by_hwaddr
 691  */
 692
 693 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 694 {
 695         struct net_device *dev;
 696
 697         ASSERT_RTNL();
 698
 699         for_each_netdev(net, dev)
 700                 if (dev->type == type &&
 701                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 702                         return dev;
 703
 704         return NULL;
 705 }
 706
 707 EXPORT_SYMBOL(dev_getbyhwaddr);
 708
 709 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 710 {
 711         struct net_device *dev;
 712
 713         ASSERT_RTNL();
 714         for_each_netdev(net, dev)
 715                 if (dev->type == type)
 716                         return dev;
 717
 718         return NULL;
 719 }
 720
 721 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 722
 723 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 724 {
 725         struct net_device *dev;
 726
 727         rtnl_lock();
 728         dev = __dev_getfirstbyhwtype(net, type);
 729         if (dev)
 730                 dev_hold(dev);
 731         rtnl_unlock();
 732         return dev;
 733 }
 734
 735 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 736
 737 /**
 738  *      dev_get_by_flags - find any device with given flags
 739  *      @net: the applicable net namespace
 740  *      @if_flags: IFF_* values
 741  *      @mask: bitmask of bits in if_flags to check
 742  *
 743  *      Search for any interface with the given flags. Returns NULL if a device
 744  *      is not found or a pointer to the device. The device returned has
 745  *      had a reference added and the pointer is safe until the user calls
 746  *      dev_put to indicate they have finished with it.
 747  */
 748
 749 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
 750 {
 751         struct net_device *dev, *ret;
 752
 753         ret = NULL;
 754         read_lock(&dev_base_lock);
 755         for_each_netdev(net, dev) {
 756                 if (((dev->flags ^ if_flags) & mask) == 0) {
 757                         dev_hold(dev);
 758                         ret = dev;
 759                         break;
 760                 }
 761         }
 762         read_unlock(&dev_base_lock);
 763         return ret;
 764 }
 765
 766 /**
 767  *      dev_valid_name - check if name is okay for network device
 768  *      @name: name string
 769  *
 770  *      Network device names need to be valid file names to
 771  *      to allow sysfs to work.  We also disallow any kind of
 772  *      whitespace.
 773  */
 774 int dev_valid_name(const char *name)
 775 {
 776         if (*name == '\0')
 777                 return 0;
 778         if (strlen(name) >= IFNAMSIZ)
 779                 return 0;
 780         if (!strcmp(name, ".") || !strcmp(name, ".."))
 781                 return 0;
 782
 783         while (*name) {
 784                 if (*name == '/' || isspace(*name))
 785                         return 0;
 786                 name++;
 787         }
 788         return 1;
 789 }
 790
 791 /**
 792  *      __dev_alloc_name - allocate a name for a device
 793  *      @net: network namespace to allocate the device name in
 794  *      @name: name format string
 795  *      @buf:  scratch buffer and result name string
 796  *
 797  *      Passed a format string - eg "lt%d" it will try and find a suitable
 798  *      id. It scans list of devices to build up a free map, then chooses
 799  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 800  *      while allocating the name and adding the device in order to avoid
 801  *      duplicates.
 802  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 803  *      Returns the number of the unit assigned or a negative errno code.
 804  */
 805
 806 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 807 {
 808         int i = 0;
 809         const char *p;
 810         const int max_netdevices = 8*PAGE_SIZE;
 811         unsigned long *inuse;
 812         struct net_device *d;
 813
 814         p = strnchr(name, IFNAMSIZ-1, '%');
 815         if (p) {
 816                 /*
 817                  * Verify the string as this thing may have come from
 818                  * the user.  There must be either one "%d" and no other "%"
 819                  * characters.
 820                  */
 821                 if (p[1] != 'd' || strchr(p + 2, '%'))
 822                         return -EINVAL;
 823
 824                 /* Use one page as a bit array of possible slots */
 825                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 826                 if (!inuse)
 827                         return -ENOMEM;
 828
 829                 for_each_netdev(net, d) {
 830                         if (!sscanf(d->name, name, &i))
 831                                 continue;
 832                         if (i < 0 || i >= max_netdevices)
 833                                 continue;
 834
 835                         /*  avoid cases where sscanf is not exact inverse of printf */
 836                         snprintf(buf, IFNAMSIZ, name, i);
 837                         if (!strncmp(buf, d->name, IFNAMSIZ))
 838                                 set_bit(i, inuse);
 839                 }
 840
 841                 i = find_first_zero_bit(inuse, max_netdevices);
 842                 free_page((unsigned long) inuse);
 843         }
 844
 845         snprintf(buf, IFNAMSIZ, name, i);
 846         if (!__dev_get_by_name(net, buf))
 847                 return i;
 848
 849         /* It is possible to run out of possible slots
 850          * when the name is long and there isn't enough space left
 851          * for the digits, or if all bits are used.
 852          */
 853         return -ENFILE;
 854 }
 855
 856 /**
 857  *      dev_alloc_name - allocate a name for a device
 858  *      @dev: device
 859  *      @name: name format string
 860  *
 861  *      Passed a format string - eg "lt%d" it will try and find a suitable
 862  *      id. It scans list of devices to build up a free map, then chooses
 863  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 864  *      while allocating the name and adding the device in order to avoid
 865  *      duplicates.
 866  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 867  *      Returns the number of the unit assigned or a negative errno code.
 868  */
 869
 870 int dev_alloc_name(struct net_device *dev, const char *name)
 871 {
 872         char buf[IFNAMSIZ];
 873         struct net *net;
 874         int ret;
 875
 876         BUG_ON(!dev_net(dev));
 877         net = dev_net(dev);
 878         ret = __dev_alloc_name(net, name, buf);
 879         if (ret >= 0)
 880                 strlcpy(dev->name, buf, IFNAMSIZ);
 881         return ret;
 882 }
 883
 884
 885 /**
 886  *      dev_change_name - change name of a device
 887  *      @dev: device
 888  *      @newname: name (or format string) must be at least IFNAMSIZ
 889  *
 890  *      Change name of a device, can pass format strings "eth%d".
 891  *      for wildcarding.
 892  */
 893 int dev_change_name(struct net_device *dev, const char *newname)
 894 {
 895         char oldname[IFNAMSIZ];
 896         int err = 0;
 897         int ret;
 898         struct net *net;
 899
 900         ASSERT_RTNL();
 901         BUG_ON(!dev_net(dev));
 902
 903         net = dev_net(dev);
 904         if (dev->flags & IFF_UP)
 905                 return -EBUSY;
 906
 907         if (!dev_valid_name(newname))
 908                 return -EINVAL;
 909
 910         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 911                 return 0;
 912
 913         memcpy(oldname, dev->name, IFNAMSIZ);
 914
 915         if (strchr(newname, '%')) {
 916                 err = dev_alloc_name(dev, newname);
 917                 if (err < 0)
 918                         return err;
 919         }
 920         else if (__dev_get_by_name(net, newname))
 921                 return -EEXIST;
 922         else
 923                 strlcpy(dev->name, newname, IFNAMSIZ);
 924
 925 rollback:
 926         /* For now only devices in the initial network namespace
 927          * are in sysfs.
 928          */
 929         if (net == &init_net) {
 930                 ret = device_rename(&dev->dev, dev->name);
 931                 if (ret) {
 932                         memcpy(dev->name, oldname, IFNAMSIZ);
 933                         return ret;
 934                 }
 935         }
 936
 937         write_lock_bh(&dev_base_lock);
 938         hlist_del(&dev->name_hlist);
 939         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 940         write_unlock_bh(&dev_base_lock);
 941
 942         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 943         ret = notifier_to_errno(ret);
 944
 945         if (ret) {
 946                 if (err) {
 947                         printk(KERN_ERR
 948                                "%s: name change rollback failed: %d.\n",
 949                                dev->name, ret);
 950                 } else {
 951                         err = ret;
 952                         memcpy(dev->name, oldname, IFNAMSIZ);
 953                         goto rollback;
 954                 }
 955         }
 956
 957         return err;
 958 }
 959
 960 /**
 961  *      dev_set_alias - change ifalias of a device
 962  *      @dev: device
 963  *      @alias: name up to IFALIASZ
 964  *      @len: limit of bytes to copy from info
 965  *
 966  *      Set ifalias for a device,
 967  */
 968 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 969 {
 970         ASSERT_RTNL();
 971
 972         if (len >= IFALIASZ)
 973                 return -EINVAL;
 974
 975         if (!len) {
 976                 if (dev->ifalias) {
 977                         kfree(dev->ifalias);
 978                         dev->ifalias = NULL;
 979                 }
 980                 return 0;
 981         }
 982
 983         dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
 984         if (!dev->ifalias)
 985                 return -ENOMEM;
 986
 987         strlcpy(dev->ifalias, alias, len+1);
 988         return len;
 989 }
 990
 991
 992 /**
 993  *      netdev_features_change - device changes features
 994  *      @dev: device to cause notification
 995  *
 996  *      Called to indicate a device has changed features.
 997  */
 998 void netdev_features_change(struct net_device *dev)
 999 {
1000         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1001 }
1002 EXPORT_SYMBOL(netdev_features_change);
1003
1004 /**
1005  *      netdev_state_change - device changes state
1006  *      @dev: device to cause notification
1007  *
1008  *      Called to indicate a device has changed state. This function calls
1009  *      the notifier chains for netdev_chain and sends a NEWLINK message
1010  *      to the routing socket.
1011  */
1012 void netdev_state_change(struct net_device *dev)
1013 {
1014         if (dev->flags & IFF_UP) {
1015                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1016                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1017         }
1018 }
1019
1020 void netdev_bonding_change(struct net_device *dev)
1021 {
1022         call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1023 }
1024 EXPORT_SYMBOL(netdev_bonding_change);
1025
1026 /**
1027  *      dev_load        - load a network module
1028  *      @net: the applicable net namespace
1029  *      @name: name of interface
1030  *
1031  *      If a network interface is not present and the process has suitable
1032  *      privileges this function loads the module. If module loading is not
1033  *      available in this kernel then it becomes a nop.
1034  */
1035
1036 void dev_load(struct net *net, const char *name)
1037 {
1038         struct net_device *dev;
1039
1040         read_lock(&dev_base_lock);
1041         dev = __dev_get_by_name(net, name);
1042         read_unlock(&dev_base_lock);
1043
1044         if (!dev && capable(CAP_SYS_MODULE))
1045                 request_module("%s", name);
1046 }
1047
1048 /**
1049  *      dev_open        - prepare an interface for use.
1050  *      @dev:   device to open
1051  *
1052  *      Takes a device from down to up state. The device's private open
1053  *      function is invoked and then the multicast lists are loaded. Finally
1054  *      the device is moved into the up state and a %NETDEV_UP message is
1055  *      sent to the netdev notifier chain.
1056  *
1057  *      Calling this function on an active interface is a nop. On a failure
1058  *      a negative errno code is returned.
1059  */
1060 int dev_open(struct net_device *dev)
1061 {
1062         const struct net_device_ops *ops = dev->netdev_ops;
1063         int ret = 0;
1064
1065         ASSERT_RTNL();
1066
1067         /*
1068          *      Is it already up?
1069          */
1070
1071         if (dev->flags & IFF_UP)
1072                 return 0;
1073
1074         /*
1075          *      Is it even present?
1076          */
1077         if (!netif_device_present(dev))
1078                 return -ENODEV;
1079
1080         /*
1081          *      Call device private open method
1082          */
1083         set_bit(__LINK_STATE_START, &dev->state);
1084
1085         if (ops->ndo_validate_addr)
1086                 ret = ops->ndo_validate_addr(dev);
1087
1088         if (!ret && ops->ndo_open)
1089                 ret = ops->ndo_open(dev);
1090
1091         /*
1092          *      If it went open OK then:
1093          */
1094
1095         if (ret)
1096                 clear_bit(__LINK_STATE_START, &dev->state);
1097         else {
1098                 /*
1099                  *      Set the flags.
1100                  */
1101                 dev->flags |= IFF_UP;
1102
1103                 /*
1104                  *      Initialize multicasting status
1105                  */
1106                 dev_set_rx_mode(dev);
1107
1108                 /*
1109                  *      Wakeup transmit queue engine
1110                  */
1111                 dev_activate(dev);
1112
1113                 /*
1114                  *      ... and announce new interface.
1115                  */
1116                 call_netdevice_notifiers(NETDEV_UP, dev);
1117         }
1118
1119         return ret;
1120 }
1121
1122 /**
1123  *      dev_close - shutdown an interface.
1124  *      @dev: device to shutdown
1125  *
1126  *      This function moves an active device into down state. A
1127  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1128  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1129  *      chain.
1130  */
1131 int dev_close(struct net_device *dev)
1132 {
1133         const struct net_device_ops *ops = dev->netdev_ops;
1134         ASSERT_RTNL();
1135
1136         might_sleep();
1137
1138         if (!(dev->flags & IFF_UP))
1139                 return 0;
1140
1141         /*
1142          *      Tell people we are going down, so that they can
1143          *      prepare to death, when device is still operating.
1144          */
1145         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1146
1147         clear_bit(__LINK_STATE_START, &dev->state);
1148
1149         /* Synchronize to scheduled poll. We cannot touch poll list,
1150          * it can be even on different cpu. So just clear netif_running().
1151          *
1152          * dev->stop() will invoke napi_disable() on all of it's
1153          * napi_struct instances on this device.
1154          */
1155         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1156
1157         dev_deactivate(dev);
1158
1159         /*
1160          *      Call the device specific close. This cannot fail.
1161          *      Only if device is UP
1162          *
1163          *      We allow it to be called even after a DETACH hot-plug
1164          *      event.
1165          */
1166         if (ops->ndo_stop)
1167                 ops->ndo_stop(dev);
1168
1169         /*
1170          *      Device is now down.
1171          */
1172
1173         dev->flags &= ~IFF_UP;
1174
1175         /*
1176          * Tell people we are down
1177          */
1178         call_netdevice_notifiers(NETDEV_DOWN, dev);
1179
1180         return 0;
1181 }
1182
1183
1184 /**
1185  *      dev_disable_lro - disable Large Receive Offload on a device
1186  *      @dev: device
1187  *
1188  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1189  *      called under RTNL.  This is needed if received packets may be
1190  *      forwarded to another interface.
1191  */
1192 void dev_disable_lro(struct net_device *dev)
1193 {
1194         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1195             dev->ethtool_ops->set_flags) {
1196                 u32 flags = dev->ethtool_ops->get_flags(dev);
1197                 if (flags & ETH_FLAG_LRO) {
1198                         flags &= ~ETH_FLAG_LRO;
1199                         dev->ethtool_ops->set_flags(dev, flags);
1200                 }
1201         }
1202         WARN_ON(dev->features & NETIF_F_LRO);
1203 }
1204 EXPORT_SYMBOL(dev_disable_lro);
1205
1206
1207 static int dev_boot_phase = 1;
1208
1209 /*
1210  *      Device change register/unregister. These are not inline or static
1211  *      as we export them to the world.
1212  */
1213
1214 /**
1215  *      register_netdevice_notifier - register a network notifier block
1216  *      @nb: notifier
1217  *
1218  *      Register a notifier to be called when network device events occur.
1219  *      The notifier passed is linked into the kernel structures and must
1220  *      not be reused until it has been unregistered. A negative errno code
1221  *      is returned on a failure.
1222  *
1223  *      When registered all registration and up events are replayed
1224  *      to the new notifier to allow device to have a race free
1225  *      view of the network device list.
1226  */
1227
1228 int register_netdevice_notifier(struct notifier_block *nb)
1229 {
1230         struct net_device *dev;
1231         struct net_device *last;
1232         struct net *net;
1233         int err;
1234
1235         rtnl_lock();
1236         err = raw_notifier_chain_register(&netdev_chain, nb);
1237         if (err)
1238                 goto unlock;
1239         if (dev_boot_phase)
1240                 goto unlock;
1241         for_each_net(net) {
1242                 for_each_netdev(net, dev) {
1243                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1244                         err = notifier_to_errno(err);
1245                         if (err)
1246                                 goto rollback;
1247
1248                         if (!(dev->flags & IFF_UP))
1249                                 continue;
1250
1251                         nb->notifier_call(nb, NETDEV_UP, dev);
1252                 }
1253         }
1254
1255 unlock:
1256         rtnl_unlock();
1257         return err;
1258
1259 rollback:
1260         last = dev;
1261         for_each_net(net) {
1262                 for_each_netdev(net, dev) {
1263                         if (dev == last)
1264                                 break;
1265
1266                         if (dev->flags & IFF_UP) {
1267                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1268                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1269                         }
1270                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1271                 }
1272         }
1273
1274         raw_notifier_chain_unregister(&netdev_chain, nb);
1275         goto unlock;
1276 }
1277
1278 /**
1279  *      unregister_netdevice_notifier - unregister a network notifier block
1280  *      @nb: notifier
1281  *
1282  *      Unregister a notifier previously registered by
1283  *      register_netdevice_notifier(). The notifier is unlinked into the
1284  *      kernel structures and may then be reused. A negative errno code
1285  *      is returned on a failure.
1286  */
1287
1288 int unregister_netdevice_notifier(struct notifier_block *nb)
1289 {
1290         int err;
1291
1292         rtnl_lock();
1293         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1294         rtnl_unlock();
1295         return err;
1296 }
1297
1298 /**
1299  *      call_netdevice_notifiers - call all network notifier blocks
1300  *      @val: value passed unmodified to notifier function
1301  *      @dev: net_device pointer passed unmodified to notifier function
1302  *
1303  *      Call all network notifier blocks.  Parameters and return value
1304  *      are as for raw_notifier_call_chain().
1305  */
1306
1307 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1308 {
1309         return raw_notifier_call_chain(&netdev_chain, val, dev);
1310 }
1311
1312 /* When > 0 there are consumers of rx skb time stamps */
1313 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1314
1315 void net_enable_timestamp(void)
1316 {
1317         atomic_inc(&netstamp_needed);
1318 }
1319
1320 void net_disable_timestamp(void)
1321 {
1322         atomic_dec(&netstamp_needed);
1323 }
1324
1325 static inline void net_timestamp(struct sk_buff *skb)
1326 {
1327         if (atomic_read(&netstamp_needed))
1328                 __net_timestamp(skb);
1329         else
1330                 skb->tstamp.tv64 = 0;
1331 }
1332
1333 /*
1334  *      Support routine. Sends outgoing frames to any network
1335  *      taps currently in use.
1336  */
1337
1338 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1339 {
1340         struct packet_type *ptype;
1341
1342         net_timestamp(skb);
1343
1344         rcu_read_lock();
1345         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1346                 /* Never send packets back to the socket
1347                  * they originated from - MvS (miquels@drinkel.ow.org)
1348                  */
1349                 if ((ptype->dev == dev || !ptype->dev) &&
1350                     (ptype->af_packet_priv == NULL ||
1351                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1352                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1353                         if (!skb2)
1354                                 break;
1355
1356                         /* skb->nh should be correctly
1357                            set by sender, so that the second statement is
1358                            just protection against buggy protocols.
1359                          */
1360                         skb_reset_mac_header(skb2);
1361
1362                         if (skb_network_header(skb2) < skb2->data ||
1363                             skb2->network_header > skb2->tail) {
1364                                 if (net_ratelimit())
1365                                         printk(KERN_CRIT "protocol %04x is "
1366                                                "buggy, dev %s\n",
1367                                                skb2->protocol, dev->name);
1368                                 skb_reset_network_header(skb2);
1369                         }
1370
1371                         skb2->transport_header = skb2->network_header;
1372                         skb2->pkt_type = PACKET_OUTGOING;
1373                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1374                 }
1375         }
1376         rcu_read_unlock();
1377 }
1378
1379
1380 static inline void __netif_reschedule(struct Qdisc *q)
1381 {
1382         struct softnet_data *sd;
1383         unsigned long flags;
1384
1385         local_irq_save(flags);
1386         sd = &__get_cpu_var(softnet_data);
1387         q->next_sched = sd->output_queue;
1388         sd->output_queue = q;
1389         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1390         local_irq_restore(flags);
1391 }
1392
1393 void __netif_schedule(struct Qdisc *q)
1394 {
1395         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1396                 __netif_reschedule(q);
1397 }
1398 EXPORT_SYMBOL(__netif_schedule);
1399
1400 void dev_kfree_skb_irq(struct sk_buff *skb)
1401 {
1402         if (atomic_dec_and_test(&skb->users)) {
1403                 struct softnet_data *sd;
1404                 unsigned long flags;
1405
1406                 local_irq_save(flags);
1407                 sd = &__get_cpu_var(softnet_data);
1408                 skb->next = sd->completion_queue;
1409                 sd->completion_queue = skb;
1410                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1411                 local_irq_restore(flags);
1412         }
1413 }
1414 EXPORT_SYMBOL(dev_kfree_skb_irq);
1415
1416 void dev_kfree_skb_any(struct sk_buff *skb)
1417 {
1418         if (in_irq() || irqs_disabled())
1419                 dev_kfree_skb_irq(skb);
1420         else
1421                 dev_kfree_skb(skb);
1422 }
1423 EXPORT_SYMBOL(dev_kfree_skb_any);
1424
1425
1426 /**
1427  * netif_device_detach - mark device as removed
1428  * @dev: network device
1429  *
1430  * Mark device as removed from system and therefore no longer available.
1431  */
1432 void netif_device_detach(struct net_device *dev)
1433 {
1434         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1435             netif_running(dev)) {
1436                 netif_stop_queue(dev);
1437         }
1438 }
1439 EXPORT_SYMBOL(netif_device_detach);
1440
1441 /**
1442  * netif_device_attach - mark device as attached
1443  * @dev: network device
1444  *
1445  * Mark device as attached from system and restart if needed.
1446  */
1447 void netif_device_attach(struct net_device *dev)
1448 {
1449         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1450             netif_running(dev)) {
1451                 netif_wake_queue(dev);
1452                 __netdev_watchdog_up(dev);
1453         }
1454 }
1455 EXPORT_SYMBOL(netif_device_attach);
1456
1457 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1458 {
1459         return ((features & NETIF_F_GEN_CSUM) ||
1460                 ((features & NETIF_F_IP_CSUM) &&
1461                  protocol == htons(ETH_P_IP)) ||
1462                 ((features & NETIF_F_IPV6_CSUM) &&
1463                  protocol == htons(ETH_P_IPV6)));
1464 }
1465
1466 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1467 {
1468         if (can_checksum_protocol(dev->features, skb->protocol))
1469                 return true;
1470
1471         if (skb->protocol == htons(ETH_P_8021Q)) {
1472                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1473                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1474                                           veh->h_vlan_encapsulated_proto))
1475                         return true;
1476         }
1477
1478         return false;
1479 }
1480
1481 /*
1482  * Invalidate hardware checksum when packet is to be mangled, and
1483  * complete checksum manually on outgoing path.
1484  */
1485 int skb_checksum_help(struct sk_buff *skb)
1486 {
1487         __wsum csum;
1488         int ret = 0, offset;
1489
1490         if (skb->ip_summed == CHECKSUM_COMPLETE)
1491                 goto out_set_summed;
1492
1493         if (unlikely(skb_shinfo(skb)->gso_size)) {
1494                 /* Let GSO fix up the checksum. */
1495                 goto out_set_summed;
1496         }
1497
1498         offset = skb->csum_start - skb_headroom(skb);
1499         BUG_ON(offset >= skb_headlen(skb));
1500         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1501
1502         offset += skb->csum_offset;
1503         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1504
1505         if (skb_cloned(skb) &&
1506             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1507                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1508                 if (ret)
1509                         goto out;
1510         }
1511
1512         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1513 out_set_summed:
1514         skb->ip_summed = CHECKSUM_NONE;
1515 out:
1516         return ret;
1517 }
1518
1519 /**
1520  *      skb_gso_segment - Perform segmentation on skb.
1521  *      @skb: buffer to segment
1522  *      @features: features for the output path (see dev->features)
1523  *
1524  *      This function segments the given skb and returns a list of segments.
1525  *
1526  *      It may return NULL if the skb requires no segmentation.  This is
1527  *      only possible when GSO is used for verifying header integrity.
1528  */
1529 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1530 {
1531         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1532         struct packet_type *ptype;
1533         __be16 type = skb->protocol;
1534         int err;
1535
1536         BUG_ON(skb_shinfo(skb)->frag_list);
1537
1538         skb_reset_mac_header(skb);
1539         skb->mac_len = skb->network_header - skb->mac_header;
1540         __skb_pull(skb, skb->mac_len);
1541
1542         if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1543                 if (skb_header_cloned(skb) &&
1544                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1545                         return ERR_PTR(err);
1546         }
1547
1548         rcu_read_lock();
1549         list_for_each_entry_rcu(ptype,
1550                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1551                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1552                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1553                                 err = ptype->gso_send_check(skb);
1554                                 segs = ERR_PTR(err);
1555                                 if (err || skb_gso_ok(skb, features))
1556                                         break;
1557                                 __skb_push(skb, (skb->data -
1558                                                  skb_network_header(skb)));
1559                         }
1560                         segs = ptype->gso_segment(skb, features);
1561                         break;
1562                 }
1563         }
1564         rcu_read_unlock();
1565
1566         __skb_push(skb, skb->data - skb_mac_header(skb));
1567
1568         return segs;
1569 }
1570
1571 EXPORT_SYMBOL(skb_gso_segment);
1572
1573 /* Take action when hardware reception checksum errors are detected. */
1574 #ifdef CONFIG_BUG
1575 void netdev_rx_csum_fault(struct net_device *dev)
1576 {
1577         if (net_ratelimit()) {
1578                 printk(KERN_ERR "%s: hw csum failure.\n",
1579                         dev ? dev->name : "<unknown>");
1580                 dump_stack();
1581         }
1582 }
1583 EXPORT_SYMBOL(netdev_rx_csum_fault);
1584 #endif
1585
1586 /* Actually, we should eliminate this check as soon as we know, that:
1587  * 1. IOMMU is present and allows to map all the memory.
1588  * 2. No high memory really exists on this machine.
1589  */
1590
1591 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1592 {
1593 #ifdef CONFIG_HIGHMEM
1594         int i;
1595
1596         if (dev->features & NETIF_F_HIGHDMA)
1597                 return 0;
1598
1599         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1600                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1601                         return 1;
1602
1603 #endif
1604         return 0;
1605 }
1606
1607 struct dev_gso_cb {
1608         void (*destructor)(struct sk_buff *skb);
1609 };
1610
1611 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1612
1613 static void dev_gso_skb_destructor(struct sk_buff *skb)
1614 {
1615         struct dev_gso_cb *cb;
1616
1617         do {
1618                 struct sk_buff *nskb = skb->next;
1619
1620                 skb->next = nskb->next;
1621                 nskb->next = NULL;
1622                 kfree_skb(nskb);
1623         } while (skb->next);
1624
1625         cb = DEV_GSO_CB(skb);
1626         if (cb->destructor)
1627                 cb->destructor(skb);
1628 }
1629
1630 /**
1631  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1632  *      @skb: buffer to segment
1633  *
1634  *      This function segments the given skb and stores the list of segments
1635  *      in skb->next.
1636  */
1637 static int dev_gso_segment(struct sk_buff *skb)
1638 {
1639         struct net_device *dev = skb->dev;
1640         struct sk_buff *segs;
1641         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1642                                          NETIF_F_SG : 0);
1643
1644         segs = skb_gso_segment(skb, features);
1645
1646         /* Verifying header integrity only. */
1647         if (!segs)
1648                 return 0;
1649
1650         if (IS_ERR(segs))
1651                 return PTR_ERR(segs);
1652
1653         skb->next = segs;
1654         DEV_GSO_CB(skb)->destructor = skb->destructor;
1655         skb->destructor = dev_gso_skb_destructor;
1656
1657         return 0;
1658 }
1659
1660 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1661                         struct netdev_queue *txq)
1662 {
1663         const struct net_device_ops *ops = dev->netdev_ops;
1664
1665         prefetch(&dev->netdev_ops->ndo_start_xmit);
1666         if (likely(!skb->next)) {
1667                 if (!list_empty(&ptype_all))
1668                         dev_queue_xmit_nit(skb, dev);
1669
1670                 if (netif_needs_gso(dev, skb)) {
1671                         if (unlikely(dev_gso_segment(skb)))
1672                                 goto out_kfree_skb;
1673                         if (skb->next)
1674                                 goto gso;
1675                 }
1676
1677                 return ops->ndo_start_xmit(skb, dev);
1678         }
1679
1680 gso:
1681         do {
1682                 struct sk_buff *nskb = skb->next;
1683                 int rc;
1684
1685                 skb->next = nskb->next;
1686                 nskb->next = NULL;
1687                 rc = ops->ndo_start_xmit(nskb, dev);
1688                 if (unlikely(rc)) {
1689                         nskb->next = skb->next;
1690                         skb->next = nskb;
1691                         return rc;
1692                 }
1693                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1694                         return NETDEV_TX_BUSY;
1695         } while (skb->next);
1696
1697         skb->destructor = DEV_GSO_CB(skb)->destructor;
1698
1699 out_kfree_skb:
1700         kfree_skb(skb);
1701         return 0;
1702 }
1703
1704 static u32 simple_tx_hashrnd;
1705 static int simple_tx_hashrnd_initialized = 0;
1706
1707 static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
1708 {
1709         u32 addr1, addr2, ports;
1710         u32 hash, ihl;
1711         u8 ip_proto = 0;
1712
1713         if (unlikely(!simple_tx_hashrnd_initialized)) {
1714                 get_random_bytes(&simple_tx_hashrnd, 4);
1715                 simple_tx_hashrnd_initialized = 1;
1716         }
1717
1718         switch (skb->protocol) {
1719         case htons(ETH_P_IP):
1720                 if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
1721                         ip_proto = ip_hdr(skb)->protocol;
1722                 addr1 = ip_hdr(skb)->saddr;
1723                 addr2 = ip_hdr(skb)->daddr;
1724                 ihl = ip_hdr(skb)->ihl;
1725                 break;
1726         case htons(ETH_P_IPV6):
1727                 ip_proto = ipv6_hdr(skb)->nexthdr;
1728                 addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
1729                 addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
1730                 ihl = (40 >> 2);
1731                 break;
1732         default:
1733                 return 0;
1734         }
1735
1736
1737         switch (ip_proto) {
1738         case IPPROTO_TCP:
1739         case IPPROTO_UDP:
1740         case IPPROTO_DCCP:
1741         case IPPROTO_ESP:
1742         case IPPROTO_AH:
1743         case IPPROTO_SCTP:
1744         case IPPROTO_UDPLITE:
1745                 ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
1746                 break;
1747
1748         default:
1749                 ports = 0;
1750                 break;
1751         }
1752
1753         hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
1754
1755         return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1756 }
1757
1758 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1759                                         struct sk_buff *skb)
1760 {
1761         const struct net_device_ops *ops = dev->netdev_ops;
1762         u16 queue_index = 0;
1763
1764         if (ops->ndo_select_queue)
1765                 queue_index = ops->ndo_select_queue(dev, skb);
1766         else if (dev->real_num_tx_queues > 1)
1767                 queue_index = simple_tx_hash(dev, skb);
1768
1769         skb_set_queue_mapping(skb, queue_index);
1770         return netdev_get_tx_queue(dev, queue_index);
1771 }
1772
1773 /**
1774  *      dev_queue_xmit - transmit a buffer
1775  *      @skb: buffer to transmit
1776  *
1777  *      Queue a buffer for transmission to a network device. The caller must
1778  *      have set the device and priority and built the buffer before calling
1779  *      this function. The function can be called from an interrupt.
1780  *
1781  *      A negative errno code is returned on a failure. A success does not
1782  *      guarantee the frame will be transmitted as it may be dropped due
1783  *      to congestion or traffic shaping.
1784  *
1785  * -----------------------------------------------------------------------------------
1786  *      I notice this method can also return errors from the queue disciplines,
1787  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1788  *      be positive.
1789  *
1790  *      Regardless of the return value, the skb is consumed, so it is currently
1791  *      difficult to retry a send to this method.  (You can bump the ref count
1792  *      before sending to hold a reference for retry if you are careful.)
1793  *
1794  *      When calling this method, interrupts MUST be enabled.  This is because
1795  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1796  *          --BLG
1797  */
1798 int dev_queue_xmit(struct sk_buff *skb)
1799 {
1800         struct net_device *dev = skb->dev;
1801         struct netdev_queue *txq;
1802         struct Qdisc *q;
1803         int rc = -ENOMEM;
1804
1805         /* GSO will handle the following emulations directly. */
1806         if (netif_needs_gso(dev, skb))
1807                 goto gso;
1808
1809         if (skb_shinfo(skb)->frag_list &&
1810             !(dev->features & NETIF_F_FRAGLIST) &&
1811             __skb_linearize(skb))
1812                 goto out_kfree_skb;
1813
1814         /* Fragmented skb is linearized if device does not support SG,
1815          * or if at least one of fragments is in highmem and device
1816          * does not support DMA from it.
1817          */
1818         if (skb_shinfo(skb)->nr_frags &&
1819             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1820             __skb_linearize(skb))
1821                 goto out_kfree_skb;
1822
1823         /* If packet is not checksummed and device does not support
1824          * checksumming for this protocol, complete checksumming here.
1825          */
1826         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1827                 skb_set_transport_header(skb, skb->csum_start -
1828                                               skb_headroom(skb));
1829                 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1830                         goto out_kfree_skb;
1831         }
1832
1833 gso:
1834         /* Disable soft irqs for various locks below. Also
1835          * stops preemption for RCU.
1836          */
1837         rcu_read_lock_bh();
1838
1839         txq = dev_pick_tx(dev, skb);
1840         q = rcu_dereference(txq->qdisc);
1841
1842 #ifdef CONFIG_NET_CLS_ACT
1843         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1844 #endif
1845         if (q->enqueue) {
1846                 spinlock_t *root_lock = qdisc_lock(q);
1847
1848                 spin_lock(root_lock);
1849
1850                 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1851                         kfree_skb(skb);
1852                         rc = NET_XMIT_DROP;
1853                 } else {
1854                         rc = qdisc_enqueue_root(skb, q);
1855                         qdisc_run(q);
1856                 }
1857                 spin_unlock(root_lock);
1858
1859                 goto out;
1860         }
1861
1862         /* The device has no queue. Common case for software devices:
1863            loopback, all the sorts of tunnels...
1864
1865            Really, it is unlikely that netif_tx_lock protection is necessary
1866            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1867            counters.)
1868            However, it is possible, that they rely on protection
1869            made by us here.
1870
1871            Check this and shot the lock. It is not prone from deadlocks.
1872            Either shot noqueue qdisc, it is even simpler 8)
1873          */
1874         if (dev->flags & IFF_UP) {
1875                 int cpu = smp_processor_id(); /* ok because BHs are off */
1876
1877                 if (txq->xmit_lock_owner != cpu) {
1878
1879                         HARD_TX_LOCK(dev, txq, cpu);
1880
1881                         if (!netif_tx_queue_stopped(txq)) {
1882                                 rc = 0;
1883                                 if (!dev_hard_start_xmit(skb, dev, txq)) {
1884                                         HARD_TX_UNLOCK(dev, txq);
1885                                         goto out;
1886                                 }
1887                         }
1888                         HARD_TX_UNLOCK(dev, txq);
1889                         if (net_ratelimit())
1890                                 printk(KERN_CRIT "Virtual device %s asks to "
1891                                        "queue packet!\n", dev->name);
1892                 } else {
1893                         /* Recursion is detected! It is possible,
1894                          * unfortunately */
1895                         if (net_ratelimit())
1896                                 printk(KERN_CRIT "Dead loop on virtual device "
1897                                        "%s, fix it urgently!\n", dev->name);
1898                 }
1899         }
1900
1901         rc = -ENETDOWN;
1902         rcu_read_unlock_bh();
1903
1904 out_kfree_skb:
1905         kfree_skb(skb);
1906         return rc;
1907 out:
1908         rcu_read_unlock_bh();
1909         return rc;
1910 }
1911
1912
1913 /*=======================================================================
1914                         Receiver routines
1915   =======================================================================*/
1916
1917 int netdev_max_backlog __read_mostly = 1000;
1918 int netdev_budget __read_mostly = 300;
1919 int weight_p __read_mostly = 64;            /* old backlog weight */
1920
1921 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1922
1923
1924 /**
1925  *      netif_rx        -       post buffer to the network code
1926  *      @skb: buffer to post
1927  *
1928  *      This function receives a packet from a device driver and queues it for
1929  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1930  *      may be dropped during processing for congestion control or by the
1931  *      protocol layers.
1932  *
1933  *      return values:
1934  *      NET_RX_SUCCESS  (no congestion)
1935  *      NET_RX_DROP     (packet was dropped)
1936  *
1937  */
1938
1939 int netif_rx(struct sk_buff *skb)
1940 {
1941         struct softnet_data *queue;
1942         unsigned long flags;
1943
1944         /* if netpoll wants it, pretend we never saw it */
1945         if (netpoll_rx(skb))
1946                 return NET_RX_DROP;
1947
1948         if (!skb->tstamp.tv64)
1949                 net_timestamp(skb);
1950
1951         /*
1952          * The code is rearranged so that the path is the most
1953          * short when CPU is congested, but is still operating.
1954          */
1955         local_irq_save(flags);
1956         queue = &__get_cpu_var(softnet_data);
1957
1958         __get_cpu_var(netdev_rx_stat).total++;
1959         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1960                 if (queue->input_pkt_queue.qlen) {
1961 enqueue:
1962                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1963                         local_irq_restore(flags);
1964                         return NET_RX_SUCCESS;
1965                 }
1966
1967                 napi_schedule(&queue->backlog);
1968                 goto enqueue;
1969         }
1970
1971         __get_cpu_var(netdev_rx_stat).dropped++;
1972         local_irq_restore(flags);
1973
1974         kfree_skb(skb);
1975         return NET_RX_DROP;
1976 }
1977
1978 int netif_rx_ni(struct sk_buff *skb)
1979 {
1980         int err;
1981
1982         preempt_disable();
1983         err = netif_rx(skb);
1984         if (local_softirq_pending())
1985                 do_softirq();
1986         preempt_enable();
1987
1988         return err;
1989 }
1990
1991 EXPORT_SYMBOL(netif_rx_ni);
1992
1993 static void net_tx_action(struct softirq_action *h)
1994 {
1995         struct softnet_data *sd = &__get_cpu_var(softnet_data);
1996
1997         if (sd->completion_queue) {
1998                 struct sk_buff *clist;
1999
2000                 local_irq_disable();
2001                 clist = sd->completion_queue;
2002                 sd->completion_queue = NULL;
2003                 local_irq_enable();
2004
2005                 while (clist) {
2006                         struct sk_buff *skb = clist;
2007                         clist = clist->next;
2008
2009                         WARN_ON(atomic_read(&skb->users));
2010                         __kfree_skb(skb);
2011                 }
2012         }
2013
2014         if (sd->output_queue) {
2015                 struct Qdisc *head;
2016
2017                 local_irq_disable();
2018                 head = sd->output_queue;
2019                 sd->output_queue = NULL;
2020                 local_irq_enable();
2021
2022                 while (head) {
2023                         struct Qdisc *q = head;
2024                         spinlock_t *root_lock;
2025
2026                         head = head->next_sched;
2027
2028                         root_lock = qdisc_lock(q);
2029                         if (spin_trylock(root_lock)) {
2030                                 smp_mb__before_clear_bit();
2031                                 clear_bit(__QDISC_STATE_SCHED,
2032                                           &q->state);
2033                                 qdisc_run(q);
2034                                 spin_unlock(root_lock);
2035                         } else {
2036                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2037                                               &q->state)) {
2038                                         __netif_reschedule(q);
2039                                 } else {
2040                                         smp_mb__before_clear_bit();
2041                                         clear_bit(__QDISC_STATE_SCHED,
2042                                                   &q->state);
2043                                 }
2044                         }
2045                 }
2046         }
2047 }
2048
2049 static inline int deliver_skb(struct sk_buff *skb,
2050                               struct packet_type *pt_prev,
2051                               struct net_device *orig_dev)
2052 {
2053         atomic_inc(&skb->users);
2054         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2055 }
2056
2057 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2058 /* These hooks defined here for ATM */
2059 struct net_bridge;
2060 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2061                                                 unsigned char *addr);
2062 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2063
2064 /*
2065  * If bridge module is loaded call bridging hook.
2066  *  returns NULL if packet was consumed.
2067  */
2068 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2069                                         struct sk_buff *skb) __read_mostly;
2070 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2071                                             struct packet_type **pt_prev, int *ret,
2072                                             struct net_device *orig_dev)
2073 {
2074         struct net_bridge_port *port;
2075
2076         if (skb->pkt_type == PACKET_LOOPBACK ||
2077             (port = rcu_dereference(skb->dev->br_port)) == NULL)
2078                 return skb;
2079
2080         if (*pt_prev) {
2081                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2082                 *pt_prev = NULL;
2083         }
2084
2085         return br_handle_frame_hook(port, skb);
2086 }
2087 #else
2088 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
2089 #endif
2090
2091 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2092 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2093 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2094
2095 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2096                                              struct packet_type **pt_prev,
2097                                              int *ret,
2098                                              struct net_device *orig_dev)
2099 {
2100         if (skb->dev->macvlan_port == NULL)
2101                 return skb;
2102
2103         if (*pt_prev) {
2104                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2105                 *pt_prev = NULL;
2106         }
2107         return macvlan_handle_frame_hook(skb);
2108 }
2109 #else
2110 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
2111 #endif
2112
2113 #ifdef CONFIG_NET_CLS_ACT
2114 /* TODO: Maybe we should just force sch_ingress to be compiled in
2115  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2116  * a compare and 2 stores extra right now if we dont have it on
2117  * but have CONFIG_NET_CLS_ACT
2118  * NOTE: This doesnt stop any functionality; if you dont have
2119  * the ingress scheduler, you just cant add policies on ingress.
2120  *
2121  */
2122 static int ing_filter(struct sk_buff *skb)
2123 {
2124         struct net_device *dev = skb->dev;
2125         u32 ttl = G_TC_RTTL(skb->tc_verd);
2126         struct netdev_queue *rxq;
2127         int result = TC_ACT_OK;
2128         struct Qdisc *q;
2129
2130         if (MAX_RED_LOOP < ttl++) {
2131                 printk(KERN_WARNING
2132                        "Redir loop detected Dropping packet (%d->%d)\n",
2133                        skb->iif, dev->ifindex);
2134                 return TC_ACT_SHOT;
2135         }
2136
2137         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2138         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2139
2140         rxq = &dev->rx_queue;
2141
2142         q = rxq->qdisc;
2143         if (q != &noop_qdisc) {
2144                 spin_lock(qdisc_lock(q));
2145                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2146                         result = qdisc_enqueue_root(skb, q);
2147                 spin_unlock(qdisc_lock(q));
2148         }
2149
2150         return result;
2151 }
2152
2153 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2154                                          struct packet_type **pt_prev,
2155                                          int *ret, struct net_device *orig_dev)
2156 {
2157         if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2158                 goto out;
2159
2160         if (*pt_prev) {
2161                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2162                 *pt_prev = NULL;
2163         } else {
2164                 /* Huh? Why does turning on AF_PACKET affect this? */
2165                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2166         }
2167
2168         switch (ing_filter(skb)) {
2169         case TC_ACT_SHOT:
2170         case TC_ACT_STOLEN:
2171                 kfree_skb(skb);
2172                 return NULL;
2173         }
2174
2175 out:
2176         skb->tc_verd = 0;
2177         return skb;
2178 }
2179 #endif
2180
2181 /*
2182  *      netif_nit_deliver - deliver received packets to network taps
2183  *      @skb: buffer
2184  *
2185  *      This function is used to deliver incoming packets to network
2186  *      taps. It should be used when the normal netif_receive_skb path
2187  *      is bypassed, for example because of VLAN acceleration.
2188  */
2189 void netif_nit_deliver(struct sk_buff *skb)
2190 {
2191         struct packet_type *ptype;
2192
2193         if (list_empty(&ptype_all))
2194                 return;
2195
2196         skb_reset_network_header(skb);
2197         skb_reset_transport_header(skb);
2198         skb->mac_len = skb->network_header - skb->mac_header;
2199
2200         rcu_read_lock();
2201         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2202                 if (!ptype->dev || ptype->dev == skb->dev)
2203                         deliver_skb(skb, ptype, skb->dev);
2204         }
2205         rcu_read_unlock();
2206 }
2207
2208 /**
2209  *      netif_receive_skb - process receive buffer from network
2210  *      @skb: buffer to process
2211  *
2212  *      netif_receive_skb() is the main receive data processing function.
2213  *      It always succeeds. The buffer may be dropped during processing
2214  *      for congestion control or by the protocol layers.
2215  *
2216  *      This function may only be called from softirq context and interrupts
2217  *      should be enabled.
2218  *
2219  *      Return values (usually ignored):
2220  *      NET_RX_SUCCESS: no congestion
2221  *      NET_RX_DROP: packet was dropped
2222  */
2223 int netif_receive_skb(struct sk_buff *skb)
2224 {
2225         struct packet_type *ptype, *pt_prev;
2226         struct net_device *orig_dev;
2227         struct net_device *null_or_orig;
2228         int ret = NET_RX_DROP;
2229         __be16 type;
2230
2231         if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2232                 return NET_RX_SUCCESS;
2233
2234         /* if we've gotten here through NAPI, check netpoll */
2235         if (netpoll_receive_skb(skb))
2236                 return NET_RX_DROP;
2237
2238         if (!skb->tstamp.tv64)
2239                 net_timestamp(skb);
2240
2241         if (!skb->iif)
2242                 skb->iif = skb->dev->ifindex;
2243
2244         null_or_orig = NULL;
2245         orig_dev = skb->dev;
2246         if (orig_dev->master) {
2247                 if (skb_bond_should_drop(skb))
2248                         null_or_orig = orig_dev; /* deliver only exact match */
2249                 else
2250                         skb->dev = orig_dev->master;
2251         }
2252
2253         __get_cpu_var(netdev_rx_stat).total++;
2254
2255         skb_reset_network_header(skb);
2256         skb_reset_transport_header(skb);
2257         skb->mac_len = skb->network_header - skb->mac_header;
2258
2259         pt_prev = NULL;
2260
2261         rcu_read_lock();
2262
2263         /* Don't receive packets in an exiting network namespace */
2264         if (!net_alive(dev_net(skb->dev))) {
2265                 kfree_skb(skb);
2266                 goto out;
2267         }
2268
2269 #ifdef CONFIG_NET_CLS_ACT
2270         if (skb->tc_verd & TC_NCLS) {
2271                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2272                 goto ncls;
2273         }
2274 #endif
2275
2276         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2277                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2278                     ptype->dev == orig_dev) {
2279                         if (pt_prev)
2280                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2281                         pt_prev = ptype;
2282                 }
2283         }
2284
2285 #ifdef CONFIG_NET_CLS_ACT
2286         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2287         if (!skb)
2288                 goto out;
2289 ncls:
2290 #endif
2291
2292         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2293         if (!skb)
2294                 goto out;
2295         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2296         if (!skb)
2297                 goto out;
2298
2299         type = skb->protocol;
2300         list_for_each_entry_rcu(ptype,
2301                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2302                 if (ptype->type == type &&
2303                     (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2304                      ptype->dev == orig_dev)) {
2305                         if (pt_prev)
2306                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2307                         pt_prev = ptype;
2308                 }
2309         }
2310
2311         if (pt_prev) {
2312                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2313         } else {
2314                 kfree_skb(skb);
2315                 /* Jamal, now you will not able to escape explaining
2316                  * me how you were going to use this. :-)
2317                  */
2318                 ret = NET_RX_DROP;
2319         }
2320
2321 out:
2322         rcu_read_unlock();
2323         return ret;
2324 }
2325
2326 /* Network device is going away, flush any packets still pending  */
2327 static void flush_backlog(void *arg)
2328 {
2329         struct net_device *dev = arg;
2330         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2331         struct sk_buff *skb, *tmp;
2332
2333         skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2334                 if (skb->dev == dev) {
2335                         __skb_unlink(skb, &queue->input_pkt_queue);
2336                         kfree_skb(skb);
2337                 }
2338 }
2339
2340 static int process_backlog(struct napi_struct *napi, int quota)
2341 {
2342         int work = 0;
2343         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2344         unsigned long start_time = jiffies;
2345
2346         napi->weight = weight_p;
2347         do {
2348                 struct sk_buff *skb;
2349
2350                 local_irq_disable();
2351                 skb = __skb_dequeue(&queue->input_pkt_queue);
2352                 if (!skb) {
2353                         __napi_complete(napi);
2354                         local_irq_enable();
2355                         break;
2356                 }
2357                 local_irq_enable();
2358
2359                 netif_receive_skb(skb);
2360         } while (++work < quota && jiffies == start_time);
2361
2362         return work;
2363 }
2364
2365 /**
2366  * __napi_schedule - schedule for receive
2367  * @n: entry to schedule
2368  *
2369  * The entry's receive function will be scheduled to run
2370  */
2371 void __napi_schedule(struct napi_struct *n)
2372 {
2373         unsigned long flags;
2374
2375         local_irq_save(flags);
2376         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2377         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2378         local_irq_restore(flags);
2379 }
2380 EXPORT_SYMBOL(__napi_schedule);
2381
2382
2383 static void net_rx_action(struct softirq_action *h)
2384 {
2385         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2386         unsigned long time_limit = jiffies + 2;
2387         int budget = netdev_budget;
2388         void *have;
2389
2390         local_irq_disable();
2391
2392         while (!list_empty(list)) {
2393                 struct napi_struct *n;
2394                 int work, weight;
2395
2396                 /* If softirq window is exhuasted then punt.
2397                  * Allow this to run for 2 jiffies since which will allow
2398                  * an average latency of 1.5/HZ.
2399                  */
2400                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2401                         goto softnet_break;
2402
2403                 local_irq_enable();
2404
2405                 /* Even though interrupts have been re-enabled, this
2406                  * access is safe because interrupts can only add new
2407                  * entries to the tail of this list, and only ->poll()
2408                  * calls can remove this head entry from the list.
2409                  */
2410                 n = list_entry(list->next, struct napi_struct, poll_list);
2411
2412                 have = netpoll_poll_lock(n);
2413
2414                 weight = n->weight;
2415
2416                 /* This NAPI_STATE_SCHED test is for avoiding a race
2417                  * with netpoll's poll_napi().  Only the entity which
2418                  * obtains the lock and sees NAPI_STATE_SCHED set will
2419                  * actually make the ->poll() call.  Therefore we avoid
2420                  * accidently calling ->poll() when NAPI is not scheduled.
2421                  */
2422                 work = 0;
2423                 if (test_bit(NAPI_STATE_SCHED, &n->state))
2424                         work = n->poll(n, weight);
2425
2426                 WARN_ON_ONCE(work > weight);
2427
2428                 budget -= work;
2429
2430                 local_irq_disable();
2431
2432                 /* Drivers must not modify the NAPI state if they
2433                  * consume the entire weight.  In such cases this code
2434                  * still "owns" the NAPI instance and therefore can
2435                  * move the instance around on the list at-will.
2436                  */
2437                 if (unlikely(work == weight)) {
2438                         if (unlikely(napi_disable_pending(n)))
2439                                 __napi_complete(n);
2440                         else
2441                                 list_move_tail(&n->poll_list, list);
2442                 }
2443
2444                 netpoll_poll_unlock(have);
2445         }
2446 out:
2447         local_irq_enable();
2448
2449 #ifdef CONFIG_NET_DMA
2450         /*
2451          * There may not be any more sk_buffs coming right now, so push
2452          * any pending DMA copies to hardware
2453          */
2454         if (!cpus_empty(net_dma.channel_mask)) {
2455                 int chan_idx;
2456                 for_each_cpu_mask_nr(chan_idx, net_dma.channel_mask) {
2457                         struct dma_chan *chan = net_dma.channels[chan_idx];
2458                         if (chan)
2459                                 dma_async_memcpy_issue_pending(chan);
2460                 }
2461         }
2462 #endif
2463
2464         return;
2465
2466 softnet_break:
2467         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2468         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2469         goto out;
2470 }
2471
2472 static gifconf_func_t * gifconf_list [NPROTO];
2473
2474 /**
2475  *      register_gifconf        -       register a SIOCGIF handler
2476  *      @family: Address family
2477  *      @gifconf: Function handler
2478  *
2479  *      Register protocol dependent address dumping routines. The handler
2480  *      that is passed must not be freed or reused until it has been replaced
2481  *      by another handler.
2482  */
2483 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2484 {
2485         if (family >= NPROTO)
2486                 return -EINVAL;
2487         gifconf_list[family] = gifconf;
2488         return 0;
2489 }
2490
2491
2492 /*
2493  *      Map an interface index to its name (SIOCGIFNAME)
2494  */
2495
2496 /*
2497  *      We need this ioctl for efficient implementation of the
2498  *      if_indextoname() function required by the IPv6 API.  Without
2499  *      it, we would have to search all the interfaces to find a
2500  *      match.  --pb
2501  */
2502
2503 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2504 {
2505         struct net_device *dev;
2506         struct ifreq ifr;
2507
2508         /*
2509          *      Fetch the caller's info block.
2510          */
2511
2512         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2513                 return -EFAULT;
2514
2515         read_lock(&dev_base_lock);
2516         dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2517         if (!dev) {
2518                 read_unlock(&dev_base_lock);
2519                 return -ENODEV;
2520         }
2521
2522         strcpy(ifr.ifr_name, dev->name);
2523         read_unlock(&dev_base_lock);
2524
2525         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2526                 return -EFAULT;
2527         return 0;
2528 }
2529
2530 /*
2531  *      Perform a SIOCGIFCONF call. This structure will change
2532  *      size eventually, and there is nothing I can do about it.
2533  *      Thus we will need a 'compatibility mode'.
2534  */
2535
2536 static int dev_ifconf(struct net *net, char __user *arg)
2537 {
2538         struct ifconf ifc;
2539         struct net_device *dev;
2540         char __user *pos;
2541         int len;
2542         int total;
2543         int i;
2544
2545         /*
2546          *      Fetch the caller's info block.
2547          */
2548
2549         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2550                 return -EFAULT;
2551
2552         pos = ifc.ifc_buf;
2553         len = ifc.ifc_len;
2554
2555         /*
2556          *      Loop over the interfaces, and write an info block for each.
2557          */
2558
2559         total = 0;
2560         for_each_netdev(net, dev) {
2561                 for (i = 0; i < NPROTO; i++) {
2562                         if (gifconf_list[i]) {
2563                                 int done;
2564                                 if (!pos)
2565                                         done = gifconf_list[i](dev, NULL, 0);
2566                                 else
2567                                         done = gifconf_list[i](dev, pos + total,
2568                                                                len - total);
2569                                 if (done < 0)
2570                                         return -EFAULT;
2571                                 total += done;
2572                         }
2573                 }
2574         }
2575
2576         /*
2577          *      All done.  Write the updated control block back to the caller.
2578          */
2579         ifc.ifc_len = total;
2580
2581         /*
2582          *      Both BSD and Solaris return 0 here, so we do too.
2583          */
2584         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2585 }
2586
2587 #ifdef CONFIG_PROC_FS
2588 /*
2589  *      This is invoked by the /proc filesystem handler to display a device
2590  *      in detail.
2591  */
2592 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2593         __acquires(dev_base_lock)
2594 {
2595         struct net *net = seq_file_net(seq);
2596         loff_t off;
2597         struct net_device *dev;
2598
2599         read_lock(&dev_base_lock);
2600         if (!*pos)
2601                 return SEQ_START_TOKEN;
2602
2603         off = 1;
2604         for_each_netdev(net, dev)
2605                 if (off++ == *pos)
2606                         return dev;
2607
2608         return NULL;
2609 }
2610
2611 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2612 {
2613         struct net *net = seq_file_net(seq);
2614         ++*pos;
2615         return v == SEQ_START_TOKEN ?
2616                 first_net_device(net) : next_net_device((struct net_device *)v);
2617 }
2618
2619 void dev_seq_stop(struct seq_file *seq, void *v)
2620         __releases(dev_base_lock)
2621 {
2622         read_unlock(&dev_base_lock);
2623 }
2624
2625 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2626 {
2627         const struct net_device_stats *stats = dev_get_stats(dev);
2628
2629         seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2630                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2631                    dev->name, stats->rx_bytes, stats->rx_packets,
2632                    stats->rx_errors,
2633                    stats->rx_dropped + stats->rx_missed_errors,
2634                    stats->rx_fifo_errors,
2635                    stats->rx_length_errors + stats->rx_over_errors +
2636                     stats->rx_crc_errors + stats->rx_frame_errors,
2637                    stats->rx_compressed, stats->multicast,
2638                    stats->tx_bytes, stats->tx_packets,
2639                    stats->tx_errors, stats->tx_dropped,
2640                    stats->tx_fifo_errors, stats->collisions,
2641                    stats->tx_carrier_errors +
2642                     stats->tx_aborted_errors +
2643                     stats->tx_window_errors +
2644                     stats->tx_heartbeat_errors,
2645                    stats->tx_compressed);
2646 }
2647
2648 /*
2649  *      Called from the PROCfs module. This now uses the new arbitrary sized
2650  *      /proc/net interface to create /proc/net/dev
2651  */
2652 static int dev_seq_show(struct seq_file *seq, void *v)
2653 {
2654         if (v == SEQ_START_TOKEN)
2655                 seq_puts(seq, "Inter-|   Receive                            "
2656                               "                    |  Transmit\n"
2657                               " face |bytes    packets errs drop fifo frame "
2658                               "compressed multicast|bytes    packets errs "
2659                               "drop fifo colls carrier compressed\n");
2660         else
2661                 dev_seq_printf_stats(seq, v);
2662         return 0;
2663 }
2664
2665 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2666 {
2667         struct netif_rx_stats *rc = NULL;
2668
2669         while (*pos < nr_cpu_ids)
2670                 if (cpu_online(*pos)) {
2671                         rc = &per_cpu(netdev_rx_stat, *pos);
2672                         break;
2673                 } else
2674                         ++*pos;
2675         return rc;
2676 }
2677
2678 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2679 {
2680         return softnet_get_online(pos);
2681 }
2682
2683 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2684 {
2685         ++*pos;
2686         return softnet_get_online(pos);
2687 }
2688
2689 static void softnet_seq_stop(struct seq_file *seq, void *v)
2690 {
2691 }
2692
2693 static int softnet_seq_show(struct seq_file *seq, void *v)
2694 {
2695         struct netif_rx_stats *s = v;
2696
2697         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2698                    s->total, s->dropped, s->time_squeeze, 0,
2699                    0, 0, 0, 0, /* was fastroute */
2700                    s->cpu_collision );
2701         return 0;
2702 }
2703
2704 static const struct seq_operations dev_seq_ops = {
2705         .start = dev_seq_start,
2706         .next  = dev_seq_next,
2707         .stop  = dev_seq_stop,
2708         .show  = dev_seq_show,
2709 };
2710
2711 static int dev_seq_open(struct inode *inode, struct file *file)
2712 {
2713         return seq_open_net(inode, file, &dev_seq_ops,
2714                             sizeof(struct seq_net_private));
2715 }
2716
2717 static const struct file_operations dev_seq_fops = {
2718         .owner   = THIS_MODULE,
2719         .open    = dev_seq_open,
2720         .read    = seq_read,
2721         .llseek  = seq_lseek,
2722         .release = seq_release_net,
2723 };
2724
2725 static const struct seq_operations softnet_seq_ops = {
2726         .start = softnet_seq_start,
2727         .next  = softnet_seq_next,
2728         .stop  = softnet_seq_stop,
2729         .show  = softnet_seq_show,
2730 };
2731
2732 static int softnet_seq_open(struct inode *inode, struct file *file)
2733 {
2734         return seq_open(file, &softnet_seq_ops);
2735 }
2736
2737 static const struct file_operations softnet_seq_fops = {
2738         .owner   = THIS_MODULE,
2739         .open    = softnet_seq_open,
2740         .read    = seq_read,
2741         .llseek  = seq_lseek,
2742         .release = seq_release,
2743 };
2744
2745 static void *ptype_get_idx(loff_t pos)
2746 {
2747         struct packet_type *pt = NULL;
2748         loff_t i = 0;
2749         int t;
2750
2751         list_for_each_entry_rcu(pt, &ptype_all, list) {
2752                 if (i == pos)
2753                         return pt;
2754                 ++i;
2755         }
2756
2757         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
2758                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
2759                         if (i == pos)
2760                                 return pt;
2761                         ++i;
2762                 }
2763         }
2764         return NULL;
2765 }
2766
2767 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
2768         __acquires(RCU)
2769 {
2770         rcu_read_lock();
2771         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
2772 }
2773
2774 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2775 {
2776         struct packet_type *pt;
2777         struct list_head *nxt;
2778         int hash;
2779
2780         ++*pos;
2781         if (v == SEQ_START_TOKEN)
2782                 return ptype_get_idx(0);
2783
2784         pt = v;
2785         nxt = pt->list.next;
2786         if (pt->type == htons(ETH_P_ALL)) {
2787                 if (nxt != &ptype_all)
2788                         goto found;
2789                 hash = 0;
2790                 nxt = ptype_base[0].next;
2791         } else
2792                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
2793
2794         while (nxt == &ptype_base[hash]) {
2795                 if (++hash >= PTYPE_HASH_SIZE)
2796                         return NULL;
2797                 nxt = ptype_base[hash].next;
2798         }
2799 found:
2800         return list_entry(nxt, struct packet_type, list);
2801 }
2802
2803 static void ptype_seq_stop(struct seq_file *seq, void *v)
2804         __releases(RCU)
2805 {
2806         rcu_read_unlock();
2807 }
2808
2809 static int ptype_seq_show(struct seq_file *seq, void *v)
2810 {
2811         struct packet_type *pt = v;
2812
2813         if (v == SEQ_START_TOKEN)
2814                 seq_puts(seq, "Type Device      Function\n");
2815         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
2816                 if (pt->type == htons(ETH_P_ALL))
2817                         seq_puts(seq, "ALL ");
2818                 else
2819                         seq_printf(seq, "%04x", ntohs(pt->type));
2820
2821                 seq_printf(seq, " %-8s %pF\n",
2822                            pt->dev ? pt->dev->name : "", pt->func);
2823         }
2824
2825         return 0;
2826 }
2827
2828 static const struct seq_operations ptype_seq_ops = {
2829         .start = ptype_seq_start,
2830         .next  = ptype_seq_next,
2831         .stop  = ptype_seq_stop,
2832         .show  = ptype_seq_show,
2833 };
2834
2835 static int ptype_seq_open(struct inode *inode, struct file *file)
2836 {
2837         return seq_open_net(inode, file, &ptype_seq_ops,
2838                         sizeof(struct seq_net_private));
2839 }
2840
2841 static const struct file_operations ptype_seq_fops = {
2842         .owner   = THIS_MODULE,
2843         .open    = ptype_seq_open,
2844         .read    = seq_read,
2845         .llseek  = seq_lseek,
2846         .release = seq_release_net,
2847 };
2848
2849
2850 static int __net_init dev_proc_net_init(struct net *net)
2851 {
2852         int rc = -ENOMEM;
2853
2854         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
2855                 goto out;
2856         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
2857                 goto out_dev;
2858         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
2859                 goto out_softnet;
2860
2861         if (wext_proc_init(net))
2862                 goto out_ptype;
2863         rc = 0;
2864 out:
2865         return rc;
2866 out_ptype:
2867         proc_net_remove(net, "ptype");
2868 out_softnet:
2869         proc_net_remove(net, "softnet_stat");
2870 out_dev:
2871         proc_net_remove(net, "dev");
2872         goto out;
2873 }
2874
2875 static void __net_exit dev_proc_net_exit(struct net *net)
2876 {
2877         wext_proc_exit(net);
2878
2879         proc_net_remove(net, "ptype");
2880         proc_net_remove(net, "softnet_stat");
2881         proc_net_remove(net, "dev");
2882 }
2883
2884 static struct pernet_operations __net_initdata dev_proc_ops = {
2885         .init = dev_proc_net_init,
2886         .exit = dev_proc_net_exit,
2887 };
2888
2889 static int __init dev_proc_init(void)
2890 {
2891         return register_pernet_subsys(&dev_proc_ops);
2892 }
2893 #else
2894 #define dev_proc_init() 0
2895 #endif  /* CONFIG_PROC_FS */
2896
2897
2898 /**
2899  *      netdev_set_master       -       set up master/slave pair
2900  *      @slave: slave device
2901  *      @master: new master device
2902  *
2903  *      Changes the master device of the slave. Pass %NULL to break the
2904  *      bonding. The caller must hold the RTNL semaphore. On a failure
2905  *      a negative errno code is returned. On success the reference counts
2906  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2907  *      function returns zero.
2908  */
2909 int netdev_set_master(struct net_device *slave, struct net_device *master)
2910 {
2911         struct net_device *old = slave->master;
2912
2913         ASSERT_RTNL();
2914
2915         if (master) {
2916                 if (old)
2917                         return -EBUSY;
2918                 dev_hold(master);
2919         }
2920
2921         slave->master = master;
2922
2923         synchronize_net();
2924
2925         if (old)
2926                 dev_put(old);
2927
2928         if (master)
2929                 slave->flags |= IFF_SLAVE;
2930         else
2931                 slave->flags &= ~IFF_SLAVE;
2932
2933         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2934         return 0;
2935 }
2936
2937 static void dev_change_rx_flags(struct net_device *dev, int flags)
2938 {
2939         const struct net_device_ops *ops = dev->netdev_ops;
2940
2941         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
2942                 ops->ndo_change_rx_flags(dev, flags);
2943 }
2944
2945 static int __dev_set_promiscuity(struct net_device *dev, int inc)
2946 {
2947         unsigned short old_flags = dev->flags;
2948
2949         ASSERT_RTNL();
2950
2951         dev->flags |= IFF_PROMISC;
2952         dev->promiscuity += inc;
2953         if (dev->promiscuity == 0) {
2954                 /*
2955                  * Avoid overflow.
2956                  * If inc causes overflow, untouch promisc and return error.
2957                  */
2958                 if (inc < 0)
2959                         dev->flags &= ~IFF_PROMISC;
2960                 else {
2961                         dev->promiscuity -= inc;
2962                         printk(KERN_WARNING "%s: promiscuity touches roof, "
2963                                 "set promiscuity failed, promiscuity feature "
2964                                 "of device might be broken.\n", dev->name);
2965                         return -EOVERFLOW;
2966                 }
2967         }
2968         if (dev->flags != old_flags) {
2969                 printk(KERN_INFO "device %s %s promiscuous mode\n",
2970                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2971                                                                "left");
2972                 if (audit_enabled)
2973                         audit_log(current->audit_context, GFP_ATOMIC,
2974                                 AUDIT_ANOM_PROMISCUOUS,
2975                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
2976                                 dev->name, (dev->flags & IFF_PROMISC),
2977                                 (old_flags & IFF_PROMISC),
2978                                 audit_get_loginuid(current),
2979                                 current->uid, current->gid,
2980                                 audit_get_sessionid(current));
2981
2982                 dev_change_rx_flags(dev, IFF_PROMISC);
2983         }
2984         return 0;
2985 }
2986
2987 /**
2988  *      dev_set_promiscuity     - update promiscuity count on a device
2989  *      @dev: device
2990  *      @inc: modifier
2991  *
2992  *      Add or remove promiscuity from a device. While the count in the device
2993  *      remains above zero the interface remains promiscuous. Once it hits zero
2994  *      the device reverts back to normal filtering operation. A negative inc
2995  *      value is used to drop promiscuity on the device.
2996  *      Return 0 if successful or a negative errno code on error.
2997  */
2998 int dev_set_promiscuity(struct net_device *dev, int inc)
2999 {
3000         unsigned short old_flags = dev->flags;
3001         int err;
3002
3003         err = __dev_set_promiscuity(dev, inc);
3004         if (err < 0)
3005                 return err;
3006         if (dev->flags != old_flags)
3007                 dev_set_rx_mode(dev);
3008         return err;
3009 }
3010
3011 /**
3012  *      dev_set_allmulti        - update allmulti count on a device
3013  *      @dev: device
3014  *      @inc: modifier
3015  *
3016  *      Add or remove reception of all multicast frames to a device. While the
3017  *      count in the device remains above zero the interface remains listening
3018  *      to all interfaces. Once it hits zero the device reverts back to normal
3019  *      filtering operation. A negative @inc value is used to drop the counter
3020  *      when releasing a resource needing all multicasts.
3021  *      Return 0 if successful or a negative errno code on error.
3022  */
3023
3024 int dev_set_allmulti(struct net_device *dev, int inc)
3025 {
3026         unsigned short old_flags = dev->flags;
3027
3028         ASSERT_RTNL();
3029
3030         dev->flags |= IFF_ALLMULTI;
3031         dev->allmulti += inc;
3032         if (dev->allmulti == 0) {
3033                 /*
3034                  * Avoid overflow.
3035                  * If inc causes overflow, untouch allmulti and return error.
3036                  */
3037                 if (inc < 0)
3038                         dev->flags &= ~IFF_ALLMULTI;
3039                 else {
3040                         dev->allmulti -= inc;
3041                         printk(KERN_WARNING "%s: allmulti touches roof, "
3042                                 "set allmulti failed, allmulti feature of "
3043                                 "device might be broken.\n", dev->name);
3044                         return -EOVERFLOW;
3045                 }
3046         }
3047         if (dev->flags ^ old_flags) {
3048                 dev_change_rx_flags(dev, IFF_ALLMULTI);
3049                 dev_set_rx_mode(dev);
3050         }
3051         return 0;
3052 }
3053
3054 /*
3055  *      Upload unicast and multicast address lists to device and
3056  *      configure RX filtering. When the device doesn't support unicast
3057  *      filtering it is put in promiscuous mode while unicast addresses
3058  *      are present.
3059  */
3060 void __dev_set_rx_mode(struct net_device *dev)
3061 {
3062         const struct net_device_ops *ops = dev->netdev_ops;
3063
3064         /* dev_open will call this function so the list will stay sane. */
3065         if (!(dev->flags&IFF_UP))
3066                 return;
3067
3068         if (!netif_device_present(dev))
3069                 return;
3070
3071         if (ops->ndo_set_rx_mode)
3072                 ops->ndo_set_rx_mode(dev);
3073         else {
3074                 /* Unicast addresses changes may only happen under the rtnl,
3075                  * therefore calling __dev_set_promiscuity here is safe.
3076                  */
3077                 if (dev->uc_count > 0 && !dev->uc_promisc) {
3078                         __dev_set_promiscuity(dev, 1);
3079                         dev->uc_promisc = 1;
3080                 } else if (dev->uc_count == 0 && dev->uc_promisc) {
3081                         __dev_set_promiscuity(dev, -1);
3082                         dev->uc_promisc = 0;
3083                 }
3084
3085                 if (ops->ndo_set_multicast_list)
3086                         ops->ndo_set_multicast_list(dev);
3087         }
3088 }
3089
3090 void dev_set_rx_mode(struct net_device *dev)
3091 {
3092         netif_addr_lock_bh(dev);
3093         __dev_set_rx_mode(dev);
3094         netif_addr_unlock_bh(dev);
3095 }
3096
3097 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3098                       void *addr, int alen, int glbl)
3099 {
3100         struct dev_addr_list *da;
3101
3102         for (; (da = *list) != NULL; list = &da->next) {
3103                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3104                     alen == da->da_addrlen) {
3105                         if (glbl) {
3106                                 int old_glbl = da->da_gusers;
3107                                 da->da_gusers = 0;
3108                                 if (old_glbl == 0)
3109                                         break;
3110                         }
3111                         if (--da->da_users)
3112                                 return 0;
3113
3114                         *list = da->next;
3115                         kfree(da);
3116                         (*count)--;
3117                         return 0;
3118                 }
3119         }
3120         return -ENOENT;
3121 }
3122
3123 int __dev_addr_add(struct dev_addr_list **list, int *count,
3124                    void *addr, int alen, int glbl)
3125 {
3126         struct dev_addr_list *da;
3127
3128         for (da = *list; da != NULL; da = da->next) {
3129                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3130                     da->da_addrlen == alen) {
3131                         if (glbl) {
3132                                 int old_glbl = da->da_gusers;
3133                                 da->da_gusers = 1;
3134                                 if (old_glbl)
3135                                         return 0;
3136                         }
3137                         da->da_users++;
3138                         return 0;
3139                 }
3140         }
3141
3142         da = kzalloc(sizeof(*da), GFP_ATOMIC);
3143         if (da == NULL)
3144                 return -ENOMEM;
3145         memcpy(da->da_addr, addr, alen);
3146         da->da_addrlen = alen;
3147         da->da_users = 1;
3148         da->da_gusers = glbl ? 1 : 0;
3149         da->next = *list;
3150         *list = da;
3151         (*count)++;
3152         return 0;
3153 }
3154
3155 /**
3156  *      dev_unicast_delete      - Release secondary unicast address.
3157  *      @dev: device
3158  *      @addr: address to delete
3159  *      @alen: length of @addr
3160  *
3161  *      Release reference to a secondary unicast address and remove it
3162  *      from the device if the reference count drops to zero.
3163  *
3164  *      The caller must hold the rtnl_mutex.
3165  */
3166 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3167 {
3168         int err;
3169
3170         ASSERT_RTNL();
3171
3172         netif_addr_lock_bh(dev);
3173         err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3174         if (!err)
3175                 __dev_set_rx_mode(dev);
3176         netif_addr_unlock_bh(dev);
3177         return err;
3178 }
3179 EXPORT_SYMBOL(dev_unicast_delete);
3180
3181 /**
3182  *      dev_unicast_add         - add a secondary unicast address
3183  *      @dev: device
3184  *      @addr: address to add
3185  *      @alen: length of @addr
3186  *
3187  *      Add a secondary unicast address to the device or increase
3188  *      the reference count if it already exists.
3189  *
3190  *      The caller must hold the rtnl_mutex.
3191  */
3192 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3193 {
3194         int err;
3195
3196         ASSERT_RTNL();
3197
3198         netif_addr_lock_bh(dev);
3199         err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3200         if (!err)
3201                 __dev_set_rx_mode(dev);
3202         netif_addr_unlock_bh(dev);
3203         return err;
3204 }
3205 EXPORT_SYMBOL(dev_unicast_add);
3206
3207 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3208                     struct dev_addr_list **from, int *from_count)
3209 {
3210         struct dev_addr_list *da, *next;
3211         int err = 0;
3212
3213         da = *from;
3214         while (da != NULL) {
3215                 next = da->next;
3216                 if (!da->da_synced) {
3217                         err = __dev_addr_add(to, to_count,
3218                                              da->da_addr, da->da_addrlen, 0);
3219                         if (err < 0)
3220                                 break;
3221                         da->da_synced = 1;
3222                         da->da_users++;
3223                 } else if (da->da_users == 1) {
3224                         __dev_addr_delete(to, to_count,
3225                                           da->da_addr, da->da_addrlen, 0);
3226                         __dev_addr_delete(from, from_count,
3227                                           da->da_addr, da->da_addrlen, 0);
3228                 }
3229                 da = next;
3230         }
3231         return err;
3232 }
3233
3234 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3235                        struct dev_addr_list **from, int *from_count)
3236 {
3237         struct dev_addr_list *da, *next;
3238
3239         da = *from;
3240         while (da != NULL) {
3241                 next = da->next;
3242                 if (da->da_synced) {
3243                         __dev_addr_delete(to, to_count,
3244                                           da->da_addr, da->da_addrlen, 0);
3245                         da->da_synced = 0;
3246                         __dev_addr_delete(from, from_count,
3247                                           da->da_addr, da->da_addrlen, 0);
3248                 }
3249                 da = next;
3250         }
3251 }
3252
3253 /**
3254  *      dev_unicast_sync - Synchronize device's unicast list to another device
3255  *      @to: destination device
3256  *      @from: source device
3257  *
3258  *      Add newly added addresses to the destination device and release
3259  *      addresses that have no users left. The source device must be
3260  *      locked by netif_tx_lock_bh.
3261  *
3262  *      This function is intended to be called from the dev->set_rx_mode
3263  *      function of layered software devices.
3264  */
3265 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3266 {
3267         int err = 0;
3268
3269         netif_addr_lock_bh(to);
3270         err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3271                               &from->uc_list, &from->uc_count);
3272         if (!err)
3273                 __dev_set_rx_mode(to);
3274         netif_addr_unlock_bh(to);
3275         return err;
3276 }
3277 EXPORT_SYMBOL(dev_unicast_sync);
3278
3279 /**
3280  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
3281  *      @to: destination device
3282  *      @from: source device
3283  *
3284  *      Remove all addresses that were added to the destination device by
3285  *      dev_unicast_sync(). This function is intended to be called from the
3286  *      dev->stop function of layered software devices.
3287  */
3288 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3289 {
3290         netif_addr_lock_bh(from);
3291         netif_addr_lock(to);
3292
3293         __dev_addr_unsync(&to->uc_list, &to->uc_count,
3294                           &from->uc_list, &from->uc_count);
3295         __dev_set_rx_mode(to);
3296
3297         netif_addr_unlock(to);
3298         netif_addr_unlock_bh(from);
3299 }
3300 EXPORT_SYMBOL(dev_unicast_unsync);
3301
3302 static void __dev_addr_discard(struct dev_addr_list **list)
3303 {
3304         struct dev_addr_list *tmp;
3305
3306         while (*list != NULL) {
3307                 tmp = *list;
3308                 *list = tmp->next;
3309                 if (tmp->da_users > tmp->da_gusers)
3310                         printk("__dev_addr_discard: address leakage! "
3311                                "da_users=%d\n", tmp->da_users);
3312                 kfree(tmp);
3313         }
3314 }
3315
3316 static void dev_addr_discard(struct net_device *dev)
3317 {
3318         netif_addr_lock_bh(dev);
3319
3320         __dev_addr_discard(&dev->uc_list);
3321         dev->uc_count = 0;
3322
3323         __dev_addr_discard(&dev->mc_list);
3324         dev->mc_count = 0;
3325
3326         netif_addr_unlock_bh(dev);
3327 }
3328
3329 /**
3330  *      dev_get_flags - get flags reported to userspace
3331  *      @dev: device
3332  *
3333  *      Get the combination of flag bits exported through APIs to userspace.
3334  */
3335 unsigned dev_get_flags(const struct net_device *dev)
3336 {
3337         unsigned flags;
3338
3339         flags = (dev->flags & ~(IFF_PROMISC |
3340                                 IFF_ALLMULTI |
3341                                 IFF_RUNNING |
3342                                 IFF_LOWER_UP |
3343                                 IFF_DORMANT)) |
3344                 (dev->gflags & (IFF_PROMISC |
3345                                 IFF_ALLMULTI));
3346
3347         if (netif_running(dev)) {
3348                 if (netif_oper_up(dev))
3349                         flags |= IFF_RUNNING;
3350                 if (netif_carrier_ok(dev))
3351                         flags |= IFF_LOWER_UP;
3352                 if (netif_dormant(dev))
3353                         flags |= IFF_DORMANT;
3354         }
3355
3356         return flags;
3357 }
3358
3359 /**
3360  *      dev_change_flags - change device settings
3361  *      @dev: device
3362  *      @flags: device state flags
3363  *
3364  *      Change settings on device based state flags. The flags are
3365  *      in the userspace exported format.
3366  */
3367 int dev_change_flags(struct net_device *dev, unsigned flags)
3368 {
3369         int ret, changes;
3370         int old_flags = dev->flags;
3371
3372         ASSERT_RTNL();
3373
3374         /*
3375          *      Set the flags on our device.
3376          */
3377
3378         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3379                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3380                                IFF_AUTOMEDIA)) |
3381                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3382                                     IFF_ALLMULTI));
3383
3384         /*
3385          *      Load in the correct multicast list now the flags have changed.
3386          */
3387
3388         if ((old_flags ^ flags) & IFF_MULTICAST)
3389                 dev_change_rx_flags(dev, IFF_MULTICAST);
3390
3391         dev_set_rx_mode(dev);
3392
3393         /*
3394          *      Have we downed the interface. We handle IFF_UP ourselves
3395          *      according to user attempts to set it, rather than blindly
3396          *      setting it.
3397          */
3398
3399         ret = 0;
3400         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
3401                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3402
3403                 if (!ret)
3404                         dev_set_rx_mode(dev);
3405         }
3406
3407         if (dev->flags & IFF_UP &&
3408             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3409                                           IFF_VOLATILE)))
3410                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
3411
3412         if ((flags ^ dev->gflags) & IFF_PROMISC) {
3413                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
3414                 dev->gflags ^= IFF_PROMISC;
3415                 dev_set_promiscuity(dev, inc);
3416         }
3417
3418         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3419            is important. Some (broken) drivers set IFF_PROMISC, when
3420            IFF_ALLMULTI is requested not asking us and not reporting.
3421          */
3422         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3423                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3424                 dev->gflags ^= IFF_ALLMULTI;
3425                 dev_set_allmulti(dev, inc);
3426         }
3427
3428         /* Exclude state transition flags, already notified */
3429         changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3430         if (changes)
3431                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3432
3433         return ret;
3434 }
3435
3436 /**
3437  *      dev_set_mtu - Change maximum transfer unit
3438  *      @dev: device
3439  *      @new_mtu: new transfer unit
3440  *
3441  *      Change the maximum transfer size of the network device.
3442  */
3443 int dev_set_mtu(struct net_device *dev, int new_mtu)
3444 {
3445         const struct net_device_ops *ops = dev->netdev_ops;
3446         int err;
3447
3448         if (new_mtu == dev->mtu)
3449                 return 0;
3450
3451         /*      MTU must be positive.    */
3452         if (new_mtu < 0)
3453                 return -EINVAL;
3454
3455         if (!netif_device_present(dev))
3456                 return -ENODEV;
3457
3458         err = 0;
3459         if (ops->ndo_change_mtu)
3460                 err = ops->ndo_change_mtu(dev, new_mtu);
3461         else
3462                 dev->mtu = new_mtu;
3463
3464         if (!err && dev->flags & IFF_UP)
3465                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3466         return err;
3467 }
3468
3469 /**
3470  *      dev_set_mac_address - Change Media Access Control Address
3471  *      @dev: device
3472  *      @sa: new address
3473  *
3474  *      Change the hardware (MAC) address of the device
3475  */
3476 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3477 {
3478         const struct net_device_ops *ops = dev->netdev_ops;
3479         int err;
3480
3481         if (!ops->ndo_set_mac_address)
3482                 return -EOPNOTSUPP;
3483         if (sa->sa_family != dev->type)
3484                 return -EINVAL;
3485         if (!netif_device_present(dev))
3486                 return -ENODEV;
3487         err = ops->ndo_set_mac_address(dev, sa);
3488         if (!err)
3489                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3490         return err;
3491 }
3492
3493 /*
3494  *      Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3495  */
3496 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3497 {
3498         int err;
3499         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3500
3501         if (!dev)
3502                 return -ENODEV;
3503
3504         switch (cmd) {
3505                 case SIOCGIFFLAGS:      /* Get interface flags */
3506                         ifr->ifr_flags = dev_get_flags(dev);
3507                         return 0;
3508
3509                 case SIOCGIFMETRIC:     /* Get the metric on the interface
3510                                            (currently unused) */
3511                         ifr->ifr_metric = 0;
3512                         return 0;
3513
3514                 case SIOCGIFMTU:        /* Get the MTU of a device */
3515                         ifr->ifr_mtu = dev->mtu;
3516                         return 0;
3517
3518                 case SIOCGIFHWADDR:
3519                         if (!dev->addr_len)
3520                                 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3521                         else
3522                                 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3523                                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3524                         ifr->ifr_hwaddr.sa_family = dev->type;
3525                         return 0;
3526
3527                 case SIOCGIFSLAVE:
3528                         err = -EINVAL;
3529                         break;
3530
3531                 case SIOCGIFMAP:
3532                         ifr->ifr_map.mem_start = dev->mem_start;
3533                         ifr->ifr_map.mem_end   = dev->mem_end;
3534                         ifr->ifr_map.base_addr = dev->base_addr;
3535                         ifr->ifr_map.irq       = dev->irq;
3536                         ifr->ifr_map.dma       = dev->dma;
3537                         ifr->ifr_map.port      = dev->if_port;
3538                         return 0;
3539
3540                 case SIOCGIFINDEX:
3541                         ifr->ifr_ifindex = dev->ifindex;
3542                         return 0;
3543
3544                 case SIOCGIFTXQLEN:
3545                         ifr->ifr_qlen = dev->tx_queue_len;
3546                         return 0;
3547
3548                 default:
3549                         /* dev_ioctl() should ensure this case
3550                          * is never reached
3551                          */
3552                         WARN_ON(1);
3553                         err = -EINVAL;
3554                         break;
3555
3556         }
3557         return err;
3558 }
3559
3560 /*
3561  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
3562  */
3563 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3564 {
3565         int err;
3566         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3567         const struct net_device_ops *ops = dev->netdev_ops;
3568
3569         if (!dev)
3570                 return -ENODEV;
3571
3572         switch (cmd) {
3573                 case SIOCSIFFLAGS:      /* Set interface flags */
3574                         return dev_change_flags(dev, ifr->ifr_flags);
3575
3576                 case SIOCSIFMETRIC:     /* Set the metric on the interface
3577                                            (currently unused) */
3578                         return -EOPNOTSUPP;
3579
3580                 case SIOCSIFMTU:        /* Set the MTU of a device */
3581                         return dev_set_mtu(dev, ifr->ifr_mtu);
3582
3583                 case SIOCSIFHWADDR:
3584                         return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3585
3586                 case SIOCSIFHWBROADCAST:
3587                         if (ifr->ifr_hwaddr.sa_family != dev->type)
3588                                 return -EINVAL;
3589                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3590                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3591                         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3592                         return 0;
3593
3594                 case SIOCSIFMAP:
3595                         if (ops->ndo_set_config) {
3596                                 if (!netif_device_present(dev))
3597                                         return -ENODEV;
3598                                 return ops->ndo_set_config(dev, &ifr->ifr_map);
3599                         }
3600                         return -EOPNOTSUPP;
3601
3602                 case SIOCADDMULTI:
3603                         if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3604                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3605                                 return -EINVAL;
3606                         if (!netif_device_present(dev))
3607                                 return -ENODEV;
3608                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3609                                           dev->addr_len, 1);
3610
3611                 case SIOCDELMULTI:
3612                         if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3613                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3614                                 return -EINVAL;
3615                         if (!netif_device_present(dev))
3616                                 return -ENODEV;
3617                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3618                                              dev->addr_len, 1);
3619
3620                 case SIOCSIFTXQLEN:
3621                         if (ifr->ifr_qlen < 0)
3622                                 return -EINVAL;
3623                         dev->tx_queue_len = ifr->ifr_qlen;
3624                         return 0;
3625
3626                 case SIOCSIFNAME:
3627                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3628                         return dev_change_name(dev, ifr->ifr_newname);
3629
3630                 /*
3631                  *      Unknown or private ioctl
3632                  */
3633
3634                 default:
3635                         if ((cmd >= SIOCDEVPRIVATE &&
3636                             cmd <= SIOCDEVPRIVATE + 15) ||
3637                             cmd == SIOCBONDENSLAVE ||
3638                             cmd == SIOCBONDRELEASE ||
3639                             cmd == SIOCBONDSETHWADDR ||
3640                             cmd == SIOCBONDSLAVEINFOQUERY ||
3641                             cmd == SIOCBONDINFOQUERY ||
3642                             cmd == SIOCBONDCHANGEACTIVE ||
3643                             cmd == SIOCGMIIPHY ||
3644                             cmd == SIOCGMIIREG ||
3645                             cmd == SIOCSMIIREG ||
3646                             cmd == SIOCBRADDIF ||
3647                             cmd == SIOCBRDELIF ||
3648                             cmd == SIOCWANDEV) {
3649                                 err = -EOPNOTSUPP;
3650                                 if (ops->ndo_do_ioctl) {
3651                                         if (netif_device_present(dev))
3652                                                 err = ops->ndo_do_ioctl(dev, ifr, cmd);
3653                                         else
3654                                                 err = -ENODEV;
3655                                 }
3656                         } else
3657                                 err = -EINVAL;
3658
3659         }
3660         return err;
3661 }
3662
3663 /*
3664  *      This function handles all "interface"-type I/O control requests. The actual
3665  *      'doing' part of this is dev_ifsioc above.
3666  */
3667
3668 /**
3669  *      dev_ioctl       -       network device ioctl
3670  *      @net: the applicable net namespace
3671  *      @cmd: command to issue
3672  *      @arg: pointer to a struct ifreq in user space
3673  *
3674  *      Issue ioctl functions to devices. This is normally called by the
3675  *      user space syscall interfaces but can sometimes be useful for
3676  *      other purposes. The return value is the return from the syscall if
3677  *      positive or a negative errno code on error.
3678  */
3679
3680 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3681 {
3682         struct ifreq ifr;
3683         int ret;
3684         char *colon;
3685
3686         /* One special case: SIOCGIFCONF takes ifconf argument
3687            and requires shared lock, because it sleeps writing
3688            to user space.
3689          */
3690
3691         if (cmd == SIOCGIFCONF) {
3692                 rtnl_lock();
3693                 ret = dev_ifconf(net, (char __user *) arg);
3694                 rtnl_unlock();
3695                 return ret;
3696         }
3697         if (cmd == SIOCGIFNAME)
3698                 return dev_ifname(net, (struct ifreq __user *)arg);
3699
3700         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3701                 return -EFAULT;
3702
3703         ifr.ifr_name[IFNAMSIZ-1] = 0;
3704
3705         colon = strchr(ifr.ifr_name, ':');
3706         if (colon)
3707                 *colon = 0;
3708
3709         /*
3710          *      See which interface the caller is talking about.
3711          */
3712
3713         switch (cmd) {
3714                 /*
3715                  *      These ioctl calls:
3716                  *      - can be done by all.
3717                  *      - atomic and do not require locking.
3718                  *      - return a value
3719                  */
3720                 case SIOCGIFFLAGS:
3721                 case SIOCGIFMETRIC:
3722                 case SIOCGIFMTU:
3723                 case SIOCGIFHWADDR:
3724                 case SIOCGIFSLAVE:
3725                 case SIOCGIFMAP:
3726                 case SIOCGIFINDEX:
3727                 case SIOCGIFTXQLEN:
3728                         dev_load(net, ifr.ifr_name);
3729                         read_lock(&dev_base_lock);
3730                         ret = dev_ifsioc_locked(net, &ifr, cmd);
3731                         read_unlock(&dev_base_lock);
3732                         if (!ret) {
3733                                 if (colon)
3734                                         *colon = ':';
3735                                 if (copy_to_user(arg, &ifr,
3736                                                  sizeof(struct ifreq)))
3737                                         ret = -EFAULT;
3738                         }
3739                         return ret;
3740
3741                 case SIOCETHTOOL:
3742                         dev_load(net, ifr.ifr_name);
3743                         rtnl_lock();
3744                         ret = dev_ethtool(net, &ifr);
3745                         rtnl_unlock();
3746                         if (!ret) {
3747                                 if (colon)
3748                                         *colon = ':';
3749                                 if (copy_to_user(arg, &ifr,
3750                                                  sizeof(struct ifreq)))
3751                                         ret = -EFAULT;
3752                         }
3753                         return ret;
3754
3755                 /*
3756                  *      These ioctl calls:
3757                  *      - require superuser power.
3758                  *      - require strict serialization.
3759                  *      - return a value
3760                  */
3761                 case SIOCGMIIPHY:
3762                 case SIOCGMIIREG:
3763                 case SIOCSIFNAME:
3764                         if (!capable(CAP_NET_ADMIN))
3765                                 return -EPERM;
3766                         dev_load(net, ifr.ifr_name);
3767                         rtnl_lock();
3768                         ret = dev_ifsioc(net, &ifr, cmd);
3769                         rtnl_unlock();
3770                         if (!ret) {
3771                                 if (colon)
3772                                         *colon = ':';
3773                                 if (copy_to_user(arg, &ifr,
3774                                                  sizeof(struct ifreq)))
3775                                         ret = -EFAULT;
3776                         }
3777                         return ret;
3778
3779                 /*
3780                  *      These ioctl calls:
3781                  *      - require superuser power.
3782                  *      - require strict serialization.
3783                  *      - do not return a value
3784                  */
3785                 case SIOCSIFFLAGS:
3786                 case SIOCSIFMETRIC:
3787                 case SIOCSIFMTU:
3788                 case SIOCSIFMAP:
3789                 case SIOCSIFHWADDR:
3790                 case SIOCSIFSLAVE:
3791                 case SIOCADDMULTI:
3792                 case SIOCDELMULTI:
3793                 case SIOCSIFHWBROADCAST:
3794                 case SIOCSIFTXQLEN:
3795                 case SIOCSMIIREG:
3796                 case SIOCBONDENSLAVE:
3797                 case SIOCBONDRELEASE:
3798                 case SIOCBONDSETHWADDR:
3799                 case SIOCBONDCHANGEACTIVE:
3800                 case SIOCBRADDIF:
3801                 case SIOCBRDELIF:
3802                         if (!capable(CAP_NET_ADMIN))
3803                                 return -EPERM;
3804                         /* fall through */
3805                 case SIOCBONDSLAVEINFOQUERY:
3806                 case SIOCBONDINFOQUERY:
3807                         dev_load(net, ifr.ifr_name);
3808                         rtnl_lock();
3809                         ret = dev_ifsioc(net, &ifr, cmd);
3810                         rtnl_unlock();
3811                         return ret;
3812
3813                 case SIOCGIFMEM:
3814                         /* Get the per device memory space. We can add this but
3815                          * currently do not support it */
3816                 case SIOCSIFMEM:
3817                         /* Set the per device memory buffer space.
3818                          * Not applicable in our case */
3819                 case SIOCSIFLINK:
3820                         return -EINVAL;
3821
3822                 /*
3823                  *      Unknown or private ioctl.
3824                  */
3825                 default:
3826                         if (cmd == SIOCWANDEV ||
3827                             (cmd >= SIOCDEVPRIVATE &&
3828                              cmd <= SIOCDEVPRIVATE + 15)) {
3829                                 dev_load(net, ifr.ifr_name);
3830                                 rtnl_lock();
3831                                 ret = dev_ifsioc(net, &ifr, cmd);
3832                                 rtnl_unlock();
3833                                 if (!ret && copy_to_user(arg, &ifr,
3834                                                          sizeof(struct ifreq)))
3835                                         ret = -EFAULT;
3836                                 return ret;
3837                         }
3838                         /* Take care of Wireless Extensions */
3839                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
3840                                 return wext_handle_ioctl(net, &ifr, cmd, arg);
3841                         return -EINVAL;
3842         }
3843 }
3844
3845
3846 /**
3847  *      dev_new_index   -       allocate an ifindex
3848  *      @net: the applicable net namespace
3849  *
3850  *      Returns a suitable unique value for a new device interface
3851  *      number.  The caller must hold the rtnl semaphore or the
3852  *      dev_base_lock to be sure it remains unique.
3853  */
3854 static int dev_new_index(struct net *net)
3855 {
3856         static int ifindex;
3857         for (;;) {
3858                 if (++ifindex <= 0)
3859                         ifindex = 1;
3860                 if (!__dev_get_by_index(net, ifindex))
3861                         return ifindex;
3862         }
3863 }
3864
3865 /* Delayed registration/unregisteration */
3866 static LIST_HEAD(net_todo_list);
3867
3868 static void net_set_todo(struct net_device *dev)
3869 {
3870         list_add_tail(&dev->todo_list, &net_todo_list);
3871 }
3872
3873 static void rollback_registered(struct net_device *dev)
3874 {
3875         BUG_ON(dev_boot_phase);
3876         ASSERT_RTNL();
3877
3878         /* Some devices call without registering for initialization unwind. */
3879         if (dev->reg_state == NETREG_UNINITIALIZED) {
3880                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3881                                   "was registered\n", dev->name, dev);
3882
3883                 WARN_ON(1);
3884                 return;
3885         }
3886
3887         BUG_ON(dev->reg_state != NETREG_REGISTERED);
3888
3889         /* If device is running, close it first. */
3890         dev_close(dev);
3891
3892         /* And unlink it from device chain. */
3893         unlist_netdevice(dev);
3894
3895         dev->reg_state = NETREG_UNREGISTERING;
3896
3897         synchronize_net();
3898
3899         /* Shutdown queueing discipline. */
3900         dev_shutdown(dev);
3901
3902
3903         /* Notify protocols, that we are about to destroy
3904            this device. They should clean all the things.
3905         */
3906         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3907
3908         /*
3909          *      Flush the unicast and multicast chains
3910          */
3911         dev_addr_discard(dev);
3912
3913         if (dev->netdev_ops->ndo_uninit)
3914                 dev->netdev_ops->ndo_uninit(dev);
3915
3916         /* Notifier chain MUST detach us from master device. */
3917         WARN_ON(dev->master);
3918
3919         /* Remove entries from kobject tree */
3920         netdev_unregister_kobject(dev);
3921
3922         synchronize_net();
3923
3924         dev_put(dev);
3925 }
3926
3927 static void __netdev_init_queue_locks_one(struct net_device *dev,
3928                                           struct netdev_queue *dev_queue,
3929                                           void *_unused)
3930 {
3931         spin_lock_init(&dev_queue->_xmit_lock);
3932         netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
3933         dev_queue->xmit_lock_owner = -1;
3934 }
3935
3936 static void netdev_init_queue_locks(struct net_device *dev)
3937 {
3938         netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
3939         __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
3940 }
3941
3942 unsigned long netdev_fix_features(unsigned long features, const char *name)
3943 {
3944         /* Fix illegal SG+CSUM combinations. */
3945         if ((features & NETIF_F_SG) &&
3946             !(features & NETIF_F_ALL_CSUM)) {
3947                 if (name)
3948                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
3949                                "checksum feature.\n", name);
3950                 features &= ~NETIF_F_SG;
3951         }
3952
3953         /* TSO requires that SG is present as well. */
3954         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
3955                 if (name)
3956                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
3957                                "SG feature.\n", name);
3958                 features &= ~NETIF_F_TSO;
3959         }
3960
3961         if (features & NETIF_F_UFO) {
3962                 if (!(features & NETIF_F_GEN_CSUM)) {
3963                         if (name)
3964                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
3965                                        "since no NETIF_F_HW_CSUM feature.\n",
3966                                        name);
3967                         features &= ~NETIF_F_UFO;
3968                 }
3969
3970                 if (!(features & NETIF_F_SG)) {
3971                         if (name)
3972                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
3973                                        "since no NETIF_F_SG feature.\n", name);
3974                         features &= ~NETIF_F_UFO;
3975                 }
3976         }
3977
3978         return features;
3979 }
3980 EXPORT_SYMBOL(netdev_fix_features);
3981
3982 /**
3983  *      register_netdevice      - register a network device
3984  *      @dev: device to register
3985  *
3986  *      Take a completed network device structure and add it to the kernel
3987  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3988  *      chain. 0 is returned on success. A negative errno code is returned
3989  *      on a failure to set up the device, or if the name is a duplicate.
3990  *
3991  *      Callers must hold the rtnl semaphore. You may want
3992  *      register_netdev() instead of this.
3993  *
3994  *      BUGS:
3995  *      The locking appears insufficient to guarantee two parallel registers
3996  *      will not get the same name.
3997  */
3998
3999 int register_netdevice(struct net_device *dev)
4000 {
4001         struct hlist_head *head;
4002         struct hlist_node *p;
4003         int ret;
4004         struct net *net = dev_net(dev);
4005
4006         BUG_ON(dev_boot_phase);
4007         ASSERT_RTNL();
4008
4009         might_sleep();
4010
4011         /* When net_device's are persistent, this will be fatal. */
4012         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4013         BUG_ON(!net);
4014
4015         spin_lock_init(&dev->addr_list_lock);
4016         netdev_set_addr_lockdep_class(dev);
4017         netdev_init_queue_locks(dev);
4018
4019         dev->iflink = -1;
4020
4021 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4022         /* Netdevice_ops API compatiability support.
4023          * This is temporary until all network devices are converted.
4024          */
4025         if (dev->netdev_ops) {
4026                 const struct net_device_ops *ops = dev->netdev_ops;
4027
4028                 dev->init = ops->ndo_init;
4029                 dev->uninit = ops->ndo_uninit;
4030                 dev->open = ops->ndo_open;
4031                 dev->change_rx_flags = ops->ndo_change_rx_flags;
4032                 dev->set_rx_mode = ops->ndo_set_rx_mode;
4033                 dev->set_multicast_list = ops->ndo_set_multicast_list;
4034                 dev->set_mac_address = ops->ndo_set_mac_address;
4035                 dev->validate_addr = ops->ndo_validate_addr;
4036                 dev->do_ioctl = ops->ndo_do_ioctl;
4037                 dev->set_config = ops->ndo_set_config;
4038                 dev->change_mtu = ops->ndo_change_mtu;
4039                 dev->tx_timeout = ops->ndo_tx_timeout;
4040                 dev->get_stats = ops->ndo_get_stats;
4041                 dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4042                 dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4043                 dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4044 #ifdef CONFIG_NET_POLL_CONTROLLER
4045                 dev->poll_controller = ops->ndo_poll_controller;
4046 #endif
4047         } else {
4048                 char drivername[64];
4049                 pr_info("%s (%s): not using net_device_ops yet\n",
4050                         dev->name, netdev_drivername(dev, drivername, 64));
4051
4052                 /* This works only because net_device_ops and the
4053                    compatiablity structure are the same. */
4054                 dev->netdev_ops = (void *) &(dev->init);
4055         }
4056 #endif
4057
4058         /* Init, if this function is available */
4059         if (dev->netdev_ops->ndo_init) {
4060                 ret = dev->netdev_ops->ndo_init(dev);
4061                 if (ret) {
4062                         if (ret > 0)
4063                                 ret = -EIO;
4064                         goto out;
4065                 }
4066         }
4067
4068         if (!dev_valid_name(dev->name)) {
4069                 ret = -EINVAL;
4070                 goto err_uninit;
4071         }
4072
4073         dev->ifindex = dev_new_index(net);
4074         if (dev->iflink == -1)
4075                 dev->iflink = dev->ifindex;
4076
4077         /* Check for existence of name */
4078         head = dev_name_hash(net, dev->name);
4079         hlist_for_each(p, head) {
4080                 struct net_device *d
4081                         = hlist_entry(p, struct net_device, name_hlist);
4082                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4083                         ret = -EEXIST;
4084                         goto err_uninit;
4085                 }
4086         }
4087
4088         /* Fix illegal checksum combinations */
4089         if ((dev->features & NETIF_F_HW_CSUM) &&
4090             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4091                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4092                        dev->name);
4093                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4094         }
4095
4096         if ((dev->features & NETIF_F_NO_CSUM) &&
4097             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4098                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4099                        dev->name);
4100                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4101         }
4102
4103         dev->features = netdev_fix_features(dev->features, dev->name);
4104
4105         /* Enable software GSO if SG is supported. */
4106         if (dev->features & NETIF_F_SG)
4107                 dev->features |= NETIF_F_GSO;
4108
4109         netdev_initialize_kobject(dev);
4110         ret = netdev_register_kobject(dev);
4111         if (ret)
4112                 goto err_uninit;
4113         dev->reg_state = NETREG_REGISTERED;
4114
4115         /*
4116          *      Default initial state at registry is that the
4117          *      device is present.
4118          */
4119
4120         set_bit(__LINK_STATE_PRESENT, &dev->state);
4121
4122         dev_init_scheduler(dev);
4123         dev_hold(dev);
4124         list_netdevice(dev);
4125
4126         /* Notify protocols, that a new device appeared. */
4127         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4128         ret = notifier_to_errno(ret);
4129         if (ret) {
4130                 rollback_registered(dev);
4131                 dev->reg_state = NETREG_UNREGISTERED;
4132         }
4133
4134 out:
4135         return ret;
4136
4137 err_uninit:
4138         if (dev->netdev_ops->ndo_uninit)
4139                 dev->netdev_ops->ndo_uninit(dev);
4140         goto out;
4141 }
4142
4143 /**
4144  *      register_netdev - register a network device
4145  *      @dev: device to register
4146  *
4147  *      Take a completed network device structure and add it to the kernel
4148  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4149  *      chain. 0 is returned on success. A negative errno code is returned
4150  *      on a failure to set up the device, or if the name is a duplicate.
4151  *
4152  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
4153  *      and expands the device name if you passed a format string to
4154  *      alloc_netdev.
4155  */
4156 int register_netdev(struct net_device *dev)
4157 {
4158         int err;
4159
4160         rtnl_lock();
4161
4162         /*
4163          * If the name is a format string the caller wants us to do a
4164          * name allocation.
4165          */
4166         if (strchr(dev->name, '%')) {
4167                 err = dev_alloc_name(dev, dev->name);
4168                 if (err < 0)
4169                         goto out;
4170         }
4171
4172         err = register_netdevice(dev);
4173 out:
4174         rtnl_unlock();
4175         return err;
4176 }
4177 EXPORT_SYMBOL(register_netdev);
4178
4179 /*
4180  * netdev_wait_allrefs - wait until all references are gone.
4181  *
4182  * This is called when unregistering network devices.
4183  *
4184  * Any protocol or device that holds a reference should register
4185  * for netdevice notification, and cleanup and put back the
4186  * reference if they receive an UNREGISTER event.
4187  * We can get stuck here if buggy protocols don't correctly
4188  * call dev_put.
4189  */
4190 static void netdev_wait_allrefs(struct net_device *dev)
4191 {
4192         unsigned long rebroadcast_time, warning_time;
4193
4194         rebroadcast_time = warning_time = jiffies;
4195         while (atomic_read(&dev->refcnt) != 0) {
4196                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4197                         rtnl_lock();
4198
4199                         /* Rebroadcast unregister notification */
4200                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4201
4202                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4203                                      &dev->state)) {
4204                                 /* We must not have linkwatch events
4205                                  * pending on unregister. If this
4206                                  * happens, we simply run the queue
4207                                  * unscheduled, resulting in a noop
4208                                  * for this device.
4209                                  */
4210                                 linkwatch_run_queue();
4211                         }
4212
4213                         __rtnl_unlock();
4214
4215                         rebroadcast_time = jiffies;
4216                 }
4217
4218                 msleep(250);
4219
4220                 if (time_after(jiffies, warning_time + 10 * HZ)) {
4221                         printk(KERN_EMERG "unregister_netdevice: "
4222                                "waiting for %s to become free. Usage "
4223                                "count = %d\n",
4224                                dev->name, atomic_read(&dev->refcnt));
4225                         warning_time = jiffies;
4226                 }
4227         }
4228 }
4229
4230 /* The sequence is:
4231  *
4232  *      rtnl_lock();
4233  *      ...
4234  *      register_netdevice(x1);
4235  *      register_netdevice(x2);
4236  *      ...
4237  *      unregister_netdevice(y1);
4238  *      unregister_netdevice(y2);
4239  *      ...
4240  *      rtnl_unlock();
4241  *      free_netdev(y1);
4242  *      free_netdev(y2);
4243  *
4244  * We are invoked by rtnl_unlock().
4245  * This allows us to deal with problems:
4246  * 1) We can delete sysfs objects which invoke hotplug
4247  *    without deadlocking with linkwatch via keventd.
4248  * 2) Since we run with the RTNL semaphore not held, we can sleep
4249  *    safely in order to wait for the netdev refcnt to drop to zero.
4250  *
4251  * We must not return until all unregister events added during
4252  * the interval the lock was held have been completed.
4253  */
4254 void netdev_run_todo(void)
4255 {
4256         struct list_head list;
4257
4258         /* Snapshot list, allow later requests */
4259         list_replace_init(&net_todo_list, &list);
4260
4261         __rtnl_unlock();
4262
4263         while (!list_empty(&list)) {
4264                 struct net_device *dev
4265                         = list_entry(list.next, struct net_device, todo_list);
4266                 list_del(&dev->todo_list);
4267
4268                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4269                         printk(KERN_ERR "network todo '%s' but state %d\n",
4270                                dev->name, dev->reg_state);
4271                         dump_stack();
4272                         continue;
4273                 }
4274
4275                 dev->reg_state = NETREG_UNREGISTERED;
4276
4277                 on_each_cpu(flush_backlog, dev, 1);
4278
4279                 netdev_wait_allrefs(dev);
4280
4281                 /* paranoia */
4282                 BUG_ON(atomic_read(&dev->refcnt));
4283                 WARN_ON(dev->ip_ptr);
4284                 WARN_ON(dev->ip6_ptr);
4285                 WARN_ON(dev->dn_ptr);
4286
4287                 if (dev->destructor)
4288                         dev->destructor(dev);
4289
4290                 /* Free network device */
4291                 kobject_put(&dev->dev.kobj);
4292         }
4293 }
4294
4295 /**
4296  *      dev_get_stats   - get network device statistics
4297  *      @dev: device to get statistics from
4298  *
4299  *      Get network statistics from device. The device driver may provide
4300  *      its own method by setting dev->netdev_ops->get_stats; otherwise
4301  *      the internal statistics structure is used.
4302  */
4303 const struct net_device_stats *dev_get_stats(struct net_device *dev)
4304  {
4305         const struct net_device_ops *ops = dev->netdev_ops;
4306
4307         if (ops->ndo_get_stats)
4308                 return ops->ndo_get_stats(dev);
4309         else
4310                 return &dev->stats;
4311 }
4312 EXPORT_SYMBOL(dev_get_stats);
4313
4314 static void netdev_init_one_queue(struct net_device *dev,
4315                                   struct netdev_queue *queue,
4316                                   void *_unused)
4317 {
4318         queue->dev = dev;
4319 }
4320
4321 static void netdev_init_queues(struct net_device *dev)
4322 {
4323         netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4324         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4325         spin_lock_init(&dev->tx_global_lock);
4326 }
4327
4328 /**
4329  *      alloc_netdev_mq - allocate network device
4330  *      @sizeof_priv:   size of private data to allocate space for
4331  *      @name:          device name format string
4332  *      @setup:         callback to initialize device
4333  *      @queue_count:   the number of subqueues to allocate
4334  *
4335  *      Allocates a struct net_device with private data area for driver use
4336  *      and performs basic initialization.  Also allocates subquue structs
4337  *      for each queue on the device at the end of the netdevice.
4338  */
4339 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4340                 void (*setup)(struct net_device *), unsigned int queue_count)
4341 {
4342         struct netdev_queue *tx;
4343         struct net_device *dev;
4344         size_t alloc_size;
4345         void *p;
4346
4347         BUG_ON(strlen(name) >= sizeof(dev->name));
4348
4349         alloc_size = sizeof(struct net_device);
4350         if (sizeof_priv) {
4351                 /* ensure 32-byte alignment of private area */
4352                 alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4353                 alloc_size += sizeof_priv;
4354         }
4355         /* ensure 32-byte alignment of whole construct */
4356         alloc_size += NETDEV_ALIGN_CONST;
4357
4358         p = kzalloc(alloc_size, GFP_KERNEL);
4359         if (!p) {
4360                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4361                 return NULL;
4362         }
4363
4364         tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4365         if (!tx) {
4366                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
4367                        "tx qdiscs.\n");
4368                 kfree(p);
4369                 return NULL;
4370         }
4371
4372         dev = (struct net_device *)
4373                 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4374         dev->padded = (char *)dev - (char *)p;
4375         dev_net_set(dev, &init_net);
4376
4377         dev->_tx = tx;
4378         dev->num_tx_queues = queue_count;
4379         dev->real_num_tx_queues = queue_count;
4380
4381         if (sizeof_priv) {
4382                 dev->priv = ((char *)dev +
4383                              ((sizeof(struct net_device) + NETDEV_ALIGN_CONST)
4384                               & ~NETDEV_ALIGN_CONST));
4385         }
4386
4387         dev->gso_max_size = GSO_MAX_SIZE;
4388
4389         netdev_init_queues(dev);
4390
4391         netpoll_netdev_init(dev);
4392         setup(dev);
4393         strcpy(dev->name, name);
4394         return dev;
4395 }
4396 EXPORT_SYMBOL(alloc_netdev_mq);
4397
4398 /**
4399  *      free_netdev - free network device
4400  *      @dev: device
4401  *
4402  *      This function does the last stage of destroying an allocated device
4403  *      interface. The reference to the device object is released.
4404  *      If this is the last reference then it will be freed.
4405  */
4406 void free_netdev(struct net_device *dev)
4407 {
4408         release_net(dev_net(dev));
4409
4410         kfree(dev->_tx);
4411
4412         /*  Compatibility with error handling in drivers */
4413         if (dev->reg_state == NETREG_UNINITIALIZED) {
4414                 kfree((char *)dev - dev->padded);
4415                 return;
4416         }
4417
4418         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4419         dev->reg_state = NETREG_RELEASED;
4420
4421         /* will free via device release */
4422         put_device(&dev->dev);
4423 }
4424
4425 /**
4426  *      synchronize_net -  Synchronize with packet receive processing
4427  *
4428  *      Wait for packets currently being received to be done.
4429  *      Does not block later packets from starting.
4430  */
4431 void synchronize_net(void)
4432 {
4433         might_sleep();
4434         synchronize_rcu();
4435 }
4436
4437 /**
4438  *      unregister_netdevice - remove device from the kernel
4439  *      @dev: device
4440  *
4441  *      This function shuts down a device interface and removes it
4442  *      from the kernel tables.
4443  *
4444  *      Callers must hold the rtnl semaphore.  You may want
4445  *      unregister_netdev() instead of this.
4446  */
4447
4448 void unregister_netdevice(struct net_device *dev)
4449 {
4450         ASSERT_RTNL();
4451
4452         rollback_registered(dev);
4453         /* Finish processing unregister after unlock */
4454         net_set_todo(dev);
4455 }
4456
4457 /**
4458  *      unregister_netdev - remove device from the kernel
4459  *      @dev: device
4460  *
4461  *      This function shuts down a device interface and removes it
4462  *      from the kernel tables.
4463  *
4464  *      This is just a wrapper for unregister_netdevice that takes
4465  *      the rtnl semaphore.  In general you want to use this and not
4466  *      unregister_netdevice.
4467  */
4468 void unregister_netdev(struct net_device *dev)
4469 {
4470         rtnl_lock();
4471         unregister_netdevice(dev);
4472         rtnl_unlock();
4473 }
4474
4475 EXPORT_SYMBOL(unregister_netdev);
4476
4477 /**
4478  *      dev_change_net_namespace - move device to different nethost namespace
4479  *      @dev: device
4480  *      @net: network namespace
4481  *      @pat: If not NULL name pattern to try if the current device name
4482  *            is already taken in the destination network namespace.
4483  *
4484  *      This function shuts down a device interface and moves it
4485  *      to a new network namespace. On success 0 is returned, on
4486  *      a failure a netagive errno code is returned.
4487  *
4488  *      Callers must hold the rtnl semaphore.
4489  */
4490
4491 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4492 {
4493         char buf[IFNAMSIZ];
4494         const char *destname;
4495         int err;
4496
4497         ASSERT_RTNL();
4498
4499         /* Don't allow namespace local devices to be moved. */
4500         err = -EINVAL;
4501         if (dev->features & NETIF_F_NETNS_LOCAL)
4502                 goto out;
4503
4504 #ifdef CONFIG_SYSFS
4505         /* Don't allow real devices to be moved when sysfs
4506          * is enabled.
4507          */
4508         err = -EINVAL;
4509         if (dev->dev.parent)
4510                 goto out;
4511 #endif
4512
4513         /* Ensure the device has been registrered */
4514         err = -EINVAL;
4515         if (dev->reg_state != NETREG_REGISTERED)
4516                 goto out;
4517
4518         /* Get out if there is nothing todo */
4519         err = 0;
4520         if (net_eq(dev_net(dev), net))
4521                 goto out;
4522
4523         /* Pick the destination device name, and ensure
4524          * we can use it in the destination network namespace.
4525          */
4526         err = -EEXIST;
4527         destname = dev->name;
4528         if (__dev_get_by_name(net, destname)) {
4529                 /* We get here if we can't use the current device name */
4530                 if (!pat)
4531                         goto out;
4532                 if (!dev_valid_name(pat))
4533                         goto out;
4534                 if (strchr(pat, '%')) {
4535                         if (__dev_alloc_name(net, pat, buf) < 0)
4536                                 goto out;
4537                         destname = buf;
4538                 } else
4539                         destname = pat;
4540                 if (__dev_get_by_name(net, destname))
4541                         goto out;
4542         }
4543
4544         /*
4545          * And now a mini version of register_netdevice unregister_netdevice.
4546          */
4547
4548         /* If device is running close it first. */
4549         dev_close(dev);
4550
4551         /* And unlink it from device chain */
4552         err = -ENODEV;
4553         unlist_netdevice(dev);
4554
4555         synchronize_net();
4556
4557         /* Shutdown queueing discipline. */
4558         dev_shutdown(dev);
4559
4560         /* Notify protocols, that we are about to destroy
4561            this device. They should clean all the things.
4562         */
4563         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4564
4565         /*
4566          *      Flush the unicast and multicast chains
4567          */
4568         dev_addr_discard(dev);
4569
4570         netdev_unregister_kobject(dev);
4571
4572         /* Actually switch the network namespace */
4573         dev_net_set(dev, net);
4574
4575         /* Assign the new device name */
4576         if (destname != dev->name)
4577                 strcpy(dev->name, destname);
4578
4579         /* If there is an ifindex conflict assign a new one */
4580         if (__dev_get_by_index(net, dev->ifindex)) {
4581                 int iflink = (dev->iflink == dev->ifindex);
4582                 dev->ifindex = dev_new_index(net);
4583                 if (iflink)
4584                         dev->iflink = dev->ifindex;
4585         }
4586
4587         /* Fixup kobjects */
4588         err = netdev_register_kobject(dev);
4589         WARN_ON(err);
4590
4591         /* Add the device back in the hashes */
4592         list_netdevice(dev);
4593
4594         /* Notify protocols, that a new device appeared. */
4595         call_netdevice_notifiers(NETDEV_REGISTER, dev);
4596
4597         synchronize_net();
4598         err = 0;
4599 out:
4600         return err;
4601 }
4602
4603 static int dev_cpu_callback(struct notifier_block *nfb,
4604                             unsigned long action,
4605                             void *ocpu)
4606 {
4607         struct sk_buff **list_skb;
4608         struct Qdisc **list_net;
4609         struct sk_buff *skb;
4610         unsigned int cpu, oldcpu = (unsigned long)ocpu;
4611         struct softnet_data *sd, *oldsd;
4612
4613         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4614                 return NOTIFY_OK;
4615
4616         local_irq_disable();
4617         cpu = smp_processor_id();
4618         sd = &per_cpu(softnet_data, cpu);
4619         oldsd = &per_cpu(softnet_data, oldcpu);
4620
4621         /* Find end of our completion_queue. */
4622         list_skb = &sd->completion_queue;
4623         while (*list_skb)
4624                 list_skb = &(*list_skb)->next;
4625         /* Append completion queue from offline CPU. */
4626         *list_skb = oldsd->completion_queue;
4627         oldsd->completion_queue = NULL;
4628
4629         /* Find end of our output_queue. */
4630         list_net = &sd->output_queue;
4631         while (*list_net)
4632                 list_net = &(*list_net)->next_sched;
4633         /* Append output queue from offline CPU. */
4634         *list_net = oldsd->output_queue;
4635         oldsd->output_queue = NULL;
4636
4637         raise_softirq_irqoff(NET_TX_SOFTIRQ);
4638         local_irq_enable();
4639
4640         /* Process offline CPU's input_pkt_queue */
4641         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4642                 netif_rx(skb);
4643
4644         return NOTIFY_OK;
4645 }
4646
4647 #ifdef CONFIG_NET_DMA
4648 /**
4649  * net_dma_rebalance - try to maintain one DMA channel per CPU
4650  * @net_dma: DMA client and associated data (lock, channels, channel_mask)
4651  *
4652  * This is called when the number of channels allocated to the net_dma client
4653  * changes.  The net_dma client tries to have one DMA channel per CPU.
4654  */
4655
4656 static void net_dma_rebalance(struct net_dma *net_dma)
4657 {
4658         unsigned int cpu, i, n, chan_idx;
4659         struct dma_chan *chan;
4660
4661         if (cpus_empty(net_dma->channel_mask)) {
4662                 for_each_online_cpu(cpu)
4663                         rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
4664                 return;
4665         }
4666
4667         i = 0;
4668         cpu = first_cpu(cpu_online_map);
4669
4670         for_each_cpu_mask_nr(chan_idx, net_dma->channel_mask) {
4671                 chan = net_dma->channels[chan_idx];
4672
4673                 n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
4674                    + (i < (num_online_cpus() %
4675                         cpus_weight(net_dma->channel_mask)) ? 1 : 0));
4676
4677                 while(n) {
4678                         per_cpu(softnet_data, cpu).net_dma = chan;
4679                         cpu = next_cpu(cpu, cpu_online_map);
4680                         n--;
4681                 }
4682                 i++;
4683         }
4684 }
4685
4686 /**
4687  * netdev_dma_event - event callback for the net_dma_client
4688  * @client: should always be net_dma_client
4689  * @chan: DMA channel for the event
4690  * @state: DMA state to be handled
4691  */
4692 static enum dma_state_client
4693 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
4694         enum dma_state state)
4695 {
4696         int i, found = 0, pos = -1;
4697         struct net_dma *net_dma =
4698                 container_of(client, struct net_dma, client);
4699         enum dma_state_client ack = DMA_DUP; /* default: take no action */
4700
4701         spin_lock(&net_dma->lock);
4702         switch (state) {
4703         case DMA_RESOURCE_AVAILABLE:
4704                 for (i = 0; i < nr_cpu_ids; i++)
4705                         if (net_dma->channels[i] == chan) {
4706                                 found = 1;
4707                                 break;
4708                         } else if (net_dma->channels[i] == NULL && pos < 0)
4709                                 pos = i;
4710
4711                 if (!found && pos >= 0) {
4712                         ack = DMA_ACK;
4713                         net_dma->channels[pos] = chan;
4714                         cpu_set(pos, net_dma->channel_mask);
4715                         net_dma_rebalance(net_dma);
4716                 }
4717                 break;
4718         case DMA_RESOURCE_REMOVED:
4719                 for (i = 0; i < nr_cpu_ids; i++)
4720                         if (net_dma->channels[i] == chan) {
4721                                 found = 1;
4722                                 pos = i;
4723                                 break;
4724                         }
4725
4726                 if (found) {
4727                         ack = DMA_ACK;
4728                         cpu_clear(pos, net_dma->channel_mask);
4729                         net_dma->channels[i] = NULL;
4730                         net_dma_rebalance(net_dma);
4731                 }
4732                 break;
4733         default:
4734                 break;
4735         }
4736         spin_unlock(&net_dma->lock);
4737
4738         return ack;
4739 }
4740
4741 /**
4742  * netdev_dma_register - register the networking subsystem as a DMA client
4743  */
4744 static int __init netdev_dma_register(void)
4745 {
4746         net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma),
4747                                                                 GFP_KERNEL);
4748         if (unlikely(!net_dma.channels)) {
4749                 printk(KERN_NOTICE
4750                                 "netdev_dma: no memory for net_dma.channels\n");
4751                 return -ENOMEM;
4752         }
4753         spin_lock_init(&net_dma.lock);
4754         dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
4755         dma_async_client_register(&net_dma.client);
4756         dma_async_client_chan_request(&net_dma.client);
4757         return 0;
4758 }
4759
4760 #else
4761 static int __init netdev_dma_register(void) { return -ENODEV; }
4762 #endif /* CONFIG_NET_DMA */
4763
4764 /**
4765  *      netdev_increment_features - increment feature set by one
4766  *      @all: current feature set
4767  *      @one: new feature set
4768  *      @mask: mask feature set
4769  *
4770  *      Computes a new feature set after adding a device with feature set
4771  *      @one to the master device with current feature set @all.  Will not
4772  *      enable anything that is off in @mask. Returns the new feature set.
4773  */
4774 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
4775                                         unsigned long mask)
4776 {
4777         /* If device needs checksumming, downgrade to it. */
4778         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4779                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
4780         else if (mask & NETIF_F_ALL_CSUM) {
4781                 /* If one device supports v4/v6 checksumming, set for all. */
4782                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
4783                     !(all & NETIF_F_GEN_CSUM)) {
4784                         all &= ~NETIF_F_ALL_CSUM;
4785                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
4786                 }
4787
4788                 /* If one device supports hw checksumming, set for all. */
4789                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
4790                         all &= ~NETIF_F_ALL_CSUM;
4791                         all |= NETIF_F_HW_CSUM;
4792                 }
4793         }
4794
4795         one |= NETIF_F_ALL_CSUM;
4796
4797         one |= all & NETIF_F_ONE_FOR_ALL;
4798         all &= one | NETIF_F_LLTX | NETIF_F_GSO;
4799         all |= one & mask & NETIF_F_ONE_FOR_ALL;
4800
4801         return all;
4802 }
4803 EXPORT_SYMBOL(netdev_increment_features);
4804
4805 static struct hlist_head *netdev_create_hash(void)
4806 {
4807         int i;
4808         struct hlist_head *hash;
4809
4810         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
4811         if (hash != NULL)
4812                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
4813                         INIT_HLIST_HEAD(&hash[i]);
4814
4815         return hash;
4816 }
4817
4818 /* Initialize per network namespace state */
4819 static int __net_init netdev_init(struct net *net)
4820 {
4821         INIT_LIST_HEAD(&net->dev_base_head);
4822
4823         net->dev_name_head = netdev_create_hash();
4824         if (net->dev_name_head == NULL)
4825                 goto err_name;
4826
4827         net->dev_index_head = netdev_create_hash();
4828         if (net->dev_index_head == NULL)
4829                 goto err_idx;
4830
4831         return 0;
4832
4833 err_idx:
4834         kfree(net->dev_name_head);
4835 err_name:
4836         return -ENOMEM;
4837 }
4838
4839 /**
4840  *      netdev_drivername - network driver for the device
4841  *      @dev: network device
4842  *      @buffer: buffer for resulting name
4843  *      @len: size of buffer
4844  *
4845  *      Determine network driver for device.
4846  */
4847 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
4848 {
4849         const struct device_driver *driver;
4850         const struct device *parent;
4851
4852         if (len <= 0 || !buffer)
4853                 return buffer;
4854         buffer[0] = 0;
4855
4856         parent = dev->dev.parent;
4857
4858         if (!parent)
4859                 return buffer;
4860
4861         driver = parent->driver;
4862         if (driver && driver->name)
4863                 strlcpy(buffer, driver->name, len);
4864         return buffer;
4865 }
4866
4867 static void __net_exit netdev_exit(struct net *net)
4868 {
4869         kfree(net->dev_name_head);
4870         kfree(net->dev_index_head);
4871 }
4872
4873 static struct pernet_operations __net_initdata netdev_net_ops = {
4874         .init = netdev_init,
4875         .exit = netdev_exit,
4876 };
4877
4878 static void __net_exit default_device_exit(struct net *net)
4879 {
4880         struct net_device *dev, *next;
4881         /*
4882          * Push all migratable of the network devices back to the
4883          * initial network namespace
4884          */
4885         rtnl_lock();
4886         for_each_netdev_safe(net, dev, next) {
4887                 int err;
4888                 char fb_name[IFNAMSIZ];
4889
4890                 /* Ignore unmoveable devices (i.e. loopback) */
4891                 if (dev->features & NETIF_F_NETNS_LOCAL)
4892                         continue;
4893
4894                 /* Delete virtual devices */
4895                 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
4896                         dev->rtnl_link_ops->dellink(dev);
4897                         continue;
4898                 }
4899
4900                 /* Push remaing network devices to init_net */
4901                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
4902                 err = dev_change_net_namespace(dev, &init_net, fb_name);
4903                 if (err) {
4904                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
4905                                 __func__, dev->name, err);
4906                         BUG();
4907                 }
4908         }
4909         rtnl_unlock();
4910 }
4911
4912 static struct pernet_operations __net_initdata default_device_ops = {
4913         .exit = default_device_exit,
4914 };
4915
4916 /*
4917  *      Initialize the DEV module. At boot time this walks the device list and
4918  *      unhooks any devices that fail to initialise (normally hardware not
4919  *      present) and leaves us with a valid list of present and active devices.
4920  *
4921  */
4922
4923 /*
4924  *       This is called single threaded during boot, so no need
4925  *       to take the rtnl semaphore.
4926  */
4927 static int __init net_dev_init(void)
4928 {
4929         int i, rc = -ENOMEM;
4930
4931         BUG_ON(!dev_boot_phase);
4932
4933         if (dev_proc_init())
4934                 goto out;
4935
4936         if (netdev_kobject_init())
4937                 goto out;
4938
4939         INIT_LIST_HEAD(&ptype_all);
4940         for (i = 0; i < PTYPE_HASH_SIZE; i++)
4941                 INIT_LIST_HEAD(&ptype_base[i]);
4942
4943         if (register_pernet_subsys(&netdev_net_ops))
4944                 goto out;
4945
4946         /*
4947          *      Initialise the packet receive queues.
4948          */
4949
4950         for_each_possible_cpu(i) {
4951                 struct softnet_data *queue;
4952
4953                 queue = &per_cpu(softnet_data, i);
4954                 skb_queue_head_init(&queue->input_pkt_queue);
4955                 queue->completion_queue = NULL;
4956                 INIT_LIST_HEAD(&queue->poll_list);
4957
4958                 queue->backlog.poll = process_backlog;
4959                 queue->backlog.weight = weight_p;
4960         }
4961
4962         dev_boot_phase = 0;
4963
4964         /* The loopback device is special if any other network devices
4965          * is present in a network namespace the loopback device must
4966          * be present. Since we now dynamically allocate and free the
4967          * loopback device ensure this invariant is maintained by
4968          * keeping the loopback device as the first device on the
4969          * list of network devices.  Ensuring the loopback devices
4970          * is the first device that appears and the last network device
4971          * that disappears.
4972          */
4973         if (register_pernet_device(&loopback_net_ops))
4974                 goto out;
4975
4976         if (register_pernet_device(&default_device_ops))
4977                 goto out;
4978
4979         netdev_dma_register();
4980
4981         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
4982         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
4983
4984         hotcpu_notifier(dev_cpu_callback, 0);
4985         dst_init();
4986         dev_mcast_init();
4987         rc = 0;
4988 out:
4989         return rc;
4990 }
4991
4992 subsys_initcall(net_dev_init);
4993
4994 EXPORT_SYMBOL(__dev_get_by_index);
4995 EXPORT_SYMBOL(__dev_get_by_name);
4996 EXPORT_SYMBOL(__dev_remove_pack);
4997 EXPORT_SYMBOL(dev_valid_name);
4998 EXPORT_SYMBOL(dev_add_pack);
4999 EXPORT_SYMBOL(dev_alloc_name);
5000 EXPORT_SYMBOL(dev_close);
5001 EXPORT_SYMBOL(dev_get_by_flags);
5002 EXPORT_SYMBOL(dev_get_by_index);
5003 EXPORT_SYMBOL(dev_get_by_name);
5004 EXPORT_SYMBOL(dev_open);
5005 EXPORT_SYMBOL(dev_queue_xmit);
5006 EXPORT_SYMBOL(dev_remove_pack);
5007 EXPORT_SYMBOL(dev_set_allmulti);
5008 EXPORT_SYMBOL(dev_set_promiscuity);
5009 EXPORT_SYMBOL(dev_change_flags);
5010 EXPORT_SYMBOL(dev_set_mtu);
5011 EXPORT_SYMBOL(dev_set_mac_address);
5012 EXPORT_SYMBOL(free_netdev);
5013 EXPORT_SYMBOL(netdev_boot_setup_check);
5014 EXPORT_SYMBOL(netdev_set_master);
5015 EXPORT_SYMBOL(netdev_state_change);
5016 EXPORT_SYMBOL(netif_receive_skb);
5017 EXPORT_SYMBOL(netif_rx);
5018 EXPORT_SYMBOL(register_gifconf);
5019 EXPORT_SYMBOL(register_netdevice);
5020 EXPORT_SYMBOL(register_netdevice_notifier);
5021 EXPORT_SYMBOL(skb_checksum_help);
5022 EXPORT_SYMBOL(synchronize_net);
5023 EXPORT_SYMBOL(unregister_netdevice);
5024 EXPORT_SYMBOL(unregister_netdevice_notifier);
5025 EXPORT_SYMBOL(net_enable_timestamp);
5026 EXPORT_SYMBOL(net_disable_timestamp);
5027 EXPORT_SYMBOL(dev_get_flags);
5028
5029 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
5030 EXPORT_SYMBOL(br_handle_frame_hook);
5031 EXPORT_SYMBOL(br_fdb_get_hook);
5032 EXPORT_SYMBOL(br_fdb_put_hook);
5033 #endif
5034
5035 EXPORT_SYMBOL(dev_load);
5036
5037 EXPORT_PER_CPU_SYMBOL(softnet_data);