net: Fix an RCU warning in dev_pick_tx()
[linux-2.6/kvm.git] / net / core / dev.c
blobf769098774b7e5e3b95f9aeb5b89d87412dda0e9
1 /*
2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <linux/if_bridge.h>
105 #include <linux/if_macvlan.h>
106 #include <net/dst.h>
107 #include <net/pkt_sched.h>
108 #include <net/checksum.h>
109 #include <net/xfrm.h>
110 #include <linux/highmem.h>
111 #include <linux/init.h>
112 #include <linux/kmod.h>
113 #include <linux/module.h>
114 #include <linux/netpoll.h>
115 #include <linux/rcupdate.h>
116 #include <linux/delay.h>
117 #include <net/wext.h>
118 #include <net/iw_handler.h>
119 #include <asm/current.h>
120 #include <linux/audit.h>
121 #include <linux/dmaengine.h>
122 #include <linux/err.h>
123 #include <linux/ctype.h>
124 #include <linux/if_arp.h>
125 #include <linux/if_vlan.h>
126 #include <linux/ip.h>
127 #include <net/ip.h>
128 #include <linux/ipv6.h>
129 #include <linux/in.h>
130 #include <linux/jhash.h>
131 #include <linux/random.h>
132 #include <trace/events/napi.h>
134 #include "net-sysfs.h"
136 /* Instead of increasing this, you should create a hash table. */
137 #define MAX_GRO_SKBS 8
139 /* This should be increased if a protocol with a bigger head is added. */
140 #define GRO_MAX_HEAD (MAX_HEADER + 128)
143 * The list of packet types we will receive (as opposed to discard)
144 * and the routines to invoke.
146 * Why 16. Because with 16 the only overlap we get on a hash of the
147 * low nibble of the protocol value is RARP/SNAP/X.25.
149 * NOTE: That is no longer true with the addition of VLAN tags. Not
150 * sure which should go first, but I bet it won't make much
151 * difference if we are running VLANs. The good news is that
152 * this protocol won't be in the list unless compiled in, so
153 * the average user (w/out VLANs) will not be adversely affected.
154 * --BLG
156 * 0800 IP
157 * 8100 802.1Q VLAN
158 * 0001 802.3
159 * 0002 AX.25
160 * 0004 802.2
161 * 8035 RARP
162 * 0005 SNAP
163 * 0805 X.25
164 * 0806 ARP
165 * 8137 IPX
166 * 0009 Localtalk
167 * 86DD IPv6
170 #define PTYPE_HASH_SIZE (16)
171 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
173 static DEFINE_SPINLOCK(ptype_lock);
174 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
175 static struct list_head ptype_all __read_mostly; /* Taps */
178 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
179 * semaphore.
181 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
183 * Writers must hold the rtnl semaphore while they loop through the
184 * dev_base_head list, and hold dev_base_lock for writing when they do the
185 * actual updates. This allows pure readers to access the list even
186 * while a writer is preparing to update it.
188 * To put it another way, dev_base_lock is held for writing only to
189 * protect against pure readers; the rtnl semaphore provides the
190 * protection against other writers.
192 * See, for example usages, register_netdevice() and
193 * unregister_netdevice(), which must be called with the rtnl
194 * semaphore held.
196 DEFINE_RWLOCK(dev_base_lock);
197 EXPORT_SYMBOL(dev_base_lock);
199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
201 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
202 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
207 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
210 /* Device list insertion */
211 static int list_netdevice(struct net_device *dev)
213 struct net *net = dev_net(dev);
215 ASSERT_RTNL();
217 write_lock_bh(&dev_base_lock);
218 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
219 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
220 hlist_add_head_rcu(&dev->index_hlist,
221 dev_index_hash(net, dev->ifindex));
222 write_unlock_bh(&dev_base_lock);
223 return 0;
226 /* Device list removal
227 * caller must respect a RCU grace period before freeing/reusing dev
229 static void unlist_netdevice(struct net_device *dev)
231 ASSERT_RTNL();
233 /* Unlink dev from the device chain */
234 write_lock_bh(&dev_base_lock);
235 list_del_rcu(&dev->dev_list);
236 hlist_del_rcu(&dev->name_hlist);
237 hlist_del_rcu(&dev->index_hlist);
238 write_unlock_bh(&dev_base_lock);
242 * Our notifier list
245 static RAW_NOTIFIER_HEAD(netdev_chain);
248 * Device drivers call our routines to queue packets here. We empty the
249 * queue in the local softnet handler.
252 DEFINE_PER_CPU(struct softnet_data, softnet_data);
253 EXPORT_PER_CPU_SYMBOL(softnet_data);
255 #ifdef CONFIG_LOCKDEP
257 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
258 * according to dev->type
260 static const unsigned short netdev_lock_type[] =
261 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
262 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
263 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
264 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
265 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
266 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
267 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
268 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
269 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
270 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
271 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
272 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
273 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
274 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
275 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
276 ARPHRD_VOID, ARPHRD_NONE};
278 static const char *const netdev_lock_name[] =
279 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
280 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
281 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
282 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
283 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
284 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
285 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
286 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
287 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
288 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
289 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
290 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
291 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
292 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
293 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
294 "_xmit_VOID", "_xmit_NONE"};
296 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
297 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
299 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
301 int i;
303 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
304 if (netdev_lock_type[i] == dev_type)
305 return i;
306 /* the last key is used by default */
307 return ARRAY_SIZE(netdev_lock_type) - 1;
310 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
311 unsigned short dev_type)
313 int i;
315 i = netdev_lock_pos(dev_type);
316 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
317 netdev_lock_name[i]);
320 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
322 int i;
324 i = netdev_lock_pos(dev->type);
325 lockdep_set_class_and_name(&dev->addr_list_lock,
326 &netdev_addr_lock_key[i],
327 netdev_lock_name[i]);
329 #else
330 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
331 unsigned short dev_type)
334 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337 #endif
339 /*******************************************************************************
341 Protocol management and registration routines
343 *******************************************************************************/
346 * Add a protocol ID to the list. Now that the input handler is
347 * smarter we can dispense with all the messy stuff that used to be
348 * here.
350 * BEWARE!!! Protocol handlers, mangling input packets,
351 * MUST BE last in hash buckets and checking protocol handlers
352 * MUST start from promiscuous ptype_all chain in net_bh.
353 * It is true now, do not change it.
354 * Explanation follows: if protocol handler, mangling packet, will
355 * be the first on list, it is not able to sense, that packet
356 * is cloned and should be copied-on-write, so that it will
357 * change it and subsequent readers will get broken packet.
358 * --ANK (980803)
362 * dev_add_pack - add packet handler
363 * @pt: packet type declaration
365 * Add a protocol handler to the networking stack. The passed &packet_type
366 * is linked into kernel lists and may not be freed until it has been
367 * removed from the kernel lists.
369 * This call does not sleep therefore it can not
370 * guarantee all CPU's that are in middle of receiving packets
371 * will see the new packet type (until the next received packet).
374 void dev_add_pack(struct packet_type *pt)
376 int hash;
378 spin_lock_bh(&ptype_lock);
379 if (pt->type == htons(ETH_P_ALL))
380 list_add_rcu(&pt->list, &ptype_all);
381 else {
382 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
383 list_add_rcu(&pt->list, &ptype_base[hash]);
385 spin_unlock_bh(&ptype_lock);
387 EXPORT_SYMBOL(dev_add_pack);
390 * __dev_remove_pack - remove packet handler
391 * @pt: packet type declaration
393 * Remove a protocol handler that was previously added to the kernel
394 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
395 * from the kernel lists and can be freed or reused once this function
396 * returns.
398 * The packet type might still be in use by receivers
399 * and must not be freed until after all the CPU's have gone
400 * through a quiescent state.
402 void __dev_remove_pack(struct packet_type *pt)
404 struct list_head *head;
405 struct packet_type *pt1;
407 spin_lock_bh(&ptype_lock);
409 if (pt->type == htons(ETH_P_ALL))
410 head = &ptype_all;
411 else
412 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
414 list_for_each_entry(pt1, head, list) {
415 if (pt == pt1) {
416 list_del_rcu(&pt->list);
417 goto out;
421 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
422 out:
423 spin_unlock_bh(&ptype_lock);
425 EXPORT_SYMBOL(__dev_remove_pack);
428 * dev_remove_pack - remove packet handler
429 * @pt: packet type declaration
431 * Remove a protocol handler that was previously added to the kernel
432 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
433 * from the kernel lists and can be freed or reused once this function
434 * returns.
436 * This call sleeps to guarantee that no CPU is looking at the packet
437 * type after return.
439 void dev_remove_pack(struct packet_type *pt)
441 __dev_remove_pack(pt);
443 synchronize_net();
445 EXPORT_SYMBOL(dev_remove_pack);
447 /******************************************************************************
449 Device Boot-time Settings Routines
451 *******************************************************************************/
453 /* Boot time configuration table */
454 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
457 * netdev_boot_setup_add - add new setup entry
458 * @name: name of the device
459 * @map: configured settings for the device
461 * Adds new setup entry to the dev_boot_setup list. The function
462 * returns 0 on error and 1 on success. This is a generic routine to
463 * all netdevices.
465 static int netdev_boot_setup_add(char *name, struct ifmap *map)
467 struct netdev_boot_setup *s;
468 int i;
470 s = dev_boot_setup;
471 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
472 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
473 memset(s[i].name, 0, sizeof(s[i].name));
474 strlcpy(s[i].name, name, IFNAMSIZ);
475 memcpy(&s[i].map, map, sizeof(s[i].map));
476 break;
480 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
484 * netdev_boot_setup_check - check boot time settings
485 * @dev: the netdevice
487 * Check boot time settings for the device.
488 * The found settings are set for the device to be used
489 * later in the device probing.
490 * Returns 0 if no settings found, 1 if they are.
492 int netdev_boot_setup_check(struct net_device *dev)
494 struct netdev_boot_setup *s = dev_boot_setup;
495 int i;
497 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
498 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
499 !strcmp(dev->name, s[i].name)) {
500 dev->irq = s[i].map.irq;
501 dev->base_addr = s[i].map.base_addr;
502 dev->mem_start = s[i].map.mem_start;
503 dev->mem_end = s[i].map.mem_end;
504 return 1;
507 return 0;
509 EXPORT_SYMBOL(netdev_boot_setup_check);
513 * netdev_boot_base - get address from boot time settings
514 * @prefix: prefix for network device
515 * @unit: id for network device
517 * Check boot time settings for the base address of device.
518 * The found settings are set for the device to be used
519 * later in the device probing.
520 * Returns 0 if no settings found.
522 unsigned long netdev_boot_base(const char *prefix, int unit)
524 const struct netdev_boot_setup *s = dev_boot_setup;
525 char name[IFNAMSIZ];
526 int i;
528 sprintf(name, "%s%d", prefix, unit);
531 * If device already registered then return base of 1
532 * to indicate not to probe for this interface
534 if (__dev_get_by_name(&init_net, name))
535 return 1;
537 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
538 if (!strcmp(name, s[i].name))
539 return s[i].map.base_addr;
540 return 0;
544 * Saves at boot time configured settings for any netdevice.
546 int __init netdev_boot_setup(char *str)
548 int ints[5];
549 struct ifmap map;
551 str = get_options(str, ARRAY_SIZE(ints), ints);
552 if (!str || !*str)
553 return 0;
555 /* Save settings */
556 memset(&map, 0, sizeof(map));
557 if (ints[0] > 0)
558 map.irq = ints[1];
559 if (ints[0] > 1)
560 map.base_addr = ints[2];
561 if (ints[0] > 2)
562 map.mem_start = ints[3];
563 if (ints[0] > 3)
564 map.mem_end = ints[4];
566 /* Add new entry to the list */
567 return netdev_boot_setup_add(str, &map);
570 __setup("netdev=", netdev_boot_setup);
572 /*******************************************************************************
574 Device Interface Subroutines
576 *******************************************************************************/
579 * __dev_get_by_name - find a device by its name
580 * @net: the applicable net namespace
581 * @name: name to find
583 * Find an interface by name. Must be called under RTNL semaphore
584 * or @dev_base_lock. If the name is found a pointer to the device
585 * is returned. If the name is not found then %NULL is returned. The
586 * reference counters are not incremented so the caller must be
587 * careful with locks.
590 struct net_device *__dev_get_by_name(struct net *net, const char *name)
592 struct hlist_node *p;
593 struct net_device *dev;
594 struct hlist_head *head = dev_name_hash(net, name);
596 hlist_for_each_entry(dev, p, head, name_hlist)
597 if (!strncmp(dev->name, name, IFNAMSIZ))
598 return dev;
600 return NULL;
602 EXPORT_SYMBOL(__dev_get_by_name);
605 * dev_get_by_name_rcu - find a device by its name
606 * @net: the applicable net namespace
607 * @name: name to find
609 * Find an interface by name.
610 * If the name is found a pointer to the device is returned.
611 * If the name is not found then %NULL is returned.
612 * The reference counters are not incremented so the caller must be
613 * careful with locks. The caller must hold RCU lock.
616 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
618 struct hlist_node *p;
619 struct net_device *dev;
620 struct hlist_head *head = dev_name_hash(net, name);
622 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
623 if (!strncmp(dev->name, name, IFNAMSIZ))
624 return dev;
626 return NULL;
628 EXPORT_SYMBOL(dev_get_by_name_rcu);
631 * dev_get_by_name - find a device by its name
632 * @net: the applicable net namespace
633 * @name: name to find
635 * Find an interface by name. This can be called from any
636 * context and does its own locking. The returned handle has
637 * the usage count incremented and the caller must use dev_put() to
638 * release it when it is no longer needed. %NULL is returned if no
639 * matching device is found.
642 struct net_device *dev_get_by_name(struct net *net, const char *name)
644 struct net_device *dev;
646 rcu_read_lock();
647 dev = dev_get_by_name_rcu(net, name);
648 if (dev)
649 dev_hold(dev);
650 rcu_read_unlock();
651 return dev;
653 EXPORT_SYMBOL(dev_get_by_name);
656 * __dev_get_by_index - find a device by its ifindex
657 * @net: the applicable net namespace
658 * @ifindex: index of device
660 * Search for an interface by index. Returns %NULL if the device
661 * is not found or a pointer to the device. The device has not
662 * had its reference counter increased so the caller must be careful
663 * about locking. The caller must hold either the RTNL semaphore
664 * or @dev_base_lock.
667 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
669 struct hlist_node *p;
670 struct net_device *dev;
671 struct hlist_head *head = dev_index_hash(net, ifindex);
673 hlist_for_each_entry(dev, p, head, index_hlist)
674 if (dev->ifindex == ifindex)
675 return dev;
677 return NULL;
679 EXPORT_SYMBOL(__dev_get_by_index);
682 * dev_get_by_index_rcu - find a device by its ifindex
683 * @net: the applicable net namespace
684 * @ifindex: index of device
686 * Search for an interface by index. Returns %NULL if the device
687 * is not found or a pointer to the device. The device has not
688 * had its reference counter increased so the caller must be careful
689 * about locking. The caller must hold RCU lock.
692 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
694 struct hlist_node *p;
695 struct net_device *dev;
696 struct hlist_head *head = dev_index_hash(net, ifindex);
698 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
699 if (dev->ifindex == ifindex)
700 return dev;
702 return NULL;
704 EXPORT_SYMBOL(dev_get_by_index_rcu);
708 * dev_get_by_index - find a device by its ifindex
709 * @net: the applicable net namespace
710 * @ifindex: index of device
712 * Search for an interface by index. Returns NULL if the device
713 * is not found or a pointer to the device. The device returned has
714 * had a reference added and the pointer is safe until the user calls
715 * dev_put to indicate they have finished with it.
718 struct net_device *dev_get_by_index(struct net *net, int ifindex)
720 struct net_device *dev;
722 rcu_read_lock();
723 dev = dev_get_by_index_rcu(net, ifindex);
724 if (dev)
725 dev_hold(dev);
726 rcu_read_unlock();
727 return dev;
729 EXPORT_SYMBOL(dev_get_by_index);
732 * dev_getbyhwaddr - find a device by its hardware address
733 * @net: the applicable net namespace
734 * @type: media type of device
735 * @ha: hardware address
737 * Search for an interface by MAC address. Returns NULL if the device
738 * is not found or a pointer to the device. The caller must hold the
739 * rtnl semaphore. The returned device has not had its ref count increased
740 * and the caller must therefore be careful about locking
742 * BUGS:
743 * If the API was consistent this would be __dev_get_by_hwaddr
746 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
748 struct net_device *dev;
750 ASSERT_RTNL();
752 for_each_netdev(net, dev)
753 if (dev->type == type &&
754 !memcmp(dev->dev_addr, ha, dev->addr_len))
755 return dev;
757 return NULL;
759 EXPORT_SYMBOL(dev_getbyhwaddr);
761 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
763 struct net_device *dev;
765 ASSERT_RTNL();
766 for_each_netdev(net, dev)
767 if (dev->type == type)
768 return dev;
770 return NULL;
772 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
774 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
776 struct net_device *dev;
778 rtnl_lock();
779 dev = __dev_getfirstbyhwtype(net, type);
780 if (dev)
781 dev_hold(dev);
782 rtnl_unlock();
783 return dev;
785 EXPORT_SYMBOL(dev_getfirstbyhwtype);
788 * dev_get_by_flags - find any device with given flags
789 * @net: the applicable net namespace
790 * @if_flags: IFF_* values
791 * @mask: bitmask of bits in if_flags to check
793 * Search for any interface with the given flags. Returns NULL if a device
794 * is not found or a pointer to the device. The device returned has
795 * had a reference added and the pointer is safe until the user calls
796 * dev_put to indicate they have finished with it.
799 struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
800 unsigned short mask)
802 struct net_device *dev, *ret;
804 ret = NULL;
805 rcu_read_lock();
806 for_each_netdev_rcu(net, dev) {
807 if (((dev->flags ^ if_flags) & mask) == 0) {
808 dev_hold(dev);
809 ret = dev;
810 break;
813 rcu_read_unlock();
814 return ret;
816 EXPORT_SYMBOL(dev_get_by_flags);
819 * dev_valid_name - check if name is okay for network device
820 * @name: name string
822 * Network device names need to be valid file names to
823 * to allow sysfs to work. We also disallow any kind of
824 * whitespace.
826 int dev_valid_name(const char *name)
828 if (*name == '\0')
829 return 0;
830 if (strlen(name) >= IFNAMSIZ)
831 return 0;
832 if (!strcmp(name, ".") || !strcmp(name, ".."))
833 return 0;
835 while (*name) {
836 if (*name == '/' || isspace(*name))
837 return 0;
838 name++;
840 return 1;
842 EXPORT_SYMBOL(dev_valid_name);
845 * __dev_alloc_name - allocate a name for a device
846 * @net: network namespace to allocate the device name in
847 * @name: name format string
848 * @buf: scratch buffer and result name string
850 * Passed a format string - eg "lt%d" it will try and find a suitable
851 * id. It scans list of devices to build up a free map, then chooses
852 * the first empty slot. The caller must hold the dev_base or rtnl lock
853 * while allocating the name and adding the device in order to avoid
854 * duplicates.
855 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
856 * Returns the number of the unit assigned or a negative errno code.
859 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
861 int i = 0;
862 const char *p;
863 const int max_netdevices = 8*PAGE_SIZE;
864 unsigned long *inuse;
865 struct net_device *d;
867 p = strnchr(name, IFNAMSIZ-1, '%');
868 if (p) {
870 * Verify the string as this thing may have come from
871 * the user. There must be either one "%d" and no other "%"
872 * characters.
874 if (p[1] != 'd' || strchr(p + 2, '%'))
875 return -EINVAL;
877 /* Use one page as a bit array of possible slots */
878 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
879 if (!inuse)
880 return -ENOMEM;
882 for_each_netdev(net, d) {
883 if (!sscanf(d->name, name, &i))
884 continue;
885 if (i < 0 || i >= max_netdevices)
886 continue;
888 /* avoid cases where sscanf is not exact inverse of printf */
889 snprintf(buf, IFNAMSIZ, name, i);
890 if (!strncmp(buf, d->name, IFNAMSIZ))
891 set_bit(i, inuse);
894 i = find_first_zero_bit(inuse, max_netdevices);
895 free_page((unsigned long) inuse);
898 if (buf != name)
899 snprintf(buf, IFNAMSIZ, name, i);
900 if (!__dev_get_by_name(net, buf))
901 return i;
903 /* It is possible to run out of possible slots
904 * when the name is long and there isn't enough space left
905 * for the digits, or if all bits are used.
907 return -ENFILE;
911 * dev_alloc_name - allocate a name for a device
912 * @dev: device
913 * @name: name format string
915 * Passed a format string - eg "lt%d" it will try and find a suitable
916 * id. It scans list of devices to build up a free map, then chooses
917 * the first empty slot. The caller must hold the dev_base or rtnl lock
918 * while allocating the name and adding the device in order to avoid
919 * duplicates.
920 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
921 * Returns the number of the unit assigned or a negative errno code.
924 int dev_alloc_name(struct net_device *dev, const char *name)
926 char buf[IFNAMSIZ];
927 struct net *net;
928 int ret;
930 BUG_ON(!dev_net(dev));
931 net = dev_net(dev);
932 ret = __dev_alloc_name(net, name, buf);
933 if (ret >= 0)
934 strlcpy(dev->name, buf, IFNAMSIZ);
935 return ret;
937 EXPORT_SYMBOL(dev_alloc_name);
939 static int dev_get_valid_name(struct net *net, const char *name, char *buf,
940 bool fmt)
942 if (!dev_valid_name(name))
943 return -EINVAL;
945 if (fmt && strchr(name, '%'))
946 return __dev_alloc_name(net, name, buf);
947 else if (__dev_get_by_name(net, name))
948 return -EEXIST;
949 else if (buf != name)
950 strlcpy(buf, name, IFNAMSIZ);
952 return 0;
956 * dev_change_name - change name of a device
957 * @dev: device
958 * @newname: name (or format string) must be at least IFNAMSIZ
960 * Change name of a device, can pass format strings "eth%d".
961 * for wildcarding.
963 int dev_change_name(struct net_device *dev, const char *newname)
965 char oldname[IFNAMSIZ];
966 int err = 0;
967 int ret;
968 struct net *net;
970 ASSERT_RTNL();
971 BUG_ON(!dev_net(dev));
973 net = dev_net(dev);
974 if (dev->flags & IFF_UP)
975 return -EBUSY;
977 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
978 return 0;
980 memcpy(oldname, dev->name, IFNAMSIZ);
982 err = dev_get_valid_name(net, newname, dev->name, 1);
983 if (err < 0)
984 return err;
986 rollback:
987 /* For now only devices in the initial network namespace
988 * are in sysfs.
990 if (net_eq(net, &init_net)) {
991 ret = device_rename(&dev->dev, dev->name);
992 if (ret) {
993 memcpy(dev->name, oldname, IFNAMSIZ);
994 return ret;
998 write_lock_bh(&dev_base_lock);
999 hlist_del(&dev->name_hlist);
1000 write_unlock_bh(&dev_base_lock);
1002 synchronize_rcu();
1004 write_lock_bh(&dev_base_lock);
1005 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1006 write_unlock_bh(&dev_base_lock);
1008 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1009 ret = notifier_to_errno(ret);
1011 if (ret) {
1012 /* err >= 0 after dev_alloc_name() or stores the first errno */
1013 if (err >= 0) {
1014 err = ret;
1015 memcpy(dev->name, oldname, IFNAMSIZ);
1016 goto rollback;
1017 } else {
1018 printk(KERN_ERR
1019 "%s: name change rollback failed: %d.\n",
1020 dev->name, ret);
1024 return err;
1028 * dev_set_alias - change ifalias of a device
1029 * @dev: device
1030 * @alias: name up to IFALIASZ
1031 * @len: limit of bytes to copy from info
1033 * Set ifalias for a device,
1035 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1037 ASSERT_RTNL();
1039 if (len >= IFALIASZ)
1040 return -EINVAL;
1042 if (!len) {
1043 if (dev->ifalias) {
1044 kfree(dev->ifalias);
1045 dev->ifalias = NULL;
1047 return 0;
1050 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1051 if (!dev->ifalias)
1052 return -ENOMEM;
1054 strlcpy(dev->ifalias, alias, len+1);
1055 return len;
1060 * netdev_features_change - device changes features
1061 * @dev: device to cause notification
1063 * Called to indicate a device has changed features.
1065 void netdev_features_change(struct net_device *dev)
1067 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1069 EXPORT_SYMBOL(netdev_features_change);
1072 * netdev_state_change - device changes state
1073 * @dev: device to cause notification
1075 * Called to indicate a device has changed state. This function calls
1076 * the notifier chains for netdev_chain and sends a NEWLINK message
1077 * to the routing socket.
1079 void netdev_state_change(struct net_device *dev)
1081 if (dev->flags & IFF_UP) {
1082 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1083 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1086 EXPORT_SYMBOL(netdev_state_change);
1088 void netdev_bonding_change(struct net_device *dev, unsigned long event)
1090 call_netdevice_notifiers(event, dev);
1092 EXPORT_SYMBOL(netdev_bonding_change);
1095 * dev_load - load a network module
1096 * @net: the applicable net namespace
1097 * @name: name of interface
1099 * If a network interface is not present and the process has suitable
1100 * privileges this function loads the module. If module loading is not
1101 * available in this kernel then it becomes a nop.
1104 void dev_load(struct net *net, const char *name)
1106 struct net_device *dev;
1108 rcu_read_lock();
1109 dev = dev_get_by_name_rcu(net, name);
1110 rcu_read_unlock();
1112 if (!dev && capable(CAP_NET_ADMIN))
1113 request_module("%s", name);
1115 EXPORT_SYMBOL(dev_load);
1117 static int __dev_open(struct net_device *dev)
1119 const struct net_device_ops *ops = dev->netdev_ops;
1120 int ret;
1122 ASSERT_RTNL();
1125 * Is it even present?
1127 if (!netif_device_present(dev))
1128 return -ENODEV;
1130 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1131 ret = notifier_to_errno(ret);
1132 if (ret)
1133 return ret;
1136 * Call device private open method
1138 set_bit(__LINK_STATE_START, &dev->state);
1140 if (ops->ndo_validate_addr)
1141 ret = ops->ndo_validate_addr(dev);
1143 if (!ret && ops->ndo_open)
1144 ret = ops->ndo_open(dev);
1147 * If it went open OK then:
1150 if (ret)
1151 clear_bit(__LINK_STATE_START, &dev->state);
1152 else {
1154 * Set the flags.
1156 dev->flags |= IFF_UP;
1159 * Enable NET_DMA
1161 net_dmaengine_get();
1164 * Initialize multicasting status
1166 dev_set_rx_mode(dev);
1169 * Wakeup transmit queue engine
1171 dev_activate(dev);
1174 return ret;
1178 * dev_open - prepare an interface for use.
1179 * @dev: device to open
1181 * Takes a device from down to up state. The device's private open
1182 * function is invoked and then the multicast lists are loaded. Finally
1183 * the device is moved into the up state and a %NETDEV_UP message is
1184 * sent to the netdev notifier chain.
1186 * Calling this function on an active interface is a nop. On a failure
1187 * a negative errno code is returned.
1189 int dev_open(struct net_device *dev)
1191 int ret;
1194 * Is it already up?
1196 if (dev->flags & IFF_UP)
1197 return 0;
1200 * Open device
1202 ret = __dev_open(dev);
1203 if (ret < 0)
1204 return ret;
1207 * ... and announce new interface.
1209 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1210 call_netdevice_notifiers(NETDEV_UP, dev);
1212 return ret;
1214 EXPORT_SYMBOL(dev_open);
1216 static int __dev_close(struct net_device *dev)
1218 const struct net_device_ops *ops = dev->netdev_ops;
1220 ASSERT_RTNL();
1221 might_sleep();
1224 * Tell people we are going down, so that they can
1225 * prepare to death, when device is still operating.
1227 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1229 clear_bit(__LINK_STATE_START, &dev->state);
1231 /* Synchronize to scheduled poll. We cannot touch poll list,
1232 * it can be even on different cpu. So just clear netif_running().
1234 * dev->stop() will invoke napi_disable() on all of it's
1235 * napi_struct instances on this device.
1237 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1239 dev_deactivate(dev);
1242 * Call the device specific close. This cannot fail.
1243 * Only if device is UP
1245 * We allow it to be called even after a DETACH hot-plug
1246 * event.
1248 if (ops->ndo_stop)
1249 ops->ndo_stop(dev);
1252 * Device is now down.
1255 dev->flags &= ~IFF_UP;
1258 * Shutdown NET_DMA
1260 net_dmaengine_put();
1262 return 0;
1266 * dev_close - shutdown an interface.
1267 * @dev: device to shutdown
1269 * This function moves an active device into down state. A
1270 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1271 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1272 * chain.
1274 int dev_close(struct net_device *dev)
1276 if (!(dev->flags & IFF_UP))
1277 return 0;
1279 __dev_close(dev);
1282 * Tell people we are down
1284 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1285 call_netdevice_notifiers(NETDEV_DOWN, dev);
1287 return 0;
1289 EXPORT_SYMBOL(dev_close);
1293 * dev_disable_lro - disable Large Receive Offload on a device
1294 * @dev: device
1296 * Disable Large Receive Offload (LRO) on a net device. Must be
1297 * called under RTNL. This is needed if received packets may be
1298 * forwarded to another interface.
1300 void dev_disable_lro(struct net_device *dev)
1302 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1303 dev->ethtool_ops->set_flags) {
1304 u32 flags = dev->ethtool_ops->get_flags(dev);
1305 if (flags & ETH_FLAG_LRO) {
1306 flags &= ~ETH_FLAG_LRO;
1307 dev->ethtool_ops->set_flags(dev, flags);
1310 WARN_ON(dev->features & NETIF_F_LRO);
1312 EXPORT_SYMBOL(dev_disable_lro);
1315 static int dev_boot_phase = 1;
1318 * Device change register/unregister. These are not inline or static
1319 * as we export them to the world.
1323 * register_netdevice_notifier - register a network notifier block
1324 * @nb: notifier
1326 * Register a notifier to be called when network device events occur.
1327 * The notifier passed is linked into the kernel structures and must
1328 * not be reused until it has been unregistered. A negative errno code
1329 * is returned on a failure.
1331 * When registered all registration and up events are replayed
1332 * to the new notifier to allow device to have a race free
1333 * view of the network device list.
1336 int register_netdevice_notifier(struct notifier_block *nb)
1338 struct net_device *dev;
1339 struct net_device *last;
1340 struct net *net;
1341 int err;
1343 rtnl_lock();
1344 err = raw_notifier_chain_register(&netdev_chain, nb);
1345 if (err)
1346 goto unlock;
1347 if (dev_boot_phase)
1348 goto unlock;
1349 for_each_net(net) {
1350 for_each_netdev(net, dev) {
1351 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1352 err = notifier_to_errno(err);
1353 if (err)
1354 goto rollback;
1356 if (!(dev->flags & IFF_UP))
1357 continue;
1359 nb->notifier_call(nb, NETDEV_UP, dev);
1363 unlock:
1364 rtnl_unlock();
1365 return err;
1367 rollback:
1368 last = dev;
1369 for_each_net(net) {
1370 for_each_netdev(net, dev) {
1371 if (dev == last)
1372 break;
1374 if (dev->flags & IFF_UP) {
1375 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1376 nb->notifier_call(nb, NETDEV_DOWN, dev);
1378 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1379 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1383 raw_notifier_chain_unregister(&netdev_chain, nb);
1384 goto unlock;
1386 EXPORT_SYMBOL(register_netdevice_notifier);
1389 * unregister_netdevice_notifier - unregister a network notifier block
1390 * @nb: notifier
1392 * Unregister a notifier previously registered by
1393 * register_netdevice_notifier(). The notifier is unlinked into the
1394 * kernel structures and may then be reused. A negative errno code
1395 * is returned on a failure.
1398 int unregister_netdevice_notifier(struct notifier_block *nb)
1400 int err;
1402 rtnl_lock();
1403 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1404 rtnl_unlock();
1405 return err;
1407 EXPORT_SYMBOL(unregister_netdevice_notifier);
1410 * call_netdevice_notifiers - call all network notifier blocks
1411 * @val: value passed unmodified to notifier function
1412 * @dev: net_device pointer passed unmodified to notifier function
1414 * Call all network notifier blocks. Parameters and return value
1415 * are as for raw_notifier_call_chain().
1418 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1420 return raw_notifier_call_chain(&netdev_chain, val, dev);
1423 /* When > 0 there are consumers of rx skb time stamps */
1424 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1426 void net_enable_timestamp(void)
1428 atomic_inc(&netstamp_needed);
1430 EXPORT_SYMBOL(net_enable_timestamp);
1432 void net_disable_timestamp(void)
1434 atomic_dec(&netstamp_needed);
1436 EXPORT_SYMBOL(net_disable_timestamp);
1438 static inline void net_timestamp(struct sk_buff *skb)
1440 if (atomic_read(&netstamp_needed))
1441 __net_timestamp(skb);
1442 else
1443 skb->tstamp.tv64 = 0;
1447 * dev_forward_skb - loopback an skb to another netif
1449 * @dev: destination network device
1450 * @skb: buffer to forward
1452 * return values:
1453 * NET_RX_SUCCESS (no congestion)
1454 * NET_RX_DROP (packet was dropped)
1456 * dev_forward_skb can be used for injecting an skb from the
1457 * start_xmit function of one device into the receive queue
1458 * of another device.
1460 * The receiving device may be in another namespace, so
1461 * we have to clear all information in the skb that could
1462 * impact namespace isolation.
1464 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1466 skb_orphan(skb);
1468 if (!(dev->flags & IFF_UP))
1469 return NET_RX_DROP;
1471 if (skb->len > (dev->mtu + dev->hard_header_len))
1472 return NET_RX_DROP;
1474 skb_set_dev(skb, dev);
1475 skb->tstamp.tv64 = 0;
1476 skb->pkt_type = PACKET_HOST;
1477 skb->protocol = eth_type_trans(skb, dev);
1478 return netif_rx(skb);
1480 EXPORT_SYMBOL_GPL(dev_forward_skb);
1483 * Support routine. Sends outgoing frames to any network
1484 * taps currently in use.
1487 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1489 struct packet_type *ptype;
1491 #ifdef CONFIG_NET_CLS_ACT
1492 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1493 net_timestamp(skb);
1494 #else
1495 net_timestamp(skb);
1496 #endif
1498 rcu_read_lock();
1499 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1500 /* Never send packets back to the socket
1501 * they originated from - MvS (miquels@drinkel.ow.org)
1503 if ((ptype->dev == dev || !ptype->dev) &&
1504 (ptype->af_packet_priv == NULL ||
1505 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1506 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1507 if (!skb2)
1508 break;
1510 /* skb->nh should be correctly
1511 set by sender, so that the second statement is
1512 just protection against buggy protocols.
1514 skb_reset_mac_header(skb2);
1516 if (skb_network_header(skb2) < skb2->data ||
1517 skb2->network_header > skb2->tail) {
1518 if (net_ratelimit())
1519 printk(KERN_CRIT "protocol %04x is "
1520 "buggy, dev %s\n",
1521 skb2->protocol, dev->name);
1522 skb_reset_network_header(skb2);
1525 skb2->transport_header = skb2->network_header;
1526 skb2->pkt_type = PACKET_OUTGOING;
1527 ptype->func(skb2, skb->dev, ptype, skb->dev);
1530 rcu_read_unlock();
1534 static inline void __netif_reschedule(struct Qdisc *q)
1536 struct softnet_data *sd;
1537 unsigned long flags;
1539 local_irq_save(flags);
1540 sd = &__get_cpu_var(softnet_data);
1541 q->next_sched = sd->output_queue;
1542 sd->output_queue = q;
1543 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1544 local_irq_restore(flags);
1547 void __netif_schedule(struct Qdisc *q)
1549 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1550 __netif_reschedule(q);
1552 EXPORT_SYMBOL(__netif_schedule);
1554 void dev_kfree_skb_irq(struct sk_buff *skb)
1556 if (atomic_dec_and_test(&skb->users)) {
1557 struct softnet_data *sd;
1558 unsigned long flags;
1560 local_irq_save(flags);
1561 sd = &__get_cpu_var(softnet_data);
1562 skb->next = sd->completion_queue;
1563 sd->completion_queue = skb;
1564 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1565 local_irq_restore(flags);
1568 EXPORT_SYMBOL(dev_kfree_skb_irq);
1570 void dev_kfree_skb_any(struct sk_buff *skb)
1572 if (in_irq() || irqs_disabled())
1573 dev_kfree_skb_irq(skb);
1574 else
1575 dev_kfree_skb(skb);
1577 EXPORT_SYMBOL(dev_kfree_skb_any);
1581 * netif_device_detach - mark device as removed
1582 * @dev: network device
1584 * Mark device as removed from system and therefore no longer available.
1586 void netif_device_detach(struct net_device *dev)
1588 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1589 netif_running(dev)) {
1590 netif_tx_stop_all_queues(dev);
1593 EXPORT_SYMBOL(netif_device_detach);
1596 * netif_device_attach - mark device as attached
1597 * @dev: network device
1599 * Mark device as attached from system and restart if needed.
1601 void netif_device_attach(struct net_device *dev)
1603 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1604 netif_running(dev)) {
1605 netif_tx_wake_all_queues(dev);
1606 __netdev_watchdog_up(dev);
1609 EXPORT_SYMBOL(netif_device_attach);
1611 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1613 return ((features & NETIF_F_GEN_CSUM) ||
1614 ((features & NETIF_F_IP_CSUM) &&
1615 protocol == htons(ETH_P_IP)) ||
1616 ((features & NETIF_F_IPV6_CSUM) &&
1617 protocol == htons(ETH_P_IPV6)) ||
1618 ((features & NETIF_F_FCOE_CRC) &&
1619 protocol == htons(ETH_P_FCOE)));
1622 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1624 if (can_checksum_protocol(dev->features, skb->protocol))
1625 return true;
1627 if (skb->protocol == htons(ETH_P_8021Q)) {
1628 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1629 if (can_checksum_protocol(dev->features & dev->vlan_features,
1630 veh->h_vlan_encapsulated_proto))
1631 return true;
1634 return false;
1638 * skb_dev_set -- assign a new device to a buffer
1639 * @skb: buffer for the new device
1640 * @dev: network device
1642 * If an skb is owned by a device already, we have to reset
1643 * all data private to the namespace a device belongs to
1644 * before assigning it a new device.
1646 #ifdef CONFIG_NET_NS
1647 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1649 skb_dst_drop(skb);
1650 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1651 secpath_reset(skb);
1652 nf_reset(skb);
1653 skb_init_secmark(skb);
1654 skb->mark = 0;
1655 skb->priority = 0;
1656 skb->nf_trace = 0;
1657 skb->ipvs_property = 0;
1658 #ifdef CONFIG_NET_SCHED
1659 skb->tc_index = 0;
1660 #endif
1662 skb->dev = dev;
1664 EXPORT_SYMBOL(skb_set_dev);
1665 #endif /* CONFIG_NET_NS */
1668 * Invalidate hardware checksum when packet is to be mangled, and
1669 * complete checksum manually on outgoing path.
1671 int skb_checksum_help(struct sk_buff *skb)
1673 __wsum csum;
1674 int ret = 0, offset;
1676 if (skb->ip_summed == CHECKSUM_COMPLETE)
1677 goto out_set_summed;
1679 if (unlikely(skb_shinfo(skb)->gso_size)) {
1680 /* Let GSO fix up the checksum. */
1681 goto out_set_summed;
1684 offset = skb->csum_start - skb_headroom(skb);
1685 BUG_ON(offset >= skb_headlen(skb));
1686 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1688 offset += skb->csum_offset;
1689 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1691 if (skb_cloned(skb) &&
1692 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1693 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1694 if (ret)
1695 goto out;
1698 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1699 out_set_summed:
1700 skb->ip_summed = CHECKSUM_NONE;
1701 out:
1702 return ret;
1704 EXPORT_SYMBOL(skb_checksum_help);
1707 * skb_gso_segment - Perform segmentation on skb.
1708 * @skb: buffer to segment
1709 * @features: features for the output path (see dev->features)
1711 * This function segments the given skb and returns a list of segments.
1713 * It may return NULL if the skb requires no segmentation. This is
1714 * only possible when GSO is used for verifying header integrity.
1716 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1718 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1719 struct packet_type *ptype;
1720 __be16 type = skb->protocol;
1721 int err;
1723 skb_reset_mac_header(skb);
1724 skb->mac_len = skb->network_header - skb->mac_header;
1725 __skb_pull(skb, skb->mac_len);
1727 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1728 struct net_device *dev = skb->dev;
1729 struct ethtool_drvinfo info = {};
1731 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1732 dev->ethtool_ops->get_drvinfo(dev, &info);
1734 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1735 "ip_summed=%d",
1736 info.driver, dev ? dev->features : 0L,
1737 skb->sk ? skb->sk->sk_route_caps : 0L,
1738 skb->len, skb->data_len, skb->ip_summed);
1740 if (skb_header_cloned(skb) &&
1741 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1742 return ERR_PTR(err);
1745 rcu_read_lock();
1746 list_for_each_entry_rcu(ptype,
1747 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1748 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1749 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1750 err = ptype->gso_send_check(skb);
1751 segs = ERR_PTR(err);
1752 if (err || skb_gso_ok(skb, features))
1753 break;
1754 __skb_push(skb, (skb->data -
1755 skb_network_header(skb)));
1757 segs = ptype->gso_segment(skb, features);
1758 break;
1761 rcu_read_unlock();
1763 __skb_push(skb, skb->data - skb_mac_header(skb));
1765 return segs;
1767 EXPORT_SYMBOL(skb_gso_segment);
1769 /* Take action when hardware reception checksum errors are detected. */
1770 #ifdef CONFIG_BUG
1771 void netdev_rx_csum_fault(struct net_device *dev)
1773 if (net_ratelimit()) {
1774 printk(KERN_ERR "%s: hw csum failure.\n",
1775 dev ? dev->name : "<unknown>");
1776 dump_stack();
1779 EXPORT_SYMBOL(netdev_rx_csum_fault);
1780 #endif
1782 /* Actually, we should eliminate this check as soon as we know, that:
1783 * 1. IOMMU is present and allows to map all the memory.
1784 * 2. No high memory really exists on this machine.
1787 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1789 #ifdef CONFIG_HIGHMEM
1790 int i;
1792 if (dev->features & NETIF_F_HIGHDMA)
1793 return 0;
1795 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1796 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1797 return 1;
1799 #endif
1800 return 0;
1803 struct dev_gso_cb {
1804 void (*destructor)(struct sk_buff *skb);
1807 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1809 static void dev_gso_skb_destructor(struct sk_buff *skb)
1811 struct dev_gso_cb *cb;
1813 do {
1814 struct sk_buff *nskb = skb->next;
1816 skb->next = nskb->next;
1817 nskb->next = NULL;
1818 kfree_skb(nskb);
1819 } while (skb->next);
1821 cb = DEV_GSO_CB(skb);
1822 if (cb->destructor)
1823 cb->destructor(skb);
1827 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1828 * @skb: buffer to segment
1830 * This function segments the given skb and stores the list of segments
1831 * in skb->next.
1833 static int dev_gso_segment(struct sk_buff *skb)
1835 struct net_device *dev = skb->dev;
1836 struct sk_buff *segs;
1837 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1838 NETIF_F_SG : 0);
1840 segs = skb_gso_segment(skb, features);
1842 /* Verifying header integrity only. */
1843 if (!segs)
1844 return 0;
1846 if (IS_ERR(segs))
1847 return PTR_ERR(segs);
1849 skb->next = segs;
1850 DEV_GSO_CB(skb)->destructor = skb->destructor;
1851 skb->destructor = dev_gso_skb_destructor;
1853 return 0;
1856 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1857 struct netdev_queue *txq)
1859 const struct net_device_ops *ops = dev->netdev_ops;
1860 int rc = NETDEV_TX_OK;
1862 if (likely(!skb->next)) {
1863 if (!list_empty(&ptype_all))
1864 dev_queue_xmit_nit(skb, dev);
1866 if (netif_needs_gso(dev, skb)) {
1867 if (unlikely(dev_gso_segment(skb)))
1868 goto out_kfree_skb;
1869 if (skb->next)
1870 goto gso;
1874 * If device doesnt need skb->dst, release it right now while
1875 * its hot in this cpu cache
1877 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1878 skb_dst_drop(skb);
1880 rc = ops->ndo_start_xmit(skb, dev);
1881 if (rc == NETDEV_TX_OK)
1882 txq_trans_update(txq);
1884 * TODO: if skb_orphan() was called by
1885 * dev->hard_start_xmit() (for example, the unmodified
1886 * igb driver does that; bnx2 doesn't), then
1887 * skb_tx_software_timestamp() will be unable to send
1888 * back the time stamp.
1890 * How can this be prevented? Always create another
1891 * reference to the socket before calling
1892 * dev->hard_start_xmit()? Prevent that skb_orphan()
1893 * does anything in dev->hard_start_xmit() by clearing
1894 * the skb destructor before the call and restoring it
1895 * afterwards, then doing the skb_orphan() ourselves?
1897 return rc;
1900 gso:
1901 do {
1902 struct sk_buff *nskb = skb->next;
1904 skb->next = nskb->next;
1905 nskb->next = NULL;
1908 * If device doesnt need nskb->dst, release it right now while
1909 * its hot in this cpu cache
1911 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1912 skb_dst_drop(nskb);
1914 rc = ops->ndo_start_xmit(nskb, dev);
1915 if (unlikely(rc != NETDEV_TX_OK)) {
1916 if (rc & ~NETDEV_TX_MASK)
1917 goto out_kfree_gso_skb;
1918 nskb->next = skb->next;
1919 skb->next = nskb;
1920 return rc;
1922 txq_trans_update(txq);
1923 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1924 return NETDEV_TX_BUSY;
1925 } while (skb->next);
1927 out_kfree_gso_skb:
1928 if (likely(skb->next == NULL))
1929 skb->destructor = DEV_GSO_CB(skb)->destructor;
1930 out_kfree_skb:
1931 kfree_skb(skb);
1932 return rc;
1935 static u32 skb_tx_hashrnd;
1937 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1939 u32 hash;
1941 if (skb_rx_queue_recorded(skb)) {
1942 hash = skb_get_rx_queue(skb);
1943 while (unlikely(hash >= dev->real_num_tx_queues))
1944 hash -= dev->real_num_tx_queues;
1945 return hash;
1948 if (skb->sk && skb->sk->sk_hash)
1949 hash = skb->sk->sk_hash;
1950 else
1951 hash = skb->protocol;
1953 hash = jhash_1word(hash, skb_tx_hashrnd);
1955 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1957 EXPORT_SYMBOL(skb_tx_hash);
1959 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1961 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1962 if (net_ratelimit()) {
1963 WARN(1, "%s selects TX queue %d, but "
1964 "real number of TX queues is %d\n",
1965 dev->name, queue_index,
1966 dev->real_num_tx_queues);
1968 return 0;
1970 return queue_index;
1973 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1974 struct sk_buff *skb)
1976 u16 queue_index;
1977 struct sock *sk = skb->sk;
1979 if (sk_tx_queue_recorded(sk)) {
1980 queue_index = sk_tx_queue_get(sk);
1981 } else {
1982 const struct net_device_ops *ops = dev->netdev_ops;
1984 if (ops->ndo_select_queue) {
1985 queue_index = ops->ndo_select_queue(dev, skb);
1986 queue_index = dev_cap_txqueue(dev, queue_index);
1987 } else {
1988 queue_index = 0;
1989 if (dev->real_num_tx_queues > 1)
1990 queue_index = skb_tx_hash(dev, skb);
1992 if (sk) {
1993 struct dst_entry *dst = rcu_dereference_bh(sk->sk_dst_cache);
1995 if (dst && skb_dst(skb) == dst)
1996 sk_tx_queue_set(sk, queue_index);
2001 skb_set_queue_mapping(skb, queue_index);
2002 return netdev_get_tx_queue(dev, queue_index);
2005 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2006 struct net_device *dev,
2007 struct netdev_queue *txq)
2009 spinlock_t *root_lock = qdisc_lock(q);
2010 int rc;
2012 spin_lock(root_lock);
2013 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2014 kfree_skb(skb);
2015 rc = NET_XMIT_DROP;
2016 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2017 !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
2019 * This is a work-conserving queue; there are no old skbs
2020 * waiting to be sent out; and the qdisc is not running -
2021 * xmit the skb directly.
2023 __qdisc_update_bstats(q, skb->len);
2024 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2025 __qdisc_run(q);
2026 else
2027 clear_bit(__QDISC_STATE_RUNNING, &q->state);
2029 rc = NET_XMIT_SUCCESS;
2030 } else {
2031 rc = qdisc_enqueue_root(skb, q);
2032 qdisc_run(q);
2034 spin_unlock(root_lock);
2036 return rc;
2040 * Returns true if either:
2041 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2042 * 2. skb is fragmented and the device does not support SG, or if
2043 * at least one of fragments is in highmem and device does not
2044 * support DMA from it.
2046 static inline int skb_needs_linearize(struct sk_buff *skb,
2047 struct net_device *dev)
2049 return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2050 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2051 illegal_highdma(dev, skb)));
2055 * dev_queue_xmit - transmit a buffer
2056 * @skb: buffer to transmit
2058 * Queue a buffer for transmission to a network device. The caller must
2059 * have set the device and priority and built the buffer before calling
2060 * this function. The function can be called from an interrupt.
2062 * A negative errno code is returned on a failure. A success does not
2063 * guarantee the frame will be transmitted as it may be dropped due
2064 * to congestion or traffic shaping.
2066 * -----------------------------------------------------------------------------------
2067 * I notice this method can also return errors from the queue disciplines,
2068 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2069 * be positive.
2071 * Regardless of the return value, the skb is consumed, so it is currently
2072 * difficult to retry a send to this method. (You can bump the ref count
2073 * before sending to hold a reference for retry if you are careful.)
2075 * When calling this method, interrupts MUST be enabled. This is because
2076 * the BH enable code must have IRQs enabled so that it will not deadlock.
2077 * --BLG
2079 int dev_queue_xmit(struct sk_buff *skb)
2081 struct net_device *dev = skb->dev;
2082 struct netdev_queue *txq;
2083 struct Qdisc *q;
2084 int rc = -ENOMEM;
2086 /* GSO will handle the following emulations directly. */
2087 if (netif_needs_gso(dev, skb))
2088 goto gso;
2090 /* Convert a paged skb to linear, if required */
2091 if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
2092 goto out_kfree_skb;
2094 /* If packet is not checksummed and device does not support
2095 * checksumming for this protocol, complete checksumming here.
2097 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2098 skb_set_transport_header(skb, skb->csum_start -
2099 skb_headroom(skb));
2100 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2101 goto out_kfree_skb;
2104 gso:
2105 /* Disable soft irqs for various locks below. Also
2106 * stops preemption for RCU.
2108 rcu_read_lock_bh();
2110 txq = dev_pick_tx(dev, skb);
2111 q = rcu_dereference_bh(txq->qdisc);
2113 #ifdef CONFIG_NET_CLS_ACT
2114 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2115 #endif
2116 if (q->enqueue) {
2117 rc = __dev_xmit_skb(skb, q, dev, txq);
2118 goto out;
2121 /* The device has no queue. Common case for software devices:
2122 loopback, all the sorts of tunnels...
2124 Really, it is unlikely that netif_tx_lock protection is necessary
2125 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2126 counters.)
2127 However, it is possible, that they rely on protection
2128 made by us here.
2130 Check this and shot the lock. It is not prone from deadlocks.
2131 Either shot noqueue qdisc, it is even simpler 8)
2133 if (dev->flags & IFF_UP) {
2134 int cpu = smp_processor_id(); /* ok because BHs are off */
2136 if (txq->xmit_lock_owner != cpu) {
2138 HARD_TX_LOCK(dev, txq, cpu);
2140 if (!netif_tx_queue_stopped(txq)) {
2141 rc = dev_hard_start_xmit(skb, dev, txq);
2142 if (dev_xmit_complete(rc)) {
2143 HARD_TX_UNLOCK(dev, txq);
2144 goto out;
2147 HARD_TX_UNLOCK(dev, txq);
2148 if (net_ratelimit())
2149 printk(KERN_CRIT "Virtual device %s asks to "
2150 "queue packet!\n", dev->name);
2151 } else {
2152 /* Recursion is detected! It is possible,
2153 * unfortunately */
2154 if (net_ratelimit())
2155 printk(KERN_CRIT "Dead loop on virtual device "
2156 "%s, fix it urgently!\n", dev->name);
2160 rc = -ENETDOWN;
2161 rcu_read_unlock_bh();
2163 out_kfree_skb:
2164 kfree_skb(skb);
2165 return rc;
2166 out:
2167 rcu_read_unlock_bh();
2168 return rc;
2170 EXPORT_SYMBOL(dev_queue_xmit);
2173 /*=======================================================================
2174 Receiver routines
2175 =======================================================================*/
2177 int netdev_max_backlog __read_mostly = 1000;
2178 int netdev_budget __read_mostly = 300;
2179 int weight_p __read_mostly = 64; /* old backlog weight */
2181 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2185 * netif_rx - post buffer to the network code
2186 * @skb: buffer to post
2188 * This function receives a packet from a device driver and queues it for
2189 * the upper (protocol) levels to process. It always succeeds. The buffer
2190 * may be dropped during processing for congestion control or by the
2191 * protocol layers.
2193 * return values:
2194 * NET_RX_SUCCESS (no congestion)
2195 * NET_RX_DROP (packet was dropped)
2199 int netif_rx(struct sk_buff *skb)
2201 struct softnet_data *queue;
2202 unsigned long flags;
2204 /* if netpoll wants it, pretend we never saw it */
2205 if (netpoll_rx(skb))
2206 return NET_RX_DROP;
2208 if (!skb->tstamp.tv64)
2209 net_timestamp(skb);
2212 * The code is rearranged so that the path is the most
2213 * short when CPU is congested, but is still operating.
2215 local_irq_save(flags);
2216 queue = &__get_cpu_var(softnet_data);
2218 __get_cpu_var(netdev_rx_stat).total++;
2219 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2220 if (queue->input_pkt_queue.qlen) {
2221 enqueue:
2222 __skb_queue_tail(&queue->input_pkt_queue, skb);
2223 local_irq_restore(flags);
2224 return NET_RX_SUCCESS;
2227 napi_schedule(&queue->backlog);
2228 goto enqueue;
2231 __get_cpu_var(netdev_rx_stat).dropped++;
2232 local_irq_restore(flags);
2234 kfree_skb(skb);
2235 return NET_RX_DROP;
2237 EXPORT_SYMBOL(netif_rx);
2239 int netif_rx_ni(struct sk_buff *skb)
2241 int err;
2243 preempt_disable();
2244 err = netif_rx(skb);
2245 if (local_softirq_pending())
2246 do_softirq();
2247 preempt_enable();
2249 return err;
2251 EXPORT_SYMBOL(netif_rx_ni);
2253 static void net_tx_action(struct softirq_action *h)
2255 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2257 if (sd->completion_queue) {
2258 struct sk_buff *clist;
2260 local_irq_disable();
2261 clist = sd->completion_queue;
2262 sd->completion_queue = NULL;
2263 local_irq_enable();
2265 while (clist) {
2266 struct sk_buff *skb = clist;
2267 clist = clist->next;
2269 WARN_ON(atomic_read(&skb->users));
2270 __kfree_skb(skb);
2274 if (sd->output_queue) {
2275 struct Qdisc *head;
2277 local_irq_disable();
2278 head = sd->output_queue;
2279 sd->output_queue = NULL;
2280 local_irq_enable();
2282 while (head) {
2283 struct Qdisc *q = head;
2284 spinlock_t *root_lock;
2286 head = head->next_sched;
2288 root_lock = qdisc_lock(q);
2289 if (spin_trylock(root_lock)) {
2290 smp_mb__before_clear_bit();
2291 clear_bit(__QDISC_STATE_SCHED,
2292 &q->state);
2293 qdisc_run(q);
2294 spin_unlock(root_lock);
2295 } else {
2296 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2297 &q->state)) {
2298 __netif_reschedule(q);
2299 } else {
2300 smp_mb__before_clear_bit();
2301 clear_bit(__QDISC_STATE_SCHED,
2302 &q->state);
2309 static inline int deliver_skb(struct sk_buff *skb,
2310 struct packet_type *pt_prev,
2311 struct net_device *orig_dev)
2313 atomic_inc(&skb->users);
2314 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2317 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2319 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2320 /* This hook is defined here for ATM LANE */
2321 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2322 unsigned char *addr) __read_mostly;
2323 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2324 #endif
2327 * If bridge module is loaded call bridging hook.
2328 * returns NULL if packet was consumed.
2330 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2331 struct sk_buff *skb) __read_mostly;
2332 EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2334 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2335 struct packet_type **pt_prev, int *ret,
2336 struct net_device *orig_dev)
2338 struct net_bridge_port *port;
2340 if (skb->pkt_type == PACKET_LOOPBACK ||
2341 (port = rcu_dereference(skb->dev->br_port)) == NULL)
2342 return skb;
2344 if (*pt_prev) {
2345 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2346 *pt_prev = NULL;
2349 return br_handle_frame_hook(port, skb);
2351 #else
2352 #define handle_bridge(skb, pt_prev, ret, orig_dev) (skb)
2353 #endif
2355 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2356 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2357 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2359 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2360 struct packet_type **pt_prev,
2361 int *ret,
2362 struct net_device *orig_dev)
2364 if (skb->dev->macvlan_port == NULL)
2365 return skb;
2367 if (*pt_prev) {
2368 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2369 *pt_prev = NULL;
2371 return macvlan_handle_frame_hook(skb);
2373 #else
2374 #define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb)
2375 #endif
2377 #ifdef CONFIG_NET_CLS_ACT
2378 /* TODO: Maybe we should just force sch_ingress to be compiled in
2379 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2380 * a compare and 2 stores extra right now if we dont have it on
2381 * but have CONFIG_NET_CLS_ACT
2382 * NOTE: This doesnt stop any functionality; if you dont have
2383 * the ingress scheduler, you just cant add policies on ingress.
2386 static int ing_filter(struct sk_buff *skb)
2388 struct net_device *dev = skb->dev;
2389 u32 ttl = G_TC_RTTL(skb->tc_verd);
2390 struct netdev_queue *rxq;
2391 int result = TC_ACT_OK;
2392 struct Qdisc *q;
2394 if (MAX_RED_LOOP < ttl++) {
2395 printk(KERN_WARNING
2396 "Redir loop detected Dropping packet (%d->%d)\n",
2397 skb->skb_iif, dev->ifindex);
2398 return TC_ACT_SHOT;
2401 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2402 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2404 rxq = &dev->rx_queue;
2406 q = rxq->qdisc;
2407 if (q != &noop_qdisc) {
2408 spin_lock(qdisc_lock(q));
2409 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2410 result = qdisc_enqueue_root(skb, q);
2411 spin_unlock(qdisc_lock(q));
2414 return result;
2417 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2418 struct packet_type **pt_prev,
2419 int *ret, struct net_device *orig_dev)
2421 if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2422 goto out;
2424 if (*pt_prev) {
2425 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2426 *pt_prev = NULL;
2427 } else {
2428 /* Huh? Why does turning on AF_PACKET affect this? */
2429 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2432 switch (ing_filter(skb)) {
2433 case TC_ACT_SHOT:
2434 case TC_ACT_STOLEN:
2435 kfree_skb(skb);
2436 return NULL;
2439 out:
2440 skb->tc_verd = 0;
2441 return skb;
2443 #endif
2446 * netif_nit_deliver - deliver received packets to network taps
2447 * @skb: buffer
2449 * This function is used to deliver incoming packets to network
2450 * taps. It should be used when the normal netif_receive_skb path
2451 * is bypassed, for example because of VLAN acceleration.
2453 void netif_nit_deliver(struct sk_buff *skb)
2455 struct packet_type *ptype;
2457 if (list_empty(&ptype_all))
2458 return;
2460 skb_reset_network_header(skb);
2461 skb_reset_transport_header(skb);
2462 skb->mac_len = skb->network_header - skb->mac_header;
2464 rcu_read_lock();
2465 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2466 if (!ptype->dev || ptype->dev == skb->dev)
2467 deliver_skb(skb, ptype, skb->dev);
2469 rcu_read_unlock();
2473 * netif_receive_skb - process receive buffer from network
2474 * @skb: buffer to process
2476 * netif_receive_skb() is the main receive data processing function.
2477 * It always succeeds. The buffer may be dropped during processing
2478 * for congestion control or by the protocol layers.
2480 * This function may only be called from softirq context and interrupts
2481 * should be enabled.
2483 * Return values (usually ignored):
2484 * NET_RX_SUCCESS: no congestion
2485 * NET_RX_DROP: packet was dropped
2487 int netif_receive_skb(struct sk_buff *skb)
2489 struct packet_type *ptype, *pt_prev;
2490 struct net_device *orig_dev;
2491 struct net_device *master;
2492 struct net_device *null_or_orig;
2493 struct net_device *null_or_bond;
2494 int ret = NET_RX_DROP;
2495 __be16 type;
2497 if (!skb->tstamp.tv64)
2498 net_timestamp(skb);
2500 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2501 return NET_RX_SUCCESS;
2503 /* if we've gotten here through NAPI, check netpoll */
2504 if (netpoll_receive_skb(skb))
2505 return NET_RX_DROP;
2507 if (!skb->skb_iif)
2508 skb->skb_iif = skb->dev->ifindex;
2510 null_or_orig = NULL;
2511 orig_dev = skb->dev;
2512 master = ACCESS_ONCE(orig_dev->master);
2513 if (master) {
2514 if (skb_bond_should_drop(skb, master))
2515 null_or_orig = orig_dev; /* deliver only exact match */
2516 else
2517 skb->dev = master;
2520 __get_cpu_var(netdev_rx_stat).total++;
2522 skb_reset_network_header(skb);
2523 skb_reset_transport_header(skb);
2524 skb->mac_len = skb->network_header - skb->mac_header;
2526 pt_prev = NULL;
2528 rcu_read_lock();
2530 #ifdef CONFIG_NET_CLS_ACT
2531 if (skb->tc_verd & TC_NCLS) {
2532 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2533 goto ncls;
2535 #endif
2537 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2538 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2539 ptype->dev == orig_dev) {
2540 if (pt_prev)
2541 ret = deliver_skb(skb, pt_prev, orig_dev);
2542 pt_prev = ptype;
2546 #ifdef CONFIG_NET_CLS_ACT
2547 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2548 if (!skb)
2549 goto out;
2550 ncls:
2551 #endif
2553 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2554 if (!skb)
2555 goto out;
2556 skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2557 if (!skb)
2558 goto out;
2561 * Make sure frames received on VLAN interfaces stacked on
2562 * bonding interfaces still make their way to any base bonding
2563 * device that may have registered for a specific ptype. The
2564 * handler may have to adjust skb->dev and orig_dev.
2566 null_or_bond = NULL;
2567 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2568 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2569 null_or_bond = vlan_dev_real_dev(skb->dev);
2572 type = skb->protocol;
2573 list_for_each_entry_rcu(ptype,
2574 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2575 if (ptype->type == type && (ptype->dev == null_or_orig ||
2576 ptype->dev == skb->dev || ptype->dev == orig_dev ||
2577 ptype->dev == null_or_bond)) {
2578 if (pt_prev)
2579 ret = deliver_skb(skb, pt_prev, orig_dev);
2580 pt_prev = ptype;
2584 if (pt_prev) {
2585 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2586 } else {
2587 kfree_skb(skb);
2588 /* Jamal, now you will not able to escape explaining
2589 * me how you were going to use this. :-)
2591 ret = NET_RX_DROP;
2594 out:
2595 rcu_read_unlock();
2596 return ret;
2598 EXPORT_SYMBOL(netif_receive_skb);
2600 /* Network device is going away, flush any packets still pending */
2601 static void flush_backlog(void *arg)
2603 struct net_device *dev = arg;
2604 struct softnet_data *queue = &__get_cpu_var(softnet_data);
2605 struct sk_buff *skb, *tmp;
2607 skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2608 if (skb->dev == dev) {
2609 __skb_unlink(skb, &queue->input_pkt_queue);
2610 kfree_skb(skb);
2614 static int napi_gro_complete(struct sk_buff *skb)
2616 struct packet_type *ptype;
2617 __be16 type = skb->protocol;
2618 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2619 int err = -ENOENT;
2621 if (NAPI_GRO_CB(skb)->count == 1) {
2622 skb_shinfo(skb)->gso_size = 0;
2623 goto out;
2626 rcu_read_lock();
2627 list_for_each_entry_rcu(ptype, head, list) {
2628 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2629 continue;
2631 err = ptype->gro_complete(skb);
2632 break;
2634 rcu_read_unlock();
2636 if (err) {
2637 WARN_ON(&ptype->list == head);
2638 kfree_skb(skb);
2639 return NET_RX_SUCCESS;
2642 out:
2643 return netif_receive_skb(skb);
2646 static void napi_gro_flush(struct napi_struct *napi)
2648 struct sk_buff *skb, *next;
2650 for (skb = napi->gro_list; skb; skb = next) {
2651 next = skb->next;
2652 skb->next = NULL;
2653 napi_gro_complete(skb);
2656 napi->gro_count = 0;
2657 napi->gro_list = NULL;
2660 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2662 struct sk_buff **pp = NULL;
2663 struct packet_type *ptype;
2664 __be16 type = skb->protocol;
2665 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2666 int same_flow;
2667 int mac_len;
2668 enum gro_result ret;
2670 if (!(skb->dev->features & NETIF_F_GRO))
2671 goto normal;
2673 if (skb_is_gso(skb) || skb_has_frags(skb))
2674 goto normal;
2676 rcu_read_lock();
2677 list_for_each_entry_rcu(ptype, head, list) {
2678 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2679 continue;
2681 skb_set_network_header(skb, skb_gro_offset(skb));
2682 mac_len = skb->network_header - skb->mac_header;
2683 skb->mac_len = mac_len;
2684 NAPI_GRO_CB(skb)->same_flow = 0;
2685 NAPI_GRO_CB(skb)->flush = 0;
2686 NAPI_GRO_CB(skb)->free = 0;
2688 pp = ptype->gro_receive(&napi->gro_list, skb);
2689 break;
2691 rcu_read_unlock();
2693 if (&ptype->list == head)
2694 goto normal;
2696 same_flow = NAPI_GRO_CB(skb)->same_flow;
2697 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2699 if (pp) {
2700 struct sk_buff *nskb = *pp;
2702 *pp = nskb->next;
2703 nskb->next = NULL;
2704 napi_gro_complete(nskb);
2705 napi->gro_count--;
2708 if (same_flow)
2709 goto ok;
2711 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2712 goto normal;
2714 napi->gro_count++;
2715 NAPI_GRO_CB(skb)->count = 1;
2716 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2717 skb->next = napi->gro_list;
2718 napi->gro_list = skb;
2719 ret = GRO_HELD;
2721 pull:
2722 if (skb_headlen(skb) < skb_gro_offset(skb)) {
2723 int grow = skb_gro_offset(skb) - skb_headlen(skb);
2725 BUG_ON(skb->end - skb->tail < grow);
2727 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
2729 skb->tail += grow;
2730 skb->data_len -= grow;
2732 skb_shinfo(skb)->frags[0].page_offset += grow;
2733 skb_shinfo(skb)->frags[0].size -= grow;
2735 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
2736 put_page(skb_shinfo(skb)->frags[0].page);
2737 memmove(skb_shinfo(skb)->frags,
2738 skb_shinfo(skb)->frags + 1,
2739 --skb_shinfo(skb)->nr_frags);
2744 return ret;
2746 normal:
2747 ret = GRO_NORMAL;
2748 goto pull;
2750 EXPORT_SYMBOL(dev_gro_receive);
2752 static gro_result_t
2753 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2755 struct sk_buff *p;
2757 if (netpoll_rx_on(skb))
2758 return GRO_NORMAL;
2760 for (p = napi->gro_list; p; p = p->next) {
2761 NAPI_GRO_CB(p)->same_flow =
2762 (p->dev == skb->dev) &&
2763 !compare_ether_header(skb_mac_header(p),
2764 skb_gro_mac_header(skb));
2765 NAPI_GRO_CB(p)->flush = 0;
2768 return dev_gro_receive(napi, skb);
2771 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
2773 switch (ret) {
2774 case GRO_NORMAL:
2775 if (netif_receive_skb(skb))
2776 ret = GRO_DROP;
2777 break;
2779 case GRO_DROP:
2780 case GRO_MERGED_FREE:
2781 kfree_skb(skb);
2782 break;
2784 case GRO_HELD:
2785 case GRO_MERGED:
2786 break;
2789 return ret;
2791 EXPORT_SYMBOL(napi_skb_finish);
2793 void skb_gro_reset_offset(struct sk_buff *skb)
2795 NAPI_GRO_CB(skb)->data_offset = 0;
2796 NAPI_GRO_CB(skb)->frag0 = NULL;
2797 NAPI_GRO_CB(skb)->frag0_len = 0;
2799 if (skb->mac_header == skb->tail &&
2800 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
2801 NAPI_GRO_CB(skb)->frag0 =
2802 page_address(skb_shinfo(skb)->frags[0].page) +
2803 skb_shinfo(skb)->frags[0].page_offset;
2804 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
2807 EXPORT_SYMBOL(skb_gro_reset_offset);
2809 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2811 skb_gro_reset_offset(skb);
2813 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2815 EXPORT_SYMBOL(napi_gro_receive);
2817 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2819 __skb_pull(skb, skb_headlen(skb));
2820 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2822 napi->skb = skb;
2824 EXPORT_SYMBOL(napi_reuse_skb);
2826 struct sk_buff *napi_get_frags(struct napi_struct *napi)
2828 struct sk_buff *skb = napi->skb;
2830 if (!skb) {
2831 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
2832 if (skb)
2833 napi->skb = skb;
2835 return skb;
2837 EXPORT_SYMBOL(napi_get_frags);
2839 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
2840 gro_result_t ret)
2842 switch (ret) {
2843 case GRO_NORMAL:
2844 case GRO_HELD:
2845 skb->protocol = eth_type_trans(skb, skb->dev);
2847 if (ret == GRO_HELD)
2848 skb_gro_pull(skb, -ETH_HLEN);
2849 else if (netif_receive_skb(skb))
2850 ret = GRO_DROP;
2851 break;
2853 case GRO_DROP:
2854 case GRO_MERGED_FREE:
2855 napi_reuse_skb(napi, skb);
2856 break;
2858 case GRO_MERGED:
2859 break;
2862 return ret;
2864 EXPORT_SYMBOL(napi_frags_finish);
2866 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
2868 struct sk_buff *skb = napi->skb;
2869 struct ethhdr *eth;
2870 unsigned int hlen;
2871 unsigned int off;
2873 napi->skb = NULL;
2875 skb_reset_mac_header(skb);
2876 skb_gro_reset_offset(skb);
2878 off = skb_gro_offset(skb);
2879 hlen = off + sizeof(*eth);
2880 eth = skb_gro_header_fast(skb, off);
2881 if (skb_gro_header_hard(skb, hlen)) {
2882 eth = skb_gro_header_slow(skb, hlen, off);
2883 if (unlikely(!eth)) {
2884 napi_reuse_skb(napi, skb);
2885 skb = NULL;
2886 goto out;
2890 skb_gro_pull(skb, sizeof(*eth));
2893 * This works because the only protocols we care about don't require
2894 * special handling. We'll fix it up properly at the end.
2896 skb->protocol = eth->h_proto;
2898 out:
2899 return skb;
2901 EXPORT_SYMBOL(napi_frags_skb);
2903 gro_result_t napi_gro_frags(struct napi_struct *napi)
2905 struct sk_buff *skb = napi_frags_skb(napi);
2907 if (!skb)
2908 return GRO_DROP;
2910 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2912 EXPORT_SYMBOL(napi_gro_frags);
2914 static int process_backlog(struct napi_struct *napi, int quota)
2916 int work = 0;
2917 struct softnet_data *queue = &__get_cpu_var(softnet_data);
2918 unsigned long start_time = jiffies;
2920 napi->weight = weight_p;
2921 do {
2922 struct sk_buff *skb;
2924 local_irq_disable();
2925 skb = __skb_dequeue(&queue->input_pkt_queue);
2926 if (!skb) {
2927 __napi_complete(napi);
2928 local_irq_enable();
2929 break;
2931 local_irq_enable();
2933 netif_receive_skb(skb);
2934 } while (++work < quota && jiffies == start_time);
2936 return work;
2940 * __napi_schedule - schedule for receive
2941 * @n: entry to schedule
2943 * The entry's receive function will be scheduled to run
2945 void __napi_schedule(struct napi_struct *n)
2947 unsigned long flags;
2949 local_irq_save(flags);
2950 list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2951 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2952 local_irq_restore(flags);
2954 EXPORT_SYMBOL(__napi_schedule);
2956 void __napi_complete(struct napi_struct *n)
2958 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2959 BUG_ON(n->gro_list);
2961 list_del(&n->poll_list);
2962 smp_mb__before_clear_bit();
2963 clear_bit(NAPI_STATE_SCHED, &n->state);
2965 EXPORT_SYMBOL(__napi_complete);
2967 void napi_complete(struct napi_struct *n)
2969 unsigned long flags;
2972 * don't let napi dequeue from the cpu poll list
2973 * just in case its running on a different cpu
2975 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2976 return;
2978 napi_gro_flush(n);
2979 local_irq_save(flags);
2980 __napi_complete(n);
2981 local_irq_restore(flags);
2983 EXPORT_SYMBOL(napi_complete);
2985 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2986 int (*poll)(struct napi_struct *, int), int weight)
2988 INIT_LIST_HEAD(&napi->poll_list);
2989 napi->gro_count = 0;
2990 napi->gro_list = NULL;
2991 napi->skb = NULL;
2992 napi->poll = poll;
2993 napi->weight = weight;
2994 list_add(&napi->dev_list, &dev->napi_list);
2995 napi->dev = dev;
2996 #ifdef CONFIG_NETPOLL
2997 spin_lock_init(&napi->poll_lock);
2998 napi->poll_owner = -1;
2999 #endif
3000 set_bit(NAPI_STATE_SCHED, &napi->state);
3002 EXPORT_SYMBOL(netif_napi_add);
3004 void netif_napi_del(struct napi_struct *napi)
3006 struct sk_buff *skb, *next;
3008 list_del_init(&napi->dev_list);
3009 napi_free_frags(napi);
3011 for (skb = napi->gro_list; skb; skb = next) {
3012 next = skb->next;
3013 skb->next = NULL;
3014 kfree_skb(skb);
3017 napi->gro_list = NULL;
3018 napi->gro_count = 0;
3020 EXPORT_SYMBOL(netif_napi_del);
3023 static void net_rx_action(struct softirq_action *h)
3025 struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
3026 unsigned long time_limit = jiffies + 2;
3027 int budget = netdev_budget;
3028 void *have;
3030 local_irq_disable();
3032 while (!list_empty(list)) {
3033 struct napi_struct *n;
3034 int work, weight;
3036 /* If softirq window is exhuasted then punt.
3037 * Allow this to run for 2 jiffies since which will allow
3038 * an average latency of 1.5/HZ.
3040 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3041 goto softnet_break;
3043 local_irq_enable();
3045 /* Even though interrupts have been re-enabled, this
3046 * access is safe because interrupts can only add new
3047 * entries to the tail of this list, and only ->poll()
3048 * calls can remove this head entry from the list.
3050 n = list_first_entry(list, struct napi_struct, poll_list);
3052 have = netpoll_poll_lock(n);
3054 weight = n->weight;
3056 /* This NAPI_STATE_SCHED test is for avoiding a race
3057 * with netpoll's poll_napi(). Only the entity which
3058 * obtains the lock and sees NAPI_STATE_SCHED set will
3059 * actually make the ->poll() call. Therefore we avoid
3060 * accidently calling ->poll() when NAPI is not scheduled.
3062 work = 0;
3063 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3064 work = n->poll(n, weight);
3065 trace_napi_poll(n);
3068 WARN_ON_ONCE(work > weight);
3070 budget -= work;
3072 local_irq_disable();
3074 /* Drivers must not modify the NAPI state if they
3075 * consume the entire weight. In such cases this code
3076 * still "owns" the NAPI instance and therefore can
3077 * move the instance around on the list at-will.
3079 if (unlikely(work == weight)) {
3080 if (unlikely(napi_disable_pending(n))) {
3081 local_irq_enable();
3082 napi_complete(n);
3083 local_irq_disable();
3084 } else
3085 list_move_tail(&n->poll_list, list);
3088 netpoll_poll_unlock(have);
3090 out:
3091 local_irq_enable();
3093 #ifdef CONFIG_NET_DMA
3095 * There may not be any more sk_buffs coming right now, so push
3096 * any pending DMA copies to hardware
3098 dma_issue_pending_all();
3099 #endif
3101 return;
3103 softnet_break:
3104 __get_cpu_var(netdev_rx_stat).time_squeeze++;
3105 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3106 goto out;
3109 static gifconf_func_t *gifconf_list[NPROTO];
3112 * register_gifconf - register a SIOCGIF handler
3113 * @family: Address family
3114 * @gifconf: Function handler
3116 * Register protocol dependent address dumping routines. The handler
3117 * that is passed must not be freed or reused until it has been replaced
3118 * by another handler.
3120 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3122 if (family >= NPROTO)
3123 return -EINVAL;
3124 gifconf_list[family] = gifconf;
3125 return 0;
3127 EXPORT_SYMBOL(register_gifconf);
3131 * Map an interface index to its name (SIOCGIFNAME)
3135 * We need this ioctl for efficient implementation of the
3136 * if_indextoname() function required by the IPv6 API. Without
3137 * it, we would have to search all the interfaces to find a
3138 * match. --pb
3141 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3143 struct net_device *dev;
3144 struct ifreq ifr;
3147 * Fetch the caller's info block.
3150 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3151 return -EFAULT;
3153 rcu_read_lock();
3154 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3155 if (!dev) {
3156 rcu_read_unlock();
3157 return -ENODEV;
3160 strcpy(ifr.ifr_name, dev->name);
3161 rcu_read_unlock();
3163 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3164 return -EFAULT;
3165 return 0;
3169 * Perform a SIOCGIFCONF call. This structure will change
3170 * size eventually, and there is nothing I can do about it.
3171 * Thus we will need a 'compatibility mode'.
3174 static int dev_ifconf(struct net *net, char __user *arg)
3176 struct ifconf ifc;
3177 struct net_device *dev;
3178 char __user *pos;
3179 int len;
3180 int total;
3181 int i;
3184 * Fetch the caller's info block.
3187 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3188 return -EFAULT;
3190 pos = ifc.ifc_buf;
3191 len = ifc.ifc_len;
3194 * Loop over the interfaces, and write an info block for each.
3197 total = 0;
3198 for_each_netdev(net, dev) {
3199 for (i = 0; i < NPROTO; i++) {
3200 if (gifconf_list[i]) {
3201 int done;
3202 if (!pos)
3203 done = gifconf_list[i](dev, NULL, 0);
3204 else
3205 done = gifconf_list[i](dev, pos + total,
3206 len - total);
3207 if (done < 0)
3208 return -EFAULT;
3209 total += done;
3215 * All done. Write the updated control block back to the caller.
3217 ifc.ifc_len = total;
3220 * Both BSD and Solaris return 0 here, so we do too.
3222 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3225 #ifdef CONFIG_PROC_FS
3227 * This is invoked by the /proc filesystem handler to display a device
3228 * in detail.
3230 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3231 __acquires(RCU)
3233 struct net *net = seq_file_net(seq);
3234 loff_t off;
3235 struct net_device *dev;
3237 rcu_read_lock();
3238 if (!*pos)
3239 return SEQ_START_TOKEN;
3241 off = 1;
3242 for_each_netdev_rcu(net, dev)
3243 if (off++ == *pos)
3244 return dev;
3246 return NULL;
3249 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3251 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3252 first_net_device(seq_file_net(seq)) :
3253 next_net_device((struct net_device *)v);
3255 ++*pos;
3256 return rcu_dereference(dev);
3259 void dev_seq_stop(struct seq_file *seq, void *v)
3260 __releases(RCU)
3262 rcu_read_unlock();
3265 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3267 const struct net_device_stats *stats = dev_get_stats(dev);
3269 seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3270 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3271 dev->name, stats->rx_bytes, stats->rx_packets,
3272 stats->rx_errors,
3273 stats->rx_dropped + stats->rx_missed_errors,
3274 stats->rx_fifo_errors,
3275 stats->rx_length_errors + stats->rx_over_errors +
3276 stats->rx_crc_errors + stats->rx_frame_errors,
3277 stats->rx_compressed, stats->multicast,
3278 stats->tx_bytes, stats->tx_packets,
3279 stats->tx_errors, stats->tx_dropped,
3280 stats->tx_fifo_errors, stats->collisions,
3281 stats->tx_carrier_errors +
3282 stats->tx_aborted_errors +
3283 stats->tx_window_errors +
3284 stats->tx_heartbeat_errors,
3285 stats->tx_compressed);
3289 * Called from the PROCfs module. This now uses the new arbitrary sized
3290 * /proc/net interface to create /proc/net/dev
3292 static int dev_seq_show(struct seq_file *seq, void *v)
3294 if (v == SEQ_START_TOKEN)
3295 seq_puts(seq, "Inter-| Receive "
3296 " | Transmit\n"
3297 " face |bytes packets errs drop fifo frame "
3298 "compressed multicast|bytes packets errs "
3299 "drop fifo colls carrier compressed\n");
3300 else
3301 dev_seq_printf_stats(seq, v);
3302 return 0;
3305 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3307 struct netif_rx_stats *rc = NULL;
3309 while (*pos < nr_cpu_ids)
3310 if (cpu_online(*pos)) {
3311 rc = &per_cpu(netdev_rx_stat, *pos);
3312 break;
3313 } else
3314 ++*pos;
3315 return rc;
3318 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3320 return softnet_get_online(pos);
3323 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3325 ++*pos;
3326 return softnet_get_online(pos);
3329 static void softnet_seq_stop(struct seq_file *seq, void *v)
3333 static int softnet_seq_show(struct seq_file *seq, void *v)
3335 struct netif_rx_stats *s = v;
3337 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3338 s->total, s->dropped, s->time_squeeze, 0,
3339 0, 0, 0, 0, /* was fastroute */
3340 s->cpu_collision);
3341 return 0;
3344 static const struct seq_operations dev_seq_ops = {
3345 .start = dev_seq_start,
3346 .next = dev_seq_next,
3347 .stop = dev_seq_stop,
3348 .show = dev_seq_show,
3351 static int dev_seq_open(struct inode *inode, struct file *file)
3353 return seq_open_net(inode, file, &dev_seq_ops,
3354 sizeof(struct seq_net_private));
3357 static const struct file_operations dev_seq_fops = {
3358 .owner = THIS_MODULE,
3359 .open = dev_seq_open,
3360 .read = seq_read,
3361 .llseek = seq_lseek,
3362 .release = seq_release_net,
3365 static const struct seq_operations softnet_seq_ops = {
3366 .start = softnet_seq_start,
3367 .next = softnet_seq_next,
3368 .stop = softnet_seq_stop,
3369 .show = softnet_seq_show,
3372 static int softnet_seq_open(struct inode *inode, struct file *file)
3374 return seq_open(file, &softnet_seq_ops);
3377 static const struct file_operations softnet_seq_fops = {
3378 .owner = THIS_MODULE,
3379 .open = softnet_seq_open,
3380 .read = seq_read,
3381 .llseek = seq_lseek,
3382 .release = seq_release,
3385 static void *ptype_get_idx(loff_t pos)
3387 struct packet_type *pt = NULL;
3388 loff_t i = 0;
3389 int t;
3391 list_for_each_entry_rcu(pt, &ptype_all, list) {
3392 if (i == pos)
3393 return pt;
3394 ++i;
3397 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3398 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3399 if (i == pos)
3400 return pt;
3401 ++i;
3404 return NULL;
3407 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3408 __acquires(RCU)
3410 rcu_read_lock();
3411 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3414 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3416 struct packet_type *pt;
3417 struct list_head *nxt;
3418 int hash;
3420 ++*pos;
3421 if (v == SEQ_START_TOKEN)
3422 return ptype_get_idx(0);
3424 pt = v;
3425 nxt = pt->list.next;
3426 if (pt->type == htons(ETH_P_ALL)) {
3427 if (nxt != &ptype_all)
3428 goto found;
3429 hash = 0;
3430 nxt = ptype_base[0].next;
3431 } else
3432 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3434 while (nxt == &ptype_base[hash]) {
3435 if (++hash >= PTYPE_HASH_SIZE)
3436 return NULL;
3437 nxt = ptype_base[hash].next;
3439 found:
3440 return list_entry(nxt, struct packet_type, list);
3443 static void ptype_seq_stop(struct seq_file *seq, void *v)
3444 __releases(RCU)
3446 rcu_read_unlock();
3449 static int ptype_seq_show(struct seq_file *seq, void *v)
3451 struct packet_type *pt = v;
3453 if (v == SEQ_START_TOKEN)
3454 seq_puts(seq, "Type Device Function\n");
3455 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3456 if (pt->type == htons(ETH_P_ALL))
3457 seq_puts(seq, "ALL ");
3458 else
3459 seq_printf(seq, "%04x", ntohs(pt->type));
3461 seq_printf(seq, " %-8s %pF\n",
3462 pt->dev ? pt->dev->name : "", pt->func);
3465 return 0;
3468 static const struct seq_operations ptype_seq_ops = {
3469 .start = ptype_seq_start,
3470 .next = ptype_seq_next,
3471 .stop = ptype_seq_stop,
3472 .show = ptype_seq_show,
3475 static int ptype_seq_open(struct inode *inode, struct file *file)
3477 return seq_open_net(inode, file, &ptype_seq_ops,
3478 sizeof(struct seq_net_private));
3481 static const struct file_operations ptype_seq_fops = {
3482 .owner = THIS_MODULE,
3483 .open = ptype_seq_open,
3484 .read = seq_read,
3485 .llseek = seq_lseek,
3486 .release = seq_release_net,
3490 static int __net_init dev_proc_net_init(struct net *net)
3492 int rc = -ENOMEM;
3494 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3495 goto out;
3496 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3497 goto out_dev;
3498 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3499 goto out_softnet;
3501 if (wext_proc_init(net))
3502 goto out_ptype;
3503 rc = 0;
3504 out:
3505 return rc;
3506 out_ptype:
3507 proc_net_remove(net, "ptype");
3508 out_softnet:
3509 proc_net_remove(net, "softnet_stat");
3510 out_dev:
3511 proc_net_remove(net, "dev");
3512 goto out;
3515 static void __net_exit dev_proc_net_exit(struct net *net)
3517 wext_proc_exit(net);
3519 proc_net_remove(net, "ptype");
3520 proc_net_remove(net, "softnet_stat");
3521 proc_net_remove(net, "dev");
3524 static struct pernet_operations __net_initdata dev_proc_ops = {
3525 .init = dev_proc_net_init,
3526 .exit = dev_proc_net_exit,
3529 static int __init dev_proc_init(void)
3531 return register_pernet_subsys(&dev_proc_ops);
3533 #else
3534 #define dev_proc_init() 0
3535 #endif /* CONFIG_PROC_FS */
3539 * netdev_set_master - set up master/slave pair
3540 * @slave: slave device
3541 * @master: new master device
3543 * Changes the master device of the slave. Pass %NULL to break the
3544 * bonding. The caller must hold the RTNL semaphore. On a failure
3545 * a negative errno code is returned. On success the reference counts
3546 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3547 * function returns zero.
3549 int netdev_set_master(struct net_device *slave, struct net_device *master)
3551 struct net_device *old = slave->master;
3553 ASSERT_RTNL();
3555 if (master) {
3556 if (old)
3557 return -EBUSY;
3558 dev_hold(master);
3561 slave->master = master;
3563 synchronize_net();
3565 if (old)
3566 dev_put(old);
3568 if (master)
3569 slave->flags |= IFF_SLAVE;
3570 else
3571 slave->flags &= ~IFF_SLAVE;
3573 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3574 return 0;
3576 EXPORT_SYMBOL(netdev_set_master);
3578 static void dev_change_rx_flags(struct net_device *dev, int flags)
3580 const struct net_device_ops *ops = dev->netdev_ops;
3582 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3583 ops->ndo_change_rx_flags(dev, flags);
3586 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3588 unsigned short old_flags = dev->flags;
3589 uid_t uid;
3590 gid_t gid;
3592 ASSERT_RTNL();
3594 dev->flags |= IFF_PROMISC;
3595 dev->promiscuity += inc;
3596 if (dev->promiscuity == 0) {
3598 * Avoid overflow.
3599 * If inc causes overflow, untouch promisc and return error.
3601 if (inc < 0)
3602 dev->flags &= ~IFF_PROMISC;
3603 else {
3604 dev->promiscuity -= inc;
3605 printk(KERN_WARNING "%s: promiscuity touches roof, "
3606 "set promiscuity failed, promiscuity feature "
3607 "of device might be broken.\n", dev->name);
3608 return -EOVERFLOW;
3611 if (dev->flags != old_flags) {
3612 printk(KERN_INFO "device %s %s promiscuous mode\n",
3613 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3614 "left");
3615 if (audit_enabled) {
3616 current_uid_gid(&uid, &gid);
3617 audit_log(current->audit_context, GFP_ATOMIC,
3618 AUDIT_ANOM_PROMISCUOUS,
3619 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3620 dev->name, (dev->flags & IFF_PROMISC),
3621 (old_flags & IFF_PROMISC),
3622 audit_get_loginuid(current),
3623 uid, gid,
3624 audit_get_sessionid(current));
3627 dev_change_rx_flags(dev, IFF_PROMISC);
3629 return 0;
3633 * dev_set_promiscuity - update promiscuity count on a device
3634 * @dev: device
3635 * @inc: modifier
3637 * Add or remove promiscuity from a device. While the count in the device
3638 * remains above zero the interface remains promiscuous. Once it hits zero
3639 * the device reverts back to normal filtering operation. A negative inc
3640 * value is used to drop promiscuity on the device.
3641 * Return 0 if successful or a negative errno code on error.
3643 int dev_set_promiscuity(struct net_device *dev, int inc)
3645 unsigned short old_flags = dev->flags;
3646 int err;
3648 err = __dev_set_promiscuity(dev, inc);
3649 if (err < 0)
3650 return err;
3651 if (dev->flags != old_flags)
3652 dev_set_rx_mode(dev);
3653 return err;
3655 EXPORT_SYMBOL(dev_set_promiscuity);
3658 * dev_set_allmulti - update allmulti count on a device
3659 * @dev: device
3660 * @inc: modifier
3662 * Add or remove reception of all multicast frames to a device. While the
3663 * count in the device remains above zero the interface remains listening
3664 * to all interfaces. Once it hits zero the device reverts back to normal
3665 * filtering operation. A negative @inc value is used to drop the counter
3666 * when releasing a resource needing all multicasts.
3667 * Return 0 if successful or a negative errno code on error.
3670 int dev_set_allmulti(struct net_device *dev, int inc)
3672 unsigned short old_flags = dev->flags;
3674 ASSERT_RTNL();
3676 dev->flags |= IFF_ALLMULTI;
3677 dev->allmulti += inc;
3678 if (dev->allmulti == 0) {
3680 * Avoid overflow.
3681 * If inc causes overflow, untouch allmulti and return error.
3683 if (inc < 0)
3684 dev->flags &= ~IFF_ALLMULTI;
3685 else {
3686 dev->allmulti -= inc;
3687 printk(KERN_WARNING "%s: allmulti touches roof, "
3688 "set allmulti failed, allmulti feature of "
3689 "device might be broken.\n", dev->name);
3690 return -EOVERFLOW;
3693 if (dev->flags ^ old_flags) {
3694 dev_change_rx_flags(dev, IFF_ALLMULTI);
3695 dev_set_rx_mode(dev);
3697 return 0;
3699 EXPORT_SYMBOL(dev_set_allmulti);
3702 * Upload unicast and multicast address lists to device and
3703 * configure RX filtering. When the device doesn't support unicast
3704 * filtering it is put in promiscuous mode while unicast addresses
3705 * are present.
3707 void __dev_set_rx_mode(struct net_device *dev)
3709 const struct net_device_ops *ops = dev->netdev_ops;
3711 /* dev_open will call this function so the list will stay sane. */
3712 if (!(dev->flags&IFF_UP))
3713 return;
3715 if (!netif_device_present(dev))
3716 return;
3718 if (ops->ndo_set_rx_mode)
3719 ops->ndo_set_rx_mode(dev);
3720 else {
3721 /* Unicast addresses changes may only happen under the rtnl,
3722 * therefore calling __dev_set_promiscuity here is safe.
3724 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
3725 __dev_set_promiscuity(dev, 1);
3726 dev->uc_promisc = 1;
3727 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
3728 __dev_set_promiscuity(dev, -1);
3729 dev->uc_promisc = 0;
3732 if (ops->ndo_set_multicast_list)
3733 ops->ndo_set_multicast_list(dev);
3737 void dev_set_rx_mode(struct net_device *dev)
3739 netif_addr_lock_bh(dev);
3740 __dev_set_rx_mode(dev);
3741 netif_addr_unlock_bh(dev);
3744 /* hw addresses list handling functions */
3746 static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
3747 int addr_len, unsigned char addr_type)
3749 struct netdev_hw_addr *ha;
3750 int alloc_size;
3752 if (addr_len > MAX_ADDR_LEN)
3753 return -EINVAL;
3755 list_for_each_entry(ha, &list->list, list) {
3756 if (!memcmp(ha->addr, addr, addr_len) &&
3757 ha->type == addr_type) {
3758 ha->refcount++;
3759 return 0;
3764 alloc_size = sizeof(*ha);
3765 if (alloc_size < L1_CACHE_BYTES)
3766 alloc_size = L1_CACHE_BYTES;
3767 ha = kmalloc(alloc_size, GFP_ATOMIC);
3768 if (!ha)
3769 return -ENOMEM;
3770 memcpy(ha->addr, addr, addr_len);
3771 ha->type = addr_type;
3772 ha->refcount = 1;
3773 ha->synced = false;
3774 list_add_tail_rcu(&ha->list, &list->list);
3775 list->count++;
3776 return 0;
3779 static void ha_rcu_free(struct rcu_head *head)
3781 struct netdev_hw_addr *ha;
3783 ha = container_of(head, struct netdev_hw_addr, rcu_head);
3784 kfree(ha);
3787 static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
3788 int addr_len, unsigned char addr_type)
3790 struct netdev_hw_addr *ha;
3792 list_for_each_entry(ha, &list->list, list) {
3793 if (!memcmp(ha->addr, addr, addr_len) &&
3794 (ha->type == addr_type || !addr_type)) {
3795 if (--ha->refcount)
3796 return 0;
3797 list_del_rcu(&ha->list);
3798 call_rcu(&ha->rcu_head, ha_rcu_free);
3799 list->count--;
3800 return 0;
3803 return -ENOENT;
3806 static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
3807 struct netdev_hw_addr_list *from_list,
3808 int addr_len,
3809 unsigned char addr_type)
3811 int err;
3812 struct netdev_hw_addr *ha, *ha2;
3813 unsigned char type;
3815 list_for_each_entry(ha, &from_list->list, list) {
3816 type = addr_type ? addr_type : ha->type;
3817 err = __hw_addr_add(to_list, ha->addr, addr_len, type);
3818 if (err)
3819 goto unroll;
3821 return 0;
3823 unroll:
3824 list_for_each_entry(ha2, &from_list->list, list) {
3825 if (ha2 == ha)
3826 break;
3827 type = addr_type ? addr_type : ha2->type;
3828 __hw_addr_del(to_list, ha2->addr, addr_len, type);
3830 return err;
3833 static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
3834 struct netdev_hw_addr_list *from_list,
3835 int addr_len,
3836 unsigned char addr_type)
3838 struct netdev_hw_addr *ha;
3839 unsigned char type;
3841 list_for_each_entry(ha, &from_list->list, list) {
3842 type = addr_type ? addr_type : ha->type;
3843 __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
3847 static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
3848 struct netdev_hw_addr_list *from_list,
3849 int addr_len)
3851 int err = 0;
3852 struct netdev_hw_addr *ha, *tmp;
3854 list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3855 if (!ha->synced) {
3856 err = __hw_addr_add(to_list, ha->addr,
3857 addr_len, ha->type);
3858 if (err)
3859 break;
3860 ha->synced = true;
3861 ha->refcount++;
3862 } else if (ha->refcount == 1) {
3863 __hw_addr_del(to_list, ha->addr, addr_len, ha->type);
3864 __hw_addr_del(from_list, ha->addr, addr_len, ha->type);
3867 return err;
3870 static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
3871 struct netdev_hw_addr_list *from_list,
3872 int addr_len)
3874 struct netdev_hw_addr *ha, *tmp;
3876 list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3877 if (ha->synced) {
3878 __hw_addr_del(to_list, ha->addr,
3879 addr_len, ha->type);
3880 ha->synced = false;
3881 __hw_addr_del(from_list, ha->addr,
3882 addr_len, ha->type);
3887 static void __hw_addr_flush(struct netdev_hw_addr_list *list)
3889 struct netdev_hw_addr *ha, *tmp;
3891 list_for_each_entry_safe(ha, tmp, &list->list, list) {
3892 list_del_rcu(&ha->list);
3893 call_rcu(&ha->rcu_head, ha_rcu_free);
3895 list->count = 0;
3898 static void __hw_addr_init(struct netdev_hw_addr_list *list)
3900 INIT_LIST_HEAD(&list->list);
3901 list->count = 0;
3904 /* Device addresses handling functions */
3906 static void dev_addr_flush(struct net_device *dev)
3908 /* rtnl_mutex must be held here */
3910 __hw_addr_flush(&dev->dev_addrs);
3911 dev->dev_addr = NULL;
3914 static int dev_addr_init(struct net_device *dev)
3916 unsigned char addr[MAX_ADDR_LEN];
3917 struct netdev_hw_addr *ha;
3918 int err;
3920 /* rtnl_mutex must be held here */
3922 __hw_addr_init(&dev->dev_addrs);
3923 memset(addr, 0, sizeof(addr));
3924 err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
3925 NETDEV_HW_ADDR_T_LAN);
3926 if (!err) {
3928 * Get the first (previously created) address from the list
3929 * and set dev_addr pointer to this location.
3931 ha = list_first_entry(&dev->dev_addrs.list,
3932 struct netdev_hw_addr, list);
3933 dev->dev_addr = ha->addr;
3935 return err;
3939 * dev_addr_add - Add a device address
3940 * @dev: device
3941 * @addr: address to add
3942 * @addr_type: address type
3944 * Add a device address to the device or increase the reference count if
3945 * it already exists.
3947 * The caller must hold the rtnl_mutex.
3949 int dev_addr_add(struct net_device *dev, unsigned char *addr,
3950 unsigned char addr_type)
3952 int err;
3954 ASSERT_RTNL();
3956 err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
3957 if (!err)
3958 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3959 return err;
3961 EXPORT_SYMBOL(dev_addr_add);
3964 * dev_addr_del - Release a device address.
3965 * @dev: device
3966 * @addr: address to delete
3967 * @addr_type: address type
3969 * Release reference to a device address and remove it from the device
3970 * if the reference count drops to zero.
3972 * The caller must hold the rtnl_mutex.
3974 int dev_addr_del(struct net_device *dev, unsigned char *addr,
3975 unsigned char addr_type)
3977 int err;
3978 struct netdev_hw_addr *ha;
3980 ASSERT_RTNL();
3983 * We can not remove the first address from the list because
3984 * dev->dev_addr points to that.
3986 ha = list_first_entry(&dev->dev_addrs.list,
3987 struct netdev_hw_addr, list);
3988 if (ha->addr == dev->dev_addr && ha->refcount == 1)
3989 return -ENOENT;
3991 err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
3992 addr_type);
3993 if (!err)
3994 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3995 return err;
3997 EXPORT_SYMBOL(dev_addr_del);
4000 * dev_addr_add_multiple - Add device addresses from another device
4001 * @to_dev: device to which addresses will be added
4002 * @from_dev: device from which addresses will be added
4003 * @addr_type: address type - 0 means type will be used from from_dev
4005 * Add device addresses of the one device to another.
4007 * The caller must hold the rtnl_mutex.
4009 int dev_addr_add_multiple(struct net_device *to_dev,
4010 struct net_device *from_dev,
4011 unsigned char addr_type)
4013 int err;
4015 ASSERT_RTNL();
4017 if (from_dev->addr_len != to_dev->addr_len)
4018 return -EINVAL;
4019 err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
4020 to_dev->addr_len, addr_type);
4021 if (!err)
4022 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
4023 return err;
4025 EXPORT_SYMBOL(dev_addr_add_multiple);
4028 * dev_addr_del_multiple - Delete device addresses by another device
4029 * @to_dev: device where the addresses will be deleted
4030 * @from_dev: device by which addresses the addresses will be deleted
4031 * @addr_type: address type - 0 means type will used from from_dev
4033 * Deletes addresses in to device by the list of addresses in from device.
4035 * The caller must hold the rtnl_mutex.
4037 int dev_addr_del_multiple(struct net_device *to_dev,
4038 struct net_device *from_dev,
4039 unsigned char addr_type)
4041 ASSERT_RTNL();
4043 if (from_dev->addr_len != to_dev->addr_len)
4044 return -EINVAL;
4045 __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
4046 to_dev->addr_len, addr_type);
4047 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
4048 return 0;
4050 EXPORT_SYMBOL(dev_addr_del_multiple);
4052 /* multicast addresses handling functions */
4054 int __dev_addr_delete(struct dev_addr_list **list, int *count,
4055 void *addr, int alen, int glbl)
4057 struct dev_addr_list *da;
4059 for (; (da = *list) != NULL; list = &da->next) {
4060 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4061 alen == da->da_addrlen) {
4062 if (glbl) {
4063 int old_glbl = da->da_gusers;
4064 da->da_gusers = 0;
4065 if (old_glbl == 0)
4066 break;
4068 if (--da->da_users)
4069 return 0;
4071 *list = da->next;
4072 kfree(da);
4073 (*count)--;
4074 return 0;
4077 return -ENOENT;
4080 int __dev_addr_add(struct dev_addr_list **list, int *count,
4081 void *addr, int alen, int glbl)
4083 struct dev_addr_list *da;
4085 for (da = *list; da != NULL; da = da->next) {
4086 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4087 da->da_addrlen == alen) {
4088 if (glbl) {
4089 int old_glbl = da->da_gusers;
4090 da->da_gusers = 1;
4091 if (old_glbl)
4092 return 0;
4094 da->da_users++;
4095 return 0;
4099 da = kzalloc(sizeof(*da), GFP_ATOMIC);
4100 if (da == NULL)
4101 return -ENOMEM;
4102 memcpy(da->da_addr, addr, alen);
4103 da->da_addrlen = alen;
4104 da->da_users = 1;
4105 da->da_gusers = glbl ? 1 : 0;
4106 da->next = *list;
4107 *list = da;
4108 (*count)++;
4109 return 0;
4113 * dev_unicast_delete - Release secondary unicast address.
4114 * @dev: device
4115 * @addr: address to delete
4117 * Release reference to a secondary unicast address and remove it
4118 * from the device if the reference count drops to zero.
4120 * The caller must hold the rtnl_mutex.
4122 int dev_unicast_delete(struct net_device *dev, void *addr)
4124 int err;
4126 ASSERT_RTNL();
4128 netif_addr_lock_bh(dev);
4129 err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
4130 NETDEV_HW_ADDR_T_UNICAST);
4131 if (!err)
4132 __dev_set_rx_mode(dev);
4133 netif_addr_unlock_bh(dev);
4134 return err;
4136 EXPORT_SYMBOL(dev_unicast_delete);
4139 * dev_unicast_add - add a secondary unicast address
4140 * @dev: device
4141 * @addr: address to add
4143 * Add a secondary unicast address to the device or increase
4144 * the reference count if it already exists.
4146 * The caller must hold the rtnl_mutex.
4148 int dev_unicast_add(struct net_device *dev, void *addr)
4150 int err;
4152 ASSERT_RTNL();
4154 netif_addr_lock_bh(dev);
4155 err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
4156 NETDEV_HW_ADDR_T_UNICAST);
4157 if (!err)
4158 __dev_set_rx_mode(dev);
4159 netif_addr_unlock_bh(dev);
4160 return err;
4162 EXPORT_SYMBOL(dev_unicast_add);
4164 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
4165 struct dev_addr_list **from, int *from_count)
4167 struct dev_addr_list *da, *next;
4168 int err = 0;
4170 da = *from;
4171 while (da != NULL) {
4172 next = da->next;
4173 if (!da->da_synced) {
4174 err = __dev_addr_add(to, to_count,
4175 da->da_addr, da->da_addrlen, 0);
4176 if (err < 0)
4177 break;
4178 da->da_synced = 1;
4179 da->da_users++;
4180 } else if (da->da_users == 1) {
4181 __dev_addr_delete(to, to_count,
4182 da->da_addr, da->da_addrlen, 0);
4183 __dev_addr_delete(from, from_count,
4184 da->da_addr, da->da_addrlen, 0);
4186 da = next;
4188 return err;
4190 EXPORT_SYMBOL_GPL(__dev_addr_sync);
4192 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
4193 struct dev_addr_list **from, int *from_count)
4195 struct dev_addr_list *da, *next;
4197 da = *from;
4198 while (da != NULL) {
4199 next = da->next;
4200 if (da->da_synced) {
4201 __dev_addr_delete(to, to_count,
4202 da->da_addr, da->da_addrlen, 0);
4203 da->da_synced = 0;
4204 __dev_addr_delete(from, from_count,
4205 da->da_addr, da->da_addrlen, 0);
4207 da = next;
4210 EXPORT_SYMBOL_GPL(__dev_addr_unsync);
4213 * dev_unicast_sync - Synchronize device's unicast list to another device
4214 * @to: destination device
4215 * @from: source device
4217 * Add newly added addresses to the destination device and release
4218 * addresses that have no users left. The source device must be
4219 * locked by netif_tx_lock_bh.
4221 * This function is intended to be called from the dev->set_rx_mode
4222 * function of layered software devices.
4224 int dev_unicast_sync(struct net_device *to, struct net_device *from)
4226 int err = 0;
4228 if (to->addr_len != from->addr_len)
4229 return -EINVAL;
4231 netif_addr_lock_bh(to);
4232 err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
4233 if (!err)
4234 __dev_set_rx_mode(to);
4235 netif_addr_unlock_bh(to);
4236 return err;
4238 EXPORT_SYMBOL(dev_unicast_sync);
4241 * dev_unicast_unsync - Remove synchronized addresses from the destination device
4242 * @to: destination device
4243 * @from: source device
4245 * Remove all addresses that were added to the destination device by
4246 * dev_unicast_sync(). This function is intended to be called from the
4247 * dev->stop function of layered software devices.
4249 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4251 if (to->addr_len != from->addr_len)
4252 return;
4254 netif_addr_lock_bh(from);
4255 netif_addr_lock(to);
4256 __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
4257 __dev_set_rx_mode(to);
4258 netif_addr_unlock(to);
4259 netif_addr_unlock_bh(from);
4261 EXPORT_SYMBOL(dev_unicast_unsync);
4263 static void dev_unicast_flush(struct net_device *dev)
4265 netif_addr_lock_bh(dev);
4266 __hw_addr_flush(&dev->uc);
4267 netif_addr_unlock_bh(dev);
4270 static void dev_unicast_init(struct net_device *dev)
4272 __hw_addr_init(&dev->uc);
4276 static void __dev_addr_discard(struct dev_addr_list **list)
4278 struct dev_addr_list *tmp;
4280 while (*list != NULL) {
4281 tmp = *list;
4282 *list = tmp->next;
4283 if (tmp->da_users > tmp->da_gusers)
4284 printk("__dev_addr_discard: address leakage! "
4285 "da_users=%d\n", tmp->da_users);
4286 kfree(tmp);
4290 static void dev_addr_discard(struct net_device *dev)
4292 netif_addr_lock_bh(dev);
4294 __dev_addr_discard(&dev->mc_list);
4295 netdev_mc_count(dev) = 0;
4297 netif_addr_unlock_bh(dev);
4301 * dev_get_flags - get flags reported to userspace
4302 * @dev: device
4304 * Get the combination of flag bits exported through APIs to userspace.
4306 unsigned dev_get_flags(const struct net_device *dev)
4308 unsigned flags;
4310 flags = (dev->flags & ~(IFF_PROMISC |
4311 IFF_ALLMULTI |
4312 IFF_RUNNING |
4313 IFF_LOWER_UP |
4314 IFF_DORMANT)) |
4315 (dev->gflags & (IFF_PROMISC |
4316 IFF_ALLMULTI));
4318 if (netif_running(dev)) {
4319 if (netif_oper_up(dev))
4320 flags |= IFF_RUNNING;
4321 if (netif_carrier_ok(dev))
4322 flags |= IFF_LOWER_UP;
4323 if (netif_dormant(dev))
4324 flags |= IFF_DORMANT;
4327 return flags;
4329 EXPORT_SYMBOL(dev_get_flags);
4331 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4333 int old_flags = dev->flags;
4334 int ret;
4336 ASSERT_RTNL();
4339 * Set the flags on our device.
4342 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4343 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4344 IFF_AUTOMEDIA)) |
4345 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4346 IFF_ALLMULTI));
4349 * Load in the correct multicast list now the flags have changed.
4352 if ((old_flags ^ flags) & IFF_MULTICAST)
4353 dev_change_rx_flags(dev, IFF_MULTICAST);
4355 dev_set_rx_mode(dev);
4358 * Have we downed the interface. We handle IFF_UP ourselves
4359 * according to user attempts to set it, rather than blindly
4360 * setting it.
4363 ret = 0;
4364 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4365 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4367 if (!ret)
4368 dev_set_rx_mode(dev);
4371 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4372 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4374 dev->gflags ^= IFF_PROMISC;
4375 dev_set_promiscuity(dev, inc);
4378 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4379 is important. Some (broken) drivers set IFF_PROMISC, when
4380 IFF_ALLMULTI is requested not asking us and not reporting.
4382 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4383 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4385 dev->gflags ^= IFF_ALLMULTI;
4386 dev_set_allmulti(dev, inc);
4389 return ret;
4392 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4394 unsigned int changes = dev->flags ^ old_flags;
4396 if (changes & IFF_UP) {
4397 if (dev->flags & IFF_UP)
4398 call_netdevice_notifiers(NETDEV_UP, dev);
4399 else
4400 call_netdevice_notifiers(NETDEV_DOWN, dev);
4403 if (dev->flags & IFF_UP &&
4404 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4405 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4409 * dev_change_flags - change device settings
4410 * @dev: device
4411 * @flags: device state flags
4413 * Change settings on device based state flags. The flags are
4414 * in the userspace exported format.
4416 int dev_change_flags(struct net_device *dev, unsigned flags)
4418 int ret, changes;
4419 int old_flags = dev->flags;
4421 ret = __dev_change_flags(dev, flags);
4422 if (ret < 0)
4423 return ret;
4425 changes = old_flags ^ dev->flags;
4426 if (changes)
4427 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4429 __dev_notify_flags(dev, old_flags);
4430 return ret;
4432 EXPORT_SYMBOL(dev_change_flags);
4435 * dev_set_mtu - Change maximum transfer unit
4436 * @dev: device
4437 * @new_mtu: new transfer unit
4439 * Change the maximum transfer size of the network device.
4441 int dev_set_mtu(struct net_device *dev, int new_mtu)
4443 const struct net_device_ops *ops = dev->netdev_ops;
4444 int err;
4446 if (new_mtu == dev->mtu)
4447 return 0;
4449 /* MTU must be positive. */
4450 if (new_mtu < 0)
4451 return -EINVAL;
4453 if (!netif_device_present(dev))
4454 return -ENODEV;
4456 err = 0;
4457 if (ops->ndo_change_mtu)
4458 err = ops->ndo_change_mtu(dev, new_mtu);
4459 else
4460 dev->mtu = new_mtu;
4462 if (!err && dev->flags & IFF_UP)
4463 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4464 return err;
4466 EXPORT_SYMBOL(dev_set_mtu);
4469 * dev_set_mac_address - Change Media Access Control Address
4470 * @dev: device
4471 * @sa: new address
4473 * Change the hardware (MAC) address of the device
4475 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4477 const struct net_device_ops *ops = dev->netdev_ops;
4478 int err;
4480 if (!ops->ndo_set_mac_address)
4481 return -EOPNOTSUPP;
4482 if (sa->sa_family != dev->type)
4483 return -EINVAL;
4484 if (!netif_device_present(dev))
4485 return -ENODEV;
4486 err = ops->ndo_set_mac_address(dev, sa);
4487 if (!err)
4488 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4489 return err;
4491 EXPORT_SYMBOL(dev_set_mac_address);
4494 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4496 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4498 int err;
4499 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4501 if (!dev)
4502 return -ENODEV;
4504 switch (cmd) {
4505 case SIOCGIFFLAGS: /* Get interface flags */
4506 ifr->ifr_flags = (short) dev_get_flags(dev);
4507 return 0;
4509 case SIOCGIFMETRIC: /* Get the metric on the interface
4510 (currently unused) */
4511 ifr->ifr_metric = 0;
4512 return 0;
4514 case SIOCGIFMTU: /* Get the MTU of a device */
4515 ifr->ifr_mtu = dev->mtu;
4516 return 0;
4518 case SIOCGIFHWADDR:
4519 if (!dev->addr_len)
4520 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4521 else
4522 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4523 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4524 ifr->ifr_hwaddr.sa_family = dev->type;
4525 return 0;
4527 case SIOCGIFSLAVE:
4528 err = -EINVAL;
4529 break;
4531 case SIOCGIFMAP:
4532 ifr->ifr_map.mem_start = dev->mem_start;
4533 ifr->ifr_map.mem_end = dev->mem_end;
4534 ifr->ifr_map.base_addr = dev->base_addr;
4535 ifr->ifr_map.irq = dev->irq;
4536 ifr->ifr_map.dma = dev->dma;
4537 ifr->ifr_map.port = dev->if_port;
4538 return 0;
4540 case SIOCGIFINDEX:
4541 ifr->ifr_ifindex = dev->ifindex;
4542 return 0;
4544 case SIOCGIFTXQLEN:
4545 ifr->ifr_qlen = dev->tx_queue_len;
4546 return 0;
4548 default:
4549 /* dev_ioctl() should ensure this case
4550 * is never reached
4552 WARN_ON(1);
4553 err = -EINVAL;
4554 break;
4557 return err;
4561 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4563 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4565 int err;
4566 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4567 const struct net_device_ops *ops;
4569 if (!dev)
4570 return -ENODEV;
4572 ops = dev->netdev_ops;
4574 switch (cmd) {
4575 case SIOCSIFFLAGS: /* Set interface flags */
4576 return dev_change_flags(dev, ifr->ifr_flags);
4578 case SIOCSIFMETRIC: /* Set the metric on the interface
4579 (currently unused) */
4580 return -EOPNOTSUPP;
4582 case SIOCSIFMTU: /* Set the MTU of a device */
4583 return dev_set_mtu(dev, ifr->ifr_mtu);
4585 case SIOCSIFHWADDR:
4586 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4588 case SIOCSIFHWBROADCAST:
4589 if (ifr->ifr_hwaddr.sa_family != dev->type)
4590 return -EINVAL;
4591 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4592 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4593 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4594 return 0;
4596 case SIOCSIFMAP:
4597 if (ops->ndo_set_config) {
4598 if (!netif_device_present(dev))
4599 return -ENODEV;
4600 return ops->ndo_set_config(dev, &ifr->ifr_map);
4602 return -EOPNOTSUPP;
4604 case SIOCADDMULTI:
4605 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4606 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4607 return -EINVAL;
4608 if (!netif_device_present(dev))
4609 return -ENODEV;
4610 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
4611 dev->addr_len, 1);
4613 case SIOCDELMULTI:
4614 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4615 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4616 return -EINVAL;
4617 if (!netif_device_present(dev))
4618 return -ENODEV;
4619 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
4620 dev->addr_len, 1);
4622 case SIOCSIFTXQLEN:
4623 if (ifr->ifr_qlen < 0)
4624 return -EINVAL;
4625 dev->tx_queue_len = ifr->ifr_qlen;
4626 return 0;
4628 case SIOCSIFNAME:
4629 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4630 return dev_change_name(dev, ifr->ifr_newname);
4633 * Unknown or private ioctl
4635 default:
4636 if ((cmd >= SIOCDEVPRIVATE &&
4637 cmd <= SIOCDEVPRIVATE + 15) ||
4638 cmd == SIOCBONDENSLAVE ||
4639 cmd == SIOCBONDRELEASE ||
4640 cmd == SIOCBONDSETHWADDR ||
4641 cmd == SIOCBONDSLAVEINFOQUERY ||
4642 cmd == SIOCBONDINFOQUERY ||
4643 cmd == SIOCBONDCHANGEACTIVE ||
4644 cmd == SIOCGMIIPHY ||
4645 cmd == SIOCGMIIREG ||
4646 cmd == SIOCSMIIREG ||
4647 cmd == SIOCBRADDIF ||
4648 cmd == SIOCBRDELIF ||
4649 cmd == SIOCSHWTSTAMP ||
4650 cmd == SIOCWANDEV) {
4651 err = -EOPNOTSUPP;
4652 if (ops->ndo_do_ioctl) {
4653 if (netif_device_present(dev))
4654 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4655 else
4656 err = -ENODEV;
4658 } else
4659 err = -EINVAL;
4662 return err;
4666 * This function handles all "interface"-type I/O control requests. The actual
4667 * 'doing' part of this is dev_ifsioc above.
4671 * dev_ioctl - network device ioctl
4672 * @net: the applicable net namespace
4673 * @cmd: command to issue
4674 * @arg: pointer to a struct ifreq in user space
4676 * Issue ioctl functions to devices. This is normally called by the
4677 * user space syscall interfaces but can sometimes be useful for
4678 * other purposes. The return value is the return from the syscall if
4679 * positive or a negative errno code on error.
4682 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4684 struct ifreq ifr;
4685 int ret;
4686 char *colon;
4688 /* One special case: SIOCGIFCONF takes ifconf argument
4689 and requires shared lock, because it sleeps writing
4690 to user space.
4693 if (cmd == SIOCGIFCONF) {
4694 rtnl_lock();
4695 ret = dev_ifconf(net, (char __user *) arg);
4696 rtnl_unlock();
4697 return ret;
4699 if (cmd == SIOCGIFNAME)
4700 return dev_ifname(net, (struct ifreq __user *)arg);
4702 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4703 return -EFAULT;
4705 ifr.ifr_name[IFNAMSIZ-1] = 0;
4707 colon = strchr(ifr.ifr_name, ':');
4708 if (colon)
4709 *colon = 0;
4712 * See which interface the caller is talking about.
4715 switch (cmd) {
4717 * These ioctl calls:
4718 * - can be done by all.
4719 * - atomic and do not require locking.
4720 * - return a value
4722 case SIOCGIFFLAGS:
4723 case SIOCGIFMETRIC:
4724 case SIOCGIFMTU:
4725 case SIOCGIFHWADDR:
4726 case SIOCGIFSLAVE:
4727 case SIOCGIFMAP:
4728 case SIOCGIFINDEX:
4729 case SIOCGIFTXQLEN:
4730 dev_load(net, ifr.ifr_name);
4731 rcu_read_lock();
4732 ret = dev_ifsioc_locked(net, &ifr, cmd);
4733 rcu_read_unlock();
4734 if (!ret) {
4735 if (colon)
4736 *colon = ':';
4737 if (copy_to_user(arg, &ifr,
4738 sizeof(struct ifreq)))
4739 ret = -EFAULT;
4741 return ret;
4743 case SIOCETHTOOL:
4744 dev_load(net, ifr.ifr_name);
4745 rtnl_lock();
4746 ret = dev_ethtool(net, &ifr);
4747 rtnl_unlock();
4748 if (!ret) {
4749 if (colon)
4750 *colon = ':';
4751 if (copy_to_user(arg, &ifr,
4752 sizeof(struct ifreq)))
4753 ret = -EFAULT;
4755 return ret;
4758 * These ioctl calls:
4759 * - require superuser power.
4760 * - require strict serialization.
4761 * - return a value
4763 case SIOCGMIIPHY:
4764 case SIOCGMIIREG:
4765 case SIOCSIFNAME:
4766 if (!capable(CAP_NET_ADMIN))
4767 return -EPERM;
4768 dev_load(net, ifr.ifr_name);
4769 rtnl_lock();
4770 ret = dev_ifsioc(net, &ifr, cmd);
4771 rtnl_unlock();
4772 if (!ret) {
4773 if (colon)
4774 *colon = ':';
4775 if (copy_to_user(arg, &ifr,
4776 sizeof(struct ifreq)))
4777 ret = -EFAULT;
4779 return ret;
4782 * These ioctl calls:
4783 * - require superuser power.
4784 * - require strict serialization.
4785 * - do not return a value
4787 case SIOCSIFFLAGS:
4788 case SIOCSIFMETRIC:
4789 case SIOCSIFMTU:
4790 case SIOCSIFMAP:
4791 case SIOCSIFHWADDR:
4792 case SIOCSIFSLAVE:
4793 case SIOCADDMULTI:
4794 case SIOCDELMULTI:
4795 case SIOCSIFHWBROADCAST:
4796 case SIOCSIFTXQLEN:
4797 case SIOCSMIIREG:
4798 case SIOCBONDENSLAVE:
4799 case SIOCBONDRELEASE:
4800 case SIOCBONDSETHWADDR:
4801 case SIOCBONDCHANGEACTIVE:
4802 case SIOCBRADDIF:
4803 case SIOCBRDELIF:
4804 case SIOCSHWTSTAMP:
4805 if (!capable(CAP_NET_ADMIN))
4806 return -EPERM;
4807 /* fall through */
4808 case SIOCBONDSLAVEINFOQUERY:
4809 case SIOCBONDINFOQUERY:
4810 dev_load(net, ifr.ifr_name);
4811 rtnl_lock();
4812 ret = dev_ifsioc(net, &ifr, cmd);
4813 rtnl_unlock();
4814 return ret;
4816 case SIOCGIFMEM:
4817 /* Get the per device memory space. We can add this but
4818 * currently do not support it */
4819 case SIOCSIFMEM:
4820 /* Set the per device memory buffer space.
4821 * Not applicable in our case */
4822 case SIOCSIFLINK:
4823 return -EINVAL;
4826 * Unknown or private ioctl.
4828 default:
4829 if (cmd == SIOCWANDEV ||
4830 (cmd >= SIOCDEVPRIVATE &&
4831 cmd <= SIOCDEVPRIVATE + 15)) {
4832 dev_load(net, ifr.ifr_name);
4833 rtnl_lock();
4834 ret = dev_ifsioc(net, &ifr, cmd);
4835 rtnl_unlock();
4836 if (!ret && copy_to_user(arg, &ifr,
4837 sizeof(struct ifreq)))
4838 ret = -EFAULT;
4839 return ret;
4841 /* Take care of Wireless Extensions */
4842 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4843 return wext_handle_ioctl(net, &ifr, cmd, arg);
4844 return -EINVAL;
4850 * dev_new_index - allocate an ifindex
4851 * @net: the applicable net namespace
4853 * Returns a suitable unique value for a new device interface
4854 * number. The caller must hold the rtnl semaphore or the
4855 * dev_base_lock to be sure it remains unique.
4857 static int dev_new_index(struct net *net)
4859 static int ifindex;
4860 for (;;) {
4861 if (++ifindex <= 0)
4862 ifindex = 1;
4863 if (!__dev_get_by_index(net, ifindex))
4864 return ifindex;
4868 /* Delayed registration/unregisteration */
4869 static LIST_HEAD(net_todo_list);
4871 static void net_set_todo(struct net_device *dev)
4873 list_add_tail(&dev->todo_list, &net_todo_list);
4876 static void rollback_registered_many(struct list_head *head)
4878 struct net_device *dev, *tmp;
4880 BUG_ON(dev_boot_phase);
4881 ASSERT_RTNL();
4883 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4884 /* Some devices call without registering
4885 * for initialization unwind. Remove those
4886 * devices and proceed with the remaining.
4888 if (dev->reg_state == NETREG_UNINITIALIZED) {
4889 pr_debug("unregister_netdevice: device %s/%p never "
4890 "was registered\n", dev->name, dev);
4892 WARN_ON(1);
4893 list_del(&dev->unreg_list);
4894 continue;
4897 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4899 /* If device is running, close it first. */
4900 dev_close(dev);
4902 /* And unlink it from device chain. */
4903 unlist_netdevice(dev);
4905 dev->reg_state = NETREG_UNREGISTERING;
4908 synchronize_net();
4910 list_for_each_entry(dev, head, unreg_list) {
4911 /* Shutdown queueing discipline. */
4912 dev_shutdown(dev);
4915 /* Notify protocols, that we are about to destroy
4916 this device. They should clean all the things.
4918 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4920 if (!dev->rtnl_link_ops ||
4921 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4922 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4925 * Flush the unicast and multicast chains
4927 dev_unicast_flush(dev);
4928 dev_addr_discard(dev);
4930 if (dev->netdev_ops->ndo_uninit)
4931 dev->netdev_ops->ndo_uninit(dev);
4933 /* Notifier chain MUST detach us from master device. */
4934 WARN_ON(dev->master);
4936 /* Remove entries from kobject tree */
4937 netdev_unregister_kobject(dev);
4940 /* Process any work delayed until the end of the batch */
4941 dev = list_first_entry(head, struct net_device, unreg_list);
4942 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4944 synchronize_net();
4946 list_for_each_entry(dev, head, unreg_list)
4947 dev_put(dev);
4950 static void rollback_registered(struct net_device *dev)
4952 LIST_HEAD(single);
4954 list_add(&dev->unreg_list, &single);
4955 rollback_registered_many(&single);
4958 static void __netdev_init_queue_locks_one(struct net_device *dev,
4959 struct netdev_queue *dev_queue,
4960 void *_unused)
4962 spin_lock_init(&dev_queue->_xmit_lock);
4963 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4964 dev_queue->xmit_lock_owner = -1;
4967 static void netdev_init_queue_locks(struct net_device *dev)
4969 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4970 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4973 unsigned long netdev_fix_features(unsigned long features, const char *name)
4975 /* Fix illegal SG+CSUM combinations. */
4976 if ((features & NETIF_F_SG) &&
4977 !(features & NETIF_F_ALL_CSUM)) {
4978 if (name)
4979 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4980 "checksum feature.\n", name);
4981 features &= ~NETIF_F_SG;
4984 /* TSO requires that SG is present as well. */
4985 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4986 if (name)
4987 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4988 "SG feature.\n", name);
4989 features &= ~NETIF_F_TSO;
4992 if (features & NETIF_F_UFO) {
4993 if (!(features & NETIF_F_GEN_CSUM)) {
4994 if (name)
4995 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4996 "since no NETIF_F_HW_CSUM feature.\n",
4997 name);
4998 features &= ~NETIF_F_UFO;
5001 if (!(features & NETIF_F_SG)) {
5002 if (name)
5003 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5004 "since no NETIF_F_SG feature.\n", name);
5005 features &= ~NETIF_F_UFO;
5009 return features;
5011 EXPORT_SYMBOL(netdev_fix_features);
5014 * netif_stacked_transfer_operstate - transfer operstate
5015 * @rootdev: the root or lower level device to transfer state from
5016 * @dev: the device to transfer operstate to
5018 * Transfer operational state from root to device. This is normally
5019 * called when a stacking relationship exists between the root
5020 * device and the device(a leaf device).
5022 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5023 struct net_device *dev)
5025 if (rootdev->operstate == IF_OPER_DORMANT)
5026 netif_dormant_on(dev);
5027 else
5028 netif_dormant_off(dev);
5030 if (netif_carrier_ok(rootdev)) {
5031 if (!netif_carrier_ok(dev))
5032 netif_carrier_on(dev);
5033 } else {
5034 if (netif_carrier_ok(dev))
5035 netif_carrier_off(dev);
5038 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5041 * register_netdevice - register a network device
5042 * @dev: device to register
5044 * Take a completed network device structure and add it to the kernel
5045 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5046 * chain. 0 is returned on success. A negative errno code is returned
5047 * on a failure to set up the device, or if the name is a duplicate.
5049 * Callers must hold the rtnl semaphore. You may want
5050 * register_netdev() instead of this.
5052 * BUGS:
5053 * The locking appears insufficient to guarantee two parallel registers
5054 * will not get the same name.
5057 int register_netdevice(struct net_device *dev)
5059 int ret;
5060 struct net *net = dev_net(dev);
5062 BUG_ON(dev_boot_phase);
5063 ASSERT_RTNL();
5065 might_sleep();
5067 /* When net_device's are persistent, this will be fatal. */
5068 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5069 BUG_ON(!net);
5071 spin_lock_init(&dev->addr_list_lock);
5072 netdev_set_addr_lockdep_class(dev);
5073 netdev_init_queue_locks(dev);
5075 dev->iflink = -1;
5077 /* Init, if this function is available */
5078 if (dev->netdev_ops->ndo_init) {
5079 ret = dev->netdev_ops->ndo_init(dev);
5080 if (ret) {
5081 if (ret > 0)
5082 ret = -EIO;
5083 goto out;
5087 ret = dev_get_valid_name(net, dev->name, dev->name, 0);
5088 if (ret)
5089 goto err_uninit;
5091 dev->ifindex = dev_new_index(net);
5092 if (dev->iflink == -1)
5093 dev->iflink = dev->ifindex;
5095 /* Fix illegal checksum combinations */
5096 if ((dev->features & NETIF_F_HW_CSUM) &&
5097 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5098 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5099 dev->name);
5100 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5103 if ((dev->features & NETIF_F_NO_CSUM) &&
5104 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5105 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5106 dev->name);
5107 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5110 dev->features = netdev_fix_features(dev->features, dev->name);
5112 /* Enable software GSO if SG is supported. */
5113 if (dev->features & NETIF_F_SG)
5114 dev->features |= NETIF_F_GSO;
5116 netdev_initialize_kobject(dev);
5118 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5119 ret = notifier_to_errno(ret);
5120 if (ret)
5121 goto err_uninit;
5123 ret = netdev_register_kobject(dev);
5124 if (ret)
5125 goto err_uninit;
5126 dev->reg_state = NETREG_REGISTERED;
5129 * Default initial state at registry is that the
5130 * device is present.
5133 set_bit(__LINK_STATE_PRESENT, &dev->state);
5135 dev_init_scheduler(dev);
5136 dev_hold(dev);
5137 list_netdevice(dev);
5139 /* Notify protocols, that a new device appeared. */
5140 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5141 ret = notifier_to_errno(ret);
5142 if (ret) {
5143 rollback_registered(dev);
5144 dev->reg_state = NETREG_UNREGISTERED;
5147 * Prevent userspace races by waiting until the network
5148 * device is fully setup before sending notifications.
5150 if (!dev->rtnl_link_ops ||
5151 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5152 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5154 out:
5155 return ret;
5157 err_uninit:
5158 if (dev->netdev_ops->ndo_uninit)
5159 dev->netdev_ops->ndo_uninit(dev);
5160 goto out;
5162 EXPORT_SYMBOL(register_netdevice);
5165 * init_dummy_netdev - init a dummy network device for NAPI
5166 * @dev: device to init
5168 * This takes a network device structure and initialize the minimum
5169 * amount of fields so it can be used to schedule NAPI polls without
5170 * registering a full blown interface. This is to be used by drivers
5171 * that need to tie several hardware interfaces to a single NAPI
5172 * poll scheduler due to HW limitations.
5174 int init_dummy_netdev(struct net_device *dev)
5176 /* Clear everything. Note we don't initialize spinlocks
5177 * are they aren't supposed to be taken by any of the
5178 * NAPI code and this dummy netdev is supposed to be
5179 * only ever used for NAPI polls
5181 memset(dev, 0, sizeof(struct net_device));
5183 /* make sure we BUG if trying to hit standard
5184 * register/unregister code path
5186 dev->reg_state = NETREG_DUMMY;
5188 /* initialize the ref count */
5189 atomic_set(&dev->refcnt, 1);
5191 /* NAPI wants this */
5192 INIT_LIST_HEAD(&dev->napi_list);
5194 /* a dummy interface is started by default */
5195 set_bit(__LINK_STATE_PRESENT, &dev->state);
5196 set_bit(__LINK_STATE_START, &dev->state);
5198 return 0;
5200 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5204 * register_netdev - register a network device
5205 * @dev: device to register
5207 * Take a completed network device structure and add it to the kernel
5208 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5209 * chain. 0 is returned on success. A negative errno code is returned
5210 * on a failure to set up the device, or if the name is a duplicate.
5212 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5213 * and expands the device name if you passed a format string to
5214 * alloc_netdev.
5216 int register_netdev(struct net_device *dev)
5218 int err;
5220 rtnl_lock();
5223 * If the name is a format string the caller wants us to do a
5224 * name allocation.
5226 if (strchr(dev->name, '%')) {
5227 err = dev_alloc_name(dev, dev->name);
5228 if (err < 0)
5229 goto out;
5232 err = register_netdevice(dev);
5233 out:
5234 rtnl_unlock();
5235 return err;
5237 EXPORT_SYMBOL(register_netdev);
5240 * netdev_wait_allrefs - wait until all references are gone.
5242 * This is called when unregistering network devices.
5244 * Any protocol or device that holds a reference should register
5245 * for netdevice notification, and cleanup and put back the
5246 * reference if they receive an UNREGISTER event.
5247 * We can get stuck here if buggy protocols don't correctly
5248 * call dev_put.
5250 static void netdev_wait_allrefs(struct net_device *dev)
5252 unsigned long rebroadcast_time, warning_time;
5254 linkwatch_forget_dev(dev);
5256 rebroadcast_time = warning_time = jiffies;
5257 while (atomic_read(&dev->refcnt) != 0) {
5258 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5259 rtnl_lock();
5261 /* Rebroadcast unregister notification */
5262 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5263 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5264 * should have already handle it the first time */
5266 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5267 &dev->state)) {
5268 /* We must not have linkwatch events
5269 * pending on unregister. If this
5270 * happens, we simply run the queue
5271 * unscheduled, resulting in a noop
5272 * for this device.
5274 linkwatch_run_queue();
5277 __rtnl_unlock();
5279 rebroadcast_time = jiffies;
5282 msleep(250);
5284 if (time_after(jiffies, warning_time + 10 * HZ)) {
5285 printk(KERN_EMERG "unregister_netdevice: "
5286 "waiting for %s to become free. Usage "
5287 "count = %d\n",
5288 dev->name, atomic_read(&dev->refcnt));
5289 warning_time = jiffies;
5294 /* The sequence is:
5296 * rtnl_lock();
5297 * ...
5298 * register_netdevice(x1);
5299 * register_netdevice(x2);
5300 * ...
5301 * unregister_netdevice(y1);
5302 * unregister_netdevice(y2);
5303 * ...
5304 * rtnl_unlock();
5305 * free_netdev(y1);
5306 * free_netdev(y2);
5308 * We are invoked by rtnl_unlock().
5309 * This allows us to deal with problems:
5310 * 1) We can delete sysfs objects which invoke hotplug
5311 * without deadlocking with linkwatch via keventd.
5312 * 2) Since we run with the RTNL semaphore not held, we can sleep
5313 * safely in order to wait for the netdev refcnt to drop to zero.
5315 * We must not return until all unregister events added during
5316 * the interval the lock was held have been completed.
5318 void netdev_run_todo(void)
5320 struct list_head list;
5322 /* Snapshot list, allow later requests */
5323 list_replace_init(&net_todo_list, &list);
5325 __rtnl_unlock();
5327 while (!list_empty(&list)) {
5328 struct net_device *dev
5329 = list_first_entry(&list, struct net_device, todo_list);
5330 list_del(&dev->todo_list);
5332 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5333 printk(KERN_ERR "network todo '%s' but state %d\n",
5334 dev->name, dev->reg_state);
5335 dump_stack();
5336 continue;
5339 dev->reg_state = NETREG_UNREGISTERED;
5341 on_each_cpu(flush_backlog, dev, 1);
5343 netdev_wait_allrefs(dev);
5345 /* paranoia */
5346 BUG_ON(atomic_read(&dev->refcnt));
5347 WARN_ON(dev->ip_ptr);
5348 WARN_ON(dev->ip6_ptr);
5349 WARN_ON(dev->dn_ptr);
5351 if (dev->destructor)
5352 dev->destructor(dev);
5354 /* Free network device */
5355 kobject_put(&dev->dev.kobj);
5360 * dev_txq_stats_fold - fold tx_queues stats
5361 * @dev: device to get statistics from
5362 * @stats: struct net_device_stats to hold results
5364 void dev_txq_stats_fold(const struct net_device *dev,
5365 struct net_device_stats *stats)
5367 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5368 unsigned int i;
5369 struct netdev_queue *txq;
5371 for (i = 0; i < dev->num_tx_queues; i++) {
5372 txq = netdev_get_tx_queue(dev, i);
5373 tx_bytes += txq->tx_bytes;
5374 tx_packets += txq->tx_packets;
5375 tx_dropped += txq->tx_dropped;
5377 if (tx_bytes || tx_packets || tx_dropped) {
5378 stats->tx_bytes = tx_bytes;
5379 stats->tx_packets = tx_packets;
5380 stats->tx_dropped = tx_dropped;
5383 EXPORT_SYMBOL(dev_txq_stats_fold);
5386 * dev_get_stats - get network device statistics
5387 * @dev: device to get statistics from
5389 * Get network statistics from device. The device driver may provide
5390 * its own method by setting dev->netdev_ops->get_stats; otherwise
5391 * the internal statistics structure is used.
5393 const struct net_device_stats *dev_get_stats(struct net_device *dev)
5395 const struct net_device_ops *ops = dev->netdev_ops;
5397 if (ops->ndo_get_stats)
5398 return ops->ndo_get_stats(dev);
5400 dev_txq_stats_fold(dev, &dev->stats);
5401 return &dev->stats;
5403 EXPORT_SYMBOL(dev_get_stats);
5405 static void netdev_init_one_queue(struct net_device *dev,
5406 struct netdev_queue *queue,
5407 void *_unused)
5409 queue->dev = dev;
5412 static void netdev_init_queues(struct net_device *dev)
5414 netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5415 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5416 spin_lock_init(&dev->tx_global_lock);
5420 * alloc_netdev_mq - allocate network device
5421 * @sizeof_priv: size of private data to allocate space for
5422 * @name: device name format string
5423 * @setup: callback to initialize device
5424 * @queue_count: the number of subqueues to allocate
5426 * Allocates a struct net_device with private data area for driver use
5427 * and performs basic initialization. Also allocates subquue structs
5428 * for each queue on the device at the end of the netdevice.
5430 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5431 void (*setup)(struct net_device *), unsigned int queue_count)
5433 struct netdev_queue *tx;
5434 struct net_device *dev;
5435 size_t alloc_size;
5436 struct net_device *p;
5438 BUG_ON(strlen(name) >= sizeof(dev->name));
5440 alloc_size = sizeof(struct net_device);
5441 if (sizeof_priv) {
5442 /* ensure 32-byte alignment of private area */
5443 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5444 alloc_size += sizeof_priv;
5446 /* ensure 32-byte alignment of whole construct */
5447 alloc_size += NETDEV_ALIGN - 1;
5449 p = kzalloc(alloc_size, GFP_KERNEL);
5450 if (!p) {
5451 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5452 return NULL;
5455 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5456 if (!tx) {
5457 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5458 "tx qdiscs.\n");
5459 goto free_p;
5462 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5463 dev->padded = (char *)dev - (char *)p;
5465 if (dev_addr_init(dev))
5466 goto free_tx;
5468 dev_unicast_init(dev);
5470 dev_net_set(dev, &init_net);
5472 dev->_tx = tx;
5473 dev->num_tx_queues = queue_count;
5474 dev->real_num_tx_queues = queue_count;
5476 dev->gso_max_size = GSO_MAX_SIZE;
5478 netdev_init_queues(dev);
5480 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5481 dev->ethtool_ntuple_list.count = 0;
5482 INIT_LIST_HEAD(&dev->napi_list);
5483 INIT_LIST_HEAD(&dev->unreg_list);
5484 INIT_LIST_HEAD(&dev->link_watch_list);
5485 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5486 setup(dev);
5487 strcpy(dev->name, name);
5488 return dev;
5490 free_tx:
5491 kfree(tx);
5493 free_p:
5494 kfree(p);
5495 return NULL;
5497 EXPORT_SYMBOL(alloc_netdev_mq);
5500 * free_netdev - free network device
5501 * @dev: device
5503 * This function does the last stage of destroying an allocated device
5504 * interface. The reference to the device object is released.
5505 * If this is the last reference then it will be freed.
5507 void free_netdev(struct net_device *dev)
5509 struct napi_struct *p, *n;
5511 release_net(dev_net(dev));
5513 kfree(dev->_tx);
5515 /* Flush device addresses */
5516 dev_addr_flush(dev);
5518 /* Clear ethtool n-tuple list */
5519 ethtool_ntuple_flush(dev);
5521 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5522 netif_napi_del(p);
5524 /* Compatibility with error handling in drivers */
5525 if (dev->reg_state == NETREG_UNINITIALIZED) {
5526 kfree((char *)dev - dev->padded);
5527 return;
5530 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5531 dev->reg_state = NETREG_RELEASED;
5533 /* will free via device release */
5534 put_device(&dev->dev);
5536 EXPORT_SYMBOL(free_netdev);
5539 * synchronize_net - Synchronize with packet receive processing
5541 * Wait for packets currently being received to be done.
5542 * Does not block later packets from starting.
5544 void synchronize_net(void)
5546 might_sleep();
5547 synchronize_rcu();
5549 EXPORT_SYMBOL(synchronize_net);
5552 * unregister_netdevice_queue - remove device from the kernel
5553 * @dev: device
5554 * @head: list
5556 * This function shuts down a device interface and removes it
5557 * from the kernel tables.
5558 * If head not NULL, device is queued to be unregistered later.
5560 * Callers must hold the rtnl semaphore. You may want
5561 * unregister_netdev() instead of this.
5564 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5566 ASSERT_RTNL();
5568 if (head) {
5569 list_move_tail(&dev->unreg_list, head);
5570 } else {
5571 rollback_registered(dev);
5572 /* Finish processing unregister after unlock */
5573 net_set_todo(dev);
5576 EXPORT_SYMBOL(unregister_netdevice_queue);
5579 * unregister_netdevice_many - unregister many devices
5580 * @head: list of devices
5582 void unregister_netdevice_many(struct list_head *head)
5584 struct net_device *dev;
5586 if (!list_empty(head)) {
5587 rollback_registered_many(head);
5588 list_for_each_entry(dev, head, unreg_list)
5589 net_set_todo(dev);
5592 EXPORT_SYMBOL(unregister_netdevice_many);
5595 * unregister_netdev - remove device from the kernel
5596 * @dev: device
5598 * This function shuts down a device interface and removes it
5599 * from the kernel tables.
5601 * This is just a wrapper for unregister_netdevice that takes
5602 * the rtnl semaphore. In general you want to use this and not
5603 * unregister_netdevice.
5605 void unregister_netdev(struct net_device *dev)
5607 rtnl_lock();
5608 unregister_netdevice(dev);
5609 rtnl_unlock();
5611 EXPORT_SYMBOL(unregister_netdev);
5614 * dev_change_net_namespace - move device to different nethost namespace
5615 * @dev: device
5616 * @net: network namespace
5617 * @pat: If not NULL name pattern to try if the current device name
5618 * is already taken in the destination network namespace.
5620 * This function shuts down a device interface and moves it
5621 * to a new network namespace. On success 0 is returned, on
5622 * a failure a netagive errno code is returned.
5624 * Callers must hold the rtnl semaphore.
5627 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5629 int err;
5631 ASSERT_RTNL();
5633 /* Don't allow namespace local devices to be moved. */
5634 err = -EINVAL;
5635 if (dev->features & NETIF_F_NETNS_LOCAL)
5636 goto out;
5638 #ifdef CONFIG_SYSFS
5639 /* Don't allow real devices to be moved when sysfs
5640 * is enabled.
5642 err = -EINVAL;
5643 if (dev->dev.parent)
5644 goto out;
5645 #endif
5647 /* Ensure the device has been registrered */
5648 err = -EINVAL;
5649 if (dev->reg_state != NETREG_REGISTERED)
5650 goto out;
5652 /* Get out if there is nothing todo */
5653 err = 0;
5654 if (net_eq(dev_net(dev), net))
5655 goto out;
5657 /* Pick the destination device name, and ensure
5658 * we can use it in the destination network namespace.
5660 err = -EEXIST;
5661 if (__dev_get_by_name(net, dev->name)) {
5662 /* We get here if we can't use the current device name */
5663 if (!pat)
5664 goto out;
5665 if (dev_get_valid_name(net, pat, dev->name, 1))
5666 goto out;
5670 * And now a mini version of register_netdevice unregister_netdevice.
5673 /* If device is running close it first. */
5674 dev_close(dev);
5676 /* And unlink it from device chain */
5677 err = -ENODEV;
5678 unlist_netdevice(dev);
5680 synchronize_net();
5682 /* Shutdown queueing discipline. */
5683 dev_shutdown(dev);
5685 /* Notify protocols, that we are about to destroy
5686 this device. They should clean all the things.
5688 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5689 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5692 * Flush the unicast and multicast chains
5694 dev_unicast_flush(dev);
5695 dev_addr_discard(dev);
5697 netdev_unregister_kobject(dev);
5699 /* Actually switch the network namespace */
5700 dev_net_set(dev, net);
5702 /* If there is an ifindex conflict assign a new one */
5703 if (__dev_get_by_index(net, dev->ifindex)) {
5704 int iflink = (dev->iflink == dev->ifindex);
5705 dev->ifindex = dev_new_index(net);
5706 if (iflink)
5707 dev->iflink = dev->ifindex;
5710 /* Fixup kobjects */
5711 err = netdev_register_kobject(dev);
5712 WARN_ON(err);
5714 /* Add the device back in the hashes */
5715 list_netdevice(dev);
5717 /* Notify protocols, that a new device appeared. */
5718 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5721 * Prevent userspace races by waiting until the network
5722 * device is fully setup before sending notifications.
5724 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5726 synchronize_net();
5727 err = 0;
5728 out:
5729 return err;
5731 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5733 static int dev_cpu_callback(struct notifier_block *nfb,
5734 unsigned long action,
5735 void *ocpu)
5737 struct sk_buff **list_skb;
5738 struct Qdisc **list_net;
5739 struct sk_buff *skb;
5740 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5741 struct softnet_data *sd, *oldsd;
5743 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5744 return NOTIFY_OK;
5746 local_irq_disable();
5747 cpu = smp_processor_id();
5748 sd = &per_cpu(softnet_data, cpu);
5749 oldsd = &per_cpu(softnet_data, oldcpu);
5751 /* Find end of our completion_queue. */
5752 list_skb = &sd->completion_queue;
5753 while (*list_skb)
5754 list_skb = &(*list_skb)->next;
5755 /* Append completion queue from offline CPU. */
5756 *list_skb = oldsd->completion_queue;
5757 oldsd->completion_queue = NULL;
5759 /* Find end of our output_queue. */
5760 list_net = &sd->output_queue;
5761 while (*list_net)
5762 list_net = &(*list_net)->next_sched;
5763 /* Append output queue from offline CPU. */
5764 *list_net = oldsd->output_queue;
5765 oldsd->output_queue = NULL;
5767 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5768 local_irq_enable();
5770 /* Process offline CPU's input_pkt_queue */
5771 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5772 netif_rx(skb);
5774 return NOTIFY_OK;
5779 * netdev_increment_features - increment feature set by one
5780 * @all: current feature set
5781 * @one: new feature set
5782 * @mask: mask feature set
5784 * Computes a new feature set after adding a device with feature set
5785 * @one to the master device with current feature set @all. Will not
5786 * enable anything that is off in @mask. Returns the new feature set.
5788 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5789 unsigned long mask)
5791 /* If device needs checksumming, downgrade to it. */
5792 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5793 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5794 else if (mask & NETIF_F_ALL_CSUM) {
5795 /* If one device supports v4/v6 checksumming, set for all. */
5796 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5797 !(all & NETIF_F_GEN_CSUM)) {
5798 all &= ~NETIF_F_ALL_CSUM;
5799 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5802 /* If one device supports hw checksumming, set for all. */
5803 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5804 all &= ~NETIF_F_ALL_CSUM;
5805 all |= NETIF_F_HW_CSUM;
5809 one |= NETIF_F_ALL_CSUM;
5811 one |= all & NETIF_F_ONE_FOR_ALL;
5812 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5813 all |= one & mask & NETIF_F_ONE_FOR_ALL;
5815 return all;
5817 EXPORT_SYMBOL(netdev_increment_features);
5819 static struct hlist_head *netdev_create_hash(void)
5821 int i;
5822 struct hlist_head *hash;
5824 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5825 if (hash != NULL)
5826 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5827 INIT_HLIST_HEAD(&hash[i]);
5829 return hash;
5832 /* Initialize per network namespace state */
5833 static int __net_init netdev_init(struct net *net)
5835 INIT_LIST_HEAD(&net->dev_base_head);
5837 net->dev_name_head = netdev_create_hash();
5838 if (net->dev_name_head == NULL)
5839 goto err_name;
5841 net->dev_index_head = netdev_create_hash();
5842 if (net->dev_index_head == NULL)
5843 goto err_idx;
5845 return 0;
5847 err_idx:
5848 kfree(net->dev_name_head);
5849 err_name:
5850 return -ENOMEM;
5854 * netdev_drivername - network driver for the device
5855 * @dev: network device
5856 * @buffer: buffer for resulting name
5857 * @len: size of buffer
5859 * Determine network driver for device.
5861 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5863 const struct device_driver *driver;
5864 const struct device *parent;
5866 if (len <= 0 || !buffer)
5867 return buffer;
5868 buffer[0] = 0;
5870 parent = dev->dev.parent;
5872 if (!parent)
5873 return buffer;
5875 driver = parent->driver;
5876 if (driver && driver->name)
5877 strlcpy(buffer, driver->name, len);
5878 return buffer;
5881 static void __net_exit netdev_exit(struct net *net)
5883 kfree(net->dev_name_head);
5884 kfree(net->dev_index_head);
5887 static struct pernet_operations __net_initdata netdev_net_ops = {
5888 .init = netdev_init,
5889 .exit = netdev_exit,
5892 static void __net_exit default_device_exit(struct net *net)
5894 struct net_device *dev, *aux;
5896 * Push all migratable network devices back to the
5897 * initial network namespace
5899 rtnl_lock();
5900 for_each_netdev_safe(net, dev, aux) {
5901 int err;
5902 char fb_name[IFNAMSIZ];
5904 /* Ignore unmoveable devices (i.e. loopback) */
5905 if (dev->features & NETIF_F_NETNS_LOCAL)
5906 continue;
5908 /* Leave virtual devices for the generic cleanup */
5909 if (dev->rtnl_link_ops)
5910 continue;
5912 /* Push remaing network devices to init_net */
5913 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5914 err = dev_change_net_namespace(dev, &init_net, fb_name);
5915 if (err) {
5916 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5917 __func__, dev->name, err);
5918 BUG();
5921 rtnl_unlock();
5924 static void __net_exit default_device_exit_batch(struct list_head *net_list)
5926 /* At exit all network devices most be removed from a network
5927 * namespace. Do this in the reverse order of registeration.
5928 * Do this across as many network namespaces as possible to
5929 * improve batching efficiency.
5931 struct net_device *dev;
5932 struct net *net;
5933 LIST_HEAD(dev_kill_list);
5935 rtnl_lock();
5936 list_for_each_entry(net, net_list, exit_list) {
5937 for_each_netdev_reverse(net, dev) {
5938 if (dev->rtnl_link_ops)
5939 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5940 else
5941 unregister_netdevice_queue(dev, &dev_kill_list);
5944 unregister_netdevice_many(&dev_kill_list);
5945 rtnl_unlock();
5948 static struct pernet_operations __net_initdata default_device_ops = {
5949 .exit = default_device_exit,
5950 .exit_batch = default_device_exit_batch,
5954 * Initialize the DEV module. At boot time this walks the device list and
5955 * unhooks any devices that fail to initialise (normally hardware not
5956 * present) and leaves us with a valid list of present and active devices.
5961 * This is called single threaded during boot, so no need
5962 * to take the rtnl semaphore.
5964 static int __init net_dev_init(void)
5966 int i, rc = -ENOMEM;
5968 BUG_ON(!dev_boot_phase);
5970 if (dev_proc_init())
5971 goto out;
5973 if (netdev_kobject_init())
5974 goto out;
5976 INIT_LIST_HEAD(&ptype_all);
5977 for (i = 0; i < PTYPE_HASH_SIZE; i++)
5978 INIT_LIST_HEAD(&ptype_base[i]);
5980 if (register_pernet_subsys(&netdev_net_ops))
5981 goto out;
5984 * Initialise the packet receive queues.
5987 for_each_possible_cpu(i) {
5988 struct softnet_data *queue;
5990 queue = &per_cpu(softnet_data, i);
5991 skb_queue_head_init(&queue->input_pkt_queue);
5992 queue->completion_queue = NULL;
5993 INIT_LIST_HEAD(&queue->poll_list);
5995 queue->backlog.poll = process_backlog;
5996 queue->backlog.weight = weight_p;
5997 queue->backlog.gro_list = NULL;
5998 queue->backlog.gro_count = 0;
6001 dev_boot_phase = 0;
6003 /* The loopback device is special if any other network devices
6004 * is present in a network namespace the loopback device must
6005 * be present. Since we now dynamically allocate and free the
6006 * loopback device ensure this invariant is maintained by
6007 * keeping the loopback device as the first device on the
6008 * list of network devices. Ensuring the loopback devices
6009 * is the first device that appears and the last network device
6010 * that disappears.
6012 if (register_pernet_device(&loopback_net_ops))
6013 goto out;
6015 if (register_pernet_device(&default_device_ops))
6016 goto out;
6018 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6019 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6021 hotcpu_notifier(dev_cpu_callback, 0);
6022 dst_init();
6023 dev_mcast_init();
6024 rc = 0;
6025 out:
6026 return rc;
6029 subsys_initcall(net_dev_init);
6031 static int __init initialize_hashrnd(void)
6033 get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
6034 return 0;
6037 late_initcall_sync(initialize_hashrnd);