Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wirel...
[linux-2.6/btrfs-unstable.git] / net / core / dev.c
blob1932d351ed7c5e6ca24612b26bcf576458834f12
1 /*
2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/proc_fs.h>
101 #include <linux/seq_file.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/module.h>
110 #include <linux/netpoll.h>
111 #include <linux/rcupdate.h>
112 #include <linux/delay.h>
113 #include <net/wext.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
123 #include <net/ip.h>
124 #include <linux/ipv6.h>
125 #include <linux/in.h>
126 #include <linux/jhash.h>
127 #include <linux/random.h>
128 #include <trace/events/napi.h>
129 #include <trace/events/net.h>
130 #include <trace/events/skb.h>
131 #include <linux/pci.h>
132 #include <linux/inetdevice.h>
133 #include <linux/cpu_rmap.h>
134 #include <linux/static_key.h>
136 #include "net-sysfs.h"
138 /* Instead of increasing this, you should create a hash table. */
139 #define MAX_GRO_SKBS 8
141 /* This should be increased if a protocol with a bigger head is added. */
142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
145 * The list of packet types we will receive (as opposed to discard)
146 * and the routines to invoke.
148 * Why 16. Because with 16 the only overlap we get on a hash of the
149 * low nibble of the protocol value is RARP/SNAP/X.25.
151 * NOTE: That is no longer true with the addition of VLAN tags. Not
152 * sure which should go first, but I bet it won't make much
153 * difference if we are running VLANs. The good news is that
154 * this protocol won't be in the list unless compiled in, so
155 * the average user (w/out VLANs) will not be adversely affected.
156 * --BLG
158 * 0800 IP
159 * 8100 802.1Q VLAN
160 * 0001 802.3
161 * 0002 AX.25
162 * 0004 802.2
163 * 8035 RARP
164 * 0005 SNAP
165 * 0805 X.25
166 * 0806 ARP
167 * 8137 IPX
168 * 0009 Localtalk
169 * 86DD IPv6
172 #define PTYPE_HASH_SIZE (16)
173 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
175 static DEFINE_SPINLOCK(ptype_lock);
176 static DEFINE_SPINLOCK(offload_lock);
177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178 static struct list_head ptype_all __read_mostly; /* Taps */
179 static struct list_head offload_base __read_mostly;
182 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
183 * semaphore.
185 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
187 * Writers must hold the rtnl semaphore while they loop through the
188 * dev_base_head list, and hold dev_base_lock for writing when they do the
189 * actual updates. This allows pure readers to access the list even
190 * while a writer is preparing to update it.
192 * To put it another way, dev_base_lock is held for writing only to
193 * protect against pure readers; the rtnl semaphore provides the
194 * protection against other writers.
196 * See, for example usages, register_netdevice() and
197 * unregister_netdevice(), which must be called with the rtnl
198 * semaphore held.
200 DEFINE_RWLOCK(dev_base_lock);
201 EXPORT_SYMBOL(dev_base_lock);
203 seqcount_t devnet_rename_seq;
205 static inline void dev_base_seq_inc(struct net *net)
207 while (++net->dev_base_seq == 0);
210 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
212 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
214 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
217 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
219 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
222 static inline void rps_lock(struct softnet_data *sd)
224 #ifdef CONFIG_RPS
225 spin_lock(&sd->input_pkt_queue.lock);
226 #endif
229 static inline void rps_unlock(struct softnet_data *sd)
231 #ifdef CONFIG_RPS
232 spin_unlock(&sd->input_pkt_queue.lock);
233 #endif
236 /* Device list insertion */
237 static int list_netdevice(struct net_device *dev)
239 struct net *net = dev_net(dev);
241 ASSERT_RTNL();
243 write_lock_bh(&dev_base_lock);
244 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
245 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
246 hlist_add_head_rcu(&dev->index_hlist,
247 dev_index_hash(net, dev->ifindex));
248 write_unlock_bh(&dev_base_lock);
250 dev_base_seq_inc(net);
252 return 0;
255 /* Device list removal
256 * caller must respect a RCU grace period before freeing/reusing dev
258 static void unlist_netdevice(struct net_device *dev)
260 ASSERT_RTNL();
262 /* Unlink dev from the device chain */
263 write_lock_bh(&dev_base_lock);
264 list_del_rcu(&dev->dev_list);
265 hlist_del_rcu(&dev->name_hlist);
266 hlist_del_rcu(&dev->index_hlist);
267 write_unlock_bh(&dev_base_lock);
269 dev_base_seq_inc(dev_net(dev));
273 * Our notifier list
276 static RAW_NOTIFIER_HEAD(netdev_chain);
279 * Device drivers call our routines to queue packets here. We empty the
280 * queue in the local softnet handler.
283 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
284 EXPORT_PER_CPU_SYMBOL(softnet_data);
286 #ifdef CONFIG_LOCKDEP
288 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
289 * according to dev->type
291 static const unsigned short netdev_lock_type[] =
292 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
293 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
294 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
295 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
296 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
297 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
298 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
299 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
300 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
301 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
302 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
303 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
304 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
305 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
306 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
308 static const char *const netdev_lock_name[] =
309 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
310 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
311 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
312 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
313 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
314 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
315 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
316 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
317 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
318 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
319 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
320 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
321 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
322 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
323 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
325 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
326 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
328 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
330 int i;
332 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
333 if (netdev_lock_type[i] == dev_type)
334 return i;
335 /* the last key is used by default */
336 return ARRAY_SIZE(netdev_lock_type) - 1;
339 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
340 unsigned short dev_type)
342 int i;
344 i = netdev_lock_pos(dev_type);
345 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
346 netdev_lock_name[i]);
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
351 int i;
353 i = netdev_lock_pos(dev->type);
354 lockdep_set_class_and_name(&dev->addr_list_lock,
355 &netdev_addr_lock_key[i],
356 netdev_lock_name[i]);
358 #else
359 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
360 unsigned short dev_type)
363 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
366 #endif
368 /*******************************************************************************
370 Protocol management and registration routines
372 *******************************************************************************/
375 * Add a protocol ID to the list. Now that the input handler is
376 * smarter we can dispense with all the messy stuff that used to be
377 * here.
379 * BEWARE!!! Protocol handlers, mangling input packets,
380 * MUST BE last in hash buckets and checking protocol handlers
381 * MUST start from promiscuous ptype_all chain in net_bh.
382 * It is true now, do not change it.
383 * Explanation follows: if protocol handler, mangling packet, will
384 * be the first on list, it is not able to sense, that packet
385 * is cloned and should be copied-on-write, so that it will
386 * change it and subsequent readers will get broken packet.
387 * --ANK (980803)
390 static inline struct list_head *ptype_head(const struct packet_type *pt)
392 if (pt->type == htons(ETH_P_ALL))
393 return &ptype_all;
394 else
395 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
399 * dev_add_pack - add packet handler
400 * @pt: packet type declaration
402 * Add a protocol handler to the networking stack. The passed &packet_type
403 * is linked into kernel lists and may not be freed until it has been
404 * removed from the kernel lists.
406 * This call does not sleep therefore it can not
407 * guarantee all CPU's that are in middle of receiving packets
408 * will see the new packet type (until the next received packet).
411 void dev_add_pack(struct packet_type *pt)
413 struct list_head *head = ptype_head(pt);
415 spin_lock(&ptype_lock);
416 list_add_rcu(&pt->list, head);
417 spin_unlock(&ptype_lock);
419 EXPORT_SYMBOL(dev_add_pack);
422 * __dev_remove_pack - remove packet handler
423 * @pt: packet type declaration
425 * Remove a protocol handler that was previously added to the kernel
426 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
427 * from the kernel lists and can be freed or reused once this function
428 * returns.
430 * The packet type might still be in use by receivers
431 * and must not be freed until after all the CPU's have gone
432 * through a quiescent state.
434 void __dev_remove_pack(struct packet_type *pt)
436 struct list_head *head = ptype_head(pt);
437 struct packet_type *pt1;
439 spin_lock(&ptype_lock);
441 list_for_each_entry(pt1, head, list) {
442 if (pt == pt1) {
443 list_del_rcu(&pt->list);
444 goto out;
448 pr_warn("dev_remove_pack: %p not found\n", pt);
449 out:
450 spin_unlock(&ptype_lock);
452 EXPORT_SYMBOL(__dev_remove_pack);
455 * dev_remove_pack - remove packet handler
456 * @pt: packet type declaration
458 * Remove a protocol handler that was previously added to the kernel
459 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
460 * from the kernel lists and can be freed or reused once this function
461 * returns.
463 * This call sleeps to guarantee that no CPU is looking at the packet
464 * type after return.
466 void dev_remove_pack(struct packet_type *pt)
468 __dev_remove_pack(pt);
470 synchronize_net();
472 EXPORT_SYMBOL(dev_remove_pack);
476 * dev_add_offload - register offload handlers
477 * @po: protocol offload declaration
479 * Add protocol offload handlers to the networking stack. The passed
480 * &proto_offload is linked into kernel lists and may not be freed until
481 * it has been removed from the kernel lists.
483 * This call does not sleep therefore it can not
484 * guarantee all CPU's that are in middle of receiving packets
485 * will see the new offload handlers (until the next received packet).
487 void dev_add_offload(struct packet_offload *po)
489 struct list_head *head = &offload_base;
491 spin_lock(&offload_lock);
492 list_add_rcu(&po->list, head);
493 spin_unlock(&offload_lock);
495 EXPORT_SYMBOL(dev_add_offload);
498 * __dev_remove_offload - remove offload handler
499 * @po: packet offload declaration
501 * Remove a protocol offload handler that was previously added to the
502 * kernel offload handlers by dev_add_offload(). The passed &offload_type
503 * is removed from the kernel lists and can be freed or reused once this
504 * function returns.
506 * The packet type might still be in use by receivers
507 * and must not be freed until after all the CPU's have gone
508 * through a quiescent state.
510 void __dev_remove_offload(struct packet_offload *po)
512 struct list_head *head = &offload_base;
513 struct packet_offload *po1;
515 spin_lock(&offload_lock);
517 list_for_each_entry(po1, head, list) {
518 if (po == po1) {
519 list_del_rcu(&po->list);
520 goto out;
524 pr_warn("dev_remove_offload: %p not found\n", po);
525 out:
526 spin_unlock(&offload_lock);
528 EXPORT_SYMBOL(__dev_remove_offload);
531 * dev_remove_offload - remove packet offload handler
532 * @po: packet offload declaration
534 * Remove a packet offload handler that was previously added to the kernel
535 * offload handlers by dev_add_offload(). The passed &offload_type is
536 * removed from the kernel lists and can be freed or reused once this
537 * function returns.
539 * This call sleeps to guarantee that no CPU is looking at the packet
540 * type after return.
542 void dev_remove_offload(struct packet_offload *po)
544 __dev_remove_offload(po);
546 synchronize_net();
548 EXPORT_SYMBOL(dev_remove_offload);
550 /******************************************************************************
552 Device Boot-time Settings Routines
554 *******************************************************************************/
556 /* Boot time configuration table */
557 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
560 * netdev_boot_setup_add - add new setup entry
561 * @name: name of the device
562 * @map: configured settings for the device
564 * Adds new setup entry to the dev_boot_setup list. The function
565 * returns 0 on error and 1 on success. This is a generic routine to
566 * all netdevices.
568 static int netdev_boot_setup_add(char *name, struct ifmap *map)
570 struct netdev_boot_setup *s;
571 int i;
573 s = dev_boot_setup;
574 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
575 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
576 memset(s[i].name, 0, sizeof(s[i].name));
577 strlcpy(s[i].name, name, IFNAMSIZ);
578 memcpy(&s[i].map, map, sizeof(s[i].map));
579 break;
583 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
587 * netdev_boot_setup_check - check boot time settings
588 * @dev: the netdevice
590 * Check boot time settings for the device.
591 * The found settings are set for the device to be used
592 * later in the device probing.
593 * Returns 0 if no settings found, 1 if they are.
595 int netdev_boot_setup_check(struct net_device *dev)
597 struct netdev_boot_setup *s = dev_boot_setup;
598 int i;
600 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
601 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
602 !strcmp(dev->name, s[i].name)) {
603 dev->irq = s[i].map.irq;
604 dev->base_addr = s[i].map.base_addr;
605 dev->mem_start = s[i].map.mem_start;
606 dev->mem_end = s[i].map.mem_end;
607 return 1;
610 return 0;
612 EXPORT_SYMBOL(netdev_boot_setup_check);
616 * netdev_boot_base - get address from boot time settings
617 * @prefix: prefix for network device
618 * @unit: id for network device
620 * Check boot time settings for the base address of device.
621 * The found settings are set for the device to be used
622 * later in the device probing.
623 * Returns 0 if no settings found.
625 unsigned long netdev_boot_base(const char *prefix, int unit)
627 const struct netdev_boot_setup *s = dev_boot_setup;
628 char name[IFNAMSIZ];
629 int i;
631 sprintf(name, "%s%d", prefix, unit);
634 * If device already registered then return base of 1
635 * to indicate not to probe for this interface
637 if (__dev_get_by_name(&init_net, name))
638 return 1;
640 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
641 if (!strcmp(name, s[i].name))
642 return s[i].map.base_addr;
643 return 0;
647 * Saves at boot time configured settings for any netdevice.
649 int __init netdev_boot_setup(char *str)
651 int ints[5];
652 struct ifmap map;
654 str = get_options(str, ARRAY_SIZE(ints), ints);
655 if (!str || !*str)
656 return 0;
658 /* Save settings */
659 memset(&map, 0, sizeof(map));
660 if (ints[0] > 0)
661 map.irq = ints[1];
662 if (ints[0] > 1)
663 map.base_addr = ints[2];
664 if (ints[0] > 2)
665 map.mem_start = ints[3];
666 if (ints[0] > 3)
667 map.mem_end = ints[4];
669 /* Add new entry to the list */
670 return netdev_boot_setup_add(str, &map);
673 __setup("netdev=", netdev_boot_setup);
675 /*******************************************************************************
677 Device Interface Subroutines
679 *******************************************************************************/
682 * __dev_get_by_name - find a device by its name
683 * @net: the applicable net namespace
684 * @name: name to find
686 * Find an interface by name. Must be called under RTNL semaphore
687 * or @dev_base_lock. If the name is found a pointer to the device
688 * is returned. If the name is not found then %NULL is returned. The
689 * reference counters are not incremented so the caller must be
690 * careful with locks.
693 struct net_device *__dev_get_by_name(struct net *net, const char *name)
695 struct hlist_node *p;
696 struct net_device *dev;
697 struct hlist_head *head = dev_name_hash(net, name);
699 hlist_for_each_entry(dev, p, head, name_hlist)
700 if (!strncmp(dev->name, name, IFNAMSIZ))
701 return dev;
703 return NULL;
705 EXPORT_SYMBOL(__dev_get_by_name);
708 * dev_get_by_name_rcu - find a device by its name
709 * @net: the applicable net namespace
710 * @name: name to find
712 * Find an interface by name.
713 * If the name is found a pointer to the device is returned.
714 * If the name is not found then %NULL is returned.
715 * The reference counters are not incremented so the caller must be
716 * careful with locks. The caller must hold RCU lock.
719 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
721 struct hlist_node *p;
722 struct net_device *dev;
723 struct hlist_head *head = dev_name_hash(net, name);
725 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
726 if (!strncmp(dev->name, name, IFNAMSIZ))
727 return dev;
729 return NULL;
731 EXPORT_SYMBOL(dev_get_by_name_rcu);
734 * dev_get_by_name - find a device by its name
735 * @net: the applicable net namespace
736 * @name: name to find
738 * Find an interface by name. This can be called from any
739 * context and does its own locking. The returned handle has
740 * the usage count incremented and the caller must use dev_put() to
741 * release it when it is no longer needed. %NULL is returned if no
742 * matching device is found.
745 struct net_device *dev_get_by_name(struct net *net, const char *name)
747 struct net_device *dev;
749 rcu_read_lock();
750 dev = dev_get_by_name_rcu(net, name);
751 if (dev)
752 dev_hold(dev);
753 rcu_read_unlock();
754 return dev;
756 EXPORT_SYMBOL(dev_get_by_name);
759 * __dev_get_by_index - find a device by its ifindex
760 * @net: the applicable net namespace
761 * @ifindex: index of device
763 * Search for an interface by index. Returns %NULL if the device
764 * is not found or a pointer to the device. The device has not
765 * had its reference counter increased so the caller must be careful
766 * about locking. The caller must hold either the RTNL semaphore
767 * or @dev_base_lock.
770 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
772 struct hlist_node *p;
773 struct net_device *dev;
774 struct hlist_head *head = dev_index_hash(net, ifindex);
776 hlist_for_each_entry(dev, p, head, index_hlist)
777 if (dev->ifindex == ifindex)
778 return dev;
780 return NULL;
782 EXPORT_SYMBOL(__dev_get_by_index);
785 * dev_get_by_index_rcu - find a device by its ifindex
786 * @net: the applicable net namespace
787 * @ifindex: index of device
789 * Search for an interface by index. Returns %NULL if the device
790 * is not found or a pointer to the device. The device has not
791 * had its reference counter increased so the caller must be careful
792 * about locking. The caller must hold RCU lock.
795 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
797 struct hlist_node *p;
798 struct net_device *dev;
799 struct hlist_head *head = dev_index_hash(net, ifindex);
801 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
802 if (dev->ifindex == ifindex)
803 return dev;
805 return NULL;
807 EXPORT_SYMBOL(dev_get_by_index_rcu);
811 * dev_get_by_index - find a device by its ifindex
812 * @net: the applicable net namespace
813 * @ifindex: index of device
815 * Search for an interface by index. Returns NULL if the device
816 * is not found or a pointer to the device. The device returned has
817 * had a reference added and the pointer is safe until the user calls
818 * dev_put to indicate they have finished with it.
821 struct net_device *dev_get_by_index(struct net *net, int ifindex)
823 struct net_device *dev;
825 rcu_read_lock();
826 dev = dev_get_by_index_rcu(net, ifindex);
827 if (dev)
828 dev_hold(dev);
829 rcu_read_unlock();
830 return dev;
832 EXPORT_SYMBOL(dev_get_by_index);
835 * dev_getbyhwaddr_rcu - find a device by its hardware address
836 * @net: the applicable net namespace
837 * @type: media type of device
838 * @ha: hardware address
840 * Search for an interface by MAC address. Returns NULL if the device
841 * is not found or a pointer to the device.
842 * The caller must hold RCU or RTNL.
843 * The returned device has not had its ref count increased
844 * and the caller must therefore be careful about locking
848 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
849 const char *ha)
851 struct net_device *dev;
853 for_each_netdev_rcu(net, dev)
854 if (dev->type == type &&
855 !memcmp(dev->dev_addr, ha, dev->addr_len))
856 return dev;
858 return NULL;
860 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
862 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
864 struct net_device *dev;
866 ASSERT_RTNL();
867 for_each_netdev(net, dev)
868 if (dev->type == type)
869 return dev;
871 return NULL;
873 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
875 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
877 struct net_device *dev, *ret = NULL;
879 rcu_read_lock();
880 for_each_netdev_rcu(net, dev)
881 if (dev->type == type) {
882 dev_hold(dev);
883 ret = dev;
884 break;
886 rcu_read_unlock();
887 return ret;
889 EXPORT_SYMBOL(dev_getfirstbyhwtype);
892 * dev_get_by_flags_rcu - find any device with given flags
893 * @net: the applicable net namespace
894 * @if_flags: IFF_* values
895 * @mask: bitmask of bits in if_flags to check
897 * Search for any interface with the given flags. Returns NULL if a device
898 * is not found or a pointer to the device. Must be called inside
899 * rcu_read_lock(), and result refcount is unchanged.
902 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
903 unsigned short mask)
905 struct net_device *dev, *ret;
907 ret = NULL;
908 for_each_netdev_rcu(net, dev) {
909 if (((dev->flags ^ if_flags) & mask) == 0) {
910 ret = dev;
911 break;
914 return ret;
916 EXPORT_SYMBOL(dev_get_by_flags_rcu);
919 * dev_valid_name - check if name is okay for network device
920 * @name: name string
922 * Network device names need to be valid file names to
923 * to allow sysfs to work. We also disallow any kind of
924 * whitespace.
926 bool dev_valid_name(const char *name)
928 if (*name == '\0')
929 return false;
930 if (strlen(name) >= IFNAMSIZ)
931 return false;
932 if (!strcmp(name, ".") || !strcmp(name, ".."))
933 return false;
935 while (*name) {
936 if (*name == '/' || isspace(*name))
937 return false;
938 name++;
940 return true;
942 EXPORT_SYMBOL(dev_valid_name);
945 * __dev_alloc_name - allocate a name for a device
946 * @net: network namespace to allocate the device name in
947 * @name: name format string
948 * @buf: scratch buffer and result name string
950 * Passed a format string - eg "lt%d" it will try and find a suitable
951 * id. It scans list of devices to build up a free map, then chooses
952 * the first empty slot. The caller must hold the dev_base or rtnl lock
953 * while allocating the name and adding the device in order to avoid
954 * duplicates.
955 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
956 * Returns the number of the unit assigned or a negative errno code.
959 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
961 int i = 0;
962 const char *p;
963 const int max_netdevices = 8*PAGE_SIZE;
964 unsigned long *inuse;
965 struct net_device *d;
967 p = strnchr(name, IFNAMSIZ-1, '%');
968 if (p) {
970 * Verify the string as this thing may have come from
971 * the user. There must be either one "%d" and no other "%"
972 * characters.
974 if (p[1] != 'd' || strchr(p + 2, '%'))
975 return -EINVAL;
977 /* Use one page as a bit array of possible slots */
978 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
979 if (!inuse)
980 return -ENOMEM;
982 for_each_netdev(net, d) {
983 if (!sscanf(d->name, name, &i))
984 continue;
985 if (i < 0 || i >= max_netdevices)
986 continue;
988 /* avoid cases where sscanf is not exact inverse of printf */
989 snprintf(buf, IFNAMSIZ, name, i);
990 if (!strncmp(buf, d->name, IFNAMSIZ))
991 set_bit(i, inuse);
994 i = find_first_zero_bit(inuse, max_netdevices);
995 free_page((unsigned long) inuse);
998 if (buf != name)
999 snprintf(buf, IFNAMSIZ, name, i);
1000 if (!__dev_get_by_name(net, buf))
1001 return i;
1003 /* It is possible to run out of possible slots
1004 * when the name is long and there isn't enough space left
1005 * for the digits, or if all bits are used.
1007 return -ENFILE;
1011 * dev_alloc_name - allocate a name for a device
1012 * @dev: device
1013 * @name: name format string
1015 * Passed a format string - eg "lt%d" it will try and find a suitable
1016 * id. It scans list of devices to build up a free map, then chooses
1017 * the first empty slot. The caller must hold the dev_base or rtnl lock
1018 * while allocating the name and adding the device in order to avoid
1019 * duplicates.
1020 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 * Returns the number of the unit assigned or a negative errno code.
1024 int dev_alloc_name(struct net_device *dev, const char *name)
1026 char buf[IFNAMSIZ];
1027 struct net *net;
1028 int ret;
1030 BUG_ON(!dev_net(dev));
1031 net = dev_net(dev);
1032 ret = __dev_alloc_name(net, name, buf);
1033 if (ret >= 0)
1034 strlcpy(dev->name, buf, IFNAMSIZ);
1035 return ret;
1037 EXPORT_SYMBOL(dev_alloc_name);
1039 static int dev_alloc_name_ns(struct net *net,
1040 struct net_device *dev,
1041 const char *name)
1043 char buf[IFNAMSIZ];
1044 int ret;
1046 ret = __dev_alloc_name(net, name, buf);
1047 if (ret >= 0)
1048 strlcpy(dev->name, buf, IFNAMSIZ);
1049 return ret;
1052 static int dev_get_valid_name(struct net *net,
1053 struct net_device *dev,
1054 const char *name)
1056 BUG_ON(!net);
1058 if (!dev_valid_name(name))
1059 return -EINVAL;
1061 if (strchr(name, '%'))
1062 return dev_alloc_name_ns(net, dev, name);
1063 else if (__dev_get_by_name(net, name))
1064 return -EEXIST;
1065 else if (dev->name != name)
1066 strlcpy(dev->name, name, IFNAMSIZ);
1068 return 0;
1072 * dev_change_name - change name of a device
1073 * @dev: device
1074 * @newname: name (or format string) must be at least IFNAMSIZ
1076 * Change name of a device, can pass format strings "eth%d".
1077 * for wildcarding.
1079 int dev_change_name(struct net_device *dev, const char *newname)
1081 char oldname[IFNAMSIZ];
1082 int err = 0;
1083 int ret;
1084 struct net *net;
1086 ASSERT_RTNL();
1087 BUG_ON(!dev_net(dev));
1089 net = dev_net(dev);
1090 if (dev->flags & IFF_UP)
1091 return -EBUSY;
1093 write_seqcount_begin(&devnet_rename_seq);
1095 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1096 write_seqcount_end(&devnet_rename_seq);
1097 return 0;
1100 memcpy(oldname, dev->name, IFNAMSIZ);
1102 err = dev_get_valid_name(net, dev, newname);
1103 if (err < 0) {
1104 write_seqcount_end(&devnet_rename_seq);
1105 return err;
1108 rollback:
1109 ret = device_rename(&dev->dev, dev->name);
1110 if (ret) {
1111 memcpy(dev->name, oldname, IFNAMSIZ);
1112 write_seqcount_end(&devnet_rename_seq);
1113 return ret;
1116 write_seqcount_end(&devnet_rename_seq);
1118 write_lock_bh(&dev_base_lock);
1119 hlist_del_rcu(&dev->name_hlist);
1120 write_unlock_bh(&dev_base_lock);
1122 synchronize_rcu();
1124 write_lock_bh(&dev_base_lock);
1125 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1126 write_unlock_bh(&dev_base_lock);
1128 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1129 ret = notifier_to_errno(ret);
1131 if (ret) {
1132 /* err >= 0 after dev_alloc_name() or stores the first errno */
1133 if (err >= 0) {
1134 err = ret;
1135 write_seqcount_begin(&devnet_rename_seq);
1136 memcpy(dev->name, oldname, IFNAMSIZ);
1137 goto rollback;
1138 } else {
1139 pr_err("%s: name change rollback failed: %d\n",
1140 dev->name, ret);
1144 return err;
1148 * dev_set_alias - change ifalias of a device
1149 * @dev: device
1150 * @alias: name up to IFALIASZ
1151 * @len: limit of bytes to copy from info
1153 * Set ifalias for a device,
1155 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1157 char *new_ifalias;
1159 ASSERT_RTNL();
1161 if (len >= IFALIASZ)
1162 return -EINVAL;
1164 if (!len) {
1165 kfree(dev->ifalias);
1166 dev->ifalias = NULL;
1167 return 0;
1170 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1171 if (!new_ifalias)
1172 return -ENOMEM;
1173 dev->ifalias = new_ifalias;
1175 strlcpy(dev->ifalias, alias, len+1);
1176 return len;
1181 * netdev_features_change - device changes features
1182 * @dev: device to cause notification
1184 * Called to indicate a device has changed features.
1186 void netdev_features_change(struct net_device *dev)
1188 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1190 EXPORT_SYMBOL(netdev_features_change);
1193 * netdev_state_change - device changes state
1194 * @dev: device to cause notification
1196 * Called to indicate a device has changed state. This function calls
1197 * the notifier chains for netdev_chain and sends a NEWLINK message
1198 * to the routing socket.
1200 void netdev_state_change(struct net_device *dev)
1202 if (dev->flags & IFF_UP) {
1203 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1204 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1207 EXPORT_SYMBOL(netdev_state_change);
1210 * netdev_notify_peers - notify network peers about existence of @dev
1211 * @dev: network device
1213 * Generate traffic such that interested network peers are aware of
1214 * @dev, such as by generating a gratuitous ARP. This may be used when
1215 * a device wants to inform the rest of the network about some sort of
1216 * reconfiguration such as a failover event or virtual machine
1217 * migration.
1219 void netdev_notify_peers(struct net_device *dev)
1221 rtnl_lock();
1222 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1223 rtnl_unlock();
1225 EXPORT_SYMBOL(netdev_notify_peers);
1227 static int __dev_open(struct net_device *dev)
1229 const struct net_device_ops *ops = dev->netdev_ops;
1230 int ret;
1232 ASSERT_RTNL();
1234 if (!netif_device_present(dev))
1235 return -ENODEV;
1237 /* Block netpoll from trying to do any rx path servicing.
1238 * If we don't do this there is a chance ndo_poll_controller
1239 * or ndo_poll may be running while we open the device
1241 ret = netpoll_rx_disable(dev);
1242 if (ret)
1243 return ret;
1245 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1246 ret = notifier_to_errno(ret);
1247 if (ret)
1248 return ret;
1250 set_bit(__LINK_STATE_START, &dev->state);
1252 if (ops->ndo_validate_addr)
1253 ret = ops->ndo_validate_addr(dev);
1255 if (!ret && ops->ndo_open)
1256 ret = ops->ndo_open(dev);
1258 netpoll_rx_enable(dev);
1260 if (ret)
1261 clear_bit(__LINK_STATE_START, &dev->state);
1262 else {
1263 dev->flags |= IFF_UP;
1264 net_dmaengine_get();
1265 dev_set_rx_mode(dev);
1266 dev_activate(dev);
1267 add_device_randomness(dev->dev_addr, dev->addr_len);
1270 return ret;
1274 * dev_open - prepare an interface for use.
1275 * @dev: device to open
1277 * Takes a device from down to up state. The device's private open
1278 * function is invoked and then the multicast lists are loaded. Finally
1279 * the device is moved into the up state and a %NETDEV_UP message is
1280 * sent to the netdev notifier chain.
1282 * Calling this function on an active interface is a nop. On a failure
1283 * a negative errno code is returned.
1285 int dev_open(struct net_device *dev)
1287 int ret;
1289 if (dev->flags & IFF_UP)
1290 return 0;
1292 ret = __dev_open(dev);
1293 if (ret < 0)
1294 return ret;
1296 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1297 call_netdevice_notifiers(NETDEV_UP, dev);
1299 return ret;
1301 EXPORT_SYMBOL(dev_open);
1303 static int __dev_close_many(struct list_head *head)
1305 struct net_device *dev;
1307 ASSERT_RTNL();
1308 might_sleep();
1310 list_for_each_entry(dev, head, unreg_list) {
1311 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1313 clear_bit(__LINK_STATE_START, &dev->state);
1315 /* Synchronize to scheduled poll. We cannot touch poll list, it
1316 * can be even on different cpu. So just clear netif_running().
1318 * dev->stop() will invoke napi_disable() on all of it's
1319 * napi_struct instances on this device.
1321 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1324 dev_deactivate_many(head);
1326 list_for_each_entry(dev, head, unreg_list) {
1327 const struct net_device_ops *ops = dev->netdev_ops;
1330 * Call the device specific close. This cannot fail.
1331 * Only if device is UP
1333 * We allow it to be called even after a DETACH hot-plug
1334 * event.
1336 if (ops->ndo_stop)
1337 ops->ndo_stop(dev);
1339 dev->flags &= ~IFF_UP;
1340 net_dmaengine_put();
1343 return 0;
1346 static int __dev_close(struct net_device *dev)
1348 int retval;
1349 LIST_HEAD(single);
1351 /* Temporarily disable netpoll until the interface is down */
1352 retval = netpoll_rx_disable(dev);
1353 if (retval)
1354 return retval;
1356 list_add(&dev->unreg_list, &single);
1357 retval = __dev_close_many(&single);
1358 list_del(&single);
1360 netpoll_rx_enable(dev);
1361 return retval;
1364 static int dev_close_many(struct list_head *head)
1366 struct net_device *dev, *tmp;
1367 LIST_HEAD(tmp_list);
1369 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1370 if (!(dev->flags & IFF_UP))
1371 list_move(&dev->unreg_list, &tmp_list);
1373 __dev_close_many(head);
1375 list_for_each_entry(dev, head, unreg_list) {
1376 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1377 call_netdevice_notifiers(NETDEV_DOWN, dev);
1380 /* rollback_registered_many needs the complete original list */
1381 list_splice(&tmp_list, head);
1382 return 0;
1386 * dev_close - shutdown an interface.
1387 * @dev: device to shutdown
1389 * This function moves an active device into down state. A
1390 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1391 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1392 * chain.
1394 int dev_close(struct net_device *dev)
1396 int ret = 0;
1397 if (dev->flags & IFF_UP) {
1398 LIST_HEAD(single);
1400 /* Block netpoll rx while the interface is going down */
1401 ret = netpoll_rx_disable(dev);
1402 if (ret)
1403 return ret;
1405 list_add(&dev->unreg_list, &single);
1406 dev_close_many(&single);
1407 list_del(&single);
1409 netpoll_rx_enable(dev);
1411 return ret;
1413 EXPORT_SYMBOL(dev_close);
1417 * dev_disable_lro - disable Large Receive Offload on a device
1418 * @dev: device
1420 * Disable Large Receive Offload (LRO) on a net device. Must be
1421 * called under RTNL. This is needed if received packets may be
1422 * forwarded to another interface.
1424 void dev_disable_lro(struct net_device *dev)
1427 * If we're trying to disable lro on a vlan device
1428 * use the underlying physical device instead
1430 if (is_vlan_dev(dev))
1431 dev = vlan_dev_real_dev(dev);
1433 dev->wanted_features &= ~NETIF_F_LRO;
1434 netdev_update_features(dev);
1436 if (unlikely(dev->features & NETIF_F_LRO))
1437 netdev_WARN(dev, "failed to disable LRO!\n");
1439 EXPORT_SYMBOL(dev_disable_lro);
1442 static int dev_boot_phase = 1;
1445 * register_netdevice_notifier - register a network notifier block
1446 * @nb: notifier
1448 * Register a notifier to be called when network device events occur.
1449 * The notifier passed is linked into the kernel structures and must
1450 * not be reused until it has been unregistered. A negative errno code
1451 * is returned on a failure.
1453 * When registered all registration and up events are replayed
1454 * to the new notifier to allow device to have a race free
1455 * view of the network device list.
1458 int register_netdevice_notifier(struct notifier_block *nb)
1460 struct net_device *dev;
1461 struct net_device *last;
1462 struct net *net;
1463 int err;
1465 rtnl_lock();
1466 err = raw_notifier_chain_register(&netdev_chain, nb);
1467 if (err)
1468 goto unlock;
1469 if (dev_boot_phase)
1470 goto unlock;
1471 for_each_net(net) {
1472 for_each_netdev(net, dev) {
1473 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1474 err = notifier_to_errno(err);
1475 if (err)
1476 goto rollback;
1478 if (!(dev->flags & IFF_UP))
1479 continue;
1481 nb->notifier_call(nb, NETDEV_UP, dev);
1485 unlock:
1486 rtnl_unlock();
1487 return err;
1489 rollback:
1490 last = dev;
1491 for_each_net(net) {
1492 for_each_netdev(net, dev) {
1493 if (dev == last)
1494 goto outroll;
1496 if (dev->flags & IFF_UP) {
1497 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1498 nb->notifier_call(nb, NETDEV_DOWN, dev);
1500 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1504 outroll:
1505 raw_notifier_chain_unregister(&netdev_chain, nb);
1506 goto unlock;
1508 EXPORT_SYMBOL(register_netdevice_notifier);
1511 * unregister_netdevice_notifier - unregister a network notifier block
1512 * @nb: notifier
1514 * Unregister a notifier previously registered by
1515 * register_netdevice_notifier(). The notifier is unlinked into the
1516 * kernel structures and may then be reused. A negative errno code
1517 * is returned on a failure.
1519 * After unregistering unregister and down device events are synthesized
1520 * for all devices on the device list to the removed notifier to remove
1521 * the need for special case cleanup code.
1524 int unregister_netdevice_notifier(struct notifier_block *nb)
1526 struct net_device *dev;
1527 struct net *net;
1528 int err;
1530 rtnl_lock();
1531 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1532 if (err)
1533 goto unlock;
1535 for_each_net(net) {
1536 for_each_netdev(net, dev) {
1537 if (dev->flags & IFF_UP) {
1538 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1539 nb->notifier_call(nb, NETDEV_DOWN, dev);
1541 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1544 unlock:
1545 rtnl_unlock();
1546 return err;
1548 EXPORT_SYMBOL(unregister_netdevice_notifier);
1551 * call_netdevice_notifiers - call all network notifier blocks
1552 * @val: value passed unmodified to notifier function
1553 * @dev: net_device pointer passed unmodified to notifier function
1555 * Call all network notifier blocks. Parameters and return value
1556 * are as for raw_notifier_call_chain().
1559 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1561 ASSERT_RTNL();
1562 return raw_notifier_call_chain(&netdev_chain, val, dev);
1564 EXPORT_SYMBOL(call_netdevice_notifiers);
1566 static struct static_key netstamp_needed __read_mostly;
1567 #ifdef HAVE_JUMP_LABEL
1568 /* We are not allowed to call static_key_slow_dec() from irq context
1569 * If net_disable_timestamp() is called from irq context, defer the
1570 * static_key_slow_dec() calls.
1572 static atomic_t netstamp_needed_deferred;
1573 #endif
1575 void net_enable_timestamp(void)
1577 #ifdef HAVE_JUMP_LABEL
1578 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1580 if (deferred) {
1581 while (--deferred)
1582 static_key_slow_dec(&netstamp_needed);
1583 return;
1585 #endif
1586 WARN_ON(in_interrupt());
1587 static_key_slow_inc(&netstamp_needed);
1589 EXPORT_SYMBOL(net_enable_timestamp);
1591 void net_disable_timestamp(void)
1593 #ifdef HAVE_JUMP_LABEL
1594 if (in_interrupt()) {
1595 atomic_inc(&netstamp_needed_deferred);
1596 return;
1598 #endif
1599 static_key_slow_dec(&netstamp_needed);
1601 EXPORT_SYMBOL(net_disable_timestamp);
1603 static inline void net_timestamp_set(struct sk_buff *skb)
1605 skb->tstamp.tv64 = 0;
1606 if (static_key_false(&netstamp_needed))
1607 __net_timestamp(skb);
1610 #define net_timestamp_check(COND, SKB) \
1611 if (static_key_false(&netstamp_needed)) { \
1612 if ((COND) && !(SKB)->tstamp.tv64) \
1613 __net_timestamp(SKB); \
1616 static inline bool is_skb_forwardable(struct net_device *dev,
1617 struct sk_buff *skb)
1619 unsigned int len;
1621 if (!(dev->flags & IFF_UP))
1622 return false;
1624 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1625 if (skb->len <= len)
1626 return true;
1628 /* if TSO is enabled, we don't care about the length as the packet
1629 * could be forwarded without being segmented before
1631 if (skb_is_gso(skb))
1632 return true;
1634 return false;
1638 * dev_forward_skb - loopback an skb to another netif
1640 * @dev: destination network device
1641 * @skb: buffer to forward
1643 * return values:
1644 * NET_RX_SUCCESS (no congestion)
1645 * NET_RX_DROP (packet was dropped, but freed)
1647 * dev_forward_skb can be used for injecting an skb from the
1648 * start_xmit function of one device into the receive queue
1649 * of another device.
1651 * The receiving device may be in another namespace, so
1652 * we have to clear all information in the skb that could
1653 * impact namespace isolation.
1655 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1657 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1658 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1659 atomic_long_inc(&dev->rx_dropped);
1660 kfree_skb(skb);
1661 return NET_RX_DROP;
1665 skb_orphan(skb);
1666 nf_reset(skb);
1668 if (unlikely(!is_skb_forwardable(dev, skb))) {
1669 atomic_long_inc(&dev->rx_dropped);
1670 kfree_skb(skb);
1671 return NET_RX_DROP;
1673 skb->skb_iif = 0;
1674 skb->dev = dev;
1675 skb_dst_drop(skb);
1676 skb->tstamp.tv64 = 0;
1677 skb->pkt_type = PACKET_HOST;
1678 skb->protocol = eth_type_trans(skb, dev);
1679 skb->mark = 0;
1680 secpath_reset(skb);
1681 nf_reset(skb);
1682 return netif_rx(skb);
1684 EXPORT_SYMBOL_GPL(dev_forward_skb);
1686 static inline int deliver_skb(struct sk_buff *skb,
1687 struct packet_type *pt_prev,
1688 struct net_device *orig_dev)
1690 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1691 return -ENOMEM;
1692 atomic_inc(&skb->users);
1693 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1696 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1698 if (!ptype->af_packet_priv || !skb->sk)
1699 return false;
1701 if (ptype->id_match)
1702 return ptype->id_match(ptype, skb->sk);
1703 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1704 return true;
1706 return false;
1710 * Support routine. Sends outgoing frames to any network
1711 * taps currently in use.
1714 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1716 struct packet_type *ptype;
1717 struct sk_buff *skb2 = NULL;
1718 struct packet_type *pt_prev = NULL;
1720 rcu_read_lock();
1721 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1722 /* Never send packets back to the socket
1723 * they originated from - MvS (miquels@drinkel.ow.org)
1725 if ((ptype->dev == dev || !ptype->dev) &&
1726 (!skb_loop_sk(ptype, skb))) {
1727 if (pt_prev) {
1728 deliver_skb(skb2, pt_prev, skb->dev);
1729 pt_prev = ptype;
1730 continue;
1733 skb2 = skb_clone(skb, GFP_ATOMIC);
1734 if (!skb2)
1735 break;
1737 net_timestamp_set(skb2);
1739 /* skb->nh should be correctly
1740 set by sender, so that the second statement is
1741 just protection against buggy protocols.
1743 skb_reset_mac_header(skb2);
1745 if (skb_network_header(skb2) < skb2->data ||
1746 skb2->network_header > skb2->tail) {
1747 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1748 ntohs(skb2->protocol),
1749 dev->name);
1750 skb_reset_network_header(skb2);
1753 skb2->transport_header = skb2->network_header;
1754 skb2->pkt_type = PACKET_OUTGOING;
1755 pt_prev = ptype;
1758 if (pt_prev)
1759 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1760 rcu_read_unlock();
1764 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1765 * @dev: Network device
1766 * @txq: number of queues available
1768 * If real_num_tx_queues is changed the tc mappings may no longer be
1769 * valid. To resolve this verify the tc mapping remains valid and if
1770 * not NULL the mapping. With no priorities mapping to this
1771 * offset/count pair it will no longer be used. In the worst case TC0
1772 * is invalid nothing can be done so disable priority mappings. If is
1773 * expected that drivers will fix this mapping if they can before
1774 * calling netif_set_real_num_tx_queues.
1776 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1778 int i;
1779 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1781 /* If TC0 is invalidated disable TC mapping */
1782 if (tc->offset + tc->count > txq) {
1783 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1784 dev->num_tc = 0;
1785 return;
1788 /* Invalidated prio to tc mappings set to TC0 */
1789 for (i = 1; i < TC_BITMASK + 1; i++) {
1790 int q = netdev_get_prio_tc_map(dev, i);
1792 tc = &dev->tc_to_txq[q];
1793 if (tc->offset + tc->count > txq) {
1794 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1795 i, q);
1796 netdev_set_prio_tc_map(dev, i, 0);
1801 #ifdef CONFIG_XPS
1802 static DEFINE_MUTEX(xps_map_mutex);
1803 #define xmap_dereference(P) \
1804 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1806 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1807 int cpu, u16 index)
1809 struct xps_map *map = NULL;
1810 int pos;
1812 if (dev_maps)
1813 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1815 for (pos = 0; map && pos < map->len; pos++) {
1816 if (map->queues[pos] == index) {
1817 if (map->len > 1) {
1818 map->queues[pos] = map->queues[--map->len];
1819 } else {
1820 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1821 kfree_rcu(map, rcu);
1822 map = NULL;
1824 break;
1828 return map;
1831 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1833 struct xps_dev_maps *dev_maps;
1834 int cpu, i;
1835 bool active = false;
1837 mutex_lock(&xps_map_mutex);
1838 dev_maps = xmap_dereference(dev->xps_maps);
1840 if (!dev_maps)
1841 goto out_no_maps;
1843 for_each_possible_cpu(cpu) {
1844 for (i = index; i < dev->num_tx_queues; i++) {
1845 if (!remove_xps_queue(dev_maps, cpu, i))
1846 break;
1848 if (i == dev->num_tx_queues)
1849 active = true;
1852 if (!active) {
1853 RCU_INIT_POINTER(dev->xps_maps, NULL);
1854 kfree_rcu(dev_maps, rcu);
1857 for (i = index; i < dev->num_tx_queues; i++)
1858 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1859 NUMA_NO_NODE);
1861 out_no_maps:
1862 mutex_unlock(&xps_map_mutex);
1865 static struct xps_map *expand_xps_map(struct xps_map *map,
1866 int cpu, u16 index)
1868 struct xps_map *new_map;
1869 int alloc_len = XPS_MIN_MAP_ALLOC;
1870 int i, pos;
1872 for (pos = 0; map && pos < map->len; pos++) {
1873 if (map->queues[pos] != index)
1874 continue;
1875 return map;
1878 /* Need to add queue to this CPU's existing map */
1879 if (map) {
1880 if (pos < map->alloc_len)
1881 return map;
1883 alloc_len = map->alloc_len * 2;
1886 /* Need to allocate new map to store queue on this CPU's map */
1887 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1888 cpu_to_node(cpu));
1889 if (!new_map)
1890 return NULL;
1892 for (i = 0; i < pos; i++)
1893 new_map->queues[i] = map->queues[i];
1894 new_map->alloc_len = alloc_len;
1895 new_map->len = pos;
1897 return new_map;
1900 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1902 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1903 struct xps_map *map, *new_map;
1904 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1905 int cpu, numa_node_id = -2;
1906 bool active = false;
1908 mutex_lock(&xps_map_mutex);
1910 dev_maps = xmap_dereference(dev->xps_maps);
1912 /* allocate memory for queue storage */
1913 for_each_online_cpu(cpu) {
1914 if (!cpumask_test_cpu(cpu, mask))
1915 continue;
1917 if (!new_dev_maps)
1918 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1919 if (!new_dev_maps)
1920 return -ENOMEM;
1922 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1923 NULL;
1925 map = expand_xps_map(map, cpu, index);
1926 if (!map)
1927 goto error;
1929 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1932 if (!new_dev_maps)
1933 goto out_no_new_maps;
1935 for_each_possible_cpu(cpu) {
1936 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1937 /* add queue to CPU maps */
1938 int pos = 0;
1940 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1941 while ((pos < map->len) && (map->queues[pos] != index))
1942 pos++;
1944 if (pos == map->len)
1945 map->queues[map->len++] = index;
1946 #ifdef CONFIG_NUMA
1947 if (numa_node_id == -2)
1948 numa_node_id = cpu_to_node(cpu);
1949 else if (numa_node_id != cpu_to_node(cpu))
1950 numa_node_id = -1;
1951 #endif
1952 } else if (dev_maps) {
1953 /* fill in the new device map from the old device map */
1954 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1955 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1960 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1962 /* Cleanup old maps */
1963 if (dev_maps) {
1964 for_each_possible_cpu(cpu) {
1965 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1966 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1967 if (map && map != new_map)
1968 kfree_rcu(map, rcu);
1971 kfree_rcu(dev_maps, rcu);
1974 dev_maps = new_dev_maps;
1975 active = true;
1977 out_no_new_maps:
1978 /* update Tx queue numa node */
1979 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1980 (numa_node_id >= 0) ? numa_node_id :
1981 NUMA_NO_NODE);
1983 if (!dev_maps)
1984 goto out_no_maps;
1986 /* removes queue from unused CPUs */
1987 for_each_possible_cpu(cpu) {
1988 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1989 continue;
1991 if (remove_xps_queue(dev_maps, cpu, index))
1992 active = true;
1995 /* free map if not active */
1996 if (!active) {
1997 RCU_INIT_POINTER(dev->xps_maps, NULL);
1998 kfree_rcu(dev_maps, rcu);
2001 out_no_maps:
2002 mutex_unlock(&xps_map_mutex);
2004 return 0;
2005 error:
2006 /* remove any maps that we added */
2007 for_each_possible_cpu(cpu) {
2008 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2009 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2010 NULL;
2011 if (new_map && new_map != map)
2012 kfree(new_map);
2015 mutex_unlock(&xps_map_mutex);
2017 kfree(new_dev_maps);
2018 return -ENOMEM;
2020 EXPORT_SYMBOL(netif_set_xps_queue);
2022 #endif
2024 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2025 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2027 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2029 int rc;
2031 if (txq < 1 || txq > dev->num_tx_queues)
2032 return -EINVAL;
2034 if (dev->reg_state == NETREG_REGISTERED ||
2035 dev->reg_state == NETREG_UNREGISTERING) {
2036 ASSERT_RTNL();
2038 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2039 txq);
2040 if (rc)
2041 return rc;
2043 if (dev->num_tc)
2044 netif_setup_tc(dev, txq);
2046 if (txq < dev->real_num_tx_queues) {
2047 qdisc_reset_all_tx_gt(dev, txq);
2048 #ifdef CONFIG_XPS
2049 netif_reset_xps_queues_gt(dev, txq);
2050 #endif
2054 dev->real_num_tx_queues = txq;
2055 return 0;
2057 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2059 #ifdef CONFIG_RPS
2061 * netif_set_real_num_rx_queues - set actual number of RX queues used
2062 * @dev: Network device
2063 * @rxq: Actual number of RX queues
2065 * This must be called either with the rtnl_lock held or before
2066 * registration of the net device. Returns 0 on success, or a
2067 * negative error code. If called before registration, it always
2068 * succeeds.
2070 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2072 int rc;
2074 if (rxq < 1 || rxq > dev->num_rx_queues)
2075 return -EINVAL;
2077 if (dev->reg_state == NETREG_REGISTERED) {
2078 ASSERT_RTNL();
2080 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2081 rxq);
2082 if (rc)
2083 return rc;
2086 dev->real_num_rx_queues = rxq;
2087 return 0;
2089 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2090 #endif
2093 * netif_get_num_default_rss_queues - default number of RSS queues
2095 * This routine should set an upper limit on the number of RSS queues
2096 * used by default by multiqueue devices.
2098 int netif_get_num_default_rss_queues(void)
2100 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2102 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2104 static inline void __netif_reschedule(struct Qdisc *q)
2106 struct softnet_data *sd;
2107 unsigned long flags;
2109 local_irq_save(flags);
2110 sd = &__get_cpu_var(softnet_data);
2111 q->next_sched = NULL;
2112 *sd->output_queue_tailp = q;
2113 sd->output_queue_tailp = &q->next_sched;
2114 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2115 local_irq_restore(flags);
2118 void __netif_schedule(struct Qdisc *q)
2120 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2121 __netif_reschedule(q);
2123 EXPORT_SYMBOL(__netif_schedule);
2125 void dev_kfree_skb_irq(struct sk_buff *skb)
2127 if (atomic_dec_and_test(&skb->users)) {
2128 struct softnet_data *sd;
2129 unsigned long flags;
2131 local_irq_save(flags);
2132 sd = &__get_cpu_var(softnet_data);
2133 skb->next = sd->completion_queue;
2134 sd->completion_queue = skb;
2135 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2136 local_irq_restore(flags);
2139 EXPORT_SYMBOL(dev_kfree_skb_irq);
2141 void dev_kfree_skb_any(struct sk_buff *skb)
2143 if (in_irq() || irqs_disabled())
2144 dev_kfree_skb_irq(skb);
2145 else
2146 dev_kfree_skb(skb);
2148 EXPORT_SYMBOL(dev_kfree_skb_any);
2152 * netif_device_detach - mark device as removed
2153 * @dev: network device
2155 * Mark device as removed from system and therefore no longer available.
2157 void netif_device_detach(struct net_device *dev)
2159 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2160 netif_running(dev)) {
2161 netif_tx_stop_all_queues(dev);
2164 EXPORT_SYMBOL(netif_device_detach);
2167 * netif_device_attach - mark device as attached
2168 * @dev: network device
2170 * Mark device as attached from system and restart if needed.
2172 void netif_device_attach(struct net_device *dev)
2174 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2175 netif_running(dev)) {
2176 netif_tx_wake_all_queues(dev);
2177 __netdev_watchdog_up(dev);
2180 EXPORT_SYMBOL(netif_device_attach);
2182 static void skb_warn_bad_offload(const struct sk_buff *skb)
2184 static const netdev_features_t null_features = 0;
2185 struct net_device *dev = skb->dev;
2186 const char *driver = "";
2188 if (dev && dev->dev.parent)
2189 driver = dev_driver_string(dev->dev.parent);
2191 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2192 "gso_type=%d ip_summed=%d\n",
2193 driver, dev ? &dev->features : &null_features,
2194 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2195 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2196 skb_shinfo(skb)->gso_type, skb->ip_summed);
2200 * Invalidate hardware checksum when packet is to be mangled, and
2201 * complete checksum manually on outgoing path.
2203 int skb_checksum_help(struct sk_buff *skb)
2205 __wsum csum;
2206 int ret = 0, offset;
2208 if (skb->ip_summed == CHECKSUM_COMPLETE)
2209 goto out_set_summed;
2211 if (unlikely(skb_shinfo(skb)->gso_size)) {
2212 skb_warn_bad_offload(skb);
2213 return -EINVAL;
2216 /* Before computing a checksum, we should make sure no frag could
2217 * be modified by an external entity : checksum could be wrong.
2219 if (skb_has_shared_frag(skb)) {
2220 ret = __skb_linearize(skb);
2221 if (ret)
2222 goto out;
2225 offset = skb_checksum_start_offset(skb);
2226 BUG_ON(offset >= skb_headlen(skb));
2227 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2229 offset += skb->csum_offset;
2230 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2232 if (skb_cloned(skb) &&
2233 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2234 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2235 if (ret)
2236 goto out;
2239 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2240 out_set_summed:
2241 skb->ip_summed = CHECKSUM_NONE;
2242 out:
2243 return ret;
2245 EXPORT_SYMBOL(skb_checksum_help);
2248 * skb_mac_gso_segment - mac layer segmentation handler.
2249 * @skb: buffer to segment
2250 * @features: features for the output path (see dev->features)
2252 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2253 netdev_features_t features)
2255 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2256 struct packet_offload *ptype;
2257 __be16 type = skb->protocol;
2259 while (type == htons(ETH_P_8021Q)) {
2260 int vlan_depth = ETH_HLEN;
2261 struct vlan_hdr *vh;
2263 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2264 return ERR_PTR(-EINVAL);
2266 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2267 type = vh->h_vlan_encapsulated_proto;
2268 vlan_depth += VLAN_HLEN;
2271 __skb_pull(skb, skb->mac_len);
2273 rcu_read_lock();
2274 list_for_each_entry_rcu(ptype, &offload_base, list) {
2275 if (ptype->type == type && ptype->callbacks.gso_segment) {
2276 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2277 int err;
2279 err = ptype->callbacks.gso_send_check(skb);
2280 segs = ERR_PTR(err);
2281 if (err || skb_gso_ok(skb, features))
2282 break;
2283 __skb_push(skb, (skb->data -
2284 skb_network_header(skb)));
2286 segs = ptype->callbacks.gso_segment(skb, features);
2287 break;
2290 rcu_read_unlock();
2292 __skb_push(skb, skb->data - skb_mac_header(skb));
2294 return segs;
2296 EXPORT_SYMBOL(skb_mac_gso_segment);
2299 /* openvswitch calls this on rx path, so we need a different check.
2301 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2303 if (tx_path)
2304 return skb->ip_summed != CHECKSUM_PARTIAL;
2305 else
2306 return skb->ip_summed == CHECKSUM_NONE;
2310 * __skb_gso_segment - Perform segmentation on skb.
2311 * @skb: buffer to segment
2312 * @features: features for the output path (see dev->features)
2313 * @tx_path: whether it is called in TX path
2315 * This function segments the given skb and returns a list of segments.
2317 * It may return NULL if the skb requires no segmentation. This is
2318 * only possible when GSO is used for verifying header integrity.
2320 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2321 netdev_features_t features, bool tx_path)
2323 if (unlikely(skb_needs_check(skb, tx_path))) {
2324 int err;
2326 skb_warn_bad_offload(skb);
2328 if (skb_header_cloned(skb) &&
2329 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2330 return ERR_PTR(err);
2333 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2334 skb_reset_mac_header(skb);
2335 skb_reset_mac_len(skb);
2337 return skb_mac_gso_segment(skb, features);
2339 EXPORT_SYMBOL(__skb_gso_segment);
2341 /* Take action when hardware reception checksum errors are detected. */
2342 #ifdef CONFIG_BUG
2343 void netdev_rx_csum_fault(struct net_device *dev)
2345 if (net_ratelimit()) {
2346 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2347 dump_stack();
2350 EXPORT_SYMBOL(netdev_rx_csum_fault);
2351 #endif
2353 /* Actually, we should eliminate this check as soon as we know, that:
2354 * 1. IOMMU is present and allows to map all the memory.
2355 * 2. No high memory really exists on this machine.
2358 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2360 #ifdef CONFIG_HIGHMEM
2361 int i;
2362 if (!(dev->features & NETIF_F_HIGHDMA)) {
2363 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2364 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2365 if (PageHighMem(skb_frag_page(frag)))
2366 return 1;
2370 if (PCI_DMA_BUS_IS_PHYS) {
2371 struct device *pdev = dev->dev.parent;
2373 if (!pdev)
2374 return 0;
2375 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2376 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2377 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2378 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2379 return 1;
2382 #endif
2383 return 0;
2386 struct dev_gso_cb {
2387 void (*destructor)(struct sk_buff *skb);
2390 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2392 static void dev_gso_skb_destructor(struct sk_buff *skb)
2394 struct dev_gso_cb *cb;
2396 do {
2397 struct sk_buff *nskb = skb->next;
2399 skb->next = nskb->next;
2400 nskb->next = NULL;
2401 kfree_skb(nskb);
2402 } while (skb->next);
2404 cb = DEV_GSO_CB(skb);
2405 if (cb->destructor)
2406 cb->destructor(skb);
2410 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2411 * @skb: buffer to segment
2412 * @features: device features as applicable to this skb
2414 * This function segments the given skb and stores the list of segments
2415 * in skb->next.
2417 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2419 struct sk_buff *segs;
2421 segs = skb_gso_segment(skb, features);
2423 /* Verifying header integrity only. */
2424 if (!segs)
2425 return 0;
2427 if (IS_ERR(segs))
2428 return PTR_ERR(segs);
2430 skb->next = segs;
2431 DEV_GSO_CB(skb)->destructor = skb->destructor;
2432 skb->destructor = dev_gso_skb_destructor;
2434 return 0;
2437 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2439 return ((features & NETIF_F_GEN_CSUM) ||
2440 ((features & NETIF_F_V4_CSUM) &&
2441 protocol == htons(ETH_P_IP)) ||
2442 ((features & NETIF_F_V6_CSUM) &&
2443 protocol == htons(ETH_P_IPV6)) ||
2444 ((features & NETIF_F_FCOE_CRC) &&
2445 protocol == htons(ETH_P_FCOE)));
2448 static netdev_features_t harmonize_features(struct sk_buff *skb,
2449 __be16 protocol, netdev_features_t features)
2451 if (skb->ip_summed != CHECKSUM_NONE &&
2452 !can_checksum_protocol(features, protocol)) {
2453 features &= ~NETIF_F_ALL_CSUM;
2454 features &= ~NETIF_F_SG;
2455 } else if (illegal_highdma(skb->dev, skb)) {
2456 features &= ~NETIF_F_SG;
2459 return features;
2462 netdev_features_t netif_skb_features(struct sk_buff *skb)
2464 __be16 protocol = skb->protocol;
2465 netdev_features_t features = skb->dev->features;
2467 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2468 features &= ~NETIF_F_GSO_MASK;
2470 if (protocol == htons(ETH_P_8021Q)) {
2471 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2472 protocol = veh->h_vlan_encapsulated_proto;
2473 } else if (!vlan_tx_tag_present(skb)) {
2474 return harmonize_features(skb, protocol, features);
2477 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2479 if (protocol != htons(ETH_P_8021Q)) {
2480 return harmonize_features(skb, protocol, features);
2481 } else {
2482 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2483 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2484 return harmonize_features(skb, protocol, features);
2487 EXPORT_SYMBOL(netif_skb_features);
2490 * Returns true if either:
2491 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2492 * 2. skb is fragmented and the device does not support SG.
2494 static inline int skb_needs_linearize(struct sk_buff *skb,
2495 int features)
2497 return skb_is_nonlinear(skb) &&
2498 ((skb_has_frag_list(skb) &&
2499 !(features & NETIF_F_FRAGLIST)) ||
2500 (skb_shinfo(skb)->nr_frags &&
2501 !(features & NETIF_F_SG)));
2504 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2505 struct netdev_queue *txq)
2507 const struct net_device_ops *ops = dev->netdev_ops;
2508 int rc = NETDEV_TX_OK;
2509 unsigned int skb_len;
2511 if (likely(!skb->next)) {
2512 netdev_features_t features;
2515 * If device doesn't need skb->dst, release it right now while
2516 * its hot in this cpu cache
2518 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2519 skb_dst_drop(skb);
2521 features = netif_skb_features(skb);
2523 if (vlan_tx_tag_present(skb) &&
2524 !(features & NETIF_F_HW_VLAN_TX)) {
2525 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2526 if (unlikely(!skb))
2527 goto out;
2529 skb->vlan_tci = 0;
2532 /* If encapsulation offload request, verify we are testing
2533 * hardware encapsulation features instead of standard
2534 * features for the netdev
2536 if (skb->encapsulation)
2537 features &= dev->hw_enc_features;
2539 if (netif_needs_gso(skb, features)) {
2540 if (unlikely(dev_gso_segment(skb, features)))
2541 goto out_kfree_skb;
2542 if (skb->next)
2543 goto gso;
2544 } else {
2545 if (skb_needs_linearize(skb, features) &&
2546 __skb_linearize(skb))
2547 goto out_kfree_skb;
2549 /* If packet is not checksummed and device does not
2550 * support checksumming for this protocol, complete
2551 * checksumming here.
2553 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2554 if (skb->encapsulation)
2555 skb_set_inner_transport_header(skb,
2556 skb_checksum_start_offset(skb));
2557 else
2558 skb_set_transport_header(skb,
2559 skb_checksum_start_offset(skb));
2560 if (!(features & NETIF_F_ALL_CSUM) &&
2561 skb_checksum_help(skb))
2562 goto out_kfree_skb;
2566 if (!list_empty(&ptype_all))
2567 dev_queue_xmit_nit(skb, dev);
2569 skb_len = skb->len;
2570 rc = ops->ndo_start_xmit(skb, dev);
2571 trace_net_dev_xmit(skb, rc, dev, skb_len);
2572 if (rc == NETDEV_TX_OK)
2573 txq_trans_update(txq);
2574 return rc;
2577 gso:
2578 do {
2579 struct sk_buff *nskb = skb->next;
2581 skb->next = nskb->next;
2582 nskb->next = NULL;
2585 * If device doesn't need nskb->dst, release it right now while
2586 * its hot in this cpu cache
2588 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2589 skb_dst_drop(nskb);
2591 if (!list_empty(&ptype_all))
2592 dev_queue_xmit_nit(nskb, dev);
2594 skb_len = nskb->len;
2595 rc = ops->ndo_start_xmit(nskb, dev);
2596 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2597 if (unlikely(rc != NETDEV_TX_OK)) {
2598 if (rc & ~NETDEV_TX_MASK)
2599 goto out_kfree_gso_skb;
2600 nskb->next = skb->next;
2601 skb->next = nskb;
2602 return rc;
2604 txq_trans_update(txq);
2605 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2606 return NETDEV_TX_BUSY;
2607 } while (skb->next);
2609 out_kfree_gso_skb:
2610 if (likely(skb->next == NULL))
2611 skb->destructor = DEV_GSO_CB(skb)->destructor;
2612 out_kfree_skb:
2613 kfree_skb(skb);
2614 out:
2615 return rc;
2618 static void qdisc_pkt_len_init(struct sk_buff *skb)
2620 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2622 qdisc_skb_cb(skb)->pkt_len = skb->len;
2624 /* To get more precise estimation of bytes sent on wire,
2625 * we add to pkt_len the headers size of all segments
2627 if (shinfo->gso_size) {
2628 unsigned int hdr_len;
2630 /* mac layer + network layer */
2631 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2633 /* + transport layer */
2634 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2635 hdr_len += tcp_hdrlen(skb);
2636 else
2637 hdr_len += sizeof(struct udphdr);
2638 qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
2642 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2643 struct net_device *dev,
2644 struct netdev_queue *txq)
2646 spinlock_t *root_lock = qdisc_lock(q);
2647 bool contended;
2648 int rc;
2650 qdisc_pkt_len_init(skb);
2651 qdisc_calculate_pkt_len(skb, q);
2653 * Heuristic to force contended enqueues to serialize on a
2654 * separate lock before trying to get qdisc main lock.
2655 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2656 * and dequeue packets faster.
2658 contended = qdisc_is_running(q);
2659 if (unlikely(contended))
2660 spin_lock(&q->busylock);
2662 spin_lock(root_lock);
2663 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2664 kfree_skb(skb);
2665 rc = NET_XMIT_DROP;
2666 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2667 qdisc_run_begin(q)) {
2669 * This is a work-conserving queue; there are no old skbs
2670 * waiting to be sent out; and the qdisc is not running -
2671 * xmit the skb directly.
2673 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2674 skb_dst_force(skb);
2676 qdisc_bstats_update(q, skb);
2678 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2679 if (unlikely(contended)) {
2680 spin_unlock(&q->busylock);
2681 contended = false;
2683 __qdisc_run(q);
2684 } else
2685 qdisc_run_end(q);
2687 rc = NET_XMIT_SUCCESS;
2688 } else {
2689 skb_dst_force(skb);
2690 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2691 if (qdisc_run_begin(q)) {
2692 if (unlikely(contended)) {
2693 spin_unlock(&q->busylock);
2694 contended = false;
2696 __qdisc_run(q);
2699 spin_unlock(root_lock);
2700 if (unlikely(contended))
2701 spin_unlock(&q->busylock);
2702 return rc;
2705 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2706 static void skb_update_prio(struct sk_buff *skb)
2708 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2710 if (!skb->priority && skb->sk && map) {
2711 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2713 if (prioidx < map->priomap_len)
2714 skb->priority = map->priomap[prioidx];
2717 #else
2718 #define skb_update_prio(skb)
2719 #endif
2721 static DEFINE_PER_CPU(int, xmit_recursion);
2722 #define RECURSION_LIMIT 10
2725 * dev_loopback_xmit - loop back @skb
2726 * @skb: buffer to transmit
2728 int dev_loopback_xmit(struct sk_buff *skb)
2730 skb_reset_mac_header(skb);
2731 __skb_pull(skb, skb_network_offset(skb));
2732 skb->pkt_type = PACKET_LOOPBACK;
2733 skb->ip_summed = CHECKSUM_UNNECESSARY;
2734 WARN_ON(!skb_dst(skb));
2735 skb_dst_force(skb);
2736 netif_rx_ni(skb);
2737 return 0;
2739 EXPORT_SYMBOL(dev_loopback_xmit);
2742 * dev_queue_xmit - transmit a buffer
2743 * @skb: buffer to transmit
2745 * Queue a buffer for transmission to a network device. The caller must
2746 * have set the device and priority and built the buffer before calling
2747 * this function. The function can be called from an interrupt.
2749 * A negative errno code is returned on a failure. A success does not
2750 * guarantee the frame will be transmitted as it may be dropped due
2751 * to congestion or traffic shaping.
2753 * -----------------------------------------------------------------------------------
2754 * I notice this method can also return errors from the queue disciplines,
2755 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2756 * be positive.
2758 * Regardless of the return value, the skb is consumed, so it is currently
2759 * difficult to retry a send to this method. (You can bump the ref count
2760 * before sending to hold a reference for retry if you are careful.)
2762 * When calling this method, interrupts MUST be enabled. This is because
2763 * the BH enable code must have IRQs enabled so that it will not deadlock.
2764 * --BLG
2766 int dev_queue_xmit(struct sk_buff *skb)
2768 struct net_device *dev = skb->dev;
2769 struct netdev_queue *txq;
2770 struct Qdisc *q;
2771 int rc = -ENOMEM;
2773 skb_reset_mac_header(skb);
2775 /* Disable soft irqs for various locks below. Also
2776 * stops preemption for RCU.
2778 rcu_read_lock_bh();
2780 skb_update_prio(skb);
2782 txq = netdev_pick_tx(dev, skb);
2783 q = rcu_dereference_bh(txq->qdisc);
2785 #ifdef CONFIG_NET_CLS_ACT
2786 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2787 #endif
2788 trace_net_dev_queue(skb);
2789 if (q->enqueue) {
2790 rc = __dev_xmit_skb(skb, q, dev, txq);
2791 goto out;
2794 /* The device has no queue. Common case for software devices:
2795 loopback, all the sorts of tunnels...
2797 Really, it is unlikely that netif_tx_lock protection is necessary
2798 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2799 counters.)
2800 However, it is possible, that they rely on protection
2801 made by us here.
2803 Check this and shot the lock. It is not prone from deadlocks.
2804 Either shot noqueue qdisc, it is even simpler 8)
2806 if (dev->flags & IFF_UP) {
2807 int cpu = smp_processor_id(); /* ok because BHs are off */
2809 if (txq->xmit_lock_owner != cpu) {
2811 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2812 goto recursion_alert;
2814 HARD_TX_LOCK(dev, txq, cpu);
2816 if (!netif_xmit_stopped(txq)) {
2817 __this_cpu_inc(xmit_recursion);
2818 rc = dev_hard_start_xmit(skb, dev, txq);
2819 __this_cpu_dec(xmit_recursion);
2820 if (dev_xmit_complete(rc)) {
2821 HARD_TX_UNLOCK(dev, txq);
2822 goto out;
2825 HARD_TX_UNLOCK(dev, txq);
2826 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2827 dev->name);
2828 } else {
2829 /* Recursion is detected! It is possible,
2830 * unfortunately
2832 recursion_alert:
2833 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2834 dev->name);
2838 rc = -ENETDOWN;
2839 rcu_read_unlock_bh();
2841 kfree_skb(skb);
2842 return rc;
2843 out:
2844 rcu_read_unlock_bh();
2845 return rc;
2847 EXPORT_SYMBOL(dev_queue_xmit);
2850 /*=======================================================================
2851 Receiver routines
2852 =======================================================================*/
2854 int netdev_max_backlog __read_mostly = 1000;
2855 EXPORT_SYMBOL(netdev_max_backlog);
2857 int netdev_tstamp_prequeue __read_mostly = 1;
2858 int netdev_budget __read_mostly = 300;
2859 int weight_p __read_mostly = 64; /* old backlog weight */
2861 /* Called with irq disabled */
2862 static inline void ____napi_schedule(struct softnet_data *sd,
2863 struct napi_struct *napi)
2865 list_add_tail(&napi->poll_list, &sd->poll_list);
2866 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2869 #ifdef CONFIG_RPS
2871 /* One global table that all flow-based protocols share. */
2872 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2873 EXPORT_SYMBOL(rps_sock_flow_table);
2875 struct static_key rps_needed __read_mostly;
2877 static struct rps_dev_flow *
2878 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2879 struct rps_dev_flow *rflow, u16 next_cpu)
2881 if (next_cpu != RPS_NO_CPU) {
2882 #ifdef CONFIG_RFS_ACCEL
2883 struct netdev_rx_queue *rxqueue;
2884 struct rps_dev_flow_table *flow_table;
2885 struct rps_dev_flow *old_rflow;
2886 u32 flow_id;
2887 u16 rxq_index;
2888 int rc;
2890 /* Should we steer this flow to a different hardware queue? */
2891 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2892 !(dev->features & NETIF_F_NTUPLE))
2893 goto out;
2894 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2895 if (rxq_index == skb_get_rx_queue(skb))
2896 goto out;
2898 rxqueue = dev->_rx + rxq_index;
2899 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2900 if (!flow_table)
2901 goto out;
2902 flow_id = skb->rxhash & flow_table->mask;
2903 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2904 rxq_index, flow_id);
2905 if (rc < 0)
2906 goto out;
2907 old_rflow = rflow;
2908 rflow = &flow_table->flows[flow_id];
2909 rflow->filter = rc;
2910 if (old_rflow->filter == rflow->filter)
2911 old_rflow->filter = RPS_NO_FILTER;
2912 out:
2913 #endif
2914 rflow->last_qtail =
2915 per_cpu(softnet_data, next_cpu).input_queue_head;
2918 rflow->cpu = next_cpu;
2919 return rflow;
2923 * get_rps_cpu is called from netif_receive_skb and returns the target
2924 * CPU from the RPS map of the receiving queue for a given skb.
2925 * rcu_read_lock must be held on entry.
2927 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2928 struct rps_dev_flow **rflowp)
2930 struct netdev_rx_queue *rxqueue;
2931 struct rps_map *map;
2932 struct rps_dev_flow_table *flow_table;
2933 struct rps_sock_flow_table *sock_flow_table;
2934 int cpu = -1;
2935 u16 tcpu;
2937 if (skb_rx_queue_recorded(skb)) {
2938 u16 index = skb_get_rx_queue(skb);
2939 if (unlikely(index >= dev->real_num_rx_queues)) {
2940 WARN_ONCE(dev->real_num_rx_queues > 1,
2941 "%s received packet on queue %u, but number "
2942 "of RX queues is %u\n",
2943 dev->name, index, dev->real_num_rx_queues);
2944 goto done;
2946 rxqueue = dev->_rx + index;
2947 } else
2948 rxqueue = dev->_rx;
2950 map = rcu_dereference(rxqueue->rps_map);
2951 if (map) {
2952 if (map->len == 1 &&
2953 !rcu_access_pointer(rxqueue->rps_flow_table)) {
2954 tcpu = map->cpus[0];
2955 if (cpu_online(tcpu))
2956 cpu = tcpu;
2957 goto done;
2959 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2960 goto done;
2963 skb_reset_network_header(skb);
2964 if (!skb_get_rxhash(skb))
2965 goto done;
2967 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2968 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2969 if (flow_table && sock_flow_table) {
2970 u16 next_cpu;
2971 struct rps_dev_flow *rflow;
2973 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2974 tcpu = rflow->cpu;
2976 next_cpu = sock_flow_table->ents[skb->rxhash &
2977 sock_flow_table->mask];
2980 * If the desired CPU (where last recvmsg was done) is
2981 * different from current CPU (one in the rx-queue flow
2982 * table entry), switch if one of the following holds:
2983 * - Current CPU is unset (equal to RPS_NO_CPU).
2984 * - Current CPU is offline.
2985 * - The current CPU's queue tail has advanced beyond the
2986 * last packet that was enqueued using this table entry.
2987 * This guarantees that all previous packets for the flow
2988 * have been dequeued, thus preserving in order delivery.
2990 if (unlikely(tcpu != next_cpu) &&
2991 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2992 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2993 rflow->last_qtail)) >= 0)) {
2994 tcpu = next_cpu;
2995 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2998 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2999 *rflowp = rflow;
3000 cpu = tcpu;
3001 goto done;
3005 if (map) {
3006 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3008 if (cpu_online(tcpu)) {
3009 cpu = tcpu;
3010 goto done;
3014 done:
3015 return cpu;
3018 #ifdef CONFIG_RFS_ACCEL
3021 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3022 * @dev: Device on which the filter was set
3023 * @rxq_index: RX queue index
3024 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3025 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3027 * Drivers that implement ndo_rx_flow_steer() should periodically call
3028 * this function for each installed filter and remove the filters for
3029 * which it returns %true.
3031 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3032 u32 flow_id, u16 filter_id)
3034 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3035 struct rps_dev_flow_table *flow_table;
3036 struct rps_dev_flow *rflow;
3037 bool expire = true;
3038 int cpu;
3040 rcu_read_lock();
3041 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3042 if (flow_table && flow_id <= flow_table->mask) {
3043 rflow = &flow_table->flows[flow_id];
3044 cpu = ACCESS_ONCE(rflow->cpu);
3045 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3046 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3047 rflow->last_qtail) <
3048 (int)(10 * flow_table->mask)))
3049 expire = false;
3051 rcu_read_unlock();
3052 return expire;
3054 EXPORT_SYMBOL(rps_may_expire_flow);
3056 #endif /* CONFIG_RFS_ACCEL */
3058 /* Called from hardirq (IPI) context */
3059 static void rps_trigger_softirq(void *data)
3061 struct softnet_data *sd = data;
3063 ____napi_schedule(sd, &sd->backlog);
3064 sd->received_rps++;
3067 #endif /* CONFIG_RPS */
3070 * Check if this softnet_data structure is another cpu one
3071 * If yes, queue it to our IPI list and return 1
3072 * If no, return 0
3074 static int rps_ipi_queued(struct softnet_data *sd)
3076 #ifdef CONFIG_RPS
3077 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3079 if (sd != mysd) {
3080 sd->rps_ipi_next = mysd->rps_ipi_list;
3081 mysd->rps_ipi_list = sd;
3083 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3084 return 1;
3086 #endif /* CONFIG_RPS */
3087 return 0;
3091 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3092 * queue (may be a remote CPU queue).
3094 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3095 unsigned int *qtail)
3097 struct softnet_data *sd;
3098 unsigned long flags;
3100 sd = &per_cpu(softnet_data, cpu);
3102 local_irq_save(flags);
3104 rps_lock(sd);
3105 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3106 if (skb_queue_len(&sd->input_pkt_queue)) {
3107 enqueue:
3108 __skb_queue_tail(&sd->input_pkt_queue, skb);
3109 input_queue_tail_incr_save(sd, qtail);
3110 rps_unlock(sd);
3111 local_irq_restore(flags);
3112 return NET_RX_SUCCESS;
3115 /* Schedule NAPI for backlog device
3116 * We can use non atomic operation since we own the queue lock
3118 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3119 if (!rps_ipi_queued(sd))
3120 ____napi_schedule(sd, &sd->backlog);
3122 goto enqueue;
3125 sd->dropped++;
3126 rps_unlock(sd);
3128 local_irq_restore(flags);
3130 atomic_long_inc(&skb->dev->rx_dropped);
3131 kfree_skb(skb);
3132 return NET_RX_DROP;
3136 * netif_rx - post buffer to the network code
3137 * @skb: buffer to post
3139 * This function receives a packet from a device driver and queues it for
3140 * the upper (protocol) levels to process. It always succeeds. The buffer
3141 * may be dropped during processing for congestion control or by the
3142 * protocol layers.
3144 * return values:
3145 * NET_RX_SUCCESS (no congestion)
3146 * NET_RX_DROP (packet was dropped)
3150 int netif_rx(struct sk_buff *skb)
3152 int ret;
3154 /* if netpoll wants it, pretend we never saw it */
3155 if (netpoll_rx(skb))
3156 return NET_RX_DROP;
3158 net_timestamp_check(netdev_tstamp_prequeue, skb);
3160 trace_netif_rx(skb);
3161 #ifdef CONFIG_RPS
3162 if (static_key_false(&rps_needed)) {
3163 struct rps_dev_flow voidflow, *rflow = &voidflow;
3164 int cpu;
3166 preempt_disable();
3167 rcu_read_lock();
3169 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3170 if (cpu < 0)
3171 cpu = smp_processor_id();
3173 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3175 rcu_read_unlock();
3176 preempt_enable();
3177 } else
3178 #endif
3180 unsigned int qtail;
3181 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3182 put_cpu();
3184 return ret;
3186 EXPORT_SYMBOL(netif_rx);
3188 int netif_rx_ni(struct sk_buff *skb)
3190 int err;
3192 preempt_disable();
3193 err = netif_rx(skb);
3194 if (local_softirq_pending())
3195 do_softirq();
3196 preempt_enable();
3198 return err;
3200 EXPORT_SYMBOL(netif_rx_ni);
3202 static void net_tx_action(struct softirq_action *h)
3204 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3206 if (sd->completion_queue) {
3207 struct sk_buff *clist;
3209 local_irq_disable();
3210 clist = sd->completion_queue;
3211 sd->completion_queue = NULL;
3212 local_irq_enable();
3214 while (clist) {
3215 struct sk_buff *skb = clist;
3216 clist = clist->next;
3218 WARN_ON(atomic_read(&skb->users));
3219 trace_kfree_skb(skb, net_tx_action);
3220 __kfree_skb(skb);
3224 if (sd->output_queue) {
3225 struct Qdisc *head;
3227 local_irq_disable();
3228 head = sd->output_queue;
3229 sd->output_queue = NULL;
3230 sd->output_queue_tailp = &sd->output_queue;
3231 local_irq_enable();
3233 while (head) {
3234 struct Qdisc *q = head;
3235 spinlock_t *root_lock;
3237 head = head->next_sched;
3239 root_lock = qdisc_lock(q);
3240 if (spin_trylock(root_lock)) {
3241 smp_mb__before_clear_bit();
3242 clear_bit(__QDISC_STATE_SCHED,
3243 &q->state);
3244 qdisc_run(q);
3245 spin_unlock(root_lock);
3246 } else {
3247 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3248 &q->state)) {
3249 __netif_reschedule(q);
3250 } else {
3251 smp_mb__before_clear_bit();
3252 clear_bit(__QDISC_STATE_SCHED,
3253 &q->state);
3260 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3261 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3262 /* This hook is defined here for ATM LANE */
3263 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3264 unsigned char *addr) __read_mostly;
3265 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3266 #endif
3268 #ifdef CONFIG_NET_CLS_ACT
3269 /* TODO: Maybe we should just force sch_ingress to be compiled in
3270 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3271 * a compare and 2 stores extra right now if we dont have it on
3272 * but have CONFIG_NET_CLS_ACT
3273 * NOTE: This doesn't stop any functionality; if you dont have
3274 * the ingress scheduler, you just can't add policies on ingress.
3277 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3279 struct net_device *dev = skb->dev;
3280 u32 ttl = G_TC_RTTL(skb->tc_verd);
3281 int result = TC_ACT_OK;
3282 struct Qdisc *q;
3284 if (unlikely(MAX_RED_LOOP < ttl++)) {
3285 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3286 skb->skb_iif, dev->ifindex);
3287 return TC_ACT_SHOT;
3290 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3291 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3293 q = rxq->qdisc;
3294 if (q != &noop_qdisc) {
3295 spin_lock(qdisc_lock(q));
3296 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3297 result = qdisc_enqueue_root(skb, q);
3298 spin_unlock(qdisc_lock(q));
3301 return result;
3304 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3305 struct packet_type **pt_prev,
3306 int *ret, struct net_device *orig_dev)
3308 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3310 if (!rxq || rxq->qdisc == &noop_qdisc)
3311 goto out;
3313 if (*pt_prev) {
3314 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3315 *pt_prev = NULL;
3318 switch (ing_filter(skb, rxq)) {
3319 case TC_ACT_SHOT:
3320 case TC_ACT_STOLEN:
3321 kfree_skb(skb);
3322 return NULL;
3325 out:
3326 skb->tc_verd = 0;
3327 return skb;
3329 #endif
3332 * netdev_rx_handler_register - register receive handler
3333 * @dev: device to register a handler for
3334 * @rx_handler: receive handler to register
3335 * @rx_handler_data: data pointer that is used by rx handler
3337 * Register a receive hander for a device. This handler will then be
3338 * called from __netif_receive_skb. A negative errno code is returned
3339 * on a failure.
3341 * The caller must hold the rtnl_mutex.
3343 * For a general description of rx_handler, see enum rx_handler_result.
3345 int netdev_rx_handler_register(struct net_device *dev,
3346 rx_handler_func_t *rx_handler,
3347 void *rx_handler_data)
3349 ASSERT_RTNL();
3351 if (dev->rx_handler)
3352 return -EBUSY;
3354 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3355 rcu_assign_pointer(dev->rx_handler, rx_handler);
3357 return 0;
3359 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3362 * netdev_rx_handler_unregister - unregister receive handler
3363 * @dev: device to unregister a handler from
3365 * Unregister a receive hander from a device.
3367 * The caller must hold the rtnl_mutex.
3369 void netdev_rx_handler_unregister(struct net_device *dev)
3372 ASSERT_RTNL();
3373 RCU_INIT_POINTER(dev->rx_handler, NULL);
3374 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3376 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3379 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3380 * the special handling of PFMEMALLOC skbs.
3382 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3384 switch (skb->protocol) {
3385 case __constant_htons(ETH_P_ARP):
3386 case __constant_htons(ETH_P_IP):
3387 case __constant_htons(ETH_P_IPV6):
3388 case __constant_htons(ETH_P_8021Q):
3389 return true;
3390 default:
3391 return false;
3395 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3397 struct packet_type *ptype, *pt_prev;
3398 rx_handler_func_t *rx_handler;
3399 struct net_device *orig_dev;
3400 struct net_device *null_or_dev;
3401 bool deliver_exact = false;
3402 int ret = NET_RX_DROP;
3403 __be16 type;
3405 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3407 trace_netif_receive_skb(skb);
3409 /* if we've gotten here through NAPI, check netpoll */
3410 if (netpoll_receive_skb(skb))
3411 goto out;
3413 orig_dev = skb->dev;
3415 skb_reset_network_header(skb);
3416 if (!skb_transport_header_was_set(skb))
3417 skb_reset_transport_header(skb);
3418 skb_reset_mac_len(skb);
3420 pt_prev = NULL;
3422 rcu_read_lock();
3424 another_round:
3425 skb->skb_iif = skb->dev->ifindex;
3427 __this_cpu_inc(softnet_data.processed);
3429 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3430 skb = vlan_untag(skb);
3431 if (unlikely(!skb))
3432 goto unlock;
3435 #ifdef CONFIG_NET_CLS_ACT
3436 if (skb->tc_verd & TC_NCLS) {
3437 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3438 goto ncls;
3440 #endif
3442 if (pfmemalloc)
3443 goto skip_taps;
3445 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3446 if (!ptype->dev || ptype->dev == skb->dev) {
3447 if (pt_prev)
3448 ret = deliver_skb(skb, pt_prev, orig_dev);
3449 pt_prev = ptype;
3453 skip_taps:
3454 #ifdef CONFIG_NET_CLS_ACT
3455 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3456 if (!skb)
3457 goto unlock;
3458 ncls:
3459 #endif
3461 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3462 goto drop;
3464 if (vlan_tx_tag_present(skb)) {
3465 if (pt_prev) {
3466 ret = deliver_skb(skb, pt_prev, orig_dev);
3467 pt_prev = NULL;
3469 if (vlan_do_receive(&skb))
3470 goto another_round;
3471 else if (unlikely(!skb))
3472 goto unlock;
3475 rx_handler = rcu_dereference(skb->dev->rx_handler);
3476 if (rx_handler) {
3477 if (pt_prev) {
3478 ret = deliver_skb(skb, pt_prev, orig_dev);
3479 pt_prev = NULL;
3481 switch (rx_handler(&skb)) {
3482 case RX_HANDLER_CONSUMED:
3483 goto unlock;
3484 case RX_HANDLER_ANOTHER:
3485 goto another_round;
3486 case RX_HANDLER_EXACT:
3487 deliver_exact = true;
3488 case RX_HANDLER_PASS:
3489 break;
3490 default:
3491 BUG();
3495 if (vlan_tx_nonzero_tag_present(skb))
3496 skb->pkt_type = PACKET_OTHERHOST;
3498 /* deliver only exact match when indicated */
3499 null_or_dev = deliver_exact ? skb->dev : NULL;
3501 type = skb->protocol;
3502 list_for_each_entry_rcu(ptype,
3503 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3504 if (ptype->type == type &&
3505 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3506 ptype->dev == orig_dev)) {
3507 if (pt_prev)
3508 ret = deliver_skb(skb, pt_prev, orig_dev);
3509 pt_prev = ptype;
3513 if (pt_prev) {
3514 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3515 goto drop;
3516 else
3517 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3518 } else {
3519 drop:
3520 atomic_long_inc(&skb->dev->rx_dropped);
3521 kfree_skb(skb);
3522 /* Jamal, now you will not able to escape explaining
3523 * me how you were going to use this. :-)
3525 ret = NET_RX_DROP;
3528 unlock:
3529 rcu_read_unlock();
3530 out:
3531 return ret;
3534 static int __netif_receive_skb(struct sk_buff *skb)
3536 int ret;
3538 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3539 unsigned long pflags = current->flags;
3542 * PFMEMALLOC skbs are special, they should
3543 * - be delivered to SOCK_MEMALLOC sockets only
3544 * - stay away from userspace
3545 * - have bounded memory usage
3547 * Use PF_MEMALLOC as this saves us from propagating the allocation
3548 * context down to all allocation sites.
3550 current->flags |= PF_MEMALLOC;
3551 ret = __netif_receive_skb_core(skb, true);
3552 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3553 } else
3554 ret = __netif_receive_skb_core(skb, false);
3556 return ret;
3560 * netif_receive_skb - process receive buffer from network
3561 * @skb: buffer to process
3563 * netif_receive_skb() is the main receive data processing function.
3564 * It always succeeds. The buffer may be dropped during processing
3565 * for congestion control or by the protocol layers.
3567 * This function may only be called from softirq context and interrupts
3568 * should be enabled.
3570 * Return values (usually ignored):
3571 * NET_RX_SUCCESS: no congestion
3572 * NET_RX_DROP: packet was dropped
3574 int netif_receive_skb(struct sk_buff *skb)
3576 net_timestamp_check(netdev_tstamp_prequeue, skb);
3578 if (skb_defer_rx_timestamp(skb))
3579 return NET_RX_SUCCESS;
3581 #ifdef CONFIG_RPS
3582 if (static_key_false(&rps_needed)) {
3583 struct rps_dev_flow voidflow, *rflow = &voidflow;
3584 int cpu, ret;
3586 rcu_read_lock();
3588 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3590 if (cpu >= 0) {
3591 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3592 rcu_read_unlock();
3593 return ret;
3595 rcu_read_unlock();
3597 #endif
3598 return __netif_receive_skb(skb);
3600 EXPORT_SYMBOL(netif_receive_skb);
3602 /* Network device is going away, flush any packets still pending
3603 * Called with irqs disabled.
3605 static void flush_backlog(void *arg)
3607 struct net_device *dev = arg;
3608 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3609 struct sk_buff *skb, *tmp;
3611 rps_lock(sd);
3612 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3613 if (skb->dev == dev) {
3614 __skb_unlink(skb, &sd->input_pkt_queue);
3615 kfree_skb(skb);
3616 input_queue_head_incr(sd);
3619 rps_unlock(sd);
3621 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3622 if (skb->dev == dev) {
3623 __skb_unlink(skb, &sd->process_queue);
3624 kfree_skb(skb);
3625 input_queue_head_incr(sd);
3630 static int napi_gro_complete(struct sk_buff *skb)
3632 struct packet_offload *ptype;
3633 __be16 type = skb->protocol;
3634 struct list_head *head = &offload_base;
3635 int err = -ENOENT;
3637 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3639 if (NAPI_GRO_CB(skb)->count == 1) {
3640 skb_shinfo(skb)->gso_size = 0;
3641 goto out;
3644 rcu_read_lock();
3645 list_for_each_entry_rcu(ptype, head, list) {
3646 if (ptype->type != type || !ptype->callbacks.gro_complete)
3647 continue;
3649 err = ptype->callbacks.gro_complete(skb);
3650 break;
3652 rcu_read_unlock();
3654 if (err) {
3655 WARN_ON(&ptype->list == head);
3656 kfree_skb(skb);
3657 return NET_RX_SUCCESS;
3660 out:
3661 return netif_receive_skb(skb);
3664 /* napi->gro_list contains packets ordered by age.
3665 * youngest packets at the head of it.
3666 * Complete skbs in reverse order to reduce latencies.
3668 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3670 struct sk_buff *skb, *prev = NULL;
3672 /* scan list and build reverse chain */
3673 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3674 skb->prev = prev;
3675 prev = skb;
3678 for (skb = prev; skb; skb = prev) {
3679 skb->next = NULL;
3681 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3682 return;
3684 prev = skb->prev;
3685 napi_gro_complete(skb);
3686 napi->gro_count--;
3689 napi->gro_list = NULL;
3691 EXPORT_SYMBOL(napi_gro_flush);
3693 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3695 struct sk_buff *p;
3696 unsigned int maclen = skb->dev->hard_header_len;
3698 for (p = napi->gro_list; p; p = p->next) {
3699 unsigned long diffs;
3701 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3702 diffs |= p->vlan_tci ^ skb->vlan_tci;
3703 if (maclen == ETH_HLEN)
3704 diffs |= compare_ether_header(skb_mac_header(p),
3705 skb_gro_mac_header(skb));
3706 else if (!diffs)
3707 diffs = memcmp(skb_mac_header(p),
3708 skb_gro_mac_header(skb),
3709 maclen);
3710 NAPI_GRO_CB(p)->same_flow = !diffs;
3711 NAPI_GRO_CB(p)->flush = 0;
3715 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3717 struct sk_buff **pp = NULL;
3718 struct packet_offload *ptype;
3719 __be16 type = skb->protocol;
3720 struct list_head *head = &offload_base;
3721 int same_flow;
3722 enum gro_result ret;
3724 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3725 goto normal;
3727 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3728 goto normal;
3730 gro_list_prepare(napi, skb);
3732 rcu_read_lock();
3733 list_for_each_entry_rcu(ptype, head, list) {
3734 if (ptype->type != type || !ptype->callbacks.gro_receive)
3735 continue;
3737 skb_set_network_header(skb, skb_gro_offset(skb));
3738 skb_reset_mac_len(skb);
3739 NAPI_GRO_CB(skb)->same_flow = 0;
3740 NAPI_GRO_CB(skb)->flush = 0;
3741 NAPI_GRO_CB(skb)->free = 0;
3743 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3744 break;
3746 rcu_read_unlock();
3748 if (&ptype->list == head)
3749 goto normal;
3751 same_flow = NAPI_GRO_CB(skb)->same_flow;
3752 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3754 if (pp) {
3755 struct sk_buff *nskb = *pp;
3757 *pp = nskb->next;
3758 nskb->next = NULL;
3759 napi_gro_complete(nskb);
3760 napi->gro_count--;
3763 if (same_flow)
3764 goto ok;
3766 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3767 goto normal;
3769 napi->gro_count++;
3770 NAPI_GRO_CB(skb)->count = 1;
3771 NAPI_GRO_CB(skb)->age = jiffies;
3772 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3773 skb->next = napi->gro_list;
3774 napi->gro_list = skb;
3775 ret = GRO_HELD;
3777 pull:
3778 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3779 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3781 BUG_ON(skb->end - skb->tail < grow);
3783 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3785 skb->tail += grow;
3786 skb->data_len -= grow;
3788 skb_shinfo(skb)->frags[0].page_offset += grow;
3789 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3791 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3792 skb_frag_unref(skb, 0);
3793 memmove(skb_shinfo(skb)->frags,
3794 skb_shinfo(skb)->frags + 1,
3795 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3800 return ret;
3802 normal:
3803 ret = GRO_NORMAL;
3804 goto pull;
3808 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3810 switch (ret) {
3811 case GRO_NORMAL:
3812 if (netif_receive_skb(skb))
3813 ret = GRO_DROP;
3814 break;
3816 case GRO_DROP:
3817 kfree_skb(skb);
3818 break;
3820 case GRO_MERGED_FREE:
3821 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3822 kmem_cache_free(skbuff_head_cache, skb);
3823 else
3824 __kfree_skb(skb);
3825 break;
3827 case GRO_HELD:
3828 case GRO_MERGED:
3829 break;
3832 return ret;
3835 static void skb_gro_reset_offset(struct sk_buff *skb)
3837 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3838 const skb_frag_t *frag0 = &pinfo->frags[0];
3840 NAPI_GRO_CB(skb)->data_offset = 0;
3841 NAPI_GRO_CB(skb)->frag0 = NULL;
3842 NAPI_GRO_CB(skb)->frag0_len = 0;
3844 if (skb->mac_header == skb->tail &&
3845 pinfo->nr_frags &&
3846 !PageHighMem(skb_frag_page(frag0))) {
3847 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3848 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3852 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3854 skb_gro_reset_offset(skb);
3856 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3858 EXPORT_SYMBOL(napi_gro_receive);
3860 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3862 __skb_pull(skb, skb_headlen(skb));
3863 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3864 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3865 skb->vlan_tci = 0;
3866 skb->dev = napi->dev;
3867 skb->skb_iif = 0;
3869 napi->skb = skb;
3872 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3874 struct sk_buff *skb = napi->skb;
3876 if (!skb) {
3877 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3878 if (skb)
3879 napi->skb = skb;
3881 return skb;
3883 EXPORT_SYMBOL(napi_get_frags);
3885 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3886 gro_result_t ret)
3888 switch (ret) {
3889 case GRO_NORMAL:
3890 case GRO_HELD:
3891 skb->protocol = eth_type_trans(skb, skb->dev);
3893 if (ret == GRO_HELD)
3894 skb_gro_pull(skb, -ETH_HLEN);
3895 else if (netif_receive_skb(skb))
3896 ret = GRO_DROP;
3897 break;
3899 case GRO_DROP:
3900 case GRO_MERGED_FREE:
3901 napi_reuse_skb(napi, skb);
3902 break;
3904 case GRO_MERGED:
3905 break;
3908 return ret;
3911 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3913 struct sk_buff *skb = napi->skb;
3914 struct ethhdr *eth;
3915 unsigned int hlen;
3916 unsigned int off;
3918 napi->skb = NULL;
3920 skb_reset_mac_header(skb);
3921 skb_gro_reset_offset(skb);
3923 off = skb_gro_offset(skb);
3924 hlen = off + sizeof(*eth);
3925 eth = skb_gro_header_fast(skb, off);
3926 if (skb_gro_header_hard(skb, hlen)) {
3927 eth = skb_gro_header_slow(skb, hlen, off);
3928 if (unlikely(!eth)) {
3929 napi_reuse_skb(napi, skb);
3930 skb = NULL;
3931 goto out;
3935 skb_gro_pull(skb, sizeof(*eth));
3938 * This works because the only protocols we care about don't require
3939 * special handling. We'll fix it up properly at the end.
3941 skb->protocol = eth->h_proto;
3943 out:
3944 return skb;
3947 gro_result_t napi_gro_frags(struct napi_struct *napi)
3949 struct sk_buff *skb = napi_frags_skb(napi);
3951 if (!skb)
3952 return GRO_DROP;
3954 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
3956 EXPORT_SYMBOL(napi_gro_frags);
3959 * net_rps_action sends any pending IPI's for rps.
3960 * Note: called with local irq disabled, but exits with local irq enabled.
3962 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3964 #ifdef CONFIG_RPS
3965 struct softnet_data *remsd = sd->rps_ipi_list;
3967 if (remsd) {
3968 sd->rps_ipi_list = NULL;
3970 local_irq_enable();
3972 /* Send pending IPI's to kick RPS processing on remote cpus. */
3973 while (remsd) {
3974 struct softnet_data *next = remsd->rps_ipi_next;
3976 if (cpu_online(remsd->cpu))
3977 __smp_call_function_single(remsd->cpu,
3978 &remsd->csd, 0);
3979 remsd = next;
3981 } else
3982 #endif
3983 local_irq_enable();
3986 static int process_backlog(struct napi_struct *napi, int quota)
3988 int work = 0;
3989 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3991 #ifdef CONFIG_RPS
3992 /* Check if we have pending ipi, its better to send them now,
3993 * not waiting net_rx_action() end.
3995 if (sd->rps_ipi_list) {
3996 local_irq_disable();
3997 net_rps_action_and_irq_enable(sd);
3999 #endif
4000 napi->weight = weight_p;
4001 local_irq_disable();
4002 while (work < quota) {
4003 struct sk_buff *skb;
4004 unsigned int qlen;
4006 while ((skb = __skb_dequeue(&sd->process_queue))) {
4007 local_irq_enable();
4008 __netif_receive_skb(skb);
4009 local_irq_disable();
4010 input_queue_head_incr(sd);
4011 if (++work >= quota) {
4012 local_irq_enable();
4013 return work;
4017 rps_lock(sd);
4018 qlen = skb_queue_len(&sd->input_pkt_queue);
4019 if (qlen)
4020 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4021 &sd->process_queue);
4023 if (qlen < quota - work) {
4025 * Inline a custom version of __napi_complete().
4026 * only current cpu owns and manipulates this napi,
4027 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4028 * we can use a plain write instead of clear_bit(),
4029 * and we dont need an smp_mb() memory barrier.
4031 list_del(&napi->poll_list);
4032 napi->state = 0;
4034 quota = work + qlen;
4036 rps_unlock(sd);
4038 local_irq_enable();
4040 return work;
4044 * __napi_schedule - schedule for receive
4045 * @n: entry to schedule
4047 * The entry's receive function will be scheduled to run
4049 void __napi_schedule(struct napi_struct *n)
4051 unsigned long flags;
4053 local_irq_save(flags);
4054 ____napi_schedule(&__get_cpu_var(softnet_data), n);
4055 local_irq_restore(flags);
4057 EXPORT_SYMBOL(__napi_schedule);
4059 void __napi_complete(struct napi_struct *n)
4061 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4062 BUG_ON(n->gro_list);
4064 list_del(&n->poll_list);
4065 smp_mb__before_clear_bit();
4066 clear_bit(NAPI_STATE_SCHED, &n->state);
4068 EXPORT_SYMBOL(__napi_complete);
4070 void napi_complete(struct napi_struct *n)
4072 unsigned long flags;
4075 * don't let napi dequeue from the cpu poll list
4076 * just in case its running on a different cpu
4078 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4079 return;
4081 napi_gro_flush(n, false);
4082 local_irq_save(flags);
4083 __napi_complete(n);
4084 local_irq_restore(flags);
4086 EXPORT_SYMBOL(napi_complete);
4088 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4089 int (*poll)(struct napi_struct *, int), int weight)
4091 INIT_LIST_HEAD(&napi->poll_list);
4092 napi->gro_count = 0;
4093 napi->gro_list = NULL;
4094 napi->skb = NULL;
4095 napi->poll = poll;
4096 napi->weight = weight;
4097 list_add(&napi->dev_list, &dev->napi_list);
4098 napi->dev = dev;
4099 #ifdef CONFIG_NETPOLL
4100 spin_lock_init(&napi->poll_lock);
4101 napi->poll_owner = -1;
4102 #endif
4103 set_bit(NAPI_STATE_SCHED, &napi->state);
4105 EXPORT_SYMBOL(netif_napi_add);
4107 void netif_napi_del(struct napi_struct *napi)
4109 struct sk_buff *skb, *next;
4111 list_del_init(&napi->dev_list);
4112 napi_free_frags(napi);
4114 for (skb = napi->gro_list; skb; skb = next) {
4115 next = skb->next;
4116 skb->next = NULL;
4117 kfree_skb(skb);
4120 napi->gro_list = NULL;
4121 napi->gro_count = 0;
4123 EXPORT_SYMBOL(netif_napi_del);
4125 static void net_rx_action(struct softirq_action *h)
4127 struct softnet_data *sd = &__get_cpu_var(softnet_data);
4128 unsigned long time_limit = jiffies + 2;
4129 int budget = netdev_budget;
4130 void *have;
4132 local_irq_disable();
4134 while (!list_empty(&sd->poll_list)) {
4135 struct napi_struct *n;
4136 int work, weight;
4138 /* If softirq window is exhuasted then punt.
4139 * Allow this to run for 2 jiffies since which will allow
4140 * an average latency of 1.5/HZ.
4142 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4143 goto softnet_break;
4145 local_irq_enable();
4147 /* Even though interrupts have been re-enabled, this
4148 * access is safe because interrupts can only add new
4149 * entries to the tail of this list, and only ->poll()
4150 * calls can remove this head entry from the list.
4152 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4154 have = netpoll_poll_lock(n);
4156 weight = n->weight;
4158 /* This NAPI_STATE_SCHED test is for avoiding a race
4159 * with netpoll's poll_napi(). Only the entity which
4160 * obtains the lock and sees NAPI_STATE_SCHED set will
4161 * actually make the ->poll() call. Therefore we avoid
4162 * accidentally calling ->poll() when NAPI is not scheduled.
4164 work = 0;
4165 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4166 work = n->poll(n, weight);
4167 trace_napi_poll(n);
4170 WARN_ON_ONCE(work > weight);
4172 budget -= work;
4174 local_irq_disable();
4176 /* Drivers must not modify the NAPI state if they
4177 * consume the entire weight. In such cases this code
4178 * still "owns" the NAPI instance and therefore can
4179 * move the instance around on the list at-will.
4181 if (unlikely(work == weight)) {
4182 if (unlikely(napi_disable_pending(n))) {
4183 local_irq_enable();
4184 napi_complete(n);
4185 local_irq_disable();
4186 } else {
4187 if (n->gro_list) {
4188 /* flush too old packets
4189 * If HZ < 1000, flush all packets.
4191 local_irq_enable();
4192 napi_gro_flush(n, HZ >= 1000);
4193 local_irq_disable();
4195 list_move_tail(&n->poll_list, &sd->poll_list);
4199 netpoll_poll_unlock(have);
4201 out:
4202 net_rps_action_and_irq_enable(sd);
4204 #ifdef CONFIG_NET_DMA
4206 * There may not be any more sk_buffs coming right now, so push
4207 * any pending DMA copies to hardware
4209 dma_issue_pending_all();
4210 #endif
4212 return;
4214 softnet_break:
4215 sd->time_squeeze++;
4216 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4217 goto out;
4220 #ifdef CONFIG_PROC_FS
4222 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4224 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4225 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4226 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4228 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4230 struct net *net = seq_file_net(seq);
4231 struct net_device *dev;
4232 struct hlist_node *p;
4233 struct hlist_head *h;
4234 unsigned int count = 0, offset = get_offset(*pos);
4236 h = &net->dev_name_head[get_bucket(*pos)];
4237 hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4238 if (++count == offset)
4239 return dev;
4242 return NULL;
4245 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4247 struct net_device *dev;
4248 unsigned int bucket;
4250 do {
4251 dev = dev_from_same_bucket(seq, pos);
4252 if (dev)
4253 return dev;
4255 bucket = get_bucket(*pos) + 1;
4256 *pos = set_bucket_offset(bucket, 1);
4257 } while (bucket < NETDEV_HASHENTRIES);
4259 return NULL;
4263 * This is invoked by the /proc filesystem handler to display a device
4264 * in detail.
4266 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4267 __acquires(RCU)
4269 rcu_read_lock();
4270 if (!*pos)
4271 return SEQ_START_TOKEN;
4273 if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4274 return NULL;
4276 return dev_from_bucket(seq, pos);
4279 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4281 ++*pos;
4282 return dev_from_bucket(seq, pos);
4285 void dev_seq_stop(struct seq_file *seq, void *v)
4286 __releases(RCU)
4288 rcu_read_unlock();
4291 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4293 struct rtnl_link_stats64 temp;
4294 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4296 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4297 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4298 dev->name, stats->rx_bytes, stats->rx_packets,
4299 stats->rx_errors,
4300 stats->rx_dropped + stats->rx_missed_errors,
4301 stats->rx_fifo_errors,
4302 stats->rx_length_errors + stats->rx_over_errors +
4303 stats->rx_crc_errors + stats->rx_frame_errors,
4304 stats->rx_compressed, stats->multicast,
4305 stats->tx_bytes, stats->tx_packets,
4306 stats->tx_errors, stats->tx_dropped,
4307 stats->tx_fifo_errors, stats->collisions,
4308 stats->tx_carrier_errors +
4309 stats->tx_aborted_errors +
4310 stats->tx_window_errors +
4311 stats->tx_heartbeat_errors,
4312 stats->tx_compressed);
4316 * Called from the PROCfs module. This now uses the new arbitrary sized
4317 * /proc/net interface to create /proc/net/dev
4319 static int dev_seq_show(struct seq_file *seq, void *v)
4321 if (v == SEQ_START_TOKEN)
4322 seq_puts(seq, "Inter-| Receive "
4323 " | Transmit\n"
4324 " face |bytes packets errs drop fifo frame "
4325 "compressed multicast|bytes packets errs "
4326 "drop fifo colls carrier compressed\n");
4327 else
4328 dev_seq_printf_stats(seq, v);
4329 return 0;
4332 static struct softnet_data *softnet_get_online(loff_t *pos)
4334 struct softnet_data *sd = NULL;
4336 while (*pos < nr_cpu_ids)
4337 if (cpu_online(*pos)) {
4338 sd = &per_cpu(softnet_data, *pos);
4339 break;
4340 } else
4341 ++*pos;
4342 return sd;
4345 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4347 return softnet_get_online(pos);
4350 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4352 ++*pos;
4353 return softnet_get_online(pos);
4356 static void softnet_seq_stop(struct seq_file *seq, void *v)
4360 static int softnet_seq_show(struct seq_file *seq, void *v)
4362 struct softnet_data *sd = v;
4364 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4365 sd->processed, sd->dropped, sd->time_squeeze, 0,
4366 0, 0, 0, 0, /* was fastroute */
4367 sd->cpu_collision, sd->received_rps);
4368 return 0;
4371 static const struct seq_operations dev_seq_ops = {
4372 .start = dev_seq_start,
4373 .next = dev_seq_next,
4374 .stop = dev_seq_stop,
4375 .show = dev_seq_show,
4378 static int dev_seq_open(struct inode *inode, struct file *file)
4380 return seq_open_net(inode, file, &dev_seq_ops,
4381 sizeof(struct seq_net_private));
4384 static const struct file_operations dev_seq_fops = {
4385 .owner = THIS_MODULE,
4386 .open = dev_seq_open,
4387 .read = seq_read,
4388 .llseek = seq_lseek,
4389 .release = seq_release_net,
4392 static const struct seq_operations softnet_seq_ops = {
4393 .start = softnet_seq_start,
4394 .next = softnet_seq_next,
4395 .stop = softnet_seq_stop,
4396 .show = softnet_seq_show,
4399 static int softnet_seq_open(struct inode *inode, struct file *file)
4401 return seq_open(file, &softnet_seq_ops);
4404 static const struct file_operations softnet_seq_fops = {
4405 .owner = THIS_MODULE,
4406 .open = softnet_seq_open,
4407 .read = seq_read,
4408 .llseek = seq_lseek,
4409 .release = seq_release,
4412 static void *ptype_get_idx(loff_t pos)
4414 struct packet_type *pt = NULL;
4415 loff_t i = 0;
4416 int t;
4418 list_for_each_entry_rcu(pt, &ptype_all, list) {
4419 if (i == pos)
4420 return pt;
4421 ++i;
4424 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4425 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4426 if (i == pos)
4427 return pt;
4428 ++i;
4431 return NULL;
4434 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4435 __acquires(RCU)
4437 rcu_read_lock();
4438 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4441 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4443 struct packet_type *pt;
4444 struct list_head *nxt;
4445 int hash;
4447 ++*pos;
4448 if (v == SEQ_START_TOKEN)
4449 return ptype_get_idx(0);
4451 pt = v;
4452 nxt = pt->list.next;
4453 if (pt->type == htons(ETH_P_ALL)) {
4454 if (nxt != &ptype_all)
4455 goto found;
4456 hash = 0;
4457 nxt = ptype_base[0].next;
4458 } else
4459 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4461 while (nxt == &ptype_base[hash]) {
4462 if (++hash >= PTYPE_HASH_SIZE)
4463 return NULL;
4464 nxt = ptype_base[hash].next;
4466 found:
4467 return list_entry(nxt, struct packet_type, list);
4470 static void ptype_seq_stop(struct seq_file *seq, void *v)
4471 __releases(RCU)
4473 rcu_read_unlock();
4476 static int ptype_seq_show(struct seq_file *seq, void *v)
4478 struct packet_type *pt = v;
4480 if (v == SEQ_START_TOKEN)
4481 seq_puts(seq, "Type Device Function\n");
4482 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4483 if (pt->type == htons(ETH_P_ALL))
4484 seq_puts(seq, "ALL ");
4485 else
4486 seq_printf(seq, "%04x", ntohs(pt->type));
4488 seq_printf(seq, " %-8s %pF\n",
4489 pt->dev ? pt->dev->name : "", pt->func);
4492 return 0;
4495 static const struct seq_operations ptype_seq_ops = {
4496 .start = ptype_seq_start,
4497 .next = ptype_seq_next,
4498 .stop = ptype_seq_stop,
4499 .show = ptype_seq_show,
4502 static int ptype_seq_open(struct inode *inode, struct file *file)
4504 return seq_open_net(inode, file, &ptype_seq_ops,
4505 sizeof(struct seq_net_private));
4508 static const struct file_operations ptype_seq_fops = {
4509 .owner = THIS_MODULE,
4510 .open = ptype_seq_open,
4511 .read = seq_read,
4512 .llseek = seq_lseek,
4513 .release = seq_release_net,
4517 static int __net_init dev_proc_net_init(struct net *net)
4519 int rc = -ENOMEM;
4521 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4522 goto out;
4523 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4524 goto out_dev;
4525 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4526 goto out_softnet;
4528 if (wext_proc_init(net))
4529 goto out_ptype;
4530 rc = 0;
4531 out:
4532 return rc;
4533 out_ptype:
4534 proc_net_remove(net, "ptype");
4535 out_softnet:
4536 proc_net_remove(net, "softnet_stat");
4537 out_dev:
4538 proc_net_remove(net, "dev");
4539 goto out;
4542 static void __net_exit dev_proc_net_exit(struct net *net)
4544 wext_proc_exit(net);
4546 proc_net_remove(net, "ptype");
4547 proc_net_remove(net, "softnet_stat");
4548 proc_net_remove(net, "dev");
4551 static struct pernet_operations __net_initdata dev_proc_ops = {
4552 .init = dev_proc_net_init,
4553 .exit = dev_proc_net_exit,
4556 static int __init dev_proc_init(void)
4558 return register_pernet_subsys(&dev_proc_ops);
4560 #else
4561 #define dev_proc_init() 0
4562 #endif /* CONFIG_PROC_FS */
4565 struct netdev_upper {
4566 struct net_device *dev;
4567 bool master;
4568 struct list_head list;
4569 struct rcu_head rcu;
4570 struct list_head search_list;
4573 static void __append_search_uppers(struct list_head *search_list,
4574 struct net_device *dev)
4576 struct netdev_upper *upper;
4578 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4579 /* check if this upper is not already in search list */
4580 if (list_empty(&upper->search_list))
4581 list_add_tail(&upper->search_list, search_list);
4585 static bool __netdev_search_upper_dev(struct net_device *dev,
4586 struct net_device *upper_dev)
4588 LIST_HEAD(search_list);
4589 struct netdev_upper *upper;
4590 struct netdev_upper *tmp;
4591 bool ret = false;
4593 __append_search_uppers(&search_list, dev);
4594 list_for_each_entry(upper, &search_list, search_list) {
4595 if (upper->dev == upper_dev) {
4596 ret = true;
4597 break;
4599 __append_search_uppers(&search_list, upper->dev);
4601 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4602 INIT_LIST_HEAD(&upper->search_list);
4603 return ret;
4606 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4607 struct net_device *upper_dev)
4609 struct netdev_upper *upper;
4611 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4612 if (upper->dev == upper_dev)
4613 return upper;
4615 return NULL;
4619 * netdev_has_upper_dev - Check if device is linked to an upper device
4620 * @dev: device
4621 * @upper_dev: upper device to check
4623 * Find out if a device is linked to specified upper device and return true
4624 * in case it is. Note that this checks only immediate upper device,
4625 * not through a complete stack of devices. The caller must hold the RTNL lock.
4627 bool netdev_has_upper_dev(struct net_device *dev,
4628 struct net_device *upper_dev)
4630 ASSERT_RTNL();
4632 return __netdev_find_upper(dev, upper_dev);
4634 EXPORT_SYMBOL(netdev_has_upper_dev);
4637 * netdev_has_any_upper_dev - Check if device is linked to some device
4638 * @dev: device
4640 * Find out if a device is linked to an upper device and return true in case
4641 * it is. The caller must hold the RTNL lock.
4643 bool netdev_has_any_upper_dev(struct net_device *dev)
4645 ASSERT_RTNL();
4647 return !list_empty(&dev->upper_dev_list);
4649 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4652 * netdev_master_upper_dev_get - Get master upper device
4653 * @dev: device
4655 * Find a master upper device and return pointer to it or NULL in case
4656 * it's not there. The caller must hold the RTNL lock.
4658 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4660 struct netdev_upper *upper;
4662 ASSERT_RTNL();
4664 if (list_empty(&dev->upper_dev_list))
4665 return NULL;
4667 upper = list_first_entry(&dev->upper_dev_list,
4668 struct netdev_upper, list);
4669 if (likely(upper->master))
4670 return upper->dev;
4671 return NULL;
4673 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4676 * netdev_master_upper_dev_get_rcu - Get master upper device
4677 * @dev: device
4679 * Find a master upper device and return pointer to it or NULL in case
4680 * it's not there. The caller must hold the RCU read lock.
4682 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4684 struct netdev_upper *upper;
4686 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4687 struct netdev_upper, list);
4688 if (upper && likely(upper->master))
4689 return upper->dev;
4690 return NULL;
4692 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4694 static int __netdev_upper_dev_link(struct net_device *dev,
4695 struct net_device *upper_dev, bool master)
4697 struct netdev_upper *upper;
4699 ASSERT_RTNL();
4701 if (dev == upper_dev)
4702 return -EBUSY;
4704 /* To prevent loops, check if dev is not upper device to upper_dev. */
4705 if (__netdev_search_upper_dev(upper_dev, dev))
4706 return -EBUSY;
4708 if (__netdev_find_upper(dev, upper_dev))
4709 return -EEXIST;
4711 if (master && netdev_master_upper_dev_get(dev))
4712 return -EBUSY;
4714 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4715 if (!upper)
4716 return -ENOMEM;
4718 upper->dev = upper_dev;
4719 upper->master = master;
4720 INIT_LIST_HEAD(&upper->search_list);
4722 /* Ensure that master upper link is always the first item in list. */
4723 if (master)
4724 list_add_rcu(&upper->list, &dev->upper_dev_list);
4725 else
4726 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4727 dev_hold(upper_dev);
4729 return 0;
4733 * netdev_upper_dev_link - Add a link to the upper device
4734 * @dev: device
4735 * @upper_dev: new upper device
4737 * Adds a link to device which is upper to this one. The caller must hold
4738 * the RTNL lock. On a failure a negative errno code is returned.
4739 * On success the reference counts are adjusted and the function
4740 * returns zero.
4742 int netdev_upper_dev_link(struct net_device *dev,
4743 struct net_device *upper_dev)
4745 return __netdev_upper_dev_link(dev, upper_dev, false);
4747 EXPORT_SYMBOL(netdev_upper_dev_link);
4750 * netdev_master_upper_dev_link - Add a master link to the upper device
4751 * @dev: device
4752 * @upper_dev: new upper device
4754 * Adds a link to device which is upper to this one. In this case, only
4755 * one master upper device can be linked, although other non-master devices
4756 * might be linked as well. The caller must hold the RTNL lock.
4757 * On a failure a negative errno code is returned. On success the reference
4758 * counts are adjusted and the function returns zero.
4760 int netdev_master_upper_dev_link(struct net_device *dev,
4761 struct net_device *upper_dev)
4763 return __netdev_upper_dev_link(dev, upper_dev, true);
4765 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4768 * netdev_upper_dev_unlink - Removes a link to upper device
4769 * @dev: device
4770 * @upper_dev: new upper device
4772 * Removes a link to device which is upper to this one. The caller must hold
4773 * the RTNL lock.
4775 void netdev_upper_dev_unlink(struct net_device *dev,
4776 struct net_device *upper_dev)
4778 struct netdev_upper *upper;
4780 ASSERT_RTNL();
4782 upper = __netdev_find_upper(dev, upper_dev);
4783 if (!upper)
4784 return;
4785 list_del_rcu(&upper->list);
4786 dev_put(upper_dev);
4787 kfree_rcu(upper, rcu);
4789 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4791 static void dev_change_rx_flags(struct net_device *dev, int flags)
4793 const struct net_device_ops *ops = dev->netdev_ops;
4795 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4796 ops->ndo_change_rx_flags(dev, flags);
4799 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4801 unsigned int old_flags = dev->flags;
4802 kuid_t uid;
4803 kgid_t gid;
4805 ASSERT_RTNL();
4807 dev->flags |= IFF_PROMISC;
4808 dev->promiscuity += inc;
4809 if (dev->promiscuity == 0) {
4811 * Avoid overflow.
4812 * If inc causes overflow, untouch promisc and return error.
4814 if (inc < 0)
4815 dev->flags &= ~IFF_PROMISC;
4816 else {
4817 dev->promiscuity -= inc;
4818 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4819 dev->name);
4820 return -EOVERFLOW;
4823 if (dev->flags != old_flags) {
4824 pr_info("device %s %s promiscuous mode\n",
4825 dev->name,
4826 dev->flags & IFF_PROMISC ? "entered" : "left");
4827 if (audit_enabled) {
4828 current_uid_gid(&uid, &gid);
4829 audit_log(current->audit_context, GFP_ATOMIC,
4830 AUDIT_ANOM_PROMISCUOUS,
4831 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4832 dev->name, (dev->flags & IFF_PROMISC),
4833 (old_flags & IFF_PROMISC),
4834 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4835 from_kuid(&init_user_ns, uid),
4836 from_kgid(&init_user_ns, gid),
4837 audit_get_sessionid(current));
4840 dev_change_rx_flags(dev, IFF_PROMISC);
4842 return 0;
4846 * dev_set_promiscuity - update promiscuity count on a device
4847 * @dev: device
4848 * @inc: modifier
4850 * Add or remove promiscuity from a device. While the count in the device
4851 * remains above zero the interface remains promiscuous. Once it hits zero
4852 * the device reverts back to normal filtering operation. A negative inc
4853 * value is used to drop promiscuity on the device.
4854 * Return 0 if successful or a negative errno code on error.
4856 int dev_set_promiscuity(struct net_device *dev, int inc)
4858 unsigned int old_flags = dev->flags;
4859 int err;
4861 err = __dev_set_promiscuity(dev, inc);
4862 if (err < 0)
4863 return err;
4864 if (dev->flags != old_flags)
4865 dev_set_rx_mode(dev);
4866 return err;
4868 EXPORT_SYMBOL(dev_set_promiscuity);
4871 * dev_set_allmulti - update allmulti count on a device
4872 * @dev: device
4873 * @inc: modifier
4875 * Add or remove reception of all multicast frames to a device. While the
4876 * count in the device remains above zero the interface remains listening
4877 * to all interfaces. Once it hits zero the device reverts back to normal
4878 * filtering operation. A negative @inc value is used to drop the counter
4879 * when releasing a resource needing all multicasts.
4880 * Return 0 if successful or a negative errno code on error.
4883 int dev_set_allmulti(struct net_device *dev, int inc)
4885 unsigned int old_flags = dev->flags;
4887 ASSERT_RTNL();
4889 dev->flags |= IFF_ALLMULTI;
4890 dev->allmulti += inc;
4891 if (dev->allmulti == 0) {
4893 * Avoid overflow.
4894 * If inc causes overflow, untouch allmulti and return error.
4896 if (inc < 0)
4897 dev->flags &= ~IFF_ALLMULTI;
4898 else {
4899 dev->allmulti -= inc;
4900 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4901 dev->name);
4902 return -EOVERFLOW;
4905 if (dev->flags ^ old_flags) {
4906 dev_change_rx_flags(dev, IFF_ALLMULTI);
4907 dev_set_rx_mode(dev);
4909 return 0;
4911 EXPORT_SYMBOL(dev_set_allmulti);
4914 * Upload unicast and multicast address lists to device and
4915 * configure RX filtering. When the device doesn't support unicast
4916 * filtering it is put in promiscuous mode while unicast addresses
4917 * are present.
4919 void __dev_set_rx_mode(struct net_device *dev)
4921 const struct net_device_ops *ops = dev->netdev_ops;
4923 /* dev_open will call this function so the list will stay sane. */
4924 if (!(dev->flags&IFF_UP))
4925 return;
4927 if (!netif_device_present(dev))
4928 return;
4930 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4931 /* Unicast addresses changes may only happen under the rtnl,
4932 * therefore calling __dev_set_promiscuity here is safe.
4934 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4935 __dev_set_promiscuity(dev, 1);
4936 dev->uc_promisc = true;
4937 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4938 __dev_set_promiscuity(dev, -1);
4939 dev->uc_promisc = false;
4943 if (ops->ndo_set_rx_mode)
4944 ops->ndo_set_rx_mode(dev);
4947 void dev_set_rx_mode(struct net_device *dev)
4949 netif_addr_lock_bh(dev);
4950 __dev_set_rx_mode(dev);
4951 netif_addr_unlock_bh(dev);
4955 * dev_get_flags - get flags reported to userspace
4956 * @dev: device
4958 * Get the combination of flag bits exported through APIs to userspace.
4960 unsigned int dev_get_flags(const struct net_device *dev)
4962 unsigned int flags;
4964 flags = (dev->flags & ~(IFF_PROMISC |
4965 IFF_ALLMULTI |
4966 IFF_RUNNING |
4967 IFF_LOWER_UP |
4968 IFF_DORMANT)) |
4969 (dev->gflags & (IFF_PROMISC |
4970 IFF_ALLMULTI));
4972 if (netif_running(dev)) {
4973 if (netif_oper_up(dev))
4974 flags |= IFF_RUNNING;
4975 if (netif_carrier_ok(dev))
4976 flags |= IFF_LOWER_UP;
4977 if (netif_dormant(dev))
4978 flags |= IFF_DORMANT;
4981 return flags;
4983 EXPORT_SYMBOL(dev_get_flags);
4985 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4987 unsigned int old_flags = dev->flags;
4988 int ret;
4990 ASSERT_RTNL();
4993 * Set the flags on our device.
4996 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4997 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4998 IFF_AUTOMEDIA)) |
4999 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5000 IFF_ALLMULTI));
5003 * Load in the correct multicast list now the flags have changed.
5006 if ((old_flags ^ flags) & IFF_MULTICAST)
5007 dev_change_rx_flags(dev, IFF_MULTICAST);
5009 dev_set_rx_mode(dev);
5012 * Have we downed the interface. We handle IFF_UP ourselves
5013 * according to user attempts to set it, rather than blindly
5014 * setting it.
5017 ret = 0;
5018 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
5019 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5021 if (!ret)
5022 dev_set_rx_mode(dev);
5025 if ((flags ^ dev->gflags) & IFF_PROMISC) {
5026 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5028 dev->gflags ^= IFF_PROMISC;
5029 dev_set_promiscuity(dev, inc);
5032 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5033 is important. Some (broken) drivers set IFF_PROMISC, when
5034 IFF_ALLMULTI is requested not asking us and not reporting.
5036 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5037 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5039 dev->gflags ^= IFF_ALLMULTI;
5040 dev_set_allmulti(dev, inc);
5043 return ret;
5046 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
5048 unsigned int changes = dev->flags ^ old_flags;
5050 if (changes & IFF_UP) {
5051 if (dev->flags & IFF_UP)
5052 call_netdevice_notifiers(NETDEV_UP, dev);
5053 else
5054 call_netdevice_notifiers(NETDEV_DOWN, dev);
5057 if (dev->flags & IFF_UP &&
5058 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
5059 call_netdevice_notifiers(NETDEV_CHANGE, dev);
5063 * dev_change_flags - change device settings
5064 * @dev: device
5065 * @flags: device state flags
5067 * Change settings on device based state flags. The flags are
5068 * in the userspace exported format.
5070 int dev_change_flags(struct net_device *dev, unsigned int flags)
5072 int ret;
5073 unsigned int changes, old_flags = dev->flags;
5075 ret = __dev_change_flags(dev, flags);
5076 if (ret < 0)
5077 return ret;
5079 changes = old_flags ^ dev->flags;
5080 if (changes)
5081 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
5083 __dev_notify_flags(dev, old_flags);
5084 return ret;
5086 EXPORT_SYMBOL(dev_change_flags);
5089 * dev_set_mtu - Change maximum transfer unit
5090 * @dev: device
5091 * @new_mtu: new transfer unit
5093 * Change the maximum transfer size of the network device.
5095 int dev_set_mtu(struct net_device *dev, int new_mtu)
5097 const struct net_device_ops *ops = dev->netdev_ops;
5098 int err;
5100 if (new_mtu == dev->mtu)
5101 return 0;
5103 /* MTU must be positive. */
5104 if (new_mtu < 0)
5105 return -EINVAL;
5107 if (!netif_device_present(dev))
5108 return -ENODEV;
5110 err = 0;
5111 if (ops->ndo_change_mtu)
5112 err = ops->ndo_change_mtu(dev, new_mtu);
5113 else
5114 dev->mtu = new_mtu;
5116 if (!err)
5117 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5118 return err;
5120 EXPORT_SYMBOL(dev_set_mtu);
5123 * dev_set_group - Change group this device belongs to
5124 * @dev: device
5125 * @new_group: group this device should belong to
5127 void dev_set_group(struct net_device *dev, int new_group)
5129 dev->group = new_group;
5131 EXPORT_SYMBOL(dev_set_group);
5134 * dev_set_mac_address - Change Media Access Control Address
5135 * @dev: device
5136 * @sa: new address
5138 * Change the hardware (MAC) address of the device
5140 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5142 const struct net_device_ops *ops = dev->netdev_ops;
5143 int err;
5145 if (!ops->ndo_set_mac_address)
5146 return -EOPNOTSUPP;
5147 if (sa->sa_family != dev->type)
5148 return -EINVAL;
5149 if (!netif_device_present(dev))
5150 return -ENODEV;
5151 err = ops->ndo_set_mac_address(dev, sa);
5152 if (err)
5153 return err;
5154 dev->addr_assign_type = NET_ADDR_SET;
5155 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5156 add_device_randomness(dev->dev_addr, dev->addr_len);
5157 return 0;
5159 EXPORT_SYMBOL(dev_set_mac_address);
5162 * dev_change_carrier - Change device carrier
5163 * @dev: device
5164 * @new_carries: new value
5166 * Change device carrier
5168 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5170 const struct net_device_ops *ops = dev->netdev_ops;
5172 if (!ops->ndo_change_carrier)
5173 return -EOPNOTSUPP;
5174 if (!netif_device_present(dev))
5175 return -ENODEV;
5176 return ops->ndo_change_carrier(dev, new_carrier);
5178 EXPORT_SYMBOL(dev_change_carrier);
5181 * dev_new_index - allocate an ifindex
5182 * @net: the applicable net namespace
5184 * Returns a suitable unique value for a new device interface
5185 * number. The caller must hold the rtnl semaphore or the
5186 * dev_base_lock to be sure it remains unique.
5188 static int dev_new_index(struct net *net)
5190 int ifindex = net->ifindex;
5191 for (;;) {
5192 if (++ifindex <= 0)
5193 ifindex = 1;
5194 if (!__dev_get_by_index(net, ifindex))
5195 return net->ifindex = ifindex;
5199 /* Delayed registration/unregisteration */
5200 static LIST_HEAD(net_todo_list);
5202 static void net_set_todo(struct net_device *dev)
5204 list_add_tail(&dev->todo_list, &net_todo_list);
5207 static void rollback_registered_many(struct list_head *head)
5209 struct net_device *dev, *tmp;
5211 BUG_ON(dev_boot_phase);
5212 ASSERT_RTNL();
5214 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5215 /* Some devices call without registering
5216 * for initialization unwind. Remove those
5217 * devices and proceed with the remaining.
5219 if (dev->reg_state == NETREG_UNINITIALIZED) {
5220 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5221 dev->name, dev);
5223 WARN_ON(1);
5224 list_del(&dev->unreg_list);
5225 continue;
5227 dev->dismantle = true;
5228 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5231 /* If device is running, close it first. */
5232 dev_close_many(head);
5234 list_for_each_entry(dev, head, unreg_list) {
5235 /* And unlink it from device chain. */
5236 unlist_netdevice(dev);
5238 dev->reg_state = NETREG_UNREGISTERING;
5241 synchronize_net();
5243 list_for_each_entry(dev, head, unreg_list) {
5244 /* Shutdown queueing discipline. */
5245 dev_shutdown(dev);
5248 /* Notify protocols, that we are about to destroy
5249 this device. They should clean all the things.
5251 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5253 if (!dev->rtnl_link_ops ||
5254 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5255 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5258 * Flush the unicast and multicast chains
5260 dev_uc_flush(dev);
5261 dev_mc_flush(dev);
5263 if (dev->netdev_ops->ndo_uninit)
5264 dev->netdev_ops->ndo_uninit(dev);
5266 /* Notifier chain MUST detach us all upper devices. */
5267 WARN_ON(netdev_has_any_upper_dev(dev));
5269 /* Remove entries from kobject tree */
5270 netdev_unregister_kobject(dev);
5271 #ifdef CONFIG_XPS
5272 /* Remove XPS queueing entries */
5273 netif_reset_xps_queues_gt(dev, 0);
5274 #endif
5277 synchronize_net();
5279 list_for_each_entry(dev, head, unreg_list)
5280 dev_put(dev);
5283 static void rollback_registered(struct net_device *dev)
5285 LIST_HEAD(single);
5287 list_add(&dev->unreg_list, &single);
5288 rollback_registered_many(&single);
5289 list_del(&single);
5292 static netdev_features_t netdev_fix_features(struct net_device *dev,
5293 netdev_features_t features)
5295 /* Fix illegal checksum combinations */
5296 if ((features & NETIF_F_HW_CSUM) &&
5297 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5298 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5299 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5302 /* Fix illegal SG+CSUM combinations. */
5303 if ((features & NETIF_F_SG) &&
5304 !(features & NETIF_F_ALL_CSUM)) {
5305 netdev_dbg(dev,
5306 "Dropping NETIF_F_SG since no checksum feature.\n");
5307 features &= ~NETIF_F_SG;
5310 /* TSO requires that SG is present as well. */
5311 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5312 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5313 features &= ~NETIF_F_ALL_TSO;
5316 /* TSO ECN requires that TSO is present as well. */
5317 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5318 features &= ~NETIF_F_TSO_ECN;
5320 /* Software GSO depends on SG. */
5321 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5322 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5323 features &= ~NETIF_F_GSO;
5326 /* UFO needs SG and checksumming */
5327 if (features & NETIF_F_UFO) {
5328 /* maybe split UFO into V4 and V6? */
5329 if (!((features & NETIF_F_GEN_CSUM) ||
5330 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5331 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5332 netdev_dbg(dev,
5333 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5334 features &= ~NETIF_F_UFO;
5337 if (!(features & NETIF_F_SG)) {
5338 netdev_dbg(dev,
5339 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5340 features &= ~NETIF_F_UFO;
5344 return features;
5347 int __netdev_update_features(struct net_device *dev)
5349 netdev_features_t features;
5350 int err = 0;
5352 ASSERT_RTNL();
5354 features = netdev_get_wanted_features(dev);
5356 if (dev->netdev_ops->ndo_fix_features)
5357 features = dev->netdev_ops->ndo_fix_features(dev, features);
5359 /* driver might be less strict about feature dependencies */
5360 features = netdev_fix_features(dev, features);
5362 if (dev->features == features)
5363 return 0;
5365 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5366 &dev->features, &features);
5368 if (dev->netdev_ops->ndo_set_features)
5369 err = dev->netdev_ops->ndo_set_features(dev, features);
5371 if (unlikely(err < 0)) {
5372 netdev_err(dev,
5373 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5374 err, &features, &dev->features);
5375 return -1;
5378 if (!err)
5379 dev->features = features;
5381 return 1;
5385 * netdev_update_features - recalculate device features
5386 * @dev: the device to check
5388 * Recalculate dev->features set and send notifications if it
5389 * has changed. Should be called after driver or hardware dependent
5390 * conditions might have changed that influence the features.
5392 void netdev_update_features(struct net_device *dev)
5394 if (__netdev_update_features(dev))
5395 netdev_features_change(dev);
5397 EXPORT_SYMBOL(netdev_update_features);
5400 * netdev_change_features - recalculate device features
5401 * @dev: the device to check
5403 * Recalculate dev->features set and send notifications even
5404 * if they have not changed. Should be called instead of
5405 * netdev_update_features() if also dev->vlan_features might
5406 * have changed to allow the changes to be propagated to stacked
5407 * VLAN devices.
5409 void netdev_change_features(struct net_device *dev)
5411 __netdev_update_features(dev);
5412 netdev_features_change(dev);
5414 EXPORT_SYMBOL(netdev_change_features);
5417 * netif_stacked_transfer_operstate - transfer operstate
5418 * @rootdev: the root or lower level device to transfer state from
5419 * @dev: the device to transfer operstate to
5421 * Transfer operational state from root to device. This is normally
5422 * called when a stacking relationship exists between the root
5423 * device and the device(a leaf device).
5425 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5426 struct net_device *dev)
5428 if (rootdev->operstate == IF_OPER_DORMANT)
5429 netif_dormant_on(dev);
5430 else
5431 netif_dormant_off(dev);
5433 if (netif_carrier_ok(rootdev)) {
5434 if (!netif_carrier_ok(dev))
5435 netif_carrier_on(dev);
5436 } else {
5437 if (netif_carrier_ok(dev))
5438 netif_carrier_off(dev);
5441 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5443 #ifdef CONFIG_RPS
5444 static int netif_alloc_rx_queues(struct net_device *dev)
5446 unsigned int i, count = dev->num_rx_queues;
5447 struct netdev_rx_queue *rx;
5449 BUG_ON(count < 1);
5451 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5452 if (!rx)
5453 return -ENOMEM;
5455 dev->_rx = rx;
5457 for (i = 0; i < count; i++)
5458 rx[i].dev = dev;
5459 return 0;
5461 #endif
5463 static void netdev_init_one_queue(struct net_device *dev,
5464 struct netdev_queue *queue, void *_unused)
5466 /* Initialize queue lock */
5467 spin_lock_init(&queue->_xmit_lock);
5468 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5469 queue->xmit_lock_owner = -1;
5470 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5471 queue->dev = dev;
5472 #ifdef CONFIG_BQL
5473 dql_init(&queue->dql, HZ);
5474 #endif
5477 static int netif_alloc_netdev_queues(struct net_device *dev)
5479 unsigned int count = dev->num_tx_queues;
5480 struct netdev_queue *tx;
5482 BUG_ON(count < 1);
5484 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5485 if (!tx)
5486 return -ENOMEM;
5488 dev->_tx = tx;
5490 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5491 spin_lock_init(&dev->tx_global_lock);
5493 return 0;
5497 * register_netdevice - register a network device
5498 * @dev: device to register
5500 * Take a completed network device structure and add it to the kernel
5501 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5502 * chain. 0 is returned on success. A negative errno code is returned
5503 * on a failure to set up the device, or if the name is a duplicate.
5505 * Callers must hold the rtnl semaphore. You may want
5506 * register_netdev() instead of this.
5508 * BUGS:
5509 * The locking appears insufficient to guarantee two parallel registers
5510 * will not get the same name.
5513 int register_netdevice(struct net_device *dev)
5515 int ret;
5516 struct net *net = dev_net(dev);
5518 BUG_ON(dev_boot_phase);
5519 ASSERT_RTNL();
5521 might_sleep();
5523 /* When net_device's are persistent, this will be fatal. */
5524 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5525 BUG_ON(!net);
5527 spin_lock_init(&dev->addr_list_lock);
5528 netdev_set_addr_lockdep_class(dev);
5530 dev->iflink = -1;
5532 ret = dev_get_valid_name(net, dev, dev->name);
5533 if (ret < 0)
5534 goto out;
5536 /* Init, if this function is available */
5537 if (dev->netdev_ops->ndo_init) {
5538 ret = dev->netdev_ops->ndo_init(dev);
5539 if (ret) {
5540 if (ret > 0)
5541 ret = -EIO;
5542 goto out;
5546 if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) &&
5547 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5548 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5549 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5550 ret = -EINVAL;
5551 goto err_uninit;
5554 ret = -EBUSY;
5555 if (!dev->ifindex)
5556 dev->ifindex = dev_new_index(net);
5557 else if (__dev_get_by_index(net, dev->ifindex))
5558 goto err_uninit;
5560 if (dev->iflink == -1)
5561 dev->iflink = dev->ifindex;
5563 /* Transfer changeable features to wanted_features and enable
5564 * software offloads (GSO and GRO).
5566 dev->hw_features |= NETIF_F_SOFT_FEATURES;
5567 dev->features |= NETIF_F_SOFT_FEATURES;
5568 dev->wanted_features = dev->features & dev->hw_features;
5570 /* Turn on no cache copy if HW is doing checksum */
5571 if (!(dev->flags & IFF_LOOPBACK)) {
5572 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5573 if (dev->features & NETIF_F_ALL_CSUM) {
5574 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5575 dev->features |= NETIF_F_NOCACHE_COPY;
5579 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5581 dev->vlan_features |= NETIF_F_HIGHDMA;
5583 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5584 ret = notifier_to_errno(ret);
5585 if (ret)
5586 goto err_uninit;
5588 ret = netdev_register_kobject(dev);
5589 if (ret)
5590 goto err_uninit;
5591 dev->reg_state = NETREG_REGISTERED;
5593 __netdev_update_features(dev);
5596 * Default initial state at registry is that the
5597 * device is present.
5600 set_bit(__LINK_STATE_PRESENT, &dev->state);
5602 linkwatch_init_dev(dev);
5604 dev_init_scheduler(dev);
5605 dev_hold(dev);
5606 list_netdevice(dev);
5607 add_device_randomness(dev->dev_addr, dev->addr_len);
5609 /* If the device has permanent device address, driver should
5610 * set dev_addr and also addr_assign_type should be set to
5611 * NET_ADDR_PERM (default value).
5613 if (dev->addr_assign_type == NET_ADDR_PERM)
5614 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5616 /* Notify protocols, that a new device appeared. */
5617 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5618 ret = notifier_to_errno(ret);
5619 if (ret) {
5620 rollback_registered(dev);
5621 dev->reg_state = NETREG_UNREGISTERED;
5624 * Prevent userspace races by waiting until the network
5625 * device is fully setup before sending notifications.
5627 if (!dev->rtnl_link_ops ||
5628 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5629 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5631 out:
5632 return ret;
5634 err_uninit:
5635 if (dev->netdev_ops->ndo_uninit)
5636 dev->netdev_ops->ndo_uninit(dev);
5637 goto out;
5639 EXPORT_SYMBOL(register_netdevice);
5642 * init_dummy_netdev - init a dummy network device for NAPI
5643 * @dev: device to init
5645 * This takes a network device structure and initialize the minimum
5646 * amount of fields so it can be used to schedule NAPI polls without
5647 * registering a full blown interface. This is to be used by drivers
5648 * that need to tie several hardware interfaces to a single NAPI
5649 * poll scheduler due to HW limitations.
5651 int init_dummy_netdev(struct net_device *dev)
5653 /* Clear everything. Note we don't initialize spinlocks
5654 * are they aren't supposed to be taken by any of the
5655 * NAPI code and this dummy netdev is supposed to be
5656 * only ever used for NAPI polls
5658 memset(dev, 0, sizeof(struct net_device));
5660 /* make sure we BUG if trying to hit standard
5661 * register/unregister code path
5663 dev->reg_state = NETREG_DUMMY;
5665 /* NAPI wants this */
5666 INIT_LIST_HEAD(&dev->napi_list);
5668 /* a dummy interface is started by default */
5669 set_bit(__LINK_STATE_PRESENT, &dev->state);
5670 set_bit(__LINK_STATE_START, &dev->state);
5672 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5673 * because users of this 'device' dont need to change
5674 * its refcount.
5677 return 0;
5679 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5683 * register_netdev - register a network device
5684 * @dev: device to register
5686 * Take a completed network device structure and add it to the kernel
5687 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5688 * chain. 0 is returned on success. A negative errno code is returned
5689 * on a failure to set up the device, or if the name is a duplicate.
5691 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5692 * and expands the device name if you passed a format string to
5693 * alloc_netdev.
5695 int register_netdev(struct net_device *dev)
5697 int err;
5699 rtnl_lock();
5700 err = register_netdevice(dev);
5701 rtnl_unlock();
5702 return err;
5704 EXPORT_SYMBOL(register_netdev);
5706 int netdev_refcnt_read(const struct net_device *dev)
5708 int i, refcnt = 0;
5710 for_each_possible_cpu(i)
5711 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5712 return refcnt;
5714 EXPORT_SYMBOL(netdev_refcnt_read);
5717 * netdev_wait_allrefs - wait until all references are gone.
5718 * @dev: target net_device
5720 * This is called when unregistering network devices.
5722 * Any protocol or device that holds a reference should register
5723 * for netdevice notification, and cleanup and put back the
5724 * reference if they receive an UNREGISTER event.
5725 * We can get stuck here if buggy protocols don't correctly
5726 * call dev_put.
5728 static void netdev_wait_allrefs(struct net_device *dev)
5730 unsigned long rebroadcast_time, warning_time;
5731 int refcnt;
5733 linkwatch_forget_dev(dev);
5735 rebroadcast_time = warning_time = jiffies;
5736 refcnt = netdev_refcnt_read(dev);
5738 while (refcnt != 0) {
5739 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5740 rtnl_lock();
5742 /* Rebroadcast unregister notification */
5743 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5745 __rtnl_unlock();
5746 rcu_barrier();
5747 rtnl_lock();
5749 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5750 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5751 &dev->state)) {
5752 /* We must not have linkwatch events
5753 * pending on unregister. If this
5754 * happens, we simply run the queue
5755 * unscheduled, resulting in a noop
5756 * for this device.
5758 linkwatch_run_queue();
5761 __rtnl_unlock();
5763 rebroadcast_time = jiffies;
5766 msleep(250);
5768 refcnt = netdev_refcnt_read(dev);
5770 if (time_after(jiffies, warning_time + 10 * HZ)) {
5771 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5772 dev->name, refcnt);
5773 warning_time = jiffies;
5778 /* The sequence is:
5780 * rtnl_lock();
5781 * ...
5782 * register_netdevice(x1);
5783 * register_netdevice(x2);
5784 * ...
5785 * unregister_netdevice(y1);
5786 * unregister_netdevice(y2);
5787 * ...
5788 * rtnl_unlock();
5789 * free_netdev(y1);
5790 * free_netdev(y2);
5792 * We are invoked by rtnl_unlock().
5793 * This allows us to deal with problems:
5794 * 1) We can delete sysfs objects which invoke hotplug
5795 * without deadlocking with linkwatch via keventd.
5796 * 2) Since we run with the RTNL semaphore not held, we can sleep
5797 * safely in order to wait for the netdev refcnt to drop to zero.
5799 * We must not return until all unregister events added during
5800 * the interval the lock was held have been completed.
5802 void netdev_run_todo(void)
5804 struct list_head list;
5806 /* Snapshot list, allow later requests */
5807 list_replace_init(&net_todo_list, &list);
5809 __rtnl_unlock();
5812 /* Wait for rcu callbacks to finish before next phase */
5813 if (!list_empty(&list))
5814 rcu_barrier();
5816 while (!list_empty(&list)) {
5817 struct net_device *dev
5818 = list_first_entry(&list, struct net_device, todo_list);
5819 list_del(&dev->todo_list);
5821 rtnl_lock();
5822 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5823 __rtnl_unlock();
5825 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5826 pr_err("network todo '%s' but state %d\n",
5827 dev->name, dev->reg_state);
5828 dump_stack();
5829 continue;
5832 dev->reg_state = NETREG_UNREGISTERED;
5834 on_each_cpu(flush_backlog, dev, 1);
5836 netdev_wait_allrefs(dev);
5838 /* paranoia */
5839 BUG_ON(netdev_refcnt_read(dev));
5840 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5841 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5842 WARN_ON(dev->dn_ptr);
5844 if (dev->destructor)
5845 dev->destructor(dev);
5847 /* Free network device */
5848 kobject_put(&dev->dev.kobj);
5852 /* Convert net_device_stats to rtnl_link_stats64. They have the same
5853 * fields in the same order, with only the type differing.
5855 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5856 const struct net_device_stats *netdev_stats)
5858 #if BITS_PER_LONG == 64
5859 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5860 memcpy(stats64, netdev_stats, sizeof(*stats64));
5861 #else
5862 size_t i, n = sizeof(*stats64) / sizeof(u64);
5863 const unsigned long *src = (const unsigned long *)netdev_stats;
5864 u64 *dst = (u64 *)stats64;
5866 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5867 sizeof(*stats64) / sizeof(u64));
5868 for (i = 0; i < n; i++)
5869 dst[i] = src[i];
5870 #endif
5872 EXPORT_SYMBOL(netdev_stats_to_stats64);
5875 * dev_get_stats - get network device statistics
5876 * @dev: device to get statistics from
5877 * @storage: place to store stats
5879 * Get network statistics from device. Return @storage.
5880 * The device driver may provide its own method by setting
5881 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5882 * otherwise the internal statistics structure is used.
5884 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5885 struct rtnl_link_stats64 *storage)
5887 const struct net_device_ops *ops = dev->netdev_ops;
5889 if (ops->ndo_get_stats64) {
5890 memset(storage, 0, sizeof(*storage));
5891 ops->ndo_get_stats64(dev, storage);
5892 } else if (ops->ndo_get_stats) {
5893 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5894 } else {
5895 netdev_stats_to_stats64(storage, &dev->stats);
5897 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5898 return storage;
5900 EXPORT_SYMBOL(dev_get_stats);
5902 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5904 struct netdev_queue *queue = dev_ingress_queue(dev);
5906 #ifdef CONFIG_NET_CLS_ACT
5907 if (queue)
5908 return queue;
5909 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5910 if (!queue)
5911 return NULL;
5912 netdev_init_one_queue(dev, queue, NULL);
5913 queue->qdisc = &noop_qdisc;
5914 queue->qdisc_sleeping = &noop_qdisc;
5915 rcu_assign_pointer(dev->ingress_queue, queue);
5916 #endif
5917 return queue;
5920 static const struct ethtool_ops default_ethtool_ops;
5922 void netdev_set_default_ethtool_ops(struct net_device *dev,
5923 const struct ethtool_ops *ops)
5925 if (dev->ethtool_ops == &default_ethtool_ops)
5926 dev->ethtool_ops = ops;
5928 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5931 * alloc_netdev_mqs - allocate network device
5932 * @sizeof_priv: size of private data to allocate space for
5933 * @name: device name format string
5934 * @setup: callback to initialize device
5935 * @txqs: the number of TX subqueues to allocate
5936 * @rxqs: the number of RX subqueues to allocate
5938 * Allocates a struct net_device with private data area for driver use
5939 * and performs basic initialization. Also allocates subquue structs
5940 * for each queue on the device.
5942 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5943 void (*setup)(struct net_device *),
5944 unsigned int txqs, unsigned int rxqs)
5946 struct net_device *dev;
5947 size_t alloc_size;
5948 struct net_device *p;
5950 BUG_ON(strlen(name) >= sizeof(dev->name));
5952 if (txqs < 1) {
5953 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5954 return NULL;
5957 #ifdef CONFIG_RPS
5958 if (rxqs < 1) {
5959 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5960 return NULL;
5962 #endif
5964 alloc_size = sizeof(struct net_device);
5965 if (sizeof_priv) {
5966 /* ensure 32-byte alignment of private area */
5967 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5968 alloc_size += sizeof_priv;
5970 /* ensure 32-byte alignment of whole construct */
5971 alloc_size += NETDEV_ALIGN - 1;
5973 p = kzalloc(alloc_size, GFP_KERNEL);
5974 if (!p)
5975 return NULL;
5977 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5978 dev->padded = (char *)dev - (char *)p;
5980 dev->pcpu_refcnt = alloc_percpu(int);
5981 if (!dev->pcpu_refcnt)
5982 goto free_p;
5984 if (dev_addr_init(dev))
5985 goto free_pcpu;
5987 dev_mc_init(dev);
5988 dev_uc_init(dev);
5990 dev_net_set(dev, &init_net);
5992 dev->gso_max_size = GSO_MAX_SIZE;
5993 dev->gso_max_segs = GSO_MAX_SEGS;
5995 INIT_LIST_HEAD(&dev->napi_list);
5996 INIT_LIST_HEAD(&dev->unreg_list);
5997 INIT_LIST_HEAD(&dev->link_watch_list);
5998 INIT_LIST_HEAD(&dev->upper_dev_list);
5999 dev->priv_flags = IFF_XMIT_DST_RELEASE;
6000 setup(dev);
6002 dev->num_tx_queues = txqs;
6003 dev->real_num_tx_queues = txqs;
6004 if (netif_alloc_netdev_queues(dev))
6005 goto free_all;
6007 #ifdef CONFIG_RPS
6008 dev->num_rx_queues = rxqs;
6009 dev->real_num_rx_queues = rxqs;
6010 if (netif_alloc_rx_queues(dev))
6011 goto free_all;
6012 #endif
6014 strcpy(dev->name, name);
6015 dev->group = INIT_NETDEV_GROUP;
6016 if (!dev->ethtool_ops)
6017 dev->ethtool_ops = &default_ethtool_ops;
6018 return dev;
6020 free_all:
6021 free_netdev(dev);
6022 return NULL;
6024 free_pcpu:
6025 free_percpu(dev->pcpu_refcnt);
6026 kfree(dev->_tx);
6027 #ifdef CONFIG_RPS
6028 kfree(dev->_rx);
6029 #endif
6031 free_p:
6032 kfree(p);
6033 return NULL;
6035 EXPORT_SYMBOL(alloc_netdev_mqs);
6038 * free_netdev - free network device
6039 * @dev: device
6041 * This function does the last stage of destroying an allocated device
6042 * interface. The reference to the device object is released.
6043 * If this is the last reference then it will be freed.
6045 void free_netdev(struct net_device *dev)
6047 struct napi_struct *p, *n;
6049 release_net(dev_net(dev));
6051 kfree(dev->_tx);
6052 #ifdef CONFIG_RPS
6053 kfree(dev->_rx);
6054 #endif
6056 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6058 /* Flush device addresses */
6059 dev_addr_flush(dev);
6061 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6062 netif_napi_del(p);
6064 free_percpu(dev->pcpu_refcnt);
6065 dev->pcpu_refcnt = NULL;
6067 /* Compatibility with error handling in drivers */
6068 if (dev->reg_state == NETREG_UNINITIALIZED) {
6069 kfree((char *)dev - dev->padded);
6070 return;
6073 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6074 dev->reg_state = NETREG_RELEASED;
6076 /* will free via device release */
6077 put_device(&dev->dev);
6079 EXPORT_SYMBOL(free_netdev);
6082 * synchronize_net - Synchronize with packet receive processing
6084 * Wait for packets currently being received to be done.
6085 * Does not block later packets from starting.
6087 void synchronize_net(void)
6089 might_sleep();
6090 if (rtnl_is_locked())
6091 synchronize_rcu_expedited();
6092 else
6093 synchronize_rcu();
6095 EXPORT_SYMBOL(synchronize_net);
6098 * unregister_netdevice_queue - remove device from the kernel
6099 * @dev: device
6100 * @head: list
6102 * This function shuts down a device interface and removes it
6103 * from the kernel tables.
6104 * If head not NULL, device is queued to be unregistered later.
6106 * Callers must hold the rtnl semaphore. You may want
6107 * unregister_netdev() instead of this.
6110 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6112 ASSERT_RTNL();
6114 if (head) {
6115 list_move_tail(&dev->unreg_list, head);
6116 } else {
6117 rollback_registered(dev);
6118 /* Finish processing unregister after unlock */
6119 net_set_todo(dev);
6122 EXPORT_SYMBOL(unregister_netdevice_queue);
6125 * unregister_netdevice_many - unregister many devices
6126 * @head: list of devices
6128 void unregister_netdevice_many(struct list_head *head)
6130 struct net_device *dev;
6132 if (!list_empty(head)) {
6133 rollback_registered_many(head);
6134 list_for_each_entry(dev, head, unreg_list)
6135 net_set_todo(dev);
6138 EXPORT_SYMBOL(unregister_netdevice_many);
6141 * unregister_netdev - remove device from the kernel
6142 * @dev: device
6144 * This function shuts down a device interface and removes it
6145 * from the kernel tables.
6147 * This is just a wrapper for unregister_netdevice that takes
6148 * the rtnl semaphore. In general you want to use this and not
6149 * unregister_netdevice.
6151 void unregister_netdev(struct net_device *dev)
6153 rtnl_lock();
6154 unregister_netdevice(dev);
6155 rtnl_unlock();
6157 EXPORT_SYMBOL(unregister_netdev);
6160 * dev_change_net_namespace - move device to different nethost namespace
6161 * @dev: device
6162 * @net: network namespace
6163 * @pat: If not NULL name pattern to try if the current device name
6164 * is already taken in the destination network namespace.
6166 * This function shuts down a device interface and moves it
6167 * to a new network namespace. On success 0 is returned, on
6168 * a failure a netagive errno code is returned.
6170 * Callers must hold the rtnl semaphore.
6173 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6175 int err;
6177 ASSERT_RTNL();
6179 /* Don't allow namespace local devices to be moved. */
6180 err = -EINVAL;
6181 if (dev->features & NETIF_F_NETNS_LOCAL)
6182 goto out;
6184 /* Ensure the device has been registrered */
6185 if (dev->reg_state != NETREG_REGISTERED)
6186 goto out;
6188 /* Get out if there is nothing todo */
6189 err = 0;
6190 if (net_eq(dev_net(dev), net))
6191 goto out;
6193 /* Pick the destination device name, and ensure
6194 * we can use it in the destination network namespace.
6196 err = -EEXIST;
6197 if (__dev_get_by_name(net, dev->name)) {
6198 /* We get here if we can't use the current device name */
6199 if (!pat)
6200 goto out;
6201 if (dev_get_valid_name(net, dev, pat) < 0)
6202 goto out;
6206 * And now a mini version of register_netdevice unregister_netdevice.
6209 /* If device is running close it first. */
6210 dev_close(dev);
6212 /* And unlink it from device chain */
6213 err = -ENODEV;
6214 unlist_netdevice(dev);
6216 synchronize_net();
6218 /* Shutdown queueing discipline. */
6219 dev_shutdown(dev);
6221 /* Notify protocols, that we are about to destroy
6222 this device. They should clean all the things.
6224 Note that dev->reg_state stays at NETREG_REGISTERED.
6225 This is wanted because this way 8021q and macvlan know
6226 the device is just moving and can keep their slaves up.
6228 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6229 rcu_barrier();
6230 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6231 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6234 * Flush the unicast and multicast chains
6236 dev_uc_flush(dev);
6237 dev_mc_flush(dev);
6239 /* Send a netdev-removed uevent to the old namespace */
6240 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6242 /* Actually switch the network namespace */
6243 dev_net_set(dev, net);
6245 /* If there is an ifindex conflict assign a new one */
6246 if (__dev_get_by_index(net, dev->ifindex)) {
6247 int iflink = (dev->iflink == dev->ifindex);
6248 dev->ifindex = dev_new_index(net);
6249 if (iflink)
6250 dev->iflink = dev->ifindex;
6253 /* Send a netdev-add uevent to the new namespace */
6254 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6256 /* Fixup kobjects */
6257 err = device_rename(&dev->dev, dev->name);
6258 WARN_ON(err);
6260 /* Add the device back in the hashes */
6261 list_netdevice(dev);
6263 /* Notify protocols, that a new device appeared. */
6264 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6267 * Prevent userspace races by waiting until the network
6268 * device is fully setup before sending notifications.
6270 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6272 synchronize_net();
6273 err = 0;
6274 out:
6275 return err;
6277 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6279 static int dev_cpu_callback(struct notifier_block *nfb,
6280 unsigned long action,
6281 void *ocpu)
6283 struct sk_buff **list_skb;
6284 struct sk_buff *skb;
6285 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6286 struct softnet_data *sd, *oldsd;
6288 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6289 return NOTIFY_OK;
6291 local_irq_disable();
6292 cpu = smp_processor_id();
6293 sd = &per_cpu(softnet_data, cpu);
6294 oldsd = &per_cpu(softnet_data, oldcpu);
6296 /* Find end of our completion_queue. */
6297 list_skb = &sd->completion_queue;
6298 while (*list_skb)
6299 list_skb = &(*list_skb)->next;
6300 /* Append completion queue from offline CPU. */
6301 *list_skb = oldsd->completion_queue;
6302 oldsd->completion_queue = NULL;
6304 /* Append output queue from offline CPU. */
6305 if (oldsd->output_queue) {
6306 *sd->output_queue_tailp = oldsd->output_queue;
6307 sd->output_queue_tailp = oldsd->output_queue_tailp;
6308 oldsd->output_queue = NULL;
6309 oldsd->output_queue_tailp = &oldsd->output_queue;
6311 /* Append NAPI poll list from offline CPU. */
6312 if (!list_empty(&oldsd->poll_list)) {
6313 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6314 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6317 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6318 local_irq_enable();
6320 /* Process offline CPU's input_pkt_queue */
6321 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6322 netif_rx(skb);
6323 input_queue_head_incr(oldsd);
6325 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6326 netif_rx(skb);
6327 input_queue_head_incr(oldsd);
6330 return NOTIFY_OK;
6335 * netdev_increment_features - increment feature set by one
6336 * @all: current feature set
6337 * @one: new feature set
6338 * @mask: mask feature set
6340 * Computes a new feature set after adding a device with feature set
6341 * @one to the master device with current feature set @all. Will not
6342 * enable anything that is off in @mask. Returns the new feature set.
6344 netdev_features_t netdev_increment_features(netdev_features_t all,
6345 netdev_features_t one, netdev_features_t mask)
6347 if (mask & NETIF_F_GEN_CSUM)
6348 mask |= NETIF_F_ALL_CSUM;
6349 mask |= NETIF_F_VLAN_CHALLENGED;
6351 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6352 all &= one | ~NETIF_F_ALL_FOR_ALL;
6354 /* If one device supports hw checksumming, set for all. */
6355 if (all & NETIF_F_GEN_CSUM)
6356 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6358 return all;
6360 EXPORT_SYMBOL(netdev_increment_features);
6362 static struct hlist_head *netdev_create_hash(void)
6364 int i;
6365 struct hlist_head *hash;
6367 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6368 if (hash != NULL)
6369 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6370 INIT_HLIST_HEAD(&hash[i]);
6372 return hash;
6375 /* Initialize per network namespace state */
6376 static int __net_init netdev_init(struct net *net)
6378 if (net != &init_net)
6379 INIT_LIST_HEAD(&net->dev_base_head);
6381 net->dev_name_head = netdev_create_hash();
6382 if (net->dev_name_head == NULL)
6383 goto err_name;
6385 net->dev_index_head = netdev_create_hash();
6386 if (net->dev_index_head == NULL)
6387 goto err_idx;
6389 return 0;
6391 err_idx:
6392 kfree(net->dev_name_head);
6393 err_name:
6394 return -ENOMEM;
6398 * netdev_drivername - network driver for the device
6399 * @dev: network device
6401 * Determine network driver for device.
6403 const char *netdev_drivername(const struct net_device *dev)
6405 const struct device_driver *driver;
6406 const struct device *parent;
6407 const char *empty = "";
6409 parent = dev->dev.parent;
6410 if (!parent)
6411 return empty;
6413 driver = parent->driver;
6414 if (driver && driver->name)
6415 return driver->name;
6416 return empty;
6419 static int __netdev_printk(const char *level, const struct net_device *dev,
6420 struct va_format *vaf)
6422 int r;
6424 if (dev && dev->dev.parent) {
6425 r = dev_printk_emit(level[1] - '0',
6426 dev->dev.parent,
6427 "%s %s %s: %pV",
6428 dev_driver_string(dev->dev.parent),
6429 dev_name(dev->dev.parent),
6430 netdev_name(dev), vaf);
6431 } else if (dev) {
6432 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6433 } else {
6434 r = printk("%s(NULL net_device): %pV", level, vaf);
6437 return r;
6440 int netdev_printk(const char *level, const struct net_device *dev,
6441 const char *format, ...)
6443 struct va_format vaf;
6444 va_list args;
6445 int r;
6447 va_start(args, format);
6449 vaf.fmt = format;
6450 vaf.va = &args;
6452 r = __netdev_printk(level, dev, &vaf);
6454 va_end(args);
6456 return r;
6458 EXPORT_SYMBOL(netdev_printk);
6460 #define define_netdev_printk_level(func, level) \
6461 int func(const struct net_device *dev, const char *fmt, ...) \
6463 int r; \
6464 struct va_format vaf; \
6465 va_list args; \
6467 va_start(args, fmt); \
6469 vaf.fmt = fmt; \
6470 vaf.va = &args; \
6472 r = __netdev_printk(level, dev, &vaf); \
6474 va_end(args); \
6476 return r; \
6478 EXPORT_SYMBOL(func);
6480 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6481 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6482 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6483 define_netdev_printk_level(netdev_err, KERN_ERR);
6484 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6485 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6486 define_netdev_printk_level(netdev_info, KERN_INFO);
6488 static void __net_exit netdev_exit(struct net *net)
6490 kfree(net->dev_name_head);
6491 kfree(net->dev_index_head);
6494 static struct pernet_operations __net_initdata netdev_net_ops = {
6495 .init = netdev_init,
6496 .exit = netdev_exit,
6499 static void __net_exit default_device_exit(struct net *net)
6501 struct net_device *dev, *aux;
6503 * Push all migratable network devices back to the
6504 * initial network namespace
6506 rtnl_lock();
6507 for_each_netdev_safe(net, dev, aux) {
6508 int err;
6509 char fb_name[IFNAMSIZ];
6511 /* Ignore unmoveable devices (i.e. loopback) */
6512 if (dev->features & NETIF_F_NETNS_LOCAL)
6513 continue;
6515 /* Leave virtual devices for the generic cleanup */
6516 if (dev->rtnl_link_ops)
6517 continue;
6519 /* Push remaining network devices to init_net */
6520 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6521 err = dev_change_net_namespace(dev, &init_net, fb_name);
6522 if (err) {
6523 pr_emerg("%s: failed to move %s to init_net: %d\n",
6524 __func__, dev->name, err);
6525 BUG();
6528 rtnl_unlock();
6531 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6533 /* At exit all network devices most be removed from a network
6534 * namespace. Do this in the reverse order of registration.
6535 * Do this across as many network namespaces as possible to
6536 * improve batching efficiency.
6538 struct net_device *dev;
6539 struct net *net;
6540 LIST_HEAD(dev_kill_list);
6542 rtnl_lock();
6543 list_for_each_entry(net, net_list, exit_list) {
6544 for_each_netdev_reverse(net, dev) {
6545 if (dev->rtnl_link_ops)
6546 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6547 else
6548 unregister_netdevice_queue(dev, &dev_kill_list);
6551 unregister_netdevice_many(&dev_kill_list);
6552 list_del(&dev_kill_list);
6553 rtnl_unlock();
6556 static struct pernet_operations __net_initdata default_device_ops = {
6557 .exit = default_device_exit,
6558 .exit_batch = default_device_exit_batch,
6562 * Initialize the DEV module. At boot time this walks the device list and
6563 * unhooks any devices that fail to initialise (normally hardware not
6564 * present) and leaves us with a valid list of present and active devices.
6569 * This is called single threaded during boot, so no need
6570 * to take the rtnl semaphore.
6572 static int __init net_dev_init(void)
6574 int i, rc = -ENOMEM;
6576 BUG_ON(!dev_boot_phase);
6578 if (dev_proc_init())
6579 goto out;
6581 if (netdev_kobject_init())
6582 goto out;
6584 INIT_LIST_HEAD(&ptype_all);
6585 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6586 INIT_LIST_HEAD(&ptype_base[i]);
6588 INIT_LIST_HEAD(&offload_base);
6590 if (register_pernet_subsys(&netdev_net_ops))
6591 goto out;
6594 * Initialise the packet receive queues.
6597 for_each_possible_cpu(i) {
6598 struct softnet_data *sd = &per_cpu(softnet_data, i);
6600 memset(sd, 0, sizeof(*sd));
6601 skb_queue_head_init(&sd->input_pkt_queue);
6602 skb_queue_head_init(&sd->process_queue);
6603 sd->completion_queue = NULL;
6604 INIT_LIST_HEAD(&sd->poll_list);
6605 sd->output_queue = NULL;
6606 sd->output_queue_tailp = &sd->output_queue;
6607 #ifdef CONFIG_RPS
6608 sd->csd.func = rps_trigger_softirq;
6609 sd->csd.info = sd;
6610 sd->csd.flags = 0;
6611 sd->cpu = i;
6612 #endif
6614 sd->backlog.poll = process_backlog;
6615 sd->backlog.weight = weight_p;
6616 sd->backlog.gro_list = NULL;
6617 sd->backlog.gro_count = 0;
6620 dev_boot_phase = 0;
6622 /* The loopback device is special if any other network devices
6623 * is present in a network namespace the loopback device must
6624 * be present. Since we now dynamically allocate and free the
6625 * loopback device ensure this invariant is maintained by
6626 * keeping the loopback device as the first device on the
6627 * list of network devices. Ensuring the loopback devices
6628 * is the first device that appears and the last network device
6629 * that disappears.
6631 if (register_pernet_device(&loopback_net_ops))
6632 goto out;
6634 if (register_pernet_device(&default_device_ops))
6635 goto out;
6637 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6638 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6640 hotcpu_notifier(dev_cpu_callback, 0);
6641 dst_init();
6642 dev_mcast_init();
6643 rc = 0;
6644 out:
6645 return rc;
6648 subsys_initcall(net_dev_init);