net: Don't write to current task flags on every packet received.
[linux-2.6/cjktty.git] / net / core / dev.c
blobf44473696b8b9e8239351c54f4a19074270e83d1
1 /*
2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/proc_fs.h>
101 #include <linux/seq_file.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/net_tstamp.h>
136 #include <linux/static_key.h>
138 #include "net-sysfs.h"
140 /* Instead of increasing this, you should create a hash table. */
141 #define MAX_GRO_SKBS 8
143 /* This should be increased if a protocol with a bigger head is added. */
144 #define GRO_MAX_HEAD (MAX_HEADER + 128)
147 * The list of packet types we will receive (as opposed to discard)
148 * and the routines to invoke.
150 * Why 16. Because with 16 the only overlap we get on a hash of the
151 * low nibble of the protocol value is RARP/SNAP/X.25.
153 * NOTE: That is no longer true with the addition of VLAN tags. Not
154 * sure which should go first, but I bet it won't make much
155 * difference if we are running VLANs. The good news is that
156 * this protocol won't be in the list unless compiled in, so
157 * the average user (w/out VLANs) will not be adversely affected.
158 * --BLG
160 * 0800 IP
161 * 8100 802.1Q VLAN
162 * 0001 802.3
163 * 0002 AX.25
164 * 0004 802.2
165 * 8035 RARP
166 * 0005 SNAP
167 * 0805 X.25
168 * 0806 ARP
169 * 8137 IPX
170 * 0009 Localtalk
171 * 86DD IPv6
174 #define PTYPE_HASH_SIZE (16)
175 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
177 static DEFINE_SPINLOCK(ptype_lock);
178 static DEFINE_SPINLOCK(offload_lock);
179 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
180 static struct list_head ptype_all __read_mostly; /* Taps */
181 static struct list_head offload_base __read_mostly;
184 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
185 * semaphore.
187 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
189 * Writers must hold the rtnl semaphore while they loop through the
190 * dev_base_head list, and hold dev_base_lock for writing when they do the
191 * actual updates. This allows pure readers to access the list even
192 * while a writer is preparing to update it.
194 * To put it another way, dev_base_lock is held for writing only to
195 * protect against pure readers; the rtnl semaphore provides the
196 * protection against other writers.
198 * See, for example usages, register_netdevice() and
199 * unregister_netdevice(), which must be called with the rtnl
200 * semaphore held.
202 DEFINE_RWLOCK(dev_base_lock);
203 EXPORT_SYMBOL(dev_base_lock);
205 seqcount_t devnet_rename_seq;
207 static inline void dev_base_seq_inc(struct net *net)
209 while (++net->dev_base_seq == 0);
212 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
214 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
216 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
219 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
221 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
224 static inline void rps_lock(struct softnet_data *sd)
226 #ifdef CONFIG_RPS
227 spin_lock(&sd->input_pkt_queue.lock);
228 #endif
231 static inline void rps_unlock(struct softnet_data *sd)
233 #ifdef CONFIG_RPS
234 spin_unlock(&sd->input_pkt_queue.lock);
235 #endif
238 /* Device list insertion */
239 static int list_netdevice(struct net_device *dev)
241 struct net *net = dev_net(dev);
243 ASSERT_RTNL();
245 write_lock_bh(&dev_base_lock);
246 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
247 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
248 hlist_add_head_rcu(&dev->index_hlist,
249 dev_index_hash(net, dev->ifindex));
250 write_unlock_bh(&dev_base_lock);
252 dev_base_seq_inc(net);
254 return 0;
257 /* Device list removal
258 * caller must respect a RCU grace period before freeing/reusing dev
260 static void unlist_netdevice(struct net_device *dev)
262 ASSERT_RTNL();
264 /* Unlink dev from the device chain */
265 write_lock_bh(&dev_base_lock);
266 list_del_rcu(&dev->dev_list);
267 hlist_del_rcu(&dev->name_hlist);
268 hlist_del_rcu(&dev->index_hlist);
269 write_unlock_bh(&dev_base_lock);
271 dev_base_seq_inc(dev_net(dev));
275 * Our notifier list
278 static RAW_NOTIFIER_HEAD(netdev_chain);
281 * Device drivers call our routines to queue packets here. We empty the
282 * queue in the local softnet handler.
285 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
286 EXPORT_PER_CPU_SYMBOL(softnet_data);
288 #ifdef CONFIG_LOCKDEP
290 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
291 * according to dev->type
293 static const unsigned short netdev_lock_type[] =
294 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
295 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
296 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
297 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
298 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
299 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
300 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
301 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
302 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
303 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
304 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
305 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
306 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
307 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
308 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
310 static const char *const netdev_lock_name[] =
311 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
312 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
313 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
314 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
315 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
316 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
317 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
318 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
319 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
320 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
321 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
322 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
323 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
324 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
325 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
327 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
328 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
330 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
332 int i;
334 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
335 if (netdev_lock_type[i] == dev_type)
336 return i;
337 /* the last key is used by default */
338 return ARRAY_SIZE(netdev_lock_type) - 1;
341 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
342 unsigned short dev_type)
344 int i;
346 i = netdev_lock_pos(dev_type);
347 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
348 netdev_lock_name[i]);
351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
353 int i;
355 i = netdev_lock_pos(dev->type);
356 lockdep_set_class_and_name(&dev->addr_list_lock,
357 &netdev_addr_lock_key[i],
358 netdev_lock_name[i]);
360 #else
361 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
362 unsigned short dev_type)
365 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
368 #endif
370 /*******************************************************************************
372 Protocol management and registration routines
374 *******************************************************************************/
377 * Add a protocol ID to the list. Now that the input handler is
378 * smarter we can dispense with all the messy stuff that used to be
379 * here.
381 * BEWARE!!! Protocol handlers, mangling input packets,
382 * MUST BE last in hash buckets and checking protocol handlers
383 * MUST start from promiscuous ptype_all chain in net_bh.
384 * It is true now, do not change it.
385 * Explanation follows: if protocol handler, mangling packet, will
386 * be the first on list, it is not able to sense, that packet
387 * is cloned and should be copied-on-write, so that it will
388 * change it and subsequent readers will get broken packet.
389 * --ANK (980803)
392 static inline struct list_head *ptype_head(const struct packet_type *pt)
394 if (pt->type == htons(ETH_P_ALL))
395 return &ptype_all;
396 else
397 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
401 * dev_add_pack - add packet handler
402 * @pt: packet type declaration
404 * Add a protocol handler to the networking stack. The passed &packet_type
405 * is linked into kernel lists and may not be freed until it has been
406 * removed from the kernel lists.
408 * This call does not sleep therefore it can not
409 * guarantee all CPU's that are in middle of receiving packets
410 * will see the new packet type (until the next received packet).
413 void dev_add_pack(struct packet_type *pt)
415 struct list_head *head = ptype_head(pt);
417 spin_lock(&ptype_lock);
418 list_add_rcu(&pt->list, head);
419 spin_unlock(&ptype_lock);
421 EXPORT_SYMBOL(dev_add_pack);
424 * __dev_remove_pack - remove packet handler
425 * @pt: packet type declaration
427 * Remove a protocol handler that was previously added to the kernel
428 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
429 * from the kernel lists and can be freed or reused once this function
430 * returns.
432 * The packet type might still be in use by receivers
433 * and must not be freed until after all the CPU's have gone
434 * through a quiescent state.
436 void __dev_remove_pack(struct packet_type *pt)
438 struct list_head *head = ptype_head(pt);
439 struct packet_type *pt1;
441 spin_lock(&ptype_lock);
443 list_for_each_entry(pt1, head, list) {
444 if (pt == pt1) {
445 list_del_rcu(&pt->list);
446 goto out;
450 pr_warn("dev_remove_pack: %p not found\n", pt);
451 out:
452 spin_unlock(&ptype_lock);
454 EXPORT_SYMBOL(__dev_remove_pack);
457 * dev_remove_pack - remove packet handler
458 * @pt: packet type declaration
460 * Remove a protocol handler that was previously added to the kernel
461 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
462 * from the kernel lists and can be freed or reused once this function
463 * returns.
465 * This call sleeps to guarantee that no CPU is looking at the packet
466 * type after return.
468 void dev_remove_pack(struct packet_type *pt)
470 __dev_remove_pack(pt);
472 synchronize_net();
474 EXPORT_SYMBOL(dev_remove_pack);
478 * dev_add_offload - register offload handlers
479 * @po: protocol offload declaration
481 * Add protocol offload handlers to the networking stack. The passed
482 * &proto_offload is linked into kernel lists and may not be freed until
483 * it has been removed from the kernel lists.
485 * This call does not sleep therefore it can not
486 * guarantee all CPU's that are in middle of receiving packets
487 * will see the new offload handlers (until the next received packet).
489 void dev_add_offload(struct packet_offload *po)
491 struct list_head *head = &offload_base;
493 spin_lock(&offload_lock);
494 list_add_rcu(&po->list, head);
495 spin_unlock(&offload_lock);
497 EXPORT_SYMBOL(dev_add_offload);
500 * __dev_remove_offload - remove offload handler
501 * @po: packet offload declaration
503 * Remove a protocol offload handler that was previously added to the
504 * kernel offload handlers by dev_add_offload(). The passed &offload_type
505 * is removed from the kernel lists and can be freed or reused once this
506 * function returns.
508 * The packet type might still be in use by receivers
509 * and must not be freed until after all the CPU's have gone
510 * through a quiescent state.
512 void __dev_remove_offload(struct packet_offload *po)
514 struct list_head *head = &offload_base;
515 struct packet_offload *po1;
517 spin_lock(&offload_lock);
519 list_for_each_entry(po1, head, list) {
520 if (po == po1) {
521 list_del_rcu(&po->list);
522 goto out;
526 pr_warn("dev_remove_offload: %p not found\n", po);
527 out:
528 spin_unlock(&offload_lock);
530 EXPORT_SYMBOL(__dev_remove_offload);
533 * dev_remove_offload - remove packet offload handler
534 * @po: packet offload declaration
536 * Remove a packet offload handler that was previously added to the kernel
537 * offload handlers by dev_add_offload(). The passed &offload_type is
538 * removed from the kernel lists and can be freed or reused once this
539 * function returns.
541 * This call sleeps to guarantee that no CPU is looking at the packet
542 * type after return.
544 void dev_remove_offload(struct packet_offload *po)
546 __dev_remove_offload(po);
548 synchronize_net();
550 EXPORT_SYMBOL(dev_remove_offload);
552 /******************************************************************************
554 Device Boot-time Settings Routines
556 *******************************************************************************/
558 /* Boot time configuration table */
559 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
562 * netdev_boot_setup_add - add new setup entry
563 * @name: name of the device
564 * @map: configured settings for the device
566 * Adds new setup entry to the dev_boot_setup list. The function
567 * returns 0 on error and 1 on success. This is a generic routine to
568 * all netdevices.
570 static int netdev_boot_setup_add(char *name, struct ifmap *map)
572 struct netdev_boot_setup *s;
573 int i;
575 s = dev_boot_setup;
576 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
577 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
578 memset(s[i].name, 0, sizeof(s[i].name));
579 strlcpy(s[i].name, name, IFNAMSIZ);
580 memcpy(&s[i].map, map, sizeof(s[i].map));
581 break;
585 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
589 * netdev_boot_setup_check - check boot time settings
590 * @dev: the netdevice
592 * Check boot time settings for the device.
593 * The found settings are set for the device to be used
594 * later in the device probing.
595 * Returns 0 if no settings found, 1 if they are.
597 int netdev_boot_setup_check(struct net_device *dev)
599 struct netdev_boot_setup *s = dev_boot_setup;
600 int i;
602 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
603 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
604 !strcmp(dev->name, s[i].name)) {
605 dev->irq = s[i].map.irq;
606 dev->base_addr = s[i].map.base_addr;
607 dev->mem_start = s[i].map.mem_start;
608 dev->mem_end = s[i].map.mem_end;
609 return 1;
612 return 0;
614 EXPORT_SYMBOL(netdev_boot_setup_check);
618 * netdev_boot_base - get address from boot time settings
619 * @prefix: prefix for network device
620 * @unit: id for network device
622 * Check boot time settings for the base address of device.
623 * The found settings are set for the device to be used
624 * later in the device probing.
625 * Returns 0 if no settings found.
627 unsigned long netdev_boot_base(const char *prefix, int unit)
629 const struct netdev_boot_setup *s = dev_boot_setup;
630 char name[IFNAMSIZ];
631 int i;
633 sprintf(name, "%s%d", prefix, unit);
636 * If device already registered then return base of 1
637 * to indicate not to probe for this interface
639 if (__dev_get_by_name(&init_net, name))
640 return 1;
642 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
643 if (!strcmp(name, s[i].name))
644 return s[i].map.base_addr;
645 return 0;
649 * Saves at boot time configured settings for any netdevice.
651 int __init netdev_boot_setup(char *str)
653 int ints[5];
654 struct ifmap map;
656 str = get_options(str, ARRAY_SIZE(ints), ints);
657 if (!str || !*str)
658 return 0;
660 /* Save settings */
661 memset(&map, 0, sizeof(map));
662 if (ints[0] > 0)
663 map.irq = ints[1];
664 if (ints[0] > 1)
665 map.base_addr = ints[2];
666 if (ints[0] > 2)
667 map.mem_start = ints[3];
668 if (ints[0] > 3)
669 map.mem_end = ints[4];
671 /* Add new entry to the list */
672 return netdev_boot_setup_add(str, &map);
675 __setup("netdev=", netdev_boot_setup);
677 /*******************************************************************************
679 Device Interface Subroutines
681 *******************************************************************************/
684 * __dev_get_by_name - find a device by its name
685 * @net: the applicable net namespace
686 * @name: name to find
688 * Find an interface by name. Must be called under RTNL semaphore
689 * or @dev_base_lock. If the name is found a pointer to the device
690 * is returned. If the name is not found then %NULL is returned. The
691 * reference counters are not incremented so the caller must be
692 * careful with locks.
695 struct net_device *__dev_get_by_name(struct net *net, const char *name)
697 struct hlist_node *p;
698 struct net_device *dev;
699 struct hlist_head *head = dev_name_hash(net, name);
701 hlist_for_each_entry(dev, p, head, name_hlist)
702 if (!strncmp(dev->name, name, IFNAMSIZ))
703 return dev;
705 return NULL;
707 EXPORT_SYMBOL(__dev_get_by_name);
710 * dev_get_by_name_rcu - find a device by its name
711 * @net: the applicable net namespace
712 * @name: name to find
714 * Find an interface by name.
715 * If the name is found a pointer to the device is returned.
716 * If the name is not found then %NULL is returned.
717 * The reference counters are not incremented so the caller must be
718 * careful with locks. The caller must hold RCU lock.
721 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
723 struct hlist_node *p;
724 struct net_device *dev;
725 struct hlist_head *head = dev_name_hash(net, name);
727 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
728 if (!strncmp(dev->name, name, IFNAMSIZ))
729 return dev;
731 return NULL;
733 EXPORT_SYMBOL(dev_get_by_name_rcu);
736 * dev_get_by_name - find a device by its name
737 * @net: the applicable net namespace
738 * @name: name to find
740 * Find an interface by name. This can be called from any
741 * context and does its own locking. The returned handle has
742 * the usage count incremented and the caller must use dev_put() to
743 * release it when it is no longer needed. %NULL is returned if no
744 * matching device is found.
747 struct net_device *dev_get_by_name(struct net *net, const char *name)
749 struct net_device *dev;
751 rcu_read_lock();
752 dev = dev_get_by_name_rcu(net, name);
753 if (dev)
754 dev_hold(dev);
755 rcu_read_unlock();
756 return dev;
758 EXPORT_SYMBOL(dev_get_by_name);
761 * __dev_get_by_index - find a device by its ifindex
762 * @net: the applicable net namespace
763 * @ifindex: index of device
765 * Search for an interface by index. Returns %NULL if the device
766 * is not found or a pointer to the device. The device has not
767 * had its reference counter increased so the caller must be careful
768 * about locking. The caller must hold either the RTNL semaphore
769 * or @dev_base_lock.
772 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
774 struct hlist_node *p;
775 struct net_device *dev;
776 struct hlist_head *head = dev_index_hash(net, ifindex);
778 hlist_for_each_entry(dev, p, head, index_hlist)
779 if (dev->ifindex == ifindex)
780 return dev;
782 return NULL;
784 EXPORT_SYMBOL(__dev_get_by_index);
787 * dev_get_by_index_rcu - find a device by its ifindex
788 * @net: the applicable net namespace
789 * @ifindex: index of device
791 * Search for an interface by index. Returns %NULL if the device
792 * is not found or a pointer to the device. The device has not
793 * had its reference counter increased so the caller must be careful
794 * about locking. The caller must hold RCU lock.
797 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
799 struct hlist_node *p;
800 struct net_device *dev;
801 struct hlist_head *head = dev_index_hash(net, ifindex);
803 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
804 if (dev->ifindex == ifindex)
805 return dev;
807 return NULL;
809 EXPORT_SYMBOL(dev_get_by_index_rcu);
813 * dev_get_by_index - find a device by its ifindex
814 * @net: the applicable net namespace
815 * @ifindex: index of device
817 * Search for an interface by index. Returns NULL if the device
818 * is not found or a pointer to the device. The device returned has
819 * had a reference added and the pointer is safe until the user calls
820 * dev_put to indicate they have finished with it.
823 struct net_device *dev_get_by_index(struct net *net, int ifindex)
825 struct net_device *dev;
827 rcu_read_lock();
828 dev = dev_get_by_index_rcu(net, ifindex);
829 if (dev)
830 dev_hold(dev);
831 rcu_read_unlock();
832 return dev;
834 EXPORT_SYMBOL(dev_get_by_index);
837 * dev_getbyhwaddr_rcu - find a device by its hardware address
838 * @net: the applicable net namespace
839 * @type: media type of device
840 * @ha: hardware address
842 * Search for an interface by MAC address. Returns NULL if the device
843 * is not found or a pointer to the device.
844 * The caller must hold RCU or RTNL.
845 * The returned device has not had its ref count increased
846 * and the caller must therefore be careful about locking
850 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
851 const char *ha)
853 struct net_device *dev;
855 for_each_netdev_rcu(net, dev)
856 if (dev->type == type &&
857 !memcmp(dev->dev_addr, ha, dev->addr_len))
858 return dev;
860 return NULL;
862 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
864 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
866 struct net_device *dev;
868 ASSERT_RTNL();
869 for_each_netdev(net, dev)
870 if (dev->type == type)
871 return dev;
873 return NULL;
875 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
877 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
879 struct net_device *dev, *ret = NULL;
881 rcu_read_lock();
882 for_each_netdev_rcu(net, dev)
883 if (dev->type == type) {
884 dev_hold(dev);
885 ret = dev;
886 break;
888 rcu_read_unlock();
889 return ret;
891 EXPORT_SYMBOL(dev_getfirstbyhwtype);
894 * dev_get_by_flags_rcu - find any device with given flags
895 * @net: the applicable net namespace
896 * @if_flags: IFF_* values
897 * @mask: bitmask of bits in if_flags to check
899 * Search for any interface with the given flags. Returns NULL if a device
900 * is not found or a pointer to the device. Must be called inside
901 * rcu_read_lock(), and result refcount is unchanged.
904 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
905 unsigned short mask)
907 struct net_device *dev, *ret;
909 ret = NULL;
910 for_each_netdev_rcu(net, dev) {
911 if (((dev->flags ^ if_flags) & mask) == 0) {
912 ret = dev;
913 break;
916 return ret;
918 EXPORT_SYMBOL(dev_get_by_flags_rcu);
921 * dev_valid_name - check if name is okay for network device
922 * @name: name string
924 * Network device names need to be valid file names to
925 * to allow sysfs to work. We also disallow any kind of
926 * whitespace.
928 bool dev_valid_name(const char *name)
930 if (*name == '\0')
931 return false;
932 if (strlen(name) >= IFNAMSIZ)
933 return false;
934 if (!strcmp(name, ".") || !strcmp(name, ".."))
935 return false;
937 while (*name) {
938 if (*name == '/' || isspace(*name))
939 return false;
940 name++;
942 return true;
944 EXPORT_SYMBOL(dev_valid_name);
947 * __dev_alloc_name - allocate a name for a device
948 * @net: network namespace to allocate the device name in
949 * @name: name format string
950 * @buf: scratch buffer and result name string
952 * Passed a format string - eg "lt%d" it will try and find a suitable
953 * id. It scans list of devices to build up a free map, then chooses
954 * the first empty slot. The caller must hold the dev_base or rtnl lock
955 * while allocating the name and adding the device in order to avoid
956 * duplicates.
957 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
958 * Returns the number of the unit assigned or a negative errno code.
961 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
963 int i = 0;
964 const char *p;
965 const int max_netdevices = 8*PAGE_SIZE;
966 unsigned long *inuse;
967 struct net_device *d;
969 p = strnchr(name, IFNAMSIZ-1, '%');
970 if (p) {
972 * Verify the string as this thing may have come from
973 * the user. There must be either one "%d" and no other "%"
974 * characters.
976 if (p[1] != 'd' || strchr(p + 2, '%'))
977 return -EINVAL;
979 /* Use one page as a bit array of possible slots */
980 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
981 if (!inuse)
982 return -ENOMEM;
984 for_each_netdev(net, d) {
985 if (!sscanf(d->name, name, &i))
986 continue;
987 if (i < 0 || i >= max_netdevices)
988 continue;
990 /* avoid cases where sscanf is not exact inverse of printf */
991 snprintf(buf, IFNAMSIZ, name, i);
992 if (!strncmp(buf, d->name, IFNAMSIZ))
993 set_bit(i, inuse);
996 i = find_first_zero_bit(inuse, max_netdevices);
997 free_page((unsigned long) inuse);
1000 if (buf != name)
1001 snprintf(buf, IFNAMSIZ, name, i);
1002 if (!__dev_get_by_name(net, buf))
1003 return i;
1005 /* It is possible to run out of possible slots
1006 * when the name is long and there isn't enough space left
1007 * for the digits, or if all bits are used.
1009 return -ENFILE;
1013 * dev_alloc_name - allocate a name for a device
1014 * @dev: device
1015 * @name: name format string
1017 * Passed a format string - eg "lt%d" it will try and find a suitable
1018 * id. It scans list of devices to build up a free map, then chooses
1019 * the first empty slot. The caller must hold the dev_base or rtnl lock
1020 * while allocating the name and adding the device in order to avoid
1021 * duplicates.
1022 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1023 * Returns the number of the unit assigned or a negative errno code.
1026 int dev_alloc_name(struct net_device *dev, const char *name)
1028 char buf[IFNAMSIZ];
1029 struct net *net;
1030 int ret;
1032 BUG_ON(!dev_net(dev));
1033 net = dev_net(dev);
1034 ret = __dev_alloc_name(net, name, buf);
1035 if (ret >= 0)
1036 strlcpy(dev->name, buf, IFNAMSIZ);
1037 return ret;
1039 EXPORT_SYMBOL(dev_alloc_name);
1041 static int dev_alloc_name_ns(struct net *net,
1042 struct net_device *dev,
1043 const char *name)
1045 char buf[IFNAMSIZ];
1046 int ret;
1048 ret = __dev_alloc_name(net, name, buf);
1049 if (ret >= 0)
1050 strlcpy(dev->name, buf, IFNAMSIZ);
1051 return ret;
1054 static int dev_get_valid_name(struct net *net,
1055 struct net_device *dev,
1056 const char *name)
1058 BUG_ON(!net);
1060 if (!dev_valid_name(name))
1061 return -EINVAL;
1063 if (strchr(name, '%'))
1064 return dev_alloc_name_ns(net, dev, name);
1065 else if (__dev_get_by_name(net, name))
1066 return -EEXIST;
1067 else if (dev->name != name)
1068 strlcpy(dev->name, name, IFNAMSIZ);
1070 return 0;
1074 * dev_change_name - change name of a device
1075 * @dev: device
1076 * @newname: name (or format string) must be at least IFNAMSIZ
1078 * Change name of a device, can pass format strings "eth%d".
1079 * for wildcarding.
1081 int dev_change_name(struct net_device *dev, const char *newname)
1083 char oldname[IFNAMSIZ];
1084 int err = 0;
1085 int ret;
1086 struct net *net;
1088 ASSERT_RTNL();
1089 BUG_ON(!dev_net(dev));
1091 net = dev_net(dev);
1092 if (dev->flags & IFF_UP)
1093 return -EBUSY;
1095 write_seqcount_begin(&devnet_rename_seq);
1097 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1098 write_seqcount_end(&devnet_rename_seq);
1099 return 0;
1102 memcpy(oldname, dev->name, IFNAMSIZ);
1104 err = dev_get_valid_name(net, dev, newname);
1105 if (err < 0) {
1106 write_seqcount_end(&devnet_rename_seq);
1107 return err;
1110 rollback:
1111 ret = device_rename(&dev->dev, dev->name);
1112 if (ret) {
1113 memcpy(dev->name, oldname, IFNAMSIZ);
1114 write_seqcount_end(&devnet_rename_seq);
1115 return ret;
1118 write_seqcount_end(&devnet_rename_seq);
1120 write_lock_bh(&dev_base_lock);
1121 hlist_del_rcu(&dev->name_hlist);
1122 write_unlock_bh(&dev_base_lock);
1124 synchronize_rcu();
1126 write_lock_bh(&dev_base_lock);
1127 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1128 write_unlock_bh(&dev_base_lock);
1130 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1131 ret = notifier_to_errno(ret);
1133 if (ret) {
1134 /* err >= 0 after dev_alloc_name() or stores the first errno */
1135 if (err >= 0) {
1136 err = ret;
1137 write_seqcount_begin(&devnet_rename_seq);
1138 memcpy(dev->name, oldname, IFNAMSIZ);
1139 goto rollback;
1140 } else {
1141 pr_err("%s: name change rollback failed: %d\n",
1142 dev->name, ret);
1146 return err;
1150 * dev_set_alias - change ifalias of a device
1151 * @dev: device
1152 * @alias: name up to IFALIASZ
1153 * @len: limit of bytes to copy from info
1155 * Set ifalias for a device,
1157 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1159 char *new_ifalias;
1161 ASSERT_RTNL();
1163 if (len >= IFALIASZ)
1164 return -EINVAL;
1166 if (!len) {
1167 kfree(dev->ifalias);
1168 dev->ifalias = NULL;
1169 return 0;
1172 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1173 if (!new_ifalias)
1174 return -ENOMEM;
1175 dev->ifalias = new_ifalias;
1177 strlcpy(dev->ifalias, alias, len+1);
1178 return len;
1183 * netdev_features_change - device changes features
1184 * @dev: device to cause notification
1186 * Called to indicate a device has changed features.
1188 void netdev_features_change(struct net_device *dev)
1190 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1192 EXPORT_SYMBOL(netdev_features_change);
1195 * netdev_state_change - device changes state
1196 * @dev: device to cause notification
1198 * Called to indicate a device has changed state. This function calls
1199 * the notifier chains for netdev_chain and sends a NEWLINK message
1200 * to the routing socket.
1202 void netdev_state_change(struct net_device *dev)
1204 if (dev->flags & IFF_UP) {
1205 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1206 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1209 EXPORT_SYMBOL(netdev_state_change);
1212 * netdev_notify_peers - notify network peers about existence of @dev
1213 * @dev: network device
1215 * Generate traffic such that interested network peers are aware of
1216 * @dev, such as by generating a gratuitous ARP. This may be used when
1217 * a device wants to inform the rest of the network about some sort of
1218 * reconfiguration such as a failover event or virtual machine
1219 * migration.
1221 void netdev_notify_peers(struct net_device *dev)
1223 rtnl_lock();
1224 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1225 rtnl_unlock();
1227 EXPORT_SYMBOL(netdev_notify_peers);
1230 * dev_load - load a network module
1231 * @net: the applicable net namespace
1232 * @name: name of interface
1234 * If a network interface is not present and the process has suitable
1235 * privileges this function loads the module. If module loading is not
1236 * available in this kernel then it becomes a nop.
1239 void dev_load(struct net *net, const char *name)
1241 struct net_device *dev;
1242 int no_module;
1244 rcu_read_lock();
1245 dev = dev_get_by_name_rcu(net, name);
1246 rcu_read_unlock();
1248 no_module = !dev;
1249 if (no_module && capable(CAP_NET_ADMIN))
1250 no_module = request_module("netdev-%s", name);
1251 if (no_module && capable(CAP_SYS_MODULE)) {
1252 if (!request_module("%s", name))
1253 pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1254 name);
1257 EXPORT_SYMBOL(dev_load);
1259 static int __dev_open(struct net_device *dev)
1261 const struct net_device_ops *ops = dev->netdev_ops;
1262 int ret;
1264 ASSERT_RTNL();
1266 if (!netif_device_present(dev))
1267 return -ENODEV;
1269 /* Block netpoll from trying to do any rx path servicing.
1270 * If we don't do this there is a chance ndo_poll_controller
1271 * or ndo_poll may be running while we open the device
1273 ret = netpoll_rx_disable(dev);
1274 if (ret)
1275 return ret;
1277 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1278 ret = notifier_to_errno(ret);
1279 if (ret)
1280 return ret;
1282 set_bit(__LINK_STATE_START, &dev->state);
1284 if (ops->ndo_validate_addr)
1285 ret = ops->ndo_validate_addr(dev);
1287 if (!ret && ops->ndo_open)
1288 ret = ops->ndo_open(dev);
1290 netpoll_rx_enable(dev);
1292 if (ret)
1293 clear_bit(__LINK_STATE_START, &dev->state);
1294 else {
1295 dev->flags |= IFF_UP;
1296 net_dmaengine_get();
1297 dev_set_rx_mode(dev);
1298 dev_activate(dev);
1299 add_device_randomness(dev->dev_addr, dev->addr_len);
1302 return ret;
1306 * dev_open - prepare an interface for use.
1307 * @dev: device to open
1309 * Takes a device from down to up state. The device's private open
1310 * function is invoked and then the multicast lists are loaded. Finally
1311 * the device is moved into the up state and a %NETDEV_UP message is
1312 * sent to the netdev notifier chain.
1314 * Calling this function on an active interface is a nop. On a failure
1315 * a negative errno code is returned.
1317 int dev_open(struct net_device *dev)
1319 int ret;
1321 if (dev->flags & IFF_UP)
1322 return 0;
1324 ret = __dev_open(dev);
1325 if (ret < 0)
1326 return ret;
1328 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1329 call_netdevice_notifiers(NETDEV_UP, dev);
1331 return ret;
1333 EXPORT_SYMBOL(dev_open);
1335 static int __dev_close_many(struct list_head *head)
1337 struct net_device *dev;
1339 ASSERT_RTNL();
1340 might_sleep();
1342 list_for_each_entry(dev, head, unreg_list) {
1343 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1345 clear_bit(__LINK_STATE_START, &dev->state);
1347 /* Synchronize to scheduled poll. We cannot touch poll list, it
1348 * can be even on different cpu. So just clear netif_running().
1350 * dev->stop() will invoke napi_disable() on all of it's
1351 * napi_struct instances on this device.
1353 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1356 dev_deactivate_many(head);
1358 list_for_each_entry(dev, head, unreg_list) {
1359 const struct net_device_ops *ops = dev->netdev_ops;
1362 * Call the device specific close. This cannot fail.
1363 * Only if device is UP
1365 * We allow it to be called even after a DETACH hot-plug
1366 * event.
1368 if (ops->ndo_stop)
1369 ops->ndo_stop(dev);
1371 dev->flags &= ~IFF_UP;
1372 net_dmaengine_put();
1375 return 0;
1378 static int __dev_close(struct net_device *dev)
1380 int retval;
1381 LIST_HEAD(single);
1383 /* Temporarily disable netpoll until the interface is down */
1384 retval = netpoll_rx_disable(dev);
1385 if (retval)
1386 return retval;
1388 list_add(&dev->unreg_list, &single);
1389 retval = __dev_close_many(&single);
1390 list_del(&single);
1392 netpoll_rx_enable(dev);
1393 return retval;
1396 static int dev_close_many(struct list_head *head)
1398 struct net_device *dev, *tmp;
1399 LIST_HEAD(tmp_list);
1401 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1402 if (!(dev->flags & IFF_UP))
1403 list_move(&dev->unreg_list, &tmp_list);
1405 __dev_close_many(head);
1407 list_for_each_entry(dev, head, unreg_list) {
1408 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1409 call_netdevice_notifiers(NETDEV_DOWN, dev);
1412 /* rollback_registered_many needs the complete original list */
1413 list_splice(&tmp_list, head);
1414 return 0;
1418 * dev_close - shutdown an interface.
1419 * @dev: device to shutdown
1421 * This function moves an active device into down state. A
1422 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1423 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1424 * chain.
1426 int dev_close(struct net_device *dev)
1428 int ret = 0;
1429 if (dev->flags & IFF_UP) {
1430 LIST_HEAD(single);
1432 /* Block netpoll rx while the interface is going down */
1433 ret = netpoll_rx_disable(dev);
1434 if (ret)
1435 return ret;
1437 list_add(&dev->unreg_list, &single);
1438 dev_close_many(&single);
1439 list_del(&single);
1441 netpoll_rx_enable(dev);
1443 return ret;
1445 EXPORT_SYMBOL(dev_close);
1449 * dev_disable_lro - disable Large Receive Offload on a device
1450 * @dev: device
1452 * Disable Large Receive Offload (LRO) on a net device. Must be
1453 * called under RTNL. This is needed if received packets may be
1454 * forwarded to another interface.
1456 void dev_disable_lro(struct net_device *dev)
1459 * If we're trying to disable lro on a vlan device
1460 * use the underlying physical device instead
1462 if (is_vlan_dev(dev))
1463 dev = vlan_dev_real_dev(dev);
1465 dev->wanted_features &= ~NETIF_F_LRO;
1466 netdev_update_features(dev);
1468 if (unlikely(dev->features & NETIF_F_LRO))
1469 netdev_WARN(dev, "failed to disable LRO!\n");
1471 EXPORT_SYMBOL(dev_disable_lro);
1474 static int dev_boot_phase = 1;
1477 * register_netdevice_notifier - register a network notifier block
1478 * @nb: notifier
1480 * Register a notifier to be called when network device events occur.
1481 * The notifier passed is linked into the kernel structures and must
1482 * not be reused until it has been unregistered. A negative errno code
1483 * is returned on a failure.
1485 * When registered all registration and up events are replayed
1486 * to the new notifier to allow device to have a race free
1487 * view of the network device list.
1490 int register_netdevice_notifier(struct notifier_block *nb)
1492 struct net_device *dev;
1493 struct net_device *last;
1494 struct net *net;
1495 int err;
1497 rtnl_lock();
1498 err = raw_notifier_chain_register(&netdev_chain, nb);
1499 if (err)
1500 goto unlock;
1501 if (dev_boot_phase)
1502 goto unlock;
1503 for_each_net(net) {
1504 for_each_netdev(net, dev) {
1505 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1506 err = notifier_to_errno(err);
1507 if (err)
1508 goto rollback;
1510 if (!(dev->flags & IFF_UP))
1511 continue;
1513 nb->notifier_call(nb, NETDEV_UP, dev);
1517 unlock:
1518 rtnl_unlock();
1519 return err;
1521 rollback:
1522 last = dev;
1523 for_each_net(net) {
1524 for_each_netdev(net, dev) {
1525 if (dev == last)
1526 goto outroll;
1528 if (dev->flags & IFF_UP) {
1529 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1530 nb->notifier_call(nb, NETDEV_DOWN, dev);
1532 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1536 outroll:
1537 raw_notifier_chain_unregister(&netdev_chain, nb);
1538 goto unlock;
1540 EXPORT_SYMBOL(register_netdevice_notifier);
1543 * unregister_netdevice_notifier - unregister a network notifier block
1544 * @nb: notifier
1546 * Unregister a notifier previously registered by
1547 * register_netdevice_notifier(). The notifier is unlinked into the
1548 * kernel structures and may then be reused. A negative errno code
1549 * is returned on a failure.
1551 * After unregistering unregister and down device events are synthesized
1552 * for all devices on the device list to the removed notifier to remove
1553 * the need for special case cleanup code.
1556 int unregister_netdevice_notifier(struct notifier_block *nb)
1558 struct net_device *dev;
1559 struct net *net;
1560 int err;
1562 rtnl_lock();
1563 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1564 if (err)
1565 goto unlock;
1567 for_each_net(net) {
1568 for_each_netdev(net, dev) {
1569 if (dev->flags & IFF_UP) {
1570 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1571 nb->notifier_call(nb, NETDEV_DOWN, dev);
1573 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1576 unlock:
1577 rtnl_unlock();
1578 return err;
1580 EXPORT_SYMBOL(unregister_netdevice_notifier);
1583 * call_netdevice_notifiers - call all network notifier blocks
1584 * @val: value passed unmodified to notifier function
1585 * @dev: net_device pointer passed unmodified to notifier function
1587 * Call all network notifier blocks. Parameters and return value
1588 * are as for raw_notifier_call_chain().
1591 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1593 ASSERT_RTNL();
1594 return raw_notifier_call_chain(&netdev_chain, val, dev);
1596 EXPORT_SYMBOL(call_netdevice_notifiers);
1598 static struct static_key netstamp_needed __read_mostly;
1599 #ifdef HAVE_JUMP_LABEL
1600 /* We are not allowed to call static_key_slow_dec() from irq context
1601 * If net_disable_timestamp() is called from irq context, defer the
1602 * static_key_slow_dec() calls.
1604 static atomic_t netstamp_needed_deferred;
1605 #endif
1607 void net_enable_timestamp(void)
1609 #ifdef HAVE_JUMP_LABEL
1610 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1612 if (deferred) {
1613 while (--deferred)
1614 static_key_slow_dec(&netstamp_needed);
1615 return;
1617 #endif
1618 WARN_ON(in_interrupt());
1619 static_key_slow_inc(&netstamp_needed);
1621 EXPORT_SYMBOL(net_enable_timestamp);
1623 void net_disable_timestamp(void)
1625 #ifdef HAVE_JUMP_LABEL
1626 if (in_interrupt()) {
1627 atomic_inc(&netstamp_needed_deferred);
1628 return;
1630 #endif
1631 static_key_slow_dec(&netstamp_needed);
1633 EXPORT_SYMBOL(net_disable_timestamp);
1635 static inline void net_timestamp_set(struct sk_buff *skb)
1637 skb->tstamp.tv64 = 0;
1638 if (static_key_false(&netstamp_needed))
1639 __net_timestamp(skb);
1642 #define net_timestamp_check(COND, SKB) \
1643 if (static_key_false(&netstamp_needed)) { \
1644 if ((COND) && !(SKB)->tstamp.tv64) \
1645 __net_timestamp(SKB); \
1648 static int net_hwtstamp_validate(struct ifreq *ifr)
1650 struct hwtstamp_config cfg;
1651 enum hwtstamp_tx_types tx_type;
1652 enum hwtstamp_rx_filters rx_filter;
1653 int tx_type_valid = 0;
1654 int rx_filter_valid = 0;
1656 if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1657 return -EFAULT;
1659 if (cfg.flags) /* reserved for future extensions */
1660 return -EINVAL;
1662 tx_type = cfg.tx_type;
1663 rx_filter = cfg.rx_filter;
1665 switch (tx_type) {
1666 case HWTSTAMP_TX_OFF:
1667 case HWTSTAMP_TX_ON:
1668 case HWTSTAMP_TX_ONESTEP_SYNC:
1669 tx_type_valid = 1;
1670 break;
1673 switch (rx_filter) {
1674 case HWTSTAMP_FILTER_NONE:
1675 case HWTSTAMP_FILTER_ALL:
1676 case HWTSTAMP_FILTER_SOME:
1677 case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1678 case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1679 case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1680 case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1681 case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1682 case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1683 case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1684 case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1685 case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1686 case HWTSTAMP_FILTER_PTP_V2_EVENT:
1687 case HWTSTAMP_FILTER_PTP_V2_SYNC:
1688 case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1689 rx_filter_valid = 1;
1690 break;
1693 if (!tx_type_valid || !rx_filter_valid)
1694 return -ERANGE;
1696 return 0;
1699 static inline bool is_skb_forwardable(struct net_device *dev,
1700 struct sk_buff *skb)
1702 unsigned int len;
1704 if (!(dev->flags & IFF_UP))
1705 return false;
1707 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1708 if (skb->len <= len)
1709 return true;
1711 /* if TSO is enabled, we don't care about the length as the packet
1712 * could be forwarded without being segmented before
1714 if (skb_is_gso(skb))
1715 return true;
1717 return false;
1721 * dev_forward_skb - loopback an skb to another netif
1723 * @dev: destination network device
1724 * @skb: buffer to forward
1726 * return values:
1727 * NET_RX_SUCCESS (no congestion)
1728 * NET_RX_DROP (packet was dropped, but freed)
1730 * dev_forward_skb can be used for injecting an skb from the
1731 * start_xmit function of one device into the receive queue
1732 * of another device.
1734 * The receiving device may be in another namespace, so
1735 * we have to clear all information in the skb that could
1736 * impact namespace isolation.
1738 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1740 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1741 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1742 atomic_long_inc(&dev->rx_dropped);
1743 kfree_skb(skb);
1744 return NET_RX_DROP;
1748 skb_orphan(skb);
1749 nf_reset(skb);
1751 if (unlikely(!is_skb_forwardable(dev, skb))) {
1752 atomic_long_inc(&dev->rx_dropped);
1753 kfree_skb(skb);
1754 return NET_RX_DROP;
1756 skb->skb_iif = 0;
1757 skb->dev = dev;
1758 skb_dst_drop(skb);
1759 skb->tstamp.tv64 = 0;
1760 skb->pkt_type = PACKET_HOST;
1761 skb->protocol = eth_type_trans(skb, dev);
1762 skb->mark = 0;
1763 secpath_reset(skb);
1764 nf_reset(skb);
1765 return netif_rx(skb);
1767 EXPORT_SYMBOL_GPL(dev_forward_skb);
1769 static inline int deliver_skb(struct sk_buff *skb,
1770 struct packet_type *pt_prev,
1771 struct net_device *orig_dev)
1773 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1774 return -ENOMEM;
1775 atomic_inc(&skb->users);
1776 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1779 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1781 if (!ptype->af_packet_priv || !skb->sk)
1782 return false;
1784 if (ptype->id_match)
1785 return ptype->id_match(ptype, skb->sk);
1786 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1787 return true;
1789 return false;
1793 * Support routine. Sends outgoing frames to any network
1794 * taps currently in use.
1797 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1799 struct packet_type *ptype;
1800 struct sk_buff *skb2 = NULL;
1801 struct packet_type *pt_prev = NULL;
1803 rcu_read_lock();
1804 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1805 /* Never send packets back to the socket
1806 * they originated from - MvS (miquels@drinkel.ow.org)
1808 if ((ptype->dev == dev || !ptype->dev) &&
1809 (!skb_loop_sk(ptype, skb))) {
1810 if (pt_prev) {
1811 deliver_skb(skb2, pt_prev, skb->dev);
1812 pt_prev = ptype;
1813 continue;
1816 skb2 = skb_clone(skb, GFP_ATOMIC);
1817 if (!skb2)
1818 break;
1820 net_timestamp_set(skb2);
1822 /* skb->nh should be correctly
1823 set by sender, so that the second statement is
1824 just protection against buggy protocols.
1826 skb_reset_mac_header(skb2);
1828 if (skb_network_header(skb2) < skb2->data ||
1829 skb2->network_header > skb2->tail) {
1830 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1831 ntohs(skb2->protocol),
1832 dev->name);
1833 skb_reset_network_header(skb2);
1836 skb2->transport_header = skb2->network_header;
1837 skb2->pkt_type = PACKET_OUTGOING;
1838 pt_prev = ptype;
1841 if (pt_prev)
1842 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1843 rcu_read_unlock();
1847 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1848 * @dev: Network device
1849 * @txq: number of queues available
1851 * If real_num_tx_queues is changed the tc mappings may no longer be
1852 * valid. To resolve this verify the tc mapping remains valid and if
1853 * not NULL the mapping. With no priorities mapping to this
1854 * offset/count pair it will no longer be used. In the worst case TC0
1855 * is invalid nothing can be done so disable priority mappings. If is
1856 * expected that drivers will fix this mapping if they can before
1857 * calling netif_set_real_num_tx_queues.
1859 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1861 int i;
1862 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1864 /* If TC0 is invalidated disable TC mapping */
1865 if (tc->offset + tc->count > txq) {
1866 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1867 dev->num_tc = 0;
1868 return;
1871 /* Invalidated prio to tc mappings set to TC0 */
1872 for (i = 1; i < TC_BITMASK + 1; i++) {
1873 int q = netdev_get_prio_tc_map(dev, i);
1875 tc = &dev->tc_to_txq[q];
1876 if (tc->offset + tc->count > txq) {
1877 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1878 i, q);
1879 netdev_set_prio_tc_map(dev, i, 0);
1884 #ifdef CONFIG_XPS
1885 static DEFINE_MUTEX(xps_map_mutex);
1886 #define xmap_dereference(P) \
1887 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1889 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1890 int cpu, u16 index)
1892 struct xps_map *map = NULL;
1893 int pos;
1895 if (dev_maps)
1896 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1898 for (pos = 0; map && pos < map->len; pos++) {
1899 if (map->queues[pos] == index) {
1900 if (map->len > 1) {
1901 map->queues[pos] = map->queues[--map->len];
1902 } else {
1903 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1904 kfree_rcu(map, rcu);
1905 map = NULL;
1907 break;
1911 return map;
1914 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1916 struct xps_dev_maps *dev_maps;
1917 int cpu, i;
1918 bool active = false;
1920 mutex_lock(&xps_map_mutex);
1921 dev_maps = xmap_dereference(dev->xps_maps);
1923 if (!dev_maps)
1924 goto out_no_maps;
1926 for_each_possible_cpu(cpu) {
1927 for (i = index; i < dev->num_tx_queues; i++) {
1928 if (!remove_xps_queue(dev_maps, cpu, i))
1929 break;
1931 if (i == dev->num_tx_queues)
1932 active = true;
1935 if (!active) {
1936 RCU_INIT_POINTER(dev->xps_maps, NULL);
1937 kfree_rcu(dev_maps, rcu);
1940 for (i = index; i < dev->num_tx_queues; i++)
1941 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1942 NUMA_NO_NODE);
1944 out_no_maps:
1945 mutex_unlock(&xps_map_mutex);
1948 static struct xps_map *expand_xps_map(struct xps_map *map,
1949 int cpu, u16 index)
1951 struct xps_map *new_map;
1952 int alloc_len = XPS_MIN_MAP_ALLOC;
1953 int i, pos;
1955 for (pos = 0; map && pos < map->len; pos++) {
1956 if (map->queues[pos] != index)
1957 continue;
1958 return map;
1961 /* Need to add queue to this CPU's existing map */
1962 if (map) {
1963 if (pos < map->alloc_len)
1964 return map;
1966 alloc_len = map->alloc_len * 2;
1969 /* Need to allocate new map to store queue on this CPU's map */
1970 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1971 cpu_to_node(cpu));
1972 if (!new_map)
1973 return NULL;
1975 for (i = 0; i < pos; i++)
1976 new_map->queues[i] = map->queues[i];
1977 new_map->alloc_len = alloc_len;
1978 new_map->len = pos;
1980 return new_map;
1983 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1985 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1986 struct xps_map *map, *new_map;
1987 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1988 int cpu, numa_node_id = -2;
1989 bool active = false;
1991 mutex_lock(&xps_map_mutex);
1993 dev_maps = xmap_dereference(dev->xps_maps);
1995 /* allocate memory for queue storage */
1996 for_each_online_cpu(cpu) {
1997 if (!cpumask_test_cpu(cpu, mask))
1998 continue;
2000 if (!new_dev_maps)
2001 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2002 if (!new_dev_maps)
2003 return -ENOMEM;
2005 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2006 NULL;
2008 map = expand_xps_map(map, cpu, index);
2009 if (!map)
2010 goto error;
2012 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2015 if (!new_dev_maps)
2016 goto out_no_new_maps;
2018 for_each_possible_cpu(cpu) {
2019 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2020 /* add queue to CPU maps */
2021 int pos = 0;
2023 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2024 while ((pos < map->len) && (map->queues[pos] != index))
2025 pos++;
2027 if (pos == map->len)
2028 map->queues[map->len++] = index;
2029 #ifdef CONFIG_NUMA
2030 if (numa_node_id == -2)
2031 numa_node_id = cpu_to_node(cpu);
2032 else if (numa_node_id != cpu_to_node(cpu))
2033 numa_node_id = -1;
2034 #endif
2035 } else if (dev_maps) {
2036 /* fill in the new device map from the old device map */
2037 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2038 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2043 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2045 /* Cleanup old maps */
2046 if (dev_maps) {
2047 for_each_possible_cpu(cpu) {
2048 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2049 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2050 if (map && map != new_map)
2051 kfree_rcu(map, rcu);
2054 kfree_rcu(dev_maps, rcu);
2057 dev_maps = new_dev_maps;
2058 active = true;
2060 out_no_new_maps:
2061 /* update Tx queue numa node */
2062 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2063 (numa_node_id >= 0) ? numa_node_id :
2064 NUMA_NO_NODE);
2066 if (!dev_maps)
2067 goto out_no_maps;
2069 /* removes queue from unused CPUs */
2070 for_each_possible_cpu(cpu) {
2071 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2072 continue;
2074 if (remove_xps_queue(dev_maps, cpu, index))
2075 active = true;
2078 /* free map if not active */
2079 if (!active) {
2080 RCU_INIT_POINTER(dev->xps_maps, NULL);
2081 kfree_rcu(dev_maps, rcu);
2084 out_no_maps:
2085 mutex_unlock(&xps_map_mutex);
2087 return 0;
2088 error:
2089 /* remove any maps that we added */
2090 for_each_possible_cpu(cpu) {
2091 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2092 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2093 NULL;
2094 if (new_map && new_map != map)
2095 kfree(new_map);
2098 mutex_unlock(&xps_map_mutex);
2100 kfree(new_dev_maps);
2101 return -ENOMEM;
2103 EXPORT_SYMBOL(netif_set_xps_queue);
2105 #endif
2107 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2108 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2110 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2112 int rc;
2114 if (txq < 1 || txq > dev->num_tx_queues)
2115 return -EINVAL;
2117 if (dev->reg_state == NETREG_REGISTERED ||
2118 dev->reg_state == NETREG_UNREGISTERING) {
2119 ASSERT_RTNL();
2121 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2122 txq);
2123 if (rc)
2124 return rc;
2126 if (dev->num_tc)
2127 netif_setup_tc(dev, txq);
2129 if (txq < dev->real_num_tx_queues) {
2130 qdisc_reset_all_tx_gt(dev, txq);
2131 #ifdef CONFIG_XPS
2132 netif_reset_xps_queues_gt(dev, txq);
2133 #endif
2137 dev->real_num_tx_queues = txq;
2138 return 0;
2140 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2142 #ifdef CONFIG_RPS
2144 * netif_set_real_num_rx_queues - set actual number of RX queues used
2145 * @dev: Network device
2146 * @rxq: Actual number of RX queues
2148 * This must be called either with the rtnl_lock held or before
2149 * registration of the net device. Returns 0 on success, or a
2150 * negative error code. If called before registration, it always
2151 * succeeds.
2153 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2155 int rc;
2157 if (rxq < 1 || rxq > dev->num_rx_queues)
2158 return -EINVAL;
2160 if (dev->reg_state == NETREG_REGISTERED) {
2161 ASSERT_RTNL();
2163 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2164 rxq);
2165 if (rc)
2166 return rc;
2169 dev->real_num_rx_queues = rxq;
2170 return 0;
2172 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2173 #endif
2176 * netif_get_num_default_rss_queues - default number of RSS queues
2178 * This routine should set an upper limit on the number of RSS queues
2179 * used by default by multiqueue devices.
2181 int netif_get_num_default_rss_queues(void)
2183 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2185 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2187 static inline void __netif_reschedule(struct Qdisc *q)
2189 struct softnet_data *sd;
2190 unsigned long flags;
2192 local_irq_save(flags);
2193 sd = &__get_cpu_var(softnet_data);
2194 q->next_sched = NULL;
2195 *sd->output_queue_tailp = q;
2196 sd->output_queue_tailp = &q->next_sched;
2197 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2198 local_irq_restore(flags);
2201 void __netif_schedule(struct Qdisc *q)
2203 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2204 __netif_reschedule(q);
2206 EXPORT_SYMBOL(__netif_schedule);
2208 void dev_kfree_skb_irq(struct sk_buff *skb)
2210 if (atomic_dec_and_test(&skb->users)) {
2211 struct softnet_data *sd;
2212 unsigned long flags;
2214 local_irq_save(flags);
2215 sd = &__get_cpu_var(softnet_data);
2216 skb->next = sd->completion_queue;
2217 sd->completion_queue = skb;
2218 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2219 local_irq_restore(flags);
2222 EXPORT_SYMBOL(dev_kfree_skb_irq);
2224 void dev_kfree_skb_any(struct sk_buff *skb)
2226 if (in_irq() || irqs_disabled())
2227 dev_kfree_skb_irq(skb);
2228 else
2229 dev_kfree_skb(skb);
2231 EXPORT_SYMBOL(dev_kfree_skb_any);
2235 * netif_device_detach - mark device as removed
2236 * @dev: network device
2238 * Mark device as removed from system and therefore no longer available.
2240 void netif_device_detach(struct net_device *dev)
2242 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2243 netif_running(dev)) {
2244 netif_tx_stop_all_queues(dev);
2247 EXPORT_SYMBOL(netif_device_detach);
2250 * netif_device_attach - mark device as attached
2251 * @dev: network device
2253 * Mark device as attached from system and restart if needed.
2255 void netif_device_attach(struct net_device *dev)
2257 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2258 netif_running(dev)) {
2259 netif_tx_wake_all_queues(dev);
2260 __netdev_watchdog_up(dev);
2263 EXPORT_SYMBOL(netif_device_attach);
2265 static void skb_warn_bad_offload(const struct sk_buff *skb)
2267 static const netdev_features_t null_features = 0;
2268 struct net_device *dev = skb->dev;
2269 const char *driver = "";
2271 if (dev && dev->dev.parent)
2272 driver = dev_driver_string(dev->dev.parent);
2274 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2275 "gso_type=%d ip_summed=%d\n",
2276 driver, dev ? &dev->features : &null_features,
2277 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2278 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2279 skb_shinfo(skb)->gso_type, skb->ip_summed);
2283 * Invalidate hardware checksum when packet is to be mangled, and
2284 * complete checksum manually on outgoing path.
2286 int skb_checksum_help(struct sk_buff *skb)
2288 __wsum csum;
2289 int ret = 0, offset;
2291 if (skb->ip_summed == CHECKSUM_COMPLETE)
2292 goto out_set_summed;
2294 if (unlikely(skb_shinfo(skb)->gso_size)) {
2295 skb_warn_bad_offload(skb);
2296 return -EINVAL;
2299 /* Before computing a checksum, we should make sure no frag could
2300 * be modified by an external entity : checksum could be wrong.
2302 if (skb_has_shared_frag(skb)) {
2303 ret = __skb_linearize(skb);
2304 if (ret)
2305 goto out;
2308 offset = skb_checksum_start_offset(skb);
2309 BUG_ON(offset >= skb_headlen(skb));
2310 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2312 offset += skb->csum_offset;
2313 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2315 if (skb_cloned(skb) &&
2316 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2317 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2318 if (ret)
2319 goto out;
2322 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2323 out_set_summed:
2324 skb->ip_summed = CHECKSUM_NONE;
2325 out:
2326 return ret;
2328 EXPORT_SYMBOL(skb_checksum_help);
2330 /* openvswitch calls this on rx path, so we need a different check.
2332 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2334 if (tx_path)
2335 return skb->ip_summed != CHECKSUM_PARTIAL;
2336 else
2337 return skb->ip_summed == CHECKSUM_NONE;
2341 * __skb_gso_segment - Perform segmentation on skb.
2342 * @skb: buffer to segment
2343 * @features: features for the output path (see dev->features)
2344 * @tx_path: whether it is called in TX path
2346 * This function segments the given skb and returns a list of segments.
2348 * It may return NULL if the skb requires no segmentation. This is
2349 * only possible when GSO is used for verifying header integrity.
2351 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2352 netdev_features_t features, bool tx_path)
2354 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2355 struct packet_offload *ptype;
2356 __be16 type = skb->protocol;
2357 int vlan_depth = ETH_HLEN;
2358 int err;
2360 while (type == htons(ETH_P_8021Q)) {
2361 struct vlan_hdr *vh;
2363 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2364 return ERR_PTR(-EINVAL);
2366 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2367 type = vh->h_vlan_encapsulated_proto;
2368 vlan_depth += VLAN_HLEN;
2371 skb_reset_mac_header(skb);
2372 skb->mac_len = skb->network_header - skb->mac_header;
2373 __skb_pull(skb, skb->mac_len);
2375 if (unlikely(skb_needs_check(skb, tx_path))) {
2376 skb_warn_bad_offload(skb);
2378 if (skb_header_cloned(skb) &&
2379 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2380 return ERR_PTR(err);
2383 rcu_read_lock();
2384 list_for_each_entry_rcu(ptype, &offload_base, list) {
2385 if (ptype->type == type && ptype->callbacks.gso_segment) {
2386 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2387 err = ptype->callbacks.gso_send_check(skb);
2388 segs = ERR_PTR(err);
2389 if (err || skb_gso_ok(skb, features))
2390 break;
2391 __skb_push(skb, (skb->data -
2392 skb_network_header(skb)));
2394 segs = ptype->callbacks.gso_segment(skb, features);
2395 break;
2398 rcu_read_unlock();
2400 __skb_push(skb, skb->data - skb_mac_header(skb));
2402 return segs;
2404 EXPORT_SYMBOL(__skb_gso_segment);
2406 /* Take action when hardware reception checksum errors are detected. */
2407 #ifdef CONFIG_BUG
2408 void netdev_rx_csum_fault(struct net_device *dev)
2410 if (net_ratelimit()) {
2411 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2412 dump_stack();
2415 EXPORT_SYMBOL(netdev_rx_csum_fault);
2416 #endif
2418 /* Actually, we should eliminate this check as soon as we know, that:
2419 * 1. IOMMU is present and allows to map all the memory.
2420 * 2. No high memory really exists on this machine.
2423 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2425 #ifdef CONFIG_HIGHMEM
2426 int i;
2427 if (!(dev->features & NETIF_F_HIGHDMA)) {
2428 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2429 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2430 if (PageHighMem(skb_frag_page(frag)))
2431 return 1;
2435 if (PCI_DMA_BUS_IS_PHYS) {
2436 struct device *pdev = dev->dev.parent;
2438 if (!pdev)
2439 return 0;
2440 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2441 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2442 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2443 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2444 return 1;
2447 #endif
2448 return 0;
2451 struct dev_gso_cb {
2452 void (*destructor)(struct sk_buff *skb);
2455 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2457 static void dev_gso_skb_destructor(struct sk_buff *skb)
2459 struct dev_gso_cb *cb;
2461 do {
2462 struct sk_buff *nskb = skb->next;
2464 skb->next = nskb->next;
2465 nskb->next = NULL;
2466 kfree_skb(nskb);
2467 } while (skb->next);
2469 cb = DEV_GSO_CB(skb);
2470 if (cb->destructor)
2471 cb->destructor(skb);
2475 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2476 * @skb: buffer to segment
2477 * @features: device features as applicable to this skb
2479 * This function segments the given skb and stores the list of segments
2480 * in skb->next.
2482 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2484 struct sk_buff *segs;
2486 segs = skb_gso_segment(skb, features);
2488 /* Verifying header integrity only. */
2489 if (!segs)
2490 return 0;
2492 if (IS_ERR(segs))
2493 return PTR_ERR(segs);
2495 skb->next = segs;
2496 DEV_GSO_CB(skb)->destructor = skb->destructor;
2497 skb->destructor = dev_gso_skb_destructor;
2499 return 0;
2502 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2504 return ((features & NETIF_F_GEN_CSUM) ||
2505 ((features & NETIF_F_V4_CSUM) &&
2506 protocol == htons(ETH_P_IP)) ||
2507 ((features & NETIF_F_V6_CSUM) &&
2508 protocol == htons(ETH_P_IPV6)) ||
2509 ((features & NETIF_F_FCOE_CRC) &&
2510 protocol == htons(ETH_P_FCOE)));
2513 static netdev_features_t harmonize_features(struct sk_buff *skb,
2514 __be16 protocol, netdev_features_t features)
2516 if (skb->ip_summed != CHECKSUM_NONE &&
2517 !can_checksum_protocol(features, protocol)) {
2518 features &= ~NETIF_F_ALL_CSUM;
2519 features &= ~NETIF_F_SG;
2520 } else if (illegal_highdma(skb->dev, skb)) {
2521 features &= ~NETIF_F_SG;
2524 return features;
2527 netdev_features_t netif_skb_features(struct sk_buff *skb)
2529 __be16 protocol = skb->protocol;
2530 netdev_features_t features = skb->dev->features;
2532 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2533 features &= ~NETIF_F_GSO_MASK;
2535 if (protocol == htons(ETH_P_8021Q)) {
2536 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2537 protocol = veh->h_vlan_encapsulated_proto;
2538 } else if (!vlan_tx_tag_present(skb)) {
2539 return harmonize_features(skb, protocol, features);
2542 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2544 if (protocol != htons(ETH_P_8021Q)) {
2545 return harmonize_features(skb, protocol, features);
2546 } else {
2547 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2548 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2549 return harmonize_features(skb, protocol, features);
2552 EXPORT_SYMBOL(netif_skb_features);
2555 * Returns true if either:
2556 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2557 * 2. skb is fragmented and the device does not support SG.
2559 static inline int skb_needs_linearize(struct sk_buff *skb,
2560 int features)
2562 return skb_is_nonlinear(skb) &&
2563 ((skb_has_frag_list(skb) &&
2564 !(features & NETIF_F_FRAGLIST)) ||
2565 (skb_shinfo(skb)->nr_frags &&
2566 !(features & NETIF_F_SG)));
2569 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2570 struct netdev_queue *txq)
2572 const struct net_device_ops *ops = dev->netdev_ops;
2573 int rc = NETDEV_TX_OK;
2574 unsigned int skb_len;
2576 if (likely(!skb->next)) {
2577 netdev_features_t features;
2580 * If device doesn't need skb->dst, release it right now while
2581 * its hot in this cpu cache
2583 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2584 skb_dst_drop(skb);
2586 features = netif_skb_features(skb);
2588 if (vlan_tx_tag_present(skb) &&
2589 !(features & NETIF_F_HW_VLAN_TX)) {
2590 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2591 if (unlikely(!skb))
2592 goto out;
2594 skb->vlan_tci = 0;
2597 /* If encapsulation offload request, verify we are testing
2598 * hardware encapsulation features instead of standard
2599 * features for the netdev
2601 if (skb->encapsulation)
2602 features &= dev->hw_enc_features;
2604 if (netif_needs_gso(skb, features)) {
2605 if (unlikely(dev_gso_segment(skb, features)))
2606 goto out_kfree_skb;
2607 if (skb->next)
2608 goto gso;
2609 } else {
2610 if (skb_needs_linearize(skb, features) &&
2611 __skb_linearize(skb))
2612 goto out_kfree_skb;
2614 /* If packet is not checksummed and device does not
2615 * support checksumming for this protocol, complete
2616 * checksumming here.
2618 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2619 if (skb->encapsulation)
2620 skb_set_inner_transport_header(skb,
2621 skb_checksum_start_offset(skb));
2622 else
2623 skb_set_transport_header(skb,
2624 skb_checksum_start_offset(skb));
2625 if (!(features & NETIF_F_ALL_CSUM) &&
2626 skb_checksum_help(skb))
2627 goto out_kfree_skb;
2631 if (!list_empty(&ptype_all))
2632 dev_queue_xmit_nit(skb, dev);
2634 skb_len = skb->len;
2635 rc = ops->ndo_start_xmit(skb, dev);
2636 trace_net_dev_xmit(skb, rc, dev, skb_len);
2637 if (rc == NETDEV_TX_OK)
2638 txq_trans_update(txq);
2639 return rc;
2642 gso:
2643 do {
2644 struct sk_buff *nskb = skb->next;
2646 skb->next = nskb->next;
2647 nskb->next = NULL;
2650 * If device doesn't need nskb->dst, release it right now while
2651 * its hot in this cpu cache
2653 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2654 skb_dst_drop(nskb);
2656 if (!list_empty(&ptype_all))
2657 dev_queue_xmit_nit(nskb, dev);
2659 skb_len = nskb->len;
2660 rc = ops->ndo_start_xmit(nskb, dev);
2661 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2662 if (unlikely(rc != NETDEV_TX_OK)) {
2663 if (rc & ~NETDEV_TX_MASK)
2664 goto out_kfree_gso_skb;
2665 nskb->next = skb->next;
2666 skb->next = nskb;
2667 return rc;
2669 txq_trans_update(txq);
2670 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2671 return NETDEV_TX_BUSY;
2672 } while (skb->next);
2674 out_kfree_gso_skb:
2675 if (likely(skb->next == NULL))
2676 skb->destructor = DEV_GSO_CB(skb)->destructor;
2677 out_kfree_skb:
2678 kfree_skb(skb);
2679 out:
2680 return rc;
2683 static void qdisc_pkt_len_init(struct sk_buff *skb)
2685 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2687 qdisc_skb_cb(skb)->pkt_len = skb->len;
2689 /* To get more precise estimation of bytes sent on wire,
2690 * we add to pkt_len the headers size of all segments
2692 if (shinfo->gso_size) {
2693 unsigned int hdr_len;
2695 /* mac layer + network layer */
2696 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2698 /* + transport layer */
2699 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2700 hdr_len += tcp_hdrlen(skb);
2701 else
2702 hdr_len += sizeof(struct udphdr);
2703 qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
2707 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2708 struct net_device *dev,
2709 struct netdev_queue *txq)
2711 spinlock_t *root_lock = qdisc_lock(q);
2712 bool contended;
2713 int rc;
2715 qdisc_pkt_len_init(skb);
2716 qdisc_calculate_pkt_len(skb, q);
2718 * Heuristic to force contended enqueues to serialize on a
2719 * separate lock before trying to get qdisc main lock.
2720 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2721 * and dequeue packets faster.
2723 contended = qdisc_is_running(q);
2724 if (unlikely(contended))
2725 spin_lock(&q->busylock);
2727 spin_lock(root_lock);
2728 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2729 kfree_skb(skb);
2730 rc = NET_XMIT_DROP;
2731 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2732 qdisc_run_begin(q)) {
2734 * This is a work-conserving queue; there are no old skbs
2735 * waiting to be sent out; and the qdisc is not running -
2736 * xmit the skb directly.
2738 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2739 skb_dst_force(skb);
2741 qdisc_bstats_update(q, skb);
2743 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2744 if (unlikely(contended)) {
2745 spin_unlock(&q->busylock);
2746 contended = false;
2748 __qdisc_run(q);
2749 } else
2750 qdisc_run_end(q);
2752 rc = NET_XMIT_SUCCESS;
2753 } else {
2754 skb_dst_force(skb);
2755 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2756 if (qdisc_run_begin(q)) {
2757 if (unlikely(contended)) {
2758 spin_unlock(&q->busylock);
2759 contended = false;
2761 __qdisc_run(q);
2764 spin_unlock(root_lock);
2765 if (unlikely(contended))
2766 spin_unlock(&q->busylock);
2767 return rc;
2770 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2771 static void skb_update_prio(struct sk_buff *skb)
2773 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2775 if (!skb->priority && skb->sk && map) {
2776 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2778 if (prioidx < map->priomap_len)
2779 skb->priority = map->priomap[prioidx];
2782 #else
2783 #define skb_update_prio(skb)
2784 #endif
2786 static DEFINE_PER_CPU(int, xmit_recursion);
2787 #define RECURSION_LIMIT 10
2790 * dev_loopback_xmit - loop back @skb
2791 * @skb: buffer to transmit
2793 int dev_loopback_xmit(struct sk_buff *skb)
2795 skb_reset_mac_header(skb);
2796 __skb_pull(skb, skb_network_offset(skb));
2797 skb->pkt_type = PACKET_LOOPBACK;
2798 skb->ip_summed = CHECKSUM_UNNECESSARY;
2799 WARN_ON(!skb_dst(skb));
2800 skb_dst_force(skb);
2801 netif_rx_ni(skb);
2802 return 0;
2804 EXPORT_SYMBOL(dev_loopback_xmit);
2807 * dev_queue_xmit - transmit a buffer
2808 * @skb: buffer to transmit
2810 * Queue a buffer for transmission to a network device. The caller must
2811 * have set the device and priority and built the buffer before calling
2812 * this function. The function can be called from an interrupt.
2814 * A negative errno code is returned on a failure. A success does not
2815 * guarantee the frame will be transmitted as it may be dropped due
2816 * to congestion or traffic shaping.
2818 * -----------------------------------------------------------------------------------
2819 * I notice this method can also return errors from the queue disciplines,
2820 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2821 * be positive.
2823 * Regardless of the return value, the skb is consumed, so it is currently
2824 * difficult to retry a send to this method. (You can bump the ref count
2825 * before sending to hold a reference for retry if you are careful.)
2827 * When calling this method, interrupts MUST be enabled. This is because
2828 * the BH enable code must have IRQs enabled so that it will not deadlock.
2829 * --BLG
2831 int dev_queue_xmit(struct sk_buff *skb)
2833 struct net_device *dev = skb->dev;
2834 struct netdev_queue *txq;
2835 struct Qdisc *q;
2836 int rc = -ENOMEM;
2838 skb_reset_mac_header(skb);
2840 /* Disable soft irqs for various locks below. Also
2841 * stops preemption for RCU.
2843 rcu_read_lock_bh();
2845 skb_update_prio(skb);
2847 txq = netdev_pick_tx(dev, skb);
2848 q = rcu_dereference_bh(txq->qdisc);
2850 #ifdef CONFIG_NET_CLS_ACT
2851 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2852 #endif
2853 trace_net_dev_queue(skb);
2854 if (q->enqueue) {
2855 rc = __dev_xmit_skb(skb, q, dev, txq);
2856 goto out;
2859 /* The device has no queue. Common case for software devices:
2860 loopback, all the sorts of tunnels...
2862 Really, it is unlikely that netif_tx_lock protection is necessary
2863 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2864 counters.)
2865 However, it is possible, that they rely on protection
2866 made by us here.
2868 Check this and shot the lock. It is not prone from deadlocks.
2869 Either shot noqueue qdisc, it is even simpler 8)
2871 if (dev->flags & IFF_UP) {
2872 int cpu = smp_processor_id(); /* ok because BHs are off */
2874 if (txq->xmit_lock_owner != cpu) {
2876 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2877 goto recursion_alert;
2879 HARD_TX_LOCK(dev, txq, cpu);
2881 if (!netif_xmit_stopped(txq)) {
2882 __this_cpu_inc(xmit_recursion);
2883 rc = dev_hard_start_xmit(skb, dev, txq);
2884 __this_cpu_dec(xmit_recursion);
2885 if (dev_xmit_complete(rc)) {
2886 HARD_TX_UNLOCK(dev, txq);
2887 goto out;
2890 HARD_TX_UNLOCK(dev, txq);
2891 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2892 dev->name);
2893 } else {
2894 /* Recursion is detected! It is possible,
2895 * unfortunately
2897 recursion_alert:
2898 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2899 dev->name);
2903 rc = -ENETDOWN;
2904 rcu_read_unlock_bh();
2906 kfree_skb(skb);
2907 return rc;
2908 out:
2909 rcu_read_unlock_bh();
2910 return rc;
2912 EXPORT_SYMBOL(dev_queue_xmit);
2915 /*=======================================================================
2916 Receiver routines
2917 =======================================================================*/
2919 int netdev_max_backlog __read_mostly = 1000;
2920 EXPORT_SYMBOL(netdev_max_backlog);
2922 int netdev_tstamp_prequeue __read_mostly = 1;
2923 int netdev_budget __read_mostly = 300;
2924 int weight_p __read_mostly = 64; /* old backlog weight */
2926 /* Called with irq disabled */
2927 static inline void ____napi_schedule(struct softnet_data *sd,
2928 struct napi_struct *napi)
2930 list_add_tail(&napi->poll_list, &sd->poll_list);
2931 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2934 #ifdef CONFIG_RPS
2936 /* One global table that all flow-based protocols share. */
2937 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2938 EXPORT_SYMBOL(rps_sock_flow_table);
2940 struct static_key rps_needed __read_mostly;
2942 static struct rps_dev_flow *
2943 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2944 struct rps_dev_flow *rflow, u16 next_cpu)
2946 if (next_cpu != RPS_NO_CPU) {
2947 #ifdef CONFIG_RFS_ACCEL
2948 struct netdev_rx_queue *rxqueue;
2949 struct rps_dev_flow_table *flow_table;
2950 struct rps_dev_flow *old_rflow;
2951 u32 flow_id;
2952 u16 rxq_index;
2953 int rc;
2955 /* Should we steer this flow to a different hardware queue? */
2956 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2957 !(dev->features & NETIF_F_NTUPLE))
2958 goto out;
2959 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2960 if (rxq_index == skb_get_rx_queue(skb))
2961 goto out;
2963 rxqueue = dev->_rx + rxq_index;
2964 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2965 if (!flow_table)
2966 goto out;
2967 flow_id = skb->rxhash & flow_table->mask;
2968 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2969 rxq_index, flow_id);
2970 if (rc < 0)
2971 goto out;
2972 old_rflow = rflow;
2973 rflow = &flow_table->flows[flow_id];
2974 rflow->filter = rc;
2975 if (old_rflow->filter == rflow->filter)
2976 old_rflow->filter = RPS_NO_FILTER;
2977 out:
2978 #endif
2979 rflow->last_qtail =
2980 per_cpu(softnet_data, next_cpu).input_queue_head;
2983 rflow->cpu = next_cpu;
2984 return rflow;
2988 * get_rps_cpu is called from netif_receive_skb and returns the target
2989 * CPU from the RPS map of the receiving queue for a given skb.
2990 * rcu_read_lock must be held on entry.
2992 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2993 struct rps_dev_flow **rflowp)
2995 struct netdev_rx_queue *rxqueue;
2996 struct rps_map *map;
2997 struct rps_dev_flow_table *flow_table;
2998 struct rps_sock_flow_table *sock_flow_table;
2999 int cpu = -1;
3000 u16 tcpu;
3002 if (skb_rx_queue_recorded(skb)) {
3003 u16 index = skb_get_rx_queue(skb);
3004 if (unlikely(index >= dev->real_num_rx_queues)) {
3005 WARN_ONCE(dev->real_num_rx_queues > 1,
3006 "%s received packet on queue %u, but number "
3007 "of RX queues is %u\n",
3008 dev->name, index, dev->real_num_rx_queues);
3009 goto done;
3011 rxqueue = dev->_rx + index;
3012 } else
3013 rxqueue = dev->_rx;
3015 map = rcu_dereference(rxqueue->rps_map);
3016 if (map) {
3017 if (map->len == 1 &&
3018 !rcu_access_pointer(rxqueue->rps_flow_table)) {
3019 tcpu = map->cpus[0];
3020 if (cpu_online(tcpu))
3021 cpu = tcpu;
3022 goto done;
3024 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3025 goto done;
3028 skb_reset_network_header(skb);
3029 if (!skb_get_rxhash(skb))
3030 goto done;
3032 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3033 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3034 if (flow_table && sock_flow_table) {
3035 u16 next_cpu;
3036 struct rps_dev_flow *rflow;
3038 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3039 tcpu = rflow->cpu;
3041 next_cpu = sock_flow_table->ents[skb->rxhash &
3042 sock_flow_table->mask];
3045 * If the desired CPU (where last recvmsg was done) is
3046 * different from current CPU (one in the rx-queue flow
3047 * table entry), switch if one of the following holds:
3048 * - Current CPU is unset (equal to RPS_NO_CPU).
3049 * - Current CPU is offline.
3050 * - The current CPU's queue tail has advanced beyond the
3051 * last packet that was enqueued using this table entry.
3052 * This guarantees that all previous packets for the flow
3053 * have been dequeued, thus preserving in order delivery.
3055 if (unlikely(tcpu != next_cpu) &&
3056 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3057 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3058 rflow->last_qtail)) >= 0)) {
3059 tcpu = next_cpu;
3060 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3063 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3064 *rflowp = rflow;
3065 cpu = tcpu;
3066 goto done;
3070 if (map) {
3071 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3073 if (cpu_online(tcpu)) {
3074 cpu = tcpu;
3075 goto done;
3079 done:
3080 return cpu;
3083 #ifdef CONFIG_RFS_ACCEL
3086 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3087 * @dev: Device on which the filter was set
3088 * @rxq_index: RX queue index
3089 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3090 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3092 * Drivers that implement ndo_rx_flow_steer() should periodically call
3093 * this function for each installed filter and remove the filters for
3094 * which it returns %true.
3096 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3097 u32 flow_id, u16 filter_id)
3099 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3100 struct rps_dev_flow_table *flow_table;
3101 struct rps_dev_flow *rflow;
3102 bool expire = true;
3103 int cpu;
3105 rcu_read_lock();
3106 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3107 if (flow_table && flow_id <= flow_table->mask) {
3108 rflow = &flow_table->flows[flow_id];
3109 cpu = ACCESS_ONCE(rflow->cpu);
3110 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3111 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3112 rflow->last_qtail) <
3113 (int)(10 * flow_table->mask)))
3114 expire = false;
3116 rcu_read_unlock();
3117 return expire;
3119 EXPORT_SYMBOL(rps_may_expire_flow);
3121 #endif /* CONFIG_RFS_ACCEL */
3123 /* Called from hardirq (IPI) context */
3124 static void rps_trigger_softirq(void *data)
3126 struct softnet_data *sd = data;
3128 ____napi_schedule(sd, &sd->backlog);
3129 sd->received_rps++;
3132 #endif /* CONFIG_RPS */
3135 * Check if this softnet_data structure is another cpu one
3136 * If yes, queue it to our IPI list and return 1
3137 * If no, return 0
3139 static int rps_ipi_queued(struct softnet_data *sd)
3141 #ifdef CONFIG_RPS
3142 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3144 if (sd != mysd) {
3145 sd->rps_ipi_next = mysd->rps_ipi_list;
3146 mysd->rps_ipi_list = sd;
3148 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3149 return 1;
3151 #endif /* CONFIG_RPS */
3152 return 0;
3156 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3157 * queue (may be a remote CPU queue).
3159 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3160 unsigned int *qtail)
3162 struct softnet_data *sd;
3163 unsigned long flags;
3165 sd = &per_cpu(softnet_data, cpu);
3167 local_irq_save(flags);
3169 rps_lock(sd);
3170 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3171 if (skb_queue_len(&sd->input_pkt_queue)) {
3172 enqueue:
3173 __skb_queue_tail(&sd->input_pkt_queue, skb);
3174 input_queue_tail_incr_save(sd, qtail);
3175 rps_unlock(sd);
3176 local_irq_restore(flags);
3177 return NET_RX_SUCCESS;
3180 /* Schedule NAPI for backlog device
3181 * We can use non atomic operation since we own the queue lock
3183 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3184 if (!rps_ipi_queued(sd))
3185 ____napi_schedule(sd, &sd->backlog);
3187 goto enqueue;
3190 sd->dropped++;
3191 rps_unlock(sd);
3193 local_irq_restore(flags);
3195 atomic_long_inc(&skb->dev->rx_dropped);
3196 kfree_skb(skb);
3197 return NET_RX_DROP;
3201 * netif_rx - post buffer to the network code
3202 * @skb: buffer to post
3204 * This function receives a packet from a device driver and queues it for
3205 * the upper (protocol) levels to process. It always succeeds. The buffer
3206 * may be dropped during processing for congestion control or by the
3207 * protocol layers.
3209 * return values:
3210 * NET_RX_SUCCESS (no congestion)
3211 * NET_RX_DROP (packet was dropped)
3215 int netif_rx(struct sk_buff *skb)
3217 int ret;
3219 /* if netpoll wants it, pretend we never saw it */
3220 if (netpoll_rx(skb))
3221 return NET_RX_DROP;
3223 net_timestamp_check(netdev_tstamp_prequeue, skb);
3225 trace_netif_rx(skb);
3226 #ifdef CONFIG_RPS
3227 if (static_key_false(&rps_needed)) {
3228 struct rps_dev_flow voidflow, *rflow = &voidflow;
3229 int cpu;
3231 preempt_disable();
3232 rcu_read_lock();
3234 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3235 if (cpu < 0)
3236 cpu = smp_processor_id();
3238 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3240 rcu_read_unlock();
3241 preempt_enable();
3242 } else
3243 #endif
3245 unsigned int qtail;
3246 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3247 put_cpu();
3249 return ret;
3251 EXPORT_SYMBOL(netif_rx);
3253 int netif_rx_ni(struct sk_buff *skb)
3255 int err;
3257 preempt_disable();
3258 err = netif_rx(skb);
3259 if (local_softirq_pending())
3260 do_softirq();
3261 preempt_enable();
3263 return err;
3265 EXPORT_SYMBOL(netif_rx_ni);
3267 static void net_tx_action(struct softirq_action *h)
3269 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3271 if (sd->completion_queue) {
3272 struct sk_buff *clist;
3274 local_irq_disable();
3275 clist = sd->completion_queue;
3276 sd->completion_queue = NULL;
3277 local_irq_enable();
3279 while (clist) {
3280 struct sk_buff *skb = clist;
3281 clist = clist->next;
3283 WARN_ON(atomic_read(&skb->users));
3284 trace_kfree_skb(skb, net_tx_action);
3285 __kfree_skb(skb);
3289 if (sd->output_queue) {
3290 struct Qdisc *head;
3292 local_irq_disable();
3293 head = sd->output_queue;
3294 sd->output_queue = NULL;
3295 sd->output_queue_tailp = &sd->output_queue;
3296 local_irq_enable();
3298 while (head) {
3299 struct Qdisc *q = head;
3300 spinlock_t *root_lock;
3302 head = head->next_sched;
3304 root_lock = qdisc_lock(q);
3305 if (spin_trylock(root_lock)) {
3306 smp_mb__before_clear_bit();
3307 clear_bit(__QDISC_STATE_SCHED,
3308 &q->state);
3309 qdisc_run(q);
3310 spin_unlock(root_lock);
3311 } else {
3312 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3313 &q->state)) {
3314 __netif_reschedule(q);
3315 } else {
3316 smp_mb__before_clear_bit();
3317 clear_bit(__QDISC_STATE_SCHED,
3318 &q->state);
3325 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3326 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3327 /* This hook is defined here for ATM LANE */
3328 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3329 unsigned char *addr) __read_mostly;
3330 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3331 #endif
3333 #ifdef CONFIG_NET_CLS_ACT
3334 /* TODO: Maybe we should just force sch_ingress to be compiled in
3335 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3336 * a compare and 2 stores extra right now if we dont have it on
3337 * but have CONFIG_NET_CLS_ACT
3338 * NOTE: This doesn't stop any functionality; if you dont have
3339 * the ingress scheduler, you just can't add policies on ingress.
3342 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3344 struct net_device *dev = skb->dev;
3345 u32 ttl = G_TC_RTTL(skb->tc_verd);
3346 int result = TC_ACT_OK;
3347 struct Qdisc *q;
3349 if (unlikely(MAX_RED_LOOP < ttl++)) {
3350 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3351 skb->skb_iif, dev->ifindex);
3352 return TC_ACT_SHOT;
3355 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3356 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3358 q = rxq->qdisc;
3359 if (q != &noop_qdisc) {
3360 spin_lock(qdisc_lock(q));
3361 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3362 result = qdisc_enqueue_root(skb, q);
3363 spin_unlock(qdisc_lock(q));
3366 return result;
3369 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3370 struct packet_type **pt_prev,
3371 int *ret, struct net_device *orig_dev)
3373 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3375 if (!rxq || rxq->qdisc == &noop_qdisc)
3376 goto out;
3378 if (*pt_prev) {
3379 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3380 *pt_prev = NULL;
3383 switch (ing_filter(skb, rxq)) {
3384 case TC_ACT_SHOT:
3385 case TC_ACT_STOLEN:
3386 kfree_skb(skb);
3387 return NULL;
3390 out:
3391 skb->tc_verd = 0;
3392 return skb;
3394 #endif
3397 * netdev_rx_handler_register - register receive handler
3398 * @dev: device to register a handler for
3399 * @rx_handler: receive handler to register
3400 * @rx_handler_data: data pointer that is used by rx handler
3402 * Register a receive hander for a device. This handler will then be
3403 * called from __netif_receive_skb. A negative errno code is returned
3404 * on a failure.
3406 * The caller must hold the rtnl_mutex.
3408 * For a general description of rx_handler, see enum rx_handler_result.
3410 int netdev_rx_handler_register(struct net_device *dev,
3411 rx_handler_func_t *rx_handler,
3412 void *rx_handler_data)
3414 ASSERT_RTNL();
3416 if (dev->rx_handler)
3417 return -EBUSY;
3419 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3420 rcu_assign_pointer(dev->rx_handler, rx_handler);
3422 return 0;
3424 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3427 * netdev_rx_handler_unregister - unregister receive handler
3428 * @dev: device to unregister a handler from
3430 * Unregister a receive hander from a device.
3432 * The caller must hold the rtnl_mutex.
3434 void netdev_rx_handler_unregister(struct net_device *dev)
3437 ASSERT_RTNL();
3438 RCU_INIT_POINTER(dev->rx_handler, NULL);
3439 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3441 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3444 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3445 * the special handling of PFMEMALLOC skbs.
3447 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3449 switch (skb->protocol) {
3450 case __constant_htons(ETH_P_ARP):
3451 case __constant_htons(ETH_P_IP):
3452 case __constant_htons(ETH_P_IPV6):
3453 case __constant_htons(ETH_P_8021Q):
3454 return true;
3455 default:
3456 return false;
3460 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3462 struct packet_type *ptype, *pt_prev;
3463 rx_handler_func_t *rx_handler;
3464 struct net_device *orig_dev;
3465 struct net_device *null_or_dev;
3466 bool deliver_exact = false;
3467 int ret = NET_RX_DROP;
3468 __be16 type;
3470 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3472 trace_netif_receive_skb(skb);
3474 /* if we've gotten here through NAPI, check netpoll */
3475 if (netpoll_receive_skb(skb))
3476 goto out;
3478 orig_dev = skb->dev;
3480 skb_reset_network_header(skb);
3481 if (!skb_transport_header_was_set(skb))
3482 skb_reset_transport_header(skb);
3483 skb_reset_mac_len(skb);
3485 pt_prev = NULL;
3487 rcu_read_lock();
3489 another_round:
3490 skb->skb_iif = skb->dev->ifindex;
3492 __this_cpu_inc(softnet_data.processed);
3494 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3495 skb = vlan_untag(skb);
3496 if (unlikely(!skb))
3497 goto unlock;
3500 #ifdef CONFIG_NET_CLS_ACT
3501 if (skb->tc_verd & TC_NCLS) {
3502 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3503 goto ncls;
3505 #endif
3507 if (pfmemalloc)
3508 goto skip_taps;
3510 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3511 if (!ptype->dev || ptype->dev == skb->dev) {
3512 if (pt_prev)
3513 ret = deliver_skb(skb, pt_prev, orig_dev);
3514 pt_prev = ptype;
3518 skip_taps:
3519 #ifdef CONFIG_NET_CLS_ACT
3520 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3521 if (!skb)
3522 goto unlock;
3523 ncls:
3524 #endif
3526 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3527 goto drop;
3529 if (vlan_tx_tag_present(skb)) {
3530 if (pt_prev) {
3531 ret = deliver_skb(skb, pt_prev, orig_dev);
3532 pt_prev = NULL;
3534 if (vlan_do_receive(&skb))
3535 goto another_round;
3536 else if (unlikely(!skb))
3537 goto unlock;
3540 rx_handler = rcu_dereference(skb->dev->rx_handler);
3541 if (rx_handler) {
3542 if (pt_prev) {
3543 ret = deliver_skb(skb, pt_prev, orig_dev);
3544 pt_prev = NULL;
3546 switch (rx_handler(&skb)) {
3547 case RX_HANDLER_CONSUMED:
3548 goto unlock;
3549 case RX_HANDLER_ANOTHER:
3550 goto another_round;
3551 case RX_HANDLER_EXACT:
3552 deliver_exact = true;
3553 case RX_HANDLER_PASS:
3554 break;
3555 default:
3556 BUG();
3560 if (vlan_tx_nonzero_tag_present(skb))
3561 skb->pkt_type = PACKET_OTHERHOST;
3563 /* deliver only exact match when indicated */
3564 null_or_dev = deliver_exact ? skb->dev : NULL;
3566 type = skb->protocol;
3567 list_for_each_entry_rcu(ptype,
3568 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3569 if (ptype->type == type &&
3570 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3571 ptype->dev == orig_dev)) {
3572 if (pt_prev)
3573 ret = deliver_skb(skb, pt_prev, orig_dev);
3574 pt_prev = ptype;
3578 if (pt_prev) {
3579 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3580 goto drop;
3581 else
3582 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3583 } else {
3584 drop:
3585 atomic_long_inc(&skb->dev->rx_dropped);
3586 kfree_skb(skb);
3587 /* Jamal, now you will not able to escape explaining
3588 * me how you were going to use this. :-)
3590 ret = NET_RX_DROP;
3593 unlock:
3594 rcu_read_unlock();
3595 out:
3596 return ret;
3599 static int __netif_receive_skb(struct sk_buff *skb)
3601 int ret;
3603 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3604 unsigned long pflags = current->flags;
3607 * PFMEMALLOC skbs are special, they should
3608 * - be delivered to SOCK_MEMALLOC sockets only
3609 * - stay away from userspace
3610 * - have bounded memory usage
3612 * Use PF_MEMALLOC as this saves us from propagating the allocation
3613 * context down to all allocation sites.
3615 current->flags |= PF_MEMALLOC;
3616 ret = __netif_receive_skb_core(skb, true);
3617 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3618 } else
3619 ret = __netif_receive_skb_core(skb, false);
3621 return ret;
3625 * netif_receive_skb - process receive buffer from network
3626 * @skb: buffer to process
3628 * netif_receive_skb() is the main receive data processing function.
3629 * It always succeeds. The buffer may be dropped during processing
3630 * for congestion control or by the protocol layers.
3632 * This function may only be called from softirq context and interrupts
3633 * should be enabled.
3635 * Return values (usually ignored):
3636 * NET_RX_SUCCESS: no congestion
3637 * NET_RX_DROP: packet was dropped
3639 int netif_receive_skb(struct sk_buff *skb)
3641 net_timestamp_check(netdev_tstamp_prequeue, skb);
3643 if (skb_defer_rx_timestamp(skb))
3644 return NET_RX_SUCCESS;
3646 #ifdef CONFIG_RPS
3647 if (static_key_false(&rps_needed)) {
3648 struct rps_dev_flow voidflow, *rflow = &voidflow;
3649 int cpu, ret;
3651 rcu_read_lock();
3653 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3655 if (cpu >= 0) {
3656 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3657 rcu_read_unlock();
3658 return ret;
3660 rcu_read_unlock();
3662 #endif
3663 return __netif_receive_skb(skb);
3665 EXPORT_SYMBOL(netif_receive_skb);
3667 /* Network device is going away, flush any packets still pending
3668 * Called with irqs disabled.
3670 static void flush_backlog(void *arg)
3672 struct net_device *dev = arg;
3673 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3674 struct sk_buff *skb, *tmp;
3676 rps_lock(sd);
3677 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3678 if (skb->dev == dev) {
3679 __skb_unlink(skb, &sd->input_pkt_queue);
3680 kfree_skb(skb);
3681 input_queue_head_incr(sd);
3684 rps_unlock(sd);
3686 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3687 if (skb->dev == dev) {
3688 __skb_unlink(skb, &sd->process_queue);
3689 kfree_skb(skb);
3690 input_queue_head_incr(sd);
3695 static int napi_gro_complete(struct sk_buff *skb)
3697 struct packet_offload *ptype;
3698 __be16 type = skb->protocol;
3699 struct list_head *head = &offload_base;
3700 int err = -ENOENT;
3702 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3704 if (NAPI_GRO_CB(skb)->count == 1) {
3705 skb_shinfo(skb)->gso_size = 0;
3706 goto out;
3709 rcu_read_lock();
3710 list_for_each_entry_rcu(ptype, head, list) {
3711 if (ptype->type != type || !ptype->callbacks.gro_complete)
3712 continue;
3714 err = ptype->callbacks.gro_complete(skb);
3715 break;
3717 rcu_read_unlock();
3719 if (err) {
3720 WARN_ON(&ptype->list == head);
3721 kfree_skb(skb);
3722 return NET_RX_SUCCESS;
3725 out:
3726 return netif_receive_skb(skb);
3729 /* napi->gro_list contains packets ordered by age.
3730 * youngest packets at the head of it.
3731 * Complete skbs in reverse order to reduce latencies.
3733 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3735 struct sk_buff *skb, *prev = NULL;
3737 /* scan list and build reverse chain */
3738 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3739 skb->prev = prev;
3740 prev = skb;
3743 for (skb = prev; skb; skb = prev) {
3744 skb->next = NULL;
3746 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3747 return;
3749 prev = skb->prev;
3750 napi_gro_complete(skb);
3751 napi->gro_count--;
3754 napi->gro_list = NULL;
3756 EXPORT_SYMBOL(napi_gro_flush);
3758 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3760 struct sk_buff *p;
3761 unsigned int maclen = skb->dev->hard_header_len;
3763 for (p = napi->gro_list; p; p = p->next) {
3764 unsigned long diffs;
3766 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3767 diffs |= p->vlan_tci ^ skb->vlan_tci;
3768 if (maclen == ETH_HLEN)
3769 diffs |= compare_ether_header(skb_mac_header(p),
3770 skb_gro_mac_header(skb));
3771 else if (!diffs)
3772 diffs = memcmp(skb_mac_header(p),
3773 skb_gro_mac_header(skb),
3774 maclen);
3775 NAPI_GRO_CB(p)->same_flow = !diffs;
3776 NAPI_GRO_CB(p)->flush = 0;
3780 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3782 struct sk_buff **pp = NULL;
3783 struct packet_offload *ptype;
3784 __be16 type = skb->protocol;
3785 struct list_head *head = &offload_base;
3786 int same_flow;
3787 int mac_len;
3788 enum gro_result ret;
3790 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3791 goto normal;
3793 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3794 goto normal;
3796 gro_list_prepare(napi, skb);
3798 rcu_read_lock();
3799 list_for_each_entry_rcu(ptype, head, list) {
3800 if (ptype->type != type || !ptype->callbacks.gro_receive)
3801 continue;
3803 skb_set_network_header(skb, skb_gro_offset(skb));
3804 mac_len = skb->network_header - skb->mac_header;
3805 skb->mac_len = mac_len;
3806 NAPI_GRO_CB(skb)->same_flow = 0;
3807 NAPI_GRO_CB(skb)->flush = 0;
3808 NAPI_GRO_CB(skb)->free = 0;
3810 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3811 break;
3813 rcu_read_unlock();
3815 if (&ptype->list == head)
3816 goto normal;
3818 same_flow = NAPI_GRO_CB(skb)->same_flow;
3819 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3821 if (pp) {
3822 struct sk_buff *nskb = *pp;
3824 *pp = nskb->next;
3825 nskb->next = NULL;
3826 napi_gro_complete(nskb);
3827 napi->gro_count--;
3830 if (same_flow)
3831 goto ok;
3833 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3834 goto normal;
3836 napi->gro_count++;
3837 NAPI_GRO_CB(skb)->count = 1;
3838 NAPI_GRO_CB(skb)->age = jiffies;
3839 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3840 skb->next = napi->gro_list;
3841 napi->gro_list = skb;
3842 ret = GRO_HELD;
3844 pull:
3845 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3846 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3848 BUG_ON(skb->end - skb->tail < grow);
3850 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3852 skb->tail += grow;
3853 skb->data_len -= grow;
3855 skb_shinfo(skb)->frags[0].page_offset += grow;
3856 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3858 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3859 skb_frag_unref(skb, 0);
3860 memmove(skb_shinfo(skb)->frags,
3861 skb_shinfo(skb)->frags + 1,
3862 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3867 return ret;
3869 normal:
3870 ret = GRO_NORMAL;
3871 goto pull;
3875 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3877 switch (ret) {
3878 case GRO_NORMAL:
3879 if (netif_receive_skb(skb))
3880 ret = GRO_DROP;
3881 break;
3883 case GRO_DROP:
3884 kfree_skb(skb);
3885 break;
3887 case GRO_MERGED_FREE:
3888 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3889 kmem_cache_free(skbuff_head_cache, skb);
3890 else
3891 __kfree_skb(skb);
3892 break;
3894 case GRO_HELD:
3895 case GRO_MERGED:
3896 break;
3899 return ret;
3902 static void skb_gro_reset_offset(struct sk_buff *skb)
3904 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3905 const skb_frag_t *frag0 = &pinfo->frags[0];
3907 NAPI_GRO_CB(skb)->data_offset = 0;
3908 NAPI_GRO_CB(skb)->frag0 = NULL;
3909 NAPI_GRO_CB(skb)->frag0_len = 0;
3911 if (skb->mac_header == skb->tail &&
3912 pinfo->nr_frags &&
3913 !PageHighMem(skb_frag_page(frag0))) {
3914 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3915 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3919 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3921 skb_gro_reset_offset(skb);
3923 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3925 EXPORT_SYMBOL(napi_gro_receive);
3927 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3929 __skb_pull(skb, skb_headlen(skb));
3930 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3931 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3932 skb->vlan_tci = 0;
3933 skb->dev = napi->dev;
3934 skb->skb_iif = 0;
3936 napi->skb = skb;
3939 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3941 struct sk_buff *skb = napi->skb;
3943 if (!skb) {
3944 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3945 if (skb)
3946 napi->skb = skb;
3948 return skb;
3950 EXPORT_SYMBOL(napi_get_frags);
3952 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3953 gro_result_t ret)
3955 switch (ret) {
3956 case GRO_NORMAL:
3957 case GRO_HELD:
3958 skb->protocol = eth_type_trans(skb, skb->dev);
3960 if (ret == GRO_HELD)
3961 skb_gro_pull(skb, -ETH_HLEN);
3962 else if (netif_receive_skb(skb))
3963 ret = GRO_DROP;
3964 break;
3966 case GRO_DROP:
3967 case GRO_MERGED_FREE:
3968 napi_reuse_skb(napi, skb);
3969 break;
3971 case GRO_MERGED:
3972 break;
3975 return ret;
3978 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3980 struct sk_buff *skb = napi->skb;
3981 struct ethhdr *eth;
3982 unsigned int hlen;
3983 unsigned int off;
3985 napi->skb = NULL;
3987 skb_reset_mac_header(skb);
3988 skb_gro_reset_offset(skb);
3990 off = skb_gro_offset(skb);
3991 hlen = off + sizeof(*eth);
3992 eth = skb_gro_header_fast(skb, off);
3993 if (skb_gro_header_hard(skb, hlen)) {
3994 eth = skb_gro_header_slow(skb, hlen, off);
3995 if (unlikely(!eth)) {
3996 napi_reuse_skb(napi, skb);
3997 skb = NULL;
3998 goto out;
4002 skb_gro_pull(skb, sizeof(*eth));
4005 * This works because the only protocols we care about don't require
4006 * special handling. We'll fix it up properly at the end.
4008 skb->protocol = eth->h_proto;
4010 out:
4011 return skb;
4014 gro_result_t napi_gro_frags(struct napi_struct *napi)
4016 struct sk_buff *skb = napi_frags_skb(napi);
4018 if (!skb)
4019 return GRO_DROP;
4021 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4023 EXPORT_SYMBOL(napi_gro_frags);
4026 * net_rps_action sends any pending IPI's for rps.
4027 * Note: called with local irq disabled, but exits with local irq enabled.
4029 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4031 #ifdef CONFIG_RPS
4032 struct softnet_data *remsd = sd->rps_ipi_list;
4034 if (remsd) {
4035 sd->rps_ipi_list = NULL;
4037 local_irq_enable();
4039 /* Send pending IPI's to kick RPS processing on remote cpus. */
4040 while (remsd) {
4041 struct softnet_data *next = remsd->rps_ipi_next;
4043 if (cpu_online(remsd->cpu))
4044 __smp_call_function_single(remsd->cpu,
4045 &remsd->csd, 0);
4046 remsd = next;
4048 } else
4049 #endif
4050 local_irq_enable();
4053 static int process_backlog(struct napi_struct *napi, int quota)
4055 int work = 0;
4056 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4058 #ifdef CONFIG_RPS
4059 /* Check if we have pending ipi, its better to send them now,
4060 * not waiting net_rx_action() end.
4062 if (sd->rps_ipi_list) {
4063 local_irq_disable();
4064 net_rps_action_and_irq_enable(sd);
4066 #endif
4067 napi->weight = weight_p;
4068 local_irq_disable();
4069 while (work < quota) {
4070 struct sk_buff *skb;
4071 unsigned int qlen;
4073 while ((skb = __skb_dequeue(&sd->process_queue))) {
4074 local_irq_enable();
4075 __netif_receive_skb(skb);
4076 local_irq_disable();
4077 input_queue_head_incr(sd);
4078 if (++work >= quota) {
4079 local_irq_enable();
4080 return work;
4084 rps_lock(sd);
4085 qlen = skb_queue_len(&sd->input_pkt_queue);
4086 if (qlen)
4087 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4088 &sd->process_queue);
4090 if (qlen < quota - work) {
4092 * Inline a custom version of __napi_complete().
4093 * only current cpu owns and manipulates this napi,
4094 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4095 * we can use a plain write instead of clear_bit(),
4096 * and we dont need an smp_mb() memory barrier.
4098 list_del(&napi->poll_list);
4099 napi->state = 0;
4101 quota = work + qlen;
4103 rps_unlock(sd);
4105 local_irq_enable();
4107 return work;
4111 * __napi_schedule - schedule for receive
4112 * @n: entry to schedule
4114 * The entry's receive function will be scheduled to run
4116 void __napi_schedule(struct napi_struct *n)
4118 unsigned long flags;
4120 local_irq_save(flags);
4121 ____napi_schedule(&__get_cpu_var(softnet_data), n);
4122 local_irq_restore(flags);
4124 EXPORT_SYMBOL(__napi_schedule);
4126 void __napi_complete(struct napi_struct *n)
4128 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4129 BUG_ON(n->gro_list);
4131 list_del(&n->poll_list);
4132 smp_mb__before_clear_bit();
4133 clear_bit(NAPI_STATE_SCHED, &n->state);
4135 EXPORT_SYMBOL(__napi_complete);
4137 void napi_complete(struct napi_struct *n)
4139 unsigned long flags;
4142 * don't let napi dequeue from the cpu poll list
4143 * just in case its running on a different cpu
4145 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4146 return;
4148 napi_gro_flush(n, false);
4149 local_irq_save(flags);
4150 __napi_complete(n);
4151 local_irq_restore(flags);
4153 EXPORT_SYMBOL(napi_complete);
4155 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4156 int (*poll)(struct napi_struct *, int), int weight)
4158 INIT_LIST_HEAD(&napi->poll_list);
4159 napi->gro_count = 0;
4160 napi->gro_list = NULL;
4161 napi->skb = NULL;
4162 napi->poll = poll;
4163 napi->weight = weight;
4164 list_add(&napi->dev_list, &dev->napi_list);
4165 napi->dev = dev;
4166 #ifdef CONFIG_NETPOLL
4167 spin_lock_init(&napi->poll_lock);
4168 napi->poll_owner = -1;
4169 #endif
4170 set_bit(NAPI_STATE_SCHED, &napi->state);
4172 EXPORT_SYMBOL(netif_napi_add);
4174 void netif_napi_del(struct napi_struct *napi)
4176 struct sk_buff *skb, *next;
4178 list_del_init(&napi->dev_list);
4179 napi_free_frags(napi);
4181 for (skb = napi->gro_list; skb; skb = next) {
4182 next = skb->next;
4183 skb->next = NULL;
4184 kfree_skb(skb);
4187 napi->gro_list = NULL;
4188 napi->gro_count = 0;
4190 EXPORT_SYMBOL(netif_napi_del);
4192 static void net_rx_action(struct softirq_action *h)
4194 struct softnet_data *sd = &__get_cpu_var(softnet_data);
4195 unsigned long time_limit = jiffies + 2;
4196 int budget = netdev_budget;
4197 void *have;
4199 local_irq_disable();
4201 while (!list_empty(&sd->poll_list)) {
4202 struct napi_struct *n;
4203 int work, weight;
4205 /* If softirq window is exhuasted then punt.
4206 * Allow this to run for 2 jiffies since which will allow
4207 * an average latency of 1.5/HZ.
4209 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4210 goto softnet_break;
4212 local_irq_enable();
4214 /* Even though interrupts have been re-enabled, this
4215 * access is safe because interrupts can only add new
4216 * entries to the tail of this list, and only ->poll()
4217 * calls can remove this head entry from the list.
4219 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4221 have = netpoll_poll_lock(n);
4223 weight = n->weight;
4225 /* This NAPI_STATE_SCHED test is for avoiding a race
4226 * with netpoll's poll_napi(). Only the entity which
4227 * obtains the lock and sees NAPI_STATE_SCHED set will
4228 * actually make the ->poll() call. Therefore we avoid
4229 * accidentally calling ->poll() when NAPI is not scheduled.
4231 work = 0;
4232 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4233 work = n->poll(n, weight);
4234 trace_napi_poll(n);
4237 WARN_ON_ONCE(work > weight);
4239 budget -= work;
4241 local_irq_disable();
4243 /* Drivers must not modify the NAPI state if they
4244 * consume the entire weight. In such cases this code
4245 * still "owns" the NAPI instance and therefore can
4246 * move the instance around on the list at-will.
4248 if (unlikely(work == weight)) {
4249 if (unlikely(napi_disable_pending(n))) {
4250 local_irq_enable();
4251 napi_complete(n);
4252 local_irq_disable();
4253 } else {
4254 if (n->gro_list) {
4255 /* flush too old packets
4256 * If HZ < 1000, flush all packets.
4258 local_irq_enable();
4259 napi_gro_flush(n, HZ >= 1000);
4260 local_irq_disable();
4262 list_move_tail(&n->poll_list, &sd->poll_list);
4266 netpoll_poll_unlock(have);
4268 out:
4269 net_rps_action_and_irq_enable(sd);
4271 #ifdef CONFIG_NET_DMA
4273 * There may not be any more sk_buffs coming right now, so push
4274 * any pending DMA copies to hardware
4276 dma_issue_pending_all();
4277 #endif
4279 return;
4281 softnet_break:
4282 sd->time_squeeze++;
4283 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4284 goto out;
4287 static gifconf_func_t *gifconf_list[NPROTO];
4290 * register_gifconf - register a SIOCGIF handler
4291 * @family: Address family
4292 * @gifconf: Function handler
4294 * Register protocol dependent address dumping routines. The handler
4295 * that is passed must not be freed or reused until it has been replaced
4296 * by another handler.
4298 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4300 if (family >= NPROTO)
4301 return -EINVAL;
4302 gifconf_list[family] = gifconf;
4303 return 0;
4305 EXPORT_SYMBOL(register_gifconf);
4309 * Map an interface index to its name (SIOCGIFNAME)
4313 * We need this ioctl for efficient implementation of the
4314 * if_indextoname() function required by the IPv6 API. Without
4315 * it, we would have to search all the interfaces to find a
4316 * match. --pb
4319 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4321 struct net_device *dev;
4322 struct ifreq ifr;
4323 unsigned seq;
4326 * Fetch the caller's info block.
4329 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4330 return -EFAULT;
4332 retry:
4333 seq = read_seqcount_begin(&devnet_rename_seq);
4334 rcu_read_lock();
4335 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4336 if (!dev) {
4337 rcu_read_unlock();
4338 return -ENODEV;
4341 strcpy(ifr.ifr_name, dev->name);
4342 rcu_read_unlock();
4343 if (read_seqcount_retry(&devnet_rename_seq, seq))
4344 goto retry;
4346 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4347 return -EFAULT;
4348 return 0;
4352 * Perform a SIOCGIFCONF call. This structure will change
4353 * size eventually, and there is nothing I can do about it.
4354 * Thus we will need a 'compatibility mode'.
4357 static int dev_ifconf(struct net *net, char __user *arg)
4359 struct ifconf ifc;
4360 struct net_device *dev;
4361 char __user *pos;
4362 int len;
4363 int total;
4364 int i;
4367 * Fetch the caller's info block.
4370 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4371 return -EFAULT;
4373 pos = ifc.ifc_buf;
4374 len = ifc.ifc_len;
4377 * Loop over the interfaces, and write an info block for each.
4380 total = 0;
4381 for_each_netdev(net, dev) {
4382 for (i = 0; i < NPROTO; i++) {
4383 if (gifconf_list[i]) {
4384 int done;
4385 if (!pos)
4386 done = gifconf_list[i](dev, NULL, 0);
4387 else
4388 done = gifconf_list[i](dev, pos + total,
4389 len - total);
4390 if (done < 0)
4391 return -EFAULT;
4392 total += done;
4398 * All done. Write the updated control block back to the caller.
4400 ifc.ifc_len = total;
4403 * Both BSD and Solaris return 0 here, so we do too.
4405 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4408 #ifdef CONFIG_PROC_FS
4410 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4412 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4413 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4414 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4416 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4418 struct net *net = seq_file_net(seq);
4419 struct net_device *dev;
4420 struct hlist_node *p;
4421 struct hlist_head *h;
4422 unsigned int count = 0, offset = get_offset(*pos);
4424 h = &net->dev_name_head[get_bucket(*pos)];
4425 hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4426 if (++count == offset)
4427 return dev;
4430 return NULL;
4433 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4435 struct net_device *dev;
4436 unsigned int bucket;
4438 do {
4439 dev = dev_from_same_bucket(seq, pos);
4440 if (dev)
4441 return dev;
4443 bucket = get_bucket(*pos) + 1;
4444 *pos = set_bucket_offset(bucket, 1);
4445 } while (bucket < NETDEV_HASHENTRIES);
4447 return NULL;
4451 * This is invoked by the /proc filesystem handler to display a device
4452 * in detail.
4454 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4455 __acquires(RCU)
4457 rcu_read_lock();
4458 if (!*pos)
4459 return SEQ_START_TOKEN;
4461 if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4462 return NULL;
4464 return dev_from_bucket(seq, pos);
4467 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4469 ++*pos;
4470 return dev_from_bucket(seq, pos);
4473 void dev_seq_stop(struct seq_file *seq, void *v)
4474 __releases(RCU)
4476 rcu_read_unlock();
4479 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4481 struct rtnl_link_stats64 temp;
4482 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4484 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4485 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4486 dev->name, stats->rx_bytes, stats->rx_packets,
4487 stats->rx_errors,
4488 stats->rx_dropped + stats->rx_missed_errors,
4489 stats->rx_fifo_errors,
4490 stats->rx_length_errors + stats->rx_over_errors +
4491 stats->rx_crc_errors + stats->rx_frame_errors,
4492 stats->rx_compressed, stats->multicast,
4493 stats->tx_bytes, stats->tx_packets,
4494 stats->tx_errors, stats->tx_dropped,
4495 stats->tx_fifo_errors, stats->collisions,
4496 stats->tx_carrier_errors +
4497 stats->tx_aborted_errors +
4498 stats->tx_window_errors +
4499 stats->tx_heartbeat_errors,
4500 stats->tx_compressed);
4504 * Called from the PROCfs module. This now uses the new arbitrary sized
4505 * /proc/net interface to create /proc/net/dev
4507 static int dev_seq_show(struct seq_file *seq, void *v)
4509 if (v == SEQ_START_TOKEN)
4510 seq_puts(seq, "Inter-| Receive "
4511 " | Transmit\n"
4512 " face |bytes packets errs drop fifo frame "
4513 "compressed multicast|bytes packets errs "
4514 "drop fifo colls carrier compressed\n");
4515 else
4516 dev_seq_printf_stats(seq, v);
4517 return 0;
4520 static struct softnet_data *softnet_get_online(loff_t *pos)
4522 struct softnet_data *sd = NULL;
4524 while (*pos < nr_cpu_ids)
4525 if (cpu_online(*pos)) {
4526 sd = &per_cpu(softnet_data, *pos);
4527 break;
4528 } else
4529 ++*pos;
4530 return sd;
4533 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4535 return softnet_get_online(pos);
4538 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4540 ++*pos;
4541 return softnet_get_online(pos);
4544 static void softnet_seq_stop(struct seq_file *seq, void *v)
4548 static int softnet_seq_show(struct seq_file *seq, void *v)
4550 struct softnet_data *sd = v;
4552 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4553 sd->processed, sd->dropped, sd->time_squeeze, 0,
4554 0, 0, 0, 0, /* was fastroute */
4555 sd->cpu_collision, sd->received_rps);
4556 return 0;
4559 static const struct seq_operations dev_seq_ops = {
4560 .start = dev_seq_start,
4561 .next = dev_seq_next,
4562 .stop = dev_seq_stop,
4563 .show = dev_seq_show,
4566 static int dev_seq_open(struct inode *inode, struct file *file)
4568 return seq_open_net(inode, file, &dev_seq_ops,
4569 sizeof(struct seq_net_private));
4572 static const struct file_operations dev_seq_fops = {
4573 .owner = THIS_MODULE,
4574 .open = dev_seq_open,
4575 .read = seq_read,
4576 .llseek = seq_lseek,
4577 .release = seq_release_net,
4580 static const struct seq_operations softnet_seq_ops = {
4581 .start = softnet_seq_start,
4582 .next = softnet_seq_next,
4583 .stop = softnet_seq_stop,
4584 .show = softnet_seq_show,
4587 static int softnet_seq_open(struct inode *inode, struct file *file)
4589 return seq_open(file, &softnet_seq_ops);
4592 static const struct file_operations softnet_seq_fops = {
4593 .owner = THIS_MODULE,
4594 .open = softnet_seq_open,
4595 .read = seq_read,
4596 .llseek = seq_lseek,
4597 .release = seq_release,
4600 static void *ptype_get_idx(loff_t pos)
4602 struct packet_type *pt = NULL;
4603 loff_t i = 0;
4604 int t;
4606 list_for_each_entry_rcu(pt, &ptype_all, list) {
4607 if (i == pos)
4608 return pt;
4609 ++i;
4612 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4613 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4614 if (i == pos)
4615 return pt;
4616 ++i;
4619 return NULL;
4622 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4623 __acquires(RCU)
4625 rcu_read_lock();
4626 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4629 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4631 struct packet_type *pt;
4632 struct list_head *nxt;
4633 int hash;
4635 ++*pos;
4636 if (v == SEQ_START_TOKEN)
4637 return ptype_get_idx(0);
4639 pt = v;
4640 nxt = pt->list.next;
4641 if (pt->type == htons(ETH_P_ALL)) {
4642 if (nxt != &ptype_all)
4643 goto found;
4644 hash = 0;
4645 nxt = ptype_base[0].next;
4646 } else
4647 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4649 while (nxt == &ptype_base[hash]) {
4650 if (++hash >= PTYPE_HASH_SIZE)
4651 return NULL;
4652 nxt = ptype_base[hash].next;
4654 found:
4655 return list_entry(nxt, struct packet_type, list);
4658 static void ptype_seq_stop(struct seq_file *seq, void *v)
4659 __releases(RCU)
4661 rcu_read_unlock();
4664 static int ptype_seq_show(struct seq_file *seq, void *v)
4666 struct packet_type *pt = v;
4668 if (v == SEQ_START_TOKEN)
4669 seq_puts(seq, "Type Device Function\n");
4670 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4671 if (pt->type == htons(ETH_P_ALL))
4672 seq_puts(seq, "ALL ");
4673 else
4674 seq_printf(seq, "%04x", ntohs(pt->type));
4676 seq_printf(seq, " %-8s %pF\n",
4677 pt->dev ? pt->dev->name : "", pt->func);
4680 return 0;
4683 static const struct seq_operations ptype_seq_ops = {
4684 .start = ptype_seq_start,
4685 .next = ptype_seq_next,
4686 .stop = ptype_seq_stop,
4687 .show = ptype_seq_show,
4690 static int ptype_seq_open(struct inode *inode, struct file *file)
4692 return seq_open_net(inode, file, &ptype_seq_ops,
4693 sizeof(struct seq_net_private));
4696 static const struct file_operations ptype_seq_fops = {
4697 .owner = THIS_MODULE,
4698 .open = ptype_seq_open,
4699 .read = seq_read,
4700 .llseek = seq_lseek,
4701 .release = seq_release_net,
4705 static int __net_init dev_proc_net_init(struct net *net)
4707 int rc = -ENOMEM;
4709 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4710 goto out;
4711 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4712 goto out_dev;
4713 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4714 goto out_softnet;
4716 if (wext_proc_init(net))
4717 goto out_ptype;
4718 rc = 0;
4719 out:
4720 return rc;
4721 out_ptype:
4722 proc_net_remove(net, "ptype");
4723 out_softnet:
4724 proc_net_remove(net, "softnet_stat");
4725 out_dev:
4726 proc_net_remove(net, "dev");
4727 goto out;
4730 static void __net_exit dev_proc_net_exit(struct net *net)
4732 wext_proc_exit(net);
4734 proc_net_remove(net, "ptype");
4735 proc_net_remove(net, "softnet_stat");
4736 proc_net_remove(net, "dev");
4739 static struct pernet_operations __net_initdata dev_proc_ops = {
4740 .init = dev_proc_net_init,
4741 .exit = dev_proc_net_exit,
4744 static int __init dev_proc_init(void)
4746 return register_pernet_subsys(&dev_proc_ops);
4748 #else
4749 #define dev_proc_init() 0
4750 #endif /* CONFIG_PROC_FS */
4753 struct netdev_upper {
4754 struct net_device *dev;
4755 bool master;
4756 struct list_head list;
4757 struct rcu_head rcu;
4758 struct list_head search_list;
4761 static void __append_search_uppers(struct list_head *search_list,
4762 struct net_device *dev)
4764 struct netdev_upper *upper;
4766 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4767 /* check if this upper is not already in search list */
4768 if (list_empty(&upper->search_list))
4769 list_add_tail(&upper->search_list, search_list);
4773 static bool __netdev_search_upper_dev(struct net_device *dev,
4774 struct net_device *upper_dev)
4776 LIST_HEAD(search_list);
4777 struct netdev_upper *upper;
4778 struct netdev_upper *tmp;
4779 bool ret = false;
4781 __append_search_uppers(&search_list, dev);
4782 list_for_each_entry(upper, &search_list, search_list) {
4783 if (upper->dev == upper_dev) {
4784 ret = true;
4785 break;
4787 __append_search_uppers(&search_list, upper->dev);
4789 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4790 INIT_LIST_HEAD(&upper->search_list);
4791 return ret;
4794 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4795 struct net_device *upper_dev)
4797 struct netdev_upper *upper;
4799 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4800 if (upper->dev == upper_dev)
4801 return upper;
4803 return NULL;
4807 * netdev_has_upper_dev - Check if device is linked to an upper device
4808 * @dev: device
4809 * @upper_dev: upper device to check
4811 * Find out if a device is linked to specified upper device and return true
4812 * in case it is. Note that this checks only immediate upper device,
4813 * not through a complete stack of devices. The caller must hold the RTNL lock.
4815 bool netdev_has_upper_dev(struct net_device *dev,
4816 struct net_device *upper_dev)
4818 ASSERT_RTNL();
4820 return __netdev_find_upper(dev, upper_dev);
4822 EXPORT_SYMBOL(netdev_has_upper_dev);
4825 * netdev_has_any_upper_dev - Check if device is linked to some device
4826 * @dev: device
4828 * Find out if a device is linked to an upper device and return true in case
4829 * it is. The caller must hold the RTNL lock.
4831 bool netdev_has_any_upper_dev(struct net_device *dev)
4833 ASSERT_RTNL();
4835 return !list_empty(&dev->upper_dev_list);
4837 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4840 * netdev_master_upper_dev_get - Get master upper device
4841 * @dev: device
4843 * Find a master upper device and return pointer to it or NULL in case
4844 * it's not there. The caller must hold the RTNL lock.
4846 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4848 struct netdev_upper *upper;
4850 ASSERT_RTNL();
4852 if (list_empty(&dev->upper_dev_list))
4853 return NULL;
4855 upper = list_first_entry(&dev->upper_dev_list,
4856 struct netdev_upper, list);
4857 if (likely(upper->master))
4858 return upper->dev;
4859 return NULL;
4861 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4864 * netdev_master_upper_dev_get_rcu - Get master upper device
4865 * @dev: device
4867 * Find a master upper device and return pointer to it or NULL in case
4868 * it's not there. The caller must hold the RCU read lock.
4870 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4872 struct netdev_upper *upper;
4874 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4875 struct netdev_upper, list);
4876 if (upper && likely(upper->master))
4877 return upper->dev;
4878 return NULL;
4880 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4882 static int __netdev_upper_dev_link(struct net_device *dev,
4883 struct net_device *upper_dev, bool master)
4885 struct netdev_upper *upper;
4887 ASSERT_RTNL();
4889 if (dev == upper_dev)
4890 return -EBUSY;
4892 /* To prevent loops, check if dev is not upper device to upper_dev. */
4893 if (__netdev_search_upper_dev(upper_dev, dev))
4894 return -EBUSY;
4896 if (__netdev_find_upper(dev, upper_dev))
4897 return -EEXIST;
4899 if (master && netdev_master_upper_dev_get(dev))
4900 return -EBUSY;
4902 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4903 if (!upper)
4904 return -ENOMEM;
4906 upper->dev = upper_dev;
4907 upper->master = master;
4908 INIT_LIST_HEAD(&upper->search_list);
4910 /* Ensure that master upper link is always the first item in list. */
4911 if (master)
4912 list_add_rcu(&upper->list, &dev->upper_dev_list);
4913 else
4914 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4915 dev_hold(upper_dev);
4917 return 0;
4921 * netdev_upper_dev_link - Add a link to the upper device
4922 * @dev: device
4923 * @upper_dev: new upper device
4925 * Adds a link to device which is upper to this one. The caller must hold
4926 * the RTNL lock. On a failure a negative errno code is returned.
4927 * On success the reference counts are adjusted and the function
4928 * returns zero.
4930 int netdev_upper_dev_link(struct net_device *dev,
4931 struct net_device *upper_dev)
4933 return __netdev_upper_dev_link(dev, upper_dev, false);
4935 EXPORT_SYMBOL(netdev_upper_dev_link);
4938 * netdev_master_upper_dev_link - Add a master link to the upper device
4939 * @dev: device
4940 * @upper_dev: new upper device
4942 * Adds a link to device which is upper to this one. In this case, only
4943 * one master upper device can be linked, although other non-master devices
4944 * might be linked as well. The caller must hold the RTNL lock.
4945 * On a failure a negative errno code is returned. On success the reference
4946 * counts are adjusted and the function returns zero.
4948 int netdev_master_upper_dev_link(struct net_device *dev,
4949 struct net_device *upper_dev)
4951 return __netdev_upper_dev_link(dev, upper_dev, true);
4953 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4956 * netdev_upper_dev_unlink - Removes a link to upper device
4957 * @dev: device
4958 * @upper_dev: new upper device
4960 * Removes a link to device which is upper to this one. The caller must hold
4961 * the RTNL lock.
4963 void netdev_upper_dev_unlink(struct net_device *dev,
4964 struct net_device *upper_dev)
4966 struct netdev_upper *upper;
4968 ASSERT_RTNL();
4970 upper = __netdev_find_upper(dev, upper_dev);
4971 if (!upper)
4972 return;
4973 list_del_rcu(&upper->list);
4974 dev_put(upper_dev);
4975 kfree_rcu(upper, rcu);
4977 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4979 static void dev_change_rx_flags(struct net_device *dev, int flags)
4981 const struct net_device_ops *ops = dev->netdev_ops;
4983 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4984 ops->ndo_change_rx_flags(dev, flags);
4987 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4989 unsigned int old_flags = dev->flags;
4990 kuid_t uid;
4991 kgid_t gid;
4993 ASSERT_RTNL();
4995 dev->flags |= IFF_PROMISC;
4996 dev->promiscuity += inc;
4997 if (dev->promiscuity == 0) {
4999 * Avoid overflow.
5000 * If inc causes overflow, untouch promisc and return error.
5002 if (inc < 0)
5003 dev->flags &= ~IFF_PROMISC;
5004 else {
5005 dev->promiscuity -= inc;
5006 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5007 dev->name);
5008 return -EOVERFLOW;
5011 if (dev->flags != old_flags) {
5012 pr_info("device %s %s promiscuous mode\n",
5013 dev->name,
5014 dev->flags & IFF_PROMISC ? "entered" : "left");
5015 if (audit_enabled) {
5016 current_uid_gid(&uid, &gid);
5017 audit_log(current->audit_context, GFP_ATOMIC,
5018 AUDIT_ANOM_PROMISCUOUS,
5019 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5020 dev->name, (dev->flags & IFF_PROMISC),
5021 (old_flags & IFF_PROMISC),
5022 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5023 from_kuid(&init_user_ns, uid),
5024 from_kgid(&init_user_ns, gid),
5025 audit_get_sessionid(current));
5028 dev_change_rx_flags(dev, IFF_PROMISC);
5030 return 0;
5034 * dev_set_promiscuity - update promiscuity count on a device
5035 * @dev: device
5036 * @inc: modifier
5038 * Add or remove promiscuity from a device. While the count in the device
5039 * remains above zero the interface remains promiscuous. Once it hits zero
5040 * the device reverts back to normal filtering operation. A negative inc
5041 * value is used to drop promiscuity on the device.
5042 * Return 0 if successful or a negative errno code on error.
5044 int dev_set_promiscuity(struct net_device *dev, int inc)
5046 unsigned int old_flags = dev->flags;
5047 int err;
5049 err = __dev_set_promiscuity(dev, inc);
5050 if (err < 0)
5051 return err;
5052 if (dev->flags != old_flags)
5053 dev_set_rx_mode(dev);
5054 return err;
5056 EXPORT_SYMBOL(dev_set_promiscuity);
5059 * dev_set_allmulti - update allmulti count on a device
5060 * @dev: device
5061 * @inc: modifier
5063 * Add or remove reception of all multicast frames to a device. While the
5064 * count in the device remains above zero the interface remains listening
5065 * to all interfaces. Once it hits zero the device reverts back to normal
5066 * filtering operation. A negative @inc value is used to drop the counter
5067 * when releasing a resource needing all multicasts.
5068 * Return 0 if successful or a negative errno code on error.
5071 int dev_set_allmulti(struct net_device *dev, int inc)
5073 unsigned int old_flags = dev->flags;
5075 ASSERT_RTNL();
5077 dev->flags |= IFF_ALLMULTI;
5078 dev->allmulti += inc;
5079 if (dev->allmulti == 0) {
5081 * Avoid overflow.
5082 * If inc causes overflow, untouch allmulti and return error.
5084 if (inc < 0)
5085 dev->flags &= ~IFF_ALLMULTI;
5086 else {
5087 dev->allmulti -= inc;
5088 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5089 dev->name);
5090 return -EOVERFLOW;
5093 if (dev->flags ^ old_flags) {
5094 dev_change_rx_flags(dev, IFF_ALLMULTI);
5095 dev_set_rx_mode(dev);
5097 return 0;
5099 EXPORT_SYMBOL(dev_set_allmulti);
5102 * Upload unicast and multicast address lists to device and
5103 * configure RX filtering. When the device doesn't support unicast
5104 * filtering it is put in promiscuous mode while unicast addresses
5105 * are present.
5107 void __dev_set_rx_mode(struct net_device *dev)
5109 const struct net_device_ops *ops = dev->netdev_ops;
5111 /* dev_open will call this function so the list will stay sane. */
5112 if (!(dev->flags&IFF_UP))
5113 return;
5115 if (!netif_device_present(dev))
5116 return;
5118 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5119 /* Unicast addresses changes may only happen under the rtnl,
5120 * therefore calling __dev_set_promiscuity here is safe.
5122 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5123 __dev_set_promiscuity(dev, 1);
5124 dev->uc_promisc = true;
5125 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5126 __dev_set_promiscuity(dev, -1);
5127 dev->uc_promisc = false;
5131 if (ops->ndo_set_rx_mode)
5132 ops->ndo_set_rx_mode(dev);
5135 void dev_set_rx_mode(struct net_device *dev)
5137 netif_addr_lock_bh(dev);
5138 __dev_set_rx_mode(dev);
5139 netif_addr_unlock_bh(dev);
5143 * dev_get_flags - get flags reported to userspace
5144 * @dev: device
5146 * Get the combination of flag bits exported through APIs to userspace.
5148 unsigned int dev_get_flags(const struct net_device *dev)
5150 unsigned int flags;
5152 flags = (dev->flags & ~(IFF_PROMISC |
5153 IFF_ALLMULTI |
5154 IFF_RUNNING |
5155 IFF_LOWER_UP |
5156 IFF_DORMANT)) |
5157 (dev->gflags & (IFF_PROMISC |
5158 IFF_ALLMULTI));
5160 if (netif_running(dev)) {
5161 if (netif_oper_up(dev))
5162 flags |= IFF_RUNNING;
5163 if (netif_carrier_ok(dev))
5164 flags |= IFF_LOWER_UP;
5165 if (netif_dormant(dev))
5166 flags |= IFF_DORMANT;
5169 return flags;
5171 EXPORT_SYMBOL(dev_get_flags);
5173 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5175 unsigned int old_flags = dev->flags;
5176 int ret;
5178 ASSERT_RTNL();
5181 * Set the flags on our device.
5184 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5185 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5186 IFF_AUTOMEDIA)) |
5187 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5188 IFF_ALLMULTI));
5191 * Load in the correct multicast list now the flags have changed.
5194 if ((old_flags ^ flags) & IFF_MULTICAST)
5195 dev_change_rx_flags(dev, IFF_MULTICAST);
5197 dev_set_rx_mode(dev);
5200 * Have we downed the interface. We handle IFF_UP ourselves
5201 * according to user attempts to set it, rather than blindly
5202 * setting it.
5205 ret = 0;
5206 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
5207 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5209 if (!ret)
5210 dev_set_rx_mode(dev);
5213 if ((flags ^ dev->gflags) & IFF_PROMISC) {
5214 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5216 dev->gflags ^= IFF_PROMISC;
5217 dev_set_promiscuity(dev, inc);
5220 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5221 is important. Some (broken) drivers set IFF_PROMISC, when
5222 IFF_ALLMULTI is requested not asking us and not reporting.
5224 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5225 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5227 dev->gflags ^= IFF_ALLMULTI;
5228 dev_set_allmulti(dev, inc);
5231 return ret;
5234 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
5236 unsigned int changes = dev->flags ^ old_flags;
5238 if (changes & IFF_UP) {
5239 if (dev->flags & IFF_UP)
5240 call_netdevice_notifiers(NETDEV_UP, dev);
5241 else
5242 call_netdevice_notifiers(NETDEV_DOWN, dev);
5245 if (dev->flags & IFF_UP &&
5246 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
5247 call_netdevice_notifiers(NETDEV_CHANGE, dev);
5251 * dev_change_flags - change device settings
5252 * @dev: device
5253 * @flags: device state flags
5255 * Change settings on device based state flags. The flags are
5256 * in the userspace exported format.
5258 int dev_change_flags(struct net_device *dev, unsigned int flags)
5260 int ret;
5261 unsigned int changes, old_flags = dev->flags;
5263 ret = __dev_change_flags(dev, flags);
5264 if (ret < 0)
5265 return ret;
5267 changes = old_flags ^ dev->flags;
5268 if (changes)
5269 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
5271 __dev_notify_flags(dev, old_flags);
5272 return ret;
5274 EXPORT_SYMBOL(dev_change_flags);
5277 * dev_set_mtu - Change maximum transfer unit
5278 * @dev: device
5279 * @new_mtu: new transfer unit
5281 * Change the maximum transfer size of the network device.
5283 int dev_set_mtu(struct net_device *dev, int new_mtu)
5285 const struct net_device_ops *ops = dev->netdev_ops;
5286 int err;
5288 if (new_mtu == dev->mtu)
5289 return 0;
5291 /* MTU must be positive. */
5292 if (new_mtu < 0)
5293 return -EINVAL;
5295 if (!netif_device_present(dev))
5296 return -ENODEV;
5298 err = 0;
5299 if (ops->ndo_change_mtu)
5300 err = ops->ndo_change_mtu(dev, new_mtu);
5301 else
5302 dev->mtu = new_mtu;
5304 if (!err)
5305 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5306 return err;
5308 EXPORT_SYMBOL(dev_set_mtu);
5311 * dev_set_group - Change group this device belongs to
5312 * @dev: device
5313 * @new_group: group this device should belong to
5315 void dev_set_group(struct net_device *dev, int new_group)
5317 dev->group = new_group;
5319 EXPORT_SYMBOL(dev_set_group);
5322 * dev_set_mac_address - Change Media Access Control Address
5323 * @dev: device
5324 * @sa: new address
5326 * Change the hardware (MAC) address of the device
5328 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5330 const struct net_device_ops *ops = dev->netdev_ops;
5331 int err;
5333 if (!ops->ndo_set_mac_address)
5334 return -EOPNOTSUPP;
5335 if (sa->sa_family != dev->type)
5336 return -EINVAL;
5337 if (!netif_device_present(dev))
5338 return -ENODEV;
5339 err = ops->ndo_set_mac_address(dev, sa);
5340 if (err)
5341 return err;
5342 dev->addr_assign_type = NET_ADDR_SET;
5343 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5344 add_device_randomness(dev->dev_addr, dev->addr_len);
5345 return 0;
5347 EXPORT_SYMBOL(dev_set_mac_address);
5350 * dev_change_carrier - Change device carrier
5351 * @dev: device
5352 * @new_carries: new value
5354 * Change device carrier
5356 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5358 const struct net_device_ops *ops = dev->netdev_ops;
5360 if (!ops->ndo_change_carrier)
5361 return -EOPNOTSUPP;
5362 if (!netif_device_present(dev))
5363 return -ENODEV;
5364 return ops->ndo_change_carrier(dev, new_carrier);
5366 EXPORT_SYMBOL(dev_change_carrier);
5369 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
5371 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
5373 int err;
5374 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
5376 if (!dev)
5377 return -ENODEV;
5379 switch (cmd) {
5380 case SIOCGIFFLAGS: /* Get interface flags */
5381 ifr->ifr_flags = (short) dev_get_flags(dev);
5382 return 0;
5384 case SIOCGIFMETRIC: /* Get the metric on the interface
5385 (currently unused) */
5386 ifr->ifr_metric = 0;
5387 return 0;
5389 case SIOCGIFMTU: /* Get the MTU of a device */
5390 ifr->ifr_mtu = dev->mtu;
5391 return 0;
5393 case SIOCGIFHWADDR:
5394 if (!dev->addr_len)
5395 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
5396 else
5397 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
5398 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5399 ifr->ifr_hwaddr.sa_family = dev->type;
5400 return 0;
5402 case SIOCGIFSLAVE:
5403 err = -EINVAL;
5404 break;
5406 case SIOCGIFMAP:
5407 ifr->ifr_map.mem_start = dev->mem_start;
5408 ifr->ifr_map.mem_end = dev->mem_end;
5409 ifr->ifr_map.base_addr = dev->base_addr;
5410 ifr->ifr_map.irq = dev->irq;
5411 ifr->ifr_map.dma = dev->dma;
5412 ifr->ifr_map.port = dev->if_port;
5413 return 0;
5415 case SIOCGIFINDEX:
5416 ifr->ifr_ifindex = dev->ifindex;
5417 return 0;
5419 case SIOCGIFTXQLEN:
5420 ifr->ifr_qlen = dev->tx_queue_len;
5421 return 0;
5423 default:
5424 /* dev_ioctl() should ensure this case
5425 * is never reached
5427 WARN_ON(1);
5428 err = -ENOTTY;
5429 break;
5432 return err;
5436 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
5438 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
5440 int err;
5441 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5442 const struct net_device_ops *ops;
5444 if (!dev)
5445 return -ENODEV;
5447 ops = dev->netdev_ops;
5449 switch (cmd) {
5450 case SIOCSIFFLAGS: /* Set interface flags */
5451 return dev_change_flags(dev, ifr->ifr_flags);
5453 case SIOCSIFMETRIC: /* Set the metric on the interface
5454 (currently unused) */
5455 return -EOPNOTSUPP;
5457 case SIOCSIFMTU: /* Set the MTU of a device */
5458 return dev_set_mtu(dev, ifr->ifr_mtu);
5460 case SIOCSIFHWADDR:
5461 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
5463 case SIOCSIFHWBROADCAST:
5464 if (ifr->ifr_hwaddr.sa_family != dev->type)
5465 return -EINVAL;
5466 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
5467 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5468 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5469 return 0;
5471 case SIOCSIFMAP:
5472 if (ops->ndo_set_config) {
5473 if (!netif_device_present(dev))
5474 return -ENODEV;
5475 return ops->ndo_set_config(dev, &ifr->ifr_map);
5477 return -EOPNOTSUPP;
5479 case SIOCADDMULTI:
5480 if (!ops->ndo_set_rx_mode ||
5481 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5482 return -EINVAL;
5483 if (!netif_device_present(dev))
5484 return -ENODEV;
5485 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5487 case SIOCDELMULTI:
5488 if (!ops->ndo_set_rx_mode ||
5489 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5490 return -EINVAL;
5491 if (!netif_device_present(dev))
5492 return -ENODEV;
5493 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5495 case SIOCSIFTXQLEN:
5496 if (ifr->ifr_qlen < 0)
5497 return -EINVAL;
5498 dev->tx_queue_len = ifr->ifr_qlen;
5499 return 0;
5501 case SIOCSIFNAME:
5502 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5503 return dev_change_name(dev, ifr->ifr_newname);
5505 case SIOCSHWTSTAMP:
5506 err = net_hwtstamp_validate(ifr);
5507 if (err)
5508 return err;
5509 /* fall through */
5512 * Unknown or private ioctl
5514 default:
5515 if ((cmd >= SIOCDEVPRIVATE &&
5516 cmd <= SIOCDEVPRIVATE + 15) ||
5517 cmd == SIOCBONDENSLAVE ||
5518 cmd == SIOCBONDRELEASE ||
5519 cmd == SIOCBONDSETHWADDR ||
5520 cmd == SIOCBONDSLAVEINFOQUERY ||
5521 cmd == SIOCBONDINFOQUERY ||
5522 cmd == SIOCBONDCHANGEACTIVE ||
5523 cmd == SIOCGMIIPHY ||
5524 cmd == SIOCGMIIREG ||
5525 cmd == SIOCSMIIREG ||
5526 cmd == SIOCBRADDIF ||
5527 cmd == SIOCBRDELIF ||
5528 cmd == SIOCSHWTSTAMP ||
5529 cmd == SIOCWANDEV) {
5530 err = -EOPNOTSUPP;
5531 if (ops->ndo_do_ioctl) {
5532 if (netif_device_present(dev))
5533 err = ops->ndo_do_ioctl(dev, ifr, cmd);
5534 else
5535 err = -ENODEV;
5537 } else
5538 err = -EINVAL;
5541 return err;
5545 * This function handles all "interface"-type I/O control requests. The actual
5546 * 'doing' part of this is dev_ifsioc above.
5550 * dev_ioctl - network device ioctl
5551 * @net: the applicable net namespace
5552 * @cmd: command to issue
5553 * @arg: pointer to a struct ifreq in user space
5555 * Issue ioctl functions to devices. This is normally called by the
5556 * user space syscall interfaces but can sometimes be useful for
5557 * other purposes. The return value is the return from the syscall if
5558 * positive or a negative errno code on error.
5561 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5563 struct ifreq ifr;
5564 int ret;
5565 char *colon;
5567 /* One special case: SIOCGIFCONF takes ifconf argument
5568 and requires shared lock, because it sleeps writing
5569 to user space.
5572 if (cmd == SIOCGIFCONF) {
5573 rtnl_lock();
5574 ret = dev_ifconf(net, (char __user *) arg);
5575 rtnl_unlock();
5576 return ret;
5578 if (cmd == SIOCGIFNAME)
5579 return dev_ifname(net, (struct ifreq __user *)arg);
5581 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5582 return -EFAULT;
5584 ifr.ifr_name[IFNAMSIZ-1] = 0;
5586 colon = strchr(ifr.ifr_name, ':');
5587 if (colon)
5588 *colon = 0;
5591 * See which interface the caller is talking about.
5594 switch (cmd) {
5596 * These ioctl calls:
5597 * - can be done by all.
5598 * - atomic and do not require locking.
5599 * - return a value
5601 case SIOCGIFFLAGS:
5602 case SIOCGIFMETRIC:
5603 case SIOCGIFMTU:
5604 case SIOCGIFHWADDR:
5605 case SIOCGIFSLAVE:
5606 case SIOCGIFMAP:
5607 case SIOCGIFINDEX:
5608 case SIOCGIFTXQLEN:
5609 dev_load(net, ifr.ifr_name);
5610 rcu_read_lock();
5611 ret = dev_ifsioc_locked(net, &ifr, cmd);
5612 rcu_read_unlock();
5613 if (!ret) {
5614 if (colon)
5615 *colon = ':';
5616 if (copy_to_user(arg, &ifr,
5617 sizeof(struct ifreq)))
5618 ret = -EFAULT;
5620 return ret;
5622 case SIOCETHTOOL:
5623 dev_load(net, ifr.ifr_name);
5624 rtnl_lock();
5625 ret = dev_ethtool(net, &ifr);
5626 rtnl_unlock();
5627 if (!ret) {
5628 if (colon)
5629 *colon = ':';
5630 if (copy_to_user(arg, &ifr,
5631 sizeof(struct ifreq)))
5632 ret = -EFAULT;
5634 return ret;
5637 * These ioctl calls:
5638 * - require superuser power.
5639 * - require strict serialization.
5640 * - return a value
5642 case SIOCGMIIPHY:
5643 case SIOCGMIIREG:
5644 case SIOCSIFNAME:
5645 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5646 return -EPERM;
5647 dev_load(net, ifr.ifr_name);
5648 rtnl_lock();
5649 ret = dev_ifsioc(net, &ifr, cmd);
5650 rtnl_unlock();
5651 if (!ret) {
5652 if (colon)
5653 *colon = ':';
5654 if (copy_to_user(arg, &ifr,
5655 sizeof(struct ifreq)))
5656 ret = -EFAULT;
5658 return ret;
5661 * These ioctl calls:
5662 * - require superuser power.
5663 * - require strict serialization.
5664 * - do not return a value
5666 case SIOCSIFMAP:
5667 case SIOCSIFTXQLEN:
5668 if (!capable(CAP_NET_ADMIN))
5669 return -EPERM;
5670 /* fall through */
5672 * These ioctl calls:
5673 * - require local superuser power.
5674 * - require strict serialization.
5675 * - do not return a value
5677 case SIOCSIFFLAGS:
5678 case SIOCSIFMETRIC:
5679 case SIOCSIFMTU:
5680 case SIOCSIFHWADDR:
5681 case SIOCSIFSLAVE:
5682 case SIOCADDMULTI:
5683 case SIOCDELMULTI:
5684 case SIOCSIFHWBROADCAST:
5685 case SIOCSMIIREG:
5686 case SIOCBONDENSLAVE:
5687 case SIOCBONDRELEASE:
5688 case SIOCBONDSETHWADDR:
5689 case SIOCBONDCHANGEACTIVE:
5690 case SIOCBRADDIF:
5691 case SIOCBRDELIF:
5692 case SIOCSHWTSTAMP:
5693 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5694 return -EPERM;
5695 /* fall through */
5696 case SIOCBONDSLAVEINFOQUERY:
5697 case SIOCBONDINFOQUERY:
5698 dev_load(net, ifr.ifr_name);
5699 rtnl_lock();
5700 ret = dev_ifsioc(net, &ifr, cmd);
5701 rtnl_unlock();
5702 return ret;
5704 case SIOCGIFMEM:
5705 /* Get the per device memory space. We can add this but
5706 * currently do not support it */
5707 case SIOCSIFMEM:
5708 /* Set the per device memory buffer space.
5709 * Not applicable in our case */
5710 case SIOCSIFLINK:
5711 return -ENOTTY;
5714 * Unknown or private ioctl.
5716 default:
5717 if (cmd == SIOCWANDEV ||
5718 (cmd >= SIOCDEVPRIVATE &&
5719 cmd <= SIOCDEVPRIVATE + 15)) {
5720 dev_load(net, ifr.ifr_name);
5721 rtnl_lock();
5722 ret = dev_ifsioc(net, &ifr, cmd);
5723 rtnl_unlock();
5724 if (!ret && copy_to_user(arg, &ifr,
5725 sizeof(struct ifreq)))
5726 ret = -EFAULT;
5727 return ret;
5729 /* Take care of Wireless Extensions */
5730 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5731 return wext_handle_ioctl(net, &ifr, cmd, arg);
5732 return -ENOTTY;
5738 * dev_new_index - allocate an ifindex
5739 * @net: the applicable net namespace
5741 * Returns a suitable unique value for a new device interface
5742 * number. The caller must hold the rtnl semaphore or the
5743 * dev_base_lock to be sure it remains unique.
5745 static int dev_new_index(struct net *net)
5747 int ifindex = net->ifindex;
5748 for (;;) {
5749 if (++ifindex <= 0)
5750 ifindex = 1;
5751 if (!__dev_get_by_index(net, ifindex))
5752 return net->ifindex = ifindex;
5756 /* Delayed registration/unregisteration */
5757 static LIST_HEAD(net_todo_list);
5759 static void net_set_todo(struct net_device *dev)
5761 list_add_tail(&dev->todo_list, &net_todo_list);
5764 static void rollback_registered_many(struct list_head *head)
5766 struct net_device *dev, *tmp;
5768 BUG_ON(dev_boot_phase);
5769 ASSERT_RTNL();
5771 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5772 /* Some devices call without registering
5773 * for initialization unwind. Remove those
5774 * devices and proceed with the remaining.
5776 if (dev->reg_state == NETREG_UNINITIALIZED) {
5777 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5778 dev->name, dev);
5780 WARN_ON(1);
5781 list_del(&dev->unreg_list);
5782 continue;
5784 dev->dismantle = true;
5785 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5788 /* If device is running, close it first. */
5789 dev_close_many(head);
5791 list_for_each_entry(dev, head, unreg_list) {
5792 /* And unlink it from device chain. */
5793 unlist_netdevice(dev);
5795 dev->reg_state = NETREG_UNREGISTERING;
5798 synchronize_net();
5800 list_for_each_entry(dev, head, unreg_list) {
5801 /* Shutdown queueing discipline. */
5802 dev_shutdown(dev);
5805 /* Notify protocols, that we are about to destroy
5806 this device. They should clean all the things.
5808 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5810 if (!dev->rtnl_link_ops ||
5811 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5812 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5815 * Flush the unicast and multicast chains
5817 dev_uc_flush(dev);
5818 dev_mc_flush(dev);
5820 if (dev->netdev_ops->ndo_uninit)
5821 dev->netdev_ops->ndo_uninit(dev);
5823 /* Notifier chain MUST detach us all upper devices. */
5824 WARN_ON(netdev_has_any_upper_dev(dev));
5826 /* Remove entries from kobject tree */
5827 netdev_unregister_kobject(dev);
5828 #ifdef CONFIG_XPS
5829 /* Remove XPS queueing entries */
5830 netif_reset_xps_queues_gt(dev, 0);
5831 #endif
5834 synchronize_net();
5836 list_for_each_entry(dev, head, unreg_list)
5837 dev_put(dev);
5840 static void rollback_registered(struct net_device *dev)
5842 LIST_HEAD(single);
5844 list_add(&dev->unreg_list, &single);
5845 rollback_registered_many(&single);
5846 list_del(&single);
5849 static netdev_features_t netdev_fix_features(struct net_device *dev,
5850 netdev_features_t features)
5852 /* Fix illegal checksum combinations */
5853 if ((features & NETIF_F_HW_CSUM) &&
5854 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5855 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5856 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5859 /* Fix illegal SG+CSUM combinations. */
5860 if ((features & NETIF_F_SG) &&
5861 !(features & NETIF_F_ALL_CSUM)) {
5862 netdev_dbg(dev,
5863 "Dropping NETIF_F_SG since no checksum feature.\n");
5864 features &= ~NETIF_F_SG;
5867 /* TSO requires that SG is present as well. */
5868 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5869 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5870 features &= ~NETIF_F_ALL_TSO;
5873 /* TSO ECN requires that TSO is present as well. */
5874 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5875 features &= ~NETIF_F_TSO_ECN;
5877 /* Software GSO depends on SG. */
5878 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5879 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5880 features &= ~NETIF_F_GSO;
5883 /* UFO needs SG and checksumming */
5884 if (features & NETIF_F_UFO) {
5885 /* maybe split UFO into V4 and V6? */
5886 if (!((features & NETIF_F_GEN_CSUM) ||
5887 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5888 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5889 netdev_dbg(dev,
5890 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5891 features &= ~NETIF_F_UFO;
5894 if (!(features & NETIF_F_SG)) {
5895 netdev_dbg(dev,
5896 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5897 features &= ~NETIF_F_UFO;
5901 return features;
5904 int __netdev_update_features(struct net_device *dev)
5906 netdev_features_t features;
5907 int err = 0;
5909 ASSERT_RTNL();
5911 features = netdev_get_wanted_features(dev);
5913 if (dev->netdev_ops->ndo_fix_features)
5914 features = dev->netdev_ops->ndo_fix_features(dev, features);
5916 /* driver might be less strict about feature dependencies */
5917 features = netdev_fix_features(dev, features);
5919 if (dev->features == features)
5920 return 0;
5922 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5923 &dev->features, &features);
5925 if (dev->netdev_ops->ndo_set_features)
5926 err = dev->netdev_ops->ndo_set_features(dev, features);
5928 if (unlikely(err < 0)) {
5929 netdev_err(dev,
5930 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5931 err, &features, &dev->features);
5932 return -1;
5935 if (!err)
5936 dev->features = features;
5938 return 1;
5942 * netdev_update_features - recalculate device features
5943 * @dev: the device to check
5945 * Recalculate dev->features set and send notifications if it
5946 * has changed. Should be called after driver or hardware dependent
5947 * conditions might have changed that influence the features.
5949 void netdev_update_features(struct net_device *dev)
5951 if (__netdev_update_features(dev))
5952 netdev_features_change(dev);
5954 EXPORT_SYMBOL(netdev_update_features);
5957 * netdev_change_features - recalculate device features
5958 * @dev: the device to check
5960 * Recalculate dev->features set and send notifications even
5961 * if they have not changed. Should be called instead of
5962 * netdev_update_features() if also dev->vlan_features might
5963 * have changed to allow the changes to be propagated to stacked
5964 * VLAN devices.
5966 void netdev_change_features(struct net_device *dev)
5968 __netdev_update_features(dev);
5969 netdev_features_change(dev);
5971 EXPORT_SYMBOL(netdev_change_features);
5974 * netif_stacked_transfer_operstate - transfer operstate
5975 * @rootdev: the root or lower level device to transfer state from
5976 * @dev: the device to transfer operstate to
5978 * Transfer operational state from root to device. This is normally
5979 * called when a stacking relationship exists between the root
5980 * device and the device(a leaf device).
5982 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5983 struct net_device *dev)
5985 if (rootdev->operstate == IF_OPER_DORMANT)
5986 netif_dormant_on(dev);
5987 else
5988 netif_dormant_off(dev);
5990 if (netif_carrier_ok(rootdev)) {
5991 if (!netif_carrier_ok(dev))
5992 netif_carrier_on(dev);
5993 } else {
5994 if (netif_carrier_ok(dev))
5995 netif_carrier_off(dev);
5998 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6000 #ifdef CONFIG_RPS
6001 static int netif_alloc_rx_queues(struct net_device *dev)
6003 unsigned int i, count = dev->num_rx_queues;
6004 struct netdev_rx_queue *rx;
6006 BUG_ON(count < 1);
6008 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6009 if (!rx)
6010 return -ENOMEM;
6012 dev->_rx = rx;
6014 for (i = 0; i < count; i++)
6015 rx[i].dev = dev;
6016 return 0;
6018 #endif
6020 static void netdev_init_one_queue(struct net_device *dev,
6021 struct netdev_queue *queue, void *_unused)
6023 /* Initialize queue lock */
6024 spin_lock_init(&queue->_xmit_lock);
6025 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6026 queue->xmit_lock_owner = -1;
6027 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6028 queue->dev = dev;
6029 #ifdef CONFIG_BQL
6030 dql_init(&queue->dql, HZ);
6031 #endif
6034 static int netif_alloc_netdev_queues(struct net_device *dev)
6036 unsigned int count = dev->num_tx_queues;
6037 struct netdev_queue *tx;
6039 BUG_ON(count < 1);
6041 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
6042 if (!tx)
6043 return -ENOMEM;
6045 dev->_tx = tx;
6047 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6048 spin_lock_init(&dev->tx_global_lock);
6050 return 0;
6054 * register_netdevice - register a network device
6055 * @dev: device to register
6057 * Take a completed network device structure and add it to the kernel
6058 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6059 * chain. 0 is returned on success. A negative errno code is returned
6060 * on a failure to set up the device, or if the name is a duplicate.
6062 * Callers must hold the rtnl semaphore. You may want
6063 * register_netdev() instead of this.
6065 * BUGS:
6066 * The locking appears insufficient to guarantee two parallel registers
6067 * will not get the same name.
6070 int register_netdevice(struct net_device *dev)
6072 int ret;
6073 struct net *net = dev_net(dev);
6075 BUG_ON(dev_boot_phase);
6076 ASSERT_RTNL();
6078 might_sleep();
6080 /* When net_device's are persistent, this will be fatal. */
6081 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6082 BUG_ON(!net);
6084 spin_lock_init(&dev->addr_list_lock);
6085 netdev_set_addr_lockdep_class(dev);
6087 dev->iflink = -1;
6089 ret = dev_get_valid_name(net, dev, dev->name);
6090 if (ret < 0)
6091 goto out;
6093 /* Init, if this function is available */
6094 if (dev->netdev_ops->ndo_init) {
6095 ret = dev->netdev_ops->ndo_init(dev);
6096 if (ret) {
6097 if (ret > 0)
6098 ret = -EIO;
6099 goto out;
6103 if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) &&
6104 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6105 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6106 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6107 ret = -EINVAL;
6108 goto err_uninit;
6111 ret = -EBUSY;
6112 if (!dev->ifindex)
6113 dev->ifindex = dev_new_index(net);
6114 else if (__dev_get_by_index(net, dev->ifindex))
6115 goto err_uninit;
6117 if (dev->iflink == -1)
6118 dev->iflink = dev->ifindex;
6120 /* Transfer changeable features to wanted_features and enable
6121 * software offloads (GSO and GRO).
6123 dev->hw_features |= NETIF_F_SOFT_FEATURES;
6124 dev->features |= NETIF_F_SOFT_FEATURES;
6125 dev->wanted_features = dev->features & dev->hw_features;
6127 /* Turn on no cache copy if HW is doing checksum */
6128 if (!(dev->flags & IFF_LOOPBACK)) {
6129 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6130 if (dev->features & NETIF_F_ALL_CSUM) {
6131 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
6132 dev->features |= NETIF_F_NOCACHE_COPY;
6136 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6138 dev->vlan_features |= NETIF_F_HIGHDMA;
6140 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6141 ret = notifier_to_errno(ret);
6142 if (ret)
6143 goto err_uninit;
6145 ret = netdev_register_kobject(dev);
6146 if (ret)
6147 goto err_uninit;
6148 dev->reg_state = NETREG_REGISTERED;
6150 __netdev_update_features(dev);
6153 * Default initial state at registry is that the
6154 * device is present.
6157 set_bit(__LINK_STATE_PRESENT, &dev->state);
6159 linkwatch_init_dev(dev);
6161 dev_init_scheduler(dev);
6162 dev_hold(dev);
6163 list_netdevice(dev);
6164 add_device_randomness(dev->dev_addr, dev->addr_len);
6166 /* If the device has permanent device address, driver should
6167 * set dev_addr and also addr_assign_type should be set to
6168 * NET_ADDR_PERM (default value).
6170 if (dev->addr_assign_type == NET_ADDR_PERM)
6171 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6173 /* Notify protocols, that a new device appeared. */
6174 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6175 ret = notifier_to_errno(ret);
6176 if (ret) {
6177 rollback_registered(dev);
6178 dev->reg_state = NETREG_UNREGISTERED;
6181 * Prevent userspace races by waiting until the network
6182 * device is fully setup before sending notifications.
6184 if (!dev->rtnl_link_ops ||
6185 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6186 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6188 out:
6189 return ret;
6191 err_uninit:
6192 if (dev->netdev_ops->ndo_uninit)
6193 dev->netdev_ops->ndo_uninit(dev);
6194 goto out;
6196 EXPORT_SYMBOL(register_netdevice);
6199 * init_dummy_netdev - init a dummy network device for NAPI
6200 * @dev: device to init
6202 * This takes a network device structure and initialize the minimum
6203 * amount of fields so it can be used to schedule NAPI polls without
6204 * registering a full blown interface. This is to be used by drivers
6205 * that need to tie several hardware interfaces to a single NAPI
6206 * poll scheduler due to HW limitations.
6208 int init_dummy_netdev(struct net_device *dev)
6210 /* Clear everything. Note we don't initialize spinlocks
6211 * are they aren't supposed to be taken by any of the
6212 * NAPI code and this dummy netdev is supposed to be
6213 * only ever used for NAPI polls
6215 memset(dev, 0, sizeof(struct net_device));
6217 /* make sure we BUG if trying to hit standard
6218 * register/unregister code path
6220 dev->reg_state = NETREG_DUMMY;
6222 /* NAPI wants this */
6223 INIT_LIST_HEAD(&dev->napi_list);
6225 /* a dummy interface is started by default */
6226 set_bit(__LINK_STATE_PRESENT, &dev->state);
6227 set_bit(__LINK_STATE_START, &dev->state);
6229 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6230 * because users of this 'device' dont need to change
6231 * its refcount.
6234 return 0;
6236 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6240 * register_netdev - register a network device
6241 * @dev: device to register
6243 * Take a completed network device structure and add it to the kernel
6244 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6245 * chain. 0 is returned on success. A negative errno code is returned
6246 * on a failure to set up the device, or if the name is a duplicate.
6248 * This is a wrapper around register_netdevice that takes the rtnl semaphore
6249 * and expands the device name if you passed a format string to
6250 * alloc_netdev.
6252 int register_netdev(struct net_device *dev)
6254 int err;
6256 rtnl_lock();
6257 err = register_netdevice(dev);
6258 rtnl_unlock();
6259 return err;
6261 EXPORT_SYMBOL(register_netdev);
6263 int netdev_refcnt_read(const struct net_device *dev)
6265 int i, refcnt = 0;
6267 for_each_possible_cpu(i)
6268 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6269 return refcnt;
6271 EXPORT_SYMBOL(netdev_refcnt_read);
6274 * netdev_wait_allrefs - wait until all references are gone.
6275 * @dev: target net_device
6277 * This is called when unregistering network devices.
6279 * Any protocol or device that holds a reference should register
6280 * for netdevice notification, and cleanup and put back the
6281 * reference if they receive an UNREGISTER event.
6282 * We can get stuck here if buggy protocols don't correctly
6283 * call dev_put.
6285 static void netdev_wait_allrefs(struct net_device *dev)
6287 unsigned long rebroadcast_time, warning_time;
6288 int refcnt;
6290 linkwatch_forget_dev(dev);
6292 rebroadcast_time = warning_time = jiffies;
6293 refcnt = netdev_refcnt_read(dev);
6295 while (refcnt != 0) {
6296 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6297 rtnl_lock();
6299 /* Rebroadcast unregister notification */
6300 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6302 __rtnl_unlock();
6303 rcu_barrier();
6304 rtnl_lock();
6306 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6307 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6308 &dev->state)) {
6309 /* We must not have linkwatch events
6310 * pending on unregister. If this
6311 * happens, we simply run the queue
6312 * unscheduled, resulting in a noop
6313 * for this device.
6315 linkwatch_run_queue();
6318 __rtnl_unlock();
6320 rebroadcast_time = jiffies;
6323 msleep(250);
6325 refcnt = netdev_refcnt_read(dev);
6327 if (time_after(jiffies, warning_time + 10 * HZ)) {
6328 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6329 dev->name, refcnt);
6330 warning_time = jiffies;
6335 /* The sequence is:
6337 * rtnl_lock();
6338 * ...
6339 * register_netdevice(x1);
6340 * register_netdevice(x2);
6341 * ...
6342 * unregister_netdevice(y1);
6343 * unregister_netdevice(y2);
6344 * ...
6345 * rtnl_unlock();
6346 * free_netdev(y1);
6347 * free_netdev(y2);
6349 * We are invoked by rtnl_unlock().
6350 * This allows us to deal with problems:
6351 * 1) We can delete sysfs objects which invoke hotplug
6352 * without deadlocking with linkwatch via keventd.
6353 * 2) Since we run with the RTNL semaphore not held, we can sleep
6354 * safely in order to wait for the netdev refcnt to drop to zero.
6356 * We must not return until all unregister events added during
6357 * the interval the lock was held have been completed.
6359 void netdev_run_todo(void)
6361 struct list_head list;
6363 /* Snapshot list, allow later requests */
6364 list_replace_init(&net_todo_list, &list);
6366 __rtnl_unlock();
6369 /* Wait for rcu callbacks to finish before next phase */
6370 if (!list_empty(&list))
6371 rcu_barrier();
6373 while (!list_empty(&list)) {
6374 struct net_device *dev
6375 = list_first_entry(&list, struct net_device, todo_list);
6376 list_del(&dev->todo_list);
6378 rtnl_lock();
6379 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6380 __rtnl_unlock();
6382 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6383 pr_err("network todo '%s' but state %d\n",
6384 dev->name, dev->reg_state);
6385 dump_stack();
6386 continue;
6389 dev->reg_state = NETREG_UNREGISTERED;
6391 on_each_cpu(flush_backlog, dev, 1);
6393 netdev_wait_allrefs(dev);
6395 /* paranoia */
6396 BUG_ON(netdev_refcnt_read(dev));
6397 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6398 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6399 WARN_ON(dev->dn_ptr);
6401 if (dev->destructor)
6402 dev->destructor(dev);
6404 /* Free network device */
6405 kobject_put(&dev->dev.kobj);
6409 /* Convert net_device_stats to rtnl_link_stats64. They have the same
6410 * fields in the same order, with only the type differing.
6412 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6413 const struct net_device_stats *netdev_stats)
6415 #if BITS_PER_LONG == 64
6416 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6417 memcpy(stats64, netdev_stats, sizeof(*stats64));
6418 #else
6419 size_t i, n = sizeof(*stats64) / sizeof(u64);
6420 const unsigned long *src = (const unsigned long *)netdev_stats;
6421 u64 *dst = (u64 *)stats64;
6423 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6424 sizeof(*stats64) / sizeof(u64));
6425 for (i = 0; i < n; i++)
6426 dst[i] = src[i];
6427 #endif
6429 EXPORT_SYMBOL(netdev_stats_to_stats64);
6432 * dev_get_stats - get network device statistics
6433 * @dev: device to get statistics from
6434 * @storage: place to store stats
6436 * Get network statistics from device. Return @storage.
6437 * The device driver may provide its own method by setting
6438 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6439 * otherwise the internal statistics structure is used.
6441 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6442 struct rtnl_link_stats64 *storage)
6444 const struct net_device_ops *ops = dev->netdev_ops;
6446 if (ops->ndo_get_stats64) {
6447 memset(storage, 0, sizeof(*storage));
6448 ops->ndo_get_stats64(dev, storage);
6449 } else if (ops->ndo_get_stats) {
6450 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6451 } else {
6452 netdev_stats_to_stats64(storage, &dev->stats);
6454 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6455 return storage;
6457 EXPORT_SYMBOL(dev_get_stats);
6459 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6461 struct netdev_queue *queue = dev_ingress_queue(dev);
6463 #ifdef CONFIG_NET_CLS_ACT
6464 if (queue)
6465 return queue;
6466 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6467 if (!queue)
6468 return NULL;
6469 netdev_init_one_queue(dev, queue, NULL);
6470 queue->qdisc = &noop_qdisc;
6471 queue->qdisc_sleeping = &noop_qdisc;
6472 rcu_assign_pointer(dev->ingress_queue, queue);
6473 #endif
6474 return queue;
6477 static const struct ethtool_ops default_ethtool_ops;
6479 void netdev_set_default_ethtool_ops(struct net_device *dev,
6480 const struct ethtool_ops *ops)
6482 if (dev->ethtool_ops == &default_ethtool_ops)
6483 dev->ethtool_ops = ops;
6485 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6488 * alloc_netdev_mqs - allocate network device
6489 * @sizeof_priv: size of private data to allocate space for
6490 * @name: device name format string
6491 * @setup: callback to initialize device
6492 * @txqs: the number of TX subqueues to allocate
6493 * @rxqs: the number of RX subqueues to allocate
6495 * Allocates a struct net_device with private data area for driver use
6496 * and performs basic initialization. Also allocates subquue structs
6497 * for each queue on the device.
6499 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6500 void (*setup)(struct net_device *),
6501 unsigned int txqs, unsigned int rxqs)
6503 struct net_device *dev;
6504 size_t alloc_size;
6505 struct net_device *p;
6507 BUG_ON(strlen(name) >= sizeof(dev->name));
6509 if (txqs < 1) {
6510 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6511 return NULL;
6514 #ifdef CONFIG_RPS
6515 if (rxqs < 1) {
6516 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6517 return NULL;
6519 #endif
6521 alloc_size = sizeof(struct net_device);
6522 if (sizeof_priv) {
6523 /* ensure 32-byte alignment of private area */
6524 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6525 alloc_size += sizeof_priv;
6527 /* ensure 32-byte alignment of whole construct */
6528 alloc_size += NETDEV_ALIGN - 1;
6530 p = kzalloc(alloc_size, GFP_KERNEL);
6531 if (!p)
6532 return NULL;
6534 dev = PTR_ALIGN(p, NETDEV_ALIGN);
6535 dev->padded = (char *)dev - (char *)p;
6537 dev->pcpu_refcnt = alloc_percpu(int);
6538 if (!dev->pcpu_refcnt)
6539 goto free_p;
6541 if (dev_addr_init(dev))
6542 goto free_pcpu;
6544 dev_mc_init(dev);
6545 dev_uc_init(dev);
6547 dev_net_set(dev, &init_net);
6549 dev->gso_max_size = GSO_MAX_SIZE;
6550 dev->gso_max_segs = GSO_MAX_SEGS;
6552 INIT_LIST_HEAD(&dev->napi_list);
6553 INIT_LIST_HEAD(&dev->unreg_list);
6554 INIT_LIST_HEAD(&dev->link_watch_list);
6555 INIT_LIST_HEAD(&dev->upper_dev_list);
6556 dev->priv_flags = IFF_XMIT_DST_RELEASE;
6557 setup(dev);
6559 dev->num_tx_queues = txqs;
6560 dev->real_num_tx_queues = txqs;
6561 if (netif_alloc_netdev_queues(dev))
6562 goto free_all;
6564 #ifdef CONFIG_RPS
6565 dev->num_rx_queues = rxqs;
6566 dev->real_num_rx_queues = rxqs;
6567 if (netif_alloc_rx_queues(dev))
6568 goto free_all;
6569 #endif
6571 strcpy(dev->name, name);
6572 dev->group = INIT_NETDEV_GROUP;
6573 if (!dev->ethtool_ops)
6574 dev->ethtool_ops = &default_ethtool_ops;
6575 return dev;
6577 free_all:
6578 free_netdev(dev);
6579 return NULL;
6581 free_pcpu:
6582 free_percpu(dev->pcpu_refcnt);
6583 kfree(dev->_tx);
6584 #ifdef CONFIG_RPS
6585 kfree(dev->_rx);
6586 #endif
6588 free_p:
6589 kfree(p);
6590 return NULL;
6592 EXPORT_SYMBOL(alloc_netdev_mqs);
6595 * free_netdev - free network device
6596 * @dev: device
6598 * This function does the last stage of destroying an allocated device
6599 * interface. The reference to the device object is released.
6600 * If this is the last reference then it will be freed.
6602 void free_netdev(struct net_device *dev)
6604 struct napi_struct *p, *n;
6606 release_net(dev_net(dev));
6608 kfree(dev->_tx);
6609 #ifdef CONFIG_RPS
6610 kfree(dev->_rx);
6611 #endif
6613 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6615 /* Flush device addresses */
6616 dev_addr_flush(dev);
6618 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6619 netif_napi_del(p);
6621 free_percpu(dev->pcpu_refcnt);
6622 dev->pcpu_refcnt = NULL;
6624 /* Compatibility with error handling in drivers */
6625 if (dev->reg_state == NETREG_UNINITIALIZED) {
6626 kfree((char *)dev - dev->padded);
6627 return;
6630 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6631 dev->reg_state = NETREG_RELEASED;
6633 /* will free via device release */
6634 put_device(&dev->dev);
6636 EXPORT_SYMBOL(free_netdev);
6639 * synchronize_net - Synchronize with packet receive processing
6641 * Wait for packets currently being received to be done.
6642 * Does not block later packets from starting.
6644 void synchronize_net(void)
6646 might_sleep();
6647 if (rtnl_is_locked())
6648 synchronize_rcu_expedited();
6649 else
6650 synchronize_rcu();
6652 EXPORT_SYMBOL(synchronize_net);
6655 * unregister_netdevice_queue - remove device from the kernel
6656 * @dev: device
6657 * @head: list
6659 * This function shuts down a device interface and removes it
6660 * from the kernel tables.
6661 * If head not NULL, device is queued to be unregistered later.
6663 * Callers must hold the rtnl semaphore. You may want
6664 * unregister_netdev() instead of this.
6667 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6669 ASSERT_RTNL();
6671 if (head) {
6672 list_move_tail(&dev->unreg_list, head);
6673 } else {
6674 rollback_registered(dev);
6675 /* Finish processing unregister after unlock */
6676 net_set_todo(dev);
6679 EXPORT_SYMBOL(unregister_netdevice_queue);
6682 * unregister_netdevice_many - unregister many devices
6683 * @head: list of devices
6685 void unregister_netdevice_many(struct list_head *head)
6687 struct net_device *dev;
6689 if (!list_empty(head)) {
6690 rollback_registered_many(head);
6691 list_for_each_entry(dev, head, unreg_list)
6692 net_set_todo(dev);
6695 EXPORT_SYMBOL(unregister_netdevice_many);
6698 * unregister_netdev - remove device from the kernel
6699 * @dev: device
6701 * This function shuts down a device interface and removes it
6702 * from the kernel tables.
6704 * This is just a wrapper for unregister_netdevice that takes
6705 * the rtnl semaphore. In general you want to use this and not
6706 * unregister_netdevice.
6708 void unregister_netdev(struct net_device *dev)
6710 rtnl_lock();
6711 unregister_netdevice(dev);
6712 rtnl_unlock();
6714 EXPORT_SYMBOL(unregister_netdev);
6717 * dev_change_net_namespace - move device to different nethost namespace
6718 * @dev: device
6719 * @net: network namespace
6720 * @pat: If not NULL name pattern to try if the current device name
6721 * is already taken in the destination network namespace.
6723 * This function shuts down a device interface and moves it
6724 * to a new network namespace. On success 0 is returned, on
6725 * a failure a netagive errno code is returned.
6727 * Callers must hold the rtnl semaphore.
6730 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6732 int err;
6734 ASSERT_RTNL();
6736 /* Don't allow namespace local devices to be moved. */
6737 err = -EINVAL;
6738 if (dev->features & NETIF_F_NETNS_LOCAL)
6739 goto out;
6741 /* Ensure the device has been registrered */
6742 if (dev->reg_state != NETREG_REGISTERED)
6743 goto out;
6745 /* Get out if there is nothing todo */
6746 err = 0;
6747 if (net_eq(dev_net(dev), net))
6748 goto out;
6750 /* Pick the destination device name, and ensure
6751 * we can use it in the destination network namespace.
6753 err = -EEXIST;
6754 if (__dev_get_by_name(net, dev->name)) {
6755 /* We get here if we can't use the current device name */
6756 if (!pat)
6757 goto out;
6758 if (dev_get_valid_name(net, dev, pat) < 0)
6759 goto out;
6763 * And now a mini version of register_netdevice unregister_netdevice.
6766 /* If device is running close it first. */
6767 dev_close(dev);
6769 /* And unlink it from device chain */
6770 err = -ENODEV;
6771 unlist_netdevice(dev);
6773 synchronize_net();
6775 /* Shutdown queueing discipline. */
6776 dev_shutdown(dev);
6778 /* Notify protocols, that we are about to destroy
6779 this device. They should clean all the things.
6781 Note that dev->reg_state stays at NETREG_REGISTERED.
6782 This is wanted because this way 8021q and macvlan know
6783 the device is just moving and can keep their slaves up.
6785 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6786 rcu_barrier();
6787 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6788 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6791 * Flush the unicast and multicast chains
6793 dev_uc_flush(dev);
6794 dev_mc_flush(dev);
6796 /* Send a netdev-removed uevent to the old namespace */
6797 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6799 /* Actually switch the network namespace */
6800 dev_net_set(dev, net);
6802 /* If there is an ifindex conflict assign a new one */
6803 if (__dev_get_by_index(net, dev->ifindex)) {
6804 int iflink = (dev->iflink == dev->ifindex);
6805 dev->ifindex = dev_new_index(net);
6806 if (iflink)
6807 dev->iflink = dev->ifindex;
6810 /* Send a netdev-add uevent to the new namespace */
6811 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6813 /* Fixup kobjects */
6814 err = device_rename(&dev->dev, dev->name);
6815 WARN_ON(err);
6817 /* Add the device back in the hashes */
6818 list_netdevice(dev);
6820 /* Notify protocols, that a new device appeared. */
6821 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6824 * Prevent userspace races by waiting until the network
6825 * device is fully setup before sending notifications.
6827 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6829 synchronize_net();
6830 err = 0;
6831 out:
6832 return err;
6834 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6836 static int dev_cpu_callback(struct notifier_block *nfb,
6837 unsigned long action,
6838 void *ocpu)
6840 struct sk_buff **list_skb;
6841 struct sk_buff *skb;
6842 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6843 struct softnet_data *sd, *oldsd;
6845 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6846 return NOTIFY_OK;
6848 local_irq_disable();
6849 cpu = smp_processor_id();
6850 sd = &per_cpu(softnet_data, cpu);
6851 oldsd = &per_cpu(softnet_data, oldcpu);
6853 /* Find end of our completion_queue. */
6854 list_skb = &sd->completion_queue;
6855 while (*list_skb)
6856 list_skb = &(*list_skb)->next;
6857 /* Append completion queue from offline CPU. */
6858 *list_skb = oldsd->completion_queue;
6859 oldsd->completion_queue = NULL;
6861 /* Append output queue from offline CPU. */
6862 if (oldsd->output_queue) {
6863 *sd->output_queue_tailp = oldsd->output_queue;
6864 sd->output_queue_tailp = oldsd->output_queue_tailp;
6865 oldsd->output_queue = NULL;
6866 oldsd->output_queue_tailp = &oldsd->output_queue;
6868 /* Append NAPI poll list from offline CPU. */
6869 if (!list_empty(&oldsd->poll_list)) {
6870 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6871 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6874 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6875 local_irq_enable();
6877 /* Process offline CPU's input_pkt_queue */
6878 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6879 netif_rx(skb);
6880 input_queue_head_incr(oldsd);
6882 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6883 netif_rx(skb);
6884 input_queue_head_incr(oldsd);
6887 return NOTIFY_OK;
6892 * netdev_increment_features - increment feature set by one
6893 * @all: current feature set
6894 * @one: new feature set
6895 * @mask: mask feature set
6897 * Computes a new feature set after adding a device with feature set
6898 * @one to the master device with current feature set @all. Will not
6899 * enable anything that is off in @mask. Returns the new feature set.
6901 netdev_features_t netdev_increment_features(netdev_features_t all,
6902 netdev_features_t one, netdev_features_t mask)
6904 if (mask & NETIF_F_GEN_CSUM)
6905 mask |= NETIF_F_ALL_CSUM;
6906 mask |= NETIF_F_VLAN_CHALLENGED;
6908 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6909 all &= one | ~NETIF_F_ALL_FOR_ALL;
6911 /* If one device supports hw checksumming, set for all. */
6912 if (all & NETIF_F_GEN_CSUM)
6913 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6915 return all;
6917 EXPORT_SYMBOL(netdev_increment_features);
6919 static struct hlist_head *netdev_create_hash(void)
6921 int i;
6922 struct hlist_head *hash;
6924 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6925 if (hash != NULL)
6926 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6927 INIT_HLIST_HEAD(&hash[i]);
6929 return hash;
6932 /* Initialize per network namespace state */
6933 static int __net_init netdev_init(struct net *net)
6935 if (net != &init_net)
6936 INIT_LIST_HEAD(&net->dev_base_head);
6938 net->dev_name_head = netdev_create_hash();
6939 if (net->dev_name_head == NULL)
6940 goto err_name;
6942 net->dev_index_head = netdev_create_hash();
6943 if (net->dev_index_head == NULL)
6944 goto err_idx;
6946 return 0;
6948 err_idx:
6949 kfree(net->dev_name_head);
6950 err_name:
6951 return -ENOMEM;
6955 * netdev_drivername - network driver for the device
6956 * @dev: network device
6958 * Determine network driver for device.
6960 const char *netdev_drivername(const struct net_device *dev)
6962 const struct device_driver *driver;
6963 const struct device *parent;
6964 const char *empty = "";
6966 parent = dev->dev.parent;
6967 if (!parent)
6968 return empty;
6970 driver = parent->driver;
6971 if (driver && driver->name)
6972 return driver->name;
6973 return empty;
6976 static int __netdev_printk(const char *level, const struct net_device *dev,
6977 struct va_format *vaf)
6979 int r;
6981 if (dev && dev->dev.parent) {
6982 r = dev_printk_emit(level[1] - '0',
6983 dev->dev.parent,
6984 "%s %s %s: %pV",
6985 dev_driver_string(dev->dev.parent),
6986 dev_name(dev->dev.parent),
6987 netdev_name(dev), vaf);
6988 } else if (dev) {
6989 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6990 } else {
6991 r = printk("%s(NULL net_device): %pV", level, vaf);
6994 return r;
6997 int netdev_printk(const char *level, const struct net_device *dev,
6998 const char *format, ...)
7000 struct va_format vaf;
7001 va_list args;
7002 int r;
7004 va_start(args, format);
7006 vaf.fmt = format;
7007 vaf.va = &args;
7009 r = __netdev_printk(level, dev, &vaf);
7011 va_end(args);
7013 return r;
7015 EXPORT_SYMBOL(netdev_printk);
7017 #define define_netdev_printk_level(func, level) \
7018 int func(const struct net_device *dev, const char *fmt, ...) \
7020 int r; \
7021 struct va_format vaf; \
7022 va_list args; \
7024 va_start(args, fmt); \
7026 vaf.fmt = fmt; \
7027 vaf.va = &args; \
7029 r = __netdev_printk(level, dev, &vaf); \
7031 va_end(args); \
7033 return r; \
7035 EXPORT_SYMBOL(func);
7037 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7038 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7039 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7040 define_netdev_printk_level(netdev_err, KERN_ERR);
7041 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7042 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7043 define_netdev_printk_level(netdev_info, KERN_INFO);
7045 static void __net_exit netdev_exit(struct net *net)
7047 kfree(net->dev_name_head);
7048 kfree(net->dev_index_head);
7051 static struct pernet_operations __net_initdata netdev_net_ops = {
7052 .init = netdev_init,
7053 .exit = netdev_exit,
7056 static void __net_exit default_device_exit(struct net *net)
7058 struct net_device *dev, *aux;
7060 * Push all migratable network devices back to the
7061 * initial network namespace
7063 rtnl_lock();
7064 for_each_netdev_safe(net, dev, aux) {
7065 int err;
7066 char fb_name[IFNAMSIZ];
7068 /* Ignore unmoveable devices (i.e. loopback) */
7069 if (dev->features & NETIF_F_NETNS_LOCAL)
7070 continue;
7072 /* Leave virtual devices for the generic cleanup */
7073 if (dev->rtnl_link_ops)
7074 continue;
7076 /* Push remaining network devices to init_net */
7077 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7078 err = dev_change_net_namespace(dev, &init_net, fb_name);
7079 if (err) {
7080 pr_emerg("%s: failed to move %s to init_net: %d\n",
7081 __func__, dev->name, err);
7082 BUG();
7085 rtnl_unlock();
7088 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7090 /* At exit all network devices most be removed from a network
7091 * namespace. Do this in the reverse order of registration.
7092 * Do this across as many network namespaces as possible to
7093 * improve batching efficiency.
7095 struct net_device *dev;
7096 struct net *net;
7097 LIST_HEAD(dev_kill_list);
7099 rtnl_lock();
7100 list_for_each_entry(net, net_list, exit_list) {
7101 for_each_netdev_reverse(net, dev) {
7102 if (dev->rtnl_link_ops)
7103 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7104 else
7105 unregister_netdevice_queue(dev, &dev_kill_list);
7108 unregister_netdevice_many(&dev_kill_list);
7109 list_del(&dev_kill_list);
7110 rtnl_unlock();
7113 static struct pernet_operations __net_initdata default_device_ops = {
7114 .exit = default_device_exit,
7115 .exit_batch = default_device_exit_batch,
7119 * Initialize the DEV module. At boot time this walks the device list and
7120 * unhooks any devices that fail to initialise (normally hardware not
7121 * present) and leaves us with a valid list of present and active devices.
7126 * This is called single threaded during boot, so no need
7127 * to take the rtnl semaphore.
7129 static int __init net_dev_init(void)
7131 int i, rc = -ENOMEM;
7133 BUG_ON(!dev_boot_phase);
7135 if (dev_proc_init())
7136 goto out;
7138 if (netdev_kobject_init())
7139 goto out;
7141 INIT_LIST_HEAD(&ptype_all);
7142 for (i = 0; i < PTYPE_HASH_SIZE; i++)
7143 INIT_LIST_HEAD(&ptype_base[i]);
7145 INIT_LIST_HEAD(&offload_base);
7147 if (register_pernet_subsys(&netdev_net_ops))
7148 goto out;
7151 * Initialise the packet receive queues.
7154 for_each_possible_cpu(i) {
7155 struct softnet_data *sd = &per_cpu(softnet_data, i);
7157 memset(sd, 0, sizeof(*sd));
7158 skb_queue_head_init(&sd->input_pkt_queue);
7159 skb_queue_head_init(&sd->process_queue);
7160 sd->completion_queue = NULL;
7161 INIT_LIST_HEAD(&sd->poll_list);
7162 sd->output_queue = NULL;
7163 sd->output_queue_tailp = &sd->output_queue;
7164 #ifdef CONFIG_RPS
7165 sd->csd.func = rps_trigger_softirq;
7166 sd->csd.info = sd;
7167 sd->csd.flags = 0;
7168 sd->cpu = i;
7169 #endif
7171 sd->backlog.poll = process_backlog;
7172 sd->backlog.weight = weight_p;
7173 sd->backlog.gro_list = NULL;
7174 sd->backlog.gro_count = 0;
7177 dev_boot_phase = 0;
7179 /* The loopback device is special if any other network devices
7180 * is present in a network namespace the loopback device must
7181 * be present. Since we now dynamically allocate and free the
7182 * loopback device ensure this invariant is maintained by
7183 * keeping the loopback device as the first device on the
7184 * list of network devices. Ensuring the loopback devices
7185 * is the first device that appears and the last network device
7186 * that disappears.
7188 if (register_pernet_device(&loopback_net_ops))
7189 goto out;
7191 if (register_pernet_device(&default_device_ops))
7192 goto out;
7194 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7195 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7197 hotcpu_notifier(dev_cpu_callback, 0);
7198 dst_init();
7199 dev_mcast_init();
7200 rc = 0;
7201 out:
7202 return rc;
7205 subsys_initcall(net_dev_init);