2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/proc_fs.h>
101 #include <linux/seq_file.h>
102 #include <linux/stat.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/net_tstamp.h>
136 #include <linux/static_key.h>
138 #include "net-sysfs.h"
140 /* Instead of increasing this, you should create a hash table. */
141 #define MAX_GRO_SKBS 8
143 /* This should be increased if a protocol with a bigger head is added. */
144 #define GRO_MAX_HEAD (MAX_HEADER + 128)
147 * The list of packet types we will receive (as opposed to discard)
148 * and the routines to invoke.
150 * Why 16. Because with 16 the only overlap we get on a hash of the
151 * low nibble of the protocol value is RARP/SNAP/X.25.
153 * NOTE: That is no longer true with the addition of VLAN tags. Not
154 * sure which should go first, but I bet it won't make much
155 * difference if we are running VLANs. The good news is that
156 * this protocol won't be in the list unless compiled in, so
157 * the average user (w/out VLANs) will not be adversely affected.
174 #define PTYPE_HASH_SIZE (16)
175 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
177 static DEFINE_SPINLOCK(ptype_lock
);
178 static DEFINE_SPINLOCK(offload_lock
);
179 static struct list_head ptype_base
[PTYPE_HASH_SIZE
] __read_mostly
;
180 static struct list_head ptype_all __read_mostly
; /* Taps */
181 static struct list_head offload_base __read_mostly
;
184 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
187 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
189 * Writers must hold the rtnl semaphore while they loop through the
190 * dev_base_head list, and hold dev_base_lock for writing when they do the
191 * actual updates. This allows pure readers to access the list even
192 * while a writer is preparing to update it.
194 * To put it another way, dev_base_lock is held for writing only to
195 * protect against pure readers; the rtnl semaphore provides the
196 * protection against other writers.
198 * See, for example usages, register_netdevice() and
199 * unregister_netdevice(), which must be called with the rtnl
202 DEFINE_RWLOCK(dev_base_lock
);
203 EXPORT_SYMBOL(dev_base_lock
);
205 seqcount_t devnet_rename_seq
;
207 static inline void dev_base_seq_inc(struct net
*net
)
209 while (++net
->dev_base_seq
== 0);
212 static inline struct hlist_head
*dev_name_hash(struct net
*net
, const char *name
)
214 unsigned int hash
= full_name_hash(name
, strnlen(name
, IFNAMSIZ
));
216 return &net
->dev_name_head
[hash_32(hash
, NETDEV_HASHBITS
)];
219 static inline struct hlist_head
*dev_index_hash(struct net
*net
, int ifindex
)
221 return &net
->dev_index_head
[ifindex
& (NETDEV_HASHENTRIES
- 1)];
224 static inline void rps_lock(struct softnet_data
*sd
)
227 spin_lock(&sd
->input_pkt_queue
.lock
);
231 static inline void rps_unlock(struct softnet_data
*sd
)
234 spin_unlock(&sd
->input_pkt_queue
.lock
);
238 /* Device list insertion */
239 static int list_netdevice(struct net_device
*dev
)
241 struct net
*net
= dev_net(dev
);
245 write_lock_bh(&dev_base_lock
);
246 list_add_tail_rcu(&dev
->dev_list
, &net
->dev_base_head
);
247 hlist_add_head_rcu(&dev
->name_hlist
, dev_name_hash(net
, dev
->name
));
248 hlist_add_head_rcu(&dev
->index_hlist
,
249 dev_index_hash(net
, dev
->ifindex
));
250 write_unlock_bh(&dev_base_lock
);
252 dev_base_seq_inc(net
);
257 /* Device list removal
258 * caller must respect a RCU grace period before freeing/reusing dev
260 static void unlist_netdevice(struct net_device
*dev
)
264 /* Unlink dev from the device chain */
265 write_lock_bh(&dev_base_lock
);
266 list_del_rcu(&dev
->dev_list
);
267 hlist_del_rcu(&dev
->name_hlist
);
268 hlist_del_rcu(&dev
->index_hlist
);
269 write_unlock_bh(&dev_base_lock
);
271 dev_base_seq_inc(dev_net(dev
));
278 static RAW_NOTIFIER_HEAD(netdev_chain
);
281 * Device drivers call our routines to queue packets here. We empty the
282 * queue in the local softnet handler.
285 DEFINE_PER_CPU_ALIGNED(struct softnet_data
, softnet_data
);
286 EXPORT_PER_CPU_SYMBOL(softnet_data
);
288 #ifdef CONFIG_LOCKDEP
290 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
291 * according to dev->type
293 static const unsigned short netdev_lock_type
[] =
294 {ARPHRD_NETROM
, ARPHRD_ETHER
, ARPHRD_EETHER
, ARPHRD_AX25
,
295 ARPHRD_PRONET
, ARPHRD_CHAOS
, ARPHRD_IEEE802
, ARPHRD_ARCNET
,
296 ARPHRD_APPLETLK
, ARPHRD_DLCI
, ARPHRD_ATM
, ARPHRD_METRICOM
,
297 ARPHRD_IEEE1394
, ARPHRD_EUI64
, ARPHRD_INFINIBAND
, ARPHRD_SLIP
,
298 ARPHRD_CSLIP
, ARPHRD_SLIP6
, ARPHRD_CSLIP6
, ARPHRD_RSRVD
,
299 ARPHRD_ADAPT
, ARPHRD_ROSE
, ARPHRD_X25
, ARPHRD_HWX25
,
300 ARPHRD_PPP
, ARPHRD_CISCO
, ARPHRD_LAPB
, ARPHRD_DDCMP
,
301 ARPHRD_RAWHDLC
, ARPHRD_TUNNEL
, ARPHRD_TUNNEL6
, ARPHRD_FRAD
,
302 ARPHRD_SKIP
, ARPHRD_LOOPBACK
, ARPHRD_LOCALTLK
, ARPHRD_FDDI
,
303 ARPHRD_BIF
, ARPHRD_SIT
, ARPHRD_IPDDP
, ARPHRD_IPGRE
,
304 ARPHRD_PIMREG
, ARPHRD_HIPPI
, ARPHRD_ASH
, ARPHRD_ECONET
,
305 ARPHRD_IRDA
, ARPHRD_FCPP
, ARPHRD_FCAL
, ARPHRD_FCPL
,
306 ARPHRD_FCFABRIC
, ARPHRD_IEEE80211
, ARPHRD_IEEE80211_PRISM
,
307 ARPHRD_IEEE80211_RADIOTAP
, ARPHRD_PHONET
, ARPHRD_PHONET_PIPE
,
308 ARPHRD_IEEE802154
, ARPHRD_VOID
, ARPHRD_NONE
};
310 static const char *const netdev_lock_name
[] =
311 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
312 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
313 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
314 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
315 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
316 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
317 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
318 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
319 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
320 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
321 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
322 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
323 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
324 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
325 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
327 static struct lock_class_key netdev_xmit_lock_key
[ARRAY_SIZE(netdev_lock_type
)];
328 static struct lock_class_key netdev_addr_lock_key
[ARRAY_SIZE(netdev_lock_type
)];
330 static inline unsigned short netdev_lock_pos(unsigned short dev_type
)
334 for (i
= 0; i
< ARRAY_SIZE(netdev_lock_type
); i
++)
335 if (netdev_lock_type
[i
] == dev_type
)
337 /* the last key is used by default */
338 return ARRAY_SIZE(netdev_lock_type
) - 1;
341 static inline void netdev_set_xmit_lockdep_class(spinlock_t
*lock
,
342 unsigned short dev_type
)
346 i
= netdev_lock_pos(dev_type
);
347 lockdep_set_class_and_name(lock
, &netdev_xmit_lock_key
[i
],
348 netdev_lock_name
[i
]);
351 static inline void netdev_set_addr_lockdep_class(struct net_device
*dev
)
355 i
= netdev_lock_pos(dev
->type
);
356 lockdep_set_class_and_name(&dev
->addr_list_lock
,
357 &netdev_addr_lock_key
[i
],
358 netdev_lock_name
[i
]);
361 static inline void netdev_set_xmit_lockdep_class(spinlock_t
*lock
,
362 unsigned short dev_type
)
365 static inline void netdev_set_addr_lockdep_class(struct net_device
*dev
)
370 /*******************************************************************************
372 Protocol management and registration routines
374 *******************************************************************************/
377 * Add a protocol ID to the list. Now that the input handler is
378 * smarter we can dispense with all the messy stuff that used to be
381 * BEWARE!!! Protocol handlers, mangling input packets,
382 * MUST BE last in hash buckets and checking protocol handlers
383 * MUST start from promiscuous ptype_all chain in net_bh.
384 * It is true now, do not change it.
385 * Explanation follows: if protocol handler, mangling packet, will
386 * be the first on list, it is not able to sense, that packet
387 * is cloned and should be copied-on-write, so that it will
388 * change it and subsequent readers will get broken packet.
392 static inline struct list_head
*ptype_head(const struct packet_type
*pt
)
394 if (pt
->type
== htons(ETH_P_ALL
))
397 return &ptype_base
[ntohs(pt
->type
) & PTYPE_HASH_MASK
];
401 * dev_add_pack - add packet handler
402 * @pt: packet type declaration
404 * Add a protocol handler to the networking stack. The passed &packet_type
405 * is linked into kernel lists and may not be freed until it has been
406 * removed from the kernel lists.
408 * This call does not sleep therefore it can not
409 * guarantee all CPU's that are in middle of receiving packets
410 * will see the new packet type (until the next received packet).
413 void dev_add_pack(struct packet_type
*pt
)
415 struct list_head
*head
= ptype_head(pt
);
417 spin_lock(&ptype_lock
);
418 list_add_rcu(&pt
->list
, head
);
419 spin_unlock(&ptype_lock
);
421 EXPORT_SYMBOL(dev_add_pack
);
424 * __dev_remove_pack - remove packet handler
425 * @pt: packet type declaration
427 * Remove a protocol handler that was previously added to the kernel
428 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
429 * from the kernel lists and can be freed or reused once this function
432 * The packet type might still be in use by receivers
433 * and must not be freed until after all the CPU's have gone
434 * through a quiescent state.
436 void __dev_remove_pack(struct packet_type
*pt
)
438 struct list_head
*head
= ptype_head(pt
);
439 struct packet_type
*pt1
;
441 spin_lock(&ptype_lock
);
443 list_for_each_entry(pt1
, head
, list
) {
445 list_del_rcu(&pt
->list
);
450 pr_warn("dev_remove_pack: %p not found\n", pt
);
452 spin_unlock(&ptype_lock
);
454 EXPORT_SYMBOL(__dev_remove_pack
);
457 * dev_remove_pack - remove packet handler
458 * @pt: packet type declaration
460 * Remove a protocol handler that was previously added to the kernel
461 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
462 * from the kernel lists and can be freed or reused once this function
465 * This call sleeps to guarantee that no CPU is looking at the packet
468 void dev_remove_pack(struct packet_type
*pt
)
470 __dev_remove_pack(pt
);
474 EXPORT_SYMBOL(dev_remove_pack
);
478 * dev_add_offload - register offload handlers
479 * @po: protocol offload declaration
481 * Add protocol offload handlers to the networking stack. The passed
482 * &proto_offload is linked into kernel lists and may not be freed until
483 * it has been removed from the kernel lists.
485 * This call does not sleep therefore it can not
486 * guarantee all CPU's that are in middle of receiving packets
487 * will see the new offload handlers (until the next received packet).
489 void dev_add_offload(struct packet_offload
*po
)
491 struct list_head
*head
= &offload_base
;
493 spin_lock(&offload_lock
);
494 list_add_rcu(&po
->list
, head
);
495 spin_unlock(&offload_lock
);
497 EXPORT_SYMBOL(dev_add_offload
);
500 * __dev_remove_offload - remove offload handler
501 * @po: packet offload declaration
503 * Remove a protocol offload handler that was previously added to the
504 * kernel offload handlers by dev_add_offload(). The passed &offload_type
505 * is removed from the kernel lists and can be freed or reused once this
508 * The packet type might still be in use by receivers
509 * and must not be freed until after all the CPU's have gone
510 * through a quiescent state.
512 void __dev_remove_offload(struct packet_offload
*po
)
514 struct list_head
*head
= &offload_base
;
515 struct packet_offload
*po1
;
517 spin_lock(&offload_lock
);
519 list_for_each_entry(po1
, head
, list
) {
521 list_del_rcu(&po
->list
);
526 pr_warn("dev_remove_offload: %p not found\n", po
);
528 spin_unlock(&offload_lock
);
530 EXPORT_SYMBOL(__dev_remove_offload
);
533 * dev_remove_offload - remove packet offload handler
534 * @po: packet offload declaration
536 * Remove a packet offload handler that was previously added to the kernel
537 * offload handlers by dev_add_offload(). The passed &offload_type is
538 * removed from the kernel lists and can be freed or reused once this
541 * This call sleeps to guarantee that no CPU is looking at the packet
544 void dev_remove_offload(struct packet_offload
*po
)
546 __dev_remove_offload(po
);
550 EXPORT_SYMBOL(dev_remove_offload
);
552 /******************************************************************************
554 Device Boot-time Settings Routines
556 *******************************************************************************/
558 /* Boot time configuration table */
559 static struct netdev_boot_setup dev_boot_setup
[NETDEV_BOOT_SETUP_MAX
];
562 * netdev_boot_setup_add - add new setup entry
563 * @name: name of the device
564 * @map: configured settings for the device
566 * Adds new setup entry to the dev_boot_setup list. The function
567 * returns 0 on error and 1 on success. This is a generic routine to
570 static int netdev_boot_setup_add(char *name
, struct ifmap
*map
)
572 struct netdev_boot_setup
*s
;
576 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++) {
577 if (s
[i
].name
[0] == '\0' || s
[i
].name
[0] == ' ') {
578 memset(s
[i
].name
, 0, sizeof(s
[i
].name
));
579 strlcpy(s
[i
].name
, name
, IFNAMSIZ
);
580 memcpy(&s
[i
].map
, map
, sizeof(s
[i
].map
));
585 return i
>= NETDEV_BOOT_SETUP_MAX
? 0 : 1;
589 * netdev_boot_setup_check - check boot time settings
590 * @dev: the netdevice
592 * Check boot time settings for the device.
593 * The found settings are set for the device to be used
594 * later in the device probing.
595 * Returns 0 if no settings found, 1 if they are.
597 int netdev_boot_setup_check(struct net_device
*dev
)
599 struct netdev_boot_setup
*s
= dev_boot_setup
;
602 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++) {
603 if (s
[i
].name
[0] != '\0' && s
[i
].name
[0] != ' ' &&
604 !strcmp(dev
->name
, s
[i
].name
)) {
605 dev
->irq
= s
[i
].map
.irq
;
606 dev
->base_addr
= s
[i
].map
.base_addr
;
607 dev
->mem_start
= s
[i
].map
.mem_start
;
608 dev
->mem_end
= s
[i
].map
.mem_end
;
614 EXPORT_SYMBOL(netdev_boot_setup_check
);
618 * netdev_boot_base - get address from boot time settings
619 * @prefix: prefix for network device
620 * @unit: id for network device
622 * Check boot time settings for the base address of device.
623 * The found settings are set for the device to be used
624 * later in the device probing.
625 * Returns 0 if no settings found.
627 unsigned long netdev_boot_base(const char *prefix
, int unit
)
629 const struct netdev_boot_setup
*s
= dev_boot_setup
;
633 sprintf(name
, "%s%d", prefix
, unit
);
636 * If device already registered then return base of 1
637 * to indicate not to probe for this interface
639 if (__dev_get_by_name(&init_net
, name
))
642 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++)
643 if (!strcmp(name
, s
[i
].name
))
644 return s
[i
].map
.base_addr
;
649 * Saves at boot time configured settings for any netdevice.
651 int __init
netdev_boot_setup(char *str
)
656 str
= get_options(str
, ARRAY_SIZE(ints
), ints
);
661 memset(&map
, 0, sizeof(map
));
665 map
.base_addr
= ints
[2];
667 map
.mem_start
= ints
[3];
669 map
.mem_end
= ints
[4];
671 /* Add new entry to the list */
672 return netdev_boot_setup_add(str
, &map
);
675 __setup("netdev=", netdev_boot_setup
);
677 /*******************************************************************************
679 Device Interface Subroutines
681 *******************************************************************************/
684 * __dev_get_by_name - find a device by its name
685 * @net: the applicable net namespace
686 * @name: name to find
688 * Find an interface by name. Must be called under RTNL semaphore
689 * or @dev_base_lock. If the name is found a pointer to the device
690 * is returned. If the name is not found then %NULL is returned. The
691 * reference counters are not incremented so the caller must be
692 * careful with locks.
695 struct net_device
*__dev_get_by_name(struct net
*net
, const char *name
)
697 struct hlist_node
*p
;
698 struct net_device
*dev
;
699 struct hlist_head
*head
= dev_name_hash(net
, name
);
701 hlist_for_each_entry(dev
, p
, head
, name_hlist
)
702 if (!strncmp(dev
->name
, name
, IFNAMSIZ
))
707 EXPORT_SYMBOL(__dev_get_by_name
);
710 * dev_get_by_name_rcu - find a device by its name
711 * @net: the applicable net namespace
712 * @name: name to find
714 * Find an interface by name.
715 * If the name is found a pointer to the device is returned.
716 * If the name is not found then %NULL is returned.
717 * The reference counters are not incremented so the caller must be
718 * careful with locks. The caller must hold RCU lock.
721 struct net_device
*dev_get_by_name_rcu(struct net
*net
, const char *name
)
723 struct hlist_node
*p
;
724 struct net_device
*dev
;
725 struct hlist_head
*head
= dev_name_hash(net
, name
);
727 hlist_for_each_entry_rcu(dev
, p
, head
, name_hlist
)
728 if (!strncmp(dev
->name
, name
, IFNAMSIZ
))
733 EXPORT_SYMBOL(dev_get_by_name_rcu
);
736 * dev_get_by_name - find a device by its name
737 * @net: the applicable net namespace
738 * @name: name to find
740 * Find an interface by name. This can be called from any
741 * context and does its own locking. The returned handle has
742 * the usage count incremented and the caller must use dev_put() to
743 * release it when it is no longer needed. %NULL is returned if no
744 * matching device is found.
747 struct net_device
*dev_get_by_name(struct net
*net
, const char *name
)
749 struct net_device
*dev
;
752 dev
= dev_get_by_name_rcu(net
, name
);
758 EXPORT_SYMBOL(dev_get_by_name
);
761 * __dev_get_by_index - find a device by its ifindex
762 * @net: the applicable net namespace
763 * @ifindex: index of device
765 * Search for an interface by index. Returns %NULL if the device
766 * is not found or a pointer to the device. The device has not
767 * had its reference counter increased so the caller must be careful
768 * about locking. The caller must hold either the RTNL semaphore
772 struct net_device
*__dev_get_by_index(struct net
*net
, int ifindex
)
774 struct hlist_node
*p
;
775 struct net_device
*dev
;
776 struct hlist_head
*head
= dev_index_hash(net
, ifindex
);
778 hlist_for_each_entry(dev
, p
, head
, index_hlist
)
779 if (dev
->ifindex
== ifindex
)
784 EXPORT_SYMBOL(__dev_get_by_index
);
787 * dev_get_by_index_rcu - find a device by its ifindex
788 * @net: the applicable net namespace
789 * @ifindex: index of device
791 * Search for an interface by index. Returns %NULL if the device
792 * is not found or a pointer to the device. The device has not
793 * had its reference counter increased so the caller must be careful
794 * about locking. The caller must hold RCU lock.
797 struct net_device
*dev_get_by_index_rcu(struct net
*net
, int ifindex
)
799 struct hlist_node
*p
;
800 struct net_device
*dev
;
801 struct hlist_head
*head
= dev_index_hash(net
, ifindex
);
803 hlist_for_each_entry_rcu(dev
, p
, head
, index_hlist
)
804 if (dev
->ifindex
== ifindex
)
809 EXPORT_SYMBOL(dev_get_by_index_rcu
);
813 * dev_get_by_index - find a device by its ifindex
814 * @net: the applicable net namespace
815 * @ifindex: index of device
817 * Search for an interface by index. Returns NULL if the device
818 * is not found or a pointer to the device. The device returned has
819 * had a reference added and the pointer is safe until the user calls
820 * dev_put to indicate they have finished with it.
823 struct net_device
*dev_get_by_index(struct net
*net
, int ifindex
)
825 struct net_device
*dev
;
828 dev
= dev_get_by_index_rcu(net
, ifindex
);
834 EXPORT_SYMBOL(dev_get_by_index
);
837 * dev_getbyhwaddr_rcu - find a device by its hardware address
838 * @net: the applicable net namespace
839 * @type: media type of device
840 * @ha: hardware address
842 * Search for an interface by MAC address. Returns NULL if the device
843 * is not found or a pointer to the device.
844 * The caller must hold RCU or RTNL.
845 * The returned device has not had its ref count increased
846 * and the caller must therefore be careful about locking
850 struct net_device
*dev_getbyhwaddr_rcu(struct net
*net
, unsigned short type
,
853 struct net_device
*dev
;
855 for_each_netdev_rcu(net
, dev
)
856 if (dev
->type
== type
&&
857 !memcmp(dev
->dev_addr
, ha
, dev
->addr_len
))
862 EXPORT_SYMBOL(dev_getbyhwaddr_rcu
);
864 struct net_device
*__dev_getfirstbyhwtype(struct net
*net
, unsigned short type
)
866 struct net_device
*dev
;
869 for_each_netdev(net
, dev
)
870 if (dev
->type
== type
)
875 EXPORT_SYMBOL(__dev_getfirstbyhwtype
);
877 struct net_device
*dev_getfirstbyhwtype(struct net
*net
, unsigned short type
)
879 struct net_device
*dev
, *ret
= NULL
;
882 for_each_netdev_rcu(net
, dev
)
883 if (dev
->type
== type
) {
891 EXPORT_SYMBOL(dev_getfirstbyhwtype
);
894 * dev_get_by_flags_rcu - find any device with given flags
895 * @net: the applicable net namespace
896 * @if_flags: IFF_* values
897 * @mask: bitmask of bits in if_flags to check
899 * Search for any interface with the given flags. Returns NULL if a device
900 * is not found or a pointer to the device. Must be called inside
901 * rcu_read_lock(), and result refcount is unchanged.
904 struct net_device
*dev_get_by_flags_rcu(struct net
*net
, unsigned short if_flags
,
907 struct net_device
*dev
, *ret
;
910 for_each_netdev_rcu(net
, dev
) {
911 if (((dev
->flags
^ if_flags
) & mask
) == 0) {
918 EXPORT_SYMBOL(dev_get_by_flags_rcu
);
921 * dev_valid_name - check if name is okay for network device
924 * Network device names need to be valid file names to
925 * to allow sysfs to work. We also disallow any kind of
928 bool dev_valid_name(const char *name
)
932 if (strlen(name
) >= IFNAMSIZ
)
934 if (!strcmp(name
, ".") || !strcmp(name
, ".."))
938 if (*name
== '/' || isspace(*name
))
944 EXPORT_SYMBOL(dev_valid_name
);
947 * __dev_alloc_name - allocate a name for a device
948 * @net: network namespace to allocate the device name in
949 * @name: name format string
950 * @buf: scratch buffer and result name string
952 * Passed a format string - eg "lt%d" it will try and find a suitable
953 * id. It scans list of devices to build up a free map, then chooses
954 * the first empty slot. The caller must hold the dev_base or rtnl lock
955 * while allocating the name and adding the device in order to avoid
957 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
958 * Returns the number of the unit assigned or a negative errno code.
961 static int __dev_alloc_name(struct net
*net
, const char *name
, char *buf
)
965 const int max_netdevices
= 8*PAGE_SIZE
;
966 unsigned long *inuse
;
967 struct net_device
*d
;
969 p
= strnchr(name
, IFNAMSIZ
-1, '%');
972 * Verify the string as this thing may have come from
973 * the user. There must be either one "%d" and no other "%"
976 if (p
[1] != 'd' || strchr(p
+ 2, '%'))
979 /* Use one page as a bit array of possible slots */
980 inuse
= (unsigned long *) get_zeroed_page(GFP_ATOMIC
);
984 for_each_netdev(net
, d
) {
985 if (!sscanf(d
->name
, name
, &i
))
987 if (i
< 0 || i
>= max_netdevices
)
990 /* avoid cases where sscanf is not exact inverse of printf */
991 snprintf(buf
, IFNAMSIZ
, name
, i
);
992 if (!strncmp(buf
, d
->name
, IFNAMSIZ
))
996 i
= find_first_zero_bit(inuse
, max_netdevices
);
997 free_page((unsigned long) inuse
);
1001 snprintf(buf
, IFNAMSIZ
, name
, i
);
1002 if (!__dev_get_by_name(net
, buf
))
1005 /* It is possible to run out of possible slots
1006 * when the name is long and there isn't enough space left
1007 * for the digits, or if all bits are used.
1013 * dev_alloc_name - allocate a name for a device
1015 * @name: name format string
1017 * Passed a format string - eg "lt%d" it will try and find a suitable
1018 * id. It scans list of devices to build up a free map, then chooses
1019 * the first empty slot. The caller must hold the dev_base or rtnl lock
1020 * while allocating the name and adding the device in order to avoid
1022 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1023 * Returns the number of the unit assigned or a negative errno code.
1026 int dev_alloc_name(struct net_device
*dev
, const char *name
)
1032 BUG_ON(!dev_net(dev
));
1034 ret
= __dev_alloc_name(net
, name
, buf
);
1036 strlcpy(dev
->name
, buf
, IFNAMSIZ
);
1039 EXPORT_SYMBOL(dev_alloc_name
);
1041 static int dev_alloc_name_ns(struct net
*net
,
1042 struct net_device
*dev
,
1048 ret
= __dev_alloc_name(net
, name
, buf
);
1050 strlcpy(dev
->name
, buf
, IFNAMSIZ
);
1054 static int dev_get_valid_name(struct net
*net
,
1055 struct net_device
*dev
,
1060 if (!dev_valid_name(name
))
1063 if (strchr(name
, '%'))
1064 return dev_alloc_name_ns(net
, dev
, name
);
1065 else if (__dev_get_by_name(net
, name
))
1067 else if (dev
->name
!= name
)
1068 strlcpy(dev
->name
, name
, IFNAMSIZ
);
1074 * dev_change_name - change name of a device
1076 * @newname: name (or format string) must be at least IFNAMSIZ
1078 * Change name of a device, can pass format strings "eth%d".
1081 int dev_change_name(struct net_device
*dev
, const char *newname
)
1083 char oldname
[IFNAMSIZ
];
1089 BUG_ON(!dev_net(dev
));
1092 if (dev
->flags
& IFF_UP
)
1095 write_seqcount_begin(&devnet_rename_seq
);
1097 if (strncmp(newname
, dev
->name
, IFNAMSIZ
) == 0) {
1098 write_seqcount_end(&devnet_rename_seq
);
1102 memcpy(oldname
, dev
->name
, IFNAMSIZ
);
1104 err
= dev_get_valid_name(net
, dev
, newname
);
1106 write_seqcount_end(&devnet_rename_seq
);
1111 ret
= device_rename(&dev
->dev
, dev
->name
);
1113 memcpy(dev
->name
, oldname
, IFNAMSIZ
);
1114 write_seqcount_end(&devnet_rename_seq
);
1118 write_seqcount_end(&devnet_rename_seq
);
1120 write_lock_bh(&dev_base_lock
);
1121 hlist_del_rcu(&dev
->name_hlist
);
1122 write_unlock_bh(&dev_base_lock
);
1126 write_lock_bh(&dev_base_lock
);
1127 hlist_add_head_rcu(&dev
->name_hlist
, dev_name_hash(net
, dev
->name
));
1128 write_unlock_bh(&dev_base_lock
);
1130 ret
= call_netdevice_notifiers(NETDEV_CHANGENAME
, dev
);
1131 ret
= notifier_to_errno(ret
);
1134 /* err >= 0 after dev_alloc_name() or stores the first errno */
1137 write_seqcount_begin(&devnet_rename_seq
);
1138 memcpy(dev
->name
, oldname
, IFNAMSIZ
);
1141 pr_err("%s: name change rollback failed: %d\n",
1150 * dev_set_alias - change ifalias of a device
1152 * @alias: name up to IFALIASZ
1153 * @len: limit of bytes to copy from info
1155 * Set ifalias for a device,
1157 int dev_set_alias(struct net_device
*dev
, const char *alias
, size_t len
)
1163 if (len
>= IFALIASZ
)
1167 kfree(dev
->ifalias
);
1168 dev
->ifalias
= NULL
;
1172 new_ifalias
= krealloc(dev
->ifalias
, len
+ 1, GFP_KERNEL
);
1175 dev
->ifalias
= new_ifalias
;
1177 strlcpy(dev
->ifalias
, alias
, len
+1);
1183 * netdev_features_change - device changes features
1184 * @dev: device to cause notification
1186 * Called to indicate a device has changed features.
1188 void netdev_features_change(struct net_device
*dev
)
1190 call_netdevice_notifiers(NETDEV_FEAT_CHANGE
, dev
);
1192 EXPORT_SYMBOL(netdev_features_change
);
1195 * netdev_state_change - device changes state
1196 * @dev: device to cause notification
1198 * Called to indicate a device has changed state. This function calls
1199 * the notifier chains for netdev_chain and sends a NEWLINK message
1200 * to the routing socket.
1202 void netdev_state_change(struct net_device
*dev
)
1204 if (dev
->flags
& IFF_UP
) {
1205 call_netdevice_notifiers(NETDEV_CHANGE
, dev
);
1206 rtmsg_ifinfo(RTM_NEWLINK
, dev
, 0);
1209 EXPORT_SYMBOL(netdev_state_change
);
1212 * netdev_notify_peers - notify network peers about existence of @dev
1213 * @dev: network device
1215 * Generate traffic such that interested network peers are aware of
1216 * @dev, such as by generating a gratuitous ARP. This may be used when
1217 * a device wants to inform the rest of the network about some sort of
1218 * reconfiguration such as a failover event or virtual machine
1221 void netdev_notify_peers(struct net_device
*dev
)
1224 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS
, dev
);
1227 EXPORT_SYMBOL(netdev_notify_peers
);
1230 * dev_load - load a network module
1231 * @net: the applicable net namespace
1232 * @name: name of interface
1234 * If a network interface is not present and the process has suitable
1235 * privileges this function loads the module. If module loading is not
1236 * available in this kernel then it becomes a nop.
1239 void dev_load(struct net
*net
, const char *name
)
1241 struct net_device
*dev
;
1245 dev
= dev_get_by_name_rcu(net
, name
);
1249 if (no_module
&& capable(CAP_NET_ADMIN
))
1250 no_module
= request_module("netdev-%s", name
);
1251 if (no_module
&& capable(CAP_SYS_MODULE
)) {
1252 if (!request_module("%s", name
))
1253 pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1257 EXPORT_SYMBOL(dev_load
);
1259 static int __dev_open(struct net_device
*dev
)
1261 const struct net_device_ops
*ops
= dev
->netdev_ops
;
1266 if (!netif_device_present(dev
))
1269 /* Block netpoll from trying to do any rx path servicing.
1270 * If we don't do this there is a chance ndo_poll_controller
1271 * or ndo_poll may be running while we open the device
1273 ret
= netpoll_rx_disable(dev
);
1277 ret
= call_netdevice_notifiers(NETDEV_PRE_UP
, dev
);
1278 ret
= notifier_to_errno(ret
);
1282 set_bit(__LINK_STATE_START
, &dev
->state
);
1284 if (ops
->ndo_validate_addr
)
1285 ret
= ops
->ndo_validate_addr(dev
);
1287 if (!ret
&& ops
->ndo_open
)
1288 ret
= ops
->ndo_open(dev
);
1290 netpoll_rx_enable(dev
);
1293 clear_bit(__LINK_STATE_START
, &dev
->state
);
1295 dev
->flags
|= IFF_UP
;
1296 net_dmaengine_get();
1297 dev_set_rx_mode(dev
);
1299 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
1306 * dev_open - prepare an interface for use.
1307 * @dev: device to open
1309 * Takes a device from down to up state. The device's private open
1310 * function is invoked and then the multicast lists are loaded. Finally
1311 * the device is moved into the up state and a %NETDEV_UP message is
1312 * sent to the netdev notifier chain.
1314 * Calling this function on an active interface is a nop. On a failure
1315 * a negative errno code is returned.
1317 int dev_open(struct net_device
*dev
)
1321 if (dev
->flags
& IFF_UP
)
1324 ret
= __dev_open(dev
);
1328 rtmsg_ifinfo(RTM_NEWLINK
, dev
, IFF_UP
|IFF_RUNNING
);
1329 call_netdevice_notifiers(NETDEV_UP
, dev
);
1333 EXPORT_SYMBOL(dev_open
);
1335 static int __dev_close_many(struct list_head
*head
)
1337 struct net_device
*dev
;
1342 list_for_each_entry(dev
, head
, unreg_list
) {
1343 call_netdevice_notifiers(NETDEV_GOING_DOWN
, dev
);
1345 clear_bit(__LINK_STATE_START
, &dev
->state
);
1347 /* Synchronize to scheduled poll. We cannot touch poll list, it
1348 * can be even on different cpu. So just clear netif_running().
1350 * dev->stop() will invoke napi_disable() on all of it's
1351 * napi_struct instances on this device.
1353 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1356 dev_deactivate_many(head
);
1358 list_for_each_entry(dev
, head
, unreg_list
) {
1359 const struct net_device_ops
*ops
= dev
->netdev_ops
;
1362 * Call the device specific close. This cannot fail.
1363 * Only if device is UP
1365 * We allow it to be called even after a DETACH hot-plug
1371 dev
->flags
&= ~IFF_UP
;
1372 net_dmaengine_put();
1378 static int __dev_close(struct net_device
*dev
)
1383 /* Temporarily disable netpoll until the interface is down */
1384 retval
= netpoll_rx_disable(dev
);
1388 list_add(&dev
->unreg_list
, &single
);
1389 retval
= __dev_close_many(&single
);
1392 netpoll_rx_enable(dev
);
1396 static int dev_close_many(struct list_head
*head
)
1398 struct net_device
*dev
, *tmp
;
1399 LIST_HEAD(tmp_list
);
1401 list_for_each_entry_safe(dev
, tmp
, head
, unreg_list
)
1402 if (!(dev
->flags
& IFF_UP
))
1403 list_move(&dev
->unreg_list
, &tmp_list
);
1405 __dev_close_many(head
);
1407 list_for_each_entry(dev
, head
, unreg_list
) {
1408 rtmsg_ifinfo(RTM_NEWLINK
, dev
, IFF_UP
|IFF_RUNNING
);
1409 call_netdevice_notifiers(NETDEV_DOWN
, dev
);
1412 /* rollback_registered_many needs the complete original list */
1413 list_splice(&tmp_list
, head
);
1418 * dev_close - shutdown an interface.
1419 * @dev: device to shutdown
1421 * This function moves an active device into down state. A
1422 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1423 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1426 int dev_close(struct net_device
*dev
)
1429 if (dev
->flags
& IFF_UP
) {
1432 /* Block netpoll rx while the interface is going down */
1433 ret
= netpoll_rx_disable(dev
);
1437 list_add(&dev
->unreg_list
, &single
);
1438 dev_close_many(&single
);
1441 netpoll_rx_enable(dev
);
1445 EXPORT_SYMBOL(dev_close
);
1449 * dev_disable_lro - disable Large Receive Offload on a device
1452 * Disable Large Receive Offload (LRO) on a net device. Must be
1453 * called under RTNL. This is needed if received packets may be
1454 * forwarded to another interface.
1456 void dev_disable_lro(struct net_device
*dev
)
1459 * If we're trying to disable lro on a vlan device
1460 * use the underlying physical device instead
1462 if (is_vlan_dev(dev
))
1463 dev
= vlan_dev_real_dev(dev
);
1465 dev
->wanted_features
&= ~NETIF_F_LRO
;
1466 netdev_update_features(dev
);
1468 if (unlikely(dev
->features
& NETIF_F_LRO
))
1469 netdev_WARN(dev
, "failed to disable LRO!\n");
1471 EXPORT_SYMBOL(dev_disable_lro
);
1474 static int dev_boot_phase
= 1;
1477 * register_netdevice_notifier - register a network notifier block
1480 * Register a notifier to be called when network device events occur.
1481 * The notifier passed is linked into the kernel structures and must
1482 * not be reused until it has been unregistered. A negative errno code
1483 * is returned on a failure.
1485 * When registered all registration and up events are replayed
1486 * to the new notifier to allow device to have a race free
1487 * view of the network device list.
1490 int register_netdevice_notifier(struct notifier_block
*nb
)
1492 struct net_device
*dev
;
1493 struct net_device
*last
;
1498 err
= raw_notifier_chain_register(&netdev_chain
, nb
);
1504 for_each_netdev(net
, dev
) {
1505 err
= nb
->notifier_call(nb
, NETDEV_REGISTER
, dev
);
1506 err
= notifier_to_errno(err
);
1510 if (!(dev
->flags
& IFF_UP
))
1513 nb
->notifier_call(nb
, NETDEV_UP
, dev
);
1524 for_each_netdev(net
, dev
) {
1528 if (dev
->flags
& IFF_UP
) {
1529 nb
->notifier_call(nb
, NETDEV_GOING_DOWN
, dev
);
1530 nb
->notifier_call(nb
, NETDEV_DOWN
, dev
);
1532 nb
->notifier_call(nb
, NETDEV_UNREGISTER
, dev
);
1537 raw_notifier_chain_unregister(&netdev_chain
, nb
);
1540 EXPORT_SYMBOL(register_netdevice_notifier
);
1543 * unregister_netdevice_notifier - unregister a network notifier block
1546 * Unregister a notifier previously registered by
1547 * register_netdevice_notifier(). The notifier is unlinked into the
1548 * kernel structures and may then be reused. A negative errno code
1549 * is returned on a failure.
1551 * After unregistering unregister and down device events are synthesized
1552 * for all devices on the device list to the removed notifier to remove
1553 * the need for special case cleanup code.
1556 int unregister_netdevice_notifier(struct notifier_block
*nb
)
1558 struct net_device
*dev
;
1563 err
= raw_notifier_chain_unregister(&netdev_chain
, nb
);
1568 for_each_netdev(net
, dev
) {
1569 if (dev
->flags
& IFF_UP
) {
1570 nb
->notifier_call(nb
, NETDEV_GOING_DOWN
, dev
);
1571 nb
->notifier_call(nb
, NETDEV_DOWN
, dev
);
1573 nb
->notifier_call(nb
, NETDEV_UNREGISTER
, dev
);
1580 EXPORT_SYMBOL(unregister_netdevice_notifier
);
1583 * call_netdevice_notifiers - call all network notifier blocks
1584 * @val: value passed unmodified to notifier function
1585 * @dev: net_device pointer passed unmodified to notifier function
1587 * Call all network notifier blocks. Parameters and return value
1588 * are as for raw_notifier_call_chain().
1591 int call_netdevice_notifiers(unsigned long val
, struct net_device
*dev
)
1594 return raw_notifier_call_chain(&netdev_chain
, val
, dev
);
1596 EXPORT_SYMBOL(call_netdevice_notifiers
);
1598 static struct static_key netstamp_needed __read_mostly
;
1599 #ifdef HAVE_JUMP_LABEL
1600 /* We are not allowed to call static_key_slow_dec() from irq context
1601 * If net_disable_timestamp() is called from irq context, defer the
1602 * static_key_slow_dec() calls.
1604 static atomic_t netstamp_needed_deferred
;
1607 void net_enable_timestamp(void)
1609 #ifdef HAVE_JUMP_LABEL
1610 int deferred
= atomic_xchg(&netstamp_needed_deferred
, 0);
1614 static_key_slow_dec(&netstamp_needed
);
1618 WARN_ON(in_interrupt());
1619 static_key_slow_inc(&netstamp_needed
);
1621 EXPORT_SYMBOL(net_enable_timestamp
);
1623 void net_disable_timestamp(void)
1625 #ifdef HAVE_JUMP_LABEL
1626 if (in_interrupt()) {
1627 atomic_inc(&netstamp_needed_deferred
);
1631 static_key_slow_dec(&netstamp_needed
);
1633 EXPORT_SYMBOL(net_disable_timestamp
);
1635 static inline void net_timestamp_set(struct sk_buff
*skb
)
1637 skb
->tstamp
.tv64
= 0;
1638 if (static_key_false(&netstamp_needed
))
1639 __net_timestamp(skb
);
1642 #define net_timestamp_check(COND, SKB) \
1643 if (static_key_false(&netstamp_needed)) { \
1644 if ((COND) && !(SKB)->tstamp.tv64) \
1645 __net_timestamp(SKB); \
1648 static int net_hwtstamp_validate(struct ifreq *ifr)
1650 struct hwtstamp_config cfg
;
1651 enum hwtstamp_tx_types tx_type
;
1652 enum hwtstamp_rx_filters rx_filter
;
1653 int tx_type_valid
= 0;
1654 int rx_filter_valid
= 0;
1656 if (copy_from_user(&cfg
, ifr
->ifr_data
, sizeof(cfg
)))
1659 if (cfg
.flags
) /* reserved for future extensions */
1662 tx_type
= cfg
.tx_type
;
1663 rx_filter
= cfg
.rx_filter
;
1666 case HWTSTAMP_TX_OFF
:
1667 case HWTSTAMP_TX_ON
:
1668 case HWTSTAMP_TX_ONESTEP_SYNC
:
1673 switch (rx_filter
) {
1674 case HWTSTAMP_FILTER_NONE
:
1675 case HWTSTAMP_FILTER_ALL
:
1676 case HWTSTAMP_FILTER_SOME
:
1677 case HWTSTAMP_FILTER_PTP_V1_L4_EVENT
:
1678 case HWTSTAMP_FILTER_PTP_V1_L4_SYNC
:
1679 case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ
:
1680 case HWTSTAMP_FILTER_PTP_V2_L4_EVENT
:
1681 case HWTSTAMP_FILTER_PTP_V2_L4_SYNC
:
1682 case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ
:
1683 case HWTSTAMP_FILTER_PTP_V2_L2_EVENT
:
1684 case HWTSTAMP_FILTER_PTP_V2_L2_SYNC
:
1685 case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ
:
1686 case HWTSTAMP_FILTER_PTP_V2_EVENT
:
1687 case HWTSTAMP_FILTER_PTP_V2_SYNC
:
1688 case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ
:
1689 rx_filter_valid
= 1;
1693 if (!tx_type_valid
|| !rx_filter_valid
)
1699 static inline bool is_skb_forwardable(struct net_device
*dev
,
1700 struct sk_buff
*skb
)
1704 if (!(dev
->flags
& IFF_UP
))
1707 len
= dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
;
1708 if (skb
->len
<= len
)
1711 /* if TSO is enabled, we don't care about the length as the packet
1712 * could be forwarded without being segmented before
1714 if (skb_is_gso(skb
))
1721 * dev_forward_skb - loopback an skb to another netif
1723 * @dev: destination network device
1724 * @skb: buffer to forward
1727 * NET_RX_SUCCESS (no congestion)
1728 * NET_RX_DROP (packet was dropped, but freed)
1730 * dev_forward_skb can be used for injecting an skb from the
1731 * start_xmit function of one device into the receive queue
1732 * of another device.
1734 * The receiving device may be in another namespace, so
1735 * we have to clear all information in the skb that could
1736 * impact namespace isolation.
1738 int dev_forward_skb(struct net_device
*dev
, struct sk_buff
*skb
)
1740 if (skb_shinfo(skb
)->tx_flags
& SKBTX_DEV_ZEROCOPY
) {
1741 if (skb_copy_ubufs(skb
, GFP_ATOMIC
)) {
1742 atomic_long_inc(&dev
->rx_dropped
);
1751 if (unlikely(!is_skb_forwardable(dev
, skb
))) {
1752 atomic_long_inc(&dev
->rx_dropped
);
1759 skb
->tstamp
.tv64
= 0;
1760 skb
->pkt_type
= PACKET_HOST
;
1761 skb
->protocol
= eth_type_trans(skb
, dev
);
1765 return netif_rx(skb
);
1767 EXPORT_SYMBOL_GPL(dev_forward_skb
);
1769 static inline int deliver_skb(struct sk_buff
*skb
,
1770 struct packet_type
*pt_prev
,
1771 struct net_device
*orig_dev
)
1773 if (unlikely(skb_orphan_frags(skb
, GFP_ATOMIC
)))
1775 atomic_inc(&skb
->users
);
1776 return pt_prev
->func(skb
, skb
->dev
, pt_prev
, orig_dev
);
1779 static inline bool skb_loop_sk(struct packet_type
*ptype
, struct sk_buff
*skb
)
1781 if (!ptype
->af_packet_priv
|| !skb
->sk
)
1784 if (ptype
->id_match
)
1785 return ptype
->id_match(ptype
, skb
->sk
);
1786 else if ((struct sock
*)ptype
->af_packet_priv
== skb
->sk
)
1793 * Support routine. Sends outgoing frames to any network
1794 * taps currently in use.
1797 static void dev_queue_xmit_nit(struct sk_buff
*skb
, struct net_device
*dev
)
1799 struct packet_type
*ptype
;
1800 struct sk_buff
*skb2
= NULL
;
1801 struct packet_type
*pt_prev
= NULL
;
1804 list_for_each_entry_rcu(ptype
, &ptype_all
, list
) {
1805 /* Never send packets back to the socket
1806 * they originated from - MvS (miquels@drinkel.ow.org)
1808 if ((ptype
->dev
== dev
|| !ptype
->dev
) &&
1809 (!skb_loop_sk(ptype
, skb
))) {
1811 deliver_skb(skb2
, pt_prev
, skb
->dev
);
1816 skb2
= skb_clone(skb
, GFP_ATOMIC
);
1820 net_timestamp_set(skb2
);
1822 /* skb->nh should be correctly
1823 set by sender, so that the second statement is
1824 just protection against buggy protocols.
1826 skb_reset_mac_header(skb2
);
1828 if (skb_network_header(skb2
) < skb2
->data
||
1829 skb2
->network_header
> skb2
->tail
) {
1830 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1831 ntohs(skb2
->protocol
),
1833 skb_reset_network_header(skb2
);
1836 skb2
->transport_header
= skb2
->network_header
;
1837 skb2
->pkt_type
= PACKET_OUTGOING
;
1842 pt_prev
->func(skb2
, skb
->dev
, pt_prev
, skb
->dev
);
1847 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1848 * @dev: Network device
1849 * @txq: number of queues available
1851 * If real_num_tx_queues is changed the tc mappings may no longer be
1852 * valid. To resolve this verify the tc mapping remains valid and if
1853 * not NULL the mapping. With no priorities mapping to this
1854 * offset/count pair it will no longer be used. In the worst case TC0
1855 * is invalid nothing can be done so disable priority mappings. If is
1856 * expected that drivers will fix this mapping if they can before
1857 * calling netif_set_real_num_tx_queues.
1859 static void netif_setup_tc(struct net_device
*dev
, unsigned int txq
)
1862 struct netdev_tc_txq
*tc
= &dev
->tc_to_txq
[0];
1864 /* If TC0 is invalidated disable TC mapping */
1865 if (tc
->offset
+ tc
->count
> txq
) {
1866 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1871 /* Invalidated prio to tc mappings set to TC0 */
1872 for (i
= 1; i
< TC_BITMASK
+ 1; i
++) {
1873 int q
= netdev_get_prio_tc_map(dev
, i
);
1875 tc
= &dev
->tc_to_txq
[q
];
1876 if (tc
->offset
+ tc
->count
> txq
) {
1877 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1879 netdev_set_prio_tc_map(dev
, i
, 0);
1885 static DEFINE_MUTEX(xps_map_mutex
);
1886 #define xmap_dereference(P) \
1887 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1889 static struct xps_map
*remove_xps_queue(struct xps_dev_maps
*dev_maps
,
1892 struct xps_map
*map
= NULL
;
1896 map
= xmap_dereference(dev_maps
->cpu_map
[cpu
]);
1898 for (pos
= 0; map
&& pos
< map
->len
; pos
++) {
1899 if (map
->queues
[pos
] == index
) {
1901 map
->queues
[pos
] = map
->queues
[--map
->len
];
1903 RCU_INIT_POINTER(dev_maps
->cpu_map
[cpu
], NULL
);
1904 kfree_rcu(map
, rcu
);
1914 static void netif_reset_xps_queues_gt(struct net_device
*dev
, u16 index
)
1916 struct xps_dev_maps
*dev_maps
;
1918 bool active
= false;
1920 mutex_lock(&xps_map_mutex
);
1921 dev_maps
= xmap_dereference(dev
->xps_maps
);
1926 for_each_possible_cpu(cpu
) {
1927 for (i
= index
; i
< dev
->num_tx_queues
; i
++) {
1928 if (!remove_xps_queue(dev_maps
, cpu
, i
))
1931 if (i
== dev
->num_tx_queues
)
1936 RCU_INIT_POINTER(dev
->xps_maps
, NULL
);
1937 kfree_rcu(dev_maps
, rcu
);
1940 for (i
= index
; i
< dev
->num_tx_queues
; i
++)
1941 netdev_queue_numa_node_write(netdev_get_tx_queue(dev
, i
),
1945 mutex_unlock(&xps_map_mutex
);
1948 static struct xps_map
*expand_xps_map(struct xps_map
*map
,
1951 struct xps_map
*new_map
;
1952 int alloc_len
= XPS_MIN_MAP_ALLOC
;
1955 for (pos
= 0; map
&& pos
< map
->len
; pos
++) {
1956 if (map
->queues
[pos
] != index
)
1961 /* Need to add queue to this CPU's existing map */
1963 if (pos
< map
->alloc_len
)
1966 alloc_len
= map
->alloc_len
* 2;
1969 /* Need to allocate new map to store queue on this CPU's map */
1970 new_map
= kzalloc_node(XPS_MAP_SIZE(alloc_len
), GFP_KERNEL
,
1975 for (i
= 0; i
< pos
; i
++)
1976 new_map
->queues
[i
] = map
->queues
[i
];
1977 new_map
->alloc_len
= alloc_len
;
1983 int netif_set_xps_queue(struct net_device
*dev
, struct cpumask
*mask
, u16 index
)
1985 struct xps_dev_maps
*dev_maps
, *new_dev_maps
= NULL
;
1986 struct xps_map
*map
, *new_map
;
1987 int maps_sz
= max_t(unsigned int, XPS_DEV_MAPS_SIZE
, L1_CACHE_BYTES
);
1988 int cpu
, numa_node_id
= -2;
1989 bool active
= false;
1991 mutex_lock(&xps_map_mutex
);
1993 dev_maps
= xmap_dereference(dev
->xps_maps
);
1995 /* allocate memory for queue storage */
1996 for_each_online_cpu(cpu
) {
1997 if (!cpumask_test_cpu(cpu
, mask
))
2001 new_dev_maps
= kzalloc(maps_sz
, GFP_KERNEL
);
2005 map
= dev_maps
? xmap_dereference(dev_maps
->cpu_map
[cpu
]) :
2008 map
= expand_xps_map(map
, cpu
, index
);
2012 RCU_INIT_POINTER(new_dev_maps
->cpu_map
[cpu
], map
);
2016 goto out_no_new_maps
;
2018 for_each_possible_cpu(cpu
) {
2019 if (cpumask_test_cpu(cpu
, mask
) && cpu_online(cpu
)) {
2020 /* add queue to CPU maps */
2023 map
= xmap_dereference(new_dev_maps
->cpu_map
[cpu
]);
2024 while ((pos
< map
->len
) && (map
->queues
[pos
] != index
))
2027 if (pos
== map
->len
)
2028 map
->queues
[map
->len
++] = index
;
2030 if (numa_node_id
== -2)
2031 numa_node_id
= cpu_to_node(cpu
);
2032 else if (numa_node_id
!= cpu_to_node(cpu
))
2035 } else if (dev_maps
) {
2036 /* fill in the new device map from the old device map */
2037 map
= xmap_dereference(dev_maps
->cpu_map
[cpu
]);
2038 RCU_INIT_POINTER(new_dev_maps
->cpu_map
[cpu
], map
);
2043 rcu_assign_pointer(dev
->xps_maps
, new_dev_maps
);
2045 /* Cleanup old maps */
2047 for_each_possible_cpu(cpu
) {
2048 new_map
= xmap_dereference(new_dev_maps
->cpu_map
[cpu
]);
2049 map
= xmap_dereference(dev_maps
->cpu_map
[cpu
]);
2050 if (map
&& map
!= new_map
)
2051 kfree_rcu(map
, rcu
);
2054 kfree_rcu(dev_maps
, rcu
);
2057 dev_maps
= new_dev_maps
;
2061 /* update Tx queue numa node */
2062 netdev_queue_numa_node_write(netdev_get_tx_queue(dev
, index
),
2063 (numa_node_id
>= 0) ? numa_node_id
:
2069 /* removes queue from unused CPUs */
2070 for_each_possible_cpu(cpu
) {
2071 if (cpumask_test_cpu(cpu
, mask
) && cpu_online(cpu
))
2074 if (remove_xps_queue(dev_maps
, cpu
, index
))
2078 /* free map if not active */
2080 RCU_INIT_POINTER(dev
->xps_maps
, NULL
);
2081 kfree_rcu(dev_maps
, rcu
);
2085 mutex_unlock(&xps_map_mutex
);
2089 /* remove any maps that we added */
2090 for_each_possible_cpu(cpu
) {
2091 new_map
= xmap_dereference(new_dev_maps
->cpu_map
[cpu
]);
2092 map
= dev_maps
? xmap_dereference(dev_maps
->cpu_map
[cpu
]) :
2094 if (new_map
&& new_map
!= map
)
2098 mutex_unlock(&xps_map_mutex
);
2100 kfree(new_dev_maps
);
2103 EXPORT_SYMBOL(netif_set_xps_queue
);
2107 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2108 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2110 int netif_set_real_num_tx_queues(struct net_device
*dev
, unsigned int txq
)
2114 if (txq
< 1 || txq
> dev
->num_tx_queues
)
2117 if (dev
->reg_state
== NETREG_REGISTERED
||
2118 dev
->reg_state
== NETREG_UNREGISTERING
) {
2121 rc
= netdev_queue_update_kobjects(dev
, dev
->real_num_tx_queues
,
2127 netif_setup_tc(dev
, txq
);
2129 if (txq
< dev
->real_num_tx_queues
) {
2130 qdisc_reset_all_tx_gt(dev
, txq
);
2132 netif_reset_xps_queues_gt(dev
, txq
);
2137 dev
->real_num_tx_queues
= txq
;
2140 EXPORT_SYMBOL(netif_set_real_num_tx_queues
);
2144 * netif_set_real_num_rx_queues - set actual number of RX queues used
2145 * @dev: Network device
2146 * @rxq: Actual number of RX queues
2148 * This must be called either with the rtnl_lock held or before
2149 * registration of the net device. Returns 0 on success, or a
2150 * negative error code. If called before registration, it always
2153 int netif_set_real_num_rx_queues(struct net_device
*dev
, unsigned int rxq
)
2157 if (rxq
< 1 || rxq
> dev
->num_rx_queues
)
2160 if (dev
->reg_state
== NETREG_REGISTERED
) {
2163 rc
= net_rx_queue_update_kobjects(dev
, dev
->real_num_rx_queues
,
2169 dev
->real_num_rx_queues
= rxq
;
2172 EXPORT_SYMBOL(netif_set_real_num_rx_queues
);
2176 * netif_get_num_default_rss_queues - default number of RSS queues
2178 * This routine should set an upper limit on the number of RSS queues
2179 * used by default by multiqueue devices.
2181 int netif_get_num_default_rss_queues(void)
2183 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES
, num_online_cpus());
2185 EXPORT_SYMBOL(netif_get_num_default_rss_queues
);
2187 static inline void __netif_reschedule(struct Qdisc
*q
)
2189 struct softnet_data
*sd
;
2190 unsigned long flags
;
2192 local_irq_save(flags
);
2193 sd
= &__get_cpu_var(softnet_data
);
2194 q
->next_sched
= NULL
;
2195 *sd
->output_queue_tailp
= q
;
2196 sd
->output_queue_tailp
= &q
->next_sched
;
2197 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
2198 local_irq_restore(flags
);
2201 void __netif_schedule(struct Qdisc
*q
)
2203 if (!test_and_set_bit(__QDISC_STATE_SCHED
, &q
->state
))
2204 __netif_reschedule(q
);
2206 EXPORT_SYMBOL(__netif_schedule
);
2208 void dev_kfree_skb_irq(struct sk_buff
*skb
)
2210 if (atomic_dec_and_test(&skb
->users
)) {
2211 struct softnet_data
*sd
;
2212 unsigned long flags
;
2214 local_irq_save(flags
);
2215 sd
= &__get_cpu_var(softnet_data
);
2216 skb
->next
= sd
->completion_queue
;
2217 sd
->completion_queue
= skb
;
2218 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
2219 local_irq_restore(flags
);
2222 EXPORT_SYMBOL(dev_kfree_skb_irq
);
2224 void dev_kfree_skb_any(struct sk_buff
*skb
)
2226 if (in_irq() || irqs_disabled())
2227 dev_kfree_skb_irq(skb
);
2231 EXPORT_SYMBOL(dev_kfree_skb_any
);
2235 * netif_device_detach - mark device as removed
2236 * @dev: network device
2238 * Mark device as removed from system and therefore no longer available.
2240 void netif_device_detach(struct net_device
*dev
)
2242 if (test_and_clear_bit(__LINK_STATE_PRESENT
, &dev
->state
) &&
2243 netif_running(dev
)) {
2244 netif_tx_stop_all_queues(dev
);
2247 EXPORT_SYMBOL(netif_device_detach
);
2250 * netif_device_attach - mark device as attached
2251 * @dev: network device
2253 * Mark device as attached from system and restart if needed.
2255 void netif_device_attach(struct net_device
*dev
)
2257 if (!test_and_set_bit(__LINK_STATE_PRESENT
, &dev
->state
) &&
2258 netif_running(dev
)) {
2259 netif_tx_wake_all_queues(dev
);
2260 __netdev_watchdog_up(dev
);
2263 EXPORT_SYMBOL(netif_device_attach
);
2265 static void skb_warn_bad_offload(const struct sk_buff
*skb
)
2267 static const netdev_features_t null_features
= 0;
2268 struct net_device
*dev
= skb
->dev
;
2269 const char *driver
= "";
2271 if (dev
&& dev
->dev
.parent
)
2272 driver
= dev_driver_string(dev
->dev
.parent
);
2274 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2275 "gso_type=%d ip_summed=%d\n",
2276 driver
, dev
? &dev
->features
: &null_features
,
2277 skb
->sk
? &skb
->sk
->sk_route_caps
: &null_features
,
2278 skb
->len
, skb
->data_len
, skb_shinfo(skb
)->gso_size
,
2279 skb_shinfo(skb
)->gso_type
, skb
->ip_summed
);
2283 * Invalidate hardware checksum when packet is to be mangled, and
2284 * complete checksum manually on outgoing path.
2286 int skb_checksum_help(struct sk_buff
*skb
)
2289 int ret
= 0, offset
;
2291 if (skb
->ip_summed
== CHECKSUM_COMPLETE
)
2292 goto out_set_summed
;
2294 if (unlikely(skb_shinfo(skb
)->gso_size
)) {
2295 skb_warn_bad_offload(skb
);
2299 /* Before computing a checksum, we should make sure no frag could
2300 * be modified by an external entity : checksum could be wrong.
2302 if (skb_has_shared_frag(skb
)) {
2303 ret
= __skb_linearize(skb
);
2308 offset
= skb_checksum_start_offset(skb
);
2309 BUG_ON(offset
>= skb_headlen(skb
));
2310 csum
= skb_checksum(skb
, offset
, skb
->len
- offset
, 0);
2312 offset
+= skb
->csum_offset
;
2313 BUG_ON(offset
+ sizeof(__sum16
) > skb_headlen(skb
));
2315 if (skb_cloned(skb
) &&
2316 !skb_clone_writable(skb
, offset
+ sizeof(__sum16
))) {
2317 ret
= pskb_expand_head(skb
, 0, 0, GFP_ATOMIC
);
2322 *(__sum16
*)(skb
->data
+ offset
) = csum_fold(csum
);
2324 skb
->ip_summed
= CHECKSUM_NONE
;
2328 EXPORT_SYMBOL(skb_checksum_help
);
2330 /* openvswitch calls this on rx path, so we need a different check.
2332 static inline bool skb_needs_check(struct sk_buff
*skb
, bool tx_path
)
2335 return skb
->ip_summed
!= CHECKSUM_PARTIAL
;
2337 return skb
->ip_summed
== CHECKSUM_NONE
;
2341 * __skb_gso_segment - Perform segmentation on skb.
2342 * @skb: buffer to segment
2343 * @features: features for the output path (see dev->features)
2344 * @tx_path: whether it is called in TX path
2346 * This function segments the given skb and returns a list of segments.
2348 * It may return NULL if the skb requires no segmentation. This is
2349 * only possible when GSO is used for verifying header integrity.
2351 struct sk_buff
*__skb_gso_segment(struct sk_buff
*skb
,
2352 netdev_features_t features
, bool tx_path
)
2354 struct sk_buff
*segs
= ERR_PTR(-EPROTONOSUPPORT
);
2355 struct packet_offload
*ptype
;
2356 __be16 type
= skb
->protocol
;
2357 int vlan_depth
= ETH_HLEN
;
2360 while (type
== htons(ETH_P_8021Q
)) {
2361 struct vlan_hdr
*vh
;
2363 if (unlikely(!pskb_may_pull(skb
, vlan_depth
+ VLAN_HLEN
)))
2364 return ERR_PTR(-EINVAL
);
2366 vh
= (struct vlan_hdr
*)(skb
->data
+ vlan_depth
);
2367 type
= vh
->h_vlan_encapsulated_proto
;
2368 vlan_depth
+= VLAN_HLEN
;
2371 skb_reset_mac_header(skb
);
2372 skb
->mac_len
= skb
->network_header
- skb
->mac_header
;
2373 __skb_pull(skb
, skb
->mac_len
);
2375 if (unlikely(skb_needs_check(skb
, tx_path
))) {
2376 skb_warn_bad_offload(skb
);
2378 if (skb_header_cloned(skb
) &&
2379 (err
= pskb_expand_head(skb
, 0, 0, GFP_ATOMIC
)))
2380 return ERR_PTR(err
);
2384 list_for_each_entry_rcu(ptype
, &offload_base
, list
) {
2385 if (ptype
->type
== type
&& ptype
->callbacks
.gso_segment
) {
2386 if (unlikely(skb
->ip_summed
!= CHECKSUM_PARTIAL
)) {
2387 err
= ptype
->callbacks
.gso_send_check(skb
);
2388 segs
= ERR_PTR(err
);
2389 if (err
|| skb_gso_ok(skb
, features
))
2391 __skb_push(skb
, (skb
->data
-
2392 skb_network_header(skb
)));
2394 segs
= ptype
->callbacks
.gso_segment(skb
, features
);
2400 __skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2404 EXPORT_SYMBOL(__skb_gso_segment
);
2406 /* Take action when hardware reception checksum errors are detected. */
2408 void netdev_rx_csum_fault(struct net_device
*dev
)
2410 if (net_ratelimit()) {
2411 pr_err("%s: hw csum failure\n", dev
? dev
->name
: "<unknown>");
2415 EXPORT_SYMBOL(netdev_rx_csum_fault
);
2418 /* Actually, we should eliminate this check as soon as we know, that:
2419 * 1. IOMMU is present and allows to map all the memory.
2420 * 2. No high memory really exists on this machine.
2423 static int illegal_highdma(struct net_device
*dev
, struct sk_buff
*skb
)
2425 #ifdef CONFIG_HIGHMEM
2427 if (!(dev
->features
& NETIF_F_HIGHDMA
)) {
2428 for (i
= 0; i
< skb_shinfo(skb
)->nr_frags
; i
++) {
2429 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
];
2430 if (PageHighMem(skb_frag_page(frag
)))
2435 if (PCI_DMA_BUS_IS_PHYS
) {
2436 struct device
*pdev
= dev
->dev
.parent
;
2440 for (i
= 0; i
< skb_shinfo(skb
)->nr_frags
; i
++) {
2441 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
];
2442 dma_addr_t addr
= page_to_phys(skb_frag_page(frag
));
2443 if (!pdev
->dma_mask
|| addr
+ PAGE_SIZE
- 1 > *pdev
->dma_mask
)
2452 void (*destructor
)(struct sk_buff
*skb
);
2455 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2457 static void dev_gso_skb_destructor(struct sk_buff
*skb
)
2459 struct dev_gso_cb
*cb
;
2462 struct sk_buff
*nskb
= skb
->next
;
2464 skb
->next
= nskb
->next
;
2467 } while (skb
->next
);
2469 cb
= DEV_GSO_CB(skb
);
2471 cb
->destructor(skb
);
2475 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2476 * @skb: buffer to segment
2477 * @features: device features as applicable to this skb
2479 * This function segments the given skb and stores the list of segments
2482 static int dev_gso_segment(struct sk_buff
*skb
, netdev_features_t features
)
2484 struct sk_buff
*segs
;
2486 segs
= skb_gso_segment(skb
, features
);
2488 /* Verifying header integrity only. */
2493 return PTR_ERR(segs
);
2496 DEV_GSO_CB(skb
)->destructor
= skb
->destructor
;
2497 skb
->destructor
= dev_gso_skb_destructor
;
2502 static bool can_checksum_protocol(netdev_features_t features
, __be16 protocol
)
2504 return ((features
& NETIF_F_GEN_CSUM
) ||
2505 ((features
& NETIF_F_V4_CSUM
) &&
2506 protocol
== htons(ETH_P_IP
)) ||
2507 ((features
& NETIF_F_V6_CSUM
) &&
2508 protocol
== htons(ETH_P_IPV6
)) ||
2509 ((features
& NETIF_F_FCOE_CRC
) &&
2510 protocol
== htons(ETH_P_FCOE
)));
2513 static netdev_features_t
harmonize_features(struct sk_buff
*skb
,
2514 __be16 protocol
, netdev_features_t features
)
2516 if (skb
->ip_summed
!= CHECKSUM_NONE
&&
2517 !can_checksum_protocol(features
, protocol
)) {
2518 features
&= ~NETIF_F_ALL_CSUM
;
2519 features
&= ~NETIF_F_SG
;
2520 } else if (illegal_highdma(skb
->dev
, skb
)) {
2521 features
&= ~NETIF_F_SG
;
2527 netdev_features_t
netif_skb_features(struct sk_buff
*skb
)
2529 __be16 protocol
= skb
->protocol
;
2530 netdev_features_t features
= skb
->dev
->features
;
2532 if (skb_shinfo(skb
)->gso_segs
> skb
->dev
->gso_max_segs
)
2533 features
&= ~NETIF_F_GSO_MASK
;
2535 if (protocol
== htons(ETH_P_8021Q
)) {
2536 struct vlan_ethhdr
*veh
= (struct vlan_ethhdr
*)skb
->data
;
2537 protocol
= veh
->h_vlan_encapsulated_proto
;
2538 } else if (!vlan_tx_tag_present(skb
)) {
2539 return harmonize_features(skb
, protocol
, features
);
2542 features
&= (skb
->dev
->vlan_features
| NETIF_F_HW_VLAN_TX
);
2544 if (protocol
!= htons(ETH_P_8021Q
)) {
2545 return harmonize_features(skb
, protocol
, features
);
2547 features
&= NETIF_F_SG
| NETIF_F_HIGHDMA
| NETIF_F_FRAGLIST
|
2548 NETIF_F_GEN_CSUM
| NETIF_F_HW_VLAN_TX
;
2549 return harmonize_features(skb
, protocol
, features
);
2552 EXPORT_SYMBOL(netif_skb_features
);
2555 * Returns true if either:
2556 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2557 * 2. skb is fragmented and the device does not support SG.
2559 static inline int skb_needs_linearize(struct sk_buff
*skb
,
2562 return skb_is_nonlinear(skb
) &&
2563 ((skb_has_frag_list(skb
) &&
2564 !(features
& NETIF_F_FRAGLIST
)) ||
2565 (skb_shinfo(skb
)->nr_frags
&&
2566 !(features
& NETIF_F_SG
)));
2569 int dev_hard_start_xmit(struct sk_buff
*skb
, struct net_device
*dev
,
2570 struct netdev_queue
*txq
)
2572 const struct net_device_ops
*ops
= dev
->netdev_ops
;
2573 int rc
= NETDEV_TX_OK
;
2574 unsigned int skb_len
;
2576 if (likely(!skb
->next
)) {
2577 netdev_features_t features
;
2580 * If device doesn't need skb->dst, release it right now while
2581 * its hot in this cpu cache
2583 if (dev
->priv_flags
& IFF_XMIT_DST_RELEASE
)
2586 features
= netif_skb_features(skb
);
2588 if (vlan_tx_tag_present(skb
) &&
2589 !(features
& NETIF_F_HW_VLAN_TX
)) {
2590 skb
= __vlan_put_tag(skb
, vlan_tx_tag_get(skb
));
2597 /* If encapsulation offload request, verify we are testing
2598 * hardware encapsulation features instead of standard
2599 * features for the netdev
2601 if (skb
->encapsulation
)
2602 features
&= dev
->hw_enc_features
;
2604 if (netif_needs_gso(skb
, features
)) {
2605 if (unlikely(dev_gso_segment(skb
, features
)))
2610 if (skb_needs_linearize(skb
, features
) &&
2611 __skb_linearize(skb
))
2614 /* If packet is not checksummed and device does not
2615 * support checksumming for this protocol, complete
2616 * checksumming here.
2618 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
2619 if (skb
->encapsulation
)
2620 skb_set_inner_transport_header(skb
,
2621 skb_checksum_start_offset(skb
));
2623 skb_set_transport_header(skb
,
2624 skb_checksum_start_offset(skb
));
2625 if (!(features
& NETIF_F_ALL_CSUM
) &&
2626 skb_checksum_help(skb
))
2631 if (!list_empty(&ptype_all
))
2632 dev_queue_xmit_nit(skb
, dev
);
2635 rc
= ops
->ndo_start_xmit(skb
, dev
);
2636 trace_net_dev_xmit(skb
, rc
, dev
, skb_len
);
2637 if (rc
== NETDEV_TX_OK
)
2638 txq_trans_update(txq
);
2644 struct sk_buff
*nskb
= skb
->next
;
2646 skb
->next
= nskb
->next
;
2650 * If device doesn't need nskb->dst, release it right now while
2651 * its hot in this cpu cache
2653 if (dev
->priv_flags
& IFF_XMIT_DST_RELEASE
)
2656 if (!list_empty(&ptype_all
))
2657 dev_queue_xmit_nit(nskb
, dev
);
2659 skb_len
= nskb
->len
;
2660 rc
= ops
->ndo_start_xmit(nskb
, dev
);
2661 trace_net_dev_xmit(nskb
, rc
, dev
, skb_len
);
2662 if (unlikely(rc
!= NETDEV_TX_OK
)) {
2663 if (rc
& ~NETDEV_TX_MASK
)
2664 goto out_kfree_gso_skb
;
2665 nskb
->next
= skb
->next
;
2669 txq_trans_update(txq
);
2670 if (unlikely(netif_xmit_stopped(txq
) && skb
->next
))
2671 return NETDEV_TX_BUSY
;
2672 } while (skb
->next
);
2675 if (likely(skb
->next
== NULL
))
2676 skb
->destructor
= DEV_GSO_CB(skb
)->destructor
;
2683 static void qdisc_pkt_len_init(struct sk_buff
*skb
)
2685 const struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
2687 qdisc_skb_cb(skb
)->pkt_len
= skb
->len
;
2689 /* To get more precise estimation of bytes sent on wire,
2690 * we add to pkt_len the headers size of all segments
2692 if (shinfo
->gso_size
) {
2693 unsigned int hdr_len
;
2695 /* mac layer + network layer */
2696 hdr_len
= skb_transport_header(skb
) - skb_mac_header(skb
);
2698 /* + transport layer */
2699 if (likely(shinfo
->gso_type
& (SKB_GSO_TCPV4
| SKB_GSO_TCPV6
)))
2700 hdr_len
+= tcp_hdrlen(skb
);
2702 hdr_len
+= sizeof(struct udphdr
);
2703 qdisc_skb_cb(skb
)->pkt_len
+= (shinfo
->gso_segs
- 1) * hdr_len
;
2707 static inline int __dev_xmit_skb(struct sk_buff
*skb
, struct Qdisc
*q
,
2708 struct net_device
*dev
,
2709 struct netdev_queue
*txq
)
2711 spinlock_t
*root_lock
= qdisc_lock(q
);
2715 qdisc_pkt_len_init(skb
);
2716 qdisc_calculate_pkt_len(skb
, q
);
2718 * Heuristic to force contended enqueues to serialize on a
2719 * separate lock before trying to get qdisc main lock.
2720 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2721 * and dequeue packets faster.
2723 contended
= qdisc_is_running(q
);
2724 if (unlikely(contended
))
2725 spin_lock(&q
->busylock
);
2727 spin_lock(root_lock
);
2728 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED
, &q
->state
))) {
2731 } else if ((q
->flags
& TCQ_F_CAN_BYPASS
) && !qdisc_qlen(q
) &&
2732 qdisc_run_begin(q
)) {
2734 * This is a work-conserving queue; there are no old skbs
2735 * waiting to be sent out; and the qdisc is not running -
2736 * xmit the skb directly.
2738 if (!(dev
->priv_flags
& IFF_XMIT_DST_RELEASE
))
2741 qdisc_bstats_update(q
, skb
);
2743 if (sch_direct_xmit(skb
, q
, dev
, txq
, root_lock
)) {
2744 if (unlikely(contended
)) {
2745 spin_unlock(&q
->busylock
);
2752 rc
= NET_XMIT_SUCCESS
;
2755 rc
= q
->enqueue(skb
, q
) & NET_XMIT_MASK
;
2756 if (qdisc_run_begin(q
)) {
2757 if (unlikely(contended
)) {
2758 spin_unlock(&q
->busylock
);
2764 spin_unlock(root_lock
);
2765 if (unlikely(contended
))
2766 spin_unlock(&q
->busylock
);
2770 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2771 static void skb_update_prio(struct sk_buff
*skb
)
2773 struct netprio_map
*map
= rcu_dereference_bh(skb
->dev
->priomap
);
2775 if (!skb
->priority
&& skb
->sk
&& map
) {
2776 unsigned int prioidx
= skb
->sk
->sk_cgrp_prioidx
;
2778 if (prioidx
< map
->priomap_len
)
2779 skb
->priority
= map
->priomap
[prioidx
];
2783 #define skb_update_prio(skb)
2786 static DEFINE_PER_CPU(int, xmit_recursion
);
2787 #define RECURSION_LIMIT 10
2790 * dev_loopback_xmit - loop back @skb
2791 * @skb: buffer to transmit
2793 int dev_loopback_xmit(struct sk_buff
*skb
)
2795 skb_reset_mac_header(skb
);
2796 __skb_pull(skb
, skb_network_offset(skb
));
2797 skb
->pkt_type
= PACKET_LOOPBACK
;
2798 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
2799 WARN_ON(!skb_dst(skb
));
2804 EXPORT_SYMBOL(dev_loopback_xmit
);
2807 * dev_queue_xmit - transmit a buffer
2808 * @skb: buffer to transmit
2810 * Queue a buffer for transmission to a network device. The caller must
2811 * have set the device and priority and built the buffer before calling
2812 * this function. The function can be called from an interrupt.
2814 * A negative errno code is returned on a failure. A success does not
2815 * guarantee the frame will be transmitted as it may be dropped due
2816 * to congestion or traffic shaping.
2818 * -----------------------------------------------------------------------------------
2819 * I notice this method can also return errors from the queue disciplines,
2820 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2823 * Regardless of the return value, the skb is consumed, so it is currently
2824 * difficult to retry a send to this method. (You can bump the ref count
2825 * before sending to hold a reference for retry if you are careful.)
2827 * When calling this method, interrupts MUST be enabled. This is because
2828 * the BH enable code must have IRQs enabled so that it will not deadlock.
2831 int dev_queue_xmit(struct sk_buff
*skb
)
2833 struct net_device
*dev
= skb
->dev
;
2834 struct netdev_queue
*txq
;
2838 skb_reset_mac_header(skb
);
2840 /* Disable soft irqs for various locks below. Also
2841 * stops preemption for RCU.
2845 skb_update_prio(skb
);
2847 txq
= netdev_pick_tx(dev
, skb
);
2848 q
= rcu_dereference_bh(txq
->qdisc
);
2850 #ifdef CONFIG_NET_CLS_ACT
2851 skb
->tc_verd
= SET_TC_AT(skb
->tc_verd
, AT_EGRESS
);
2853 trace_net_dev_queue(skb
);
2855 rc
= __dev_xmit_skb(skb
, q
, dev
, txq
);
2859 /* The device has no queue. Common case for software devices:
2860 loopback, all the sorts of tunnels...
2862 Really, it is unlikely that netif_tx_lock protection is necessary
2863 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2865 However, it is possible, that they rely on protection
2868 Check this and shot the lock. It is not prone from deadlocks.
2869 Either shot noqueue qdisc, it is even simpler 8)
2871 if (dev
->flags
& IFF_UP
) {
2872 int cpu
= smp_processor_id(); /* ok because BHs are off */
2874 if (txq
->xmit_lock_owner
!= cpu
) {
2876 if (__this_cpu_read(xmit_recursion
) > RECURSION_LIMIT
)
2877 goto recursion_alert
;
2879 HARD_TX_LOCK(dev
, txq
, cpu
);
2881 if (!netif_xmit_stopped(txq
)) {
2882 __this_cpu_inc(xmit_recursion
);
2883 rc
= dev_hard_start_xmit(skb
, dev
, txq
);
2884 __this_cpu_dec(xmit_recursion
);
2885 if (dev_xmit_complete(rc
)) {
2886 HARD_TX_UNLOCK(dev
, txq
);
2890 HARD_TX_UNLOCK(dev
, txq
);
2891 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2894 /* Recursion is detected! It is possible,
2898 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2904 rcu_read_unlock_bh();
2909 rcu_read_unlock_bh();
2912 EXPORT_SYMBOL(dev_queue_xmit
);
2915 /*=======================================================================
2917 =======================================================================*/
2919 int netdev_max_backlog __read_mostly
= 1000;
2920 EXPORT_SYMBOL(netdev_max_backlog
);
2922 int netdev_tstamp_prequeue __read_mostly
= 1;
2923 int netdev_budget __read_mostly
= 300;
2924 int weight_p __read_mostly
= 64; /* old backlog weight */
2926 /* Called with irq disabled */
2927 static inline void ____napi_schedule(struct softnet_data
*sd
,
2928 struct napi_struct
*napi
)
2930 list_add_tail(&napi
->poll_list
, &sd
->poll_list
);
2931 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
2936 /* One global table that all flow-based protocols share. */
2937 struct rps_sock_flow_table __rcu
*rps_sock_flow_table __read_mostly
;
2938 EXPORT_SYMBOL(rps_sock_flow_table
);
2940 struct static_key rps_needed __read_mostly
;
2942 static struct rps_dev_flow
*
2943 set_rps_cpu(struct net_device
*dev
, struct sk_buff
*skb
,
2944 struct rps_dev_flow
*rflow
, u16 next_cpu
)
2946 if (next_cpu
!= RPS_NO_CPU
) {
2947 #ifdef CONFIG_RFS_ACCEL
2948 struct netdev_rx_queue
*rxqueue
;
2949 struct rps_dev_flow_table
*flow_table
;
2950 struct rps_dev_flow
*old_rflow
;
2955 /* Should we steer this flow to a different hardware queue? */
2956 if (!skb_rx_queue_recorded(skb
) || !dev
->rx_cpu_rmap
||
2957 !(dev
->features
& NETIF_F_NTUPLE
))
2959 rxq_index
= cpu_rmap_lookup_index(dev
->rx_cpu_rmap
, next_cpu
);
2960 if (rxq_index
== skb_get_rx_queue(skb
))
2963 rxqueue
= dev
->_rx
+ rxq_index
;
2964 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
2967 flow_id
= skb
->rxhash
& flow_table
->mask
;
2968 rc
= dev
->netdev_ops
->ndo_rx_flow_steer(dev
, skb
,
2969 rxq_index
, flow_id
);
2973 rflow
= &flow_table
->flows
[flow_id
];
2975 if (old_rflow
->filter
== rflow
->filter
)
2976 old_rflow
->filter
= RPS_NO_FILTER
;
2980 per_cpu(softnet_data
, next_cpu
).input_queue_head
;
2983 rflow
->cpu
= next_cpu
;
2988 * get_rps_cpu is called from netif_receive_skb and returns the target
2989 * CPU from the RPS map of the receiving queue for a given skb.
2990 * rcu_read_lock must be held on entry.
2992 static int get_rps_cpu(struct net_device
*dev
, struct sk_buff
*skb
,
2993 struct rps_dev_flow
**rflowp
)
2995 struct netdev_rx_queue
*rxqueue
;
2996 struct rps_map
*map
;
2997 struct rps_dev_flow_table
*flow_table
;
2998 struct rps_sock_flow_table
*sock_flow_table
;
3002 if (skb_rx_queue_recorded(skb
)) {
3003 u16 index
= skb_get_rx_queue(skb
);
3004 if (unlikely(index
>= dev
->real_num_rx_queues
)) {
3005 WARN_ONCE(dev
->real_num_rx_queues
> 1,
3006 "%s received packet on queue %u, but number "
3007 "of RX queues is %u\n",
3008 dev
->name
, index
, dev
->real_num_rx_queues
);
3011 rxqueue
= dev
->_rx
+ index
;
3015 map
= rcu_dereference(rxqueue
->rps_map
);
3017 if (map
->len
== 1 &&
3018 !rcu_access_pointer(rxqueue
->rps_flow_table
)) {
3019 tcpu
= map
->cpus
[0];
3020 if (cpu_online(tcpu
))
3024 } else if (!rcu_access_pointer(rxqueue
->rps_flow_table
)) {
3028 skb_reset_network_header(skb
);
3029 if (!skb_get_rxhash(skb
))
3032 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
3033 sock_flow_table
= rcu_dereference(rps_sock_flow_table
);
3034 if (flow_table
&& sock_flow_table
) {
3036 struct rps_dev_flow
*rflow
;
3038 rflow
= &flow_table
->flows
[skb
->rxhash
& flow_table
->mask
];
3041 next_cpu
= sock_flow_table
->ents
[skb
->rxhash
&
3042 sock_flow_table
->mask
];
3045 * If the desired CPU (where last recvmsg was done) is
3046 * different from current CPU (one in the rx-queue flow
3047 * table entry), switch if one of the following holds:
3048 * - Current CPU is unset (equal to RPS_NO_CPU).
3049 * - Current CPU is offline.
3050 * - The current CPU's queue tail has advanced beyond the
3051 * last packet that was enqueued using this table entry.
3052 * This guarantees that all previous packets for the flow
3053 * have been dequeued, thus preserving in order delivery.
3055 if (unlikely(tcpu
!= next_cpu
) &&
3056 (tcpu
== RPS_NO_CPU
|| !cpu_online(tcpu
) ||
3057 ((int)(per_cpu(softnet_data
, tcpu
).input_queue_head
-
3058 rflow
->last_qtail
)) >= 0)) {
3060 rflow
= set_rps_cpu(dev
, skb
, rflow
, next_cpu
);
3063 if (tcpu
!= RPS_NO_CPU
&& cpu_online(tcpu
)) {
3071 tcpu
= map
->cpus
[((u64
) skb
->rxhash
* map
->len
) >> 32];
3073 if (cpu_online(tcpu
)) {
3083 #ifdef CONFIG_RFS_ACCEL
3086 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3087 * @dev: Device on which the filter was set
3088 * @rxq_index: RX queue index
3089 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3090 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3092 * Drivers that implement ndo_rx_flow_steer() should periodically call
3093 * this function for each installed filter and remove the filters for
3094 * which it returns %true.
3096 bool rps_may_expire_flow(struct net_device
*dev
, u16 rxq_index
,
3097 u32 flow_id
, u16 filter_id
)
3099 struct netdev_rx_queue
*rxqueue
= dev
->_rx
+ rxq_index
;
3100 struct rps_dev_flow_table
*flow_table
;
3101 struct rps_dev_flow
*rflow
;
3106 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
3107 if (flow_table
&& flow_id
<= flow_table
->mask
) {
3108 rflow
= &flow_table
->flows
[flow_id
];
3109 cpu
= ACCESS_ONCE(rflow
->cpu
);
3110 if (rflow
->filter
== filter_id
&& cpu
!= RPS_NO_CPU
&&
3111 ((int)(per_cpu(softnet_data
, cpu
).input_queue_head
-
3112 rflow
->last_qtail
) <
3113 (int)(10 * flow_table
->mask
)))
3119 EXPORT_SYMBOL(rps_may_expire_flow
);
3121 #endif /* CONFIG_RFS_ACCEL */
3123 /* Called from hardirq (IPI) context */
3124 static void rps_trigger_softirq(void *data
)
3126 struct softnet_data
*sd
= data
;
3128 ____napi_schedule(sd
, &sd
->backlog
);
3132 #endif /* CONFIG_RPS */
3135 * Check if this softnet_data structure is another cpu one
3136 * If yes, queue it to our IPI list and return 1
3139 static int rps_ipi_queued(struct softnet_data
*sd
)
3142 struct softnet_data
*mysd
= &__get_cpu_var(softnet_data
);
3145 sd
->rps_ipi_next
= mysd
->rps_ipi_list
;
3146 mysd
->rps_ipi_list
= sd
;
3148 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
3151 #endif /* CONFIG_RPS */
3156 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3157 * queue (may be a remote CPU queue).
3159 static int enqueue_to_backlog(struct sk_buff
*skb
, int cpu
,
3160 unsigned int *qtail
)
3162 struct softnet_data
*sd
;
3163 unsigned long flags
;
3165 sd
= &per_cpu(softnet_data
, cpu
);
3167 local_irq_save(flags
);
3170 if (skb_queue_len(&sd
->input_pkt_queue
) <= netdev_max_backlog
) {
3171 if (skb_queue_len(&sd
->input_pkt_queue
)) {
3173 __skb_queue_tail(&sd
->input_pkt_queue
, skb
);
3174 input_queue_tail_incr_save(sd
, qtail
);
3176 local_irq_restore(flags
);
3177 return NET_RX_SUCCESS
;
3180 /* Schedule NAPI for backlog device
3181 * We can use non atomic operation since we own the queue lock
3183 if (!__test_and_set_bit(NAPI_STATE_SCHED
, &sd
->backlog
.state
)) {
3184 if (!rps_ipi_queued(sd
))
3185 ____napi_schedule(sd
, &sd
->backlog
);
3193 local_irq_restore(flags
);
3195 atomic_long_inc(&skb
->dev
->rx_dropped
);
3201 * netif_rx - post buffer to the network code
3202 * @skb: buffer to post
3204 * This function receives a packet from a device driver and queues it for
3205 * the upper (protocol) levels to process. It always succeeds. The buffer
3206 * may be dropped during processing for congestion control or by the
3210 * NET_RX_SUCCESS (no congestion)
3211 * NET_RX_DROP (packet was dropped)
3215 int netif_rx(struct sk_buff
*skb
)
3219 /* if netpoll wants it, pretend we never saw it */
3220 if (netpoll_rx(skb
))
3223 net_timestamp_check(netdev_tstamp_prequeue
, skb
);
3225 trace_netif_rx(skb
);
3227 if (static_key_false(&rps_needed
)) {
3228 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
3234 cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
3236 cpu
= smp_processor_id();
3238 ret
= enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
3246 ret
= enqueue_to_backlog(skb
, get_cpu(), &qtail
);
3251 EXPORT_SYMBOL(netif_rx
);
3253 int netif_rx_ni(struct sk_buff
*skb
)
3258 err
= netif_rx(skb
);
3259 if (local_softirq_pending())
3265 EXPORT_SYMBOL(netif_rx_ni
);
3267 static void net_tx_action(struct softirq_action
*h
)
3269 struct softnet_data
*sd
= &__get_cpu_var(softnet_data
);
3271 if (sd
->completion_queue
) {
3272 struct sk_buff
*clist
;
3274 local_irq_disable();
3275 clist
= sd
->completion_queue
;
3276 sd
->completion_queue
= NULL
;
3280 struct sk_buff
*skb
= clist
;
3281 clist
= clist
->next
;
3283 WARN_ON(atomic_read(&skb
->users
));
3284 trace_kfree_skb(skb
, net_tx_action
);
3289 if (sd
->output_queue
) {
3292 local_irq_disable();
3293 head
= sd
->output_queue
;
3294 sd
->output_queue
= NULL
;
3295 sd
->output_queue_tailp
= &sd
->output_queue
;
3299 struct Qdisc
*q
= head
;
3300 spinlock_t
*root_lock
;
3302 head
= head
->next_sched
;
3304 root_lock
= qdisc_lock(q
);
3305 if (spin_trylock(root_lock
)) {
3306 smp_mb__before_clear_bit();
3307 clear_bit(__QDISC_STATE_SCHED
,
3310 spin_unlock(root_lock
);
3312 if (!test_bit(__QDISC_STATE_DEACTIVATED
,
3314 __netif_reschedule(q
);
3316 smp_mb__before_clear_bit();
3317 clear_bit(__QDISC_STATE_SCHED
,
3325 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3326 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3327 /* This hook is defined here for ATM LANE */
3328 int (*br_fdb_test_addr_hook
)(struct net_device
*dev
,
3329 unsigned char *addr
) __read_mostly
;
3330 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook
);
3333 #ifdef CONFIG_NET_CLS_ACT
3334 /* TODO: Maybe we should just force sch_ingress to be compiled in
3335 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3336 * a compare and 2 stores extra right now if we dont have it on
3337 * but have CONFIG_NET_CLS_ACT
3338 * NOTE: This doesn't stop any functionality; if you dont have
3339 * the ingress scheduler, you just can't add policies on ingress.
3342 static int ing_filter(struct sk_buff
*skb
, struct netdev_queue
*rxq
)
3344 struct net_device
*dev
= skb
->dev
;
3345 u32 ttl
= G_TC_RTTL(skb
->tc_verd
);
3346 int result
= TC_ACT_OK
;
3349 if (unlikely(MAX_RED_LOOP
< ttl
++)) {
3350 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3351 skb
->skb_iif
, dev
->ifindex
);
3355 skb
->tc_verd
= SET_TC_RTTL(skb
->tc_verd
, ttl
);
3356 skb
->tc_verd
= SET_TC_AT(skb
->tc_verd
, AT_INGRESS
);
3359 if (q
!= &noop_qdisc
) {
3360 spin_lock(qdisc_lock(q
));
3361 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED
, &q
->state
)))
3362 result
= qdisc_enqueue_root(skb
, q
);
3363 spin_unlock(qdisc_lock(q
));
3369 static inline struct sk_buff
*handle_ing(struct sk_buff
*skb
,
3370 struct packet_type
**pt_prev
,
3371 int *ret
, struct net_device
*orig_dev
)
3373 struct netdev_queue
*rxq
= rcu_dereference(skb
->dev
->ingress_queue
);
3375 if (!rxq
|| rxq
->qdisc
== &noop_qdisc
)
3379 *ret
= deliver_skb(skb
, *pt_prev
, orig_dev
);
3383 switch (ing_filter(skb
, rxq
)) {
3397 * netdev_rx_handler_register - register receive handler
3398 * @dev: device to register a handler for
3399 * @rx_handler: receive handler to register
3400 * @rx_handler_data: data pointer that is used by rx handler
3402 * Register a receive hander for a device. This handler will then be
3403 * called from __netif_receive_skb. A negative errno code is returned
3406 * The caller must hold the rtnl_mutex.
3408 * For a general description of rx_handler, see enum rx_handler_result.
3410 int netdev_rx_handler_register(struct net_device
*dev
,
3411 rx_handler_func_t
*rx_handler
,
3412 void *rx_handler_data
)
3416 if (dev
->rx_handler
)
3419 rcu_assign_pointer(dev
->rx_handler_data
, rx_handler_data
);
3420 rcu_assign_pointer(dev
->rx_handler
, rx_handler
);
3424 EXPORT_SYMBOL_GPL(netdev_rx_handler_register
);
3427 * netdev_rx_handler_unregister - unregister receive handler
3428 * @dev: device to unregister a handler from
3430 * Unregister a receive hander from a device.
3432 * The caller must hold the rtnl_mutex.
3434 void netdev_rx_handler_unregister(struct net_device
*dev
)
3438 RCU_INIT_POINTER(dev
->rx_handler
, NULL
);
3439 RCU_INIT_POINTER(dev
->rx_handler_data
, NULL
);
3441 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister
);
3444 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3445 * the special handling of PFMEMALLOC skbs.
3447 static bool skb_pfmemalloc_protocol(struct sk_buff
*skb
)
3449 switch (skb
->protocol
) {
3450 case __constant_htons(ETH_P_ARP
):
3451 case __constant_htons(ETH_P_IP
):
3452 case __constant_htons(ETH_P_IPV6
):
3453 case __constant_htons(ETH_P_8021Q
):
3460 static int __netif_receive_skb_core(struct sk_buff
*skb
, bool pfmemalloc
)
3462 struct packet_type
*ptype
, *pt_prev
;
3463 rx_handler_func_t
*rx_handler
;
3464 struct net_device
*orig_dev
;
3465 struct net_device
*null_or_dev
;
3466 bool deliver_exact
= false;
3467 int ret
= NET_RX_DROP
;
3470 net_timestamp_check(!netdev_tstamp_prequeue
, skb
);
3472 trace_netif_receive_skb(skb
);
3474 /* if we've gotten here through NAPI, check netpoll */
3475 if (netpoll_receive_skb(skb
))
3478 orig_dev
= skb
->dev
;
3480 skb_reset_network_header(skb
);
3481 if (!skb_transport_header_was_set(skb
))
3482 skb_reset_transport_header(skb
);
3483 skb_reset_mac_len(skb
);
3490 skb
->skb_iif
= skb
->dev
->ifindex
;
3492 __this_cpu_inc(softnet_data
.processed
);
3494 if (skb
->protocol
== cpu_to_be16(ETH_P_8021Q
)) {
3495 skb
= vlan_untag(skb
);
3500 #ifdef CONFIG_NET_CLS_ACT
3501 if (skb
->tc_verd
& TC_NCLS
) {
3502 skb
->tc_verd
= CLR_TC_NCLS(skb
->tc_verd
);
3510 list_for_each_entry_rcu(ptype
, &ptype_all
, list
) {
3511 if (!ptype
->dev
|| ptype
->dev
== skb
->dev
) {
3513 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3519 #ifdef CONFIG_NET_CLS_ACT
3520 skb
= handle_ing(skb
, &pt_prev
, &ret
, orig_dev
);
3526 if (pfmemalloc
&& !skb_pfmemalloc_protocol(skb
))
3529 if (vlan_tx_tag_present(skb
)) {
3531 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3534 if (vlan_do_receive(&skb
))
3536 else if (unlikely(!skb
))
3540 rx_handler
= rcu_dereference(skb
->dev
->rx_handler
);
3543 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3546 switch (rx_handler(&skb
)) {
3547 case RX_HANDLER_CONSUMED
:
3549 case RX_HANDLER_ANOTHER
:
3551 case RX_HANDLER_EXACT
:
3552 deliver_exact
= true;
3553 case RX_HANDLER_PASS
:
3560 if (vlan_tx_nonzero_tag_present(skb
))
3561 skb
->pkt_type
= PACKET_OTHERHOST
;
3563 /* deliver only exact match when indicated */
3564 null_or_dev
= deliver_exact
? skb
->dev
: NULL
;
3566 type
= skb
->protocol
;
3567 list_for_each_entry_rcu(ptype
,
3568 &ptype_base
[ntohs(type
) & PTYPE_HASH_MASK
], list
) {
3569 if (ptype
->type
== type
&&
3570 (ptype
->dev
== null_or_dev
|| ptype
->dev
== skb
->dev
||
3571 ptype
->dev
== orig_dev
)) {
3573 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3579 if (unlikely(skb_orphan_frags(skb
, GFP_ATOMIC
)))
3582 ret
= pt_prev
->func(skb
, skb
->dev
, pt_prev
, orig_dev
);
3585 atomic_long_inc(&skb
->dev
->rx_dropped
);
3587 /* Jamal, now you will not able to escape explaining
3588 * me how you were going to use this. :-)
3599 static int __netif_receive_skb(struct sk_buff
*skb
)
3603 if (sk_memalloc_socks() && skb_pfmemalloc(skb
)) {
3604 unsigned long pflags
= current
->flags
;
3607 * PFMEMALLOC skbs are special, they should
3608 * - be delivered to SOCK_MEMALLOC sockets only
3609 * - stay away from userspace
3610 * - have bounded memory usage
3612 * Use PF_MEMALLOC as this saves us from propagating the allocation
3613 * context down to all allocation sites.
3615 current
->flags
|= PF_MEMALLOC
;
3616 ret
= __netif_receive_skb_core(skb
, true);
3617 tsk_restore_flags(current
, pflags
, PF_MEMALLOC
);
3619 ret
= __netif_receive_skb_core(skb
, false);
3625 * netif_receive_skb - process receive buffer from network
3626 * @skb: buffer to process
3628 * netif_receive_skb() is the main receive data processing function.
3629 * It always succeeds. The buffer may be dropped during processing
3630 * for congestion control or by the protocol layers.
3632 * This function may only be called from softirq context and interrupts
3633 * should be enabled.
3635 * Return values (usually ignored):
3636 * NET_RX_SUCCESS: no congestion
3637 * NET_RX_DROP: packet was dropped
3639 int netif_receive_skb(struct sk_buff
*skb
)
3641 net_timestamp_check(netdev_tstamp_prequeue
, skb
);
3643 if (skb_defer_rx_timestamp(skb
))
3644 return NET_RX_SUCCESS
;
3647 if (static_key_false(&rps_needed
)) {
3648 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
3653 cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
3656 ret
= enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
3663 return __netif_receive_skb(skb
);
3665 EXPORT_SYMBOL(netif_receive_skb
);
3667 /* Network device is going away, flush any packets still pending
3668 * Called with irqs disabled.
3670 static void flush_backlog(void *arg
)
3672 struct net_device
*dev
= arg
;
3673 struct softnet_data
*sd
= &__get_cpu_var(softnet_data
);
3674 struct sk_buff
*skb
, *tmp
;
3677 skb_queue_walk_safe(&sd
->input_pkt_queue
, skb
, tmp
) {
3678 if (skb
->dev
== dev
) {
3679 __skb_unlink(skb
, &sd
->input_pkt_queue
);
3681 input_queue_head_incr(sd
);
3686 skb_queue_walk_safe(&sd
->process_queue
, skb
, tmp
) {
3687 if (skb
->dev
== dev
) {
3688 __skb_unlink(skb
, &sd
->process_queue
);
3690 input_queue_head_incr(sd
);
3695 static int napi_gro_complete(struct sk_buff
*skb
)
3697 struct packet_offload
*ptype
;
3698 __be16 type
= skb
->protocol
;
3699 struct list_head
*head
= &offload_base
;
3702 BUILD_BUG_ON(sizeof(struct napi_gro_cb
) > sizeof(skb
->cb
));
3704 if (NAPI_GRO_CB(skb
)->count
== 1) {
3705 skb_shinfo(skb
)->gso_size
= 0;
3710 list_for_each_entry_rcu(ptype
, head
, list
) {
3711 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_complete
)
3714 err
= ptype
->callbacks
.gro_complete(skb
);
3720 WARN_ON(&ptype
->list
== head
);
3722 return NET_RX_SUCCESS
;
3726 return netif_receive_skb(skb
);
3729 /* napi->gro_list contains packets ordered by age.
3730 * youngest packets at the head of it.
3731 * Complete skbs in reverse order to reduce latencies.
3733 void napi_gro_flush(struct napi_struct
*napi
, bool flush_old
)
3735 struct sk_buff
*skb
, *prev
= NULL
;
3737 /* scan list and build reverse chain */
3738 for (skb
= napi
->gro_list
; skb
!= NULL
; skb
= skb
->next
) {
3743 for (skb
= prev
; skb
; skb
= prev
) {
3746 if (flush_old
&& NAPI_GRO_CB(skb
)->age
== jiffies
)
3750 napi_gro_complete(skb
);
3754 napi
->gro_list
= NULL
;
3756 EXPORT_SYMBOL(napi_gro_flush
);
3758 static void gro_list_prepare(struct napi_struct
*napi
, struct sk_buff
*skb
)
3761 unsigned int maclen
= skb
->dev
->hard_header_len
;
3763 for (p
= napi
->gro_list
; p
; p
= p
->next
) {
3764 unsigned long diffs
;
3766 diffs
= (unsigned long)p
->dev
^ (unsigned long)skb
->dev
;
3767 diffs
|= p
->vlan_tci
^ skb
->vlan_tci
;
3768 if (maclen
== ETH_HLEN
)
3769 diffs
|= compare_ether_header(skb_mac_header(p
),
3770 skb_gro_mac_header(skb
));
3772 diffs
= memcmp(skb_mac_header(p
),
3773 skb_gro_mac_header(skb
),
3775 NAPI_GRO_CB(p
)->same_flow
= !diffs
;
3776 NAPI_GRO_CB(p
)->flush
= 0;
3780 static enum gro_result
dev_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
3782 struct sk_buff
**pp
= NULL
;
3783 struct packet_offload
*ptype
;
3784 __be16 type
= skb
->protocol
;
3785 struct list_head
*head
= &offload_base
;
3788 enum gro_result ret
;
3790 if (!(skb
->dev
->features
& NETIF_F_GRO
) || netpoll_rx_on(skb
))
3793 if (skb_is_gso(skb
) || skb_has_frag_list(skb
))
3796 gro_list_prepare(napi
, skb
);
3799 list_for_each_entry_rcu(ptype
, head
, list
) {
3800 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_receive
)
3803 skb_set_network_header(skb
, skb_gro_offset(skb
));
3804 mac_len
= skb
->network_header
- skb
->mac_header
;
3805 skb
->mac_len
= mac_len
;
3806 NAPI_GRO_CB(skb
)->same_flow
= 0;
3807 NAPI_GRO_CB(skb
)->flush
= 0;
3808 NAPI_GRO_CB(skb
)->free
= 0;
3810 pp
= ptype
->callbacks
.gro_receive(&napi
->gro_list
, skb
);
3815 if (&ptype
->list
== head
)
3818 same_flow
= NAPI_GRO_CB(skb
)->same_flow
;
3819 ret
= NAPI_GRO_CB(skb
)->free
? GRO_MERGED_FREE
: GRO_MERGED
;
3822 struct sk_buff
*nskb
= *pp
;
3826 napi_gro_complete(nskb
);
3833 if (NAPI_GRO_CB(skb
)->flush
|| napi
->gro_count
>= MAX_GRO_SKBS
)
3837 NAPI_GRO_CB(skb
)->count
= 1;
3838 NAPI_GRO_CB(skb
)->age
= jiffies
;
3839 skb_shinfo(skb
)->gso_size
= skb_gro_len(skb
);
3840 skb
->next
= napi
->gro_list
;
3841 napi
->gro_list
= skb
;
3845 if (skb_headlen(skb
) < skb_gro_offset(skb
)) {
3846 int grow
= skb_gro_offset(skb
) - skb_headlen(skb
);
3848 BUG_ON(skb
->end
- skb
->tail
< grow
);
3850 memcpy(skb_tail_pointer(skb
), NAPI_GRO_CB(skb
)->frag0
, grow
);
3853 skb
->data_len
-= grow
;
3855 skb_shinfo(skb
)->frags
[0].page_offset
+= grow
;
3856 skb_frag_size_sub(&skb_shinfo(skb
)->frags
[0], grow
);
3858 if (unlikely(!skb_frag_size(&skb_shinfo(skb
)->frags
[0]))) {
3859 skb_frag_unref(skb
, 0);
3860 memmove(skb_shinfo(skb
)->frags
,
3861 skb_shinfo(skb
)->frags
+ 1,
3862 --skb_shinfo(skb
)->nr_frags
* sizeof(skb_frag_t
));
3875 static gro_result_t
napi_skb_finish(gro_result_t ret
, struct sk_buff
*skb
)
3879 if (netif_receive_skb(skb
))
3887 case GRO_MERGED_FREE
:
3888 if (NAPI_GRO_CB(skb
)->free
== NAPI_GRO_FREE_STOLEN_HEAD
)
3889 kmem_cache_free(skbuff_head_cache
, skb
);
3902 static void skb_gro_reset_offset(struct sk_buff
*skb
)
3904 const struct skb_shared_info
*pinfo
= skb_shinfo(skb
);
3905 const skb_frag_t
*frag0
= &pinfo
->frags
[0];
3907 NAPI_GRO_CB(skb
)->data_offset
= 0;
3908 NAPI_GRO_CB(skb
)->frag0
= NULL
;
3909 NAPI_GRO_CB(skb
)->frag0_len
= 0;
3911 if (skb
->mac_header
== skb
->tail
&&
3913 !PageHighMem(skb_frag_page(frag0
))) {
3914 NAPI_GRO_CB(skb
)->frag0
= skb_frag_address(frag0
);
3915 NAPI_GRO_CB(skb
)->frag0_len
= skb_frag_size(frag0
);
3919 gro_result_t
napi_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
3921 skb_gro_reset_offset(skb
);
3923 return napi_skb_finish(dev_gro_receive(napi
, skb
), skb
);
3925 EXPORT_SYMBOL(napi_gro_receive
);
3927 static void napi_reuse_skb(struct napi_struct
*napi
, struct sk_buff
*skb
)
3929 __skb_pull(skb
, skb_headlen(skb
));
3930 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3931 skb_reserve(skb
, NET_SKB_PAD
+ NET_IP_ALIGN
- skb_headroom(skb
));
3933 skb
->dev
= napi
->dev
;
3939 struct sk_buff
*napi_get_frags(struct napi_struct
*napi
)
3941 struct sk_buff
*skb
= napi
->skb
;
3944 skb
= netdev_alloc_skb_ip_align(napi
->dev
, GRO_MAX_HEAD
);
3950 EXPORT_SYMBOL(napi_get_frags
);
3952 static gro_result_t
napi_frags_finish(struct napi_struct
*napi
, struct sk_buff
*skb
,
3958 skb
->protocol
= eth_type_trans(skb
, skb
->dev
);
3960 if (ret
== GRO_HELD
)
3961 skb_gro_pull(skb
, -ETH_HLEN
);
3962 else if (netif_receive_skb(skb
))
3967 case GRO_MERGED_FREE
:
3968 napi_reuse_skb(napi
, skb
);
3978 static struct sk_buff
*napi_frags_skb(struct napi_struct
*napi
)
3980 struct sk_buff
*skb
= napi
->skb
;
3987 skb_reset_mac_header(skb
);
3988 skb_gro_reset_offset(skb
);
3990 off
= skb_gro_offset(skb
);
3991 hlen
= off
+ sizeof(*eth
);
3992 eth
= skb_gro_header_fast(skb
, off
);
3993 if (skb_gro_header_hard(skb
, hlen
)) {
3994 eth
= skb_gro_header_slow(skb
, hlen
, off
);
3995 if (unlikely(!eth
)) {
3996 napi_reuse_skb(napi
, skb
);
4002 skb_gro_pull(skb
, sizeof(*eth
));
4005 * This works because the only protocols we care about don't require
4006 * special handling. We'll fix it up properly at the end.
4008 skb
->protocol
= eth
->h_proto
;
4014 gro_result_t
napi_gro_frags(struct napi_struct
*napi
)
4016 struct sk_buff
*skb
= napi_frags_skb(napi
);
4021 return napi_frags_finish(napi
, skb
, dev_gro_receive(napi
, skb
));
4023 EXPORT_SYMBOL(napi_gro_frags
);
4026 * net_rps_action sends any pending IPI's for rps.
4027 * Note: called with local irq disabled, but exits with local irq enabled.
4029 static void net_rps_action_and_irq_enable(struct softnet_data
*sd
)
4032 struct softnet_data
*remsd
= sd
->rps_ipi_list
;
4035 sd
->rps_ipi_list
= NULL
;
4039 /* Send pending IPI's to kick RPS processing on remote cpus. */
4041 struct softnet_data
*next
= remsd
->rps_ipi_next
;
4043 if (cpu_online(remsd
->cpu
))
4044 __smp_call_function_single(remsd
->cpu
,
4053 static int process_backlog(struct napi_struct
*napi
, int quota
)
4056 struct softnet_data
*sd
= container_of(napi
, struct softnet_data
, backlog
);
4059 /* Check if we have pending ipi, its better to send them now,
4060 * not waiting net_rx_action() end.
4062 if (sd
->rps_ipi_list
) {
4063 local_irq_disable();
4064 net_rps_action_and_irq_enable(sd
);
4067 napi
->weight
= weight_p
;
4068 local_irq_disable();
4069 while (work
< quota
) {
4070 struct sk_buff
*skb
;
4073 while ((skb
= __skb_dequeue(&sd
->process_queue
))) {
4075 __netif_receive_skb(skb
);
4076 local_irq_disable();
4077 input_queue_head_incr(sd
);
4078 if (++work
>= quota
) {
4085 qlen
= skb_queue_len(&sd
->input_pkt_queue
);
4087 skb_queue_splice_tail_init(&sd
->input_pkt_queue
,
4088 &sd
->process_queue
);
4090 if (qlen
< quota
- work
) {
4092 * Inline a custom version of __napi_complete().
4093 * only current cpu owns and manipulates this napi,
4094 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4095 * we can use a plain write instead of clear_bit(),
4096 * and we dont need an smp_mb() memory barrier.
4098 list_del(&napi
->poll_list
);
4101 quota
= work
+ qlen
;
4111 * __napi_schedule - schedule for receive
4112 * @n: entry to schedule
4114 * The entry's receive function will be scheduled to run
4116 void __napi_schedule(struct napi_struct
*n
)
4118 unsigned long flags
;
4120 local_irq_save(flags
);
4121 ____napi_schedule(&__get_cpu_var(softnet_data
), n
);
4122 local_irq_restore(flags
);
4124 EXPORT_SYMBOL(__napi_schedule
);
4126 void __napi_complete(struct napi_struct
*n
)
4128 BUG_ON(!test_bit(NAPI_STATE_SCHED
, &n
->state
));
4129 BUG_ON(n
->gro_list
);
4131 list_del(&n
->poll_list
);
4132 smp_mb__before_clear_bit();
4133 clear_bit(NAPI_STATE_SCHED
, &n
->state
);
4135 EXPORT_SYMBOL(__napi_complete
);
4137 void napi_complete(struct napi_struct
*n
)
4139 unsigned long flags
;
4142 * don't let napi dequeue from the cpu poll list
4143 * just in case its running on a different cpu
4145 if (unlikely(test_bit(NAPI_STATE_NPSVC
, &n
->state
)))
4148 napi_gro_flush(n
, false);
4149 local_irq_save(flags
);
4151 local_irq_restore(flags
);
4153 EXPORT_SYMBOL(napi_complete
);
4155 void netif_napi_add(struct net_device
*dev
, struct napi_struct
*napi
,
4156 int (*poll
)(struct napi_struct
*, int), int weight
)
4158 INIT_LIST_HEAD(&napi
->poll_list
);
4159 napi
->gro_count
= 0;
4160 napi
->gro_list
= NULL
;
4163 napi
->weight
= weight
;
4164 list_add(&napi
->dev_list
, &dev
->napi_list
);
4166 #ifdef CONFIG_NETPOLL
4167 spin_lock_init(&napi
->poll_lock
);
4168 napi
->poll_owner
= -1;
4170 set_bit(NAPI_STATE_SCHED
, &napi
->state
);
4172 EXPORT_SYMBOL(netif_napi_add
);
4174 void netif_napi_del(struct napi_struct
*napi
)
4176 struct sk_buff
*skb
, *next
;
4178 list_del_init(&napi
->dev_list
);
4179 napi_free_frags(napi
);
4181 for (skb
= napi
->gro_list
; skb
; skb
= next
) {
4187 napi
->gro_list
= NULL
;
4188 napi
->gro_count
= 0;
4190 EXPORT_SYMBOL(netif_napi_del
);
4192 static void net_rx_action(struct softirq_action
*h
)
4194 struct softnet_data
*sd
= &__get_cpu_var(softnet_data
);
4195 unsigned long time_limit
= jiffies
+ 2;
4196 int budget
= netdev_budget
;
4199 local_irq_disable();
4201 while (!list_empty(&sd
->poll_list
)) {
4202 struct napi_struct
*n
;
4205 /* If softirq window is exhuasted then punt.
4206 * Allow this to run for 2 jiffies since which will allow
4207 * an average latency of 1.5/HZ.
4209 if (unlikely(budget
<= 0 || time_after(jiffies
, time_limit
)))
4214 /* Even though interrupts have been re-enabled, this
4215 * access is safe because interrupts can only add new
4216 * entries to the tail of this list, and only ->poll()
4217 * calls can remove this head entry from the list.
4219 n
= list_first_entry(&sd
->poll_list
, struct napi_struct
, poll_list
);
4221 have
= netpoll_poll_lock(n
);
4225 /* This NAPI_STATE_SCHED test is for avoiding a race
4226 * with netpoll's poll_napi(). Only the entity which
4227 * obtains the lock and sees NAPI_STATE_SCHED set will
4228 * actually make the ->poll() call. Therefore we avoid
4229 * accidentally calling ->poll() when NAPI is not scheduled.
4232 if (test_bit(NAPI_STATE_SCHED
, &n
->state
)) {
4233 work
= n
->poll(n
, weight
);
4237 WARN_ON_ONCE(work
> weight
);
4241 local_irq_disable();
4243 /* Drivers must not modify the NAPI state if they
4244 * consume the entire weight. In such cases this code
4245 * still "owns" the NAPI instance and therefore can
4246 * move the instance around on the list at-will.
4248 if (unlikely(work
== weight
)) {
4249 if (unlikely(napi_disable_pending(n
))) {
4252 local_irq_disable();
4255 /* flush too old packets
4256 * If HZ < 1000, flush all packets.
4259 napi_gro_flush(n
, HZ
>= 1000);
4260 local_irq_disable();
4262 list_move_tail(&n
->poll_list
, &sd
->poll_list
);
4266 netpoll_poll_unlock(have
);
4269 net_rps_action_and_irq_enable(sd
);
4271 #ifdef CONFIG_NET_DMA
4273 * There may not be any more sk_buffs coming right now, so push
4274 * any pending DMA copies to hardware
4276 dma_issue_pending_all();
4283 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
4287 static gifconf_func_t
*gifconf_list
[NPROTO
];
4290 * register_gifconf - register a SIOCGIF handler
4291 * @family: Address family
4292 * @gifconf: Function handler
4294 * Register protocol dependent address dumping routines. The handler
4295 * that is passed must not be freed or reused until it has been replaced
4296 * by another handler.
4298 int register_gifconf(unsigned int family
, gifconf_func_t
*gifconf
)
4300 if (family
>= NPROTO
)
4302 gifconf_list
[family
] = gifconf
;
4305 EXPORT_SYMBOL(register_gifconf
);
4309 * Map an interface index to its name (SIOCGIFNAME)
4313 * We need this ioctl for efficient implementation of the
4314 * if_indextoname() function required by the IPv6 API. Without
4315 * it, we would have to search all the interfaces to find a
4319 static int dev_ifname(struct net
*net
, struct ifreq __user
*arg
)
4321 struct net_device
*dev
;
4326 * Fetch the caller's info block.
4329 if (copy_from_user(&ifr
, arg
, sizeof(struct ifreq
)))
4333 seq
= read_seqcount_begin(&devnet_rename_seq
);
4335 dev
= dev_get_by_index_rcu(net
, ifr
.ifr_ifindex
);
4341 strcpy(ifr
.ifr_name
, dev
->name
);
4343 if (read_seqcount_retry(&devnet_rename_seq
, seq
))
4346 if (copy_to_user(arg
, &ifr
, sizeof(struct ifreq
)))
4352 * Perform a SIOCGIFCONF call. This structure will change
4353 * size eventually, and there is nothing I can do about it.
4354 * Thus we will need a 'compatibility mode'.
4357 static int dev_ifconf(struct net
*net
, char __user
*arg
)
4360 struct net_device
*dev
;
4367 * Fetch the caller's info block.
4370 if (copy_from_user(&ifc
, arg
, sizeof(struct ifconf
)))
4377 * Loop over the interfaces, and write an info block for each.
4381 for_each_netdev(net
, dev
) {
4382 for (i
= 0; i
< NPROTO
; i
++) {
4383 if (gifconf_list
[i
]) {
4386 done
= gifconf_list
[i
](dev
, NULL
, 0);
4388 done
= gifconf_list
[i
](dev
, pos
+ total
,
4398 * All done. Write the updated control block back to the caller.
4400 ifc
.ifc_len
= total
;
4403 * Both BSD and Solaris return 0 here, so we do too.
4405 return copy_to_user(arg
, &ifc
, sizeof(struct ifconf
)) ? -EFAULT
: 0;
4408 #ifdef CONFIG_PROC_FS
4410 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4412 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4413 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4414 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4416 static inline struct net_device
*dev_from_same_bucket(struct seq_file
*seq
, loff_t
*pos
)
4418 struct net
*net
= seq_file_net(seq
);
4419 struct net_device
*dev
;
4420 struct hlist_node
*p
;
4421 struct hlist_head
*h
;
4422 unsigned int count
= 0, offset
= get_offset(*pos
);
4424 h
= &net
->dev_name_head
[get_bucket(*pos
)];
4425 hlist_for_each_entry_rcu(dev
, p
, h
, name_hlist
) {
4426 if (++count
== offset
)
4433 static inline struct net_device
*dev_from_bucket(struct seq_file
*seq
, loff_t
*pos
)
4435 struct net_device
*dev
;
4436 unsigned int bucket
;
4439 dev
= dev_from_same_bucket(seq
, pos
);
4443 bucket
= get_bucket(*pos
) + 1;
4444 *pos
= set_bucket_offset(bucket
, 1);
4445 } while (bucket
< NETDEV_HASHENTRIES
);
4451 * This is invoked by the /proc filesystem handler to display a device
4454 void *dev_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4459 return SEQ_START_TOKEN
;
4461 if (get_bucket(*pos
) >= NETDEV_HASHENTRIES
)
4464 return dev_from_bucket(seq
, pos
);
4467 void *dev_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4470 return dev_from_bucket(seq
, pos
);
4473 void dev_seq_stop(struct seq_file
*seq
, void *v
)
4479 static void dev_seq_printf_stats(struct seq_file
*seq
, struct net_device
*dev
)
4481 struct rtnl_link_stats64 temp
;
4482 const struct rtnl_link_stats64
*stats
= dev_get_stats(dev
, &temp
);
4484 seq_printf(seq
, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4485 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4486 dev
->name
, stats
->rx_bytes
, stats
->rx_packets
,
4488 stats
->rx_dropped
+ stats
->rx_missed_errors
,
4489 stats
->rx_fifo_errors
,
4490 stats
->rx_length_errors
+ stats
->rx_over_errors
+
4491 stats
->rx_crc_errors
+ stats
->rx_frame_errors
,
4492 stats
->rx_compressed
, stats
->multicast
,
4493 stats
->tx_bytes
, stats
->tx_packets
,
4494 stats
->tx_errors
, stats
->tx_dropped
,
4495 stats
->tx_fifo_errors
, stats
->collisions
,
4496 stats
->tx_carrier_errors
+
4497 stats
->tx_aborted_errors
+
4498 stats
->tx_window_errors
+
4499 stats
->tx_heartbeat_errors
,
4500 stats
->tx_compressed
);
4504 * Called from the PROCfs module. This now uses the new arbitrary sized
4505 * /proc/net interface to create /proc/net/dev
4507 static int dev_seq_show(struct seq_file
*seq
, void *v
)
4509 if (v
== SEQ_START_TOKEN
)
4510 seq_puts(seq
, "Inter-| Receive "
4512 " face |bytes packets errs drop fifo frame "
4513 "compressed multicast|bytes packets errs "
4514 "drop fifo colls carrier compressed\n");
4516 dev_seq_printf_stats(seq
, v
);
4520 static struct softnet_data
*softnet_get_online(loff_t
*pos
)
4522 struct softnet_data
*sd
= NULL
;
4524 while (*pos
< nr_cpu_ids
)
4525 if (cpu_online(*pos
)) {
4526 sd
= &per_cpu(softnet_data
, *pos
);
4533 static void *softnet_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4535 return softnet_get_online(pos
);
4538 static void *softnet_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4541 return softnet_get_online(pos
);
4544 static void softnet_seq_stop(struct seq_file
*seq
, void *v
)
4548 static int softnet_seq_show(struct seq_file
*seq
, void *v
)
4550 struct softnet_data
*sd
= v
;
4552 seq_printf(seq
, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4553 sd
->processed
, sd
->dropped
, sd
->time_squeeze
, 0,
4554 0, 0, 0, 0, /* was fastroute */
4555 sd
->cpu_collision
, sd
->received_rps
);
4559 static const struct seq_operations dev_seq_ops
= {
4560 .start
= dev_seq_start
,
4561 .next
= dev_seq_next
,
4562 .stop
= dev_seq_stop
,
4563 .show
= dev_seq_show
,
4566 static int dev_seq_open(struct inode
*inode
, struct file
*file
)
4568 return seq_open_net(inode
, file
, &dev_seq_ops
,
4569 sizeof(struct seq_net_private
));
4572 static const struct file_operations dev_seq_fops
= {
4573 .owner
= THIS_MODULE
,
4574 .open
= dev_seq_open
,
4576 .llseek
= seq_lseek
,
4577 .release
= seq_release_net
,
4580 static const struct seq_operations softnet_seq_ops
= {
4581 .start
= softnet_seq_start
,
4582 .next
= softnet_seq_next
,
4583 .stop
= softnet_seq_stop
,
4584 .show
= softnet_seq_show
,
4587 static int softnet_seq_open(struct inode
*inode
, struct file
*file
)
4589 return seq_open(file
, &softnet_seq_ops
);
4592 static const struct file_operations softnet_seq_fops
= {
4593 .owner
= THIS_MODULE
,
4594 .open
= softnet_seq_open
,
4596 .llseek
= seq_lseek
,
4597 .release
= seq_release
,
4600 static void *ptype_get_idx(loff_t pos
)
4602 struct packet_type
*pt
= NULL
;
4606 list_for_each_entry_rcu(pt
, &ptype_all
, list
) {
4612 for (t
= 0; t
< PTYPE_HASH_SIZE
; t
++) {
4613 list_for_each_entry_rcu(pt
, &ptype_base
[t
], list
) {
4622 static void *ptype_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4626 return *pos
? ptype_get_idx(*pos
- 1) : SEQ_START_TOKEN
;
4629 static void *ptype_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4631 struct packet_type
*pt
;
4632 struct list_head
*nxt
;
4636 if (v
== SEQ_START_TOKEN
)
4637 return ptype_get_idx(0);
4640 nxt
= pt
->list
.next
;
4641 if (pt
->type
== htons(ETH_P_ALL
)) {
4642 if (nxt
!= &ptype_all
)
4645 nxt
= ptype_base
[0].next
;
4647 hash
= ntohs(pt
->type
) & PTYPE_HASH_MASK
;
4649 while (nxt
== &ptype_base
[hash
]) {
4650 if (++hash
>= PTYPE_HASH_SIZE
)
4652 nxt
= ptype_base
[hash
].next
;
4655 return list_entry(nxt
, struct packet_type
, list
);
4658 static void ptype_seq_stop(struct seq_file
*seq
, void *v
)
4664 static int ptype_seq_show(struct seq_file
*seq
, void *v
)
4666 struct packet_type
*pt
= v
;
4668 if (v
== SEQ_START_TOKEN
)
4669 seq_puts(seq
, "Type Device Function\n");
4670 else if (pt
->dev
== NULL
|| dev_net(pt
->dev
) == seq_file_net(seq
)) {
4671 if (pt
->type
== htons(ETH_P_ALL
))
4672 seq_puts(seq
, "ALL ");
4674 seq_printf(seq
, "%04x", ntohs(pt
->type
));
4676 seq_printf(seq
, " %-8s %pF\n",
4677 pt
->dev
? pt
->dev
->name
: "", pt
->func
);
4683 static const struct seq_operations ptype_seq_ops
= {
4684 .start
= ptype_seq_start
,
4685 .next
= ptype_seq_next
,
4686 .stop
= ptype_seq_stop
,
4687 .show
= ptype_seq_show
,
4690 static int ptype_seq_open(struct inode
*inode
, struct file
*file
)
4692 return seq_open_net(inode
, file
, &ptype_seq_ops
,
4693 sizeof(struct seq_net_private
));
4696 static const struct file_operations ptype_seq_fops
= {
4697 .owner
= THIS_MODULE
,
4698 .open
= ptype_seq_open
,
4700 .llseek
= seq_lseek
,
4701 .release
= seq_release_net
,
4705 static int __net_init
dev_proc_net_init(struct net
*net
)
4709 if (!proc_net_fops_create(net
, "dev", S_IRUGO
, &dev_seq_fops
))
4711 if (!proc_net_fops_create(net
, "softnet_stat", S_IRUGO
, &softnet_seq_fops
))
4713 if (!proc_net_fops_create(net
, "ptype", S_IRUGO
, &ptype_seq_fops
))
4716 if (wext_proc_init(net
))
4722 proc_net_remove(net
, "ptype");
4724 proc_net_remove(net
, "softnet_stat");
4726 proc_net_remove(net
, "dev");
4730 static void __net_exit
dev_proc_net_exit(struct net
*net
)
4732 wext_proc_exit(net
);
4734 proc_net_remove(net
, "ptype");
4735 proc_net_remove(net
, "softnet_stat");
4736 proc_net_remove(net
, "dev");
4739 static struct pernet_operations __net_initdata dev_proc_ops
= {
4740 .init
= dev_proc_net_init
,
4741 .exit
= dev_proc_net_exit
,
4744 static int __init
dev_proc_init(void)
4746 return register_pernet_subsys(&dev_proc_ops
);
4749 #define dev_proc_init() 0
4750 #endif /* CONFIG_PROC_FS */
4753 struct netdev_upper
{
4754 struct net_device
*dev
;
4756 struct list_head list
;
4757 struct rcu_head rcu
;
4758 struct list_head search_list
;
4761 static void __append_search_uppers(struct list_head
*search_list
,
4762 struct net_device
*dev
)
4764 struct netdev_upper
*upper
;
4766 list_for_each_entry(upper
, &dev
->upper_dev_list
, list
) {
4767 /* check if this upper is not already in search list */
4768 if (list_empty(&upper
->search_list
))
4769 list_add_tail(&upper
->search_list
, search_list
);
4773 static bool __netdev_search_upper_dev(struct net_device
*dev
,
4774 struct net_device
*upper_dev
)
4776 LIST_HEAD(search_list
);
4777 struct netdev_upper
*upper
;
4778 struct netdev_upper
*tmp
;
4781 __append_search_uppers(&search_list
, dev
);
4782 list_for_each_entry(upper
, &search_list
, search_list
) {
4783 if (upper
->dev
== upper_dev
) {
4787 __append_search_uppers(&search_list
, upper
->dev
);
4789 list_for_each_entry_safe(upper
, tmp
, &search_list
, search_list
)
4790 INIT_LIST_HEAD(&upper
->search_list
);
4794 static struct netdev_upper
*__netdev_find_upper(struct net_device
*dev
,
4795 struct net_device
*upper_dev
)
4797 struct netdev_upper
*upper
;
4799 list_for_each_entry(upper
, &dev
->upper_dev_list
, list
) {
4800 if (upper
->dev
== upper_dev
)
4807 * netdev_has_upper_dev - Check if device is linked to an upper device
4809 * @upper_dev: upper device to check
4811 * Find out if a device is linked to specified upper device and return true
4812 * in case it is. Note that this checks only immediate upper device,
4813 * not through a complete stack of devices. The caller must hold the RTNL lock.
4815 bool netdev_has_upper_dev(struct net_device
*dev
,
4816 struct net_device
*upper_dev
)
4820 return __netdev_find_upper(dev
, upper_dev
);
4822 EXPORT_SYMBOL(netdev_has_upper_dev
);
4825 * netdev_has_any_upper_dev - Check if device is linked to some device
4828 * Find out if a device is linked to an upper device and return true in case
4829 * it is. The caller must hold the RTNL lock.
4831 bool netdev_has_any_upper_dev(struct net_device
*dev
)
4835 return !list_empty(&dev
->upper_dev_list
);
4837 EXPORT_SYMBOL(netdev_has_any_upper_dev
);
4840 * netdev_master_upper_dev_get - Get master upper device
4843 * Find a master upper device and return pointer to it or NULL in case
4844 * it's not there. The caller must hold the RTNL lock.
4846 struct net_device
*netdev_master_upper_dev_get(struct net_device
*dev
)
4848 struct netdev_upper
*upper
;
4852 if (list_empty(&dev
->upper_dev_list
))
4855 upper
= list_first_entry(&dev
->upper_dev_list
,
4856 struct netdev_upper
, list
);
4857 if (likely(upper
->master
))
4861 EXPORT_SYMBOL(netdev_master_upper_dev_get
);
4864 * netdev_master_upper_dev_get_rcu - Get master upper device
4867 * Find a master upper device and return pointer to it or NULL in case
4868 * it's not there. The caller must hold the RCU read lock.
4870 struct net_device
*netdev_master_upper_dev_get_rcu(struct net_device
*dev
)
4872 struct netdev_upper
*upper
;
4874 upper
= list_first_or_null_rcu(&dev
->upper_dev_list
,
4875 struct netdev_upper
, list
);
4876 if (upper
&& likely(upper
->master
))
4880 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu
);
4882 static int __netdev_upper_dev_link(struct net_device
*dev
,
4883 struct net_device
*upper_dev
, bool master
)
4885 struct netdev_upper
*upper
;
4889 if (dev
== upper_dev
)
4892 /* To prevent loops, check if dev is not upper device to upper_dev. */
4893 if (__netdev_search_upper_dev(upper_dev
, dev
))
4896 if (__netdev_find_upper(dev
, upper_dev
))
4899 if (master
&& netdev_master_upper_dev_get(dev
))
4902 upper
= kmalloc(sizeof(*upper
), GFP_KERNEL
);
4906 upper
->dev
= upper_dev
;
4907 upper
->master
= master
;
4908 INIT_LIST_HEAD(&upper
->search_list
);
4910 /* Ensure that master upper link is always the first item in list. */
4912 list_add_rcu(&upper
->list
, &dev
->upper_dev_list
);
4914 list_add_tail_rcu(&upper
->list
, &dev
->upper_dev_list
);
4915 dev_hold(upper_dev
);
4921 * netdev_upper_dev_link - Add a link to the upper device
4923 * @upper_dev: new upper device
4925 * Adds a link to device which is upper to this one. The caller must hold
4926 * the RTNL lock. On a failure a negative errno code is returned.
4927 * On success the reference counts are adjusted and the function
4930 int netdev_upper_dev_link(struct net_device
*dev
,
4931 struct net_device
*upper_dev
)
4933 return __netdev_upper_dev_link(dev
, upper_dev
, false);
4935 EXPORT_SYMBOL(netdev_upper_dev_link
);
4938 * netdev_master_upper_dev_link - Add a master link to the upper device
4940 * @upper_dev: new upper device
4942 * Adds a link to device which is upper to this one. In this case, only
4943 * one master upper device can be linked, although other non-master devices
4944 * might be linked as well. The caller must hold the RTNL lock.
4945 * On a failure a negative errno code is returned. On success the reference
4946 * counts are adjusted and the function returns zero.
4948 int netdev_master_upper_dev_link(struct net_device
*dev
,
4949 struct net_device
*upper_dev
)
4951 return __netdev_upper_dev_link(dev
, upper_dev
, true);
4953 EXPORT_SYMBOL(netdev_master_upper_dev_link
);
4956 * netdev_upper_dev_unlink - Removes a link to upper device
4958 * @upper_dev: new upper device
4960 * Removes a link to device which is upper to this one. The caller must hold
4963 void netdev_upper_dev_unlink(struct net_device
*dev
,
4964 struct net_device
*upper_dev
)
4966 struct netdev_upper
*upper
;
4970 upper
= __netdev_find_upper(dev
, upper_dev
);
4973 list_del_rcu(&upper
->list
);
4975 kfree_rcu(upper
, rcu
);
4977 EXPORT_SYMBOL(netdev_upper_dev_unlink
);
4979 static void dev_change_rx_flags(struct net_device
*dev
, int flags
)
4981 const struct net_device_ops
*ops
= dev
->netdev_ops
;
4983 if ((dev
->flags
& IFF_UP
) && ops
->ndo_change_rx_flags
)
4984 ops
->ndo_change_rx_flags(dev
, flags
);
4987 static int __dev_set_promiscuity(struct net_device
*dev
, int inc
)
4989 unsigned int old_flags
= dev
->flags
;
4995 dev
->flags
|= IFF_PROMISC
;
4996 dev
->promiscuity
+= inc
;
4997 if (dev
->promiscuity
== 0) {
5000 * If inc causes overflow, untouch promisc and return error.
5003 dev
->flags
&= ~IFF_PROMISC
;
5005 dev
->promiscuity
-= inc
;
5006 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5011 if (dev
->flags
!= old_flags
) {
5012 pr_info("device %s %s promiscuous mode\n",
5014 dev
->flags
& IFF_PROMISC
? "entered" : "left");
5015 if (audit_enabled
) {
5016 current_uid_gid(&uid
, &gid
);
5017 audit_log(current
->audit_context
, GFP_ATOMIC
,
5018 AUDIT_ANOM_PROMISCUOUS
,
5019 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5020 dev
->name
, (dev
->flags
& IFF_PROMISC
),
5021 (old_flags
& IFF_PROMISC
),
5022 from_kuid(&init_user_ns
, audit_get_loginuid(current
)),
5023 from_kuid(&init_user_ns
, uid
),
5024 from_kgid(&init_user_ns
, gid
),
5025 audit_get_sessionid(current
));
5028 dev_change_rx_flags(dev
, IFF_PROMISC
);
5034 * dev_set_promiscuity - update promiscuity count on a device
5038 * Add or remove promiscuity from a device. While the count in the device
5039 * remains above zero the interface remains promiscuous. Once it hits zero
5040 * the device reverts back to normal filtering operation. A negative inc
5041 * value is used to drop promiscuity on the device.
5042 * Return 0 if successful or a negative errno code on error.
5044 int dev_set_promiscuity(struct net_device
*dev
, int inc
)
5046 unsigned int old_flags
= dev
->flags
;
5049 err
= __dev_set_promiscuity(dev
, inc
);
5052 if (dev
->flags
!= old_flags
)
5053 dev_set_rx_mode(dev
);
5056 EXPORT_SYMBOL(dev_set_promiscuity
);
5059 * dev_set_allmulti - update allmulti count on a device
5063 * Add or remove reception of all multicast frames to a device. While the
5064 * count in the device remains above zero the interface remains listening
5065 * to all interfaces. Once it hits zero the device reverts back to normal
5066 * filtering operation. A negative @inc value is used to drop the counter
5067 * when releasing a resource needing all multicasts.
5068 * Return 0 if successful or a negative errno code on error.
5071 int dev_set_allmulti(struct net_device
*dev
, int inc
)
5073 unsigned int old_flags
= dev
->flags
;
5077 dev
->flags
|= IFF_ALLMULTI
;
5078 dev
->allmulti
+= inc
;
5079 if (dev
->allmulti
== 0) {
5082 * If inc causes overflow, untouch allmulti and return error.
5085 dev
->flags
&= ~IFF_ALLMULTI
;
5087 dev
->allmulti
-= inc
;
5088 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5093 if (dev
->flags
^ old_flags
) {
5094 dev_change_rx_flags(dev
, IFF_ALLMULTI
);
5095 dev_set_rx_mode(dev
);
5099 EXPORT_SYMBOL(dev_set_allmulti
);
5102 * Upload unicast and multicast address lists to device and
5103 * configure RX filtering. When the device doesn't support unicast
5104 * filtering it is put in promiscuous mode while unicast addresses
5107 void __dev_set_rx_mode(struct net_device
*dev
)
5109 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5111 /* dev_open will call this function so the list will stay sane. */
5112 if (!(dev
->flags
&IFF_UP
))
5115 if (!netif_device_present(dev
))
5118 if (!(dev
->priv_flags
& IFF_UNICAST_FLT
)) {
5119 /* Unicast addresses changes may only happen under the rtnl,
5120 * therefore calling __dev_set_promiscuity here is safe.
5122 if (!netdev_uc_empty(dev
) && !dev
->uc_promisc
) {
5123 __dev_set_promiscuity(dev
, 1);
5124 dev
->uc_promisc
= true;
5125 } else if (netdev_uc_empty(dev
) && dev
->uc_promisc
) {
5126 __dev_set_promiscuity(dev
, -1);
5127 dev
->uc_promisc
= false;
5131 if (ops
->ndo_set_rx_mode
)
5132 ops
->ndo_set_rx_mode(dev
);
5135 void dev_set_rx_mode(struct net_device
*dev
)
5137 netif_addr_lock_bh(dev
);
5138 __dev_set_rx_mode(dev
);
5139 netif_addr_unlock_bh(dev
);
5143 * dev_get_flags - get flags reported to userspace
5146 * Get the combination of flag bits exported through APIs to userspace.
5148 unsigned int dev_get_flags(const struct net_device
*dev
)
5152 flags
= (dev
->flags
& ~(IFF_PROMISC
|
5157 (dev
->gflags
& (IFF_PROMISC
|
5160 if (netif_running(dev
)) {
5161 if (netif_oper_up(dev
))
5162 flags
|= IFF_RUNNING
;
5163 if (netif_carrier_ok(dev
))
5164 flags
|= IFF_LOWER_UP
;
5165 if (netif_dormant(dev
))
5166 flags
|= IFF_DORMANT
;
5171 EXPORT_SYMBOL(dev_get_flags
);
5173 int __dev_change_flags(struct net_device
*dev
, unsigned int flags
)
5175 unsigned int old_flags
= dev
->flags
;
5181 * Set the flags on our device.
5184 dev
->flags
= (flags
& (IFF_DEBUG
| IFF_NOTRAILERS
| IFF_NOARP
|
5185 IFF_DYNAMIC
| IFF_MULTICAST
| IFF_PORTSEL
|
5187 (dev
->flags
& (IFF_UP
| IFF_VOLATILE
| IFF_PROMISC
|
5191 * Load in the correct multicast list now the flags have changed.
5194 if ((old_flags
^ flags
) & IFF_MULTICAST
)
5195 dev_change_rx_flags(dev
, IFF_MULTICAST
);
5197 dev_set_rx_mode(dev
);
5200 * Have we downed the interface. We handle IFF_UP ourselves
5201 * according to user attempts to set it, rather than blindly
5206 if ((old_flags
^ flags
) & IFF_UP
) { /* Bit is different ? */
5207 ret
= ((old_flags
& IFF_UP
) ? __dev_close
: __dev_open
)(dev
);
5210 dev_set_rx_mode(dev
);
5213 if ((flags
^ dev
->gflags
) & IFF_PROMISC
) {
5214 int inc
= (flags
& IFF_PROMISC
) ? 1 : -1;
5216 dev
->gflags
^= IFF_PROMISC
;
5217 dev_set_promiscuity(dev
, inc
);
5220 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5221 is important. Some (broken) drivers set IFF_PROMISC, when
5222 IFF_ALLMULTI is requested not asking us and not reporting.
5224 if ((flags
^ dev
->gflags
) & IFF_ALLMULTI
) {
5225 int inc
= (flags
& IFF_ALLMULTI
) ? 1 : -1;
5227 dev
->gflags
^= IFF_ALLMULTI
;
5228 dev_set_allmulti(dev
, inc
);
5234 void __dev_notify_flags(struct net_device
*dev
, unsigned int old_flags
)
5236 unsigned int changes
= dev
->flags
^ old_flags
;
5238 if (changes
& IFF_UP
) {
5239 if (dev
->flags
& IFF_UP
)
5240 call_netdevice_notifiers(NETDEV_UP
, dev
);
5242 call_netdevice_notifiers(NETDEV_DOWN
, dev
);
5245 if (dev
->flags
& IFF_UP
&&
5246 (changes
& ~(IFF_UP
| IFF_PROMISC
| IFF_ALLMULTI
| IFF_VOLATILE
)))
5247 call_netdevice_notifiers(NETDEV_CHANGE
, dev
);
5251 * dev_change_flags - change device settings
5253 * @flags: device state flags
5255 * Change settings on device based state flags. The flags are
5256 * in the userspace exported format.
5258 int dev_change_flags(struct net_device
*dev
, unsigned int flags
)
5261 unsigned int changes
, old_flags
= dev
->flags
;
5263 ret
= __dev_change_flags(dev
, flags
);
5267 changes
= old_flags
^ dev
->flags
;
5269 rtmsg_ifinfo(RTM_NEWLINK
, dev
, changes
);
5271 __dev_notify_flags(dev
, old_flags
);
5274 EXPORT_SYMBOL(dev_change_flags
);
5277 * dev_set_mtu - Change maximum transfer unit
5279 * @new_mtu: new transfer unit
5281 * Change the maximum transfer size of the network device.
5283 int dev_set_mtu(struct net_device
*dev
, int new_mtu
)
5285 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5288 if (new_mtu
== dev
->mtu
)
5291 /* MTU must be positive. */
5295 if (!netif_device_present(dev
))
5299 if (ops
->ndo_change_mtu
)
5300 err
= ops
->ndo_change_mtu(dev
, new_mtu
);
5305 call_netdevice_notifiers(NETDEV_CHANGEMTU
, dev
);
5308 EXPORT_SYMBOL(dev_set_mtu
);
5311 * dev_set_group - Change group this device belongs to
5313 * @new_group: group this device should belong to
5315 void dev_set_group(struct net_device
*dev
, int new_group
)
5317 dev
->group
= new_group
;
5319 EXPORT_SYMBOL(dev_set_group
);
5322 * dev_set_mac_address - Change Media Access Control Address
5326 * Change the hardware (MAC) address of the device
5328 int dev_set_mac_address(struct net_device
*dev
, struct sockaddr
*sa
)
5330 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5333 if (!ops
->ndo_set_mac_address
)
5335 if (sa
->sa_family
!= dev
->type
)
5337 if (!netif_device_present(dev
))
5339 err
= ops
->ndo_set_mac_address(dev
, sa
);
5342 dev
->addr_assign_type
= NET_ADDR_SET
;
5343 call_netdevice_notifiers(NETDEV_CHANGEADDR
, dev
);
5344 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
5347 EXPORT_SYMBOL(dev_set_mac_address
);
5350 * dev_change_carrier - Change device carrier
5352 * @new_carries: new value
5354 * Change device carrier
5356 int dev_change_carrier(struct net_device
*dev
, bool new_carrier
)
5358 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5360 if (!ops
->ndo_change_carrier
)
5362 if (!netif_device_present(dev
))
5364 return ops
->ndo_change_carrier(dev
, new_carrier
);
5366 EXPORT_SYMBOL(dev_change_carrier
);
5369 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
5371 static int dev_ifsioc_locked(struct net
*net
, struct ifreq
*ifr
, unsigned int cmd
)
5374 struct net_device
*dev
= dev_get_by_name_rcu(net
, ifr
->ifr_name
);
5380 case SIOCGIFFLAGS
: /* Get interface flags */
5381 ifr
->ifr_flags
= (short) dev_get_flags(dev
);
5384 case SIOCGIFMETRIC
: /* Get the metric on the interface
5385 (currently unused) */
5386 ifr
->ifr_metric
= 0;
5389 case SIOCGIFMTU
: /* Get the MTU of a device */
5390 ifr
->ifr_mtu
= dev
->mtu
;
5395 memset(ifr
->ifr_hwaddr
.sa_data
, 0, sizeof ifr
->ifr_hwaddr
.sa_data
);
5397 memcpy(ifr
->ifr_hwaddr
.sa_data
, dev
->dev_addr
,
5398 min(sizeof ifr
->ifr_hwaddr
.sa_data
, (size_t) dev
->addr_len
));
5399 ifr
->ifr_hwaddr
.sa_family
= dev
->type
;
5407 ifr
->ifr_map
.mem_start
= dev
->mem_start
;
5408 ifr
->ifr_map
.mem_end
= dev
->mem_end
;
5409 ifr
->ifr_map
.base_addr
= dev
->base_addr
;
5410 ifr
->ifr_map
.irq
= dev
->irq
;
5411 ifr
->ifr_map
.dma
= dev
->dma
;
5412 ifr
->ifr_map
.port
= dev
->if_port
;
5416 ifr
->ifr_ifindex
= dev
->ifindex
;
5420 ifr
->ifr_qlen
= dev
->tx_queue_len
;
5424 /* dev_ioctl() should ensure this case
5436 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
5438 static int dev_ifsioc(struct net
*net
, struct ifreq
*ifr
, unsigned int cmd
)
5441 struct net_device
*dev
= __dev_get_by_name(net
, ifr
->ifr_name
);
5442 const struct net_device_ops
*ops
;
5447 ops
= dev
->netdev_ops
;
5450 case SIOCSIFFLAGS
: /* Set interface flags */
5451 return dev_change_flags(dev
, ifr
->ifr_flags
);
5453 case SIOCSIFMETRIC
: /* Set the metric on the interface
5454 (currently unused) */
5457 case SIOCSIFMTU
: /* Set the MTU of a device */
5458 return dev_set_mtu(dev
, ifr
->ifr_mtu
);
5461 return dev_set_mac_address(dev
, &ifr
->ifr_hwaddr
);
5463 case SIOCSIFHWBROADCAST
:
5464 if (ifr
->ifr_hwaddr
.sa_family
!= dev
->type
)
5466 memcpy(dev
->broadcast
, ifr
->ifr_hwaddr
.sa_data
,
5467 min(sizeof ifr
->ifr_hwaddr
.sa_data
, (size_t) dev
->addr_len
));
5468 call_netdevice_notifiers(NETDEV_CHANGEADDR
, dev
);
5472 if (ops
->ndo_set_config
) {
5473 if (!netif_device_present(dev
))
5475 return ops
->ndo_set_config(dev
, &ifr
->ifr_map
);
5480 if (!ops
->ndo_set_rx_mode
||
5481 ifr
->ifr_hwaddr
.sa_family
!= AF_UNSPEC
)
5483 if (!netif_device_present(dev
))
5485 return dev_mc_add_global(dev
, ifr
->ifr_hwaddr
.sa_data
);
5488 if (!ops
->ndo_set_rx_mode
||
5489 ifr
->ifr_hwaddr
.sa_family
!= AF_UNSPEC
)
5491 if (!netif_device_present(dev
))
5493 return dev_mc_del_global(dev
, ifr
->ifr_hwaddr
.sa_data
);
5496 if (ifr
->ifr_qlen
< 0)
5498 dev
->tx_queue_len
= ifr
->ifr_qlen
;
5502 ifr
->ifr_newname
[IFNAMSIZ
-1] = '\0';
5503 return dev_change_name(dev
, ifr
->ifr_newname
);
5506 err
= net_hwtstamp_validate(ifr
);
5512 * Unknown or private ioctl
5515 if ((cmd
>= SIOCDEVPRIVATE
&&
5516 cmd
<= SIOCDEVPRIVATE
+ 15) ||
5517 cmd
== SIOCBONDENSLAVE
||
5518 cmd
== SIOCBONDRELEASE
||
5519 cmd
== SIOCBONDSETHWADDR
||
5520 cmd
== SIOCBONDSLAVEINFOQUERY
||
5521 cmd
== SIOCBONDINFOQUERY
||
5522 cmd
== SIOCBONDCHANGEACTIVE
||
5523 cmd
== SIOCGMIIPHY
||
5524 cmd
== SIOCGMIIREG
||
5525 cmd
== SIOCSMIIREG
||
5526 cmd
== SIOCBRADDIF
||
5527 cmd
== SIOCBRDELIF
||
5528 cmd
== SIOCSHWTSTAMP
||
5529 cmd
== SIOCWANDEV
) {
5531 if (ops
->ndo_do_ioctl
) {
5532 if (netif_device_present(dev
))
5533 err
= ops
->ndo_do_ioctl(dev
, ifr
, cmd
);
5545 * This function handles all "interface"-type I/O control requests. The actual
5546 * 'doing' part of this is dev_ifsioc above.
5550 * dev_ioctl - network device ioctl
5551 * @net: the applicable net namespace
5552 * @cmd: command to issue
5553 * @arg: pointer to a struct ifreq in user space
5555 * Issue ioctl functions to devices. This is normally called by the
5556 * user space syscall interfaces but can sometimes be useful for
5557 * other purposes. The return value is the return from the syscall if
5558 * positive or a negative errno code on error.
5561 int dev_ioctl(struct net
*net
, unsigned int cmd
, void __user
*arg
)
5567 /* One special case: SIOCGIFCONF takes ifconf argument
5568 and requires shared lock, because it sleeps writing
5572 if (cmd
== SIOCGIFCONF
) {
5574 ret
= dev_ifconf(net
, (char __user
*) arg
);
5578 if (cmd
== SIOCGIFNAME
)
5579 return dev_ifname(net
, (struct ifreq __user
*)arg
);
5581 if (copy_from_user(&ifr
, arg
, sizeof(struct ifreq
)))
5584 ifr
.ifr_name
[IFNAMSIZ
-1] = 0;
5586 colon
= strchr(ifr
.ifr_name
, ':');
5591 * See which interface the caller is talking about.
5596 * These ioctl calls:
5597 * - can be done by all.
5598 * - atomic and do not require locking.
5609 dev_load(net
, ifr
.ifr_name
);
5611 ret
= dev_ifsioc_locked(net
, &ifr
, cmd
);
5616 if (copy_to_user(arg
, &ifr
,
5617 sizeof(struct ifreq
)))
5623 dev_load(net
, ifr
.ifr_name
);
5625 ret
= dev_ethtool(net
, &ifr
);
5630 if (copy_to_user(arg
, &ifr
,
5631 sizeof(struct ifreq
)))
5637 * These ioctl calls:
5638 * - require superuser power.
5639 * - require strict serialization.
5645 if (!ns_capable(net
->user_ns
, CAP_NET_ADMIN
))
5647 dev_load(net
, ifr
.ifr_name
);
5649 ret
= dev_ifsioc(net
, &ifr
, cmd
);
5654 if (copy_to_user(arg
, &ifr
,
5655 sizeof(struct ifreq
)))
5661 * These ioctl calls:
5662 * - require superuser power.
5663 * - require strict serialization.
5664 * - do not return a value
5668 if (!capable(CAP_NET_ADMIN
))
5672 * These ioctl calls:
5673 * - require local superuser power.
5674 * - require strict serialization.
5675 * - do not return a value
5684 case SIOCSIFHWBROADCAST
:
5686 case SIOCBONDENSLAVE
:
5687 case SIOCBONDRELEASE
:
5688 case SIOCBONDSETHWADDR
:
5689 case SIOCBONDCHANGEACTIVE
:
5693 if (!ns_capable(net
->user_ns
, CAP_NET_ADMIN
))
5696 case SIOCBONDSLAVEINFOQUERY
:
5697 case SIOCBONDINFOQUERY
:
5698 dev_load(net
, ifr
.ifr_name
);
5700 ret
= dev_ifsioc(net
, &ifr
, cmd
);
5705 /* Get the per device memory space. We can add this but
5706 * currently do not support it */
5708 /* Set the per device memory buffer space.
5709 * Not applicable in our case */
5714 * Unknown or private ioctl.
5717 if (cmd
== SIOCWANDEV
||
5718 (cmd
>= SIOCDEVPRIVATE
&&
5719 cmd
<= SIOCDEVPRIVATE
+ 15)) {
5720 dev_load(net
, ifr
.ifr_name
);
5722 ret
= dev_ifsioc(net
, &ifr
, cmd
);
5724 if (!ret
&& copy_to_user(arg
, &ifr
,
5725 sizeof(struct ifreq
)))
5729 /* Take care of Wireless Extensions */
5730 if (cmd
>= SIOCIWFIRST
&& cmd
<= SIOCIWLAST
)
5731 return wext_handle_ioctl(net
, &ifr
, cmd
, arg
);
5738 * dev_new_index - allocate an ifindex
5739 * @net: the applicable net namespace
5741 * Returns a suitable unique value for a new device interface
5742 * number. The caller must hold the rtnl semaphore or the
5743 * dev_base_lock to be sure it remains unique.
5745 static int dev_new_index(struct net
*net
)
5747 int ifindex
= net
->ifindex
;
5751 if (!__dev_get_by_index(net
, ifindex
))
5752 return net
->ifindex
= ifindex
;
5756 /* Delayed registration/unregisteration */
5757 static LIST_HEAD(net_todo_list
);
5759 static void net_set_todo(struct net_device
*dev
)
5761 list_add_tail(&dev
->todo_list
, &net_todo_list
);
5764 static void rollback_registered_many(struct list_head
*head
)
5766 struct net_device
*dev
, *tmp
;
5768 BUG_ON(dev_boot_phase
);
5771 list_for_each_entry_safe(dev
, tmp
, head
, unreg_list
) {
5772 /* Some devices call without registering
5773 * for initialization unwind. Remove those
5774 * devices and proceed with the remaining.
5776 if (dev
->reg_state
== NETREG_UNINITIALIZED
) {
5777 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5781 list_del(&dev
->unreg_list
);
5784 dev
->dismantle
= true;
5785 BUG_ON(dev
->reg_state
!= NETREG_REGISTERED
);
5788 /* If device is running, close it first. */
5789 dev_close_many(head
);
5791 list_for_each_entry(dev
, head
, unreg_list
) {
5792 /* And unlink it from device chain. */
5793 unlist_netdevice(dev
);
5795 dev
->reg_state
= NETREG_UNREGISTERING
;
5800 list_for_each_entry(dev
, head
, unreg_list
) {
5801 /* Shutdown queueing discipline. */
5805 /* Notify protocols, that we are about to destroy
5806 this device. They should clean all the things.
5808 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
5810 if (!dev
->rtnl_link_ops
||
5811 dev
->rtnl_link_state
== RTNL_LINK_INITIALIZED
)
5812 rtmsg_ifinfo(RTM_DELLINK
, dev
, ~0U);
5815 * Flush the unicast and multicast chains
5820 if (dev
->netdev_ops
->ndo_uninit
)
5821 dev
->netdev_ops
->ndo_uninit(dev
);
5823 /* Notifier chain MUST detach us all upper devices. */
5824 WARN_ON(netdev_has_any_upper_dev(dev
));
5826 /* Remove entries from kobject tree */
5827 netdev_unregister_kobject(dev
);
5829 /* Remove XPS queueing entries */
5830 netif_reset_xps_queues_gt(dev
, 0);
5836 list_for_each_entry(dev
, head
, unreg_list
)
5840 static void rollback_registered(struct net_device
*dev
)
5844 list_add(&dev
->unreg_list
, &single
);
5845 rollback_registered_many(&single
);
5849 static netdev_features_t
netdev_fix_features(struct net_device
*dev
,
5850 netdev_features_t features
)
5852 /* Fix illegal checksum combinations */
5853 if ((features
& NETIF_F_HW_CSUM
) &&
5854 (features
& (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))) {
5855 netdev_warn(dev
, "mixed HW and IP checksum settings.\n");
5856 features
&= ~(NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
);
5859 /* Fix illegal SG+CSUM combinations. */
5860 if ((features
& NETIF_F_SG
) &&
5861 !(features
& NETIF_F_ALL_CSUM
)) {
5863 "Dropping NETIF_F_SG since no checksum feature.\n");
5864 features
&= ~NETIF_F_SG
;
5867 /* TSO requires that SG is present as well. */
5868 if ((features
& NETIF_F_ALL_TSO
) && !(features
& NETIF_F_SG
)) {
5869 netdev_dbg(dev
, "Dropping TSO features since no SG feature.\n");
5870 features
&= ~NETIF_F_ALL_TSO
;
5873 /* TSO ECN requires that TSO is present as well. */
5874 if ((features
& NETIF_F_ALL_TSO
) == NETIF_F_TSO_ECN
)
5875 features
&= ~NETIF_F_TSO_ECN
;
5877 /* Software GSO depends on SG. */
5878 if ((features
& NETIF_F_GSO
) && !(features
& NETIF_F_SG
)) {
5879 netdev_dbg(dev
, "Dropping NETIF_F_GSO since no SG feature.\n");
5880 features
&= ~NETIF_F_GSO
;
5883 /* UFO needs SG and checksumming */
5884 if (features
& NETIF_F_UFO
) {
5885 /* maybe split UFO into V4 and V6? */
5886 if (!((features
& NETIF_F_GEN_CSUM
) ||
5887 (features
& (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))
5888 == (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))) {
5890 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5891 features
&= ~NETIF_F_UFO
;
5894 if (!(features
& NETIF_F_SG
)) {
5896 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5897 features
&= ~NETIF_F_UFO
;
5904 int __netdev_update_features(struct net_device
*dev
)
5906 netdev_features_t features
;
5911 features
= netdev_get_wanted_features(dev
);
5913 if (dev
->netdev_ops
->ndo_fix_features
)
5914 features
= dev
->netdev_ops
->ndo_fix_features(dev
, features
);
5916 /* driver might be less strict about feature dependencies */
5917 features
= netdev_fix_features(dev
, features
);
5919 if (dev
->features
== features
)
5922 netdev_dbg(dev
, "Features changed: %pNF -> %pNF\n",
5923 &dev
->features
, &features
);
5925 if (dev
->netdev_ops
->ndo_set_features
)
5926 err
= dev
->netdev_ops
->ndo_set_features(dev
, features
);
5928 if (unlikely(err
< 0)) {
5930 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5931 err
, &features
, &dev
->features
);
5936 dev
->features
= features
;
5942 * netdev_update_features - recalculate device features
5943 * @dev: the device to check
5945 * Recalculate dev->features set and send notifications if it
5946 * has changed. Should be called after driver or hardware dependent
5947 * conditions might have changed that influence the features.
5949 void netdev_update_features(struct net_device
*dev
)
5951 if (__netdev_update_features(dev
))
5952 netdev_features_change(dev
);
5954 EXPORT_SYMBOL(netdev_update_features
);
5957 * netdev_change_features - recalculate device features
5958 * @dev: the device to check
5960 * Recalculate dev->features set and send notifications even
5961 * if they have not changed. Should be called instead of
5962 * netdev_update_features() if also dev->vlan_features might
5963 * have changed to allow the changes to be propagated to stacked
5966 void netdev_change_features(struct net_device
*dev
)
5968 __netdev_update_features(dev
);
5969 netdev_features_change(dev
);
5971 EXPORT_SYMBOL(netdev_change_features
);
5974 * netif_stacked_transfer_operstate - transfer operstate
5975 * @rootdev: the root or lower level device to transfer state from
5976 * @dev: the device to transfer operstate to
5978 * Transfer operational state from root to device. This is normally
5979 * called when a stacking relationship exists between the root
5980 * device and the device(a leaf device).
5982 void netif_stacked_transfer_operstate(const struct net_device
*rootdev
,
5983 struct net_device
*dev
)
5985 if (rootdev
->operstate
== IF_OPER_DORMANT
)
5986 netif_dormant_on(dev
);
5988 netif_dormant_off(dev
);
5990 if (netif_carrier_ok(rootdev
)) {
5991 if (!netif_carrier_ok(dev
))
5992 netif_carrier_on(dev
);
5994 if (netif_carrier_ok(dev
))
5995 netif_carrier_off(dev
);
5998 EXPORT_SYMBOL(netif_stacked_transfer_operstate
);
6001 static int netif_alloc_rx_queues(struct net_device
*dev
)
6003 unsigned int i
, count
= dev
->num_rx_queues
;
6004 struct netdev_rx_queue
*rx
;
6008 rx
= kcalloc(count
, sizeof(struct netdev_rx_queue
), GFP_KERNEL
);
6014 for (i
= 0; i
< count
; i
++)
6020 static void netdev_init_one_queue(struct net_device
*dev
,
6021 struct netdev_queue
*queue
, void *_unused
)
6023 /* Initialize queue lock */
6024 spin_lock_init(&queue
->_xmit_lock
);
6025 netdev_set_xmit_lockdep_class(&queue
->_xmit_lock
, dev
->type
);
6026 queue
->xmit_lock_owner
= -1;
6027 netdev_queue_numa_node_write(queue
, NUMA_NO_NODE
);
6030 dql_init(&queue
->dql
, HZ
);
6034 static int netif_alloc_netdev_queues(struct net_device
*dev
)
6036 unsigned int count
= dev
->num_tx_queues
;
6037 struct netdev_queue
*tx
;
6041 tx
= kcalloc(count
, sizeof(struct netdev_queue
), GFP_KERNEL
);
6047 netdev_for_each_tx_queue(dev
, netdev_init_one_queue
, NULL
);
6048 spin_lock_init(&dev
->tx_global_lock
);
6054 * register_netdevice - register a network device
6055 * @dev: device to register
6057 * Take a completed network device structure and add it to the kernel
6058 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6059 * chain. 0 is returned on success. A negative errno code is returned
6060 * on a failure to set up the device, or if the name is a duplicate.
6062 * Callers must hold the rtnl semaphore. You may want
6063 * register_netdev() instead of this.
6066 * The locking appears insufficient to guarantee two parallel registers
6067 * will not get the same name.
6070 int register_netdevice(struct net_device
*dev
)
6073 struct net
*net
= dev_net(dev
);
6075 BUG_ON(dev_boot_phase
);
6080 /* When net_device's are persistent, this will be fatal. */
6081 BUG_ON(dev
->reg_state
!= NETREG_UNINITIALIZED
);
6084 spin_lock_init(&dev
->addr_list_lock
);
6085 netdev_set_addr_lockdep_class(dev
);
6089 ret
= dev_get_valid_name(net
, dev
, dev
->name
);
6093 /* Init, if this function is available */
6094 if (dev
->netdev_ops
->ndo_init
) {
6095 ret
= dev
->netdev_ops
->ndo_init(dev
);
6103 if (((dev
->hw_features
| dev
->features
) & NETIF_F_HW_VLAN_FILTER
) &&
6104 (!dev
->netdev_ops
->ndo_vlan_rx_add_vid
||
6105 !dev
->netdev_ops
->ndo_vlan_rx_kill_vid
)) {
6106 netdev_WARN(dev
, "Buggy VLAN acceleration in driver!\n");
6113 dev
->ifindex
= dev_new_index(net
);
6114 else if (__dev_get_by_index(net
, dev
->ifindex
))
6117 if (dev
->iflink
== -1)
6118 dev
->iflink
= dev
->ifindex
;
6120 /* Transfer changeable features to wanted_features and enable
6121 * software offloads (GSO and GRO).
6123 dev
->hw_features
|= NETIF_F_SOFT_FEATURES
;
6124 dev
->features
|= NETIF_F_SOFT_FEATURES
;
6125 dev
->wanted_features
= dev
->features
& dev
->hw_features
;
6127 /* Turn on no cache copy if HW is doing checksum */
6128 if (!(dev
->flags
& IFF_LOOPBACK
)) {
6129 dev
->hw_features
|= NETIF_F_NOCACHE_COPY
;
6130 if (dev
->features
& NETIF_F_ALL_CSUM
) {
6131 dev
->wanted_features
|= NETIF_F_NOCACHE_COPY
;
6132 dev
->features
|= NETIF_F_NOCACHE_COPY
;
6136 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6138 dev
->vlan_features
|= NETIF_F_HIGHDMA
;
6140 ret
= call_netdevice_notifiers(NETDEV_POST_INIT
, dev
);
6141 ret
= notifier_to_errno(ret
);
6145 ret
= netdev_register_kobject(dev
);
6148 dev
->reg_state
= NETREG_REGISTERED
;
6150 __netdev_update_features(dev
);
6153 * Default initial state at registry is that the
6154 * device is present.
6157 set_bit(__LINK_STATE_PRESENT
, &dev
->state
);
6159 linkwatch_init_dev(dev
);
6161 dev_init_scheduler(dev
);
6163 list_netdevice(dev
);
6164 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
6166 /* If the device has permanent device address, driver should
6167 * set dev_addr and also addr_assign_type should be set to
6168 * NET_ADDR_PERM (default value).
6170 if (dev
->addr_assign_type
== NET_ADDR_PERM
)
6171 memcpy(dev
->perm_addr
, dev
->dev_addr
, dev
->addr_len
);
6173 /* Notify protocols, that a new device appeared. */
6174 ret
= call_netdevice_notifiers(NETDEV_REGISTER
, dev
);
6175 ret
= notifier_to_errno(ret
);
6177 rollback_registered(dev
);
6178 dev
->reg_state
= NETREG_UNREGISTERED
;
6181 * Prevent userspace races by waiting until the network
6182 * device is fully setup before sending notifications.
6184 if (!dev
->rtnl_link_ops
||
6185 dev
->rtnl_link_state
== RTNL_LINK_INITIALIZED
)
6186 rtmsg_ifinfo(RTM_NEWLINK
, dev
, ~0U);
6192 if (dev
->netdev_ops
->ndo_uninit
)
6193 dev
->netdev_ops
->ndo_uninit(dev
);
6196 EXPORT_SYMBOL(register_netdevice
);
6199 * init_dummy_netdev - init a dummy network device for NAPI
6200 * @dev: device to init
6202 * This takes a network device structure and initialize the minimum
6203 * amount of fields so it can be used to schedule NAPI polls without
6204 * registering a full blown interface. This is to be used by drivers
6205 * that need to tie several hardware interfaces to a single NAPI
6206 * poll scheduler due to HW limitations.
6208 int init_dummy_netdev(struct net_device
*dev
)
6210 /* Clear everything. Note we don't initialize spinlocks
6211 * are they aren't supposed to be taken by any of the
6212 * NAPI code and this dummy netdev is supposed to be
6213 * only ever used for NAPI polls
6215 memset(dev
, 0, sizeof(struct net_device
));
6217 /* make sure we BUG if trying to hit standard
6218 * register/unregister code path
6220 dev
->reg_state
= NETREG_DUMMY
;
6222 /* NAPI wants this */
6223 INIT_LIST_HEAD(&dev
->napi_list
);
6225 /* a dummy interface is started by default */
6226 set_bit(__LINK_STATE_PRESENT
, &dev
->state
);
6227 set_bit(__LINK_STATE_START
, &dev
->state
);
6229 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6230 * because users of this 'device' dont need to change
6236 EXPORT_SYMBOL_GPL(init_dummy_netdev
);
6240 * register_netdev - register a network device
6241 * @dev: device to register
6243 * Take a completed network device structure and add it to the kernel
6244 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6245 * chain. 0 is returned on success. A negative errno code is returned
6246 * on a failure to set up the device, or if the name is a duplicate.
6248 * This is a wrapper around register_netdevice that takes the rtnl semaphore
6249 * and expands the device name if you passed a format string to
6252 int register_netdev(struct net_device
*dev
)
6257 err
= register_netdevice(dev
);
6261 EXPORT_SYMBOL(register_netdev
);
6263 int netdev_refcnt_read(const struct net_device
*dev
)
6267 for_each_possible_cpu(i
)
6268 refcnt
+= *per_cpu_ptr(dev
->pcpu_refcnt
, i
);
6271 EXPORT_SYMBOL(netdev_refcnt_read
);
6274 * netdev_wait_allrefs - wait until all references are gone.
6275 * @dev: target net_device
6277 * This is called when unregistering network devices.
6279 * Any protocol or device that holds a reference should register
6280 * for netdevice notification, and cleanup and put back the
6281 * reference if they receive an UNREGISTER event.
6282 * We can get stuck here if buggy protocols don't correctly
6285 static void netdev_wait_allrefs(struct net_device
*dev
)
6287 unsigned long rebroadcast_time
, warning_time
;
6290 linkwatch_forget_dev(dev
);
6292 rebroadcast_time
= warning_time
= jiffies
;
6293 refcnt
= netdev_refcnt_read(dev
);
6295 while (refcnt
!= 0) {
6296 if (time_after(jiffies
, rebroadcast_time
+ 1 * HZ
)) {
6299 /* Rebroadcast unregister notification */
6300 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
6306 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
6307 if (test_bit(__LINK_STATE_LINKWATCH_PENDING
,
6309 /* We must not have linkwatch events
6310 * pending on unregister. If this
6311 * happens, we simply run the queue
6312 * unscheduled, resulting in a noop
6315 linkwatch_run_queue();
6320 rebroadcast_time
= jiffies
;
6325 refcnt
= netdev_refcnt_read(dev
);
6327 if (time_after(jiffies
, warning_time
+ 10 * HZ
)) {
6328 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6330 warning_time
= jiffies
;
6339 * register_netdevice(x1);
6340 * register_netdevice(x2);
6342 * unregister_netdevice(y1);
6343 * unregister_netdevice(y2);
6349 * We are invoked by rtnl_unlock().
6350 * This allows us to deal with problems:
6351 * 1) We can delete sysfs objects which invoke hotplug
6352 * without deadlocking with linkwatch via keventd.
6353 * 2) Since we run with the RTNL semaphore not held, we can sleep
6354 * safely in order to wait for the netdev refcnt to drop to zero.
6356 * We must not return until all unregister events added during
6357 * the interval the lock was held have been completed.
6359 void netdev_run_todo(void)
6361 struct list_head list
;
6363 /* Snapshot list, allow later requests */
6364 list_replace_init(&net_todo_list
, &list
);
6369 /* Wait for rcu callbacks to finish before next phase */
6370 if (!list_empty(&list
))
6373 while (!list_empty(&list
)) {
6374 struct net_device
*dev
6375 = list_first_entry(&list
, struct net_device
, todo_list
);
6376 list_del(&dev
->todo_list
);
6379 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
6382 if (unlikely(dev
->reg_state
!= NETREG_UNREGISTERING
)) {
6383 pr_err("network todo '%s' but state %d\n",
6384 dev
->name
, dev
->reg_state
);
6389 dev
->reg_state
= NETREG_UNREGISTERED
;
6391 on_each_cpu(flush_backlog
, dev
, 1);
6393 netdev_wait_allrefs(dev
);
6396 BUG_ON(netdev_refcnt_read(dev
));
6397 WARN_ON(rcu_access_pointer(dev
->ip_ptr
));
6398 WARN_ON(rcu_access_pointer(dev
->ip6_ptr
));
6399 WARN_ON(dev
->dn_ptr
);
6401 if (dev
->destructor
)
6402 dev
->destructor(dev
);
6404 /* Free network device */
6405 kobject_put(&dev
->dev
.kobj
);
6409 /* Convert net_device_stats to rtnl_link_stats64. They have the same
6410 * fields in the same order, with only the type differing.
6412 void netdev_stats_to_stats64(struct rtnl_link_stats64
*stats64
,
6413 const struct net_device_stats
*netdev_stats
)
6415 #if BITS_PER_LONG == 64
6416 BUILD_BUG_ON(sizeof(*stats64
) != sizeof(*netdev_stats
));
6417 memcpy(stats64
, netdev_stats
, sizeof(*stats64
));
6419 size_t i
, n
= sizeof(*stats64
) / sizeof(u64
);
6420 const unsigned long *src
= (const unsigned long *)netdev_stats
;
6421 u64
*dst
= (u64
*)stats64
;
6423 BUILD_BUG_ON(sizeof(*netdev_stats
) / sizeof(unsigned long) !=
6424 sizeof(*stats64
) / sizeof(u64
));
6425 for (i
= 0; i
< n
; i
++)
6429 EXPORT_SYMBOL(netdev_stats_to_stats64
);
6432 * dev_get_stats - get network device statistics
6433 * @dev: device to get statistics from
6434 * @storage: place to store stats
6436 * Get network statistics from device. Return @storage.
6437 * The device driver may provide its own method by setting
6438 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6439 * otherwise the internal statistics structure is used.
6441 struct rtnl_link_stats64
*dev_get_stats(struct net_device
*dev
,
6442 struct rtnl_link_stats64
*storage
)
6444 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6446 if (ops
->ndo_get_stats64
) {
6447 memset(storage
, 0, sizeof(*storage
));
6448 ops
->ndo_get_stats64(dev
, storage
);
6449 } else if (ops
->ndo_get_stats
) {
6450 netdev_stats_to_stats64(storage
, ops
->ndo_get_stats(dev
));
6452 netdev_stats_to_stats64(storage
, &dev
->stats
);
6454 storage
->rx_dropped
+= atomic_long_read(&dev
->rx_dropped
);
6457 EXPORT_SYMBOL(dev_get_stats
);
6459 struct netdev_queue
*dev_ingress_queue_create(struct net_device
*dev
)
6461 struct netdev_queue
*queue
= dev_ingress_queue(dev
);
6463 #ifdef CONFIG_NET_CLS_ACT
6466 queue
= kzalloc(sizeof(*queue
), GFP_KERNEL
);
6469 netdev_init_one_queue(dev
, queue
, NULL
);
6470 queue
->qdisc
= &noop_qdisc
;
6471 queue
->qdisc_sleeping
= &noop_qdisc
;
6472 rcu_assign_pointer(dev
->ingress_queue
, queue
);
6477 static const struct ethtool_ops default_ethtool_ops
;
6479 void netdev_set_default_ethtool_ops(struct net_device
*dev
,
6480 const struct ethtool_ops
*ops
)
6482 if (dev
->ethtool_ops
== &default_ethtool_ops
)
6483 dev
->ethtool_ops
= ops
;
6485 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops
);
6488 * alloc_netdev_mqs - allocate network device
6489 * @sizeof_priv: size of private data to allocate space for
6490 * @name: device name format string
6491 * @setup: callback to initialize device
6492 * @txqs: the number of TX subqueues to allocate
6493 * @rxqs: the number of RX subqueues to allocate
6495 * Allocates a struct net_device with private data area for driver use
6496 * and performs basic initialization. Also allocates subquue structs
6497 * for each queue on the device.
6499 struct net_device
*alloc_netdev_mqs(int sizeof_priv
, const char *name
,
6500 void (*setup
)(struct net_device
*),
6501 unsigned int txqs
, unsigned int rxqs
)
6503 struct net_device
*dev
;
6505 struct net_device
*p
;
6507 BUG_ON(strlen(name
) >= sizeof(dev
->name
));
6510 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6516 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6521 alloc_size
= sizeof(struct net_device
);
6523 /* ensure 32-byte alignment of private area */
6524 alloc_size
= ALIGN(alloc_size
, NETDEV_ALIGN
);
6525 alloc_size
+= sizeof_priv
;
6527 /* ensure 32-byte alignment of whole construct */
6528 alloc_size
+= NETDEV_ALIGN
- 1;
6530 p
= kzalloc(alloc_size
, GFP_KERNEL
);
6534 dev
= PTR_ALIGN(p
, NETDEV_ALIGN
);
6535 dev
->padded
= (char *)dev
- (char *)p
;
6537 dev
->pcpu_refcnt
= alloc_percpu(int);
6538 if (!dev
->pcpu_refcnt
)
6541 if (dev_addr_init(dev
))
6547 dev_net_set(dev
, &init_net
);
6549 dev
->gso_max_size
= GSO_MAX_SIZE
;
6550 dev
->gso_max_segs
= GSO_MAX_SEGS
;
6552 INIT_LIST_HEAD(&dev
->napi_list
);
6553 INIT_LIST_HEAD(&dev
->unreg_list
);
6554 INIT_LIST_HEAD(&dev
->link_watch_list
);
6555 INIT_LIST_HEAD(&dev
->upper_dev_list
);
6556 dev
->priv_flags
= IFF_XMIT_DST_RELEASE
;
6559 dev
->num_tx_queues
= txqs
;
6560 dev
->real_num_tx_queues
= txqs
;
6561 if (netif_alloc_netdev_queues(dev
))
6565 dev
->num_rx_queues
= rxqs
;
6566 dev
->real_num_rx_queues
= rxqs
;
6567 if (netif_alloc_rx_queues(dev
))
6571 strcpy(dev
->name
, name
);
6572 dev
->group
= INIT_NETDEV_GROUP
;
6573 if (!dev
->ethtool_ops
)
6574 dev
->ethtool_ops
= &default_ethtool_ops
;
6582 free_percpu(dev
->pcpu_refcnt
);
6592 EXPORT_SYMBOL(alloc_netdev_mqs
);
6595 * free_netdev - free network device
6598 * This function does the last stage of destroying an allocated device
6599 * interface. The reference to the device object is released.
6600 * If this is the last reference then it will be freed.
6602 void free_netdev(struct net_device
*dev
)
6604 struct napi_struct
*p
, *n
;
6606 release_net(dev_net(dev
));
6613 kfree(rcu_dereference_protected(dev
->ingress_queue
, 1));
6615 /* Flush device addresses */
6616 dev_addr_flush(dev
);
6618 list_for_each_entry_safe(p
, n
, &dev
->napi_list
, dev_list
)
6621 free_percpu(dev
->pcpu_refcnt
);
6622 dev
->pcpu_refcnt
= NULL
;
6624 /* Compatibility with error handling in drivers */
6625 if (dev
->reg_state
== NETREG_UNINITIALIZED
) {
6626 kfree((char *)dev
- dev
->padded
);
6630 BUG_ON(dev
->reg_state
!= NETREG_UNREGISTERED
);
6631 dev
->reg_state
= NETREG_RELEASED
;
6633 /* will free via device release */
6634 put_device(&dev
->dev
);
6636 EXPORT_SYMBOL(free_netdev
);
6639 * synchronize_net - Synchronize with packet receive processing
6641 * Wait for packets currently being received to be done.
6642 * Does not block later packets from starting.
6644 void synchronize_net(void)
6647 if (rtnl_is_locked())
6648 synchronize_rcu_expedited();
6652 EXPORT_SYMBOL(synchronize_net
);
6655 * unregister_netdevice_queue - remove device from the kernel
6659 * This function shuts down a device interface and removes it
6660 * from the kernel tables.
6661 * If head not NULL, device is queued to be unregistered later.
6663 * Callers must hold the rtnl semaphore. You may want
6664 * unregister_netdev() instead of this.
6667 void unregister_netdevice_queue(struct net_device
*dev
, struct list_head
*head
)
6672 list_move_tail(&dev
->unreg_list
, head
);
6674 rollback_registered(dev
);
6675 /* Finish processing unregister after unlock */
6679 EXPORT_SYMBOL(unregister_netdevice_queue
);
6682 * unregister_netdevice_many - unregister many devices
6683 * @head: list of devices
6685 void unregister_netdevice_many(struct list_head
*head
)
6687 struct net_device
*dev
;
6689 if (!list_empty(head
)) {
6690 rollback_registered_many(head
);
6691 list_for_each_entry(dev
, head
, unreg_list
)
6695 EXPORT_SYMBOL(unregister_netdevice_many
);
6698 * unregister_netdev - remove device from the kernel
6701 * This function shuts down a device interface and removes it
6702 * from the kernel tables.
6704 * This is just a wrapper for unregister_netdevice that takes
6705 * the rtnl semaphore. In general you want to use this and not
6706 * unregister_netdevice.
6708 void unregister_netdev(struct net_device
*dev
)
6711 unregister_netdevice(dev
);
6714 EXPORT_SYMBOL(unregister_netdev
);
6717 * dev_change_net_namespace - move device to different nethost namespace
6719 * @net: network namespace
6720 * @pat: If not NULL name pattern to try if the current device name
6721 * is already taken in the destination network namespace.
6723 * This function shuts down a device interface and moves it
6724 * to a new network namespace. On success 0 is returned, on
6725 * a failure a netagive errno code is returned.
6727 * Callers must hold the rtnl semaphore.
6730 int dev_change_net_namespace(struct net_device
*dev
, struct net
*net
, const char *pat
)
6736 /* Don't allow namespace local devices to be moved. */
6738 if (dev
->features
& NETIF_F_NETNS_LOCAL
)
6741 /* Ensure the device has been registrered */
6742 if (dev
->reg_state
!= NETREG_REGISTERED
)
6745 /* Get out if there is nothing todo */
6747 if (net_eq(dev_net(dev
), net
))
6750 /* Pick the destination device name, and ensure
6751 * we can use it in the destination network namespace.
6754 if (__dev_get_by_name(net
, dev
->name
)) {
6755 /* We get here if we can't use the current device name */
6758 if (dev_get_valid_name(net
, dev
, pat
) < 0)
6763 * And now a mini version of register_netdevice unregister_netdevice.
6766 /* If device is running close it first. */
6769 /* And unlink it from device chain */
6771 unlist_netdevice(dev
);
6775 /* Shutdown queueing discipline. */
6778 /* Notify protocols, that we are about to destroy
6779 this device. They should clean all the things.
6781 Note that dev->reg_state stays at NETREG_REGISTERED.
6782 This is wanted because this way 8021q and macvlan know
6783 the device is just moving and can keep their slaves up.
6785 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
6787 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
6788 rtmsg_ifinfo(RTM_DELLINK
, dev
, ~0U);
6791 * Flush the unicast and multicast chains
6796 /* Send a netdev-removed uevent to the old namespace */
6797 kobject_uevent(&dev
->dev
.kobj
, KOBJ_REMOVE
);
6799 /* Actually switch the network namespace */
6800 dev_net_set(dev
, net
);
6802 /* If there is an ifindex conflict assign a new one */
6803 if (__dev_get_by_index(net
, dev
->ifindex
)) {
6804 int iflink
= (dev
->iflink
== dev
->ifindex
);
6805 dev
->ifindex
= dev_new_index(net
);
6807 dev
->iflink
= dev
->ifindex
;
6810 /* Send a netdev-add uevent to the new namespace */
6811 kobject_uevent(&dev
->dev
.kobj
, KOBJ_ADD
);
6813 /* Fixup kobjects */
6814 err
= device_rename(&dev
->dev
, dev
->name
);
6817 /* Add the device back in the hashes */
6818 list_netdevice(dev
);
6820 /* Notify protocols, that a new device appeared. */
6821 call_netdevice_notifiers(NETDEV_REGISTER
, dev
);
6824 * Prevent userspace races by waiting until the network
6825 * device is fully setup before sending notifications.
6827 rtmsg_ifinfo(RTM_NEWLINK
, dev
, ~0U);
6834 EXPORT_SYMBOL_GPL(dev_change_net_namespace
);
6836 static int dev_cpu_callback(struct notifier_block
*nfb
,
6837 unsigned long action
,
6840 struct sk_buff
**list_skb
;
6841 struct sk_buff
*skb
;
6842 unsigned int cpu
, oldcpu
= (unsigned long)ocpu
;
6843 struct softnet_data
*sd
, *oldsd
;
6845 if (action
!= CPU_DEAD
&& action
!= CPU_DEAD_FROZEN
)
6848 local_irq_disable();
6849 cpu
= smp_processor_id();
6850 sd
= &per_cpu(softnet_data
, cpu
);
6851 oldsd
= &per_cpu(softnet_data
, oldcpu
);
6853 /* Find end of our completion_queue. */
6854 list_skb
= &sd
->completion_queue
;
6856 list_skb
= &(*list_skb
)->next
;
6857 /* Append completion queue from offline CPU. */
6858 *list_skb
= oldsd
->completion_queue
;
6859 oldsd
->completion_queue
= NULL
;
6861 /* Append output queue from offline CPU. */
6862 if (oldsd
->output_queue
) {
6863 *sd
->output_queue_tailp
= oldsd
->output_queue
;
6864 sd
->output_queue_tailp
= oldsd
->output_queue_tailp
;
6865 oldsd
->output_queue
= NULL
;
6866 oldsd
->output_queue_tailp
= &oldsd
->output_queue
;
6868 /* Append NAPI poll list from offline CPU. */
6869 if (!list_empty(&oldsd
->poll_list
)) {
6870 list_splice_init(&oldsd
->poll_list
, &sd
->poll_list
);
6871 raise_softirq_irqoff(NET_RX_SOFTIRQ
);
6874 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
6877 /* Process offline CPU's input_pkt_queue */
6878 while ((skb
= __skb_dequeue(&oldsd
->process_queue
))) {
6880 input_queue_head_incr(oldsd
);
6882 while ((skb
= __skb_dequeue(&oldsd
->input_pkt_queue
))) {
6884 input_queue_head_incr(oldsd
);
6892 * netdev_increment_features - increment feature set by one
6893 * @all: current feature set
6894 * @one: new feature set
6895 * @mask: mask feature set
6897 * Computes a new feature set after adding a device with feature set
6898 * @one to the master device with current feature set @all. Will not
6899 * enable anything that is off in @mask. Returns the new feature set.
6901 netdev_features_t
netdev_increment_features(netdev_features_t all
,
6902 netdev_features_t one
, netdev_features_t mask
)
6904 if (mask
& NETIF_F_GEN_CSUM
)
6905 mask
|= NETIF_F_ALL_CSUM
;
6906 mask
|= NETIF_F_VLAN_CHALLENGED
;
6908 all
|= one
& (NETIF_F_ONE_FOR_ALL
|NETIF_F_ALL_CSUM
) & mask
;
6909 all
&= one
| ~NETIF_F_ALL_FOR_ALL
;
6911 /* If one device supports hw checksumming, set for all. */
6912 if (all
& NETIF_F_GEN_CSUM
)
6913 all
&= ~(NETIF_F_ALL_CSUM
& ~NETIF_F_GEN_CSUM
);
6917 EXPORT_SYMBOL(netdev_increment_features
);
6919 static struct hlist_head
*netdev_create_hash(void)
6922 struct hlist_head
*hash
;
6924 hash
= kmalloc(sizeof(*hash
) * NETDEV_HASHENTRIES
, GFP_KERNEL
);
6926 for (i
= 0; i
< NETDEV_HASHENTRIES
; i
++)
6927 INIT_HLIST_HEAD(&hash
[i
]);
6932 /* Initialize per network namespace state */
6933 static int __net_init
netdev_init(struct net
*net
)
6935 if (net
!= &init_net
)
6936 INIT_LIST_HEAD(&net
->dev_base_head
);
6938 net
->dev_name_head
= netdev_create_hash();
6939 if (net
->dev_name_head
== NULL
)
6942 net
->dev_index_head
= netdev_create_hash();
6943 if (net
->dev_index_head
== NULL
)
6949 kfree(net
->dev_name_head
);
6955 * netdev_drivername - network driver for the device
6956 * @dev: network device
6958 * Determine network driver for device.
6960 const char *netdev_drivername(const struct net_device
*dev
)
6962 const struct device_driver
*driver
;
6963 const struct device
*parent
;
6964 const char *empty
= "";
6966 parent
= dev
->dev
.parent
;
6970 driver
= parent
->driver
;
6971 if (driver
&& driver
->name
)
6972 return driver
->name
;
6976 static int __netdev_printk(const char *level
, const struct net_device
*dev
,
6977 struct va_format
*vaf
)
6981 if (dev
&& dev
->dev
.parent
) {
6982 r
= dev_printk_emit(level
[1] - '0',
6985 dev_driver_string(dev
->dev
.parent
),
6986 dev_name(dev
->dev
.parent
),
6987 netdev_name(dev
), vaf
);
6989 r
= printk("%s%s: %pV", level
, netdev_name(dev
), vaf
);
6991 r
= printk("%s(NULL net_device): %pV", level
, vaf
);
6997 int netdev_printk(const char *level
, const struct net_device
*dev
,
6998 const char *format
, ...)
7000 struct va_format vaf
;
7004 va_start(args
, format
);
7009 r
= __netdev_printk(level
, dev
, &vaf
);
7015 EXPORT_SYMBOL(netdev_printk
);
7017 #define define_netdev_printk_level(func, level) \
7018 int func(const struct net_device *dev, const char *fmt, ...) \
7021 struct va_format vaf; \
7024 va_start(args, fmt); \
7029 r = __netdev_printk(level, dev, &vaf); \
7035 EXPORT_SYMBOL(func);
7037 define_netdev_printk_level(netdev_emerg
, KERN_EMERG
);
7038 define_netdev_printk_level(netdev_alert
, KERN_ALERT
);
7039 define_netdev_printk_level(netdev_crit
, KERN_CRIT
);
7040 define_netdev_printk_level(netdev_err
, KERN_ERR
);
7041 define_netdev_printk_level(netdev_warn
, KERN_WARNING
);
7042 define_netdev_printk_level(netdev_notice
, KERN_NOTICE
);
7043 define_netdev_printk_level(netdev_info
, KERN_INFO
);
7045 static void __net_exit
netdev_exit(struct net
*net
)
7047 kfree(net
->dev_name_head
);
7048 kfree(net
->dev_index_head
);
7051 static struct pernet_operations __net_initdata netdev_net_ops
= {
7052 .init
= netdev_init
,
7053 .exit
= netdev_exit
,
7056 static void __net_exit
default_device_exit(struct net
*net
)
7058 struct net_device
*dev
, *aux
;
7060 * Push all migratable network devices back to the
7061 * initial network namespace
7064 for_each_netdev_safe(net
, dev
, aux
) {
7066 char fb_name
[IFNAMSIZ
];
7068 /* Ignore unmoveable devices (i.e. loopback) */
7069 if (dev
->features
& NETIF_F_NETNS_LOCAL
)
7072 /* Leave virtual devices for the generic cleanup */
7073 if (dev
->rtnl_link_ops
)
7076 /* Push remaining network devices to init_net */
7077 snprintf(fb_name
, IFNAMSIZ
, "dev%d", dev
->ifindex
);
7078 err
= dev_change_net_namespace(dev
, &init_net
, fb_name
);
7080 pr_emerg("%s: failed to move %s to init_net: %d\n",
7081 __func__
, dev
->name
, err
);
7088 static void __net_exit
default_device_exit_batch(struct list_head
*net_list
)
7090 /* At exit all network devices most be removed from a network
7091 * namespace. Do this in the reverse order of registration.
7092 * Do this across as many network namespaces as possible to
7093 * improve batching efficiency.
7095 struct net_device
*dev
;
7097 LIST_HEAD(dev_kill_list
);
7100 list_for_each_entry(net
, net_list
, exit_list
) {
7101 for_each_netdev_reverse(net
, dev
) {
7102 if (dev
->rtnl_link_ops
)
7103 dev
->rtnl_link_ops
->dellink(dev
, &dev_kill_list
);
7105 unregister_netdevice_queue(dev
, &dev_kill_list
);
7108 unregister_netdevice_many(&dev_kill_list
);
7109 list_del(&dev_kill_list
);
7113 static struct pernet_operations __net_initdata default_device_ops
= {
7114 .exit
= default_device_exit
,
7115 .exit_batch
= default_device_exit_batch
,
7119 * Initialize the DEV module. At boot time this walks the device list and
7120 * unhooks any devices that fail to initialise (normally hardware not
7121 * present) and leaves us with a valid list of present and active devices.
7126 * This is called single threaded during boot, so no need
7127 * to take the rtnl semaphore.
7129 static int __init
net_dev_init(void)
7131 int i
, rc
= -ENOMEM
;
7133 BUG_ON(!dev_boot_phase
);
7135 if (dev_proc_init())
7138 if (netdev_kobject_init())
7141 INIT_LIST_HEAD(&ptype_all
);
7142 for (i
= 0; i
< PTYPE_HASH_SIZE
; i
++)
7143 INIT_LIST_HEAD(&ptype_base
[i
]);
7145 INIT_LIST_HEAD(&offload_base
);
7147 if (register_pernet_subsys(&netdev_net_ops
))
7151 * Initialise the packet receive queues.
7154 for_each_possible_cpu(i
) {
7155 struct softnet_data
*sd
= &per_cpu(softnet_data
, i
);
7157 memset(sd
, 0, sizeof(*sd
));
7158 skb_queue_head_init(&sd
->input_pkt_queue
);
7159 skb_queue_head_init(&sd
->process_queue
);
7160 sd
->completion_queue
= NULL
;
7161 INIT_LIST_HEAD(&sd
->poll_list
);
7162 sd
->output_queue
= NULL
;
7163 sd
->output_queue_tailp
= &sd
->output_queue
;
7165 sd
->csd
.func
= rps_trigger_softirq
;
7171 sd
->backlog
.poll
= process_backlog
;
7172 sd
->backlog
.weight
= weight_p
;
7173 sd
->backlog
.gro_list
= NULL
;
7174 sd
->backlog
.gro_count
= 0;
7179 /* The loopback device is special if any other network devices
7180 * is present in a network namespace the loopback device must
7181 * be present. Since we now dynamically allocate and free the
7182 * loopback device ensure this invariant is maintained by
7183 * keeping the loopback device as the first device on the
7184 * list of network devices. Ensuring the loopback devices
7185 * is the first device that appears and the last network device
7188 if (register_pernet_device(&loopback_net_ops
))
7191 if (register_pernet_device(&default_device_ops
))
7194 open_softirq(NET_TX_SOFTIRQ
, net_tx_action
);
7195 open_softirq(NET_RX_SOFTIRQ
, net_rx_action
);
7197 hotcpu_notifier(dev_cpu_callback
, 0);
7205 subsys_initcall(net_dev_init
);