1 #include <linux/etherdevice.h>
2 #include <linux/if_macvlan.h>
3 #include <linux/interrupt.h>
4 #include <linux/nsproxy.h>
5 #include <linux/compat.h>
6 #include <linux/if_tun.h>
7 #include <linux/module.h>
8 #include <linux/skbuff.h>
9 #include <linux/cache.h>
10 #include <linux/sched.h>
11 #include <linux/types.h>
12 #include <linux/init.h>
13 #include <linux/wait.h>
14 #include <linux/cdev.h>
17 #include <net/net_namespace.h>
18 #include <net/rtnetlink.h>
22 * A macvtap queue is the central object of this driver, it connects
23 * an open character device to a macvlan interface. There can be
24 * multiple queues on one interface, which map back to queues
25 * implemented in hardware on the underlying device.
27 * macvtap_proto is used to allocate queues through the sock allocation
30 * TODO: multiqueue support is currently not implemented, even though
31 * macvtap is basically prepared for that. We will need to add this
32 * here as well as in virtio-net and qemu to get line rate on 10gbit
33 * adapters from a guest.
35 struct macvtap_queue
{
38 struct macvlan_dev
*vlan
;
42 static struct proto macvtap_proto
= {
45 .obj_size
= sizeof (struct macvtap_queue
),
49 * Minor number matches netdev->ifindex, so need a potentially
50 * large value. This also makes it possible to split the
51 * tap functionality out again in the future by offering it
52 * from other drivers besides macvtap. As long as every device
53 * only has one tap, the interface numbers assure that the
54 * device nodes are unique.
56 static unsigned int macvtap_major
;
57 #define MACVTAP_NUM_DEVS 65536
58 static struct class *macvtap_class
;
59 static struct cdev macvtap_cdev
;
63 * The macvtap_queue is referenced both from the chardev struct file
64 * and from the struct macvlan_dev using rcu_read_lock.
66 * We never actually update the contents of a macvtap_queue atomically
67 * with RCU but it is used for race-free destruction of a queue when
68 * either the file or the macvlan_dev goes away. Pointers back to
69 * the dev and the file are implicitly valid as long as the queue
72 * The callbacks from macvlan are always done with rcu_read_lock held
73 * already. For calls from file_operations, we use the rcu_read_lock_bh
74 * to get a reference count on the socket and the device.
76 * When destroying a queue, we remove the pointers from the file and
77 * from the dev and then synchronize_rcu to make sure no thread is
78 * still using the queue. There may still be references to the struct
79 * sock inside of the queue from outbound SKBs, but these never
80 * reference back to the file or the dev. The data structure is freed
81 * through __sk_free when both our references and any pending SKBs
84 * macvtap_lock is only used to prevent multiple concurrent open()
85 * calls to assign a new vlan->tap pointer. It could be moved into
86 * the macvlan_dev itself but is extremely rarely used.
88 static DEFINE_SPINLOCK(macvtap_lock
);
91 * Choose the next free queue, for now there is only one
93 static int macvtap_set_queue(struct net_device
*dev
, struct file
*file
,
94 struct macvtap_queue
*q
)
96 struct macvlan_dev
*vlan
= netdev_priv(dev
);
99 spin_lock(&macvtap_lock
);
100 if (rcu_dereference(vlan
->tap
))
105 rcu_assign_pointer(vlan
->tap
, q
);
108 rcu_assign_pointer(file
->private_data
, q
);
111 spin_unlock(&macvtap_lock
);
116 * We must destroy each queue exactly once, when either
117 * the netdev or the file go away.
119 * Using the spinlock makes sure that we don't get
120 * to the queue again after destroying it.
122 * synchronize_rcu serializes with the packet flow
123 * that uses rcu_read_lock.
125 static void macvtap_del_queue(struct macvtap_queue
**qp
)
127 struct macvtap_queue
*q
;
129 spin_lock(&macvtap_lock
);
130 q
= rcu_dereference(*qp
);
132 spin_unlock(&macvtap_lock
);
136 rcu_assign_pointer(q
->vlan
->tap
, NULL
);
137 rcu_assign_pointer(q
->file
->private_data
, NULL
);
138 spin_unlock(&macvtap_lock
);
145 * Since we only support one queue, just dereference the pointer.
147 static struct macvtap_queue
*macvtap_get_queue(struct net_device
*dev
,
150 struct macvlan_dev
*vlan
= netdev_priv(dev
);
152 return rcu_dereference(vlan
->tap
);
155 static void macvtap_del_queues(struct net_device
*dev
)
157 struct macvlan_dev
*vlan
= netdev_priv(dev
);
158 macvtap_del_queue(&vlan
->tap
);
161 static inline struct macvtap_queue
*macvtap_file_get_queue(struct file
*file
)
163 struct macvtap_queue
*q
;
165 q
= rcu_dereference(file
->private_data
);
168 dev_hold(q
->vlan
->dev
);
170 rcu_read_unlock_bh();
174 static inline void macvtap_file_put_queue(struct macvtap_queue
*q
)
177 dev_put(q
->vlan
->dev
);
181 * Forward happens for data that gets sent from one macvlan
182 * endpoint to another one in bridge mode. We just take
183 * the skb and put it into the receive queue.
185 static int macvtap_forward(struct net_device
*dev
, struct sk_buff
*skb
)
187 struct macvtap_queue
*q
= macvtap_get_queue(dev
, skb
);
191 skb_queue_tail(&q
->sk
.sk_receive_queue
, skb
);
192 wake_up(q
->sk
.sk_sleep
);
197 * Receive is for data from the external interface (lowerdev),
198 * in case of macvtap, we can treat that the same way as
199 * forward, which macvlan cannot.
201 static int macvtap_receive(struct sk_buff
*skb
)
203 skb_push(skb
, ETH_HLEN
);
204 return macvtap_forward(skb
->dev
, skb
);
207 static int macvtap_newlink(struct net
*src_net
,
208 struct net_device
*dev
,
210 struct nlattr
*data
[])
212 struct device
*classdev
;
216 err
= macvlan_common_newlink(src_net
, dev
, tb
, data
,
217 macvtap_receive
, macvtap_forward
);
221 devt
= MKDEV(MAJOR(macvtap_major
), dev
->ifindex
);
223 classdev
= device_create(macvtap_class
, &dev
->dev
, devt
,
224 dev
, "tap%d", dev
->ifindex
);
225 if (IS_ERR(classdev
)) {
226 err
= PTR_ERR(classdev
);
227 macvtap_del_queues(dev
);
234 static void macvtap_dellink(struct net_device
*dev
,
235 struct list_head
*head
)
237 device_destroy(macvtap_class
,
238 MKDEV(MAJOR(macvtap_major
), dev
->ifindex
));
240 macvtap_del_queues(dev
);
241 macvlan_dellink(dev
, head
);
244 static struct rtnl_link_ops macvtap_link_ops __read_mostly
= {
246 .newlink
= macvtap_newlink
,
247 .dellink
= macvtap_dellink
,
251 static void macvtap_sock_write_space(struct sock
*sk
)
253 if (!sock_writeable(sk
) ||
254 !test_and_clear_bit(SOCK_ASYNC_NOSPACE
, &sk
->sk_socket
->flags
))
257 if (sk
->sk_sleep
&& waitqueue_active(sk
->sk_sleep
))
258 wake_up_interruptible_sync(sk
->sk_sleep
);
261 static int macvtap_open(struct inode
*inode
, struct file
*file
)
263 struct net
*net
= current
->nsproxy
->net_ns
;
264 struct net_device
*dev
= dev_get_by_index(net
, iminor(inode
));
265 struct macvtap_queue
*q
;
272 /* check if this is a macvtap device */
274 if (dev
->rtnl_link_ops
!= &macvtap_link_ops
)
278 q
= (struct macvtap_queue
*)sk_alloc(net
, AF_UNSPEC
, GFP_KERNEL
,
283 init_waitqueue_head(&q
->sock
.wait
);
284 q
->sock
.type
= SOCK_RAW
;
285 q
->sock
.state
= SS_CONNECTED
;
286 sock_init_data(&q
->sock
, &q
->sk
);
287 q
->sk
.sk_allocation
= GFP_ATOMIC
; /* for now */
288 q
->sk
.sk_write_space
= macvtap_sock_write_space
;
290 err
= macvtap_set_queue(dev
, file
, q
);
301 static int macvtap_release(struct inode
*inode
, struct file
*file
)
303 macvtap_del_queue((struct macvtap_queue
**)&file
->private_data
);
307 static unsigned int macvtap_poll(struct file
*file
, poll_table
* wait
)
309 struct macvtap_queue
*q
= macvtap_file_get_queue(file
);
310 unsigned int mask
= POLLERR
;
316 poll_wait(file
, &q
->sock
.wait
, wait
);
318 if (!skb_queue_empty(&q
->sk
.sk_receive_queue
))
319 mask
|= POLLIN
| POLLRDNORM
;
321 if (sock_writeable(&q
->sk
) ||
322 (!test_and_set_bit(SOCK_ASYNC_NOSPACE
, &q
->sock
.flags
) &&
323 sock_writeable(&q
->sk
)))
324 mask
|= POLLOUT
| POLLWRNORM
;
326 macvtap_file_put_queue(q
);
331 /* Get packet from user space buffer */
332 static ssize_t
macvtap_get_user(struct macvtap_queue
*q
,
333 const struct iovec
*iv
, size_t count
,
340 if (unlikely(len
< ETH_HLEN
))
343 skb
= sock_alloc_send_skb(&q
->sk
, NET_IP_ALIGN
+ len
, noblock
, &err
);
346 macvlan_count_rx(q
->vlan
, 0, false, false);
350 skb_reserve(skb
, NET_IP_ALIGN
);
353 if (skb_copy_datagram_from_iovec(skb
, 0, iv
, 0, len
)) {
354 macvlan_count_rx(q
->vlan
, 0, false, false);
359 skb_set_network_header(skb
, ETH_HLEN
);
361 macvlan_start_xmit(skb
, q
->vlan
->dev
);
366 static ssize_t
macvtap_aio_write(struct kiocb
*iocb
, const struct iovec
*iv
,
367 unsigned long count
, loff_t pos
)
369 struct file
*file
= iocb
->ki_filp
;
370 ssize_t result
= -ENOLINK
;
371 struct macvtap_queue
*q
= macvtap_file_get_queue(file
);
376 result
= macvtap_get_user(q
, iv
, iov_length(iv
, count
),
377 file
->f_flags
& O_NONBLOCK
);
378 macvtap_file_put_queue(q
);
383 /* Put packet to the user space buffer */
384 static ssize_t
macvtap_put_user(struct macvtap_queue
*q
,
385 const struct sk_buff
*skb
,
386 const struct iovec
*iv
, int len
)
388 struct macvlan_dev
*vlan
= q
->vlan
;
391 len
= min_t(int, skb
->len
, len
);
393 ret
= skb_copy_datagram_const_iovec(skb
, 0, iv
, 0, len
);
395 macvlan_count_rx(vlan
, len
, ret
== 0, 0);
397 return ret
? ret
: len
;
400 static ssize_t
macvtap_aio_read(struct kiocb
*iocb
, const struct iovec
*iv
,
401 unsigned long count
, loff_t pos
)
403 struct file
*file
= iocb
->ki_filp
;
404 struct macvtap_queue
*q
= macvtap_file_get_queue(file
);
406 DECLARE_WAITQUEUE(wait
, current
);
408 ssize_t len
, ret
= 0;
413 len
= iov_length(iv
, count
);
419 add_wait_queue(q
->sk
.sk_sleep
, &wait
);
421 current
->state
= TASK_INTERRUPTIBLE
;
423 /* Read frames from the queue */
424 skb
= skb_dequeue(&q
->sk
.sk_receive_queue
);
426 if (file
->f_flags
& O_NONBLOCK
) {
430 if (signal_pending(current
)) {
434 /* Nothing to read, let's sleep */
438 ret
= macvtap_put_user(q
, skb
, iv
, len
);
443 current
->state
= TASK_RUNNING
;
444 remove_wait_queue(q
->sk
.sk_sleep
, &wait
);
447 macvtap_file_put_queue(q
);
452 * provide compatibility with generic tun/tap interface
454 static long macvtap_ioctl(struct file
*file
, unsigned int cmd
,
457 struct macvtap_queue
*q
;
458 void __user
*argp
= (void __user
*)arg
;
459 struct ifreq __user
*ifr
= argp
;
460 unsigned int __user
*up
= argp
;
462 char devname
[IFNAMSIZ
];
466 /* ignore the name, just look at flags */
467 if (get_user(u
, &ifr
->ifr_flags
))
469 if (u
!= (IFF_TAP
| IFF_NO_PI
))
474 q
= macvtap_file_get_queue(file
);
477 memcpy(devname
, q
->vlan
->dev
->name
, sizeof(devname
));
478 macvtap_file_put_queue(q
);
480 if (copy_to_user(&ifr
->ifr_name
, q
->vlan
->dev
->name
, IFNAMSIZ
) ||
481 put_user((TUN_TAP_DEV
| TUN_NO_PI
), &ifr
->ifr_flags
))
486 if (put_user((IFF_TAP
| IFF_NO_PI
), up
))
494 q
= macvtap_file_get_queue(file
);
498 macvtap_file_put_queue(q
);
502 /* let the user check for future flags */
503 if (arg
& ~(TUN_F_CSUM
| TUN_F_TSO4
| TUN_F_TSO6
|
504 TUN_F_TSO_ECN
| TUN_F_UFO
))
507 /* TODO: add support for these, so far we don't
508 support any offload */
509 if (arg
& (TUN_F_CSUM
| TUN_F_TSO4
| TUN_F_TSO6
|
510 TUN_F_TSO_ECN
| TUN_F_UFO
))
521 static long macvtap_compat_ioctl(struct file
*file
, unsigned int cmd
,
524 return macvtap_ioctl(file
, cmd
, (unsigned long)compat_ptr(arg
));
528 static const struct file_operations macvtap_fops
= {
529 .owner
= THIS_MODULE
,
530 .open
= macvtap_open
,
531 .release
= macvtap_release
,
532 .aio_read
= macvtap_aio_read
,
533 .aio_write
= macvtap_aio_write
,
534 .poll
= macvtap_poll
,
536 .unlocked_ioctl
= macvtap_ioctl
,
538 .compat_ioctl
= macvtap_compat_ioctl
,
542 static int macvtap_init(void)
546 err
= alloc_chrdev_region(&macvtap_major
, 0,
547 MACVTAP_NUM_DEVS
, "macvtap");
551 cdev_init(&macvtap_cdev
, &macvtap_fops
);
552 err
= cdev_add(&macvtap_cdev
, macvtap_major
, MACVTAP_NUM_DEVS
);
556 macvtap_class
= class_create(THIS_MODULE
, "macvtap");
557 if (IS_ERR(macvtap_class
)) {
558 err
= PTR_ERR(macvtap_class
);
562 err
= macvlan_link_register(&macvtap_link_ops
);
569 class_unregister(macvtap_class
);
571 cdev_del(&macvtap_cdev
);
573 unregister_chrdev_region(macvtap_major
, MACVTAP_NUM_DEVS
);
577 module_init(macvtap_init
);
579 static void macvtap_exit(void)
581 rtnl_link_unregister(&macvtap_link_ops
);
582 class_unregister(macvtap_class
);
583 cdev_del(&macvtap_cdev
);
584 unregister_chrdev_region(macvtap_major
, MACVTAP_NUM_DEVS
);
586 module_exit(macvtap_exit
);
588 MODULE_ALIAS_RTNL_LINK("macvtap");
589 MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
590 MODULE_LICENSE("GPL");