15325 bhyve upstream sync 2023 January
[illumos-gate.git] / usr / src / cmd / bhyve / net_backends.c
blob1ef17a8e89c77956cab447e90ae924c788277218
1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 * $FreeBSD$
31 * This file implements multiple network backends (tap, netmap, ...),
32 * to be used by network frontends such as virtio-net and e1000.
33 * The API to access the backend (e.g. send/receive packets, negotiate
34 * features) is exported by net_backends.h.
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
40 #include <sys/types.h> /* u_short etc */
41 #ifndef WITHOUT_CAPSICUM
42 #include <sys/capsicum.h>
43 #endif
44 #include <sys/ioctl.h>
45 #include <sys/mman.h>
46 #include <sys/uio.h>
48 #include <net/if.h>
49 #ifdef __FreeBSD__
50 #if defined(INET6) || defined(INET)
51 #include <net/if_tap.h>
52 #endif
53 #include <net/netmap.h>
54 #include <net/netmap_virt.h>
55 #define NETMAP_WITH_LIBS
56 #include <net/netmap_user.h>
57 #endif /* __FreeBSD__ */
59 #ifndef WITHOUT_CAPSICUM
60 #include <capsicum_helpers.h>
61 #endif
62 #include <err.h>
63 #include <errno.h>
64 #include <fcntl.h>
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <stdint.h>
68 #include <string.h>
69 #include <unistd.h>
70 #include <sysexits.h>
71 #include <assert.h>
72 #include <pthread.h>
73 #include <pthread_np.h>
74 #include <poll.h>
75 #include <assert.h>
77 #ifdef NETGRAPH
78 #include <sys/param.h>
79 #include <sys/sysctl.h>
80 #include <netgraph.h>
81 #endif
83 #ifndef __FreeBSD__
84 #include <libdlpi.h>
85 #include <net/ethernet.h>
86 #endif
88 #include "config.h"
89 #include "debug.h"
90 #include "iov.h"
91 #include "mevent.h"
92 #include "net_backends.h"
93 #include "pci_emul.h"
95 #include <sys/linker_set.h>
98 * Each network backend registers a set of function pointers that are
99 * used to implement the net backends API.
100 * This might need to be exposed if we implement backends in separate files.
102 struct net_backend {
103 const char *prefix; /* prefix matching this backend */
106 * Routines used to initialize and cleanup the resources needed
107 * by a backend. The cleanup function is used internally,
108 * and should not be called by the frontend.
110 int (*init)(struct net_backend *be, const char *devname,
111 nvlist_t *nvl, net_be_rxeof_t cb, void *param);
112 void (*cleanup)(struct net_backend *be);
115 * Called to serve a guest transmit request. The scatter-gather
116 * vector provided by the caller has 'iovcnt' elements and contains
117 * the packet to send.
119 ssize_t (*send)(struct net_backend *be, const struct iovec *iov,
120 int iovcnt);
123 * Get the length of the next packet that can be received from
124 * the backend. If no packets are currently available, this
125 * function returns 0.
127 ssize_t (*peek_recvlen)(struct net_backend *be);
130 * Called to receive a packet from the backend. When the function
131 * returns a positive value 'len', the scatter-gather vector
132 * provided by the caller contains a packet with such length.
133 * The function returns 0 if the backend doesn't have a new packet to
134 * receive.
136 ssize_t (*recv)(struct net_backend *be, const struct iovec *iov,
137 int iovcnt);
140 * Ask the backend to enable or disable receive operation in the
141 * backend. On return from a disable operation, it is guaranteed
142 * that the receive callback won't be called until receive is
143 * enabled again. Note however that it is up to the caller to make
144 * sure that netbe_recv() is not currently being executed by another
145 * thread.
147 void (*recv_enable)(struct net_backend *be);
148 void (*recv_disable)(struct net_backend *be);
151 * Ask the backend for the virtio-net features it is able to
152 * support. Possible features are TSO, UFO and checksum offloading
153 * in both rx and tx direction and for both IPv4 and IPv6.
155 uint64_t (*get_cap)(struct net_backend *be);
158 * Tell the backend to enable/disable the specified virtio-net
159 * features (capabilities).
161 int (*set_cap)(struct net_backend *be, uint64_t features,
162 unsigned int vnet_hdr_len);
164 #ifndef __FreeBSD__
165 int (*get_mac)(struct net_backend *be, void *, size_t *);
166 #endif
168 struct pci_vtnet_softc *sc;
169 int fd;
172 * Length of the virtio-net header used by the backend and the
173 * frontend, respectively. A zero value means that the header
174 * is not used.
176 unsigned int be_vnet_hdr_len;
177 unsigned int fe_vnet_hdr_len;
179 /* Size of backend-specific private data. */
180 size_t priv_size;
182 /* Backend-specific private data follows. */
185 #define NET_BE_PRIV(be) ((void *)((be) + 1))
186 #define NET_BE_SIZE(be) (sizeof(*be) + (be)->priv_size)
188 SET_DECLARE(net_backend_set, struct net_backend);
190 #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr)
192 #define WPRINTF(params) PRINTLN params
194 #ifdef __FreeBSD__
197 * The tap backend
200 #if defined(INET6) || defined(INET)
201 static const int pf_list[] = {
202 #if defined(INET6)
203 PF_INET6,
204 #endif
205 #if defined(INET)
206 PF_INET,
207 #endif
209 #endif
211 struct tap_priv {
212 struct mevent *mevp;
214 * A bounce buffer that allows us to implement the peek_recvlen
215 * callback. In the future we may get the same information from
216 * the kevent data.
218 char bbuf[1 << 16];
219 ssize_t bbuflen;
222 static void
223 tap_cleanup(struct net_backend *be)
225 struct tap_priv *priv = NET_BE_PRIV(be);
227 if (priv->mevp) {
228 mevent_delete(priv->mevp);
230 if (be->fd != -1) {
231 close(be->fd);
232 be->fd = -1;
236 static int
237 tap_init(struct net_backend *be, const char *devname,
238 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
240 struct tap_priv *priv = NET_BE_PRIV(be);
241 char tbuf[80];
242 int opt = 1;
243 #if defined(INET6) || defined(INET)
244 struct ifreq ifrq;
245 int s;
246 #endif
247 #ifndef WITHOUT_CAPSICUM
248 cap_rights_t rights;
249 #endif
251 if (cb == NULL) {
252 WPRINTF(("TAP backend requires non-NULL callback"));
253 return (-1);
256 strcpy(tbuf, "/dev/");
257 strlcat(tbuf, devname, sizeof(tbuf));
259 be->fd = open(tbuf, O_RDWR);
260 if (be->fd == -1) {
261 WPRINTF(("open of tap device %s failed", tbuf));
262 goto error;
266 * Set non-blocking and register for read
267 * notifications with the event loop
269 if (ioctl(be->fd, FIONBIO, &opt) < 0) {
270 WPRINTF(("tap device O_NONBLOCK failed"));
271 goto error;
274 #if defined(INET6) || defined(INET)
276 * Try to UP the interface rather than relying on
277 * net.link.tap.up_on_open.
279 bzero(&ifrq, sizeof(ifrq));
280 if (ioctl(be->fd, TAPGIFNAME, &ifrq) < 0) {
281 WPRINTF(("Could not get interface name"));
282 goto error;
285 s = -1;
286 for (size_t i = 0; s == -1 && i < nitems(pf_list); i++)
287 s = socket(pf_list[i], SOCK_DGRAM, 0);
288 if (s == -1) {
289 WPRINTF(("Could open socket"));
290 goto error;
293 if (ioctl(s, SIOCGIFFLAGS, &ifrq) < 0) {
294 (void)close(s);
295 WPRINTF(("Could not get interface flags"));
296 goto error;
298 ifrq.ifr_flags |= IFF_UP;
299 if (ioctl(s, SIOCSIFFLAGS, &ifrq) < 0) {
300 (void)close(s);
301 WPRINTF(("Could not set interface flags"));
302 goto error;
304 (void)close(s);
305 #endif
307 #ifndef WITHOUT_CAPSICUM
308 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
309 if (caph_rights_limit(be->fd, &rights) == -1)
310 errx(EX_OSERR, "Unable to apply rights for sandbox");
311 #endif
313 memset(priv->bbuf, 0, sizeof(priv->bbuf));
314 priv->bbuflen = 0;
316 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
317 if (priv->mevp == NULL) {
318 WPRINTF(("Could not register event"));
319 goto error;
322 return (0);
324 error:
325 tap_cleanup(be);
326 return (-1);
330 * Called to send a buffer chain out to the tap device
332 static ssize_t
333 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
335 return (writev(be->fd, iov, iovcnt));
338 static ssize_t
339 tap_peek_recvlen(struct net_backend *be)
341 struct tap_priv *priv = NET_BE_PRIV(be);
342 ssize_t ret;
344 if (priv->bbuflen > 0) {
346 * We already have a packet in the bounce buffer.
347 * Just return its length.
349 return priv->bbuflen;
353 * Read the next packet (if any) into the bounce buffer, so
354 * that we get to know its length and we can return that
355 * to the caller.
357 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
358 if (ret < 0 && errno == EWOULDBLOCK) {
359 return (0);
362 if (ret > 0)
363 priv->bbuflen = ret;
365 return (ret);
368 static ssize_t
369 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
371 struct tap_priv *priv = NET_BE_PRIV(be);
372 ssize_t ret;
374 if (priv->bbuflen > 0) {
376 * A packet is available in the bounce buffer, so
377 * we read it from there.
379 ret = buf_to_iov(priv->bbuf, priv->bbuflen,
380 iov, iovcnt, 0);
382 /* Mark the bounce buffer as empty. */
383 priv->bbuflen = 0;
385 return (ret);
388 ret = readv(be->fd, iov, iovcnt);
389 if (ret < 0 && errno == EWOULDBLOCK) {
390 return (0);
393 return (ret);
396 static void
397 tap_recv_enable(struct net_backend *be)
399 struct tap_priv *priv = NET_BE_PRIV(be);
401 mevent_enable(priv->mevp);
404 static void
405 tap_recv_disable(struct net_backend *be)
407 struct tap_priv *priv = NET_BE_PRIV(be);
409 mevent_disable(priv->mevp);
412 static uint64_t
413 tap_get_cap(struct net_backend *be __unused)
416 return (0); /* no capabilities for now */
419 static int
420 tap_set_cap(struct net_backend *be __unused, uint64_t features,
421 unsigned vnet_hdr_len)
424 return ((features || vnet_hdr_len) ? -1 : 0);
427 static struct net_backend tap_backend = {
428 .prefix = "tap",
429 .priv_size = sizeof(struct tap_priv),
430 .init = tap_init,
431 .cleanup = tap_cleanup,
432 .send = tap_send,
433 .peek_recvlen = tap_peek_recvlen,
434 .recv = tap_recv,
435 .recv_enable = tap_recv_enable,
436 .recv_disable = tap_recv_disable,
437 .get_cap = tap_get_cap,
438 .set_cap = tap_set_cap,
441 /* A clone of the tap backend, with a different prefix. */
442 static struct net_backend vmnet_backend = {
443 .prefix = "vmnet",
444 .priv_size = sizeof(struct tap_priv),
445 .init = tap_init,
446 .cleanup = tap_cleanup,
447 .send = tap_send,
448 .peek_recvlen = tap_peek_recvlen,
449 .recv = tap_recv,
450 .recv_enable = tap_recv_enable,
451 .recv_disable = tap_recv_disable,
452 .get_cap = tap_get_cap,
453 .set_cap = tap_set_cap,
456 DATA_SET(net_backend_set, tap_backend);
457 DATA_SET(net_backend_set, vmnet_backend);
459 #ifdef NETGRAPH
462 * Netgraph backend
465 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024)
467 static int
468 ng_init(struct net_backend *be, const char *devname __unused,
469 nvlist_t *nvl, net_be_rxeof_t cb, void *param)
471 struct tap_priv *p = NET_BE_PRIV(be);
472 struct ngm_connect ngc;
473 const char *value, *nodename;
474 int sbsz;
475 int ctrl_sock;
476 int flags;
477 unsigned long maxsbsz;
478 size_t msbsz;
479 #ifndef WITHOUT_CAPSICUM
480 cap_rights_t rights;
481 #endif
483 if (cb == NULL) {
484 WPRINTF(("Netgraph backend requires non-NULL callback"));
485 return (-1);
488 be->fd = -1;
490 memset(&ngc, 0, sizeof(ngc));
492 value = get_config_value_node(nvl, "path");
493 if (value == NULL) {
494 WPRINTF(("path must be provided"));
495 return (-1);
497 strncpy(ngc.path, value, NG_PATHSIZ - 1);
499 value = get_config_value_node(nvl, "hook");
500 if (value == NULL)
501 value = "vmlink";
502 strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1);
504 value = get_config_value_node(nvl, "peerhook");
505 if (value == NULL) {
506 WPRINTF(("peer hook must be provided"));
507 return (-1);
509 strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1);
511 nodename = get_config_value_node(nvl, "socket");
512 if (NgMkSockNode(nodename,
513 &ctrl_sock, &be->fd) < 0) {
514 WPRINTF(("can't get Netgraph sockets"));
515 return (-1);
518 if (NgSendMsg(ctrl_sock, ".",
519 NGM_GENERIC_COOKIE,
520 NGM_CONNECT, &ngc, sizeof(ngc)) < 0) {
521 WPRINTF(("can't connect to node"));
522 close(ctrl_sock);
523 goto error;
526 close(ctrl_sock);
528 flags = fcntl(be->fd, F_GETFL);
530 if (flags < 0) {
531 WPRINTF(("can't get socket flags"));
532 goto error;
535 if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) {
536 WPRINTF(("can't set O_NONBLOCK flag"));
537 goto error;
541 * The default ng_socket(4) buffer's size is too low.
542 * Calculate the minimum value between NG_SBUF_MAX_SIZE
543 * and kern.ipc.maxsockbuf.
545 msbsz = sizeof(maxsbsz);
546 if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz,
547 NULL, 0) < 0) {
548 WPRINTF(("can't get 'kern.ipc.maxsockbuf' value"));
549 goto error;
553 * We can't set the socket buffer size to kern.ipc.maxsockbuf value,
554 * as it takes into account the mbuf(9) overhead.
556 maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES);
558 sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz);
560 if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz,
561 sizeof(sbsz)) < 0) {
562 WPRINTF(("can't set TX buffer size"));
563 goto error;
566 if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz,
567 sizeof(sbsz)) < 0) {
568 WPRINTF(("can't set RX buffer size"));
569 goto error;
572 #ifndef WITHOUT_CAPSICUM
573 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
574 if (caph_rights_limit(be->fd, &rights) == -1)
575 errx(EX_OSERR, "Unable to apply rights for sandbox");
576 #endif
578 memset(p->bbuf, 0, sizeof(p->bbuf));
579 p->bbuflen = 0;
581 p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
582 if (p->mevp == NULL) {
583 WPRINTF(("Could not register event"));
584 goto error;
587 return (0);
589 error:
590 tap_cleanup(be);
591 return (-1);
594 static struct net_backend ng_backend = {
595 .prefix = "netgraph",
596 .priv_size = sizeof(struct tap_priv),
597 .init = ng_init,
598 .cleanup = tap_cleanup,
599 .send = tap_send,
600 .peek_recvlen = tap_peek_recvlen,
601 .recv = tap_recv,
602 .recv_enable = tap_recv_enable,
603 .recv_disable = tap_recv_disable,
604 .get_cap = tap_get_cap,
605 .set_cap = tap_set_cap,
608 DATA_SET(net_backend_set, ng_backend);
610 #endif /* NETGRAPH */
613 * The netmap backend
616 /* The virtio-net features supported by netmap. */
617 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
618 VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
619 VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
620 VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
622 struct netmap_priv {
623 char ifname[IFNAMSIZ];
624 struct nm_desc *nmd;
625 uint16_t memid;
626 struct netmap_ring *rx;
627 struct netmap_ring *tx;
628 struct mevent *mevp;
629 net_be_rxeof_t cb;
630 void *cb_param;
633 static void
634 nmreq_init(struct nmreq *req, char *ifname)
637 memset(req, 0, sizeof(*req));
638 strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
639 req->nr_version = NETMAP_API;
642 static int
643 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
645 int err;
646 struct nmreq req;
647 struct netmap_priv *priv = NET_BE_PRIV(be);
649 nmreq_init(&req, priv->ifname);
650 req.nr_cmd = NETMAP_BDG_VNET_HDR;
651 req.nr_arg1 = vnet_hdr_len;
652 err = ioctl(be->fd, NIOCREGIF, &req);
653 if (err) {
654 WPRINTF(("Unable to set vnet header length %d",
655 vnet_hdr_len));
656 return (err);
659 be->be_vnet_hdr_len = vnet_hdr_len;
661 return (0);
664 static int
665 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
667 unsigned prev_hdr_len = be->be_vnet_hdr_len;
668 int ret;
670 if (vnet_hdr_len == prev_hdr_len) {
671 return (1);
674 ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
675 if (ret) {
676 return (0);
679 netmap_set_vnet_hdr_len(be, prev_hdr_len);
681 return (1);
684 static uint64_t
685 netmap_get_cap(struct net_backend *be)
688 return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
689 NETMAP_FEATURES : 0);
692 static int
693 netmap_set_cap(struct net_backend *be, uint64_t features __unused,
694 unsigned vnet_hdr_len)
697 return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
700 static int
701 netmap_init(struct net_backend *be, const char *devname,
702 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
704 struct netmap_priv *priv = NET_BE_PRIV(be);
706 strlcpy(priv->ifname, devname, sizeof(priv->ifname));
707 priv->ifname[sizeof(priv->ifname) - 1] = '\0';
709 priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
710 if (priv->nmd == NULL) {
711 WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)",
712 devname, strerror(errno)));
713 return (-1);
716 priv->memid = priv->nmd->req.nr_arg2;
717 priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
718 priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
719 priv->cb = cb;
720 priv->cb_param = param;
721 be->fd = priv->nmd->fd;
723 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
724 if (priv->mevp == NULL) {
725 WPRINTF(("Could not register event"));
726 return (-1);
729 return (0);
732 static void
733 netmap_cleanup(struct net_backend *be)
735 struct netmap_priv *priv = NET_BE_PRIV(be);
737 if (priv->mevp) {
738 mevent_delete(priv->mevp);
740 if (priv->nmd) {
741 nm_close(priv->nmd);
743 be->fd = -1;
746 static ssize_t
747 netmap_send(struct net_backend *be, const struct iovec *iov,
748 int iovcnt)
750 struct netmap_priv *priv = NET_BE_PRIV(be);
751 struct netmap_ring *ring;
752 ssize_t totlen = 0;
753 int nm_buf_size;
754 int nm_buf_len;
755 uint32_t head;
756 uint8_t *nm_buf;
757 int j;
759 ring = priv->tx;
760 head = ring->head;
761 if (head == ring->tail) {
762 WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt)));
763 goto txsync;
765 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
766 nm_buf_size = ring->nr_buf_size;
767 nm_buf_len = 0;
769 for (j = 0; j < iovcnt; j++) {
770 uint8_t *iov_frag_buf = iov[j].iov_base;
771 int iov_frag_size = iov[j].iov_len;
773 totlen += iov_frag_size;
776 * Split each iovec fragment over more netmap slots, if
777 * necessary.
779 for (;;) {
780 int copylen;
782 copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
783 memcpy(nm_buf, iov_frag_buf, copylen);
785 iov_frag_buf += copylen;
786 iov_frag_size -= copylen;
787 nm_buf += copylen;
788 nm_buf_size -= copylen;
789 nm_buf_len += copylen;
791 if (iov_frag_size == 0) {
792 break;
795 ring->slot[head].len = nm_buf_len;
796 ring->slot[head].flags = NS_MOREFRAG;
797 head = nm_ring_next(ring, head);
798 if (head == ring->tail) {
800 * We ran out of netmap slots while
801 * splitting the iovec fragments.
803 WPRINTF(("No space, drop %zu bytes",
804 count_iov(iov, iovcnt)));
805 goto txsync;
807 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
808 nm_buf_size = ring->nr_buf_size;
809 nm_buf_len = 0;
813 /* Complete the last slot, which must not have NS_MOREFRAG set. */
814 ring->slot[head].len = nm_buf_len;
815 ring->slot[head].flags = 0;
816 head = nm_ring_next(ring, head);
818 /* Now update ring->head and ring->cur. */
819 ring->head = ring->cur = head;
820 txsync:
821 ioctl(be->fd, NIOCTXSYNC, NULL);
823 return (totlen);
826 static ssize_t
827 netmap_peek_recvlen(struct net_backend *be)
829 struct netmap_priv *priv = NET_BE_PRIV(be);
830 struct netmap_ring *ring = priv->rx;
831 uint32_t head = ring->head;
832 ssize_t totlen = 0;
834 while (head != ring->tail) {
835 struct netmap_slot *slot = ring->slot + head;
837 totlen += slot->len;
838 if ((slot->flags & NS_MOREFRAG) == 0)
839 break;
840 head = nm_ring_next(ring, head);
843 return (totlen);
846 static ssize_t
847 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
849 struct netmap_priv *priv = NET_BE_PRIV(be);
850 struct netmap_slot *slot = NULL;
851 struct netmap_ring *ring;
852 uint8_t *iov_frag_buf;
853 int iov_frag_size;
854 ssize_t totlen = 0;
855 uint32_t head;
857 assert(iovcnt);
859 ring = priv->rx;
860 head = ring->head;
861 iov_frag_buf = iov->iov_base;
862 iov_frag_size = iov->iov_len;
864 do {
865 uint8_t *nm_buf;
866 int nm_buf_len;
868 if (head == ring->tail) {
869 return (0);
872 slot = ring->slot + head;
873 nm_buf = NETMAP_BUF(ring, slot->buf_idx);
874 nm_buf_len = slot->len;
876 for (;;) {
877 int copylen = nm_buf_len < iov_frag_size ?
878 nm_buf_len : iov_frag_size;
880 memcpy(iov_frag_buf, nm_buf, copylen);
881 nm_buf += copylen;
882 nm_buf_len -= copylen;
883 iov_frag_buf += copylen;
884 iov_frag_size -= copylen;
885 totlen += copylen;
887 if (nm_buf_len == 0) {
888 break;
891 iov++;
892 iovcnt--;
893 if (iovcnt == 0) {
894 /* No space to receive. */
895 WPRINTF(("Short iov, drop %zd bytes",
896 totlen));
897 return (-ENOSPC);
899 iov_frag_buf = iov->iov_base;
900 iov_frag_size = iov->iov_len;
903 head = nm_ring_next(ring, head);
905 } while (slot->flags & NS_MOREFRAG);
907 /* Release slots to netmap. */
908 ring->head = ring->cur = head;
910 return (totlen);
913 static void
914 netmap_recv_enable(struct net_backend *be)
916 struct netmap_priv *priv = NET_BE_PRIV(be);
918 mevent_enable(priv->mevp);
921 static void
922 netmap_recv_disable(struct net_backend *be)
924 struct netmap_priv *priv = NET_BE_PRIV(be);
926 mevent_disable(priv->mevp);
929 static struct net_backend netmap_backend = {
930 .prefix = "netmap",
931 .priv_size = sizeof(struct netmap_priv),
932 .init = netmap_init,
933 .cleanup = netmap_cleanup,
934 .send = netmap_send,
935 .peek_recvlen = netmap_peek_recvlen,
936 .recv = netmap_recv,
937 .recv_enable = netmap_recv_enable,
938 .recv_disable = netmap_recv_disable,
939 .get_cap = netmap_get_cap,
940 .set_cap = netmap_set_cap,
943 /* A clone of the netmap backend, with a different prefix. */
944 static struct net_backend vale_backend = {
945 .prefix = "vale",
946 .priv_size = sizeof(struct netmap_priv),
947 .init = netmap_init,
948 .cleanup = netmap_cleanup,
949 .send = netmap_send,
950 .peek_recvlen = netmap_peek_recvlen,
951 .recv = netmap_recv,
952 .recv_enable = netmap_recv_enable,
953 .recv_disable = netmap_recv_disable,
954 .get_cap = netmap_get_cap,
955 .set_cap = netmap_set_cap,
958 DATA_SET(net_backend_set, netmap_backend);
959 DATA_SET(net_backend_set, vale_backend);
961 #else /* __FreeBSD__ */
964 * The illumos dlpi backend
968 * The size of the bounce buffer used to implement the peek callback.
969 * This value should be big enough to accommodate the largest of all possible
970 * frontend packet lengths. The value here matches the definition of
971 * VTNET_MAX_PKT_LEN in pci_virtio_net.c
973 #define DLPI_BBUF_SIZE (65536 + 64)
975 typedef struct be_dlpi_priv {
976 dlpi_handle_t bdp_dhp;
977 struct mevent *bdp_mevp;
979 * A bounce buffer that allows us to implement the peek_recvlen
980 * callback. Each structure is only used by a single thread so
981 * one is enough.
983 uint8_t bdp_bbuf[DLPI_BBUF_SIZE];
984 ssize_t bdp_bbuflen;
985 } be_dlpi_priv_t;
987 static void
988 be_dlpi_cleanup(net_backend_t *be)
990 be_dlpi_priv_t *priv = NET_BE_PRIV(be);
992 if (priv->bdp_dhp != NULL)
993 dlpi_close(priv->bdp_dhp);
994 priv->bdp_dhp = NULL;
996 if (priv->bdp_mevp != NULL)
997 mevent_delete(priv->bdp_mevp);
998 priv->bdp_mevp = NULL;
1000 priv->bdp_bbuflen = 0;
1001 be->fd = -1;
1004 static void
1005 be_dlpi_err(int ret, const char *dev, char *msg)
1007 WPRINTF(("%s: %s (%s)", dev, msg, dlpi_strerror(ret)));
1010 static int
1011 be_dlpi_init(net_backend_t *be, const char *devname __unused,
1012 nvlist_t *nvl, net_be_rxeof_t cb, void *param)
1014 be_dlpi_priv_t *priv = NET_BE_PRIV(be);
1015 const char *vnic;
1016 int ret;
1018 if (cb == NULL) {
1019 WPRINTF(("dlpi backend requires non-NULL callback"));
1020 return (-1);
1023 vnic = get_config_value_node(nvl, "vnic");
1024 if (vnic == NULL) {
1025 WPRINTF(("dlpi backend requires a VNIC"));
1026 return (-1);
1029 priv->bdp_bbuflen = 0;
1031 ret = dlpi_open(vnic, &priv->bdp_dhp, DLPI_RAW);
1033 if (ret != DLPI_SUCCESS) {
1034 be_dlpi_err(ret, vnic, "open failed");
1035 goto error;
1038 if ((ret = dlpi_bind(priv->bdp_dhp, DLPI_ANY_SAP, NULL)) !=
1039 DLPI_SUCCESS) {
1040 be_dlpi_err(ret, vnic, "bind failed");
1041 goto error;
1044 if (get_config_bool_node_default(nvl, "promiscrxonly", true)) {
1045 if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_RX_ONLY)) !=
1046 DLPI_SUCCESS) {
1047 be_dlpi_err(ret, vnic,
1048 "enable promiscuous mode(rxonly) failed");
1049 goto error;
1052 if (get_config_bool_node_default(nvl, "promiscphys", false)) {
1053 if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_PHYS)) !=
1054 DLPI_SUCCESS) {
1055 be_dlpi_err(ret, vnic,
1056 "enable promiscuous mode(physical) failed");
1057 goto error;
1060 if (get_config_bool_node_default(nvl, "promiscsap", true)) {
1061 if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_SAP)) !=
1062 DLPI_SUCCESS) {
1063 be_dlpi_err(ret, vnic,
1064 "enable promiscuous mode(SAP) failed");
1065 goto error;
1068 if (get_config_bool_node_default(nvl, "promiscmulti", true)) {
1069 if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_MULTI)) !=
1070 DLPI_SUCCESS) {
1071 be_dlpi_err(ret, vnic,
1072 "enable promiscuous mode(muticast) failed");
1073 goto error;
1077 be->fd = dlpi_fd(priv->bdp_dhp);
1079 if (fcntl(be->fd, F_SETFL, O_NONBLOCK) < 0) {
1080 WPRINTF(("%s: enable O_NONBLOCK failed", vnic));
1081 goto error;
1084 priv->bdp_mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
1085 if (priv->bdp_mevp == NULL) {
1086 WPRINTF(("Could not register event"));
1087 goto error;
1090 return (0);
1092 error:
1093 be_dlpi_cleanup(be);
1094 return (-1);
1098 * Called to send a buffer chain out to the dlpi device
1100 static ssize_t
1101 be_dlpi_send(net_backend_t *be, const struct iovec *iov, int iovcnt)
1103 be_dlpi_priv_t *priv = NET_BE_PRIV(be);
1104 ssize_t len = 0;
1105 int ret;
1107 if (iovcnt == 1) {
1108 len = iov[0].iov_len;
1109 ret = dlpi_send(priv->bdp_dhp, NULL, 0, iov[0].iov_base, len,
1110 NULL);
1111 } else {
1112 void *buf = NULL;
1114 len = iov_to_buf(iov, iovcnt, &buf);
1116 if (len <= 0 || buf == NULL)
1117 return (-1);
1119 ret = dlpi_send(priv->bdp_dhp, NULL, 0, buf, len, NULL);
1120 free(buf);
1123 if (ret != DLPI_SUCCESS)
1124 return (-1);
1126 return (len);
1129 static ssize_t
1130 be_dlpi_peek_recvlen(net_backend_t *be)
1132 be_dlpi_priv_t *priv = NET_BE_PRIV(be);
1133 dlpi_recvinfo_t recv;
1134 size_t len;
1135 int ret;
1138 * We already have a packet in the bounce buffer.
1139 * Just return its length.
1141 if (priv->bdp_bbuflen > 0)
1142 return (priv->bdp_bbuflen);
1145 * Read the next packet (if any) into the bounce buffer, so
1146 * that we get to know its length and we can return that
1147 * to the caller.
1149 len = sizeof (priv->bdp_bbuf);
1150 ret = dlpi_recv(priv->bdp_dhp, NULL, NULL, priv->bdp_bbuf, &len,
1151 0, &recv);
1152 if (ret == DL_SYSERR) {
1153 if (errno == EWOULDBLOCK)
1154 return (0);
1155 return (-1);
1156 } else if (ret == DLPI_ETIMEDOUT) {
1157 return (0);
1158 } else if (ret != DLPI_SUCCESS) {
1159 return (-1);
1162 if (recv.dri_totmsglen > sizeof (priv->bdp_bbuf)) {
1163 EPRINTLN("DLPI bounce buffer was too small! - needed %x bytes",
1164 recv.dri_totmsglen);
1167 priv->bdp_bbuflen = len;
1169 return (len);
1172 static ssize_t
1173 be_dlpi_recv(net_backend_t *be, const struct iovec *iov, int iovcnt)
1175 be_dlpi_priv_t *priv = NET_BE_PRIV(be);
1176 size_t len;
1177 int ret;
1179 if (priv->bdp_bbuflen > 0) {
1181 * A packet is available in the bounce buffer, so
1182 * we read it from there.
1184 len = buf_to_iov(priv->bdp_bbuf, priv->bdp_bbuflen,
1185 iov, iovcnt, 0);
1187 /* Mark the bounce buffer as empty. */
1188 priv->bdp_bbuflen = 0;
1190 return (len);
1193 len = iov[0].iov_len;
1194 ret = dlpi_recv(priv->bdp_dhp, NULL, NULL,
1195 (uint8_t *)iov[0].iov_base, &len, 0, NULL);
1196 if (ret == DL_SYSERR) {
1197 if (errno == EWOULDBLOCK)
1198 return (0);
1199 return (-1);
1200 } else if (ret == DLPI_ETIMEDOUT) {
1201 return (0);
1202 } else if (ret != DLPI_SUCCESS) {
1203 return (-1);
1206 return (len);
1209 static void
1210 be_dlpi_recv_enable(net_backend_t *be)
1212 be_dlpi_priv_t *priv = NET_BE_PRIV(be);
1214 mevent_enable(priv->bdp_mevp);
1217 static void
1218 be_dlpi_recv_disable(net_backend_t *be)
1220 be_dlpi_priv_t *priv = NET_BE_PRIV(be);
1222 mevent_disable(priv->bdp_mevp);
1225 static uint64_t
1226 be_dlpi_get_cap(net_backend_t *be)
1228 return (0); /* no capabilities for now */
1231 static int
1232 be_dlpi_set_cap(net_backend_t *be, uint64_t features,
1233 unsigned vnet_hdr_len)
1235 return ((features || vnet_hdr_len) ? -1 : 0);
1238 static int
1239 be_dlpi_get_mac(net_backend_t *be, void *buf, size_t *buflen)
1241 be_dlpi_priv_t *priv = NET_BE_PRIV(be);
1242 uchar_t physaddr[DLPI_PHYSADDR_MAX];
1243 size_t physaddrlen = DLPI_PHYSADDR_MAX;
1244 int ret;
1246 if ((ret = dlpi_get_physaddr(priv->bdp_dhp, DL_CURR_PHYS_ADDR,
1247 physaddr, &physaddrlen)) != DLPI_SUCCESS) {
1248 be_dlpi_err(ret, dlpi_linkname(priv->bdp_dhp),
1249 "read MAC address failed");
1250 return (EINVAL);
1253 if (physaddrlen != ETHERADDRL) {
1254 WPRINTF(("%s: bad MAC address len %d",
1255 dlpi_linkname(priv->bdp_dhp), physaddrlen));
1256 return (EINVAL);
1259 if (physaddrlen > *buflen) {
1260 WPRINTF(("%s: MAC address too long (%d bytes required)",
1261 dlpi_linkname(priv->bdp_dhp), physaddrlen));
1262 return (ENOMEM);
1265 *buflen = physaddrlen;
1266 memcpy(buf, physaddr, *buflen);
1268 return (0);
1271 static struct net_backend dlpi_backend = {
1272 .prefix = "dlpi",
1273 .priv_size = sizeof(struct be_dlpi_priv),
1274 .init = be_dlpi_init,
1275 .cleanup = be_dlpi_cleanup,
1276 .send = be_dlpi_send,
1277 .peek_recvlen = be_dlpi_peek_recvlen,
1278 .recv = be_dlpi_recv,
1279 .recv_enable = be_dlpi_recv_enable,
1280 .recv_disable = be_dlpi_recv_disable,
1281 .get_cap = be_dlpi_get_cap,
1282 .set_cap = be_dlpi_set_cap,
1283 .get_mac = be_dlpi_get_mac,
1286 DATA_SET(net_backend_set, dlpi_backend);
1288 #endif /* __FreeBSD__ */
1290 #ifdef __FreeBSD__
1292 netbe_legacy_config(nvlist_t *nvl, const char *opts)
1294 char *backend, *cp;
1296 if (opts == NULL)
1297 return (0);
1299 cp = strchr(opts, ',');
1300 if (cp == NULL) {
1301 set_config_value_node(nvl, "backend", opts);
1302 return (0);
1304 backend = strndup(opts, cp - opts);
1305 set_config_value_node(nvl, "backend", backend);
1306 free(backend);
1307 return (pci_parse_legacy_config(nvl, cp + 1));
1309 #else
1311 netbe_legacy_config(nvlist_t *nvl, const char *opts)
1313 char *config, *name, *tofree, *value;
1315 if (opts == NULL)
1316 return (0);
1318 /* Default to the 'dlpi' backend - can still be overridden by opts */
1319 set_config_value_node(nvl, "backend", "dlpi");
1320 set_config_value_node(nvl, "type", "dlpi");
1322 config = tofree = strdup(opts);
1323 if (config == NULL)
1324 err(4, "netbe_legacy_config strdup()");
1325 while ((name = strsep(&config, ",")) != NULL) {
1326 value = strchr(name, '=');
1327 if (value != NULL) {
1328 *value++ = '\0';
1329 set_config_value_node(nvl, name, value);
1330 } else {
1331 set_config_value_node(nvl, "vnic", name);
1334 free(tofree);
1336 return (0);
1338 #endif
1341 * Initialize a backend and attach to the frontend.
1342 * This is called during frontend initialization.
1343 * @ret is a pointer to the backend to be initialized
1344 * @devname is the backend-name as supplied on the command line,
1345 * e.g. -s 2:0,frontend-name,backend-name[,other-args]
1346 * @cb is the receive callback supplied by the frontend,
1347 * and it is invoked in the event loop when a receive
1348 * event is generated in the hypervisor,
1349 * @param is a pointer to the frontend, and normally used as
1350 * the argument for the callback.
1353 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb,
1354 void *param)
1356 struct net_backend **pbe, *nbe, *tbe = NULL;
1357 const char *value, *type;
1358 char *devname;
1359 int err;
1361 value = get_config_value_node(nvl, "backend");
1362 if (value == NULL) {
1363 return (-1);
1365 devname = strdup(value);
1368 * Use the type given by configuration if exists; otherwise
1369 * use the prefix of the backend as the type.
1371 type = get_config_value_node(nvl, "type");
1372 if (type == NULL)
1373 type = devname;
1376 * Find the network backend that matches the user-provided
1377 * device name. net_backend_set is built using a linker set.
1379 SET_FOREACH(pbe, net_backend_set) {
1380 if (strncmp(type, (*pbe)->prefix,
1381 strlen((*pbe)->prefix)) == 0) {
1382 tbe = *pbe;
1383 assert(tbe->init != NULL);
1384 assert(tbe->cleanup != NULL);
1385 assert(tbe->send != NULL);
1386 assert(tbe->recv != NULL);
1387 assert(tbe->get_cap != NULL);
1388 assert(tbe->set_cap != NULL);
1389 break;
1393 *ret = NULL;
1394 if (tbe == NULL) {
1395 free(devname);
1396 return (EINVAL);
1399 nbe = calloc(1, NET_BE_SIZE(tbe));
1400 *nbe = *tbe; /* copy the template */
1401 nbe->fd = -1;
1402 nbe->sc = param;
1403 nbe->be_vnet_hdr_len = 0;
1404 nbe->fe_vnet_hdr_len = 0;
1406 /* Initialize the backend. */
1407 err = nbe->init(nbe, devname, nvl, cb, param);
1408 if (err) {
1409 free(devname);
1410 free(nbe);
1411 return (err);
1414 *ret = nbe;
1415 free(devname);
1417 return (0);
1420 void
1421 netbe_cleanup(struct net_backend *be)
1424 if (be != NULL) {
1425 be->cleanup(be);
1426 free(be);
1430 uint64_t
1431 netbe_get_cap(struct net_backend *be)
1434 assert(be != NULL);
1435 return (be->get_cap(be));
1439 netbe_set_cap(struct net_backend *be, uint64_t features,
1440 unsigned vnet_hdr_len)
1442 int ret;
1444 assert(be != NULL);
1446 /* There are only three valid lengths, i.e., 0, 10 and 12. */
1447 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
1448 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
1449 return (-1);
1451 be->fe_vnet_hdr_len = vnet_hdr_len;
1453 ret = be->set_cap(be, features, vnet_hdr_len);
1454 assert(be->be_vnet_hdr_len == 0 ||
1455 be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
1457 return (ret);
1460 ssize_t
1461 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
1464 return (be->send(be, iov, iovcnt));
1467 ssize_t
1468 netbe_peek_recvlen(struct net_backend *be)
1471 return (be->peek_recvlen(be));
1475 * Try to read a packet from the backend, without blocking.
1476 * If no packets are available, return 0. In case of success, return
1477 * the length of the packet just read. Return -1 in case of errors.
1479 ssize_t
1480 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
1483 return (be->recv(be, iov, iovcnt));
1487 * Read a packet from the backend and discard it.
1488 * Returns the size of the discarded packet or zero if no packet was available.
1489 * A negative error code is returned in case of read error.
1491 ssize_t
1492 netbe_rx_discard(struct net_backend *be)
1495 * MP note: the dummybuf is only used to discard frames,
1496 * so there is no need for it to be per-vtnet or locked.
1497 * We only make it large enough for TSO-sized segment.
1499 static uint8_t dummybuf[65536 + 64];
1500 struct iovec iov;
1502 #ifdef __FreeBSD__
1503 iov.iov_base = dummybuf;
1504 #else
1505 iov.iov_base = (caddr_t)dummybuf;
1506 #endif
1507 iov.iov_len = sizeof(dummybuf);
1509 return netbe_recv(be, &iov, 1);
1512 void
1513 netbe_rx_disable(struct net_backend *be)
1516 return be->recv_disable(be);
1519 void
1520 netbe_rx_enable(struct net_backend *be)
1523 return be->recv_enable(be);
1526 size_t
1527 netbe_get_vnet_hdr_len(struct net_backend *be)
1530 return (be->be_vnet_hdr_len);
1533 #ifndef __FreeBSD__
1535 netbe_get_mac(net_backend_t *be, void *buf, size_t *buflen)
1537 if (be->get_mac == NULL)
1538 return (ENOTSUP);
1539 return (be->get_mac(be, buf, buflen));
1541 #endif