2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 * This file implements multiple network backends (tap, netmap, ...),
32 * to be used by network frontends such as virtio-net and e1000.
33 * The API to access the backend (e.g. send/receive packets, negotiate
34 * features) is exported by net_backends.h.
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
40 #include <sys/types.h> /* u_short etc */
41 #ifndef WITHOUT_CAPSICUM
42 #include <sys/capsicum.h>
44 #include <sys/ioctl.h>
50 #if defined(INET6) || defined(INET)
51 #include <net/if_tap.h>
53 #include <net/netmap.h>
54 #include <net/netmap_virt.h>
55 #define NETMAP_WITH_LIBS
56 #include <net/netmap_user.h>
57 #endif /* __FreeBSD__ */
59 #ifndef WITHOUT_CAPSICUM
60 #include <capsicum_helpers.h>
73 #include <pthread_np.h>
78 #include <sys/param.h>
79 #include <sys/sysctl.h>
85 #include <net/ethernet.h>
92 #include "net_backends.h"
95 #include <sys/linker_set.h>
98 * Each network backend registers a set of function pointers that are
99 * used to implement the net backends API.
100 * This might need to be exposed if we implement backends in separate files.
103 const char *prefix
; /* prefix matching this backend */
106 * Routines used to initialize and cleanup the resources needed
107 * by a backend. The cleanup function is used internally,
108 * and should not be called by the frontend.
110 int (*init
)(struct net_backend
*be
, const char *devname
,
111 nvlist_t
*nvl
, net_be_rxeof_t cb
, void *param
);
112 void (*cleanup
)(struct net_backend
*be
);
115 * Called to serve a guest transmit request. The scatter-gather
116 * vector provided by the caller has 'iovcnt' elements and contains
117 * the packet to send.
119 ssize_t (*send
)(struct net_backend
*be
, const struct iovec
*iov
,
123 * Get the length of the next packet that can be received from
124 * the backend. If no packets are currently available, this
125 * function returns 0.
127 ssize_t (*peek_recvlen
)(struct net_backend
*be
);
130 * Called to receive a packet from the backend. When the function
131 * returns a positive value 'len', the scatter-gather vector
132 * provided by the caller contains a packet with such length.
133 * The function returns 0 if the backend doesn't have a new packet to
136 ssize_t (*recv
)(struct net_backend
*be
, const struct iovec
*iov
,
140 * Ask the backend to enable or disable receive operation in the
141 * backend. On return from a disable operation, it is guaranteed
142 * that the receive callback won't be called until receive is
143 * enabled again. Note however that it is up to the caller to make
144 * sure that netbe_recv() is not currently being executed by another
147 void (*recv_enable
)(struct net_backend
*be
);
148 void (*recv_disable
)(struct net_backend
*be
);
151 * Ask the backend for the virtio-net features it is able to
152 * support. Possible features are TSO, UFO and checksum offloading
153 * in both rx and tx direction and for both IPv4 and IPv6.
155 uint64_t (*get_cap
)(struct net_backend
*be
);
158 * Tell the backend to enable/disable the specified virtio-net
159 * features (capabilities).
161 int (*set_cap
)(struct net_backend
*be
, uint64_t features
,
162 unsigned int vnet_hdr_len
);
165 int (*get_mac
)(struct net_backend
*be
, void *, size_t *);
168 struct pci_vtnet_softc
*sc
;
172 * Length of the virtio-net header used by the backend and the
173 * frontend, respectively. A zero value means that the header
176 unsigned int be_vnet_hdr_len
;
177 unsigned int fe_vnet_hdr_len
;
179 /* Size of backend-specific private data. */
182 /* Backend-specific private data follows. */
185 #define NET_BE_PRIV(be) ((void *)((be) + 1))
186 #define NET_BE_SIZE(be) (sizeof(*be) + (be)->priv_size)
188 SET_DECLARE(net_backend_set
, struct net_backend
);
190 #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr)
192 #define WPRINTF(params) PRINTLN params
200 #if defined(INET6) || defined(INET)
201 static const int pf_list
[] = {
214 * A bounce buffer that allows us to implement the peek_recvlen
215 * callback. In the future we may get the same information from
223 tap_cleanup(struct net_backend
*be
)
225 struct tap_priv
*priv
= NET_BE_PRIV(be
);
228 mevent_delete(priv
->mevp
);
237 tap_init(struct net_backend
*be
, const char *devname
,
238 nvlist_t
*nvl __unused
, net_be_rxeof_t cb
, void *param
)
240 struct tap_priv
*priv
= NET_BE_PRIV(be
);
243 #if defined(INET6) || defined(INET)
247 #ifndef WITHOUT_CAPSICUM
252 WPRINTF(("TAP backend requires non-NULL callback"));
256 strcpy(tbuf
, "/dev/");
257 strlcat(tbuf
, devname
, sizeof(tbuf
));
259 be
->fd
= open(tbuf
, O_RDWR
);
261 WPRINTF(("open of tap device %s failed", tbuf
));
266 * Set non-blocking and register for read
267 * notifications with the event loop
269 if (ioctl(be
->fd
, FIONBIO
, &opt
) < 0) {
270 WPRINTF(("tap device O_NONBLOCK failed"));
274 #if defined(INET6) || defined(INET)
276 * Try to UP the interface rather than relying on
277 * net.link.tap.up_on_open.
279 bzero(&ifrq
, sizeof(ifrq
));
280 if (ioctl(be
->fd
, TAPGIFNAME
, &ifrq
) < 0) {
281 WPRINTF(("Could not get interface name"));
286 for (size_t i
= 0; s
== -1 && i
< nitems(pf_list
); i
++)
287 s
= socket(pf_list
[i
], SOCK_DGRAM
, 0);
289 WPRINTF(("Could open socket"));
293 if (ioctl(s
, SIOCGIFFLAGS
, &ifrq
) < 0) {
295 WPRINTF(("Could not get interface flags"));
298 ifrq
.ifr_flags
|= IFF_UP
;
299 if (ioctl(s
, SIOCSIFFLAGS
, &ifrq
) < 0) {
301 WPRINTF(("Could not set interface flags"));
307 #ifndef WITHOUT_CAPSICUM
308 cap_rights_init(&rights
, CAP_EVENT
, CAP_READ
, CAP_WRITE
);
309 if (caph_rights_limit(be
->fd
, &rights
) == -1)
310 errx(EX_OSERR
, "Unable to apply rights for sandbox");
313 memset(priv
->bbuf
, 0, sizeof(priv
->bbuf
));
316 priv
->mevp
= mevent_add_disabled(be
->fd
, EVF_READ
, cb
, param
);
317 if (priv
->mevp
== NULL
) {
318 WPRINTF(("Could not register event"));
330 * Called to send a buffer chain out to the tap device
333 tap_send(struct net_backend
*be
, const struct iovec
*iov
, int iovcnt
)
335 return (writev(be
->fd
, iov
, iovcnt
));
339 tap_peek_recvlen(struct net_backend
*be
)
341 struct tap_priv
*priv
= NET_BE_PRIV(be
);
344 if (priv
->bbuflen
> 0) {
346 * We already have a packet in the bounce buffer.
347 * Just return its length.
349 return priv
->bbuflen
;
353 * Read the next packet (if any) into the bounce buffer, so
354 * that we get to know its length and we can return that
357 ret
= read(be
->fd
, priv
->bbuf
, sizeof(priv
->bbuf
));
358 if (ret
< 0 && errno
== EWOULDBLOCK
) {
369 tap_recv(struct net_backend
*be
, const struct iovec
*iov
, int iovcnt
)
371 struct tap_priv
*priv
= NET_BE_PRIV(be
);
374 if (priv
->bbuflen
> 0) {
376 * A packet is available in the bounce buffer, so
377 * we read it from there.
379 ret
= buf_to_iov(priv
->bbuf
, priv
->bbuflen
,
382 /* Mark the bounce buffer as empty. */
388 ret
= readv(be
->fd
, iov
, iovcnt
);
389 if (ret
< 0 && errno
== EWOULDBLOCK
) {
397 tap_recv_enable(struct net_backend
*be
)
399 struct tap_priv
*priv
= NET_BE_PRIV(be
);
401 mevent_enable(priv
->mevp
);
405 tap_recv_disable(struct net_backend
*be
)
407 struct tap_priv
*priv
= NET_BE_PRIV(be
);
409 mevent_disable(priv
->mevp
);
413 tap_get_cap(struct net_backend
*be __unused
)
416 return (0); /* no capabilities for now */
420 tap_set_cap(struct net_backend
*be __unused
, uint64_t features
,
421 unsigned vnet_hdr_len
)
424 return ((features
|| vnet_hdr_len
) ? -1 : 0);
427 static struct net_backend tap_backend
= {
429 .priv_size
= sizeof(struct tap_priv
),
431 .cleanup
= tap_cleanup
,
433 .peek_recvlen
= tap_peek_recvlen
,
435 .recv_enable
= tap_recv_enable
,
436 .recv_disable
= tap_recv_disable
,
437 .get_cap
= tap_get_cap
,
438 .set_cap
= tap_set_cap
,
441 /* A clone of the tap backend, with a different prefix. */
442 static struct net_backend vmnet_backend
= {
444 .priv_size
= sizeof(struct tap_priv
),
446 .cleanup
= tap_cleanup
,
448 .peek_recvlen
= tap_peek_recvlen
,
450 .recv_enable
= tap_recv_enable
,
451 .recv_disable
= tap_recv_disable
,
452 .get_cap
= tap_get_cap
,
453 .set_cap
= tap_set_cap
,
456 DATA_SET(net_backend_set
, tap_backend
);
457 DATA_SET(net_backend_set
, vmnet_backend
);
465 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024)
468 ng_init(struct net_backend
*be
, const char *devname __unused
,
469 nvlist_t
*nvl
, net_be_rxeof_t cb
, void *param
)
471 struct tap_priv
*p
= NET_BE_PRIV(be
);
472 struct ngm_connect ngc
;
473 const char *value
, *nodename
;
477 unsigned long maxsbsz
;
479 #ifndef WITHOUT_CAPSICUM
484 WPRINTF(("Netgraph backend requires non-NULL callback"));
490 memset(&ngc
, 0, sizeof(ngc
));
492 value
= get_config_value_node(nvl
, "path");
494 WPRINTF(("path must be provided"));
497 strncpy(ngc
.path
, value
, NG_PATHSIZ
- 1);
499 value
= get_config_value_node(nvl
, "hook");
502 strncpy(ngc
.ourhook
, value
, NG_HOOKSIZ
- 1);
504 value
= get_config_value_node(nvl
, "peerhook");
506 WPRINTF(("peer hook must be provided"));
509 strncpy(ngc
.peerhook
, value
, NG_HOOKSIZ
- 1);
511 nodename
= get_config_value_node(nvl
, "socket");
512 if (NgMkSockNode(nodename
,
513 &ctrl_sock
, &be
->fd
) < 0) {
514 WPRINTF(("can't get Netgraph sockets"));
518 if (NgSendMsg(ctrl_sock
, ".",
520 NGM_CONNECT
, &ngc
, sizeof(ngc
)) < 0) {
521 WPRINTF(("can't connect to node"));
528 flags
= fcntl(be
->fd
, F_GETFL
);
531 WPRINTF(("can't get socket flags"));
535 if (fcntl(be
->fd
, F_SETFL
, flags
| O_NONBLOCK
) < 0) {
536 WPRINTF(("can't set O_NONBLOCK flag"));
541 * The default ng_socket(4) buffer's size is too low.
542 * Calculate the minimum value between NG_SBUF_MAX_SIZE
543 * and kern.ipc.maxsockbuf.
545 msbsz
= sizeof(maxsbsz
);
546 if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz
, &msbsz
,
548 WPRINTF(("can't get 'kern.ipc.maxsockbuf' value"));
553 * We can't set the socket buffer size to kern.ipc.maxsockbuf value,
554 * as it takes into account the mbuf(9) overhead.
556 maxsbsz
= maxsbsz
* MCLBYTES
/ (MSIZE
+ MCLBYTES
);
558 sbsz
= MIN(NG_SBUF_MAX_SIZE
, maxsbsz
);
560 if (setsockopt(be
->fd
, SOL_SOCKET
, SO_SNDBUF
, &sbsz
,
562 WPRINTF(("can't set TX buffer size"));
566 if (setsockopt(be
->fd
, SOL_SOCKET
, SO_RCVBUF
, &sbsz
,
568 WPRINTF(("can't set RX buffer size"));
572 #ifndef WITHOUT_CAPSICUM
573 cap_rights_init(&rights
, CAP_EVENT
, CAP_READ
, CAP_WRITE
);
574 if (caph_rights_limit(be
->fd
, &rights
) == -1)
575 errx(EX_OSERR
, "Unable to apply rights for sandbox");
578 memset(p
->bbuf
, 0, sizeof(p
->bbuf
));
581 p
->mevp
= mevent_add_disabled(be
->fd
, EVF_READ
, cb
, param
);
582 if (p
->mevp
== NULL
) {
583 WPRINTF(("Could not register event"));
594 static struct net_backend ng_backend
= {
595 .prefix
= "netgraph",
596 .priv_size
= sizeof(struct tap_priv
),
598 .cleanup
= tap_cleanup
,
600 .peek_recvlen
= tap_peek_recvlen
,
602 .recv_enable
= tap_recv_enable
,
603 .recv_disable
= tap_recv_disable
,
604 .get_cap
= tap_get_cap
,
605 .set_cap
= tap_set_cap
,
608 DATA_SET(net_backend_set
, ng_backend
);
610 #endif /* NETGRAPH */
616 /* The virtio-net features supported by netmap. */
617 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
618 VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
619 VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
620 VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
623 char ifname
[IFNAMSIZ
];
626 struct netmap_ring
*rx
;
627 struct netmap_ring
*tx
;
634 nmreq_init(struct nmreq
*req
, char *ifname
)
637 memset(req
, 0, sizeof(*req
));
638 strlcpy(req
->nr_name
, ifname
, sizeof(req
->nr_name
));
639 req
->nr_version
= NETMAP_API
;
643 netmap_set_vnet_hdr_len(struct net_backend
*be
, int vnet_hdr_len
)
647 struct netmap_priv
*priv
= NET_BE_PRIV(be
);
649 nmreq_init(&req
, priv
->ifname
);
650 req
.nr_cmd
= NETMAP_BDG_VNET_HDR
;
651 req
.nr_arg1
= vnet_hdr_len
;
652 err
= ioctl(be
->fd
, NIOCREGIF
, &req
);
654 WPRINTF(("Unable to set vnet header length %d",
659 be
->be_vnet_hdr_len
= vnet_hdr_len
;
665 netmap_has_vnet_hdr_len(struct net_backend
*be
, unsigned vnet_hdr_len
)
667 unsigned prev_hdr_len
= be
->be_vnet_hdr_len
;
670 if (vnet_hdr_len
== prev_hdr_len
) {
674 ret
= netmap_set_vnet_hdr_len(be
, vnet_hdr_len
);
679 netmap_set_vnet_hdr_len(be
, prev_hdr_len
);
685 netmap_get_cap(struct net_backend
*be
)
688 return (netmap_has_vnet_hdr_len(be
, VNET_HDR_LEN
) ?
689 NETMAP_FEATURES
: 0);
693 netmap_set_cap(struct net_backend
*be
, uint64_t features __unused
,
694 unsigned vnet_hdr_len
)
697 return (netmap_set_vnet_hdr_len(be
, vnet_hdr_len
));
701 netmap_init(struct net_backend
*be
, const char *devname
,
702 nvlist_t
*nvl __unused
, net_be_rxeof_t cb
, void *param
)
704 struct netmap_priv
*priv
= NET_BE_PRIV(be
);
706 strlcpy(priv
->ifname
, devname
, sizeof(priv
->ifname
));
707 priv
->ifname
[sizeof(priv
->ifname
) - 1] = '\0';
709 priv
->nmd
= nm_open(priv
->ifname
, NULL
, NETMAP_NO_TX_POLL
, NULL
);
710 if (priv
->nmd
== NULL
) {
711 WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)",
712 devname
, strerror(errno
)));
716 priv
->memid
= priv
->nmd
->req
.nr_arg2
;
717 priv
->tx
= NETMAP_TXRING(priv
->nmd
->nifp
, 0);
718 priv
->rx
= NETMAP_RXRING(priv
->nmd
->nifp
, 0);
720 priv
->cb_param
= param
;
721 be
->fd
= priv
->nmd
->fd
;
723 priv
->mevp
= mevent_add_disabled(be
->fd
, EVF_READ
, cb
, param
);
724 if (priv
->mevp
== NULL
) {
725 WPRINTF(("Could not register event"));
733 netmap_cleanup(struct net_backend
*be
)
735 struct netmap_priv
*priv
= NET_BE_PRIV(be
);
738 mevent_delete(priv
->mevp
);
747 netmap_send(struct net_backend
*be
, const struct iovec
*iov
,
750 struct netmap_priv
*priv
= NET_BE_PRIV(be
);
751 struct netmap_ring
*ring
;
761 if (head
== ring
->tail
) {
762 WPRINTF(("No space, drop %zu bytes", count_iov(iov
, iovcnt
)));
765 nm_buf
= NETMAP_BUF(ring
, ring
->slot
[head
].buf_idx
);
766 nm_buf_size
= ring
->nr_buf_size
;
769 for (j
= 0; j
< iovcnt
; j
++) {
770 uint8_t *iov_frag_buf
= iov
[j
].iov_base
;
771 int iov_frag_size
= iov
[j
].iov_len
;
773 totlen
+= iov_frag_size
;
776 * Split each iovec fragment over more netmap slots, if
782 copylen
= iov_frag_size
< nm_buf_size
? iov_frag_size
: nm_buf_size
;
783 memcpy(nm_buf
, iov_frag_buf
, copylen
);
785 iov_frag_buf
+= copylen
;
786 iov_frag_size
-= copylen
;
788 nm_buf_size
-= copylen
;
789 nm_buf_len
+= copylen
;
791 if (iov_frag_size
== 0) {
795 ring
->slot
[head
].len
= nm_buf_len
;
796 ring
->slot
[head
].flags
= NS_MOREFRAG
;
797 head
= nm_ring_next(ring
, head
);
798 if (head
== ring
->tail
) {
800 * We ran out of netmap slots while
801 * splitting the iovec fragments.
803 WPRINTF(("No space, drop %zu bytes",
804 count_iov(iov
, iovcnt
)));
807 nm_buf
= NETMAP_BUF(ring
, ring
->slot
[head
].buf_idx
);
808 nm_buf_size
= ring
->nr_buf_size
;
813 /* Complete the last slot, which must not have NS_MOREFRAG set. */
814 ring
->slot
[head
].len
= nm_buf_len
;
815 ring
->slot
[head
].flags
= 0;
816 head
= nm_ring_next(ring
, head
);
818 /* Now update ring->head and ring->cur. */
819 ring
->head
= ring
->cur
= head
;
821 ioctl(be
->fd
, NIOCTXSYNC
, NULL
);
827 netmap_peek_recvlen(struct net_backend
*be
)
829 struct netmap_priv
*priv
= NET_BE_PRIV(be
);
830 struct netmap_ring
*ring
= priv
->rx
;
831 uint32_t head
= ring
->head
;
834 while (head
!= ring
->tail
) {
835 struct netmap_slot
*slot
= ring
->slot
+ head
;
838 if ((slot
->flags
& NS_MOREFRAG
) == 0)
840 head
= nm_ring_next(ring
, head
);
847 netmap_recv(struct net_backend
*be
, const struct iovec
*iov
, int iovcnt
)
849 struct netmap_priv
*priv
= NET_BE_PRIV(be
);
850 struct netmap_slot
*slot
= NULL
;
851 struct netmap_ring
*ring
;
852 uint8_t *iov_frag_buf
;
861 iov_frag_buf
= iov
->iov_base
;
862 iov_frag_size
= iov
->iov_len
;
868 if (head
== ring
->tail
) {
872 slot
= ring
->slot
+ head
;
873 nm_buf
= NETMAP_BUF(ring
, slot
->buf_idx
);
874 nm_buf_len
= slot
->len
;
877 int copylen
= nm_buf_len
< iov_frag_size
?
878 nm_buf_len
: iov_frag_size
;
880 memcpy(iov_frag_buf
, nm_buf
, copylen
);
882 nm_buf_len
-= copylen
;
883 iov_frag_buf
+= copylen
;
884 iov_frag_size
-= copylen
;
887 if (nm_buf_len
== 0) {
894 /* No space to receive. */
895 WPRINTF(("Short iov, drop %zd bytes",
899 iov_frag_buf
= iov
->iov_base
;
900 iov_frag_size
= iov
->iov_len
;
903 head
= nm_ring_next(ring
, head
);
905 } while (slot
->flags
& NS_MOREFRAG
);
907 /* Release slots to netmap. */
908 ring
->head
= ring
->cur
= head
;
914 netmap_recv_enable(struct net_backend
*be
)
916 struct netmap_priv
*priv
= NET_BE_PRIV(be
);
918 mevent_enable(priv
->mevp
);
922 netmap_recv_disable(struct net_backend
*be
)
924 struct netmap_priv
*priv
= NET_BE_PRIV(be
);
926 mevent_disable(priv
->mevp
);
929 static struct net_backend netmap_backend
= {
931 .priv_size
= sizeof(struct netmap_priv
),
933 .cleanup
= netmap_cleanup
,
935 .peek_recvlen
= netmap_peek_recvlen
,
937 .recv_enable
= netmap_recv_enable
,
938 .recv_disable
= netmap_recv_disable
,
939 .get_cap
= netmap_get_cap
,
940 .set_cap
= netmap_set_cap
,
943 /* A clone of the netmap backend, with a different prefix. */
944 static struct net_backend vale_backend
= {
946 .priv_size
= sizeof(struct netmap_priv
),
948 .cleanup
= netmap_cleanup
,
950 .peek_recvlen
= netmap_peek_recvlen
,
952 .recv_enable
= netmap_recv_enable
,
953 .recv_disable
= netmap_recv_disable
,
954 .get_cap
= netmap_get_cap
,
955 .set_cap
= netmap_set_cap
,
958 DATA_SET(net_backend_set
, netmap_backend
);
959 DATA_SET(net_backend_set
, vale_backend
);
961 #else /* __FreeBSD__ */
964 * The illumos dlpi backend
968 * The size of the bounce buffer used to implement the peek callback.
969 * This value should be big enough to accommodate the largest of all possible
970 * frontend packet lengths. The value here matches the definition of
971 * VTNET_MAX_PKT_LEN in pci_virtio_net.c
973 #define DLPI_BBUF_SIZE (65536 + 64)
975 typedef struct be_dlpi_priv
{
976 dlpi_handle_t bdp_dhp
;
977 struct mevent
*bdp_mevp
;
979 * A bounce buffer that allows us to implement the peek_recvlen
980 * callback. Each structure is only used by a single thread so
983 uint8_t bdp_bbuf
[DLPI_BBUF_SIZE
];
988 be_dlpi_cleanup(net_backend_t
*be
)
990 be_dlpi_priv_t
*priv
= NET_BE_PRIV(be
);
992 if (priv
->bdp_dhp
!= NULL
)
993 dlpi_close(priv
->bdp_dhp
);
994 priv
->bdp_dhp
= NULL
;
996 if (priv
->bdp_mevp
!= NULL
)
997 mevent_delete(priv
->bdp_mevp
);
998 priv
->bdp_mevp
= NULL
;
1000 priv
->bdp_bbuflen
= 0;
1005 be_dlpi_err(int ret
, const char *dev
, char *msg
)
1007 WPRINTF(("%s: %s (%s)", dev
, msg
, dlpi_strerror(ret
)));
1011 be_dlpi_init(net_backend_t
*be
, const char *devname __unused
,
1012 nvlist_t
*nvl
, net_be_rxeof_t cb
, void *param
)
1014 be_dlpi_priv_t
*priv
= NET_BE_PRIV(be
);
1019 WPRINTF(("dlpi backend requires non-NULL callback"));
1023 vnic
= get_config_value_node(nvl
, "vnic");
1025 WPRINTF(("dlpi backend requires a VNIC"));
1029 priv
->bdp_bbuflen
= 0;
1031 ret
= dlpi_open(vnic
, &priv
->bdp_dhp
, DLPI_RAW
);
1033 if (ret
!= DLPI_SUCCESS
) {
1034 be_dlpi_err(ret
, vnic
, "open failed");
1038 if ((ret
= dlpi_bind(priv
->bdp_dhp
, DLPI_ANY_SAP
, NULL
)) !=
1040 be_dlpi_err(ret
, vnic
, "bind failed");
1044 if (get_config_bool_node_default(nvl
, "promiscrxonly", true)) {
1045 if ((ret
= dlpi_promiscon(priv
->bdp_dhp
, DL_PROMISC_RX_ONLY
)) !=
1047 be_dlpi_err(ret
, vnic
,
1048 "enable promiscuous mode(rxonly) failed");
1052 if (get_config_bool_node_default(nvl
, "promiscphys", false)) {
1053 if ((ret
= dlpi_promiscon(priv
->bdp_dhp
, DL_PROMISC_PHYS
)) !=
1055 be_dlpi_err(ret
, vnic
,
1056 "enable promiscuous mode(physical) failed");
1060 if (get_config_bool_node_default(nvl
, "promiscsap", true)) {
1061 if ((ret
= dlpi_promiscon(priv
->bdp_dhp
, DL_PROMISC_SAP
)) !=
1063 be_dlpi_err(ret
, vnic
,
1064 "enable promiscuous mode(SAP) failed");
1068 if (get_config_bool_node_default(nvl
, "promiscmulti", true)) {
1069 if ((ret
= dlpi_promiscon(priv
->bdp_dhp
, DL_PROMISC_MULTI
)) !=
1071 be_dlpi_err(ret
, vnic
,
1072 "enable promiscuous mode(muticast) failed");
1077 be
->fd
= dlpi_fd(priv
->bdp_dhp
);
1079 if (fcntl(be
->fd
, F_SETFL
, O_NONBLOCK
) < 0) {
1080 WPRINTF(("%s: enable O_NONBLOCK failed", vnic
));
1084 priv
->bdp_mevp
= mevent_add_disabled(be
->fd
, EVF_READ
, cb
, param
);
1085 if (priv
->bdp_mevp
== NULL
) {
1086 WPRINTF(("Could not register event"));
1093 be_dlpi_cleanup(be
);
1098 * Called to send a buffer chain out to the dlpi device
1101 be_dlpi_send(net_backend_t
*be
, const struct iovec
*iov
, int iovcnt
)
1103 be_dlpi_priv_t
*priv
= NET_BE_PRIV(be
);
1108 len
= iov
[0].iov_len
;
1109 ret
= dlpi_send(priv
->bdp_dhp
, NULL
, 0, iov
[0].iov_base
, len
,
1114 len
= iov_to_buf(iov
, iovcnt
, &buf
);
1116 if (len
<= 0 || buf
== NULL
)
1119 ret
= dlpi_send(priv
->bdp_dhp
, NULL
, 0, buf
, len
, NULL
);
1123 if (ret
!= DLPI_SUCCESS
)
1130 be_dlpi_peek_recvlen(net_backend_t
*be
)
1132 be_dlpi_priv_t
*priv
= NET_BE_PRIV(be
);
1133 dlpi_recvinfo_t recv
;
1138 * We already have a packet in the bounce buffer.
1139 * Just return its length.
1141 if (priv
->bdp_bbuflen
> 0)
1142 return (priv
->bdp_bbuflen
);
1145 * Read the next packet (if any) into the bounce buffer, so
1146 * that we get to know its length and we can return that
1149 len
= sizeof (priv
->bdp_bbuf
);
1150 ret
= dlpi_recv(priv
->bdp_dhp
, NULL
, NULL
, priv
->bdp_bbuf
, &len
,
1152 if (ret
== DL_SYSERR
) {
1153 if (errno
== EWOULDBLOCK
)
1156 } else if (ret
== DLPI_ETIMEDOUT
) {
1158 } else if (ret
!= DLPI_SUCCESS
) {
1162 if (recv
.dri_totmsglen
> sizeof (priv
->bdp_bbuf
)) {
1163 EPRINTLN("DLPI bounce buffer was too small! - needed %x bytes",
1164 recv
.dri_totmsglen
);
1167 priv
->bdp_bbuflen
= len
;
1173 be_dlpi_recv(net_backend_t
*be
, const struct iovec
*iov
, int iovcnt
)
1175 be_dlpi_priv_t
*priv
= NET_BE_PRIV(be
);
1179 if (priv
->bdp_bbuflen
> 0) {
1181 * A packet is available in the bounce buffer, so
1182 * we read it from there.
1184 len
= buf_to_iov(priv
->bdp_bbuf
, priv
->bdp_bbuflen
,
1187 /* Mark the bounce buffer as empty. */
1188 priv
->bdp_bbuflen
= 0;
1193 len
= iov
[0].iov_len
;
1194 ret
= dlpi_recv(priv
->bdp_dhp
, NULL
, NULL
,
1195 (uint8_t *)iov
[0].iov_base
, &len
, 0, NULL
);
1196 if (ret
== DL_SYSERR
) {
1197 if (errno
== EWOULDBLOCK
)
1200 } else if (ret
== DLPI_ETIMEDOUT
) {
1202 } else if (ret
!= DLPI_SUCCESS
) {
1210 be_dlpi_recv_enable(net_backend_t
*be
)
1212 be_dlpi_priv_t
*priv
= NET_BE_PRIV(be
);
1214 mevent_enable(priv
->bdp_mevp
);
1218 be_dlpi_recv_disable(net_backend_t
*be
)
1220 be_dlpi_priv_t
*priv
= NET_BE_PRIV(be
);
1222 mevent_disable(priv
->bdp_mevp
);
1226 be_dlpi_get_cap(net_backend_t
*be
)
1228 return (0); /* no capabilities for now */
1232 be_dlpi_set_cap(net_backend_t
*be
, uint64_t features
,
1233 unsigned vnet_hdr_len
)
1235 return ((features
|| vnet_hdr_len
) ? -1 : 0);
1239 be_dlpi_get_mac(net_backend_t
*be
, void *buf
, size_t *buflen
)
1241 be_dlpi_priv_t
*priv
= NET_BE_PRIV(be
);
1242 uchar_t physaddr
[DLPI_PHYSADDR_MAX
];
1243 size_t physaddrlen
= DLPI_PHYSADDR_MAX
;
1246 if ((ret
= dlpi_get_physaddr(priv
->bdp_dhp
, DL_CURR_PHYS_ADDR
,
1247 physaddr
, &physaddrlen
)) != DLPI_SUCCESS
) {
1248 be_dlpi_err(ret
, dlpi_linkname(priv
->bdp_dhp
),
1249 "read MAC address failed");
1253 if (physaddrlen
!= ETHERADDRL
) {
1254 WPRINTF(("%s: bad MAC address len %d",
1255 dlpi_linkname(priv
->bdp_dhp
), physaddrlen
));
1259 if (physaddrlen
> *buflen
) {
1260 WPRINTF(("%s: MAC address too long (%d bytes required)",
1261 dlpi_linkname(priv
->bdp_dhp
), physaddrlen
));
1265 *buflen
= physaddrlen
;
1266 memcpy(buf
, physaddr
, *buflen
);
1271 static struct net_backend dlpi_backend
= {
1273 .priv_size
= sizeof(struct be_dlpi_priv
),
1274 .init
= be_dlpi_init
,
1275 .cleanup
= be_dlpi_cleanup
,
1276 .send
= be_dlpi_send
,
1277 .peek_recvlen
= be_dlpi_peek_recvlen
,
1278 .recv
= be_dlpi_recv
,
1279 .recv_enable
= be_dlpi_recv_enable
,
1280 .recv_disable
= be_dlpi_recv_disable
,
1281 .get_cap
= be_dlpi_get_cap
,
1282 .set_cap
= be_dlpi_set_cap
,
1283 .get_mac
= be_dlpi_get_mac
,
1286 DATA_SET(net_backend_set
, dlpi_backend
);
1288 #endif /* __FreeBSD__ */
1292 netbe_legacy_config(nvlist_t
*nvl
, const char *opts
)
1299 cp
= strchr(opts
, ',');
1301 set_config_value_node(nvl
, "backend", opts
);
1304 backend
= strndup(opts
, cp
- opts
);
1305 set_config_value_node(nvl
, "backend", backend
);
1307 return (pci_parse_legacy_config(nvl
, cp
+ 1));
1311 netbe_legacy_config(nvlist_t
*nvl
, const char *opts
)
1313 char *config
, *name
, *tofree
, *value
;
1318 /* Default to the 'dlpi' backend - can still be overridden by opts */
1319 set_config_value_node(nvl
, "backend", "dlpi");
1320 set_config_value_node(nvl
, "type", "dlpi");
1322 config
= tofree
= strdup(opts
);
1324 err(4, "netbe_legacy_config strdup()");
1325 while ((name
= strsep(&config
, ",")) != NULL
) {
1326 value
= strchr(name
, '=');
1327 if (value
!= NULL
) {
1329 set_config_value_node(nvl
, name
, value
);
1331 set_config_value_node(nvl
, "vnic", name
);
1341 * Initialize a backend and attach to the frontend.
1342 * This is called during frontend initialization.
1343 * @ret is a pointer to the backend to be initialized
1344 * @devname is the backend-name as supplied on the command line,
1345 * e.g. -s 2:0,frontend-name,backend-name[,other-args]
1346 * @cb is the receive callback supplied by the frontend,
1347 * and it is invoked in the event loop when a receive
1348 * event is generated in the hypervisor,
1349 * @param is a pointer to the frontend, and normally used as
1350 * the argument for the callback.
1353 netbe_init(struct net_backend
**ret
, nvlist_t
*nvl
, net_be_rxeof_t cb
,
1356 struct net_backend
**pbe
, *nbe
, *tbe
= NULL
;
1357 const char *value
, *type
;
1361 value
= get_config_value_node(nvl
, "backend");
1362 if (value
== NULL
) {
1365 devname
= strdup(value
);
1368 * Use the type given by configuration if exists; otherwise
1369 * use the prefix of the backend as the type.
1371 type
= get_config_value_node(nvl
, "type");
1376 * Find the network backend that matches the user-provided
1377 * device name. net_backend_set is built using a linker set.
1379 SET_FOREACH(pbe
, net_backend_set
) {
1380 if (strncmp(type
, (*pbe
)->prefix
,
1381 strlen((*pbe
)->prefix
)) == 0) {
1383 assert(tbe
->init
!= NULL
);
1384 assert(tbe
->cleanup
!= NULL
);
1385 assert(tbe
->send
!= NULL
);
1386 assert(tbe
->recv
!= NULL
);
1387 assert(tbe
->get_cap
!= NULL
);
1388 assert(tbe
->set_cap
!= NULL
);
1399 nbe
= calloc(1, NET_BE_SIZE(tbe
));
1400 *nbe
= *tbe
; /* copy the template */
1403 nbe
->be_vnet_hdr_len
= 0;
1404 nbe
->fe_vnet_hdr_len
= 0;
1406 /* Initialize the backend. */
1407 err
= nbe
->init(nbe
, devname
, nvl
, cb
, param
);
1421 netbe_cleanup(struct net_backend
*be
)
1431 netbe_get_cap(struct net_backend
*be
)
1435 return (be
->get_cap(be
));
1439 netbe_set_cap(struct net_backend
*be
, uint64_t features
,
1440 unsigned vnet_hdr_len
)
1446 /* There are only three valid lengths, i.e., 0, 10 and 12. */
1447 if (vnet_hdr_len
&& vnet_hdr_len
!= VNET_HDR_LEN
1448 && vnet_hdr_len
!= (VNET_HDR_LEN
- sizeof(uint16_t)))
1451 be
->fe_vnet_hdr_len
= vnet_hdr_len
;
1453 ret
= be
->set_cap(be
, features
, vnet_hdr_len
);
1454 assert(be
->be_vnet_hdr_len
== 0 ||
1455 be
->be_vnet_hdr_len
== be
->fe_vnet_hdr_len
);
1461 netbe_send(struct net_backend
*be
, const struct iovec
*iov
, int iovcnt
)
1464 return (be
->send(be
, iov
, iovcnt
));
1468 netbe_peek_recvlen(struct net_backend
*be
)
1471 return (be
->peek_recvlen(be
));
1475 * Try to read a packet from the backend, without blocking.
1476 * If no packets are available, return 0. In case of success, return
1477 * the length of the packet just read. Return -1 in case of errors.
1480 netbe_recv(struct net_backend
*be
, const struct iovec
*iov
, int iovcnt
)
1483 return (be
->recv(be
, iov
, iovcnt
));
1487 * Read a packet from the backend and discard it.
1488 * Returns the size of the discarded packet or zero if no packet was available.
1489 * A negative error code is returned in case of read error.
1492 netbe_rx_discard(struct net_backend
*be
)
1495 * MP note: the dummybuf is only used to discard frames,
1496 * so there is no need for it to be per-vtnet or locked.
1497 * We only make it large enough for TSO-sized segment.
1499 static uint8_t dummybuf
[65536 + 64];
1503 iov
.iov_base
= dummybuf
;
1505 iov
.iov_base
= (caddr_t
)dummybuf
;
1507 iov
.iov_len
= sizeof(dummybuf
);
1509 return netbe_recv(be
, &iov
, 1);
1513 netbe_rx_disable(struct net_backend
*be
)
1516 return be
->recv_disable(be
);
1520 netbe_rx_enable(struct net_backend
*be
)
1523 return be
->recv_enable(be
);
1527 netbe_get_vnet_hdr_len(struct net_backend
*be
)
1530 return (be
->be_vnet_hdr_len
);
1535 netbe_get_mac(net_backend_t
*be
, void *buf
, size_t *buflen
)
1537 if (be
->get_mac
== NULL
)
1539 return (be
->get_mac(be
, buf
, buflen
));