9pfs: rename virtio-9p-posix-acl.c to 9p-posix-acl.c
[qemu/ar7.git] / tests / vhost-user-bridge.c
blob9fb09f1df463bdfcd3a4486e647973ace2e06358
1 /*
2 * Vhost User Bridge
4 * Copyright (c) 2015 Red Hat, Inc.
6 * Authors:
7 * Victor Kaplansky <victork@redhat.com>
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
14 * TODO:
15 * - main should get parameters from the command line.
16 * - implement all request handlers. Still not implemented:
17 * vubr_get_queue_num_exec()
18 * vubr_send_rarp_exec()
19 * - test for broken requests and virtqueue.
20 * - implement features defined by Virtio 1.0 spec.
21 * - support mergeable buffers and indirect descriptors.
22 * - implement clean shutdown.
23 * - implement non-blocking writes to UDP backend.
24 * - implement polling strategy.
25 * - implement clean starting/stopping of vq processing
26 * - implement clean starting/stopping of used and buffers
27 * dirty page logging.
30 #define _FILE_OFFSET_BITS 64
32 #include <stddef.h>
33 #include <assert.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <stdint.h>
37 #include <inttypes.h>
38 #include <string.h>
39 #include <unistd.h>
40 #include <errno.h>
41 #include <sys/types.h>
42 #include <sys/socket.h>
43 #include <sys/un.h>
44 #include <sys/unistd.h>
45 #include <sys/mman.h>
46 #include <sys/eventfd.h>
47 #include <arpa/inet.h>
48 #include <ctype.h>
49 #include <netdb.h>
51 #include <linux/vhost.h>
53 #include "qemu/atomic.h"
54 #include "standard-headers/linux/virtio_net.h"
55 #include "standard-headers/linux/virtio_ring.h"
57 #define VHOST_USER_BRIDGE_DEBUG 1
59 #define DPRINT(...) \
60 do { \
61 if (VHOST_USER_BRIDGE_DEBUG) { \
62 printf(__VA_ARGS__); \
63 } \
64 } while (0)
66 typedef void (*CallbackFunc)(int sock, void *ctx);
68 typedef struct Event {
69 void *ctx;
70 CallbackFunc callback;
71 } Event;
73 typedef struct Dispatcher {
74 int max_sock;
75 fd_set fdset;
76 Event events[FD_SETSIZE];
77 } Dispatcher;
79 static void
80 vubr_die(const char *s)
82 perror(s);
83 exit(1);
86 static int
87 dispatcher_init(Dispatcher *dispr)
89 FD_ZERO(&dispr->fdset);
90 dispr->max_sock = -1;
91 return 0;
94 static int
95 dispatcher_add(Dispatcher *dispr, int sock, void *ctx, CallbackFunc cb)
97 if (sock >= FD_SETSIZE) {
98 fprintf(stderr,
99 "Error: Failed to add new event. sock %d should be less than %d\n",
100 sock, FD_SETSIZE);
101 return -1;
104 dispr->events[sock].ctx = ctx;
105 dispr->events[sock].callback = cb;
107 FD_SET(sock, &dispr->fdset);
108 if (sock > dispr->max_sock) {
109 dispr->max_sock = sock;
111 DPRINT("Added sock %d for watching. max_sock: %d\n",
112 sock, dispr->max_sock);
113 return 0;
116 /* dispatcher_remove() is not currently in use but may be useful
117 * in the future. */
118 static int
119 dispatcher_remove(Dispatcher *dispr, int sock)
121 if (sock >= FD_SETSIZE) {
122 fprintf(stderr,
123 "Error: Failed to remove event. sock %d should be less than %d\n",
124 sock, FD_SETSIZE);
125 return -1;
128 FD_CLR(sock, &dispr->fdset);
129 DPRINT("Sock %d removed from dispatcher watch.\n", sock);
130 return 0;
133 /* timeout in us */
134 static int
135 dispatcher_wait(Dispatcher *dispr, uint32_t timeout)
137 struct timeval tv;
138 tv.tv_sec = timeout / 1000000;
139 tv.tv_usec = timeout % 1000000;
141 fd_set fdset = dispr->fdset;
143 /* wait until some of sockets become readable. */
144 int rc = select(dispr->max_sock + 1, &fdset, 0, 0, &tv);
146 if (rc == -1) {
147 vubr_die("select");
150 /* Timeout */
151 if (rc == 0) {
152 return 0;
155 /* Now call callback for every ready socket. */
157 int sock;
158 for (sock = 0; sock < dispr->max_sock + 1; sock++) {
159 /* The callback on a socket can remove other sockets from the
160 * dispatcher, thus we have to check that the socket is
161 * still not removed from dispatcher's list
163 if (FD_ISSET(sock, &fdset) && FD_ISSET(sock, &dispr->fdset)) {
164 Event *e = &dispr->events[sock];
165 e->callback(sock, e->ctx);
169 return 0;
172 typedef struct VubrVirtq {
173 int call_fd;
174 int kick_fd;
175 uint32_t size;
176 uint16_t last_avail_index;
177 uint16_t last_used_index;
178 struct vring_desc *desc;
179 struct vring_avail *avail;
180 struct vring_used *used;
181 uint64_t log_guest_addr;
182 int enable;
183 } VubrVirtq;
185 /* Based on qemu/hw/virtio/vhost-user.c */
187 #define VHOST_MEMORY_MAX_NREGIONS 8
188 #define VHOST_USER_F_PROTOCOL_FEATURES 30
190 #define VHOST_LOG_PAGE 4096
192 enum VhostUserProtocolFeature {
193 VHOST_USER_PROTOCOL_F_MQ = 0,
194 VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
195 VHOST_USER_PROTOCOL_F_RARP = 2,
197 VHOST_USER_PROTOCOL_F_MAX
200 #define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1)
202 typedef enum VhostUserRequest {
203 VHOST_USER_NONE = 0,
204 VHOST_USER_GET_FEATURES = 1,
205 VHOST_USER_SET_FEATURES = 2,
206 VHOST_USER_SET_OWNER = 3,
207 VHOST_USER_RESET_OWNER = 4,
208 VHOST_USER_SET_MEM_TABLE = 5,
209 VHOST_USER_SET_LOG_BASE = 6,
210 VHOST_USER_SET_LOG_FD = 7,
211 VHOST_USER_SET_VRING_NUM = 8,
212 VHOST_USER_SET_VRING_ADDR = 9,
213 VHOST_USER_SET_VRING_BASE = 10,
214 VHOST_USER_GET_VRING_BASE = 11,
215 VHOST_USER_SET_VRING_KICK = 12,
216 VHOST_USER_SET_VRING_CALL = 13,
217 VHOST_USER_SET_VRING_ERR = 14,
218 VHOST_USER_GET_PROTOCOL_FEATURES = 15,
219 VHOST_USER_SET_PROTOCOL_FEATURES = 16,
220 VHOST_USER_GET_QUEUE_NUM = 17,
221 VHOST_USER_SET_VRING_ENABLE = 18,
222 VHOST_USER_SEND_RARP = 19,
223 VHOST_USER_MAX
224 } VhostUserRequest;
226 typedef struct VhostUserMemoryRegion {
227 uint64_t guest_phys_addr;
228 uint64_t memory_size;
229 uint64_t userspace_addr;
230 uint64_t mmap_offset;
231 } VhostUserMemoryRegion;
233 typedef struct VhostUserMemory {
234 uint32_t nregions;
235 uint32_t padding;
236 VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
237 } VhostUserMemory;
239 typedef struct VhostUserLog {
240 uint64_t mmap_size;
241 uint64_t mmap_offset;
242 } VhostUserLog;
244 typedef struct VhostUserMsg {
245 VhostUserRequest request;
247 #define VHOST_USER_VERSION_MASK (0x3)
248 #define VHOST_USER_REPLY_MASK (0x1<<2)
249 uint32_t flags;
250 uint32_t size; /* the following payload size */
251 union {
252 #define VHOST_USER_VRING_IDX_MASK (0xff)
253 #define VHOST_USER_VRING_NOFD_MASK (0x1<<8)
254 uint64_t u64;
255 struct vhost_vring_state state;
256 struct vhost_vring_addr addr;
257 VhostUserMemory memory;
258 VhostUserLog log;
259 } payload;
260 int fds[VHOST_MEMORY_MAX_NREGIONS];
261 int fd_num;
262 } QEMU_PACKED VhostUserMsg;
264 #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
266 /* The version of the protocol we support */
267 #define VHOST_USER_VERSION (0x1)
269 #define MAX_NR_VIRTQUEUE (8)
271 typedef struct VubrDevRegion {
272 /* Guest Physical address. */
273 uint64_t gpa;
274 /* Memory region size. */
275 uint64_t size;
276 /* QEMU virtual address (userspace). */
277 uint64_t qva;
278 /* Starting offset in our mmaped space. */
279 uint64_t mmap_offset;
280 /* Start address of mmaped space. */
281 uint64_t mmap_addr;
282 } VubrDevRegion;
284 typedef struct VubrDev {
285 int sock;
286 Dispatcher dispatcher;
287 uint32_t nregions;
288 VubrDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
289 VubrVirtq vq[MAX_NR_VIRTQUEUE];
290 int log_call_fd;
291 uint64_t log_size;
292 uint8_t *log_table;
293 int backend_udp_sock;
294 struct sockaddr_in backend_udp_dest;
295 int ready;
296 uint64_t features;
297 } VubrDev;
299 static const char *vubr_request_str[] = {
300 [VHOST_USER_NONE] = "VHOST_USER_NONE",
301 [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
302 [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
303 [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
304 [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
305 [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
306 [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
307 [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
308 [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
309 [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
310 [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
311 [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
312 [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
313 [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
314 [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR",
315 [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
316 [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
317 [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
318 [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE",
319 [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP",
320 [VHOST_USER_MAX] = "VHOST_USER_MAX",
323 static void
324 print_buffer(uint8_t *buf, size_t len)
326 int i;
327 printf("Raw buffer:\n");
328 for (i = 0; i < len; i++) {
329 if (i % 16 == 0) {
330 printf("\n");
332 if (i % 4 == 0) {
333 printf(" ");
335 printf("%02x ", buf[i]);
337 printf("\n............................................................\n");
340 /* Translate guest physical address to our virtual address. */
341 static uint64_t
342 gpa_to_va(VubrDev *dev, uint64_t guest_addr)
344 int i;
346 /* Find matching memory region. */
347 for (i = 0; i < dev->nregions; i++) {
348 VubrDevRegion *r = &dev->regions[i];
350 if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) {
351 return guest_addr - r->gpa + r->mmap_addr + r->mmap_offset;
355 assert(!"address not found in regions");
356 return 0;
359 /* Translate qemu virtual address to our virtual address. */
360 static uint64_t
361 qva_to_va(VubrDev *dev, uint64_t qemu_addr)
363 int i;
365 /* Find matching memory region. */
366 for (i = 0; i < dev->nregions; i++) {
367 VubrDevRegion *r = &dev->regions[i];
369 if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
370 return qemu_addr - r->qva + r->mmap_addr + r->mmap_offset;
374 assert(!"address not found in regions");
375 return 0;
378 static void
379 vubr_message_read(int conn_fd, VhostUserMsg *vmsg)
381 char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
382 struct iovec iov = {
383 .iov_base = (char *)vmsg,
384 .iov_len = VHOST_USER_HDR_SIZE,
386 struct msghdr msg = {
387 .msg_iov = &iov,
388 .msg_iovlen = 1,
389 .msg_control = control,
390 .msg_controllen = sizeof(control),
392 size_t fd_size;
393 struct cmsghdr *cmsg;
394 int rc;
396 rc = recvmsg(conn_fd, &msg, 0);
398 if (rc == 0) {
399 vubr_die("recvmsg");
400 fprintf(stderr, "Peer disconnected.\n");
401 exit(1);
403 if (rc < 0) {
404 vubr_die("recvmsg");
407 vmsg->fd_num = 0;
408 for (cmsg = CMSG_FIRSTHDR(&msg);
409 cmsg != NULL;
410 cmsg = CMSG_NXTHDR(&msg, cmsg))
412 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
413 fd_size = cmsg->cmsg_len - CMSG_LEN(0);
414 vmsg->fd_num = fd_size / sizeof(int);
415 memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
416 break;
420 if (vmsg->size > sizeof(vmsg->payload)) {
421 fprintf(stderr,
422 "Error: too big message request: %d, size: vmsg->size: %u, "
423 "while sizeof(vmsg->payload) = %lu\n",
424 vmsg->request, vmsg->size, sizeof(vmsg->payload));
425 exit(1);
428 if (vmsg->size) {
429 rc = read(conn_fd, &vmsg->payload, vmsg->size);
430 if (rc == 0) {
431 vubr_die("recvmsg");
432 fprintf(stderr, "Peer disconnected.\n");
433 exit(1);
435 if (rc < 0) {
436 vubr_die("recvmsg");
439 assert(rc == vmsg->size);
443 static void
444 vubr_message_write(int conn_fd, VhostUserMsg *vmsg)
446 int rc;
448 do {
449 rc = write(conn_fd, vmsg, VHOST_USER_HDR_SIZE + vmsg->size);
450 } while (rc < 0 && errno == EINTR);
452 if (rc < 0) {
453 vubr_die("write");
457 static void
458 vubr_backend_udp_sendbuf(VubrDev *dev, uint8_t *buf, size_t len)
460 int slen = sizeof(struct sockaddr_in);
462 if (sendto(dev->backend_udp_sock, buf, len, 0,
463 (struct sockaddr *) &dev->backend_udp_dest, slen) == -1) {
464 vubr_die("sendto()");
468 static int
469 vubr_backend_udp_recvbuf(VubrDev *dev, uint8_t *buf, size_t buflen)
471 int slen = sizeof(struct sockaddr_in);
472 int rc;
474 rc = recvfrom(dev->backend_udp_sock, buf, buflen, 0,
475 (struct sockaddr *) &dev->backend_udp_dest,
476 (socklen_t *)&slen);
477 if (rc == -1) {
478 vubr_die("recvfrom()");
481 return rc;
484 static void
485 vubr_consume_raw_packet(VubrDev *dev, uint8_t *buf, uint32_t len)
487 int hdrlen = sizeof(struct virtio_net_hdr_v1);
489 if (VHOST_USER_BRIDGE_DEBUG) {
490 print_buffer(buf, len);
492 vubr_backend_udp_sendbuf(dev, buf + hdrlen, len - hdrlen);
495 /* Kick the log_call_fd if required. */
496 static void
497 vubr_log_kick(VubrDev *dev)
499 if (dev->log_call_fd != -1) {
500 DPRINT("Kicking the QEMU's log...\n");
501 eventfd_write(dev->log_call_fd, 1);
505 /* Kick the guest if necessary. */
506 static void
507 vubr_virtqueue_kick(VubrVirtq *vq)
509 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
510 DPRINT("Kicking the guest...\n");
511 eventfd_write(vq->call_fd, 1);
515 static void
516 vubr_log_page(uint8_t *log_table, uint64_t page)
518 DPRINT("Logged dirty guest page: %"PRId64"\n", page);
519 atomic_or(&log_table[page / 8], 1 << (page % 8));
522 static void
523 vubr_log_write(VubrDev *dev, uint64_t address, uint64_t length)
525 uint64_t page;
527 if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) ||
528 !dev->log_table || !length) {
529 return;
532 assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8));
534 page = address / VHOST_LOG_PAGE;
535 while (page * VHOST_LOG_PAGE < address + length) {
536 vubr_log_page(dev->log_table, page);
537 page += VHOST_LOG_PAGE;
539 vubr_log_kick(dev);
542 static void
543 vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len)
545 struct vring_desc *desc = vq->desc;
546 struct vring_avail *avail = vq->avail;
547 struct vring_used *used = vq->used;
548 uint64_t log_guest_addr = vq->log_guest_addr;
550 unsigned int size = vq->size;
552 uint16_t avail_index = atomic_mb_read(&avail->idx);
554 /* We check the available descriptors before posting the
555 * buffer, so here we assume that enough available
556 * descriptors. */
557 assert(vq->last_avail_index != avail_index);
558 uint16_t a_index = vq->last_avail_index % size;
559 uint16_t u_index = vq->last_used_index % size;
560 uint16_t d_index = avail->ring[a_index];
562 int i = d_index;
564 DPRINT("Post packet to guest on vq:\n");
565 DPRINT(" size = %d\n", vq->size);
566 DPRINT(" last_avail_index = %d\n", vq->last_avail_index);
567 DPRINT(" last_used_index = %d\n", vq->last_used_index);
568 DPRINT(" a_index = %d\n", a_index);
569 DPRINT(" u_index = %d\n", u_index);
570 DPRINT(" d_index = %d\n", d_index);
571 DPRINT(" desc[%d].addr = 0x%016"PRIx64"\n", i, desc[i].addr);
572 DPRINT(" desc[%d].len = %d\n", i, desc[i].len);
573 DPRINT(" desc[%d].flags = %d\n", i, desc[i].flags);
574 DPRINT(" avail->idx = %d\n", avail_index);
575 DPRINT(" used->idx = %d\n", used->idx);
577 if (!(desc[i].flags & VRING_DESC_F_WRITE)) {
578 /* FIXME: we should find writable descriptor. */
579 fprintf(stderr, "Error: descriptor is not writable. Exiting.\n");
580 exit(1);
583 void *chunk_start = (void *)gpa_to_va(dev, desc[i].addr);
584 uint32_t chunk_len = desc[i].len;
586 if (len <= chunk_len) {
587 memcpy(chunk_start, buf, len);
588 vubr_log_write(dev, desc[i].addr, len);
589 } else {
590 fprintf(stderr,
591 "Received too long packet from the backend. Dropping...\n");
592 return;
595 /* Add descriptor to the used ring. */
596 used->ring[u_index].id = d_index;
597 used->ring[u_index].len = len;
598 vubr_log_write(dev,
599 log_guest_addr + offsetof(struct vring_used, ring[u_index]),
600 sizeof(used->ring[u_index]));
602 vq->last_avail_index++;
603 vq->last_used_index++;
605 atomic_mb_set(&used->idx, vq->last_used_index);
606 vubr_log_write(dev,
607 log_guest_addr + offsetof(struct vring_used, idx),
608 sizeof(used->idx));
610 /* Kick the guest if necessary. */
611 vubr_virtqueue_kick(vq);
614 static int
615 vubr_process_desc(VubrDev *dev, VubrVirtq *vq)
617 struct vring_desc *desc = vq->desc;
618 struct vring_avail *avail = vq->avail;
619 struct vring_used *used = vq->used;
620 uint64_t log_guest_addr = vq->log_guest_addr;
622 unsigned int size = vq->size;
624 uint16_t a_index = vq->last_avail_index % size;
625 uint16_t u_index = vq->last_used_index % size;
626 uint16_t d_index = avail->ring[a_index];
628 uint32_t i, len = 0;
629 size_t buf_size = 4096;
630 uint8_t buf[4096];
632 DPRINT("Chunks: ");
633 i = d_index;
634 do {
635 void *chunk_start = (void *)gpa_to_va(dev, desc[i].addr);
636 uint32_t chunk_len = desc[i].len;
638 assert(!(desc[i].flags & VRING_DESC_F_WRITE));
640 if (len + chunk_len < buf_size) {
641 memcpy(buf + len, chunk_start, chunk_len);
642 DPRINT("%d ", chunk_len);
643 } else {
644 fprintf(stderr, "Error: too long packet. Dropping...\n");
645 break;
648 len += chunk_len;
650 if (!(desc[i].flags & VRING_DESC_F_NEXT)) {
651 break;
654 i = desc[i].next;
655 } while (1);
656 DPRINT("\n");
658 if (!len) {
659 return -1;
662 /* Add descriptor to the used ring. */
663 used->ring[u_index].id = d_index;
664 used->ring[u_index].len = len;
665 vubr_log_write(dev,
666 log_guest_addr + offsetof(struct vring_used, ring[u_index]),
667 sizeof(used->ring[u_index]));
669 vubr_consume_raw_packet(dev, buf, len);
671 return 0;
674 static void
675 vubr_process_avail(VubrDev *dev, VubrVirtq *vq)
677 struct vring_avail *avail = vq->avail;
678 struct vring_used *used = vq->used;
679 uint64_t log_guest_addr = vq->log_guest_addr;
681 while (vq->last_avail_index != atomic_mb_read(&avail->idx)) {
682 vubr_process_desc(dev, vq);
683 vq->last_avail_index++;
684 vq->last_used_index++;
687 atomic_mb_set(&used->idx, vq->last_used_index);
688 vubr_log_write(dev,
689 log_guest_addr + offsetof(struct vring_used, idx),
690 sizeof(used->idx));
693 static void
694 vubr_backend_recv_cb(int sock, void *ctx)
696 VubrDev *dev = (VubrDev *) ctx;
697 VubrVirtq *rx_vq = &dev->vq[0];
698 uint8_t buf[4096];
699 struct virtio_net_hdr_v1 *hdr = (struct virtio_net_hdr_v1 *)buf;
700 int hdrlen = sizeof(struct virtio_net_hdr_v1);
701 int buflen = sizeof(buf);
702 int len;
704 if (!dev->ready) {
705 return;
708 DPRINT("\n\n *** IN UDP RECEIVE CALLBACK ***\n\n");
710 uint16_t avail_index = atomic_mb_read(&rx_vq->avail->idx);
712 /* If there is no available descriptors, just do nothing.
713 * The buffer will be handled by next arrived UDP packet,
714 * or next kick on receive virtq. */
715 if (rx_vq->last_avail_index == avail_index) {
716 DPRINT("Got UDP packet, but no available descriptors on RX virtq.\n");
717 return;
720 len = vubr_backend_udp_recvbuf(dev, buf + hdrlen, buflen - hdrlen);
722 *hdr = (struct virtio_net_hdr_v1) { };
723 hdr->num_buffers = 1;
724 vubr_post_buffer(dev, rx_vq, buf, len + hdrlen);
727 static void
728 vubr_kick_cb(int sock, void *ctx)
730 VubrDev *dev = (VubrDev *) ctx;
731 eventfd_t kick_data;
732 ssize_t rc;
734 rc = eventfd_read(sock, &kick_data);
735 if (rc == -1) {
736 vubr_die("eventfd_read()");
737 } else {
738 DPRINT("Got kick_data: %016"PRIx64"\n", kick_data);
739 vubr_process_avail(dev, &dev->vq[1]);
743 static int
744 vubr_none_exec(VubrDev *dev, VhostUserMsg *vmsg)
746 DPRINT("Function %s() not implemented yet.\n", __func__);
747 return 0;
750 static int
751 vubr_get_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
753 vmsg->payload.u64 =
754 ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
755 (1ULL << VHOST_F_LOG_ALL) |
756 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
757 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES));
759 vmsg->size = sizeof(vmsg->payload.u64);
761 DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
763 /* Reply */
764 return 1;
767 static int
768 vubr_set_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
770 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
771 dev->features = vmsg->payload.u64;
772 return 0;
775 static int
776 vubr_set_owner_exec(VubrDev *dev, VhostUserMsg *vmsg)
778 return 0;
781 static void
782 vubr_close_log(VubrDev *dev)
784 if (dev->log_table) {
785 if (munmap(dev->log_table, dev->log_size) != 0) {
786 vubr_die("munmap()");
789 dev->log_table = 0;
791 if (dev->log_call_fd != -1) {
792 close(dev->log_call_fd);
793 dev->log_call_fd = -1;
797 static int
798 vubr_reset_device_exec(VubrDev *dev, VhostUserMsg *vmsg)
800 vubr_close_log(dev);
801 dev->ready = 0;
802 dev->features = 0;
803 return 0;
806 static int
807 vubr_set_mem_table_exec(VubrDev *dev, VhostUserMsg *vmsg)
809 int i;
810 VhostUserMemory *memory = &vmsg->payload.memory;
811 dev->nregions = memory->nregions;
813 DPRINT("Nregions: %d\n", memory->nregions);
814 for (i = 0; i < dev->nregions; i++) {
815 void *mmap_addr;
816 VhostUserMemoryRegion *msg_region = &memory->regions[i];
817 VubrDevRegion *dev_region = &dev->regions[i];
819 DPRINT("Region %d\n", i);
820 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
821 msg_region->guest_phys_addr);
822 DPRINT(" memory_size: 0x%016"PRIx64"\n",
823 msg_region->memory_size);
824 DPRINT(" userspace_addr 0x%016"PRIx64"\n",
825 msg_region->userspace_addr);
826 DPRINT(" mmap_offset 0x%016"PRIx64"\n",
827 msg_region->mmap_offset);
829 dev_region->gpa = msg_region->guest_phys_addr;
830 dev_region->size = msg_region->memory_size;
831 dev_region->qva = msg_region->userspace_addr;
832 dev_region->mmap_offset = msg_region->mmap_offset;
834 /* We don't use offset argument of mmap() since the
835 * mapped address has to be page aligned, and we use huge
836 * pages. */
837 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
838 PROT_READ | PROT_WRITE, MAP_SHARED,
839 vmsg->fds[i], 0);
841 if (mmap_addr == MAP_FAILED) {
842 vubr_die("mmap");
844 dev_region->mmap_addr = (uint64_t) mmap_addr;
845 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", dev_region->mmap_addr);
847 close(vmsg->fds[i]);
850 return 0;
853 static int
854 vubr_set_log_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
856 int fd;
857 uint64_t log_mmap_size, log_mmap_offset;
858 void *rc;
860 assert(vmsg->fd_num == 1);
861 fd = vmsg->fds[0];
863 assert(vmsg->size == sizeof(vmsg->payload.log));
864 log_mmap_offset = vmsg->payload.log.mmap_offset;
865 log_mmap_size = vmsg->payload.log.mmap_size;
866 DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset);
867 DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size);
869 rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
870 log_mmap_offset);
871 if (rc == MAP_FAILED) {
872 vubr_die("mmap");
874 dev->log_table = rc;
875 dev->log_size = log_mmap_size;
877 vmsg->size = sizeof(vmsg->payload.u64);
878 /* Reply */
879 return 1;
882 static int
883 vubr_set_log_fd_exec(VubrDev *dev, VhostUserMsg *vmsg)
885 assert(vmsg->fd_num == 1);
886 dev->log_call_fd = vmsg->fds[0];
887 DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]);
888 return 0;
891 static int
892 vubr_set_vring_num_exec(VubrDev *dev, VhostUserMsg *vmsg)
894 unsigned int index = vmsg->payload.state.index;
895 unsigned int num = vmsg->payload.state.num;
897 DPRINT("State.index: %d\n", index);
898 DPRINT("State.num: %d\n", num);
899 dev->vq[index].size = num;
900 return 0;
903 static int
904 vubr_set_vring_addr_exec(VubrDev *dev, VhostUserMsg *vmsg)
906 struct vhost_vring_addr *vra = &vmsg->payload.addr;
907 unsigned int index = vra->index;
908 VubrVirtq *vq = &dev->vq[index];
910 DPRINT("vhost_vring_addr:\n");
911 DPRINT(" index: %d\n", vra->index);
912 DPRINT(" flags: %d\n", vra->flags);
913 DPRINT(" desc_user_addr: 0x%016llx\n", vra->desc_user_addr);
914 DPRINT(" used_user_addr: 0x%016llx\n", vra->used_user_addr);
915 DPRINT(" avail_user_addr: 0x%016llx\n", vra->avail_user_addr);
916 DPRINT(" log_guest_addr: 0x%016llx\n", vra->log_guest_addr);
918 vq->desc = (struct vring_desc *)qva_to_va(dev, vra->desc_user_addr);
919 vq->used = (struct vring_used *)qva_to_va(dev, vra->used_user_addr);
920 vq->avail = (struct vring_avail *)qva_to_va(dev, vra->avail_user_addr);
921 vq->log_guest_addr = vra->log_guest_addr;
923 DPRINT("Setting virtq addresses:\n");
924 DPRINT(" vring_desc at %p\n", vq->desc);
925 DPRINT(" vring_used at %p\n", vq->used);
926 DPRINT(" vring_avail at %p\n", vq->avail);
928 vq->last_used_index = vq->used->idx;
929 return 0;
932 static int
933 vubr_set_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
935 unsigned int index = vmsg->payload.state.index;
936 unsigned int num = vmsg->payload.state.num;
938 DPRINT("State.index: %d\n", index);
939 DPRINT("State.num: %d\n", num);
940 dev->vq[index].last_avail_index = num;
942 return 0;
945 static int
946 vubr_get_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
948 unsigned int index = vmsg->payload.state.index;
950 DPRINT("State.index: %d\n", index);
951 vmsg->payload.state.num = dev->vq[index].last_avail_index;
952 vmsg->size = sizeof(vmsg->payload.state);
953 /* FIXME: this is a work-around for a bug in QEMU enabling
954 * too early vrings. When protocol features are enabled,
955 * we have to respect * VHOST_USER_SET_VRING_ENABLE request. */
956 dev->ready = 0;
958 if (dev->vq[index].call_fd != -1) {
959 close(dev->vq[index].call_fd);
960 dispatcher_remove(&dev->dispatcher, dev->vq[index].call_fd);
961 dev->vq[index].call_fd = -1;
963 if (dev->vq[index].kick_fd != -1) {
964 close(dev->vq[index].kick_fd);
965 dispatcher_remove(&dev->dispatcher, dev->vq[index].kick_fd);
966 dev->vq[index].kick_fd = -1;
969 /* Reply */
970 return 1;
973 static int
974 vubr_set_vring_kick_exec(VubrDev *dev, VhostUserMsg *vmsg)
976 uint64_t u64_arg = vmsg->payload.u64;
977 int index = u64_arg & VHOST_USER_VRING_IDX_MASK;
979 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
981 assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0);
982 assert(vmsg->fd_num == 1);
984 if (dev->vq[index].kick_fd != -1) {
985 close(dev->vq[index].kick_fd);
986 dispatcher_remove(&dev->dispatcher, dev->vq[index].kick_fd);
988 dev->vq[index].kick_fd = vmsg->fds[0];
989 DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index);
991 if (index % 2 == 1) {
992 /* TX queue. */
993 dispatcher_add(&dev->dispatcher, dev->vq[index].kick_fd,
994 dev, vubr_kick_cb);
996 DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
997 dev->vq[index].kick_fd, index);
999 /* We temporarily use this hack to determine that both TX and RX
1000 * queues are set up and ready for processing.
1001 * FIXME: we need to rely in VHOST_USER_SET_VRING_ENABLE and
1002 * actual kicks. */
1003 if (dev->vq[0].kick_fd != -1 &&
1004 dev->vq[1].kick_fd != -1) {
1005 dev->ready = 1;
1006 DPRINT("vhost-user-bridge is ready for processing queues.\n");
1008 return 0;
1012 static int
1013 vubr_set_vring_call_exec(VubrDev *dev, VhostUserMsg *vmsg)
1015 uint64_t u64_arg = vmsg->payload.u64;
1016 int index = u64_arg & VHOST_USER_VRING_IDX_MASK;
1018 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
1019 assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0);
1020 assert(vmsg->fd_num == 1);
1022 if (dev->vq[index].call_fd != -1) {
1023 close(dev->vq[index].call_fd);
1024 dispatcher_remove(&dev->dispatcher, dev->vq[index].call_fd);
1026 dev->vq[index].call_fd = vmsg->fds[0];
1027 DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
1029 return 0;
1032 static int
1033 vubr_set_vring_err_exec(VubrDev *dev, VhostUserMsg *vmsg)
1035 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
1036 return 0;
1039 static int
1040 vubr_get_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
1042 vmsg->payload.u64 = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD;
1043 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
1044 vmsg->size = sizeof(vmsg->payload.u64);
1046 /* Reply */
1047 return 1;
1050 static int
1051 vubr_set_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
1053 /* FIXME: unimplented */
1054 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
1055 return 0;
1058 static int
1059 vubr_get_queue_num_exec(VubrDev *dev, VhostUserMsg *vmsg)
1061 DPRINT("Function %s() not implemented yet.\n", __func__);
1062 return 0;
1065 static int
1066 vubr_set_vring_enable_exec(VubrDev *dev, VhostUserMsg *vmsg)
1068 unsigned int index = vmsg->payload.state.index;
1069 unsigned int enable = vmsg->payload.state.num;
1071 DPRINT("State.index: %d\n", index);
1072 DPRINT("State.enable: %d\n", enable);
1073 dev->vq[index].enable = enable;
1074 return 0;
1077 static int
1078 vubr_send_rarp_exec(VubrDev *dev, VhostUserMsg *vmsg)
1080 DPRINT("Function %s() not implemented yet.\n", __func__);
1081 return 0;
1084 static int
1085 vubr_execute_request(VubrDev *dev, VhostUserMsg *vmsg)
1087 /* Print out generic part of the request. */
1088 DPRINT(
1089 "================== Vhost user message from QEMU ==================\n");
1090 DPRINT("Request: %s (%d)\n", vubr_request_str[vmsg->request],
1091 vmsg->request);
1092 DPRINT("Flags: 0x%x\n", vmsg->flags);
1093 DPRINT("Size: %d\n", vmsg->size);
1095 if (vmsg->fd_num) {
1096 int i;
1097 DPRINT("Fds:");
1098 for (i = 0; i < vmsg->fd_num; i++) {
1099 DPRINT(" %d", vmsg->fds[i]);
1101 DPRINT("\n");
1104 switch (vmsg->request) {
1105 case VHOST_USER_NONE:
1106 return vubr_none_exec(dev, vmsg);
1107 case VHOST_USER_GET_FEATURES:
1108 return vubr_get_features_exec(dev, vmsg);
1109 case VHOST_USER_SET_FEATURES:
1110 return vubr_set_features_exec(dev, vmsg);
1111 case VHOST_USER_SET_OWNER:
1112 return vubr_set_owner_exec(dev, vmsg);
1113 case VHOST_USER_RESET_OWNER:
1114 return vubr_reset_device_exec(dev, vmsg);
1115 case VHOST_USER_SET_MEM_TABLE:
1116 return vubr_set_mem_table_exec(dev, vmsg);
1117 case VHOST_USER_SET_LOG_BASE:
1118 return vubr_set_log_base_exec(dev, vmsg);
1119 case VHOST_USER_SET_LOG_FD:
1120 return vubr_set_log_fd_exec(dev, vmsg);
1121 case VHOST_USER_SET_VRING_NUM:
1122 return vubr_set_vring_num_exec(dev, vmsg);
1123 case VHOST_USER_SET_VRING_ADDR:
1124 return vubr_set_vring_addr_exec(dev, vmsg);
1125 case VHOST_USER_SET_VRING_BASE:
1126 return vubr_set_vring_base_exec(dev, vmsg);
1127 case VHOST_USER_GET_VRING_BASE:
1128 return vubr_get_vring_base_exec(dev, vmsg);
1129 case VHOST_USER_SET_VRING_KICK:
1130 return vubr_set_vring_kick_exec(dev, vmsg);
1131 case VHOST_USER_SET_VRING_CALL:
1132 return vubr_set_vring_call_exec(dev, vmsg);
1133 case VHOST_USER_SET_VRING_ERR:
1134 return vubr_set_vring_err_exec(dev, vmsg);
1135 case VHOST_USER_GET_PROTOCOL_FEATURES:
1136 return vubr_get_protocol_features_exec(dev, vmsg);
1137 case VHOST_USER_SET_PROTOCOL_FEATURES:
1138 return vubr_set_protocol_features_exec(dev, vmsg);
1139 case VHOST_USER_GET_QUEUE_NUM:
1140 return vubr_get_queue_num_exec(dev, vmsg);
1141 case VHOST_USER_SET_VRING_ENABLE:
1142 return vubr_set_vring_enable_exec(dev, vmsg);
1143 case VHOST_USER_SEND_RARP:
1144 return vubr_send_rarp_exec(dev, vmsg);
1146 case VHOST_USER_MAX:
1147 assert(vmsg->request != VHOST_USER_MAX);
1149 return 0;
1152 static void
1153 vubr_receive_cb(int sock, void *ctx)
1155 VubrDev *dev = (VubrDev *) ctx;
1156 VhostUserMsg vmsg;
1157 int reply_requested;
1159 vubr_message_read(sock, &vmsg);
1160 reply_requested = vubr_execute_request(dev, &vmsg);
1161 if (reply_requested) {
1162 /* Set the version in the flags when sending the reply */
1163 vmsg.flags &= ~VHOST_USER_VERSION_MASK;
1164 vmsg.flags |= VHOST_USER_VERSION;
1165 vmsg.flags |= VHOST_USER_REPLY_MASK;
1166 vubr_message_write(sock, &vmsg);
1170 static void
1171 vubr_accept_cb(int sock, void *ctx)
1173 VubrDev *dev = (VubrDev *)ctx;
1174 int conn_fd;
1175 struct sockaddr_un un;
1176 socklen_t len = sizeof(un);
1178 conn_fd = accept(sock, (struct sockaddr *) &un, &len);
1179 if (conn_fd == -1) {
1180 vubr_die("accept()");
1182 DPRINT("Got connection from remote peer on sock %d\n", conn_fd);
1183 dispatcher_add(&dev->dispatcher, conn_fd, ctx, vubr_receive_cb);
1186 static VubrDev *
1187 vubr_new(const char *path)
1189 VubrDev *dev = (VubrDev *) calloc(1, sizeof(VubrDev));
1190 dev->nregions = 0;
1191 int i;
1192 struct sockaddr_un un;
1193 size_t len;
1195 for (i = 0; i < MAX_NR_VIRTQUEUE; i++) {
1196 dev->vq[i] = (VubrVirtq) {
1197 .call_fd = -1, .kick_fd = -1,
1198 .size = 0,
1199 .last_avail_index = 0, .last_used_index = 0,
1200 .desc = 0, .avail = 0, .used = 0,
1201 .enable = 0,
1205 /* Init log */
1206 dev->log_call_fd = -1;
1207 dev->log_size = 0;
1208 dev->log_table = 0;
1209 dev->ready = 0;
1210 dev->features = 0;
1212 /* Get a UNIX socket. */
1213 dev->sock = socket(AF_UNIX, SOCK_STREAM, 0);
1214 if (dev->sock == -1) {
1215 vubr_die("socket");
1218 un.sun_family = AF_UNIX;
1219 strcpy(un.sun_path, path);
1220 len = sizeof(un.sun_family) + strlen(path);
1221 unlink(path);
1223 if (bind(dev->sock, (struct sockaddr *) &un, len) == -1) {
1224 vubr_die("bind");
1227 if (listen(dev->sock, 1) == -1) {
1228 vubr_die("listen");
1231 dispatcher_init(&dev->dispatcher);
1232 dispatcher_add(&dev->dispatcher, dev->sock, (void *)dev,
1233 vubr_accept_cb);
1235 DPRINT("Waiting for connections on UNIX socket %s ...\n", path);
1236 return dev;
1239 static void
1240 vubr_set_host(struct sockaddr_in *saddr, const char *host)
1242 if (isdigit(host[0])) {
1243 if (!inet_aton(host, &saddr->sin_addr)) {
1244 fprintf(stderr, "inet_aton() failed.\n");
1245 exit(1);
1247 } else {
1248 struct hostent *he = gethostbyname(host);
1250 if (!he) {
1251 fprintf(stderr, "gethostbyname() failed.\n");
1252 exit(1);
1254 saddr->sin_addr = *(struct in_addr *)he->h_addr;
1258 static void
1259 vubr_backend_udp_setup(VubrDev *dev,
1260 const char *local_host,
1261 const char *local_port,
1262 const char *remote_host,
1263 const char *remote_port)
1265 int sock;
1266 const char *r;
1268 int lport, rport;
1270 lport = strtol(local_port, (char **)&r, 0);
1271 if (r == local_port) {
1272 fprintf(stderr, "lport parsing failed.\n");
1273 exit(1);
1276 rport = strtol(remote_port, (char **)&r, 0);
1277 if (r == remote_port) {
1278 fprintf(stderr, "rport parsing failed.\n");
1279 exit(1);
1282 struct sockaddr_in si_local = {
1283 .sin_family = AF_INET,
1284 .sin_port = htons(lport),
1287 vubr_set_host(&si_local, local_host);
1289 /* setup destination for sends */
1290 dev->backend_udp_dest = (struct sockaddr_in) {
1291 .sin_family = AF_INET,
1292 .sin_port = htons(rport),
1294 vubr_set_host(&dev->backend_udp_dest, remote_host);
1296 sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
1297 if (sock == -1) {
1298 vubr_die("socket");
1301 if (bind(sock, (struct sockaddr *)&si_local, sizeof(si_local)) == -1) {
1302 vubr_die("bind");
1305 dev->backend_udp_sock = sock;
1306 dispatcher_add(&dev->dispatcher, sock, dev, vubr_backend_recv_cb);
1307 DPRINT("Waiting for data from udp backend on %s:%d...\n",
1308 local_host, lport);
1311 static void
1312 vubr_run(VubrDev *dev)
1314 while (1) {
1315 /* timeout 200ms */
1316 dispatcher_wait(&dev->dispatcher, 200000);
1317 /* Here one can try polling strategy. */
1321 static int
1322 vubr_parse_host_port(const char **host, const char **port, const char *buf)
1324 char *p = strchr(buf, ':');
1326 if (!p) {
1327 return -1;
1329 *p = '\0';
1330 *host = strdup(buf);
1331 *port = strdup(p + 1);
1332 return 0;
1335 #define DEFAULT_UD_SOCKET "/tmp/vubr.sock"
1336 #define DEFAULT_LHOST "127.0.0.1"
1337 #define DEFAULT_LPORT "4444"
1338 #define DEFAULT_RHOST "127.0.0.1"
1339 #define DEFAULT_RPORT "5555"
1341 static const char *ud_socket_path = DEFAULT_UD_SOCKET;
1342 static const char *lhost = DEFAULT_LHOST;
1343 static const char *lport = DEFAULT_LPORT;
1344 static const char *rhost = DEFAULT_RHOST;
1345 static const char *rport = DEFAULT_RPORT;
1348 main(int argc, char *argv[])
1350 VubrDev *dev;
1351 int opt;
1353 while ((opt = getopt(argc, argv, "l:r:u:")) != -1) {
1355 switch (opt) {
1356 case 'l':
1357 if (vubr_parse_host_port(&lhost, &lport, optarg) < 0) {
1358 goto out;
1360 break;
1361 case 'r':
1362 if (vubr_parse_host_port(&rhost, &rport, optarg) < 0) {
1363 goto out;
1365 break;
1366 case 'u':
1367 ud_socket_path = strdup(optarg);
1368 break;
1369 default:
1370 goto out;
1374 DPRINT("ud socket: %s\n", ud_socket_path);
1375 DPRINT("local: %s:%s\n", lhost, lport);
1376 DPRINT("remote: %s:%s\n", rhost, rport);
1378 dev = vubr_new(ud_socket_path);
1379 if (!dev) {
1380 return 1;
1383 vubr_backend_udp_setup(dev, lhost, lport, rhost, rport);
1384 vubr_run(dev);
1385 return 0;
1387 out:
1388 fprintf(stderr, "Usage: %s ", argv[0]);
1389 fprintf(stderr, "[-u ud_socket_path] [-l lhost:lport] [-r rhost:rport]\n");
1390 fprintf(stderr, "\t-u path to unix doman socket. default: %s\n",
1391 DEFAULT_UD_SOCKET);
1392 fprintf(stderr, "\t-l local host and port. default: %s:%s\n",
1393 DEFAULT_LHOST, DEFAULT_LPORT);
1394 fprintf(stderr, "\t-r remote host and port. default: %s:%s\n",
1395 DEFAULT_RHOST, DEFAULT_RPORT);
1397 return 1;