fsdev-proxy-helper: avoid TOC/TOU race
[qemu.git] / tests / vhost-user-bridge.c
blob85c4c8a8350be41a6a9820bca0d96715a514f8f6
1 /*
2 * Vhost User Bridge
4 * Copyright (c) 2015 Red Hat, Inc.
6 * Authors:
7 * Victor Kaplansky <victork@redhat.com>
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
14 * TODO:
15 * - main should get parameters from the command line.
16 * - implement all request handlers. Still not implemented:
17 * vubr_get_queue_num_exec()
18 * vubr_send_rarp_exec()
19 * - test for broken requests and virtqueue.
20 * - implement features defined by Virtio 1.0 spec.
21 * - support mergeable buffers and indirect descriptors.
22 * - implement clean shutdown.
23 * - implement non-blocking writes to UDP backend.
24 * - implement polling strategy.
25 * - implement clean starting/stopping of vq processing
26 * - implement clean starting/stopping of used and buffers
27 * dirty page logging.
30 #define _FILE_OFFSET_BITS 64
32 #include <stddef.h>
33 #include <assert.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <stdint.h>
37 #include <inttypes.h>
38 #include <string.h>
39 #include <unistd.h>
40 #include <errno.h>
41 #include <sys/types.h>
42 #include <sys/socket.h>
43 #include <sys/un.h>
44 #include <sys/unistd.h>
45 #include <sys/mman.h>
46 #include <sys/eventfd.h>
47 #include <arpa/inet.h>
48 #include <ctype.h>
49 #include <netdb.h>
51 #include <linux/vhost.h>
53 #include "qemu/atomic.h"
54 #include "standard-headers/linux/virtio_net.h"
55 #include "standard-headers/linux/virtio_ring.h"
57 #define VHOST_USER_BRIDGE_DEBUG 1
59 #define DPRINT(...) \
60 do { \
61 if (VHOST_USER_BRIDGE_DEBUG) { \
62 printf(__VA_ARGS__); \
63 } \
64 } while (0)
66 typedef void (*CallbackFunc)(int sock, void *ctx);
68 typedef struct Event {
69 void *ctx;
70 CallbackFunc callback;
71 } Event;
73 typedef struct Dispatcher {
74 int max_sock;
75 fd_set fdset;
76 Event events[FD_SETSIZE];
77 } Dispatcher;
79 static void
80 vubr_die(const char *s)
82 perror(s);
83 exit(1);
86 static int
87 dispatcher_init(Dispatcher *dispr)
89 FD_ZERO(&dispr->fdset);
90 dispr->max_sock = -1;
91 return 0;
94 static int
95 dispatcher_add(Dispatcher *dispr, int sock, void *ctx, CallbackFunc cb)
97 if (sock >= FD_SETSIZE) {
98 fprintf(stderr,
99 "Error: Failed to add new event. sock %d should be less than %d\n",
100 sock, FD_SETSIZE);
101 return -1;
104 dispr->events[sock].ctx = ctx;
105 dispr->events[sock].callback = cb;
107 FD_SET(sock, &dispr->fdset);
108 if (sock > dispr->max_sock) {
109 dispr->max_sock = sock;
111 DPRINT("Added sock %d for watching. max_sock: %d\n",
112 sock, dispr->max_sock);
113 return 0;
116 #if 0
117 /* dispatcher_remove() is not currently in use but may be useful
118 * in the future. */
119 static int
120 dispatcher_remove(Dispatcher *dispr, int sock)
122 if (sock >= FD_SETSIZE) {
123 fprintf(stderr,
124 "Error: Failed to remove event. sock %d should be less than %d\n",
125 sock, FD_SETSIZE);
126 return -1;
129 FD_CLR(sock, &dispr->fdset);
130 return 0;
132 #endif
134 /* timeout in us */
135 static int
136 dispatcher_wait(Dispatcher *dispr, uint32_t timeout)
138 struct timeval tv;
139 tv.tv_sec = timeout / 1000000;
140 tv.tv_usec = timeout % 1000000;
142 fd_set fdset = dispr->fdset;
144 /* wait until some of sockets become readable. */
145 int rc = select(dispr->max_sock + 1, &fdset, 0, 0, &tv);
147 if (rc == -1) {
148 vubr_die("select");
151 /* Timeout */
152 if (rc == 0) {
153 return 0;
156 /* Now call callback for every ready socket. */
158 int sock;
159 for (sock = 0; sock < dispr->max_sock + 1; sock++)
160 if (FD_ISSET(sock, &fdset)) {
161 Event *e = &dispr->events[sock];
162 e->callback(sock, e->ctx);
165 return 0;
168 typedef struct VubrVirtq {
169 int call_fd;
170 int kick_fd;
171 uint32_t size;
172 uint16_t last_avail_index;
173 uint16_t last_used_index;
174 struct vring_desc *desc;
175 struct vring_avail *avail;
176 struct vring_used *used;
177 uint64_t log_guest_addr;
178 int enable;
179 } VubrVirtq;
181 /* Based on qemu/hw/virtio/vhost-user.c */
183 #define VHOST_MEMORY_MAX_NREGIONS 8
184 #define VHOST_USER_F_PROTOCOL_FEATURES 30
186 #define VHOST_LOG_PAGE 4096
188 enum VhostUserProtocolFeature {
189 VHOST_USER_PROTOCOL_F_MQ = 0,
190 VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
191 VHOST_USER_PROTOCOL_F_RARP = 2,
193 VHOST_USER_PROTOCOL_F_MAX
196 #define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1)
198 typedef enum VhostUserRequest {
199 VHOST_USER_NONE = 0,
200 VHOST_USER_GET_FEATURES = 1,
201 VHOST_USER_SET_FEATURES = 2,
202 VHOST_USER_SET_OWNER = 3,
203 VHOST_USER_RESET_OWNER = 4,
204 VHOST_USER_SET_MEM_TABLE = 5,
205 VHOST_USER_SET_LOG_BASE = 6,
206 VHOST_USER_SET_LOG_FD = 7,
207 VHOST_USER_SET_VRING_NUM = 8,
208 VHOST_USER_SET_VRING_ADDR = 9,
209 VHOST_USER_SET_VRING_BASE = 10,
210 VHOST_USER_GET_VRING_BASE = 11,
211 VHOST_USER_SET_VRING_KICK = 12,
212 VHOST_USER_SET_VRING_CALL = 13,
213 VHOST_USER_SET_VRING_ERR = 14,
214 VHOST_USER_GET_PROTOCOL_FEATURES = 15,
215 VHOST_USER_SET_PROTOCOL_FEATURES = 16,
216 VHOST_USER_GET_QUEUE_NUM = 17,
217 VHOST_USER_SET_VRING_ENABLE = 18,
218 VHOST_USER_SEND_RARP = 19,
219 VHOST_USER_MAX
220 } VhostUserRequest;
222 typedef struct VhostUserMemoryRegion {
223 uint64_t guest_phys_addr;
224 uint64_t memory_size;
225 uint64_t userspace_addr;
226 uint64_t mmap_offset;
227 } VhostUserMemoryRegion;
229 typedef struct VhostUserMemory {
230 uint32_t nregions;
231 uint32_t padding;
232 VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
233 } VhostUserMemory;
235 typedef struct VhostUserLog {
236 uint64_t mmap_size;
237 uint64_t mmap_offset;
238 } VhostUserLog;
240 typedef struct VhostUserMsg {
241 VhostUserRequest request;
243 #define VHOST_USER_VERSION_MASK (0x3)
244 #define VHOST_USER_REPLY_MASK (0x1<<2)
245 uint32_t flags;
246 uint32_t size; /* the following payload size */
247 union {
248 #define VHOST_USER_VRING_IDX_MASK (0xff)
249 #define VHOST_USER_VRING_NOFD_MASK (0x1<<8)
250 uint64_t u64;
251 struct vhost_vring_state state;
252 struct vhost_vring_addr addr;
253 VhostUserMemory memory;
254 VhostUserLog log;
255 } payload;
256 int fds[VHOST_MEMORY_MAX_NREGIONS];
257 int fd_num;
258 } QEMU_PACKED VhostUserMsg;
260 #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
262 /* The version of the protocol we support */
263 #define VHOST_USER_VERSION (0x1)
265 #define MAX_NR_VIRTQUEUE (8)
267 typedef struct VubrDevRegion {
268 /* Guest Physical address. */
269 uint64_t gpa;
270 /* Memory region size. */
271 uint64_t size;
272 /* QEMU virtual address (userspace). */
273 uint64_t qva;
274 /* Starting offset in our mmaped space. */
275 uint64_t mmap_offset;
276 /* Start address of mmaped space. */
277 uint64_t mmap_addr;
278 } VubrDevRegion;
280 typedef struct VubrDev {
281 int sock;
282 Dispatcher dispatcher;
283 uint32_t nregions;
284 VubrDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
285 VubrVirtq vq[MAX_NR_VIRTQUEUE];
286 int log_call_fd;
287 uint64_t log_size;
288 uint8_t *log_table;
289 int backend_udp_sock;
290 struct sockaddr_in backend_udp_dest;
291 int ready;
292 uint64_t features;
293 } VubrDev;
295 static const char *vubr_request_str[] = {
296 [VHOST_USER_NONE] = "VHOST_USER_NONE",
297 [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
298 [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
299 [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
300 [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
301 [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
302 [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
303 [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
304 [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
305 [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
306 [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
307 [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
308 [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
309 [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
310 [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR",
311 [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
312 [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
313 [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
314 [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE",
315 [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP",
316 [VHOST_USER_MAX] = "VHOST_USER_MAX",
319 static void
320 print_buffer(uint8_t *buf, size_t len)
322 int i;
323 printf("Raw buffer:\n");
324 for (i = 0; i < len; i++) {
325 if (i % 16 == 0) {
326 printf("\n");
328 if (i % 4 == 0) {
329 printf(" ");
331 printf("%02x ", buf[i]);
333 printf("\n............................................................\n");
336 /* Translate guest physical address to our virtual address. */
337 static uint64_t
338 gpa_to_va(VubrDev *dev, uint64_t guest_addr)
340 int i;
342 /* Find matching memory region. */
343 for (i = 0; i < dev->nregions; i++) {
344 VubrDevRegion *r = &dev->regions[i];
346 if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) {
347 return guest_addr - r->gpa + r->mmap_addr + r->mmap_offset;
351 assert(!"address not found in regions");
352 return 0;
355 /* Translate qemu virtual address to our virtual address. */
356 static uint64_t
357 qva_to_va(VubrDev *dev, uint64_t qemu_addr)
359 int i;
361 /* Find matching memory region. */
362 for (i = 0; i < dev->nregions; i++) {
363 VubrDevRegion *r = &dev->regions[i];
365 if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
366 return qemu_addr - r->qva + r->mmap_addr + r->mmap_offset;
370 assert(!"address not found in regions");
371 return 0;
374 static void
375 vubr_message_read(int conn_fd, VhostUserMsg *vmsg)
377 char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
378 struct iovec iov = {
379 .iov_base = (char *)vmsg,
380 .iov_len = VHOST_USER_HDR_SIZE,
382 struct msghdr msg = {
383 .msg_iov = &iov,
384 .msg_iovlen = 1,
385 .msg_control = control,
386 .msg_controllen = sizeof(control),
388 size_t fd_size;
389 struct cmsghdr *cmsg;
390 int rc;
392 rc = recvmsg(conn_fd, &msg, 0);
394 if (rc == 0) {
395 vubr_die("recvmsg");
396 fprintf(stderr, "Peer disconnected.\n");
397 exit(1);
399 if (rc < 0) {
400 vubr_die("recvmsg");
403 vmsg->fd_num = 0;
404 for (cmsg = CMSG_FIRSTHDR(&msg);
405 cmsg != NULL;
406 cmsg = CMSG_NXTHDR(&msg, cmsg))
408 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
409 fd_size = cmsg->cmsg_len - CMSG_LEN(0);
410 vmsg->fd_num = fd_size / sizeof(int);
411 memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
412 break;
416 if (vmsg->size > sizeof(vmsg->payload)) {
417 fprintf(stderr,
418 "Error: too big message request: %d, size: vmsg->size: %u, "
419 "while sizeof(vmsg->payload) = %lu\n",
420 vmsg->request, vmsg->size, sizeof(vmsg->payload));
421 exit(1);
424 if (vmsg->size) {
425 rc = read(conn_fd, &vmsg->payload, vmsg->size);
426 if (rc == 0) {
427 vubr_die("recvmsg");
428 fprintf(stderr, "Peer disconnected.\n");
429 exit(1);
431 if (rc < 0) {
432 vubr_die("recvmsg");
435 assert(rc == vmsg->size);
439 static void
440 vubr_message_write(int conn_fd, VhostUserMsg *vmsg)
442 int rc;
444 do {
445 rc = write(conn_fd, vmsg, VHOST_USER_HDR_SIZE + vmsg->size);
446 } while (rc < 0 && errno == EINTR);
448 if (rc < 0) {
449 vubr_die("write");
453 static void
454 vubr_backend_udp_sendbuf(VubrDev *dev, uint8_t *buf, size_t len)
456 int slen = sizeof(struct sockaddr_in);
458 if (sendto(dev->backend_udp_sock, buf, len, 0,
459 (struct sockaddr *) &dev->backend_udp_dest, slen) == -1) {
460 vubr_die("sendto()");
464 static int
465 vubr_backend_udp_recvbuf(VubrDev *dev, uint8_t *buf, size_t buflen)
467 int slen = sizeof(struct sockaddr_in);
468 int rc;
470 rc = recvfrom(dev->backend_udp_sock, buf, buflen, 0,
471 (struct sockaddr *) &dev->backend_udp_dest,
472 (socklen_t *)&slen);
473 if (rc == -1) {
474 vubr_die("recvfrom()");
477 return rc;
480 static void
481 vubr_consume_raw_packet(VubrDev *dev, uint8_t *buf, uint32_t len)
483 int hdrlen = sizeof(struct virtio_net_hdr_v1);
485 if (VHOST_USER_BRIDGE_DEBUG) {
486 print_buffer(buf, len);
488 vubr_backend_udp_sendbuf(dev, buf + hdrlen, len - hdrlen);
491 /* Kick the log_call_fd if required. */
492 static void
493 vubr_log_kick(VubrDev *dev)
495 if (dev->log_call_fd != -1) {
496 DPRINT("Kicking the QEMU's log...\n");
497 eventfd_write(dev->log_call_fd, 1);
501 /* Kick the guest if necessary. */
502 static void
503 vubr_virtqueue_kick(VubrVirtq *vq)
505 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
506 DPRINT("Kicking the guest...\n");
507 eventfd_write(vq->call_fd, 1);
511 static void
512 vubr_log_page(uint8_t *log_table, uint64_t page)
514 DPRINT("Logged dirty guest page: %"PRId64"\n", page);
515 atomic_or(&log_table[page / 8], 1 << (page % 8));
518 static void
519 vubr_log_write(VubrDev *dev, uint64_t address, uint64_t length)
521 uint64_t page;
523 if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) ||
524 !dev->log_table || !length) {
525 return;
528 assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8));
530 page = address / VHOST_LOG_PAGE;
531 while (page * VHOST_LOG_PAGE < address + length) {
532 vubr_log_page(dev->log_table, page);
533 page += VHOST_LOG_PAGE;
535 vubr_log_kick(dev);
538 static void
539 vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len)
541 struct vring_desc *desc = vq->desc;
542 struct vring_avail *avail = vq->avail;
543 struct vring_used *used = vq->used;
544 uint64_t log_guest_addr = vq->log_guest_addr;
546 unsigned int size = vq->size;
548 uint16_t avail_index = atomic_mb_read(&avail->idx);
550 /* We check the available descriptors before posting the
551 * buffer, so here we assume that enough available
552 * descriptors. */
553 assert(vq->last_avail_index != avail_index);
554 uint16_t a_index = vq->last_avail_index % size;
555 uint16_t u_index = vq->last_used_index % size;
556 uint16_t d_index = avail->ring[a_index];
558 int i = d_index;
560 DPRINT("Post packet to guest on vq:\n");
561 DPRINT(" size = %d\n", vq->size);
562 DPRINT(" last_avail_index = %d\n", vq->last_avail_index);
563 DPRINT(" last_used_index = %d\n", vq->last_used_index);
564 DPRINT(" a_index = %d\n", a_index);
565 DPRINT(" u_index = %d\n", u_index);
566 DPRINT(" d_index = %d\n", d_index);
567 DPRINT(" desc[%d].addr = 0x%016"PRIx64"\n", i, desc[i].addr);
568 DPRINT(" desc[%d].len = %d\n", i, desc[i].len);
569 DPRINT(" desc[%d].flags = %d\n", i, desc[i].flags);
570 DPRINT(" avail->idx = %d\n", avail_index);
571 DPRINT(" used->idx = %d\n", used->idx);
573 if (!(desc[i].flags & VRING_DESC_F_WRITE)) {
574 /* FIXME: we should find writable descriptor. */
575 fprintf(stderr, "Error: descriptor is not writable. Exiting.\n");
576 exit(1);
579 void *chunk_start = (void *)gpa_to_va(dev, desc[i].addr);
580 uint32_t chunk_len = desc[i].len;
582 if (len <= chunk_len) {
583 memcpy(chunk_start, buf, len);
584 vubr_log_write(dev, desc[i].addr, len);
585 } else {
586 fprintf(stderr,
587 "Received too long packet from the backend. Dropping...\n");
588 return;
591 /* Add descriptor to the used ring. */
592 used->ring[u_index].id = d_index;
593 used->ring[u_index].len = len;
594 vubr_log_write(dev,
595 log_guest_addr + offsetof(struct vring_used, ring[u_index]),
596 sizeof(used->ring[u_index]));
598 vq->last_avail_index++;
599 vq->last_used_index++;
601 atomic_mb_set(&used->idx, vq->last_used_index);
602 vubr_log_write(dev,
603 log_guest_addr + offsetof(struct vring_used, idx),
604 sizeof(used->idx));
606 /* Kick the guest if necessary. */
607 vubr_virtqueue_kick(vq);
610 static int
611 vubr_process_desc(VubrDev *dev, VubrVirtq *vq)
613 struct vring_desc *desc = vq->desc;
614 struct vring_avail *avail = vq->avail;
615 struct vring_used *used = vq->used;
616 uint64_t log_guest_addr = vq->log_guest_addr;
618 unsigned int size = vq->size;
620 uint16_t a_index = vq->last_avail_index % size;
621 uint16_t u_index = vq->last_used_index % size;
622 uint16_t d_index = avail->ring[a_index];
624 uint32_t i, len = 0;
625 size_t buf_size = 4096;
626 uint8_t buf[4096];
628 DPRINT("Chunks: ");
629 i = d_index;
630 do {
631 void *chunk_start = (void *)gpa_to_va(dev, desc[i].addr);
632 uint32_t chunk_len = desc[i].len;
634 assert(!(desc[i].flags & VRING_DESC_F_WRITE));
636 if (len + chunk_len < buf_size) {
637 memcpy(buf + len, chunk_start, chunk_len);
638 DPRINT("%d ", chunk_len);
639 } else {
640 fprintf(stderr, "Error: too long packet. Dropping...\n");
641 break;
644 len += chunk_len;
646 if (!(desc[i].flags & VRING_DESC_F_NEXT)) {
647 break;
650 i = desc[i].next;
651 } while (1);
652 DPRINT("\n");
654 if (!len) {
655 return -1;
658 /* Add descriptor to the used ring. */
659 used->ring[u_index].id = d_index;
660 used->ring[u_index].len = len;
661 vubr_log_write(dev,
662 log_guest_addr + offsetof(struct vring_used, ring[u_index]),
663 sizeof(used->ring[u_index]));
665 vubr_consume_raw_packet(dev, buf, len);
667 return 0;
670 static void
671 vubr_process_avail(VubrDev *dev, VubrVirtq *vq)
673 struct vring_avail *avail = vq->avail;
674 struct vring_used *used = vq->used;
675 uint64_t log_guest_addr = vq->log_guest_addr;
677 while (vq->last_avail_index != atomic_mb_read(&avail->idx)) {
678 vubr_process_desc(dev, vq);
679 vq->last_avail_index++;
680 vq->last_used_index++;
683 atomic_mb_set(&used->idx, vq->last_used_index);
684 vubr_log_write(dev,
685 log_guest_addr + offsetof(struct vring_used, idx),
686 sizeof(used->idx));
689 static void
690 vubr_backend_recv_cb(int sock, void *ctx)
692 VubrDev *dev = (VubrDev *) ctx;
693 VubrVirtq *rx_vq = &dev->vq[0];
694 uint8_t buf[4096];
695 struct virtio_net_hdr_v1 *hdr = (struct virtio_net_hdr_v1 *)buf;
696 int hdrlen = sizeof(struct virtio_net_hdr_v1);
697 int buflen = sizeof(buf);
698 int len;
700 if (!dev->ready) {
701 return;
704 DPRINT("\n\n *** IN UDP RECEIVE CALLBACK ***\n\n");
706 uint16_t avail_index = atomic_mb_read(&rx_vq->avail->idx);
708 /* If there is no available descriptors, just do nothing.
709 * The buffer will be handled by next arrived UDP packet,
710 * or next kick on receive virtq. */
711 if (rx_vq->last_avail_index == avail_index) {
712 DPRINT("Got UDP packet, but no available descriptors on RX virtq.\n");
713 return;
716 len = vubr_backend_udp_recvbuf(dev, buf + hdrlen, buflen - hdrlen);
718 *hdr = (struct virtio_net_hdr_v1) { };
719 hdr->num_buffers = 1;
720 vubr_post_buffer(dev, rx_vq, buf, len + hdrlen);
723 static void
724 vubr_kick_cb(int sock, void *ctx)
726 VubrDev *dev = (VubrDev *) ctx;
727 eventfd_t kick_data;
728 ssize_t rc;
730 rc = eventfd_read(sock, &kick_data);
731 if (rc == -1) {
732 vubr_die("eventfd_read()");
733 } else {
734 DPRINT("Got kick_data: %016"PRIx64"\n", kick_data);
735 vubr_process_avail(dev, &dev->vq[1]);
739 static int
740 vubr_none_exec(VubrDev *dev, VhostUserMsg *vmsg)
742 DPRINT("Function %s() not implemented yet.\n", __func__);
743 return 0;
746 static int
747 vubr_get_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
749 vmsg->payload.u64 =
750 ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
751 (1ULL << VHOST_F_LOG_ALL) |
752 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
753 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES));
755 vmsg->size = sizeof(vmsg->payload.u64);
757 DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
759 /* Reply */
760 return 1;
763 static int
764 vubr_set_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
766 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
767 dev->features = vmsg->payload.u64;
768 return 0;
771 static int
772 vubr_set_owner_exec(VubrDev *dev, VhostUserMsg *vmsg)
774 return 0;
777 static void
778 vubr_close_log(VubrDev *dev)
780 if (dev->log_table) {
781 if (munmap(dev->log_table, dev->log_size) != 0) {
782 vubr_die("munmap()");
785 dev->log_table = 0;
787 if (dev->log_call_fd != -1) {
788 close(dev->log_call_fd);
789 dev->log_call_fd = -1;
793 static int
794 vubr_reset_device_exec(VubrDev *dev, VhostUserMsg *vmsg)
796 vubr_close_log(dev);
797 dev->ready = 0;
798 dev->features = 0;
799 return 0;
802 static int
803 vubr_set_mem_table_exec(VubrDev *dev, VhostUserMsg *vmsg)
805 int i;
806 VhostUserMemory *memory = &vmsg->payload.memory;
807 dev->nregions = memory->nregions;
809 DPRINT("Nregions: %d\n", memory->nregions);
810 for (i = 0; i < dev->nregions; i++) {
811 void *mmap_addr;
812 VhostUserMemoryRegion *msg_region = &memory->regions[i];
813 VubrDevRegion *dev_region = &dev->regions[i];
815 DPRINT("Region %d\n", i);
816 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
817 msg_region->guest_phys_addr);
818 DPRINT(" memory_size: 0x%016"PRIx64"\n",
819 msg_region->memory_size);
820 DPRINT(" userspace_addr 0x%016"PRIx64"\n",
821 msg_region->userspace_addr);
822 DPRINT(" mmap_offset 0x%016"PRIx64"\n",
823 msg_region->mmap_offset);
825 dev_region->gpa = msg_region->guest_phys_addr;
826 dev_region->size = msg_region->memory_size;
827 dev_region->qva = msg_region->userspace_addr;
828 dev_region->mmap_offset = msg_region->mmap_offset;
830 /* We don't use offset argument of mmap() since the
831 * mapped address has to be page aligned, and we use huge
832 * pages. */
833 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
834 PROT_READ | PROT_WRITE, MAP_SHARED,
835 vmsg->fds[i], 0);
837 if (mmap_addr == MAP_FAILED) {
838 vubr_die("mmap");
841 dev_region->mmap_addr = (uint64_t) mmap_addr;
842 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", dev_region->mmap_addr);
845 return 0;
848 static int
849 vubr_set_log_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
851 int fd;
852 uint64_t log_mmap_size, log_mmap_offset;
853 void *rc;
855 assert(vmsg->fd_num == 1);
856 fd = vmsg->fds[0];
858 assert(vmsg->size == sizeof(vmsg->payload.log));
859 log_mmap_offset = vmsg->payload.log.mmap_offset;
860 log_mmap_size = vmsg->payload.log.mmap_size;
861 DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset);
862 DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size);
864 rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
865 log_mmap_offset);
866 if (rc == MAP_FAILED) {
867 vubr_die("mmap");
869 dev->log_table = rc;
870 dev->log_size = log_mmap_size;
872 vmsg->size = sizeof(vmsg->payload.u64);
873 /* Reply */
874 return 1;
877 static int
878 vubr_set_log_fd_exec(VubrDev *dev, VhostUserMsg *vmsg)
880 assert(vmsg->fd_num == 1);
881 dev->log_call_fd = vmsg->fds[0];
882 DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]);
883 return 0;
886 static int
887 vubr_set_vring_num_exec(VubrDev *dev, VhostUserMsg *vmsg)
889 unsigned int index = vmsg->payload.state.index;
890 unsigned int num = vmsg->payload.state.num;
892 DPRINT("State.index: %d\n", index);
893 DPRINT("State.num: %d\n", num);
894 dev->vq[index].size = num;
895 return 0;
898 static int
899 vubr_set_vring_addr_exec(VubrDev *dev, VhostUserMsg *vmsg)
901 struct vhost_vring_addr *vra = &vmsg->payload.addr;
902 unsigned int index = vra->index;
903 VubrVirtq *vq = &dev->vq[index];
905 DPRINT("vhost_vring_addr:\n");
906 DPRINT(" index: %d\n", vra->index);
907 DPRINT(" flags: %d\n", vra->flags);
908 DPRINT(" desc_user_addr: 0x%016llx\n", vra->desc_user_addr);
909 DPRINT(" used_user_addr: 0x%016llx\n", vra->used_user_addr);
910 DPRINT(" avail_user_addr: 0x%016llx\n", vra->avail_user_addr);
911 DPRINT(" log_guest_addr: 0x%016llx\n", vra->log_guest_addr);
913 vq->desc = (struct vring_desc *)qva_to_va(dev, vra->desc_user_addr);
914 vq->used = (struct vring_used *)qva_to_va(dev, vra->used_user_addr);
915 vq->avail = (struct vring_avail *)qva_to_va(dev, vra->avail_user_addr);
916 vq->log_guest_addr = vra->log_guest_addr;
918 DPRINT("Setting virtq addresses:\n");
919 DPRINT(" vring_desc at %p\n", vq->desc);
920 DPRINT(" vring_used at %p\n", vq->used);
921 DPRINT(" vring_avail at %p\n", vq->avail);
923 vq->last_used_index = vq->used->idx;
924 return 0;
927 static int
928 vubr_set_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
930 unsigned int index = vmsg->payload.state.index;
931 unsigned int num = vmsg->payload.state.num;
933 DPRINT("State.index: %d\n", index);
934 DPRINT("State.num: %d\n", num);
935 dev->vq[index].last_avail_index = num;
937 return 0;
940 static int
941 vubr_get_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
943 unsigned int index = vmsg->payload.state.index;
945 DPRINT("State.index: %d\n", index);
946 vmsg->payload.state.num = dev->vq[index].last_avail_index;
947 vmsg->size = sizeof(vmsg->payload.state);
948 /* FIXME: this is a work-around for a bug in QEMU enabling
949 * too early vrings. When protocol features are enabled,
950 * we have to respect * VHOST_USER_SET_VRING_ENABLE request. */
951 dev->ready = 0;
953 /* Reply */
954 return 1;
957 static int
958 vubr_set_vring_kick_exec(VubrDev *dev, VhostUserMsg *vmsg)
960 uint64_t u64_arg = vmsg->payload.u64;
961 int index = u64_arg & VHOST_USER_VRING_IDX_MASK;
963 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
965 assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0);
966 assert(vmsg->fd_num == 1);
968 dev->vq[index].kick_fd = vmsg->fds[0];
969 DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index);
971 if (index % 2 == 1) {
972 /* TX queue. */
973 dispatcher_add(&dev->dispatcher, dev->vq[index].kick_fd,
974 dev, vubr_kick_cb);
976 DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
977 dev->vq[index].kick_fd, index);
979 /* We temporarily use this hack to determine that both TX and RX
980 * queues are set up and ready for processing.
981 * FIXME: we need to rely in VHOST_USER_SET_VRING_ENABLE and
982 * actual kicks. */
983 if (dev->vq[0].kick_fd != -1 &&
984 dev->vq[1].kick_fd != -1) {
985 dev->ready = 1;
986 DPRINT("vhost-user-bridge is ready for processing queues.\n");
988 return 0;
992 static int
993 vubr_set_vring_call_exec(VubrDev *dev, VhostUserMsg *vmsg)
995 uint64_t u64_arg = vmsg->payload.u64;
996 int index = u64_arg & VHOST_USER_VRING_IDX_MASK;
998 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
999 assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0);
1000 assert(vmsg->fd_num == 1);
1002 dev->vq[index].call_fd = vmsg->fds[0];
1003 DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
1005 return 0;
1008 static int
1009 vubr_set_vring_err_exec(VubrDev *dev, VhostUserMsg *vmsg)
1011 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
1012 return 0;
1015 static int
1016 vubr_get_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
1018 vmsg->payload.u64 = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD;
1019 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
1020 vmsg->size = sizeof(vmsg->payload.u64);
1022 /* Reply */
1023 return 1;
1026 static int
1027 vubr_set_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
1029 /* FIXME: unimplented */
1030 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
1031 return 0;
1034 static int
1035 vubr_get_queue_num_exec(VubrDev *dev, VhostUserMsg *vmsg)
1037 DPRINT("Function %s() not implemented yet.\n", __func__);
1038 return 0;
1041 static int
1042 vubr_set_vring_enable_exec(VubrDev *dev, VhostUserMsg *vmsg)
1044 unsigned int index = vmsg->payload.state.index;
1045 unsigned int enable = vmsg->payload.state.num;
1047 DPRINT("State.index: %d\n", index);
1048 DPRINT("State.enable: %d\n", enable);
1049 dev->vq[index].enable = enable;
1050 return 0;
1053 static int
1054 vubr_send_rarp_exec(VubrDev *dev, VhostUserMsg *vmsg)
1056 DPRINT("Function %s() not implemented yet.\n", __func__);
1057 return 0;
1060 static int
1061 vubr_execute_request(VubrDev *dev, VhostUserMsg *vmsg)
1063 /* Print out generic part of the request. */
1064 DPRINT(
1065 "================== Vhost user message from QEMU ==================\n");
1066 DPRINT("Request: %s (%d)\n", vubr_request_str[vmsg->request],
1067 vmsg->request);
1068 DPRINT("Flags: 0x%x\n", vmsg->flags);
1069 DPRINT("Size: %d\n", vmsg->size);
1071 if (vmsg->fd_num) {
1072 int i;
1073 DPRINT("Fds:");
1074 for (i = 0; i < vmsg->fd_num; i++) {
1075 DPRINT(" %d", vmsg->fds[i]);
1077 DPRINT("\n");
1080 switch (vmsg->request) {
1081 case VHOST_USER_NONE:
1082 return vubr_none_exec(dev, vmsg);
1083 case VHOST_USER_GET_FEATURES:
1084 return vubr_get_features_exec(dev, vmsg);
1085 case VHOST_USER_SET_FEATURES:
1086 return vubr_set_features_exec(dev, vmsg);
1087 case VHOST_USER_SET_OWNER:
1088 return vubr_set_owner_exec(dev, vmsg);
1089 case VHOST_USER_RESET_OWNER:
1090 return vubr_reset_device_exec(dev, vmsg);
1091 case VHOST_USER_SET_MEM_TABLE:
1092 return vubr_set_mem_table_exec(dev, vmsg);
1093 case VHOST_USER_SET_LOG_BASE:
1094 return vubr_set_log_base_exec(dev, vmsg);
1095 case VHOST_USER_SET_LOG_FD:
1096 return vubr_set_log_fd_exec(dev, vmsg);
1097 case VHOST_USER_SET_VRING_NUM:
1098 return vubr_set_vring_num_exec(dev, vmsg);
1099 case VHOST_USER_SET_VRING_ADDR:
1100 return vubr_set_vring_addr_exec(dev, vmsg);
1101 case VHOST_USER_SET_VRING_BASE:
1102 return vubr_set_vring_base_exec(dev, vmsg);
1103 case VHOST_USER_GET_VRING_BASE:
1104 return vubr_get_vring_base_exec(dev, vmsg);
1105 case VHOST_USER_SET_VRING_KICK:
1106 return vubr_set_vring_kick_exec(dev, vmsg);
1107 case VHOST_USER_SET_VRING_CALL:
1108 return vubr_set_vring_call_exec(dev, vmsg);
1109 case VHOST_USER_SET_VRING_ERR:
1110 return vubr_set_vring_err_exec(dev, vmsg);
1111 case VHOST_USER_GET_PROTOCOL_FEATURES:
1112 return vubr_get_protocol_features_exec(dev, vmsg);
1113 case VHOST_USER_SET_PROTOCOL_FEATURES:
1114 return vubr_set_protocol_features_exec(dev, vmsg);
1115 case VHOST_USER_GET_QUEUE_NUM:
1116 return vubr_get_queue_num_exec(dev, vmsg);
1117 case VHOST_USER_SET_VRING_ENABLE:
1118 return vubr_set_vring_enable_exec(dev, vmsg);
1119 case VHOST_USER_SEND_RARP:
1120 return vubr_send_rarp_exec(dev, vmsg);
1122 case VHOST_USER_MAX:
1123 assert(vmsg->request != VHOST_USER_MAX);
1125 return 0;
1128 static void
1129 vubr_receive_cb(int sock, void *ctx)
1131 VubrDev *dev = (VubrDev *) ctx;
1132 VhostUserMsg vmsg;
1133 int reply_requested;
1135 vubr_message_read(sock, &vmsg);
1136 reply_requested = vubr_execute_request(dev, &vmsg);
1137 if (reply_requested) {
1138 /* Set the version in the flags when sending the reply */
1139 vmsg.flags &= ~VHOST_USER_VERSION_MASK;
1140 vmsg.flags |= VHOST_USER_VERSION;
1141 vmsg.flags |= VHOST_USER_REPLY_MASK;
1142 vubr_message_write(sock, &vmsg);
1146 static void
1147 vubr_accept_cb(int sock, void *ctx)
1149 VubrDev *dev = (VubrDev *)ctx;
1150 int conn_fd;
1151 struct sockaddr_un un;
1152 socklen_t len = sizeof(un);
1154 conn_fd = accept(sock, (struct sockaddr *) &un, &len);
1155 if (conn_fd == -1) {
1156 vubr_die("accept()");
1158 DPRINT("Got connection from remote peer on sock %d\n", conn_fd);
1159 dispatcher_add(&dev->dispatcher, conn_fd, ctx, vubr_receive_cb);
1162 static VubrDev *
1163 vubr_new(const char *path)
1165 VubrDev *dev = (VubrDev *) calloc(1, sizeof(VubrDev));
1166 dev->nregions = 0;
1167 int i;
1168 struct sockaddr_un un;
1169 size_t len;
1171 for (i = 0; i < MAX_NR_VIRTQUEUE; i++) {
1172 dev->vq[i] = (VubrVirtq) {
1173 .call_fd = -1, .kick_fd = -1,
1174 .size = 0,
1175 .last_avail_index = 0, .last_used_index = 0,
1176 .desc = 0, .avail = 0, .used = 0,
1177 .enable = 0,
1181 /* Init log */
1182 dev->log_call_fd = -1;
1183 dev->log_size = 0;
1184 dev->log_table = 0;
1185 dev->ready = 0;
1186 dev->features = 0;
1188 /* Get a UNIX socket. */
1189 dev->sock = socket(AF_UNIX, SOCK_STREAM, 0);
1190 if (dev->sock == -1) {
1191 vubr_die("socket");
1194 un.sun_family = AF_UNIX;
1195 strcpy(un.sun_path, path);
1196 len = sizeof(un.sun_family) + strlen(path);
1197 unlink(path);
1199 if (bind(dev->sock, (struct sockaddr *) &un, len) == -1) {
1200 vubr_die("bind");
1203 if (listen(dev->sock, 1) == -1) {
1204 vubr_die("listen");
1207 dispatcher_init(&dev->dispatcher);
1208 dispatcher_add(&dev->dispatcher, dev->sock, (void *)dev,
1209 vubr_accept_cb);
1211 DPRINT("Waiting for connections on UNIX socket %s ...\n", path);
1212 return dev;
1215 static void
1216 vubr_set_host(struct sockaddr_in *saddr, const char *host)
1218 if (isdigit(host[0])) {
1219 if (!inet_aton(host, &saddr->sin_addr)) {
1220 fprintf(stderr, "inet_aton() failed.\n");
1221 exit(1);
1223 } else {
1224 struct hostent *he = gethostbyname(host);
1226 if (!he) {
1227 fprintf(stderr, "gethostbyname() failed.\n");
1228 exit(1);
1230 saddr->sin_addr = *(struct in_addr *)he->h_addr;
1234 static void
1235 vubr_backend_udp_setup(VubrDev *dev,
1236 const char *local_host,
1237 const char *local_port,
1238 const char *remote_host,
1239 const char *remote_port)
1241 int sock;
1242 const char *r;
1244 int lport, rport;
1246 lport = strtol(local_port, (char **)&r, 0);
1247 if (r == local_port) {
1248 fprintf(stderr, "lport parsing failed.\n");
1249 exit(1);
1252 rport = strtol(remote_port, (char **)&r, 0);
1253 if (r == remote_port) {
1254 fprintf(stderr, "rport parsing failed.\n");
1255 exit(1);
1258 struct sockaddr_in si_local = {
1259 .sin_family = AF_INET,
1260 .sin_port = htons(lport),
1263 vubr_set_host(&si_local, local_host);
1265 /* setup destination for sends */
1266 dev->backend_udp_dest = (struct sockaddr_in) {
1267 .sin_family = AF_INET,
1268 .sin_port = htons(rport),
1270 vubr_set_host(&dev->backend_udp_dest, remote_host);
1272 sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
1273 if (sock == -1) {
1274 vubr_die("socket");
1277 if (bind(sock, (struct sockaddr *)&si_local, sizeof(si_local)) == -1) {
1278 vubr_die("bind");
1281 dev->backend_udp_sock = sock;
1282 dispatcher_add(&dev->dispatcher, sock, dev, vubr_backend_recv_cb);
1283 DPRINT("Waiting for data from udp backend on %s:%d...\n",
1284 local_host, lport);
1287 static void
1288 vubr_run(VubrDev *dev)
1290 while (1) {
1291 /* timeout 200ms */
1292 dispatcher_wait(&dev->dispatcher, 200000);
1293 /* Here one can try polling strategy. */
1297 static int
1298 vubr_parse_host_port(const char **host, const char **port, const char *buf)
1300 char *p = strchr(buf, ':');
1302 if (!p) {
1303 return -1;
1305 *p = '\0';
1306 *host = strdup(buf);
1307 *port = strdup(p + 1);
1308 return 0;
1311 #define DEFAULT_UD_SOCKET "/tmp/vubr.sock"
1312 #define DEFAULT_LHOST "127.0.0.1"
1313 #define DEFAULT_LPORT "4444"
1314 #define DEFAULT_RHOST "127.0.0.1"
1315 #define DEFAULT_RPORT "5555"
1317 static const char *ud_socket_path = DEFAULT_UD_SOCKET;
1318 static const char *lhost = DEFAULT_LHOST;
1319 static const char *lport = DEFAULT_LPORT;
1320 static const char *rhost = DEFAULT_RHOST;
1321 static const char *rport = DEFAULT_RPORT;
1324 main(int argc, char *argv[])
1326 VubrDev *dev;
1327 int opt;
1329 while ((opt = getopt(argc, argv, "l:r:u:")) != -1) {
1331 switch (opt) {
1332 case 'l':
1333 if (vubr_parse_host_port(&lhost, &lport, optarg) < 0) {
1334 goto out;
1336 break;
1337 case 'r':
1338 if (vubr_parse_host_port(&rhost, &rport, optarg) < 0) {
1339 goto out;
1341 break;
1342 case 'u':
1343 ud_socket_path = strdup(optarg);
1344 break;
1345 default:
1346 goto out;
1350 DPRINT("ud socket: %s\n", ud_socket_path);
1351 DPRINT("local: %s:%s\n", lhost, lport);
1352 DPRINT("remote: %s:%s\n", rhost, rport);
1354 dev = vubr_new(ud_socket_path);
1355 if (!dev) {
1356 return 1;
1359 vubr_backend_udp_setup(dev, lhost, lport, rhost, rport);
1360 vubr_run(dev);
1361 return 0;
1363 out:
1364 fprintf(stderr, "Usage: %s ", argv[0]);
1365 fprintf(stderr, "[-u ud_socket_path] [-l lhost:lport] [-r rhost:rport]\n");
1366 fprintf(stderr, "\t-u path to unix doman socket. default: %s\n",
1367 DEFAULT_UD_SOCKET);
1368 fprintf(stderr, "\t-l local host and port. default: %s:%s\n",
1369 DEFAULT_LHOST, DEFAULT_LPORT);
1370 fprintf(stderr, "\t-r remote host and port. default: %s:%s\n",
1371 DEFAULT_RHOST, DEFAULT_RPORT);
1373 return 1;