pci: introduce helper function to handle msi-x and msi.
[qemu/cris-port.git] / block / sheepdog.c
blob81aa564f263a511cf5e856a4143fe036d47cd8ef
1 /*
2 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License version
6 * 2 as published by the Free Software Foundation.
8 * You should have received a copy of the GNU General Public License
9 * along with this program. If not, see <http://www.gnu.org/licenses/>.
11 #ifdef _WIN32
12 #include <windows.h>
13 #include <winsock2.h>
14 #include <ws2tcpip.h>
15 #else
16 #include <netdb.h>
17 #include <netinet/tcp.h>
19 #define closesocket(s) close(s)
20 #endif
22 #include "qemu-common.h"
23 #include "qemu-error.h"
24 #include "qemu_socket.h"
25 #include "block_int.h"
27 #define SD_PROTO_VER 0x01
29 #define SD_DEFAULT_ADDR "localhost"
30 #define SD_DEFAULT_PORT "7000"
32 #define SD_OP_CREATE_AND_WRITE_OBJ 0x01
33 #define SD_OP_READ_OBJ 0x02
34 #define SD_OP_WRITE_OBJ 0x03
36 #define SD_OP_NEW_VDI 0x11
37 #define SD_OP_LOCK_VDI 0x12
38 #define SD_OP_RELEASE_VDI 0x13
39 #define SD_OP_GET_VDI_INFO 0x14
40 #define SD_OP_READ_VDIS 0x15
42 #define SD_FLAG_CMD_WRITE 0x01
43 #define SD_FLAG_CMD_COW 0x02
45 #define SD_RES_SUCCESS 0x00 /* Success */
46 #define SD_RES_UNKNOWN 0x01 /* Unknown error */
47 #define SD_RES_NO_OBJ 0x02 /* No object found */
48 #define SD_RES_EIO 0x03 /* I/O error */
49 #define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
50 #define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
51 #define SD_RES_SYSTEM_ERROR 0x06 /* System error */
52 #define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */
53 #define SD_RES_NO_VDI 0x08 /* No vdi found */
54 #define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */
55 #define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */
56 #define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
57 #define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
58 #define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */
59 #define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
60 #define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
61 #define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */
62 #define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
63 #define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
64 #define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */
65 #define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
66 #define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
67 #define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
68 #define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
69 #define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
72 * Object ID rules
74 * 0 - 19 (20 bits): data object space
75 * 20 - 31 (12 bits): reserved data object space
76 * 32 - 55 (24 bits): vdi object space
77 * 56 - 59 ( 4 bits): reserved vdi object space
78 * 60 - 63 ( 4 bits): object type indentifier space
81 #define VDI_SPACE_SHIFT 32
82 #define VDI_BIT (UINT64_C(1) << 63)
83 #define VMSTATE_BIT (UINT64_C(1) << 62)
84 #define MAX_DATA_OBJS (UINT64_C(1) << 20)
85 #define MAX_CHILDREN 1024
86 #define SD_MAX_VDI_LEN 256
87 #define SD_MAX_VDI_TAG_LEN 256
88 #define SD_NR_VDIS (1U << 24)
89 #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
90 #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
91 #define SECTOR_SIZE 512
93 #define SD_INODE_SIZE (sizeof(SheepdogInode))
94 #define CURRENT_VDI_ID 0
96 typedef struct SheepdogReq {
97 uint8_t proto_ver;
98 uint8_t opcode;
99 uint16_t flags;
100 uint32_t epoch;
101 uint32_t id;
102 uint32_t data_length;
103 uint32_t opcode_specific[8];
104 } SheepdogReq;
106 typedef struct SheepdogRsp {
107 uint8_t proto_ver;
108 uint8_t opcode;
109 uint16_t flags;
110 uint32_t epoch;
111 uint32_t id;
112 uint32_t data_length;
113 uint32_t result;
114 uint32_t opcode_specific[7];
115 } SheepdogRsp;
117 typedef struct SheepdogObjReq {
118 uint8_t proto_ver;
119 uint8_t opcode;
120 uint16_t flags;
121 uint32_t epoch;
122 uint32_t id;
123 uint32_t data_length;
124 uint64_t oid;
125 uint64_t cow_oid;
126 uint32_t copies;
127 uint32_t rsvd;
128 uint64_t offset;
129 } SheepdogObjReq;
131 typedef struct SheepdogObjRsp {
132 uint8_t proto_ver;
133 uint8_t opcode;
134 uint16_t flags;
135 uint32_t epoch;
136 uint32_t id;
137 uint32_t data_length;
138 uint32_t result;
139 uint32_t copies;
140 uint32_t pad[6];
141 } SheepdogObjRsp;
143 typedef struct SheepdogVdiReq {
144 uint8_t proto_ver;
145 uint8_t opcode;
146 uint16_t flags;
147 uint32_t epoch;
148 uint32_t id;
149 uint32_t data_length;
150 uint64_t vdi_size;
151 uint32_t base_vdi_id;
152 uint32_t copies;
153 uint32_t snapid;
154 uint32_t pad[3];
155 } SheepdogVdiReq;
157 typedef struct SheepdogVdiRsp {
158 uint8_t proto_ver;
159 uint8_t opcode;
160 uint16_t flags;
161 uint32_t epoch;
162 uint32_t id;
163 uint32_t data_length;
164 uint32_t result;
165 uint32_t rsvd;
166 uint32_t vdi_id;
167 uint32_t pad[5];
168 } SheepdogVdiRsp;
170 typedef struct SheepdogInode {
171 char name[SD_MAX_VDI_LEN];
172 char tag[SD_MAX_VDI_TAG_LEN];
173 uint64_t ctime;
174 uint64_t snap_ctime;
175 uint64_t vm_clock_nsec;
176 uint64_t vdi_size;
177 uint64_t vm_state_size;
178 uint16_t copy_policy;
179 uint8_t nr_copies;
180 uint8_t block_size_shift;
181 uint32_t snap_id;
182 uint32_t vdi_id;
183 uint32_t parent_vdi_id;
184 uint32_t child_vdi_id[MAX_CHILDREN];
185 uint32_t data_vdi_id[MAX_DATA_OBJS];
186 } SheepdogInode;
189 * 64 bit FNV-1a non-zero initial basis
191 #define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
194 * 64 bit Fowler/Noll/Vo FNV-1a hash code
196 static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
198 unsigned char *bp = buf;
199 unsigned char *be = bp + len;
200 while (bp < be) {
201 hval ^= (uint64_t) *bp++;
202 hval += (hval << 1) + (hval << 4) + (hval << 5) +
203 (hval << 7) + (hval << 8) + (hval << 40);
205 return hval;
208 static inline int is_data_obj_writeable(SheepdogInode *inode, unsigned int idx)
210 return inode->vdi_id == inode->data_vdi_id[idx];
213 static inline int is_data_obj(uint64_t oid)
215 return !(VDI_BIT & oid);
218 static inline uint64_t data_oid_to_idx(uint64_t oid)
220 return oid & (MAX_DATA_OBJS - 1);
223 static inline uint64_t vid_to_vdi_oid(uint32_t vid)
225 return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
228 static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
230 return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
233 static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
235 return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
238 static inline int is_snapshot(struct SheepdogInode *inode)
240 return !!inode->snap_ctime;
243 #undef dprintf
244 #ifdef DEBUG_SDOG
245 #define dprintf(fmt, args...) \
246 do { \
247 fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
248 } while (0)
249 #else
250 #define dprintf(fmt, args...)
251 #endif
253 typedef struct SheepdogAIOCB SheepdogAIOCB;
255 typedef struct AIOReq {
256 SheepdogAIOCB *aiocb;
257 unsigned int iov_offset;
259 uint64_t oid;
260 uint64_t base_oid;
261 uint64_t offset;
262 unsigned int data_len;
263 uint8_t flags;
264 uint32_t id;
266 QLIST_ENTRY(AIOReq) outstanding_aio_siblings;
267 QLIST_ENTRY(AIOReq) aioreq_siblings;
268 } AIOReq;
270 enum AIOCBState {
271 AIOCB_WRITE_UDATA,
272 AIOCB_READ_UDATA,
275 struct SheepdogAIOCB {
276 BlockDriverAIOCB common;
278 QEMUIOVector *qiov;
280 int64_t sector_num;
281 int nb_sectors;
283 int ret;
284 enum AIOCBState aiocb_type;
286 QEMUBH *bh;
287 void (*aio_done_func)(SheepdogAIOCB *);
289 int canceled;
291 QLIST_HEAD(aioreq_head, AIOReq) aioreq_head;
294 typedef struct BDRVSheepdogState {
295 SheepdogInode inode;
297 uint32_t min_dirty_data_idx;
298 uint32_t max_dirty_data_idx;
300 char name[SD_MAX_VDI_LEN];
301 int is_snapshot;
303 char *addr;
304 char *port;
305 int fd;
307 uint32_t aioreq_seq_num;
308 QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
309 } BDRVSheepdogState;
311 static const char * sd_strerror(int err)
313 int i;
315 static const struct {
316 int err;
317 const char *desc;
318 } errors[] = {
319 {SD_RES_SUCCESS, "Success"},
320 {SD_RES_UNKNOWN, "Unknown error"},
321 {SD_RES_NO_OBJ, "No object found"},
322 {SD_RES_EIO, "I/O error"},
323 {SD_RES_VDI_EXIST, "VDI exists already"},
324 {SD_RES_INVALID_PARMS, "Invalid parameters"},
325 {SD_RES_SYSTEM_ERROR, "System error"},
326 {SD_RES_VDI_LOCKED, "VDI is already locked"},
327 {SD_RES_NO_VDI, "No vdi found"},
328 {SD_RES_NO_BASE_VDI, "No base VDI found"},
329 {SD_RES_VDI_READ, "Failed read the requested VDI"},
330 {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
331 {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
332 {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
333 {SD_RES_NO_TAG, "Failed to find the requested tag"},
334 {SD_RES_STARTUP, "The system is still booting"},
335 {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
336 {SD_RES_SHUTDOWN, "The system is shutting down"},
337 {SD_RES_NO_MEM, "Out of memory on the server"},
338 {SD_RES_FULL_VDI, "We already have the maximum vdis"},
339 {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
340 {SD_RES_NO_SPACE, "Server has no space for new objects"},
341 {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
342 {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
343 {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
346 for (i = 0; i < ARRAY_SIZE(errors); ++i) {
347 if (errors[i].err == err) {
348 return errors[i].desc;
352 return "Invalid error code";
356 * Sheepdog I/O handling:
358 * 1. In the sd_aio_readv/writev, read/write requests are added to the
359 * QEMU Bottom Halves.
361 * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
362 * requests to the server and link the requests to the
363 * outstanding_list in the BDRVSheepdogState. we exits the
364 * function without waiting for receiving the response.
366 * 3. We receive the response in aio_read_response, the fd handler to
367 * the sheepdog connection. If metadata update is needed, we send
368 * the write request to the vdi object in sd_write_done, the write
369 * completion function. The AIOCB callback is not called until all
370 * the requests belonging to the AIOCB are finished.
373 static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
374 uint64_t oid, unsigned int data_len,
375 uint64_t offset, uint8_t flags,
376 uint64_t base_oid, unsigned int iov_offset)
378 AIOReq *aio_req;
380 aio_req = qemu_malloc(sizeof(*aio_req));
381 aio_req->aiocb = acb;
382 aio_req->iov_offset = iov_offset;
383 aio_req->oid = oid;
384 aio_req->base_oid = base_oid;
385 aio_req->offset = offset;
386 aio_req->data_len = data_len;
387 aio_req->flags = flags;
388 aio_req->id = s->aioreq_seq_num++;
390 QLIST_INSERT_HEAD(&s->outstanding_aio_head, aio_req,
391 outstanding_aio_siblings);
392 QLIST_INSERT_HEAD(&acb->aioreq_head, aio_req, aioreq_siblings);
394 return aio_req;
397 static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
399 SheepdogAIOCB *acb = aio_req->aiocb;
400 QLIST_REMOVE(aio_req, outstanding_aio_siblings);
401 QLIST_REMOVE(aio_req, aioreq_siblings);
402 qemu_free(aio_req);
404 return !QLIST_EMPTY(&acb->aioreq_head);
407 static void sd_finish_aiocb(SheepdogAIOCB *acb)
409 if (!acb->canceled) {
410 acb->common.cb(acb->common.opaque, acb->ret);
412 qemu_aio_release(acb);
415 static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
417 SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
420 * Sheepdog cannot cancel the requests which are already sent to
421 * the servers, so we just complete the request with -EIO here.
423 acb->common.cb(acb->common.opaque, -EIO);
424 acb->canceled = 1;
427 static AIOPool sd_aio_pool = {
428 .aiocb_size = sizeof(SheepdogAIOCB),
429 .cancel = sd_aio_cancel,
432 static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
433 int64_t sector_num, int nb_sectors,
434 BlockDriverCompletionFunc *cb, void *opaque)
436 SheepdogAIOCB *acb;
438 acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque);
440 acb->qiov = qiov;
442 acb->sector_num = sector_num;
443 acb->nb_sectors = nb_sectors;
445 acb->aio_done_func = NULL;
446 acb->canceled = 0;
447 acb->bh = NULL;
448 acb->ret = 0;
449 QLIST_INIT(&acb->aioreq_head);
450 return acb;
453 static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb)
455 if (acb->bh) {
456 error_report("bug: %d %d\n", acb->aiocb_type, acb->aiocb_type);
457 return -EIO;
460 acb->bh = qemu_bh_new(cb, acb);
461 if (!acb->bh) {
462 error_report("oom: %d %d\n", acb->aiocb_type, acb->aiocb_type);
463 return -EIO;
466 qemu_bh_schedule(acb->bh);
468 return 0;
471 #ifdef _WIN32
473 struct msghdr {
474 struct iovec *msg_iov;
475 size_t msg_iovlen;
478 static ssize_t sendmsg(int s, const struct msghdr *msg, int flags)
480 size_t size = 0;
481 char *buf, *p;
482 int i, ret;
484 /* count the msg size */
485 for (i = 0; i < msg->msg_iovlen; i++) {
486 size += msg->msg_iov[i].iov_len;
488 buf = qemu_malloc(size);
490 p = buf;
491 for (i = 0; i < msg->msg_iovlen; i++) {
492 memcpy(p, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len);
493 p += msg->msg_iov[i].iov_len;
496 ret = send(s, buf, size, flags);
498 qemu_free(buf);
499 return ret;
502 static ssize_t recvmsg(int s, struct msghdr *msg, int flags)
504 size_t size = 0;
505 char *buf, *p;
506 int i, ret;
508 /* count the msg size */
509 for (i = 0; i < msg->msg_iovlen; i++) {
510 size += msg->msg_iov[i].iov_len;
512 buf = qemu_malloc(size);
514 ret = recv(s, buf, size, flags);
515 if (ret < 0) {
516 goto out;
519 p = buf;
520 for (i = 0; i < msg->msg_iovlen; i++) {
521 memcpy(msg->msg_iov[i].iov_base, p, msg->msg_iov[i].iov_len);
522 p += msg->msg_iov[i].iov_len;
524 out:
525 qemu_free(buf);
526 return ret;
529 #endif
532 * Send/recv data with iovec buffers
534 * This function send/recv data from/to the iovec buffer directly.
535 * The first `offset' bytes in the iovec buffer are skipped and next
536 * `len' bytes are used.
538 * For example,
540 * do_send_recv(sockfd, iov, len, offset, 1);
542 * is equals to
544 * char *buf = malloc(size);
545 * iov_to_buf(iov, iovcnt, buf, offset, size);
546 * send(sockfd, buf, size, 0);
547 * free(buf);
549 static int do_send_recv(int sockfd, struct iovec *iov, int len, int offset,
550 int write)
552 struct msghdr msg;
553 int ret, diff;
555 memset(&msg, 0, sizeof(msg));
556 msg.msg_iov = iov;
557 msg.msg_iovlen = 1;
559 len += offset;
561 while (iov->iov_len < len) {
562 len -= iov->iov_len;
564 iov++;
565 msg.msg_iovlen++;
568 diff = iov->iov_len - len;
569 iov->iov_len -= diff;
571 while (msg.msg_iov->iov_len <= offset) {
572 offset -= msg.msg_iov->iov_len;
574 msg.msg_iov++;
575 msg.msg_iovlen--;
578 msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base + offset;
579 msg.msg_iov->iov_len -= offset;
581 if (write) {
582 ret = sendmsg(sockfd, &msg, 0);
583 } else {
584 ret = recvmsg(sockfd, &msg, 0);
587 msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base - offset;
588 msg.msg_iov->iov_len += offset;
590 iov->iov_len += diff;
591 return ret;
594 static int connect_to_sdog(const char *addr, const char *port)
596 char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
597 int fd, ret;
598 struct addrinfo hints, *res, *res0;
600 if (!addr) {
601 addr = SD_DEFAULT_ADDR;
602 port = SD_DEFAULT_PORT;
605 memset(&hints, 0, sizeof(hints));
606 hints.ai_socktype = SOCK_STREAM;
608 ret = getaddrinfo(addr, port, &hints, &res0);
609 if (ret) {
610 error_report("unable to get address info %s, %s\n",
611 addr, strerror(errno));
612 return -1;
615 for (res = res0; res; res = res->ai_next) {
616 ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf),
617 sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV);
618 if (ret) {
619 continue;
622 fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
623 if (fd < 0) {
624 continue;
627 reconnect:
628 ret = connect(fd, res->ai_addr, res->ai_addrlen);
629 if (ret < 0) {
630 if (errno == EINTR) {
631 goto reconnect;
633 break;
636 dprintf("connected to %s:%s\n", addr, port);
637 goto success;
639 fd = -1;
640 error_report("failed connect to %s:%s\n", addr, port);
641 success:
642 freeaddrinfo(res0);
643 return fd;
646 static int do_readv_writev(int sockfd, struct iovec *iov, int len,
647 int iov_offset, int write)
649 int ret;
650 again:
651 ret = do_send_recv(sockfd, iov, len, iov_offset, write);
652 if (ret < 0) {
653 if (errno == EINTR || errno == EAGAIN) {
654 goto again;
656 error_report("failed to recv a rsp, %s\n", strerror(errno));
657 return 1;
660 iov_offset += ret;
661 len -= ret;
662 if (len) {
663 goto again;
666 return 0;
669 static int do_readv(int sockfd, struct iovec *iov, int len, int iov_offset)
671 return do_readv_writev(sockfd, iov, len, iov_offset, 0);
674 static int do_writev(int sockfd, struct iovec *iov, int len, int iov_offset)
676 return do_readv_writev(sockfd, iov, len, iov_offset, 1);
679 static int do_read_write(int sockfd, void *buf, int len, int write)
681 struct iovec iov;
683 iov.iov_base = buf;
684 iov.iov_len = len;
686 return do_readv_writev(sockfd, &iov, len, 0, write);
689 static int do_read(int sockfd, void *buf, int len)
691 return do_read_write(sockfd, buf, len, 0);
694 static int do_write(int sockfd, void *buf, int len)
696 return do_read_write(sockfd, buf, len, 1);
699 static int send_req(int sockfd, SheepdogReq *hdr, void *data,
700 unsigned int *wlen)
702 int ret;
703 struct iovec iov[2];
705 iov[0].iov_base = hdr;
706 iov[0].iov_len = sizeof(*hdr);
708 if (*wlen) {
709 iov[1].iov_base = data;
710 iov[1].iov_len = *wlen;
713 ret = do_writev(sockfd, iov, sizeof(*hdr) + *wlen, 0);
714 if (ret) {
715 error_report("failed to send a req, %s\n", strerror(errno));
716 ret = -1;
719 return ret;
722 static int do_req(int sockfd, SheepdogReq *hdr, void *data,
723 unsigned int *wlen, unsigned int *rlen)
725 int ret;
727 ret = send_req(sockfd, hdr, data, wlen);
728 if (ret) {
729 ret = -1;
730 goto out;
733 ret = do_read(sockfd, hdr, sizeof(*hdr));
734 if (ret) {
735 error_report("failed to get a rsp, %s\n", strerror(errno));
736 ret = -1;
737 goto out;
740 if (*rlen > hdr->data_length) {
741 *rlen = hdr->data_length;
744 if (*rlen) {
745 ret = do_read(sockfd, data, *rlen);
746 if (ret) {
747 error_report("failed to get the data, %s\n", strerror(errno));
748 ret = -1;
749 goto out;
752 ret = 0;
753 out:
754 return ret;
757 static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
758 struct iovec *iov, int niov, int create,
759 enum AIOCBState aiocb_type);
762 * This function searchs pending requests to the object `oid', and
763 * sends them.
765 static void send_pending_req(BDRVSheepdogState *s, uint64_t oid, uint32_t id)
767 AIOReq *aio_req, *next;
768 SheepdogAIOCB *acb;
769 int ret;
771 QLIST_FOREACH_SAFE(aio_req, &s->outstanding_aio_head,
772 outstanding_aio_siblings, next) {
773 if (id == aio_req->id) {
774 continue;
776 if (aio_req->oid != oid) {
777 continue;
780 acb = aio_req->aiocb;
781 ret = add_aio_request(s, aio_req, acb->qiov->iov,
782 acb->qiov->niov, 0, acb->aiocb_type);
783 if (ret < 0) {
784 error_report("add_aio_request is failed\n");
785 free_aio_req(s, aio_req);
786 if (QLIST_EMPTY(&acb->aioreq_head)) {
787 sd_finish_aiocb(acb);
794 * Receive responses of the I/O requests.
796 * This function is registered as a fd handler, and called from the
797 * main loop when s->fd is ready for reading responses.
799 static void aio_read_response(void *opaque)
801 SheepdogObjRsp rsp;
802 BDRVSheepdogState *s = opaque;
803 int fd = s->fd;
804 int ret;
805 AIOReq *aio_req = NULL;
806 SheepdogAIOCB *acb;
807 int rest;
808 unsigned long idx;
810 if (QLIST_EMPTY(&s->outstanding_aio_head)) {
811 return;
814 /* read a header */
815 ret = do_read(fd, &rsp, sizeof(rsp));
816 if (ret) {
817 error_report("failed to get the header, %s\n", strerror(errno));
818 return;
821 /* find the right aio_req from the outstanding_aio list */
822 QLIST_FOREACH(aio_req, &s->outstanding_aio_head, outstanding_aio_siblings) {
823 if (aio_req->id == rsp.id) {
824 break;
827 if (!aio_req) {
828 error_report("cannot find aio_req %x\n", rsp.id);
829 return;
832 acb = aio_req->aiocb;
834 switch (acb->aiocb_type) {
835 case AIOCB_WRITE_UDATA:
836 if (!is_data_obj(aio_req->oid)) {
837 break;
839 idx = data_oid_to_idx(aio_req->oid);
841 if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) {
843 * If the object is newly created one, we need to update
844 * the vdi object (metadata object). min_dirty_data_idx
845 * and max_dirty_data_idx are changed to include updated
846 * index between them.
848 s->inode.data_vdi_id[idx] = s->inode.vdi_id;
849 s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
850 s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
853 * Some requests may be blocked because simultaneous
854 * create requests are not allowed, so we search the
855 * pending requests here.
857 send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx), rsp.id);
859 break;
860 case AIOCB_READ_UDATA:
861 ret = do_readv(fd, acb->qiov->iov, rsp.data_length,
862 aio_req->iov_offset);
863 if (ret) {
864 error_report("failed to get the data, %s\n", strerror(errno));
865 return;
867 break;
870 if (rsp.result != SD_RES_SUCCESS) {
871 acb->ret = -EIO;
872 error_report("%s\n", sd_strerror(rsp.result));
875 rest = free_aio_req(s, aio_req);
876 if (!rest) {
878 * We've finished all requests which belong to the AIOCB, so
879 * we can call the callback now.
881 acb->aio_done_func(acb);
885 static int aio_flush_request(void *opaque)
887 BDRVSheepdogState *s = opaque;
889 return !QLIST_EMPTY(&s->outstanding_aio_head);
892 #if !defined(SOL_TCP) || !defined(TCP_CORK)
894 static int set_cork(int fd, int v)
896 return 0;
899 #else
901 static int set_cork(int fd, int v)
903 return setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v));
906 #endif
908 static int set_nodelay(int fd)
910 int ret, opt;
912 opt = 1;
913 ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt));
914 return ret;
918 * Return a socket discriptor to read/write objects.
920 * We cannot use this discriptor for other operations because
921 * the block driver may be on waiting response from the server.
923 static int get_sheep_fd(BDRVSheepdogState *s)
925 int ret, fd;
927 fd = connect_to_sdog(s->addr, s->port);
928 if (fd < 0) {
929 error_report("%s\n", strerror(errno));
930 return -1;
933 socket_set_nonblock(fd);
935 ret = set_nodelay(fd);
936 if (ret) {
937 error_report("%s\n", strerror(errno));
938 closesocket(fd);
939 return -1;
942 qemu_aio_set_fd_handler(fd, aio_read_response, NULL, aio_flush_request,
943 NULL, s);
944 return fd;
948 * Parse a filename
950 * filename must be one of the following formats:
951 * 1. [vdiname]
952 * 2. [vdiname]:[snapid]
953 * 3. [vdiname]:[tag]
954 * 4. [hostname]:[port]:[vdiname]
955 * 5. [hostname]:[port]:[vdiname]:[snapid]
956 * 6. [hostname]:[port]:[vdiname]:[tag]
958 * You can boot from the snapshot images by specifying `snapid` or
959 * `tag'.
961 * You can run VMs outside the Sheepdog cluster by specifying
962 * `hostname' and `port' (experimental).
964 static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
965 char *vdi, uint32_t *snapid, char *tag)
967 char *p, *q;
968 int nr_sep;
970 p = q = qemu_strdup(filename);
972 /* count the number of separators */
973 nr_sep = 0;
974 while (*p) {
975 if (*p == ':') {
976 nr_sep++;
978 p++;
980 p = q;
982 /* use the first two tokens as hostname and port number. */
983 if (nr_sep >= 2) {
984 s->addr = p;
985 p = strchr(p, ':');
986 *p++ = '\0';
988 s->port = p;
989 p = strchr(p, ':');
990 *p++ = '\0';
991 } else {
992 s->addr = NULL;
993 s->port = 0;
996 strncpy(vdi, p, SD_MAX_VDI_LEN);
998 p = strchr(vdi, ':');
999 if (p) {
1000 *p++ = '\0';
1001 *snapid = strtoul(p, NULL, 10);
1002 if (*snapid == 0) {
1003 strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
1005 } else {
1006 *snapid = CURRENT_VDI_ID; /* search current vdi */
1009 if (s->addr == NULL) {
1010 qemu_free(q);
1013 return 0;
1016 static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
1017 char *tag, uint32_t *vid, int for_snapshot)
1019 int ret, fd;
1020 SheepdogVdiReq hdr;
1021 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1022 unsigned int wlen, rlen = 0;
1023 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
1025 fd = connect_to_sdog(s->addr, s->port);
1026 if (fd < 0) {
1027 return -1;
1030 memset(buf, 0, sizeof(buf));
1031 strncpy(buf, filename, SD_MAX_VDI_LEN);
1032 strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
1034 memset(&hdr, 0, sizeof(hdr));
1035 if (for_snapshot) {
1036 hdr.opcode = SD_OP_GET_VDI_INFO;
1037 } else {
1038 hdr.opcode = SD_OP_LOCK_VDI;
1040 wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
1041 hdr.proto_ver = SD_PROTO_VER;
1042 hdr.data_length = wlen;
1043 hdr.snapid = snapid;
1044 hdr.flags = SD_FLAG_CMD_WRITE;
1046 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1047 if (ret) {
1048 ret = -1;
1049 goto out;
1052 if (rsp->result != SD_RES_SUCCESS) {
1053 error_report("cannot get vdi info, %s, %s %d %s\n",
1054 sd_strerror(rsp->result), filename, snapid, tag);
1055 ret = -1;
1056 goto out;
1058 *vid = rsp->vdi_id;
1060 ret = 0;
1061 out:
1062 closesocket(fd);
1063 return ret;
1066 static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
1067 struct iovec *iov, int niov, int create,
1068 enum AIOCBState aiocb_type)
1070 int nr_copies = s->inode.nr_copies;
1071 SheepdogObjReq hdr;
1072 unsigned int wlen;
1073 int ret;
1074 uint64_t oid = aio_req->oid;
1075 unsigned int datalen = aio_req->data_len;
1076 uint64_t offset = aio_req->offset;
1077 uint8_t flags = aio_req->flags;
1078 uint64_t old_oid = aio_req->base_oid;
1080 if (!nr_copies) {
1081 error_report("bug\n");
1084 memset(&hdr, 0, sizeof(hdr));
1086 if (aiocb_type == AIOCB_READ_UDATA) {
1087 wlen = 0;
1088 hdr.opcode = SD_OP_READ_OBJ;
1089 hdr.flags = flags;
1090 } else if (create) {
1091 wlen = datalen;
1092 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1093 hdr.flags = SD_FLAG_CMD_WRITE | flags;
1094 } else {
1095 wlen = datalen;
1096 hdr.opcode = SD_OP_WRITE_OBJ;
1097 hdr.flags = SD_FLAG_CMD_WRITE | flags;
1100 hdr.oid = oid;
1101 hdr.cow_oid = old_oid;
1102 hdr.copies = s->inode.nr_copies;
1104 hdr.data_length = datalen;
1105 hdr.offset = offset;
1107 hdr.id = aio_req->id;
1109 set_cork(s->fd, 1);
1111 /* send a header */
1112 ret = do_write(s->fd, &hdr, sizeof(hdr));
1113 if (ret) {
1114 error_report("failed to send a req, %s\n", strerror(errno));
1115 return -EIO;
1118 if (wlen) {
1119 ret = do_writev(s->fd, iov, wlen, aio_req->iov_offset);
1120 if (ret) {
1121 error_report("failed to send a data, %s\n", strerror(errno));
1122 return -EIO;
1126 set_cork(s->fd, 0);
1128 return 0;
1131 static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
1132 unsigned int datalen, uint64_t offset,
1133 int write, int create)
1135 SheepdogObjReq hdr;
1136 SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1137 unsigned int wlen, rlen;
1138 int ret;
1140 memset(&hdr, 0, sizeof(hdr));
1142 if (write) {
1143 wlen = datalen;
1144 rlen = 0;
1145 hdr.flags = SD_FLAG_CMD_WRITE;
1146 if (create) {
1147 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1148 } else {
1149 hdr.opcode = SD_OP_WRITE_OBJ;
1151 } else {
1152 wlen = 0;
1153 rlen = datalen;
1154 hdr.opcode = SD_OP_READ_OBJ;
1156 hdr.oid = oid;
1157 hdr.data_length = datalen;
1158 hdr.offset = offset;
1159 hdr.copies = copies;
1161 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1162 if (ret) {
1163 error_report("failed to send a request to the sheep\n");
1164 return -1;
1167 switch (rsp->result) {
1168 case SD_RES_SUCCESS:
1169 return 0;
1170 default:
1171 error_report("%s\n", sd_strerror(rsp->result));
1172 return -1;
1176 static int read_object(int fd, char *buf, uint64_t oid, int copies,
1177 unsigned int datalen, uint64_t offset)
1179 return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0);
1182 static int write_object(int fd, char *buf, uint64_t oid, int copies,
1183 unsigned int datalen, uint64_t offset, int create)
1185 return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create);
1188 static int sd_open(BlockDriverState *bs, const char *filename, int flags)
1190 int ret, fd;
1191 uint32_t vid = 0;
1192 BDRVSheepdogState *s = bs->opaque;
1193 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1194 uint32_t snapid;
1195 char *buf = NULL;
1197 strstart(filename, "sheepdog:", (const char **)&filename);
1199 QLIST_INIT(&s->outstanding_aio_head);
1200 s->fd = -1;
1202 memset(vdi, 0, sizeof(vdi));
1203 memset(tag, 0, sizeof(tag));
1204 if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) {
1205 goto out;
1207 s->fd = get_sheep_fd(s);
1208 if (s->fd < 0) {
1209 goto out;
1212 ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0);
1213 if (ret) {
1214 goto out;
1217 if (snapid) {
1218 dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
1219 s->is_snapshot = 1;
1222 fd = connect_to_sdog(s->addr, s->port);
1223 if (fd < 0) {
1224 error_report("failed to connect\n");
1225 goto out;
1228 buf = qemu_malloc(SD_INODE_SIZE);
1229 ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0);
1231 closesocket(fd);
1233 if (ret) {
1234 goto out;
1237 memcpy(&s->inode, buf, sizeof(s->inode));
1238 s->min_dirty_data_idx = UINT32_MAX;
1239 s->max_dirty_data_idx = 0;
1241 bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
1242 strncpy(s->name, vdi, sizeof(s->name));
1243 qemu_free(buf);
1244 return 0;
1245 out:
1246 qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1247 if (s->fd >= 0) {
1248 closesocket(s->fd);
1250 qemu_free(buf);
1251 return -1;
1254 static int do_sd_create(char *filename, int64_t vdi_size,
1255 uint32_t base_vid, uint32_t *vdi_id, int snapshot,
1256 const char *addr, const char *port)
1258 SheepdogVdiReq hdr;
1259 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1260 int fd, ret;
1261 unsigned int wlen, rlen = 0;
1262 char buf[SD_MAX_VDI_LEN];
1264 fd = connect_to_sdog(addr, port);
1265 if (fd < 0) {
1266 return -EIO;
1269 memset(buf, 0, sizeof(buf));
1270 strncpy(buf, filename, SD_MAX_VDI_LEN);
1272 memset(&hdr, 0, sizeof(hdr));
1273 hdr.opcode = SD_OP_NEW_VDI;
1274 hdr.base_vdi_id = base_vid;
1276 wlen = SD_MAX_VDI_LEN;
1278 hdr.flags = SD_FLAG_CMD_WRITE;
1279 hdr.snapid = snapshot;
1281 hdr.data_length = wlen;
1282 hdr.vdi_size = vdi_size;
1284 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1286 closesocket(fd);
1288 if (ret) {
1289 return -EIO;
1292 if (rsp->result != SD_RES_SUCCESS) {
1293 error_report("%s, %s\n", sd_strerror(rsp->result), filename);
1294 return -EIO;
1297 if (vdi_id) {
1298 *vdi_id = rsp->vdi_id;
1301 return 0;
1304 static int sd_create(const char *filename, QEMUOptionParameter *options)
1306 int ret;
1307 uint32_t vid = 0;
1308 int64_t vdi_size = 0;
1309 char *backing_file = NULL;
1311 strstart(filename, "sheepdog:", (const char **)&filename);
1313 while (options && options->name) {
1314 if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1315 vdi_size = options->value.n;
1316 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1317 backing_file = options->value.s;
1319 options++;
1322 if (vdi_size > SD_MAX_VDI_SIZE) {
1323 error_report("too big image size\n");
1324 return -EINVAL;
1327 if (backing_file) {
1328 BlockDriverState *bs;
1329 BDRVSheepdogState *s;
1330 BlockDriver *drv;
1332 /* Currently, only Sheepdog backing image is supported. */
1333 drv = bdrv_find_protocol(backing_file);
1334 if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
1335 error_report("backing_file must be a sheepdog image\n");
1336 return -EINVAL;
1339 ret = bdrv_file_open(&bs, backing_file, 0);
1340 if (ret < 0)
1341 return -EIO;
1343 s = bs->opaque;
1345 if (!is_snapshot(&s->inode)) {
1346 error_report("cannot clone from a non snapshot vdi\n");
1347 bdrv_delete(bs);
1348 return -EINVAL;
1351 vid = s->inode.vdi_id;
1352 bdrv_delete(bs);
1355 return do_sd_create((char *)filename, vdi_size, vid, NULL, 0, NULL, NULL);
1358 static void sd_close(BlockDriverState *bs)
1360 BDRVSheepdogState *s = bs->opaque;
1361 SheepdogVdiReq hdr;
1362 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1363 unsigned int wlen, rlen = 0;
1364 int fd, ret;
1366 dprintf("%s\n", s->name);
1368 fd = connect_to_sdog(s->addr, s->port);
1369 if (fd < 0) {
1370 return;
1373 memset(&hdr, 0, sizeof(hdr));
1375 hdr.opcode = SD_OP_RELEASE_VDI;
1376 wlen = strlen(s->name) + 1;
1377 hdr.data_length = wlen;
1378 hdr.flags = SD_FLAG_CMD_WRITE;
1380 ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen);
1382 closesocket(fd);
1384 if (!ret && rsp->result != SD_RES_SUCCESS &&
1385 rsp->result != SD_RES_VDI_NOT_LOCKED) {
1386 error_report("%s, %s\n", sd_strerror(rsp->result), s->name);
1389 qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1390 closesocket(s->fd);
1391 qemu_free(s->addr);
1394 static int64_t sd_getlength(BlockDriverState *bs)
1396 BDRVSheepdogState *s = bs->opaque;
1398 return s->inode.vdi_size;
1401 static int sd_truncate(BlockDriverState *bs, int64_t offset)
1403 BDRVSheepdogState *s = bs->opaque;
1404 int ret, fd;
1405 unsigned int datalen;
1407 if (offset < s->inode.vdi_size) {
1408 error_report("shrinking is not supported\n");
1409 return -EINVAL;
1410 } else if (offset > SD_MAX_VDI_SIZE) {
1411 error_report("too big image size\n");
1412 return -EINVAL;
1415 fd = connect_to_sdog(s->addr, s->port);
1416 if (fd < 0) {
1417 return -EIO;
1420 /* we don't need to update entire object */
1421 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1422 s->inode.vdi_size = offset;
1423 ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1424 s->inode.nr_copies, datalen, 0, 0);
1425 close(fd);
1427 if (ret < 0) {
1428 error_report("failed to update an inode.\n");
1429 return -EIO;
1432 return 0;
1436 * This function is called after writing data objects. If we need to
1437 * update metadata, this sends a write request to the vdi object.
1438 * Otherwise, this calls the AIOCB callback.
1440 static void sd_write_done(SheepdogAIOCB *acb)
1442 int ret;
1443 BDRVSheepdogState *s = acb->common.bs->opaque;
1444 struct iovec iov;
1445 AIOReq *aio_req;
1446 uint32_t offset, data_len, mn, mx;
1448 mn = s->min_dirty_data_idx;
1449 mx = s->max_dirty_data_idx;
1450 if (mn <= mx) {
1451 /* we need to update the vdi object. */
1452 offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
1453 mn * sizeof(s->inode.data_vdi_id[0]);
1454 data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
1456 s->min_dirty_data_idx = UINT32_MAX;
1457 s->max_dirty_data_idx = 0;
1459 iov.iov_base = &s->inode;
1460 iov.iov_len = sizeof(s->inode);
1461 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
1462 data_len, offset, 0, 0, offset);
1463 ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA);
1464 if (ret) {
1465 free_aio_req(s, aio_req);
1466 acb->ret = -EIO;
1467 goto out;
1470 acb->aio_done_func = sd_finish_aiocb;
1471 acb->aiocb_type = AIOCB_WRITE_UDATA;
1472 return;
1474 out:
1475 sd_finish_aiocb(acb);
1479 * Create a writable VDI from a snapshot
1481 static int sd_create_branch(BDRVSheepdogState *s)
1483 int ret, fd;
1484 uint32_t vid;
1485 char *buf;
1487 dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
1489 buf = qemu_malloc(SD_INODE_SIZE);
1491 ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1,
1492 s->addr, s->port);
1493 if (ret) {
1494 goto out;
1497 dprintf("%" PRIx32 " is created.\n", vid);
1499 fd = connect_to_sdog(s->addr, s->port);
1500 if (fd < 0) {
1501 error_report("failed to connect\n");
1502 goto out;
1505 ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1506 SD_INODE_SIZE, 0);
1508 closesocket(fd);
1510 if (ret < 0) {
1511 goto out;
1514 memcpy(&s->inode, buf, sizeof(s->inode));
1516 s->is_snapshot = 0;
1517 ret = 0;
1518 dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
1520 out:
1521 qemu_free(buf);
1523 return ret;
1527 * Send I/O requests to the server.
1529 * This function sends requests to the server, links the requests to
1530 * the outstanding_list in BDRVSheepdogState, and exits without
1531 * waiting the response. The responses are received in the
1532 * `aio_read_response' function which is called from the main loop as
1533 * a fd handler.
1535 static void sd_readv_writev_bh_cb(void *p)
1537 SheepdogAIOCB *acb = p;
1538 int ret = 0;
1539 unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE;
1540 unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE;
1541 uint64_t oid;
1542 uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
1543 BDRVSheepdogState *s = acb->common.bs->opaque;
1544 SheepdogInode *inode = &s->inode;
1545 AIOReq *aio_req;
1547 qemu_bh_delete(acb->bh);
1548 acb->bh = NULL;
1550 if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
1552 * In the case we open the snapshot VDI, Sheepdog creates the
1553 * writable VDI when we do a write operation first.
1555 ret = sd_create_branch(s);
1556 if (ret) {
1557 acb->ret = -EIO;
1558 goto out;
1562 while (done != total) {
1563 uint8_t flags = 0;
1564 uint64_t old_oid = 0;
1565 int create = 0;
1567 oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
1569 len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
1571 if (!inode->data_vdi_id[idx]) {
1572 if (acb->aiocb_type == AIOCB_READ_UDATA) {
1573 goto done;
1576 create = 1;
1577 } else if (acb->aiocb_type == AIOCB_WRITE_UDATA
1578 && !is_data_obj_writeable(inode, idx)) {
1579 /* Copy-On-Write */
1580 create = 1;
1581 old_oid = oid;
1582 flags = SD_FLAG_CMD_COW;
1585 if (create) {
1586 dprintf("update ino (%" PRIu32") %" PRIu64 " %" PRIu64
1587 " %" PRIu64 "\n", inode->vdi_id, oid,
1588 vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
1589 oid = vid_to_data_oid(inode->vdi_id, idx);
1590 dprintf("new oid %lx\n", oid);
1593 aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
1595 if (create) {
1596 AIOReq *areq;
1597 QLIST_FOREACH(areq, &s->outstanding_aio_head,
1598 outstanding_aio_siblings) {
1599 if (areq == aio_req) {
1600 continue;
1602 if (areq->oid == oid) {
1604 * Sheepdog cannot handle simultaneous create
1605 * requests to the same object. So we cannot send
1606 * the request until the previous request
1607 * finishes.
1609 aio_req->flags = 0;
1610 aio_req->base_oid = 0;
1611 goto done;
1616 ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
1617 create, acb->aiocb_type);
1618 if (ret < 0) {
1619 error_report("add_aio_request is failed\n");
1620 free_aio_req(s, aio_req);
1621 acb->ret = -EIO;
1622 goto out;
1624 done:
1625 offset = 0;
1626 idx++;
1627 done += len;
1629 out:
1630 if (QLIST_EMPTY(&acb->aioreq_head)) {
1631 sd_finish_aiocb(acb);
1635 static BlockDriverAIOCB *sd_aio_writev(BlockDriverState *bs, int64_t sector_num,
1636 QEMUIOVector *qiov, int nb_sectors,
1637 BlockDriverCompletionFunc *cb,
1638 void *opaque)
1640 SheepdogAIOCB *acb;
1642 if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
1643 /* TODO: shouldn't block here */
1644 if (sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE) < 0) {
1645 return NULL;
1647 bs->total_sectors = sector_num + nb_sectors;
1650 acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
1651 acb->aio_done_func = sd_write_done;
1652 acb->aiocb_type = AIOCB_WRITE_UDATA;
1654 sd_schedule_bh(sd_readv_writev_bh_cb, acb);
1655 return &acb->common;
1658 static BlockDriverAIOCB *sd_aio_readv(BlockDriverState *bs, int64_t sector_num,
1659 QEMUIOVector *qiov, int nb_sectors,
1660 BlockDriverCompletionFunc *cb,
1661 void *opaque)
1663 SheepdogAIOCB *acb;
1664 int i;
1666 acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
1667 acb->aiocb_type = AIOCB_READ_UDATA;
1668 acb->aio_done_func = sd_finish_aiocb;
1671 * TODO: we can do better; we don't need to initialize
1672 * blindly.
1674 for (i = 0; i < qiov->niov; i++) {
1675 memset(qiov->iov[i].iov_base, 0, qiov->iov[i].iov_len);
1678 sd_schedule_bh(sd_readv_writev_bh_cb, acb);
1679 return &acb->common;
1682 static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
1684 BDRVSheepdogState *s = bs->opaque;
1685 int ret, fd;
1686 uint32_t new_vid;
1687 SheepdogInode *inode;
1688 unsigned int datalen;
1690 dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %d "
1691 "is_snapshot %d\n", sn_info->name, sn_info->id_str,
1692 s->name, sn_info->vm_state_size, s->is_snapshot);
1694 if (s->is_snapshot) {
1695 error_report("You can't create a snapshot of a snapshot VDI, "
1696 "%s (%" PRIu32 ").\n", s->name, s->inode.vdi_id);
1698 return -EINVAL;
1701 dprintf("%s %s\n", sn_info->name, sn_info->id_str);
1703 s->inode.vm_state_size = sn_info->vm_state_size;
1704 s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
1705 strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
1706 /* we don't need to update entire object */
1707 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1709 /* refresh inode. */
1710 fd = connect_to_sdog(s->addr, s->port);
1711 if (fd < 0) {
1712 ret = -EIO;
1713 goto cleanup;
1716 ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1717 s->inode.nr_copies, datalen, 0, 0);
1718 if (ret < 0) {
1719 error_report("failed to write snapshot's inode.\n");
1720 ret = -EIO;
1721 goto cleanup;
1724 ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1,
1725 s->addr, s->port);
1726 if (ret < 0) {
1727 error_report("failed to create inode for snapshot. %s\n",
1728 strerror(errno));
1729 ret = -EIO;
1730 goto cleanup;
1733 inode = (SheepdogInode *)qemu_malloc(datalen);
1735 ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
1736 s->inode.nr_copies, datalen, 0);
1738 if (ret < 0) {
1739 error_report("failed to read new inode info. %s\n", strerror(errno));
1740 ret = -EIO;
1741 goto cleanup;
1744 memcpy(&s->inode, inode, datalen);
1745 dprintf("s->inode: name %s snap_id %x oid %x\n",
1746 s->inode.name, s->inode.snap_id, s->inode.vdi_id);
1748 cleanup:
1749 closesocket(fd);
1750 return ret;
1753 static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
1755 BDRVSheepdogState *s = bs->opaque;
1756 BDRVSheepdogState *old_s;
1757 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1758 char *buf = NULL;
1759 uint32_t vid;
1760 uint32_t snapid = 0;
1761 int ret = -ENOENT, fd;
1763 old_s = qemu_malloc(sizeof(BDRVSheepdogState));
1765 memcpy(old_s, s, sizeof(BDRVSheepdogState));
1767 memset(vdi, 0, sizeof(vdi));
1768 strncpy(vdi, s->name, sizeof(vdi));
1770 memset(tag, 0, sizeof(tag));
1771 snapid = strtoul(snapshot_id, NULL, 10);
1772 if (!snapid) {
1773 strncpy(tag, s->name, sizeof(tag));
1776 ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
1777 if (ret) {
1778 error_report("Failed to find_vdi_name\n");
1779 ret = -ENOENT;
1780 goto out;
1783 fd = connect_to_sdog(s->addr, s->port);
1784 if (fd < 0) {
1785 error_report("failed to connect\n");
1786 goto out;
1789 buf = qemu_malloc(SD_INODE_SIZE);
1790 ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1791 SD_INODE_SIZE, 0);
1793 closesocket(fd);
1795 if (ret) {
1796 ret = -ENOENT;
1797 goto out;
1800 memcpy(&s->inode, buf, sizeof(s->inode));
1802 if (!s->inode.vm_state_size) {
1803 error_report("Invalid snapshot\n");
1804 ret = -ENOENT;
1805 goto out;
1808 s->is_snapshot = 1;
1810 qemu_free(buf);
1811 qemu_free(old_s);
1813 return 0;
1814 out:
1815 /* recover bdrv_sd_state */
1816 memcpy(s, old_s, sizeof(BDRVSheepdogState));
1817 qemu_free(buf);
1818 qemu_free(old_s);
1820 error_report("failed to open. recover old bdrv_sd_state.\n");
1822 return ret;
1825 static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1827 /* FIXME: Delete specified snapshot id. */
1828 return 0;
1831 #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
1832 #define BITS_PER_BYTE 8
1833 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
1834 #define DECLARE_BITMAP(name,bits) \
1835 unsigned long name[BITS_TO_LONGS(bits)]
1837 #define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long))
1839 static inline int test_bit(unsigned int nr, const unsigned long *addr)
1841 return ((1UL << (nr % BITS_PER_LONG)) &
1842 (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
1845 static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
1847 BDRVSheepdogState *s = bs->opaque;
1848 SheepdogReq req;
1849 int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
1850 QEMUSnapshotInfo *sn_tab = NULL;
1851 unsigned wlen, rlen;
1852 int found = 0;
1853 static SheepdogInode inode;
1854 unsigned long *vdi_inuse;
1855 unsigned int start_nr;
1856 uint64_t hval;
1857 uint32_t vid;
1859 vdi_inuse = qemu_malloc(max);
1861 fd = connect_to_sdog(s->addr, s->port);
1862 if (fd < 0) {
1863 goto out;
1866 rlen = max;
1867 wlen = 0;
1869 memset(&req, 0, sizeof(req));
1871 req.opcode = SD_OP_READ_VDIS;
1872 req.data_length = max;
1874 ret = do_req(fd, (SheepdogReq *)&req, vdi_inuse, &wlen, &rlen);
1876 closesocket(fd);
1877 if (ret) {
1878 goto out;
1881 sn_tab = qemu_mallocz(nr * sizeof(*sn_tab));
1883 /* calculate a vdi id with hash function */
1884 hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
1885 start_nr = hval & (SD_NR_VDIS - 1);
1887 fd = connect_to_sdog(s->addr, s->port);
1888 if (fd < 0) {
1889 error_report("failed to connect\n");
1890 goto out;
1893 for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
1894 if (!test_bit(vid, vdi_inuse)) {
1895 break;
1898 /* we don't need to read entire object */
1899 ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
1900 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0);
1902 if (ret) {
1903 continue;
1906 if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
1907 sn_tab[found].date_sec = inode.snap_ctime >> 32;
1908 sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
1909 sn_tab[found].vm_state_size = inode.vm_state_size;
1910 sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
1912 snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
1913 inode.snap_id);
1914 strncpy(sn_tab[found].name, inode.tag,
1915 MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
1916 found++;
1920 closesocket(fd);
1921 out:
1922 *psn_tab = sn_tab;
1924 qemu_free(vdi_inuse);
1926 return found;
1929 static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
1930 int64_t pos, int size, int load)
1932 int fd, create;
1933 int ret = 0;
1934 unsigned int data_len;
1935 uint64_t vmstate_oid;
1936 uint32_t vdi_index;
1937 uint64_t offset;
1939 fd = connect_to_sdog(s->addr, s->port);
1940 if (fd < 0) {
1941 ret = -EIO;
1942 goto cleanup;
1945 while (size) {
1946 vdi_index = pos / SD_DATA_OBJ_SIZE;
1947 offset = pos % SD_DATA_OBJ_SIZE;
1949 data_len = MIN(size, SD_DATA_OBJ_SIZE);
1951 vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
1953 create = (offset == 0);
1954 if (load) {
1955 ret = read_object(fd, (char *)data, vmstate_oid,
1956 s->inode.nr_copies, data_len, offset);
1957 } else {
1958 ret = write_object(fd, (char *)data, vmstate_oid,
1959 s->inode.nr_copies, data_len, offset, create);
1962 if (ret < 0) {
1963 error_report("failed to save vmstate %s\n", strerror(errno));
1964 ret = -EIO;
1965 goto cleanup;
1968 pos += data_len;
1969 size -= data_len;
1970 ret += data_len;
1972 cleanup:
1973 closesocket(fd);
1974 return ret;
1977 static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data,
1978 int64_t pos, int size)
1980 BDRVSheepdogState *s = bs->opaque;
1982 return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0);
1985 static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
1986 int64_t pos, int size)
1988 BDRVSheepdogState *s = bs->opaque;
1990 return do_load_save_vmstate(s, data, pos, size, 1);
1994 static QEMUOptionParameter sd_create_options[] = {
1996 .name = BLOCK_OPT_SIZE,
1997 .type = OPT_SIZE,
1998 .help = "Virtual disk size"
2001 .name = BLOCK_OPT_BACKING_FILE,
2002 .type = OPT_STRING,
2003 .help = "File name of a base image"
2005 { NULL }
2008 BlockDriver bdrv_sheepdog = {
2009 .format_name = "sheepdog",
2010 .protocol_name = "sheepdog",
2011 .instance_size = sizeof(BDRVSheepdogState),
2012 .bdrv_file_open = sd_open,
2013 .bdrv_close = sd_close,
2014 .bdrv_create = sd_create,
2015 .bdrv_getlength = sd_getlength,
2016 .bdrv_truncate = sd_truncate,
2018 .bdrv_aio_readv = sd_aio_readv,
2019 .bdrv_aio_writev = sd_aio_writev,
2021 .bdrv_snapshot_create = sd_snapshot_create,
2022 .bdrv_snapshot_goto = sd_snapshot_goto,
2023 .bdrv_snapshot_delete = sd_snapshot_delete,
2024 .bdrv_snapshot_list = sd_snapshot_list,
2026 .bdrv_save_vmstate = sd_save_vmstate,
2027 .bdrv_load_vmstate = sd_load_vmstate,
2029 .create_options = sd_create_options,
2032 static void bdrv_sheepdog_init(void)
2034 bdrv_register(&bdrv_sheepdog);
2036 block_init(bdrv_sheepdog_init);