2 * Copyright (C) 2013-2019 Red Hat Inc.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
46 #include "byte-swapping.h"
48 #include "nbd-protocol.h"
49 #include "protostrings.h"
52 validate_request (struct connection
*conn
,
53 uint16_t cmd
, uint16_t flags
, uint64_t offset
, uint32_t count
,
56 /* Readonly connection? */
57 if (conn
->eflags
& NBD_FLAG_READ_ONLY
&&
58 (cmd
== NBD_CMD_WRITE
|| cmd
== NBD_CMD_TRIM
||
59 cmd
== NBD_CMD_WRITE_ZEROES
)) {
60 nbdkit_error ("invalid request: %s: write request on readonly connection",
61 name_of_nbd_cmd (cmd
));
66 /* Validate cmd, offset, count. */
72 case NBD_CMD_WRITE_ZEROES
:
73 case NBD_CMD_BLOCK_STATUS
:
74 if (!backend_valid_range (backend
, conn
, offset
, count
)) {
75 /* XXX Allow writes to extend the disk? */
76 nbdkit_error ("invalid request: %s: offset and count are out of range: "
77 "offset=%" PRIu64
" count=%" PRIu32
,
78 name_of_nbd_cmd (cmd
), offset
, count
);
79 *error
= (cmd
== NBD_CMD_WRITE
||
80 cmd
== NBD_CMD_WRITE_ZEROES
) ? ENOSPC
: EINVAL
;
86 if (offset
!= 0 || count
!= 0) {
87 nbdkit_error ("invalid request: %s: expecting offset and count = 0",
88 name_of_nbd_cmd (cmd
));
95 nbdkit_error ("invalid request: unknown command (%" PRIu32
") ignored",
102 if (flags
& ~(NBD_CMD_FLAG_FUA
| NBD_CMD_FLAG_NO_HOLE
|
103 NBD_CMD_FLAG_DF
| NBD_CMD_FLAG_REQ_ONE
|
104 NBD_CMD_FLAG_FAST_ZERO
)) {
105 nbdkit_error ("invalid request: unknown flag (0x%x)", flags
);
109 if ((flags
& NBD_CMD_FLAG_NO_HOLE
) &&
110 cmd
!= NBD_CMD_WRITE_ZEROES
) {
111 nbdkit_error ("invalid request: NO_HOLE flag needs WRITE_ZEROES request");
115 if ((flags
& NBD_CMD_FLAG_FAST_ZERO
) &&
116 cmd
!= NBD_CMD_WRITE_ZEROES
) {
117 nbdkit_error ("invalid request: "
118 "FAST_ZERO flag needs WRITE_ZEROES request");
122 if (flags
& NBD_CMD_FLAG_DF
) {
123 if (cmd
!= NBD_CMD_READ
) {
124 nbdkit_error ("invalid request: DF flag needs READ request");
128 if (!conn
->structured_replies
) {
129 nbdkit_error ("invalid request: "
130 "%s: structured replies was not negotiated",
131 name_of_nbd_cmd (cmd
));
136 if ((flags
& NBD_CMD_FLAG_REQ_ONE
) &&
137 cmd
!= NBD_CMD_BLOCK_STATUS
) {
138 nbdkit_error ("invalid request: REQ_ONE flag needs BLOCK_STATUS request");
142 if (flags
& NBD_CMD_FLAG_FUA
&& !(conn
->eflags
& NBD_FLAG_SEND_FUA
)) {
143 nbdkit_error ("invalid request: FUA flag not supported");
148 /* Refuse over-large read and write requests. */
149 if ((cmd
== NBD_CMD_WRITE
|| cmd
== NBD_CMD_READ
) &&
150 count
> MAX_REQUEST_SIZE
) {
151 nbdkit_error ("invalid request: %s: data request is too large (%" PRIu32
153 name_of_nbd_cmd (cmd
), count
, MAX_REQUEST_SIZE
);
159 if (cmd
== NBD_CMD_FLUSH
&& !(conn
->eflags
& NBD_FLAG_SEND_FLUSH
)) {
160 nbdkit_error ("invalid request: %s: flush operation not supported",
161 name_of_nbd_cmd (cmd
));
167 if (cmd
== NBD_CMD_TRIM
&& !(conn
->eflags
& NBD_FLAG_SEND_TRIM
)) {
168 nbdkit_error ("invalid request: %s: trim operation not supported",
169 name_of_nbd_cmd (cmd
));
175 if (cmd
== NBD_CMD_WRITE_ZEROES
&&
176 !(conn
->eflags
& NBD_FLAG_SEND_WRITE_ZEROES
)) {
177 nbdkit_error ("invalid request: %s: write zeroes operation not supported",
178 name_of_nbd_cmd (cmd
));
184 if (cmd
== NBD_CMD_CACHE
&& !(conn
->eflags
& NBD_FLAG_SEND_CACHE
)) {
185 nbdkit_error ("invalid request: %s: cache operation not supported",
186 name_of_nbd_cmd (cmd
));
191 /* Block status allowed? */
192 if (cmd
== NBD_CMD_BLOCK_STATUS
) {
193 if (!conn
->structured_replies
) {
194 nbdkit_error ("invalid request: "
195 "%s: structured replies was not negotiated",
196 name_of_nbd_cmd (cmd
));
200 if (!conn
->meta_context_base_allocation
) {
201 nbdkit_error ("invalid request: "
202 "%s: base:allocation was not negotiated",
203 name_of_nbd_cmd (cmd
));
209 return true; /* Command validates. */
212 /* This is called with the request lock held to actually execute the
213 * request (by calling the plugin). Note that the request fields have
214 * been validated already in 'validate_request' so we don't have to
217 * 'buf' is either the data to be written or the data to be returned,
218 * and points to a buffer of size 'count' bytes.
220 * 'extents' is an empty extents list used for block status requests
223 * In all cases, the return value is the system errno value that will
224 * later be converted to the nbd error to send back to the client (0
228 handle_request (struct connection
*conn
,
229 uint16_t cmd
, uint16_t flags
, uint64_t offset
, uint32_t count
,
230 void *buf
, struct nbdkit_extents
*extents
)
235 /* Clear the error, so that we know if the plugin calls
236 * nbdkit_set_error() or relied on errno. */
237 threadlocal_set_error (0);
241 if (backend_pread (backend
, conn
, buf
, count
, offset
, 0, &err
) == -1)
246 if (flags
& NBD_CMD_FLAG_FUA
)
247 f
|= NBDKIT_FLAG_FUA
;
248 if (backend_pwrite (backend
, conn
, buf
, count
, offset
, f
, &err
) == -1)
253 if (backend_flush (backend
, conn
, 0, &err
) == -1)
258 if (flags
& NBD_CMD_FLAG_FUA
)
259 f
|= NBDKIT_FLAG_FUA
;
260 if (backend_trim (backend
, conn
, count
, offset
, f
, &err
) == -1)
265 if (backend_cache (backend
, conn
, count
, offset
, 0, &err
) == -1)
269 case NBD_CMD_WRITE_ZEROES
:
270 if (!(flags
& NBD_CMD_FLAG_NO_HOLE
))
271 f
|= NBDKIT_FLAG_MAY_TRIM
;
272 if (flags
& NBD_CMD_FLAG_FUA
)
273 f
|= NBDKIT_FLAG_FUA
;
274 if (flags
& NBD_CMD_FLAG_FAST_ZERO
)
275 f
|= NBDKIT_FLAG_FAST_ZERO
;
276 if (backend_zero (backend
, conn
, count
, offset
, f
, &err
) == -1)
280 case NBD_CMD_BLOCK_STATUS
:
281 if (flags
& NBD_CMD_FLAG_REQ_ONE
)
282 f
|= NBDKIT_FLAG_REQ_ONE
;
283 if (backend_extents (backend
, conn
, count
, offset
, f
,
284 extents
, &err
) == -1)
296 skip_over_write_buffer (int sock
, size_t count
)
301 if (count
> MAX_REQUEST_SIZE
* 2) {
302 nbdkit_error ("write request too large to skip");
307 r
= read (sock
, buf
, count
> BUFSIZ
? BUFSIZ
: count
);
309 nbdkit_error ("skipping write buffer: %m");
313 nbdkit_error ("unexpected early EOF");
322 /* Convert a system errno to an NBD_E* error code. */
324 nbd_errno (int error
, uint16_t flags
)
344 return NBD_ESHUTDOWN
;
347 #if ENOTSUP != EOPNOTSUPP
350 if (flags
& NBD_CMD_FLAG_FAST_ZERO
)
354 if (flags
& NBD_CMD_FLAG_DF
)
355 return NBD_EOVERFLOW
;
364 send_simple_reply (struct connection
*conn
,
365 uint64_t handle
, uint16_t cmd
, uint16_t flags
,
366 const char *buf
, uint32_t count
,
369 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->write_lock
);
370 struct nbd_simple_reply reply
;
372 int f
= (cmd
== NBD_CMD_READ
&& !error
) ? SEND_MORE
: 0;
374 reply
.magic
= htobe32 (NBD_SIMPLE_REPLY_MAGIC
);
375 reply
.handle
= handle
;
376 reply
.error
= htobe32 (nbd_errno (error
, flags
));
378 r
= conn
->send (conn
, &reply
, sizeof reply
, f
);
380 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
381 return connection_set_status (conn
, -1);
384 /* Send the read data buffer. */
385 if (cmd
== NBD_CMD_READ
&& !error
) {
386 r
= conn
->send (conn
, buf
, count
, 0);
388 nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd
));
389 return connection_set_status (conn
, -1);
393 return 1; /* command processed ok */
397 send_structured_reply_read (struct connection
*conn
,
398 uint64_t handle
, uint16_t cmd
,
399 const char *buf
, uint32_t count
, uint64_t offset
)
401 /* Once we are really using structured replies and sending data back
402 * in chunks, we'll be able to grab the write lock for each chunk,
403 * allowing other threads to interleave replies. As we're not doing
404 * that yet we acquire the lock for the whole function.
406 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->write_lock
);
407 struct nbd_structured_reply reply
;
408 struct nbd_structured_reply_offset_data offset_data
;
411 assert (cmd
== NBD_CMD_READ
);
413 reply
.magic
= htobe32 (NBD_STRUCTURED_REPLY_MAGIC
);
414 reply
.handle
= handle
;
415 reply
.flags
= htobe16 (NBD_REPLY_FLAG_DONE
);
416 reply
.type
= htobe16 (NBD_REPLY_TYPE_OFFSET_DATA
);
417 reply
.length
= htobe32 (count
+ sizeof offset_data
);
419 r
= conn
->send (conn
, &reply
, sizeof reply
, SEND_MORE
);
421 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
422 return connection_set_status (conn
, -1);
425 /* Send the offset + read data buffer. */
426 offset_data
.offset
= htobe64 (offset
);
427 r
= conn
->send (conn
, &offset_data
, sizeof offset_data
, SEND_MORE
);
429 nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd
));
430 return connection_set_status (conn
, -1);
433 r
= conn
->send (conn
, buf
, count
, 0);
435 nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd
));
436 return connection_set_status (conn
, -1);
439 return 1; /* command processed ok */
442 /* Convert a list of extents into NBD_REPLY_TYPE_BLOCK_STATUS blocks.
443 * The rules here are very complicated. Read the spec carefully!
445 static struct nbd_block_descriptor
*
446 extents_to_block_descriptors (struct nbdkit_extents
*extents
,
448 uint32_t count
, uint64_t offset
,
451 const bool req_one
= flags
& NBD_CMD_FLAG_REQ_ONE
;
452 const size_t nr_extents
= nbdkit_extents_count (extents
);
454 struct nbd_block_descriptor
*blocks
;
456 /* This is checked in server/plugins.c. */
457 assert (nr_extents
>= 1);
459 /* We may send fewer than nr_extents blocks, but never more. */
460 blocks
= calloc (req_one
? 1 : nr_extents
,
461 sizeof (struct nbd_block_descriptor
));
462 if (blocks
== NULL
) {
463 nbdkit_error ("calloc: %m");
468 const struct nbdkit_extent e
= nbdkit_get_extent (extents
, 0);
470 /* Checked as a side effect of how the extent list is created. */
471 assert (e
.length
> 0);
475 /* Must not exceed count of the original request. */
476 blocks
[0].length
= MIN (e
.length
, (uint64_t) count
);
477 blocks
[0].status_flags
= e
.type
& 3;
480 uint64_t pos
= offset
;
483 for (i
= 0; i
< nr_extents
; ++i
) {
484 const struct nbdkit_extent e
= nbdkit_get_extent (extents
, i
);
488 assert (e
.offset
== offset
);
490 /* Must not exceed UINT32_MAX. */
491 blocks
[i
].length
= length
= MIN (e
.length
, UINT32_MAX
);
492 blocks
[i
].status_flags
= e
.type
& 3;
496 if (pos
> offset
+ count
) /* this must be the last block */
499 /* If we reach here then we must have consumed this whole
500 * extent. This is currently true because the server only sends
501 * 32 bit requests, but if we move to 64 bit requests we will
502 * need to revisit this code so it can split extents into
503 * multiple blocks. XXX
505 assert (e
.length
<= length
);
510 for (i
= 0; i
< *nr_blocks
; ++i
)
511 debug ("block status: sending block %" PRIu32
" type %" PRIu32
,
512 blocks
[i
].length
, blocks
[i
].status_flags
);
515 /* Convert to big endian for the protocol. */
516 for (i
= 0; i
< *nr_blocks
; ++i
) {
517 blocks
[i
].length
= htobe32 (blocks
[i
].length
);
518 blocks
[i
].status_flags
= htobe32 (blocks
[i
].status_flags
);
525 send_structured_reply_block_status (struct connection
*conn
,
527 uint16_t cmd
, uint16_t flags
,
528 uint32_t count
, uint64_t offset
,
529 struct nbdkit_extents
*extents
)
531 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->write_lock
);
532 struct nbd_structured_reply reply
;
533 CLEANUP_FREE
struct nbd_block_descriptor
*blocks
= NULL
;
539 assert (conn
->meta_context_base_allocation
);
540 assert (cmd
== NBD_CMD_BLOCK_STATUS
);
542 blocks
= extents_to_block_descriptors (extents
, flags
, count
, offset
,
545 return connection_set_status (conn
, -1);
547 reply
.magic
= htobe32 (NBD_STRUCTURED_REPLY_MAGIC
);
548 reply
.handle
= handle
;
549 reply
.flags
= htobe16 (NBD_REPLY_FLAG_DONE
);
550 reply
.type
= htobe16 (NBD_REPLY_TYPE_BLOCK_STATUS
);
551 reply
.length
= htobe32 (sizeof context_id
+
552 nr_blocks
* sizeof (struct nbd_block_descriptor
));
554 r
= conn
->send (conn
, &reply
, sizeof reply
, SEND_MORE
);
556 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
557 return connection_set_status (conn
, -1);
560 /* Send the base:allocation context ID. */
561 context_id
= htobe32 (base_allocation_id
);
562 r
= conn
->send (conn
, &context_id
, sizeof context_id
, SEND_MORE
);
564 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
565 return connection_set_status (conn
, -1);
568 /* Send each block descriptor. */
569 for (i
= 0; i
< nr_blocks
; ++i
) {
570 r
= conn
->send (conn
, &blocks
[i
], sizeof blocks
[i
],
571 i
== nr_blocks
- 1 ? 0 : SEND_MORE
);
573 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
574 return connection_set_status (conn
, -1);
578 return 1; /* command processed ok */
582 send_structured_reply_error (struct connection
*conn
,
583 uint64_t handle
, uint16_t cmd
, uint16_t flags
,
586 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->write_lock
);
587 struct nbd_structured_reply reply
;
588 struct nbd_structured_reply_error error_data
;
591 reply
.magic
= htobe32 (NBD_STRUCTURED_REPLY_MAGIC
);
592 reply
.handle
= handle
;
593 reply
.flags
= htobe16 (NBD_REPLY_FLAG_DONE
);
594 reply
.type
= htobe16 (NBD_REPLY_TYPE_ERROR
);
595 reply
.length
= htobe32 (0 /* no human readable error */ + sizeof error_data
);
597 r
= conn
->send (conn
, &reply
, sizeof reply
, SEND_MORE
);
599 nbdkit_error ("write error reply: %m");
600 return connection_set_status (conn
, -1);
603 /* Send the error. */
604 error_data
.error
= htobe32 (nbd_errno (error
, flags
));
605 error_data
.len
= htobe16 (0);
606 r
= conn
->send (conn
, &error_data
, sizeof error_data
, 0);
608 nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd
));
609 return connection_set_status (conn
, -1);
611 /* No human readable error message at the moment. */
613 return 1; /* command processed ok */
617 protocol_recv_request_send_reply (struct connection
*conn
)
620 struct nbd_request request
;
622 uint32_t magic
, count
, error
= 0;
625 CLEANUP_EXTENTS_FREE
struct nbdkit_extents
*extents
= NULL
;
627 /* Read the request packet. */
629 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->read_lock
);
630 r
= connection_get_status (conn
);
633 r
= conn
->recv (conn
, &request
, sizeof request
);
635 nbdkit_error ("read request: %m");
636 return connection_set_status (conn
, -1);
639 debug ("client closed input socket, closing connection");
640 return connection_set_status (conn
, 0); /* disconnect */
643 magic
= be32toh (request
.magic
);
644 if (magic
!= NBD_REQUEST_MAGIC
) {
645 nbdkit_error ("invalid request: 'magic' field is incorrect (0x%x)",
647 return connection_set_status (conn
, -1);
650 flags
= be16toh (request
.flags
);
651 cmd
= be16toh (request
.type
);
653 offset
= be64toh (request
.offset
);
654 count
= be32toh (request
.count
);
656 if (cmd
== NBD_CMD_DISC
) {
657 debug ("client sent %s, closing connection", name_of_nbd_cmd (cmd
));
658 return connection_set_status (conn
, 0); /* disconnect */
661 /* Validate the request. */
662 if (!validate_request (conn
, cmd
, flags
, offset
, count
, &error
)) {
663 if (cmd
== NBD_CMD_WRITE
&&
664 skip_over_write_buffer (conn
->sockin
, count
) < 0)
665 return connection_set_status (conn
, -1);
669 /* Get the data buffer used for either read or write requests.
670 * This is a common per-thread data buffer, it must not be freed.
672 if (cmd
== NBD_CMD_READ
|| cmd
== NBD_CMD_WRITE
) {
673 buf
= threadlocal_buffer ((size_t) count
);
676 if (cmd
== NBD_CMD_WRITE
&&
677 skip_over_write_buffer (conn
->sockin
, count
) < 0)
678 return connection_set_status (conn
, -1);
683 /* Allocate the extents list for block status only. */
684 if (cmd
== NBD_CMD_BLOCK_STATUS
) {
685 extents
= nbdkit_extents_new (offset
, backend_get_size (backend
, conn
));
686 if (extents
== NULL
) {
692 /* Receive the write data buffer. */
693 if (cmd
== NBD_CMD_WRITE
) {
694 r
= conn
->recv (conn
, buf
, count
);
700 nbdkit_error ("read data: %s: %m", name_of_nbd_cmd (cmd
));
701 return connection_set_status (conn
, -1);
706 /* Perform the request. Only this part happens inside the request lock. */
707 if (quit
|| !connection_get_status (conn
)) {
712 error
= handle_request (conn
, cmd
, flags
, offset
, count
, buf
, extents
);
713 assert ((int) error
>= 0);
714 unlock_request (conn
);
717 /* Send the reply packet. */
719 if (connection_get_status (conn
) < 0)
723 /* Since we're about to send only the limited NBD_E* errno to the
724 * client, don't lose the information about what really happened
725 * on the server side. Make sure there is a way for the operator
726 * to retrieve the real error.
728 debug ("sending error reply: %s", strerror (error
));
731 /* Currently we prefer to send simple replies for everything except
732 * where we have to (ie. NBD_CMD_READ and NBD_CMD_BLOCK_STATUS when
733 * structured_replies have been negotiated). However this prevents
734 * us from sending human-readable error messages to the client, so
735 * we should reconsider this in future.
737 if (conn
->structured_replies
&&
738 (cmd
== NBD_CMD_READ
|| cmd
== NBD_CMD_BLOCK_STATUS
)) {
740 if (cmd
== NBD_CMD_READ
)
741 return send_structured_reply_read (conn
, request
.handle
, cmd
,
743 else /* NBD_CMD_BLOCK_STATUS */
744 return send_structured_reply_block_status (conn
, request
.handle
,
750 return send_structured_reply_error (conn
, request
.handle
, cmd
, flags
,
754 return send_simple_reply (conn
, request
.handle
, cmd
, flags
, buf
, count
,