4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
46 #include "byte-swapping.h"
48 #include "nbd-protocol.h"
49 #include "protostrings.h"
52 validate_request (uint16_t cmd
, uint16_t flags
, uint64_t offset
, uint32_t count
,
57 /* Readonly connection? */
58 if (conn
->eflags
& NBD_FLAG_READ_ONLY
&&
59 (cmd
== NBD_CMD_WRITE
|| cmd
== NBD_CMD_TRIM
||
60 cmd
== NBD_CMD_WRITE_ZEROES
)) {
61 nbdkit_error ("invalid request: %s: write request on readonly connection",
62 name_of_nbd_cmd (cmd
));
67 /* Validate cmd, offset, count. */
73 case NBD_CMD_WRITE_ZEROES
:
74 case NBD_CMD_BLOCK_STATUS
:
75 if (!backend_valid_range (conn
->top_context
, offset
, count
)) {
76 /* XXX Allow writes to extend the disk? */
77 nbdkit_error ("invalid request: %s: offset and count are out of range: "
78 "offset=%" PRIu64
" count=%" PRIu32
,
79 name_of_nbd_cmd (cmd
), offset
, count
);
80 *error
= (cmd
== NBD_CMD_WRITE
||
81 cmd
== NBD_CMD_WRITE_ZEROES
) ? ENOSPC
: EINVAL
;
87 if (offset
!= 0 || count
!= 0) {
88 nbdkit_error ("invalid request: %s: expecting offset and count = 0",
89 name_of_nbd_cmd (cmd
));
96 nbdkit_error ("invalid request: unknown command (%" PRIu32
") ignored",
103 if (flags
& ~(NBD_CMD_FLAG_FUA
| NBD_CMD_FLAG_NO_HOLE
|
104 NBD_CMD_FLAG_DF
| NBD_CMD_FLAG_REQ_ONE
|
105 NBD_CMD_FLAG_FAST_ZERO
)) {
106 nbdkit_error ("invalid request: unknown flag (0x%x)", flags
);
110 if ((flags
& NBD_CMD_FLAG_NO_HOLE
) &&
111 cmd
!= NBD_CMD_WRITE_ZEROES
) {
112 nbdkit_error ("invalid request: NO_HOLE flag needs WRITE_ZEROES request");
116 if ((flags
& NBD_CMD_FLAG_FAST_ZERO
) &&
117 cmd
!= NBD_CMD_WRITE_ZEROES
) {
118 nbdkit_error ("invalid request: "
119 "FAST_ZERO flag needs WRITE_ZEROES request");
123 if (flags
& NBD_CMD_FLAG_DF
) {
124 if (cmd
!= NBD_CMD_READ
) {
125 nbdkit_error ("invalid request: DF flag needs READ request");
129 if (!conn
->structured_replies
) {
130 nbdkit_error ("invalid request: "
131 "%s: structured replies was not negotiated",
132 name_of_nbd_cmd (cmd
));
137 if ((flags
& NBD_CMD_FLAG_REQ_ONE
) &&
138 cmd
!= NBD_CMD_BLOCK_STATUS
) {
139 nbdkit_error ("invalid request: REQ_ONE flag needs BLOCK_STATUS request");
143 if (flags
& NBD_CMD_FLAG_FUA
&& !(conn
->eflags
& NBD_FLAG_SEND_FUA
)) {
144 nbdkit_error ("invalid request: FUA flag not supported");
149 /* Refuse over-large read and write requests. */
150 if ((cmd
== NBD_CMD_WRITE
|| cmd
== NBD_CMD_READ
) &&
151 count
> MAX_REQUEST_SIZE
) {
152 nbdkit_error ("invalid request: %s: data request is too large (%" PRIu32
154 name_of_nbd_cmd (cmd
), count
, MAX_REQUEST_SIZE
);
160 if (cmd
== NBD_CMD_FLUSH
&& !(conn
->eflags
& NBD_FLAG_SEND_FLUSH
)) {
161 nbdkit_error ("invalid request: %s: flush operation not supported",
162 name_of_nbd_cmd (cmd
));
168 if (cmd
== NBD_CMD_TRIM
&& !(conn
->eflags
& NBD_FLAG_SEND_TRIM
)) {
169 nbdkit_error ("invalid request: %s: trim operation not supported",
170 name_of_nbd_cmd (cmd
));
176 if (cmd
== NBD_CMD_WRITE_ZEROES
&&
177 !(conn
->eflags
& NBD_FLAG_SEND_WRITE_ZEROES
)) {
178 nbdkit_error ("invalid request: %s: write zeroes operation not supported",
179 name_of_nbd_cmd (cmd
));
185 if (cmd
== NBD_CMD_CACHE
&& !(conn
->eflags
& NBD_FLAG_SEND_CACHE
)) {
186 nbdkit_error ("invalid request: %s: cache operation not supported",
187 name_of_nbd_cmd (cmd
));
192 /* Block status allowed? */
193 if (cmd
== NBD_CMD_BLOCK_STATUS
) {
194 if (!conn
->structured_replies
) {
195 nbdkit_error ("invalid request: "
196 "%s: structured replies was not negotiated",
197 name_of_nbd_cmd (cmd
));
201 if (!conn
->meta_context_base_allocation
) {
202 nbdkit_error ("invalid request: "
203 "%s: base:allocation was not negotiated",
204 name_of_nbd_cmd (cmd
));
210 return true; /* Command validates. */
213 /* This is called with the request lock held to actually execute the
214 * request (by calling the plugin). Note that the request fields have
215 * been validated already in 'validate_request' so we don't have to
218 * 'buf' is either the data to be written or the data to be returned,
219 * and points to a buffer of size 'count' bytes.
221 * 'extents' is an empty extents list used for block status requests
224 * In all cases, the return value is the system errno value that will
225 * later be converted to the nbd error to send back to the client (0
229 handle_request (uint16_t cmd
, uint16_t flags
, uint64_t offset
, uint32_t count
,
230 void *buf
, struct nbdkit_extents
*extents
)
233 struct context
*c
= conn
->top_context
;
237 /* Clear the error, so that we know if the plugin calls
238 * nbdkit_set_error() or relied on errno. */
239 threadlocal_set_error (0);
243 if (backend_pread (c
, buf
, count
, offset
, 0, &err
) == -1)
248 if (flags
& NBD_CMD_FLAG_FUA
)
249 f
|= NBDKIT_FLAG_FUA
;
250 if (backend_pwrite (c
, buf
, count
, offset
, f
, &err
) == -1)
255 if (backend_flush (c
, 0, &err
) == -1)
260 if (flags
& NBD_CMD_FLAG_FUA
)
261 f
|= NBDKIT_FLAG_FUA
;
262 if (backend_trim (c
, count
, offset
, f
, &err
) == -1)
267 if (backend_cache (c
, count
, offset
, 0, &err
) == -1)
271 case NBD_CMD_WRITE_ZEROES
:
272 if (!(flags
& NBD_CMD_FLAG_NO_HOLE
))
273 f
|= NBDKIT_FLAG_MAY_TRIM
;
274 if (flags
& NBD_CMD_FLAG_FUA
)
275 f
|= NBDKIT_FLAG_FUA
;
276 if (flags
& NBD_CMD_FLAG_FAST_ZERO
)
277 f
|= NBDKIT_FLAG_FAST_ZERO
;
278 if (backend_zero (c
, count
, offset
, f
, &err
) == -1)
282 case NBD_CMD_BLOCK_STATUS
:
283 if (flags
& NBD_CMD_FLAG_REQ_ONE
)
284 f
|= NBDKIT_FLAG_REQ_ONE
;
285 if (backend_extents (c
, count
, offset
, f
,
286 extents
, &err
) == -1)
298 skip_over_write_buffer (int sock
, size_t count
)
303 if (count
> MAX_REQUEST_SIZE
* 2) {
304 nbdkit_error ("write request too large to skip");
309 r
= read (sock
, buf
, count
> BUFSIZ
? BUFSIZ
: count
);
311 nbdkit_error ("skipping write buffer: %m");
315 nbdkit_error ("unexpected early EOF");
324 /* Convert a system errno to an NBD_E* error code. */
326 nbd_errno (int error
, uint16_t flags
)
346 return NBD_ESHUTDOWN
;
349 #if ENOTSUP != EOPNOTSUPP
352 if (flags
& NBD_CMD_FLAG_FAST_ZERO
)
356 if (flags
& NBD_CMD_FLAG_DF
)
357 return NBD_EOVERFLOW
;
366 send_simple_reply (uint64_t handle
, uint16_t cmd
, uint16_t flags
,
367 const char *buf
, uint32_t count
,
371 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->write_lock
);
372 struct nbd_simple_reply reply
;
374 int f
= (cmd
== NBD_CMD_READ
&& !error
) ? SEND_MORE
: 0;
376 reply
.magic
= htobe32 (NBD_SIMPLE_REPLY_MAGIC
);
377 reply
.handle
= handle
;
378 reply
.error
= htobe32 (nbd_errno (error
, flags
));
380 r
= conn
->send (&reply
, sizeof reply
, f
);
382 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
383 return connection_set_status (STATUS_DEAD
);
386 /* Send the read data buffer. */
387 if (cmd
== NBD_CMD_READ
&& !error
) {
388 r
= conn
->send (buf
, count
, 0);
390 nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd
));
391 return connection_set_status (STATUS_DEAD
);
398 send_structured_reply_read (uint64_t handle
, uint16_t cmd
,
399 const char *buf
, uint32_t count
, uint64_t offset
)
402 /* Once we are really using structured replies and sending data back
403 * in chunks, we'll be able to grab the write lock for each chunk,
404 * allowing other threads to interleave replies. As we're not doing
405 * that yet we acquire the lock for the whole function.
407 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->write_lock
);
408 struct nbd_structured_reply reply
;
409 struct nbd_structured_reply_offset_data offset_data
;
412 assert (cmd
== NBD_CMD_READ
);
414 reply
.magic
= htobe32 (NBD_STRUCTURED_REPLY_MAGIC
);
415 reply
.handle
= handle
;
416 reply
.flags
= htobe16 (NBD_REPLY_FLAG_DONE
);
417 reply
.type
= htobe16 (NBD_REPLY_TYPE_OFFSET_DATA
);
418 reply
.length
= htobe32 (count
+ sizeof offset_data
);
420 r
= conn
->send (&reply
, sizeof reply
, SEND_MORE
);
422 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
423 return connection_set_status (STATUS_DEAD
);
426 /* Send the offset + read data buffer. */
427 offset_data
.offset
= htobe64 (offset
);
428 r
= conn
->send (&offset_data
, sizeof offset_data
, SEND_MORE
);
430 nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd
));
431 return connection_set_status (STATUS_DEAD
);
434 r
= conn
->send (buf
, count
, 0);
436 nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd
));
437 return connection_set_status (STATUS_DEAD
);
442 /* Convert a list of extents into NBD_REPLY_TYPE_BLOCK_STATUS blocks.
443 * The rules here are very complicated. Read the spec carefully!
445 static struct nbd_block_descriptor
*
446 extents_to_block_descriptors (struct nbdkit_extents
*extents
,
448 uint32_t count
, uint64_t offset
,
451 const bool req_one
= flags
& NBD_CMD_FLAG_REQ_ONE
;
452 const size_t nr_extents
= nbdkit_extents_count (extents
);
454 struct nbd_block_descriptor
*blocks
;
456 /* This is checked in server/plugins.c. */
457 assert (nr_extents
>= 1);
459 /* We may send fewer than nr_extents blocks, but never more. */
460 blocks
= calloc (req_one
? 1 : nr_extents
,
461 sizeof (struct nbd_block_descriptor
));
462 if (blocks
== NULL
) {
463 nbdkit_error ("calloc: %m");
468 const struct nbdkit_extent e
= nbdkit_get_extent (extents
, 0);
470 /* Checked as a side effect of how the extent list is created. */
471 assert (e
.length
> 0);
475 /* Must not exceed count of the original request. */
476 blocks
[0].length
= MIN (e
.length
, (uint64_t) count
);
477 blocks
[0].status_flags
= e
.type
& 3;
480 uint64_t pos
= offset
;
483 for (i
= 0; i
< nr_extents
; ++i
) {
484 const struct nbdkit_extent e
= nbdkit_get_extent (extents
, i
);
488 assert (e
.offset
== offset
);
490 /* Must not exceed UINT32_MAX. */
491 blocks
[i
].length
= length
= MIN (e
.length
, UINT32_MAX
);
492 blocks
[i
].status_flags
= e
.type
& 3;
496 if (pos
> offset
+ count
) /* this must be the last block */
499 /* If we reach here then we must have consumed this whole
500 * extent. This is currently true because the server only sends
501 * 32 bit requests, but if we move to 64 bit requests we will
502 * need to revisit this code so it can split extents into
503 * multiple blocks. XXX
505 assert (e
.length
<= length
);
510 for (i
= 0; i
< *nr_blocks
; ++i
)
511 debug ("block status: sending block %" PRIu32
" type %" PRIu32
,
512 blocks
[i
].length
, blocks
[i
].status_flags
);
515 /* Convert to big endian for the protocol. */
516 for (i
= 0; i
< *nr_blocks
; ++i
) {
517 blocks
[i
].length
= htobe32 (blocks
[i
].length
);
518 blocks
[i
].status_flags
= htobe32 (blocks
[i
].status_flags
);
525 send_structured_reply_block_status (uint64_t handle
,
526 uint16_t cmd
, uint16_t flags
,
527 uint32_t count
, uint64_t offset
,
528 struct nbdkit_extents
*extents
)
531 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->write_lock
);
532 struct nbd_structured_reply reply
;
533 CLEANUP_FREE
struct nbd_block_descriptor
*blocks
= NULL
;
539 assert (conn
->meta_context_base_allocation
);
540 assert (cmd
== NBD_CMD_BLOCK_STATUS
);
542 blocks
= extents_to_block_descriptors (extents
, flags
, count
, offset
,
545 return connection_set_status (STATUS_DEAD
);
547 reply
.magic
= htobe32 (NBD_STRUCTURED_REPLY_MAGIC
);
548 reply
.handle
= handle
;
549 reply
.flags
= htobe16 (NBD_REPLY_FLAG_DONE
);
550 reply
.type
= htobe16 (NBD_REPLY_TYPE_BLOCK_STATUS
);
551 reply
.length
= htobe32 (sizeof context_id
+
552 nr_blocks
* sizeof (struct nbd_block_descriptor
));
554 r
= conn
->send (&reply
, sizeof reply
, SEND_MORE
);
556 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
557 return connection_set_status (STATUS_DEAD
);
560 /* Send the base:allocation context ID. */
561 context_id
= htobe32 (base_allocation_id
);
562 r
= conn
->send (&context_id
, sizeof context_id
, SEND_MORE
);
564 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
565 return connection_set_status (STATUS_DEAD
);
568 /* Send each block descriptor. */
569 for (i
= 0; i
< nr_blocks
; ++i
) {
570 r
= conn
->send (&blocks
[i
], sizeof blocks
[i
],
571 i
== nr_blocks
- 1 ? 0 : SEND_MORE
);
573 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
574 return connection_set_status (STATUS_DEAD
);
581 send_structured_reply_error (uint64_t handle
, uint16_t cmd
, uint16_t flags
,
585 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->write_lock
);
586 struct nbd_structured_reply reply
;
587 struct nbd_structured_reply_error error_data
;
590 reply
.magic
= htobe32 (NBD_STRUCTURED_REPLY_MAGIC
);
591 reply
.handle
= handle
;
592 reply
.flags
= htobe16 (NBD_REPLY_FLAG_DONE
);
593 reply
.type
= htobe16 (NBD_REPLY_TYPE_ERROR
);
594 reply
.length
= htobe32 (0 /* no human readable error */ + sizeof error_data
);
596 r
= conn
->send (&reply
, sizeof reply
, SEND_MORE
);
598 nbdkit_error ("write error reply: %m");
599 return connection_set_status (STATUS_DEAD
);
602 /* Send the error. */
603 error_data
.error
= htobe32 (nbd_errno (error
, flags
));
604 error_data
.len
= htobe16 (0);
605 r
= conn
->send (&error_data
, sizeof error_data
, 0);
607 nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd
));
608 return connection_set_status (STATUS_DEAD
);
610 /* No human readable error message at the moment. */
614 /* Do a recv/send sequence. Return true if the caller should shutdown. */
616 protocol_recv_request_send_reply (void)
621 struct nbd_request request
;
623 uint32_t magic
, count
, error
= 0;
626 CLEANUP_EXTENTS_FREE
struct nbdkit_extents
*extents
= NULL
;
628 /* Read the request packet. */
630 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->read_lock
);
631 r
= conn
->recv (&request
, sizeof request
);
632 cs
= connection_get_status ();
633 if (cs
<= STATUS_CLIENT_DONE
)
636 nbdkit_error ("read request: %m");
637 return connection_set_status (STATUS_DEAD
);
640 debug ("client closed input socket, closing connection");
641 return connection_set_status (STATUS_CLIENT_DONE
); /* disconnect */
644 magic
= be32toh (request
.magic
);
645 if (magic
!= NBD_REQUEST_MAGIC
) {
646 nbdkit_error ("invalid request: 'magic' field is incorrect (0x%x)",
648 return connection_set_status (STATUS_DEAD
);
651 flags
= be16toh (request
.flags
);
652 cmd
= be16toh (request
.type
);
654 offset
= be64toh (request
.offset
);
655 count
= be32toh (request
.count
);
657 if (cmd
== NBD_CMD_DISC
) {
658 debug ("client sent %s, closing connection", name_of_nbd_cmd (cmd
));
659 return connection_set_status (STATUS_CLIENT_DONE
); /* disconnect */
662 /* Validate the request. */
663 if (!validate_request (cmd
, flags
, offset
, count
, &error
)) {
664 if (cmd
== NBD_CMD_WRITE
&&
665 skip_over_write_buffer (conn
->sockin
, count
) < 0) {
666 return connection_set_status (STATUS_DEAD
);
671 /* Get the data buffer used for either read or write requests.
672 * This is a common per-thread data buffer, it must not be freed.
674 if (cmd
== NBD_CMD_READ
|| cmd
== NBD_CMD_WRITE
) {
675 buf
= threadlocal_buffer ((size_t) count
);
678 if (cmd
== NBD_CMD_WRITE
&&
679 skip_over_write_buffer (conn
->sockin
, count
) < 0) {
680 return connection_set_status (STATUS_DEAD
);
686 /* Allocate the extents list for block status only. */
687 if (cmd
== NBD_CMD_BLOCK_STATUS
) {
688 extents
= nbdkit_extents_new (offset
,
689 backend_get_size (conn
->top_context
));
690 if (extents
== NULL
) {
696 /* Receive the write data buffer. */
697 if (cmd
== NBD_CMD_WRITE
) {
698 r
= conn
->recv (buf
, count
);
704 nbdkit_error ("read data: %s: %m", name_of_nbd_cmd (cmd
));
705 return connection_set_status (STATUS_DEAD
);
710 /* Perform the request. Only this part happens inside the request lock. */
711 if (quit
|| cs
< STATUS_ACTIVE
) {
716 error
= handle_request (cmd
, flags
, offset
, count
, buf
, extents
);
717 assert ((int) error
>= 0);
721 /* Send the reply packet. */
723 if (connection_get_status () < STATUS_CLIENT_DONE
)
727 /* Since we're about to send only the limited NBD_E* errno to the
728 * client, don't lose the information about what really happened
729 * on the server side. Make sure there is a way for the operator
730 * to retrieve the real error.
732 debug ("sending error reply: %s", strerror (error
));
735 /* Currently we prefer to send simple replies for everything except
736 * where we have to (ie. NBD_CMD_READ and NBD_CMD_BLOCK_STATUS when
737 * structured_replies have been negotiated). However this prevents
738 * us from sending human-readable error messages to the client, so
739 * we should reconsider this in future.
741 if (!conn
->structured_replies
||
742 (cmd
!= NBD_CMD_READ
&& cmd
!= NBD_CMD_BLOCK_STATUS
))
743 return send_simple_reply (request
.handle
, cmd
, flags
, buf
, count
, error
);
746 return send_structured_reply_error (request
.handle
, cmd
, flags
, error
);
748 if (cmd
== NBD_CMD_READ
)
749 return send_structured_reply_read (request
.handle
, cmd
, buf
, count
,
752 /* NBD_CMD_BLOCK_STATUS */
753 return send_structured_reply_block_status (request
.handle
, cmd
, flags
,
754 count
, offset
, extents
);