2 * Copyright (C) 2013-2019 Red Hat Inc.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
46 #include "byte-swapping.h"
51 valid_range (struct connection
*conn
, uint64_t offset
, uint32_t count
)
53 uint64_t exportsize
= backend_get_size (backend
, conn
);
55 assert (exportsize
<= INT64_MAX
); /* Guaranteed by negotiation phase */
56 return count
> 0 && offset
<= exportsize
&& offset
+ count
<= exportsize
;
60 validate_request (struct connection
*conn
,
61 uint16_t cmd
, uint16_t flags
, uint64_t offset
, uint32_t count
,
64 /* Readonly connection? */
66 (cmd
== NBD_CMD_WRITE
|| cmd
== NBD_CMD_TRIM
||
67 cmd
== NBD_CMD_WRITE_ZEROES
)) {
68 nbdkit_error ("invalid request: %s: write request on readonly connection",
69 name_of_nbd_cmd (cmd
));
74 /* Validate cmd, offset, count. */
80 case NBD_CMD_WRITE_ZEROES
:
81 case NBD_CMD_BLOCK_STATUS
:
82 if (!valid_range (conn
, offset
, count
)) {
83 /* XXX Allow writes to extend the disk? */
84 nbdkit_error ("invalid request: %s: offset and count are out of range: "
85 "offset=%" PRIu64
" count=%" PRIu32
,
86 name_of_nbd_cmd (cmd
), offset
, count
);
87 *error
= (cmd
== NBD_CMD_WRITE
||
88 cmd
== NBD_CMD_WRITE_ZEROES
) ? ENOSPC
: EINVAL
;
94 if (offset
!= 0 || count
!= 0) {
95 nbdkit_error ("invalid request: %s: expecting offset and count = 0",
96 name_of_nbd_cmd (cmd
));
103 nbdkit_error ("invalid request: unknown command (%" PRIu32
") ignored",
110 if (flags
& ~(NBD_CMD_FLAG_FUA
| NBD_CMD_FLAG_NO_HOLE
|
111 NBD_CMD_FLAG_DF
| NBD_CMD_FLAG_REQ_ONE
)) {
112 nbdkit_error ("invalid request: unknown flag (0x%x)", flags
);
116 if ((flags
& NBD_CMD_FLAG_NO_HOLE
) &&
117 cmd
!= NBD_CMD_WRITE_ZEROES
) {
118 nbdkit_error ("invalid request: NO_HOLE flag needs WRITE_ZEROES request");
122 if (flags
& NBD_CMD_FLAG_DF
) {
123 if (cmd
!= NBD_CMD_READ
) {
124 nbdkit_error ("invalid request: DF flag needs READ request");
128 if (!conn
->structured_replies
) {
129 nbdkit_error ("invalid request: "
130 "%s: structured replies was not negotiated",
131 name_of_nbd_cmd (cmd
));
136 if ((flags
& NBD_CMD_FLAG_REQ_ONE
) &&
137 cmd
!= NBD_CMD_BLOCK_STATUS
) {
138 nbdkit_error ("invalid request: REQ_ONE flag needs BLOCK_STATUS request");
142 if (!conn
->can_fua
&& (flags
& NBD_CMD_FLAG_FUA
)) {
143 nbdkit_error ("invalid request: FUA flag not supported");
148 /* Refuse over-large read and write requests. */
149 if ((cmd
== NBD_CMD_WRITE
|| cmd
== NBD_CMD_READ
) &&
150 count
> MAX_REQUEST_SIZE
) {
151 nbdkit_error ("invalid request: %s: data request is too large (%" PRIu32
153 name_of_nbd_cmd (cmd
), count
, MAX_REQUEST_SIZE
);
159 if (!conn
->can_flush
&& cmd
== NBD_CMD_FLUSH
) {
160 nbdkit_error ("invalid request: %s: flush operation not supported",
161 name_of_nbd_cmd (cmd
));
167 if (!conn
->can_trim
&& cmd
== NBD_CMD_TRIM
) {
168 nbdkit_error ("invalid request: %s: trim operation not supported",
169 name_of_nbd_cmd (cmd
));
175 if (!conn
->can_zero
&& cmd
== NBD_CMD_WRITE_ZEROES
) {
176 nbdkit_error ("invalid request: %s: write zeroes operation not supported",
177 name_of_nbd_cmd (cmd
));
183 if (!conn
->can_cache
&& cmd
== NBD_CMD_CACHE
) {
184 nbdkit_error ("invalid request: %s: cache operation not supported",
185 name_of_nbd_cmd (cmd
));
190 /* Block status allowed? */
191 if (cmd
== NBD_CMD_BLOCK_STATUS
) {
192 if (!conn
->structured_replies
) {
193 nbdkit_error ("invalid request: "
194 "%s: structured replies was not negotiated",
195 name_of_nbd_cmd (cmd
));
199 if (!conn
->meta_context_base_allocation
) {
200 nbdkit_error ("invalid request: "
201 "%s: base:allocation was not negotiated",
202 name_of_nbd_cmd (cmd
));
208 return true; /* Command validates. */
211 /* This is called with the request lock held to actually execute the
212 * request (by calling the plugin). Note that the request fields have
213 * been validated already in 'validate_request' so we don't have to
216 * 'buf' is either the data to be written or the data to be returned,
217 * and points to a buffer of size 'count' bytes.
219 * 'extents' is an empty extents list used for block status requests
222 * In all cases, the return value is the system errno value that will
223 * later be converted to the nbd error to send back to the client (0
227 handle_request (struct connection
*conn
,
228 uint16_t cmd
, uint16_t flags
, uint64_t offset
, uint32_t count
,
229 void *buf
, struct nbdkit_extents
*extents
)
232 bool fua
= conn
->can_fua
&& (flags
& NBD_CMD_FLAG_FUA
);
235 /* Clear the error, so that we know if the plugin calls
236 * nbdkit_set_error() or relied on errno. */
237 threadlocal_set_error (0);
241 if (backend_pread (backend
, conn
, buf
, count
, offset
, 0, &err
) == -1)
247 f
|= NBDKIT_FLAG_FUA
;
248 if (backend_pwrite (backend
, conn
, buf
, count
, offset
, f
, &err
) == -1)
253 if (backend_flush (backend
, conn
, 0, &err
) == -1)
259 f
|= NBDKIT_FLAG_FUA
;
260 if (backend_trim (backend
, conn
, count
, offset
, f
, &err
) == -1)
265 if (conn
->emulate_cache
) {
266 static char buf
[MAX_REQUEST_SIZE
]; /* data sink, never read */
270 limit
= MIN (count
, sizeof buf
);
271 if (backend_pread (backend
, conn
, buf
, limit
, offset
, flags
,
277 else if (backend_cache (backend
, conn
, count
, offset
, 0, &err
) == -1)
281 case NBD_CMD_WRITE_ZEROES
:
282 if (!(flags
& NBD_CMD_FLAG_NO_HOLE
))
283 f
|= NBDKIT_FLAG_MAY_TRIM
;
285 f
|= NBDKIT_FLAG_FUA
;
286 if (backend_zero (backend
, conn
, count
, offset
, f
, &err
) == -1)
290 case NBD_CMD_BLOCK_STATUS
:
291 /* The other backend methods don't check can_*. That is because
292 * those methods are implicitly suppressed by returning eflags to
293 * the client. However there is no eflag for extents so we must
296 if (conn
->can_extents
) {
297 if (flags
& NBD_CMD_FLAG_REQ_ONE
)
298 f
|= NBDKIT_FLAG_REQ_ONE
;
299 if (backend_extents (backend
, conn
, count
, offset
, f
,
300 extents
, &err
) == -1)
306 /* By default it is safe assume that everything in the range is
310 r
= nbdkit_add_extent (extents
, offset
, count
, 0 /* allocated data */);
312 return errno
? errno
: EINVAL
;
325 skip_over_write_buffer (int sock
, size_t count
)
330 if (count
> MAX_REQUEST_SIZE
* 2) {
331 nbdkit_error ("write request too large to skip");
336 r
= read (sock
, buf
, count
> BUFSIZ
? BUFSIZ
: count
);
338 nbdkit_error ("skipping write buffer: %m");
342 nbdkit_error ("unexpected early EOF");
351 /* Convert a system errno to an NBD_E* error code. */
353 nbd_errno (int error
, bool flag_df
)
373 return NBD_ESHUTDOWN
;
377 return NBD_EOVERFLOW
;
386 send_simple_reply (struct connection
*conn
,
387 uint64_t handle
, uint16_t cmd
,
388 const char *buf
, uint32_t count
,
391 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->write_lock
);
392 struct simple_reply reply
;
394 int f
= (cmd
== NBD_CMD_READ
&& !error
) ? SEND_MORE
: 0;
396 reply
.magic
= htobe32 (NBD_SIMPLE_REPLY_MAGIC
);
397 reply
.handle
= handle
;
398 reply
.error
= htobe32 (nbd_errno (error
, false));
400 r
= conn
->send (conn
, &reply
, sizeof reply
, f
);
402 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
403 return connection_set_status (conn
, -1);
406 /* Send the read data buffer. */
407 if (cmd
== NBD_CMD_READ
&& !error
) {
408 r
= conn
->send (conn
, buf
, count
, 0);
410 nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd
));
411 return connection_set_status (conn
, -1);
415 return 1; /* command processed ok */
419 send_structured_reply_read (struct connection
*conn
,
420 uint64_t handle
, uint16_t cmd
,
421 const char *buf
, uint32_t count
, uint64_t offset
)
423 /* Once we are really using structured replies and sending data back
424 * in chunks, we'll be able to grab the write lock for each chunk,
425 * allowing other threads to interleave replies. As we're not doing
426 * that yet we acquire the lock for the whole function.
428 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->write_lock
);
429 struct structured_reply reply
;
430 struct structured_reply_offset_data offset_data
;
433 assert (cmd
== NBD_CMD_READ
);
435 reply
.magic
= htobe32 (NBD_STRUCTURED_REPLY_MAGIC
);
436 reply
.handle
= handle
;
437 reply
.flags
= htobe16 (NBD_REPLY_FLAG_DONE
);
438 reply
.type
= htobe16 (NBD_REPLY_TYPE_OFFSET_DATA
);
439 reply
.length
= htobe32 (count
+ sizeof offset_data
);
441 r
= conn
->send (conn
, &reply
, sizeof reply
, SEND_MORE
);
443 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
444 return connection_set_status (conn
, -1);
447 /* Send the offset + read data buffer. */
448 offset_data
.offset
= htobe64 (offset
);
449 r
= conn
->send (conn
, &offset_data
, sizeof offset_data
, SEND_MORE
);
451 nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd
));
452 return connection_set_status (conn
, -1);
455 r
= conn
->send (conn
, buf
, count
, 0);
457 nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd
));
458 return connection_set_status (conn
, -1);
461 return 1; /* command processed ok */
464 /* Convert a list of extents into NBD_REPLY_TYPE_BLOCK_STATUS blocks.
465 * The rules here are very complicated. Read the spec carefully!
467 static struct block_descriptor
*
468 extents_to_block_descriptors (struct nbdkit_extents
*extents
,
470 uint32_t count
, uint64_t offset
,
473 const bool req_one
= flags
& NBD_CMD_FLAG_REQ_ONE
;
474 const size_t nr_extents
= nbdkit_extents_count (extents
);
476 struct block_descriptor
*blocks
;
478 /* This is checked in server/plugins.c. */
479 assert (nr_extents
>= 1);
481 /* We may send fewer than nr_extents blocks, but never more. */
482 blocks
= calloc (req_one
? 1 : nr_extents
, sizeof (struct block_descriptor
));
483 if (blocks
== NULL
) {
484 nbdkit_error ("calloc: %m");
489 const struct nbdkit_extent e
= nbdkit_get_extent (extents
, 0);
491 /* Checked as a side effect of how the extent list is created. */
492 assert (e
.length
> 0);
496 /* Must not exceed count of the original request. */
497 blocks
[0].length
= MIN (e
.length
, (uint64_t) count
);
498 blocks
[0].status_flags
= e
.type
& 3;
501 uint64_t pos
= offset
;
504 for (i
= 0; i
< nr_extents
; ++i
) {
505 const struct nbdkit_extent e
= nbdkit_get_extent (extents
, i
);
509 assert (e
.offset
== offset
);
511 /* Must not exceed UINT32_MAX. */
512 blocks
[i
].length
= length
= MIN (e
.length
, UINT32_MAX
);
513 blocks
[i
].status_flags
= e
.type
& 3;
517 if (pos
> offset
+ count
) /* this must be the last block */
520 /* If we reach here then we must have consumed this whole
521 * extent. This is currently true because the server only sends
522 * 32 bit requests, but if we move to 64 bit requests we will
523 * need to revisit this code so it can split extents into
524 * multiple blocks. XXX
526 assert (e
.length
<= length
);
531 for (i
= 0; i
< *nr_blocks
; ++i
)
532 nbdkit_debug ("block status: sending block %" PRIu32
" type %" PRIu32
,
533 blocks
[i
].length
, blocks
[i
].status_flags
);
536 /* Convert to big endian for the protocol. */
537 for (i
= 0; i
< *nr_blocks
; ++i
) {
538 blocks
[i
].length
= htobe32 (blocks
[i
].length
);
539 blocks
[i
].status_flags
= htobe32 (blocks
[i
].status_flags
);
546 send_structured_reply_block_status (struct connection
*conn
,
548 uint16_t cmd
, uint16_t flags
,
549 uint32_t count
, uint64_t offset
,
550 struct nbdkit_extents
*extents
)
552 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->write_lock
);
553 struct structured_reply reply
;
554 CLEANUP_FREE
struct block_descriptor
*blocks
= NULL
;
560 assert (conn
->meta_context_base_allocation
);
561 assert (cmd
== NBD_CMD_BLOCK_STATUS
);
563 blocks
= extents_to_block_descriptors (extents
, flags
, count
, offset
,
566 return connection_set_status (conn
, -1);
568 reply
.magic
= htobe32 (NBD_STRUCTURED_REPLY_MAGIC
);
569 reply
.handle
= handle
;
570 reply
.flags
= htobe16 (NBD_REPLY_FLAG_DONE
);
571 reply
.type
= htobe16 (NBD_REPLY_TYPE_BLOCK_STATUS
);
572 reply
.length
= htobe32 (sizeof context_id
+
573 nr_blocks
* sizeof (struct block_descriptor
));
575 r
= conn
->send (conn
, &reply
, sizeof reply
, SEND_MORE
);
577 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
578 return connection_set_status (conn
, -1);
581 /* Send the base:allocation context ID. */
582 context_id
= htobe32 (base_allocation_id
);
583 r
= conn
->send (conn
, &context_id
, sizeof context_id
, SEND_MORE
);
585 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
586 return connection_set_status (conn
, -1);
589 /* Send each block descriptor. */
590 for (i
= 0; i
< nr_blocks
; ++i
) {
591 r
= conn
->send (conn
, &blocks
[i
], sizeof blocks
[i
],
592 i
== nr_blocks
- 1 ? 0 : SEND_MORE
);
594 nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd
));
595 return connection_set_status (conn
, -1);
599 return 1; /* command processed ok */
603 send_structured_reply_error (struct connection
*conn
,
604 uint64_t handle
, uint16_t cmd
, uint16_t flags
,
607 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->write_lock
);
608 struct structured_reply reply
;
609 struct structured_reply_error error_data
;
612 reply
.magic
= htobe32 (NBD_STRUCTURED_REPLY_MAGIC
);
613 reply
.handle
= handle
;
614 reply
.flags
= htobe16 (NBD_REPLY_FLAG_DONE
);
615 reply
.type
= htobe16 (NBD_REPLY_TYPE_ERROR
);
616 reply
.length
= htobe32 (0 /* no human readable error */ + sizeof error_data
);
618 r
= conn
->send (conn
, &reply
, sizeof reply
, SEND_MORE
);
620 nbdkit_error ("write error reply: %m");
621 return connection_set_status (conn
, -1);
624 /* Send the error. */
625 error_data
.error
= htobe32 (nbd_errno (error
, flags
& NBD_CMD_FLAG_DF
));
626 error_data
.len
= htobe16 (0);
627 r
= conn
->send (conn
, &error_data
, sizeof error_data
, 0);
629 nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd
));
630 return connection_set_status (conn
, -1);
632 /* No human readable error message at the moment. */
634 return 1; /* command processed ok */
638 protocol_recv_request_send_reply (struct connection
*conn
)
641 struct request request
;
643 uint32_t magic
, count
, error
= 0;
646 CLEANUP_EXTENTS_FREE
struct nbdkit_extents
*extents
= NULL
;
648 /* Read the request packet. */
650 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn
->read_lock
);
651 r
= connection_get_status (conn
);
654 r
= conn
->recv (conn
, &request
, sizeof request
);
656 nbdkit_error ("read request: %m");
657 return connection_set_status (conn
, -1);
660 debug ("client closed input socket, closing connection");
661 return connection_set_status (conn
, 0); /* disconnect */
664 magic
= be32toh (request
.magic
);
665 if (magic
!= NBD_REQUEST_MAGIC
) {
666 nbdkit_error ("invalid request: 'magic' field is incorrect (0x%x)",
668 return connection_set_status (conn
, -1);
671 flags
= be16toh (request
.flags
);
672 cmd
= be16toh (request
.type
);
674 offset
= be64toh (request
.offset
);
675 count
= be32toh (request
.count
);
677 if (cmd
== NBD_CMD_DISC
) {
678 debug ("client sent %s, closing connection", name_of_nbd_cmd (cmd
));
679 return connection_set_status (conn
, 0); /* disconnect */
682 /* Validate the request. */
683 if (!validate_request (conn
, cmd
, flags
, offset
, count
, &error
)) {
684 if (cmd
== NBD_CMD_WRITE
&&
685 skip_over_write_buffer (conn
->sockin
, count
) < 0)
686 return connection_set_status (conn
, -1);
690 /* Get the data buffer used for either read or write requests.
691 * This is a common per-thread data buffer, it must not be freed.
693 if (cmd
== NBD_CMD_READ
|| cmd
== NBD_CMD_WRITE
) {
694 buf
= threadlocal_buffer ((size_t) count
);
697 if (cmd
== NBD_CMD_WRITE
&&
698 skip_over_write_buffer (conn
->sockin
, count
) < 0)
699 return connection_set_status (conn
, -1);
704 /* Allocate the extents list for block status only. */
705 if (cmd
== NBD_CMD_BLOCK_STATUS
) {
706 extents
= nbdkit_extents_new (offset
, backend_get_size (backend
, conn
));
707 if (extents
== NULL
) {
713 /* Receive the write data buffer. */
714 if (cmd
== NBD_CMD_WRITE
) {
715 r
= conn
->recv (conn
, buf
, count
);
721 nbdkit_error ("read data: %s: %m", name_of_nbd_cmd (cmd
));
722 return connection_set_status (conn
, -1);
727 /* Perform the request. Only this part happens inside the request lock. */
728 if (quit
|| !connection_get_status (conn
)) {
733 error
= handle_request (conn
, cmd
, flags
, offset
, count
, buf
, extents
);
734 assert ((int) error
>= 0);
735 unlock_request (conn
);
738 /* Send the reply packet. */
740 if (connection_get_status (conn
) < 0)
744 /* Since we're about to send only the limited NBD_E* errno to the
745 * client, don't lose the information about what really happened
746 * on the server side. Make sure there is a way for the operator
747 * to retrieve the real error.
749 debug ("sending error reply: %s", strerror (error
));
752 /* Currently we prefer to send simple replies for everything except
753 * where we have to (ie. NBD_CMD_READ and NBD_CMD_BLOCK_STATUS when
754 * structured_replies have been negotiated). However this prevents
755 * us from sending human-readable error messages to the client, so
756 * we should reconsider this in future.
758 if (conn
->structured_replies
&&
759 (cmd
== NBD_CMD_READ
|| cmd
== NBD_CMD_BLOCK_STATUS
)) {
761 if (cmd
== NBD_CMD_READ
)
762 return send_structured_reply_read (conn
, request
.handle
, cmd
,
764 else /* NBD_CMD_BLOCK_STATUS */
765 return send_structured_reply_block_status (conn
, request
.handle
,
771 return send_structured_reply_error (conn
, request
.handle
, cmd
, flags
,
775 return send_simple_reply (conn
, request
.handle
, cmd
, buf
, count
, error
);