server/protocol.c

   1 /* nbdkit
   2  * Copyright Red Hat
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions are
   6  * met:
   7  *
   8  * * Redistributions of source code must retain the above copyright
   9  * notice, this list of conditions and the following disclaimer.
  10  *
  11  * * Redistributions in binary form must reproduce the above copyright
  12  * notice, this list of conditions and the following disclaimer in the
  13  * documentation and/or other materials provided with the distribution.
  14  *
  15  * * Neither the name of Red Hat nor the names of its contributors may be
  16  * used to endorse or promote products derived from this software without
  17  * specific prior written permission.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  22  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
  23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  26  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  27  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  28  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  29  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  */
  32
  33 #include <config.h>
  34
  35 #include <stdio.h>
  36 #include <stdlib.h>
  37 #include <stdint.h>
  38 #include <stdbool.h>
  39 #include <inttypes.h>
  40 #include <string.h>
  41 #include <unistd.h>
  42 #include <errno.h>
  43 #include <assert.h>
  44
  45 #include "internal.h"
  46 #include "byte-swapping.h"
  47 #include "minmax.h"
  48 #include "nbd-protocol.h"
  49 #include "protostrings.h"
  50
  51 static bool
  52 validate_request (uint16_t cmd, uint16_t flags, uint64_t offset, uint32_t count,
  53                   uint32_t *error)
  54 {
  55   GET_CONN;
  56
  57   /* Readonly connection? */
  58   if (conn->eflags & NBD_FLAG_READ_ONLY &&
  59       (cmd == NBD_CMD_WRITE || cmd == NBD_CMD_TRIM ||
  60        cmd == NBD_CMD_WRITE_ZEROES)) {
  61     nbdkit_error ("invalid request: %s: write request on readonly connection",
  62                   name_of_nbd_cmd (cmd));
  63     *error = EROFS;
  64     return false;
  65   }
  66
  67   /* Validate cmd, offset, count. */
  68   switch (cmd) {
  69   case NBD_CMD_READ:
  70   case NBD_CMD_CACHE:
  71   case NBD_CMD_WRITE:
  72   case NBD_CMD_TRIM:
  73   case NBD_CMD_WRITE_ZEROES:
  74   case NBD_CMD_BLOCK_STATUS:
  75     if (!backend_valid_range (conn->top_context, offset, count)) {
  76       /* XXX Allow writes to extend the disk? */
  77       nbdkit_error ("invalid request: %s: offset and count are out of range: "
  78                     "offset=%" PRIu64 " count=%" PRIu32,
  79                     name_of_nbd_cmd (cmd), offset, count);
  80       *error = (cmd == NBD_CMD_WRITE ||
  81                 cmd == NBD_CMD_WRITE_ZEROES) ? ENOSPC : EINVAL;
  82       return false;
  83     }
  84     break;
  85
  86   case NBD_CMD_FLUSH:
  87     if (offset != 0 || count != 0) {
  88       nbdkit_error ("invalid request: %s: expecting offset and count = 0",
  89                     name_of_nbd_cmd (cmd));
  90       *error = EINVAL;
  91       return false;
  92     }
  93     break;
  94
  95   default:
  96     nbdkit_error ("invalid request: unknown command (%" PRIu32 ") ignored",
  97                   cmd);
  98     *error = EINVAL;
  99     return false;
 100   }
 101
 102   /* Validate flags */
 103   if (flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE |
 104                 NBD_CMD_FLAG_DF | NBD_CMD_FLAG_REQ_ONE |
 105                 NBD_CMD_FLAG_FAST_ZERO)) {
 106     nbdkit_error ("invalid request: unknown flag (0x%x)", flags);
 107     *error = EINVAL;
 108     return false;
 109   }
 110   if ((flags & NBD_CMD_FLAG_NO_HOLE) &&
 111       cmd != NBD_CMD_WRITE_ZEROES) {
 112     nbdkit_error ("invalid request: NO_HOLE flag needs WRITE_ZEROES request");
 113     *error = EINVAL;
 114     return false;
 115   }
 116   if ((flags & NBD_CMD_FLAG_FAST_ZERO) &&
 117       cmd != NBD_CMD_WRITE_ZEROES) {
 118     nbdkit_error ("invalid request: "
 119                   "FAST_ZERO flag needs WRITE_ZEROES request");
 120     *error = EINVAL;
 121     return false;
 122   }
 123   if (flags & NBD_CMD_FLAG_DF) {
 124     if (cmd != NBD_CMD_READ) {
 125       nbdkit_error ("invalid request: DF flag needs READ request");
 126       *error = EINVAL;
 127       return false;
 128     }
 129     if (!conn->structured_replies) {
 130       nbdkit_error ("invalid request: "
 131                     "%s: structured replies was not negotiated",
 132                     name_of_nbd_cmd (cmd));
 133       *error = EINVAL;
 134       return false;
 135     }
 136   }
 137   if ((flags & NBD_CMD_FLAG_REQ_ONE) &&
 138       cmd != NBD_CMD_BLOCK_STATUS) {
 139     nbdkit_error ("invalid request: REQ_ONE flag needs BLOCK_STATUS request");
 140     *error = EINVAL;
 141     return false;
 142   }
 143   if (flags & NBD_CMD_FLAG_FUA && !(conn->eflags & NBD_FLAG_SEND_FUA)) {
 144     nbdkit_error ("invalid request: FUA flag not supported");
 145     *error = EINVAL;
 146     return false;
 147   }
 148
 149   /* Refuse over-large read and write requests. */
 150   if ((cmd == NBD_CMD_WRITE || cmd == NBD_CMD_READ) &&
 151       count > MAX_REQUEST_SIZE) {
 152     nbdkit_error ("invalid request: %s: data request is too large (%" PRIu32
 153                   " > %d)",
 154                   name_of_nbd_cmd (cmd), count, MAX_REQUEST_SIZE);
 155     *error = ENOMEM;
 156     return false;
 157   }
 158
 159   /* Flush allowed? */
 160   if (cmd == NBD_CMD_FLUSH && !(conn->eflags & NBD_FLAG_SEND_FLUSH)) {
 161     nbdkit_error ("invalid request: %s: flush operation not supported",
 162                   name_of_nbd_cmd (cmd));
 163     *error = EINVAL;
 164     return false;
 165   }
 166
 167   /* Trim allowed? */
 168   if (cmd == NBD_CMD_TRIM && !(conn->eflags & NBD_FLAG_SEND_TRIM)) {
 169     nbdkit_error ("invalid request: %s: trim operation not supported",
 170                   name_of_nbd_cmd (cmd));
 171     *error = EINVAL;
 172     return false;
 173   }
 174
 175   /* Zero allowed? */
 176   if (cmd == NBD_CMD_WRITE_ZEROES &&
 177       !(conn->eflags & NBD_FLAG_SEND_WRITE_ZEROES)) {
 178     nbdkit_error ("invalid request: %s: write zeroes operation not supported",
 179                   name_of_nbd_cmd (cmd));
 180     *error = EINVAL;
 181     return false;
 182   }
 183
 184   /* Cache allowed? */
 185   if (cmd == NBD_CMD_CACHE && !(conn->eflags & NBD_FLAG_SEND_CACHE)) {
 186     nbdkit_error ("invalid request: %s: cache operation not supported",
 187                   name_of_nbd_cmd (cmd));
 188     *error = EINVAL;
 189     return false;
 190   }
 191
 192   /* Block status allowed? */
 193   if (cmd == NBD_CMD_BLOCK_STATUS) {
 194     if (!conn->structured_replies) {
 195       nbdkit_error ("invalid request: "
 196                     "%s: structured replies was not negotiated",
 197                     name_of_nbd_cmd (cmd));
 198       *error = EINVAL;
 199       return false;
 200     }
 201     if (!conn->meta_context_base_allocation) {
 202       nbdkit_error ("invalid request: "
 203                     "%s: base:allocation was not negotiated",
 204                     name_of_nbd_cmd (cmd));
 205       *error = EINVAL;
 206       return false;
 207     }
 208   }
 209
 210   return true;                     /* Command validates. */
 211 }
 212
 213 /* This is called with the request lock held to actually execute the
 214  * request (by calling the plugin).  Note that the request fields have
 215  * been validated already in 'validate_request' so we don't have to
 216  * check them again.
 217  *
 218  * 'buf' is either the data to be written or the data to be returned,
 219  * and points to a buffer of size 'count' bytes.
 220  *
 221  * 'extents' is an empty extents list used for block status requests
 222  * only.
 223  *
 224  * In all cases, the return value is the system errno value that will
 225  * later be converted to the nbd error to send back to the client (0
 226  * for success).
 227  */
 228 static uint32_t
 229 handle_request (uint16_t cmd, uint16_t flags, uint64_t offset, uint32_t count,
 230                 void *buf, struct nbdkit_extents *extents)
 231 {
 232   GET_CONN;
 233   struct context *c = conn->top_context;
 234   uint32_t f = 0;
 235   int err = 0;
 236
 237   /* Clear the error, so that we know if the plugin calls
 238    * nbdkit_set_error() or relied on errno.  */
 239   threadlocal_set_error (0);
 240
 241   switch (cmd) {
 242   case NBD_CMD_READ:
 243     if (backend_pread (c, buf, count, offset, 0, &err) == -1)
 244       return err;
 245     break;
 246
 247   case NBD_CMD_WRITE:
 248     if (flags & NBD_CMD_FLAG_FUA)
 249       f |= NBDKIT_FLAG_FUA;
 250     if (backend_pwrite (c, buf, count, offset, f, &err) == -1)
 251       return err;
 252     break;
 253
 254   case NBD_CMD_FLUSH:
 255     if (backend_flush (c, 0, &err) == -1)
 256       return err;
 257     break;
 258
 259   case NBD_CMD_TRIM:
 260     if (flags & NBD_CMD_FLAG_FUA)
 261       f |= NBDKIT_FLAG_FUA;
 262     if (backend_trim (c, count, offset, f, &err) == -1)
 263       return err;
 264     break;
 265
 266   case NBD_CMD_CACHE:
 267     if (backend_cache (c, count, offset, 0, &err) == -1)
 268       return err;
 269     break;
 270
 271   case NBD_CMD_WRITE_ZEROES:
 272     if (!(flags & NBD_CMD_FLAG_NO_HOLE))
 273       f |= NBDKIT_FLAG_MAY_TRIM;
 274     if (flags & NBD_CMD_FLAG_FUA)
 275       f |= NBDKIT_FLAG_FUA;
 276     if (flags & NBD_CMD_FLAG_FAST_ZERO)
 277       f |= NBDKIT_FLAG_FAST_ZERO;
 278     if (backend_zero (c, count, offset, f, &err) == -1)
 279       return err;
 280     break;
 281
 282   case NBD_CMD_BLOCK_STATUS:
 283     if (flags & NBD_CMD_FLAG_REQ_ONE)
 284       f |= NBDKIT_FLAG_REQ_ONE;
 285     if (backend_extents (c, count, offset, f,
 286                          extents, &err) == -1)
 287       return err;
 288     break;
 289
 290   default:
 291     abort ();
 292   }
 293
 294   return 0;
 295 }
 296
 297 static int
 298 skip_over_write_buffer (int sock, size_t count)
 299 {
 300   char buf[BUFSIZ];
 301   ssize_t r;
 302
 303   if (count > MAX_REQUEST_SIZE * 2) {
 304     nbdkit_error ("write request too large to skip");
 305     return -1;
 306   }
 307
 308   while (count > 0) {
 309     r = read (sock, buf, count > BUFSIZ ? BUFSIZ : count);
 310     if (r == -1) {
 311       nbdkit_error ("skipping write buffer: %m");
 312       return -1;
 313     }
 314     if (r == 0)  {
 315       nbdkit_error ("unexpected early EOF");
 316       errno = EBADMSG;
 317       return -1;
 318     }
 319     count -= r;
 320   }
 321   return 0;
 322 }
 323
 324 /* Convert a system errno to an NBD_E* error code. */
 325 static int
 326 nbd_errno (int error, uint16_t flags)
 327 {
 328   switch (error) {
 329   case 0:
 330     return NBD_SUCCESS;
 331   case EROFS:
 332   case EPERM:
 333     return NBD_EPERM;
 334   case EIO:
 335     return NBD_EIO;
 336   case ENOMEM:
 337     return NBD_ENOMEM;
 338 #ifdef EDQUOT
 339   case EDQUOT:
 340 #endif
 341   case EFBIG:
 342   case ENOSPC:
 343     return NBD_ENOSPC;
 344 #ifdef ESHUTDOWN
 345   case ESHUTDOWN:
 346     return NBD_ESHUTDOWN;
 347 #endif
 348   case ENOTSUP:
 349 #if ENOTSUP != EOPNOTSUPP
 350   case EOPNOTSUPP:
 351 #endif
 352     if (flags & NBD_CMD_FLAG_FAST_ZERO)
 353       return NBD_ENOTSUP;
 354     return NBD_EINVAL;
 355   case EOVERFLOW:
 356     if (flags & NBD_CMD_FLAG_DF)
 357       return NBD_EOVERFLOW;
 358     return NBD_EINVAL;
 359   case EINVAL:
 360   default:
 361     return NBD_EINVAL;
 362   }
 363 }
 364
 365 static bool
 366 send_simple_reply (uint64_t handle, uint16_t cmd, uint16_t flags,
 367                    const char *buf, uint32_t count,
 368                    uint32_t error)
 369 {
 370   GET_CONN;
 371   ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->write_lock);
 372   struct nbd_simple_reply reply;
 373   int r;
 374   int f = (cmd == NBD_CMD_READ && !error) ? SEND_MORE : 0;
 375
 376   reply.magic = htobe32 (NBD_SIMPLE_REPLY_MAGIC);
 377   reply.handle = handle;
 378   reply.error = htobe32 (nbd_errno (error, flags));
 379
 380   r = conn->send (&reply, sizeof reply, f);
 381   if (r == -1) {
 382     nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
 383     return connection_set_status (STATUS_DEAD);
 384   }
 385
 386   /* Send the read data buffer. */
 387   if (cmd == NBD_CMD_READ && !error) {
 388     r = conn->send (buf, count, 0);
 389     if (r == -1) {
 390       nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd));
 391       return connection_set_status (STATUS_DEAD);
 392     }
 393   }
 394   return false;
 395 }
 396
 397 static bool
 398 send_structured_reply_read (uint64_t handle, uint16_t cmd,
 399                             const char *buf, uint32_t count, uint64_t offset)
 400 {
 401   GET_CONN;
 402   /* Once we are really using structured replies and sending data back
 403    * in chunks, we'll be able to grab the write lock for each chunk,
 404    * allowing other threads to interleave replies.  As we're not doing
 405    * that yet we acquire the lock for the whole function.
 406    */
 407   ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->write_lock);
 408   struct nbd_structured_reply reply;
 409   struct nbd_structured_reply_offset_data offset_data;
 410   int r;
 411
 412   assert (cmd == NBD_CMD_READ);
 413
 414   reply.magic = htobe32 (NBD_STRUCTURED_REPLY_MAGIC);
 415   reply.handle = handle;
 416   reply.flags = htobe16 (NBD_REPLY_FLAG_DONE);
 417   reply.type = htobe16 (NBD_REPLY_TYPE_OFFSET_DATA);
 418   reply.length = htobe32 (count + sizeof offset_data);
 419
 420   r = conn->send (&reply, sizeof reply, SEND_MORE);
 421   if (r == -1) {
 422     nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
 423     return connection_set_status (STATUS_DEAD);
 424   }
 425
 426   /* Send the offset + read data buffer. */
 427   offset_data.offset = htobe64 (offset);
 428   r = conn->send (&offset_data, sizeof offset_data, SEND_MORE);
 429   if (r == -1) {
 430     nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd));
 431     return connection_set_status (STATUS_DEAD);
 432   }
 433
 434   r = conn->send (buf, count, 0);
 435   if (r == -1) {
 436     nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd));
 437     return connection_set_status (STATUS_DEAD);
 438   }
 439   return false;
 440 }
 441
 442 /* Convert a list of extents into NBD_REPLY_TYPE_BLOCK_STATUS blocks.
 443  * The rules here are very complicated.  Read the spec carefully!
 444  */
 445 static struct nbd_block_descriptor *
 446 extents_to_block_descriptors (struct nbdkit_extents *extents,
 447                               uint16_t flags,
 448                               uint32_t count, uint64_t offset,
 449                               size_t *nr_blocks)
 450 {
 451   const bool req_one = flags & NBD_CMD_FLAG_REQ_ONE;
 452   const size_t nr_extents = nbdkit_extents_count (extents);
 453   size_t i;
 454   struct nbd_block_descriptor *blocks;
 455
 456   /* This is checked in server/plugins.c. */
 457   assert (nr_extents >= 1);
 458
 459   /* We may send fewer than nr_extents blocks, but never more. */
 460   blocks = calloc (req_one ? 1 : nr_extents,
 461                    sizeof (struct nbd_block_descriptor));
 462   if (blocks == NULL) {
 463     nbdkit_error ("calloc: %m");
 464     return NULL;
 465   }
 466
 467   if (req_one) {
 468     const struct nbdkit_extent e = nbdkit_get_extent (extents, 0);
 469
 470     /* Checked as a side effect of how the extent list is created. */
 471     assert (e.length > 0);
 472
 473     *nr_blocks = 1;
 474
 475     /* Must not exceed count of the original request. */
 476     blocks[0].length = MIN (e.length, (uint64_t) count);
 477     blocks[0].status_flags = e.type & 3;
 478   }
 479   else {
 480     uint64_t pos = offset;
 481
 482     *nr_blocks = 0;
 483     for (i = 0; i < nr_extents; ++i) {
 484       const struct nbdkit_extent e = nbdkit_get_extent (extents, i);
 485       uint64_t length;
 486
 487       if (i == 0)
 488         assert (e.offset == offset);
 489
 490       /* Must not exceed UINT32_MAX. */
 491       blocks[i].length = length = MIN (e.length, UINT32_MAX);
 492       blocks[i].status_flags = e.type & 3;
 493       (*nr_blocks)++;
 494
 495       pos += length;
 496       if (pos > offset + count) /* this must be the last block */
 497         break;
 498
 499       /* If we reach here then we must have consumed this whole
 500        * extent.  This is currently true because the server only sends
 501        * 32 bit requests, but if we move to 64 bit requests we will
 502        * need to revisit this code so it can split extents into
 503        * multiple blocks.  XXX
 504        */
 505       assert (e.length <= length);
 506     }
 507   }
 508
 509 #if 0
 510   for (i = 0; i < *nr_blocks; ++i)
 511     debug ("block status: sending block %" PRIu32 " type %" PRIu32,
 512            blocks[i].length, blocks[i].status_flags);
 513 #endif
 514
 515   /* Convert to big endian for the protocol. */
 516   for (i = 0; i < *nr_blocks; ++i) {
 517     blocks[i].length = htobe32 (blocks[i].length);
 518     blocks[i].status_flags = htobe32 (blocks[i].status_flags);
 519   }
 520
 521   return blocks;
 522 }
 523
 524 static bool
 525 send_structured_reply_block_status (uint64_t handle,
 526                                     uint16_t cmd, uint16_t flags,
 527                                     uint32_t count, uint64_t offset,
 528                                     struct nbdkit_extents *extents)
 529 {
 530   GET_CONN;
 531   ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->write_lock);
 532   struct nbd_structured_reply reply;
 533   CLEANUP_FREE struct nbd_block_descriptor *blocks = NULL;
 534   size_t nr_blocks;
 535   uint32_t context_id;
 536   size_t i;
 537   int r;
 538
 539   assert (conn->meta_context_base_allocation);
 540   assert (cmd == NBD_CMD_BLOCK_STATUS);
 541
 542   blocks = extents_to_block_descriptors (extents, flags, count, offset,
 543                                          &nr_blocks);
 544   if (blocks == NULL)
 545     return connection_set_status (STATUS_DEAD);
 546
 547   reply.magic = htobe32 (NBD_STRUCTURED_REPLY_MAGIC);
 548   reply.handle = handle;
 549   reply.flags = htobe16 (NBD_REPLY_FLAG_DONE);
 550   reply.type = htobe16 (NBD_REPLY_TYPE_BLOCK_STATUS);
 551   reply.length = htobe32 (sizeof context_id +
 552                           nr_blocks * sizeof (struct nbd_block_descriptor));
 553
 554   r = conn->send (&reply, sizeof reply, SEND_MORE);
 555   if (r == -1) {
 556     nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
 557     return connection_set_status (STATUS_DEAD);
 558   }
 559
 560   /* Send the base:allocation context ID. */
 561   context_id = htobe32 (base_allocation_id);
 562   r = conn->send (&context_id, sizeof context_id, SEND_MORE);
 563   if (r == -1) {
 564     nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
 565     return connection_set_status (STATUS_DEAD);
 566   }
 567
 568   /* Send each block descriptor. */
 569   for (i = 0; i < nr_blocks; ++i) {
 570     r = conn->send (&blocks[i], sizeof blocks[i],
 571                     i == nr_blocks - 1 ? 0 : SEND_MORE);
 572     if (r == -1) {
 573       nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
 574       return connection_set_status (STATUS_DEAD);
 575     }
 576   }
 577   return false;
 578 }
 579
 580 static bool
 581 send_structured_reply_error (uint64_t handle, uint16_t cmd, uint16_t flags,
 582                              uint32_t error)
 583 {
 584   GET_CONN;
 585   ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->write_lock);
 586   struct nbd_structured_reply reply;
 587   struct nbd_structured_reply_error error_data;
 588   int r;
 589
 590   reply.magic = htobe32 (NBD_STRUCTURED_REPLY_MAGIC);
 591   reply.handle = handle;
 592   reply.flags = htobe16 (NBD_REPLY_FLAG_DONE);
 593   reply.type = htobe16 (NBD_REPLY_TYPE_ERROR);
 594   reply.length = htobe32 (0 /* no human readable error */ + sizeof error_data);
 595
 596   r = conn->send (&reply, sizeof reply, SEND_MORE);
 597   if (r == -1) {
 598     nbdkit_error ("write error reply: %m");
 599     return connection_set_status (STATUS_DEAD);
 600   }
 601
 602   /* Send the error. */
 603   error_data.error = htobe32 (nbd_errno (error, flags));
 604   error_data.len = htobe16 (0);
 605   r = conn->send (&error_data, sizeof error_data, 0);
 606   if (r == -1) {
 607     nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd));
 608     return connection_set_status (STATUS_DEAD);
 609   }
 610   /* No human readable error message at the moment. */
 611   return false;
 612 }
 613
 614 /* Do a recv/send sequence. Return true if the caller should shutdown. */
 615 bool
 616 protocol_recv_request_send_reply (void)
 617 {
 618   GET_CONN;
 619   int r;
 620   conn_status cs;
 621   struct nbd_request request;
 622   uint16_t cmd, flags;
 623   uint32_t magic, count, error = 0;
 624   uint64_t offset;
 625   char *buf = NULL;
 626   CLEANUP_EXTENTS_FREE struct nbdkit_extents *extents = NULL;
 627
 628   /* Read the request packet. */
 629   {
 630     ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->read_lock);
 631     r = conn->recv (&request, sizeof request);
 632     cs = connection_get_status ();
 633     if (cs <= STATUS_CLIENT_DONE)
 634       return false;
 635     if (r == -1) {
 636       nbdkit_error ("read request: %m");
 637       return connection_set_status (STATUS_DEAD);
 638     }
 639     if (r == 0) {
 640       debug ("client closed input socket, closing connection");
 641       return connection_set_status (STATUS_CLIENT_DONE); /* disconnect */
 642     }
 643
 644     magic = be32toh (request.magic);
 645     if (magic != NBD_REQUEST_MAGIC) {
 646       nbdkit_error ("invalid request: 'magic' field is incorrect (0x%x)",
 647                     magic);
 648       return connection_set_status (STATUS_DEAD);
 649     }
 650
 651     flags = be16toh (request.flags);
 652     cmd = be16toh (request.type);
 653
 654     offset = be64toh (request.offset);
 655     count = be32toh (request.count);
 656
 657     if (cmd == NBD_CMD_DISC) {
 658       debug ("client sent %s, closing connection", name_of_nbd_cmd (cmd));
 659       return connection_set_status (STATUS_CLIENT_DONE); /* disconnect */
 660     }
 661
 662     /* Validate the request. */
 663     if (!validate_request (cmd, flags, offset, count, &error)) {
 664       if (cmd == NBD_CMD_WRITE &&
 665           skip_over_write_buffer (conn->sockin, count) < 0) {
 666         return connection_set_status (STATUS_DEAD);
 667       }
 668       goto send_reply;
 669     }
 670
 671     /* Get the data buffer used for either read or write requests.
 672      * This is a common per-thread data buffer, it must not be freed.
 673      */
 674     if (cmd == NBD_CMD_READ || cmd == NBD_CMD_WRITE) {
 675       buf = threadlocal_buffer ((size_t) count);
 676       if (buf == NULL) {
 677         error = ENOMEM;
 678         if (cmd == NBD_CMD_WRITE &&
 679             skip_over_write_buffer (conn->sockin, count) < 0) {
 680           return connection_set_status (STATUS_DEAD);
 681         }
 682         goto send_reply;
 683       }
 684     }
 685
 686     /* Allocate the extents list for block status only. */
 687     if (cmd == NBD_CMD_BLOCK_STATUS) {
 688       extents = nbdkit_extents_new (offset,
 689                                     backend_get_size (conn->top_context));
 690       if (extents == NULL) {
 691         error = ENOMEM;
 692         goto send_reply;
 693       }
 694     }
 695
 696     /* Receive the write data buffer. */
 697     if (cmd == NBD_CMD_WRITE) {
 698       r = conn->recv (buf, count);
 699       if (r == 0) {
 700         errno = EBADMSG;
 701         r = -1;
 702       }
 703       if (r == -1) {
 704         nbdkit_error ("read data: %s: %m", name_of_nbd_cmd (cmd));
 705         return connection_set_status (STATUS_DEAD);
 706       }
 707     }
 708   }
 709
 710   /* Perform the request.  Only this part happens inside the request lock. */
 711   if (quit || cs < STATUS_ACTIVE) {
 712     error = ESHUTDOWN;
 713   }
 714   else {
 715     lock_request ();
 716     error = handle_request (cmd, flags, offset, count, buf, extents);
 717     assert ((int) error >= 0);
 718     unlock_request ();
 719   }
 720
 721   /* Send the reply packet. */
 722  send_reply:
 723   if (connection_get_status () < STATUS_CLIENT_DONE)
 724     return false;
 725
 726   if (error != 0) {
 727     /* Since we're about to send only the limited NBD_E* errno to the
 728      * client, don't lose the information about what really happened
 729      * on the server side.  Make sure there is a way for the operator
 730      * to retrieve the real error.
 731      */
 732     debug ("sending error reply: %s", strerror (error));
 733   }
 734
 735   /* Currently we prefer to send simple replies for everything except
 736    * where we have to (ie. NBD_CMD_READ and NBD_CMD_BLOCK_STATUS when
 737    * structured_replies have been negotiated).  However this prevents
 738    * us from sending human-readable error messages to the client, so
 739    * we should reconsider this in future.
 740    */
 741   if (!conn->structured_replies ||
 742       (cmd != NBD_CMD_READ && cmd != NBD_CMD_BLOCK_STATUS))
 743     return send_simple_reply (request.handle, cmd, flags, buf, count, error);
 744
 745   if (error)
 746     return send_structured_reply_error (request.handle, cmd, flags, error);
 747
 748   if (cmd == NBD_CMD_READ)
 749     return send_structured_reply_read (request.handle, cmd, buf, count,
 750                                        offset);
 751
 752   /* NBD_CMD_BLOCK_STATUS */
 753   return send_structured_reply_block_status (request.handle, cmd, flags,
 754                                              count, offset, extents);
 755 }