Update copyright statement
[nbd.git] / nbd-server.c
blob0ff7992bfed04ee04900616ac130586f8ae5b79c
1 /*
2 * Network Block Device - server
4 * Copyright 1996-1998 Pavel Machek, distribute under GPL
5 * <pavel@atrey.karlin.mff.cuni.cz>
6 * Copyright 2001-2004 Wouter Verhelst <wouter@debian.org>
7 * Copyright 2002 Anton Altaparmakov <aia21@cam.ac.uk>
9 * Version 1.0 - hopefully 64-bit-clean
10 * Version 1.1 - merging enhancements from Josh Parsons, <josh@coombs.anu.edu.au>
11 * Version 1.2 - autodetect size of block devices, thanx to Peter T. Breuer" <ptb@it.uc3m.es>
12 * Version 1.5 - can compile on Unix systems that don't have 64 bit integer
13 * type, or don't have 64 bit file offsets by defining FS_32BIT
14 * in compile options for nbd-server *only*. This can be done
15 * with make FSCHOICE=-DFS_32BIT nbd-server. (I don't have the
16 * original autoconf input file, or I would make it a configure
17 * option.) Ken Yap <ken@nlc.net.au>.
18 * Version 1.6 - fix autodetection of block device size and really make 64 bit
19 * clean on 32 bit machines. Anton Altaparmakov <aia21@cam.ac.uk>
20 * Version 2.0 - Version synchronised with client
21 * Version 2.1 - Reap zombie client processes when they exit. Removed
22 * (uncommented) the _IO magic, it's no longer necessary. Wouter
23 * Verhelst <wouter@debian.org>
24 * Version 2.2 - Auto switch to read-only mode (usefull for floppies).
25 * Version 2.3 - Fixed code so that Large File Support works. This
26 * removes the FS_32BIT compile-time directive; define
27 * _FILE_OFFSET_BITS=64 and _LARGEFILE_SOURCE if you used to be
28 * using FS_32BIT. This will allow you to use files >2GB instead of
29 * having to use the -m option. Wouter Verhelst <wouter@debian.org>
30 * Version 2.4 - Added code to keep track of children, so that we can
31 * properly kill them from initscripts. Add a call to daemon(),
32 * so that processes don't think they have to wait for us, which is
33 * interesting for initscripts as well. Wouter Verhelst
34 * <wouter@debian.org>
35 * Version 2.5 - Bugfix release: forgot to reset child_arraysize to
36 * zero after fork()ing, resulting in nbd-server going berserk
37 * when it receives a signal with at least one child open. Wouter
38 * Verhelst <wouter@debian.org>
39 * 10/10/2003 - Added socket option SO_KEEPALIVE (sf.net bug 819235);
40 * rectified type of mainloop::size_host (sf.net bugs 814435 and
41 * 817385); close the PID file after writing to it, so that the
42 * daemon can actually be found. Wouter Verhelst
43 * <wouter@debian.org>
44 * 10/10/2003 - Size of the data "size_host" was wrong and so was not
45 * correctly put in network endianness. Many types were corrected
46 * (size_t and off_t instead of int). <vspaceg@sourceforge.net>
47 * Version 2.6 - Some code cleanup.
48 * Version 2.7 - Better build system.
49 * 11/02/2004 - Doxygenified the source, modularized it a bit. Needs a
50 * lot more work, but this is a start. Wouter Verhelst
51 * <wouter@debian.org>
52 * 16/03/2010 - Add IPv6 support.
53 * Kitt Tientanopajai <kitt@kitty.in.th>
54 * Neutron Soutmun <neo.neutron@gmail.com>
55 * Suriya Soutmun <darksolar@gmail.com>
58 /* Includes LFS defines, which defines behaviours of some of the following
59 * headers, so must come before those */
60 #include "lfs.h"
61 #define _DEFAULT_SOURCE
62 #define _XOPEN_SOURCE 500 /* to get pread/pwrite */
63 #if NEED_BSD_SOURCE
64 #define _BSD_SOURCE /* to get DT_* macros on some platforms */
65 #endif
66 #define _DARWIN_C_SOURCE /* to get DT_* macros on OS X */
68 #include <assert.h>
69 #include <sys/types.h>
70 #include <sys/socket.h>
71 #include <sys/stat.h>
72 #include <sys/select.h>
73 #include <sys/wait.h>
74 #include <sys/un.h>
75 #ifdef HAVE_SYS_IOCTL_H
76 #include <sys/ioctl.h>
77 #endif
78 #ifdef HAVE_SYS_UIO_H
79 #include <sys/uio.h>
80 #endif
81 #include <sys/param.h>
82 #include <signal.h>
83 #include <errno.h>
84 #include <libgen.h>
85 #include <netinet/tcp.h>
86 #include <netinet/in.h>
87 #include <netdb.h>
88 #include <syslog.h>
89 #include <unistd.h>
90 #include <stdbool.h>
91 #include <stdio.h>
92 #include <stdlib.h>
93 #include <string.h>
94 #include <fcntl.h>
95 #if HAVE_FALLOC_PH
96 #include <linux/falloc.h>
97 #endif
98 #if HAVE_BLKDISCARD
99 #include <linux/fs.h>
100 #endif
101 #include <arpa/inet.h>
102 #include <strings.h>
103 #include <dirent.h>
104 #ifdef HAVE_SYS_DIR_H
105 #include <sys/dir.h>
106 #endif
107 #ifdef HAVE_SYS_DIRENT_H
108 #include <sys/dirent.h>
109 #endif
110 #include <getopt.h>
111 #include <pwd.h>
112 #include <grp.h>
113 #include <dirent.h>
114 #include <ctype.h>
115 #include <inttypes.h>
117 #include <glib.h>
119 /* used in cliserv.h, so must come first */
120 #define MY_NAME "nbd_server"
121 #include "cliserv.h"
122 #include "nbd-debug.h"
123 #include "netdb-compat.h"
124 #include "backend.h"
125 #include "treefiles.h"
126 #include "nbd-helper.h"
128 #ifdef WITH_SDP
129 #include <sdp_inet.h>
130 #endif
132 #if HAVE_FSCTL_SET_ZERO_DATA
133 #include <io.h>
134 /* don't include <windows.h> to avoid redefining eg the ERROR macro */
135 #define NOMINMAX 1
136 #include <windef.h>
137 #include <winbase.h>
138 #include <winioctl.h>
139 #endif
141 /** Default position of the config file */
142 #ifndef SYSCONFDIR
143 #define SYSCONFDIR "/etc"
144 #endif
145 #define CFILE SYSCONFDIR "/nbd-server/config"
147 #if HAVE_GNUTLS
148 #include <gnutls/gnutls.h>
149 #include <gnutls/x509.h>
150 #endif
152 #ifndef HAVE_G_MEMDUP2
153 /* Our uses of g_memdup2 below are safe from g_memdup's 32-bit overflow */
154 #define g_memdup2 g_memdup
155 #endif
158 * Shorten error handling and regular function return sequences
159 * automatically freeing dynamically allocated resources
161 #define _cleanup_(x) __attribute__((__cleanup__(x)))
162 static inline void g_freep(void *p) {
163 g_free(*(void**) p);
165 #define _cleanup_g_free_ _cleanup_(g_freep)
166 #define DEFINE_TRIVIAL_CLEANUP_FUNC(type, func) \
167 static inline void func##p(type *p) { \
168 if (*p) \
169 func(*p); \
171 DEFINE_TRIVIAL_CLEANUP_FUNC(GKeyFile*, g_key_file_free)
172 DEFINE_TRIVIAL_CLEANUP_FUNC(gchar **, g_strfreev)
174 /** Where our config file actually is */
175 gchar* config_file_pos;
177 /** global flags */
178 int glob_flags=0;
180 /* Whether we should avoid daemonizing the main process */
181 int nodaemon = 0;
183 /* Whether we should avoid forking into child processes */
184 int dontfork = 0;
187 * The highest value a variable of type off_t can reach. This is a signed
188 * integer, so set all bits except for the leftmost one.
190 #define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1))
191 #define BUFSIZE ((1024*1024)+sizeof(struct nbd_reply)) /**< Size of buffer that can hold requests */
192 #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */
194 /** Global flags: */
195 #define F_OLDSTYLE 1 /**< Allow oldstyle (port-based) exports */
196 #define F_LIST 2 /**< Allow clients to list the exports on a server */
197 #define F_NO_ZEROES 4 /**< Do not send zeros to client */
198 #define F_DUAL_LISTEN 8 /**< Listen on both TCP and unix socket */
199 // also accepts F_FORCEDTLS (which is 16384)
200 GHashTable *children;
201 char pidfname[256]; /**< name of our PID file */
202 char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */
204 #define NEG_INIT (1 << 0)
205 #define NEG_OLD (1 << 1)
206 #define NEG_MODERN (1 << 2)
209 * If we want what the system really has set we'd have to read
210 * /proc/sys/fs/pipe-max-size, but for now 1mb should be enough.
212 #define MAX_PIPE_SIZE (1 * 1024 * 1024)
213 #define SPLICE_IN 0
214 #define SPLICE_OUT 1
216 #include <nbdsrv.h>
218 /* Our thread pool */
219 GThreadPool *tpool = NULL;
221 /* A work package for the thread pool functions */
222 struct work_package {
223 CLIENT* client;
224 struct nbd_request* req;
225 int pipefd[2];
226 void* data; /**< for write requests */
229 static volatile sig_atomic_t is_sigchld_caught; /**< Flag set by
230 SIGCHLD handler
231 to mark a child
232 exit */
234 static volatile sig_atomic_t is_sigterm_caught; /**< Flag set by
235 SIGTERM handler
236 to mark a exit
237 request */
239 static volatile sig_atomic_t is_sighup_caught; /**< Flag set by SIGHUP
240 handler to mark a
241 reconfiguration
242 request */
244 GArray* modernsocks; /**< Sockets for the modern handler. Not used
245 if a client was only specified on the
246 command line; only port used if
247 oldstyle is set to false (and then the
248 command-line client isn't used, gna gna).
249 This may be more than one socket on
250 systems that don't support serving IPv4
251 and IPv6 from the same socket (like,
252 e.g., FreeBSD) */
253 GArray* childsocks; /**< parent-side sockets for communication with children */
254 int commsocket; /**< child-side socket for communication with parent */
255 static sem_t file_wait_sem;
257 bool logged_oversized=false; /**< whether we logged oversized requests already */
260 * Type of configuration file values
262 typedef enum {
263 PARAM_INT, /**< This parameter is an integer */
264 PARAM_INT64, /**< This parameter is an integer */
265 PARAM_STRING, /**< This parameter is a string */
266 PARAM_BOOL, /**< This parameter is a boolean */
267 } PARAM_TYPE;
270 * Configuration file values
272 typedef struct {
273 gchar *paramname; /**< Name of the parameter, as it appears in
274 the config file */
275 gboolean required; /**< Whether this is a required (as opposed to
276 optional) parameter */
277 PARAM_TYPE ptype; /**< Type of the parameter. */
278 gpointer target; /**< Pointer to where the data of this
279 parameter should be written. If ptype is
280 PARAM_BOOL, the data is or'ed rather than
281 overwritten. */
282 gint flagval; /**< Flag mask for this parameter in case ptype
283 is PARAM_BOOL. */
284 } PARAM;
287 * Configuration file values of the "generic" section
289 struct generic_conf {
290 gchar *user; /**< user we run the server as */
291 gchar *group; /**< group we run running as */
292 gchar *modernaddr; /**< address of the modern socket */
293 gchar *modernport; /**< port of the modern socket */
294 gchar *unixsock; /**< file name of the unix domain socket */
295 gchar *certfile; /**< certificate file */
296 gchar *keyfile; /**< key file */
297 gchar *cacertfile; /**< CA certificate file */
298 gchar *tlsprio; /**< TLS priority string */
299 gint flags; /**< global flags */
300 gint threads; /**< maximum number of parallel threads we want to run */
303 #if HAVE_GNUTLS
304 static int writeit_tls(gnutls_session_t s, void *buf, size_t len) {
305 _cleanup_g_free_ char *m = NULL;
306 ssize_t res;
307 while(len > 0) {
308 DEBUG("+");
309 if ((res = gnutls_record_send(s, buf, len)) < 0 && !gnutls_error_is_fatal(res)) {
310 m = g_strdup_printf("issue while sending data: %s", gnutls_strerror(res));
311 err_nonfatal(m);
312 } else if(res < 0) {
313 m = g_strdup_printf("could not send data: %s", gnutls_strerror(res));
314 err_nonfatal(m);
315 return -1;
316 } else {
317 len -= res;
318 buf += res;
321 return 0;
324 static int readit_tls(gnutls_session_t s, void *buf, size_t len) {
325 _cleanup_g_free_ char *m = NULL;
326 ssize_t res;
327 while(len > 0) {
328 DEBUG("*");
329 if((res = gnutls_record_recv(s, buf, len)) < 0 && !gnutls_error_is_fatal(res)) {
330 m = g_strdup_printf("issue while receiving data: %s", gnutls_strerror(res));
331 err_nonfatal(m);
332 } else if(res < 0) {
333 m = g_strdup_printf("could not receive data: %s", gnutls_strerror(res));
334 err_nonfatal(m);
335 return -1;
336 } else {
337 len -= res;
338 buf += res;
341 return 0;
344 static int socket_read_tls(CLIENT* client, void *buf, size_t len) {
345 return readit_tls(*((gnutls_session_t*)client->tls_session), buf, len);
348 static int socket_write_tls(CLIENT* client, void *buf, size_t len) {
349 return writeit_tls(*((gnutls_session_t*)client->tls_session), buf, len);
351 #endif // HAVE_GNUTLS
353 static int socket_read_notls(CLIENT* client, void *buf, size_t len) {
354 return readit(client->net, buf, len);
357 static int socket_write_notls(CLIENT* client, void *buf, size_t len) {
358 return writeit(client->net, buf, len);
361 static void socket_read(CLIENT* client, void *buf, size_t len) {
362 g_assert(client->socket_read != NULL);
363 if(client->socket_read(client, buf, len)<0) {
364 g_assert(client->socket_closed != NULL);
365 client->socket_closed(client);
370 * Consume data from a socket that we don't want
372 * @param c the client to read from
373 * @param len the number of bytes to consume
374 * @param buf a buffer
375 * @param bufsiz the size of the buffer
377 static inline void consume(CLIENT* c, size_t len, void * buf, size_t bufsiz) {
378 size_t curlen;
379 while (len>0) {
380 curlen = (len>bufsiz)?bufsiz:len;
381 socket_read(c, buf, curlen);
382 len -= curlen;
387 * Consume a length field and corresponding payload that we don't want
389 * @param c the client to read from
391 static inline void consume_len(CLIENT* c) {
392 uint32_t len;
393 char buf[1024];
395 socket_read(c, &len, sizeof(len));
396 len = ntohl(len);
397 consume(c, len, buf, sizeof(buf));
400 static void socket_write(CLIENT* client, void *buf, size_t len) {
401 g_assert(client->socket_write != NULL);
402 if(client->socket_write(client, buf, len)<0) {
403 g_assert(client->socket_closed != NULL);
404 client->socket_closed(client);
408 static inline void socket_closed_negotiate(CLIENT* client) {
409 err("Negotiation failed: %m");
412 static void cleanup_transactionlog(CLIENT *client) {
414 if (client->transactionlogfd != -1) {
415 close(client->transactionlogfd);
416 client->transactionlogfd = -1;
418 if (client->logsem != SEM_FAILED) {
419 sem_close(client->logsem);
420 client->logsem = SEM_FAILED;
421 sem_unlink(client->semname);
425 static void lock_logsem(CLIENT *client) {
426 sem_wait(client->logsem);
428 static void unlock_logsem(CLIENT *client) {
429 sem_post(client->logsem);
433 * Run a command. This is used for the ``prerun'' and ``postrun'' config file
434 * options
436 * @param command the command to be ran. Read from the config file
437 * @param file the file name we're about to export
439 int do_run(gchar* command, gchar* file) {
440 _cleanup_g_free_ gchar* cmd = NULL;
441 int retval=0;
443 if(command && *command) {
444 cmd = g_strdup_printf(command, file);
445 retval=system(cmd);
447 return retval;
450 static inline void finalize_client(CLIENT* client) {
451 g_thread_pool_free(tpool, FALSE, TRUE);
452 do_run(client->server->postrun, client->exportname);
453 if(client->transactionlogfd != -1)
454 cleanup_transactionlog(client);
456 if(client->server->flags & F_COPYONWRITE) {
457 unlink(client->difffilename);
459 serve_dec_ref(client->server);
462 static inline void socket_closed_transmission(CLIENT* client) {
463 int saved_errno = errno;
464 finalize_client(client);
465 errno = saved_errno;
466 err("Connection dropped: %m");
469 #ifdef HAVE_SPLICE
471 * Splice data between a pipe and a file descriptor
473 * @param fd_in The fd to splice from.
474 * @param off_in The fd_in offset to splice from.
475 * @param fd_out The fd to splice to.
476 * @param off_out The fd_out offset to splice to.
477 * @param len The length to splice.
479 static inline void spliceit(int fd_in, loff_t *off_in, int fd_out,
480 loff_t *off_out, size_t len)
482 ssize_t ret;
483 while (len > 0) {
484 if ((ret = splice(fd_in, off_in, fd_out, off_out, len,
485 SPLICE_F_MOVE)) <= 0)
486 err("Splice failed: %m");
487 len -= ret;
490 #endif
493 * Print out a message about how to use nbd-server. Split out to a separate
494 * function so that we can call it from multiple places
496 void usage() {
497 printf("This is nbd-server version " VERSION "\n");
498 printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name] [-M max connections] [-V] [-n] [-d]\n"
499 "\t-r|--read-only\t\tread only\n"
500 "\t-m|--multi-file\t\tmultiple file\n"
501 "\t-c|--copy-on-write\tcopy on write\n"
502 "\t-C|--config-file\tspecify an alternate configuration file\n"
503 "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n"
504 "\t-p|--pid-file\t\tspecify a filename to write our PID to\n"
505 "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n"
506 "\t-M|--max-connection\tspecify the maximum number of opened connections\n"
507 "\t-V|--version\t\toutput the version and exit\n"
508 "\t-n|--nodaemon\t\tdo not daemonize main process\n"
509 "\t-d|--dont-fork\t\tdo not fork (implies --nodaemon)\n\n"
510 "\tif port is set to 0, stdin is used (for running from inetd).\n"
511 "\tif file_to_export contains '%%s', it is substituted with the IP\n"
512 "\t\taddress of the machine trying to connect\n"
513 "\tif ip is set, it contains the local IP address on which we're listening.\n\tif not, the server will listen on all local IP addresses\n");
514 printf("Using configuration file %s\n", CFILE);
515 printf("For help, or when encountering bugs, please contact %s\n", PACKAGE_BUGREPORT);
518 /* Dumps a config file section of the given SERVER*, and exits. */
519 void dump_section(SERVER* serve, gchar* section_header) {
520 printf("[%s]\n", section_header);
521 printf("\texportname = %s\n", serve->exportname);
522 printf("\tlistenaddr = %s\n", serve->listenaddr);
523 if(serve->flags & F_READONLY) {
524 printf("\treadonly = true\n");
526 if(serve->flags & F_MULTIFILE) {
527 printf("\tmultifile = true\n");
529 if(serve->flags & F_TREEFILES) {
530 printf("\ttreefiles = true\n");
532 if(serve->flags & F_COPYONWRITE) {
533 printf("\tcopyonwrite = true\n");
535 if(serve->expected_size) {
536 printf("\tfilesize = %lld\n", (long long int)serve->expected_size);
538 if(serve->authname) {
539 printf("\tauthfile = %s\n", serve->authname);
541 exit(EXIT_SUCCESS);
545 * Parse the command line.
547 * @param argc the argc argument to main()
548 * @param argv the argv argument to main()
550 SERVER* cmdline(int argc, char *argv[], struct generic_conf *genconf) {
551 int i=0;
552 int nonspecial=0;
553 int c;
554 struct option long_options[] = {
555 {"read-only", no_argument, NULL, 'r'},
556 {"multi-file", no_argument, NULL, 'm'},
557 {"copy-on-write", no_argument, NULL, 'c'},
558 {"nodaemon", no_argument, NULL, 'n'},
559 {"dont-fork", no_argument, NULL, 'd'},
560 {"authorize-file", required_argument, NULL, 'l'},
561 {"config-file", required_argument, NULL, 'C'},
562 {"pid-file", required_argument, NULL, 'p'},
563 {"output-config", required_argument, NULL, 'o'},
564 {"max-connection", required_argument, NULL, 'M'},
565 {"version", no_argument, NULL, 'V'},
566 {0,0,0,0}
568 SERVER *serve;
569 off_t es;
570 size_t last;
571 char suffix;
572 bool do_output=false;
573 gchar* section_header="";
574 gchar** addr_port;
576 if(argc==1) {
577 return NULL;
579 serve=serve_inc_ref((SERVER*)g_new0(SERVER, 1));
580 serve->authname = g_strdup(default_authname);
581 serve->virtstyle=VIRT_IPLIT;
582 while((c=getopt_long(argc, argv, "-C:cwndl:mo:rp:M:V", long_options, &i))>=0) {
583 switch (c) {
584 case 1:
585 /* non-option argument */
586 switch(nonspecial++) {
587 case 0:
588 if(strchr(optarg, ':') == strrchr(optarg, ':')) {
589 addr_port=g_strsplit(optarg, ":", 2);
591 /* Check for "@" - maybe user using this separator
592 for IPv4 address */
593 if(!addr_port[1]) {
594 g_strfreev(addr_port);
595 addr_port=g_strsplit(optarg, "@", 2);
597 } else {
598 addr_port=g_strsplit(optarg, "@", 2);
601 if(addr_port[1]) {
602 genconf->modernport=g_strdup(addr_port[1]);
603 genconf->modernaddr=g_strdup(addr_port[0]);
604 } else {
605 g_free(genconf->modernaddr);
606 genconf->modernaddr=NULL;
607 genconf->modernport=g_strdup(addr_port[0]);
609 g_strfreev(addr_port);
610 break;
611 case 1:
612 serve->exportname = g_strdup(optarg);
613 if(serve->exportname[0] != '/') {
614 fprintf(stderr, "E: The to be exported file needs to be an absolute filename!\n");
615 exit(EXIT_FAILURE);
617 break;
618 case 2:
619 last=strlen(optarg)-1;
620 suffix=optarg[last];
621 if (suffix == 'k' || suffix == 'K' ||
622 suffix == 'm' || suffix == 'M')
623 optarg[last] = '\0';
624 es = (off_t)atoll(optarg);
625 switch (suffix) {
626 case 'm':
627 case 'M': es <<= 10;
628 case 'k':
629 case 'K': es <<= 10;
630 default : break;
632 serve->expected_size = es;
633 break;
635 break;
636 case 'r':
637 serve->flags |= F_READONLY;
638 break;
639 case 'm':
640 serve->flags |= F_MULTIFILE;
641 break;
642 case 'o':
643 do_output = true;
644 section_header = g_strdup(optarg);
645 break;
646 case 'p':
647 strncpy(pidfname, optarg, 256);
648 pidfname[255]='\0';
649 break;
650 case 'c':
651 serve->flags |=F_COPYONWRITE;
652 break;
653 case 'n':
654 nodaemon = 1;
655 break;
656 case 'd':
657 dontfork = 1;
658 nodaemon = 1;
659 break;
660 case 'C':
661 g_free(config_file_pos);
662 config_file_pos=g_strdup(optarg);
663 break;
664 case 'l':
665 g_free(serve->authname);
666 serve->authname=g_strdup(optarg);
667 break;
668 case 'M':
669 serve->max_connections = strtol(optarg, NULL, 0);
670 break;
671 case 'V':
672 printf("This is nbd-server version " VERSION "\n");
673 exit(EXIT_SUCCESS);
674 break;
675 default:
676 usage();
677 exit(EXIT_FAILURE);
678 break;
681 /* What's left: the port to export, the name of the to be exported
682 * file, and, optionally, the size of the file, in that order. */
683 if(nonspecial<2) {
684 serve=serve_dec_ref(serve);
685 } else {
686 serve->servename = "";
688 if(do_output) {
689 if(!serve) {
690 g_critical("Need a complete configuration on the command line to output a config file section!");
691 exit(EXIT_FAILURE);
693 dump_section(serve, section_header);
695 return serve;
698 /* forward definition of parse_cfile */
699 GArray* parse_cfile(gchar* f, struct generic_conf *genconf, bool expect_generic, GError** e);
701 #ifdef HAVE_STRUCT_DIRENT_D_TYPE
702 #define NBD_D_TYPE de->d_type
703 #else
704 #define NBD_D_TYPE 0
705 #define DT_UNKNOWN 0
706 #define DT_REG 1
707 #endif
710 * Parse config file snippets in a directory. Uses readdir() and friends
711 * to find files and open them, then passes them on to parse_cfile
712 * with have_global set false
714 GArray* do_cfile_dir(gchar* dir, struct generic_conf *const genconf, GError** e) {
715 DIR* dirh = opendir(dir);
716 struct dirent* de;
717 gchar* fname;
718 GArray* retval = NULL;
719 GArray* tmp;
720 struct stat stbuf;
722 if(!dirh) {
723 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_DIR_UNKNOWN, "Invalid directory specified: %s", strerror(errno));
724 return NULL;
726 errno=0;
727 while((de = readdir(dirh))) {
728 int saved_errno=errno;
729 fname = g_build_filename(dir, de->d_name, NULL);
730 switch(NBD_D_TYPE) {
731 case DT_UNKNOWN:
732 /* Filesystem doesn't return type of
733 * file through readdir, or struct dirent
734 * doesn't have d_type. Run stat() on the file
735 * instead */
736 if(stat(fname, &stbuf)) {
737 perror("stat");
738 goto err_out;
740 if (!S_ISREG(stbuf.st_mode)) {
741 goto next;
743 case DT_REG:
744 /* Skip unless the name ends with '.conf' */
745 if(strcmp((de->d_name + strlen(de->d_name) - 5), ".conf")) {
746 goto next;
748 tmp = parse_cfile(fname, genconf, false, e);
749 errno=saved_errno;
750 if(*e) {
751 goto err_out;
753 if(!retval)
754 retval = g_array_new(FALSE, TRUE, sizeof(SERVER*));
755 retval = g_array_append_vals(retval, tmp->data, tmp->len);
756 g_array_free(tmp, TRUE);
757 default:
758 break;
760 next:
761 g_free(fname);
763 if(errno) {
764 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_READDIR_ERR, "Error trying to read directory: %s", strerror(errno));
765 err_out:
766 if(retval)
767 g_array_free(retval, TRUE);
768 retval = NULL;
770 if(dirh)
771 closedir(dirh);
772 return retval;
776 * To be called by GArray clearing function.
777 * @param server pointer to server element
779 static void serve_clear_element(SERVER **server) {
780 serve_dec_ref(*server);
784 * Parse the config file.
786 * @param f the name of the config file
788 * @param genconf a pointer to generic configuration which will get
789 * updated with parsed values. If NULL, then parsed generic
790 * configuration values are safely and silently discarded.
792 * @param e a GError. Error code can be any of the following:
793 * NBDS_ERR_CFILE_NOTFOUND, NBDS_ERR_CFILE_MISSING_GENERIC,
794 * NBDS_ERR_CFILE_VALUE_INVALID, NBDS_ERR_CFILE_VALUE_UNSUPPORTED
795 * or NBDS_ERR_CFILE_NO_EXPORTS. @see NBDS_ERRS.
797 * @param expect_generic if true, we expect a configuration file that
798 * contains a [generic] section. If false, we don't.
800 * @return a GArray of SERVER* pointers. If the config file is empty or does not
801 * exist, returns an empty GArray; if the config file contains an
802 * error, returns NULL, and e is set appropriately
804 GArray* parse_cfile(gchar* f, struct generic_conf *const genconf, bool expect_generic, GError** e) {
805 const char* DEFAULT_ERROR = "Could not parse %s in group %s: %s";
806 const char* MISSING_REQUIRED_ERROR = "Could not find required value %s in group %s: %s";
807 gchar* cfdir = NULL;
808 SERVER s;
809 gchar *virtstyle=NULL;
810 PARAM lp[] = {
811 { "exportname", TRUE, PARAM_STRING, &(s.exportname), 0 },
812 { "authfile", FALSE, PARAM_STRING, &(s.authname), 0 },
813 { "filesize", FALSE, PARAM_OFFT, &(s.expected_size), 0 },
814 { "virtstyle", FALSE, PARAM_STRING, &(virtstyle), 0 },
815 { "prerun", FALSE, PARAM_STRING, &(s.prerun), 0 },
816 { "postrun", FALSE, PARAM_STRING, &(s.postrun), 0 },
817 { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog), 0 },
818 { "cowdir", FALSE, PARAM_STRING, &(s.cowdir), 0 },
819 { "readonly", FALSE, PARAM_BOOL, &(s.flags), F_READONLY },
820 { "multifile", FALSE, PARAM_BOOL, &(s.flags), F_MULTIFILE },
821 { "treefiles", FALSE, PARAM_BOOL, &(s.flags), F_TREEFILES },
822 { "copyonwrite", FALSE, PARAM_BOOL, &(s.flags), F_COPYONWRITE },
823 { "waitfile", FALSE, PARAM_BOOL, &(s.flags), F_WAIT },
824 { "sparse_cow", FALSE, PARAM_BOOL, &(s.flags), F_SPARSE },
825 { "sdp", FALSE, PARAM_BOOL, &(s.flags), F_SDP },
826 { "sync", FALSE, PARAM_BOOL, &(s.flags), F_SYNC },
827 { "flush", FALSE, PARAM_BOOL, &(s.flags), F_FLUSH },
828 { "fua", FALSE, PARAM_BOOL, &(s.flags), F_FUA },
829 { "rotational", FALSE, PARAM_BOOL, &(s.flags), F_ROTATIONAL },
830 { "temporary", FALSE, PARAM_BOOL, &(s.flags), F_TEMPORARY },
831 { "trim", FALSE, PARAM_BOOL, &(s.flags), F_TRIM },
832 { "datalog", FALSE, PARAM_BOOL, &(s.flags), F_DATALOG },
833 { "listenaddr", FALSE, PARAM_STRING, &(s.listenaddr), 0 },
834 { "maxconnections", FALSE, PARAM_INT, &(s.max_connections), 0 },
835 { "force_tls", FALSE, PARAM_BOOL, &(s.flags), F_FORCEDTLS },
836 { "splice", FALSE, PARAM_BOOL, &(s.flags), F_SPLICE},
838 const int lp_size=sizeof(lp)/sizeof(PARAM);
839 struct generic_conf genconftmp;
840 PARAM gp[] = {
841 { "user", FALSE, PARAM_STRING, &(genconftmp.user), 0 },
842 { "group", FALSE, PARAM_STRING, &(genconftmp.group), 0 },
843 { "oldstyle", FALSE, PARAM_BOOL, &(genconftmp.flags), F_OLDSTYLE }, // only left here so we can issue an appropriate error message when the option is used
844 { "listenaddr", FALSE, PARAM_STRING, &(genconftmp.modernaddr), 0 },
845 { "port", FALSE, PARAM_STRING, &(genconftmp.modernport), 0 },
846 { "includedir", FALSE, PARAM_STRING, &cfdir, 0 },
847 { "allowlist", FALSE, PARAM_BOOL, &(genconftmp.flags), F_LIST },
848 { "unixsock", FALSE, PARAM_STRING, &(genconftmp.unixsock), 0 },
849 { "duallisten", FALSE, PARAM_BOOL, &(genconftmp.flags), F_DUAL_LISTEN }, // Used to listen on both TCP and unix socket
850 { "max_threads", FALSE, PARAM_INT, &(genconftmp.threads), 0 },
851 { "force_tls", FALSE, PARAM_BOOL, &(genconftmp.flags), F_FORCEDTLS },
852 { "certfile", FALSE, PARAM_STRING, &(genconftmp.certfile), 0 },
853 { "keyfile", FALSE, PARAM_STRING, &(genconftmp.keyfile), 0 },
854 { "cacertfile", FALSE, PARAM_STRING, &(genconftmp.cacertfile), 0 },
855 { "tlsprio", FALSE, PARAM_STRING, &(genconftmp.tlsprio), 0 },
857 PARAM* p=gp;
858 int p_size=sizeof(gp)/sizeof(PARAM);
859 _cleanup_(g_key_file_freep) GKeyFile *cfile = NULL;
860 g_autoptr(GError) err = NULL;
861 const char *err_msg=NULL;
862 GArray *retval=NULL;
863 gchar **groups;
864 gboolean bval;
865 gint ival;
866 gint64 i64val;
867 gchar* sval;
868 _cleanup_g_free_ gchar* startgroup = NULL;
869 gint i;
870 gint j;
872 memset(&genconftmp, 0, sizeof(struct generic_conf));
874 genconftmp.tlsprio = "NORMAL:-VERS-TLS-ALL:+VERS-TLS1.2:%SERVER_PRECEDENCE";
876 if (genconf) {
877 /* Use the passed configuration values as defaults. The
878 * parsing algorithm below updates all parameter targets
879 * found from configuration files. */
880 memcpy(&genconftmp, genconf, sizeof(struct generic_conf));
883 cfile = g_key_file_new();
884 retval = g_array_new(FALSE, TRUE, sizeof(SERVER*));
885 if(expect_generic) {
886 g_array_set_clear_func(retval, (GDestroyNotify)serve_clear_element);
888 if(!g_key_file_load_from_file(cfile, f, G_KEY_FILE_KEEP_COMMENTS |
889 G_KEY_FILE_KEEP_TRANSLATIONS, &err)) {
890 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_NOTFOUND, "Could not open config file %s: %s",
891 f, err->message);
892 g_key_file_free(cfile);
893 return retval;
895 startgroup = g_key_file_get_start_group(cfile);
896 if((!startgroup || strcmp(startgroup, "generic")) && expect_generic) {
897 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_MISSING_GENERIC, "Config file does not contain the [generic] group!");
898 return NULL;
900 groups = g_key_file_get_groups(cfile, NULL);
901 for(i=0;groups[i];i++) {
902 memset(&s, '\0', sizeof(SERVER));
904 /* After the [generic] group or when we're parsing an include
905 * directory, start parsing exports */
906 if(i==1 || !expect_generic) {
907 p=lp;
908 p_size=lp_size;
910 for(j=0;j<p_size;j++) {
911 assert(p[j].target != NULL);
912 assert(p[j].ptype==PARAM_INT||p[j].ptype==PARAM_STRING||p[j].ptype==PARAM_BOOL||p[j].ptype==PARAM_INT64);
913 switch(p[j].ptype) {
914 case PARAM_INT:
915 ival = g_key_file_get_integer(cfile,
916 groups[i],
917 p[j].paramname,
918 &err);
919 if(!err) {
920 *((gint*)p[j].target) = ival;
922 break;
923 case PARAM_INT64:
924 i64val = g_key_file_get_int64(cfile,
925 groups[i],
926 p[j].paramname,
927 &err);
928 if(!err) {
929 *((gint64*)p[j].target) = i64val;
931 break;
932 case PARAM_STRING:
933 sval = g_key_file_get_string(cfile,
934 groups[i],
935 p[j].paramname,
936 &err);
937 if(!err) {
938 *((gchar**)p[j].target) = sval;
940 break;
941 case PARAM_BOOL:
942 bval = g_key_file_get_boolean(cfile,
943 groups[i],
944 p[j].paramname, &err);
945 if(!err) {
946 if(bval) {
947 *((gint*)p[j].target) |= p[j].flagval;
948 } else {
949 *((gint*)p[j].target) &= ~(p[j].flagval);
952 break;
954 if(err) {
955 if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) {
956 if(!p[j].required) {
957 /* Ignore not-found error for optional values */
958 g_clear_error(&err);
959 continue;
960 } else {
961 err_msg = MISSING_REQUIRED_ERROR;
963 } else {
964 err_msg = DEFAULT_ERROR;
966 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_INVALID, err_msg, p[j].paramname, groups[i], err->message);
967 g_array_free(retval, TRUE);
968 return NULL;
971 if(virtstyle) {
972 if(!strncmp(virtstyle, "none", 4)) {
973 s.virtstyle=VIRT_NONE;
974 } else if(!strncmp(virtstyle, "ipliteral", 9)) {
975 s.virtstyle=VIRT_IPLIT;
976 } else if(!strncmp(virtstyle, "iphash", 6)) {
977 s.virtstyle=VIRT_IPHASH;
978 } else if(!strncmp(virtstyle, "cidrhash", 8)) {
979 s.virtstyle=VIRT_CIDR;
980 if(strlen(virtstyle)<10) {
981 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s: missing length", virtstyle, groups[i]);
982 g_array_free(retval, TRUE);
983 return NULL;
985 s.cidrlen=strtol(virtstyle+8, NULL, 0);
986 } else {
987 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s", virtstyle, groups[i]);
988 g_array_free(retval, TRUE);
989 return NULL;
991 } else {
992 s.virtstyle=VIRT_IPLIT;
994 if(genconftmp.flags & F_OLDSTYLE) {
995 g_message("Since 3.10, the oldstyle protocol is no longer supported. Please migrate to the newstyle protocol.");
996 g_message("Exiting.");
997 return NULL;
999 #ifndef HAVE_SPLICE
1000 if (s.flags & F_SPLICE) {
1001 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without splice support, yet group %s uses it", groups[i]);
1002 g_array_free(retval, TRUE);
1003 return NULL;
1005 #endif
1006 /* We can't mix copyonwrite and splice. */
1007 if ((s.flags & F_COPYONWRITE) && (s.flags & F_SPLICE)) {
1008 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_INVALID_SPLICE,
1009 "Cannot mix copyonwrite with splice for an export in group %s",
1010 groups[i]);
1011 g_array_free(retval, TRUE);
1012 return NULL;
1014 if ((s.flags & F_COPYONWRITE) && (s.flags & F_WAIT)) {
1015 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_INVALID_WAIT,
1016 "Cannot mix copyonwrite with waitfile for an export in group %s",
1017 groups[i]);
1018 g_array_free(retval, TRUE);
1019 return NULL;
1021 /* We can't mix datalog and splice. */
1022 if ((s.flags & F_DATALOG) && (s.flags & F_SPLICE)) {
1023 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_INVALID_SPLICE,
1024 "Cannot mix datalog with splice for an export in group %s",
1025 groups[i]);
1026 g_array_free(retval, TRUE);
1027 return NULL;
1029 /* Don't need to free this, it's not our string */
1030 virtstyle=NULL;
1031 /* Don't append values for the [generic] group */
1032 if(i>0 || !expect_generic) {
1033 s.servename = groups[i];
1035 SERVER *srv = serve_inc_ref(g_memdup2(&s, sizeof(SERVER)));
1036 g_array_append_val(retval, srv);
1038 #ifndef WITH_SDP
1039 if(s.flags & F_SDP) {
1040 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without support for SDP, yet group %s uses it", groups[i]);
1041 g_array_free(retval, TRUE);
1042 return NULL;
1044 #endif
1046 if(cfdir) {
1047 GArray* extra = do_cfile_dir(cfdir, &genconftmp, e);
1048 if(extra) {
1049 retval = g_array_append_vals(retval, extra->data, extra->len);
1050 i+=extra->len;
1051 g_array_free(extra, TRUE);
1052 } else {
1053 if(*e) {
1054 g_array_free(retval, TRUE);
1055 return NULL;
1059 if(i==1 && expect_generic) {
1060 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_NO_EXPORTS, "The config file does not specify any exports");
1063 if (genconf) {
1064 /* Return the updated generic configuration through the
1065 * pointer parameter. */
1066 memcpy(genconf, &genconftmp, sizeof(struct generic_conf));
1069 return retval;
1073 * Handle SIGCHLD by setting atomically a flag which will be evaluated in the
1074 * main loop of the root server process. This allows us to separate the signal
1075 * catching from th actual task triggered by SIGCHLD and hence processing in the
1076 * interrupt context is kept as minimial as possible.
1078 * @param s the signal we're handling (must be SIGCHLD, or something
1079 * is severely wrong)
1081 static void sigchld_handler(const int s G_GNUC_UNUSED) {
1082 is_sigchld_caught = 1;
1086 * Kill a child. Called from sigterm_handler::g_hash_table_foreach.
1088 * @param key the key
1089 * @param value the value corresponding to the above key
1090 * @param user_data a pointer which we always set to 1, so that we know what
1091 * will happen next.
1093 void killchild(gpointer key, gpointer value, gpointer user_data) {
1094 pid_t *pid=value;
1096 kill(*pid, SIGTERM);
1100 * Handle SIGTERM by setting atomically a flag which will be evaluated in the
1101 * main loop of the root server process. This allows us to separate the signal
1102 * catching from th actual task triggered by SIGTERM and hence processing in the
1103 * interrupt context is kept as minimial as possible.
1105 * @param s the signal we're handling (must be SIGTERM, or something
1106 * is severely wrong).
1108 static void sigterm_handler(const int s G_GNUC_UNUSED) {
1109 is_sigterm_caught = 1;
1113 * Handle SIGHUP by setting atomically a flag which will be evaluated in
1114 * the main loop of the root server process. This allows us to separate
1115 * the signal catching from th actual task triggered by SIGHUP and hence
1116 * processing in the interrupt context is kept as minimial as possible.
1118 * @param s the signal we're handling (must be SIGHUP, or something
1119 * is severely wrong).
1121 static void sighup_handler(const int s G_GNUC_UNUSED) {
1122 is_sighup_caught = 1;
1125 static void sigusr1_handler(const int s G_GNUC_UNUSED) {
1126 msg(LOG_INFO, "Got SIGUSR1");
1127 sem_post(&file_wait_sem);
1131 * Get the file handle and offset, given an export offset.
1133 * @param client The client we're serving for
1134 * @param a The offset to get corresponding file/offset for
1135 * @param fhandle [out] File descriptor
1136 * @param foffset [out] Offset into fhandle
1137 * @param maxbytes [out] Tells how many bytes can be read/written
1138 * from fhandle starting at foffset (0 if there is no limit)
1139 * @return 0 on success, -1 on failure
1141 int get_filepos(CLIENT *client, off_t a, int* fhandle, off_t* foffset, size_t* maxbytes ) {
1142 GArray * const export = client->export;
1144 /* Negative offset not allowed */
1145 if(a < 0)
1146 return -1;
1148 /* Open separate file for treefiles */
1149 if (client->server->flags & F_TREEFILES) {
1150 *foffset = a % TREEPAGESIZE;
1151 *maxbytes = (( 1 + (a/TREEPAGESIZE) ) * TREEPAGESIZE) - a; // start position of next block
1152 *fhandle = open_treefile(client->exportname, ((client->server->flags & F_READONLY) ? O_RDONLY : O_RDWR), client->exportsize,a, &client->lock);
1153 return 0;
1156 /* Binary search for last file with starting offset <= a */
1157 FILE_INFO fi;
1158 int start = 0;
1159 int end = export->len - 1;
1160 while( start <= end ) {
1161 int mid = (start + end) / 2;
1162 fi = g_array_index(export, FILE_INFO, mid);
1163 if( fi.startoff < a ) {
1164 start = mid + 1;
1165 } else if( fi.startoff > a ) {
1166 end = mid - 1;
1167 } else {
1168 start = end = mid;
1169 break;
1173 /* end should never go negative, since first startoff is 0 and a >= 0 */
1174 assert(end >= 0);
1176 fi = g_array_index(export, FILE_INFO, end);
1177 *fhandle = fi.fhandle;
1178 *foffset = a - fi.startoff;
1179 *maxbytes = 0;
1180 if( end+1 < export->len ) {
1181 FILE_INFO fi_next = g_array_index(export, FILE_INFO, end+1);
1182 *maxbytes = fi_next.startoff - a;
1185 return 0;
1189 * Write an amount of bytes at a given offset to the right file. This
1190 * abstracts the write-side of the multiple file option.
1192 * @param a The offset where the write should start
1193 * @param buf The buffer to write from
1194 * @param len The length of buf
1195 * @param client The client we're serving for
1196 * @param fua Flag to indicate 'Force Unit Access'
1197 * @return The number of bytes actually written, or -1 in case of an error
1199 ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1200 int fhandle;
1201 off_t foffset;
1202 size_t maxbytes;
1203 ssize_t retval;
1205 if(get_filepos(client, a, &fhandle, &foffset, &maxbytes))
1206 return -1;
1207 if(maxbytes && len > maxbytes)
1208 len = maxbytes;
1210 DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua);
1212 retval = pwrite(fhandle, buf, len, foffset);
1213 if(client->server->flags & F_SYNC) {
1214 fsync(fhandle);
1215 } else if (fua) {
1217 /* This is where we would do the following
1218 * #ifdef USE_SYNC_FILE_RANGE
1219 * However, we don't, for the reasons set out below
1220 * by Christoph Hellwig <hch@infradead.org>
1222 * [BEGINS]
1223 * fdatasync is equivalent to fsync except that it does not flush
1224 * non-essential metadata (basically just timestamps in practice), but it
1225 * does flush metadata requried to find the data again, e.g. allocation
1226 * information and extent maps. sync_file_range does nothing but flush
1227 * out pagecache content - it means you basically won't get your data
1228 * back in case of a crash if you either:
1230 * a) have a volatile write cache in your disk (e.g. any normal SATA disk)
1231 * b) are using a sparse file on a filesystem
1232 * c) are using a fallocate-preallocated file on a filesystem
1233 * d) use any file on a COW filesystem like btrfs
1235 * e.g. it only does anything useful for you if you do not have a volatile
1236 * write cache, and either use a raw block device node, or just overwrite
1237 * an already fully allocated (and not preallocated) file on a non-COW
1238 * filesystem.
1239 * [ENDS]
1241 * What we should do is open a second FD with O_DSYNC set, then write to
1242 * that when appropriate. However, with a Linux client, every REQ_FUA
1243 * immediately follows a REQ_FLUSH, so fdatasync does not cause performance
1244 * problems.
1247 #if 0
1248 sync_file_range(fhandle, foffset, len,
1249 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
1250 SYNC_FILE_RANGE_WAIT_AFTER);
1251 #else
1252 fdatasync(fhandle);
1253 #endif
1255 /* close file pointer in case of treefiles */
1256 if (client->server->flags & F_TREEFILES) {
1257 close(fhandle);
1259 return retval;
1263 * Call rawexpwrite repeatedly until all data has been written.
1265 * @param a The offset where the write should start
1266 * @param buf The buffer to write from
1267 * @param len The length of buf
1268 * @param client The client we're serving for
1269 * @param fua Flag to indicate 'Force Unit Access'
1270 * @return 0 on success, nonzero on failure
1272 int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1273 ssize_t ret=0;
1275 while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
1276 a += ret;
1277 buf += ret;
1278 len -= ret;
1280 return (ret < 0 || len != 0);
1283 static void setup_reply(struct nbd_reply* rep, struct nbd_request* req) {
1284 rep->magic = htonl(NBD_REPLY_MAGIC);
1285 rep->error = 0;
1286 rep->cookie = req->cookie;
1289 static void log_reply(CLIENT *client, struct nbd_reply *prply) {
1290 if (client->transactionlogfd != -1) {
1291 lock_logsem(client);
1292 writeit(client->transactionlogfd, prply, sizeof(*prply));
1293 unlock_logsem(client);
1297 static void log_structured_reply(CLIENT *client, struct nbd_structured_reply *prply) {
1298 if (client->transactionlogfd != -1) {
1299 lock_logsem(client);
1300 writeit(client->transactionlogfd, prply, sizeof(*prply));
1301 unlock_logsem(client);
1305 void send_structured_chunk(CLIENT *client, struct nbd_request *req, uint16_t flags, uint16_t type, uint32_t length, int bufcount, void *buf[], size_t buflen[]) {
1306 struct nbd_structured_reply rep;
1307 rep.magic = htonl(NBD_STRUCTURED_REPLY_MAGIC);
1308 rep.flags = htons(flags);
1309 rep.type = htons(type);
1310 rep.cookie = req->cookie;
1311 rep.paylen = htonl(length);
1312 pthread_mutex_lock(&(client->lock));
1313 socket_write(client, &rep, sizeof rep);
1314 for(int i=0; i<bufcount; i++) {
1315 socket_write(client, buf[i], buflen[i]);
1317 pthread_mutex_unlock(&(client->lock));
1318 log_structured_reply(client, &rep);
1321 void send_structured_chunk_v(CLIENT *client, struct nbd_request *req, uint16_t flags, uint16_t type, uint32_t length, int bufcount, ...) {
1322 struct nbd_structured_reply rep;
1323 va_list ap;
1324 rep.magic = htonl(NBD_STRUCTURED_REPLY_MAGIC);
1325 rep.flags = htons(flags);
1326 rep.type = htons(type);
1327 rep.cookie = req->cookie;
1328 rep.paylen = htonl(length);
1329 va_start(ap, bufcount);
1330 pthread_mutex_lock(&(client->lock));
1331 socket_write(client, &rep, sizeof rep);
1332 for(int i=0; i<bufcount; i++) {
1333 void *buf = va_arg(ap, void*);
1334 size_t size = va_arg(ap, size_t);
1335 socket_write(client, buf, size);
1337 pthread_mutex_unlock(&(client->lock));
1338 log_structured_reply(client, &rep);
1339 va_end(ap);
1343 * Find the location to write the data for the next chunk to.
1344 * Assumes checks on memory sizes etc have already been done.
1346 * @param ctx the context we're working with
1347 * @param offset the offset into the request
1348 * @param len the length of this chunk.
1350 char * find_read_buf(READ_CTX *ctx) {
1351 if(!(ctx->is_structured) || ctx->df) {
1352 return ctx->buf + ctx->current_offset;
1354 ctx->buf = malloc(ctx->current_len);
1355 if(!(ctx->buf)) {
1356 err("Could not allocate memory for request");
1358 return ctx->buf;
1361 void confirm_read(CLIENT *client, READ_CTX *ctx, size_t len_read) {
1362 if(ctx->is_structured && !(ctx->df)) {
1363 uint64_t offset = htonll(ctx->req->from + (uint64_t)(ctx->current_offset));
1364 send_structured_chunk_v(client, ctx->req, 0, NBD_REPLY_TYPE_OFFSET_DATA, len_read + 8, 2, &offset, sizeof offset, ctx->buf, (size_t)len_read);
1365 free(ctx->buf);
1369 void complete_read(CLIENT *client, READ_CTX *ctx, uint32_t error, char *errmsg, uint16_t msglen, bool with_offset, uint64_t err_offset) {
1370 uint16_t type;
1371 uint64_t offset = 0;
1372 if(ctx->is_structured) {
1373 if(ctx->df) {
1374 uint32_t len = ctx->req->len;
1375 if(error != 0 && with_offset) {
1376 len = err_offset;
1378 if(error == 0 || with_offset) {
1379 offset = htonll(ctx->req->from);
1380 send_structured_chunk_v(client, ctx->req, 0, NBD_REPLY_TYPE_OFFSET_DATA, len + 8, 2, &offset, sizeof offset, ctx->buf, err_offset);
1382 free(ctx->buf);
1384 if(error != 0) {
1385 struct nbd_structured_error_payload pl;
1386 void *buf[3];
1387 size_t bufsize[3];
1388 int payloads = 1;
1389 size_t total_size;
1390 pl.error = error;
1391 pl.msglen = msglen;
1392 if(with_offset) {
1393 offset += err_offset;
1394 type = NBD_REPLY_TYPE_ERROR_OFFSET;
1395 } else {
1396 type = NBD_REPLY_TYPE_ERROR;
1398 buf[0] = &pl;
1399 bufsize[0] = sizeof pl;
1400 total_size = bufsize[0];
1401 if(msglen > 0) {
1402 buf[payloads] = errmsg;
1403 bufsize[payloads++] = msglen;
1404 total_size += msglen;
1406 if(with_offset) {
1407 buf[payloads] = &offset;
1408 bufsize[payloads++] = sizeof offset;
1409 total_size += sizeof offset;
1411 send_structured_chunk(client, ctx->req, NBD_REPLY_FLAG_DONE, type, total_size, payloads, buf, bufsize);
1412 return;
1414 send_structured_chunk_v(client, ctx->req, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_NONE, 0, 0);
1415 } else {
1416 struct nbd_reply rep;
1417 setup_reply(&rep, ctx->req);
1418 if(error) {
1419 rep.error = error;
1421 log_reply(client, &rep);
1422 pthread_mutex_lock(&(client->lock));
1423 socket_write(client, &rep, sizeof rep);
1424 if(!error) {
1425 socket_write(client, ctx->buf, ctx->buflen);
1427 pthread_mutex_unlock(&(client->lock));
1428 free(ctx->buf);
1433 * Read an amount of bytes at a given offset from the right file. This
1434 * abstracts the read-side of the multiple files option.
1436 * @param a The offset where the read should start
1437 * @param buf A buffer to read into
1438 * @param len The size of buf
1439 * @param client The client we're serving for
1440 * @return The number of bytes actually read, or -1 in case of an
1441 * error.
1443 ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
1444 int fhandle;
1445 off_t foffset;
1446 size_t maxbytes;
1447 ssize_t retval;
1449 if(get_filepos(client, a, &fhandle, &foffset, &maxbytes))
1450 return -1;
1451 if(maxbytes && len > maxbytes)
1452 len = maxbytes;
1454 DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len);
1456 retval = pread(fhandle, buf, len, foffset);
1457 if (client->server->flags & F_TREEFILES) {
1458 close(fhandle);
1460 return retval;
1464 * Call rawexpread repeatedly until all data has been read.
1465 * @return 0 on success, nonzero on failure
1467 int rawexpread_fully(READ_CTX *ctx, CLIENT *client) {
1468 ssize_t ret=0;
1470 char *buf;
1472 while(ctx->current_len > 0) {
1473 buf = find_read_buf(ctx);
1474 if((ret = rawexpread((off_t)ctx->req->from + (off_t)ctx->current_offset, buf, ctx->current_len, client)) <= 0) {
1475 break;
1477 confirm_read(client, ctx, ret);
1478 ctx->current_offset += ret;
1479 ctx->current_len -= ret;
1481 return (ret < 0 || ctx->current_len != 0);
1484 #ifdef HAVE_SPLICE
1485 int rawexpsplice(int pipe, off_t a, size_t len, CLIENT *client, int dir,
1486 int fua)
1488 int fhandle;
1489 off_t foffset;
1490 size_t maxbytes;
1491 ssize_t retval;
1493 if (get_filepos(client, a, &fhandle, &foffset, &maxbytes))
1494 return -1;
1495 if (maxbytes && len > maxbytes)
1496 len = maxbytes;
1498 DEBUG("(SPLICE %s fd %d offset %llu len %u), ",
1499 (dir == SPLICE_IN) ? "from" : "to", fhandle,
1500 (unsigned long long)a, (unsigned)len);
1503 * SPLICE_F_MOVE doesn't actually work at the moment, but in the future
1504 * it might, so go ahead and use it.
1506 if (dir == SPLICE_IN) {
1507 retval = splice(fhandle, &foffset, pipe, NULL, len,
1508 SPLICE_F_MOVE);
1509 } else {
1510 retval = splice(pipe, NULL, fhandle, &foffset, len,
1511 SPLICE_F_MOVE);
1512 if (client->server->flags & F_SYNC)
1513 fsync(fhandle);
1514 else if (fua)
1515 fdatasync(fhandle);
1517 if (client->server->flags & F_TREEFILES)
1518 close(fhandle);
1519 return retval;
1523 * Splice an amount of bytes from the given offset from/into the right file
1524 * from/into the given pipe.
1525 * @param pipe The pipe we are using for this splice.
1526 * @param a The offset of the file we are operating on.
1527 * @param len The length of the splice.
1528 * @param client The client we're splicing for.
1529 * @param dir The direction we are doing the splice in.
1530 * @param fua Set if this is a write and we need to fua.
1531 * @return 0 on success, nonzero on failure.
1533 int expsplice(int pipe, off_t a, size_t len, CLIENT *client, int dir, int fua)
1535 ssize_t ret = 0;
1537 while (len > 0 &&
1538 (ret = rawexpsplice(pipe, a, len, client, dir, fua)) > 0) {
1539 a += ret;
1540 len -= ret;
1542 return (ret < 0 || len != 0);
1544 #endif /* HAVE_SPLICE */
1547 * Read an amount of bytes at a given offset from the right file. This
1548 * abstracts the read-side of the copyonwrite stuff, and calls
1549 * rawexpread() with the right parameters to do the actual work.
1550 * @param a The offset where the read should start
1551 * @param buf A buffer to read into
1552 * @param len The size of buf
1553 * @param client The client we're going to read for
1554 * @return 0 on success, nonzero on failure
1556 int expread(READ_CTX *ctx, CLIENT *client) {
1557 off_t rdlen, offset;
1558 off_t mapcnt, mapl, maph, pagestart;
1559 off_t a = (off_t)ctx->current_offset + (off_t)ctx->req->from;
1560 size_t len = (size_t) ctx->req->len;
1561 int rv = 0;
1563 DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1565 if (!(client->server->flags & F_COPYONWRITE) && !((client->server->flags & F_WAIT) && (client->export == NULL)))
1566 return(rawexpread_fully(ctx, client));
1568 mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
1570 for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1571 pagestart=mapcnt*DIFFPAGESIZE;
1572 offset=a-pagestart;
1573 rdlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1574 len : (size_t)DIFFPAGESIZE-offset;
1575 if (!(client->server->flags & F_COPYONWRITE))
1576 pthread_rwlock_rdlock(&client->export_lock);
1577 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1578 DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1579 (unsigned long)(client->difmap[mapcnt]));
1580 char *buf = find_read_buf(ctx);
1581 if (pread(client->difffile, buf, rdlen, client->difmap[mapcnt]*DIFFPAGESIZE+offset) != rdlen) {
1582 goto fail;
1584 confirm_read(client, ctx, rdlen);
1585 } else { /* the block is not there */
1586 if ((client->server->flags & F_WAIT) && (client->export == NULL)){
1587 DEBUG("Page %llu is not here, and waiting for file\n",
1588 (unsigned long long)mapcnt);
1589 goto fail;
1590 } else {
1591 DEBUG("Page %llu is not here, we read the original one\n",
1592 (unsigned long long)mapcnt);
1593 ctx->current_len = rdlen;
1594 if(rawexpread_fully(ctx, client)) goto fail;
1597 if (!(client->server->flags & F_COPYONWRITE))
1598 pthread_rwlock_unlock(&client->export_lock);
1599 len-=rdlen; a+=rdlen;
1601 rv = 0;
1602 goto end;
1603 fail:
1604 if (!(client->server->flags & F_COPYONWRITE))
1605 pthread_rwlock_unlock(&client->export_lock);
1606 rv = -1;
1607 end:
1608 return rv;
1612 * Write an amount of bytes at a given offset to the right file. This
1613 * abstracts the write-side of the copyonwrite option, and calls
1614 * rawexpwrite() with the right parameters to do the actual work.
1616 * @param a The offset where the write should start
1617 * @param buf The buffer to write from
1618 * @param len The length of buf
1619 * @param client The client we're going to write for.
1620 * @param fua Flag to indicate 'Force Unit Access'
1621 * @return 0 on success, nonzero on failure
1623 int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1624 char pagebuf[DIFFPAGESIZE];
1625 off_t mapcnt,mapl,maph;
1626 off_t wrlen,rdlen;
1627 off_t pagestart;
1628 off_t offset;
1630 DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1633 if (!(client->server->flags & F_COPYONWRITE) && !((client->server->flags & F_WAIT) && (client->export == NULL)))
1634 return(rawexpwrite_fully(a, buf, len, client, fua));
1636 mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
1638 for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1639 pagestart=mapcnt*DIFFPAGESIZE ;
1640 offset=a-pagestart ;
1641 wrlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1642 len : (size_t)DIFFPAGESIZE-offset;
1644 if (!(client->server->flags & F_COPYONWRITE))
1645 pthread_rwlock_rdlock(&client->export_lock);
1646 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1647 DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1648 (unsigned long)(client->difmap[mapcnt])) ;
1649 if (pwrite(client->difffile, buf, wrlen, client->difmap[mapcnt]*DIFFPAGESIZE+offset) != wrlen) goto fail;
1650 } else { /* the block is not there */
1651 client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
1652 DEBUG("Page %llu is not here, we put it at %lu\n",
1653 (unsigned long long)mapcnt,
1654 (unsigned long)(client->difmap[mapcnt]));
1655 if ((offset != 0) || (wrlen != DIFFPAGESIZE)){
1656 if ((client->server->flags & F_WAIT) && (client->export == NULL)){
1657 DEBUG("error: we can write only whole page while waiting for file\n");
1658 goto fail;
1660 rdlen=DIFFPAGESIZE;
1661 int ret;
1662 char *ptr = pagebuf;
1663 while(rdlen > 0 && (ret = rawexpread(pagestart, ptr, rdlen, client)) > 0) {
1664 pagestart += ret;
1665 ptr += ret;
1666 rdlen -= ret;
1668 if(ret < 0 ) goto fail;
1670 memcpy(pagebuf+offset,buf,wrlen) ;
1671 if (write(client->difffile, pagebuf, DIFFPAGESIZE) != DIFFPAGESIZE)
1672 goto fail;
1674 if (!(client->server->flags & F_COPYONWRITE))
1675 pthread_rwlock_unlock(&client->export_lock);
1676 len-=wrlen ; a+=wrlen ; buf+=wrlen ;
1678 if (client->server->flags & F_SYNC) {
1679 fsync(client->difffile);
1680 } else if (fua) {
1681 /* open question: would it be cheaper to do multiple sync_file_ranges?
1682 as we iterate through the above?
1684 fdatasync(client->difffile);
1686 return 0;
1687 fail:
1688 if (!(client->server->flags & F_COPYONWRITE))
1689 pthread_rwlock_unlock(&client->export_lock);
1690 return -1;
1695 * Write an amount of zeroes at a given offset to the right file.
1696 * This routine could be optimised by not calling expwrite. However,
1697 * this is by far the simplest way to do it.
1699 * @param req the request
1700 * @param client The client we're going to write for.
1701 * @return 0 on success, nonzero on failure
1703 int expwrite_zeroes(struct nbd_request* req, CLIENT* client, int fua) {
1704 off_t a = req->from;
1705 size_t len = req->len;
1706 size_t maxsize = 64LL*1024LL*1024LL;
1707 /* use calloc() as sadly MAP_ANON is apparently not POSIX standard */
1708 char *buf = calloc (1, maxsize);
1709 int ret;
1710 while (len > 0) {
1711 size_t l = len;
1712 if (l > maxsize)
1713 l = maxsize;
1714 ret = expwrite(a, buf, l, client, fua);
1715 if (ret) {
1716 free(buf);
1717 return ret;
1719 len -= l;
1721 free(buf);
1722 return 0;
1726 * Flush data to a client
1728 * @param client The client we're going to write for.
1729 * @return 0 on success, nonzero on failure
1731 int expflush(CLIENT *client) {
1732 gint i;
1734 if (client->server->flags & F_COPYONWRITE) {
1735 return fsync(client->difffile);
1738 if (client->server->flags & F_WAIT) {
1739 return fsync(client->difffile);
1742 if (client->server->flags & F_TREEFILES ) {
1743 // all we can do is force sync the entire filesystem containing the tree
1744 if (client->server->flags & F_READONLY)
1745 return 0;
1746 sync();
1747 return 0;
1750 for (i = 0; i < client->export->len; i++) {
1751 FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
1752 if (fsync(fi.fhandle) < 0)
1753 return -1;
1756 return 0;
1759 void punch_hole(int fd, off_t off, off_t len) {
1760 DEBUG("Request to punch a hole in fd=%d, starting from %llu, length %llu\n", fd, (unsigned long long)off, (unsigned long long)len);
1761 errno = 0;
1762 // fallocate -- files, Linux
1763 #if HAVE_FALLOC_PH
1764 do {
1765 if(fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, len) == 0)
1766 return;
1767 } while(errno == EINTR);
1768 #endif
1769 // ioctl(BLKDISCARD) -- block devices, Linux
1770 #if HAVE_BLKDISCARD
1771 uint64_t range[2] = {off, len};
1772 do {
1773 if(ioctl(fd, BLKDISCARD, range) == 0)
1774 return;
1775 } while(errno == EINTR);
1776 #endif
1777 // Windows
1778 #if HAVE_FSCTL_SET_ZERO_DATA
1779 FILE_ZERO_DATA_INFORMATION zerodata;
1780 zerodata.FileOffset.QuadPart = off;
1781 zerodata.BeyondFinalZero.QuadPart = off + len;
1782 HANDLE w32handle = (HANDLE)_get_osfhandle(fd);
1783 DWORD bytesret;
1784 DeviceIoControl(w32handle, FSCTL_SET_ZERO_DATA, &zerodata, sizeof(zerodata), NULL, 0, &bytesret, NULL);
1785 return;
1786 #endif
1787 if(errno) {
1788 DEBUG("punching holes failed: %s", strerror(errno));
1789 } else {
1790 DEBUG("punching holes not supported on this platform\n");
1794 static void send_reply(CLIENT* client, uint32_t opt, uint32_t reply_type, ssize_t datasize, void* data) {
1795 struct {
1796 uint64_t magic;
1797 uint32_t opt;
1798 uint32_t reply_type;
1799 uint32_t datasize;
1800 } __attribute__ ((packed)) header = {
1801 htonll(0x3e889045565a9LL),
1802 htonl(opt),
1803 htonl(reply_type),
1804 htonl(datasize),
1806 if(datasize < 0) {
1807 datasize = strlen((char*)data);
1808 header.datasize = htonl(datasize);
1810 socket_write(client, &header, sizeof(header));
1811 if(data != NULL) {
1812 socket_write(client, data, datasize);
1817 * Find the name of the file we have to serve. This will use g_strdup_printf
1818 * to put the IP address of the client inside a filename containing
1819 * "%s" (in the form as specified by the "virtstyle" option). That name
1820 * is then written to client->exportname.
1822 * @param net A socket connected to an nbd client
1823 * @param client information about the client. The IP address in human-readable
1824 * format will be written to a new char* buffer, the address of which will be
1825 * stored in client->clientname.
1826 * @return: 0 - OK, -1 - failed.
1828 int set_peername(int net, CLIENT *client) {
1829 struct sockaddr_storage netaddr;
1830 struct sockaddr* addr = (struct sockaddr*)&netaddr;
1831 socklen_t addrinlen = sizeof( struct sockaddr_storage );
1832 struct addrinfo hints;
1833 struct addrinfo *ai = NULL;
1834 char peername[NI_MAXHOST];
1835 char netname[NI_MAXHOST];
1836 char *tmp = NULL;
1837 int i;
1838 int e;
1840 if (getsockname(net, addr, &addrinlen) < 0) {
1841 msg(LOG_INFO, "getsockname failed: %m");
1842 return -1;
1845 if(netaddr.ss_family == AF_UNIX) {
1846 client->clientaddr.ss_family = AF_UNIX;
1847 strcpy(peername, "unix");
1848 } else {
1849 if (getpeername(net, (struct sockaddr *) &(client->clientaddr), &addrinlen) < 0) {
1850 msg(LOG_INFO, "getpeername failed: %m");
1851 return -1;
1853 if((e = getnameinfo((struct sockaddr *)&(client->clientaddr), addrinlen,
1854 peername, sizeof (peername), NULL, 0, NI_NUMERICHOST))) {
1855 msg(LOG_INFO, "getnameinfo failed: %s", gai_strerror(e));
1856 return -1;
1859 memset(&hints, '\0', sizeof (hints));
1860 hints.ai_flags = AI_ADDRCONFIG;
1861 e = getaddrinfo(peername, NULL, &hints, &ai);
1863 if(e != 0) {
1864 msg(LOG_INFO, "getaddrinfo failed: %s", gai_strerror(e));
1865 freeaddrinfo(ai);
1866 return -1;
1870 if(strncmp(peername, "::ffff:", 7) == 0) {
1871 memmove(peername, peername+7, strlen(peername));
1874 switch(client->server->virtstyle) {
1875 case VIRT_NONE:
1876 msg(LOG_DEBUG, "virtualization is off");
1877 client->exportname=g_strdup(client->server->exportname);
1878 break;
1879 case VIRT_IPHASH:
1880 msg(LOG_DEBUG, "virtstyle iphash");
1881 for(i=0;i<strlen(peername);i++) {
1882 if(peername[i]=='.') {
1883 peername[i]='/';
1886 break;
1887 case VIRT_IPLIT:
1888 msg(LOG_DEBUG, "virtstyle ipliteral");
1889 client->exportname=g_strdup_printf(client->server->exportname, peername);
1890 break;
1891 case VIRT_CIDR:
1892 msg(LOG_DEBUG, "virtstyle cidr %d", client->server->cidrlen);
1893 memcpy(&netaddr, &(client->clientaddr), addrinlen);
1894 int addrbits;
1895 if(client->clientaddr.ss_family == AF_UNIX) {
1896 tmp = g_strdup(peername);
1897 } else {
1898 assert((ai->ai_family == AF_INET) || (ai->ai_family == AF_INET6));
1899 if(ai->ai_family == AF_INET) {
1900 addrbits = 32;
1901 } else if(ai->ai_family == AF_INET6) {
1902 addrbits = 128;
1903 } else {
1904 g_assert_not_reached();
1906 uint8_t* addrptr = (uint8_t*)(((struct sockaddr*)&netaddr)->sa_data);
1907 for(int i = 0; i < addrbits; i+=8) {
1908 int masklen = client->server->cidrlen - i;
1909 masklen = masklen > 0 ? masklen : 0;
1910 uint8_t mask = getmaskbyte(masklen);
1911 *addrptr &= mask;
1912 addrptr++;
1914 getnameinfo((struct sockaddr *) &netaddr, addrinlen,
1915 netname, sizeof (netname), NULL, 0, NI_NUMERICHOST);
1916 tmp=g_strdup_printf("%s/%s", netname, peername);
1919 if(tmp != NULL) {
1920 client->exportname=g_strdup_printf(client->server->exportname, tmp);
1921 g_free(tmp);
1924 break;
1927 if(ai) {
1928 freeaddrinfo(ai);
1930 msg(LOG_INFO, "connect from %s, assigned file is %s",
1931 peername, client->exportname);
1932 client->clientname=g_strdup(peername);
1933 return 0;
1936 int commit_diff(CLIENT* client, bool lock, int fhandle){
1937 int dirtycount = 0;
1938 int pagecount = client->exportsize/DIFFPAGESIZE;
1939 off_t offset;
1940 char* buf = malloc(sizeof(char)*DIFFPAGESIZE);
1942 for (int i=0; i<pagecount; i++){
1943 offset = DIFFPAGESIZE*i;
1944 if (lock)
1945 pthread_rwlock_wrlock(&client->export_lock);
1946 if (client->difmap[i] != (u32)-1){
1947 dirtycount += 1;
1948 DEBUG("flushing dirty page %d, offset %ld\n", i, offset);
1949 if (pread(client->difffile, buf, DIFFPAGESIZE, client->difmap[i]*DIFFPAGESIZE) != DIFFPAGESIZE) {
1950 msg(LOG_WARNING, "could not read while committing diff: %m");
1951 if(lock) {
1952 pthread_rwlock_unlock(&client->export_lock);
1954 break;
1956 if (pwrite(fhandle, buf, DIFFPAGESIZE, offset) != DIFFPAGESIZE) {
1957 msg(LOG_WARNING, "could not write while committing diff: %m");
1958 if (lock) {
1959 pthread_rwlock_unlock(&client->export_lock);
1961 break;
1963 client->difmap[i] = (u32)-1;
1965 if (lock)
1966 pthread_rwlock_unlock(&client->export_lock);
1969 free(buf);
1970 return dirtycount;
1973 void* wait_file(void *void_ptr) {
1974 CLIENT* client = (CLIENT *)void_ptr;
1975 FILE_INFO fi;
1976 GArray* export;
1977 mode_t mode = O_RDWR;
1978 int dirtycount;
1980 fi.fhandle = -1;
1981 fi.startoff = 0;
1983 while (fi.fhandle < 1){
1984 sem_wait(&file_wait_sem);
1985 msg(LOG_INFO, "checking for file %s", client->server->exportname);
1986 fi.fhandle = open(client->server->exportname, mode);
1989 msg(LOG_INFO, "File %s appeared, fd %d", client->server->exportname, fi.fhandle);
1991 // first time there may be lot of data so we lock only per page
1992 do {
1993 dirtycount = commit_diff(client, true, fi.fhandle);
1994 } while (dirtycount > 0);
1996 //last time we lock export for the whole time until we switch write destination
1997 pthread_rwlock_wrlock(&client->export_lock);
1998 do {
1999 dirtycount = commit_diff(client, false, fi.fhandle);
2000 } while (dirtycount > 0);
2002 export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
2003 g_array_append_val(export, fi);
2005 client->export = export;
2006 pthread_rwlock_unlock(&client->export_lock);
2007 msg(LOG_INFO, "Waiting for file ended, switching to exported file %s", client->server->exportname);
2009 return NULL;
2013 * Set up client export array, which is an array of FILE_INFO.
2014 * Also, split a single exportfile into multiple ones, if that was asked.
2015 * @param client information on the client which we want to setup export for
2017 bool setupexport(CLIENT* client) {
2018 int i = 0;
2019 off_t laststartoff = 0, lastsize = 0;
2020 int multifile = (client->server->flags & F_MULTIFILE);
2021 int treefile = (client->server->flags & F_TREEFILES);
2022 int temporary = (client->server->flags & F_TEMPORARY) && !multifile;
2023 int cancreate = (client->server->expected_size) && !multifile;
2025 if (treefile || (client->server->flags & F_WAIT)) {
2026 client->export = NULL; // this could be thousands of files so we open handles on demand although its slower
2027 client->exportsize = client->server->expected_size; // available space is not checked, as it could change during runtime anyway
2029 if(client->server->flags & F_WAIT){
2030 pthread_t wait_file_thread;
2031 if (pthread_create(&wait_file_thread, NULL, wait_file, client)){
2032 DEBUG("failed to create wait_file thread");
2033 return false;
2037 } else {
2038 client->export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
2040 /* If multi-file, open as many files as we can.
2041 * If not, open exactly one file.
2042 * Calculate file sizes as we go to get total size. */
2043 for(i=0; ; i++) {
2044 FILE_INFO fi;
2045 _cleanup_g_free_ gchar *tmpname = NULL;
2046 _cleanup_g_free_ gchar* error_string = NULL;
2048 if (i)
2049 cancreate = 0;
2050 /* if expected_size is specified, and this is the first file, we can create the file */
2051 mode_t mode = (client->server->flags & F_READONLY) ?
2052 O_RDONLY : (O_RDWR | (cancreate?O_CREAT:0));
2054 if (temporary) {
2055 tmpname=g_strdup_printf("%s.%d-XXXXXX", client->exportname, i);
2056 DEBUG( "Opening %s\n", tmpname );
2057 fi.fhandle = mkstemp(tmpname);
2058 } else {
2059 if(multifile) {
2060 tmpname=g_strdup_printf("%s.%d", client->exportname, i);
2061 } else {
2062 tmpname=g_strdup(client->exportname);
2064 DEBUG( "Opening %s\n", tmpname );
2065 fi.fhandle = open(tmpname, mode, 0600);
2066 if(fi.fhandle == -1 && mode == O_RDWR) {
2067 /* Try again because maybe media was read-only */
2068 fi.fhandle = open(tmpname, O_RDONLY);
2069 if(fi.fhandle != -1) {
2070 /* Opening the base file in copyonwrite mode is
2071 * okay */
2072 if(!(client->server->flags & F_COPYONWRITE)) {
2073 client->server->flags |= F_AUTOREADONLY;
2074 client->server->flags |= F_READONLY;
2079 if(fi.fhandle == -1) {
2080 if(multifile && i>0)
2081 break;
2082 error_string=g_strdup_printf(
2083 "Could not open exported file %s: %%m",
2084 tmpname);
2085 err_nonfatal(error_string);
2086 return false;
2089 if (temporary) {
2090 unlink(tmpname); /* File will stick around whilst FD open */
2093 fi.startoff = laststartoff + lastsize;
2094 g_array_append_val(client->export, fi);
2096 /* Starting offset and size of this file will be used to
2097 * calculate starting offset of next file */
2098 laststartoff = fi.startoff;
2099 lastsize = size_autodetect(fi.fhandle);
2101 /* If we created the file, it will be length zero */
2102 if (!lastsize && cancreate) {
2103 assert(!multifile);
2104 if(ftruncate (fi.fhandle, client->server->expected_size)<0) {
2105 err_nonfatal("Could not expand file: %m");
2106 return false;
2108 lastsize = client->server->expected_size;
2109 break; /* don't look for any more files */
2112 if(!multifile || temporary)
2113 break;
2116 /* Set export size to total calculated size */
2117 client->exportsize = laststartoff + lastsize;
2119 /* Export size may be overridden */
2120 if(client->server->expected_size) {
2121 /* desired size must be <= total calculated size */
2122 if(client->server->expected_size > client->exportsize) {
2123 err_nonfatal("Size of exported file is too big\n");
2124 return false;
2127 client->exportsize = client->server->expected_size;
2131 msg(LOG_INFO, "Size of exported file/device is %llu", (unsigned long long)client->exportsize);
2132 if(multifile) {
2133 msg(LOG_INFO, "Total number of files: %d", i);
2135 if(treefile) {
2136 msg(LOG_INFO, "Total number of (potential) files: %" PRId64, (client->exportsize+TREEPAGESIZE-1)/TREEPAGESIZE);
2138 return true;
2141 bool copyonwrite_prepare(CLIENT* client) {
2142 off_t i;
2143 _cleanup_g_free_ gchar* dir = NULL;
2144 _cleanup_g_free_ gchar* export_base = NULL;
2145 if (client->server->cowdir != NULL) {
2146 dir = g_strdup(client->server->cowdir);
2147 } else {
2148 dir = g_strdup(dirname(client->exportname));
2150 export_base = g_strdup(basename(client->exportname));
2151 client->difffilename = g_strdup_printf("%s/%s-%s-%d.diff",dir,export_base,client->clientname,
2152 (int)getpid());
2153 msg(LOG_INFO, "About to create map and diff file %s", client->difffilename) ;
2154 client->difffile=open(client->difffilename,O_RDWR | O_CREAT | O_TRUNC,0600) ;
2155 if (client->difffile<0) {
2156 err("Could not create diff file (%m)");
2157 return false;
2159 if ((client->difmap=calloc(client->exportsize/DIFFPAGESIZE,sizeof(u32)))==NULL) {
2160 err("Could not allocate memory");
2161 return false;
2163 for (i=0;i<client->exportsize/DIFFPAGESIZE;i++) client->difmap[i]=(u32)-1;
2165 return true;
2168 void send_export_info(CLIENT* client, SERVER* server, bool maybe_zeroes) {
2169 uint64_t size_host = htonll((u64)(client->exportsize));
2170 uint16_t flags = NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_WRITE_ZEROES;
2172 socket_write(client, &size_host, 8);
2173 if (server->flags & F_READONLY)
2174 flags |= NBD_FLAG_READ_ONLY;
2175 if (server->flags & F_FLUSH)
2176 flags |= NBD_FLAG_SEND_FLUSH;
2177 if (server->flags & F_FUA)
2178 flags |= NBD_FLAG_SEND_FUA;
2179 if (server->flags & F_ROTATIONAL)
2180 flags |= NBD_FLAG_ROTATIONAL;
2181 if (server->flags & F_TRIM)
2182 flags |= NBD_FLAG_SEND_TRIM;
2183 if (!(server->flags & F_COPYONWRITE))
2184 flags |= NBD_FLAG_CAN_MULTI_CONN;
2185 if (client->clientflags & F_STRUCTURED)
2186 flags |= NBD_FLAG_SEND_DF;
2187 flags = htons(flags);
2188 socket_write(client, &flags, sizeof(flags));
2189 if (!(glob_flags & F_NO_ZEROES) && maybe_zeroes) {
2190 char zeros[128];
2191 memset(zeros, '\0', sizeof(zeros));
2192 socket_write(client, zeros, 124);
2197 * Setup the transaction log
2199 * The function does all things required for the transaction log:
2200 * - Create a new log file.
2201 * - allocate the posix semaphore for synchronization.
2202 * - Report if a log file already exists.
2203 * - If needed add a header to the log.
2205 * If something goes wrong, logging is disabled.
2207 * @param client the CLIENT structure with .server and .net members set
2208 * up correctly
2210 static void setup_transactionlog(CLIENT *client) {
2211 struct stat fdinfo;
2212 int ret;
2214 /* 1) create the file */
2215 if((client->transactionlogfd =
2216 open(client->server->transactionlog,
2217 O_WRONLY | O_CREAT,
2218 S_IRUSR | S_IWUSR)) ==
2219 -1) {
2220 msg(LOG_INFO, "Could not open transactionlog %s, moving on without it",
2221 client->server->transactionlog);
2222 return;
2225 /* 2) If needed, write flags */
2226 if (client->server->flags & F_DATALOG) {
2227 struct nbd_request req;
2228 int ret;
2230 req.magic = htonl(NBD_TRACELOG_MAGIC);
2231 req.type = htonl(NBD_TRACELOG_SET_DATALOG);
2232 req.cookie = 0;
2233 req.from = htonll(NBD_TRACELOG_FROM_MAGIC);
2234 req.len = htonl(TRUE);
2236 ret = writeit(client->transactionlogfd, &req, sizeof(struct nbd_request));
2237 if (ret < 0) {
2238 msg(LOG_INFO, "Could not write to transactionlog %s, moving on without it",
2239 client->server->transactionlog);
2240 close(client->transactionlogfd);
2241 client->transactionlogfd = -1;
2242 return;
2246 /* 3) Allocate the semaphore used for locking */
2247 ret = fstat(client->transactionlogfd, &fdinfo);
2248 if (ret == -1) {
2249 msg(LOG_INFO, "Could not stat transactionlog %s, moving on without it",
2250 client->server->transactionlog);
2251 close(client->transactionlogfd);
2252 client->transactionlogfd = -1;
2253 return;
2255 snprintf(client->semname, sizeof(client->semname), "/nbd-server-%llx-%llx",
2256 (unsigned long long)fdinfo.st_dev,
2257 (unsigned long long)fdinfo.st_ino);
2258 client->logsem = sem_open(client->semname, O_CREAT, 0600, 1);
2259 if (client->logsem == SEM_FAILED) {
2260 msg(LOG_INFO, "Could not allocate semaphore for transactionlog %s, moving on without it",
2261 client->server->transactionlog);
2262 close(client->transactionlogfd);
2263 client->transactionlogfd = -1;
2268 * Commit to exporting the chosen export
2270 * When a client sends NBD_OPT_EXPORT_NAME or NBD_OPT_GO, we need to do
2271 * a number of things (verify whether the client is allowed access, try
2272 * to open files, etc etc) before we're ready to actually serve the
2273 * export.
2275 * This function does all those things.
2277 * @param client the CLIENT structure with .server and .net members set
2278 * up correctly
2279 * @return true if the client is allowed access to the export, false
2280 * otherwise
2282 static bool commit_client(CLIENT* client, SERVER* server) {
2283 char acl;
2284 uint32_t len;
2286 client->server = serve_inc_ref(server);
2287 client->exportsize = OFFT_MAX;
2288 client->transactionlogfd = -1;
2289 if(pthread_mutex_init(&(client->lock), NULL)) {
2290 msg(LOG_ERR, "Unable to initialize mutex");
2291 return false;
2293 if (pthread_rwlock_init(&client->export_lock, NULL)){
2294 msg(LOG_ERR, "Unable to initialize write lock");
2295 return false;
2297 /* Check whether we exceeded the maximum number of allowed
2298 * clients already */
2299 if(dontfork) {
2300 acl = 'Y';
2301 } else {
2302 len = strlen(client->server->servename);
2303 writeit(commsocket, &len, sizeof len);
2304 writeit(commsocket, client->server->servename, len);
2305 readit(commsocket, &acl, 1);
2306 close(commsocket);
2308 switch(acl) {
2309 case 'N':
2310 msg(LOG_ERR, "Connection not allowed (too many clients)");
2311 return false;
2312 case 'X':
2313 msg(LOG_ERR, "Connection not allowed (unknown by parent?!?)");
2314 return false;
2317 /* Check whether the client is listed in the authfile */
2318 if (set_peername(client->net, client)) {
2319 msg(LOG_ERR, "Failed to set peername");
2320 return false;
2323 if (!authorized_client(client)) {
2324 msg(LOG_INFO, "Client '%s' is not authorized to access",
2325 client->clientname);
2326 return false;
2329 /* Set up the transactionlog, if we need one */
2330 if (client->server->transactionlog && (client->transactionlogfd == -1))
2331 setup_transactionlog(client);
2333 /* Run any pre scripts that we may need */
2334 if (do_run(client->server->prerun, client->exportname)) {
2335 msg(LOG_INFO, "Client '%s' not allowed access by prerun script",
2336 client->clientname);
2337 return false;
2339 client->socket_closed = socket_closed_transmission;
2340 if(!setupexport(client)) {
2341 return false;
2344 if (client->server->flags & F_COPYONWRITE) {
2345 if(!copyonwrite_prepare(client)) {
2346 return false;
2350 if (client->server->flags & F_WAIT) {
2351 if(!copyonwrite_prepare(client)) {
2352 return false;
2356 setmysockopt(client->net);
2358 return true;
2361 static CLIENT* handle_export_name(CLIENT* client, uint32_t opt, GArray* servers, uint32_t cflags) {
2362 uint32_t namelen;
2363 char* name;
2364 int i;
2366 socket_read(client, &namelen, sizeof(namelen));
2367 namelen = ntohl(namelen);
2368 if(namelen > 4096) {
2369 return NULL;
2371 if(namelen > 0) {
2372 name = malloc(namelen+1);
2373 name[namelen]=0;
2374 socket_read(client, name, namelen);
2375 } else {
2376 name = strdup("");
2378 for(i=0; i<servers->len; i++) {
2379 SERVER* serve = (g_array_index(servers, SERVER*, i));
2380 // hide exports that are TLS-only if we haven't negotiated TLS
2381 // yet
2382 if ((serve->flags & F_FORCEDTLS) && !client->tls_session) {
2383 continue;
2385 if(!strcmp(serve->servename, name)) {
2386 client->clientfeats = cflags;
2387 free(name);
2388 if(!commit_client(client, serve)) {
2389 return NULL;
2391 send_export_info(client, serve, true);
2392 return client;
2395 free(name);
2396 err("Negotiation failed/8a: Requested export not found, or is TLS-only and client did not negotiate TLS");
2399 static void handle_list(CLIENT* client, uint32_t opt, GArray* servers, uint32_t cflags) {
2400 uint32_t len;
2401 int i;
2402 char buf[1024];
2403 char *ptr = buf + sizeof(len);
2405 socket_read(client, &len, sizeof(len));
2406 len = ntohl(len);
2407 if(len) {
2408 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "NBD_OPT_LIST with nonzero data length is not a valid request");
2410 if(!(glob_flags & F_LIST)) {
2411 send_reply(client, opt, NBD_REP_ERR_POLICY, -1, "Listing of exports denied by server configuration");
2412 err_nonfatal("Client tried disallowed list option");
2413 return;
2415 for(i=0; i<servers->len; i++) {
2416 SERVER* serve = (g_array_index(servers, SERVER*, i));
2417 // Hide TLS-only exports if we haven't negotiated TLS yet
2418 if(!client->tls_session && (serve->flags & F_FORCEDTLS)) {
2419 continue;
2421 len = htonl(strlen(serve->servename));
2422 memcpy(buf, &len, sizeof(len));
2423 strncpy(ptr, serve->servename, sizeof(buf) - sizeof(len));
2424 send_reply(client, opt, NBD_REP_SERVER, strlen(serve->servename)+sizeof(len), buf);
2426 send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2429 #if HAVE_GNUTLS
2430 static int verify_cert(gnutls_session_t session) {
2431 int ret;
2432 unsigned int status, cert_list_size;
2433 const gnutls_datum_t *cert_list;
2434 gnutls_x509_crt_t cert;
2435 time_t now = time(NULL);
2437 ret = gnutls_certificate_verify_peers2(session, &status);
2438 if(ret < 0 || status != 0 || gnutls_certificate_type_get(session) !=
2439 GNUTLS_CRT_X509) {
2440 goto err;
2443 if(gnutls_x509_crt_init(&cert) < 0) {
2444 goto err;
2447 cert_list = gnutls_certificate_get_peers(session, &cert_list_size);
2448 if(cert_list == NULL) {
2449 goto err;
2451 if(gnutls_x509_crt_import(cert, &cert_list[0], GNUTLS_X509_FMT_DER) < 0) {
2452 goto err;
2454 if(gnutls_x509_crt_get_activation_time(cert) > now) {
2455 goto err;
2457 if(gnutls_x509_crt_get_expiration_time(cert) < now) {
2458 goto err;
2460 // TODO: check CRLs and/or OCSP etc. Patches welcome.
2461 msg(LOG_INFO, "client certificate verification successful");
2462 return 0;
2463 err:
2464 msg(LOG_ERR, "E: client certificate verification failed");
2465 return GNUTLS_E_CERTIFICATE_ERROR;
2468 CLIENT* handle_starttls(CLIENT* client, int opt, GArray* servers, uint32_t cflags, struct generic_conf *genconf) {
2469 #define check_rv(c) if((c)<0) { retval = NULL; goto exit; }
2470 gnutls_certificate_credentials_t x509_cred;
2471 CLIENT* retval = client;
2472 gnutls_priority_t priority_cache;
2473 gnutls_session_t *session = g_new0(gnutls_session_t, 1);
2474 int ret;
2475 int len;
2477 socket_read(client, &len, sizeof(len));
2478 if(G_UNLIKELY(len != 0)) {
2479 char buf[1024*1024];
2480 consume(client, len, buf, sizeof(buf));
2481 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "Sending a STARTTLS command with data is invalid");
2482 return NULL;
2485 send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2487 check_rv(gnutls_certificate_allocate_credentials(&x509_cred));
2488 gnutls_certificate_set_verify_function(x509_cred, verify_cert);
2489 check_rv(gnutls_certificate_set_x509_trust_file(x509_cred, genconf->cacertfile, GNUTLS_X509_FMT_PEM));
2490 check_rv(gnutls_certificate_set_x509_key_file(x509_cred, genconf->certfile, genconf->keyfile, GNUTLS_X509_FMT_PEM));
2491 check_rv(gnutls_priority_init(&priority_cache, genconf->tlsprio, NULL));
2492 check_rv(gnutls_init(session, GNUTLS_SERVER));
2493 check_rv(gnutls_priority_set(*session, priority_cache));
2494 check_rv(gnutls_credentials_set(*session, GNUTLS_CRD_CERTIFICATE, x509_cred));
2496 gnutls_certificate_server_set_request(*session, GNUTLS_CERT_REQUEST);
2497 #if GNUTLS_VERSION_NUMBER >= 0x030109
2498 gnutls_transport_set_int(*session, client->net);
2499 #else
2500 gnutls_transport_set_ptr(*session, (gnutls_transport_ptr_t) (intptr_t) client->net);
2501 #endif
2502 do {
2503 ret = gnutls_handshake(*session);
2504 } while(ret < 0 && gnutls_error_is_fatal(ret) == 0);
2506 if (ret < 0) {
2507 err_nonfatal(gnutls_strerror(ret));
2508 gnutls_bye(*session, GNUTLS_SHUT_RDWR);
2509 gnutls_deinit(*session);
2510 g_free(session);
2511 return NULL;
2513 client->tls_session = session;
2514 client->socket_read = socket_read_tls;
2515 client->socket_write = socket_write_tls;
2516 #undef check_rv
2517 exit:
2518 if(retval == NULL && session != NULL) {
2519 g_free(session);
2521 /* export names cannot be chosen before NBD_OPT_STARTTLS and be retained */
2522 if(retval != NULL && retval->server != NULL) {
2523 retval->server = NULL;
2525 return retval;
2527 #endif
2530 * Handle an NBD_OPT_STRUCTURED_REPLY message
2532 static void handle_structured_reply(CLIENT *client, uint32_t opt, GArray *servers, uint32_t cflags) {
2533 uint32_t len;
2534 int i;
2536 socket_read(client, &len, sizeof(len));
2537 len = ntohl(len);
2538 if(len) {
2539 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "NBD_OPT_STRUCTURED_REPLY with nonzero data length is not a valid request");
2540 char buf[1024];
2541 consume(client, len, buf, sizeof buf);
2542 return;
2544 if(client->clientflags & F_STRUCTURED) {
2545 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "NBD_OPT_STRUCTURED_REPLY has already been called");
2546 return;
2548 client->clientflags |= F_STRUCTURED;
2549 send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2553 * Handle an NBD_OPT_INFO or NBD_OPT_GO request.
2555 static bool handle_info(CLIENT* client, uint32_t opt, GArray* servers, uint32_t cflags) {
2556 uint32_t namelen, len;
2557 char *name;
2558 int i;
2559 SERVER *server = NULL;
2560 uint16_t n_requests;
2561 uint16_t request;
2562 char buf[1024];
2563 bool sent_export = false;
2564 uint32_t reptype = NBD_REP_ERR_UNKNOWN;
2565 char *msg = "Export unknown";
2567 socket_read(client, &len, sizeof(len));
2568 len = htonl(len);
2569 socket_read(client, &namelen, sizeof(namelen));
2570 namelen = htonl(namelen);
2571 if(namelen > (len - 6)) {
2572 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "An OPT_INFO request cannot be smaller than the length of the name + 6");
2573 consume(client, len - sizeof(namelen), buf, sizeof(buf));
2575 if(namelen > 4096) {
2576 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "The name for this OPT_INFO request is too long");
2577 consume(client, namelen, buf, sizeof(buf));
2579 if(namelen > 0) {
2580 name = malloc(namelen + 1);
2581 if (!name) {
2582 send_reply(client, opt, reptype, -1, "nbd server out of memory");
2583 return false;
2585 name[namelen] = 0;
2586 socket_read(client, name, namelen);
2587 } else {
2588 name = strdup("");
2590 for(i=0; i<servers->len; i++) {
2591 SERVER *serve = (g_array_index(servers, SERVER*, i));
2592 if (!strcmp(serve->servename, name)) {
2593 if ((serve->flags & F_FORCEDTLS) && !client->tls_session) {
2594 reptype = NBD_REP_ERR_TLS_REQD;
2595 msg = "TLS is required for that export";
2596 continue;
2598 server = serve;
2601 free(name);
2602 socket_read(client, &n_requests, sizeof(n_requests));
2603 n_requests = ntohs(n_requests);
2604 if(!server) {
2605 consume(client, n_requests * sizeof(request), buf,
2606 sizeof(buf));
2607 send_reply(client, opt, reptype, -1, msg);
2608 return false;
2610 if (opt == NBD_OPT_GO) {
2611 client->clientfeats = cflags;
2612 if(!commit_client(client, server)) {
2613 consume(client, n_requests * sizeof(request), buf,
2614 sizeof(buf));
2615 send_reply(client, opt, NBD_REP_ERR_POLICY, -1, "Access denied by server configuration");
2616 return false;
2619 for(i=0; i<n_requests; i++) {
2620 socket_read(client, &request, sizeof(request));
2621 switch(ntohs(request)) {
2622 case NBD_INFO_EXPORT:
2623 send_reply(client, opt, NBD_REP_INFO, 12, NULL);
2624 socket_write(client, &request, 2);
2625 send_export_info(client, server, false);
2626 sent_export = true;
2627 break;
2628 default:
2629 // ignore all other options for now.
2630 break;
2633 if(!sent_export) {
2634 request = htons(NBD_INFO_EXPORT);
2635 send_reply(client, opt, NBD_REP_INFO, 12, NULL);
2636 socket_write(client, &request, 2);
2637 send_export_info(client, server, false);
2639 send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2641 return true;
2645 * Do the initial negotiation.
2647 * @param net The socket we're doing the negotiation over.
2648 * @param servers The array of known servers.
2649 * @param genconf the global options (needed for accessing TLS config data)
2651 CLIENT* negotiate(int net, GArray* servers, struct generic_conf *genconf) {
2652 uint16_t smallflags = NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES;
2653 uint64_t magic;
2654 uint32_t cflags = 0;
2655 uint32_t opt;
2656 CLIENT* client = g_new0(CLIENT, 1);
2657 client->net = net;
2658 client->socket_read = socket_read_notls;
2659 client->socket_write = socket_write_notls;
2660 client->socket_closed = socket_closed_negotiate;
2661 client->transactionlogfd = -1;
2662 client->logsem = SEM_FAILED;
2664 assert(servers != NULL);
2665 socket_write(client, INIT_PASSWD, 8);
2666 magic = htonll(opts_magic);
2667 socket_write(client, &magic, sizeof(magic));
2669 smallflags = htons(smallflags);
2670 socket_write(client, &smallflags, sizeof(uint16_t));
2671 socket_read(client, &cflags, sizeof(cflags));
2672 cflags = htonl(cflags);
2673 if (cflags & NBD_FLAG_C_NO_ZEROES) {
2674 glob_flags |= F_NO_ZEROES;
2676 do {
2677 socket_read(client, &magic, sizeof(magic));
2678 magic = ntohll(magic);
2679 if(magic != opts_magic) {
2680 err_nonfatal("Negotiation failed/5a: magic mismatch");
2681 goto handler_err;
2683 socket_read(client, &opt, sizeof(opt));
2684 opt = ntohl(opt);
2685 if(client->tls_session == NULL
2686 && glob_flags & F_FORCEDTLS
2687 && opt != NBD_OPT_STARTTLS) {
2688 if(opt == NBD_OPT_EXPORT_NAME) {
2689 // can't send an error message for EXPORT_NAME,
2690 // so must do hard close
2691 goto handler_err;
2693 if(opt == NBD_OPT_ABORT) {
2694 // handled below
2695 break;
2697 consume_len(client);
2698 send_reply(client, opt, NBD_REP_ERR_TLS_REQD, -1, "TLS is required on this server");
2699 continue;
2701 switch(opt) {
2702 case NBD_OPT_EXPORT_NAME:
2703 // NBD_OPT_EXPORT_NAME must be the last
2704 // selected option, so return from here
2705 // if that is chosen.
2706 if(handle_export_name(client, opt, servers, cflags) != NULL) {
2707 return client;
2708 } else {
2709 goto handler_err;
2711 break;
2712 case NBD_OPT_LIST:
2713 handle_list(client, opt, servers, cflags);
2714 break;
2715 case NBD_OPT_ABORT:
2716 // handled below
2717 break;
2718 case NBD_OPT_STARTTLS:
2719 #if !HAVE_GNUTLS
2720 consume_len(client);
2721 send_reply(client, opt, NBD_REP_ERR_PLATFORM, -1, "This nbd-server was compiled without TLS support");
2722 #else
2723 if(client->tls_session != NULL) {
2724 consume_len(client);
2725 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "Invalid STARTTLS request: TLS has already been negotiated!");
2726 continue;
2728 if(genconf->keyfile == NULL) {
2729 consume_len(client);
2730 send_reply(client, opt, NBD_REP_ERR_POLICY, -1, "TLS not allowed on this server");
2731 continue;
2733 if(handle_starttls(client, opt, servers, cflags, genconf) == NULL) {
2734 // can't recover from failed TLS negotiation.
2735 goto handler_err;
2737 // once TLS has been negotiated, any state must be cleared
2738 client->clientflags = 0;
2739 #endif
2740 break;
2741 case NBD_OPT_GO:
2742 case NBD_OPT_INFO:
2743 if(handle_info(client, opt, servers, cflags) && opt == NBD_OPT_GO) {
2744 return client;
2746 break;
2747 case NBD_OPT_STRUCTURED_REPLY:
2748 handle_structured_reply(client, opt, servers, cflags);
2749 break;
2750 default:
2751 consume_len(client);
2752 send_reply(client, opt, NBD_REP_ERR_UNSUP, -1, "The given option is unknown to this server implementation");
2753 break;
2755 } while((opt != NBD_OPT_EXPORT_NAME) && (opt != NBD_OPT_ABORT));
2756 if(opt == NBD_OPT_ABORT) {
2757 err_nonfatal("Session terminated by client");
2758 goto handler_err;
2760 err_nonfatal("Weird things happened: reached end of negotiation without success");
2761 handler_err:
2762 g_free(client);
2763 return NULL;
2766 static int nbd_errno(int errcode) {
2767 switch (errcode) {
2768 case EPERM:
2769 return htonl(1);
2770 case EIO:
2771 return htonl(5);
2772 case ENOMEM:
2773 return htonl(12);
2774 case EINVAL:
2775 return htonl(22);
2776 case EFBIG:
2777 case ENOSPC:
2778 #ifdef EDQUOT
2779 case EDQUOT:
2780 #endif
2781 return htonl(28); // ENOSPC
2782 default:
2783 return htonl(22); // EINVAL
2787 static void package_dispose(struct work_package* package) {
2788 if (package->pipefd[0] > 0)
2789 close(package->pipefd[0]);
2790 if (package->pipefd[1] > 0)
2791 close(package->pipefd[1]);
2792 g_free(package->data);
2793 g_free(package->req);
2794 g_free(package);
2797 static int mkpipe(int pipefd[2], size_t len)
2799 if (len > MAX_PIPE_SIZE)
2800 return -1;
2801 if (pipe(pipefd))
2802 return -1;
2804 #ifdef HAVE_SPLICE
2805 if (fcntl(pipefd[1], F_SETPIPE_SZ, MAX_PIPE_SIZE) < MAX_PIPE_SIZE) {
2806 close(pipefd[0]);
2807 close(pipefd[1]);
2808 pipefd[0] = -1;
2809 pipefd[1] = -1;
2810 return -1;
2812 #endif
2814 return 0;
2817 struct work_package* package_create(CLIENT* client, struct nbd_request* req) {
2818 struct work_package* rv = calloc(sizeof (struct work_package), 1);
2820 rv->req = req;
2821 rv->client = client;
2822 rv->data = NULL;
2823 rv->pipefd[0] = -1;
2824 rv->pipefd[1] = -1;
2826 if((req->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
2827 if (client->server->flags & F_SPLICE) {
2828 if (mkpipe(rv->pipefd, req->len))
2829 rv->data = malloc(req->len);
2830 } else {
2831 rv->data = malloc(req->len);
2835 return rv;
2838 #ifdef HAVE_SPLICE
2839 static int handle_splice_read(CLIENT *client, struct nbd_request *req)
2841 struct nbd_reply rep;
2842 int pipefd[2];
2844 // splice doesn't work with TLS
2845 if (client->tls_session != NULL)
2846 return -1;
2848 if (mkpipe(pipefd, req->len))
2849 return -1;
2851 if (expsplice(pipefd[1], req->from, req->len, client, SPLICE_IN, 0)) {
2852 close(pipefd[1]);
2853 close(pipefd[0]);
2854 return -1;
2857 DEBUG("handling read request (splice)\n");
2858 setup_reply(&rep, req);
2859 log_reply(client, &rep);
2860 pthread_mutex_lock(&(client->lock));
2861 writeit(client->net, &rep, sizeof(rep));
2862 spliceit(pipefd[0], NULL, client->net, NULL, req->len);
2863 pthread_mutex_unlock(&(client->lock));
2864 close(pipefd[0]);
2865 close(pipefd[1]);
2866 return 0;
2868 #endif
2870 static void handle_normal_read(CLIENT *client, struct nbd_request *req)
2872 DEBUG("handling read request\n");
2873 char read_failed[] = "Read failed";
2874 _cleanup_g_free_ READ_CTX *ctx = g_new0(READ_CTX, 1);
2875 ctx->req = req;
2876 ctx->current_len = req->len;
2877 uint32_t error = 0;
2878 char *errmsg = NULL;
2879 uint16_t msglen = 0;
2880 if(client->clientflags & F_STRUCTURED) {
2881 ctx->is_structured = 1;
2882 } else {
2883 ctx->is_structured = 0;
2885 if(req->type & NBD_CMD_FLAG_DF != 0) {
2886 ctx->df = 1;
2888 if(ctx->is_structured && ctx->df && req->len > (1 << 20)) {
2889 /* standard requires a minimum of 64KiB; we are more generous
2890 * by allowing up to 1MiB as our largest unfragmented answer */
2891 const char too_long[] = "Request too long for unfragmented reply";
2892 struct nbd_structured_error_payload pl;
2893 pl.error = NBD_EOVERFLOW;
2894 pl.msglen = sizeof too_long;
2895 send_structured_chunk_v(client, req, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_ERROR, 6 + pl.msglen, 2, &pl, sizeof pl, too_long, sizeof too_long);
2896 return;
2898 if(ctx->df || !(ctx->is_structured)) {
2899 ctx->buf = malloc(req->len);
2900 if(!(ctx->buf)) {
2901 err("Could not allocate memory for request");
2903 ctx->buflen = req->len;
2905 if(expread(ctx, client)) {
2906 DEBUG("Read failed: %m");
2907 error = nbd_errno(errno);
2908 errmsg = read_failed;
2909 msglen = sizeof read_failed;
2911 complete_read(client, ctx, error, errmsg, msglen, false, 0);
2914 static void handle_read(CLIENT* client, struct nbd_request* req)
2916 #ifdef HAVE_SPLICE
2918 * If we have splice set we want to try that first, and if that fails
2919 * for whatever reason we fall through to ye olde read.
2921 if (client->server->flags & F_SPLICE)
2922 if (!handle_splice_read(client, req))
2923 return;
2924 #endif
2925 handle_normal_read(client, req);
2928 static void handle_write(struct work_package *pkg)
2930 CLIENT *client = pkg->client;
2931 struct nbd_request *req = pkg->req;
2932 struct nbd_reply rep;
2933 int fua = !!(req->type & NBD_CMD_FLAG_FUA);
2935 DEBUG("handling write request\n");
2936 setup_reply(&rep, req);
2938 #ifdef HAVE_SPLICE
2939 if (!pkg->data) {
2940 if (expsplice(pkg->pipefd[0], req->from, req->len, client,
2941 SPLICE_OUT, fua)) {
2942 DEBUG("Splice failed: %m");
2943 rep.error = nbd_errno(errno);
2945 } else
2946 #endif
2948 if(expwrite(req->from, pkg->data, req->len, client, fua)) {
2949 DEBUG("Write failed: %m");
2950 rep.error = nbd_errno(errno);
2953 log_reply(client, &rep);
2954 pthread_mutex_lock(&(client->lock));
2955 socket_write(client, &rep, sizeof rep);
2956 pthread_mutex_unlock(&(client->lock));
2959 static void handle_flush(CLIENT* client, struct nbd_request* req) {
2960 struct nbd_reply rep;
2961 DEBUG("handling flush request\n");
2962 setup_reply(&rep, req);
2963 if(expflush(client)) {
2964 DEBUG("Flush failed: %m");
2965 rep.error = nbd_errno(errno);
2967 log_reply(client, &rep);
2968 pthread_mutex_lock(&(client->lock));
2969 socket_write(client, &rep, sizeof rep);
2970 pthread_mutex_unlock(&(client->lock));
2973 static void handle_trim(CLIENT* client, struct nbd_request* req) {
2974 struct nbd_reply rep;
2975 DEBUG("handling trim request\n");
2976 setup_reply(&rep, req);
2977 if(exptrim(req, client)) {
2978 DEBUG("Trim failed: %m");
2979 rep.error = nbd_errno(errno);
2981 log_reply(client, &rep);
2982 pthread_mutex_lock(&(client->lock));
2983 socket_write(client, &rep, sizeof rep);
2984 pthread_mutex_unlock(&(client->lock));
2987 static void handle_write_zeroes(CLIENT* client, struct nbd_request* req) {
2988 struct nbd_reply rep;
2989 DEBUG("handling write_zeroes request\n");
2990 int fua = !!(req->type & NBD_CMD_FLAG_FUA);
2991 setup_reply(&rep, req);
2992 if(expwrite_zeroes(req, client, fua)) {
2993 DEBUG("Write_zeroes failed: %m");
2994 rep.error = nbd_errno(errno);
2996 // For now, don't trim
2997 // TODO: handle this far more efficiently with reference to the
2998 // actual backing driver
2999 log_reply(client, &rep);
3000 pthread_mutex_lock(&(client->lock));
3001 socket_write(client, &rep, sizeof rep);
3002 pthread_mutex_unlock(&(client->lock));
3006 static bool bad_write(CLIENT* client, struct nbd_request* req) {
3007 if ((client->server->flags & F_READONLY) ||
3008 (client->server->flags & F_AUTOREADONLY)) {
3009 DEBUG("[WRITE to READONLY!]");
3010 return true;
3012 return false;
3015 static bool bad_range(CLIENT* client, struct nbd_request* req) {
3016 if(req->from > client->exportsize ||
3017 req->from + req->len > client->exportsize) {
3018 DEBUG("[out of bounds!]");
3019 return true;
3021 return false;
3024 static void handle_request(gpointer data, gpointer user_data) {
3025 struct work_package* package = (struct work_package*) data;
3026 uint32_t type = package->req->type & NBD_CMD_MASK_COMMAND;
3027 uint32_t flags = package->req->type & ~NBD_CMD_MASK_COMMAND;
3028 struct nbd_reply rep;
3029 int err = EINVAL;
3031 if(flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE)) {
3032 msg(LOG_ERR, "E: received invalid flag %d on command %d, ignoring", flags, type);
3033 goto error;
3036 switch(type) {
3037 case NBD_CMD_READ:
3038 if (bad_range(package->client, package->req)) {
3039 goto error;
3041 handle_read(package->client, package->req);
3042 break;
3043 case NBD_CMD_WRITE:
3044 if (bad_write(package->client, package->req)) {
3045 err = EPERM;
3046 goto error;
3048 if (bad_range(package->client, package->req)) {
3049 err = ENOSPC;
3050 goto error;
3052 handle_write(package);
3053 break;
3054 case NBD_CMD_FLUSH:
3055 handle_flush(package->client, package->req);
3056 break;
3057 case NBD_CMD_TRIM:
3058 if (bad_write(package->client, package->req)) {
3059 err = EPERM;
3060 goto error;
3062 if (bad_range(package->client, package->req)) {
3063 goto error;
3065 handle_trim(package->client, package->req);
3066 break;
3067 case NBD_CMD_WRITE_ZEROES:
3068 if (bad_write(package->client, package->req)) {
3069 err = EPERM;
3070 goto error;
3072 if (bad_range(package->client, package->req)) {
3073 err = ENOSPC;
3074 goto error;
3076 handle_write_zeroes(package->client, package->req);
3077 break;
3078 default:
3079 msg(LOG_ERR, "E: received unknown command %d of type, ignoring", package->req->type);
3080 goto error;
3082 goto end;
3083 error:
3084 setup_reply(&rep, package->req);
3085 rep.error = nbd_errno(err);
3086 log_reply(package->client, &rep);
3087 pthread_mutex_lock(&(package->client->lock));
3088 socket_write(package->client, &rep, sizeof rep);
3089 pthread_mutex_unlock(&(package->client->lock));
3090 end:
3091 package_dispose(package);
3094 static int mainloop_threaded(CLIENT* client) {
3095 struct nbd_request* req;
3096 struct work_package* pkg;
3097 int write_data = false;
3099 DEBUG("Entering request loop\n");
3100 while(1) {
3101 req = calloc(sizeof (struct nbd_request), 1);
3103 socket_read(client, req, sizeof(struct nbd_request));
3105 if(client->transactionlogfd != -1) {
3106 lock_logsem(client);
3107 writeit(client->transactionlogfd, req, sizeof(struct nbd_request));
3108 if(((ntohl(req->type) & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) &&
3109 (client->server->flags & F_DATALOG) &&
3110 !(client->server->flags & F_SPLICE)) {
3111 write_data = true;
3112 } else {
3113 write_data = false;
3114 unlock_logsem(client);
3118 req->from = ntohll(req->from);
3119 req->type = ntohl(req->type);
3120 req->len = ntohl(req->len);
3122 if(req->magic != htonl(NBD_REQUEST_MAGIC))
3123 err("Protocol error: not enough magic.");
3125 pkg = package_create(client, req);
3127 if((req->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
3128 #ifdef HAVE_SPLICE
3129 if ((client->server->flags & F_SPLICE) &&
3130 (req->len <= MAX_PIPE_SIZE && pkg->pipefd[1] > 0) &&
3131 (client->tls_session == NULL))
3132 spliceit(client->net, NULL, pkg->pipefd[1],
3133 NULL, req->len);
3134 else
3135 #endif
3136 socket_read(client, pkg->data, req->len);
3138 if (write_data) {
3139 writeit(client->transactionlogfd, pkg->data, req->len);
3140 unlock_logsem(client);
3141 write_data = false;
3144 if(req->type == NBD_CMD_DISC) {
3145 finalize_client(client);
3146 package_dispose(pkg);
3147 return 0;
3149 g_thread_pool_push(tpool, pkg, NULL);
3154 * Destroy a pid_t*
3155 * @param data a pointer to pid_t which should be freed
3157 void destroy_pid_t(gpointer data) {
3158 g_free(data);
3161 static pid_t spawn_child(int* socket) {
3162 pid_t pid;
3163 sigset_t newset;
3164 sigset_t oldset;
3165 int sockets[2];
3167 sigemptyset(&newset);
3168 sigaddset(&newset, SIGCHLD);
3169 sigaddset(&newset, SIGTERM);
3170 sigprocmask(SIG_BLOCK, &newset, &oldset);
3171 socketpair(AF_UNIX, SOCK_STREAM, 0, sockets);
3172 pid = fork();
3173 if (pid < 0) {
3174 msg(LOG_ERR, "Could not fork (%s)", strerror(errno));
3175 close(sockets[0]);
3176 close(sockets[1]);
3177 goto out;
3179 if (pid > 0) { /* Parent */
3180 pid_t *pidp;
3182 pidp = g_malloc(sizeof(pid_t));
3183 *pidp = pid;
3184 *socket = sockets[1];
3185 close(sockets[0]);
3186 g_hash_table_insert(children, pidp, pidp);
3187 goto out;
3189 /* Child */
3190 *socket = sockets[0];
3191 close(sockets[1]);
3192 /* Child's signal disposition is reset to default. */
3193 signal(SIGCHLD, SIG_DFL);
3194 signal(SIGTERM, SIG_DFL);
3195 signal(SIGHUP, SIG_DFL);
3196 sigemptyset(&oldset);
3197 out:
3198 sigprocmask(SIG_SETMASK, &oldset, NULL);
3199 return pid;
3202 static int
3203 socket_accept(const int sock)
3205 struct sockaddr_storage addrin;
3206 socklen_t addrinlen = sizeof(addrin);
3207 int net;
3209 net = accept(sock, (struct sockaddr *) &addrin, &addrinlen);
3210 if (net < 0) {
3211 err_nonfatal("Failed to accept socket connection: %m");
3214 return net;
3217 static void
3218 handle_modern_connection(GArray *const servers, const int sock, struct generic_conf *genconf)
3220 int net;
3221 pid_t pid;
3222 CLIENT *client = NULL;
3223 int sock_flags_old;
3224 int sock_flags_new;
3226 net = socket_accept(sock);
3227 if (net < 0)
3228 return;
3230 if (!dontfork) {
3231 pid = spawn_child(&commsocket);
3232 if (pid) {
3233 if (pid > 0) {
3234 msg(LOG_INFO, "Spawned a child process");
3235 g_array_append_val(childsocks, commsocket);
3237 if (pid < 0)
3238 msg(LOG_ERR, "Failed to spawn a child process");
3239 close(net);
3240 return;
3242 /* Child just continues. */
3244 tpool = g_thread_pool_new(handle_request, NULL, genconf->threads, FALSE, NULL);
3246 sock_flags_old = fcntl(net, F_GETFL, 0);
3247 if (sock_flags_old == -1) {
3248 msg(LOG_ERR, "Failed to get socket flags");
3249 goto handler_err;
3252 sock_flags_new = sock_flags_old & ~O_NONBLOCK;
3253 if (sock_flags_new != sock_flags_old &&
3254 fcntl(net, F_SETFL, sock_flags_new) == -1) {
3255 msg(LOG_ERR, "Failed to set socket to blocking mode");
3256 goto handler_err;
3259 client = negotiate(net, servers, genconf);
3260 if (!client) {
3261 msg(LOG_ERR, "Modern initial negotiation failed");
3262 goto handler_err;
3265 if (!dontfork) {
3266 int i;
3268 /* Free all root server resources here, because we are
3269 * currently in the child process serving one specific
3270 * connection. These are not simply needed anymore. */
3271 g_hash_table_destroy(children);
3272 children = NULL;
3273 for (i = 0; i < modernsocks->len; i++) {
3274 close(g_array_index(modernsocks, int, i));
3276 g_array_free(modernsocks, TRUE);
3278 /* Now that we are in the child process after a
3279 * succesful negotiation, we do not need the list of
3280 * servers anymore, get rid of it.*/
3281 g_array_free(servers, FALSE);
3284 msg(LOG_INFO, "Starting to serve");
3285 mainloop_threaded(client);
3286 exit(EXIT_SUCCESS);
3288 handler_err:
3289 close(net);
3290 g_free(client);
3292 if (!dontfork) {
3293 exit(EXIT_FAILURE);
3297 static int handle_childname(GArray* servers, int socket)
3299 uint32_t len;
3300 _cleanup_g_free_ char *buf = NULL;
3301 int i, r, rt = 0;
3303 while(rt < sizeof(len)) {
3304 switch((r = read(socket, &len, sizeof len))) {
3305 case 0:
3306 return -1;
3307 case -1:
3308 err_nonfatal("Error reading from acl socket: %m");
3309 return -1;
3310 default:
3311 rt += r;
3312 break;
3315 if (len >= ULONG_MAX - 1) {
3316 err_nonfatal("Value out of range");
3317 return -1;
3319 buf = g_malloc0(len + 1);
3320 readit(socket, buf, len);
3321 buf[len] = 0;
3322 for(i=0; i<servers->len; i++) {
3323 SERVER* srv = g_array_index(servers, SERVER*, i);
3324 if(strcmp(srv->servename, buf) == 0) {
3325 if(srv->max_connections == 0 || srv->max_connections > srv->numclients) {
3326 writeit(socket, "Y", 1);
3327 srv->numclients++;
3328 } else {
3329 writeit(socket, "N", 1);
3331 goto exit;
3334 writeit(socket, "X", 1);
3335 exit:
3336 return 0;
3340 * Return the index of the server whose servename matches the given
3341 * name.
3343 * @param servename a string to match
3344 * @param servers an array of servers
3345 * @return the first index of the server whose servename matches the
3346 * given name or -1 if one cannot be found
3348 static int get_index_by_servename(const gchar *const servename,
3349 const GArray *const servers) {
3350 int i;
3352 for (i = 0; i < servers->len; ++i) {
3353 const SERVER* server = g_array_index(servers, SERVER*, i);
3355 if (strcmp(servename, server->servename) == 0)
3356 return i;
3359 return -1;
3363 * Parse configuration files and add servers to the array if they don't
3364 * already exist there. The existence is tested by comparing
3365 * servenames. A server is appended to the array only if its servename
3366 * is unique among all other servers.
3368 * @param servers an array of servers
3369 * @param genconf a pointer to generic configuration
3370 * @return the number of new servers appended to the array, or -1 in
3371 * case of an error
3373 static int append_new_servers(GArray *const servers, struct generic_conf *genconf, GError **const gerror) {
3374 int i;
3375 GArray *new_servers;
3376 const int old_len = servers->len;
3377 int retval = -1;
3379 new_servers = parse_cfile(config_file_pos, genconf, true, gerror);
3380 if(tpool) g_thread_pool_set_max_threads(tpool, genconf->threads, NULL);
3381 if(!new_servers)
3382 goto out;
3384 for(i = 0; i < new_servers->len; ++i) {
3385 SERVER *new_server = g_array_index(new_servers, SERVER*, i);
3387 if (new_server->servename
3388 && -1 == get_index_by_servename(new_server->servename,
3389 servers)) {
3390 serve_inc_ref(new_server);
3391 g_array_append_val(servers, new_server);
3395 retval = servers->len - old_len;
3396 out:
3397 g_array_free(new_servers, TRUE);
3399 return retval;
3402 void serveloop(GArray* servers, struct generic_conf *genconf) G_GNUC_NORETURN;
3404 * Loop through the available servers, and serve them. Never returns.
3406 void serveloop(GArray* servers, struct generic_conf *genconf) {
3407 int i;
3408 int mmax, max;
3409 fd_set mset;
3410 fd_set rset;
3411 sigset_t blocking_mask;
3412 sigset_t original_mask;
3415 * Set up the master fd_set. The set of descriptors we need
3416 * to select() for never changes anyway and it buys us a *lot*
3417 * of time to only build this once. However, if we ever choose
3418 * to not fork() for clients anymore, we may have to revisit
3419 * this.
3421 mmax=0;
3422 FD_ZERO(&mset);
3423 for(i=0;i<modernsocks->len;i++) {
3424 int sock = g_array_index(modernsocks, int, i);
3425 FD_SET(sock, &mset);
3426 mmax=sock>mmax?sock:mmax;
3429 /* Construct a signal mask which is used to make signal testing and
3430 * receiving an atomic operation to ensure no signal is received between
3431 * tests and blocking pselect(). */
3432 if (sigemptyset(&blocking_mask) == -1)
3433 err("failed to initialize blocking_mask: %m");
3435 if (sigaddset(&blocking_mask, SIGCHLD) == -1)
3436 err("failed to add SIGCHLD to blocking_mask: %m");
3438 if (sigaddset(&blocking_mask, SIGHUP) == -1)
3439 err("failed to add SIGHUP to blocking_mask: %m");
3441 if (sigaddset(&blocking_mask, SIGTERM) == -1)
3442 err("failed to add SIGTERM to blocking_mask: %m");
3444 if (sigprocmask(SIG_BLOCK, &blocking_mask, &original_mask) == -1)
3445 err("failed to block signals: %m");
3447 for(;;) {
3448 if (is_sigterm_caught) {
3449 is_sigterm_caught = 0;
3451 g_hash_table_foreach(children, killchild, NULL);
3452 unlink(pidfname);
3454 exit(EXIT_SUCCESS);
3457 if (is_sigchld_caught) {
3458 int status;
3459 int* i;
3460 pid_t pid;
3462 is_sigchld_caught = 0;
3464 while ((pid=waitpid(-1, &status, WNOHANG)) > 0) {
3465 if (WIFEXITED(status)) {
3466 msg(LOG_INFO, "Child exited with %d", WEXITSTATUS(status));
3468 i = g_hash_table_lookup(children, &pid);
3469 if (!i) {
3470 msg(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid);
3471 } else {
3472 DEBUG("Removing %d from the list of children", pid);
3473 g_hash_table_remove(children, &pid);
3478 /* SIGHUP causes the root server process to reconfigure
3479 * itself and add new export servers for each newly
3480 * found export configuration group, i.e. spawn new
3481 * server processes for each previously non-existent
3482 * export. This does not alter old runtime configuration
3483 * but just appends new exports. */
3484 if (is_sighup_caught) {
3485 int n;
3486 GError *gerror = NULL;
3488 msg(LOG_INFO, "reconfiguration request received");
3489 is_sighup_caught = 0; /* Reset to allow catching
3490 * it again. */
3492 n = append_new_servers(servers, genconf, &gerror);
3493 if (n == -1)
3494 msg(LOG_ERR, "failed to append new servers: %s",
3495 gerror->message);
3497 for (i = servers->len - n; i < servers->len; ++i) {
3498 const SERVER *server = g_array_index(servers,
3499 SERVER*, i);
3501 msg(LOG_INFO, "reconfigured new server: %s",
3502 server->servename);
3506 memcpy(&rset, &mset, sizeof(fd_set));
3507 max=mmax;
3508 for(i=0;i<childsocks->len;i++) {
3509 int sock = g_array_index(childsocks, int, i);
3510 FD_SET(sock, &rset);
3511 max=sock>max?sock:max;
3514 if (pselect(max + 1, &rset, NULL, NULL, NULL, &original_mask) > 0) {
3515 DEBUG("accept, ");
3516 for(i=0; i < modernsocks->len; i++) {
3517 int sock = g_array_index(modernsocks, int, i);
3518 if(!FD_ISSET(sock, &rset)) {
3519 continue;
3522 handle_modern_connection(servers, sock, genconf);
3524 for(i=0; i < childsocks->len; i++) {
3525 int sock = g_array_index(childsocks, int, i);
3527 if(FD_ISSET(sock, &rset)) {
3528 if(handle_childname(servers, sock) < 0) {
3529 close(sock);
3530 g_array_remove_index(childsocks, i);
3539 * Set server socket options.
3541 * @param socket a socket descriptor of the server
3543 * @param gerror a pointer to an error object pointer used for reporting
3544 * errors. On error, if gerror is not NULL, *gerror is set and -1
3545 * is returned.
3547 * @return 0 on success, -1 on error
3549 int dosockopts(const int socket, GError **const gerror) {
3550 #ifndef sun
3551 int yes=1;
3552 #else
3553 char yes='1';
3554 #endif /* sun */
3555 struct linger l;
3557 /* lose the pesky "Address already in use" error message */
3558 if (setsockopt(socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) {
3559 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SO_REUSEADDR,
3560 "failed to set socket option SO_REUSEADDR: %s",
3561 strerror(errno));
3562 return -1;
3564 l.l_onoff = 1;
3565 l.l_linger = 10;
3566 if (setsockopt(socket,SOL_SOCKET,SO_LINGER,&l,sizeof(l)) == -1) {
3567 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SO_LINGER,
3568 "failed to set socket option SO_LINGER: %s",
3569 strerror(errno));
3570 return -1;
3572 if (setsockopt(socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) {
3573 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SO_KEEPALIVE,
3574 "failed to set socket option SO_KEEPALIVE: %s",
3575 strerror(errno));
3576 return -1;
3579 return 0;
3582 int open_unix(const gchar *const sockname, GError **const gerror) {
3583 struct sockaddr_un sa;
3584 int sock=-1;
3585 int retval=-1;
3587 memset(&sa, 0, sizeof(struct sockaddr_un));
3588 sa.sun_family = AF_UNIX;
3589 strncpy(sa.sun_path, sockname, sizeof sa.sun_path);
3590 sa.sun_path[sizeof(sa.sun_path)-1] = '\0';
3591 sock = socket(AF_UNIX, SOCK_STREAM, 0);
3592 if(sock < 0) {
3593 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SOCKET,
3594 "failed to open a unix socket: "
3595 "failed to create socket: %s",
3596 strerror(errno));
3597 goto out;
3599 if(bind(sock, (struct sockaddr*)&sa, sizeof(struct sockaddr_un))<0) {
3600 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3601 "failed to open a unix socket: "
3602 "failed to bind to address %s: %s",
3603 sockname, strerror(errno));
3604 goto out;
3606 if(listen(sock, 10)<0) {
3607 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3608 "failed to open a unix socket: "
3609 "failed to start listening: %s",
3610 strerror(errno));
3611 goto out;
3613 retval=0;
3614 g_array_append_val(modernsocks, sock);
3615 out:
3616 if(retval<0 && sock >= 0) {
3617 close(sock);
3620 return retval;
3623 int open_modern(const gchar *const addr, const gchar *const port,
3624 GError **const gerror) {
3625 struct addrinfo hints;
3626 struct addrinfo* ai = NULL;
3627 struct addrinfo* ai_bak = NULL;
3628 struct sock_flags;
3629 int e;
3630 int retval = -1;
3631 int sock = -1;
3632 _cleanup_(g_strfreevp) gchar** addrs;
3633 gchar const* l_addr = addr;
3635 if(!addr || strlen(addr) == 0) {
3636 l_addr = "::, 0.0.0.0";
3639 addrs = g_strsplit_set(l_addr, ", \t", -1);
3641 for(int i=0; addrs[i]!=NULL; i++) {
3642 if(addrs[i][0] == '\0') {
3643 continue;
3645 memset(&hints, '\0', sizeof(hints));
3646 hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
3647 hints.ai_socktype = SOCK_STREAM;
3648 hints.ai_family = AF_UNSPEC;
3649 hints.ai_protocol = IPPROTO_TCP;
3650 e = getaddrinfo(addrs[i], port ? port : NBD_DEFAULT_PORT, &hints, &ai);
3651 ai_bak = ai;
3652 if(e != 0 && addrs[i+1] == NULL && modernsocks->len == 0) {
3653 g_set_error(gerror, NBDS_ERR, NBDS_ERR_GAI,
3654 "failed to open a modern socket: "
3655 "failed to get address info: %s",
3656 gai_strerror(e));
3657 goto out;
3660 while(ai != NULL) {
3661 sock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
3662 if(sock<0) {
3663 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SOCKET,
3664 "failed to open a modern socket: "
3665 "failed to create a socket: %s",
3666 strerror(errno));
3667 goto out;
3670 if (dosockopts(sock, gerror) == -1) {
3671 g_prefix_error(gerror, "failed to open a modern socket: ");
3672 goto out;
3675 if(bind(sock, ai->ai_addr, ai->ai_addrlen)) {
3677 * Some systems will return multiple entries for the
3678 * same address when we ask it for something
3679 * AF_UNSPEC, even though the first entry will
3680 * listen to both protocols. Other systems will
3681 * return multiple entries too, but we actually
3682 * do need to open both.
3684 * Handle this by ignoring EADDRINUSE if we've
3685 * already got at least one socket open
3687 if(errno == EADDRINUSE && modernsocks->len > 0) {
3688 goto next;
3690 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3691 "failed to open a modern socket: "
3692 "failed to bind an address to a socket: %s",
3693 strerror(errno));
3694 goto out;
3697 if(listen(sock, 10) <0) {
3698 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3699 "failed to open a modern socket: "
3700 "failed to start listening on a socket: %s",
3701 strerror(errno));
3702 goto out;
3704 g_array_append_val(modernsocks, sock);
3705 next:
3706 ai = ai->ai_next;
3708 if(ai_bak) {
3709 freeaddrinfo(ai_bak);
3710 ai_bak=NULL;
3714 retval = 0;
3715 out:
3717 if (retval == -1 && sock >= 0) {
3718 close(sock);
3720 if(ai_bak)
3721 freeaddrinfo(ai_bak);
3723 return retval;
3727 * Connect our servers.
3729 void setup_servers(GArray *const servers, const gchar *const modernaddr,
3730 const gchar *const modernport, const gchar* unixsock,
3731 const gint flags ) {
3732 struct sigaction sa;
3734 if(unixsock != NULL) {
3735 GError* gerror = NULL;
3736 if(open_unix(unixsock, &gerror) == -1) {
3737 msg(LOG_ERR, "failed to setup servers: %s",
3738 gerror->message);
3739 g_clear_error(&gerror);
3740 exit(EXIT_FAILURE);
3743 if (((flags & F_DUAL_LISTEN) != 0) || (unixsock == NULL)) {
3744 GError *gerror = NULL;
3745 if (open_modern(modernaddr, modernport, &gerror) == -1) {
3746 msg(LOG_ERR, "failed to setup servers: %s",
3747 gerror->message);
3748 g_clear_error(&gerror);
3749 exit(EXIT_FAILURE);
3752 children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
3754 sa.sa_handler = sigchld_handler;
3755 sigemptyset(&sa.sa_mask);
3756 sigaddset(&sa.sa_mask, SIGTERM);
3757 sa.sa_flags = SA_RESTART;
3758 if(sigaction(SIGCHLD, &sa, NULL) == -1)
3759 err("sigaction: %m");
3761 sa.sa_handler = sigterm_handler;
3762 sigemptyset(&sa.sa_mask);
3763 sigaddset(&sa.sa_mask, SIGCHLD);
3764 sa.sa_flags = SA_RESTART;
3765 if(sigaction(SIGTERM, &sa, NULL) == -1)
3766 err("sigaction: %m");
3768 sa.sa_handler = sighup_handler;
3769 sigemptyset(&sa.sa_mask);
3770 sa.sa_flags = SA_RESTART;
3771 if(sigaction(SIGHUP, &sa, NULL) == -1)
3772 err("sigaction: %m");
3774 sa.sa_handler = sigusr1_handler;
3775 sigemptyset(&sa.sa_mask);
3776 sa.sa_flags = SA_RESTART;
3777 if(sigaction(SIGUSR1, &sa, NULL) == -1)
3778 err("sigaction: %m");
3782 * Go daemon (unless we specified at compile time that we didn't want this)
3783 * @param serve the first server of our configuration. If its port is zero,
3784 * then do not daemonize, because we're doing inetd then. This parameter
3785 * is only used to create a PID file of the form
3786 * /var/run/nbd-server.&lt;port&gt;.pid; it's not modified in any way.
3788 #if !defined(NODAEMON)
3789 void daemonize() {
3790 FILE*pidf;
3792 if(daemon(0,0)<0) {
3793 err("daemon");
3795 if(!*pidfname) {
3796 strncpy(pidfname, "/var/run/nbd-server.pid", 255);
3798 pidf=fopen(pidfname, "w");
3799 if(pidf) {
3800 fprintf(pidf,"%d\n", (int)getpid());
3801 fclose(pidf);
3802 } else {
3803 perror("fopen");
3804 fprintf(stderr, "Not fatal; continuing");
3807 #else
3808 #define daemonize(serve)
3809 #endif /* !defined(NODAEMON) */
3812 * Everything beyond this point (in the file) is run in non-daemon mode.
3813 * The stuff above daemonize() isn't.
3817 * Set up user-ID and/or group-ID
3819 void dousers(const gchar *const username, const gchar *const groupname) {
3820 struct passwd *pw;
3821 struct group *gr;
3822 gchar* str;
3823 if (groupname) {
3824 gr = getgrnam(groupname);
3825 if(!gr) {
3826 str = g_strdup_printf("Invalid group name: %s", groupname);
3827 err(str);
3829 if(setgid(gr->gr_gid)<0) {
3830 err("Could not set GID: %m");
3833 if (username) {
3834 pw = getpwnam(username);
3835 if(!pw) {
3836 str = g_strdup_printf("Invalid user name: %s", username);
3837 err(str);
3839 if (setgroups(0, NULL)<0) {
3840 err("Could not set groups: %m");
3842 if(setuid(pw->pw_uid)<0) {
3843 err("Could not set UID: %m");
3848 #ifndef ISSERVER
3849 void glib_message_syslog_redirect(const gchar *log_domain,
3850 GLogLevelFlags log_level,
3851 const gchar *message,
3852 gpointer user_data)
3854 int level=LOG_DEBUG;
3856 switch( log_level )
3858 case G_LOG_FLAG_FATAL:
3859 case G_LOG_LEVEL_CRITICAL:
3860 case G_LOG_LEVEL_ERROR:
3861 level=LOG_ERR;
3862 break;
3863 case G_LOG_LEVEL_WARNING:
3864 level=LOG_WARNING;
3865 break;
3866 case G_LOG_LEVEL_MESSAGE:
3867 case G_LOG_LEVEL_INFO:
3868 level=LOG_INFO;
3869 break;
3870 case G_LOG_LEVEL_DEBUG:
3871 level=LOG_DEBUG;
3872 break;
3873 default:
3874 level=LOG_ERR;
3876 syslog(level, "%s", message);
3878 #endif
3881 * Main entry point...
3883 int main(int argc, char *argv[]) {
3884 SERVER *serve;
3885 GArray *servers;
3886 GError *gerr=NULL;
3887 struct generic_conf genconf;
3889 memset(&genconf, 0, sizeof(struct generic_conf));
3891 if (sizeof( struct nbd_request )!=28) {
3892 fprintf(stderr,"Bad size of structure. Alignment problems?\n");
3893 exit(EXIT_FAILURE) ;
3896 modernsocks = g_array_new(FALSE, FALSE, sizeof(int));
3897 childsocks = g_array_new(FALSE, FALSE, sizeof(int));
3899 logging(MY_NAME);
3900 config_file_pos = g_strdup(CFILE);
3901 serve=cmdline(argc, argv, &genconf);
3903 genconf.threads = 4;
3904 servers = parse_cfile(config_file_pos, &genconf, true, &gerr);
3906 /* Update global variables with parsed values. This will be
3907 * removed once we get rid of global configuration variables. */
3908 glob_flags |= genconf.flags;
3910 if(serve) {
3911 g_array_append_val(servers, serve);
3914 if(!servers || !servers->len) {
3915 if(gerr && !(gerr->domain == NBDS_ERR
3916 && gerr->code == NBDS_ERR_CFILE_NOTFOUND)) {
3917 g_warning("Could not parse config file: %s", gerr->message);
3920 if(serve) {
3921 g_warning("Specifying an export on the command line no longer uses the oldstyle protocol.");
3924 if((!serve) && (!servers||!servers->len)) {
3925 if(gerr)
3926 g_message("No configured exports; quitting.");
3927 exit(EXIT_FAILURE);
3929 if (!nodaemon)
3930 daemonize();
3932 setup_servers(servers, genconf.modernaddr, genconf.modernport,
3933 genconf.unixsock, genconf.flags);
3934 dousers(genconf.user, genconf.group);
3936 #if HAVE_GNUTLS
3937 gnutls_global_init();
3938 static gnutls_dh_params_t dh_params;
3939 gnutls_dh_params_init(&dh_params);
3940 gnutls_dh_params_generate2(dh_params,
3941 gnutls_sec_param_to_pk_bits(GNUTLS_PK_DH,
3942 // Renamed in GnuTLS 3.3
3943 #if GNUTLS_VERSION_NUMBER >= 0x030300
3944 GNUTLS_SEC_PARAM_MEDIUM
3945 #else
3946 GNUTLS_SEC_PARAM_NORMAL
3947 #endif
3949 #endif
3951 if((genconf.modernport != NULL) && strcmp(genconf.modernport, "0")==0) {
3952 #ifndef ISSERVER
3953 err("inetd mode requires syslog");
3954 #endif
3955 CLIENT* client = negotiate(0, servers, &genconf);
3956 if(!client) {
3957 exit(EXIT_FAILURE);
3959 tpool = g_thread_pool_new(handle_request, NULL, genconf.threads, FALSE, NULL);
3960 mainloop_threaded(client);
3961 return 0;
3964 serveloop(servers, &genconf);