7712 mandoc -Tlint does always exit with error code 0
[unleashed.git] / usr / src / cmd / lvm / rpc.mdcommd / mdmn_commd_server.c
blob644c2ad2c7ab66af8129b0cdb20ecaeaadbd0781
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
26 #include <unistd.h>
27 #include <sys/types.h>
28 #include <sys/stat.h>
29 #include <sys/statvfs.h>
30 #include <sys/uadmin.h>
31 #include <sys/resource.h>
32 #include <fcntl.h>
33 #include <stdio.h>
34 #include <thread.h>
35 #include <meta.h>
36 #include <sdssc.h>
37 #include <mdmn_changelog.h>
38 #include "mdmn_subr.h"
41 * This is the communication daemon for SVM Multi Node Disksets.
42 * It runs on every node and provides the following rpc services:
43 * - mdmn_send_svc_2
44 * - mdmn_work_svc_2
45 * - mdmn_wakeup_initiator_svc_2
46 * - mdmn_wakeup_master_svc_2
47 * - mdmn_comm_lock_svc_2
48 * - mdmn_comm_unlock_svc_2
49 * - mdmn_comm_suspend_svc_2
50 * - mdmn_comm_resume_svc_2
51 * - mdmn_comm_reinit_set_svc_2
52 * where send, lock, unlock and reinit are meant for external use,
53 * work and the two wakeups are for internal use only.
55 * NOTE:
56 * On every node only one of those xxx_2 functions can be active at the
57 * same time because the daemon is single threaded.
59 * (not quite true, as mdmn_send_svc_2 and mdmn_work_svc_2 do thr_create()s
60 * as part of their handlers, so those aspects are multi-threaded)
62 * In case an event occurs that has to be propagated to all the nodes...
64 * One node (the initiator)
65 * calls the libmeta function mdmn_send_message()
66 * This function calls the local daemon thru mdmn_send_svc_2.
68 * On the initiator:
69 * mdmn_send_svc_2()
70 * - starts a thread -> mdmn_send_to_work() and returns.
71 * mdmn_send_to_work()
72 * - sends this message over to the master of the diskset.
73 * This is done by calling mdmn_work_svc_2 on the master.
74 * - registers to the initiator_table
75 * - exits without doing a svc_sendreply() for the call to
76 * mdmn_send_svc_2. This means that call is blocked until somebody
77 * (see end of this comment) does a svc_sendreply().
78 * This means mdmn_send_message() does not yet return.
79 * - A timeout surveillance is started at this point.
80 * This means in case the master doesn't reply at all in an
81 * aproppriate time, an error condition is returned
82 * to the caller.
84 * On the master:
85 * mdmn_work_svc_2()
86 * - starts a thread -> mdmn_master_process_msg() and returns
87 * mdmn_master_process_msg()
88 * - logs the message to the change log
89 * - executes the message locally
90 * - flags the message in the change log
91 * - sends the message to mdmn_work_svc_2() on all the
92 * other nodes (slaves)
93 * after each call to mdmn_work_svc_2 the thread goes to sleep and
94 * will be woken up by mdmn_wakeup_master_svc_2() as soon as the
95 * slave node is done with this message.
96 * - In case the slave doesn't respond in a apropriate time, an error
97 * is assumed to ensure the master doesn't wait forever.
99 * On a slave:
100 * mdmn_work_svc_2()
101 * - starts a thread -> mdmn_slave_process_msg() and returns
102 * mdmn_slave_process_msg()
103 * - processes this message locally by calling the appropriate message
104 * handler, that creates some result.
105 * - sends that result thru a call to mdmn_wakeup_master_svc_2() to
106 * the master.
108 * Back on the master:
109 * mdmn_wakeup_master_svc_2()
110 * - stores the result into the master_table.
111 * - signals the mdmn_master_process_msg-thread.
112 * - returns
113 * mdmn_master_process_msg()
114 * - after getting the results from all nodes
115 * - sends them back to the initiating node thru a call to
116 * mdmn_wakeup_initiator_svc_2.
118 * Back on the initiator:
119 * mdmn_wakeup_initiator_svc_2()
120 * - calls svc_sendreply() which makes the call to mdmn_send_svc_2()
121 * return.
122 * which allows the initial mdmn_send_message() call to return.
125 FILE *commdout; /* debug output for the commd */
126 char *commdoutfile; /* file name for the above output */
127 /* want at least 10 MB free space when logging into a file */
128 #define MIN_FS_SPACE (10LL * 1024 * 1024)
131 * Number of outstanding messages that were initiated by this node.
132 * If zero, check_timeouts goes to sleep
134 uint_t messages_on_their_way;
135 mutex_t check_timeout_mutex; /* need mutex to protect above */
136 cond_t check_timeout_cv; /* trigger for check_timeouts */
138 /* for printing out time stamps */
139 hrtime_t __savetime;
141 /* RPC clients for every set and every node and their protecting locks */
142 CLIENT *client[MD_MAXSETS][NNODES];
143 rwlock_t client_rwlock[MD_MAXSETS];
145 /* the descriptors of all possible sets and their protectors */
146 struct md_set_desc *set_descriptor[MD_MAXSETS];
147 rwlock_t set_desc_rwlock[MD_MAXSETS];
149 /* the daemon to daemon communication has to timeout quickly */
150 static struct timeval FOUR_SECS = { 4, 0 };
152 /* These indicate if a set has already been setup */
153 int md_mn_set_inited[MD_MAXSETS];
155 /* For every set we have a message completion table and protecting mutexes */
156 md_mn_mct_t *mct[MD_MAXSETS];
157 mutex_t mct_mutex[MD_MAXSETS][MD_MN_NCLASSES];
159 /* Stuff to describe the global status of the commd on one node */
160 #define MD_CGS_INITED 0x0001
161 #define MD_CGS_ABORTED 0x0002 /* return everything with MDMNE_ABORT */
162 uint_t md_commd_global_state = 0; /* No state when starting up */
165 * Global verbosity level for the daemon
167 uint_t md_commd_global_verb;
170 * libmeta doesn't like multiple threads in metaget_setdesc().
171 * So we must protect access to it with a global lock
173 mutex_t get_setdesc_mutex;
176 * Need a way to block single message types,
177 * hence an array with a status for every message type
179 uint_t msgtype_lock_state[MD_MN_NMESSAGES];
181 /* for reading in the config file */
182 #define MAX_LINE_SIZE 1024
184 extern char *commd_get_outfile(void);
185 extern uint_t commd_get_verbosity(void);
188 * mdmn_clnt_create is a helper function for meta_client_create_retry. It
189 * merely needs to call clnt_create_timed, and meta_client_create_retry
190 * will take care of the rest.
192 /* ARGSUSED */
193 static CLIENT *
194 mdmn_clnt_create(char *ignore, void *data, struct timeval *time_out)
196 md_mnnode_desc *node = (md_mnnode_desc *)data;
198 return (clnt_create_timed(node->nd_priv_ic, MDMN_COMMD, TWO, "tcp",
199 time_out));
202 #define FLUSH_DEBUGFILE() \
203 if (commdout != (FILE *)NULL) { \
204 (void) fflush(commdout); \
205 (void) fsync(fileno(commdout)); \
208 static void
209 panic_system(int nid, md_mn_msgtype_t type, int master_err, int master_exitval,
210 md_mn_result_t *slave_result)
212 md_mn_commd_err_t commd_err;
213 md_error_t mne = mdnullerror;
214 char *msg_buf;
216 msg_buf = (char *)calloc(MAXPATHLEN + 1, sizeof (char));
218 FLUSH_DEBUGFILE();
220 if (master_err != MDMNE_ACK) {
221 (void) snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC "
222 "fail on master when processing message type %d\n", type);
223 } else if (slave_result == NULL) {
224 (void) snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail "
225 "on node %d when processing message type %d\n", nid, type);
226 } else {
227 (void) snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: "
228 "Inconsistent return value from node %d when processing "
229 "message type %d. Master exitval = %d, "
230 "Slave exitval = %d\n", nid, type, master_exitval,
231 slave_result->mmr_exitval);
233 commd_err.size = strlen(msg_buf);
234 commd_err.md_message = (uint64_t)(uintptr_t)&msg_buf[0];
236 (void) metaioctl(MD_MN_COMMD_ERR, &commd_err, &mne, "rpc.mdcommd");
237 (void) uadmin(A_DUMP, AD_BOOT, NULL);
240 static void
241 flush_fcout()
243 struct statvfs64 vfsbuf;
244 long long avail_bytes;
245 int warned = 0;
247 for (; ; ) {
248 (void) sleep(10);
249 /* No output file, nothing to do */
250 if (commdout == (FILE *)NULL)
251 continue;
254 * stat the appropriate filesystem to check for available space.
256 if (statvfs64(commdoutfile, &vfsbuf)) {
257 continue;
260 avail_bytes = vfsbuf.f_frsize * vfsbuf.f_bavail;
262 * If we don't have enough space, we print out a warning.
263 * And we drop the verbosity level to NULL
264 * In case the condtion doesn't go away, we don't repeat
265 * the warning.
267 if (avail_bytes < MIN_FS_SPACE) {
268 if (warned) {
269 continue;
271 commd_debug(MD_MMV_SYSLOG,
272 "NOT enough space available for logging\n");
273 commd_debug(MD_MMV_SYSLOG,
274 "Have %lld bytes, need %lld bytes\n",
275 avail_bytes, MIN_FS_SPACE);
276 warned = 1;
277 md_commd_global_verb = MD_MMV_NULL;
278 } else {
279 warned = 0;
282 (void) fflush(commdout);
286 /* safer version of clnt_destroy. If clnt is NULL don't do anything */
287 #define mdmn_clnt_destroy(clnt) { \
288 if (clnt) \
289 clnt_destroy(clnt); \
293 * Own version of svc_sendreply that checks the integrity of the transport
294 * handle and so prevents us from core dumps in the real svc_sendreply()
296 void
297 mdmn_svc_sendreply(SVCXPRT *transp, xdrproc_t xdr, caddr_t data)
299 if (SVC_STAT(transp) == XPRT_DIED) {
300 commd_debug(MD_MMV_MISC,
301 "mdmn_svc_sendreply: XPRT_DIED\n");
302 return;
304 (void) svc_sendreply(transp, xdr, data);
308 * timeout_initiator(set, class)
310 * Alas, I sent a message and didn't get a response back in aproppriate time.
312 * timeout_initiator() takes care for doing the needed svc_sendreply() to the
313 * calling mdmn_send_message, so that guy doesn't wait forever
314 * What is done here is pretty much the same as what is done in
315 * wakeup initiator. The difference is that we cannot provide for any results,
316 * of course and we set the comm_state to MDMNE_TIMEOUT.
318 * By doing so, mdmn_send_message can decide if a retry would make sense or not.
319 * It's not our's to decide that here.
321 void
322 timeout_initiator(set_t setno, md_mn_msgclass_t class)
324 SVCXPRT *transp;
325 md_mn_msgid_t mid;
326 md_mn_result_t *resultp;
328 resultp = Zalloc(sizeof (md_mn_result_t));
329 resultp->mmr_comm_state = MDMNE_TIMEOUT;
331 commd_debug(MD_MMV_MISC,
332 "timeout_initiator set = %d, class = %d\n", setno, class);
334 transp = mdmn_get_initiator_table_transp(setno, class);
335 mdmn_get_initiator_table_id(setno, class, &mid);
337 commd_debug(MD_MMV_MISC, "timeout_ini: (%d, 0x%llx-%d)\n",
338 MSGID_ELEMS(mid));
340 * Give the result the corresponding msgid from the failed message.
342 MSGID_COPY(&mid, &(resultp->mmr_msgid));
344 /* return to mdmn_send_message() and let it deal with the situation */
345 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
347 free(resultp);
348 commd_debug(MD_MMV_MISC, "timeout_ini: sendreplied\n");
349 svc_done(transp);
350 mdmn_unregister_initiator_table(setno, class);
355 * check_timeouts - thread
357 * This implements a timeout surveillance for messages sent from the
358 * initiator to the master.
360 * If a message is started, this thread is triggered thru
361 * cond_signal(&check_timeout_cv) and we keep track of the numbers of
362 * messages that are outstanding (messages_on_their_way).
364 * As long as there are messages on their way, this thread never goes to sleep.
365 * It'll keep checking all class/set combinations for outstanding messages.
366 * If one is found, it's checked if this message is overdue. In that case,
367 * timeout_initiator() is called to wakeup the calling mdmn_send_message and
368 * to clean up the mess.
370 * If the result from the master arrives later, this message is considered
371 * to be unsolicited. And will be ignored.
374 void
375 check_timeouts()
377 set_t setno;
378 time_t now, then;
379 mutex_t *mx;
380 md_mn_msgclass_t class;
382 for (; ; ) {
383 now = time((time_t *)NULL);
384 for (setno = 1; setno < MD_MAXSETS; setno++) {
385 if (md_mn_set_inited[setno] != MDMN_SET_READY) {
386 continue;
388 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES;
389 class++) {
390 mx = mdmn_get_initiator_table_mx(setno, class);
391 (void) mutex_lock(mx);
393 /* then is the registered time */
394 then =
395 mdmn_get_initiator_table_time(setno, class);
396 if ((then != 0) && (now > then)) {
397 timeout_initiator(setno, class);
399 (void) mutex_unlock(mx);
402 /* it's ok to check only once per second */
403 (void) sleep(1);
405 /* is there work to do? */
406 (void) mutex_lock(&check_timeout_mutex);
407 if (messages_on_their_way == 0) {
408 (void) cond_wait(&check_timeout_cv,
409 &check_timeout_mutex);
411 (void) mutex_unlock(&check_timeout_mutex);
415 void
416 setup_debug(void)
418 char *tmp_dir;
420 /* Read in the debug-controlling tokens from runtime.cf */
421 md_commd_global_verb = commd_get_verbosity();
423 * If the user didn't specify a verbosity level in runtime.cf
424 * we can safely return here. As we don't intend to printout
425 * debug messages, we don't need to check for the output file.
427 if (md_commd_global_verb == 0) {
428 return;
431 /* if commdout is non-NULL it is an open FILE, we'd better close it */
432 if (commdout != (FILE *)NULL) {
433 (void) fclose(commdout);
436 commdoutfile = commd_get_outfile();
438 /* setup the debug output */
439 if (commdoutfile == (char *)NULL) {
440 /* if no valid file was specified, use the default */
441 commdoutfile = "/var/run/commd.out";
442 commdout = fopen(commdoutfile, "a");
443 } else {
444 /* check if the directory exists and is writable */
445 tmp_dir = strdup(commdoutfile);
446 if ((access(dirname(tmp_dir), X_OK|W_OK)) ||
447 ((commdout = fopen(commdoutfile, "a")) == (FILE *)NULL)) {
448 syslog(LOG_ERR,
449 "Can't write to specified output file %s,\n"
450 "using /var/run/commd.out instead\n", commdoutfile);
451 free(commdoutfile);
452 commdoutfile = "/var/run/commd.out";
453 commdout = fopen(commdoutfile, "a");
455 free(tmp_dir);
458 if (commdout == (FILE *)NULL) {
459 syslog(LOG_ERR, "Can't write to debug output file %s\n",
460 commdoutfile);
465 * mdmn_is_node_dead checks to see if a node is dead using
466 * the SunCluster infrastructure which is a stable interface.
467 * If unable to contact SunCuster the node is assumed to be alive.
468 * Return values:
469 * 1 - node is dead
470 * 0 - node is alive
473 mdmn_is_node_dead(md_mnnode_desc *node)
475 char *fmt = "/usr/cluster/bin/scha_cluster_get -O NODESTATE_NODE ";
476 char *cmd;
477 size_t size;
478 char buf[10];
479 FILE *ptr;
480 int retval = 0;
482 /* I know that I'm alive */
483 if (strcmp(node->nd_nodename, mynode()) == 0)
484 return (retval);
486 size = strlen(fmt) + strlen(node->nd_nodename) + 1;
487 cmd = Zalloc(size);
488 (void) strlcat(cmd, fmt, size);
489 (void) strlcat(cmd, node->nd_nodename, size);
491 if ((ptr = popen(cmd, "r")) != NULL) {
492 if (fgets(buf, sizeof (buf), ptr) != NULL) {
493 /* If scha_cluster_get returned DOWN - return dead */
494 if (strncmp(buf, "DOWN", 4) == 0)
495 retval = 1;
497 (void) pclose(ptr);
499 Free(cmd);
500 return (retval);
504 * global_init()
506 * Perform some global initializations.
508 * the following routines have to call this before operation can start:
509 * - mdmn_send_svc_2
510 * - mdmn_work_svc_2
511 * - mdmn_comm_lock_svc_2
512 * - mdmn_comm_unlock_svc_2
513 * - mdmn_comm_suspend_svc_2
514 * - mdmn_comm_resume_svc_2
515 * - mdmn_comm_reinit_set_svc_2
517 * This is a single threaded daemon, so it can only be in one of the above
518 * routines at the same time.
519 * This means, global_init() cannot be called more than once at the same time.
520 * Hence, no lock is needed.
522 void
523 global_init(void)
525 set_t set;
526 md_mn_msgclass_t class;
527 struct sigaction sighandler;
528 time_t clock_val;
529 struct rlimit commd_limit;
533 /* Do these global initializations only once */
534 if (md_commd_global_state & MD_CGS_INITED) {
535 return;
537 (void) sdssc_bind_library();
539 /* setup the debug options from the config file */
540 setup_debug();
542 /* make sure that we don't run out of file descriptors */
543 commd_limit.rlim_cur = commd_limit.rlim_max = RLIM_INFINITY;
544 if (setrlimit(RLIMIT_NOFILE, &commd_limit) != 0) {
545 syslog(LOG_WARNING, gettext("setrlimit failed."
546 "Could not increase the max file descriptors"));
549 /* Make setup_debug() be the action in case of SIGHUP */
550 sighandler.sa_flags = 0;
551 (void) sigfillset(&sighandler.sa_mask);
552 sighandler.sa_handler = (void (*)(int)) setup_debug;
553 (void) sigaction(SIGHUP, &sighandler, NULL);
555 __savetime = gethrtime();
556 (void) time(&clock_val);
557 commd_debug(MD_MMV_MISC, "global init called %s\n", ctime(&clock_val));
559 /* start a thread that flushes out the debug on a regular basis */
560 (void) thr_create(NULL, 0, (void *(*)(void *))flush_fcout,
561 (void *) NULL, THR_DETACHED, NULL);
563 /* global rwlock's / mutex's / cond_t's go here */
564 (void) mutex_init(&check_timeout_mutex, USYNC_THREAD, NULL);
565 (void) cond_init(&check_timeout_cv, USYNC_THREAD, NULL);
566 (void) mutex_init(&get_setdesc_mutex, USYNC_THREAD, NULL);
568 /* Make sure the initiator table is initialized correctly */
569 for (set = 0; set < MD_MAXSETS; set++) {
570 for (class = 0; class < MD_MN_NCLASSES; class++) {
571 mdmn_unregister_initiator_table(set, class);
576 /* setup the check for timeouts */
577 (void) thr_create(NULL, 0, (void *(*)(void *))check_timeouts,
578 (void *) NULL, THR_DETACHED, NULL);
580 md_commd_global_state |= MD_CGS_INITED;
585 * mdmn_init_client(setno, nodeid)
586 * called if client[setno][nodeid] is NULL
588 * NOTE: Must be called with set_desc_rwlock held as a reader
589 * NOTE: Must be called with client_rwlock held as a writer
591 * If the rpc client for this node has not been setup for any set, we do it now.
593 * Returns 0 on success (node found in set, rpc client setup)
594 * -1 if metaget_setdesc failed,
595 * -2 if node not part of set
596 * -3 if clnt_create fails
598 static int
599 mdmn_init_client(set_t setno, md_mn_nodeid_t nid)
601 md_error_t ep = mdnullerror;
602 md_mnnode_desc *node;
603 md_set_desc *sd; /* just an abbr for set_descriptor[setno] */
605 sd = set_descriptor[setno];
608 * Is the appropriate set_descriptor already initialized ?
609 * Can't think of a scenario where this is not the case, but we'd better
610 * check for it anyway.
612 if (sd == NULL) {
613 mdsetname_t *sp;
615 /* readlock -> writelock */
616 (void) rw_unlock(&set_desc_rwlock[setno]);
617 (void) rw_wrlock(&set_desc_rwlock[setno]);
618 sp = metasetnosetname(setno, &ep);
619 /* Only one thread is supposed to be in metaget_setdesc() */
620 (void) mutex_lock(&get_setdesc_mutex);
621 sd = metaget_setdesc(sp, &ep);
622 (void) mutex_unlock(&get_setdesc_mutex);
623 if (sd == NULL) {
624 /* back to ... */
625 (void) rw_unlock(&set_desc_rwlock[setno]);
626 /* ... readlock */
627 (void) rw_rdlock(&set_desc_rwlock[setno]);
628 return (-1);
630 set_descriptor[setno] = sd;
631 /* back to readlock */
632 (void) rw_unlock(&set_desc_rwlock[setno]);
633 (void) rw_rdlock(&set_desc_rwlock[setno]);
636 /* first we have to find the node name for this node id */
637 for (node = sd->sd_nodelist; node; node = node->nd_next) {
638 if (node->nd_nodeid == nid)
639 break; /* we found our node in this set */
643 if (node == (md_mnnode_desc *)NULL) {
644 commd_debug(MD_MMV_SYSLOG,
645 "FATAL: node %d not found in set %d\n", nid, setno);
646 (void) rw_unlock(&set_desc_rwlock[setno]);
647 return (-2);
650 commd_debug(MD_MMV_INIT, "init: %s has the flags: 0x%x\n",
651 node->nd_nodename ? node->nd_nodename : "NULL", node->nd_flags);
653 /* Did this node join the diskset? */
654 if ((node->nd_flags & MD_MN_NODE_OWN) == 0) {
655 commd_debug(MD_MMV_INIT, "init: %s didn't join set %d\n",
656 node->nd_nodename ? node->nd_nodename : "NULL", setno);
657 (void) rw_unlock(&set_desc_rwlock[setno]);
658 return (-2);
661 /* if clnt_create has not been done for that node, do it now */
662 if (client[setno][nid] == (CLIENT *) NULL) {
663 time_t tout = 0;
666 * While trying to create a connection to a node,
667 * periodically check to see if the node has been marked
668 * dead by the SunCluster infrastructure.
669 * This periodic check is needed since a non-responsive
670 * rpc.mdcommd (while it is attempting to create a connection
671 * to a dead node) can lead to large delays and/or failures
672 * in the reconfig steps.
674 while ((client[setno][nid] == (CLIENT *) NULL) &&
675 (tout < MD_CLNT_CREATE_TOUT)) {
676 client[setno][nid] = meta_client_create_retry(
677 node->nd_nodename, mdmn_clnt_create,
678 (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep);
679 /* Is the node dead? */
680 if (mdmn_is_node_dead(node) == 1) {
681 commd_debug(MD_MMV_SYSLOG,
682 "rpc.mdcommd: no client for dead node %s\n",
683 node->nd_nodename);
684 break;
685 } else
686 tout += MD_CLNT_CREATE_SUBTIMEOUT;
689 if (client[setno][nid] == (CLIENT *) NULL) {
690 clnt_pcreateerror(node->nd_nodename);
691 (void) rw_unlock(&set_desc_rwlock[setno]);
692 return (-3);
694 /* this node has the license to send */
695 commd_debug(MD_MMV_MISC, "init_client: calling add_lic\n");
696 add_license(node);
698 /* set the timeout value */
699 clnt_control(client[setno][nid], CLSET_TIMEOUT,
700 (char *)&FOUR_SECS);
703 (void) rw_unlock(&set_desc_rwlock[setno]);
704 return (0);
708 * check_client(setno, nodeid)
710 * must be called with reader lock held for set_desc_rwlock[setno]
711 * and must be called with reader lock held for client_rwlock[setno]
712 * Checks if the client for this set/node combination is already setup
713 * if not it upgrades the lock to a writer lock
714 * and tries to initialize the client.
715 * Finally it's checked if the client nulled out again due to some race
717 * returns 0 if there is a usable client
718 * returns MDMNE_RPC_FAIL otherwise
720 static int
721 check_client(set_t setno, md_mn_nodeid_t nodeid)
723 int ret = 0;
725 while ((client[setno][nodeid] == (CLIENT *)NULL) && (ret == 0)) {
726 /* upgrade reader ... */
727 (void) rw_unlock(&client_rwlock[setno]);
728 /* ... to writer lock. */
729 (void) rw_wrlock(&client_rwlock[setno]);
730 if (mdmn_init_client(setno, nodeid) != 0) {
731 ret = MDMNE_RPC_FAIL;
733 /* downgrade writer ... */
734 (void) rw_unlock(&client_rwlock[setno]);
735 /* ... back to reader lock. */
736 (void) rw_rdlock(&client_rwlock[setno]);
738 return (ret);
742 * mdmn_init_set(setno, todo)
743 * setno is the number of the set to be initialized.
744 * todo is one of the MDMN_SET_* thingies or MDMN_SET_READY
745 * If called with MDMN_SET_READY everything is initialized.
747 * If the set mutexes are already initialized, the caller has to hold
748 * both set_desc_rwlock[setno] and client_rwlock[setno] as a writer, before
749 * calling mdmn_init_set()
752 mdmn_init_set(set_t setno, int todo)
754 int class;
755 md_mnnode_desc *node;
756 md_set_desc *sd; /* just an abbr for set_descriptor[setno] */
757 mdsetname_t *sp;
758 md_error_t ep = mdnullerror;
759 md_mn_nodeid_t nid;
762 * Check if we are told to setup the mutexes and
763 * if these are not yet setup
765 if ((todo & MDMN_SET_MUTEXES) &&
766 ((md_mn_set_inited[setno] & MDMN_SET_MUTEXES) == 0)) {
767 (void) mutex_init(&mdmn_busy_mutex[setno], USYNC_THREAD, NULL);
768 (void) cond_init(&mdmn_busy_cv[setno], USYNC_THREAD, NULL);
769 (void) rwlock_init(&client_rwlock[setno], USYNC_THREAD, NULL);
770 (void) rwlock_init(&set_desc_rwlock[setno], USYNC_THREAD, NULL);
772 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) {
773 (void) mutex_init(mdmn_get_master_table_mx(setno,
774 class), USYNC_THREAD, NULL);
775 (void) cond_init(mdmn_get_master_table_cv(setno, class),
776 USYNC_THREAD, NULL);
777 (void) mutex_init(mdmn_get_initiator_table_mx(setno,
778 class), USYNC_THREAD, NULL);
780 md_mn_set_inited[setno] |= MDMN_SET_MUTEXES;
782 if ((todo & MDMN_SET_MCT) &&
783 ((md_mn_set_inited[setno] & MDMN_SET_MCT) == 0)) {
784 int fd;
785 size_t filesize;
786 caddr_t addr;
787 char table_name[32];
788 struct flock fl;
790 filesize = (sizeof (md_mn_mct_t));
791 (void) snprintf(table_name, sizeof (table_name), "%s%d",
792 MD_MN_MSG_COMP_TABLE, setno);
794 * If the mct file exists we map it into memory.
795 * Otherwise we create an empty file of appropriate
796 * size and map that into memory.
797 * The mapped areas are stored in mct[setno].
799 fd = open(table_name, O_RDWR|O_CREAT|O_DSYNC, 0600);
800 if (fd < 0) {
801 commd_debug(MD_MMV_MISC,
802 "init_set: Can't open MCT\n");
803 return (-1);
806 * Ensure that we are the only process that has this file
807 * mapped. If another instance of rpc.mdcommd has beaten us
808 * then we display the failing process and attempt to terminate
809 * it. The next call of this routine should establish us as
810 * the only rpc.mdcommd on the system.
812 (void) memset(&fl, 0, sizeof (fl));
813 fl.l_type = F_WRLCK;
814 fl.l_whence = SEEK_SET;
815 fl.l_start = 0;
816 fl.l_len = filesize + 1;
818 if (fcntl(fd, F_SETLK, &fl) == -1) {
819 commd_debug(MD_MMV_SYSLOG,
820 "init_set: Cannot lock MCT '%s'\n", table_name);
821 if (fcntl(fd, F_GETLK, &fl) != -1) {
822 commd_debug(MD_MMV_SYSLOG, "rpc.mdcommd:"
823 "Process %d holds lock\n", fl.l_pid);
824 (void) close(fd);
825 } else {
826 commd_debug(MD_MMV_SYSLOG, "rpc.mdcommd:"
827 "F_GETLK failed\n");
828 (void) close(fd);
829 return (-1);
833 * Try to terminate other mdcommd process so that we
834 * can establish ourselves.
836 if (sigsend(P_PID, fl.l_pid, 0) == 0) {
837 if (sigsend(P_PID, fl.l_pid, SIGKILL) < 0) {
838 commd_debug(MD_MMV_SYSLOG,
839 "rpc.mdcommd:"
840 "SIGKILL of %d failed\n", fl.l_pid);
841 } else {
842 commd_debug(MD_MMV_SYSLOG,
843 "rpc.mdcommd:"
844 "Process %d killed\n", fl.l_pid);
846 } else {
847 commd_debug(MD_MMV_SYSLOG, "rpc.mdcommd:"
848 "Process %d not killable\n", fl.l_pid);
850 return (-1);
853 * To ensure that the file has the appropriate size,
854 * we write a byte at the end of the file.
856 (void) lseek(fd, filesize + 1, SEEK_SET);
857 (void) write(fd, "\0", 1);
859 /* at this point we have a file in place that we can mmap */
860 addr = mmap(0, filesize, PROT_READ | PROT_WRITE,
861 MAP_SHARED, fd, (off_t)0);
862 if (addr == MAP_FAILED) {
863 commd_debug(MD_MMV_INIT,
864 "init_set: mmap mct error %d\n",
865 errno);
866 return (-1);
868 /* LINTED pointer alignment */
869 mct[setno] = (md_mn_mct_t *)addr;
871 /* finally we initialize the mutexes that protect the mct */
872 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) {
873 (void) mutex_init(&(mct_mutex[setno][class]),
874 USYNC_THREAD, NULL);
877 md_mn_set_inited[setno] |= MDMN_SET_MCT;
880 * Check if we are told to setup the nodes and
881 * if these are not yet setup
882 * (Attention: negative logic here compared to above!)
884 if (((todo & MDMN_SET_NODES) == 0) ||
885 (md_mn_set_inited[setno] & MDMN_SET_NODES)) {
886 return (0); /* success */
889 if ((sp = metasetnosetname(setno, &ep)) == NULL) {
890 commd_debug(MD_MMV_SYSLOG,
891 "metasetnosetname(%d) returned NULL\n", setno);
892 return (MDMNE_NOT_JOINED);
895 /* flush local copy of rpc.metad data */
896 metaflushsetname(sp);
898 (void) mutex_lock(&get_setdesc_mutex);
899 sd = metaget_setdesc(sp, &ep);
900 (void) mutex_unlock(&get_setdesc_mutex);
902 if (sd == NULL) {
903 commd_debug(MD_MMV_SYSLOG,
904 "metaget_setdesc(%d) returned NULL\n", setno);
905 return (MDMNE_NOT_JOINED);
909 * if this set is not a multinode set or
910 * this node didn't join yet the diskset, better don't do anything
912 if ((MD_MNSET_DESC(sd) == 0) ||
913 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN) == 0) {
914 commd_debug(MD_MMV_INIT, "didn't yet join set %d\n", setno);
915 return (MDMNE_NOT_JOINED);
918 for (node = sd->sd_nodelist; node != NULL; node = node->nd_next) {
919 time_t tout = 0;
920 nid = node->nd_nodeid;
922 commd_debug(MD_MMV_INIT,
923 "setting up: node=%s, priv_ic=%s, flags=0x%x\n",
924 node->nd_nodename ? node->nd_nodename : "NULL",
925 node->nd_priv_ic ? node->nd_priv_ic : "NULL",
926 node->nd_flags);
928 if ((node->nd_flags & MD_MN_NODE_OWN) == 0) {
929 commd_debug(MD_MMV_INIT,
930 "init: %s didn't join set %d\n",
931 node->nd_nodename ? node->nd_nodename : "NULL",
932 setno);
933 continue;
936 if (client[setno][nid] != (CLIENT *) NULL) {
937 /* already inited */
938 commd_debug(MD_MMV_INIT, "init: already: node=%s\n",
939 node->nd_nodename ? node->nd_nodename : "NULL");
940 continue;
944 * While trying to create a connection to a node,
945 * periodically check to see if the node has been marked
946 * dead by the SunCluster infrastructure.
947 * This periodic check is needed since a non-responsive
948 * rpc.mdcommd (while it is attempting to create a connection
949 * to a dead node) can lead to large delays and/or failures
950 * in the reconfig steps.
952 while ((client[setno][nid] == (CLIENT *) NULL) &&
953 (tout < MD_CLNT_CREATE_TOUT)) {
954 client[setno][nid] = meta_client_create_retry(
955 node->nd_nodename, mdmn_clnt_create,
956 (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep);
957 /* Is the node dead? */
958 if (mdmn_is_node_dead(node) == 1) {
959 commd_debug(MD_MMV_SYSLOG,
960 "rpc.mdcommd: no client for dead node %s\n",
961 node->nd_nodename);
962 break;
963 } else
964 tout += MD_CLNT_CREATE_SUBTIMEOUT;
967 if (client[setno][nid] == (CLIENT *) NULL) {
968 clnt_pcreateerror(node->nd_nodename);
970 * If we cannot connect to a single node
971 * (maybe because it is down) we mark this node as not
972 * owned and continue with the next node in the list.
973 * This is better than failing the entire starting up
974 * of the commd system.
976 node->nd_flags &= ~MD_MN_NODE_OWN;
977 commd_debug(MD_MMV_SYSLOG,
978 "WARNING couldn't create client for %s\n"
979 "Reconfig cycle required\n",
980 node->nd_nodename);
981 commd_debug(MD_MMV_INIT,
982 "WARNING couldn't create client for %s\n"
983 "Reconfig cycle required\n",
984 node->nd_nodename);
985 continue;
987 /* this node has the license to send */
988 commd_debug(MD_MMV_MISC, "init_set: calling add_lic\n");
989 add_license(node);
991 /* set the timeout value */
992 clnt_control(client[setno][nid], CLSET_TIMEOUT,
993 (char *)&FOUR_SECS);
995 commd_debug(MD_MMV_INIT, "init: done: node=%s\n",
996 node->nd_nodename ? node->nd_nodename : "NULL");
999 set_descriptor[setno] = sd;
1000 md_mn_set_inited[setno] |= MDMN_SET_NODES;
1001 return (0); /* success */
1004 void *
1005 mdmn_send_to_work(void *arg)
1007 int *rpc_err = NULL;
1008 int success;
1009 int try_master;
1010 set_t setno;
1011 mutex_t *mx; /* protection for initiator_table */
1012 SVCXPRT *transp;
1013 md_mn_msg_t *msg;
1014 md_mn_nodeid_t set_master;
1015 md_mn_msgclass_t class;
1016 md_mn_msg_and_transp_t *matp = (md_mn_msg_and_transp_t *)arg;
1018 msg = matp->mat_msg;
1019 transp = matp->mat_transp;
1021 class = mdmn_get_message_class(msg->msg_type);
1022 setno = msg->msg_setno;
1024 /* set the sender, so the master knows who to send the results */
1025 (void) rw_rdlock(&set_desc_rwlock[setno]);
1026 msg->msg_sender = set_descriptor[setno]->sd_mn_mynode->nd_nodeid;
1027 set_master = set_descriptor[setno]->sd_mn_master_nodeid;
1029 mx = mdmn_get_initiator_table_mx(setno, class);
1030 (void) mutex_lock(mx);
1033 * Here we check, if the initiator table slot for this set/class
1034 * combination is free to use.
1035 * If this is not the case, we return CLASS_BUSY forcing the
1036 * initiating send_message call to retry
1038 success = mdmn_check_initiator_table(setno, class);
1039 if (success == MDMNE_CLASS_BUSY) {
1040 md_mn_msgid_t active_mid;
1042 mdmn_get_initiator_table_id(setno, class, &active_mid);
1044 commd_debug(MD_MMV_SEND,
1045 "send_to_work: received but locally busy "
1046 "(%d, 0x%llx-%d), set=%d, class=%d, type=%d, "
1047 "active msg=(%d, 0x%llx-%d)\n",
1048 MSGID_ELEMS(msg->msg_msgid), setno, class,
1049 msg->msg_type, MSGID_ELEMS(active_mid));
1050 } else {
1051 commd_debug(MD_MMV_SEND,
1052 "send_to_work: received (%d, 0x%llx-%d), "
1053 "set=%d, class=%d, type=%d\n",
1054 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type);
1057 try_master = 2; /* return failure after two retries */
1058 while ((success == MDMNE_ACK) && (try_master--)) {
1059 (void) rw_rdlock(&client_rwlock[setno]);
1060 /* is the rpc client to the master still around ? */
1061 if (check_client(setno, set_master)) {
1062 success = MDMNE_RPC_FAIL;
1063 FLUSH_DEBUGFILE();
1064 (void) rw_unlock(&client_rwlock[setno]);
1065 break; /* out of try_master-loop */
1069 * Send the request to the work function on the master
1070 * this call will return immediately
1072 rpc_err = mdmn_work_2(msg, client[setno][set_master],
1073 set_master);
1075 /* Everything's Ok? */
1076 if (rpc_err == NULL) {
1077 success = MDMNE_RPC_FAIL;
1079 * Probably something happened to the daemon on the
1080 * master. Kill the client, and try again...
1082 (void) rw_unlock(&client_rwlock[setno]);
1083 (void) rw_wrlock(&client_rwlock[setno]);
1084 mdmn_clnt_destroy(client[setno][set_master]);
1085 if (client[setno][set_master] != (CLIENT *)NULL) {
1086 client[setno][set_master] = (CLIENT *)NULL;
1088 (void) rw_unlock(&client_rwlock[setno]);
1089 continue;
1091 } else if (*rpc_err != MDMNE_ACK) {
1092 /* something went wrong, break out */
1093 success = *rpc_err;
1094 free(rpc_err);
1095 (void) rw_unlock(&client_rwlock[setno]);
1096 break; /* out of try_master-loop */
1099 (void) rw_unlock(&client_rwlock[setno]);
1100 free(rpc_err);
1103 * If we are here, we sucessfully delivered the message.
1104 * We register the initiator_table, so that
1105 * wakeup_initiator_2 can do the sendreply with the
1106 * results for us.
1108 success = MDMNE_ACK;
1109 mdmn_register_initiator_table(setno, class, msg, transp);
1111 /* tell check_timeouts, there's work to do */
1112 (void) mutex_lock(&check_timeout_mutex);
1113 messages_on_their_way++;
1114 (void) cond_signal(&check_timeout_cv);
1115 (void) mutex_unlock(&check_timeout_mutex);
1116 break; /* out of try_master-loop */
1119 (void) rw_unlock(&set_desc_rwlock[setno]);
1121 if (success == MDMNE_ACK) {
1122 commd_debug(MD_MMV_SEND,
1123 "send_to_work: registered (%d, 0x%llx-%d)\n",
1124 MSGID_ELEMS(msg->msg_msgid));
1125 } else {
1126 /* In case of failure do the sendreply now */
1127 md_mn_result_t *resultp;
1128 resultp = Zalloc(sizeof (md_mn_result_t));
1129 resultp->mmr_comm_state = success;
1131 * copy the MSGID so that we know _which_ message
1132 * failed (if the transp has got mangled)
1134 MSGID_COPY(&(msg->msg_msgid), &(resultp->mmr_msgid));
1135 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
1136 commd_debug(MD_MMV_SEND,
1137 "send_to_work: not registered (%d, 0x%llx-%d) cs=%d\n",
1138 MSGID_ELEMS(msg->msg_msgid), success);
1139 free_result(resultp);
1141 * We don't have a timeout registered to wake us up, so we're
1142 * now done with this handle. Release it back to the pool.
1144 svc_done(transp);
1148 free_msg(msg);
1149 /* the alloc was done in mdmn_send_svc_2 */
1150 Free(matp);
1151 (void) mutex_unlock(mx);
1152 return (NULL);
1157 * do_message_locally(msg, result)
1158 * Process a message locally on the master
1159 * Lookup the MCT if the message has already been processed.
1160 * If not, call the handler and store the result
1161 * If yes, retrieve the result from the MCT.
1162 * Return:
1163 * MDMNE_ACK in case of success
1164 * MDMNE_LOG_FAIL if the MCT could not be checked
1166 static int
1167 do_message_locally(md_mn_msg_t *msg, md_mn_result_t *result)
1169 int completed;
1170 set_t setno;
1171 md_mn_msgtype_t msgtype = msg->msg_type;
1172 md_mn_msgclass_t class;
1174 void (*handler)(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *res);
1176 handler = mdmn_get_handler(msgtype);
1177 if (handler == NULL) {
1178 result->mmr_exitval = 0;
1179 /* let the sender decide if this is an error or not */
1180 result->mmr_comm_state = MDMNE_NO_HANDLER;
1181 return (MDMNE_NO_HANDLER);
1184 class = mdmn_get_message_class(msg->msg_type);
1185 setno = msg->msg_setno;
1187 result->mmr_msgtype = msgtype;
1188 result->mmr_flags = msg->msg_flags;
1189 MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid));
1191 (void) mutex_lock(&mct_mutex[setno][class]);
1192 completed = mdmn_check_completion(msg, result);
1193 if (completed == MDMN_MCT_NOT_DONE) {
1194 /* message not yet processed locally */
1195 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1196 "calling handler for (%d,0x%llx-%d) type %d\n",
1197 MSGID_ELEMS(msg->msg_msgid), msgtype);
1200 * Mark the message as being currently processed,
1201 * so we won't start a second handler for it
1203 (void) mdmn_mark_completion(msg, NULL, MDMN_MCT_IN_PROGRESS);
1204 (void) mutex_unlock(&mct_mutex[setno][class]);
1206 /* here we actually process the message on the master */
1207 (*handler)(msg, MD_MSGF_ON_MASTER, result);
1209 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1210 "finished handler for (%d,0x%llx-%d) type %d\n",
1211 MSGID_ELEMS(msg->msg_msgid), msgtype);
1213 /* Mark the message as fully processed, store the result */
1214 (void) mutex_lock(&mct_mutex[setno][class]);
1215 (void) mdmn_mark_completion(msg, result, MDMN_MCT_DONE);
1216 } else if (completed == MDMN_MCT_DONE) {
1217 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1218 "result for (%d, 0x%llx-%d) from MCT\n",
1219 MSGID_ELEMS(msg->msg_msgid), msgtype);
1220 } else if (completed == MDMN_MCT_IN_PROGRESS) {
1221 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1222 "(%d, 0x%llx-%d) is currently being processed\n",
1223 MSGID_ELEMS(msg->msg_msgid), msgtype);
1224 } else {
1225 /* MCT error occurred (should never happen) */
1226 (void) mutex_unlock(&mct_mutex[setno][class]);
1227 result->mmr_comm_state = MDMNE_LOG_FAIL;
1228 commd_debug(MD_MMV_SYSLOG, "WARNING "
1229 "mdmn_check_completion returned %d "
1230 "for (%d,0x%llx-%d)\n", completed,
1231 MSGID_ELEMS(msg->msg_msgid));
1232 return (MDMNE_LOG_FAIL);
1234 (void) mutex_unlock(&mct_mutex[setno][class]);
1235 return (MDMNE_ACK);
1240 * do_send_message(msg, node)
1242 * Send a message to a given node and wait for a acknowledgment, that the
1243 * message has arrived on the remote node.
1244 * Make sure that the client for the set is setup correctly.
1245 * If no ACK arrives, destroy and recreate the RPC client and retry the
1246 * message one time
1247 * After actually sending wait no longer than the appropriate number of
1248 * before timing out the message.
1250 * Note must be called with set_desc_wrlock held in reader mode
1252 static int
1253 do_send_message(md_mn_msg_t *msg, md_mnnode_desc *node)
1255 int err;
1256 int rpc_retries;
1257 int timeout_retries = 0;
1258 int *ret = NULL;
1259 set_t setno;
1260 cond_t *cv; /* see mdmn_wakeup_master_svc_2 */
1261 mutex_t *mx; /* protection for class_busy */
1262 timestruc_t timeout; /* surveillance for remote daemon */
1263 md_mn_nodeid_t nid;
1264 md_mn_msgtype_t msgtype;
1265 md_mn_msgclass_t class;
1267 nid = node->nd_nodeid;
1268 msgtype = msg->msg_type;
1269 setno = msg->msg_setno;
1270 class = mdmn_get_message_class(msgtype);
1271 mx = mdmn_get_master_table_mx(setno, class);
1272 cv = mdmn_get_master_table_cv(setno, class);
1274 retry_rpc:
1276 /* We try two times to send the message */
1277 rpc_retries = 2;
1280 * if sending the message doesn't succeed the first time due to a
1281 * RPC problem, we retry one time
1283 while ((rpc_retries != 0) && (ret == NULL)) {
1284 /* in abort state, we error out immediately */
1285 if (md_commd_global_state & MD_CGS_ABORTED) {
1286 return (MDMNE_ABORT);
1289 (void) rw_rdlock(&client_rwlock[setno]);
1290 /* unable to create client? Ignore it */
1291 if (check_client(setno, nid)) {
1293 * In case we cannot establish an RPC client, we
1294 * take this node out of our considerations.
1295 * This will be reset by a reconfig
1296 * cycle that should come pretty soon.
1297 * MNISSUE: Should a reconfig cycle
1298 * be forced on SunCluster?
1300 node->nd_flags &= ~MD_MN_NODE_OWN;
1301 commd_debug(MD_MMV_SYSLOG,
1302 "WARNING couldn't create client for %s\n"
1303 "Reconfig cycle required\n",
1304 node->nd_nodename);
1305 commd_debug(MD_MMV_PROC_M, "proc_mas: (%d,0x%llx-%d) "
1306 "WARNING couldn't create client for %s\n",
1307 MSGID_ELEMS(msg->msg_msgid), node->nd_nodename);
1308 (void) rw_unlock(&client_rwlock[setno]);
1309 return (MDMNE_IGNORE_NODE);
1311 /* let's be paranoid and check again before sending */
1312 if (client[setno][nid] == NULL) {
1314 * if this is true, strange enough, we catch our breath,
1315 * and then continue, so that the client is set up
1316 * once again.
1318 commd_debug(MD_MMV_PROC_M, "client is NULL\n");
1319 (void) rw_unlock(&client_rwlock[setno]);
1320 (void) sleep(1);
1321 continue;
1324 /* send it over, it will return immediately */
1325 ret = mdmn_work_2(msg, client[setno][nid], nid);
1327 (void) rw_unlock(&client_rwlock[setno]);
1329 if (ret != NULL) {
1330 commd_debug(MD_MMV_PROC_M,
1331 "proc_mas: sending (%d,0x%llx-%d) to %d returned "
1332 " 0x%x\n",
1333 MSGID_ELEMS(msg->msg_msgid), nid, *ret);
1334 } else {
1335 commd_debug(MD_MMV_PROC_M,
1336 "proc_mas: sending (%d,0x%llx-%d) to %d returned "
1337 " NULL \n",
1338 MSGID_ELEMS(msg->msg_msgid), nid);
1341 if ((ret == NULL) || (*ret == MDMNE_CANNOT_CONNECT) ||
1342 (*ret == MDMNE_THR_CREATE_FAIL)) {
1344 * Something happened to the daemon on the other side.
1345 * Kill the client, and try again.
1346 * check_client() will create a new client
1348 (void) rw_wrlock(&client_rwlock[setno]);
1349 mdmn_clnt_destroy(client[setno][nid]);
1350 if (client[setno][nid] != (CLIENT *)NULL) {
1351 client[setno][nid] = (CLIENT *)NULL;
1353 (void) rw_unlock(&client_rwlock[setno]);
1355 /* ... but don't try infinitely */
1356 --rpc_retries;
1357 continue;
1360 * If the class is locked on the other node, keep trying.
1361 * This situation will go away automatically,
1362 * if we wait long enough
1364 if (*ret == MDMNE_CLASS_LOCKED) {
1365 (void) sleep(1);
1366 free(ret);
1367 ret = NULL;
1368 continue;
1371 if (ret == NULL) {
1372 return (MDMNE_RPC_FAIL);
1376 /* if the slave is in abort state, we just ignore it. */
1377 if (*ret == MDMNE_ABORT) {
1378 commd_debug(MD_MMV_PROC_M,
1379 "proc_mas: work(%d,0x%llx-%d) returned "
1380 "MDMNE_ABORT\n",
1381 MSGID_ELEMS(msg->msg_msgid));
1382 free(ret);
1383 return (MDMNE_IGNORE_NODE);
1386 /* Did the remote processing succeed? */
1387 if (*ret != MDMNE_ACK) {
1389 * Some commd failure in the middle of sending the msg
1390 * to the nodes. We don't continue here.
1392 commd_debug(MD_MMV_PROC_M,
1393 "proc_mas: work(%d,0x%llx-%d) returns %d\n",
1394 MSGID_ELEMS(msg->msg_msgid), *ret);
1395 free(ret);
1396 return (MDMNE_RPC_FAIL);
1398 free(ret);
1399 ret = NULL;
1402 * When we are here, we have sent the message to the other node and
1403 * we know that node has accepted it.
1404 * We go to sleep and have trust to be woken up by wakeup.
1405 * If we wakeup due to a timeout, or a signal, no result has been
1406 * placed in the appropriate slot.
1407 * If we timeout, it is likely that this is because the node has
1408 * gone away, so we will destroy the client and try it again in the
1409 * expectation that the rpc will fail and we will return
1410 * MDMNE_IGNORE_NODE. If that is not the case, the message must still
1411 * be being processed on the slave. In this case just timeout for 4
1412 * more seconds and then return RPC_FAIL if the message is not complete.
1414 timeout.tv_nsec = 0;
1415 timeout.tv_sec = (timeout_retries == 0) ? mdmn_get_timeout(msgtype) :
1416 FOUR_SECS.tv_sec;
1417 err = cond_reltimedwait(cv, mx, &timeout);
1419 if (err == 0) {
1420 /* everything's fine, return success */
1421 return (MDMNE_ACK);
1424 if (err == ETIME) {
1425 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1426 "timeout occured, set=%d, class=%d, "
1427 "msgid=(%d, 0x%llx-%d), timeout_retries=%d\n",
1428 setno, class, MSGID_ELEMS(msg->msg_msgid), timeout_retries);
1429 if (timeout_retries == 0) {
1430 timeout_retries++;
1432 * Destroy the client and try the rpc call again
1434 (void) rw_wrlock(&client_rwlock[setno]);
1435 mdmn_clnt_destroy(client[setno][nid]);
1436 client[setno][nid] = (CLIENT *)NULL;
1437 (void) rw_unlock(&client_rwlock[setno]);
1438 goto retry_rpc;
1440 } else if (err == EINTR) {
1441 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1442 "commd signalled, set=%d, class=%d, "
1443 "msgid=(%d, 0x%llx-%d)\n",
1444 setno, class, MSGID_ELEMS(msg->msg_msgid));
1445 } else {
1446 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1447 "cond_reltimedwait err=%d, set=%d, "
1448 "class=%d, msgid=(%d, 0x%llx-%d)\n",
1449 err, setno, class,
1450 MSGID_ELEMS(msg->msg_msgid));
1453 /* some failure happened */
1454 return (MDMNE_RPC_FAIL);
1458 * before we return we have to
1459 * free_msg(msg); because we are working on a copied message
1461 void
1462 mdmn_master_process_msg(md_mn_msg_t *msg)
1464 int *ret;
1465 int err;
1466 int nmsgs; /* total number of msgs */
1467 int curmsg; /* index of current msg */
1468 set_t setno;
1469 uint_t inherit_flags = 0;
1470 uint_t secdiff, usecdiff; /* runtime of this message */
1471 md_error_t mde = mdnullerror;
1472 md_mn_msg_t *msglist[MAX_SUBMESSAGES]; /* all msgs to process */
1473 md_mn_msg_t *cmsg; /* current msg */
1474 md_mn_msgid_t dummyid;
1475 md_mn_result_t *result;
1476 md_mn_result_t *slave_result;
1477 md_mn_nodeid_t sender;
1478 md_mn_nodeid_t set_master;
1479 md_mnnode_desc *node;
1480 md_mn_msgtype_t orig_type; /* type of the original message */
1481 md_mn_msgtype_t msgtype; /* type of the current message */
1482 md_mn_msgclass_t orig_class; /* class of the original message */
1483 md_mn_msgclass_t class; /* class of the current message */
1485 int (*smgen)(md_mn_msg_t *msg, md_mn_msg_t **msglist);
1487 orig_type = msgtype = msg->msg_type;
1488 sender = msg->msg_sender;
1489 setno = msg->msg_setno;
1491 result = Zalloc(sizeof (md_mn_result_t));
1492 result->mmr_setno = setno;
1493 result->mmr_msgtype = msgtype;
1494 MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid));
1496 orig_class = mdmn_get_message_class(msgtype);
1498 commd_debug(MD_MMV_PROC_M,
1499 "proc_mas: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n",
1500 MSGID_ELEMS(msg->msg_msgid), setno, orig_class, msgtype);
1502 (void) rw_rdlock(&set_desc_rwlock[setno]);
1503 set_master = set_descriptor[setno]->sd_mn_master_nodeid;
1504 result->mmr_sender = set_master;
1506 * Put message into the change log unless told otherwise
1507 * Note that we only log original messages.
1508 * If they are generated by some smgen, we don't log them!
1509 * Replay messages aren't logged either.
1510 * Note, that replay messages are unlogged on completion.
1512 if ((msg->msg_flags & (MD_MSGF_NO_LOG | MD_MSGF_REPLAY_MSG)) == 0) {
1513 commd_debug(MD_MMV_PROC_M,
1514 "proc_mas: calling log_msg for (%d,0x%llx-%d) type %d\n",
1515 MSGID_ELEMS(msg->msg_msgid), msgtype);
1516 err = mdmn_log_msg(msg);
1517 if (err == MDMNE_NULL) {
1518 /* msg logged successfully */
1519 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1520 "done log_msg for (%d,0x%llx-%d) type %d\n",
1521 MSGID_ELEMS(msg->msg_msgid), msgtype);
1522 goto proceed;
1524 if (err == MDMNE_ACK) {
1525 /* Same msg in the slot, proceed */
1526 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1527 "already logged (%d,0x%llx-%d) type %d\n",
1528 MSGID_ELEMS(msg->msg_msgid), msgtype);
1529 goto proceed;
1531 if (err == MDMNE_LOG_FAIL) {
1532 /* Oh, bad, the log is non functional. */
1533 result->mmr_comm_state = MDMNE_LOG_FAIL;
1535 * Note that the mark_busy was already done by
1536 * mdmn_work_svc_2()
1538 (void) mutex_lock(&mdmn_busy_mutex[setno]);
1539 mdmn_mark_class_unbusy(setno, orig_class);
1540 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
1543 if (err == MDMNE_CLASS_BUSY) {
1545 * The log is occupied with a different message
1546 * that needs to be played first.
1547 * We reject the current message with MDMNE_CLASS_BUSY
1548 * to the initiator and do not unbusy the set/class,
1549 * because we will proceed with the logged message,
1550 * which has the same set/class combination
1552 result->mmr_comm_state = MDMNE_CLASS_BUSY;
1554 ret = (int *)NULL;
1555 (void) rw_rdlock(&client_rwlock[setno]);
1557 if (check_client(setno, sender)) {
1558 commd_debug(MD_MMV_SYSLOG,
1559 "proc_mas: No client for initiator \n");
1560 } else {
1561 ret = mdmn_wakeup_initiator_2(result,
1562 client[setno][sender], sender);
1564 (void) rw_unlock(&client_rwlock[setno]);
1566 if (ret == (int *)NULL) {
1567 commd_debug(MD_MMV_SYSLOG,
1568 "proc_mas: couldn't wakeup_initiator \n");
1569 } else {
1570 if (*ret != MDMNE_ACK) {
1571 commd_debug(MD_MMV_SYSLOG, "proc_mas: "
1572 "wakeup_initiator returned %d\n", *ret);
1574 free(ret);
1576 free_msg(msg);
1578 if (err == MDMNE_LOG_FAIL) {
1579 /* we can't proceed here */
1580 free_result(result);
1581 (void) rw_unlock(&set_desc_rwlock[setno]);
1582 return;
1583 } else if (err == MDMNE_CLASS_BUSY) {
1584 mdmn_changelog_record_t *lr;
1585 lr = mdmn_get_changelogrec(setno, orig_class);
1586 assert(lr != NULL);
1588 /* proceed with the logged message */
1589 msg = copy_msg(&(lr->lr_msg), NULL);
1592 * The logged message has to have the same class but
1593 * type and sender can be different
1595 orig_type = msgtype = msg->msg_type;
1596 sender = msg->msg_sender;
1598 commd_debug(MD_MMV_PROC_M,
1599 "proc_mas: Got new message from change log: "
1600 "(%d,0x%llx-%d) type %d\n",
1601 MSGID_ELEMS(msg->msg_msgid), msgtype);
1603 /* continue normal operation with this message */
1607 proceed:
1608 smgen = mdmn_get_submessage_generator(msgtype);
1609 if (smgen == NULL) {
1610 /* no submessages to create, just use the original message */
1611 msglist[0] = msg;
1612 nmsgs = 1;
1613 } else {
1614 /* some bits are passed on to submessages */
1615 inherit_flags = msg->msg_flags & MD_MSGF_INHERIT_BITS;
1617 nmsgs = smgen(msg, msglist);
1619 /* some settings for the submessages */
1620 for (curmsg = 0; curmsg < nmsgs; curmsg++) {
1621 cmsg = msglist[curmsg];
1623 /* Apply the inherited flags */
1624 cmsg->msg_flags |= inherit_flags;
1627 * Make sure the submessage ID is set correctly
1628 * Note: first submessage has mid_smid of 1 (not 0)
1630 cmsg->msg_msgid.mid_smid = curmsg + 1;
1632 /* need the original class set in msgID (for MCT) */
1633 cmsg->msg_msgid.mid_oclass = orig_class;
1636 commd_debug(MD_MMV_PROC_M,
1637 "smgen generated %d submsgs, origclass = %d\n",
1638 nmsgs, orig_class);
1641 * This big loop does the following.
1642 * For all messages:
1643 * process message on the master first (a message completion
1644 * table MCT ensures a message is not processed twice)
1645 * in case of an error break out of message loop
1646 * for all nodes -- unless MD_MSGF_NO_BCAST is set --
1647 * send message to node until that succeeds
1648 * merge result -- not yet implemented
1649 * respect MD_MSGF_STOP_ON_ERROR
1651 for (curmsg = 0; curmsg < nmsgs; curmsg++) {
1652 int break_msg_loop = 0;
1653 mutex_t *mx; /* protection for class_busy */
1654 int master_err;
1655 int master_exitval = -1;
1657 cmsg = msglist[curmsg];
1658 msgtype = cmsg->msg_type;
1659 class = mdmn_get_message_class(msgtype);
1660 node = NULL;
1661 mx = mdmn_get_master_table_mx(setno, class);
1663 /* If we are in the abort state, we error out immediately */
1664 if (md_commd_global_state & MD_CGS_ABORTED) {
1665 break; /* out of the message loop */
1668 commd_debug(MD_MMV_PROC_M, "class=%d, orig_class=%d\n",
1669 class, orig_class);
1671 * If the current class is different from the original class,
1672 * we have to lock it down.
1673 * The original class is already marked busy.
1674 * At this point we cannot refuse the message because the
1675 * class is busy right now, so we wait until the class becomes
1676 * available again. As soon as something changes for this set
1677 * we will be cond_signal'ed (in mdmn_mark_class_unbusy)
1679 * Granularity could be finer (setno/class)
1681 if (class != orig_class) {
1682 (void) mutex_lock(&mdmn_busy_mutex[setno]);
1683 while (mdmn_mark_class_busy(setno, class) == FALSE) {
1684 (void) cond_wait(&mdmn_busy_cv[setno],
1685 &mdmn_busy_mutex[setno]);
1687 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
1690 master_err = do_message_locally(cmsg, result);
1692 if ((master_err != MDMNE_ACK) ||
1693 ((master_err == MDMNE_ACK) && (result->mmr_exitval != 0))) {
1694 result->mmr_failing_node = set_master;
1695 if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) {
1697 * if appropriate, unbusy the class and
1698 * break out of the message loop
1700 if (class != orig_class) {
1701 (void) mutex_lock(
1702 &mdmn_busy_mutex[setno]);
1703 mdmn_mark_class_unbusy(setno, class);
1704 (void) mutex_unlock(
1705 &mdmn_busy_mutex[setno]);
1707 break;
1711 if (master_err == MDMNE_ACK)
1712 master_exitval = result->mmr_exitval;
1714 /* No broadcast? => next message */
1715 if (cmsg->msg_flags & MD_MSGF_NO_BCAST) {
1716 /* if appropriate, unbusy the class */
1717 if (class != orig_class) {
1718 (void) mutex_lock(&mdmn_busy_mutex[setno]);
1719 mdmn_mark_class_unbusy(setno, class);
1720 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
1722 continue;
1726 /* fake sender, so we get notified when the results are avail */
1727 cmsg->msg_sender = set_master;
1729 * register to the master_table. It's needed by wakeup_master to
1730 * wakeup the sleeping thread.
1731 * Access is protected by the class lock: mdmn_mark_class_busy()
1733 mdmn_set_master_table_id(setno, class, &(cmsg->msg_msgid));
1737 (void) rw_rdlock(&set_desc_rwlock[setno]);
1738 /* Send the message to all other nodes */
1739 for (node = set_descriptor[setno]->sd_nodelist; node;
1740 node = node->nd_next) {
1741 md_mn_nodeid_t nid = node->nd_nodeid;
1743 /* We are master and have already processed the msg */
1744 if (node == set_descriptor[setno]->sd_mn_masternode) {
1745 continue;
1748 /* If this node didn't join the disk set, ignore it */
1749 if ((node->nd_flags & MD_MN_NODE_OWN) == 0) {
1750 continue;
1753 /* If a DIRECTED message, skip non-recipient nodes */
1754 if ((cmsg->msg_flags & MD_MSGF_DIRECTED) &&
1755 nid != cmsg->msg_recipient) {
1756 continue;
1759 (void) mutex_lock(mx);
1761 * Register the node that is addressed,
1762 * so we can detect unsolicited messages
1764 mdmn_set_master_table_addr(setno, class, nid);
1765 slave_result = (md_mn_result_t *)NULL;
1768 * Now send it. do_send_message() will return if
1769 * a failure occurs or
1770 * the results are available
1772 err = do_send_message(cmsg, node);
1774 /* in abort state, we error out immediately */
1775 if (md_commd_global_state & MD_CGS_ABORTED) {
1776 break;
1779 if (err == MDMNE_ACK) {
1780 slave_result =
1781 mdmn_get_master_table_res(setno, class);
1782 commd_debug(MD_MMV_PROC_M,
1783 "proc_mas: got result for (%d,0x%llx-%d)\n",
1784 MSGID_ELEMS(cmsg->msg_msgid));
1785 } else if (err == MDMNE_IGNORE_NODE) {
1786 (void) mutex_unlock(mx);
1787 continue; /* send to next node */
1789 (void) mutex_unlock(mx);
1793 * If the result is NULL, or err doesn't show success,
1794 * something went wrong with this RPC call.
1796 if ((slave_result == NULL) || (err != MDMNE_ACK)) {
1798 * If PANIC_WHEN_INCONSISTENT set,
1799 * panic if the master succeeded while
1800 * this node failed
1802 if ((cmsg->msg_flags &
1803 MD_MSGF_PANIC_WHEN_INCONSISTENT) &&
1804 (master_err == MDMNE_ACK))
1805 panic_system(nid, cmsg->msg_type,
1806 master_err, master_exitval,
1807 slave_result);
1809 result->mmr_failing_node = nid;
1810 /* are we supposed to stop in case of error? */
1811 if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) {
1812 result->mmr_exitval = MDMNE_RPC_FAIL;
1813 commd_debug(MD_MMV_SYSLOG, "proc_mas: "
1814 "result (%d,0x%llx-%d) is NULL\n",
1815 MSGID_ELEMS(cmsg->msg_msgid));
1816 FLUSH_DEBUGFILE();
1817 break_msg_loop = 1;
1818 break; /* out of node loop first */
1819 } else {
1820 /* send msg to the next node */
1821 continue;
1827 * Message processed on remote node.
1828 * If PANIC_WHEN_INCONSISTENT set, panic if the
1829 * result is different on this node from the result
1830 * on the master
1832 if ((cmsg->msg_flags &
1833 MD_MSGF_PANIC_WHEN_INCONSISTENT) &&
1834 ((master_err != MDMNE_ACK) ||
1835 (slave_result->mmr_exitval != master_exitval)))
1836 panic_system(nid, cmsg->msg_type, master_err,
1837 master_exitval, slave_result);
1840 * At this point we know we have a message that was
1841 * processed on the remote node.
1842 * We now check if the exitval is non zero.
1843 * In that case we discard the previous result and
1844 * rather use the current.
1845 * This means: If a message fails on no node,
1846 * the result from the master will be returned.
1847 * There's currently no such thing as merge of results
1848 * If additionally STOP_ON_ERROR is set, we bail out
1850 if (slave_result->mmr_exitval != 0) {
1851 /* throw away the previously allocated result */
1852 free_result(result);
1854 /* copy_result() allocates new memory */
1855 result = copy_result(slave_result);
1856 free_result(slave_result);
1858 dump_result(MD_MMV_PROC_M, "proc_mas", result);
1860 result->mmr_failing_node = nid;
1861 if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) {
1862 break_msg_loop = 1;
1863 break; /* out of node loop */
1865 continue; /* try next node */
1867 } else {
1869 * MNIssue: may want to merge the results
1870 * from all slaves. Currently only report
1871 * the results from the master.
1873 free_result(slave_result);
1876 } /* End of loop over the nodes */
1877 (void) rw_unlock(&set_desc_rwlock[setno]);
1880 /* release the current class again */
1881 if (class != orig_class) {
1882 (void) mutex_lock(&mdmn_busy_mutex[setno]);
1883 mdmn_mark_class_unbusy(setno, class);
1884 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
1887 /* are we supposed to quit entirely ? */
1888 if (break_msg_loop ||
1889 (md_commd_global_state & MD_CGS_ABORTED)) {
1890 break; /* out of msg loop */
1893 } /* End of loop over the messages */
1895 * If we are here, there's two possibilities:
1896 * - we processed all messages on all nodes without an error.
1897 * In this case we return the result from the master.
1898 * (to be implemented: return the merged result)
1899 * - we encountered an error in which case result has been
1900 * set accordingly already.
1903 if (md_commd_global_state & MD_CGS_ABORTED) {
1904 result->mmr_comm_state = MDMNE_ABORT;
1908 * This message has been processed completely.
1909 * Remove it from the changelog.
1910 * Do this for replay messages too.
1911 * Note that the message is unlogged before waking up the
1912 * initiator. This is done for two reasons.
1913 * 1. Remove a race condition that occurs when back to back
1914 * messages are sent for the same class, the registeration is
1915 * is lost.
1916 * 2. If the initiator died but the action was completed on all the
1917 * the nodes, we want that to be marked "done" quickly.
1920 if ((msg->msg_flags & MD_MSGF_NO_LOG) == 0) {
1921 commd_debug(MD_MMV_PROC_M,
1922 "proc_mas: calling unlog_msg for (%d,0x%llx-%d) type %d\n",
1923 MSGID_ELEMS(msg->msg_msgid), msgtype);
1924 (void) mdmn_unlog_msg(msg);
1925 commd_debug(MD_MMV_PROC_M,
1926 "proc_mas: done unlog_msg for (%d,0x%llx-%d) type %d\n",
1927 MSGID_ELEMS(msg->msg_msgid), msgtype);
1931 * In case of submessages, we increased the submessage ID in the
1932 * result structure. We restore the message ID to the value that
1933 * the initiator is waiting for.
1935 result->mmr_msgid.mid_smid = 0;
1936 result->mmr_msgtype = orig_type;
1937 result->mmr_sender = set_master;
1939 /* if we have an inited client, send result */
1940 ret = (int *)NULL;
1942 (void) rw_rdlock(&client_rwlock[setno]);
1943 if (check_client(setno, sender)) {
1944 commd_debug(MD_MMV_SYSLOG,
1945 "proc_mas: unable to create client for initiator\n");
1946 } else {
1947 ret = mdmn_wakeup_initiator_2(result, client[setno][sender],
1948 sender);
1950 (void) rw_unlock(&client_rwlock[setno]);
1952 if (ret == (int *)NULL) {
1953 commd_debug(MD_MMV_PROC_M,
1954 "proc_mas: couldn't wakeup initiator\n");
1955 } else {
1956 if (*ret != MDMNE_ACK) {
1957 commd_debug(MD_MMV_PROC_M,
1958 "proc_mas: wakeup_initiator returned %d\n",
1959 *ret);
1961 free(ret);
1964 (void) rw_unlock(&set_desc_rwlock[setno]);
1965 /* Free all submessages, if there were any */
1966 if (nmsgs > 1) {
1967 for (curmsg = 0; curmsg < nmsgs; curmsg++) {
1968 free_msg(msglist[curmsg]);
1971 /* Free the result */
1972 free_result(result);
1974 (void) mutex_lock(&mdmn_busy_mutex[setno]);
1975 mdmn_mark_class_unbusy(setno, orig_class);
1976 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
1980 * We use this ioctl just to get the time in the same format as used in
1981 * the messageID. If it fails, all we get is a bad runtime output.
1983 (void) metaioctl(MD_IOCGUNIQMSGID, &dummyid, &mde, NULL);
1984 secdiff = (dummyid.mid_time - msg->msg_msgid.mid_time) >> 32;
1985 usecdiff = (dummyid.mid_time - msg->msg_msgid.mid_time) & 0xfffff;
1987 /* catching possible overflow */
1988 if (usecdiff >= 1000000) {
1989 usecdiff -= 1000000;
1990 secdiff++;
1994 commd_debug(MD_MMV_PROC_M, "proc_mas: done (%d, 0x%llx-%d) type=%02d "
1995 "%5d.%06d secs runtime\n",
1996 MSGID_ELEMS(msg->msg_msgid), orig_type, secdiff, usecdiff);
1998 /* Free the original message */
1999 free_msg(msg);
2002 void
2003 mdmn_slave_process_msg(md_mn_msg_t *msg)
2005 int *ret = NULL;
2006 int completed;
2007 int retries;
2008 int successfully_returned;
2009 set_t setno;
2010 md_mn_result_t *result;
2011 md_mn_nodeid_t sender;
2012 md_mn_nodeid_t whoami;
2013 md_mn_msgtype_t msgtype;
2014 md_mn_msgclass_t class;
2016 void (*handler)(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *res);
2018 setno = msg->msg_setno;
2019 sender = msg->msg_sender; /* this is always the master of the set */
2020 msgtype = msg->msg_type;
2022 (void) rw_rdlock(&set_desc_rwlock[setno]);
2023 whoami = set_descriptor[setno]->sd_mn_mynode->nd_nodeid;
2024 (void) rw_unlock(&set_desc_rwlock[setno]);
2026 result = Zalloc(sizeof (md_mn_result_t));
2027 result->mmr_flags = msg->msg_flags;
2028 result->mmr_setno = setno;
2029 result->mmr_msgtype = msgtype;
2030 result->mmr_sender = whoami;
2031 result->mmr_comm_state = MDMNE_ACK; /* Ok state */
2032 MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid));
2033 class = mdmn_get_message_class(msgtype);
2035 commd_debug(MD_MMV_PROC_S,
2036 "proc_sla: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n",
2037 MSGID_ELEMS(msg->msg_msgid), setno, class, msgtype);
2039 handler = mdmn_get_handler(msgtype);
2041 if (handler == NULL) {
2042 result->mmr_exitval = 0;
2043 /* let the sender decide if this is an error or not */
2044 result->mmr_comm_state = MDMNE_NO_HANDLER;
2045 commd_debug(MD_MMV_PROC_S,
2046 "proc_sla: No handler for (%d, 0x%llx-%d)\n",
2047 MSGID_ELEMS(msg->msg_msgid));
2048 } else {
2050 /* Did we already process this message ? */
2051 (void) mutex_lock(&mct_mutex[setno][class]);
2052 completed = mdmn_check_completion(msg, result);
2054 if (completed == MDMN_MCT_NOT_DONE) {
2055 /* message not yet processed locally */
2056 commd_debug(MD_MMV_PROC_S,
2057 "proc_sla: calling handler for (%d, 0x%llx-%d)\n",
2058 MSGID_ELEMS(msg->msg_msgid));
2061 * Mark the message as being currently processed,
2062 * so we won't start a second handler for it
2064 (void) mdmn_mark_completion(msg, NULL,
2065 MDMN_MCT_IN_PROGRESS);
2067 (void) mutex_unlock(&mct_mutex[setno][class]);
2068 (*handler)(msg, MD_MSGF_ON_SLAVE, result);
2070 commd_debug(MD_MMV_PROC_S,
2071 "proc_sla: finished handler for (%d, 0x%llx-%d)\n",
2072 MSGID_ELEMS(msg->msg_msgid));
2074 (void) mutex_lock(&mct_mutex[setno][class]);
2075 /* Mark the message as fully done, store the result */
2076 (void) mdmn_mark_completion(msg, result, MDMN_MCT_DONE);
2078 } else if (completed == MDMN_MCT_DONE) {
2079 /* message processed previously, got result from MCT */
2080 commd_debug(MD_MMV_PROC_S,
2081 "proc_sla: result for (%d, 0x%llx-%d) from MCT\n",
2082 MSGID_ELEMS(msg->msg_msgid));
2083 } else if (completed == MDMN_MCT_IN_PROGRESS) {
2085 * If the message is curruntly being processed,
2086 * we can return here, without sending a result back.
2087 * This will be done by the initial message handling
2088 * thread
2090 (void) mutex_unlock(&mct_mutex[setno][class]);
2091 commd_debug(MD_MMV_PROC_M, "proc_sla: "
2092 "(%d, 0x%llx-%d) is currently being processed\n",
2093 MSGID_ELEMS(msg->msg_msgid), msgtype);
2095 free_msg(msg);
2096 free_result(result);
2097 return;
2098 } else {
2099 /* MCT error occurred (should never happen) */
2100 result->mmr_comm_state = MDMNE_LOG_FAIL;
2101 commd_debug(MD_MMV_PROC_S,
2102 "proc_sla: MCT error for (%d, 0x%llx-%d)\n",
2103 MSGID_ELEMS(msg->msg_msgid));
2105 (void) mutex_unlock(&mct_mutex[setno][class]);
2109 * At this point we have a result (even in an error case)
2110 * that we return to the master.
2112 (void) rw_rdlock(&set_desc_rwlock[setno]);
2113 retries = 2; /* we will try two times to send the results */
2114 successfully_returned = 0;
2116 while (!successfully_returned && (retries != 0)) {
2117 ret = (int *)NULL;
2118 (void) rw_rdlock(&client_rwlock[setno]);
2119 if (check_client(setno, sender)) {
2121 * If we cannot setup the rpc connection to the master,
2122 * we can't do anything besides logging this fact.
2124 commd_debug(MD_MMV_SYSLOG,
2125 "proc_mas: unable to create client for master\n");
2126 (void) rw_unlock(&client_rwlock[setno]);
2127 break;
2128 } else {
2129 ret = mdmn_wakeup_master_2(result,
2130 client[setno][sender], sender);
2132 * if mdmn_wakeup_master_2 returns NULL, it can be that
2133 * the master (or the commd on the master) had died.
2134 * In that case, we destroy the client to the master
2135 * and retry.
2136 * If mdmn_wakeup_master_2 doesn't return MDMNE_ACK,
2137 * the commd on the master is alive but
2138 * something else is wrong,
2139 * in that case a retry doesn't make sense => break out
2141 if (ret == (int *)NULL) {
2142 commd_debug(MD_MMV_PROC_S,
2143 "proc_sla: wakeup_master returned NULL\n");
2144 /* release reader lock, grab writer lock */
2145 (void) rw_unlock(&client_rwlock[setno]);
2146 (void) rw_wrlock(&client_rwlock[setno]);
2147 mdmn_clnt_destroy(client[setno][sender]);
2148 if (client[setno][sender] != (CLIENT *)NULL) {
2149 client[setno][sender] = (CLIENT *)NULL;
2151 (void) rw_unlock(&client_rwlock[setno]);
2152 retries--;
2153 commd_debug(MD_MMV_PROC_S,
2154 "retries = %d\n", retries);
2155 continue;
2157 if (*ret != MDMNE_ACK) {
2158 commd_debug(MD_MMV_PROC_S, "proc_sla: "
2159 "wakeup_master returned %d\n", *ret);
2160 (void) rw_unlock(&client_rwlock[setno]);
2161 break;
2162 } else { /* Good case */
2163 successfully_returned = 1;
2164 (void) rw_unlock(&client_rwlock[setno]);
2169 (void) rw_unlock(&set_desc_rwlock[setno]);
2170 commd_debug(MD_MMV_PROC_S, "proc_sla: done (%d, 0x%llx-%d)\n",
2171 MSGID_ELEMS(msg->msg_msgid));
2173 if (ret != (int *)NULL)
2174 free(ret);
2175 free_msg(msg);
2176 free_result(result);
2181 * mdmn_send_svc_2:
2182 * ---------------
2183 * Check that the issuing node is a legitimate one (i.e. is licensed to send
2184 * messages to us), that the RPC request can be staged.
2186 * Returns:
2187 * 0 => no RPC request is in-flight, no deferred svc_sendreply()
2188 * 1 => queued RPC request in-flight. Completion will be made (later)
2189 * by a wakeup_initiator_2() [hopefully]
2192 mdmn_send_svc_2(md_mn_msg_t *omsg, struct svc_req *rqstp)
2194 int err;
2195 set_t setno;
2196 SVCXPRT *transp = rqstp->rq_xprt;
2197 md_mn_msg_t *msg;
2198 md_mn_result_t *resultp;
2199 md_mn_msgclass_t class;
2200 md_mn_msg_and_transp_t *matp;
2202 msg = copy_msg(omsg, NULL);
2203 xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg);
2205 setno = msg->msg_setno;
2206 class = mdmn_get_message_class(msg->msg_type);
2208 /* If we are in the abort state, we error out immediately */
2209 if (md_commd_global_state & MD_CGS_ABORTED) {
2210 resultp = Zalloc(sizeof (md_mn_result_t));
2211 resultp->mmr_comm_state = MDMNE_ABORT;
2212 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
2213 free_result(resultp);
2214 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2215 return (0);
2218 /* check if the global initialization is done */
2219 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2220 global_init();
2223 commd_debug(MD_MMV_SEND,
2224 "send: received (%d, 0x%llx-%d), set=%d, class=%d, type=%d\n",
2225 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type);
2227 /* Check for verbosity related message */
2228 if (msg->msg_type == MD_MN_MSG_VERBOSITY) {
2229 md_mn_verbose_t *d;
2231 d = (md_mn_verbose_t *)((void *)(msg->msg_event_data));
2232 md_commd_global_verb = d->mmv_what;
2233 /* everytime the bitmask is set, we reset the timer */
2234 __savetime = gethrtime();
2236 * If local-only-flag is set, we are done here,
2237 * otherwise we pass that message on to the master.
2239 if (msg->msg_flags & MD_MSGF_LOCAL_ONLY) {
2240 resultp = Zalloc(sizeof (md_mn_result_t));
2241 resultp->mmr_comm_state = MDMNE_ACK;
2242 mdmn_svc_sendreply(transp, xdr_md_mn_result_t,
2243 (char *)resultp);
2244 free_result(resultp);
2245 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2246 return (0);
2251 * Are we entering the abort state?
2252 * Here we don't even need to check for MD_MSGF_LOCAL_ONLY, because
2253 * this message cannot be distributed anyway.
2254 * So, it's safe to return immediately.
2256 if (msg->msg_type == MD_MN_MSG_ABORT) {
2257 md_commd_global_state |= MD_CGS_ABORTED;
2258 resultp = Zalloc(sizeof (md_mn_result_t));
2259 resultp->mmr_comm_state = MDMNE_ACK;
2260 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
2261 free_result(resultp);
2262 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2263 return (0);
2268 * Is this message type blocked?
2269 * If so we return MDMNE_CLASS_LOCKED, immediately
2271 if (msgtype_lock_state[msg->msg_type] == MMTL_LOCK) {
2272 resultp = Zalloc(sizeof (md_mn_result_t));
2273 resultp->mmr_comm_state = MDMNE_CLASS_LOCKED;
2274 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
2275 free_result(resultp);
2276 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2277 commd_debug(MD_MMV_SEND,
2278 "send: type locked (%d, 0x%llx-%d), set=%d, class=%d, "
2279 "type=%d\n", MSGID_ELEMS(msg->msg_msgid), setno, class,
2280 msg->msg_type);
2281 return (0);
2285 if (md_mn_set_inited[setno] != MDMN_SET_READY) {
2286 /* Can only use the appropriate mutexes if they are inited */
2287 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) {
2288 (void) rw_wrlock(&set_desc_rwlock[setno]);
2289 (void) rw_wrlock(&client_rwlock[setno]);
2290 err = mdmn_init_set(setno, MDMN_SET_READY);
2291 (void) rw_unlock(&client_rwlock[setno]);
2292 (void) rw_unlock(&set_desc_rwlock[setno]);
2293 } else {
2294 err = mdmn_init_set(setno, MDMN_SET_READY);
2297 if (err) {
2298 /* couldn't initialize connections, cannot proceed */
2299 resultp = Zalloc(sizeof (md_mn_result_t));
2300 resultp->mmr_comm_state = err;
2301 mdmn_svc_sendreply(transp, xdr_md_mn_result_t,
2302 (char *)resultp);
2303 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2304 free_result(resultp);
2305 commd_debug(MD_MMV_SEND,
2306 "send: init err = %d\n", err);
2307 return (0);
2311 (void) mutex_lock(&mdmn_busy_mutex[setno]);
2312 if ((mdmn_is_class_suspended(setno, class) == TRUE) &&
2313 ((msg->msg_flags & MD_MSGF_OVERRIDE_SUSPEND) == 0)) {
2314 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2315 resultp = Zalloc(sizeof (md_mn_result_t));
2316 resultp->mmr_comm_state = MDMNE_SUSPENDED;
2317 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
2318 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2319 free_result(resultp);
2320 commd_debug(MD_MMV_SEND,
2321 "send: class suspended (%d, 0x%llx-%d), set=%d, "
2322 "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid),
2323 setno, class, msg->msg_type);
2324 return (0);
2326 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2328 /* is this rpc request coming from the local node? */
2329 if (check_license(rqstp, 0) == FALSE) {
2330 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2331 commd_debug(MD_MMV_SEND,
2332 "send: check licence fail(%d, 0x%llx-%d), set=%d, "
2333 "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid),
2334 setno, class, msg->msg_type);
2335 return (0);
2340 * We allocate a structure that can take two pointers in order to pass
2341 * both the message and the transp into thread_create.
2342 * The free for this alloc is done in mdmn_send_to_work()
2344 matp = Malloc(sizeof (md_mn_msg_and_transp_t));
2345 matp->mat_msg = msg;
2346 matp->mat_transp = transp;
2349 * create a thread here that calls work on the master.
2350 * If we are already on the master, this would block if running
2351 * in the same context. (our service is single threaded)(
2352 * Make it a detached thread because it will not communicate with
2353 * anybody thru thr_* mechanisms
2355 (void) thr_create(NULL, 0, mdmn_send_to_work, (void *) matp,
2356 THR_DETACHED, NULL);
2358 commd_debug(MD_MMV_SEND, "send: done (%d, 0x%llx-%d)\n",
2359 MSGID_ELEMS(msg->msg_msgid));
2361 * We return here without sending results. This will be done by
2362 * mdmn_wakeup_initiator_svc_2() as soon as the results are available.
2363 * Until then the calling send_message will be blocked, while we
2364 * are able to take calls.
2367 return (1);
2370 /* ARGSUSED */
2371 int *
2372 mdmn_work_svc_2(md_mn_msg_t *omsg, struct svc_req *rqstp)
2374 int err;
2375 set_t setno;
2376 thread_t tid;
2377 int *retval;
2378 md_mn_msg_t *msg;
2379 md_mn_msgclass_t class;
2381 retval = Malloc(sizeof (int));
2383 /* If we are in the abort state, we error out immediately */
2384 if (md_commd_global_state & MD_CGS_ABORTED) {
2385 xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg);
2386 *retval = MDMNE_ABORT;
2387 return (retval);
2390 msg = copy_msg(omsg, NULL);
2391 xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg);
2394 * Is this message type blocked?
2395 * If so we return MDMNE_CLASS_LOCKED, immediately.
2396 * This check is performed on master and slave.
2398 if (msgtype_lock_state[msg->msg_type] == MMTL_LOCK) {
2399 *retval = MDMNE_CLASS_LOCKED;
2400 return (retval);
2403 /* check if the global initialization is done */
2404 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2405 global_init();
2408 class = mdmn_get_message_class(msg->msg_type);
2409 setno = msg->msg_setno;
2411 if (md_mn_set_inited[setno] != MDMN_SET_READY) {
2412 /* Can only use the appropriate mutexes if they are inited */
2413 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) {
2414 (void) rw_wrlock(&set_desc_rwlock[setno]);
2415 (void) rw_wrlock(&client_rwlock[setno]);
2416 err = mdmn_init_set(setno, MDMN_SET_READY);
2417 (void) rw_unlock(&client_rwlock[setno]);
2418 (void) rw_unlock(&set_desc_rwlock[setno]);
2419 } else {
2420 err = mdmn_init_set(setno, MDMN_SET_READY);
2423 if (err) {
2424 *retval = MDMNE_CANNOT_CONNECT;
2425 free_msg(msg);
2426 return (retval);
2430 /* is this rpc request coming from a licensed node? */
2431 if (check_license(rqstp, msg->msg_sender) == FALSE) {
2432 free_msg(msg);
2433 *retval = MDMNE_RPC_FAIL;
2434 return (retval);
2437 commd_debug(MD_MMV_WORK,
2438 "work: received (%d, 0x%llx-%d), set=%d, class=%d, type=%d, "
2439 "flags=0x%x\n",
2440 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type,
2441 msg->msg_flags);
2443 /* Check for various CLASS0 message types */
2444 if (msg->msg_type == MD_MN_MSG_VERBOSITY) {
2445 md_mn_verbose_t *d;
2447 d = (md_mn_verbose_t *)((void *)(msg->msg_event_data));
2448 /* for now we ignore set / class in md_mn_verbose_t */
2449 md_commd_global_verb = d->mmv_what;
2450 /* everytime the bitmask is set, we reset the timer */
2451 __savetime = gethrtime();
2454 (void) mutex_lock(&mdmn_busy_mutex[setno]);
2456 /* check if class is locked via a call to mdmn_comm_lock_svc_2 */
2457 if (mdmn_is_class_locked(setno, class) == TRUE) {
2458 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2459 *retval = MDMNE_CLASS_LOCKED;
2460 free_msg(msg);
2461 return (retval);
2463 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2465 /* Check if the class is busy right now. Do it only on the master */
2466 (void) rw_rdlock(&set_desc_rwlock[setno]);
2467 if (set_descriptor[setno]->sd_mn_am_i_master) {
2468 (void) rw_unlock(&set_desc_rwlock[setno]);
2470 * If the class is currently suspended, don't accept new
2471 * messages, unless they are flagged with an override bit.
2473 (void) mutex_lock(&mdmn_busy_mutex[setno]);
2474 if ((mdmn_is_class_suspended(setno, class) == TRUE) &&
2475 ((msg->msg_flags & MD_MSGF_OVERRIDE_SUSPEND) == 0)) {
2476 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2477 *retval = MDMNE_SUSPENDED;
2478 commd_debug(MD_MMV_SEND,
2479 "send: set %d is suspended\n", setno);
2480 free_msg(msg);
2481 return (retval);
2483 if (mdmn_mark_class_busy(setno, class) == FALSE) {
2484 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2485 *retval = MDMNE_CLASS_BUSY;
2486 free_msg(msg);
2487 return (retval);
2489 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2491 * Because the real processing of the message takes time we
2492 * create a thread for it. So the master thread can continue
2493 * to run and accept further messages.
2495 *retval = thr_create(NULL, 0,
2496 (void *(*)(void *))mdmn_master_process_msg, (void *)msg,
2497 THR_DETACHED|THR_SUSPENDED, &tid);
2498 } else {
2499 (void) rw_unlock(&set_desc_rwlock[setno]);
2500 *retval = thr_create(NULL, 0,
2501 (void *(*)(void *)) mdmn_slave_process_msg, (void *)msg,
2502 THR_DETACHED|THR_SUSPENDED, &tid);
2505 if (*retval != 0) {
2506 *retval = MDMNE_THR_CREATE_FAIL;
2507 free_msg(msg);
2508 return (retval);
2511 /* Now run the new thread */
2512 (void) thr_continue(tid);
2514 commd_debug(MD_MMV_WORK,
2515 "work: done (%d, 0x%llx-%d), set=%d, class=%d, type=%d\n",
2516 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type);
2518 *retval = MDMNE_ACK; /* this means success */
2519 return (retval);
2522 /* ARGSUSED */
2523 int *
2524 mdmn_wakeup_initiator_svc_2(md_mn_result_t *res, struct svc_req *rqstp)
2527 int *retval;
2528 int err;
2529 set_t setno;
2530 mutex_t *mx; /* protection of initiator_table */
2531 SVCXPRT *transp = NULL;
2532 md_mn_msgid_t initiator_table_id;
2533 md_mn_msgclass_t class;
2535 retval = Malloc(sizeof (int));
2537 /* check if the global initialization is done */
2538 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2539 global_init();
2542 setno = res->mmr_setno;
2544 if (md_mn_set_inited[setno] != MDMN_SET_READY) {
2545 /* set not ready means we just crashed are restarted now */
2546 /* Can only use the appropriate mutexes if they are inited */
2547 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) {
2548 (void) rw_wrlock(&set_desc_rwlock[setno]);
2549 (void) rw_wrlock(&client_rwlock[setno]);
2550 err = mdmn_init_set(setno, MDMN_SET_READY);
2551 (void) rw_unlock(&client_rwlock[setno]);
2552 (void) rw_unlock(&set_desc_rwlock[setno]);
2553 } else {
2554 err = mdmn_init_set(setno, MDMN_SET_READY);
2557 if (err) {
2558 *retval = MDMNE_CANNOT_CONNECT;
2559 xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2560 return (retval);
2564 /* is this rpc request coming from a licensed node? */
2565 if (check_license(rqstp, res->mmr_sender) == FALSE) {
2566 xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2567 *retval = MDMNE_RPC_FAIL;
2568 return (retval);
2572 class = mdmn_get_message_class(res->mmr_msgtype);
2573 mx = mdmn_get_initiator_table_mx(setno, class);
2575 commd_debug(MD_MMV_WAKE_I,
2576 "wake_ini: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n",
2577 MSGID_ELEMS(res->mmr_msgid), setno, class, res->mmr_msgtype);
2579 (void) mutex_lock(mx);
2582 * Search the initiator wakeup table.
2583 * If we find an entry here (which should always be true)
2584 * we are on the initiating node and we wakeup the original
2585 * local rpc call.
2587 mdmn_get_initiator_table_id(setno, class, &initiator_table_id);
2589 if (MSGID_CMP(&(initiator_table_id), &(res->mmr_msgid))) {
2590 transp = mdmn_get_initiator_table_transp(setno, class);
2591 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)res);
2592 svc_done(transp);
2593 mdmn_unregister_initiator_table(setno, class);
2594 *retval = MDMNE_ACK;
2596 commd_debug(MD_MMV_WAKE_I,
2597 "wake_ini: replied (%d, 0x%llx-%d)\n",
2598 MSGID_ELEMS(res->mmr_msgid));
2599 } else {
2600 commd_debug(MD_MMV_WAKE_I,
2601 "wakeup initiator: unsolicited message (%d, 0x%llx-%d)\n",
2602 MSGID_ELEMS(res->mmr_msgid));
2603 *retval = MDMNE_NO_WAKEUP_ENTRY;
2605 (void) mutex_unlock(mx);
2606 /* less work for check_timeouts */
2607 (void) mutex_lock(&check_timeout_mutex);
2608 if (messages_on_their_way == 0) {
2609 commd_debug(MD_MMV_WAKE_I,
2610 "Oops, messages_on_their_way < 0 (%d, 0x%llx-%d)\n",
2611 MSGID_ELEMS(res->mmr_msgid));
2612 } else {
2613 messages_on_their_way--;
2615 (void) mutex_unlock(&check_timeout_mutex);
2616 xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2618 return (retval);
2623 * res must be free'd by the thread we wake up
2625 /* ARGSUSED */
2626 int *
2627 mdmn_wakeup_master_svc_2(md_mn_result_t *ores, struct svc_req *rqstp)
2630 int *retval;
2631 int err;
2632 set_t setno;
2633 cond_t *cv;
2634 mutex_t *mx;
2635 md_mn_msgid_t master_table_id;
2636 md_mn_nodeid_t sender;
2637 md_mn_result_t *res;
2638 md_mn_msgclass_t class;
2640 retval = Malloc(sizeof (int));
2642 /* check if the global initialization is done */
2643 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2644 global_init();
2647 /* Need to copy the results here, as they are static for RPC */
2648 res = copy_result(ores);
2649 xdr_free(xdr_md_mn_result_t, (caddr_t)ores);
2651 class = mdmn_get_message_class(res->mmr_msgtype);
2652 setno = res->mmr_setno;
2654 if (md_mn_set_inited[setno] != MDMN_SET_READY) {
2655 /* set not ready means we just crashed are restarted now */
2656 /* Can only use the appropriate mutexes if they are inited */
2657 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) {
2658 (void) rw_wrlock(&set_desc_rwlock[setno]);
2659 (void) rw_wrlock(&client_rwlock[setno]);
2660 err = mdmn_init_set(setno, MDMN_SET_READY);
2661 (void) rw_unlock(&client_rwlock[setno]);
2662 (void) rw_unlock(&set_desc_rwlock[setno]);
2663 } else {
2664 err = mdmn_init_set(setno, MDMN_SET_READY);
2667 if (err) {
2668 *retval = MDMNE_CANNOT_CONNECT;
2669 xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2670 return (retval);
2674 /* is this rpc request coming from a licensed node? */
2675 if (check_license(rqstp, res->mmr_sender) == FALSE) {
2676 *retval = MDMNE_RPC_FAIL;
2677 xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2678 return (retval);
2682 commd_debug(MD_MMV_WAKE_M,
2683 "wake_mas: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d "
2684 "from %d\n",
2685 MSGID_ELEMS(res->mmr_msgid), setno, class, res->mmr_msgtype,
2686 res->mmr_sender);
2688 * The mutex and cv are needed for waking up the thread
2689 * sleeping in mdmn_master_process_msg()
2691 mx = mdmn_get_master_table_mx(setno, class);
2692 cv = mdmn_get_master_table_cv(setno, class);
2695 * lookup the master wakeup table
2696 * If we find our message, we are on the master and
2697 * called by a slave that finished processing a message.
2698 * We store the results in the appropriate slot and
2699 * wakeup the thread (mdmn_master_process_msg()) waiting for them.
2701 (void) mutex_lock(mx);
2702 mdmn_get_master_table_id(setno, class, &master_table_id);
2703 sender = mdmn_get_master_table_addr(setno, class);
2705 if (MSGID_CMP(&(master_table_id), &(res->mmr_msgid))) {
2706 if (sender == res->mmr_sender) {
2707 mdmn_set_master_table_res(setno, class, res);
2708 (void) cond_signal(cv);
2709 *retval = MDMNE_ACK;
2710 } else {
2711 /* id is correct but wrong sender (I smell a timeout) */
2712 commd_debug(MD_MMV_WAKE_M,
2713 "wakeup master got unsolicited message: "
2714 "(%d, 0x%llx-%d) from %d\n",
2715 MSGID_ELEMS(res->mmr_msgid), res->mmr_sender);
2716 free_result(res);
2717 *retval = MDMNE_TIMEOUT;
2719 } else {
2720 /* id is wrong, smells like a very late timeout */
2721 commd_debug(MD_MMV_WAKE_M,
2722 "wakeup master got unsolicited message: "
2723 "(%d, 0x%llx-%d) from %d, expected (%d, 0x%llx-%d)\n",
2724 MSGID_ELEMS(res->mmr_msgid), res->mmr_sender,
2725 MSGID_ELEMS(master_table_id));
2726 free_result(res);
2727 *retval = MDMNE_NO_WAKEUP_ENTRY;
2730 (void) mutex_unlock(mx);
2732 return (retval);
2736 * Lock a set/class combination.
2737 * This is mainly done for debug purpose.
2738 * This set/class combination immediately is blocked,
2739 * even in the middle of sending messages to multiple slaves.
2740 * This remains until the user issues a mdmn_comm_unlock_svc_2 for the same
2741 * set/class combination.
2743 * Special messages of class MD_MSG_CLASS0 can never be locked.
2744 * e.g. MD_MN_MSG_VERBOSITY, MD_MN_MSG_ABORT
2746 * That means, if MD_MSG_CLASS0 is specified, we lock all classes from
2747 * >= MD_MSG_CLASS1 to < MD_MN_NCLASSES
2749 * set must be between 1 and MD_MAXSETS
2750 * class can be:
2751 * MD_MSG_CLASS0 which means all other classes in this case
2752 * or one specific class (< MD_MN_NCLASSES)
2754 * Returns:
2755 * MDMNE_ACK on sucess (locking a locked class is Ok)
2756 * MDMNE_EINVAL if a parameter is out of range
2759 /* ARGSUSED */
2760 int *
2761 mdmn_comm_lock_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
2763 int *retval;
2764 set_t setno = msc->msc_set;
2765 md_mn_msgclass_t class = msc->msc_class;
2767 retval = Malloc(sizeof (int));
2769 /* check if the global initialization is done */
2770 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2771 global_init();
2774 /* is this rpc request coming from the local node ? */
2775 if (check_license(rqstp, 0) == FALSE) {
2776 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc);
2777 *retval = MDMNE_RPC_FAIL;
2778 return (retval);
2781 /* Perform some range checking */
2782 if ((setno == 0) || (setno >= MD_MAXSETS) ||
2783 (class < MD_MSG_CLASS0) || (class >= MD_MN_NCLASSES)) {
2784 *retval = MDMNE_EINVAL;
2785 return (retval);
2788 commd_debug(MD_MMV_MISC, "lock: set=%d, class=%d\n", setno, class);
2789 (void) mutex_lock(&mdmn_busy_mutex[setno]);
2790 if (class != MD_MSG_CLASS0) {
2791 mdmn_mark_class_locked(setno, class);
2792 } else {
2793 /* MD_MSG_CLASS0 is used as a wild card for all classes */
2794 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) {
2795 mdmn_mark_class_locked(setno, class);
2798 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2800 *retval = MDMNE_ACK;
2801 return (retval);
2805 * Unlock a set/class combination.
2806 * set must be between 1 and MD_MAXSETS
2807 * class can be:
2808 * MD_MSG_CLASS0 which means all other classes in this case (like above)
2809 * or one specific class (< MD_MN_NCLASSES)
2811 * Returns:
2812 * MDMNE_ACK on sucess (unlocking an unlocked class is Ok)
2813 * MDMNE_EINVAL if a parameter is out of range
2815 /* ARGSUSED */
2816 int *
2817 mdmn_comm_unlock_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
2819 int *retval;
2820 set_t setno = msc->msc_set;
2821 md_mn_msgclass_t class = msc->msc_class;
2823 retval = Malloc(sizeof (int));
2825 /* check if the global initialization is done */
2826 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2827 global_init();
2830 /* is this rpc request coming from the local node ? */
2831 if (check_license(rqstp, 0) == FALSE) {
2832 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc);
2833 *retval = MDMNE_RPC_FAIL;
2834 return (retval);
2837 /* Perform some range checking */
2838 if ((setno == 0) || (setno >= MD_MAXSETS) ||
2839 (class < MD_MSG_CLASS0) || (class >= MD_MN_NCLASSES)) {
2840 *retval = MDMNE_EINVAL;
2841 return (retval);
2843 commd_debug(MD_MMV_MISC, "unlock: set=%d, class=%d\n", setno, class);
2845 (void) mutex_lock(&mdmn_busy_mutex[setno]);
2846 if (class != MD_MSG_CLASS0) {
2847 mdmn_mark_class_unlocked(setno, class);
2848 } else {
2849 /* MD_MSG_CLASS0 is used as a wild card for all classes */
2850 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) {
2851 mdmn_mark_class_unlocked(setno, class);
2854 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2856 *retval = MDMNE_ACK;
2857 return (retval);
2861 * mdmn_comm_suspend_svc_2(setno, class)
2863 * Drain all outstanding messages for a given set/class combination
2864 * and don't allow new messages to be processed.
2866 * Special messages of class MD_MSG_CLASS0 can never be locked.
2867 * e.g. MD_MN_MSG_VERBOSITY
2869 * 1 <= setno < MD_MAXSETS or setno == MD_COMM_ALL_SETS
2870 * 1 <= class < MD_MN_NCLASSES or class == MD_COMM_ALL_CLASSES
2872 * If class _is_not_ MD_COMM_ALL_CLASSES, then we simply mark this
2873 * one class as being suspended.
2874 * If messages for this class are currently on their way,
2875 * MDMNE_SET_NOT_DRAINED is returned. Otherwise MDMNE_ACK is returned.
2877 * If class _is_ MD_COMM_ALL_CLASSES we drain all classes of this set.
2878 * Messages must be generated in ascending order.
2879 * This means, a message cannot create submessages with the same or lower class.
2880 * Draining messages must go from 1 to NCLASSES in order to ensure we don't
2881 * generate a hanging situation here.
2882 * We mark class 1 as being suspended.
2883 * if the class is not busy, we proceed with class 2
2884 * and so on
2885 * if a class *is* busy, we cannot continue here, but return
2886 * MDMNE_SET_NOT_DRAINED.
2887 * We expect the caller to hold on for some seconds and try again.
2888 * When that message, that held the class busy is done in
2889 * mdmn_master_process_msg(), mdmn_mark_class_unbusy() called.
2890 * There it is checked if the class is about to drain.
2891 * In that case it tries to drain all higher classes there.
2893 * If setno is MD_COMM_ALL_SETS then we perform this on all possible sets.
2894 * In that case we return MDMNE_SET_NOT_DRAINED if not all sets are
2895 * completely drained.
2897 * Returns:
2898 * MDMNE_ACK on sucess (set is drained, no outstanding messages)
2899 * MDMNE_SET_NOT_DRAINED if drain process is started, but there are
2900 * still outstanding messages for this set(s)
2901 * MDMNE_EINVAL if setno is out of range
2902 * MDMNE_NOT_JOINED if the set is not yet initialized on this node
2905 /* ARGSUSED */
2906 int *
2907 mdmn_comm_suspend_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
2909 int *retval;
2910 int failure = 0;
2911 set_t startset, endset;
2912 set_t setno = msc->msc_set;
2913 md_mn_msgclass_t oclass = msc->msc_class;
2914 #ifdef NOT_YET_NEEDED
2915 uint_t flags = msc->msc_flags;
2916 #endif /* NOT_YET_NEEDED */
2917 md_mn_msgclass_t class;
2919 retval = Malloc(sizeof (int));
2921 /* check if the global initialization is done */
2922 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2923 global_init();
2926 /* is this rpc request coming from the local node ? */
2927 if (check_license(rqstp, 0) == FALSE) {
2928 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc);
2929 *retval = MDMNE_RPC_FAIL;
2930 return (retval);
2933 commd_debug(MD_MMV_MISC, "suspend: called for set=%d class=%d\n",
2934 setno, oclass);
2936 /* Perform some range checking */
2937 if (setno >= MD_MAXSETS) {
2938 *retval = MDMNE_EINVAL;
2939 commd_debug(MD_MMV_MISC, "suspend: returning MDMNE_EINVAL\n");
2940 return (retval);
2943 /* setno == MD_COMM_ALL_SETS means: we walk thru all possible sets. */
2944 if (setno == MD_COMM_ALL_SETS) {
2945 startset = 1;
2946 endset = MD_MAXSETS - 1;
2947 } else {
2948 startset = setno;
2949 endset = setno;
2952 for (setno = startset; setno <= endset; setno++) {
2953 /* Here we need the mutexes for the set to be setup */
2954 if (md_mn_set_inited[setno] != MDMN_SET_MUTEXES) {
2955 (void) mdmn_init_set(setno, MDMN_SET_MUTEXES);
2958 (void) mutex_lock(&mdmn_busy_mutex[setno]);
2959 /* shall we drain all classes of this set? */
2960 if (oclass == MD_COMM_ALL_CLASSES) {
2961 for (class = 1; class < MD_MN_NCLASSES; class ++) {
2962 commd_debug(MD_MMV_MISC,
2963 "suspend: suspending set %d, class %d\n",
2964 setno, class);
2965 *retval = mdmn_mark_class_suspended(setno,
2966 class, MDMN_SUSPEND_ALL);
2967 if (*retval == MDMNE_SET_NOT_DRAINED) {
2968 failure++;
2971 } else {
2972 /* only drain one specific class */
2973 commd_debug(MD_MMV_MISC,
2974 "suspend: suspending set=%d class=%d\n",
2975 setno, oclass);
2976 *retval = mdmn_mark_class_suspended(setno, oclass,
2977 MDMN_SUSPEND_1);
2978 if (*retval == MDMNE_SET_NOT_DRAINED) {
2979 failure++;
2982 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2984 /* If one or more sets are not entirely drained, failure is non-zero */
2985 if (failure != 0) {
2986 *retval = MDMNE_SET_NOT_DRAINED;
2987 commd_debug(MD_MMV_MISC,
2988 "suspend: returning MDMNE_SET_NOT_DRAINED\n");
2989 } else {
2990 *retval = MDMNE_ACK;
2993 return (retval);
2997 * mdmn_comm_resume_svc_2(setno, class)
2999 * Resume processing messages for a given set.
3000 * This incorporates the repeal of a previous suspend operation.
3002 * 1 <= setno < MD_MAXSETS or setno == MD_COMM_ALL_SETS
3003 * 1 <= class < MD_MN_NCLASSES or class == MD_COMM_ALL_CLASSES
3005 * If class _is_not_ MD_COMM_ALL_CLASSES, then we simply mark this
3006 * one class as being resumed.
3008 * If class _is_ MD_COMM_ALL_CLASSES we resume all classes of this set.
3010 * If setno is MD_COMM_ALL_SETS then we perform this on all possible sets.
3012 * If both setno is MD_COMM_ALL_SETS and class is MD_COMM_ALL_CLASSES we also
3013 * reset any ABORT flag from the global state.
3015 * Returns:
3016 * MDMNE_ACK on sucess (resuming an unlocked set is Ok)
3017 * MDMNE_EINVAL if setno is out of range
3018 * MDMNE_NOT_JOINED if the set is not yet initialized on this node
3020 /* ARGSUSED */
3021 int *
3022 mdmn_comm_resume_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
3024 int *retval;
3025 set_t startset, endset;
3026 set_t setno = msc->msc_set;
3027 md_mn_msgclass_t oclass = msc->msc_class;
3028 uint_t flags = msc->msc_flags;
3029 md_mn_msgclass_t class;
3031 retval = Malloc(sizeof (int));
3033 /* check if the global initialization is done */
3034 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
3035 global_init();
3038 /* is this rpc request coming from the local node ? */
3039 if (check_license(rqstp, 0) == FALSE) {
3040 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc);
3041 *retval = MDMNE_RPC_FAIL;
3042 return (retval);
3045 commd_debug(MD_MMV_MISC, "resume: called for set=%d class=%d\n",
3046 setno, oclass);
3048 /* Perform some range checking */
3049 if (setno > MD_MAXSETS) {
3050 *retval = MDMNE_EINVAL;
3051 return (retval);
3054 if (setno == MD_COMM_ALL_SETS) {
3055 startset = 1;
3056 endset = MD_MAXSETS - 1;
3057 if (oclass == MD_COMM_ALL_CLASSES) {
3058 /* This is the point where we "unabort" the commd */
3059 commd_debug(MD_MMV_MISC, "resume: resetting ABORT\n");
3060 md_commd_global_state &= ~MD_CGS_ABORTED;
3062 } else {
3063 startset = setno;
3064 endset = setno;
3067 for (setno = startset; setno <= endset; setno++) {
3069 /* Here we need the mutexes for the set to be setup */
3070 if ((md_mn_set_inited[setno] & MDMN_SET_MUTEXES) == 0) {
3071 (void) mdmn_init_set(setno, MDMN_SET_MUTEXES);
3074 (void) mutex_lock(&mdmn_busy_mutex[setno]);
3076 if (oclass == MD_COMM_ALL_CLASSES) {
3077 int end_class = 1;
3079 * When SUSPENDing all classes, we go
3080 * from 1 to MD_MN_NCLASSES-1
3081 * The correct reverse action is RESUMing
3082 * from MD_MN_NCLASSES-1 to 1 (or 2)
3085 if (flags & MD_MSCF_DONT_RESUME_CLASS1) {
3086 end_class = 2;
3090 * Then mark all classes of this set as no longer
3091 * suspended. This supersedes any previous suspend(1)
3092 * calls and resumes the set entirely.
3094 for (class = MD_MN_NCLASSES - 1; class >= end_class;
3095 class --) {
3096 commd_debug(MD_MMV_MISC,
3097 "resume: resuming set=%d class=%d\n",
3098 setno, class);
3099 mdmn_mark_class_resumed(setno, class,
3100 (MDMN_SUSPEND_ALL | MDMN_SUSPEND_1));
3102 } else {
3104 * In this case only one class is marked as not
3105 * suspended. If a suspend(all) is currently active for
3106 * this set, this class will still be suspended.
3107 * That state will be cleared by a suspend(all)
3108 * (see above)
3110 commd_debug(MD_MMV_MISC,
3111 "resume: resuming set=%d class=%d\n",
3112 setno, oclass);
3113 mdmn_mark_class_resumed(setno, oclass, MDMN_SUSPEND_1);
3116 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
3119 *retval = MDMNE_ACK;
3120 return (retval);
3122 /* ARGSUSED */
3123 int *
3124 mdmn_comm_reinit_set_svc_2(set_t *setnop, struct svc_req *rqstp)
3126 int *retval;
3127 md_mnnode_desc *node;
3128 set_t setno = *setnop;
3130 retval = Malloc(sizeof (int));
3132 /* check if the global initialization is done */
3133 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
3134 global_init();
3137 /* is this rpc request coming from the local node ? */
3138 if (check_license(rqstp, 0) == FALSE) {
3139 xdr_free(xdr_set_t, (caddr_t)setnop);
3140 *retval = MDMNE_RPC_FAIL;
3141 return (retval);
3144 commd_debug(MD_MMV_MISC, "reinit: set=%d\n", setno);
3146 (void) rw_rdlock(&set_desc_rwlock[setno]);
3148 * We assume, that all messages have been suspended previously.
3150 * As we are modifying lots of clients here we grab the client_rwlock
3151 * in writer mode. This ensures, no new messages come in.
3153 (void) rw_wrlock(&client_rwlock[setno]);
3154 /* This set is no longer initialized */
3156 if ((set_descriptor[setno] != NULL) &&
3157 (md_mn_set_inited[setno] & MDMN_SET_NODES)) {
3158 /* destroy all rpc clients from this set */
3159 for (node = set_descriptor[setno]->sd_nodelist; node;
3160 node = node->nd_next) {
3162 * Since the CLIENT for ourself will be recreated
3163 * shortly, and this node is guaranteed to be
3164 * there after a reconfig, there's no reason to go
3165 * through destroying it. It also avoids an issue
3166 * with calling clnt_create() later from within the
3167 * server thread, which can effectively deadlock
3168 * itself due to RPC design limitations.
3170 if (node == set_descriptor[setno]->sd_mn_mynode)
3171 continue;
3172 mdmn_clnt_destroy(client[setno][node->nd_nodeid]);
3173 if (client[setno][node->nd_nodeid] != (CLIENT *)NULL) {
3174 client[setno][node->nd_nodeid] = (CLIENT *)NULL;
3177 md_mn_set_inited[setno] &= ~MDMN_SET_NODES;
3180 commd_debug(MD_MMV_MISC, "reinit: done init_set(%d)\n", setno);
3182 (void) rw_unlock(&client_rwlock[setno]);
3183 (void) rw_unlock(&set_desc_rwlock[setno]);
3184 *retval = MDMNE_ACK;
3185 return (retval);
3189 * This is just an interface for testing purpose.
3190 * Here we can disable single message types.
3191 * If we block a message type, this is valid for all MN sets.
3192 * If a message arrives later, and it's message type is blocked, it will
3193 * be returned immediately with MDMNE_CLASS_LOCKED, which causes the sender to
3194 * resend this message over and over again.
3197 /* ARGSUSED */
3198 int *
3199 mdmn_comm_msglock_svc_2(md_mn_type_and_lock_t *mmtl, struct svc_req *rqstp)
3201 int *retval;
3202 md_mn_msgtype_t type = mmtl->mmtl_type;
3203 uint_t lock = mmtl->mmtl_lock;
3205 retval = Malloc(sizeof (int));
3207 /* check if the global initialization is done */
3208 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
3209 global_init();
3212 /* is this rpc request coming from the local node ? */
3213 if (check_license(rqstp, 0) == FALSE) {
3214 xdr_free(xdr_md_mn_type_and_lock_t, (caddr_t)mmtl);
3215 *retval = MDMNE_RPC_FAIL;
3216 return (retval);
3219 /* Perform some range checking */
3220 if ((type == 0) || (type >= MD_MN_NMESSAGES)) {
3221 *retval = MDMNE_EINVAL;
3222 return (retval);
3225 commd_debug(MD_MMV_MISC, "msglock: type=%d, lock=%d\n", type, lock);
3226 msgtype_lock_state[type] = lock;
3228 *retval = MDMNE_ACK;
3229 return (retval);