4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
27 #include <sys/types.h>
29 #include <sys/statvfs.h>
30 #include <sys/uadmin.h>
31 #include <sys/resource.h>
37 #include <mdmn_changelog.h>
38 #include "mdmn_subr.h"
41 * This is the communication daemon for SVM Multi Node Disksets.
42 * It runs on every node and provides the following rpc services:
45 * - mdmn_wakeup_initiator_svc_2
46 * - mdmn_wakeup_master_svc_2
47 * - mdmn_comm_lock_svc_2
48 * - mdmn_comm_unlock_svc_2
49 * - mdmn_comm_suspend_svc_2
50 * - mdmn_comm_resume_svc_2
51 * - mdmn_comm_reinit_set_svc_2
52 * where send, lock, unlock and reinit are meant for external use,
53 * work and the two wakeups are for internal use only.
56 * On every node only one of those xxx_2 functions can be active at the
57 * same time because the daemon is single threaded.
59 * (not quite true, as mdmn_send_svc_2 and mdmn_work_svc_2 do thr_create()s
60 * as part of their handlers, so those aspects are multi-threaded)
62 * In case an event occurs that has to be propagated to all the nodes...
64 * One node (the initiator)
65 * calls the libmeta function mdmn_send_message()
66 * This function calls the local daemon thru mdmn_send_svc_2.
70 * - starts a thread -> mdmn_send_to_work() and returns.
72 * - sends this message over to the master of the diskset.
73 * This is done by calling mdmn_work_svc_2 on the master.
74 * - registers to the initiator_table
75 * - exits without doing a svc_sendreply() for the call to
76 * mdmn_send_svc_2. This means that call is blocked until somebody
77 * (see end of this comment) does a svc_sendreply().
78 * This means mdmn_send_message() does not yet return.
79 * - A timeout surveillance is started at this point.
80 * This means in case the master doesn't reply at all in an
81 * aproppriate time, an error condition is returned
86 * - starts a thread -> mdmn_master_process_msg() and returns
87 * mdmn_master_process_msg()
88 * - logs the message to the change log
89 * - executes the message locally
90 * - flags the message in the change log
91 * - sends the message to mdmn_work_svc_2() on all the
92 * other nodes (slaves)
93 * after each call to mdmn_work_svc_2 the thread goes to sleep and
94 * will be woken up by mdmn_wakeup_master_svc_2() as soon as the
95 * slave node is done with this message.
96 * - In case the slave doesn't respond in a apropriate time, an error
97 * is assumed to ensure the master doesn't wait forever.
101 * - starts a thread -> mdmn_slave_process_msg() and returns
102 * mdmn_slave_process_msg()
103 * - processes this message locally by calling the appropriate message
104 * handler, that creates some result.
105 * - sends that result thru a call to mdmn_wakeup_master_svc_2() to
108 * Back on the master:
109 * mdmn_wakeup_master_svc_2()
110 * - stores the result into the master_table.
111 * - signals the mdmn_master_process_msg-thread.
113 * mdmn_master_process_msg()
114 * - after getting the results from all nodes
115 * - sends them back to the initiating node thru a call to
116 * mdmn_wakeup_initiator_svc_2.
118 * Back on the initiator:
119 * mdmn_wakeup_initiator_svc_2()
120 * - calls svc_sendreply() which makes the call to mdmn_send_svc_2()
122 * which allows the initial mdmn_send_message() call to return.
125 FILE *commdout
; /* debug output for the commd */
126 char *commdoutfile
; /* file name for the above output */
127 /* want at least 10 MB free space when logging into a file */
128 #define MIN_FS_SPACE (10LL * 1024 * 1024)
131 * Number of outstanding messages that were initiated by this node.
132 * If zero, check_timeouts goes to sleep
134 uint_t messages_on_their_way
;
135 mutex_t check_timeout_mutex
; /* need mutex to protect above */
136 cond_t check_timeout_cv
; /* trigger for check_timeouts */
138 /* for printing out time stamps */
141 /* RPC clients for every set and every node and their protecting locks */
142 CLIENT
*client
[MD_MAXSETS
][NNODES
];
143 rwlock_t client_rwlock
[MD_MAXSETS
];
145 /* the descriptors of all possible sets and their protectors */
146 struct md_set_desc
*set_descriptor
[MD_MAXSETS
];
147 rwlock_t set_desc_rwlock
[MD_MAXSETS
];
149 /* the daemon to daemon communication has to timeout quickly */
150 static struct timeval FOUR_SECS
= { 4, 0 };
152 /* These indicate if a set has already been setup */
153 int md_mn_set_inited
[MD_MAXSETS
];
155 /* For every set we have a message completion table and protecting mutexes */
156 md_mn_mct_t
*mct
[MD_MAXSETS
];
157 mutex_t mct_mutex
[MD_MAXSETS
][MD_MN_NCLASSES
];
159 /* Stuff to describe the global status of the commd on one node */
160 #define MD_CGS_INITED 0x0001
161 #define MD_CGS_ABORTED 0x0002 /* return everything with MDMNE_ABORT */
162 uint_t md_commd_global_state
= 0; /* No state when starting up */
165 * Global verbosity level for the daemon
167 uint_t md_commd_global_verb
;
170 * libmeta doesn't like multiple threads in metaget_setdesc().
171 * So we must protect access to it with a global lock
173 mutex_t get_setdesc_mutex
;
176 * Need a way to block single message types,
177 * hence an array with a status for every message type
179 uint_t msgtype_lock_state
[MD_MN_NMESSAGES
];
181 /* for reading in the config file */
182 #define MAX_LINE_SIZE 1024
184 extern char *commd_get_outfile(void);
185 extern uint_t
commd_get_verbosity(void);
188 * mdmn_clnt_create is a helper function for meta_client_create_retry. It
189 * merely needs to call clnt_create_timed, and meta_client_create_retry
190 * will take care of the rest.
194 mdmn_clnt_create(char *ignore
, void *data
, struct timeval
*time_out
)
196 md_mnnode_desc
*node
= (md_mnnode_desc
*)data
;
198 return (clnt_create_timed(node
->nd_priv_ic
, MDMN_COMMD
, TWO
, "tcp",
202 #define FLUSH_DEBUGFILE() \
203 if (commdout != (FILE *)NULL) { \
204 (void) fflush(commdout); \
205 (void) fsync(fileno(commdout)); \
209 panic_system(int nid
, md_mn_msgtype_t type
, int master_err
, int master_exitval
,
210 md_mn_result_t
*slave_result
)
212 md_mn_commd_err_t commd_err
;
213 md_error_t mne
= mdnullerror
;
216 msg_buf
= (char *)calloc(MAXPATHLEN
+ 1, sizeof (char));
220 if (master_err
!= MDMNE_ACK
) {
221 (void) snprintf(msg_buf
, MAXPATHLEN
, "rpc.mdcommd: RPC "
222 "fail on master when processing message type %d\n", type
);
223 } else if (slave_result
== NULL
) {
224 (void) snprintf(msg_buf
, MAXPATHLEN
, "rpc.mdcommd: RPC fail "
225 "on node %d when processing message type %d\n", nid
, type
);
227 (void) snprintf(msg_buf
, MAXPATHLEN
, "rpc.mdcommd: "
228 "Inconsistent return value from node %d when processing "
229 "message type %d. Master exitval = %d, "
230 "Slave exitval = %d\n", nid
, type
, master_exitval
,
231 slave_result
->mmr_exitval
);
233 commd_err
.size
= strlen(msg_buf
);
234 commd_err
.md_message
= (uint64_t)(uintptr_t)&msg_buf
[0];
236 (void) metaioctl(MD_MN_COMMD_ERR
, &commd_err
, &mne
, "rpc.mdcommd");
237 (void) uadmin(A_DUMP
, AD_BOOT
, NULL
);
243 struct statvfs64 vfsbuf
;
244 long long avail_bytes
;
249 /* No output file, nothing to do */
250 if (commdout
== (FILE *)NULL
)
254 * stat the appropriate filesystem to check for available space.
256 if (statvfs64(commdoutfile
, &vfsbuf
)) {
260 avail_bytes
= vfsbuf
.f_frsize
* vfsbuf
.f_bavail
;
262 * If we don't have enough space, we print out a warning.
263 * And we drop the verbosity level to NULL
264 * In case the condtion doesn't go away, we don't repeat
267 if (avail_bytes
< MIN_FS_SPACE
) {
271 commd_debug(MD_MMV_SYSLOG
,
272 "NOT enough space available for logging\n");
273 commd_debug(MD_MMV_SYSLOG
,
274 "Have %lld bytes, need %lld bytes\n",
275 avail_bytes
, MIN_FS_SPACE
);
277 md_commd_global_verb
= MD_MMV_NULL
;
282 (void) fflush(commdout
);
286 /* safer version of clnt_destroy. If clnt is NULL don't do anything */
287 #define mdmn_clnt_destroy(clnt) { \
289 clnt_destroy(clnt); \
293 * Own version of svc_sendreply that checks the integrity of the transport
294 * handle and so prevents us from core dumps in the real svc_sendreply()
297 mdmn_svc_sendreply(SVCXPRT
*transp
, xdrproc_t xdr
, caddr_t data
)
299 if (SVC_STAT(transp
) == XPRT_DIED
) {
300 commd_debug(MD_MMV_MISC
,
301 "mdmn_svc_sendreply: XPRT_DIED\n");
304 (void) svc_sendreply(transp
, xdr
, data
);
308 * timeout_initiator(set, class)
310 * Alas, I sent a message and didn't get a response back in aproppriate time.
312 * timeout_initiator() takes care for doing the needed svc_sendreply() to the
313 * calling mdmn_send_message, so that guy doesn't wait forever
314 * What is done here is pretty much the same as what is done in
315 * wakeup initiator. The difference is that we cannot provide for any results,
316 * of course and we set the comm_state to MDMNE_TIMEOUT.
318 * By doing so, mdmn_send_message can decide if a retry would make sense or not.
319 * It's not our's to decide that here.
322 timeout_initiator(set_t setno
, md_mn_msgclass_t
class)
326 md_mn_result_t
*resultp
;
328 resultp
= Zalloc(sizeof (md_mn_result_t
));
329 resultp
->mmr_comm_state
= MDMNE_TIMEOUT
;
331 commd_debug(MD_MMV_MISC
,
332 "timeout_initiator set = %d, class = %d\n", setno
, class);
334 transp
= mdmn_get_initiator_table_transp(setno
, class);
335 mdmn_get_initiator_table_id(setno
, class, &mid
);
337 commd_debug(MD_MMV_MISC
, "timeout_ini: (%d, 0x%llx-%d)\n",
340 * Give the result the corresponding msgid from the failed message.
342 MSGID_COPY(&mid
, &(resultp
->mmr_msgid
));
344 /* return to mdmn_send_message() and let it deal with the situation */
345 mdmn_svc_sendreply(transp
, xdr_md_mn_result_t
, (char *)resultp
);
348 commd_debug(MD_MMV_MISC
, "timeout_ini: sendreplied\n");
350 mdmn_unregister_initiator_table(setno
, class);
355 * check_timeouts - thread
357 * This implements a timeout surveillance for messages sent from the
358 * initiator to the master.
360 * If a message is started, this thread is triggered thru
361 * cond_signal(&check_timeout_cv) and we keep track of the numbers of
362 * messages that are outstanding (messages_on_their_way).
364 * As long as there are messages on their way, this thread never goes to sleep.
365 * It'll keep checking all class/set combinations for outstanding messages.
366 * If one is found, it's checked if this message is overdue. In that case,
367 * timeout_initiator() is called to wakeup the calling mdmn_send_message and
368 * to clean up the mess.
370 * If the result from the master arrives later, this message is considered
371 * to be unsolicited. And will be ignored.
380 md_mn_msgclass_t
class;
383 now
= time((time_t *)NULL
);
384 for (setno
= 1; setno
< MD_MAXSETS
; setno
++) {
385 if (md_mn_set_inited
[setno
] != MDMN_SET_READY
) {
388 for (class = MD_MSG_CLASS1
; class < MD_MN_NCLASSES
;
390 mx
= mdmn_get_initiator_table_mx(setno
, class);
391 (void) mutex_lock(mx
);
393 /* then is the registered time */
395 mdmn_get_initiator_table_time(setno
, class);
396 if ((then
!= 0) && (now
> then
)) {
397 timeout_initiator(setno
, class);
399 (void) mutex_unlock(mx
);
402 /* it's ok to check only once per second */
405 /* is there work to do? */
406 (void) mutex_lock(&check_timeout_mutex
);
407 if (messages_on_their_way
== 0) {
408 (void) cond_wait(&check_timeout_cv
,
409 &check_timeout_mutex
);
411 (void) mutex_unlock(&check_timeout_mutex
);
420 /* Read in the debug-controlling tokens from runtime.cf */
421 md_commd_global_verb
= commd_get_verbosity();
423 * If the user didn't specify a verbosity level in runtime.cf
424 * we can safely return here. As we don't intend to printout
425 * debug messages, we don't need to check for the output file.
427 if (md_commd_global_verb
== 0) {
431 /* if commdout is non-NULL it is an open FILE, we'd better close it */
432 if (commdout
!= (FILE *)NULL
) {
433 (void) fclose(commdout
);
436 commdoutfile
= commd_get_outfile();
438 /* setup the debug output */
439 if (commdoutfile
== (char *)NULL
) {
440 /* if no valid file was specified, use the default */
441 commdoutfile
= "/var/run/commd.out";
442 commdout
= fopen(commdoutfile
, "a");
444 /* check if the directory exists and is writable */
445 tmp_dir
= strdup(commdoutfile
);
446 if ((access(dirname(tmp_dir
), X_OK
|W_OK
)) ||
447 ((commdout
= fopen(commdoutfile
, "a")) == (FILE *)NULL
)) {
449 "Can't write to specified output file %s,\n"
450 "using /var/run/commd.out instead\n", commdoutfile
);
452 commdoutfile
= "/var/run/commd.out";
453 commdout
= fopen(commdoutfile
, "a");
458 if (commdout
== (FILE *)NULL
) {
459 syslog(LOG_ERR
, "Can't write to debug output file %s\n",
465 * mdmn_is_node_dead checks to see if a node is dead using
466 * the SunCluster infrastructure which is a stable interface.
467 * If unable to contact SunCuster the node is assumed to be alive.
473 mdmn_is_node_dead(md_mnnode_desc
*node
)
475 char *fmt
= "/usr/cluster/bin/scha_cluster_get -O NODESTATE_NODE ";
482 /* I know that I'm alive */
483 if (strcmp(node
->nd_nodename
, mynode()) == 0)
486 size
= strlen(fmt
) + strlen(node
->nd_nodename
) + 1;
488 (void) strlcat(cmd
, fmt
, size
);
489 (void) strlcat(cmd
, node
->nd_nodename
, size
);
491 if ((ptr
= popen(cmd
, "r")) != NULL
) {
492 if (fgets(buf
, sizeof (buf
), ptr
) != NULL
) {
493 /* If scha_cluster_get returned DOWN - return dead */
494 if (strncmp(buf
, "DOWN", 4) == 0)
506 * Perform some global initializations.
508 * the following routines have to call this before operation can start:
511 * - mdmn_comm_lock_svc_2
512 * - mdmn_comm_unlock_svc_2
513 * - mdmn_comm_suspend_svc_2
514 * - mdmn_comm_resume_svc_2
515 * - mdmn_comm_reinit_set_svc_2
517 * This is a single threaded daemon, so it can only be in one of the above
518 * routines at the same time.
519 * This means, global_init() cannot be called more than once at the same time.
520 * Hence, no lock is needed.
526 md_mn_msgclass_t
class;
527 struct sigaction sighandler
;
529 struct rlimit commd_limit
;
533 /* Do these global initializations only once */
534 if (md_commd_global_state
& MD_CGS_INITED
) {
537 (void) sdssc_bind_library();
539 /* setup the debug options from the config file */
542 /* make sure that we don't run out of file descriptors */
543 commd_limit
.rlim_cur
= commd_limit
.rlim_max
= RLIM_INFINITY
;
544 if (setrlimit(RLIMIT_NOFILE
, &commd_limit
) != 0) {
545 syslog(LOG_WARNING
, gettext("setrlimit failed."
546 "Could not increase the max file descriptors"));
549 /* Make setup_debug() be the action in case of SIGHUP */
550 sighandler
.sa_flags
= 0;
551 (void) sigfillset(&sighandler
.sa_mask
);
552 sighandler
.sa_handler
= (void (*)(int)) setup_debug
;
553 (void) sigaction(SIGHUP
, &sighandler
, NULL
);
555 __savetime
= gethrtime();
556 (void) time(&clock_val
);
557 commd_debug(MD_MMV_MISC
, "global init called %s\n", ctime(&clock_val
));
559 /* start a thread that flushes out the debug on a regular basis */
560 (void) thr_create(NULL
, 0, (void *(*)(void *))flush_fcout
,
561 (void *) NULL
, THR_DETACHED
, NULL
);
563 /* global rwlock's / mutex's / cond_t's go here */
564 (void) mutex_init(&check_timeout_mutex
, USYNC_THREAD
, NULL
);
565 (void) cond_init(&check_timeout_cv
, USYNC_THREAD
, NULL
);
566 (void) mutex_init(&get_setdesc_mutex
, USYNC_THREAD
, NULL
);
568 /* Make sure the initiator table is initialized correctly */
569 for (set
= 0; set
< MD_MAXSETS
; set
++) {
570 for (class = 0; class < MD_MN_NCLASSES
; class++) {
571 mdmn_unregister_initiator_table(set
, class);
576 /* setup the check for timeouts */
577 (void) thr_create(NULL
, 0, (void *(*)(void *))check_timeouts
,
578 (void *) NULL
, THR_DETACHED
, NULL
);
580 md_commd_global_state
|= MD_CGS_INITED
;
585 * mdmn_init_client(setno, nodeid)
586 * called if client[setno][nodeid] is NULL
588 * NOTE: Must be called with set_desc_rwlock held as a reader
589 * NOTE: Must be called with client_rwlock held as a writer
591 * If the rpc client for this node has not been setup for any set, we do it now.
593 * Returns 0 on success (node found in set, rpc client setup)
594 * -1 if metaget_setdesc failed,
595 * -2 if node not part of set
596 * -3 if clnt_create fails
599 mdmn_init_client(set_t setno
, md_mn_nodeid_t nid
)
601 md_error_t ep
= mdnullerror
;
602 md_mnnode_desc
*node
;
603 md_set_desc
*sd
; /* just an abbr for set_descriptor[setno] */
605 sd
= set_descriptor
[setno
];
608 * Is the appropriate set_descriptor already initialized ?
609 * Can't think of a scenario where this is not the case, but we'd better
610 * check for it anyway.
615 /* readlock -> writelock */
616 (void) rw_unlock(&set_desc_rwlock
[setno
]);
617 (void) rw_wrlock(&set_desc_rwlock
[setno
]);
618 sp
= metasetnosetname(setno
, &ep
);
619 /* Only one thread is supposed to be in metaget_setdesc() */
620 (void) mutex_lock(&get_setdesc_mutex
);
621 sd
= metaget_setdesc(sp
, &ep
);
622 (void) mutex_unlock(&get_setdesc_mutex
);
625 (void) rw_unlock(&set_desc_rwlock
[setno
]);
627 (void) rw_rdlock(&set_desc_rwlock
[setno
]);
630 set_descriptor
[setno
] = sd
;
631 /* back to readlock */
632 (void) rw_unlock(&set_desc_rwlock
[setno
]);
633 (void) rw_rdlock(&set_desc_rwlock
[setno
]);
636 /* first we have to find the node name for this node id */
637 for (node
= sd
->sd_nodelist
; node
; node
= node
->nd_next
) {
638 if (node
->nd_nodeid
== nid
)
639 break; /* we found our node in this set */
643 if (node
== (md_mnnode_desc
*)NULL
) {
644 commd_debug(MD_MMV_SYSLOG
,
645 "FATAL: node %d not found in set %d\n", nid
, setno
);
646 (void) rw_unlock(&set_desc_rwlock
[setno
]);
650 commd_debug(MD_MMV_INIT
, "init: %s has the flags: 0x%x\n",
651 node
->nd_nodename
? node
->nd_nodename
: "NULL", node
->nd_flags
);
653 /* Did this node join the diskset? */
654 if ((node
->nd_flags
& MD_MN_NODE_OWN
) == 0) {
655 commd_debug(MD_MMV_INIT
, "init: %s didn't join set %d\n",
656 node
->nd_nodename
? node
->nd_nodename
: "NULL", setno
);
657 (void) rw_unlock(&set_desc_rwlock
[setno
]);
661 /* if clnt_create has not been done for that node, do it now */
662 if (client
[setno
][nid
] == (CLIENT
*) NULL
) {
666 * While trying to create a connection to a node,
667 * periodically check to see if the node has been marked
668 * dead by the SunCluster infrastructure.
669 * This periodic check is needed since a non-responsive
670 * rpc.mdcommd (while it is attempting to create a connection
671 * to a dead node) can lead to large delays and/or failures
672 * in the reconfig steps.
674 while ((client
[setno
][nid
] == (CLIENT
*) NULL
) &&
675 (tout
< MD_CLNT_CREATE_TOUT
)) {
676 client
[setno
][nid
] = meta_client_create_retry(
677 node
->nd_nodename
, mdmn_clnt_create
,
678 (void *) node
, MD_CLNT_CREATE_SUBTIMEOUT
, &ep
);
679 /* Is the node dead? */
680 if (mdmn_is_node_dead(node
) == 1) {
681 commd_debug(MD_MMV_SYSLOG
,
682 "rpc.mdcommd: no client for dead node %s\n",
686 tout
+= MD_CLNT_CREATE_SUBTIMEOUT
;
689 if (client
[setno
][nid
] == (CLIENT
*) NULL
) {
690 clnt_pcreateerror(node
->nd_nodename
);
691 (void) rw_unlock(&set_desc_rwlock
[setno
]);
694 /* this node has the license to send */
695 commd_debug(MD_MMV_MISC
, "init_client: calling add_lic\n");
698 /* set the timeout value */
699 clnt_control(client
[setno
][nid
], CLSET_TIMEOUT
,
703 (void) rw_unlock(&set_desc_rwlock
[setno
]);
708 * check_client(setno, nodeid)
710 * must be called with reader lock held for set_desc_rwlock[setno]
711 * and must be called with reader lock held for client_rwlock[setno]
712 * Checks if the client for this set/node combination is already setup
713 * if not it upgrades the lock to a writer lock
714 * and tries to initialize the client.
715 * Finally it's checked if the client nulled out again due to some race
717 * returns 0 if there is a usable client
718 * returns MDMNE_RPC_FAIL otherwise
721 check_client(set_t setno
, md_mn_nodeid_t nodeid
)
725 while ((client
[setno
][nodeid
] == (CLIENT
*)NULL
) && (ret
== 0)) {
726 /* upgrade reader ... */
727 (void) rw_unlock(&client_rwlock
[setno
]);
728 /* ... to writer lock. */
729 (void) rw_wrlock(&client_rwlock
[setno
]);
730 if (mdmn_init_client(setno
, nodeid
) != 0) {
731 ret
= MDMNE_RPC_FAIL
;
733 /* downgrade writer ... */
734 (void) rw_unlock(&client_rwlock
[setno
]);
735 /* ... back to reader lock. */
736 (void) rw_rdlock(&client_rwlock
[setno
]);
742 * mdmn_init_set(setno, todo)
743 * setno is the number of the set to be initialized.
744 * todo is one of the MDMN_SET_* thingies or MDMN_SET_READY
745 * If called with MDMN_SET_READY everything is initialized.
747 * If the set mutexes are already initialized, the caller has to hold
748 * both set_desc_rwlock[setno] and client_rwlock[setno] as a writer, before
749 * calling mdmn_init_set()
752 mdmn_init_set(set_t setno
, int todo
)
755 md_mnnode_desc
*node
;
756 md_set_desc
*sd
; /* just an abbr for set_descriptor[setno] */
758 md_error_t ep
= mdnullerror
;
762 * Check if we are told to setup the mutexes and
763 * if these are not yet setup
765 if ((todo
& MDMN_SET_MUTEXES
) &&
766 ((md_mn_set_inited
[setno
] & MDMN_SET_MUTEXES
) == 0)) {
767 (void) mutex_init(&mdmn_busy_mutex
[setno
], USYNC_THREAD
, NULL
);
768 (void) cond_init(&mdmn_busy_cv
[setno
], USYNC_THREAD
, NULL
);
769 (void) rwlock_init(&client_rwlock
[setno
], USYNC_THREAD
, NULL
);
770 (void) rwlock_init(&set_desc_rwlock
[setno
], USYNC_THREAD
, NULL
);
772 for (class = MD_MSG_CLASS1
; class < MD_MN_NCLASSES
; class++) {
773 (void) mutex_init(mdmn_get_master_table_mx(setno
,
774 class), USYNC_THREAD
, NULL
);
775 (void) cond_init(mdmn_get_master_table_cv(setno
, class),
777 (void) mutex_init(mdmn_get_initiator_table_mx(setno
,
778 class), USYNC_THREAD
, NULL
);
780 md_mn_set_inited
[setno
] |= MDMN_SET_MUTEXES
;
782 if ((todo
& MDMN_SET_MCT
) &&
783 ((md_mn_set_inited
[setno
] & MDMN_SET_MCT
) == 0)) {
790 filesize
= (sizeof (md_mn_mct_t
));
791 (void) snprintf(table_name
, sizeof (table_name
), "%s%d",
792 MD_MN_MSG_COMP_TABLE
, setno
);
794 * If the mct file exists we map it into memory.
795 * Otherwise we create an empty file of appropriate
796 * size and map that into memory.
797 * The mapped areas are stored in mct[setno].
799 fd
= open(table_name
, O_RDWR
|O_CREAT
|O_DSYNC
, 0600);
801 commd_debug(MD_MMV_MISC
,
802 "init_set: Can't open MCT\n");
806 * Ensure that we are the only process that has this file
807 * mapped. If another instance of rpc.mdcommd has beaten us
808 * then we display the failing process and attempt to terminate
809 * it. The next call of this routine should establish us as
810 * the only rpc.mdcommd on the system.
812 (void) memset(&fl
, 0, sizeof (fl
));
814 fl
.l_whence
= SEEK_SET
;
816 fl
.l_len
= filesize
+ 1;
818 if (fcntl(fd
, F_SETLK
, &fl
) == -1) {
819 commd_debug(MD_MMV_SYSLOG
,
820 "init_set: Cannot lock MCT '%s'\n", table_name
);
821 if (fcntl(fd
, F_GETLK
, &fl
) != -1) {
822 commd_debug(MD_MMV_SYSLOG
, "rpc.mdcommd:"
823 "Process %d holds lock\n", fl
.l_pid
);
826 commd_debug(MD_MMV_SYSLOG
, "rpc.mdcommd:"
833 * Try to terminate other mdcommd process so that we
834 * can establish ourselves.
836 if (sigsend(P_PID
, fl
.l_pid
, 0) == 0) {
837 if (sigsend(P_PID
, fl
.l_pid
, SIGKILL
) < 0) {
838 commd_debug(MD_MMV_SYSLOG
,
840 "SIGKILL of %d failed\n", fl
.l_pid
);
842 commd_debug(MD_MMV_SYSLOG
,
844 "Process %d killed\n", fl
.l_pid
);
847 commd_debug(MD_MMV_SYSLOG
, "rpc.mdcommd:"
848 "Process %d not killable\n", fl
.l_pid
);
853 * To ensure that the file has the appropriate size,
854 * we write a byte at the end of the file.
856 (void) lseek(fd
, filesize
+ 1, SEEK_SET
);
857 (void) write(fd
, "\0", 1);
859 /* at this point we have a file in place that we can mmap */
860 addr
= mmap(0, filesize
, PROT_READ
| PROT_WRITE
,
861 MAP_SHARED
, fd
, (off_t
)0);
862 if (addr
== MAP_FAILED
) {
863 commd_debug(MD_MMV_INIT
,
864 "init_set: mmap mct error %d\n",
868 /* LINTED pointer alignment */
869 mct
[setno
] = (md_mn_mct_t
*)addr
;
871 /* finally we initialize the mutexes that protect the mct */
872 for (class = MD_MSG_CLASS1
; class < MD_MN_NCLASSES
; class++) {
873 (void) mutex_init(&(mct_mutex
[setno
][class]),
877 md_mn_set_inited
[setno
] |= MDMN_SET_MCT
;
880 * Check if we are told to setup the nodes and
881 * if these are not yet setup
882 * (Attention: negative logic here compared to above!)
884 if (((todo
& MDMN_SET_NODES
) == 0) ||
885 (md_mn_set_inited
[setno
] & MDMN_SET_NODES
)) {
886 return (0); /* success */
889 if ((sp
= metasetnosetname(setno
, &ep
)) == NULL
) {
890 commd_debug(MD_MMV_SYSLOG
,
891 "metasetnosetname(%d) returned NULL\n", setno
);
892 return (MDMNE_NOT_JOINED
);
895 /* flush local copy of rpc.metad data */
896 metaflushsetname(sp
);
898 (void) mutex_lock(&get_setdesc_mutex
);
899 sd
= metaget_setdesc(sp
, &ep
);
900 (void) mutex_unlock(&get_setdesc_mutex
);
903 commd_debug(MD_MMV_SYSLOG
,
904 "metaget_setdesc(%d) returned NULL\n", setno
);
905 return (MDMNE_NOT_JOINED
);
909 * if this set is not a multinode set or
910 * this node didn't join yet the diskset, better don't do anything
912 if ((MD_MNSET_DESC(sd
) == 0) ||
913 (sd
->sd_mn_mynode
->nd_flags
& MD_MN_NODE_OWN
) == 0) {
914 commd_debug(MD_MMV_INIT
, "didn't yet join set %d\n", setno
);
915 return (MDMNE_NOT_JOINED
);
918 for (node
= sd
->sd_nodelist
; node
!= NULL
; node
= node
->nd_next
) {
920 nid
= node
->nd_nodeid
;
922 commd_debug(MD_MMV_INIT
,
923 "setting up: node=%s, priv_ic=%s, flags=0x%x\n",
924 node
->nd_nodename
? node
->nd_nodename
: "NULL",
925 node
->nd_priv_ic
? node
->nd_priv_ic
: "NULL",
928 if ((node
->nd_flags
& MD_MN_NODE_OWN
) == 0) {
929 commd_debug(MD_MMV_INIT
,
930 "init: %s didn't join set %d\n",
931 node
->nd_nodename
? node
->nd_nodename
: "NULL",
936 if (client
[setno
][nid
] != (CLIENT
*) NULL
) {
938 commd_debug(MD_MMV_INIT
, "init: already: node=%s\n",
939 node
->nd_nodename
? node
->nd_nodename
: "NULL");
944 * While trying to create a connection to a node,
945 * periodically check to see if the node has been marked
946 * dead by the SunCluster infrastructure.
947 * This periodic check is needed since a non-responsive
948 * rpc.mdcommd (while it is attempting to create a connection
949 * to a dead node) can lead to large delays and/or failures
950 * in the reconfig steps.
952 while ((client
[setno
][nid
] == (CLIENT
*) NULL
) &&
953 (tout
< MD_CLNT_CREATE_TOUT
)) {
954 client
[setno
][nid
] = meta_client_create_retry(
955 node
->nd_nodename
, mdmn_clnt_create
,
956 (void *) node
, MD_CLNT_CREATE_SUBTIMEOUT
, &ep
);
957 /* Is the node dead? */
958 if (mdmn_is_node_dead(node
) == 1) {
959 commd_debug(MD_MMV_SYSLOG
,
960 "rpc.mdcommd: no client for dead node %s\n",
964 tout
+= MD_CLNT_CREATE_SUBTIMEOUT
;
967 if (client
[setno
][nid
] == (CLIENT
*) NULL
) {
968 clnt_pcreateerror(node
->nd_nodename
);
970 * If we cannot connect to a single node
971 * (maybe because it is down) we mark this node as not
972 * owned and continue with the next node in the list.
973 * This is better than failing the entire starting up
974 * of the commd system.
976 node
->nd_flags
&= ~MD_MN_NODE_OWN
;
977 commd_debug(MD_MMV_SYSLOG
,
978 "WARNING couldn't create client for %s\n"
979 "Reconfig cycle required\n",
981 commd_debug(MD_MMV_INIT
,
982 "WARNING couldn't create client for %s\n"
983 "Reconfig cycle required\n",
987 /* this node has the license to send */
988 commd_debug(MD_MMV_MISC
, "init_set: calling add_lic\n");
991 /* set the timeout value */
992 clnt_control(client
[setno
][nid
], CLSET_TIMEOUT
,
995 commd_debug(MD_MMV_INIT
, "init: done: node=%s\n",
996 node
->nd_nodename
? node
->nd_nodename
: "NULL");
999 set_descriptor
[setno
] = sd
;
1000 md_mn_set_inited
[setno
] |= MDMN_SET_NODES
;
1001 return (0); /* success */
1005 mdmn_send_to_work(void *arg
)
1007 int *rpc_err
= NULL
;
1011 mutex_t
*mx
; /* protection for initiator_table */
1014 md_mn_nodeid_t set_master
;
1015 md_mn_msgclass_t
class;
1016 md_mn_msg_and_transp_t
*matp
= (md_mn_msg_and_transp_t
*)arg
;
1018 msg
= matp
->mat_msg
;
1019 transp
= matp
->mat_transp
;
1021 class = mdmn_get_message_class(msg
->msg_type
);
1022 setno
= msg
->msg_setno
;
1024 /* set the sender, so the master knows who to send the results */
1025 (void) rw_rdlock(&set_desc_rwlock
[setno
]);
1026 msg
->msg_sender
= set_descriptor
[setno
]->sd_mn_mynode
->nd_nodeid
;
1027 set_master
= set_descriptor
[setno
]->sd_mn_master_nodeid
;
1029 mx
= mdmn_get_initiator_table_mx(setno
, class);
1030 (void) mutex_lock(mx
);
1033 * Here we check, if the initiator table slot for this set/class
1034 * combination is free to use.
1035 * If this is not the case, we return CLASS_BUSY forcing the
1036 * initiating send_message call to retry
1038 success
= mdmn_check_initiator_table(setno
, class);
1039 if (success
== MDMNE_CLASS_BUSY
) {
1040 md_mn_msgid_t active_mid
;
1042 mdmn_get_initiator_table_id(setno
, class, &active_mid
);
1044 commd_debug(MD_MMV_SEND
,
1045 "send_to_work: received but locally busy "
1046 "(%d, 0x%llx-%d), set=%d, class=%d, type=%d, "
1047 "active msg=(%d, 0x%llx-%d)\n",
1048 MSGID_ELEMS(msg
->msg_msgid
), setno
, class,
1049 msg
->msg_type
, MSGID_ELEMS(active_mid
));
1051 commd_debug(MD_MMV_SEND
,
1052 "send_to_work: received (%d, 0x%llx-%d), "
1053 "set=%d, class=%d, type=%d\n",
1054 MSGID_ELEMS(msg
->msg_msgid
), setno
, class, msg
->msg_type
);
1057 try_master
= 2; /* return failure after two retries */
1058 while ((success
== MDMNE_ACK
) && (try_master
--)) {
1059 (void) rw_rdlock(&client_rwlock
[setno
]);
1060 /* is the rpc client to the master still around ? */
1061 if (check_client(setno
, set_master
)) {
1062 success
= MDMNE_RPC_FAIL
;
1064 (void) rw_unlock(&client_rwlock
[setno
]);
1065 break; /* out of try_master-loop */
1069 * Send the request to the work function on the master
1070 * this call will return immediately
1072 rpc_err
= mdmn_work_2(msg
, client
[setno
][set_master
],
1075 /* Everything's Ok? */
1076 if (rpc_err
== NULL
) {
1077 success
= MDMNE_RPC_FAIL
;
1079 * Probably something happened to the daemon on the
1080 * master. Kill the client, and try again...
1082 (void) rw_unlock(&client_rwlock
[setno
]);
1083 (void) rw_wrlock(&client_rwlock
[setno
]);
1084 mdmn_clnt_destroy(client
[setno
][set_master
]);
1085 if (client
[setno
][set_master
] != (CLIENT
*)NULL
) {
1086 client
[setno
][set_master
] = (CLIENT
*)NULL
;
1088 (void) rw_unlock(&client_rwlock
[setno
]);
1091 } else if (*rpc_err
!= MDMNE_ACK
) {
1092 /* something went wrong, break out */
1095 (void) rw_unlock(&client_rwlock
[setno
]);
1096 break; /* out of try_master-loop */
1099 (void) rw_unlock(&client_rwlock
[setno
]);
1103 * If we are here, we sucessfully delivered the message.
1104 * We register the initiator_table, so that
1105 * wakeup_initiator_2 can do the sendreply with the
1108 success
= MDMNE_ACK
;
1109 mdmn_register_initiator_table(setno
, class, msg
, transp
);
1111 /* tell check_timeouts, there's work to do */
1112 (void) mutex_lock(&check_timeout_mutex
);
1113 messages_on_their_way
++;
1114 (void) cond_signal(&check_timeout_cv
);
1115 (void) mutex_unlock(&check_timeout_mutex
);
1116 break; /* out of try_master-loop */
1119 (void) rw_unlock(&set_desc_rwlock
[setno
]);
1121 if (success
== MDMNE_ACK
) {
1122 commd_debug(MD_MMV_SEND
,
1123 "send_to_work: registered (%d, 0x%llx-%d)\n",
1124 MSGID_ELEMS(msg
->msg_msgid
));
1126 /* In case of failure do the sendreply now */
1127 md_mn_result_t
*resultp
;
1128 resultp
= Zalloc(sizeof (md_mn_result_t
));
1129 resultp
->mmr_comm_state
= success
;
1131 * copy the MSGID so that we know _which_ message
1132 * failed (if the transp has got mangled)
1134 MSGID_COPY(&(msg
->msg_msgid
), &(resultp
->mmr_msgid
));
1135 mdmn_svc_sendreply(transp
, xdr_md_mn_result_t
, (char *)resultp
);
1136 commd_debug(MD_MMV_SEND
,
1137 "send_to_work: not registered (%d, 0x%llx-%d) cs=%d\n",
1138 MSGID_ELEMS(msg
->msg_msgid
), success
);
1139 free_result(resultp
);
1141 * We don't have a timeout registered to wake us up, so we're
1142 * now done with this handle. Release it back to the pool.
1149 /* the alloc was done in mdmn_send_svc_2 */
1151 (void) mutex_unlock(mx
);
1157 * do_message_locally(msg, result)
1158 * Process a message locally on the master
1159 * Lookup the MCT if the message has already been processed.
1160 * If not, call the handler and store the result
1161 * If yes, retrieve the result from the MCT.
1163 * MDMNE_ACK in case of success
1164 * MDMNE_LOG_FAIL if the MCT could not be checked
1167 do_message_locally(md_mn_msg_t
*msg
, md_mn_result_t
*result
)
1171 md_mn_msgtype_t msgtype
= msg
->msg_type
;
1172 md_mn_msgclass_t
class;
1174 void (*handler
)(md_mn_msg_t
*msg
, uint_t flags
, md_mn_result_t
*res
);
1176 handler
= mdmn_get_handler(msgtype
);
1177 if (handler
== NULL
) {
1178 result
->mmr_exitval
= 0;
1179 /* let the sender decide if this is an error or not */
1180 result
->mmr_comm_state
= MDMNE_NO_HANDLER
;
1181 return (MDMNE_NO_HANDLER
);
1184 class = mdmn_get_message_class(msg
->msg_type
);
1185 setno
= msg
->msg_setno
;
1187 result
->mmr_msgtype
= msgtype
;
1188 result
->mmr_flags
= msg
->msg_flags
;
1189 MSGID_COPY(&(msg
->msg_msgid
), &(result
->mmr_msgid
));
1191 (void) mutex_lock(&mct_mutex
[setno
][class]);
1192 completed
= mdmn_check_completion(msg
, result
);
1193 if (completed
== MDMN_MCT_NOT_DONE
) {
1194 /* message not yet processed locally */
1195 commd_debug(MD_MMV_PROC_M
, "proc_mas: "
1196 "calling handler for (%d,0x%llx-%d) type %d\n",
1197 MSGID_ELEMS(msg
->msg_msgid
), msgtype
);
1200 * Mark the message as being currently processed,
1201 * so we won't start a second handler for it
1203 (void) mdmn_mark_completion(msg
, NULL
, MDMN_MCT_IN_PROGRESS
);
1204 (void) mutex_unlock(&mct_mutex
[setno
][class]);
1206 /* here we actually process the message on the master */
1207 (*handler
)(msg
, MD_MSGF_ON_MASTER
, result
);
1209 commd_debug(MD_MMV_PROC_M
, "proc_mas: "
1210 "finished handler for (%d,0x%llx-%d) type %d\n",
1211 MSGID_ELEMS(msg
->msg_msgid
), msgtype
);
1213 /* Mark the message as fully processed, store the result */
1214 (void) mutex_lock(&mct_mutex
[setno
][class]);
1215 (void) mdmn_mark_completion(msg
, result
, MDMN_MCT_DONE
);
1216 } else if (completed
== MDMN_MCT_DONE
) {
1217 commd_debug(MD_MMV_PROC_M
, "proc_mas: "
1218 "result for (%d, 0x%llx-%d) from MCT\n",
1219 MSGID_ELEMS(msg
->msg_msgid
), msgtype
);
1220 } else if (completed
== MDMN_MCT_IN_PROGRESS
) {
1221 commd_debug(MD_MMV_PROC_M
, "proc_mas: "
1222 "(%d, 0x%llx-%d) is currently being processed\n",
1223 MSGID_ELEMS(msg
->msg_msgid
), msgtype
);
1225 /* MCT error occurred (should never happen) */
1226 (void) mutex_unlock(&mct_mutex
[setno
][class]);
1227 result
->mmr_comm_state
= MDMNE_LOG_FAIL
;
1228 commd_debug(MD_MMV_SYSLOG
, "WARNING "
1229 "mdmn_check_completion returned %d "
1230 "for (%d,0x%llx-%d)\n", completed
,
1231 MSGID_ELEMS(msg
->msg_msgid
));
1232 return (MDMNE_LOG_FAIL
);
1234 (void) mutex_unlock(&mct_mutex
[setno
][class]);
1240 * do_send_message(msg, node)
1242 * Send a message to a given node and wait for a acknowledgment, that the
1243 * message has arrived on the remote node.
1244 * Make sure that the client for the set is setup correctly.
1245 * If no ACK arrives, destroy and recreate the RPC client and retry the
1247 * After actually sending wait no longer than the appropriate number of
1248 * before timing out the message.
1250 * Note must be called with set_desc_wrlock held in reader mode
1253 do_send_message(md_mn_msg_t
*msg
, md_mnnode_desc
*node
)
1257 int timeout_retries
= 0;
1260 cond_t
*cv
; /* see mdmn_wakeup_master_svc_2 */
1261 mutex_t
*mx
; /* protection for class_busy */
1262 timestruc_t timeout
; /* surveillance for remote daemon */
1264 md_mn_msgtype_t msgtype
;
1265 md_mn_msgclass_t
class;
1267 nid
= node
->nd_nodeid
;
1268 msgtype
= msg
->msg_type
;
1269 setno
= msg
->msg_setno
;
1270 class = mdmn_get_message_class(msgtype
);
1271 mx
= mdmn_get_master_table_mx(setno
, class);
1272 cv
= mdmn_get_master_table_cv(setno
, class);
1276 /* We try two times to send the message */
1280 * if sending the message doesn't succeed the first time due to a
1281 * RPC problem, we retry one time
1283 while ((rpc_retries
!= 0) && (ret
== NULL
)) {
1284 /* in abort state, we error out immediately */
1285 if (md_commd_global_state
& MD_CGS_ABORTED
) {
1286 return (MDMNE_ABORT
);
1289 (void) rw_rdlock(&client_rwlock
[setno
]);
1290 /* unable to create client? Ignore it */
1291 if (check_client(setno
, nid
)) {
1293 * In case we cannot establish an RPC client, we
1294 * take this node out of our considerations.
1295 * This will be reset by a reconfig
1296 * cycle that should come pretty soon.
1297 * MNISSUE: Should a reconfig cycle
1298 * be forced on SunCluster?
1300 node
->nd_flags
&= ~MD_MN_NODE_OWN
;
1301 commd_debug(MD_MMV_SYSLOG
,
1302 "WARNING couldn't create client for %s\n"
1303 "Reconfig cycle required\n",
1305 commd_debug(MD_MMV_PROC_M
, "proc_mas: (%d,0x%llx-%d) "
1306 "WARNING couldn't create client for %s\n",
1307 MSGID_ELEMS(msg
->msg_msgid
), node
->nd_nodename
);
1308 (void) rw_unlock(&client_rwlock
[setno
]);
1309 return (MDMNE_IGNORE_NODE
);
1311 /* let's be paranoid and check again before sending */
1312 if (client
[setno
][nid
] == NULL
) {
1314 * if this is true, strange enough, we catch our breath,
1315 * and then continue, so that the client is set up
1318 commd_debug(MD_MMV_PROC_M
, "client is NULL\n");
1319 (void) rw_unlock(&client_rwlock
[setno
]);
1324 /* send it over, it will return immediately */
1325 ret
= mdmn_work_2(msg
, client
[setno
][nid
], nid
);
1327 (void) rw_unlock(&client_rwlock
[setno
]);
1330 commd_debug(MD_MMV_PROC_M
,
1331 "proc_mas: sending (%d,0x%llx-%d) to %d returned "
1333 MSGID_ELEMS(msg
->msg_msgid
), nid
, *ret
);
1335 commd_debug(MD_MMV_PROC_M
,
1336 "proc_mas: sending (%d,0x%llx-%d) to %d returned "
1338 MSGID_ELEMS(msg
->msg_msgid
), nid
);
1341 if ((ret
== NULL
) || (*ret
== MDMNE_CANNOT_CONNECT
) ||
1342 (*ret
== MDMNE_THR_CREATE_FAIL
)) {
1344 * Something happened to the daemon on the other side.
1345 * Kill the client, and try again.
1346 * check_client() will create a new client
1348 (void) rw_wrlock(&client_rwlock
[setno
]);
1349 mdmn_clnt_destroy(client
[setno
][nid
]);
1350 if (client
[setno
][nid
] != (CLIENT
*)NULL
) {
1351 client
[setno
][nid
] = (CLIENT
*)NULL
;
1353 (void) rw_unlock(&client_rwlock
[setno
]);
1355 /* ... but don't try infinitely */
1360 * If the class is locked on the other node, keep trying.
1361 * This situation will go away automatically,
1362 * if we wait long enough
1364 if (*ret
== MDMNE_CLASS_LOCKED
) {
1372 return (MDMNE_RPC_FAIL
);
1376 /* if the slave is in abort state, we just ignore it. */
1377 if (*ret
== MDMNE_ABORT
) {
1378 commd_debug(MD_MMV_PROC_M
,
1379 "proc_mas: work(%d,0x%llx-%d) returned "
1381 MSGID_ELEMS(msg
->msg_msgid
));
1383 return (MDMNE_IGNORE_NODE
);
1386 /* Did the remote processing succeed? */
1387 if (*ret
!= MDMNE_ACK
) {
1389 * Some commd failure in the middle of sending the msg
1390 * to the nodes. We don't continue here.
1392 commd_debug(MD_MMV_PROC_M
,
1393 "proc_mas: work(%d,0x%llx-%d) returns %d\n",
1394 MSGID_ELEMS(msg
->msg_msgid
), *ret
);
1396 return (MDMNE_RPC_FAIL
);
1402 * When we are here, we have sent the message to the other node and
1403 * we know that node has accepted it.
1404 * We go to sleep and have trust to be woken up by wakeup.
1405 * If we wakeup due to a timeout, or a signal, no result has been
1406 * placed in the appropriate slot.
1407 * If we timeout, it is likely that this is because the node has
1408 * gone away, so we will destroy the client and try it again in the
1409 * expectation that the rpc will fail and we will return
1410 * MDMNE_IGNORE_NODE. If that is not the case, the message must still
1411 * be being processed on the slave. In this case just timeout for 4
1412 * more seconds and then return RPC_FAIL if the message is not complete.
1414 timeout
.tv_nsec
= 0;
1415 timeout
.tv_sec
= (timeout_retries
== 0) ? mdmn_get_timeout(msgtype
) :
1417 err
= cond_reltimedwait(cv
, mx
, &timeout
);
1420 /* everything's fine, return success */
1425 commd_debug(MD_MMV_PROC_M
, "proc_mas: "
1426 "timeout occured, set=%d, class=%d, "
1427 "msgid=(%d, 0x%llx-%d), timeout_retries=%d\n",
1428 setno
, class, MSGID_ELEMS(msg
->msg_msgid
), timeout_retries
);
1429 if (timeout_retries
== 0) {
1432 * Destroy the client and try the rpc call again
1434 (void) rw_wrlock(&client_rwlock
[setno
]);
1435 mdmn_clnt_destroy(client
[setno
][nid
]);
1436 client
[setno
][nid
] = (CLIENT
*)NULL
;
1437 (void) rw_unlock(&client_rwlock
[setno
]);
1440 } else if (err
== EINTR
) {
1441 commd_debug(MD_MMV_PROC_M
, "proc_mas: "
1442 "commd signalled, set=%d, class=%d, "
1443 "msgid=(%d, 0x%llx-%d)\n",
1444 setno
, class, MSGID_ELEMS(msg
->msg_msgid
));
1446 commd_debug(MD_MMV_PROC_M
, "proc_mas: "
1447 "cond_reltimedwait err=%d, set=%d, "
1448 "class=%d, msgid=(%d, 0x%llx-%d)\n",
1450 MSGID_ELEMS(msg
->msg_msgid
));
1453 /* some failure happened */
1454 return (MDMNE_RPC_FAIL
);
1458 * before we return we have to
1459 * free_msg(msg); because we are working on a copied message
1462 mdmn_master_process_msg(md_mn_msg_t
*msg
)
1466 int nmsgs
; /* total number of msgs */
1467 int curmsg
; /* index of current msg */
1469 uint_t inherit_flags
= 0;
1470 uint_t secdiff
, usecdiff
; /* runtime of this message */
1471 md_error_t mde
= mdnullerror
;
1472 md_mn_msg_t
*msglist
[MAX_SUBMESSAGES
]; /* all msgs to process */
1473 md_mn_msg_t
*cmsg
; /* current msg */
1474 md_mn_msgid_t dummyid
;
1475 md_mn_result_t
*result
;
1476 md_mn_result_t
*slave_result
;
1477 md_mn_nodeid_t sender
;
1478 md_mn_nodeid_t set_master
;
1479 md_mnnode_desc
*node
;
1480 md_mn_msgtype_t orig_type
; /* type of the original message */
1481 md_mn_msgtype_t msgtype
; /* type of the current message */
1482 md_mn_msgclass_t orig_class
; /* class of the original message */
1483 md_mn_msgclass_t
class; /* class of the current message */
1485 int (*smgen
)(md_mn_msg_t
*msg
, md_mn_msg_t
**msglist
);
1487 orig_type
= msgtype
= msg
->msg_type
;
1488 sender
= msg
->msg_sender
;
1489 setno
= msg
->msg_setno
;
1491 result
= Zalloc(sizeof (md_mn_result_t
));
1492 result
->mmr_setno
= setno
;
1493 result
->mmr_msgtype
= msgtype
;
1494 MSGID_COPY(&(msg
->msg_msgid
), &(result
->mmr_msgid
));
1496 orig_class
= mdmn_get_message_class(msgtype
);
1498 commd_debug(MD_MMV_PROC_M
,
1499 "proc_mas: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n",
1500 MSGID_ELEMS(msg
->msg_msgid
), setno
, orig_class
, msgtype
);
1502 (void) rw_rdlock(&set_desc_rwlock
[setno
]);
1503 set_master
= set_descriptor
[setno
]->sd_mn_master_nodeid
;
1504 result
->mmr_sender
= set_master
;
1506 * Put message into the change log unless told otherwise
1507 * Note that we only log original messages.
1508 * If they are generated by some smgen, we don't log them!
1509 * Replay messages aren't logged either.
1510 * Note, that replay messages are unlogged on completion.
1512 if ((msg
->msg_flags
& (MD_MSGF_NO_LOG
| MD_MSGF_REPLAY_MSG
)) == 0) {
1513 commd_debug(MD_MMV_PROC_M
,
1514 "proc_mas: calling log_msg for (%d,0x%llx-%d) type %d\n",
1515 MSGID_ELEMS(msg
->msg_msgid
), msgtype
);
1516 err
= mdmn_log_msg(msg
);
1517 if (err
== MDMNE_NULL
) {
1518 /* msg logged successfully */
1519 commd_debug(MD_MMV_PROC_M
, "proc_mas: "
1520 "done log_msg for (%d,0x%llx-%d) type %d\n",
1521 MSGID_ELEMS(msg
->msg_msgid
), msgtype
);
1524 if (err
== MDMNE_ACK
) {
1525 /* Same msg in the slot, proceed */
1526 commd_debug(MD_MMV_PROC_M
, "proc_mas: "
1527 "already logged (%d,0x%llx-%d) type %d\n",
1528 MSGID_ELEMS(msg
->msg_msgid
), msgtype
);
1531 if (err
== MDMNE_LOG_FAIL
) {
1532 /* Oh, bad, the log is non functional. */
1533 result
->mmr_comm_state
= MDMNE_LOG_FAIL
;
1535 * Note that the mark_busy was already done by
1538 (void) mutex_lock(&mdmn_busy_mutex
[setno
]);
1539 mdmn_mark_class_unbusy(setno
, orig_class
);
1540 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
1543 if (err
== MDMNE_CLASS_BUSY
) {
1545 * The log is occupied with a different message
1546 * that needs to be played first.
1547 * We reject the current message with MDMNE_CLASS_BUSY
1548 * to the initiator and do not unbusy the set/class,
1549 * because we will proceed with the logged message,
1550 * which has the same set/class combination
1552 result
->mmr_comm_state
= MDMNE_CLASS_BUSY
;
1555 (void) rw_rdlock(&client_rwlock
[setno
]);
1557 if (check_client(setno
, sender
)) {
1558 commd_debug(MD_MMV_SYSLOG
,
1559 "proc_mas: No client for initiator \n");
1561 ret
= mdmn_wakeup_initiator_2(result
,
1562 client
[setno
][sender
], sender
);
1564 (void) rw_unlock(&client_rwlock
[setno
]);
1566 if (ret
== (int *)NULL
) {
1567 commd_debug(MD_MMV_SYSLOG
,
1568 "proc_mas: couldn't wakeup_initiator \n");
1570 if (*ret
!= MDMNE_ACK
) {
1571 commd_debug(MD_MMV_SYSLOG
, "proc_mas: "
1572 "wakeup_initiator returned %d\n", *ret
);
1578 if (err
== MDMNE_LOG_FAIL
) {
1579 /* we can't proceed here */
1580 free_result(result
);
1581 (void) rw_unlock(&set_desc_rwlock
[setno
]);
1583 } else if (err
== MDMNE_CLASS_BUSY
) {
1584 mdmn_changelog_record_t
*lr
;
1585 lr
= mdmn_get_changelogrec(setno
, orig_class
);
1588 /* proceed with the logged message */
1589 msg
= copy_msg(&(lr
->lr_msg
), NULL
);
1592 * The logged message has to have the same class but
1593 * type and sender can be different
1595 orig_type
= msgtype
= msg
->msg_type
;
1596 sender
= msg
->msg_sender
;
1598 commd_debug(MD_MMV_PROC_M
,
1599 "proc_mas: Got new message from change log: "
1600 "(%d,0x%llx-%d) type %d\n",
1601 MSGID_ELEMS(msg
->msg_msgid
), msgtype
);
1603 /* continue normal operation with this message */
1608 smgen
= mdmn_get_submessage_generator(msgtype
);
1609 if (smgen
== NULL
) {
1610 /* no submessages to create, just use the original message */
1614 /* some bits are passed on to submessages */
1615 inherit_flags
= msg
->msg_flags
& MD_MSGF_INHERIT_BITS
;
1617 nmsgs
= smgen(msg
, msglist
);
1619 /* some settings for the submessages */
1620 for (curmsg
= 0; curmsg
< nmsgs
; curmsg
++) {
1621 cmsg
= msglist
[curmsg
];
1623 /* Apply the inherited flags */
1624 cmsg
->msg_flags
|= inherit_flags
;
1627 * Make sure the submessage ID is set correctly
1628 * Note: first submessage has mid_smid of 1 (not 0)
1630 cmsg
->msg_msgid
.mid_smid
= curmsg
+ 1;
1632 /* need the original class set in msgID (for MCT) */
1633 cmsg
->msg_msgid
.mid_oclass
= orig_class
;
1636 commd_debug(MD_MMV_PROC_M
,
1637 "smgen generated %d submsgs, origclass = %d\n",
1641 * This big loop does the following.
1643 * process message on the master first (a message completion
1644 * table MCT ensures a message is not processed twice)
1645 * in case of an error break out of message loop
1646 * for all nodes -- unless MD_MSGF_NO_BCAST is set --
1647 * send message to node until that succeeds
1648 * merge result -- not yet implemented
1649 * respect MD_MSGF_STOP_ON_ERROR
1651 for (curmsg
= 0; curmsg
< nmsgs
; curmsg
++) {
1652 int break_msg_loop
= 0;
1653 mutex_t
*mx
; /* protection for class_busy */
1655 int master_exitval
= -1;
1657 cmsg
= msglist
[curmsg
];
1658 msgtype
= cmsg
->msg_type
;
1659 class = mdmn_get_message_class(msgtype
);
1661 mx
= mdmn_get_master_table_mx(setno
, class);
1663 /* If we are in the abort state, we error out immediately */
1664 if (md_commd_global_state
& MD_CGS_ABORTED
) {
1665 break; /* out of the message loop */
1668 commd_debug(MD_MMV_PROC_M
, "class=%d, orig_class=%d\n",
1671 * If the current class is different from the original class,
1672 * we have to lock it down.
1673 * The original class is already marked busy.
1674 * At this point we cannot refuse the message because the
1675 * class is busy right now, so we wait until the class becomes
1676 * available again. As soon as something changes for this set
1677 * we will be cond_signal'ed (in mdmn_mark_class_unbusy)
1679 * Granularity could be finer (setno/class)
1681 if (class != orig_class
) {
1682 (void) mutex_lock(&mdmn_busy_mutex
[setno
]);
1683 while (mdmn_mark_class_busy(setno
, class) == FALSE
) {
1684 (void) cond_wait(&mdmn_busy_cv
[setno
],
1685 &mdmn_busy_mutex
[setno
]);
1687 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
1690 master_err
= do_message_locally(cmsg
, result
);
1692 if ((master_err
!= MDMNE_ACK
) ||
1693 ((master_err
== MDMNE_ACK
) && (result
->mmr_exitval
!= 0))) {
1694 result
->mmr_failing_node
= set_master
;
1695 if (cmsg
->msg_flags
& MD_MSGF_STOP_ON_ERROR
) {
1697 * if appropriate, unbusy the class and
1698 * break out of the message loop
1700 if (class != orig_class
) {
1702 &mdmn_busy_mutex
[setno
]);
1703 mdmn_mark_class_unbusy(setno
, class);
1704 (void) mutex_unlock(
1705 &mdmn_busy_mutex
[setno
]);
1711 if (master_err
== MDMNE_ACK
)
1712 master_exitval
= result
->mmr_exitval
;
1714 /* No broadcast? => next message */
1715 if (cmsg
->msg_flags
& MD_MSGF_NO_BCAST
) {
1716 /* if appropriate, unbusy the class */
1717 if (class != orig_class
) {
1718 (void) mutex_lock(&mdmn_busy_mutex
[setno
]);
1719 mdmn_mark_class_unbusy(setno
, class);
1720 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
1726 /* fake sender, so we get notified when the results are avail */
1727 cmsg
->msg_sender
= set_master
;
1729 * register to the master_table. It's needed by wakeup_master to
1730 * wakeup the sleeping thread.
1731 * Access is protected by the class lock: mdmn_mark_class_busy()
1733 mdmn_set_master_table_id(setno
, class, &(cmsg
->msg_msgid
));
1737 (void) rw_rdlock(&set_desc_rwlock
[setno
]);
1738 /* Send the message to all other nodes */
1739 for (node
= set_descriptor
[setno
]->sd_nodelist
; node
;
1740 node
= node
->nd_next
) {
1741 md_mn_nodeid_t nid
= node
->nd_nodeid
;
1743 /* We are master and have already processed the msg */
1744 if (node
== set_descriptor
[setno
]->sd_mn_masternode
) {
1748 /* If this node didn't join the disk set, ignore it */
1749 if ((node
->nd_flags
& MD_MN_NODE_OWN
) == 0) {
1753 /* If a DIRECTED message, skip non-recipient nodes */
1754 if ((cmsg
->msg_flags
& MD_MSGF_DIRECTED
) &&
1755 nid
!= cmsg
->msg_recipient
) {
1759 (void) mutex_lock(mx
);
1761 * Register the node that is addressed,
1762 * so we can detect unsolicited messages
1764 mdmn_set_master_table_addr(setno
, class, nid
);
1765 slave_result
= (md_mn_result_t
*)NULL
;
1768 * Now send it. do_send_message() will return if
1769 * a failure occurs or
1770 * the results are available
1772 err
= do_send_message(cmsg
, node
);
1774 /* in abort state, we error out immediately */
1775 if (md_commd_global_state
& MD_CGS_ABORTED
) {
1779 if (err
== MDMNE_ACK
) {
1781 mdmn_get_master_table_res(setno
, class);
1782 commd_debug(MD_MMV_PROC_M
,
1783 "proc_mas: got result for (%d,0x%llx-%d)\n",
1784 MSGID_ELEMS(cmsg
->msg_msgid
));
1785 } else if (err
== MDMNE_IGNORE_NODE
) {
1786 (void) mutex_unlock(mx
);
1787 continue; /* send to next node */
1789 (void) mutex_unlock(mx
);
1793 * If the result is NULL, or err doesn't show success,
1794 * something went wrong with this RPC call.
1796 if ((slave_result
== NULL
) || (err
!= MDMNE_ACK
)) {
1798 * If PANIC_WHEN_INCONSISTENT set,
1799 * panic if the master succeeded while
1802 if ((cmsg
->msg_flags
&
1803 MD_MSGF_PANIC_WHEN_INCONSISTENT
) &&
1804 (master_err
== MDMNE_ACK
))
1805 panic_system(nid
, cmsg
->msg_type
,
1806 master_err
, master_exitval
,
1809 result
->mmr_failing_node
= nid
;
1810 /* are we supposed to stop in case of error? */
1811 if (cmsg
->msg_flags
& MD_MSGF_STOP_ON_ERROR
) {
1812 result
->mmr_exitval
= MDMNE_RPC_FAIL
;
1813 commd_debug(MD_MMV_SYSLOG
, "proc_mas: "
1814 "result (%d,0x%llx-%d) is NULL\n",
1815 MSGID_ELEMS(cmsg
->msg_msgid
));
1818 break; /* out of node loop first */
1820 /* send msg to the next node */
1827 * Message processed on remote node.
1828 * If PANIC_WHEN_INCONSISTENT set, panic if the
1829 * result is different on this node from the result
1832 if ((cmsg
->msg_flags
&
1833 MD_MSGF_PANIC_WHEN_INCONSISTENT
) &&
1834 ((master_err
!= MDMNE_ACK
) ||
1835 (slave_result
->mmr_exitval
!= master_exitval
)))
1836 panic_system(nid
, cmsg
->msg_type
, master_err
,
1837 master_exitval
, slave_result
);
1840 * At this point we know we have a message that was
1841 * processed on the remote node.
1842 * We now check if the exitval is non zero.
1843 * In that case we discard the previous result and
1844 * rather use the current.
1845 * This means: If a message fails on no node,
1846 * the result from the master will be returned.
1847 * There's currently no such thing as merge of results
1848 * If additionally STOP_ON_ERROR is set, we bail out
1850 if (slave_result
->mmr_exitval
!= 0) {
1851 /* throw away the previously allocated result */
1852 free_result(result
);
1854 /* copy_result() allocates new memory */
1855 result
= copy_result(slave_result
);
1856 free_result(slave_result
);
1858 dump_result(MD_MMV_PROC_M
, "proc_mas", result
);
1860 result
->mmr_failing_node
= nid
;
1861 if (cmsg
->msg_flags
& MD_MSGF_STOP_ON_ERROR
) {
1863 break; /* out of node loop */
1865 continue; /* try next node */
1869 * MNIssue: may want to merge the results
1870 * from all slaves. Currently only report
1871 * the results from the master.
1873 free_result(slave_result
);
1876 } /* End of loop over the nodes */
1877 (void) rw_unlock(&set_desc_rwlock
[setno
]);
1880 /* release the current class again */
1881 if (class != orig_class
) {
1882 (void) mutex_lock(&mdmn_busy_mutex
[setno
]);
1883 mdmn_mark_class_unbusy(setno
, class);
1884 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
1887 /* are we supposed to quit entirely ? */
1888 if (break_msg_loop
||
1889 (md_commd_global_state
& MD_CGS_ABORTED
)) {
1890 break; /* out of msg loop */
1893 } /* End of loop over the messages */
1895 * If we are here, there's two possibilities:
1896 * - we processed all messages on all nodes without an error.
1897 * In this case we return the result from the master.
1898 * (to be implemented: return the merged result)
1899 * - we encountered an error in which case result has been
1900 * set accordingly already.
1903 if (md_commd_global_state
& MD_CGS_ABORTED
) {
1904 result
->mmr_comm_state
= MDMNE_ABORT
;
1908 * This message has been processed completely.
1909 * Remove it from the changelog.
1910 * Do this for replay messages too.
1911 * Note that the message is unlogged before waking up the
1912 * initiator. This is done for two reasons.
1913 * 1. Remove a race condition that occurs when back to back
1914 * messages are sent for the same class, the registeration is
1916 * 2. If the initiator died but the action was completed on all the
1917 * the nodes, we want that to be marked "done" quickly.
1920 if ((msg
->msg_flags
& MD_MSGF_NO_LOG
) == 0) {
1921 commd_debug(MD_MMV_PROC_M
,
1922 "proc_mas: calling unlog_msg for (%d,0x%llx-%d) type %d\n",
1923 MSGID_ELEMS(msg
->msg_msgid
), msgtype
);
1924 (void) mdmn_unlog_msg(msg
);
1925 commd_debug(MD_MMV_PROC_M
,
1926 "proc_mas: done unlog_msg for (%d,0x%llx-%d) type %d\n",
1927 MSGID_ELEMS(msg
->msg_msgid
), msgtype
);
1931 * In case of submessages, we increased the submessage ID in the
1932 * result structure. We restore the message ID to the value that
1933 * the initiator is waiting for.
1935 result
->mmr_msgid
.mid_smid
= 0;
1936 result
->mmr_msgtype
= orig_type
;
1937 result
->mmr_sender
= set_master
;
1939 /* if we have an inited client, send result */
1942 (void) rw_rdlock(&client_rwlock
[setno
]);
1943 if (check_client(setno
, sender
)) {
1944 commd_debug(MD_MMV_SYSLOG
,
1945 "proc_mas: unable to create client for initiator\n");
1947 ret
= mdmn_wakeup_initiator_2(result
, client
[setno
][sender
],
1950 (void) rw_unlock(&client_rwlock
[setno
]);
1952 if (ret
== (int *)NULL
) {
1953 commd_debug(MD_MMV_PROC_M
,
1954 "proc_mas: couldn't wakeup initiator\n");
1956 if (*ret
!= MDMNE_ACK
) {
1957 commd_debug(MD_MMV_PROC_M
,
1958 "proc_mas: wakeup_initiator returned %d\n",
1964 (void) rw_unlock(&set_desc_rwlock
[setno
]);
1965 /* Free all submessages, if there were any */
1967 for (curmsg
= 0; curmsg
< nmsgs
; curmsg
++) {
1968 free_msg(msglist
[curmsg
]);
1971 /* Free the result */
1972 free_result(result
);
1974 (void) mutex_lock(&mdmn_busy_mutex
[setno
]);
1975 mdmn_mark_class_unbusy(setno
, orig_class
);
1976 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
1980 * We use this ioctl just to get the time in the same format as used in
1981 * the messageID. If it fails, all we get is a bad runtime output.
1983 (void) metaioctl(MD_IOCGUNIQMSGID
, &dummyid
, &mde
, NULL
);
1984 secdiff
= (dummyid
.mid_time
- msg
->msg_msgid
.mid_time
) >> 32;
1985 usecdiff
= (dummyid
.mid_time
- msg
->msg_msgid
.mid_time
) & 0xfffff;
1987 /* catching possible overflow */
1988 if (usecdiff
>= 1000000) {
1989 usecdiff
-= 1000000;
1994 commd_debug(MD_MMV_PROC_M
, "proc_mas: done (%d, 0x%llx-%d) type=%02d "
1995 "%5d.%06d secs runtime\n",
1996 MSGID_ELEMS(msg
->msg_msgid
), orig_type
, secdiff
, usecdiff
);
1998 /* Free the original message */
2003 mdmn_slave_process_msg(md_mn_msg_t
*msg
)
2008 int successfully_returned
;
2010 md_mn_result_t
*result
;
2011 md_mn_nodeid_t sender
;
2012 md_mn_nodeid_t whoami
;
2013 md_mn_msgtype_t msgtype
;
2014 md_mn_msgclass_t
class;
2016 void (*handler
)(md_mn_msg_t
*msg
, uint_t flags
, md_mn_result_t
*res
);
2018 setno
= msg
->msg_setno
;
2019 sender
= msg
->msg_sender
; /* this is always the master of the set */
2020 msgtype
= msg
->msg_type
;
2022 (void) rw_rdlock(&set_desc_rwlock
[setno
]);
2023 whoami
= set_descriptor
[setno
]->sd_mn_mynode
->nd_nodeid
;
2024 (void) rw_unlock(&set_desc_rwlock
[setno
]);
2026 result
= Zalloc(sizeof (md_mn_result_t
));
2027 result
->mmr_flags
= msg
->msg_flags
;
2028 result
->mmr_setno
= setno
;
2029 result
->mmr_msgtype
= msgtype
;
2030 result
->mmr_sender
= whoami
;
2031 result
->mmr_comm_state
= MDMNE_ACK
; /* Ok state */
2032 MSGID_COPY(&(msg
->msg_msgid
), &(result
->mmr_msgid
));
2033 class = mdmn_get_message_class(msgtype
);
2035 commd_debug(MD_MMV_PROC_S
,
2036 "proc_sla: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n",
2037 MSGID_ELEMS(msg
->msg_msgid
), setno
, class, msgtype
);
2039 handler
= mdmn_get_handler(msgtype
);
2041 if (handler
== NULL
) {
2042 result
->mmr_exitval
= 0;
2043 /* let the sender decide if this is an error or not */
2044 result
->mmr_comm_state
= MDMNE_NO_HANDLER
;
2045 commd_debug(MD_MMV_PROC_S
,
2046 "proc_sla: No handler for (%d, 0x%llx-%d)\n",
2047 MSGID_ELEMS(msg
->msg_msgid
));
2050 /* Did we already process this message ? */
2051 (void) mutex_lock(&mct_mutex
[setno
][class]);
2052 completed
= mdmn_check_completion(msg
, result
);
2054 if (completed
== MDMN_MCT_NOT_DONE
) {
2055 /* message not yet processed locally */
2056 commd_debug(MD_MMV_PROC_S
,
2057 "proc_sla: calling handler for (%d, 0x%llx-%d)\n",
2058 MSGID_ELEMS(msg
->msg_msgid
));
2061 * Mark the message as being currently processed,
2062 * so we won't start a second handler for it
2064 (void) mdmn_mark_completion(msg
, NULL
,
2065 MDMN_MCT_IN_PROGRESS
);
2067 (void) mutex_unlock(&mct_mutex
[setno
][class]);
2068 (*handler
)(msg
, MD_MSGF_ON_SLAVE
, result
);
2070 commd_debug(MD_MMV_PROC_S
,
2071 "proc_sla: finished handler for (%d, 0x%llx-%d)\n",
2072 MSGID_ELEMS(msg
->msg_msgid
));
2074 (void) mutex_lock(&mct_mutex
[setno
][class]);
2075 /* Mark the message as fully done, store the result */
2076 (void) mdmn_mark_completion(msg
, result
, MDMN_MCT_DONE
);
2078 } else if (completed
== MDMN_MCT_DONE
) {
2079 /* message processed previously, got result from MCT */
2080 commd_debug(MD_MMV_PROC_S
,
2081 "proc_sla: result for (%d, 0x%llx-%d) from MCT\n",
2082 MSGID_ELEMS(msg
->msg_msgid
));
2083 } else if (completed
== MDMN_MCT_IN_PROGRESS
) {
2085 * If the message is curruntly being processed,
2086 * we can return here, without sending a result back.
2087 * This will be done by the initial message handling
2090 (void) mutex_unlock(&mct_mutex
[setno
][class]);
2091 commd_debug(MD_MMV_PROC_M
, "proc_sla: "
2092 "(%d, 0x%llx-%d) is currently being processed\n",
2093 MSGID_ELEMS(msg
->msg_msgid
), msgtype
);
2096 free_result(result
);
2099 /* MCT error occurred (should never happen) */
2100 result
->mmr_comm_state
= MDMNE_LOG_FAIL
;
2101 commd_debug(MD_MMV_PROC_S
,
2102 "proc_sla: MCT error for (%d, 0x%llx-%d)\n",
2103 MSGID_ELEMS(msg
->msg_msgid
));
2105 (void) mutex_unlock(&mct_mutex
[setno
][class]);
2109 * At this point we have a result (even in an error case)
2110 * that we return to the master.
2112 (void) rw_rdlock(&set_desc_rwlock
[setno
]);
2113 retries
= 2; /* we will try two times to send the results */
2114 successfully_returned
= 0;
2116 while (!successfully_returned
&& (retries
!= 0)) {
2118 (void) rw_rdlock(&client_rwlock
[setno
]);
2119 if (check_client(setno
, sender
)) {
2121 * If we cannot setup the rpc connection to the master,
2122 * we can't do anything besides logging this fact.
2124 commd_debug(MD_MMV_SYSLOG
,
2125 "proc_mas: unable to create client for master\n");
2126 (void) rw_unlock(&client_rwlock
[setno
]);
2129 ret
= mdmn_wakeup_master_2(result
,
2130 client
[setno
][sender
], sender
);
2132 * if mdmn_wakeup_master_2 returns NULL, it can be that
2133 * the master (or the commd on the master) had died.
2134 * In that case, we destroy the client to the master
2136 * If mdmn_wakeup_master_2 doesn't return MDMNE_ACK,
2137 * the commd on the master is alive but
2138 * something else is wrong,
2139 * in that case a retry doesn't make sense => break out
2141 if (ret
== (int *)NULL
) {
2142 commd_debug(MD_MMV_PROC_S
,
2143 "proc_sla: wakeup_master returned NULL\n");
2144 /* release reader lock, grab writer lock */
2145 (void) rw_unlock(&client_rwlock
[setno
]);
2146 (void) rw_wrlock(&client_rwlock
[setno
]);
2147 mdmn_clnt_destroy(client
[setno
][sender
]);
2148 if (client
[setno
][sender
] != (CLIENT
*)NULL
) {
2149 client
[setno
][sender
] = (CLIENT
*)NULL
;
2151 (void) rw_unlock(&client_rwlock
[setno
]);
2153 commd_debug(MD_MMV_PROC_S
,
2154 "retries = %d\n", retries
);
2157 if (*ret
!= MDMNE_ACK
) {
2158 commd_debug(MD_MMV_PROC_S
, "proc_sla: "
2159 "wakeup_master returned %d\n", *ret
);
2160 (void) rw_unlock(&client_rwlock
[setno
]);
2162 } else { /* Good case */
2163 successfully_returned
= 1;
2164 (void) rw_unlock(&client_rwlock
[setno
]);
2169 (void) rw_unlock(&set_desc_rwlock
[setno
]);
2170 commd_debug(MD_MMV_PROC_S
, "proc_sla: done (%d, 0x%llx-%d)\n",
2171 MSGID_ELEMS(msg
->msg_msgid
));
2173 if (ret
!= (int *)NULL
)
2176 free_result(result
);
2183 * Check that the issuing node is a legitimate one (i.e. is licensed to send
2184 * messages to us), that the RPC request can be staged.
2187 * 0 => no RPC request is in-flight, no deferred svc_sendreply()
2188 * 1 => queued RPC request in-flight. Completion will be made (later)
2189 * by a wakeup_initiator_2() [hopefully]
2192 mdmn_send_svc_2(md_mn_msg_t
*omsg
, struct svc_req
*rqstp
)
2196 SVCXPRT
*transp
= rqstp
->rq_xprt
;
2198 md_mn_result_t
*resultp
;
2199 md_mn_msgclass_t
class;
2200 md_mn_msg_and_transp_t
*matp
;
2202 msg
= copy_msg(omsg
, NULL
);
2203 xdr_free(xdr_md_mn_msg_t
, (caddr_t
)omsg
);
2205 setno
= msg
->msg_setno
;
2206 class = mdmn_get_message_class(msg
->msg_type
);
2208 /* If we are in the abort state, we error out immediately */
2209 if (md_commd_global_state
& MD_CGS_ABORTED
) {
2210 resultp
= Zalloc(sizeof (md_mn_result_t
));
2211 resultp
->mmr_comm_state
= MDMNE_ABORT
;
2212 mdmn_svc_sendreply(transp
, xdr_md_mn_result_t
, (char *)resultp
);
2213 free_result(resultp
);
2214 svc_freeargs(transp
, xdr_md_mn_msg_t
, (caddr_t
)msg
);
2218 /* check if the global initialization is done */
2219 if ((md_commd_global_state
& MD_CGS_INITED
) == 0) {
2223 commd_debug(MD_MMV_SEND
,
2224 "send: received (%d, 0x%llx-%d), set=%d, class=%d, type=%d\n",
2225 MSGID_ELEMS(msg
->msg_msgid
), setno
, class, msg
->msg_type
);
2227 /* Check for verbosity related message */
2228 if (msg
->msg_type
== MD_MN_MSG_VERBOSITY
) {
2231 d
= (md_mn_verbose_t
*)((void *)(msg
->msg_event_data
));
2232 md_commd_global_verb
= d
->mmv_what
;
2233 /* everytime the bitmask is set, we reset the timer */
2234 __savetime
= gethrtime();
2236 * If local-only-flag is set, we are done here,
2237 * otherwise we pass that message on to the master.
2239 if (msg
->msg_flags
& MD_MSGF_LOCAL_ONLY
) {
2240 resultp
= Zalloc(sizeof (md_mn_result_t
));
2241 resultp
->mmr_comm_state
= MDMNE_ACK
;
2242 mdmn_svc_sendreply(transp
, xdr_md_mn_result_t
,
2244 free_result(resultp
);
2245 svc_freeargs(transp
, xdr_md_mn_msg_t
, (caddr_t
)msg
);
2251 * Are we entering the abort state?
2252 * Here we don't even need to check for MD_MSGF_LOCAL_ONLY, because
2253 * this message cannot be distributed anyway.
2254 * So, it's safe to return immediately.
2256 if (msg
->msg_type
== MD_MN_MSG_ABORT
) {
2257 md_commd_global_state
|= MD_CGS_ABORTED
;
2258 resultp
= Zalloc(sizeof (md_mn_result_t
));
2259 resultp
->mmr_comm_state
= MDMNE_ACK
;
2260 mdmn_svc_sendreply(transp
, xdr_md_mn_result_t
, (char *)resultp
);
2261 free_result(resultp
);
2262 svc_freeargs(transp
, xdr_md_mn_msg_t
, (caddr_t
)msg
);
2268 * Is this message type blocked?
2269 * If so we return MDMNE_CLASS_LOCKED, immediately
2271 if (msgtype_lock_state
[msg
->msg_type
] == MMTL_LOCK
) {
2272 resultp
= Zalloc(sizeof (md_mn_result_t
));
2273 resultp
->mmr_comm_state
= MDMNE_CLASS_LOCKED
;
2274 mdmn_svc_sendreply(transp
, xdr_md_mn_result_t
, (char *)resultp
);
2275 free_result(resultp
);
2276 svc_freeargs(transp
, xdr_md_mn_msg_t
, (caddr_t
)msg
);
2277 commd_debug(MD_MMV_SEND
,
2278 "send: type locked (%d, 0x%llx-%d), set=%d, class=%d, "
2279 "type=%d\n", MSGID_ELEMS(msg
->msg_msgid
), setno
, class,
2285 if (md_mn_set_inited
[setno
] != MDMN_SET_READY
) {
2286 /* Can only use the appropriate mutexes if they are inited */
2287 if (md_mn_set_inited
[setno
] & MDMN_SET_MUTEXES
) {
2288 (void) rw_wrlock(&set_desc_rwlock
[setno
]);
2289 (void) rw_wrlock(&client_rwlock
[setno
]);
2290 err
= mdmn_init_set(setno
, MDMN_SET_READY
);
2291 (void) rw_unlock(&client_rwlock
[setno
]);
2292 (void) rw_unlock(&set_desc_rwlock
[setno
]);
2294 err
= mdmn_init_set(setno
, MDMN_SET_READY
);
2298 /* couldn't initialize connections, cannot proceed */
2299 resultp
= Zalloc(sizeof (md_mn_result_t
));
2300 resultp
->mmr_comm_state
= err
;
2301 mdmn_svc_sendreply(transp
, xdr_md_mn_result_t
,
2303 svc_freeargs(transp
, xdr_md_mn_msg_t
, (caddr_t
)msg
);
2304 free_result(resultp
);
2305 commd_debug(MD_MMV_SEND
,
2306 "send: init err = %d\n", err
);
2311 (void) mutex_lock(&mdmn_busy_mutex
[setno
]);
2312 if ((mdmn_is_class_suspended(setno
, class) == TRUE
) &&
2313 ((msg
->msg_flags
& MD_MSGF_OVERRIDE_SUSPEND
) == 0)) {
2314 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
2315 resultp
= Zalloc(sizeof (md_mn_result_t
));
2316 resultp
->mmr_comm_state
= MDMNE_SUSPENDED
;
2317 mdmn_svc_sendreply(transp
, xdr_md_mn_result_t
, (char *)resultp
);
2318 svc_freeargs(transp
, xdr_md_mn_msg_t
, (caddr_t
)msg
);
2319 free_result(resultp
);
2320 commd_debug(MD_MMV_SEND
,
2321 "send: class suspended (%d, 0x%llx-%d), set=%d, "
2322 "class=%d, type=%d\n", MSGID_ELEMS(msg
->msg_msgid
),
2323 setno
, class, msg
->msg_type
);
2326 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
2328 /* is this rpc request coming from the local node? */
2329 if (check_license(rqstp
, 0) == FALSE
) {
2330 svc_freeargs(transp
, xdr_md_mn_msg_t
, (caddr_t
)msg
);
2331 commd_debug(MD_MMV_SEND
,
2332 "send: check licence fail(%d, 0x%llx-%d), set=%d, "
2333 "class=%d, type=%d\n", MSGID_ELEMS(msg
->msg_msgid
),
2334 setno
, class, msg
->msg_type
);
2340 * We allocate a structure that can take two pointers in order to pass
2341 * both the message and the transp into thread_create.
2342 * The free for this alloc is done in mdmn_send_to_work()
2344 matp
= Malloc(sizeof (md_mn_msg_and_transp_t
));
2345 matp
->mat_msg
= msg
;
2346 matp
->mat_transp
= transp
;
2349 * create a thread here that calls work on the master.
2350 * If we are already on the master, this would block if running
2351 * in the same context. (our service is single threaded)(
2352 * Make it a detached thread because it will not communicate with
2353 * anybody thru thr_* mechanisms
2355 (void) thr_create(NULL
, 0, mdmn_send_to_work
, (void *) matp
,
2356 THR_DETACHED
, NULL
);
2358 commd_debug(MD_MMV_SEND
, "send: done (%d, 0x%llx-%d)\n",
2359 MSGID_ELEMS(msg
->msg_msgid
));
2361 * We return here without sending results. This will be done by
2362 * mdmn_wakeup_initiator_svc_2() as soon as the results are available.
2363 * Until then the calling send_message will be blocked, while we
2364 * are able to take calls.
2372 mdmn_work_svc_2(md_mn_msg_t
*omsg
, struct svc_req
*rqstp
)
2379 md_mn_msgclass_t
class;
2381 retval
= Malloc(sizeof (int));
2383 /* If we are in the abort state, we error out immediately */
2384 if (md_commd_global_state
& MD_CGS_ABORTED
) {
2385 xdr_free(xdr_md_mn_msg_t
, (caddr_t
)omsg
);
2386 *retval
= MDMNE_ABORT
;
2390 msg
= copy_msg(omsg
, NULL
);
2391 xdr_free(xdr_md_mn_msg_t
, (caddr_t
)omsg
);
2394 * Is this message type blocked?
2395 * If so we return MDMNE_CLASS_LOCKED, immediately.
2396 * This check is performed on master and slave.
2398 if (msgtype_lock_state
[msg
->msg_type
] == MMTL_LOCK
) {
2399 *retval
= MDMNE_CLASS_LOCKED
;
2403 /* check if the global initialization is done */
2404 if ((md_commd_global_state
& MD_CGS_INITED
) == 0) {
2408 class = mdmn_get_message_class(msg
->msg_type
);
2409 setno
= msg
->msg_setno
;
2411 if (md_mn_set_inited
[setno
] != MDMN_SET_READY
) {
2412 /* Can only use the appropriate mutexes if they are inited */
2413 if (md_mn_set_inited
[setno
] & MDMN_SET_MUTEXES
) {
2414 (void) rw_wrlock(&set_desc_rwlock
[setno
]);
2415 (void) rw_wrlock(&client_rwlock
[setno
]);
2416 err
= mdmn_init_set(setno
, MDMN_SET_READY
);
2417 (void) rw_unlock(&client_rwlock
[setno
]);
2418 (void) rw_unlock(&set_desc_rwlock
[setno
]);
2420 err
= mdmn_init_set(setno
, MDMN_SET_READY
);
2424 *retval
= MDMNE_CANNOT_CONNECT
;
2430 /* is this rpc request coming from a licensed node? */
2431 if (check_license(rqstp
, msg
->msg_sender
) == FALSE
) {
2433 *retval
= MDMNE_RPC_FAIL
;
2437 commd_debug(MD_MMV_WORK
,
2438 "work: received (%d, 0x%llx-%d), set=%d, class=%d, type=%d, "
2440 MSGID_ELEMS(msg
->msg_msgid
), setno
, class, msg
->msg_type
,
2443 /* Check for various CLASS0 message types */
2444 if (msg
->msg_type
== MD_MN_MSG_VERBOSITY
) {
2447 d
= (md_mn_verbose_t
*)((void *)(msg
->msg_event_data
));
2448 /* for now we ignore set / class in md_mn_verbose_t */
2449 md_commd_global_verb
= d
->mmv_what
;
2450 /* everytime the bitmask is set, we reset the timer */
2451 __savetime
= gethrtime();
2454 (void) mutex_lock(&mdmn_busy_mutex
[setno
]);
2456 /* check if class is locked via a call to mdmn_comm_lock_svc_2 */
2457 if (mdmn_is_class_locked(setno
, class) == TRUE
) {
2458 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
2459 *retval
= MDMNE_CLASS_LOCKED
;
2463 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
2465 /* Check if the class is busy right now. Do it only on the master */
2466 (void) rw_rdlock(&set_desc_rwlock
[setno
]);
2467 if (set_descriptor
[setno
]->sd_mn_am_i_master
) {
2468 (void) rw_unlock(&set_desc_rwlock
[setno
]);
2470 * If the class is currently suspended, don't accept new
2471 * messages, unless they are flagged with an override bit.
2473 (void) mutex_lock(&mdmn_busy_mutex
[setno
]);
2474 if ((mdmn_is_class_suspended(setno
, class) == TRUE
) &&
2475 ((msg
->msg_flags
& MD_MSGF_OVERRIDE_SUSPEND
) == 0)) {
2476 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
2477 *retval
= MDMNE_SUSPENDED
;
2478 commd_debug(MD_MMV_SEND
,
2479 "send: set %d is suspended\n", setno
);
2483 if (mdmn_mark_class_busy(setno
, class) == FALSE
) {
2484 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
2485 *retval
= MDMNE_CLASS_BUSY
;
2489 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
2491 * Because the real processing of the message takes time we
2492 * create a thread for it. So the master thread can continue
2493 * to run and accept further messages.
2495 *retval
= thr_create(NULL
, 0,
2496 (void *(*)(void *))mdmn_master_process_msg
, (void *)msg
,
2497 THR_DETACHED
|THR_SUSPENDED
, &tid
);
2499 (void) rw_unlock(&set_desc_rwlock
[setno
]);
2500 *retval
= thr_create(NULL
, 0,
2501 (void *(*)(void *)) mdmn_slave_process_msg
, (void *)msg
,
2502 THR_DETACHED
|THR_SUSPENDED
, &tid
);
2506 *retval
= MDMNE_THR_CREATE_FAIL
;
2511 /* Now run the new thread */
2512 (void) thr_continue(tid
);
2514 commd_debug(MD_MMV_WORK
,
2515 "work: done (%d, 0x%llx-%d), set=%d, class=%d, type=%d\n",
2516 MSGID_ELEMS(msg
->msg_msgid
), setno
, class, msg
->msg_type
);
2518 *retval
= MDMNE_ACK
; /* this means success */
2524 mdmn_wakeup_initiator_svc_2(md_mn_result_t
*res
, struct svc_req
*rqstp
)
2530 mutex_t
*mx
; /* protection of initiator_table */
2531 SVCXPRT
*transp
= NULL
;
2532 md_mn_msgid_t initiator_table_id
;
2533 md_mn_msgclass_t
class;
2535 retval
= Malloc(sizeof (int));
2537 /* check if the global initialization is done */
2538 if ((md_commd_global_state
& MD_CGS_INITED
) == 0) {
2542 setno
= res
->mmr_setno
;
2544 if (md_mn_set_inited
[setno
] != MDMN_SET_READY
) {
2545 /* set not ready means we just crashed are restarted now */
2546 /* Can only use the appropriate mutexes if they are inited */
2547 if (md_mn_set_inited
[setno
] & MDMN_SET_MUTEXES
) {
2548 (void) rw_wrlock(&set_desc_rwlock
[setno
]);
2549 (void) rw_wrlock(&client_rwlock
[setno
]);
2550 err
= mdmn_init_set(setno
, MDMN_SET_READY
);
2551 (void) rw_unlock(&client_rwlock
[setno
]);
2552 (void) rw_unlock(&set_desc_rwlock
[setno
]);
2554 err
= mdmn_init_set(setno
, MDMN_SET_READY
);
2558 *retval
= MDMNE_CANNOT_CONNECT
;
2559 xdr_free(xdr_md_mn_result_t
, (caddr_t
)res
);
2564 /* is this rpc request coming from a licensed node? */
2565 if (check_license(rqstp
, res
->mmr_sender
) == FALSE
) {
2566 xdr_free(xdr_md_mn_result_t
, (caddr_t
)res
);
2567 *retval
= MDMNE_RPC_FAIL
;
2572 class = mdmn_get_message_class(res
->mmr_msgtype
);
2573 mx
= mdmn_get_initiator_table_mx(setno
, class);
2575 commd_debug(MD_MMV_WAKE_I
,
2576 "wake_ini: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n",
2577 MSGID_ELEMS(res
->mmr_msgid
), setno
, class, res
->mmr_msgtype
);
2579 (void) mutex_lock(mx
);
2582 * Search the initiator wakeup table.
2583 * If we find an entry here (which should always be true)
2584 * we are on the initiating node and we wakeup the original
2587 mdmn_get_initiator_table_id(setno
, class, &initiator_table_id
);
2589 if (MSGID_CMP(&(initiator_table_id
), &(res
->mmr_msgid
))) {
2590 transp
= mdmn_get_initiator_table_transp(setno
, class);
2591 mdmn_svc_sendreply(transp
, xdr_md_mn_result_t
, (char *)res
);
2593 mdmn_unregister_initiator_table(setno
, class);
2594 *retval
= MDMNE_ACK
;
2596 commd_debug(MD_MMV_WAKE_I
,
2597 "wake_ini: replied (%d, 0x%llx-%d)\n",
2598 MSGID_ELEMS(res
->mmr_msgid
));
2600 commd_debug(MD_MMV_WAKE_I
,
2601 "wakeup initiator: unsolicited message (%d, 0x%llx-%d)\n",
2602 MSGID_ELEMS(res
->mmr_msgid
));
2603 *retval
= MDMNE_NO_WAKEUP_ENTRY
;
2605 (void) mutex_unlock(mx
);
2606 /* less work for check_timeouts */
2607 (void) mutex_lock(&check_timeout_mutex
);
2608 if (messages_on_their_way
== 0) {
2609 commd_debug(MD_MMV_WAKE_I
,
2610 "Oops, messages_on_their_way < 0 (%d, 0x%llx-%d)\n",
2611 MSGID_ELEMS(res
->mmr_msgid
));
2613 messages_on_their_way
--;
2615 (void) mutex_unlock(&check_timeout_mutex
);
2616 xdr_free(xdr_md_mn_result_t
, (caddr_t
)res
);
2623 * res must be free'd by the thread we wake up
2627 mdmn_wakeup_master_svc_2(md_mn_result_t
*ores
, struct svc_req
*rqstp
)
2635 md_mn_msgid_t master_table_id
;
2636 md_mn_nodeid_t sender
;
2637 md_mn_result_t
*res
;
2638 md_mn_msgclass_t
class;
2640 retval
= Malloc(sizeof (int));
2642 /* check if the global initialization is done */
2643 if ((md_commd_global_state
& MD_CGS_INITED
) == 0) {
2647 /* Need to copy the results here, as they are static for RPC */
2648 res
= copy_result(ores
);
2649 xdr_free(xdr_md_mn_result_t
, (caddr_t
)ores
);
2651 class = mdmn_get_message_class(res
->mmr_msgtype
);
2652 setno
= res
->mmr_setno
;
2654 if (md_mn_set_inited
[setno
] != MDMN_SET_READY
) {
2655 /* set not ready means we just crashed are restarted now */
2656 /* Can only use the appropriate mutexes if they are inited */
2657 if (md_mn_set_inited
[setno
] & MDMN_SET_MUTEXES
) {
2658 (void) rw_wrlock(&set_desc_rwlock
[setno
]);
2659 (void) rw_wrlock(&client_rwlock
[setno
]);
2660 err
= mdmn_init_set(setno
, MDMN_SET_READY
);
2661 (void) rw_unlock(&client_rwlock
[setno
]);
2662 (void) rw_unlock(&set_desc_rwlock
[setno
]);
2664 err
= mdmn_init_set(setno
, MDMN_SET_READY
);
2668 *retval
= MDMNE_CANNOT_CONNECT
;
2669 xdr_free(xdr_md_mn_result_t
, (caddr_t
)res
);
2674 /* is this rpc request coming from a licensed node? */
2675 if (check_license(rqstp
, res
->mmr_sender
) == FALSE
) {
2676 *retval
= MDMNE_RPC_FAIL
;
2677 xdr_free(xdr_md_mn_result_t
, (caddr_t
)res
);
2682 commd_debug(MD_MMV_WAKE_M
,
2683 "wake_mas: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d "
2685 MSGID_ELEMS(res
->mmr_msgid
), setno
, class, res
->mmr_msgtype
,
2688 * The mutex and cv are needed for waking up the thread
2689 * sleeping in mdmn_master_process_msg()
2691 mx
= mdmn_get_master_table_mx(setno
, class);
2692 cv
= mdmn_get_master_table_cv(setno
, class);
2695 * lookup the master wakeup table
2696 * If we find our message, we are on the master and
2697 * called by a slave that finished processing a message.
2698 * We store the results in the appropriate slot and
2699 * wakeup the thread (mdmn_master_process_msg()) waiting for them.
2701 (void) mutex_lock(mx
);
2702 mdmn_get_master_table_id(setno
, class, &master_table_id
);
2703 sender
= mdmn_get_master_table_addr(setno
, class);
2705 if (MSGID_CMP(&(master_table_id
), &(res
->mmr_msgid
))) {
2706 if (sender
== res
->mmr_sender
) {
2707 mdmn_set_master_table_res(setno
, class, res
);
2708 (void) cond_signal(cv
);
2709 *retval
= MDMNE_ACK
;
2711 /* id is correct but wrong sender (I smell a timeout) */
2712 commd_debug(MD_MMV_WAKE_M
,
2713 "wakeup master got unsolicited message: "
2714 "(%d, 0x%llx-%d) from %d\n",
2715 MSGID_ELEMS(res
->mmr_msgid
), res
->mmr_sender
);
2717 *retval
= MDMNE_TIMEOUT
;
2720 /* id is wrong, smells like a very late timeout */
2721 commd_debug(MD_MMV_WAKE_M
,
2722 "wakeup master got unsolicited message: "
2723 "(%d, 0x%llx-%d) from %d, expected (%d, 0x%llx-%d)\n",
2724 MSGID_ELEMS(res
->mmr_msgid
), res
->mmr_sender
,
2725 MSGID_ELEMS(master_table_id
));
2727 *retval
= MDMNE_NO_WAKEUP_ENTRY
;
2730 (void) mutex_unlock(mx
);
2736 * Lock a set/class combination.
2737 * This is mainly done for debug purpose.
2738 * This set/class combination immediately is blocked,
2739 * even in the middle of sending messages to multiple slaves.
2740 * This remains until the user issues a mdmn_comm_unlock_svc_2 for the same
2741 * set/class combination.
2743 * Special messages of class MD_MSG_CLASS0 can never be locked.
2744 * e.g. MD_MN_MSG_VERBOSITY, MD_MN_MSG_ABORT
2746 * That means, if MD_MSG_CLASS0 is specified, we lock all classes from
2747 * >= MD_MSG_CLASS1 to < MD_MN_NCLASSES
2749 * set must be between 1 and MD_MAXSETS
2751 * MD_MSG_CLASS0 which means all other classes in this case
2752 * or one specific class (< MD_MN_NCLASSES)
2755 * MDMNE_ACK on sucess (locking a locked class is Ok)
2756 * MDMNE_EINVAL if a parameter is out of range
2761 mdmn_comm_lock_svc_2(md_mn_set_and_class_t
*msc
, struct svc_req
*rqstp
)
2764 set_t setno
= msc
->msc_set
;
2765 md_mn_msgclass_t
class = msc
->msc_class
;
2767 retval
= Malloc(sizeof (int));
2769 /* check if the global initialization is done */
2770 if ((md_commd_global_state
& MD_CGS_INITED
) == 0) {
2774 /* is this rpc request coming from the local node ? */
2775 if (check_license(rqstp
, 0) == FALSE
) {
2776 xdr_free(xdr_md_mn_set_and_class_t
, (caddr_t
)msc
);
2777 *retval
= MDMNE_RPC_FAIL
;
2781 /* Perform some range checking */
2782 if ((setno
== 0) || (setno
>= MD_MAXSETS
) ||
2783 (class < MD_MSG_CLASS0
) || (class >= MD_MN_NCLASSES
)) {
2784 *retval
= MDMNE_EINVAL
;
2788 commd_debug(MD_MMV_MISC
, "lock: set=%d, class=%d\n", setno
, class);
2789 (void) mutex_lock(&mdmn_busy_mutex
[setno
]);
2790 if (class != MD_MSG_CLASS0
) {
2791 mdmn_mark_class_locked(setno
, class);
2793 /* MD_MSG_CLASS0 is used as a wild card for all classes */
2794 for (class = MD_MSG_CLASS1
; class < MD_MN_NCLASSES
; class++) {
2795 mdmn_mark_class_locked(setno
, class);
2798 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
2800 *retval
= MDMNE_ACK
;
2805 * Unlock a set/class combination.
2806 * set must be between 1 and MD_MAXSETS
2808 * MD_MSG_CLASS0 which means all other classes in this case (like above)
2809 * or one specific class (< MD_MN_NCLASSES)
2812 * MDMNE_ACK on sucess (unlocking an unlocked class is Ok)
2813 * MDMNE_EINVAL if a parameter is out of range
2817 mdmn_comm_unlock_svc_2(md_mn_set_and_class_t
*msc
, struct svc_req
*rqstp
)
2820 set_t setno
= msc
->msc_set
;
2821 md_mn_msgclass_t
class = msc
->msc_class
;
2823 retval
= Malloc(sizeof (int));
2825 /* check if the global initialization is done */
2826 if ((md_commd_global_state
& MD_CGS_INITED
) == 0) {
2830 /* is this rpc request coming from the local node ? */
2831 if (check_license(rqstp
, 0) == FALSE
) {
2832 xdr_free(xdr_md_mn_set_and_class_t
, (caddr_t
)msc
);
2833 *retval
= MDMNE_RPC_FAIL
;
2837 /* Perform some range checking */
2838 if ((setno
== 0) || (setno
>= MD_MAXSETS
) ||
2839 (class < MD_MSG_CLASS0
) || (class >= MD_MN_NCLASSES
)) {
2840 *retval
= MDMNE_EINVAL
;
2843 commd_debug(MD_MMV_MISC
, "unlock: set=%d, class=%d\n", setno
, class);
2845 (void) mutex_lock(&mdmn_busy_mutex
[setno
]);
2846 if (class != MD_MSG_CLASS0
) {
2847 mdmn_mark_class_unlocked(setno
, class);
2849 /* MD_MSG_CLASS0 is used as a wild card for all classes */
2850 for (class = MD_MSG_CLASS1
; class < MD_MN_NCLASSES
; class++) {
2851 mdmn_mark_class_unlocked(setno
, class);
2854 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
2856 *retval
= MDMNE_ACK
;
2861 * mdmn_comm_suspend_svc_2(setno, class)
2863 * Drain all outstanding messages for a given set/class combination
2864 * and don't allow new messages to be processed.
2866 * Special messages of class MD_MSG_CLASS0 can never be locked.
2867 * e.g. MD_MN_MSG_VERBOSITY
2869 * 1 <= setno < MD_MAXSETS or setno == MD_COMM_ALL_SETS
2870 * 1 <= class < MD_MN_NCLASSES or class == MD_COMM_ALL_CLASSES
2872 * If class _is_not_ MD_COMM_ALL_CLASSES, then we simply mark this
2873 * one class as being suspended.
2874 * If messages for this class are currently on their way,
2875 * MDMNE_SET_NOT_DRAINED is returned. Otherwise MDMNE_ACK is returned.
2877 * If class _is_ MD_COMM_ALL_CLASSES we drain all classes of this set.
2878 * Messages must be generated in ascending order.
2879 * This means, a message cannot create submessages with the same or lower class.
2880 * Draining messages must go from 1 to NCLASSES in order to ensure we don't
2881 * generate a hanging situation here.
2882 * We mark class 1 as being suspended.
2883 * if the class is not busy, we proceed with class 2
2885 * if a class *is* busy, we cannot continue here, but return
2886 * MDMNE_SET_NOT_DRAINED.
2887 * We expect the caller to hold on for some seconds and try again.
2888 * When that message, that held the class busy is done in
2889 * mdmn_master_process_msg(), mdmn_mark_class_unbusy() called.
2890 * There it is checked if the class is about to drain.
2891 * In that case it tries to drain all higher classes there.
2893 * If setno is MD_COMM_ALL_SETS then we perform this on all possible sets.
2894 * In that case we return MDMNE_SET_NOT_DRAINED if not all sets are
2895 * completely drained.
2898 * MDMNE_ACK on sucess (set is drained, no outstanding messages)
2899 * MDMNE_SET_NOT_DRAINED if drain process is started, but there are
2900 * still outstanding messages for this set(s)
2901 * MDMNE_EINVAL if setno is out of range
2902 * MDMNE_NOT_JOINED if the set is not yet initialized on this node
2907 mdmn_comm_suspend_svc_2(md_mn_set_and_class_t
*msc
, struct svc_req
*rqstp
)
2911 set_t startset
, endset
;
2912 set_t setno
= msc
->msc_set
;
2913 md_mn_msgclass_t oclass
= msc
->msc_class
;
2914 #ifdef NOT_YET_NEEDED
2915 uint_t flags
= msc
->msc_flags
;
2916 #endif /* NOT_YET_NEEDED */
2917 md_mn_msgclass_t
class;
2919 retval
= Malloc(sizeof (int));
2921 /* check if the global initialization is done */
2922 if ((md_commd_global_state
& MD_CGS_INITED
) == 0) {
2926 /* is this rpc request coming from the local node ? */
2927 if (check_license(rqstp
, 0) == FALSE
) {
2928 xdr_free(xdr_md_mn_set_and_class_t
, (caddr_t
)msc
);
2929 *retval
= MDMNE_RPC_FAIL
;
2933 commd_debug(MD_MMV_MISC
, "suspend: called for set=%d class=%d\n",
2936 /* Perform some range checking */
2937 if (setno
>= MD_MAXSETS
) {
2938 *retval
= MDMNE_EINVAL
;
2939 commd_debug(MD_MMV_MISC
, "suspend: returning MDMNE_EINVAL\n");
2943 /* setno == MD_COMM_ALL_SETS means: we walk thru all possible sets. */
2944 if (setno
== MD_COMM_ALL_SETS
) {
2946 endset
= MD_MAXSETS
- 1;
2952 for (setno
= startset
; setno
<= endset
; setno
++) {
2953 /* Here we need the mutexes for the set to be setup */
2954 if (md_mn_set_inited
[setno
] != MDMN_SET_MUTEXES
) {
2955 (void) mdmn_init_set(setno
, MDMN_SET_MUTEXES
);
2958 (void) mutex_lock(&mdmn_busy_mutex
[setno
]);
2959 /* shall we drain all classes of this set? */
2960 if (oclass
== MD_COMM_ALL_CLASSES
) {
2961 for (class = 1; class < MD_MN_NCLASSES
; class ++) {
2962 commd_debug(MD_MMV_MISC
,
2963 "suspend: suspending set %d, class %d\n",
2965 *retval
= mdmn_mark_class_suspended(setno
,
2966 class, MDMN_SUSPEND_ALL
);
2967 if (*retval
== MDMNE_SET_NOT_DRAINED
) {
2972 /* only drain one specific class */
2973 commd_debug(MD_MMV_MISC
,
2974 "suspend: suspending set=%d class=%d\n",
2976 *retval
= mdmn_mark_class_suspended(setno
, oclass
,
2978 if (*retval
== MDMNE_SET_NOT_DRAINED
) {
2982 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
2984 /* If one or more sets are not entirely drained, failure is non-zero */
2986 *retval
= MDMNE_SET_NOT_DRAINED
;
2987 commd_debug(MD_MMV_MISC
,
2988 "suspend: returning MDMNE_SET_NOT_DRAINED\n");
2990 *retval
= MDMNE_ACK
;
2997 * mdmn_comm_resume_svc_2(setno, class)
2999 * Resume processing messages for a given set.
3000 * This incorporates the repeal of a previous suspend operation.
3002 * 1 <= setno < MD_MAXSETS or setno == MD_COMM_ALL_SETS
3003 * 1 <= class < MD_MN_NCLASSES or class == MD_COMM_ALL_CLASSES
3005 * If class _is_not_ MD_COMM_ALL_CLASSES, then we simply mark this
3006 * one class as being resumed.
3008 * If class _is_ MD_COMM_ALL_CLASSES we resume all classes of this set.
3010 * If setno is MD_COMM_ALL_SETS then we perform this on all possible sets.
3012 * If both setno is MD_COMM_ALL_SETS and class is MD_COMM_ALL_CLASSES we also
3013 * reset any ABORT flag from the global state.
3016 * MDMNE_ACK on sucess (resuming an unlocked set is Ok)
3017 * MDMNE_EINVAL if setno is out of range
3018 * MDMNE_NOT_JOINED if the set is not yet initialized on this node
3022 mdmn_comm_resume_svc_2(md_mn_set_and_class_t
*msc
, struct svc_req
*rqstp
)
3025 set_t startset
, endset
;
3026 set_t setno
= msc
->msc_set
;
3027 md_mn_msgclass_t oclass
= msc
->msc_class
;
3028 uint_t flags
= msc
->msc_flags
;
3029 md_mn_msgclass_t
class;
3031 retval
= Malloc(sizeof (int));
3033 /* check if the global initialization is done */
3034 if ((md_commd_global_state
& MD_CGS_INITED
) == 0) {
3038 /* is this rpc request coming from the local node ? */
3039 if (check_license(rqstp
, 0) == FALSE
) {
3040 xdr_free(xdr_md_mn_set_and_class_t
, (caddr_t
)msc
);
3041 *retval
= MDMNE_RPC_FAIL
;
3045 commd_debug(MD_MMV_MISC
, "resume: called for set=%d class=%d\n",
3048 /* Perform some range checking */
3049 if (setno
> MD_MAXSETS
) {
3050 *retval
= MDMNE_EINVAL
;
3054 if (setno
== MD_COMM_ALL_SETS
) {
3056 endset
= MD_MAXSETS
- 1;
3057 if (oclass
== MD_COMM_ALL_CLASSES
) {
3058 /* This is the point where we "unabort" the commd */
3059 commd_debug(MD_MMV_MISC
, "resume: resetting ABORT\n");
3060 md_commd_global_state
&= ~MD_CGS_ABORTED
;
3067 for (setno
= startset
; setno
<= endset
; setno
++) {
3069 /* Here we need the mutexes for the set to be setup */
3070 if ((md_mn_set_inited
[setno
] & MDMN_SET_MUTEXES
) == 0) {
3071 (void) mdmn_init_set(setno
, MDMN_SET_MUTEXES
);
3074 (void) mutex_lock(&mdmn_busy_mutex
[setno
]);
3076 if (oclass
== MD_COMM_ALL_CLASSES
) {
3079 * When SUSPENDing all classes, we go
3080 * from 1 to MD_MN_NCLASSES-1
3081 * The correct reverse action is RESUMing
3082 * from MD_MN_NCLASSES-1 to 1 (or 2)
3085 if (flags
& MD_MSCF_DONT_RESUME_CLASS1
) {
3090 * Then mark all classes of this set as no longer
3091 * suspended. This supersedes any previous suspend(1)
3092 * calls and resumes the set entirely.
3094 for (class = MD_MN_NCLASSES
- 1; class >= end_class
;
3096 commd_debug(MD_MMV_MISC
,
3097 "resume: resuming set=%d class=%d\n",
3099 mdmn_mark_class_resumed(setno
, class,
3100 (MDMN_SUSPEND_ALL
| MDMN_SUSPEND_1
));
3104 * In this case only one class is marked as not
3105 * suspended. If a suspend(all) is currently active for
3106 * this set, this class will still be suspended.
3107 * That state will be cleared by a suspend(all)
3110 commd_debug(MD_MMV_MISC
,
3111 "resume: resuming set=%d class=%d\n",
3113 mdmn_mark_class_resumed(setno
, oclass
, MDMN_SUSPEND_1
);
3116 (void) mutex_unlock(&mdmn_busy_mutex
[setno
]);
3119 *retval
= MDMNE_ACK
;
3124 mdmn_comm_reinit_set_svc_2(set_t
*setnop
, struct svc_req
*rqstp
)
3127 md_mnnode_desc
*node
;
3128 set_t setno
= *setnop
;
3130 retval
= Malloc(sizeof (int));
3132 /* check if the global initialization is done */
3133 if ((md_commd_global_state
& MD_CGS_INITED
) == 0) {
3137 /* is this rpc request coming from the local node ? */
3138 if (check_license(rqstp
, 0) == FALSE
) {
3139 xdr_free(xdr_set_t
, (caddr_t
)setnop
);
3140 *retval
= MDMNE_RPC_FAIL
;
3144 commd_debug(MD_MMV_MISC
, "reinit: set=%d\n", setno
);
3146 (void) rw_rdlock(&set_desc_rwlock
[setno
]);
3148 * We assume, that all messages have been suspended previously.
3150 * As we are modifying lots of clients here we grab the client_rwlock
3151 * in writer mode. This ensures, no new messages come in.
3153 (void) rw_wrlock(&client_rwlock
[setno
]);
3154 /* This set is no longer initialized */
3156 if ((set_descriptor
[setno
] != NULL
) &&
3157 (md_mn_set_inited
[setno
] & MDMN_SET_NODES
)) {
3158 /* destroy all rpc clients from this set */
3159 for (node
= set_descriptor
[setno
]->sd_nodelist
; node
;
3160 node
= node
->nd_next
) {
3162 * Since the CLIENT for ourself will be recreated
3163 * shortly, and this node is guaranteed to be
3164 * there after a reconfig, there's no reason to go
3165 * through destroying it. It also avoids an issue
3166 * with calling clnt_create() later from within the
3167 * server thread, which can effectively deadlock
3168 * itself due to RPC design limitations.
3170 if (node
== set_descriptor
[setno
]->sd_mn_mynode
)
3172 mdmn_clnt_destroy(client
[setno
][node
->nd_nodeid
]);
3173 if (client
[setno
][node
->nd_nodeid
] != (CLIENT
*)NULL
) {
3174 client
[setno
][node
->nd_nodeid
] = (CLIENT
*)NULL
;
3177 md_mn_set_inited
[setno
] &= ~MDMN_SET_NODES
;
3180 commd_debug(MD_MMV_MISC
, "reinit: done init_set(%d)\n", setno
);
3182 (void) rw_unlock(&client_rwlock
[setno
]);
3183 (void) rw_unlock(&set_desc_rwlock
[setno
]);
3184 *retval
= MDMNE_ACK
;
3189 * This is just an interface for testing purpose.
3190 * Here we can disable single message types.
3191 * If we block a message type, this is valid for all MN sets.
3192 * If a message arrives later, and it's message type is blocked, it will
3193 * be returned immediately with MDMNE_CLASS_LOCKED, which causes the sender to
3194 * resend this message over and over again.
3199 mdmn_comm_msglock_svc_2(md_mn_type_and_lock_t
*mmtl
, struct svc_req
*rqstp
)
3202 md_mn_msgtype_t type
= mmtl
->mmtl_type
;
3203 uint_t lock
= mmtl
->mmtl_lock
;
3205 retval
= Malloc(sizeof (int));
3207 /* check if the global initialization is done */
3208 if ((md_commd_global_state
& MD_CGS_INITED
) == 0) {
3212 /* is this rpc request coming from the local node ? */
3213 if (check_license(rqstp
, 0) == FALSE
) {
3214 xdr_free(xdr_md_mn_type_and_lock_t
, (caddr_t
)mmtl
);
3215 *retval
= MDMNE_RPC_FAIL
;
3219 /* Perform some range checking */
3220 if ((type
== 0) || (type
>= MD_MN_NMESSAGES
)) {
3221 *retval
= MDMNE_EINVAL
;
3225 commd_debug(MD_MMV_MISC
, "msglock: type=%d, lock=%d\n", type
, lock
);
3226 msgtype_lock_state
[type
] = lock
;
3228 *retval
= MDMNE_ACK
;