6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli...
[unleashed.git] / usr / src / lib / lvm / libmeta / common / meta_mn_changelog.c
blob388a5d9de757474d908d6b7a44c66b81fdcca677
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #include <stdlib.h>
28 #include <unistd.h>
29 #include <wait.h>
30 #include <sys/time.h>
31 #include <meta.h>
32 #include <metad.h>
33 #include <mdmn_changelog.h>
34 #include <syslog.h>
35 #include <umem.h>
38 * Number of log entries per set.
40 * We want at least 4 spares available at all times
41 * in case new classes are added during a live upgrade.
43 * Allocate the entries in chunks of 16
45 #define MDMN_LOGRECS_QUANTA 16
46 #define MDMN_LOGRECS_MINSPARES 4
47 #define MDMN_LOGHDR_SIZE sizeof (mdmn_changelog_record_t)
48 #define MDMN_LOGRECSIZE (MDMN_LOGHDR_SIZE + MD_MN_MSG_MAXDATALEN)
49 #define MDMN_LOGRECSIZE_OD sizeof (mdmn_changelog_record_od_t)
50 #define MDMN_LOGRECS_TRIMUP ((MD_MN_NCLASSES % MDMN_LOGRECS_QUANTA) > \
51 (MDMN_LOGRECS_QUANTA - MDMN_LOGRECS_MINSPARES))
53 static int mdmn_commitlog(md_set_desc *, md_error_t *);
54 static int mdmn_log_it(set_t, md_error_t *, mdmn_changelog_record_t *lr);
57 /* Global variables */
59 mdmn_changelog_record_t *mdmn_changelog[MD_MAXSETS];
60 int mdmn_changelog_snarfed[MD_MAXSETS];
62 /* Total number of log records */
63 int mdmn_logrecs = (MDMN_LOGRECS_QUANTA +
64 ((MD_MN_NCLASSES/MDMN_LOGRECS_QUANTA) * MDMN_LOGRECS_QUANTA));
66 #ifdef DEBUG
67 void
68 dump_rec(char *fn_name, mdmn_changelog_record_t *lr)
70 syslog(LOG_DEBUG, "%s incore: selfid 0x%x class %d flags %d "
71 "msglen %d\n", fn_name, lr->lr_selfid, lr->lr_class,
72 lr->lr_flags, lr->lr_msglen);
74 void
75 dump_rec_od(char *fn_name, mdmn_changelog_record_od_t *lr)
77 syslog(LOG_DEBUG, "%s ondisk: selfid 0x%x class %d flags %d "
78 "msglen %d\n", fn_name, lr->lr_selfid, lr->lr_class,
79 lr->lr_flags, lr->lr_msglen);
82 void
83 dump_array(char *fn_name, set_t setno)
85 int i;
86 char tchar[80];
88 mdmn_changelog_record_t *tlr;
90 for (i = 0; i < mdmn_logrecs; i++) {
91 tlr = &mdmn_changelog[setno][i];
92 (void) snprintf(tchar, sizeof (tchar), "%s class %d ",
93 fn_name, i);
94 dump_rec(tchar, tlr);
97 #endif
100 * copy_changelog: copies changelog ondisk<->incore records.
101 * The argument "direction" controls the direction to copy the
102 * the records. Incore and ondisk changlog structures must be
103 * allocated when calling this routine.
105 * The purpose of changelog is to store a message that is in progress.
106 * Therefore the changlog structure embeds the message structure.
107 * Incore and ondisk changelog structures are created to handle the
108 * incore and ondisk message formats. The incore message has a pointer
109 * to the payload. The ondisk message format has payload embedded as
110 * part of the message.
112 * Caveat Emptor: Incore and ondisk structures have the payload buffers
113 * correctly allocated.
116 static void
117 copy_changelog(mdmn_changelog_record_t *incp,
118 mdmn_changelog_record_od_t *odp, int direction)
120 assert(incp != NULL && odp != NULL);
121 assert((direction == MD_MN_COPY_TO_ONDISK) ||
122 (direction == MD_MN_COPY_TO_INCORE));
124 if (direction == MD_MN_COPY_TO_ONDISK) {
125 odp->lr_revision = incp->lr_revision;
126 odp->lr_flags = incp->lr_flags;
127 odp->lr_selfid = incp->lr_selfid;
128 odp->lr_class = incp->lr_class;
129 odp->lr_msglen = incp->lr_msglen;
130 if (incp->lr_msglen)
131 copy_msg_2(&incp->lr_msg, &odp->lr_od_msg, direction);
132 } else {
133 incp->lr_revision = odp->lr_revision;
134 incp->lr_flags = odp->lr_flags;
135 incp->lr_selfid = odp->lr_selfid;
136 incp->lr_class = odp->lr_class;
137 incp->lr_msglen = odp->lr_msglen;
138 if (odp->lr_msglen)
139 copy_msg_2(&incp->lr_msg, &odp->lr_od_msg, direction);
144 * mdmn_allocate_changelog
146 * Changelog records are allocated on a per multi-node basis.
147 * This routine is called during MN set creation.
148 * It pre-allocates the changelog, as user records
149 * one per message class plus some spares.
150 * Once the records are allocated they are never freed until
151 * the mddb is deleted. The preallocation ensures that all nodes
152 * will have a consistent view of the mddb.
154 * Each record is large enough to hold a maximum sized message
155 * Return Values:
156 * 0 - success
157 * -1 - fail
160 mdmn_allocate_changelog(mdsetname_t *sp, md_error_t *ep)
162 mddb_userreq_t req;
163 md_set_desc *sd;
164 mdmn_changelog_record_t *tlr;
165 int i;
166 set_t setno;
168 /* Get a pointer to the incore md_set_desc for this MN set */
169 if ((sd = metaget_setdesc(sp, ep)) == NULL)
170 return (-1);
171 setno = sd->sd_setno;
173 * Round up the number of changelog records
174 * to the next value of MDMN_LOGRECS_QUANTA
176 * In all cases, make sure we have at least
177 * four more entries than the number of classes
178 * in order to provide space for live upgrades that
179 * might add classes.
182 mdmn_logrecs += (MDMN_LOGRECS_TRIMUP) ? MDMN_LOGRECS_QUANTA : 0;
184 mdmn_changelog[setno] = Zalloc(MDMN_LOGHDR_SIZE * mdmn_logrecs);
186 for (i = 0; i < mdmn_logrecs; i++) {
187 (void) memset(&req, 0, sizeof (req));
188 METAD_SETUP_LR(MD_DB_CREATE, setno, 0);
189 /* grab a record big enough for max message size */
190 req.ur_size = MDMN_LOGRECSIZE_OD;
192 if (metaioctl(MD_MN_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) {
193 (void) mdstealerror(ep, &req.ur_mde);
194 #ifdef DEBUG
195 syslog(LOG_DEBUG, "allocate_log: %s\n",
196 mde_sperror(ep, ""));
197 #endif
198 Free(mdmn_changelog[setno]);
199 return (-1);
202 tlr = &mdmn_changelog[setno][i];
203 tlr->lr_selfid = req.ur_recid;
204 tlr->lr_revision = MD_MN_CHANGELOG_RECORD_REVISION;
205 tlr->lr_class = i;
208 /* commit class, and selfid */
209 (void) mdmn_commitlog(sd, ep);
210 Free(mdmn_changelog[setno]);
211 return (0);
215 * mdmn_reset_changelog
217 * Called during reconfig step 2.
218 * The only time the changelog is reset is when all nodes in a cluster
219 * are starting up. In this case changelog must be ignored, therefore
220 * it is reset.
222 * The function frees the incore data structures and zeros out the
223 * records. The ondisk records are never freed.
225 * Return Values:
226 * 0 - success
227 * -1 - fail
230 mdmn_reset_changelog(mdsetname_t *sp, md_error_t *ep, int flag)
232 md_set_desc *sd;
233 mdmn_changelog_record_t *lr;
234 set_t setno;
235 int lrc;
237 /* Get a pointer to the incore md_set_desc this MN set */
238 if ((sd = metaget_setdesc(sp, ep)) == NULL)
239 return (-1);
241 setno = sd->sd_setno;
243 if (mdmn_snarf_changelog(setno, ep) == 0) {
244 return (0);
247 if (flag & MDMN_CLF_RESETLOG) {
248 for (lrc = 0; lrc < mdmn_logrecs; lrc++) {
249 lr = &mdmn_changelog[setno][lrc];
250 Free(lr->lr_msg.msg_event_data);
251 (void) memset(&lr->lr_msg, 0, sizeof (md_mn_msg_t));
252 lr->lr_msglen = 0;
253 lr->lr_flags = 0;
255 (void) mdmn_commitlog(sd, ep);
256 #ifdef DEBUG
257 syslog(LOG_DEBUG, "reset_changelog: Log reset\n");
258 #endif
260 /* now zap the array */
261 if (flag & MDMN_CLF_RESETCACHE) {
262 #ifdef DEBUG
263 syslog(LOG_DEBUG, "reset_changelog: cache reset\n");
264 #endif
265 Free(&mdmn_changelog[setno]);
266 mdmn_changelog[setno] = NULL;
267 mdmn_changelog_snarfed[setno] = 0;
269 return (0);
273 * Log a given message in the changelog.
274 * This function is only executed by the master node
275 * Return Values:
276 * MDMNE_NULL:
277 * success, the log slot is free
279 * MDMNE_ACK:
280 * success,
281 * the log slot is occupied with the same msg from a previous try.
283 * MDMNE_CLASS_BUSY:
284 * This means the appropriate slot is occupied with a different
285 * message. In that case the stored message needs being replayed,
286 * while the current message will be rejected with MDMNE_CLASS_BUSY
287 * to the initiator.
289 * MDMNE_LOG_FAIL:
290 * Bad things happend, cannot continue.
293 mdmn_log_msg(md_mn_msg_t *msg)
295 set_t setno;
296 md_mn_msgclass_t class;
297 mdmn_changelog_record_t *lr;
298 md_error_t err = mdnullerror;
299 md_error_t *ep = &err;
300 int retval = 0;
302 setno = msg->msg_setno;
303 class = mdmn_get_message_class(msg->msg_type);
305 /* if not snarfed, snarf it */
306 if (mdmn_snarf_changelog(setno, ep) <= 0) {
307 syslog(LOG_DAEMON | LOG_ERR, dgettext(TEXT_DOMAIN,
308 "log_msg: No records snarfed\n"));
309 return (-1);
313 /* log entry for the class */
314 lr = &mdmn_changelog[setno][class];
316 /* Check if the class is occupied */
317 if (lr->lr_flags & MD_MN_LR_INUSE) {
318 if (!MSGID_CMP(&(msg->msg_msgid), &(lr->lr_msg.msg_msgid))) {
319 syslog(LOG_DAEMON | LOG_DEBUG, dgettext(TEXT_DOMAIN,
320 "log_msg: id mismatch:\n"
321 " stored : ID = (%d, 0x%llx-%d)"
322 " setno %d class %d type %d\n"
323 " msg to log: ID = (%d, 0x%llx-%d)"
324 " setno %d class %d type %d.\n"),
325 MSGID_ELEMS(lr->lr_msg.msg_msgid), lr->lr_setno,
326 lr->lr_class, lr->lr_msgtype,
327 MSGID_ELEMS(msg->msg_msgid), msg->msg_setno, class,
328 msg->msg_type);
329 return (MDMNE_CLASS_BUSY);
330 } else {
331 syslog(LOG_DAEMON | LOG_DEBUG, dgettext(TEXT_DOMAIN,
332 "log_msg: msgid already logged:\n ID = "
333 " (%d, 0x%llx-%d) setno %d class %d type %d\n"),
334 MSGID_ELEMS(lr->lr_msg.msg_msgid), lr->lr_setno,
335 lr->lr_class, lr->lr_msgtype);
336 return (MDMNE_ACK);
340 lr->lr_flags |= MD_MN_LR_INUSE;
341 lr->lr_msglen = MD_MN_MSG_LEN(msg);
342 assert(lr->lr_msg.msg_event_data == NULL);
343 if (msg->msg_event_size)
344 lr->lr_msg.msg_event_data = Zalloc(msg->msg_event_size);
345 (void) copy_msg(msg, &(lr->lr_msg));
346 retval = mdmn_log_it(setno, ep, lr);
347 if (retval != 0) {
348 syslog(LOG_DAEMON | LOG_ERR, dgettext(TEXT_DOMAIN,
349 "mdmn_log_msg - failure committing logged msg to disk\n"));
350 return (MDMNE_LOG_FAIL);
353 return (MDMNE_NULL); /* this is good */
357 * mdmn_unlog_msg(md_mn_msg_t *)
359 * Clear the log entry holding the indicated message.
360 * Only the set master can do this.
362 * Return Values:
363 * 0 - success
364 * -1 - fail
367 mdmn_unlog_msg(md_mn_msg_t *msg)
369 set_t setno;
370 md_mn_msgclass_t class;
371 md_error_t err = mdnullerror;
372 md_error_t *ep = &err;
373 int retval = 0;
374 mdmn_changelog_record_t *lr = NULL;
376 setno = msg->msg_setno;
377 class = mdmn_get_message_class(msg->msg_type);
379 /* Find the log entry holding the indicated message */
380 if (mdmn_snarf_changelog(setno, ep) == 0)
381 return (-1);
383 lr = &mdmn_changelog[setno][class];
385 /* assert the message is still logged */
386 assert(lr != NULL);
387 if (!MSGID_CMP(&(msg->msg_msgid), &(lr->lr_msg.msg_msgid))) {
388 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
389 "unlog_msg: msgid mismatch\n"
390 "\t\tstored: ID = (%d, 0x%llx-%d) setno %d "
391 "class %d type %d\n"
392 "\t\tattempting to unlog:\n"
393 "\t\tID = (%d, 0x%llx-%d) setno %d class %d type %d.\n"),
394 MSGID_ELEMS(lr->lr_msg.msg_msgid), lr->lr_setno,
395 lr->lr_class, lr->lr_msgtype, MSGID_ELEMS(msg->msg_msgid),
396 msg->msg_setno, class, msg->msg_type);
397 return (-1);
399 lr->lr_msglen = 0;
400 lr->lr_flags &= ~(MD_MN_LR_INUSE);
401 if (lr->lr_msg.msg_event_data) {
402 Free(lr->lr_msg.msg_event_data);
403 lr->lr_msg.msg_event_data = NULL;
405 /* commit the updated log record to disk */
406 retval = mdmn_log_it(setno, ep, lr);
407 #ifdef DEBUG
408 dump_rec("mdmn_unlog_msg: ", lr);
409 #endif
410 return (retval);
415 * mdmn_get_changelogrec(set_t , md_mn_msgclass_t)
416 * Returns a pointer to incore changelog record.
418 * Return Values:
419 * non-NULL - success
420 * NULL - fail
422 mdmn_changelog_record_t *
423 mdmn_get_changelogrec(set_t setno, md_mn_msgclass_t class)
425 md_error_t err = mdnullerror;
427 if (mdmn_snarf_changelog(setno, &err) == 0)
428 return (NULL);
429 assert(mdmn_changelog[setno] != NULL);
431 return (&mdmn_changelog[setno][class]);
435 * mdmn_commitlog(md_set_desc *, md_error_t *)
437 * Commit the set record and all of the changelog entry records to disk.
438 * Don't bother with other stuff hanging off the set record
439 * (e.g. drive records) since none of that is changing.
440 * Called only at changelog pre-allocation time or when flushing a log.
442 * Return Values:
443 * 0 - success
444 * errno - fail
447 static int
448 mdmn_commitlog(md_set_desc *sd, md_error_t *ep)
450 int lrc;
451 int *recs;
452 uint_t size;
453 mdmn_changelog_record_t *lr;
454 mdmn_changelog_record_od_t clodrec; /* changelog ondisk record */
455 mddb_userreq_t req;
456 int retval = 0;
457 set_t setno;
459 /* Check for master and bounce non-master requests */
460 if (!(MD_MNSET_DESC(sd)) || !sd->sd_mn_am_i_master) {
461 if (!(MD_MNSET_DESC(sd))) {
462 syslog(LOG_DAEMON | LOG_ERR, dgettext(TEXT_DOMAIN,
463 "mdmn_commitlog - Not MN Set\n"));
464 } else {
465 syslog(LOG_DAEMON | LOG_ERR, dgettext(TEXT_DOMAIN,
466 "mdmn_commit_log - Not Master\n"));
468 return (-1);
470 (void) memset(&req, 0, sizeof (req));
471 /* create the records to commit the info to the mddb */
473 size = (mdmn_logrecs + 1) * sizeof (int);
474 recs = Zalloc(size);
475 /* Initialize the log entry records for update */
476 setno = sd->sd_setno;
478 for (lrc = 0; lrc < mdmn_logrecs; lrc++) {
479 lr = &mdmn_changelog[setno][lrc];
480 recs[lrc] = lr->lr_selfid;
481 copy_changelog(lr, &clodrec, MD_MN_COPY_TO_ONDISK);
482 METAD_SETUP_LR(MD_DB_SETDATA, setno, lr->lr_selfid);
483 req.ur_size = MDMN_LOGRECSIZE_OD;
484 req.ur_data = (uintptr_t)&clodrec;
485 if ((retval = metaioctl(MD_MN_DB_USERREQ, &req, &req.ur_mde,
486 NULL)) != 0) {
487 (void) mdstealerror(ep, &req.ur_mde);
488 #ifdef DEBUG
489 syslog(LOG_DAEMON|LOG_DEBUG,
490 "mdmn_commitlog - metaioctl SETDATA failure\n%s",
491 mde_sperror(ep, ""));
492 #endif
493 break;
497 if (retval == 0) {
498 /* set last rec to be 0 to indicate completion */
499 recs[lrc] = 0;
500 /* Commit to mddb on disk */
501 METAD_SETUP_LR(MD_DB_COMMIT_MANY, setno,
502 mdmn_changelog[setno][0].lr_selfid);
503 req.ur_size = size;
504 req.ur_data = (uintptr_t)recs;
505 if ((retval = metaioctl(MD_MN_DB_USERREQ, &req,
506 &req.ur_mde, NULL)) != 0) {
507 (void) mdstealerror(ep, &req.ur_mde);
508 #ifdef DEBUG
509 syslog(LOG_DAEMON|LOG_DEBUG,
510 "mdmn_commitlog - metaioctl COMMIT_MANY"
511 "Failure\n%s", mde_sperror(ep, ""));
512 #endif
516 Free(recs);
517 return (retval);
521 * mdmn_log_it(set_t, md_error_t *, mdmn_changelog_record_t *)
523 * Commit the changed log record to disk.
525 * Return Values:
526 * 0 - success
527 * -1 - fail
529 static int
530 mdmn_log_it(set_t set, md_error_t *ep, mdmn_changelog_record_t *lr)
532 int *recs;
533 uint_t size;
534 mddb_userreq_t req;
535 mdmn_changelog_record_od_t clodrec;
537 (void) memset(&req, 0, sizeof (req));
539 /* Initialize the log entry record for update */
541 copy_changelog(lr, &clodrec, MD_MN_COPY_TO_ONDISK);
542 METAD_SETUP_LR(MD_DB_SETDATA, set, lr->lr_selfid);
543 req.ur_size = MDMN_LOGRECSIZE_OD;
544 req.ur_data = (uintptr_t)&clodrec;
545 if (metaioctl(MD_MN_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) {
546 (void) mdstealerror(ep, &req.ur_mde);
547 #ifdef DEBUG
548 syslog(LOG_DEBUG, "mdmn_log_it: DB_SETDATA failed\n"
549 "set %d selfid %d, size %d\n%s", set, lr->lr_selfid,
550 req.ur_size, mde_sperror(ep, ""));
551 #endif
552 return (-1);
554 /* Set up the recid to be updated */
555 size = 2 * sizeof (int); /* the changed record, plus null terminator */
556 recs = Zalloc(size);
557 recs[0] = lr->lr_selfid;
558 recs[1] = 0;
559 /* Commit to mddb on disk */
560 METAD_SETUP_LR(MD_DB_COMMIT_ONE, set, lr->lr_selfid);
561 req.ur_size = size;
562 req.ur_data = (uintptr_t)recs;
563 if (metaioctl(MD_MN_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) {
564 (void) mdstealerror(ep, &req.ur_mde);
565 #ifdef DEBUG
566 syslog(LOG_DEBUG, "mdmn_log_it: DB_COMMIT_ONE failed\n"
567 "set %d selfid %d, size %d\n%s", set, lr->lr_selfid,
568 req.ur_size, mde_sperror(ep, ""));
569 #endif
570 Free(recs);
571 return (-1);
573 Free(recs);
574 return (0);
578 * mdmn_snarf_changelog(set_t, md_error_t *)
580 * snarf in the changelog entries and allocate incore structures
581 * if required.
582 * mdmn_changelog_snarfed array if set to MDMN_CLF_SNARFED, then
583 * then the records are already snarfed.
585 * Called from set_snarf(), mdmn_log_msg(), and mdmn_unlog_msg()
586 * Return Values:
587 * non-zero - success
588 * 0 - fail
591 mdmn_snarf_changelog(set_t set, md_error_t *ep)
593 mdmn_changelog_record_t *tlr;
594 mdmn_changelog_record_od_t *lr;
595 mddb_recid_t id;
596 md_mn_msgclass_t class;
599 if (set == MD_LOCAL_SET)
600 return (0);
602 id = 0;
604 if (mdmn_changelog_snarfed[set] & MDMN_CLF_SNARFED) {
605 assert(mdmn_changelog[set] != NULL);
606 return (mdmn_logrecs);
609 lr = (mdmn_changelog_record_od_t *)get_ur_rec(set, MD_UR_GET_NEXT,
610 MDDB_UR_LR, &id, ep);
611 if (lr == NULL)
612 return (0);
614 /* only allocate if Log records exist */
616 if (mdmn_changelog[set] == NULL) {
617 /* Allocate incore state for the log */
618 mdmn_changelog[set] = Zalloc(MDMN_LOGHDR_SIZE *
619 mdmn_logrecs);
622 do {
623 class = lr->lr_class;
624 tlr = &mdmn_changelog[set][class];
625 copy_changelog(tlr, lr, MD_MN_COPY_TO_INCORE);
626 Free(lr);
627 lr = (mdmn_changelog_record_od_t *)get_ur_rec(set,
628 MD_UR_GET_NEXT, MDDB_UR_LR, &id, ep);
629 } while (lr != NULL);
631 /* Since log records counts are fixed return that value */
632 mdmn_changelog_snarfed[set] |= MDMN_CLF_SNARFED;
633 return (mdmn_logrecs);