4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
33 #include <mdmn_changelog.h>
38 * Number of log entries per set.
40 * We want at least 4 spares available at all times
41 * in case new classes are added during a live upgrade.
43 * Allocate the entries in chunks of 16
45 #define MDMN_LOGRECS_QUANTA 16
46 #define MDMN_LOGRECS_MINSPARES 4
47 #define MDMN_LOGHDR_SIZE sizeof (mdmn_changelog_record_t)
48 #define MDMN_LOGRECSIZE (MDMN_LOGHDR_SIZE + MD_MN_MSG_MAXDATALEN)
49 #define MDMN_LOGRECSIZE_OD sizeof (mdmn_changelog_record_od_t)
50 #define MDMN_LOGRECS_TRIMUP ((MD_MN_NCLASSES % MDMN_LOGRECS_QUANTA) > \
51 (MDMN_LOGRECS_QUANTA - MDMN_LOGRECS_MINSPARES))
53 static int mdmn_commitlog(md_set_desc
*, md_error_t
*);
54 static int mdmn_log_it(set_t
, md_error_t
*, mdmn_changelog_record_t
*lr
);
57 /* Global variables */
59 mdmn_changelog_record_t
*mdmn_changelog
[MD_MAXSETS
];
60 int mdmn_changelog_snarfed
[MD_MAXSETS
];
62 /* Total number of log records */
63 int mdmn_logrecs
= (MDMN_LOGRECS_QUANTA
+
64 ((MD_MN_NCLASSES
/MDMN_LOGRECS_QUANTA
) * MDMN_LOGRECS_QUANTA
));
68 dump_rec(char *fn_name
, mdmn_changelog_record_t
*lr
)
70 syslog(LOG_DEBUG
, "%s incore: selfid 0x%x class %d flags %d "
71 "msglen %d\n", fn_name
, lr
->lr_selfid
, lr
->lr_class
,
72 lr
->lr_flags
, lr
->lr_msglen
);
75 dump_rec_od(char *fn_name
, mdmn_changelog_record_od_t
*lr
)
77 syslog(LOG_DEBUG
, "%s ondisk: selfid 0x%x class %d flags %d "
78 "msglen %d\n", fn_name
, lr
->lr_selfid
, lr
->lr_class
,
79 lr
->lr_flags
, lr
->lr_msglen
);
83 dump_array(char *fn_name
, set_t setno
)
88 mdmn_changelog_record_t
*tlr
;
90 for (i
= 0; i
< mdmn_logrecs
; i
++) {
91 tlr
= &mdmn_changelog
[setno
][i
];
92 (void) snprintf(tchar
, sizeof (tchar
), "%s class %d ",
100 * copy_changelog: copies changelog ondisk<->incore records.
101 * The argument "direction" controls the direction to copy the
102 * the records. Incore and ondisk changlog structures must be
103 * allocated when calling this routine.
105 * The purpose of changelog is to store a message that is in progress.
106 * Therefore the changlog structure embeds the message structure.
107 * Incore and ondisk changelog structures are created to handle the
108 * incore and ondisk message formats. The incore message has a pointer
109 * to the payload. The ondisk message format has payload embedded as
110 * part of the message.
112 * Caveat Emptor: Incore and ondisk structures have the payload buffers
113 * correctly allocated.
117 copy_changelog(mdmn_changelog_record_t
*incp
,
118 mdmn_changelog_record_od_t
*odp
, int direction
)
120 assert(incp
!= NULL
&& odp
!= NULL
);
121 assert((direction
== MD_MN_COPY_TO_ONDISK
) ||
122 (direction
== MD_MN_COPY_TO_INCORE
));
124 if (direction
== MD_MN_COPY_TO_ONDISK
) {
125 odp
->lr_revision
= incp
->lr_revision
;
126 odp
->lr_flags
= incp
->lr_flags
;
127 odp
->lr_selfid
= incp
->lr_selfid
;
128 odp
->lr_class
= incp
->lr_class
;
129 odp
->lr_msglen
= incp
->lr_msglen
;
131 copy_msg_2(&incp
->lr_msg
, &odp
->lr_od_msg
, direction
);
133 incp
->lr_revision
= odp
->lr_revision
;
134 incp
->lr_flags
= odp
->lr_flags
;
135 incp
->lr_selfid
= odp
->lr_selfid
;
136 incp
->lr_class
= odp
->lr_class
;
137 incp
->lr_msglen
= odp
->lr_msglen
;
139 copy_msg_2(&incp
->lr_msg
, &odp
->lr_od_msg
, direction
);
144 * mdmn_allocate_changelog
146 * Changelog records are allocated on a per multi-node basis.
147 * This routine is called during MN set creation.
148 * It pre-allocates the changelog, as user records
149 * one per message class plus some spares.
150 * Once the records are allocated they are never freed until
151 * the mddb is deleted. The preallocation ensures that all nodes
152 * will have a consistent view of the mddb.
154 * Each record is large enough to hold a maximum sized message
160 mdmn_allocate_changelog(mdsetname_t
*sp
, md_error_t
*ep
)
164 mdmn_changelog_record_t
*tlr
;
168 /* Get a pointer to the incore md_set_desc for this MN set */
169 if ((sd
= metaget_setdesc(sp
, ep
)) == NULL
)
171 setno
= sd
->sd_setno
;
173 * Round up the number of changelog records
174 * to the next value of MDMN_LOGRECS_QUANTA
176 * In all cases, make sure we have at least
177 * four more entries than the number of classes
178 * in order to provide space for live upgrades that
182 mdmn_logrecs
+= (MDMN_LOGRECS_TRIMUP
) ? MDMN_LOGRECS_QUANTA
: 0;
184 mdmn_changelog
[setno
] = Zalloc(MDMN_LOGHDR_SIZE
* mdmn_logrecs
);
186 for (i
= 0; i
< mdmn_logrecs
; i
++) {
187 (void) memset(&req
, 0, sizeof (req
));
188 METAD_SETUP_LR(MD_DB_CREATE
, setno
, 0);
189 /* grab a record big enough for max message size */
190 req
.ur_size
= MDMN_LOGRECSIZE_OD
;
192 if (metaioctl(MD_MN_DB_USERREQ
, &req
, &req
.ur_mde
, NULL
) != 0) {
193 (void) mdstealerror(ep
, &req
.ur_mde
);
195 syslog(LOG_DEBUG
, "allocate_log: %s\n",
196 mde_sperror(ep
, ""));
198 Free(mdmn_changelog
[setno
]);
202 tlr
= &mdmn_changelog
[setno
][i
];
203 tlr
->lr_selfid
= req
.ur_recid
;
204 tlr
->lr_revision
= MD_MN_CHANGELOG_RECORD_REVISION
;
208 /* commit class, and selfid */
209 (void) mdmn_commitlog(sd
, ep
);
210 Free(mdmn_changelog
[setno
]);
215 * mdmn_reset_changelog
217 * Called during reconfig step 2.
218 * The only time the changelog is reset is when all nodes in a cluster
219 * are starting up. In this case changelog must be ignored, therefore
222 * The function frees the incore data structures and zeros out the
223 * records. The ondisk records are never freed.
230 mdmn_reset_changelog(mdsetname_t
*sp
, md_error_t
*ep
, int flag
)
233 mdmn_changelog_record_t
*lr
;
237 /* Get a pointer to the incore md_set_desc this MN set */
238 if ((sd
= metaget_setdesc(sp
, ep
)) == NULL
)
241 setno
= sd
->sd_setno
;
243 if (mdmn_snarf_changelog(setno
, ep
) == 0) {
247 if (flag
& MDMN_CLF_RESETLOG
) {
248 for (lrc
= 0; lrc
< mdmn_logrecs
; lrc
++) {
249 lr
= &mdmn_changelog
[setno
][lrc
];
250 Free(lr
->lr_msg
.msg_event_data
);
251 (void) memset(&lr
->lr_msg
, 0, sizeof (md_mn_msg_t
));
255 (void) mdmn_commitlog(sd
, ep
);
257 syslog(LOG_DEBUG
, "reset_changelog: Log reset\n");
260 /* now zap the array */
261 if (flag
& MDMN_CLF_RESETCACHE
) {
263 syslog(LOG_DEBUG
, "reset_changelog: cache reset\n");
265 Free(&mdmn_changelog
[setno
]);
266 mdmn_changelog
[setno
] = NULL
;
267 mdmn_changelog_snarfed
[setno
] = 0;
273 * Log a given message in the changelog.
274 * This function is only executed by the master node
277 * success, the log slot is free
281 * the log slot is occupied with the same msg from a previous try.
284 * This means the appropriate slot is occupied with a different
285 * message. In that case the stored message needs being replayed,
286 * while the current message will be rejected with MDMNE_CLASS_BUSY
290 * Bad things happend, cannot continue.
293 mdmn_log_msg(md_mn_msg_t
*msg
)
296 md_mn_msgclass_t
class;
297 mdmn_changelog_record_t
*lr
;
298 md_error_t err
= mdnullerror
;
299 md_error_t
*ep
= &err
;
302 setno
= msg
->msg_setno
;
303 class = mdmn_get_message_class(msg
->msg_type
);
305 /* if not snarfed, snarf it */
306 if (mdmn_snarf_changelog(setno
, ep
) <= 0) {
307 syslog(LOG_DAEMON
| LOG_ERR
, dgettext(TEXT_DOMAIN
,
308 "log_msg: No records snarfed\n"));
313 /* log entry for the class */
314 lr
= &mdmn_changelog
[setno
][class];
316 /* Check if the class is occupied */
317 if (lr
->lr_flags
& MD_MN_LR_INUSE
) {
318 if (!MSGID_CMP(&(msg
->msg_msgid
), &(lr
->lr_msg
.msg_msgid
))) {
319 syslog(LOG_DAEMON
| LOG_DEBUG
, dgettext(TEXT_DOMAIN
,
320 "log_msg: id mismatch:\n"
321 " stored : ID = (%d, 0x%llx-%d)"
322 " setno %d class %d type %d\n"
323 " msg to log: ID = (%d, 0x%llx-%d)"
324 " setno %d class %d type %d.\n"),
325 MSGID_ELEMS(lr
->lr_msg
.msg_msgid
), lr
->lr_setno
,
326 lr
->lr_class
, lr
->lr_msgtype
,
327 MSGID_ELEMS(msg
->msg_msgid
), msg
->msg_setno
, class,
329 return (MDMNE_CLASS_BUSY
);
331 syslog(LOG_DAEMON
| LOG_DEBUG
, dgettext(TEXT_DOMAIN
,
332 "log_msg: msgid already logged:\n ID = "
333 " (%d, 0x%llx-%d) setno %d class %d type %d\n"),
334 MSGID_ELEMS(lr
->lr_msg
.msg_msgid
), lr
->lr_setno
,
335 lr
->lr_class
, lr
->lr_msgtype
);
340 lr
->lr_flags
|= MD_MN_LR_INUSE
;
341 lr
->lr_msglen
= MD_MN_MSG_LEN(msg
);
342 assert(lr
->lr_msg
.msg_event_data
== NULL
);
343 if (msg
->msg_event_size
)
344 lr
->lr_msg
.msg_event_data
= Zalloc(msg
->msg_event_size
);
345 (void) copy_msg(msg
, &(lr
->lr_msg
));
346 retval
= mdmn_log_it(setno
, ep
, lr
);
348 syslog(LOG_DAEMON
| LOG_ERR
, dgettext(TEXT_DOMAIN
,
349 "mdmn_log_msg - failure committing logged msg to disk\n"));
350 return (MDMNE_LOG_FAIL
);
353 return (MDMNE_NULL
); /* this is good */
357 * mdmn_unlog_msg(md_mn_msg_t *)
359 * Clear the log entry holding the indicated message.
360 * Only the set master can do this.
367 mdmn_unlog_msg(md_mn_msg_t
*msg
)
370 md_mn_msgclass_t
class;
371 md_error_t err
= mdnullerror
;
372 md_error_t
*ep
= &err
;
374 mdmn_changelog_record_t
*lr
= NULL
;
376 setno
= msg
->msg_setno
;
377 class = mdmn_get_message_class(msg
->msg_type
);
379 /* Find the log entry holding the indicated message */
380 if (mdmn_snarf_changelog(setno
, ep
) == 0)
383 lr
= &mdmn_changelog
[setno
][class];
385 /* assert the message is still logged */
387 if (!MSGID_CMP(&(msg
->msg_msgid
), &(lr
->lr_msg
.msg_msgid
))) {
388 syslog(LOG_ERR
, dgettext(TEXT_DOMAIN
,
389 "unlog_msg: msgid mismatch\n"
390 "\t\tstored: ID = (%d, 0x%llx-%d) setno %d "
392 "\t\tattempting to unlog:\n"
393 "\t\tID = (%d, 0x%llx-%d) setno %d class %d type %d.\n"),
394 MSGID_ELEMS(lr
->lr_msg
.msg_msgid
), lr
->lr_setno
,
395 lr
->lr_class
, lr
->lr_msgtype
, MSGID_ELEMS(msg
->msg_msgid
),
396 msg
->msg_setno
, class, msg
->msg_type
);
400 lr
->lr_flags
&= ~(MD_MN_LR_INUSE
);
401 if (lr
->lr_msg
.msg_event_data
) {
402 Free(lr
->lr_msg
.msg_event_data
);
403 lr
->lr_msg
.msg_event_data
= NULL
;
405 /* commit the updated log record to disk */
406 retval
= mdmn_log_it(setno
, ep
, lr
);
408 dump_rec("mdmn_unlog_msg: ", lr
);
415 * mdmn_get_changelogrec(set_t , md_mn_msgclass_t)
416 * Returns a pointer to incore changelog record.
422 mdmn_changelog_record_t
*
423 mdmn_get_changelogrec(set_t setno
, md_mn_msgclass_t
class)
425 md_error_t err
= mdnullerror
;
427 if (mdmn_snarf_changelog(setno
, &err
) == 0)
429 assert(mdmn_changelog
[setno
] != NULL
);
431 return (&mdmn_changelog
[setno
][class]);
435 * mdmn_commitlog(md_set_desc *, md_error_t *)
437 * Commit the set record and all of the changelog entry records to disk.
438 * Don't bother with other stuff hanging off the set record
439 * (e.g. drive records) since none of that is changing.
440 * Called only at changelog pre-allocation time or when flushing a log.
448 mdmn_commitlog(md_set_desc
*sd
, md_error_t
*ep
)
453 mdmn_changelog_record_t
*lr
;
454 mdmn_changelog_record_od_t clodrec
; /* changelog ondisk record */
459 /* Check for master and bounce non-master requests */
460 if (!(MD_MNSET_DESC(sd
)) || !sd
->sd_mn_am_i_master
) {
461 if (!(MD_MNSET_DESC(sd
))) {
462 syslog(LOG_DAEMON
| LOG_ERR
, dgettext(TEXT_DOMAIN
,
463 "mdmn_commitlog - Not MN Set\n"));
465 syslog(LOG_DAEMON
| LOG_ERR
, dgettext(TEXT_DOMAIN
,
466 "mdmn_commit_log - Not Master\n"));
470 (void) memset(&req
, 0, sizeof (req
));
471 /* create the records to commit the info to the mddb */
473 size
= (mdmn_logrecs
+ 1) * sizeof (int);
475 /* Initialize the log entry records for update */
476 setno
= sd
->sd_setno
;
478 for (lrc
= 0; lrc
< mdmn_logrecs
; lrc
++) {
479 lr
= &mdmn_changelog
[setno
][lrc
];
480 recs
[lrc
] = lr
->lr_selfid
;
481 copy_changelog(lr
, &clodrec
, MD_MN_COPY_TO_ONDISK
);
482 METAD_SETUP_LR(MD_DB_SETDATA
, setno
, lr
->lr_selfid
);
483 req
.ur_size
= MDMN_LOGRECSIZE_OD
;
484 req
.ur_data
= (uintptr_t)&clodrec
;
485 if ((retval
= metaioctl(MD_MN_DB_USERREQ
, &req
, &req
.ur_mde
,
487 (void) mdstealerror(ep
, &req
.ur_mde
);
489 syslog(LOG_DAEMON
|LOG_DEBUG
,
490 "mdmn_commitlog - metaioctl SETDATA failure\n%s",
491 mde_sperror(ep
, ""));
498 /* set last rec to be 0 to indicate completion */
500 /* Commit to mddb on disk */
501 METAD_SETUP_LR(MD_DB_COMMIT_MANY
, setno
,
502 mdmn_changelog
[setno
][0].lr_selfid
);
504 req
.ur_data
= (uintptr_t)recs
;
505 if ((retval
= metaioctl(MD_MN_DB_USERREQ
, &req
,
506 &req
.ur_mde
, NULL
)) != 0) {
507 (void) mdstealerror(ep
, &req
.ur_mde
);
509 syslog(LOG_DAEMON
|LOG_DEBUG
,
510 "mdmn_commitlog - metaioctl COMMIT_MANY"
511 "Failure\n%s", mde_sperror(ep
, ""));
521 * mdmn_log_it(set_t, md_error_t *, mdmn_changelog_record_t *)
523 * Commit the changed log record to disk.
530 mdmn_log_it(set_t set
, md_error_t
*ep
, mdmn_changelog_record_t
*lr
)
535 mdmn_changelog_record_od_t clodrec
;
537 (void) memset(&req
, 0, sizeof (req
));
539 /* Initialize the log entry record for update */
541 copy_changelog(lr
, &clodrec
, MD_MN_COPY_TO_ONDISK
);
542 METAD_SETUP_LR(MD_DB_SETDATA
, set
, lr
->lr_selfid
);
543 req
.ur_size
= MDMN_LOGRECSIZE_OD
;
544 req
.ur_data
= (uintptr_t)&clodrec
;
545 if (metaioctl(MD_MN_DB_USERREQ
, &req
, &req
.ur_mde
, NULL
) != 0) {
546 (void) mdstealerror(ep
, &req
.ur_mde
);
548 syslog(LOG_DEBUG
, "mdmn_log_it: DB_SETDATA failed\n"
549 "set %d selfid %d, size %d\n%s", set
, lr
->lr_selfid
,
550 req
.ur_size
, mde_sperror(ep
, ""));
554 /* Set up the recid to be updated */
555 size
= 2 * sizeof (int); /* the changed record, plus null terminator */
557 recs
[0] = lr
->lr_selfid
;
559 /* Commit to mddb on disk */
560 METAD_SETUP_LR(MD_DB_COMMIT_ONE
, set
, lr
->lr_selfid
);
562 req
.ur_data
= (uintptr_t)recs
;
563 if (metaioctl(MD_MN_DB_USERREQ
, &req
, &req
.ur_mde
, NULL
) != 0) {
564 (void) mdstealerror(ep
, &req
.ur_mde
);
566 syslog(LOG_DEBUG
, "mdmn_log_it: DB_COMMIT_ONE failed\n"
567 "set %d selfid %d, size %d\n%s", set
, lr
->lr_selfid
,
568 req
.ur_size
, mde_sperror(ep
, ""));
578 * mdmn_snarf_changelog(set_t, md_error_t *)
580 * snarf in the changelog entries and allocate incore structures
582 * mdmn_changelog_snarfed array if set to MDMN_CLF_SNARFED, then
583 * then the records are already snarfed.
585 * Called from set_snarf(), mdmn_log_msg(), and mdmn_unlog_msg()
591 mdmn_snarf_changelog(set_t set
, md_error_t
*ep
)
593 mdmn_changelog_record_t
*tlr
;
594 mdmn_changelog_record_od_t
*lr
;
596 md_mn_msgclass_t
class;
599 if (set
== MD_LOCAL_SET
)
604 if (mdmn_changelog_snarfed
[set
] & MDMN_CLF_SNARFED
) {
605 assert(mdmn_changelog
[set
] != NULL
);
606 return (mdmn_logrecs
);
609 lr
= (mdmn_changelog_record_od_t
*)get_ur_rec(set
, MD_UR_GET_NEXT
,
610 MDDB_UR_LR
, &id
, ep
);
614 /* only allocate if Log records exist */
616 if (mdmn_changelog
[set
] == NULL
) {
617 /* Allocate incore state for the log */
618 mdmn_changelog
[set
] = Zalloc(MDMN_LOGHDR_SIZE
*
623 class = lr
->lr_class
;
624 tlr
= &mdmn_changelog
[set
][class];
625 copy_changelog(tlr
, lr
, MD_MN_COPY_TO_INCORE
);
627 lr
= (mdmn_changelog_record_od_t
*)get_ur_rec(set
,
628 MD_UR_GET_NEXT
, MDDB_UR_LR
, &id
, ep
);
629 } while (lr
!= NULL
);
631 /* Since log records counts are fixed return that value */
632 mdmn_changelog_snarfed
[set
] |= MDMN_CLF_SNARFED
;
633 return (mdmn_logrecs
);