From 7750fd726d0264283608ff21d8a003fb8d7da03b Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 26 Mar 2015 20:05:51 -0700 Subject: [PATCH] dmsg - refactor cluster and pfs identifiers Cleanup loose ends in the CONN/SPAN messages that prior work has exposed. DMSG now uses LNK_SPAN exclusively in its graph algorithms, so LNK_CONN has a different meaning than it used to. * Change the way cl_id and fs_id work. Rename cl_id and most cluster fields as 'peer' identification fields. Reduce complexity and confusion by removing pfs_type and pfs_fsid from LNK_CONN. Also remove pfs_mask and fs_label. Change cl_label to peer_label. peer_label is now always a human-readonable string identifier for the socket connection and no longer performs any connection filtering. This allows it to be passed in LNK_SPANs to make the trees more human-readable when dumped. * A LNK_CONN identifies the peer and not necessarily any particular cluster. Note that the peer_id in a LNK_CONN is a filter request, not an advertisement. * Make peer_label more meaningful by incorporating the hostname (needs more work). * Rename filesystem identifiers as 'pfs' identifiers. A LNK_SPAN identifies a particular PFS. Note that the peer_id in a LNK_SPAN is part of the PFS { cluster_id, pfs_id } advertisement. * Pad some in-memory structures for natural alignment (it's a general rule for all hammer2 structures, even in-memory structures when appropriate). --- lib/libdmsg/dmsg.h | 2 +- lib/libdmsg/msg_lnk.c | 140 +++++++++++++++++--------------------- lib/libdmsg/service.c | 4 +- lib/libdmsg/subs.c | 6 +- sbin/hammer2/cmd_pfs.c | 4 +- sbin/hammer2/main.c | 2 +- sbin/hammer2/subs.c | 2 +- sys/dev/disk/xdisk/xdisk.c | 61 +++++++++-------- sys/kern/subr_diskiocom.c | 41 +++++++---- sys/sys/dmsg.h | 61 ++++++++--------- sys/vfs/hammer2/hammer2.h | 33 +++++++++ sys/vfs/hammer2/hammer2_cluster.c | 61 ++++++++++++++++- sys/vfs/hammer2/hammer2_disk.h | 36 +++++++--- sys/vfs/hammer2/hammer2_iocom.c | 62 ++++++++--------- 14 files changed, 308 insertions(+), 207 deletions(-) diff --git a/lib/libdmsg/dmsg.h b/lib/libdmsg/dmsg.h index 38c45280ff..7847edcf6f 100644 --- a/lib/libdmsg/dmsg.h +++ b/lib/libdmsg/dmsg.h @@ -153,7 +153,7 @@ TAILQ_HEAD(dmsg_media_queue, dmsg_media); struct dmsg_media { TAILQ_ENTRY(dmsg_media) entry; - uuid_t mediaid; + uuid_t media_id; int refs; void *usrhandle; }; diff --git a/lib/libdmsg/msg_lnk.c b/lib/libdmsg/msg_lnk.c index 0243c94370..0c3aa7db84 100644 --- a/lib/libdmsg/msg_lnk.c +++ b/lib/libdmsg/msg_lnk.c @@ -73,7 +73,7 @@ * each cluster. * * h2span_node - Organizes the nodes in a cluster. One structure - * for each unique {cluster,node}, aka {fsid, pfs_fsid}. + * for each unique {cluster,node}, aka {peer_id, pfs_id}. * * h2span_link - Organizes all incoming and outgoing LNK_SPAN message * transactions related to a node. @@ -122,15 +122,16 @@ struct h2span_conn { }; /* - * All received LNK_SPANs are organized by cluster (pfs_clid), - * node (pfs_fsid), and link (received LNK_SPAN transaction). + * All received LNK_SPANs are organized by peer id (peer_id), + * node (pfs_id), and link (received LNK_SPAN transaction). */ struct h2span_cluster { RB_ENTRY(h2span_cluster) rbnode; struct h2span_node_tree tree; - uuid_t pfs_clid; /* shared fsid */ + uuid_t peer_id; /* shared fsid */ uint8_t peer_type; - char cl_label[128]; /* cluster label (typ PEER_BLOCK) */ + uint8_t reserved01[7]; + char peer_label[128]; /* string identification */ int refs; /* prevents destruction */ }; @@ -139,8 +140,9 @@ struct h2span_node { struct h2span_link_tree tree; struct h2span_cluster *cls; uint8_t pfs_type; - uuid_t pfs_fsid; /* unique fsid */ - char fs_label[128]; /* fs label (typ PEER_HAMMER2) */ + uint8_t reserved01[7]; + uuid_t pfs_id; /* unique pfs id */ + char pfs_label[128]; /* string identification */ void *opaque; }; @@ -208,16 +210,16 @@ h2span_cluster_cmp(h2span_cluster_t *cls1, h2span_cluster_t *cls2) return(-1); if (cls1->peer_type > cls2->peer_type) return(1); - r = uuid_compare(&cls1->pfs_clid, &cls2->pfs_clid, NULL); + r = uuid_compare(&cls1->peer_id, &cls2->peer_id, NULL); if (r == 0) - r = strcmp(cls1->cl_label, cls2->cl_label); + r = strcmp(cls1->peer_label, cls2->peer_label); return r; } /* - * Match against fs_label/pfs_fsid. Together these two items represent a - * unique node. In most cases the primary differentiator is pfs_fsid but + * Match against pfs_label/pfs_id. Together these two items represent a + * unique node. In most cases the primary differentiator is pfs_id but * we also string-match fs_label. */ static @@ -226,9 +228,9 @@ h2span_node_cmp(h2span_node_t *node1, h2span_node_t *node2) { int r; - r = strcmp(node1->fs_label, node2->fs_label); + r = strcmp(node1->pfs_label, node2->pfs_label); if (r == 0) - r = uuid_compare(&node1->pfs_fsid, &node2->pfs_fsid, NULL); + r = uuid_compare(&node1->pfs_id, &node2->pfs_id, NULL); return (r); } @@ -408,12 +410,10 @@ dmsg_lnk_conn(dmsg_msg_t *msg) * acknowledge the request, leaving the transaction open. * We then relay priority-selected SPANs. */ - dmio_printf(iocom, 3, "LNK_CONN(%08x): %s/%s/%s\n", + dmio_printf(iocom, 3, "LNK_CONN(%08x): %s/%s\n", (uint32_t)msg->any.head.msgid, - dmsg_uuid_to_str(&msg->any.lnk_conn.pfs_clid, - &alloc), - msg->any.lnk_conn.cl_label, - msg->any.lnk_conn.fs_label); + dmsg_uuid_to_str(&msg->any.lnk_conn.peer_id, &alloc), + msg->any.lnk_conn.peer_label); free(alloc); conn = dmsg_alloc(sizeof(*conn)); @@ -433,14 +433,14 @@ dmsg_lnk_conn(dmsg_msg_t *msg) * Set up media */ TAILQ_FOREACH(media, &mediaq, entry) { - if (uuid_compare(&msg->any.lnk_conn.mediaid, - &media->mediaid, NULL) == 0) { + if (uuid_compare(&msg->any.lnk_conn.media_id, + &media->media_id, NULL) == 0) { break; } } if (media == NULL) { media = dmsg_alloc(sizeof(*media)); - media->mediaid = msg->any.lnk_conn.mediaid; + media->media_id = msg->any.lnk_conn.media_id; TAILQ_INSERT_TAIL(&mediaq, media, entry); } state->media = media; @@ -557,25 +557,23 @@ dmsg_lnk_span(dmsg_msg_t *msg) assert(state->func == NULL); state->func = dmsg_lnk_span; - dmsg_termstr(msg->any.lnk_span.cl_label); - dmsg_termstr(msg->any.lnk_span.fs_label); + dmsg_termstr(msg->any.lnk_span.peer_label); + dmsg_termstr(msg->any.lnk_span.pfs_label); /* * Find the cluster */ - dummy_cls.pfs_clid = msg->any.lnk_span.pfs_clid; + dummy_cls.peer_id = msg->any.lnk_span.peer_id; dummy_cls.peer_type = msg->any.lnk_span.peer_type; - bcopy(msg->any.lnk_span.cl_label, - dummy_cls.cl_label, - sizeof(dummy_cls.cl_label)); + bcopy(msg->any.lnk_span.peer_label, dummy_cls.peer_label, + sizeof(dummy_cls.peer_label)); cls = RB_FIND(h2span_cluster_tree, &cluster_tree, &dummy_cls); if (cls == NULL) { cls = dmsg_alloc(sizeof(*cls)); - cls->pfs_clid = msg->any.lnk_span.pfs_clid; + cls->peer_id = msg->any.lnk_span.peer_id; cls->peer_type = msg->any.lnk_span.peer_type; - bcopy(msg->any.lnk_span.cl_label, - cls->cl_label, - sizeof(cls->cl_label)); + bcopy(msg->any.lnk_span.peer_label, + cls->peer_label, sizeof(cls->peer_label)); RB_INIT(&cls->tree); RB_INSERT(h2span_cluster_tree, &cluster_tree, cls); } @@ -583,17 +581,16 @@ dmsg_lnk_span(dmsg_msg_t *msg) /* * Find the node */ - dummy_node.pfs_fsid = msg->any.lnk_span.pfs_fsid; - bcopy(msg->any.lnk_span.fs_label, dummy_node.fs_label, - sizeof(dummy_node.fs_label)); + dummy_node.pfs_id = msg->any.lnk_span.pfs_id; + bcopy(msg->any.lnk_span.pfs_label, dummy_node.pfs_label, + sizeof(dummy_node.pfs_label)); node = RB_FIND(h2span_node_tree, &cls->tree, &dummy_node); if (node == NULL) { node = dmsg_alloc(sizeof(*node)); - node->pfs_fsid = msg->any.lnk_span.pfs_fsid; + node->pfs_id = msg->any.lnk_span.pfs_id; node->pfs_type = msg->any.lnk_span.pfs_type; - bcopy(msg->any.lnk_span.fs_label, - node->fs_label, - sizeof(node->fs_label)); + bcopy(msg->any.lnk_span.pfs_label, node->pfs_label, + sizeof(node->pfs_label)); node->cls = cls; RB_INIT(&node->tree); RB_INSERT(h2span_node_tree, &cls->tree, node); @@ -632,10 +629,10 @@ dmsg_lnk_span(dmsg_msg_t *msg) dmio_printf(iocom, 3, "LNK_SPAN(thr %p): %p %s cl=%s fs=%s dist=%d\n", iocom, slink, - dmsg_uuid_to_str(&msg->any.lnk_span.pfs_clid, + dmsg_uuid_to_str(&msg->any.lnk_span.peer_id, &alloc), - msg->any.lnk_span.cl_label, - msg->any.lnk_span.fs_label, + msg->any.lnk_span.peer_label, + msg->any.lnk_span.pfs_label, msg->any.lnk_span.dist); free(alloc); #if 0 @@ -663,9 +660,9 @@ dmsg_lnk_span(dmsg_msg_t *msg) dmio_printf(iocom, 3, "LNK_DELE(thr %p): %p %s cl=%s fs=%s\n", iocom, slink, - dmsg_uuid_to_str(&cls->pfs_clid, &alloc), - cls->cl_label, - node->fs_label); + dmsg_uuid_to_str(&cls->peer_id, &alloc), + cls->peer_label, + node->pfs_label); free(alloc); /* @@ -929,52 +926,39 @@ dmsg_relay_scan_specific(h2span_node_t *node, h2span_conn_t *conn) * * Don't bother transmitting if the remote connection * is not accepting this SPAN's peer_type. - * - * pfs_mask is typically used so pure clients can filter - * out receiving SPANs for other pure clients. */ lspan = &slink->lnk_span; lconn = &conn->lnk_conn; if (((1LLU << lspan->peer_type) & lconn->peer_mask) == 0) break; - if (((1LLU << lspan->pfs_type) & lconn->pfs_mask) == 0) - break; /* * Do not give pure clients visibility to other pure clients */ - if (lconn->pfs_type == DMSG_PFSTYPE_CLIENT && - lspan->pfs_type == DMSG_PFSTYPE_CLIENT) { - break; - } - - /* - * Connection filter, if cluster uuid is not NULL it must - * match the span cluster uuid. Only applies when the - * peer_type matches. - */ - if (lspan->peer_type == lconn->peer_type && - !uuid_is_nil(&lconn->pfs_clid, NULL) && - uuid_compare(&slink->node->cls->pfs_clid, - &lconn->pfs_clid, NULL)) { + if (lconn->peer_type == DMSG_PEER_CLIENT && + lspan->peer_type == DMSG_PEER_CLIENT) { break; } /* - * Connection filter, if cluster label is not empty it must - * match the span cluster label. Only applies when the - * peer_type matches. + * Clients can set peer_id to filter the peer_id of incoming + * spans. Other peer types set peer_id to advertising their + * peer_id. XXX + * + * NOTE: peer_label is not a filter on clients, it identifies + * the client just as it identifies other peer types. */ - if (lspan->peer_type == lconn->peer_type && - lconn->cl_label[0] && - strcmp(lconn->cl_label, slink->node->cls->cl_label)) { + if (lconn->peer_type == DMSG_PEER_CLIENT && + !uuid_is_nil(&lconn->peer_id, NULL) && + uuid_compare(&slink->node->cls->peer_id, + &lconn->peer_id, NULL)) { break; } /* - * NOTE! pfs_fsid differentiates nodes within the same cluster + * NOTE! pfs_id differentiates nodes within the same cluster * so we obviously don't want to match those. Similarly - * for fs_label. + * for pfs_label. */ /* @@ -1179,12 +1163,12 @@ dmsg_relay_delete(h2span_relay_t *relay) * pointer to it. */ h2span_cluster_t * -dmsg_cluster_get(uuid_t *pfs_clid) +dmsg_cluster_get(uuid_t *peer_id) { h2span_cluster_t dummy_cls; h2span_cluster_t *cls; - dummy_cls.pfs_clid = *pfs_clid; + dummy_cls.peer_id = *peer_id; pthread_mutex_lock(&cluster_mtx); cls = RB_FIND(h2span_cluster_tree, &cluster_tree, &dummy_cls); if (cls) @@ -1214,7 +1198,7 @@ dmsg_cluster_put(h2span_cluster_t *cls) * stable nodes. */ h2span_node_t * -dmsg_node_get(h2span_cluster_t *cls, uuid_t *pfs_fsid) +dmsg_node_get(h2span_cluster_t *cls, uuid_t *pfs_id) { } @@ -1238,13 +1222,13 @@ dmsg_shell_tree(dmsg_iocom_t *iocom, char *cmdbuf __unused) RB_FOREACH(cls, h2span_cluster_tree, &cluster_tree) { dmsg_printf(iocom, "Cluster %s %s (%s)\n", dmsg_peer_type_to_str(cls->peer_type), - dmsg_uuid_to_str(&cls->pfs_clid, &uustr), - cls->cl_label); + dmsg_uuid_to_str(&cls->peer_id, &uustr), + cls->peer_label); RB_FOREACH(node, h2span_node_tree, &cls->tree) { dmsg_printf(iocom, " Node %02x %s (%s)\n", node->pfs_type, - dmsg_uuid_to_str(&node->pfs_fsid, &uustr), - node->fs_label); + dmsg_uuid_to_str(&node->pfs_id, &uustr), + node->pfs_label); RB_FOREACH(slink, h2span_link_tree, &node->tree) { dmsg_printf(iocom, "\tSLink msgid %016jx " diff --git a/lib/libdmsg/service.c b/lib/libdmsg/service.c index b9c3b04bee..56ef3560b3 100644 --- a/lib/libdmsg/service.c +++ b/lib/libdmsg/service.c @@ -111,8 +111,8 @@ master_auth_signal(dmsg_iocom_t *iocom) DMSG_LNK_CONN | DMSGF_CREATE, master_auth_conn_rx, NULL); msg->any.lnk_conn.peer_mask = (uint64_t)-1; - msg->any.lnk_conn.peer_type = DMSG_PEER_CLUSTER; - msg->any.lnk_conn.pfs_mask = (uint64_t)-1; + msg->any.lnk_conn.peer_type = DMSG_PEER_ROUTER; + msg->any.lnk_conn.peer_mask = (uint64_t)-1; dmsg_msg_write(msg); diff --git a/lib/libdmsg/subs.c b/lib/libdmsg/subs.c index e9d120faa5..5b8a72795d 100644 --- a/lib/libdmsg/subs.c +++ b/lib/libdmsg/subs.c @@ -73,12 +73,14 @@ dmsg_peer_type_to_str(uint8_t type) switch(type) { case DMSG_PEER_NONE: return("NONE"); - case DMSG_PEER_CLUSTER: - return("CLUSTER"); + case DMSG_PEER_ROUTER: + return("ROUTER"); case DMSG_PEER_BLOCK: return("BLOCK"); case DMSG_PEER_HAMMER2: return("HAMMER2"); + case DMSG_PEER_CLIENT: + return("CLIENT"); default: return("?PEERTYPE?"); } diff --git a/sbin/hammer2/cmd_pfs.c b/sbin/hammer2/cmd_pfs.c index 0ba7a311ae..c9c3e98565 100644 --- a/sbin/hammer2/cmd_pfs.c +++ b/sbin/hammer2/cmd_pfs.c @@ -61,7 +61,7 @@ cmd_pfs_list(const char *sel_path) "Label\n"); } switch(pfs.pfs_type) { - case DMSG_PFSTYPE_NONE: + case HAMMER2_PFSTYPE_NONE: printf("NONE "); break; case HAMMER2_PFSTYPE_CACHE: @@ -144,7 +144,7 @@ cmd_pfs_create(const char *sel_path, const char *name, /* * Default to MASTER */ - if (pfs_type == DMSG_PFSTYPE_NONE) { + if (pfs_type == HAMMER2_PFSTYPE_NONE) { pfs_type = HAMMER2_PFSTYPE_MASTER; } diff --git a/sbin/hammer2/main.c b/sbin/hammer2/main.c index ba70dbb33d..b975270e53 100644 --- a/sbin/hammer2/main.c +++ b/sbin/hammer2/main.c @@ -50,7 +50,7 @@ main(int ac, char **av) const char *sel_path = NULL; const char *uuid_str = NULL; const char *arg; - int pfs_type = DMSG_PFSTYPE_NONE; + int pfs_type = HAMMER2_PFSTYPE_NONE; int all_opt = 0; int ecode = 0; int ch; diff --git a/sbin/hammer2/subs.c b/sbin/hammer2/subs.c index 2713f41f57..2ebbf7e2a9 100644 --- a/sbin/hammer2/subs.c +++ b/sbin/hammer2/subs.c @@ -201,7 +201,7 @@ const char * hammer2_pfstype_to_str(uint8_t type) { switch(type) { - case DMSG_PFSTYPE_NONE: + case HAMMER2_PFSTYPE_NONE: return("NONE"); case HAMMER2_PFSTYPE_CACHE: return("CACHE"); diff --git a/sys/dev/disk/xdisk/xdisk.c b/sys/dev/disk/xdisk/xdisk.c index a73a62c441..164a0fbb20 100644 --- a/sys/dev/disk/xdisk/xdisk.c +++ b/sys/dev/disk/xdisk/xdisk.c @@ -108,7 +108,7 @@ struct xa_softc { struct devstat stats; struct disk_info info; struct disk disk; - uuid_t pfs_fsid; + uuid_t peer_id; int unit; int opencnt; int spancnt; @@ -116,8 +116,8 @@ struct xa_softc { int serializing; int last_error; int terminating; - char cl_label[64]; /* from LNK_SPAN cl_label (host/dev) */ - char fs_label[64]; /* from LNK_SPAN fs_label (serno str) */ + char peer_label[64]; /* from LNK_SPAN host/dev */ + char pfs_label[64]; /* from LNK_SPAN serno */ xa_tag_t *open_tag; TAILQ_HEAD(, bio) bioq; /* pending BIOs */ TAILQ_HEAD(, xa_tag) tag_freeq; /* available I/O tags */ @@ -239,7 +239,7 @@ DEV_MODULE(xdisk, xdisk_modevent, 0); static int xa_softc_cmp(xa_softc_t *sc1, xa_softc_t *sc2) { - return(strcmp(sc1->fs_label, sc2->fs_label)); + return(strcmp(sc1->pfs_label, sc2->pfs_label)); } /* @@ -319,20 +319,19 @@ xdisk_attach(struct xdisk_attach_ioctl *xaioc) /* * Setup our LNK_CONN advertisement for autoinitiate. * - * Our filter is setup to only accept PEER_BLOCK/SERVER - * advertisements. + * Our filter is setup to only accept PEER_BLOCK advertisements. + * XXX no peer_id filter. * * We need a unique pfs_fsid to avoid confusion. */ - xaio->iocom.auto_lnk_conn.pfs_type = DMSG_PFSTYPE_CLIENT; + xaio->iocom.auto_lnk_conn.peer_type = DMSG_PEER_CLIENT; xaio->iocom.auto_lnk_conn.proto_version = DMSG_SPAN_PROTO_1; - xaio->iocom.auto_lnk_conn.peer_type = DMSG_PEER_BLOCK; xaio->iocom.auto_lnk_conn.peer_mask = 1LLU << DMSG_PEER_BLOCK; - xaio->iocom.auto_lnk_conn.pfs_mask = 1LLU << DMSG_PFSTYPE_SERVER; - ksnprintf(xaio->iocom.auto_lnk_conn.fs_label, - sizeof(xaio->iocom.auto_lnk_conn.fs_label), - "xdisk"); - kern_uuidgen(&xaio->iocom.auto_lnk_conn.pfs_fsid, 1); + ksnprintf(xaio->iocom.auto_lnk_conn.peer_label, + sizeof(xaio->iocom.auto_lnk_conn.peer_label), + "%s/xdisk", + hostname); + /* kern_uuidgen(&xaio->iocom.auto_lnk_conn.pfs_fsid, 1); */ /* * Setup our LNK_SPAN advertisement for autoinitiate @@ -407,16 +406,18 @@ xaio_rcvdmsg(kdmsg_msg_t *msg) * Return a streaming result, leaving the transaction open * in both directions to allow sub-transactions. */ - bcopy(msg->any.lnk_span.cl_label, xaio->dummysc.cl_label, - sizeof(xaio->dummysc.cl_label)); - xaio->dummysc.cl_label[sizeof(xaio->dummysc.cl_label) - 1] = 0; + bcopy(msg->any.lnk_span.peer_label, xaio->dummysc.peer_label, + sizeof(xaio->dummysc.peer_label)); + xaio->dummysc.peer_label[ + sizeof(xaio->dummysc.peer_label) - 1] = 0; - bcopy(msg->any.lnk_span.fs_label, xaio->dummysc.fs_label, - sizeof(xaio->dummysc.fs_label)); - xaio->dummysc.fs_label[sizeof(xaio->dummysc.fs_label) - 1] = 0; + bcopy(msg->any.lnk_span.pfs_label, xaio->dummysc.pfs_label, + sizeof(xaio->dummysc.pfs_label)); + xaio->dummysc.pfs_label[ + sizeof(xaio->dummysc.pfs_label) - 1] = 0; xa_printf(3, "LINK_SPAN state %p create for %s\n", - msg->state, msg->any.lnk_span.fs_label); + msg->state, msg->any.lnk_span.pfs_label); sc = RB_FIND(xa_softc_tree, &xa_device_tree, &xaio->dummysc); if (sc == NULL) { @@ -427,12 +428,12 @@ xaio_rcvdmsg(kdmsg_msg_t *msg) int n; sc = kmalloc(sizeof(*sc), M_XDISK, M_WAITOK | M_ZERO); - bcopy(msg->any.lnk_span.cl_label, sc->cl_label, - sizeof(sc->cl_label)); - sc->cl_label[sizeof(sc->cl_label) - 1] = 0; - bcopy(msg->any.lnk_span.fs_label, sc->fs_label, - sizeof(sc->fs_label)); - sc->fs_label[sizeof(sc->fs_label) - 1] = 0; + bcopy(msg->any.lnk_span.peer_label, sc->peer_label, + sizeof(sc->peer_label)); + sc->peer_label[sizeof(sc->peer_label) - 1] = 0; + bcopy(msg->any.lnk_span.pfs_label, sc->pfs_label, + sizeof(sc->pfs_label)); + sc->pfs_label[sizeof(sc->pfs_label) - 1] = 0; /* XXX FIXME O(N^2) */ unit = -1; @@ -494,8 +495,8 @@ xaio_rcvdmsg(kdmsg_msg_t *msg) sc->info.d_secpercyl = sc->info.d_secpertrack * sc->info.d_nheads; sc->info.d_ncylinders = 0; - if (sc->fs_label[0]) - sc->info.d_serialno = sc->fs_label; + if (sc->pfs_label[0]) + sc->info.d_serialno = sc->pfs_label; /* * WARNING! disk_setdiskinfo() must be asynchronous * because we are in the rxmsg thread. If @@ -517,7 +518,7 @@ xaio_rcvdmsg(kdmsg_msg_t *msg) lockmgr(&sc->lk, LK_RELEASE); if (sc->dev && sc->dev->si_disk) { xa_printf(1, "reprobe disk: %s\n", - sc->fs_label); + sc->pfs_label); disk_msg_send(DISK_DISK_REPROBE, sc->dev->si_disk, NULL); @@ -534,7 +535,7 @@ xaio_rcvdmsg(kdmsg_msg_t *msg) */ sc = msg->state->any.xa_sc; xa_printf(3, "LINK_SPAN state %p delete for %s (sc=%p)\n", - msg->state, (sc ? sc->fs_label : "(null)"), sc); + msg->state, (sc ? sc->pfs_label : "(null)"), sc); lockmgr(&sc->lk, LK_EXCLUSIVE); msg->state->any.xa_sc = NULL; TAILQ_REMOVE(&sc->spanq, msg->state, user_entry); diff --git a/sys/kern/subr_diskiocom.c b/sys/kern/subr_diskiocom.c index a78512721d..f4e2bfbb77 100644 --- a/sys/kern/subr_diskiocom.c +++ b/sys/kern/subr_diskiocom.c @@ -142,34 +142,47 @@ disk_iocom_reconnect(struct disk *dp, struct file *fp) kdmsg_iocom_reconnect(&dp->d_iocom, fp, devname); - dp->d_iocom.auto_lnk_conn.pfs_type = DMSG_PFSTYPE_SERVER; dp->d_iocom.auto_lnk_conn.proto_version = DMSG_SPAN_PROTO_1; dp->d_iocom.auto_lnk_conn.peer_type = DMSG_PEER_BLOCK; dp->d_iocom.auto_lnk_conn.peer_mask = 1LLU << DMSG_PEER_BLOCK; - dp->d_iocom.auto_lnk_conn.pfs_mask = (uint64_t)-1; - ksnprintf(dp->d_iocom.auto_lnk_conn.cl_label, - sizeof(dp->d_iocom.auto_lnk_conn.cl_label), - "%s/%s", hostname, devname); + dp->d_iocom.auto_lnk_conn.peer_mask = (uint64_t)-1; +#if 0 if (dp->d_info.d_serialno) { - ksnprintf(dp->d_iocom.auto_lnk_conn.fs_label, - sizeof(dp->d_iocom.auto_lnk_conn.fs_label), - "%s", dp->d_info.d_serialno); + ksnprintf(dp->d_iocom.auto_lnk_conn.peer_label, + sizeof(dp->d_iocom.auto_lnk_conn.peer_label), + "%s/%s", hostname, dp->d_info.d_serialno); + } else { + ksnprintf(dp->d_iocom.auto_lnk_conn.peer_label, + sizeof(dp->d_iocom.auto_lnk_conn.peer_label), + "%s/%s", hostname, devname); } +#endif + ksnprintf(dp->d_iocom.auto_lnk_conn.peer_label, + sizeof(dp->d_iocom.auto_lnk_conn.peer_label), + "%s/%s", hostname, devname); - dp->d_iocom.auto_lnk_span.pfs_type = DMSG_PFSTYPE_SERVER; dp->d_iocom.auto_lnk_span.proto_version = DMSG_SPAN_PROTO_1; dp->d_iocom.auto_lnk_span.peer_type = DMSG_PEER_BLOCK; dp->d_iocom.auto_lnk_span.media.block.bytes = dp->d_info.d_media_size; dp->d_iocom.auto_lnk_span.media.block.blksize = dp->d_info.d_media_blksize; - ksnprintf(dp->d_iocom.auto_lnk_span.cl_label, - sizeof(dp->d_iocom.auto_lnk_span.cl_label), - "%s/%s", hostname, devname); + ksnprintf(dp->d_iocom.auto_lnk_span.peer_label, + sizeof(dp->d_iocom.auto_lnk_span.peer_label), + "%s", dp->d_iocom.auto_lnk_conn.peer_label); if (dp->d_info.d_serialno) { - ksnprintf(dp->d_iocom.auto_lnk_span.fs_label, - sizeof(dp->d_iocom.auto_lnk_span.fs_label), + ksnprintf(dp->d_iocom.auto_lnk_span.pfs_label, + sizeof(dp->d_iocom.auto_lnk_span.pfs_label), "%s", dp->d_info.d_serialno); + } else { + /* + * If no serial number is available generate a dummy serial + * number from the host and device name and pray. This will + * allow e.g. /dev/vn* to look meaningful on a remote machine. + */ + ksnprintf(dp->d_iocom.auto_lnk_span.pfs_label, + sizeof(dp->d_iocom.auto_lnk_span.pfs_label), + "%s.%s", hostname, devname); } kdmsg_iocom_autoinitiate(&dp->d_iocom, NULL); diff --git a/sys/sys/dmsg.h b/sys/sys/dmsg.h index e293a342ff..dc69ba226b 100644 --- a/sys/sys/dmsg.h +++ b/sys/sys/dmsg.h @@ -391,58 +391,54 @@ struct dmsg_lnk_auth { * LNK_CONN - Register connection info for SPAN protocol * (transaction, left open, iocom->state0 only). * - * LNK_CONN identifies a streaming connection into the cluster and serves - * to identify, enable, and specify filters for the SPAN protocol. + * LNK_CONN identifies a streaming connection into the cluster. * * peer_mask serves to filter the SPANs we receive by peer_type. A cluster * controller typically sets this to (uint64_t)-1, indicating that it wants * everything. A block devfs interface might set it to 1 << DMSG_PEER_DISK, * and a hammer2 mount might set it to 1 << DMSG_PEER_HAMMER2. * - * mediaid allows multiple (e.g. HAMMER2) connections belonging to the same - * media to transmit duplicative LNK_VOLCONF updates without causing - * confusion in the cluster controller. + * media_iud allows multiple (e.g. HAMMER2) connections belonging to the same + * media to transmit duplicative LNK_VOLCONF updates without causing confusion + * in the cluster controller. * * pfs_clid, pfs_fsid, pfs_type, and label are peer-specific and must be * left empty (zero-fill) if not supported by a particular peer. - * - * DMSG_PEER_CLUSTER filter: none - * DMSG_PEER_BLOCK filter: label - * DMSG_PEER_HAMMER2 filter: pfs_clid if not empty, and label */ struct dmsg_lnk_conn { dmsg_hdr_t head; - uuid_t mediaid; /* media configuration id */ - uuid_t pfs_clid; /* rendezvous pfs uuid */ - uuid_t pfs_fsid; /* unique pfs uuid */ + uuid_t media_id; /* media configuration id */ + uuid_t peer_id; /* unique peer uuid */ + uuid_t reserved01; uint64_t peer_mask; /* PEER mask for SPAN filtering */ uint8_t peer_type; /* see DMSG_PEER_xxx */ - uint8_t pfs_type; /* pfs type */ + uint8_t reserved02; uint16_t proto_version; /* high level protocol support */ uint32_t status; /* status flags */ uint32_t rnss; /* node's generated rnss */ - uint8_t reserved02[8]; - uint32_t reserved03[12]; - uint64_t pfs_mask; /* PFS mask for SPAN filtering */ - char cl_label[DMSG_LABEL_SIZE]; /* cluster label */ - char fs_label[DMSG_LABEL_SIZE]; /* PFS label */ + uint8_t reserved03[8]; + uint32_t reserved04[14]; + char peer_label[DMSG_LABEL_SIZE]; /* peer identity string */ }; typedef struct dmsg_lnk_conn dmsg_lnk_conn_t; /* - * PFSTYPEs 0-15 used by sys/dmsg.h 16-31 reserved by hammer2. + * PEER types 0-63 are defined here. There is a limit of 64 types due to + * the width of peer_mask. + * + * PFS types depend on the peer type. sys/dmsg.h only defines the default. + * peer-specific headers define PFS types for any given peer. */ -#define DMSG_PFSTYPE_NONE 0 -#define DMSG_PFSTYPE_ADMIN 1 -#define DMSG_PFSTYPE_CLIENT 2 -#define DMSG_PFSTYPE_SERVER 3 -#define DMSG_PFSTYPE_MAX 32 +#define DMSG_PEER_NONE 0 +#define DMSG_PEER_ROUTER 1 /* server: cluster controller */ +#define DMSG_PEER_BLOCK 2 /* server: block devices */ +#define DMSG_PEER_HAMMER2 3 /* server: h2 mounted volume */ +#define DMSG_PEER_CLIENT 63 /* a client connection */ +#define DMSG_PEER_MAX 64 -#define DMSG_PEER_NONE 0 -#define DMSG_PEER_CLUSTER 1 /* a cluster controller */ -#define DMSG_PEER_BLOCK 2 /* block devices */ -#define DMSG_PEER_HAMMER2 3 /* hammer2-mounted volumes */ +#define DMSG_PFSTYPE_DEFAULT 0 +#define DMSG_PFSTYPE_MASK 0x0F /* * Structures embedded in LNK_SPAN @@ -450,6 +446,7 @@ typedef struct dmsg_lnk_conn dmsg_lnk_conn_t; struct dmsg_media_block { uint64_t bytes; /* media size in bytes */ uint32_t blksize; /* media block size */ + uint32_t reserved01; }; typedef struct dmsg_media_block dmsg_media_block_t; @@ -503,8 +500,8 @@ typedef struct dmsg_media_block dmsg_media_block_t; */ struct dmsg_lnk_span { dmsg_hdr_t head; - uuid_t pfs_clid; /* rendezvous pfs uuid */ - uuid_t pfs_fsid; /* unique pfs id (differentiate node) */ + uuid_t peer_id; + uuid_t pfs_id; /* unique pfs id */ uint8_t pfs_type; /* PFS type */ uint8_t peer_type; /* PEER type */ uint16_t proto_version; /* high level protocol support */ @@ -524,8 +521,8 @@ struct dmsg_lnk_span { * for PEER_BLOCK cl_label is typically host/device and * fs_label is typically the serial number string. */ - char cl_label[DMSG_LABEL_SIZE]; /* cluster label */ - char fs_label[DMSG_LABEL_SIZE]; /* PFS label */ + char peer_label[DMSG_LABEL_SIZE]; /* peer label */ + char pfs_label[DMSG_LABEL_SIZE]; /* PFS label */ }; typedef struct dmsg_lnk_span dmsg_lnk_span_t; diff --git a/sys/vfs/hammer2/hammer2.h b/sys/vfs/hammer2/hammer2.h index 8cb1fc3f7e..b75e27a1b5 100644 --- a/sys/vfs/hammer2/hammer2.h +++ b/sys/vfs/hammer2/hammer2.h @@ -685,6 +685,34 @@ typedef struct hammer2_mount hammer2_mount_t; * PFS under a specific device mount (HMP). The distinction is important * because the elements backing a cluster mount can change on the fly. * + * pfs_mode and pfs_nmasters critically describes how a HAMMER2 filesytem + * mount should operate. pfs_nmasters indicates how many master PFSs + * exist for the filesystem (whether available or not). pfs_mode is + * a bitmask: + * + * XXX this should be automatic based on the 'primary' mount.. based on + * which target you are mounting. + * + * HAMMER2_PFSMODE_QUORUM - Validate against quorum of masters, + * else operate unsynchronized. + * + * HAMMER2_PFSMODE_RW - Allow writing to the cluster, + * else do not allow. + * + * When operating in quorum mode modifying operations flow into + * a quorum+ of masters and all other local PFS types are synchronized + * in the background. Other PFS types will be used to improve or avoid + * network I/O only if they agree with a quorum of masters. + * + * When not operating in quorum mode modifying operations may only flow + * into a SOFT_MASTER and will be synchronized with the quorum in the + * background, and will not be cache-coherent with the quorum. Think + * laptop-on-the-road. Other PFS types will be used to improve or avoid + * network I/O only if they agree with the SOFT_MASTER. + * + * When not operating in quorum mode a read-only mount can be used to + * access a particular PFS unsynchronized. + * * Usually the first element under the cluster represents the original * user-requested mount that bootstraps the whole mess. In significant * setups the original is usually just a read-only media image (or @@ -710,6 +738,11 @@ struct hammer2_pfsmount { hammer2_tid_t alloc_tid; hammer2_tid_t flush_tid; hammer2_tid_t inode_tid; + uint8_t pfs_nmasters; /* total masters */ + uint8_t pfs_mode; /* operating mode PFSMODE */ + uint8_t unused01; + uint8_t unused02; + uint32_t unused03; long inmem_inodes; uint32_t inmem_dirty_chains; int count_lwinprog; /* logical write in prog */ diff --git a/sys/vfs/hammer2/hammer2_cluster.c b/sys/vfs/hammer2/hammer2_cluster.c index 71ab5cc80d..0c13e787f0 100644 --- a/sys/vfs/hammer2/hammer2_cluster.c +++ b/sys/vfs/hammer2/hammer2_cluster.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014 The DragonFly Project. All rights reserved. + * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project * by Matthew Dillon @@ -49,6 +49,62 @@ * locks and I/O, do quorum and/or master-slave processing, and * it must operate properly even if some nodes are broken (which * can also mean indefinite locks). + * + * CLUSTER OPERATIONS + * + * Cluster operations can be broken down into three pieces: + * + * (1) Chain locking and data retrieval. + * hammer2_cluster_lock() + * hammer2_cluster_parent() + * + * - Most complex functions, quorum management on transaction ids. + * + * - Locking and data accesses must be internally asynchronous. + * + * - Validate and manage cache coherency primitives (cache state + * is stored in chain topologies but must be validated by these + * functions). + * + * (2) Lookups and Scans + * hammer2_cluster_lookup() + * hammer2_cluster_next() + * + * - Depend on locking & data retrieval functions, but still complex. + * + * - Must do quorum management on transaction ids. + * + * - Lookup and Iteration ops Must be internally asynchronous. + * + * (3) Modifying Operations + * hammer2_cluster_create() + * hammer2_cluster_rename() + * hammer2_cluster_delete() + * hammer2_cluster_modify() + * hammer2_cluster_modsync() + * + * - Can usually punt on failures, operation continues unless quorum + * is lost. If quorum is lost, must wait for resynchronization + * (depending on the management mode). + * + * - Must disconnect node on failures (also not flush), remount, and + * resynchronize. + * + * - Network links (via kdmsg) are relatively easy to issue as the + * complex underworkings of hammer2_chain.c don't have to messed + * with (the protocol is at a higher level than block-level). + * + * - Multiple local disk nodes (i.e. block devices) are another matter. + * Chain operations have to be dispatched to per-node threads (xN) + * because we can't asynchronize potentially very complex chain + * operations in hammer2_chain.c (it would be a huge mess). + * + * (these threads are also used to terminate incoming kdmsg ops from + * other machines). + * + * - Single-node filesystems do not use threads and will simply call + * hammer2_chain.c functions directly. This short-cut is handled + * at the base of each cluster function. */ #include #include @@ -1105,7 +1161,8 @@ hammer2_cluster_snapshot(hammer2_trans_t *trans, hammer2_cluster_t *ocluster, /* * Return locked parent cluster given a locked child. The child remains - * locked on return. The new parent's focus follows the child's focus. + * locked on return. The new parent's focus follows the child's focus + * and the parent is always resolved. */ hammer2_cluster_t * hammer2_cluster_parent(hammer2_cluster_t *cluster) diff --git a/sys/vfs/hammer2/hammer2_disk.h b/sys/vfs/hammer2/hammer2_disk.h index 6b933aec70..715e682b8b 100644 --- a/sys/vfs/hammer2/hammer2_disk.h +++ b/sys/vfs/hammer2/hammer2_disk.h @@ -911,19 +911,35 @@ typedef struct hammer2_inode_data hammer2_inode_data_t; /* * PFS types identify a PFS on media and in LNK_SPAN messages. - * * PFS types >= 16 belong to HAMMER, 0-15 are defined in sys/dmsg.h + * + * For example, a mount operating in SOFT_MASTER mode might have nodes + * representing several MASTERs, CACHEs, and one SOFT_MASTER, and will + * operate by modifying the SOFT_MASTER and allowing another thread + * synchronize it to the MASTERs. But if it were operating in MASTER + * mode it would ignore the SOFT_MASTER and use the quorum protocol + * on the MASTERs. */ /* 0-15 reserved by sys/dmsg.h */ -#define HAMMER2_PFSTYPE_CACHE 16 -#define HAMMER2_PFSTYPE_COPY 17 -#define HAMMER2_PFSTYPE_SLAVE 18 -#define HAMMER2_PFSTYPE_SOFT_SLAVE 19 -#define HAMMER2_PFSTYPE_SOFT_MASTER 20 -#define HAMMER2_PFSTYPE_MASTER 21 -#define HAMMER2_PFSTYPE_SNAPSHOT 22 -#define HAMMER2_PFSTYPE_SUPROOT 23 -#define HAMMER2_PFSTYPE_MAX 32 +#define HAMMER2_PFSTYPE_NONE 0 +#define HAMMER2_PFSTYPE_CACHE 1 +#define HAMMER2_PFSTYPE_COPY 2 +#define HAMMER2_PFSTYPE_SLAVE 3 +#define HAMMER2_PFSTYPE_SOFT_SLAVE 4 +#define HAMMER2_PFSTYPE_SOFT_MASTER 5 +#define HAMMER2_PFSTYPE_MASTER 6 +#define HAMMER2_PFSTYPE_SNAPSHOT 7 +#define HAMMER2_PFSTYPE_SUPROOT 8 +#define HAMMER2_PFSTYPE_MAX 16 + +#define HAMMER2_PFSTYPE_MASK 0x0F + +/* + * PFS mode of operation is a bitmask. This is typically not stored + * on-media, but defined here because the field may be used in dmsgs. + */ +#define HAMMER2_PFSMODE_QUORUM 0x01 +#define HAMMER2_PFSMODE_RW 0x02 /* * Allocation Table diff --git a/sys/vfs/hammer2/hammer2_iocom.c b/sys/vfs/hammer2/hammer2_iocom.c index 0c11a80efa..a8c402828a 100644 --- a/sys/vfs/hammer2/hammer2_iocom.c +++ b/sys/vfs/hammer2/hammer2_iocom.c @@ -91,9 +91,6 @@ hammer2_iocom_uninit(hammer2_mount_t *hmp) void hammer2_cluster_reconnect(hammer2_mount_t *hmp, struct file *fp) { - size_t name_len; - const char *name = "disk-volume"; - /* * Closes old comm descriptor, kills threads, cleans up * states, then installs the new descriptor and creates @@ -102,18 +99,18 @@ hammer2_cluster_reconnect(hammer2_mount_t *hmp, struct file *fp) kdmsg_iocom_reconnect(&hmp->iocom, fp, "hammer2"); /* - * Setup LNK_CONN fields for autoinitiated state machine. We - * will use SPANs to advertise multiple PFSs so only pass the - * fsid and HAMMER2_PFSTYPE_SUPROOT for the AUTOCONN. + * Setup LNK_CONN fields for autoinitiated state machine. LNK_CONN + * does not have to be unique. peer_id can be used to filter incoming + * LNK_SPANs automatically if desired (though we still need to check). + * peer_label typically identifies who we are and is not a filter. * - * Since we will be initiating multiple LNK_SPANs we cannot - * use AUTOTXSPAN, but we do use AUTORXSPAN so kdmsg tracks - * received LNK_SPANs, and we simply monitor those messages. + * Since we will be initiating multiple LNK_SPANs we cannot use + * AUTOTXSPAN, but we do use AUTORXSPAN so kdmsg tracks received + * LNK_SPANs, and we simply monitor those messages. */ - bzero(&hmp->iocom.auto_lnk_conn.pfs_clid, - sizeof(hmp->iocom.auto_lnk_conn.pfs_clid)); - hmp->iocom.auto_lnk_conn.pfs_fsid = hmp->voldata.fsid; - hmp->iocom.auto_lnk_conn.pfs_type = HAMMER2_PFSTYPE_SUPROOT; + bzero(&hmp->iocom.auto_lnk_conn.peer_id, + sizeof(hmp->iocom.auto_lnk_conn.peer_id)); + /* hmp->iocom.auto_lnk_conn.peer_id = hmp->voldata.fsid; */ hmp->iocom.auto_lnk_conn.proto_version = DMSG_SPAN_PROTO_1; #if 0 hmp->iocom.auto_lnk_conn.peer_type = hmp->voldata.peer_type; @@ -121,12 +118,10 @@ hammer2_cluster_reconnect(hammer2_mount_t *hmp, struct file *fp) hmp->iocom.auto_lnk_conn.peer_type = DMSG_PEER_HAMMER2; /* - * Filter adjustment. Clients do not need visibility into other - * clients (otherwise millions of clients would present a serious - * problem). The fs_label also serves to restrict the namespace. + * We just want to receive LNK_SPANs related to HAMMER2 matching + * peer_id. */ hmp->iocom.auto_lnk_conn.peer_mask = 1LLU << DMSG_PEER_HAMMER2; - hmp->iocom.auto_lnk_conn.pfs_mask = (uint64_t)-1; #if 0 switch (ipdata->pfs_type) { @@ -139,12 +134,12 @@ hammer2_cluster_reconnect(hammer2_mount_t *hmp, struct file *fp) } #endif - name_len = strlen(name); - if (name_len >= sizeof(hmp->iocom.auto_lnk_conn.fs_label)) - name_len = sizeof(hmp->iocom.auto_lnk_conn.fs_label) - 1; - bcopy(name, hmp->iocom.auto_lnk_conn.fs_label, name_len); - hmp->iocom.auto_lnk_conn.fs_label[name_len] = 0; - + bzero(&hmp->iocom.auto_lnk_conn.peer_label, + sizeof(hmp->iocom.auto_lnk_conn.peer_label)); + ksnprintf(hmp->iocom.auto_lnk_conn.peer_label, + sizeof(hmp->iocom.auto_lnk_conn.peer_label), + "%s/%s", + hostname, "hammer2-mount"); kdmsg_iocom_autoinitiate(&hmp->iocom, hammer2_autodmsg); } @@ -260,11 +255,12 @@ hammer2_autodmsg(kdmsg_msg_t *msg) kdmsg_msg_reply(msg, 0); break; } - DMSG_TERMINATE_STRING(msg->any.lnk_span.fs_label); - kprintf("H2 +RXSPAN cmd=%08x (%-20s) cl=", msg->any.head.cmd, msg->any.lnk_span.fs_label); - printf_uuid(&msg->any.lnk_span.pfs_clid); + DMSG_TERMINATE_STRING(msg->any.lnk_span.peer_label); + kprintf("H2 +RXSPAN cmd=%08x (%-20s) cl=", + msg->any.head.cmd, msg->any.lnk_span.peer_label); + printf_uuid(&msg->any.lnk_span.peer_id); kprintf(" fs="); - printf_uuid(&msg->any.lnk_span.pfs_fsid); + printf_uuid(&msg->any.lnk_span.pfs_id); kprintf(" type=%d\n", msg->any.lnk_span.pfs_type); kdmsg_msg_result(msg, 0); break; @@ -315,15 +311,17 @@ hammer2_update_spans(hammer2_mount_t *hmp, kdmsg_state_t *state) rmsg = kdmsg_msg_alloc(&hmp->iocom.state0, DMSG_LNK_SPAN | DMSGF_CREATE, hammer2_lnk_span_reply, NULL); - rmsg->any.lnk_span.pfs_clid = ripdata->pfs_clid; - rmsg->any.lnk_span.pfs_fsid = ripdata->pfs_fsid; + rmsg->any.lnk_span.peer_id = ripdata->pfs_clid; + rmsg->any.lnk_span.pfs_id = ripdata->pfs_fsid; rmsg->any.lnk_span.pfs_type = ripdata->pfs_type; rmsg->any.lnk_span.peer_type = DMSG_PEER_HAMMER2; rmsg->any.lnk_span.proto_version = DMSG_SPAN_PROTO_1; name_len = ripdata->name_len; - if (name_len >= sizeof(rmsg->any.lnk_span.fs_label)) - name_len = sizeof(rmsg->any.lnk_span.fs_label) - 1; - bcopy(ripdata->filename, rmsg->any.lnk_span.fs_label, name_len); + if (name_len >= sizeof(rmsg->any.lnk_span.peer_label)) + name_len = sizeof(rmsg->any.lnk_span.peer_label) - 1; + bcopy(ripdata->filename, + rmsg->any.lnk_span.peer_label, + name_len); kdmsg_msg_write(rmsg); -- 2.11.4.GIT