2 * QEMU Xen emulation: The actual implementation of XenStore
4 * Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
6 * Authors: David Woodhouse <dwmw2@infradead.org>, Paul Durrant <paul@xen.org>
8 * This work is licensed under the terms of the GNU GPL, version 2 or later.
9 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include "qom/object.h"
15 #include "hw/xen/xen.h"
17 #include "xen_xenstore.h"
18 #include "xenstore_impl.h"
20 #include "hw/xen/interface/io/xs_wire.h"
22 #define XS_MAX_WATCHES 128
23 #define XS_MAX_DOMAIN_NODES 1000
24 #define XS_MAX_NODE_SIZE 2048
25 #define XS_MAX_TRANSACTIONS 10
26 #define XS_MAX_PERMS_PER_NODE 5
28 #define XS_VALID_CHARS "abcdefghijklmnopqrstuvwxyz" \
29 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
32 typedef struct XsNode
{
40 unsigned int serialized_tx
;
41 #ifdef XS_NODE_UNIT_TEST
42 gchar
*name
; /* debug only */
46 typedef struct XsWatch
{
55 typedef struct XsTransaction
{
57 unsigned int nr_nodes
;
63 struct XenstoreImplState
{
65 unsigned int nr_nodes
;
67 unsigned int nr_domu_watches
;
68 GHashTable
*transactions
;
69 unsigned int nr_domu_transactions
;
76 static void nobble_tx(gpointer key
, gpointer value
, gpointer user_data
)
78 unsigned int *new_tx_id
= user_data
;
79 XsTransaction
*tx
= value
;
81 if (tx
->base_tx
== *new_tx_id
) {
82 /* Transactions based on XBT_NULL will always fail */
83 tx
->base_tx
= XBT_NULL
;
87 static inline unsigned int next_tx(struct XenstoreImplState
*s
)
91 /* Find the next TX id which isn't either XBT_NULL or in use. */
94 } while (tx_id
== XBT_NULL
|| tx_id
== s
->root_tx
||
95 g_hash_table_lookup(s
->transactions
, GINT_TO_POINTER(tx_id
)));
98 * It is vanishingly unlikely, but ensure that no outstanding transaction
99 * is based on the (previous incarnation of the) newly-allocated TX id.
101 g_hash_table_foreach(s
->transactions
, nobble_tx
, &tx_id
);
106 static inline XsNode
*xs_node_new(void)
108 XsNode
*n
= g_new0(XsNode
, 1);
111 #ifdef XS_NODE_UNIT_TEST
113 xs_node_list
= g_list_prepend(xs_node_list
, n
);
118 static inline XsNode
*xs_node_ref(XsNode
*n
)
120 /* With just 10 transactions, it can never get anywhere near this. */
121 g_assert(n
->ref
< INT_MAX
);
128 static inline void xs_node_unref(XsNode
*n
)
139 g_byte_array_unref(n
->content
);
142 g_list_free_full(n
->perms
, g_free
);
145 g_hash_table_unref(n
->children
);
147 #ifdef XS_NODE_UNIT_TEST
150 xs_node_list
= g_list_remove(xs_node_list
, n
);
155 char *xs_perm_as_string(unsigned int perm
, unsigned int domid
)
160 case XS_PERM_READ
| XS_PERM_WRITE
:
175 return g_strdup_printf("%c%u", letter
, domid
);
178 static gpointer
do_perm_copy(gconstpointer src
, gpointer user_data
)
180 return g_strdup(src
);
183 static XsNode
*xs_node_create(const char *name
, GList
*perms
)
185 XsNode
*n
= xs_node_new();
187 #ifdef XS_NODE_UNIT_TEST
189 n
->name
= g_strdup(name
);
193 n
->perms
= g_list_copy_deep(perms
, do_perm_copy
, NULL
);
198 /* For copying from one hash table to another using g_hash_table_foreach() */
199 static void do_child_insert(gpointer key
, gpointer value
, gpointer user_data
)
201 g_hash_table_insert(user_data
, g_strdup(key
), xs_node_ref(value
));
204 static XsNode
*xs_node_copy(XsNode
*old
)
206 XsNode
*n
= xs_node_new();
208 n
->gencnt
= old
->gencnt
;
210 #ifdef XS_NODE_UNIT_TEST
212 n
->name
= g_strdup(old
->name
);
218 n
->children
= g_hash_table_new_full(g_str_hash
, g_str_equal
, g_free
,
219 (GDestroyNotify
)xs_node_unref
);
220 g_hash_table_foreach(old
->children
, do_child_insert
, n
->children
);
223 n
->perms
= g_list_copy_deep(old
->perms
, do_perm_copy
, NULL
);
226 n
->content
= g_byte_array_ref(old
->content
);
231 /* Returns true if it made a change to the hash table */
232 static bool xs_node_add_child(XsNode
*n
, const char *path_elem
, XsNode
*child
)
234 assert(!strchr(path_elem
, '/'));
238 return g_hash_table_remove(n
->children
, path_elem
);
241 #ifdef XS_NODE_UNIT_TEST
243 child
->name
= g_strdup(path_elem
);
246 n
->children
= g_hash_table_new_full(g_str_hash
, g_str_equal
, g_free
,
247 (GDestroyNotify
)xs_node_unref
);
251 * The documentation for g_hash_table_insert() says that it "returns a
252 * boolean value to indicate whether the newly added value was already
253 * in the hash table or not."
255 * It could perhaps be clearer that returning TRUE means it wasn't,
257 return g_hash_table_insert(n
->children
, g_strdup(path_elem
), child
);
261 struct XenstoreImplState
*s
;
262 char path
[XENSTORE_ABS_PATH_MAX
+ 2]; /* Two NUL terminators */
263 int (*op_fn
)(XsNode
**n
, struct walk_op
*op
);
271 /* The number of nodes which will exist in the tree if this op succeeds. */
272 unsigned int new_nr_nodes
;
275 * This is maintained on the way *down* the walk to indicate
276 * whether nodes can be modified in place or whether COW is
277 * required. It starts off being true, as we're always going to
278 * replace the root node. If we walk into a shared subtree it
279 * becomes false. If we start *creating* new nodes for a write,
280 * it becomes true again.
282 * Do not use it on the way back up.
289 /* Tracking during recursion so we know which is first. */
293 static void fire_watches(struct walk_op
*op
, bool parents
)
298 if (!op
->mutating
|| op
->in_transaction
) {
306 w
= g_hash_table_lookup(op
->s
->watches
, op
->path
);
309 /* Fire the parent nodes from 'op' if asked to */
315 assert(strlen(op
->path
) > w
->rel_prefix
);
316 w
->cb(w
->cb_opaque
, op
->path
+ w
->rel_prefix
, w
->token
);
322 static int xs_node_add_content(XsNode
**n
, struct walk_op
*op
)
324 GByteArray
*data
= op
->op_opaque
;
328 * The real XenStored includes permissions and names of child nodes
329 * in the calculated datasize but life's too short. For a single
330 * tenant internal XenStore, we don't have to be quite as pedantic.
332 if (data
->len
> XS_MAX_NODE_SIZE
) {
336 /* We *are* the node to be written. Either this or a copy. */
339 *n
= xs_node_copy(old
);
344 g_byte_array_unref((*n
)->content
);
346 (*n
)->content
= g_byte_array_ref(data
);
347 if (op
->tx_id
!= XBT_NULL
) {
348 (*n
)->modified_in_tx
= true;
353 static int xs_node_get_content(XsNode
**n
, struct walk_op
*op
)
355 GByteArray
*data
= op
->op_opaque
;
356 GByteArray
*node_data
;
361 node_data
= (*n
)->content
;
363 g_byte_array_append(data
, node_data
->data
, node_data
->len
);
369 static int node_rm_recurse(gpointer key
, gpointer value
, gpointer user_data
)
371 struct walk_op
*op
= user_data
;
372 int path_len
= strlen(op
->path
);
373 int key_len
= strlen(key
);
375 bool this_inplace
= op
->inplace
;
381 assert(key_len
+ path_len
+ 2 <= sizeof(op
->path
));
382 op
->path
[path_len
] = '/';
383 memcpy(op
->path
+ path_len
+ 1, key
, key_len
+ 1);
386 g_hash_table_foreach_remove(n
->children
, node_rm_recurse
, op
);
391 * Fire watches on *this* node but not the parents because they are
392 * going to be deleted too, so the watch will fire for them anyway.
394 fire_watches(op
, false);
395 op
->path
[path_len
] = '\0';
398 * Actually deleting the child here is just an optimisation; if we
399 * don't then the final unref on the topmost victim will just have
400 * to cascade down again repeating all the g_hash_table_foreach()
406 static XsNode
*xs_node_copy_deleted(XsNode
*old
, struct walk_op
*op
);
407 static void copy_deleted_recurse(gpointer key
, gpointer value
,
410 struct walk_op
*op
= user_data
;
411 GHashTable
*siblings
= op
->op_opaque2
;
412 XsNode
*n
= xs_node_copy_deleted(value
, op
);
415 * Reinsert the deleted_in_tx copy of the node into the parent's
416 * 'children' hash table. Having stashed it from op->op_opaque2
417 * before the recursive call to xs_node_copy_deleted() scribbled
420 g_hash_table_insert(siblings
, g_strdup(key
), n
);
423 static XsNode
*xs_node_copy_deleted(XsNode
*old
, struct walk_op
*op
)
425 XsNode
*n
= xs_node_new();
427 n
->gencnt
= old
->gencnt
;
429 #ifdef XS_NODE_UNIT_TEST
431 n
->name
= g_strdup(old
->name
);
436 n
->children
= g_hash_table_new_full(g_str_hash
, g_str_equal
, g_free
,
437 (GDestroyNotify
)xs_node_unref
);
438 op
->op_opaque2
= n
->children
;
439 g_hash_table_foreach(old
->children
, copy_deleted_recurse
, op
);
442 n
->perms
= g_list_copy_deep(old
->perms
, do_perm_copy
, NULL
);
444 n
->deleted_in_tx
= true;
445 /* If it gets resurrected we only fire a watch if it lost its content */
447 n
->modified_in_tx
= true;
453 static int xs_node_rm(XsNode
**n
, struct walk_op
*op
)
455 bool this_inplace
= op
->inplace
;
457 if (op
->tx_id
!= XBT_NULL
) {
458 /* It's not trivial to do inplace handling for this one */
460 *n
= xs_node_copy_deleted(old
, op
);
465 /* Fire watches for, and count, nodes in the subtree which get deleted */
466 if ((*n
)->children
) {
467 g_hash_table_foreach_remove((*n
)->children
, node_rm_recurse
, op
);
478 static int xs_node_get_perms(XsNode
**n
, struct walk_op
*op
)
480 GList
**perms
= op
->op_opaque
;
485 *perms
= g_list_copy_deep((*n
)->perms
, do_perm_copy
, NULL
);
489 static void parse_perm(const char *perm
, char *letter
, unsigned int *dom_id
)
491 unsigned int n
= sscanf(perm
, "%c%u", letter
, dom_id
);
496 static bool can_access(unsigned int dom_id
, GList
*perms
, const char *letters
)
500 unsigned int perm_dom_id
;
507 n
= g_list_length(perms
);
511 * The dom_id of the first perm is the owner, and the owner always has
514 parse_perm(g_list_nth_data(perms
, 0), &perm_letter
, &perm_dom_id
);
515 if (dom_id
== perm_dom_id
) {
520 * The letter of the first perm specified the default access for all other
523 access
= !!strchr(letters
, perm_letter
);
524 for (i
= 1; i
< n
; i
++) {
525 parse_perm(g_list_nth_data(perms
, i
), &perm_letter
, &perm_dom_id
);
526 if (dom_id
!= perm_dom_id
) {
529 access
= !!strchr(letters
, perm_letter
);
535 static int xs_node_set_perms(XsNode
**n
, struct walk_op
*op
)
537 GList
*perms
= op
->op_opaque
;
540 unsigned int perm_dom_id
;
543 /* A guest may not change permissions on nodes it does not own */
544 if (!can_access(op
->dom_id
, (*n
)->perms
, "")) {
548 /* A guest may not change the owner of a node it owns. */
549 parse_perm(perms
->data
, &perm_letter
, &perm_dom_id
);
550 if (perm_dom_id
!= op
->dom_id
) {
554 if (g_list_length(perms
) > XS_MAX_PERMS_PER_NODE
) {
559 /* We *are* the node to be written. Either this or a copy. */
562 *n
= xs_node_copy(old
);
567 g_list_free_full((*n
)->perms
, g_free
);
569 (*n
)->perms
= g_list_copy_deep(perms
, do_perm_copy
, NULL
);
570 if (op
->tx_id
!= XBT_NULL
) {
571 (*n
)->modified_in_tx
= true;
577 * Passed a full reference in *n which it may free if it needs to COW.
579 * When changing the tree, the op->inplace flag indicates whether this
580 * node may be modified in place (i.e. it and all its parents had a
581 * refcount of one). If walking down the tree we find a node whose
582 * refcount is higher, we must clear op->inplace and COW from there
583 * down. Unless we are creating new nodes as scaffolding for a write
584 * (which works like 'mkdir -p' does). In which case those newly
585 * created nodes can (and must) be modified in place again.
587 static int xs_node_walk(XsNode
**n
, struct walk_op
*op
)
589 char *child_name
= NULL
;
591 XsNode
*old
= *n
, *child
= NULL
;
592 bool stole_child
= false;
597 namelen
= strlen(op
->path
);
598 watch
= g_hash_table_lookup(op
->s
->watches
, op
->path
);
600 /* Is there a child, or do we hit the double-NUL termination? */
601 if (op
->path
[namelen
+ 1]) {
603 child_name
= op
->path
+ namelen
+ 1;
604 slash
= strchr(child_name
, '/');
608 op
->path
[namelen
] = '/';
611 /* If we walk into a subtree which is shared, we must COW */
612 if (op
->mutating
&& old
->ref
!= 1) {
617 const char *letters
= op
->mutating
? "wb" : "rb";
619 if (!can_access(op
->dom_id
, old
->perms
, letters
)) {
624 /* This is the actual node on which the operation shall be performed */
625 err
= op
->op_fn(n
, op
);
627 fire_watches(op
, true);
632 /* op->inplace will be further modified during the recursion */
633 this_inplace
= op
->inplace
;
635 if (old
&& old
->children
) {
636 child
= g_hash_table_lookup(old
->children
, child_name
);
637 /* This is a *weak* reference to 'child', owned by the hash table */
641 if (child
->deleted_in_tx
) {
642 assert(child
->ref
== 1);
643 /* Cannot actually set child->deleted_in_tx = false until later */
647 * Now we own it too. But if we can modify inplace, that's going to
648 * foil the check and force it to COW. We want to be the *only* owner
649 * so that it can be modified in place, so remove it from the hash
650 * table in that case. We'll add it (or its replacement) back later.
652 if (op
->mutating
&& this_inplace
) {
653 g_hash_table_remove(old
->children
, child_name
);
656 } else if (op
->create_dirs
) {
657 assert(op
->mutating
);
659 if (!can_access(op
->dom_id
, old
->perms
, "wb")) {
664 if (op
->dom_id
&& op
->new_nr_nodes
>= XS_MAX_DOMAIN_NODES
) {
669 child
= xs_node_create(child_name
, old
->perms
);
673 * If we're creating a new child, we can clearly modify it (and its
674 * children) in place from here on down.
683 * If there's a watch on this node, add it to the list to be fired
684 * (with the correct full pathname for the modified node) at the end.
687 op
->watches
= g_list_append(op
->watches
, watch
);
691 * Except for the temporary child-stealing as noted, our node has not
692 * changed yet. We don't yet know the overall operation will complete.
694 err
= xs_node_walk(&child
, op
);
697 op
->watches
= g_list_remove(op
->watches
, watch
);
700 if (err
|| !op
->mutating
) {
702 /* Put it back as it was. */
703 g_hash_table_replace(old
->children
, g_strdup(child_name
), child
);
705 xs_node_unref(child
);
711 * Now we know the operation has completed successfully and we're on
712 * the way back up. Make the change, substituting 'child' in the
716 *n
= xs_node_copy(old
);
721 * If we resurrected a deleted_in_tx node, we can mark it as no longer
722 * deleted now that we know the overall operation has succeeded.
724 if (op
->create_dirs
&& child
&& child
->deleted_in_tx
) {
726 child
->deleted_in_tx
= false;
730 * The child may be NULL here, for a remove operation. Either way,
731 * xs_node_add_child() will do the right thing and return a value
732 * indicating whether it changed the parent's hash table or not.
734 * We bump the parent gencnt if it adds a child that we *didn't*
735 * steal from it in the first place, or if child==NULL and was
736 * thus removed (whether we stole it earlier and didn't put it
737 * back, or xs_node_add_child() actually removed it now).
739 if ((xs_node_add_child(*n
, child_name
, child
) && !stole_child
) || !child
) {
744 op
->path
[namelen
] = '\0';
746 assert(!op
->watches
);
748 * On completing the recursion back up the path walk and reaching the
749 * top, assign the new node count if the operation was successful. If
750 * the main tree was changed, bump its tx ID so that outstanding
751 * transactions correctly fail. But don't bump it every time; only
752 * if it makes a difference.
754 if (!err
&& op
->mutating
) {
755 if (!op
->in_transaction
) {
756 if (op
->s
->root_tx
!= op
->s
->last_tx
) {
757 op
->s
->root_tx
= next_tx(op
->s
);
759 op
->s
->nr_nodes
= op
->new_nr_nodes
;
761 XsTransaction
*tx
= g_hash_table_lookup(op
->s
->transactions
,
762 GINT_TO_POINTER(op
->tx_id
));
764 tx
->nr_nodes
= op
->new_nr_nodes
;
771 static void append_directory_item(gpointer key
, gpointer value
,
774 GList
**items
= user_data
;
776 *items
= g_list_insert_sorted(*items
, g_strdup(key
), (GCompareFunc
)strcmp
);
779 /* Populates items with char * names which caller must free. */
780 static int xs_node_directory(XsNode
**n
, struct walk_op
*op
)
782 GList
**items
= op
->op_opaque
;
787 if ((*n
)->children
) {
788 g_hash_table_foreach((*n
)->children
, append_directory_item
, items
);
791 if (op
->op_opaque2
) {
792 *(uint64_t *)op
->op_opaque2
= (*n
)->gencnt
;
798 static int validate_path(char *outpath
, const char *userpath
,
801 size_t i
, pathlen
= strlen(userpath
);
803 if (!pathlen
|| userpath
[pathlen
] == '/' || strstr(userpath
, "//")) {
806 for (i
= 0; i
< pathlen
; i
++) {
807 if (!strchr(XS_VALID_CHARS
, userpath
[i
])) {
811 if (userpath
[0] == '/') {
812 if (pathlen
> XENSTORE_ABS_PATH_MAX
) {
815 memcpy(outpath
, userpath
, pathlen
+ 1);
817 if (pathlen
> XENSTORE_REL_PATH_MAX
) {
820 snprintf(outpath
, XENSTORE_ABS_PATH_MAX
, "/local/domain/%u/%s", dom_id
,
827 static int init_walk_op(XenstoreImplState
*s
, struct walk_op
*op
,
828 xs_transaction_t tx_id
, unsigned int dom_id
,
829 const char *path
, XsNode
***rootp
)
831 int ret
= validate_path(op
->path
, path
, dom_id
);
837 * We use *two* NUL terminators at the end of the path, as during the walk
838 * we will temporarily turn each '/' into a NUL to allow us to use that
839 * path element for the lookup.
841 op
->path
[strlen(op
->path
) + 1] = '\0';
845 op
->mutating
= false;
846 op
->create_dirs
= false;
847 op
->in_transaction
= false;
852 if (tx_id
== XBT_NULL
) {
854 op
->new_nr_nodes
= s
->nr_nodes
;
856 XsTransaction
*tx
= g_hash_table_lookup(s
->transactions
,
857 GINT_TO_POINTER(tx_id
));
862 op
->new_nr_nodes
= tx
->nr_nodes
;
863 op
->in_transaction
= true;
869 int xs_impl_read(XenstoreImplState
*s
, unsigned int dom_id
,
870 xs_transaction_t tx_id
, const char *path
, GByteArray
*data
)
873 * The data GByteArray shall exist, and will be freed by caller.
874 * Just g_byte_array_append() to it.
880 ret
= init_walk_op(s
, &op
, tx_id
, dom_id
, path
, &n
);
884 op
.op_fn
= xs_node_get_content
;
886 return xs_node_walk(n
, &op
);
889 int xs_impl_write(XenstoreImplState
*s
, unsigned int dom_id
,
890 xs_transaction_t tx_id
, const char *path
, GByteArray
*data
)
893 * The data GByteArray shall exist, will be freed by caller. You are
894 * free to use g_byte_array_steal() and keep the data. Or just ref it.
900 ret
= init_walk_op(s
, &op
, tx_id
, dom_id
, path
, &n
);
904 op
.op_fn
= xs_node_add_content
;
907 op
.create_dirs
= true;
908 return xs_node_walk(n
, &op
);
911 int xs_impl_directory(XenstoreImplState
*s
, unsigned int dom_id
,
912 xs_transaction_t tx_id
, const char *path
,
913 uint64_t *gencnt
, GList
**items
)
916 * The items are (char *) to be freed by caller. Although it's consumed
917 * immediately so if you want to change it to (const char *) and keep
918 * them, go ahead and change the caller.
924 ret
= init_walk_op(s
, &op
, tx_id
, dom_id
, path
, &n
);
928 op
.op_fn
= xs_node_directory
;
929 op
.op_opaque
= items
;
930 op
.op_opaque2
= gencnt
;
931 return xs_node_walk(n
, &op
);
934 int xs_impl_transaction_start(XenstoreImplState
*s
, unsigned int dom_id
,
935 xs_transaction_t
*tx_id
)
939 if (*tx_id
!= XBT_NULL
) {
943 if (dom_id
&& s
->nr_domu_transactions
>= XS_MAX_TRANSACTIONS
) {
947 tx
= g_new0(XsTransaction
, 1);
949 tx
->nr_nodes
= s
->nr_nodes
;
950 tx
->tx_id
= next_tx(s
);
951 tx
->base_tx
= s
->root_tx
;
952 tx
->root
= xs_node_ref(s
->root
);
955 g_hash_table_insert(s
->transactions
, GINT_TO_POINTER(tx
->tx_id
), tx
);
957 s
->nr_domu_transactions
++;
963 static gboolean
tx_commit_walk(gpointer key
, gpointer value
,
966 struct walk_op
*op
= user_data
;
967 int path_len
= strlen(op
->path
);
968 int key_len
= strlen(key
);
969 bool fire_parents
= true;
977 if (n
->deleted_in_tx
) {
979 * We fire watches on our parents if we are the *first* node
980 * to be deleted (the topmost one). This matches the behaviour
981 * when deleting in the live tree.
983 fire_parents
= !op
->deleted_in_tx
;
985 /* Only used on the way down so no need to clear it later */
986 op
->deleted_in_tx
= true;
989 assert(key_len
+ path_len
+ 2 <= sizeof(op
->path
));
990 op
->path
[path_len
] = '/';
991 memcpy(op
->path
+ path_len
+ 1, key
, key_len
+ 1);
993 watch
= g_hash_table_lookup(op
->s
->watches
, op
->path
);
995 op
->watches
= g_list_append(op
->watches
, watch
);
999 g_hash_table_foreach_remove(n
->children
, tx_commit_walk
, op
);
1003 op
->watches
= g_list_remove(op
->watches
, watch
);
1007 * Don't fire watches if this node was only copied because a
1008 * descendent was changed. The modified_in_tx flag indicates the
1009 * ones which were really changed.
1011 if (n
->modified_in_tx
|| n
->deleted_in_tx
) {
1012 fire_watches(op
, fire_parents
);
1013 n
->modified_in_tx
= false;
1015 op
->path
[path_len
] = '\0';
1017 /* Deleted nodes really do get expunged when we commit */
1018 return n
->deleted_in_tx
;
1021 static int transaction_commit(XenstoreImplState
*s
, XsTransaction
*tx
)
1027 if (s
->root_tx
!= tx
->base_tx
) {
1030 xs_node_unref(s
->root
);
1033 s
->root_tx
= tx
->tx_id
;
1034 s
->nr_nodes
= tx
->nr_nodes
;
1036 ret
= init_walk_op(s
, &op
, XBT_NULL
, tx
->dom_id
, "/", &n
);
1038 * There are two reasons why init_walk_op() may fail: an invalid tx_id,
1039 * or an invalid path. We pass XBT_NULL and "/", and it cannot fail.
1040 * If it does, the world is broken. And returning 'ret' would be weird
1041 * because the transaction *was* committed, and all this tree walk is
1042 * trying to do is fire the resulting watches on newly-committed nodes.
1046 op
.deleted_in_tx
= false;
1050 * Walk the new root and fire watches on any node which has a
1051 * refcount of one (which is therefore unique to this transaction).
1053 if (s
->root
->children
) {
1054 g_hash_table_foreach_remove(s
->root
->children
, tx_commit_walk
, &op
);
1060 int xs_impl_transaction_end(XenstoreImplState
*s
, unsigned int dom_id
,
1061 xs_transaction_t tx_id
, bool commit
)
1064 XsTransaction
*tx
= g_hash_table_lookup(s
->transactions
,
1065 GINT_TO_POINTER(tx_id
));
1067 if (!tx
|| tx
->dom_id
!= dom_id
) {
1072 ret
= transaction_commit(s
, tx
);
1075 g_hash_table_remove(s
->transactions
, GINT_TO_POINTER(tx_id
));
1077 assert(s
->nr_domu_transactions
);
1078 s
->nr_domu_transactions
--;
1083 int xs_impl_rm(XenstoreImplState
*s
, unsigned int dom_id
,
1084 xs_transaction_t tx_id
, const char *path
)
1090 ret
= init_walk_op(s
, &op
, tx_id
, dom_id
, path
, &n
);
1094 op
.op_fn
= xs_node_rm
;
1096 return xs_node_walk(n
, &op
);
1099 int xs_impl_get_perms(XenstoreImplState
*s
, unsigned int dom_id
,
1100 xs_transaction_t tx_id
, const char *path
, GList
**perms
)
1106 ret
= init_walk_op(s
, &op
, tx_id
, dom_id
, path
, &n
);
1110 op
.op_fn
= xs_node_get_perms
;
1111 op
.op_opaque
= perms
;
1112 return xs_node_walk(n
, &op
);
1115 static void is_valid_perm(gpointer data
, gpointer user_data
)
1118 bool *valid
= user_data
;
1120 unsigned int dom_id
;
1126 if (sscanf(perm
, "%c%u", &letter
, &dom_id
) != 2) {
1144 int xs_impl_set_perms(XenstoreImplState
*s
, unsigned int dom_id
,
1145 xs_transaction_t tx_id
, const char *path
, GList
*perms
)
1152 if (!g_list_length(perms
)) {
1156 g_list_foreach(perms
, is_valid_perm
, &valid
);
1161 ret
= init_walk_op(s
, &op
, tx_id
, dom_id
, path
, &n
);
1165 op
.op_fn
= xs_node_set_perms
;
1166 op
.op_opaque
= perms
;
1168 return xs_node_walk(n
, &op
);
1171 static int do_xs_impl_watch(XenstoreImplState
*s
, unsigned int dom_id
,
1172 const char *path
, const char *token
,
1173 xs_impl_watch_fn fn
, void *opaque
)
1176 char abspath
[XENSTORE_ABS_PATH_MAX
+ 1];
1180 ret
= validate_path(abspath
, path
, dom_id
);
1185 /* Check for duplicates */
1186 l
= w
= g_hash_table_lookup(s
->watches
, abspath
);
1188 if (!g_strcmp0(token
, w
->token
) && opaque
== w
->cb_opaque
&&
1189 fn
== w
->cb
&& dom_id
== w
->dom_id
) {
1195 if (dom_id
&& s
->nr_domu_watches
>= XS_MAX_WATCHES
) {
1199 w
= g_new0(XsWatch
, 1);
1200 w
->token
= g_strdup(token
);
1202 w
->cb_opaque
= opaque
;
1204 w
->rel_prefix
= strlen(abspath
) - strlen(path
);
1206 /* l was looked up above when checking for duplicates */
1211 g_hash_table_insert(s
->watches
, g_strdup(abspath
), w
);
1214 s
->nr_domu_watches
++;
1220 int xs_impl_watch(XenstoreImplState
*s
, unsigned int dom_id
, const char *path
,
1221 const char *token
, xs_impl_watch_fn fn
, void *opaque
)
1223 int ret
= do_xs_impl_watch(s
, dom_id
, path
, token
, fn
, opaque
);
1226 /* A new watch should fire immediately */
1227 fn(opaque
, path
, token
);
1233 static XsWatch
*free_watch(XenstoreImplState
*s
, XsWatch
*w
)
1235 XsWatch
*next
= w
->next
;
1238 assert(s
->nr_domu_watches
);
1239 s
->nr_domu_watches
--;
1248 int xs_impl_unwatch(XenstoreImplState
*s
, unsigned int dom_id
,
1249 const char *path
, const char *token
,
1250 xs_impl_watch_fn fn
, void *opaque
)
1252 char abspath
[XENSTORE_ABS_PATH_MAX
+ 1];
1256 ret
= validate_path(abspath
, path
, dom_id
);
1261 w
= g_hash_table_lookup(s
->watches
, abspath
);
1267 * The hash table contains the first element of a list of
1268 * watches. Removing the first element in the list is a
1269 * special case because we have to update the hash table to
1270 * point to the next (or remove it if there's nothing left).
1272 if (!g_strcmp0(token
, w
->token
) && fn
== w
->cb
&& opaque
== w
->cb_opaque
&&
1273 dom_id
== w
->dom_id
) {
1275 /* Insert the previous 'next' into the hash table */
1276 g_hash_table_insert(s
->watches
, g_strdup(abspath
), w
->next
);
1278 /* Nothing left; remove from hash table */
1279 g_hash_table_remove(s
->watches
, abspath
);
1286 * We're all done messing with the hash table because the element
1287 * it points to has survived the cull. Now it's just a simple
1288 * linked list removal operation.
1290 for (l
= &w
->next
; *l
; l
= &w
->next
) {
1293 if (!g_strcmp0(token
, w
->token
) && fn
== w
->cb
&&
1294 opaque
!= w
->cb_opaque
&& dom_id
== w
->dom_id
) {
1295 *l
= free_watch(s
, w
);
1303 int xs_impl_reset_watches(XenstoreImplState
*s
, unsigned int dom_id
)
1306 guint nr_watch_paths
;
1309 watch_paths
= (char **)g_hash_table_get_keys_as_array(s
->watches
,
1312 for (i
= 0; i
< nr_watch_paths
; i
++) {
1313 XsWatch
*w1
= g_hash_table_lookup(s
->watches
, watch_paths
[i
]);
1314 XsWatch
*w2
, *w
, **l
;
1317 * w1 is the original list. The hash table has this pointer.
1318 * w2 is the head of our newly-filtered list.
1319 * w and l are temporary for processing. w is somewhat redundant
1320 * with *l but makes my eyes bleed less.
1326 if (w
->dom_id
== dom_id
) {
1327 /* If we're freeing the head of the list, bump w2 */
1331 *l
= free_watch(s
, w
);
1338 * If the head of the list survived the cull, we don't need to
1339 * touch the hash table and we're done with this path. Else...
1342 g_hash_table_steal(s
->watches
, watch_paths
[i
]);
1345 * It was already freed. (Don't worry, this whole thing is
1346 * single-threaded and nobody saw it in the meantime). And
1347 * having *stolen* it, we now own the watch_paths[i] string
1348 * so if we don't give it back to the hash table, we need
1352 g_hash_table_insert(s
->watches
, watch_paths
[i
], w2
);
1354 g_free(watch_paths
[i
]);
1358 g_free(watch_paths
);
1362 static void xs_tx_free(void *_tx
)
1364 XsTransaction
*tx
= _tx
;
1366 xs_node_unref(tx
->root
);
1371 XenstoreImplState
*xs_impl_create(unsigned int dom_id
)
1373 XenstoreImplState
*s
= g_new0(XenstoreImplState
, 1);
1376 s
->watches
= g_hash_table_new_full(g_str_hash
, g_str_equal
, g_free
, NULL
);
1377 s
->transactions
= g_hash_table_new_full(g_direct_hash
, g_direct_equal
,
1380 perms
= g_list_append(NULL
, xs_perm_as_string(XS_PERM_NONE
, 0));
1381 s
->root
= xs_node_create("/", perms
);
1382 g_list_free_full(perms
, g_free
);
1385 s
->root_tx
= s
->last_tx
= 1;
1390 static void clear_serialized_tx(gpointer key
, gpointer value
, gpointer opaque
)
1394 n
->serialized_tx
= XBT_NULL
;
1396 g_hash_table_foreach(n
->children
, clear_serialized_tx
, NULL
);
1400 static void clear_tx_serialized_tx(gpointer key
, gpointer value
,
1403 XsTransaction
*t
= value
;
1405 clear_serialized_tx(NULL
, t
->root
, NULL
);
1408 static void write_be32(GByteArray
*save
, uint32_t val
)
1410 uint32_t be
= htonl(val
);
1411 g_byte_array_append(save
, (void *)&be
, sizeof(be
));
1420 #define MODIFIED_IN_TX (1U << 0)
1421 #define DELETED_IN_TX (1U << 1)
1422 #define NODE_REF (1U << 2)
1424 static void save_node(gpointer key
, gpointer value
, gpointer opaque
)
1426 struct save_state
*ss
= opaque
;
1431 /* Child nodes (i.e. anything but the root) have a name */
1433 g_byte_array_append(ss
->bytes
, key
, strlen(key
) + 1);
1437 * If we already wrote this node, refer to the previous copy.
1438 * There's no rename/move in XenStore, so all we need to find
1439 * it is the tx_id of the transaction in which it exists. Which
1440 * may be the root tx.
1442 if (n
->serialized_tx
!= XBT_NULL
) {
1444 g_byte_array_append(ss
->bytes
, &flag
, 1);
1445 write_be32(ss
->bytes
, n
->serialized_tx
);
1448 n
->serialized_tx
= ss
->tx_id
;
1450 if (n
->modified_in_tx
) {
1451 flag
|= MODIFIED_IN_TX
;
1453 if (n
->deleted_in_tx
) {
1454 flag
|= DELETED_IN_TX
;
1456 g_byte_array_append(ss
->bytes
, &flag
, 1);
1459 write_be32(ss
->bytes
, n
->content
->len
);
1460 g_byte_array_append(ss
->bytes
, n
->content
->data
, n
->content
->len
);
1462 write_be32(ss
->bytes
, 0);
1465 for (l
= n
->perms
; l
; l
= l
->next
) {
1466 g_byte_array_append(ss
->bytes
, l
->data
, strlen(l
->data
) + 1);
1468 /* NUL termination after perms */
1469 g_byte_array_append(ss
->bytes
, (void *)"", 1);
1472 g_hash_table_foreach(n
->children
, save_node
, ss
);
1474 /* NUL termination after children (child name is NUL) */
1475 g_byte_array_append(ss
->bytes
, (void *)"", 1);
1479 static void save_tree(struct save_state
*ss
, uint32_t tx_id
, XsNode
*root
)
1481 write_be32(ss
->bytes
, tx_id
);
1483 save_node(NULL
, root
, ss
);
1486 static void save_tx(gpointer key
, gpointer value
, gpointer opaque
)
1488 uint32_t tx_id
= GPOINTER_TO_INT(key
);
1489 struct save_state
*ss
= opaque
;
1490 XsTransaction
*n
= value
;
1492 write_be32(ss
->bytes
, n
->base_tx
);
1493 write_be32(ss
->bytes
, n
->dom_id
);
1495 save_tree(ss
, tx_id
, n
->root
);
1498 static void save_watch(gpointer key
, gpointer value
, gpointer opaque
)
1500 struct save_state
*ss
= opaque
;
1503 /* We only save the *guest* watches. */
1505 gpointer relpath
= key
+ w
->rel_prefix
;
1506 g_byte_array_append(ss
->bytes
, relpath
, strlen(relpath
) + 1);
1507 g_byte_array_append(ss
->bytes
, (void *)w
->token
, strlen(w
->token
) + 1);
1511 GByteArray
*xs_impl_serialize(XenstoreImplState
*s
)
1513 struct save_state ss
;
1515 ss
.bytes
= g_byte_array_new();
1518 * node = flags [ real_node / node_ref ]
1519 * flags = uint8_t (MODIFIED_IN_TX | DELETED_IN_TX | NODE_REF)
1520 * node_ref = tx_id (in which the original version of this node exists)
1521 * real_node = content perms child* NUL
1522 * content = len data
1524 * data = uint8_t{len}
1533 * transaction = base_tx_id dom_id tree
1534 * base_tx_id = uint32_t
1537 * tx_list = tree transaction* XBT_NULL
1539 * watch = path token
1543 * watch_list = watch* NUL
1545 * xs_serialize_stream = last_tx tx_list watch_list
1546 * last_tx = uint32_t
1549 /* Clear serialized_tx in every node. */
1550 if (s
->serialized
) {
1551 clear_serialized_tx(NULL
, s
->root
, NULL
);
1552 g_hash_table_foreach(s
->transactions
, clear_tx_serialized_tx
, NULL
);
1555 s
->serialized
= true;
1557 write_be32(ss
.bytes
, s
->last_tx
);
1558 save_tree(&ss
, s
->root_tx
, s
->root
);
1559 g_hash_table_foreach(s
->transactions
, save_tx
, &ss
);
1561 write_be32(ss
.bytes
, XBT_NULL
);
1563 g_hash_table_foreach(s
->watches
, save_watch
, &ss
);
1564 g_byte_array_append(ss
.bytes
, (void *)"", 1);
1569 struct unsave_state
{
1570 char path
[XENSTORE_ABS_PATH_MAX
+ 1];
1571 XenstoreImplState
*s
;
1578 static int consume_be32(struct unsave_state
*us
, unsigned int *val
)
1582 if (us
->l
< sizeof(d
)) {
1585 memcpy(&d
, us
->d
, sizeof(d
));
1592 static int consume_string(struct unsave_state
*us
, char **str
, size_t *len
)
1600 l
= strnlen((void *)us
->d
, us
->l
);
1606 *str
= (void *)us
->d
;
1617 static XsNode
*lookup_node(XsNode
*n
, char *path
)
1619 char *slash
= strchr(path
, '/');
1622 if (path
[0] == '\0') {
1633 child
= g_hash_table_lookup(n
->children
, path
);
1642 return lookup_node(child
, slash
+ 1);
1645 static XsNode
*lookup_tx_node(struct unsave_state
*us
, unsigned int tx_id
)
1648 if (tx_id
== us
->s
->root_tx
) {
1649 return lookup_node(us
->s
->root
, us
->path
+ 1);
1652 t
= g_hash_table_lookup(us
->s
->transactions
, GINT_TO_POINTER(tx_id
));
1657 return lookup_node(t
->root
, us
->path
+ 1);
1660 static void count_child_nodes(gpointer key
, gpointer value
, gpointer user_data
)
1662 unsigned int *nr_nodes
= user_data
;
1668 g_hash_table_foreach(n
->children
, count_child_nodes
, nr_nodes
);
1672 static int consume_node(struct unsave_state
*us
, XsNode
**nodep
,
1673 unsigned int *nr_nodes
)
1686 if (flags
== NODE_REF
) {
1689 ret
= consume_be32(us
, &tx
);
1694 n
= lookup_tx_node(us
, tx
);
1700 g_hash_table_foreach(n
->children
, count_child_nodes
, nr_nodes
);
1705 if (flags
& ~(DELETED_IN_TX
| MODIFIED_IN_TX
)) {
1710 if (flags
& DELETED_IN_TX
) {
1711 n
->deleted_in_tx
= true;
1713 if (flags
& MODIFIED_IN_TX
) {
1714 n
->modified_in_tx
= true;
1716 ret
= consume_be32(us
, &datalen
);
1722 if (datalen
> us
->l
) {
1727 GByteArray
*node_data
= g_byte_array_new();
1728 g_byte_array_append(node_data
, us
->d
, datalen
);
1731 n
->content
= node_data
;
1733 if (us
->root_walk
) {
1734 n
->modified_in_tx
= true;
1741 ret
= consume_string(us
, &perm
, &permlen
);
1751 n
->perms
= g_list_append(n
->perms
, g_strdup(perm
));
1759 XsNode
*child
= NULL
;
1761 ret
= consume_string(us
, &childname
, &childlen
);
1771 pathend
= us
->path
+ strlen(us
->path
);
1772 strncat(us
->path
, "/", sizeof(us
->path
) - 1);
1773 strncat(us
->path
, childname
, sizeof(us
->path
) - 1);
1775 ret
= consume_node(us
, &child
, nr_nodes
);
1782 xs_node_add_child(n
, childname
, child
);
1786 * If the node has no data and no children we still want to fire
1789 if (us
->root_walk
&& !n
->children
) {
1790 n
->modified_in_tx
= true;
1794 if (!n
->deleted_in_tx
) {
1802 static int consume_tree(struct unsave_state
*us
, XsTransaction
*t
)
1806 ret
= consume_be32(us
, &t
->tx_id
);
1811 if (t
->tx_id
> us
->s
->last_tx
) {
1817 return consume_node(us
, &t
->root
, &t
->nr_nodes
);
1820 int xs_impl_deserialize(XenstoreImplState
*s
, GByteArray
*bytes
,
1821 unsigned int dom_id
, xs_impl_watch_fn watch_fn
,
1824 struct unsave_state us
;
1825 XsTransaction base_t
= { 0 };
1833 xs_impl_reset_watches(s
, dom_id
);
1834 g_hash_table_remove_all(s
->transactions
);
1836 xs_node_unref(s
->root
);
1838 s
->root_tx
= s
->last_tx
= XBT_NULL
;
1840 ret
= consume_be32(&us
, &s
->last_tx
);
1846 * Consume the base tree into a transaction so that watches can be
1847 * fired as we commit it. By setting us.root_walk we cause the nodes
1848 * to be marked as 'modified_in_tx' as they are created, so that the
1849 * watches are triggered on them.
1851 base_t
.dom_id
= dom_id
;
1852 base_t
.base_tx
= XBT_NULL
;
1853 us
.root_walk
= true;
1854 ret
= consume_tree(&us
, &base_t
);
1858 us
.root_walk
= false;
1861 * Commit the transaction now while the refcount on all nodes is 1.
1862 * Note that we haven't yet reinstated the *guest* watches but that's
1863 * OK because we don't want the guest to see any changes. Even any
1864 * backend nodes which get recreated should be *precisely* as they
1865 * were before the migration. Back ends may have been instantiated
1866 * already, and will see the frontend magically blink into existence
1867 * now (well, from the aio_bh which fires the watches). It's their
1868 * responsibility to rebuild everything precisely as it was before.
1870 ret
= transaction_commit(s
, &base_t
);
1876 unsigned int base_tx
;
1879 ret
= consume_be32(&us
, &base_tx
);
1883 if (base_tx
== XBT_NULL
) {
1887 t
= g_new0(XsTransaction
, 1);
1888 t
->base_tx
= base_tx
;
1890 ret
= consume_be32(&us
, &t
->dom_id
);
1892 ret
= consume_tree(&us
, t
);
1900 s
->nr_domu_transactions
++;
1902 g_hash_table_insert(s
->transactions
, GINT_TO_POINTER(t
->tx_id
), t
);
1907 size_t pathlen
, toklen
;
1909 ret
= consume_string(&us
, &path
, &pathlen
);
1917 ret
= consume_string(&us
, &token
, &toklen
);
1926 ret
= do_xs_impl_watch(s
, dom_id
, path
, token
, watch_fn
, watch_opaque
);