usched: Allow process to change self cpu affinity
[dragonfly.git] / sys / sys / dmsg.h
blobdc69ba226b432724f6f5ce39031ff0b4ad93f8a0
1 /*
2 * Copyright (c) 2011-2014 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
35 #ifndef _SYS_DMSG_H_
36 #define _SYS_DMSG_H_
38 #ifndef _SYS_MALLOC_H_
39 #include <sys/malloc.h>
40 #endif
41 #ifndef _SYS_TREE_H_
42 #include <sys/tree.h>
43 #endif
44 #ifndef _SYS_THREAD_H_
45 #include <sys/thread.h>
46 #endif
47 #ifndef _SYS_UUID_H_
48 #include <sys/uuid.h>
49 #endif
52 * Mesh network protocol structures.
54 * CONN PROTOCOL
56 * The mesh is constructed via point-to-point streaming links with varying
57 * levels of interconnectedness, forming a graph. Leafs of the graph are
58 * typically kernel devices (xdisk) or VFSs (HAMMER2). Internal nodes are
59 * usually (user level) hammer2 service demons.
61 * Upon connecting and after authentication, a LNK_CONN transaction is opened
62 * to configure the link. The SPAN protocol is then typically run over the
63 * open LNK_CONN transaction.
65 * Terminating the LNK_CONN transaction terminates everything running over it
66 * (typically open LNK_SPAN transactions), which in turn terminates everything
67 * running over the LNK_SPANs.
69 * SPAN PROTOCOL
71 * The SPAN protocol runs over an open LNK_CONN transaction and is used to
72 * advertise any number of services. For example, each PFS under a HAMMER2
73 * mount will be advertised as an open LNK_SPAN transaction.
75 * Any network node on the graph running multiple connections is capable
76 * of relaying LNK_SPANs from any connection to any other connection. This
77 * is typically done by the user-level hammer2 service demon, and typically
78 * not done by kernel devices or VFSs (though these entities must be able
79 * to manage multiple LNK_SPANs since they might advertise or need to talk
80 * to multiple services).
82 * Relaying is not necessarily trivial as it requires internal nodes to
83 * track two open transactions (on the two iocom interfaces) and translate
84 * the msgid and circuit. In addition, the relay may have to track multiple
85 * SPANs from the same iocom or from multiple iocoms which represent the same
86 * end-point and must select the best end-point, must send notifications when
87 * a better path is available, and must allow (when connectivity is still
88 * present) any existing, open, stacked sub-transactions to complete before
89 * terminating the less efficient SPAN.
91 * Relaying is optional. It is perfectly acceptable for the hammer2 service
92 * to plug a received socket descriptor directly into the appropriate kernel
93 * device driver.
95 * STACKED TRANSACTIONS
97 * Message transactions can be stacked. That is, you can initiate a DMSG
98 * transaction relative to another open transaction. sub-transactions can
99 * be initiate without waiting for the parent transaction to complete its
100 * handshake.
102 * This is done by entering the open transaction's msgid as the circuit field
103 * in the new transaction (typically by populating msg->parent). The
104 * transaction tracking structure will be referenced and will track the
105 * sub-transaction. Note that msgids must still be unique on an
106 * iocom-by-iocom basis.
108 * Messages can race closing circuits. When a circuit is lost,
109 * messages are simulated to delete any sub-transactions.
111 * MESSAGE TRANSACTIONAL STATES
113 * Message transactions are handled by the CREATE, DELETE, REPLY, ABORT, and
114 * CREPLY flags. Message state is typically recorded at the end points and
115 * will be maintained (preventing reuse of the transaction id) until a DELETE
116 * is both sent and received.
118 * One-way messages such as those used for debug commands are not recorded
119 * and do not require any transactional state. These are sent without
120 * the CREATE, DELETE, or ABORT flags set. ABORT is not supported for
121 * one-off messages. The REPLY bit can be used to distinguish between
122 * command and status if desired.
124 * Transactional messages are messages which require a reply to be
125 * returned. These messages can also consist of multiple message elements
126 * for the command or reply or both (or neither). The command message
127 * sequence sets CREATE on the first message and DELETE on the last message.
128 * A single message command sets both (CREATE|DELETE). The reply message
129 * sequence works the same way but of course also sets the REPLY bit.
131 * Tansactional messages can be aborted by sending a message element
132 * with the ABORT flag set. This flag can be combined with either or both
133 * the CREATE and DELETE flags. When combined with the CREATE flag the
134 * command is treated as non-blocking but still executes. Whem combined
135 * with the DELETE flag no additional message elements are required.
137 * Transactions are terminated by sending a message with DELETE set.
138 * Transactions must be CREATEd and DELETEd in both directions. If a
139 * transaction is governing stacked sub-transactions the sub-transactions
140 * are automatically terminated before the governing transaction is terminated.
141 * Terminates are handled by simulating a received DELETE and expecting the
142 * normal function callback and state machine to (ultimately) issue a
143 * terminating (DELETE) response.
145 * Transactions can operate in full-duplex as both sides are fully open
146 * (i.e. CREATE sent, CREATE|REPLY returned, DELETE not sent by anyone).
147 * Additional commands can be initiated from either side of the transaction.
149 * ABORT SPECIAL CASE - Mid-stream aborts. A mid-stream abort can be sent
150 * when supported by the sender by sending an ABORT message with neither
151 * CREATE or DELETE set. This effectively turns the message into a
152 * non-blocking message (but depending on what is being represented can also
153 * cut short prior data elements in the stream).
155 * ABORT SPECIAL CASE - Abort-after-DELETE. Transactional messages have to be
156 * abortable if the stream/pipe/whatever is lost. In this situation any
157 * forwarding relay needs to unconditionally abort commands and replies that
158 * are still active. This is done by sending an ABORT|DELETE even in
159 * situations where a DELETE has already been sent in that direction. This
160 * is done, for example, when links are in a half-closed state. In this
161 * situation it is possible for the abort request to race a transition to the
162 * fully closed state. ABORT|DELETE messages which race the fully closed
163 * state are expected to be discarded by the other end.
165 * --
167 * All base and extended message headers are 64-byte aligned, and all
168 * transports must support extended message headers up to DMSG_HDR_MAX.
169 * Currently we allow extended message headers up to 2048 bytes. Note
170 * that the extended header size is encoded in the 'cmd' field of the header.
172 * Any in-band data is padded to a 64-byte alignment and placed directly
173 * after the extended header (after the higher-level cmd/rep structure).
174 * The actual unaligned size of the in-band data is encoded in the aux_bytes
175 * field in this case. Maximum data sizes are negotiated during registration.
177 * Auxillary data can be in-band or out-of-band. In-band data sets aux_descr
178 * equal to 0. Any out-of-band data must be negotiated by the SPAN protocol.
180 * Auxillary data, whether in-band or out-of-band, must be at-least 64-byte
181 * aligned. The aux_bytes field contains the actual byte-granular length
182 * and not the aligned length. The crc is against the aligned length (so
183 * a faster crc algorithm can be used, theoretically).
185 * hdr_crc is calculated over the entire, ALIGNED extended header. For
186 * the purposes of calculating the crc, the hdr_crc field is 0. That is,
187 * if calculating the crc in HW a 32-bit '0' must be inserted in place of
188 * the hdr_crc field when reading the entire header and compared at the
189 * end (but the actual hdr_crc must be left intact in memory). A simple
190 * counter to replace the field going into the CRC generator does the job
191 * in HW. The CRC endian is based on the magic number field and may have
192 * to be byte-swapped, too (which is also easy to do in HW).
194 * aux_crc is calculated over the entire, ALIGNED auxillary data.
196 * SHARED MEMORY IMPLEMENTATIONS
198 * Shared-memory implementations typically use a pipe to transmit the extended
199 * message header and shared memory to store any auxilary data. Auxillary
200 * data in one-way (non-transactional) messages is typically required to be
201 * inline. CRCs are still recommended and required at the beginning, but
202 * may be negotiated away later.
205 #define DMSG_TERMINATE_STRING(ary) \
206 do { (ary)[sizeof(ary) - 1] = 0; } while (0)
209 * dmsg_hdr must be 64 bytes
211 struct dmsg_hdr {
212 uint16_t magic; /* 00 sanity, synchro, endian */
213 uint16_t reserved02; /* 02 */
214 uint32_t salt; /* 04 random salt helps w/crypto */
216 uint64_t msgid; /* 08 message transaction id */
217 uint64_t circuit; /* 10 circuit id or 0 */
218 uint64_t reserved18; /* 18 */
220 uint32_t cmd; /* 20 flags | cmd | hdr_size / ALIGN */
221 uint32_t aux_crc; /* 24 auxillary data crc */
222 uint32_t aux_bytes; /* 28 auxillary data length (bytes) */
223 uint32_t error; /* 2C error code or 0 */
224 uint64_t aux_descr; /* 30 negotiated OOB data descr */
225 uint32_t reserved38; /* 38 */
226 uint32_t hdr_crc; /* 3C (aligned) extended header crc */
229 typedef struct dmsg_hdr dmsg_hdr_t;
231 #define DMSG_HDR_MAGIC 0x4832
232 #define DMSG_HDR_MAGIC_REV 0x3248
233 #define DMSG_HDR_CRCOFF offsetof(dmsg_hdr_t, salt)
234 #define DMSG_HDR_CRCBYTES (sizeof(dmsg_hdr_t) - DMSG_HDR_CRCOFF)
237 * Administrative protocol limits.
239 * NOTE: A dmsg header must completely fit in the (fifo) buffer, but
240 * dmsg aux data does not have to completely fit. The dmsg
241 * structure allows headers up to 255*64 = 16320 bytes. There
242 * is no real limit on the aux_data other than what we deem
243 * reasonable and defenseable (i.e. not run processes or the
244 * kernel out of memory). But it should be able to handle at
245 * least MAXPHYS bytes which is typically 128KB or 256KB.
247 #define DMSG_HDR_MAX 2048 /* <= 8192 */
248 #define DMSG_AUX_MAX (1024*1024) /* <= 1MB */
249 #define DMSG_BUF_SIZE (DMSG_HDR_MAX * 4)
250 #define DMSG_BUF_MASK (DMSG_BUF_SIZE - 1)
253 * The message (cmd) field also encodes various flags and the total size
254 * of the message header. This allows the protocol processors to validate
255 * persistency and structural settings for every command simply by
256 * switch()ing on the (cmd) field.
258 #define DMSGF_CREATE 0x80000000U /* msg start */
259 #define DMSGF_DELETE 0x40000000U /* msg end */
260 #define DMSGF_REPLY 0x20000000U /* reply path */
261 #define DMSGF_ABORT 0x10000000U /* abort req */
262 #define DMSGF_REVTRANS 0x08000000U /* opposite direction msgid */
263 #define DMSGF_REVCIRC 0x04000000U /* opposite direction circuit */
264 #define DMSGF_FLAG1 0x02000000U
265 #define DMSGF_FLAG0 0x01000000U
267 #define DMSGF_FLAGS 0xFF000000U /* all flags */
268 #define DMSGF_PROTOS 0x00F00000U /* all protos */
269 #define DMSGF_CMDS 0x000FFF00U /* all cmds */
270 #define DMSGF_SIZE 0x000000FFU /* N*32 */
273 * XXX Future, flag that an in-line (not part of a CREATE/DELETE) command
274 * expects some sort of acknowledgement. Allows protocol mismatches to
275 * be detected.
277 #define DMSGF_CMDF_EXPECT_ACK 0x00080000U /* in-line command no-ack */
279 #define DMSGF_CMDSWMASK (DMSGF_CMDS | \
280 DMSGF_SIZE | \
281 DMSGF_PROTOS | \
282 DMSGF_REPLY)
284 #define DMSGF_BASECMDMASK (DMSGF_CMDS | \
285 DMSGF_SIZE | \
286 DMSGF_PROTOS)
288 #define DMSGF_TRANSMASK (DMSGF_CMDS | \
289 DMSGF_SIZE | \
290 DMSGF_PROTOS | \
291 DMSGF_REPLY | \
292 DMSGF_CREATE | \
293 DMSGF_DELETE)
295 #define DMSGF_BASEFLAGS (DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY)
297 #define DMSG_PROTO_LNK 0x00000000U
298 #define DMSG_PROTO_DBG 0x00100000U
299 #define DMSG_PROTO_HM2 0x00200000U
300 #define DMSG_PROTO_XX3 0x00300000U
301 #define DMSG_PROTO_XX4 0x00400000U
302 #define DMSG_PROTO_BLK 0x00500000U
303 #define DMSG_PROTO_VOP 0x00600000U
306 * Message command constructors, sans flags
308 #define DMSG_ALIGN 64
309 #define DMSG_ALIGNMASK (DMSG_ALIGN - 1)
310 #define DMSG_DOALIGN(bytes) (((bytes) + DMSG_ALIGNMASK) & \
311 ~DMSG_ALIGNMASK)
313 #define DMSG_HDR_ENCODE(elm) (((uint32_t)sizeof(struct elm) + \
314 DMSG_ALIGNMASK) / \
315 DMSG_ALIGN)
317 #define DMSG_LNK(cmd, elm) (DMSG_PROTO_LNK | \
318 ((cmd) << 8) | \
319 DMSG_HDR_ENCODE(elm))
321 #define DMSG_DBG(cmd, elm) (DMSG_PROTO_DBG | \
322 ((cmd) << 8) | \
323 DMSG_HDR_ENCODE(elm))
325 #define DMSG_HM2(cmd, elm) (DMSG_PROTO_HM2 | \
326 ((cmd) << 8) | \
327 DMSG_HDR_ENCODE(elm))
329 #define DMSG_BLK(cmd, elm) (DMSG_PROTO_BLK | \
330 ((cmd) << 8) | \
331 DMSG_HDR_ENCODE(elm))
333 #define DMSG_VOP(cmd, elm) (DMSG_PROTO_VOP | \
334 ((cmd) << 8) | \
335 DMSG_HDR_ENCODE(elm))
338 * Link layer ops basically talk to just the other side of a direct
339 * connection.
341 * LNK_PAD - One-way message on circuit 0, ignored by target. Used to
342 * pad message buffers on shared-memory transports. Not
343 * typically used with TCP.
345 * LNK_PING - One-way message on circuit-0, keep-alive, run by both sides
346 * typically 1/sec on idle link, link is lost after 10 seconds
347 * of inactivity.
349 * LNK_AUTH - Authenticate the connection, negotiate administrative
350 * rights & encryption, protocol class, etc. Only PAD and
351 * AUTH messages (not even PING) are accepted until
352 * authentication is complete. This message also identifies
353 * the host.
355 * LNK_CONN - Enable the SPAN protocol on circuit-0, possibly also
356 * installing a PFS filter (by cluster id, unique id, and/or
357 * wildcarded name).
359 * LNK_SPAN - A SPAN transaction typically on iocom->state0 enables
360 * messages to be relayed to/from a particular cluster node.
361 * SPANs are received, sorted, aggregated, filtered, and
362 * retransmitted back out across all applicable connections.
364 * The leaf protocol also uses this to make a PFS available
365 * to the cluster (e.g. on-mount).
367 #define DMSG_LNK_PAD DMSG_LNK(0x000, dmsg_hdr)
368 #define DMSG_LNK_PING DMSG_LNK(0x001, dmsg_hdr)
369 #define DMSG_LNK_AUTH DMSG_LNK(0x010, dmsg_lnk_auth)
370 #define DMSG_LNK_CONN DMSG_LNK(0x011, dmsg_lnk_conn)
371 #define DMSG_LNK_SPAN DMSG_LNK(0x012, dmsg_lnk_span)
372 #define DMSG_LNK_ERROR DMSG_LNK(0xFFF, dmsg_hdr)
375 * Reserved command codes for third party subsystems. Structure size is
376 * not known here so do not try to construct the full DMSG_LNK_ define.
378 #define DMSG_LNK_CMD_HAMMER2_VOLCONF 0x20
380 #define DMSG_LABEL_SIZE 128 /* fixed at 128, do not change */
383 * LNK_AUTH - Authentication (often omitted)
385 struct dmsg_lnk_auth {
386 dmsg_hdr_t head;
387 char dummy[64];
391 * LNK_CONN - Register connection info for SPAN protocol
392 * (transaction, left open, iocom->state0 only).
394 * LNK_CONN identifies a streaming connection into the cluster.
396 * peer_mask serves to filter the SPANs we receive by peer_type. A cluster
397 * controller typically sets this to (uint64_t)-1, indicating that it wants
398 * everything. A block devfs interface might set it to 1 << DMSG_PEER_DISK,
399 * and a hammer2 mount might set it to 1 << DMSG_PEER_HAMMER2.
401 * media_iud allows multiple (e.g. HAMMER2) connections belonging to the same
402 * media to transmit duplicative LNK_VOLCONF updates without causing confusion
403 * in the cluster controller.
405 * pfs_clid, pfs_fsid, pfs_type, and label are peer-specific and must be
406 * left empty (zero-fill) if not supported by a particular peer.
408 struct dmsg_lnk_conn {
409 dmsg_hdr_t head;
410 uuid_t media_id; /* media configuration id */
411 uuid_t peer_id; /* unique peer uuid */
412 uuid_t reserved01;
413 uint64_t peer_mask; /* PEER mask for SPAN filtering */
414 uint8_t peer_type; /* see DMSG_PEER_xxx */
415 uint8_t reserved02;
416 uint16_t proto_version; /* high level protocol support */
417 uint32_t status; /* status flags */
418 uint32_t rnss; /* node's generated rnss */
419 uint8_t reserved03[8];
420 uint32_t reserved04[14];
421 char peer_label[DMSG_LABEL_SIZE]; /* peer identity string */
424 typedef struct dmsg_lnk_conn dmsg_lnk_conn_t;
427 * PEER types 0-63 are defined here. There is a limit of 64 types due to
428 * the width of peer_mask.
430 * PFS types depend on the peer type. sys/dmsg.h only defines the default.
431 * peer-specific headers define PFS types for any given peer.
433 #define DMSG_PEER_NONE 0
434 #define DMSG_PEER_ROUTER 1 /* server: cluster controller */
435 #define DMSG_PEER_BLOCK 2 /* server: block devices */
436 #define DMSG_PEER_HAMMER2 3 /* server: h2 mounted volume */
437 #define DMSG_PEER_CLIENT 63 /* a client connection */
438 #define DMSG_PEER_MAX 64
440 #define DMSG_PFSTYPE_DEFAULT 0
441 #define DMSG_PFSTYPE_MASK 0x0F
444 * Structures embedded in LNK_SPAN
446 struct dmsg_media_block {
447 uint64_t bytes; /* media size in bytes */
448 uint32_t blksize; /* media block size */
449 uint32_t reserved01;
452 typedef struct dmsg_media_block dmsg_media_block_t;
455 * LNK_SPAN - Initiate or relay a SPAN
456 * (transaction, left open, typically only on iocom->state0)
458 * This message registers an end-point with the other end of the connection,
459 * telling the other end who we are and what we can provide or intend to
460 * consume. Multiple registrations can be maintained as open transactions
461 * with each one specifying a unique end-point.
463 * Registrations are sent from {source}=S {1...n} to {target}=0 and maintained
464 * as open transactions. Registrations are also received and maintains as
465 * open transactions, creating a matrix of linkid's.
467 * While these transactions are open additional transactions can be executed
468 * between any two linkid's {source}=S (registrations we sent) to {target}=T
469 * (registrations we received).
471 * Closure of any registration transaction will automatically abort any open
472 * transactions using the related linkids. Closure can be initiated
473 * voluntarily from either side with either end issuing a DELETE, or they
474 * can be ABORTed.
476 * Status updates are performed via the open transaction.
478 * --
480 * A registration identifies a node and its various PFS parameters including
481 * the PFS_TYPE. For example, a diskless HAMMER2 client typically identifies
482 * itself as PFSTYPE_CLIENT.
484 * Any node may serve as a cluster controller, aggregating and passing
485 * on received registrations, but end-points do not have to implement this
486 * ability. Most end-points typically implement a single client-style or
487 * server-style PFS_TYPE and rendezvous at a cluster controller.
489 * The cluster controller does not aggregate/pass-on all received
490 * registrations. It typically filters what gets passed on based on what it
491 * receives, passing on only the best candidates.
493 * If a symmetric spanning tree is desired additional candidates whos
494 * {dist, rnss} fields match the last best candidate must also be propagated.
495 * This feature is not currently enabled.
497 * STATUS UPDATES: Status updates use the same structure but typically
498 * only contain incremental changes to e.g. pfs_type, with
499 * a text description sent as out-of-band data.
501 struct dmsg_lnk_span {
502 dmsg_hdr_t head;
503 uuid_t peer_id;
504 uuid_t pfs_id; /* unique pfs id */
505 uint8_t pfs_type; /* PFS type */
506 uint8_t peer_type; /* PEER type */
507 uint16_t proto_version; /* high level protocol support */
508 uint32_t status; /* status flags */
509 uint8_t reserved02[8];
510 uint32_t dist; /* span distance */
511 uint32_t rnss; /* random number sub-sort */
512 union {
513 uint32_t reserved03[14];
514 dmsg_media_block_t block;
515 } media;
518 * NOTE: for PEER_HAMMER2 cl_label is typically empty and fs_label
519 * is the superroot directory name.
521 * for PEER_BLOCK cl_label is typically host/device and
522 * fs_label is typically the serial number string.
524 char peer_label[DMSG_LABEL_SIZE]; /* peer label */
525 char pfs_label[DMSG_LABEL_SIZE]; /* PFS label */
528 typedef struct dmsg_lnk_span dmsg_lnk_span_t;
530 #define DMSG_SPAN_PROTO_1 1
533 * Debug layer ops operate on any link
535 * SHELL - Persist stream, access the debug shell on the target
536 * registration. Multiple shells can be operational.
538 #define DMSG_DBG_SHELL DMSG_DBG(0x001, dmsg_dbg_shell)
540 struct dmsg_dbg_shell {
541 dmsg_hdr_t head;
543 typedef struct dmsg_dbg_shell dmsg_dbg_shell_t;
546 * Hammer2 layer ops (low-level chain manipulation used by cluster code)
548 * HM2_OPENPFS - Attach a PFS
549 * HM2_FLUSHPFS - Flush a PFS
551 * HM2_LOOKUP - Lookup chain (parent-relative transaction)
552 * (can request multiple chains)
553 * HM2_NEXT - Lookup next chain (parent-relative transaction)
554 * (can request multiple chains)
555 * HM2_LOCK - [Re]lock a chain (chain-relative) (non-recursive)
556 * HM2_UNLOCK - Unlock a chain (chain-relative) (non-recursive)
557 * HM2_RESIZE - Resize a chain (chain-relative)
558 * HM2_MODIFY - Modify a chain (chain-relative)
559 * HM2_CREATE - Create a chain (parent-relative)
560 * HM2_DUPLICATE- Duplicate a chain (target-parent-relative)
561 * HM2_DELDUP - Delete-Duplicate a chain (chain-relative)
562 * HM2_DELETE - Delete a chain (chain-relative)
563 * HM2_SNAPSHOT - Create a snapshot (snapshot-root-relative, w/clid override)
565 #define DMSG_HM2_OPENPFS DMSG_HM2(0x001, dmsg_hm2_openpfs)
568 * DMSG_PROTO_BLK Protocol
570 * BLK_OPEN - Open device. This transaction must be left open for the
571 * duration and the returned keyid passed in all associated
572 * BLK commands. Multiple OPENs can be issued within the
573 * transaction.
575 * BLK_CLOSE - Close device. This can be used to close one of the opens
576 * within a BLK_OPEN transaction. It may NOT initiate a
577 * transaction. Note that a termination of the transaction
578 * (e.g. with LNK_ERROR or BLK_ERROR) closes all active OPENs
579 * for that transaction. XXX not well defined atm.
581 * BLK_READ - Strategy read. Not typically streaming.
583 * BLK_WRITE - Strategy write. Not typically streaming.
585 * BLK_FLUSH - Strategy flush. Not typically streaming.
587 * BLK_FREEBLKS - Strategy freeblks. Not typically streaming.
589 #define DMSG_BLK_OPEN DMSG_BLK(0x001, dmsg_blk_open)
590 #define DMSG_BLK_CLOSE DMSG_BLK(0x002, dmsg_blk_open)
591 #define DMSG_BLK_READ DMSG_BLK(0x003, dmsg_blk_read)
592 #define DMSG_BLK_WRITE DMSG_BLK(0x004, dmsg_blk_write)
593 #define DMSG_BLK_FLUSH DMSG_BLK(0x005, dmsg_blk_flush)
594 #define DMSG_BLK_FREEBLKS DMSG_BLK(0x006, dmsg_blk_freeblks)
595 #define DMSG_BLK_ERROR DMSG_BLK(0xFFF, dmsg_blk_error)
597 struct dmsg_blk_open {
598 dmsg_hdr_t head;
599 uint32_t modes;
600 uint32_t reserved01;
603 #define DMSG_BLKOPEN_RD 0x0001
604 #define DMSG_BLKOPEN_WR 0x0002
607 * DMSG_LNK_ERROR is returned for simple results,
608 * DMSG_BLK_ERROR is returned for extended results.
610 struct dmsg_blk_error {
611 dmsg_hdr_t head;
612 uint64_t keyid;
613 uint32_t resid;
614 uint32_t reserved02;
615 char buf[64];
618 struct dmsg_blk_read {
619 dmsg_hdr_t head;
620 uint64_t keyid;
621 uint64_t offset;
622 uint32_t bytes;
623 uint32_t flags;
624 uint32_t reserved01;
625 uint32_t reserved02;
628 struct dmsg_blk_write {
629 dmsg_hdr_t head;
630 uint64_t keyid;
631 uint64_t offset;
632 uint32_t bytes;
633 uint32_t flags;
634 uint32_t reserved01;
635 uint32_t reserved02;
638 struct dmsg_blk_flush {
639 dmsg_hdr_t head;
640 uint64_t keyid;
641 uint64_t offset;
642 uint32_t bytes;
643 uint32_t flags;
644 uint32_t reserved01;
645 uint32_t reserved02;
648 struct dmsg_blk_freeblks {
649 dmsg_hdr_t head;
650 uint64_t keyid;
651 uint64_t offset;
652 uint32_t bytes;
653 uint32_t flags;
654 uint32_t reserved01;
655 uint32_t reserved02;
658 typedef struct dmsg_blk_open dmsg_blk_open_t;
659 typedef struct dmsg_blk_read dmsg_blk_read_t;
660 typedef struct dmsg_blk_write dmsg_blk_write_t;
661 typedef struct dmsg_blk_flush dmsg_blk_flush_t;
662 typedef struct dmsg_blk_freeblks dmsg_blk_freeblks_t;
663 typedef struct dmsg_blk_error dmsg_blk_error_t;
666 * NOTE!!!! ALL EXTENDED HEADER STRUCTURES MUST BE 64-BYTE ALIGNED!!!
668 * General message errors
670 * 0x00 - 0x1F Local iocomm errors
671 * 0x20 - 0x2F Global errors
673 #define DMSG_ERR_NOSUPP 0x20
674 #define DMSG_ERR_LOSTLINK 0x21
675 #define DMSG_ERR_IO 0x22 /* generic */
676 #define DMSG_ERR_PARAM 0x23 /* generic */
677 #define DMSG_ERR_CANTCIRC 0x24 /* (typically means lost span) */
679 union dmsg_any {
680 char buf[DMSG_HDR_MAX];
681 dmsg_hdr_t head;
683 dmsg_lnk_conn_t lnk_conn;
684 dmsg_lnk_span_t lnk_span;
686 dmsg_blk_open_t blk_open;
687 dmsg_blk_error_t blk_error;
688 dmsg_blk_read_t blk_read;
689 dmsg_blk_write_t blk_write;
690 dmsg_blk_flush_t blk_flush;
691 dmsg_blk_freeblks_t blk_freeblks;
694 typedef union dmsg_any dmsg_any_t;
697 * Kernel iocom structures and prototypes for kern/kern_dmsg.c
699 #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
701 struct hammer2_mount;
702 struct xa_softc;
703 struct kdmsg_iocom;
704 struct kdmsg_state;
705 struct kdmsg_msg;
706 struct kdmsg_data;
709 * msg_ctl flags (atomic)
711 #define KDMSG_CLUSTERCTL_UNUSED01 0x00000001
712 #define KDMSG_CLUSTERCTL_KILLRX 0x00000002 /* staged helper exit */
713 #define KDMSG_CLUSTERCTL_KILLTX 0x00000004 /* staged helper exit */
714 #define KDMSG_CLUSTERCTL_SLEEPING 0x00000008 /* interlocked w/msglk */
717 * Transactional state structure, representing an open transaction. The
718 * transaction might represent a cache state (and thus have a chain
719 * association), or a VOP op, LNK_SPAN, or other things.
721 * NOTE: A non-empty subq represents one ref.
722 * If we are inserted on a parent's subq, that's one ref (SUBINSERTED).
723 * If we are inserted on a RB tree, that's one ref (RBINSERTED).
724 * msg->state represents a ref.
725 * Other code references may hold refs.
727 * NOTE: The parent association stays intact as long as a state has a
728 * non-empty subq. Otherwise simulated failures might not be able
729 * to reach the children.
731 TAILQ_HEAD(kdmsg_state_list, kdmsg_state);
733 struct kdmsg_state {
734 RB_ENTRY(kdmsg_state) rbnode; /* indexed by msgid */
735 struct kdmsg_state *scan; /* scan check */
736 struct kdmsg_state_list subq; /* active stacked states */
737 TAILQ_ENTRY(kdmsg_state) entry; /* on parent subq */
738 TAILQ_ENTRY(kdmsg_state) user_entry; /* available to devices */
739 struct kdmsg_iocom *iocom;
740 struct kdmsg_state *parent;
741 int refs; /* refs */
742 uint32_t icmd; /* record cmd creating state */
743 uint32_t txcmd; /* mostly for CMDF flags */
744 uint32_t rxcmd; /* mostly for CMDF flags */
745 uint64_t msgid; /* {parent,msgid} uniq */
746 int flags;
747 int error;
748 void *chain; /* (caller's state) */
749 int (*func)(struct kdmsg_state *, struct kdmsg_msg *);
750 union {
751 void *any;
752 struct hammer2_mount *hmp;
753 struct xa_softc *xa_sc;
754 } any;
757 #define KDMSG_STATE_SUBINSERTED 0x0001
758 #define KDMSG_STATE_DYNAMIC 0x0002
759 #define KDMSG_STATE_UNUSED0004 0x0004
760 #define KDMSG_STATE_ABORTING 0x0008 /* avoids recursive abort */
761 #define KDMSG_STATE_OPPOSITE 0x0010 /* opposite direction */
762 #define KDMSG_STATE_DYING 0x0020 /* atomic recursive circ fail */
763 #define KDMSG_STATE_INTERLOCK 0x0040
764 #define KDMSG_STATE_RBINSERTED 0x0080
765 #define KDMSG_STATE_SIGNAL 0x0400
766 #define KDMSG_STATE_NEW 0x0800 /* defer abort processing */
768 struct kdmsg_msg {
769 TAILQ_ENTRY(kdmsg_msg) qentry; /* serialized queue */
770 struct kdmsg_state *state;
771 size_t hdr_size;
772 size_t aux_size;
773 char *aux_data;
774 uint32_t flags;
775 uint32_t tcmd; /* outer transaction cmd */
776 dmsg_any_t any; /* variable sized */
779 struct kdmsg_data {
780 char *aux_data;
781 size_t aux_size;
782 struct kdmsg_iocom *iocom;
785 #define KDMSG_FLAG_AUXALLOC 0x0001
787 typedef struct kdmsg_link kdmsg_link_t;
788 typedef struct kdmsg_state kdmsg_state_t;
789 typedef struct kdmsg_msg kdmsg_msg_t;
790 typedef struct kdmsg_data kdmsg_data_t;
792 struct kdmsg_state_tree;
793 int kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2);
794 RB_HEAD(kdmsg_state_tree, kdmsg_state);
795 RB_PROTOTYPE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
798 * Structure embedded in e.g. mount, master control structure for
799 * DMSG stream handling.
801 struct kdmsg_iocom {
802 struct malloc_type *mmsg;
803 struct file *msg_fp; /* cluster pipe->userland */
804 thread_t msgrd_td; /* cluster thread */
805 thread_t msgwr_td; /* cluster thread */
806 int msg_ctl; /* wakeup flags */
807 int msg_seq; /* cluster msg sequence id */
808 uint32_t flags;
809 struct lock msglk; /* lockmgr lock */
810 TAILQ_HEAD(, kdmsg_msg) msgq; /* transmit queue */
811 void *handle;
812 void (*auto_callback)(kdmsg_msg_t *);
813 int (*rcvmsg)(kdmsg_msg_t *);
814 void (*exit_func)(struct kdmsg_iocom *);
815 struct kdmsg_state state0; /* root state for stacking */
816 struct kdmsg_state *conn_state; /* active LNK_CONN state */
817 struct kdmsg_state *freerd_state; /* allocation cache */
818 struct kdmsg_state *freewr_state; /* allocation cache */
819 struct kdmsg_state_tree staterd_tree; /* active messages */
820 struct kdmsg_state_tree statewr_tree; /* active messages */
821 dmsg_lnk_conn_t auto_lnk_conn;
822 dmsg_lnk_span_t auto_lnk_span;
825 typedef struct kdmsg_iocom kdmsg_iocom_t;
827 #define KDMSG_IOCOMF_AUTOCONN 0x0001 /* handle RX/TX LNK_CONN */
828 #define KDMSG_IOCOMF_AUTORXSPAN 0x0002 /* handle RX LNK_SPAN */
829 #define KDMSG_IOCOMF_AUTOTXSPAN 0x0008 /* handle TX LNK_SPAN */
830 #define KDMSG_IOCOMF_EXITNOACC 0x8000 /* cannot accept writes */
832 #define KDMSG_IOCOMF_AUTOANY (KDMSG_IOCOMF_AUTOCONN | \
833 KDMSG_IOCOMF_AUTORXSPAN | \
834 KDMSG_IOCOMF_AUTOTXSPAN)
836 uint32_t kdmsg_icrc32(const void *buf, size_t size);
837 uint32_t kdmsg_icrc32c(const void *buf, size_t size, uint32_t crc);
840 * kern_dmsg.c
842 void kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, u_int32_t flags,
843 struct malloc_type *mmsg,
844 int (*rcvmsg)(kdmsg_msg_t *msg));
845 void kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
846 const char *subsysname);
847 void kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
848 void (*conn_callback)(kdmsg_msg_t *msg));
849 void kdmsg_iocom_uninit(kdmsg_iocom_t *iocom);
850 void kdmsg_drain_msgq(kdmsg_iocom_t *iocom);
852 void kdmsg_msg_free(kdmsg_msg_t *msg);
853 kdmsg_msg_t *kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
854 int (*func)(kdmsg_state_t *, kdmsg_msg_t *),
855 void *data);
856 void kdmsg_msg_write(kdmsg_msg_t *msg);
857 void kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error);
858 void kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error);
859 void kdmsg_state_reply(kdmsg_state_t *state, uint32_t error);
860 void kdmsg_state_result(kdmsg_state_t *state, uint32_t error);
861 void kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data);
862 void kdmsg_free_aux_data(kdmsg_data_t *data);
864 #endif
866 #endif