- Test m_pkthdr.fw_flags against DUMMYNET_MBUF_TAGGED before trying to locate
[dragonfly/netmp.git] / sys / sys / journal.h
blob150c4a4330aa8127b2c38b5c64facbb44b1aea63
1 /*
2 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
34 * $DragonFly: src/sys/sys/journal.h,v 1.13 2007/05/09 00:53:35 dillon Exp $
37 #ifndef _SYS_JOURNAL_H_
38 #define _SYS_JOURNAL_H_
40 #ifndef _SYS_TYPES_H_
41 #include <sys/types.h>
42 #endif
43 #ifndef _SYS_TIME_H_
44 #include <sys/time.h>
45 #endif
48 * Physical file format (binary)
50 * All raw records are 128-bit aligned, but all record sizes are actual.
51 * This means that any scanning code must 16-byte-align the recsize field
52 * when calculating skips. The top level raw record has a header and a
53 * trailer to allow both forwards and backwards scanning of the journal.
54 * The alignment requirement allows the worker thread FIFO reservation
55 * API to operate efficiently, amoung other things.
57 * Logical data stream records are usually no larger then the journal's
58 * in-memory FIFO, since the journal's transactional APIs return contiguous
59 * blocks of buffer space and since logical stream records are used to avoid
60 * stalls when concurrent blocking operations are being written to the journal.
61 * Programs can depend on a logical stream record being a 'reasonable' size.
63 * Multiple logical data streams may operate concurrently in the journal,
64 * reflecting the fact that the system may be executing multiple blocking
65 * operations on the filesystem all at the same time. These logical data
66 * streams are short-lived transactional entities which use a 13 bit id
67 * plus a transaction start bit, end bit, and abort bit.
69 * Stream identifiers in the 0x00-0xFF range are special and not used for
70 * normal transactional commands.
72 * Stream id 0x00 indicates that no other streams should be active at that
73 * point in the journal, which helps the journaling code detect corruption.
75 * Stream id 0x01 is used for pad. Pads are used to align data on convenient
76 * boundaries and to deal with dead space.
78 * Stream id 0x02 indicates a discontinuity in the streamed data and typically
79 * contains information relating to the reason for the discontinuity.
80 * JTYPE_ASSOCIATE and JTYPE_DISASSOCIATE are usually emplaced in stream 0x02.
82 * Stream id 0x03 may be used to annotate the journal with text comments
83 * via mountctl commands. This can be extremely useful to note situations
84 * that may help with later recovery or audit operations.
86 * Stream id 0x04-0x7F are reserved by DragonFly for future protocol expansion.
88 * Stream id 0x80-0xFF may be used for third-party protocol expansion.
90 * Stream id's 0x0100-0x1FFF typically represent short-lived transactions
91 * (i.e. an id may be reused once the previous use has completed). The
92 * journaling system runs through these id's sequentially which means that
93 * the journaling code can handle up to 8192-256 = 7936 simultanious
94 * transactions at any given moment.
96 * The sequence number field is context-sensitive. It is typically used by
97 * a journaling stream to provide an incrementing counter and/or timestamp
98 * so recovery utilities can determine if any data is missing.
100 * The check word in the trailer may be used to provide an integrity check
101 * on the journaled data. A value of 0 always means that no check word
102 * has been calculated.
104 * The journal_rawrecbeg structure MUST be a multiple of 16 bytes.
105 * The journal_rawrecend structure MUST be a multiple of 8 bytes.
107 * NOTE: PAD RECORD SPECIAL CASE. Pad records can be 16 bytes and have the
108 * rawrecend structure overlayed on the sequence number field of the
109 * rawrecbeg structure. This is necessary because stream records are
110 * 16 byte aligned, not 24 byte aligned, and dead space is not allowed.
111 * So the pad record must fit into any dead space. THEREFORE, THE TRANSID
112 * FIELD FOR A PAD RECORD MUST BE IGNORED.
114 * NOTE: ENDIAN HANDLING. Data records can be in little or big endian form.
115 * The receiver detects the state by observing the 'begmagic' field. Each
116 * direction in a full-duplex connection can be operating with different
117 * endianess. Checksum data is always calculated on the raw record (including
118 * dead space) in a byte-stream fashion, and then converted to the transmit
119 * endianess like everything else. If the receiver's endianess is different
120 * it must convert it back to host normal form to compare it against the
121 * calculated checksum.
123 struct journal_rawrecbeg {
124 u_int16_t begmagic; /* recovery scan, endianess detection */
125 u_int16_t streamid; /* start/stop bits and stream identifier */
126 int32_t recsize; /* stream data block (incls beg & end) */
127 int64_t transid; /* sequence number or transaction id */
128 /* ADDITIONAL DATA */
131 struct journal_rawrecend {
132 u_int16_t endmagic; /* recovery scan, endianess detection */
133 u_int16_t check; /* check word or 0 */
134 int32_t recsize; /* same as rawrecbeg->recsize, for rev scan */
137 struct journal_ackrecord {
138 struct journal_rawrecbeg rbeg;
139 int32_t filler0;
140 int32_t filler1;
141 struct journal_rawrecend rend;
145 * Constants for stream record magic numbers. The incomplete magic
146 * number code is used internally by the memory FIFO reservation API
147 * and worker thread, allowing a block of space in the journaling
148 * stream (aka a stream block) to be reserved and then populated without
149 * stalling other threads doing their own reservation and population.
151 #define JREC_BEGMAGIC 0x1234
152 #define JREC_ENDMAGIC 0xCDEF
153 #define JREC_INCOMPLETEMAGIC 0xFFFF
156 * Stream ids are 14 bits. The top 2 bits specify when a new logical
157 * stream is being created or an existing logical stream is being terminated.
158 * A single raw stream record will set both the BEGIN and END bits if the
159 * entire transaction is encapsulated in a single stream record.
161 #define JREC_STREAMCTL_MASK 0xE000
162 #define JREC_STREAMCTL_BEGIN 0x8000 /* start a new logical stream */
163 #define JREC_STREAMCTL_END 0x4000 /* terminate a logical stream */
164 #define JREC_STREAMCTL_ABORTED 0x2000
166 #define JREC_STREAMID_MASK 0x1FFF
167 #define JREC_STREAMID_SYNCPT (JREC_STREAMCTL_BEGIN|JREC_STREAMCTL_END|0x0000)
168 #define JREC_STREAMID_PAD (JREC_STREAMCTL_BEGIN|JREC_STREAMCTL_END|0x0001)
169 #define JREC_STREAMID_DISCONT 0x0002 /* discontinuity */
170 #define JREC_STREAMID_ANNOTATE 0x0003 /* annotation */
171 #define JREC_STREAMID_ACK 0x0004 /* acknowledgement */
172 #define JREC_STREAMID_RESTART 0x0005 /* disctoninuity - journal restart */
173 /* 0x0006-0x007F reserved by DragonFly */
174 /* 0x0080-0x00FF for third party use */
175 #define JREC_STREAMID_JMIN 0x0100 /* lowest allowed general id */
176 #define JREC_STREAMID_JMAX 0x2000 /* (one past the highest allowed id) */
178 #define JREC_DEFAULTSIZE 64 /* reasonable initial reservation */
179 #define JREC_MINRECSIZE 16 /* (after alignment) */
180 #define JREC_MAXRECSIZE (128*1024*1024)
183 * Each logical journaling stream typically represents a transaction...
184 * that is, a VFS operation. The VFS operation is written out using
185 * sub-records and may contain multiple, possibly nested sub-transactions.
186 * multiple sub-transactions occur when a VFS operation cannot be represented
187 * by a single command. This is typically the case when a journal is
188 * configured to be reversable because UNDO sequences almost always have to
189 * be specified in such cases. For example, if you ftruncate() a file the
190 * journal might have to write out a sequence of WRITE records representing
191 * the lost data, otherwise the journal would not be reversable.
192 * Sub-transactions within a particular stream do not have their own sequence
193 * number field and thus may not be parallelized (the protocol is already
194 * complex enough!).
196 * In order to support streaming operation with a limited buffer the recsize
197 * field is allowed to be 0 for subrecords with the JMASK_NESTED bit set.
198 * If this case occurs a scanner can determine that the recursion has ended
199 * by detecting a nested subrecord with the JMASK_LAST bit set. A scanner
200 * may also set the field to the proper value after the fact to make later
201 * operations more efficient.
203 * Note that this bit must be properly set even if the recsize field is
204 * non-zero. The recsize must always be properly specified for 'leaf'
205 * subrecords, however in order to allow subsystems to potentially allocate
206 * more data space then they use the protocol allows any 'dead' space to be
207 * filled with JLEAF_PAD records.
209 * The recsize field may indicate data well past the size of the current
210 * raw stream record. That is, the scanner may have to glue together
211 * multiple stream records with the same stream id to fully decode the
212 * embedded subrecords. In particular, a subrecord could very well represent
213 * hundreds of megabytes of data (e.g. if a program were to do a
214 * multi-megabyte write()) and be split up across thousands of raw streaming
215 * records, possibly interlaced with other unrelated streams from other
216 * unrelated processes.
218 * If a large sub-transaction is aborted the logical stream may be
219 * terminated without writing out all the expected data. When this occurs
220 * the stream's ending record must also have the JREC_STREAMCTL_ABORTED bit
221 * set. However, scanners should still be robust enough to detect such
222 * overflows even if the aborted bit is not set and consider them data
223 * corruption.
225 * Aborts may also occur in the normal course of operations, especially once
226 * the journaling API is integrated into the cache coherency API. A normal
227 * abort is issued by emplacing a JLEAF_ABORT record within the transaction
228 * being aborted. Such records must be the last record in the sub-transaction,
229 * so JLEAF_LAST is also usually set. In a transaction with many
230 * sub-transactions only those sub-transactions with an abort record are
231 * aborted, the rest remain valid. Abort records are considered S.O.P. for
232 * two reasons: First, limited memory buffer space may make it impossible
233 * to delete the portion of the stream being aborted (the data may have
234 * already been sent to the target). Second, the journaling code will
235 * eventually be used to support a cache coherency layer which may have to
236 * abort operations as part of the cache coherency protocol. Note that
237 * subrecord aborts are different from stream record aborts. Stream record
238 * aborts are considered to be extrodinary situations while subrecord aborts
239 * are S.O.P.
242 struct journal_subrecord {
243 u_int16_t rectype; /* 2 control bits, 14 record type bits */
244 int16_t reserved; /* future use */
245 int32_t recsize; /* record size (mandatory if not NESTED) */
246 /* ADDITIONAL DATA */
249 #define JMASK_NESTED 0x8000 /* data is a nested recursion */
250 #define JMASK_LAST 0x4000
251 #define JMASK_SUBRECORD 0x0400
252 #define JTYPE_MASK (~JMASK_LAST)
254 #define JLEAF_PAD 0x0000
255 #define JLEAF_ABORT 0x0001
256 #define JTYPE_ASSOCIATE 0x0002
257 #define JTYPE_DISASSOCIATE 0x0003
258 #define JTYPE_UNDO (JMASK_NESTED|0x0004)
259 #define JTYPE_AUDIT (JMASK_NESTED|0x0005)
260 #define JTYPE_REDO (JMASK_NESTED|0x0006)
262 #define JTYPE_SETATTR (JMASK_NESTED|0x0010)
263 #define JTYPE_WRITE (JMASK_NESTED|0x0011)
264 #define JTYPE_PUTPAGES (JMASK_NESTED|0x0012)
265 #define JTYPE_SETACL (JMASK_NESTED|0x0013)
266 #define JTYPE_SETEXTATTR (JMASK_NESTED|0x0014)
267 #define JTYPE_CREATE (JMASK_NESTED|0x0015)
268 #define JTYPE_MKNOD (JMASK_NESTED|0x0016)
269 #define JTYPE_LINK (JMASK_NESTED|0x0017)
270 #define JTYPE_SYMLINK (JMASK_NESTED|0x0018)
271 #define JTYPE_WHITEOUT (JMASK_NESTED|0x0019)
272 #define JTYPE_REMOVE (JMASK_NESTED|0x001A)
273 #define JTYPE_MKDIR (JMASK_NESTED|0x001B)
274 #define JTYPE_RMDIR (JMASK_NESTED|0x001C)
275 #define JTYPE_RENAME (JMASK_NESTED|0x001D)
277 #define JTYPE_VATTR (JMASK_NESTED|0x0100)
278 #define JTYPE_CRED (JMASK_NESTED|0x0101)
281 * Low level record types
283 #define JLEAF_FILEDATA 0x0401
284 #define JLEAF_PATH1 0x0402
285 #define JLEAF_PATH2 0x0403
286 #define JLEAF_PATH3 0x0404
287 #define JLEAF_PATH4 0x0405
288 #define JLEAF_UID 0x0406
289 #define JLEAF_GID 0x0407
290 #define JLEAF_MODES 0x0408
291 #define JLEAF_FFLAGS 0x0409
292 #define JLEAF_PID 0x040A
293 #define JLEAF_PPID 0x040B
294 #define JLEAF_COMM 0x040C
295 #define JLEAF_ATTRNAME 0x040D
296 #define JLEAF_PATH_REF 0x040E
297 #define JLEAF_RESERVED_0F 0x040F
298 #define JLEAF_SYMLINKDATA 0x0410
299 #define JLEAF_SEEKPOS 0x0411
300 #define JLEAF_INUM 0x0412
301 #define JLEAF_NLINK 0x0413
302 #define JLEAF_FSID 0x0414
303 #define JLEAF_SIZE 0x0415
304 #define JLEAF_ATIME 0x0416
305 #define JLEAF_MTIME 0x0417
306 #define JLEAF_CTIME 0x0418
307 #define JLEAF_GEN 0x0419
308 #define JLEAF_FLAGS 0x041A
309 #define JLEAF_UDEV 0x041B
310 #define JLEAF_FILEREV 0x041C
311 #define JLEAF_VTYPE 0x041D
312 #define JLEAF_ERROR 0x041E
313 #define JLEAF_UMAJOR 0x041F
314 #define JLEAF_UMINOR 0x0420
317 * Low level journal data file structures
319 * NOTE: embedded strings may use the full width of the field and thus
320 * may not be 0-terminated.
322 struct jleaf_path {
323 char path[4]; /* path from base of mount point */
324 /* path is variable length and 0-terminated */
327 struct jleaf_vattr {
328 int32_t modes;
329 int32_t fflags;
330 struct timespec atime;
331 struct timespec mtime;
332 struct timespec ctime;
333 int64_t inum;
336 struct jleaf_cred {
337 int32_t uid;
338 int32_t gid;
339 int32_t pid;
340 int32_t flags; /* suid/sgid and other flags */
341 char line[8]; /* ttyname or other session identification */
342 char comm[8]; /* simplified command name for reference */
345 struct jleaf_ioinfo {
346 int64_t offset;
349 #endif