4 * As should be obvious for Linux kernel code, license is GPLv2
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
8 * Public header for logfs.
10 #ifndef FS_LOGFS_LOGFS_ABI_H
11 #define FS_LOGFS_LOGFS_ABI_H
13 /* For out-of-kernel compiles */
15 #define BUILD_BUG_ON(condition) /**/
18 #define SIZE_CHECK(type, size) \
19 static inline void check_##type(void) \
21 BUILD_BUG_ON(sizeof(struct type) != (size)); \
25 * Throughout the logfs code, we're constantly dealing with blocks at
26 * various positions or offsets. To remove confusion, we stricly
27 * distinguish between a "position" - the logical position within a
28 * file and an "offset" - the physical location within the device.
30 * Any usage of the term offset for a logical location or position for
31 * a physical one is a bug and should get fixed.
35 * Block are allocated in one of several segments depending on their
36 * level. The following levels are used:
37 * 0 - regular data block
38 * 1 - i1 indirect blocks
39 * 2 - i2 indirect blocks
40 * 3 - i3 indirect blocks
41 * 4 - i4 indirect blocks
42 * 5 - i5 indirect blocks
43 * 6 - ifile data blocks
44 * 7 - ifile i1 indirect blocks
45 * 8 - ifile i2 indirect blocks
46 * 9 - ifile i3 indirect blocks
47 * 10 - ifile i4 indirect blocks
48 * 11 - ifile i5 indirect blocks
49 * Potential levels to be used in the future:
50 * 12 - gc recycled blocks, long-lived data
51 * 13 - replacement blocks, short-lived data
53 * Levels 1-11 are necessary for robust gc operations and help separate
54 * short-lived metadata from longer-lived file data. In the future,
55 * file data should get separated into several segments based on simple
56 * heuristics. Old data recycled during gc operation is expected to be
57 * long-lived. New data is of uncertain life expectancy. New data
58 * used to replace older blocks in existing files is expected to be
63 /* Magic numbers. 64bit for superblock, 32bit for statfs f_type */
64 #define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull
65 #define LOGFS_MAGIC_U32 0xc97e8168u
68 * Various blocksize related macros. Blocksize is currently fixed at 4KiB.
69 * Sooner or later that should become configurable and the macros replaced
70 * by something superblock-dependent. Pointers in indirect blocks are and
73 * LOGFS_BLOCKSIZE - self-explaining
74 * LOGFS_BLOCK_FACTOR - number of pointers per indirect block
75 * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts
77 #define LOGFS_BLOCKSIZE (4096ull)
78 #define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64))
79 #define LOGFS_BLOCK_BITS (9)
82 * Number of blocks at various levels of indirection. There are 16 direct
83 * block pointers plus a single indirect pointer.
85 #define I0_BLOCKS (16)
86 #define I1_BLOCKS LOGFS_BLOCK_FACTOR
87 #define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS)
88 #define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS)
89 #define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS)
90 #define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS)
92 #define INDIRECT_INDEX I0_BLOCKS
93 #define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1)
96 * Sizes at which files require another level of indirection. Files smaller
97 * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
98 * similar like ext2 fast symlinks.
100 * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
101 * direct pointers, else through the 1x indirect pointer and so forth.
103 #define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64))
104 #define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE)
105 #define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE)
106 #define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE)
107 #define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE)
108 #define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE)
109 #define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE)
112 * Each indirect block pointer must have this flag set, if all block pointers
113 * behind it are set, i.e. there is no hole hidden in the shadow of this
114 * indirect block pointer.
116 #define LOGFS_FULLY_POPULATED (1ULL << 63)
117 #define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
120 * LogFS needs to separate data into levels. Each level is defined as the
121 * maximal possible distance from the master inode (inode of the inode file).
122 * Data blocks reside on level 0, 1x indirect block on level 1, etc.
123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
124 * This effort is necessary to guarantee garbage collection to always make
127 * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
128 * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is
129 * the maximal number of levels for one file.
130 * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
131 * effectively stacked on top of each other.
133 #define LOGFS_MAX_INDIRECT (5)
134 #define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1)
135 #define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS)
137 /* Maximum size of filenames */
138 #define LOGFS_MAX_NAMELEN (255)
140 /* Number of segments in the primary journal. */
141 #define LOGFS_JOURNAL_SEGS (16)
143 /* Maximum number of free/erased/etc. segments in journal entries */
144 #define MAX_CACHED_SEGS (64)
148 * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
149 * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
151 * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
152 * its segment header and the padded space at the end when no further objects
155 #define LOGFS_OBJECT_HEADERSIZE (0x1c)
156 #define LOGFS_SEGMENT_HEADERSIZE (0x18)
157 #define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
158 #define LOGFS_SEGMENT_RESERVE \
159 (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
163 * SEG_SUPER - Data or indirect block
164 * SEG_JOURNAL - Inode
165 * SEG_OSTORE - Dentry
174 * struct logfs_segment_header - per-segment header in the ostore
176 * @crc: crc32 of header (there is no data)
177 * @pad: unused, must be 0
178 * @type: segment type, see above
179 * @level: GC level for all objects in this segment
180 * @segno: segment number
181 * @ec: erase count for this segment
182 * @gec: global erase count at time of writing
184 struct logfs_segment_header
{
194 SIZE_CHECK(logfs_segment_header
, LOGFS_SEGMENT_HEADERSIZE
);
196 #define LOGFS_FEATURES_INCOMPAT (0ull)
197 #define LOGFS_FEATURES_RO_COMPAT (0ull)
198 #define LOGFS_FEATURES_COMPAT (0ull)
201 * struct logfs_disk_super - on-medium superblock
203 * @ds_magic: magic number, must equal LOGFS_MAGIC
204 * @ds_crc: crc32 of structure starting with the next field
205 * @ds_ifile_levels: maximum number of levels for ifile
206 * @ds_iblock_levels: maximum number of levels for regular files
207 * @ds_data_levels: number of separate levels for data
208 * @pad0: reserved, must be 0
209 * @ds_feature_incompat: incompatible filesystem features
210 * @ds_feature_ro_compat: read-only compatible filesystem features
211 * @ds_feature_compat: compatible filesystem features
213 * @ds_segment_shift: log2 of segment size
214 * @ds_block_shift: log2 of block size
215 * @ds_write_shift: log2 of write size
216 * @pad1: reserved, must be 0
217 * @ds_journal_seg: segments used by primary journal
218 * @ds_root_reserve: bytes reserved for the superuser
219 * @ds_speed_reserve: bytes reserved to speed up GC
220 * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks
221 * @pad2: reserved, must be 0
222 * @pad3: reserved, must be 0
224 * Contains only read-only fields. Read-write fields like the amount of used
225 * space is tracked in the dynamic superblock, which is stored in the journal.
227 struct logfs_disk_super
{
228 struct logfs_segment_header ds_sh
;
232 __u8 ds_ifile_levels
;
233 __u8 ds_iblock_levels
;
235 __u8 ds_segment_shift
;
240 __be64 ds_filesystem_size
;
241 __be32 ds_segment_size
;
242 __be32 ds_bad_seg_reserve
;
244 __be64 ds_feature_incompat
;
245 __be64 ds_feature_ro_compat
;
247 __be64 ds_feature_compat
;
248 __be64 ds_feature_flags
;
250 __be64 ds_root_reserve
;
251 __be64 ds_speed_reserve
;
253 __be32 ds_journal_seg
[LOGFS_JOURNAL_SEGS
];
255 __be64 ds_super_ofs
[2];
259 SIZE_CHECK(logfs_disk_super
, 256);
263 * OBJ_BLOCK - Data or indirect block
265 * OBJ_DENTRY - Dentry
274 * struct logfs_object_header - per-object header in the ostore
276 * @crc: crc32 of header, excluding data_crc
277 * @len: length of data
278 * @type: object type, see above
279 * @compr: compression type
282 * @data_crc: crc32 of payload
284 struct logfs_object_header
{
292 } __attribute__((packed
));
294 SIZE_CHECK(logfs_object_header
, LOGFS_OBJECT_HEADERSIZE
);
297 * Reserved inode numbers:
298 * LOGFS_INO_MASTER - master inode (for inode file)
299 * LOGFS_INO_ROOT - root directory
300 * LOGFS_INO_SEGFILE - per-segment used bytes and erase count
303 LOGFS_INO_MAPPING
= 0x00,
304 LOGFS_INO_MASTER
= 0x01,
305 LOGFS_INO_ROOT
= 0x02,
306 LOGFS_INO_SEGFILE
= 0x03,
307 LOGFS_RESERVED_INOS
= 0x10,
311 * Inode flags. High bits should never be written to the medium. They are
312 * reserved for in-memory usage.
313 * Low bits should either remain in sync with the corresponding FS_*_FL or
314 * reuse slots that obviously don't make sense for logfs.
316 * LOGFS_IF_DIRTY Inode must be written back
317 * LOGFS_IF_ZOMBIE Inode has been deleted
318 * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode
320 #define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */
321 #define LOGFS_IF_DIRTY 0x20000000
322 #define LOGFS_IF_ZOMBIE 0x40000000
323 #define LOGFS_IF_STILLBORN 0x80000000
325 /* Flags available to chattr */
326 #define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED)
327 #define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
328 /* Flags inherited from parent directory on file/directory creation */
329 #define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED)
332 * struct logfs_disk_inode - on-medium inode
334 * @di_mode: file mode
335 * @di_pad: reserved, must be 0
336 * @di_flags: inode flags, see above
339 * @di_ctime: change time
340 * @di_mtime: modify time
341 * @di_refcount: reference count (aka nlink or link count)
342 * @di_generation: inode generation, for nfs
343 * @di_used_bytes: number of bytes used
344 * @di_size: file size
345 * @di_data: data pointers
347 struct logfs_disk_inode
{
360 __be32 di_generation
;
362 __be64 di_used_bytes
;
365 __be64 di_data
[LOGFS_EMBEDDED_FIELDS
];
368 SIZE_CHECK(logfs_disk_inode
, 200);
370 #define INODE_POINTER_OFS \
371 (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
372 #define INODE_USED_OFS \
373 (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
374 #define INODE_SIZE_OFS \
375 (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
376 #define INODE_HEIGHT_OFS (0)
379 * struct logfs_disk_dentry - on-medium dentry structure
382 * @namelen: length of file name
383 * @type: file type, identical to bits 12..15 of mode
386 /* FIXME: add 6 bytes of padding to remove the __packed */
387 struct logfs_disk_dentry
{
391 __u8 name
[LOGFS_MAX_NAMELEN
];
392 } __attribute__((packed
));
394 SIZE_CHECK(logfs_disk_dentry
, 266);
396 #define RESERVED 0xffffffff
397 #define BADSEG 0xffffffff
399 * struct logfs_segment_entry - segment file entry
401 * @ec_level: erase count and level
402 * @valid: number of valid bytes
404 * Segment file contains one entry for every segment. ec_level contains the
405 * erasecount in the upper 28 bits and the level in the lower 4 bits. An
406 * ec_level of BADSEG (-1) identifies bad segments. valid contains the number
407 * of valid bytes or RESERVED (-1 again) if the segment is used for either the
408 * superblock or the journal, or when the segment is bad.
410 struct logfs_segment_entry
{
415 SIZE_CHECK(logfs_segment_entry
, 8);
418 * struct logfs_journal_header - header for journal entries (JEs)
420 * @h_crc: crc32 of journal entry
421 * @h_len: length of compressed journal entry,
422 * not including header
423 * @h_datalen: length of uncompressed data
425 * @h_compr: compression type
428 struct logfs_journal_header
{
437 SIZE_CHECK(logfs_journal_header
, 16);
440 * Life expectency of data.
441 * VIM_DEFAULT - default vim
442 * VIM_SEGFILE - for segment file only - very short-living
443 * VIM_GC - GC'd data - likely long-living
451 * struct logfs_je_area - wbuf header
453 * @segno: segment number of area
454 * @used_bytes: number of bytes already used
455 * @gc_level: GC level
456 * @vim: life expectancy of data
458 * "Areas" are segments currently being used for writing. There is at least
459 * one area per GC level. Several may be used to separate long-living from
460 * short-living data. If an area with unknown vim is encountered, it can
462 * The write buffer immediately follow this header.
464 struct logfs_je_area
{
469 } __attribute__((packed
));
471 SIZE_CHECK(logfs_je_area
, 10);
473 #define MAX_JOURNAL_HEADER \
474 (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
477 * struct logfs_je_dynsb - dynamic superblock
479 * @ds_gec: global erase count
480 * @ds_sweeper: current position of GC "sweeper"
481 * @ds_rename_dir: source directory ino (see dir.c documentation)
482 * @ds_rename_pos: position of source dd (see dir.c documentation)
483 * @ds_victim_ino: victims of incomplete dir operation (see dir.c)
484 * @ds_victim_ino: parent inode of victim (see dir.c)
485 * @ds_used_bytes: number of used bytes
487 struct logfs_je_dynsb
{
491 __be64 ds_rename_dir
;
492 __be64 ds_rename_pos
;
494 __be64 ds_victim_ino
;
495 __be64 ds_victim_parent
; /* XXX */
497 __be64 ds_used_bytes
;
498 __be32 ds_generation
;
502 SIZE_CHECK(logfs_je_dynsb
, 64);
505 * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
507 * @da_size: size of inode file
508 * @da_last_ino: last created inode
509 * @da_used_bytes: number of bytes used
510 * @da_data: data pointers
512 struct logfs_je_anchor
{
516 __be64 da_used_bytes
;
520 __be64 da_data
[LOGFS_EMBEDDED_FIELDS
];
523 SIZE_CHECK(logfs_je_anchor
, 168);
526 * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
528 * @so_segment: segments used for 2nd journal
530 * Length of the array is given by h_len field in the header.
532 struct logfs_je_spillout
{
533 __be64 so_segment
[0];
536 SIZE_CHECK(logfs_je_spillout
, 0);
539 * struct logfs_je_journal_ec - erase counts for all journal segments
543 * Length of the array is given by h_len field in the header.
545 struct logfs_je_journal_ec
{
549 SIZE_CHECK(logfs_je_journal_ec
, 0);
552 * struct logfs_je_free_segments - list of free segmetns with erase count
554 struct logfs_je_free_segments
{
559 SIZE_CHECK(logfs_je_free_segments
, 8);
562 * struct logfs_seg_alias - list of segment aliases
564 struct logfs_seg_alias
{
569 SIZE_CHECK(logfs_seg_alias
, 8);
572 * struct logfs_obj_alias - list of object aliases
574 struct logfs_obj_alias
{
583 SIZE_CHECK(logfs_obj_alias
, 32);
588 * COMPR_NONE - uncompressed
589 * COMPR_ZLIB - compressed with zlib
597 * Journal entries come in groups of 16. First group contains unique
598 * entries, next groups contain one entry per level
600 * JE_FIRST - smallest possible journal entry number
602 * JEG_BASE - base group, containing unique entries
603 * JE_COMMIT - commit entry, validates all previous entries
604 * JE_DYNSB - dynamic superblock, anything that ought to be in the
605 * superblock but cannot because it is read-write data
606 * JE_ANCHOR - anchor aka master inode aka inode file's inode
607 * JE_ERASECOUNT erasecounts for all journal segments
608 * JE_SPILLOUT - unused
609 * JE_SEG_ALIAS - aliases segments
610 * JE_AREA - area description
612 * JE_LAST - largest possible journal entry number
621 JE_ERASECOUNT
= 0x05,