Merge tag 'v2.4.0-rc3'
[qemu/ar7.git] / block / fvd.h
blob64f57a3c6554ae123759e3327449d3e0afbcf8f8
1 /*
2 * Copyright (c) 2010-2011 IBM
4 * Authors:
5 * Chunqiang Tang <ctang@us.ibm.com>
7 * This work is licensed under the terms of the GNU GPL, version 2.
8 * See the COPYING file in the top-level directory.
9 */
11 /*=============================================================================
12 * A short description: this is the header of the FVD block device driver.
13 *============================================================================*/
15 #include <sys/vfs.h>
16 #include <sys/mman.h>
17 #include <pthread.h>
18 #include <execinfo.h>
19 #include <sys/ioctl.h>
20 #include "qemu-common.h"
21 #include "block/block_int.h"
22 #include "qemu/option.h"
23 #include "qemu/osdep.h"
24 #include "qemu/queue.h"
25 #include "qemu/timer.h"
26 #include "block/block.h"
27 #include "block/blksim.h"
28 #include "block/fvd-ext.h"
30 #define FVD_MAGIC (('Q' << 24) | ('C' << 16) | (0xF5 << 8) | 0xA9)
31 #define FVD_VERSION 1
33 extern bool in_qemu_tool;
35 /* Profile-directed prefetch. (to be implemented). */
36 typedef struct __attribute__ ((__packed__)) PrefetchProfileEntry {
37 int64_t offset; /* in bytes */
39 /* In the unit of FvdHeader.prefetch_profile_entry_len_unit, i.e.,
40 * len_in_bytes = len * FvdHeader.unit_of_PrefetchProfileEntry_len. */
41 uint32_t len;
42 } PrefetchProfileEntry;
45 * The FVD format consists of:
46 * + Header fields of FvdHeader.
47 * + Bitmap, starting on a 4KB page boundary at a location specified by
48 * FvdHeader.bitmap_offset.
49 * + Table, starting on a 4KB page boundary at a location specified by
50 * FvdHeader.table_offset.
51 * + Journal, starting on a 4KB page boundary at a location specified by
52 * FvdHeader.journal_offset.
53 * + Prefetch profile entries, starting on a 4KB page boundary at a location
54 * specified by FvdHeader.prefetch_profile_offset. (to be implemented)
55 * + Virtual disk data, starting on a 4KB page boundary. Optionally, disk
56 * data can be stored in a separate data file specified by
57 * FvdHeader.data_file.
59 typedef struct __attribute__ ((__packed__)) FvdHeader {
60 uint32_t magic;
61 uint32_t version;
63 /* This field is set to TRUE after whole-image prefetching finishes. */
64 int32_t all_data_in_fvd_img;
66 int64_t virtual_disk_size; /* in bytes. Disk size perceived by the VM. */
67 int64_t metadata_size; /* in bytes. */
68 char base_img[1024];
69 char base_img_fmt[16];
70 int64_t base_img_size; /* in bytes. */
71 int64_t bitmap_offset; /* in bytes. Aligned on DEF_PAGE_SIZE. */
72 int64_t bitmap_size; /* in bytes. Rounded up to DEF_PAGE_SIZE */
73 int32_t block_size; /* in bytes. */
74 int32_t copy_on_read; /* TRUE or FALSE */
75 int64_t max_outstanding_copy_on_read_data; /* in bytes. */
77 /* If (data_file[0]==0), the FVD metadata and data are stored in one file.*/
78 char data_file[1024];
79 char data_file_fmt[16];
81 /******** Begin: for prefetching. *******************************/
82 /* in seconds. -1 means disable whole image prefetching. */
83 int32_t prefetch_start_delay;
85 /* in bytes. Aligned on DEF_PAGE_SIZE. (to be implemented) */
86 int64_t prefetch_profile_offset;
88 /* Number of PrefetchProfileEntry. (to be implemented) */
89 int64_t prefetch_profile_entries;
91 int32_t num_prefetch_slots; /* Max number of outstanding prefetch writes. */
92 int32_t bytes_per_prefetch; /* For whole image prefetching. */
93 int32_t prefetch_read_throughput_measure_time; /* in milliseconds. */
94 int32_t prefetch_write_throughput_measure_time; /* in milliseconds. */
96 /* Controls the calculation of the moving average of throughput. Must be a
97 * value between [0,100].
98 * actual_normalized_alpha = * prefetch_perf_calc_alpha / 100.0 */
99 int32_t prefetch_perf_calc_alpha;
101 int32_t prefetch_min_read_throughput; /* in KB/second. */
102 int32_t prefetch_min_write_throughput; /* in KB/second. */
103 int32_t prefetch_max_read_throughput; /* in KB/second. */
104 int32_t prefetch_max_write_throughput; /* in KB/second. */
106 /* in milliseconds. When prefetch read/write throughput is low, prefetch
107 * pauses for a random time uniformly distributed in
108 * [0, prefetch_throttle_time]. */
109 int32_t prefetch_throttle_time;
110 /******** End: for prefetching. *******************************/
112 /******** Begin: for compact image. *****************************/
113 int32_t compact_image; /* TRUE or FALSE */
114 int64_t table_offset; /* in bytes. */
115 int64_t chunk_size; /* in bytes. */
116 int64_t storage_grow_unit; /* in bytes. */
117 char add_storage_cmd[2048];
118 /******** End: for compact image. *******************************/
120 /******** Begin: for journal. ***********************************/
121 int64_t journal_offset; /* in bytes. */
122 int64_t journal_size; /* in bytes. */
123 int32_t clean_shutdown; /* TRUE if VM's last shutdown was graceful. */
124 /******** End: for journal. *************************************/
127 * This field is TRUE if the image mandates that the storage layer
128 * (BDRVFvdState.fvd_data) must return TRUE for bdrv_has_zero_init().
129 * This is the case if the optimization described in Section 3.3.3 of the
130 * FVD-cow paper is enabled (see function search_holes()). If 'qemu-img
131 * create' sets need_zero_init to TRUE, 'qemu-img update' can be used to
132 * manually reset it to FALSE, if the user always manually pre-fills the
133 * storage (e.g., a raw partition) with zeros. If the image is stored on a
134 * file system, it already supports zero_init, and hence there is no need
135 * to manually manipulate this field.
137 int32_t need_zero_init;
139 /* If TRUE, FVD dumps a prefetch profile after the VM shuts down.
140 * (to be implemented) */
141 int32_t generate_prefetch_profile;
143 /* See the comment on PrefetchProfileEntry.len. (to be implemented) */
144 int32_t unit_of_PrefetchProfileEntry_len;
146 /* in seconds. -1 means disable profile-directed prefetching.
147 * (to be implemented) */
148 int32_t profile_directed_prefetch_start_delay;
150 /* Possible values are "no", "writethrough", "writeback", or
151 * "writenocache". (to be implemented) */
152 char write_updates_base_img[16];
153 } FvdHeader;
155 typedef struct BDRVFvdState {
156 BlockDriverState *fvd_metadata;
157 BlockDriverState *fvd_data;
158 int64_t virtual_disk_size; /*in bytes. */
159 int64_t bitmap_offset; /* in sectors */
160 int64_t bitmap_size; /* in bytes. */
161 int64_t data_offset; /* in sectors. Begin of real data. */
162 int64_t nb_sectors_in_base_img;
163 int32_t block_size; /* in sectors. */
164 int copy_on_read; /* TRUE or FALSE */
165 int64_t max_outstanding_copy_on_read_data; /* in bytes. */
166 int64_t outstanding_copy_on_read_data; /* in bytes. */
167 int data_region_prepared; /* TRUE or FALSE */
168 QLIST_HEAD(WriteLocks, FvdAIOCB) write_locks; /* All writes. */
169 QLIST_HEAD(CopyLocks, FvdAIOCB) copy_locks; /* copy-on-read and CoW. */
171 /* Keep two copies of bitmap to reduce the overhead of updating the
172 * on-disk bitmap, i.e., copy-on-read and prefetching do not update the
173 * on-disk bitmap. See Section 3.3.4 of the FVD-cow paper. */
174 uint8_t *fresh_bitmap;
175 uint8_t *stale_bitmap;
177 /******** Begin: for prefetching. ***********************************/
178 struct FvdAIOCB **prefetch_acb;
179 int prefetch_state; /* PREFETCH_STATE_RUNNING, FINISHED, or DISABLED. */
180 int prefetch_error; /* TRUE or FALSE */
181 int num_prefetch_slots;
182 int num_filled_prefetch_slots;
183 int next_prefetch_read_slot;
184 int prefetch_read_active; /* TRUE or FALSE */
185 int pause_prefetch_requested; /* TRUE or FALSE */
186 int prefetch_start_delay; /* in seconds */
187 int64_t unclaimed_prefetch_region_start;
188 int64_t prefetch_read_time; /* in milliseconds. */
189 int64_t prefetch_write_time; /* in milliseconds. */
190 int64_t prefetch_data_read; /* in bytes. */
191 int64_t prefetch_data_written; /* in bytes. */
192 double prefetch_read_throughput; /* in bytes/millisecond. */
193 double prefetch_write_throughput; /* in bytes/millisecond. */
194 double prefetch_min_read_throughput; /* in bytes/millisecond. */
195 double prefetch_min_write_throughput; /* in bytes/millisecond. */
196 int64_t prefetch_read_throughput_measure_time; /* in millisecond. */
197 int64_t prefetch_write_throughput_measure_time; /* in millisecond.*/
198 int prefetch_throttle_time; /* in millisecond. */
199 int sectors_per_prefetch;
200 QEMUTimer *prefetch_timer;
201 /* prefetch_perf_calc_alpha = FvdHeader.prefetch_perf_calc_alpha/100.0 */
202 double prefetch_perf_calc_alpha;
203 /******** End: for prefetching. ***********************************/
205 /******** Begin: for compact image. *************************************/
206 uint32_t *table; /* Mapping table stored in memory in little endian. */
207 int64_t data_storage; /* in sectors. */
208 int64_t used_storage; /* in sectors. */
209 int64_t chunk_size; /* in sectors. */
210 int64_t storage_grow_unit; /* in sectors. */
211 int64_t table_offset; /* in sectors. */
212 char *add_storage_cmd;
213 /******** Begin: for compact image. *************************************/
215 /******** Begin: for journal. *******************************************/
216 int64_t journal_offset; /* in sectors. */
217 int64_t journal_size; /* in sectors. */
218 int64_t next_journal_sector; /* in sector. */
219 int ongoing_journal_updates; /* Number of ongoing journal updates. */
220 int dirty_image; /* TRUE or FALSE. */
222 /* Requests waiting for metadata flush and journal recycle to finish. */
223 QLIST_HEAD(JournalFlush, FvdAIOCB) wait_for_journal;
224 /******** End: for journal. ********************************************/
226 #ifdef FVD_DEBUG
227 int64_t total_copy_on_read_data; /* in bytes. */
228 int64_t total_prefetch_data; /* in bytes. */
229 #endif
230 } BDRVFvdState;
232 /* Begin of data type definitions. */
233 struct FvdAIOCB;
235 typedef struct JournalCB {
236 BlockDriverAIOCB *hd_acb;
237 QEMUIOVector qiov;
238 struct iovec iov;
239 QLIST_ENTRY(FvdAIOCB) next_wait_for_journal;
240 } JournalCB;
242 /* CopyLock is used by AIOWriteCB and AIOCopyCB. */
243 typedef struct CopyLock {
244 QLIST_ENTRY(FvdAIOCB) next;
245 int64_t begin;
246 int64_t end;
247 QLIST_HEAD(DependentWritesHead, FvdAIOCB) dependent_writes;
248 } CopyLock;
250 typedef struct ChildAIOReadCB {
251 BlockDriverAIOCB *hd_acb;
252 struct iovec iov;
253 QEMUIOVector qiov;
254 int64_t sector_num;
255 int nb_sectors;
256 int done;
257 } ChildAIOReadCB;
259 typedef struct AIOReadCB {
260 QEMUIOVector *qiov;
261 int ret;
262 ChildAIOReadCB read_backing;
263 ChildAIOReadCB read_fvd;
264 } AIOReadCB;
266 /* For copy-on-read and prefetching. */
267 typedef struct AIOCopyCB {
268 BlockDriverAIOCB *hd_acb;
269 struct iovec iov;
270 QEMUIOVector qiov;
271 uint8_t *buf;
272 int64_t buffered_sector_begin;
273 int64_t buffered_sector_end;
274 int64_t last_prefetch_op_start_time; /* For prefetch only. */
275 } AIOCopyCB;
277 typedef struct AIOWriteCB {
278 BlockDriverAIOCB *hd_acb;
279 QEMUIOVector *qiov;
280 uint8_t *cow_buf;
281 QEMUIOVector *cow_qiov;
282 int64_t cow_start_sector;
283 int update_table; /* TRUE or FALSE. */
284 int ret;
285 QLIST_ENTRY(FvdAIOCB) next_write_lock; /* See BDRVFvdState.write_locks */
287 /* See FvdAIOCB.write.dependent_writes. */
288 QLIST_ENTRY(FvdAIOCB) next_dependent_write;
289 } AIOWriteCB;
291 /* For AIOStoreCompactCB and AIOLoadCompactCB. */
292 typedef struct CompactChildCB {
293 struct FvdAIOCB *acb;
294 BlockDriverAIOCB *hd_acb;
295 } CompactChildCB;
297 /* For storing data to a compact image. */
298 typedef struct AIOStoreCompactCB {
299 CompactChildCB one_child;
300 CompactChildCB *children;
301 int update_table;
302 int num_children;
303 int finished_children;
304 struct FvdAIOCB *parent_acb;
305 int ret;
306 int soft_write; /*TRUE if the store is caused by copy-on-read or prefetch.*/
307 QEMUIOVector *orig_qiov;
308 } AIOStoreCompactCB;
310 /* For loading data from a compact image. */
311 typedef struct AIOLoadCompactCB {
312 CompactChildCB *children;
313 CompactChildCB one_child;
314 int num_children;
315 int finished_children;
316 struct FvdAIOCB *parent_acb;
317 int ret;
318 QEMUIOVector *orig_qiov;
319 } AIOLoadCompactCB;
321 typedef struct AIOFlushCB {
322 BlockDriverAIOCB *data_acb;
323 BlockDriverAIOCB *metadata_acb;
324 int num_finished;
325 int ret;
326 } AIOFlushCB;
328 typedef struct AIOWrapperCB {
329 QEMUBH *bh;
330 } AIOWrapperCB;
332 typedef enum { OP_READ = 1, OP_WRITE, OP_COPY, OP_STORE_COMPACT,
333 OP_LOAD_COMPACT, OP_WRAPPER, OP_FLUSH } op_type;
335 #ifdef FVD_DEBUG
336 /* For debugging memory leadk. */
337 typedef struct alloc_tracer_t {
338 int64_t magic;
339 int alloc_tracer;
340 const char *alloc_file;
341 int alloc_line;
342 size_t size;
343 } alloc_tracer_t;
344 #endif
346 typedef struct FvdAIOCB {
347 BlockDriverAIOCB common;
348 op_type type;
349 int64_t sector_num;
350 int nb_sectors;
351 JournalCB jcb; /* For AIOWriteCB and AIOStoreCompactCB. */
352 CopyLock copy_lock; /* For AIOWriteCB and AIOCopyCB. */
354 /* Use a union so that all requests can efficiently share one big AIOCBInfo.*/
355 union {
356 AIOWrapperCB wrapper;
357 AIOReadCB read;
358 AIOWriteCB write;
359 AIOCopyCB copy;
360 AIOLoadCompactCB load;
361 AIOStoreCompactCB store;
362 AIOFlushCB flush;
365 #ifdef FVD_DEBUG
366 int64_t magic;
367 alloc_tracer_t tracer;
369 /* Uniquely identifies a request across all processing activities. */
370 unsigned long long int uuid;
371 #endif
372 } FvdAIOCB;
374 static AIOCBInfo fvd_aio_pool;
375 static BlockDriver bdrv_fvd;
376 static QemuOptsList fvd_create_opts;
378 /* Function prototypes. */
379 static int do_aio_write(struct FvdAIOCB *acb);
380 static void finish_write_data(void *opaque, int ret);
381 static void restart_dependent_writes(struct FvdAIOCB *acb);
382 static void finish_prefetch_read(void *opaque, int ret);
383 static int read_fvd_header(BDRVFvdState * s, FvdHeader * header);
384 static int update_fvd_header(BDRVFvdState * s, FvdHeader * header);
385 #if 0
386 static void fvd_aio_cancel(BlockDriverAIOCB * blockacb);
387 #endif
388 static BlockDriverAIOCB *store_data_in_compact_image(struct FvdAIOCB *acb,
389 int soft_write, struct FvdAIOCB *parent_acb, BlockDriverState * bs,
390 int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
391 BlockDriverCompletionFunc * cb, void *opaque);
392 static BlockDriverAIOCB *load_data_from_compact_image(struct FvdAIOCB *acb,
393 struct FvdAIOCB *parent_acb, BlockDriverState * bs,
394 int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
395 BlockDriverCompletionFunc * cb, void *opaque);
396 static void free_write_resource(struct FvdAIOCB *acb);
397 static void write_metadata_to_journal(struct FvdAIOCB *acb);
398 static void flush_metadata_to_disk(BlockDriverState * bs);
399 static void free_journal_sectors(BDRVFvdState * s);
400 static int fvd_create(const char *filename, QemuOpts *options,
401 Error **errp);
402 static int fvd_probe(const uint8_t * buf, int buf_size, const char *filename);
403 static int64_t coroutine_fn fvd_get_block_status(BlockDriverState *bs,
404 int64_t sector_num,
405 int nb_sectors, int *pnum);
406 static int fvd_flush(BlockDriverState * bs);
407 static BlockDriverAIOCB *fvd_aio_readv(BlockDriverState * bs,
408 int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
409 BlockDriverCompletionFunc * cb, void *opaque);
410 static BlockDriverAIOCB *fvd_aio_writev(BlockDriverState * bs,
411 int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
412 BlockDriverCompletionFunc * cb, void *opaque);
413 static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs,
414 BlockDriverCompletionFunc * cb, void *opaque);
415 static int fvd_get_info(BlockDriverState * bs, BlockDriverInfo * bdi);
416 static int fvd_update(BlockDriverState * bs, int argc, char **argv);
417 static int fvd_has_zero_init(BlockDriverState * bs);
418 #if 0
419 static void fvd_read_cancel(FvdAIOCB * acb);
420 static void fvd_write_cancel(FvdAIOCB * acb);
421 static void fvd_copy_cancel(FvdAIOCB * acb);
422 static void fvd_load_compact_cancel(FvdAIOCB * acb);
423 static void fvd_store_compact_cancel(FvdAIOCB * acb);
424 static void fvd_wrapper_cancel(FvdAIOCB * acb);
425 #endif
426 static void flush_metadata_to_disk_on_exit (BlockDriverState *bs);
427 static inline BlockDriverAIOCB *load_data(FvdAIOCB * parent_acb,
428 BlockDriverState * bs, int64_t sector_num, QEMUIOVector * orig_qiov,
429 int nb_sectors, BlockDriverCompletionFunc * cb, void *opaque);
430 static inline BlockDriverAIOCB *store_data(int soft_write,
431 FvdAIOCB * parent_acb, BlockDriverState * bs, int64_t sector_num,
432 QEMUIOVector * orig_qiov, int nb_sectors,
433 BlockDriverCompletionFunc * cb, void *opaque);
435 /* Default configurations. */
436 #define DEF_PAGE_SIZE 4096 /* bytes */
437 #define BYTES_PER_PREFETCH 1048576 /* bytes */
438 #define PREFETCH_THROTTLING_TIME 30000 /* milliseconds */
439 #define NUM_PREFETCH_SLOTS 2
440 #define PREFETCH_MIN_MEASURE_READ_TIME 100 /* milliseconds */
441 #define PREFETCH_MIN_MEASURE_WRITE_TIME 100 /* milliseconds */
442 #define PREFETCH_MIN_READ_THROUGHPUT 5120 /* KB/s */
443 #define PREFETCH_MIN_WRITE_THROUGHPUT 5120 /* KB/s */
444 #define PREFETCH_MAX_READ_THROUGHPUT 1000000000L /* KB/s */
445 #define PREFETCH_MAX_WRITE_THROUGHPUT 1000000000L /* KB/s */
446 #define PREFETCH_PERF_CALC_ALPHA 80 /* in [0,100]. */
447 #define MAX_OUTSTANDING_COPY_ON_READ_DATA 2000000 /* bytes */
448 #define MODERATE_BITMAP_SIZE 4194304L /* bytes */
449 #define CHUNK_SIZE 1048576LL /* bytes */
450 #define JOURNAL_SIZE 16777216LL /* bytes */
451 #define STORAGE_GROW_UNIT 104857600LL /* bytes */
453 /* State of BDRVFvdState.prefetch_state. */
454 #define PREFETCH_STATE_RUNNING 1
455 #define PREFETCH_STATE_FINISHED 2
456 #define PREFETCH_STATE_DISABLED 3
458 /* For convience. */
459 #undef ROUND_UP /* override definition from osdep.h */
460 #define ROUND_UP(x, base) ((((x)+(base)-1) / (base)) * (base))
461 #define ROUND_DOWN(x, base) ((((x) / (base)) * (base)))
462 #define BOOL(x) ((x) ? "true" : "false")
463 #define EMPTY_TABLE ((uint32_t)0xFFFFFFFF)
464 #define DIRTY_TABLE ((uint32_t)0x80000000)
465 #define READ_TABLE(entry) (le32_to_cpu(entry) & ~DIRTY_TABLE)
466 # define FVDAIOCB_MAGIC ((uint64_t)0x3A8FCE89325B976DULL)
467 # define FVD_ALLOC_MAGIC ((uint64_t)0x4A7dCEF9925B976DULL)
468 #define IS_EMPTY(entry) ((entry) == EMPTY_TABLE)
469 #define IS_DIRTY(entry) (le32_to_cpu(entry) & DIRTY_TABLE)
470 #define WRITE_TABLE(entry,id) ((entry) = cpu_to_le32(id))
471 #define READ_TABLE2(entry) \
472 ((entry)==EMPTY_TABLE ? EMPTY_TABLE : (le32_to_cpu(entry) & ~DIRTY_TABLE))
474 #define CLEAN_DIRTY(entry) \
475 do { \
476 if (!IS_EMPTY(entry)) \
477 entry = cpu_to_le32(le32_to_cpu(entry) & ~DIRTY_TABLE); \
478 } while (0)
480 #define CLEAN_DIRTY2(entry) \
481 do { \
482 ASSERT(!IS_EMPTY(entry)); \
483 entry = cpu_to_le32(le32_to_cpu(entry) & ~DIRTY_TABLE); \
484 } while (0)