2 * Copyright (c) 2010-2011 IBM
5 * Chunqiang Tang <ctang@us.ibm.com>
7 * This work is licensed under the terms of the GNU GPL, version 2.
8 * See the COPYING file in the top-level directory.
11 /*=============================================================================
12 * A short description: this is the header of the FVD block device driver.
13 *============================================================================*/
19 #include <sys/ioctl.h>
20 #include "qemu-common.h"
21 #include "block/block_int.h"
22 #include "qemu/option.h"
23 #include "qemu/queue.h"
24 #include "qemu/timer.h"
25 #include "block/block.h"
26 #include "block/blksim.h"
27 #include "block/fvd-ext.h"
29 #define FVD_MAGIC (('Q' << 24) | ('C' << 16) | (0xF5 << 8) | 0xA9)
32 extern bool in_qemu_tool
;
34 /* Profile-directed prefetch. (to be implemented). */
35 typedef struct __attribute__ ((__packed__
)) PrefetchProfileEntry
{
36 int64_t offset
; /* in bytes */
38 /* In the unit of FvdHeader.prefetch_profile_entry_len_unit, i.e.,
39 * len_in_bytes = len * FvdHeader.unit_of_PrefetchProfileEntry_len. */
41 } PrefetchProfileEntry
;
44 * The FVD format consists of:
45 * + Header fields of FvdHeader.
46 * + Bitmap, starting on a 4KB page boundary at a location specified by
47 * FvdHeader.bitmap_offset.
48 * + Table, starting on a 4KB page boundary at a location specified by
49 * FvdHeader.table_offset.
50 * + Journal, starting on a 4KB page boundary at a location specified by
51 * FvdHeader.journal_offset.
52 * + Prefetch profile entries, starting on a 4KB page boundary at a location
53 * specified by FvdHeader.prefetch_profile_offset. (to be implemented)
54 * + Virtual disk data, starting on a 4KB page boundary. Optionally, disk
55 * data can be stored in a separate data file specified by
56 * FvdHeader.data_file.
58 typedef struct __attribute__ ((__packed__
)) FvdHeader
{
62 /* This field is set to TRUE after whole-image prefetching finishes. */
63 int32_t all_data_in_fvd_img
;
65 int64_t virtual_disk_size
; /* in bytes. Disk size perceived by the VM. */
66 int64_t metadata_size
; /* in bytes. */
68 char base_img_fmt
[16];
69 int64_t base_img_size
; /* in bytes. */
70 int64_t bitmap_offset
; /* in bytes. Aligned on DEF_PAGE_SIZE. */
71 int64_t bitmap_size
; /* in bytes. Rounded up to DEF_PAGE_SIZE */
72 int32_t block_size
; /* in bytes. */
73 int32_t copy_on_read
; /* TRUE or FALSE */
74 int64_t max_outstanding_copy_on_read_data
; /* in bytes. */
76 /* If (data_file[0]==0), the FVD metadata and data are stored in one file.*/
78 char data_file_fmt
[16];
80 /******** Begin: for prefetching. *******************************/
81 /* in seconds. -1 means disable whole image prefetching. */
82 int32_t prefetch_start_delay
;
84 /* in bytes. Aligned on DEF_PAGE_SIZE. (to be implemented) */
85 int64_t prefetch_profile_offset
;
87 /* Number of PrefetchProfileEntry. (to be implemented) */
88 int64_t prefetch_profile_entries
;
90 int32_t num_prefetch_slots
; /* Max number of outstanding prefetch writes. */
91 int32_t bytes_per_prefetch
; /* For whole image prefetching. */
92 int32_t prefetch_read_throughput_measure_time
; /* in milliseconds. */
93 int32_t prefetch_write_throughput_measure_time
; /* in milliseconds. */
95 /* Controls the calculation of the moving average of throughput. Must be a
96 * value between [0,100].
97 * actual_normalized_alpha = * prefetch_perf_calc_alpha / 100.0 */
98 int32_t prefetch_perf_calc_alpha
;
100 int32_t prefetch_min_read_throughput
; /* in KB/second. */
101 int32_t prefetch_min_write_throughput
; /* in KB/second. */
102 int32_t prefetch_max_read_throughput
; /* in KB/second. */
103 int32_t prefetch_max_write_throughput
; /* in KB/second. */
105 /* in milliseconds. When prefetch read/write throughput is low, prefetch
106 * pauses for a random time uniformly distributed in
107 * [0, prefetch_throttle_time]. */
108 int32_t prefetch_throttle_time
;
109 /******** End: for prefetching. *******************************/
111 /******** Begin: for compact image. *****************************/
112 int32_t compact_image
; /* TRUE or FALSE */
113 int64_t table_offset
; /* in bytes. */
114 int64_t chunk_size
; /* in bytes. */
115 int64_t storage_grow_unit
; /* in bytes. */
116 char add_storage_cmd
[2048];
117 /******** End: for compact image. *******************************/
119 /******** Begin: for journal. ***********************************/
120 int64_t journal_offset
; /* in bytes. */
121 int64_t journal_size
; /* in bytes. */
122 int32_t clean_shutdown
; /* TRUE if VM's last shutdown was graceful. */
123 /******** End: for journal. *************************************/
126 * This field is TRUE if the image mandates that the storage layer
127 * (BDRVFvdState.fvd_data) must return TRUE for bdrv_has_zero_init().
128 * This is the case if the optimization described in Section 3.3.3 of the
129 * FVD-cow paper is enabled (see function search_holes()). If 'qemu-img
130 * create' sets need_zero_init to TRUE, 'qemu-img update' can be used to
131 * manually reset it to FALSE, if the user always manually pre-fills the
132 * storage (e.g., a raw partition) with zeros. If the image is stored on a
133 * file system, it already supports zero_init, and hence there is no need
134 * to manually manipulate this field.
136 int32_t need_zero_init
;
138 /* If TRUE, FVD dumps a prefetch profile after the VM shuts down.
139 * (to be implemented) */
140 int32_t generate_prefetch_profile
;
142 /* See the comment on PrefetchProfileEntry.len. (to be implemented) */
143 int32_t unit_of_PrefetchProfileEntry_len
;
145 /* in seconds. -1 means disable profile-directed prefetching.
146 * (to be implemented) */
147 int32_t profile_directed_prefetch_start_delay
;
149 /* Possible values are "no", "writethrough", "writeback", or
150 * "writenocache". (to be implemented) */
151 char write_updates_base_img
[16];
154 typedef struct BDRVFvdState
{
155 BlockDriverState
*fvd_metadata
;
156 BlockDriverState
*fvd_data
;
157 int64_t virtual_disk_size
; /*in bytes. */
158 int64_t bitmap_offset
; /* in sectors */
159 int64_t bitmap_size
; /* in bytes. */
160 int64_t data_offset
; /* in sectors. Begin of real data. */
161 int64_t nb_sectors_in_base_img
;
162 int32_t block_size
; /* in sectors. */
163 int copy_on_read
; /* TRUE or FALSE */
164 int64_t max_outstanding_copy_on_read_data
; /* in bytes. */
165 int64_t outstanding_copy_on_read_data
; /* in bytes. */
166 int data_region_prepared
; /* TRUE or FALSE */
167 QLIST_HEAD(WriteLocks
, FvdAIOCB
) write_locks
; /* All writes. */
168 QLIST_HEAD(CopyLocks
, FvdAIOCB
) copy_locks
; /* copy-on-read and CoW. */
170 /* Keep two copies of bitmap to reduce the overhead of updating the
171 * on-disk bitmap, i.e., copy-on-read and prefetching do not update the
172 * on-disk bitmap. See Section 3.3.4 of the FVD-cow paper. */
173 uint8_t *fresh_bitmap
;
174 uint8_t *stale_bitmap
;
176 /******** Begin: for prefetching. ***********************************/
177 struct FvdAIOCB
**prefetch_acb
;
178 int prefetch_state
; /* PREFETCH_STATE_RUNNING, FINISHED, or DISABLED. */
179 int prefetch_error
; /* TRUE or FALSE */
180 int num_prefetch_slots
;
181 int num_filled_prefetch_slots
;
182 int next_prefetch_read_slot
;
183 int prefetch_read_active
; /* TRUE or FALSE */
184 int pause_prefetch_requested
; /* TRUE or FALSE */
185 int prefetch_start_delay
; /* in seconds */
186 int64_t unclaimed_prefetch_region_start
;
187 int64_t prefetch_read_time
; /* in milliseconds. */
188 int64_t prefetch_write_time
; /* in milliseconds. */
189 int64_t prefetch_data_read
; /* in bytes. */
190 int64_t prefetch_data_written
; /* in bytes. */
191 double prefetch_read_throughput
; /* in bytes/millisecond. */
192 double prefetch_write_throughput
; /* in bytes/millisecond. */
193 double prefetch_min_read_throughput
; /* in bytes/millisecond. */
194 double prefetch_min_write_throughput
; /* in bytes/millisecond. */
195 int64_t prefetch_read_throughput_measure_time
; /* in millisecond. */
196 int64_t prefetch_write_throughput_measure_time
; /* in millisecond.*/
197 int prefetch_throttle_time
; /* in millisecond. */
198 int sectors_per_prefetch
;
199 QEMUTimer
*prefetch_timer
;
200 /* prefetch_perf_calc_alpha = FvdHeader.prefetch_perf_calc_alpha/100.0 */
201 double prefetch_perf_calc_alpha
;
202 /******** End: for prefetching. ***********************************/
204 /******** Begin: for compact image. *************************************/
205 uint32_t *table
; /* Mapping table stored in memory in little endian. */
206 int64_t data_storage
; /* in sectors. */
207 int64_t used_storage
; /* in sectors. */
208 int64_t chunk_size
; /* in sectors. */
209 int64_t storage_grow_unit
; /* in sectors. */
210 int64_t table_offset
; /* in sectors. */
211 char *add_storage_cmd
;
212 /******** Begin: for compact image. *************************************/
214 /******** Begin: for journal. *******************************************/
215 int64_t journal_offset
; /* in sectors. */
216 int64_t journal_size
; /* in sectors. */
217 int64_t next_journal_sector
; /* in sector. */
218 int ongoing_journal_updates
; /* Number of ongoing journal updates. */
219 int dirty_image
; /* TRUE or FALSE. */
221 /* Requests waiting for metadata flush and journal recycle to finish. */
222 QLIST_HEAD(JournalFlush
, FvdAIOCB
) wait_for_journal
;
223 /******** End: for journal. ********************************************/
226 int64_t total_copy_on_read_data
; /* in bytes. */
227 int64_t total_prefetch_data
; /* in bytes. */
231 /* Begin of data type definitions. */
234 typedef struct JournalCB
{
235 BlockDriverAIOCB
*hd_acb
;
238 QLIST_ENTRY(FvdAIOCB
) next_wait_for_journal
;
241 /* CopyLock is used by AIOWriteCB and AIOCopyCB. */
242 typedef struct CopyLock
{
243 QLIST_ENTRY(FvdAIOCB
) next
;
246 QLIST_HEAD(DependentWritesHead
, FvdAIOCB
) dependent_writes
;
249 typedef struct ChildAIOReadCB
{
250 BlockDriverAIOCB
*hd_acb
;
258 typedef struct AIOReadCB
{
261 ChildAIOReadCB read_backing
;
262 ChildAIOReadCB read_fvd
;
265 /* For copy-on-read and prefetching. */
266 typedef struct AIOCopyCB
{
267 BlockDriverAIOCB
*hd_acb
;
271 int64_t buffered_sector_begin
;
272 int64_t buffered_sector_end
;
273 int64_t last_prefetch_op_start_time
; /* For prefetch only. */
276 typedef struct AIOWriteCB
{
277 BlockDriverAIOCB
*hd_acb
;
280 QEMUIOVector
*cow_qiov
;
281 int64_t cow_start_sector
;
282 int update_table
; /* TRUE or FALSE. */
284 QLIST_ENTRY(FvdAIOCB
) next_write_lock
; /* See BDRVFvdState.write_locks */
286 /* See FvdAIOCB.write.dependent_writes. */
287 QLIST_ENTRY(FvdAIOCB
) next_dependent_write
;
290 /* For AIOStoreCompactCB and AIOLoadCompactCB. */
291 typedef struct CompactChildCB
{
292 struct FvdAIOCB
*acb
;
293 BlockDriverAIOCB
*hd_acb
;
296 /* For storing data to a compact image. */
297 typedef struct AIOStoreCompactCB
{
298 CompactChildCB one_child
;
299 CompactChildCB
*children
;
302 int finished_children
;
303 struct FvdAIOCB
*parent_acb
;
305 int soft_write
; /*TRUE if the store is caused by copy-on-read or prefetch.*/
306 QEMUIOVector
*orig_qiov
;
309 /* For loading data from a compact image. */
310 typedef struct AIOLoadCompactCB
{
311 CompactChildCB
*children
;
312 CompactChildCB one_child
;
314 int finished_children
;
315 struct FvdAIOCB
*parent_acb
;
317 QEMUIOVector
*orig_qiov
;
320 typedef struct AIOFlushCB
{
321 BlockDriverAIOCB
*data_acb
;
322 BlockDriverAIOCB
*metadata_acb
;
327 typedef struct AIOWrapperCB
{
331 typedef enum { OP_READ
= 1, OP_WRITE
, OP_COPY
, OP_STORE_COMPACT
,
332 OP_LOAD_COMPACT
, OP_WRAPPER
, OP_FLUSH
} op_type
;
335 /* For debugging memory leadk. */
336 typedef struct alloc_tracer_t
{
339 const char *alloc_file
;
345 typedef struct FvdAIOCB
{
346 BlockDriverAIOCB common
;
350 JournalCB jcb
; /* For AIOWriteCB and AIOStoreCompactCB. */
351 CopyLock copy_lock
; /* For AIOWriteCB and AIOCopyCB. */
353 /* Use a union so that all requests can efficiently share one big AIOCBInfo.*/
355 AIOWrapperCB wrapper
;
359 AIOLoadCompactCB load
;
360 AIOStoreCompactCB store
;
366 alloc_tracer_t tracer
;
368 /* Uniquely identifies a request across all processing activities. */
369 unsigned long long int uuid
;
373 static AIOCBInfo fvd_aio_pool
;
374 static BlockDriver bdrv_fvd
;
375 static QemuOptsList fvd_create_opts
;
377 /* Function prototypes. */
378 static int do_aio_write(struct FvdAIOCB
*acb
);
379 static void finish_write_data(void *opaque
, int ret
);
380 static void restart_dependent_writes(struct FvdAIOCB
*acb
);
381 static void finish_prefetch_read(void *opaque
, int ret
);
382 static int read_fvd_header(BDRVFvdState
* s
, FvdHeader
* header
);
383 static int update_fvd_header(BDRVFvdState
* s
, FvdHeader
* header
);
385 static void fvd_aio_cancel(BlockDriverAIOCB
* blockacb
);
387 static BlockDriverAIOCB
*store_data_in_compact_image(struct FvdAIOCB
*acb
,
388 int soft_write
, struct FvdAIOCB
*parent_acb
, BlockDriverState
* bs
,
389 int64_t sector_num
, QEMUIOVector
* qiov
, int nb_sectors
,
390 BlockDriverCompletionFunc
* cb
, void *opaque
);
391 static BlockDriverAIOCB
*load_data_from_compact_image(struct FvdAIOCB
*acb
,
392 struct FvdAIOCB
*parent_acb
, BlockDriverState
* bs
,
393 int64_t sector_num
, QEMUIOVector
* qiov
, int nb_sectors
,
394 BlockDriverCompletionFunc
* cb
, void *opaque
);
395 static void free_write_resource(struct FvdAIOCB
*acb
);
396 static void write_metadata_to_journal(struct FvdAIOCB
*acb
);
397 static void flush_metadata_to_disk(BlockDriverState
* bs
);
398 static void free_journal_sectors(BDRVFvdState
* s
);
399 static int fvd_create(const char *filename
, QemuOpts
*options
,
401 static int fvd_probe(const uint8_t * buf
, int buf_size
, const char *filename
);
402 static int64_t coroutine_fn
fvd_get_block_status(BlockDriverState
*bs
,
404 int nb_sectors
, int *pnum
);
405 static int fvd_flush(BlockDriverState
* bs
);
406 static BlockDriverAIOCB
*fvd_aio_readv(BlockDriverState
* bs
,
407 int64_t sector_num
, QEMUIOVector
* qiov
, int nb_sectors
,
408 BlockDriverCompletionFunc
* cb
, void *opaque
);
409 static BlockDriverAIOCB
*fvd_aio_writev(BlockDriverState
* bs
,
410 int64_t sector_num
, QEMUIOVector
* qiov
, int nb_sectors
,
411 BlockDriverCompletionFunc
* cb
, void *opaque
);
412 static BlockDriverAIOCB
*fvd_aio_flush(BlockDriverState
* bs
,
413 BlockDriverCompletionFunc
* cb
, void *opaque
);
414 static int fvd_get_info(BlockDriverState
* bs
, BlockDriverInfo
* bdi
);
415 static int fvd_update(BlockDriverState
* bs
, int argc
, char **argv
);
416 static int fvd_has_zero_init(BlockDriverState
* bs
);
418 static void fvd_read_cancel(FvdAIOCB
* acb
);
419 static void fvd_write_cancel(FvdAIOCB
* acb
);
420 static void fvd_copy_cancel(FvdAIOCB
* acb
);
421 static void fvd_load_compact_cancel(FvdAIOCB
* acb
);
422 static void fvd_store_compact_cancel(FvdAIOCB
* acb
);
423 static void fvd_wrapper_cancel(FvdAIOCB
* acb
);
425 static void flush_metadata_to_disk_on_exit (BlockDriverState
*bs
);
426 static inline BlockDriverAIOCB
*load_data(FvdAIOCB
* parent_acb
,
427 BlockDriverState
* bs
, int64_t sector_num
, QEMUIOVector
* orig_qiov
,
428 int nb_sectors
, BlockDriverCompletionFunc
* cb
, void *opaque
);
429 static inline BlockDriverAIOCB
*store_data(int soft_write
,
430 FvdAIOCB
* parent_acb
, BlockDriverState
* bs
, int64_t sector_num
,
431 QEMUIOVector
* orig_qiov
, int nb_sectors
,
432 BlockDriverCompletionFunc
* cb
, void *opaque
);
434 /* Default configurations. */
435 #define DEF_PAGE_SIZE 4096 /* bytes */
436 #define BYTES_PER_PREFETCH 1048576 /* bytes */
437 #define PREFETCH_THROTTLING_TIME 30000 /* milliseconds */
438 #define NUM_PREFETCH_SLOTS 2
439 #define PREFETCH_MIN_MEASURE_READ_TIME 100 /* milliseconds */
440 #define PREFETCH_MIN_MEASURE_WRITE_TIME 100 /* milliseconds */
441 #define PREFETCH_MIN_READ_THROUGHPUT 5120 /* KB/s */
442 #define PREFETCH_MIN_WRITE_THROUGHPUT 5120 /* KB/s */
443 #define PREFETCH_MAX_READ_THROUGHPUT 1000000000L /* KB/s */
444 #define PREFETCH_MAX_WRITE_THROUGHPUT 1000000000L /* KB/s */
445 #define PREFETCH_PERF_CALC_ALPHA 80 /* in [0,100]. */
446 #define MAX_OUTSTANDING_COPY_ON_READ_DATA 2000000 /* bytes */
447 #define MODERATE_BITMAP_SIZE 4194304L /* bytes */
448 #define CHUNK_SIZE 1048576LL /* bytes */
449 #define JOURNAL_SIZE 16777216LL /* bytes */
450 #define STORAGE_GROW_UNIT 104857600LL /* bytes */
452 /* State of BDRVFvdState.prefetch_state. */
453 #define PREFETCH_STATE_RUNNING 1
454 #define PREFETCH_STATE_FINISHED 2
455 #define PREFETCH_STATE_DISABLED 3
458 #undef ROUND_UP /* override definition from osdep.h */
459 #define ROUND_UP(x, base) ((((x)+(base)-1) / (base)) * (base))
460 #define ROUND_DOWN(x, base) ((((x) / (base)) * (base)))
461 #define BOOL(x) ((x) ? "true" : "false")
462 #define EMPTY_TABLE ((uint32_t)0xFFFFFFFF)
463 #define DIRTY_TABLE ((uint32_t)0x80000000)
464 #define READ_TABLE(entry) (le32_to_cpu(entry) & ~DIRTY_TABLE)
465 # define FVDAIOCB_MAGIC ((uint64_t)0x3A8FCE89325B976DULL)
466 # define FVD_ALLOC_MAGIC ((uint64_t)0x4A7dCEF9925B976DULL)
467 #define IS_EMPTY(entry) ((entry) == EMPTY_TABLE)
468 #define IS_DIRTY(entry) (le32_to_cpu(entry) & DIRTY_TABLE)
469 #define WRITE_TABLE(entry,id) ((entry) = cpu_to_le32(id))
470 #define READ_TABLE2(entry) \
471 ((entry)==EMPTY_TABLE ? EMPTY_TABLE : (le32_to_cpu(entry) & ~DIRTY_TABLE))
473 #define CLEAN_DIRTY(entry) \
475 if (!IS_EMPTY(entry)) \
476 entry = cpu_to_le32(le32_to_cpu(entry) & ~DIRTY_TABLE); \
479 #define CLEAN_DIRTY2(entry) \
481 ASSERT(!IS_EMPTY(entry)); \
482 entry = cpu_to_le32(le32_to_cpu(entry) & ~DIRTY_TABLE); \