2 * Copyright (c) 2010-2011 IBM
5 * Chunqiang Tang <ctang@us.ibm.com>
7 * This work is licensed under the terms of the GNU GPL, version 2.
8 * See the COPYING file in the top-level directory.
11 /*=============================================================================
12 * A short description: this module implements misc functions of the
13 * BlockDriver interface for the Fast Virtual Disk (FVD) format.
14 *===========================================================================*/
17 static void fvd_flush_cancel (FvdAIOCB
* acb
)
19 if (acb
->flush
.data_acb
) {
20 bdrv_aio_cancel (acb
->flush
.data_acb
);
22 if (acb
->flush
.metadata_acb
) {
23 bdrv_aio_cancel (acb
->flush
.metadata_acb
);
25 my_qemu_aio_unref (acb
);
28 static void fvd_aio_cancel (BlockDriverAIOCB
* blockacb
)
30 FvdAIOCB
*acb
= container_of (blockacb
, FvdAIOCB
, common
);
32 QDEBUG ("CANCEL: acb%llu-%p\n", acb
->uuid
, acb
);
36 fvd_read_cancel (acb
);
40 fvd_write_cancel (acb
);
44 fvd_copy_cancel (acb
);
48 fvd_load_compact_cancel (acb
);
51 case OP_STORE_COMPACT
:
52 fvd_store_compact_cancel (acb
);
56 fvd_wrapper_cancel (acb
);
60 fvd_flush_cancel (acb
);
66 static inline void finish_flush (FvdAIOCB
* acb
)
68 QDEBUG ("FLUSH: acb%llu-%p finish_flush ret=%d\n",
69 acb
->uuid
, acb
, acb
->flush
.ret
);
70 acb
->common
.cb (acb
->common
.opaque
, acb
->flush
.ret
);
71 my_qemu_aio_unref (acb
);
74 static void finish_flush_data (void *opaque
, int ret
)
76 FvdAIOCB
*acb
= opaque
;
78 QDEBUG ("FLUSH: acb%llu-%p finish_flush_data ret=%d\n",
81 if (acb
->flush
.ret
== 0) {
85 acb
->flush
.data_acb
= NULL
;
86 acb
->flush
.num_finished
++;
87 if (acb
->flush
.num_finished
== 2) {
92 static void finish_flush_metadata (void *opaque
, int ret
)
94 FvdAIOCB
*acb
= opaque
;
96 QDEBUG ("FLUSH: acb%llu-%p finish_flush_metadata ret=%d\n",
99 if (acb
->flush
.ret
== 0) {
100 acb
->flush
.ret
= ret
;
103 acb
->flush
.metadata_acb
= NULL
;
104 acb
->flush
.num_finished
++;
105 if (acb
->flush
.num_finished
== 2) {
110 static BlockDriverAIOCB
*fvd_aio_flush (BlockDriverState
* bs
,
111 BlockDriverCompletionFunc
* cb
, void *opaque
)
113 BDRVFvdState
*s
= bs
->opaque
;
114 if (s
->fvd_data
== s
->fvd_metadata
) {
115 return bdrv_aio_flush (s
->fvd_metadata
, cb
, opaque
);
118 FvdAIOCB
*acb
= my_qemu_aio_get (&fvd_aio_pool
, bs
, cb
, opaque
);
123 acb
->type
= OP_FLUSH
;
124 acb
->flush
.num_finished
= 0;
126 acb
->flush
.data_acb
= bdrv_aio_flush (s
->fvd_data
, finish_flush_data
, acb
);
127 if (!acb
->flush
.data_acb
) {
128 my_qemu_aio_unref (acb
);
132 acb
->flush
.metadata_acb
= bdrv_aio_flush (s
->fvd_metadata
,
133 finish_flush_metadata
, acb
);
134 if (!acb
->flush
.metadata_acb
) {
135 bdrv_aio_cancel (acb
->flush
.data_acb
);
136 my_qemu_aio_unref (acb
);
140 QDEBUG ("FLUSH: acb%llu-%p start\n", acb
->uuid
, acb
);
144 static int fvd_flush (BlockDriverState
* bs
)
146 BDRVFvdState
*s
= bs
->opaque
;
149 QDEBUG ("fvd_flush() invoked\n");
152 if ((ret
= bdrv_flush (s
->fvd_data
))) {
156 if (s
->fvd_metadata
== s
->fvd_data
) {
160 return bdrv_flush (s
->fvd_metadata
);
163 static void fvd_close (BlockDriverState
* bs
)
165 BDRVFvdState
*s
= bs
->opaque
;
169 if (s
->prefetch_state
== PREFETCH_STATE_RUNNING
) {
170 s
->prefetch_state
= PREFETCH_STATE_DISABLED
;
172 if (s
->prefetch_timer
) {
173 timer_del(s
->prefetch_timer
);
174 timer_free(s
->prefetch_timer
);
175 s
->prefetch_timer
= NULL
;
178 /* Clean up prefetch operations. */
179 if (s
->prefetch_acb
) {
180 for (i
= 0; i
< s
->num_prefetch_slots
; i
++) {
181 if (s
->prefetch_acb
[i
] != NULL
) {
182 acb
= s
->prefetch_acb
[i
];
183 if (acb
->copy
.hd_acb
) {
184 bdrv_aio_cancel (acb
->copy
.hd_acb
);
186 my_qemu_vfree (s
->prefetch_acb
[i
]->copy
.buf
);
187 my_qemu_aio_unref (s
->prefetch_acb
[i
]);
188 s
->prefetch_acb
[i
] = NULL
;
191 my_qemu_free (s
->prefetch_acb
);
192 s
->prefetch_acb
= NULL
;
195 flush_metadata_to_disk_on_exit (bs
);
197 if (s
->stale_bitmap
) {
198 my_qemu_vfree (s
->stale_bitmap
);
199 if (s
->fresh_bitmap
!= s
->stale_bitmap
) {
200 my_qemu_vfree (s
->fresh_bitmap
);
202 s
->stale_bitmap
= NULL
;
203 s
->fresh_bitmap
= NULL
;
207 my_qemu_vfree (s
->table
);
211 if (s
->fvd_metadata
) {
212 if (s
->fvd_metadata
!= s
->fvd_data
) {
213 bdrv_unref(s
->fvd_metadata
);
215 s
->fvd_metadata
= NULL
;
218 bdrv_unref(s
->fvd_data
);
222 if (s
->add_storage_cmd
) {
223 my_qemu_free (s
->add_storage_cmd
);
224 s
->add_storage_cmd
= NULL
;
227 dump_resource_summary (s
);
231 static int fvd_probe (const uint8_t * buf
, int buf_size
, const char *filename
)
233 const FvdHeader
*header
= (const void *) buf
;
235 if (buf_size
>= 2 * sizeof (uint32_t)
236 && le32_to_cpu (header
->magic
) == FVD_MAGIC
237 && le32_to_cpu (header
->version
) == FVD_VERSION
) {
244 static int64_t fvd_get_block_status(BlockDriverState
* bs
, int64_t sector_num
,
245 int nb_sectors
, int *pnum
)
247 BDRVFvdState
*s
= bs
->opaque
;
249 if (s
->prefetch_state
== PREFETCH_STATE_FINISHED
250 || sector_num
>= s
->nb_sectors_in_base_img
251 || !fresh_bitmap_show_sector_in_base_img (sector_num
, s
)) {
252 /* For the three cases that data may be saved in the FVD data file, we
253 * still need to check the underlying storage because those data could
254 * be holes in a sparse image, due to the optimization of "free write
255 * to zero-filled blocks". See Section 3.3.3 of the FVD-cow paper.
256 * This also covers the case of no base image. */
259 return bdrv_is_allocated (s
->fvd_data
, s
->data_offset
+ sector_num
,
263 /* Use the table to figure it out. */
264 int64_t first_chunk
= sector_num
/ s
->chunk_size
;
265 int64_t last_chunk
= (sector_num
+ nb_sectors
- 1) / s
->chunk_size
;
266 int allocated
= !IS_EMPTY (s
->table
[first_chunk
]);
269 if (first_chunk
== last_chunk
) {
270 /* All data in one chunk. */
275 /* Data in the first chunk. */
276 count
= s
->chunk_size
- (sector_num
% s
->chunk_size
);
280 while (first_chunk
< last_chunk
) {
281 if ((allocated
&& IS_EMPTY (s
->table
[first_chunk
]))
282 || (!allocated
&& !IS_EMPTY (s
->table
[first_chunk
]))) {
287 count
+= s
->chunk_size
;
291 /* Data in the last chunk. */
292 if ((allocated
&& !IS_EMPTY (s
->table
[last_chunk
]))
293 || (!allocated
&& IS_EMPTY (s
->table
[last_chunk
]))) {
294 int nb
= (sector_num
+ nb_sectors
) % s
->chunk_size
;
295 count
+= nb
? nb
: s
->chunk_size
;
302 /* Use the FVD metadata to find out sectors in the base image. */
303 int64_t end
= sector_num
+ nb_sectors
;
304 if (end
> s
->nb_sectors_in_base_img
) {
305 end
= s
->nb_sectors_in_base_img
;
308 int64_t next
= sector_num
+ 1;
309 while (next
< end
&& fresh_bitmap_show_sector_in_base_img (next
, s
)) {
313 *pnum
= next
- sector_num
;
317 static void update_usage (void)
319 printf ("Usage: update <image_file> [attribute=val]\n See outputs of"
320 "the 'info' command for all available attributes.\n");
323 static int fvd_get_info (BlockDriverState
* bs
, BlockDriverInfo
* bdi
)
325 BDRVFvdState
*s
= bs
->opaque
;
328 if (read_fvd_header (s
, &header
) < 0) {
332 printf ("========= Begin of FVD specific information ==================\n");
333 printf ("magic\t\t\t\t\t\t%0X\n", header
.magic
);
334 printf ("version\t\t\t\t\t\t%d\n", header
.version
);
335 printf ("virtual_disk_size (bytes)\t\t\t%" PRId64
"\n",
336 header
.virtual_disk_size
);
337 printf ("disk_metadata_size (bytes)\t\t\t%" PRId64
"\n",
338 header
.metadata_size
);
339 if (header
.data_file
[0]) {
340 printf ("data_file\t\t\t\t\t%s\n", header
.data_file
);
342 if (header
.data_file_fmt
[0]) {
343 printf ("data_file_fmt\t\t\t\t%s\n", header
.data_file_fmt
);
346 if (header
.base_img
[0] != 0) {
347 printf ("base_img\t\t\t\t\t%s\n", header
.base_img
);
348 printf ("all_data_in_fvd_img\t\t\t\t%s\n",
349 BOOL (header
.all_data_in_fvd_img
));
350 printf ("base_img_size (bytes)\t\t\t\t%" PRId64
"\n",
351 header
.base_img_size
);
352 printf ("bitmap_offset (bytes)\t\t\t\t%" PRId64
"\n",
353 header
.bitmap_offset
);
354 printf ("bitmap_size (bytes)\t\t\t\t%" PRId64
"\n", header
.bitmap_size
);
355 printf ("prefetch_profile_offset (bytes)\t\t\t%" PRId64
"\n",
356 header
.prefetch_profile_offset
);
357 printf ("prefetch_profile_entries\t\t\t%" PRId64
"\n",
358 header
.prefetch_profile_entries
);
359 printf ("prefetch_profile_entry_len_unit\t\t\t%d\n",
360 header
.unit_of_PrefetchProfileEntry_len
);
361 printf ("block_size\t\t\t\t\t%d\n", header
.block_size
);
362 printf ("copy_on_read\t\t\t\t\t%s\n", BOOL (header
.copy_on_read
));
363 printf ("max_outstanding_copy_on_read_data (bytes)\t%" PRId64
"\n",
364 header
.max_outstanding_copy_on_read_data
);
365 printf ("prefetch_start_delay (sec)\t\t\t%d\n",
366 header
.prefetch_start_delay
);
367 printf ("profile_directed_prefetch_start_delay (sec)\t%d\n",
368 header
.profile_directed_prefetch_start_delay
);
369 printf ("max_num_outstanding_prefetch_writes\t\t%d\n",
370 header
.num_prefetch_slots
);
371 printf ("bytes_per_prefetch\t\t\t\t%d\n", header
.bytes_per_prefetch
);
372 printf ("prefetch_over_threshold_throttle_time (ms)\t%d\n",
373 header
.prefetch_throttle_time
);
374 printf ("prefetch_read_throughput_measure_time (ms)\t%d\n",
375 header
.prefetch_read_throughput_measure_time
);
376 printf ("prefetch_write_throughput_measure_time (ms)\t%d\n",
377 header
.prefetch_write_throughput_measure_time
);
378 printf ("prefetch_min_read_throughput_threshold (KB/s)\t%d\n",
379 header
.prefetch_min_read_throughput
);
380 printf ("prefetch_min_write_throughput_threshold (KB/s)\t%d\n",
381 header
.prefetch_min_write_throughput
);
382 printf ("prefetch_max_read_throughput_threshold (KB/s)\t%d\n",
383 header
.prefetch_max_read_throughput
);
384 printf ("prefetch_max_write_throughput_threshold (KB/s)\t%d\n",
385 header
.prefetch_max_write_throughput
);
386 printf ("prefetch_perf_calc_alpha\t\t\t%d\n",
387 header
.prefetch_perf_calc_alpha
);
388 printf ("generate_prefetch_profile\t\t\t%s\n",
389 BOOL (header
.generate_prefetch_profile
));
392 printf ("need_zero_init\t\t\t\t\t%s\n", BOOL (header
.need_zero_init
));
393 printf ("compact_image\t\t\t\t\t%s\n", BOOL (header
.compact_image
));
394 if (header
.compact_image
) {
395 printf ("data_storage (bytes)\t\t\t\t%" PRId64
"\n",
396 s
->data_storage
* 512);
397 printf ("chunk_size (bytes)\t\t\t\t%" PRId64
"\n", header
.chunk_size
);
398 printf ("used_chunks (bytes)\t\t\t\t%" PRId64
"\n",
399 s
->used_storage
* 512);
400 printf ("storage_grow_unit (bytes)\t\t\t%" PRId64
"\n",
401 header
.storage_grow_unit
);
402 printf ("table_offset (bytes)\t\t\t\t%" PRId64
"\n",
403 header
.table_offset
);
404 int64_t vsize
= ROUND_UP (s
->virtual_disk_size
, s
->chunk_size
* 512);
405 int table_entries
= vsize
/ (s
->chunk_size
* 512);
406 int64_t table_size
= sizeof (uint32_t) * table_entries
;
407 table_size
= ROUND_UP (table_size
, DEF_PAGE_SIZE
);
408 printf ("table_size (bytes)\t\t\t\t%" PRId64
"\n", table_size
);
410 if (header
.add_storage_cmd
[0] != 0) {
411 printf ("add_storage_cmd\t\t\t\t\t%s\n", header
.add_storage_cmd
);
414 printf ("clean_shutdown\t\t\t\t\t%s\n", BOOL (header
.clean_shutdown
));
415 if (header
.journal_size
> 0) {
416 printf ("journal_offset\t\t\t\t\t%" PRId64
"\n", header
.journal_offset
);
417 printf ("journal_size\t\t\t\t\t%" PRId64
"\n", header
.journal_size
);
419 printf ("========= End of FVD specific information ====================\n");
421 bdi
->cluster_size
= 0;
422 bdi
->vm_state_offset
= 0;
426 static int fvd_has_zero_init (BlockDriverState
* bs
)
428 BDRVFvdState
*s
= bs
->opaque
;
429 return bdrv_has_zero_init (s
->fvd_data
);
432 static int fvd_update (BlockDriverState
* bs
, int argc
, char **argv
)
434 BDRVFvdState
*s
= bs
->opaque
;
443 if (strcmp (argv
[0], "-h") == 0 || strcmp (argv
[0], "--help") == 0
444 || strcmp (argv
[0], "-o") == 0) {
449 read_fvd_header (s
, &header
);
451 for (i
= 0; i
< argc
; i
++) {
452 char *attr
= argv
[i
];
453 char *val
= strchr (attr
, '=');
455 fprintf (stderr
, "Error: string '%s' is not in the format of "
456 "'attribute=val' without spaces.\n", attr
);
462 if (strcmp (attr
, "size") == 0) {
464 new_size
= atoll (val
);
465 int len
= strlen (val
);
466 if (val
[len
- 1] == 'G') {
467 new_size
*= ((int64_t) 1024) * 1024 * 1024;
468 } else if (val
[len
- 1] == 'M') {
469 new_size
*= ((int64_t) 1024) * 1024;
470 } else if (val
[len
- 1] == 'K') {
471 new_size
*= ((int64_t) 1024);
472 } else if (val
[len
- 1] == 'B') {
473 /* No change to new_size as it is already in bytes. */
475 /* If no unit is specified, the default unit is KB. */
476 new_size
*= ((int64_t) 1024);
480 fprintf (stderr
, "Error: size %s is not positive.\n", val
);
484 new_size
= ROUND_UP (new_size
, 512);
485 if (new_size
< header
.virtual_disk_size
) {
486 printf ("Warning: image's new size %" PRId64
487 " is smaller than the original size %" PRId64
488 ". Some image data will be truncated.\n",
489 new_size
, header
.virtual_disk_size
);
491 header
.virtual_disk_size
= new_size
;
492 printf ("Image resized to %" PRId64
" bytes.\n", new_size
);
493 } else if (strcmp (attr
, "base_img") == 0) {
494 if (strlen (val
) > 1023) {
495 fprintf (stderr
, "Error: the new base image name is longer "
496 "than 1023, which is not allowed.\n");
500 memset (header
.base_img
, 0, 1024);
501 pstrcpy (header
.base_img
, 1024, val
);
502 printf ("Backing file updated to '%s'.\n", val
);
503 } else if (strcmp (attr
, "data_file") == 0) {
504 if (strlen (val
) > 1023) {
505 fprintf (stderr
, "Error: the new data file name is longer "
506 "than 1023, which is not allowed.\n");
510 memset (header
.data_file
, 0, 1024);
511 pstrcpy (header
.data_file
, 1024, val
);
512 printf ("Data file updated to '%s'.\n", val
);
513 } else if (strcmp (attr
, "need_zero_init") == 0) {
514 if (strcasecmp (val
, "true") == 0 || strcasecmp (val
, "on") == 0) {
515 header
.need_zero_init
= TRUE
;
516 printf ("need_zero_init is turned on for this disk.\n");
518 header
.need_zero_init
= FALSE
;
519 printf ("need_zero_init is turned off for this disk.\n");
521 } else if (strcmp (attr
, "copy_on_read") == 0) {
522 if (strcasecmp (val
, "true") == 0 || strcasecmp (val
, "on") == 0) {
523 header
.copy_on_read
= TRUE
;
524 printf ("Copy on read is enabled for this disk.\n");
526 header
.copy_on_read
= FALSE
;
527 printf ("Copy on read is disabled for this disk.\n");
529 } else if (strcmp (attr
, "clean_shutdown") == 0) {
530 if (strcasecmp (val
, "true") == 0 || strcasecmp (val
, "on") == 0) {
531 header
.clean_shutdown
= TRUE
;
532 printf ("clean_shutdown is manually set to true\n");
534 header
.clean_shutdown
= FALSE
;
535 printf ("clean_shutdown is manually set to false\n");
537 } else if (strcmp (attr
, "max_outstanding_copy_on_read_data") == 0) {
538 header
.max_outstanding_copy_on_read_data
= atoll (val
);
539 if (header
.max_outstanding_copy_on_read_data
<= 0) {
540 fprintf (stderr
, "Error: max_outstanding_copy_on_read_data "
541 "must be positive while the provided value is %"
543 header
.max_outstanding_copy_on_read_data
);
546 printf ("max_outstanding_copy_on_read_data updated to %" PRId64
547 ".\n", header
.max_outstanding_copy_on_read_data
);
548 } else if (strcmp (attr
, "prefetch_start_delay") == 0) {
549 header
.prefetch_start_delay
= atoi (val
);
550 if (header
.prefetch_start_delay
>= 0) {
551 printf ("Prefetch starting delay updated to %d seconds.\n",
552 header
.prefetch_start_delay
);
555 printf ("Prefetch starting delay updated to %d seconds. "
556 "Because of the negative value, prefetching is "
557 "disabled for this image.\n",
558 header
.prefetch_start_delay
);
560 } else if (strcmp (attr
, "max_num_outstanding_prefetch_writes") == 0) {
561 header
.num_prefetch_slots
= atoi (val
);
562 if (header
.num_prefetch_slots
< 1) {
563 fprintf (stderr
, "Error: max_num_outstanding_prefetch_writes "
564 "%d is not a positive integer.\n",
565 header
.num_prefetch_slots
);
568 printf ("max_num_outstanding_prefetch_writes updated to %d.\n",
569 header
.num_prefetch_slots
);
570 } else if (strcmp (attr
, "bytes_per_prefetch") == 0) {
571 header
.bytes_per_prefetch
= atoi (val
);
572 if (header
.bytes_per_prefetch
< DEF_PAGE_SIZE
) {
573 fprintf (stderr
, "Error: bytes_per_prefetch cannot be smaller "
574 "than %d.\n", DEF_PAGE_SIZE
);
577 printf ("bytes_per_prefetch updated to %d.\n",
578 header
.bytes_per_prefetch
);
579 } else if (strcmp (attr
, "prefetch_min_read_throughput_threshold")==0) {
580 header
.prefetch_min_read_throughput
= atoi (val
);
581 printf ("prefetch_min_read_throughput_threshold updated to %d "
582 "KB/s\n", header
.prefetch_min_read_throughput
);
583 } else if (strcmp (attr
,"prefetch_min_write_throughput_threshold")==0) {
584 header
.prefetch_min_write_throughput
= atoi (val
);
585 printf ("prefetch_min_write_throughput_threshold updated to %d "
586 "KB/s\n", header
.prefetch_min_write_throughput
);
587 } else if (strcmp (attr
, "prefetch_perf_calc_alpha") == 0) {
588 header
.prefetch_perf_calc_alpha
= atoi (val
);
589 printf ("prefetch_perf_calc_alpha updated to %d\n",
590 header
.prefetch_perf_calc_alpha
);
591 } else if (strcmp (attr
, "prefetch_read_throughput_measure_time")==0) {
592 header
.prefetch_read_throughput_measure_time
= atoi (val
);
593 printf ("prefetch_read_throughput_measure_time updated to %d ms\n",
594 header
.prefetch_read_throughput_measure_time
);
595 } else if (strcmp (attr
, "prefetch_write_throughput_measure_time")==0) {
596 header
.prefetch_write_throughput_measure_time
= atoi (val
);
597 printf ("prefetch_write_throughput_measure_time updated to %d ms\n",
598 header
.prefetch_write_throughput_measure_time
);
599 } else if (strcmp (attr
, "prefetch_over_threshold_throttle_time")==0) {
600 header
.prefetch_throttle_time
= atoi (val
);
601 if (header
.prefetch_throttle_time
> 0) {
602 printf ("prefetch_over_threshold_throttle_time updated to %d "
603 "milliseconds.\n", header
.prefetch_throttle_time
);
605 printf ("prefetch_over_threshold_throttle_time updated to %d "
606 "milliseconds. It is not positive and hence no "
607 "throttling will be applied to prefetch.\n",
608 header
.prefetch_throttle_time
);
611 fprintf (stderr
, "Error: unknown setting '%s=%s'\n", attr
, val
);
616 update_fvd_header (s
, &header
);