2 * Copyright (c) 2010-2011 IBM
5 * Chunqiang Tang <ctang@us.ibm.com>
7 * This work is licensed under the terms of the GNU GPL, version 2.
8 * See the COPYING file in the top-level directory.
11 /*=============================================================================
12 * A short description: this module implements bdrv_file_open() for FVD.
13 *============================================================================*/
15 static void init_prefetch_timer (BlockDriverState
* bs
, BDRVFvdState
* s
);
16 static int init_data_file (BDRVFvdState
* s
, FvdHeader
* header
, int flags
);
17 static int init_bitmap (BlockDriverState
* bs
, BDRVFvdState
* s
,
18 FvdHeader
* header
, const char *const filename
);
19 static int load_table (BDRVFvdState
* s
, FvdHeader
* header
,
20 const char *const filename
);
21 static int init_journal (int read_only
, BlockDriverState
* bs
,
23 static int init_compact_image (BDRVFvdState
* s
, FvdHeader
* header
,
24 const char *const filename
);
26 static QemuOptsList runtime_opts
= {
28 .head
= QTAILQ_HEAD_INITIALIZER(runtime_opts
.head
),
32 .type
= QEMU_OPT_STRING
,
33 .help
= "File name of the image",
39 static int fvd_open(BlockDriverState
* bs
, QDict
*options
, int flags
,
42 BDRVFvdState
*s
= bs
->opaque
;
47 Error
*local_err
= NULL
;
50 QemuOpts
*opts
= qemu_opts_create_nofail(&runtime_opts
);
51 qemu_opts_absorb_qdict(opts
, options
, &local_err
);
52 if (error_is_set(&local_err
)) {
53 qerror_report_err(local_err
);
54 error_free(local_err
);
58 filename
= qemu_opt_get(opts
, "filename");
60 const char * protocol
= strchr (filename
, ':');
62 drv
= bdrv_find_protocol (filename
, true);
63 filename
= protocol
+ 1;
66 /* Use "raw" instead of "file" to allow storing the image on device. */
67 drv
= bdrv_find_format ("raw");
69 fprintf (stderr
, "Failed to find the block device driver\n");
74 s
->fvd_metadata
= bdrv_new ("");
75 ret
= bdrv_open(s
->fvd_metadata
, filename
, NULL
, flags
, drv
, &local_err
);
77 qerror_report_err(local_err
);
78 error_free(local_err
);
82 /* Initialize so that jumping to 'fail' would do cleanup properly. */
83 s
->stale_bitmap
= NULL
;
84 s
->fresh_bitmap
= NULL
;
86 s
->outstanding_copy_on_read_data
= 0;
87 QLIST_INIT (&s
->write_locks
);
88 QLIST_INIT (&s
->copy_locks
);
89 QLIST_INIT (&s
->wait_for_journal
);
90 s
->ongoing_journal_updates
= 0;
91 s
->prefetch_acb
= NULL
;
92 s
->add_storage_cmd
= NULL
;
94 s
->total_copy_on_read_data
= s
->total_prefetch_data
= 0;
97 if (bdrv_pread (s
->fvd_metadata
, 0, &header
, sizeof (header
)) !=
99 fprintf (stderr
, "Failed to read the header of %s\n", filename
);
103 fvd_header_le_to_cpu (&header
);
105 if (header
.magic
!= FVD_MAGIC
|| header
.version
!= FVD_VERSION
) {
106 fprintf (stderr
, "Incorrect magic number in the header of %s: "
107 "magic=%0X version=%d expect_magic=%0X expect_version=%d\n",
108 filename
, header
.magic
, header
.version
, FVD_MAGIC
,
112 if (header
.virtual_disk_size
% 512 != 0) {
113 fprintf (stderr
, "Disk size %"PRId64
" in the header of %s is not "
114 "a multple of 512.\n", header
.virtual_disk_size
, filename
);
118 /* Initialize the fields of BDRVFvdState. */
119 s
->dirty_image
= FALSE
;
120 s
->block_size
= header
.block_size
/ 512;
121 s
->bitmap_size
= header
.bitmap_size
;
122 s
->prefetch_error
= FALSE
;
123 s
->prefetch_timer
= NULL
;
124 s
->sectors_per_prefetch
= (header
.bytes_per_prefetch
+ 511) / 512;
125 s
->prefetch_throttle_time
= header
.prefetch_throttle_time
;
126 s
->prefetch_perf_calc_alpha
= header
.prefetch_perf_calc_alpha
/ 100.0;
127 s
->prefetch_read_throughput_measure_time
=
128 header
.prefetch_read_throughput_measure_time
;
129 s
->prefetch_write_throughput_measure_time
=
130 header
.prefetch_write_throughput_measure_time
;
132 /* Convert KB/s to bytes/millisec. */
133 s
->prefetch_min_read_throughput
=
134 ((double) header
.prefetch_min_read_throughput
) * 1024.0 / 1000.0;
135 s
->prefetch_min_write_throughput
=
136 ((double) header
.prefetch_min_write_throughput
) * 1024.0 / 1000.0;
138 if (header
.base_img
[0] != 0 && s
->sectors_per_prefetch
%s
->block_size
!= 0) {
139 fprintf (stderr
, "sectors_per_prefetch (%d) is not a multiple of "
141 s
->sectors_per_prefetch
* 512, s
->block_size
* 512);
143 s
->max_outstanding_copy_on_read_data
=
144 header
.max_outstanding_copy_on_read_data
;
145 if (s
->max_outstanding_copy_on_read_data
< header
.block_size
* 2) {
146 s
->max_outstanding_copy_on_read_data
= header
.block_size
;
149 if (header
.num_prefetch_slots
< 1) {
150 s
->num_prefetch_slots
= 1;
152 s
->num_prefetch_slots
= header
.num_prefetch_slots
;
155 /* No prefetching in a qemu tool. */
156 s
->prefetch_start_delay
= -1;
158 #ifndef SIMULATED_TEST_WITH_QEMU_IO
159 s
->copy_on_read
= FALSE
; /* No prefetching in a qemu tool. */
161 /* But allow debugging copy_on_read in qemu-io if configured. */
162 s
->copy_on_read
= header
.copy_on_read
;
165 s
->prefetch_start_delay
= header
.prefetch_start_delay
;
166 s
->copy_on_read
= header
.copy_on_read
;
168 s
->virtual_disk_size
= header
.virtual_disk_size
;
169 s
->bitmap_offset
= header
.bitmap_offset
/ 512;
170 s
->nb_sectors_in_base_img
= header
.base_img_size
/ 512;
171 bs
->total_sectors
= s
->virtual_disk_size
/ 512;
173 if (init_data_file (s
, &header
, flags
)) {
177 if (init_bitmap (bs
, s
, &header
, filename
)) {
181 if (load_table (s
, &header
, filename
)) {
185 const int read_only
= !(flags
& BDRV_O_RDWR
);
186 if (init_journal (read_only
, bs
, &header
)) {
190 /* This must be done after init_journal() because it may use metadata
191 * recovered from the journal. */
192 if (init_compact_image (s
, &header
, filename
)) {
197 /* This flag will be cleaned later when the image is shut down
199 update_clean_shutdown_flag (s
, FALSE
);
201 init_prefetch_timer (bs
, s
);
203 QDEBUG ("copy_on_read=%s block_size=%d journal_size=%" PRId64
204 " prefetching_delay=%d prefetch_slots=%d "
205 "prefetch_read_threshold_KB=%.0lf "
206 "prefetch_write_threshold_KB=%.0lf "
207 "prefetch_throttle_time=%d bytes_per_prefetch=%d "
208 "max_outstanding_copy_on_read_data=%"PRId64
"\n",
209 BOOL (s
->copy_on_read
), s
->block_size
* 512,
210 s
->journal_size
* 512, s
->prefetch_start_delay
,
211 s
->num_prefetch_slots
,
212 s
->prefetch_min_read_throughput
* 1000.0 / 1024.0,
213 s
->prefetch_min_write_throughput
* 1000.0 / 1024.0,
214 s
->prefetch_throttle_time
, s
->sectors_per_prefetch
* 512,
215 s
->max_outstanding_copy_on_read_data
);
220 fprintf (stderr
, "Failed to open %s using the FVD format.\n", filename
);
225 static int load_table (BDRVFvdState
* s
, FvdHeader
* header
,
226 const char *const filename
)
228 if (!header
->compact_image
) {
232 /* Initialize the table. */
233 s
->table_offset
= header
->table_offset
/ 512;
234 s
->chunk_size
= header
->chunk_size
/ 512;
235 int64_t vsize
= header
->virtual_disk_size
+ header
->chunk_size
- 1;
236 int table_entries
= vsize
/ header
->chunk_size
;
237 int64_t table_size
= sizeof (uint32_t) * table_entries
;
238 table_size
= ROUND_UP (table_size
, DEF_PAGE_SIZE
);
239 s
->table
= my_qemu_blockalign (s
->fvd_metadata
, (size_t) table_size
);
241 if (bdrv_pread (s
->fvd_metadata
, header
->table_offset
, s
->table
, table_size
)
243 fprintf (stderr
, "Failed to read the table of %s\n", filename
);
250 static int init_compact_image (BDRVFvdState
* s
, FvdHeader
* header
,
251 const char *const filename
)
253 if (!header
->compact_image
) {
254 s
->data_region_prepared
= FALSE
;
258 /* Scan the table to find the max allocated chunk. */
260 uint32_t max_chunk
= 0;
261 int empty_disk
= TRUE
;
263 (int) (ROUND_UP (header
->virtual_disk_size
, header
->chunk_size
) /
265 for (i
= 0; i
< table_entries
; i
++) {
266 if (!IS_EMPTY (s
->table
[i
])) {
268 uint32_t id
= READ_TABLE (s
->table
[i
]);
269 if (id
> max_chunk
) {
277 s
->used_storage
= max_chunk
* s
->chunk_size
;
278 s
->storage_grow_unit
= header
->storage_grow_unit
/ 512;
280 /* Check if the image is directly stored on a raw device, including
281 * logical volume. If so, figure out the size of the device. */
282 struct stat stat_buf
;
283 if (stat (filename
, &stat_buf
) != 0) {
284 fprintf (stderr
, "Failed to stat() %s\n", filename
);
288 /* Check how much storage space is already allocated. */
289 int64_t size
= bdrv_getlength (s
->fvd_data
);
291 fprintf (stderr
, "Failed in bdrv_getlength(%s)\n", filename
);
294 const int64_t min_size
= (s
->data_offset
+ s
->used_storage
) * 512;
295 if (size
< min_size
) {
296 fprintf (stderr
, "The size of device %s is not even big enough to "
297 "store already allocated data.\n",
302 if (S_ISBLK (stat_buf
.st_mode
) || S_ISCHR (stat_buf
.st_mode
)) {
303 /* Initialize the command to grow storage space. */
305 if (header
->add_storage_cmd
[0] == 0) {
306 s
->add_storage_cmd
= NULL
;
308 if (strcmp (header
->add_storage_cmd
, "builtin:lvextend") == 0) {
309 /* Note the following:
310 * 1. lvextend may generate warning messages like "File
311 * descriptor...leaked...", * which is fine. See the
312 * following from LVM manual: "On invocation, lvm requires
313 * that only the standard file descriptors stdin,
314 * stdout * and stderr are available. If others are
315 * found, they get closed and messages are issued warning
317 * 2. Instead of using the lvextend command line, one
318 * option is to use liblvm directly, which avoids creating
319 * a process to resize a LV.
320 * 3. On Ubuntu, /bin/sh is linked to /bin/dash, which
321 * does not support ">&" for stdout and stderr
323 snprintf (cmd
, sizeof (cmd
) - 1, "/sbin/lvextend -L+%" PRId64
324 "B %s >/dev/null 2>/dev/null",
325 header
->storage_grow_unit
,
326 header
->data_file
[0] ? header
->data_file
: filename
);
328 snprintf (cmd
, sizeof (cmd
) - 1, "%s %" PRId64
329 " %s >/dev/null 2>/dev/null",
330 header
->add_storage_cmd
, header
->storage_grow_unit
,
331 header
->data_file
[0] ? header
->data_file
: filename
);
334 int len
= strlen (cmd
);
335 s
->add_storage_cmd
= my_qemu_malloc (len
+ 1);
336 memcpy (s
->add_storage_cmd
, cmd
, len
+ 1);
340 s
->data_storage
= size
/ 512 - s
->data_offset
;
341 s
->fvd_data
->growable
= TRUE
;
342 s
->data_region_prepared
= TRUE
;
347 static int init_data_file (BDRVFvdState
* s
, FvdHeader
* header
, int flags
)
349 Error
*local_err
= NULL
;
352 if (header
->data_file
[0]) {
353 /* Open a separate data file. */
355 s
->fvd_data
= bdrv_new ("");
357 fprintf (stderr
, "Failed to create a new block device driver.\n");
361 if (header
->data_file_fmt
[0] == 0) {
362 ret
= bdrv_open(s
->fvd_data
, header
->data_file
, NULL
, flags
, NULL
,
365 BlockDriver
*data_drv
= bdrv_find_format (header
->data_file_fmt
);
367 fprintf (stderr
, "Failed to find driver for image format "
368 "'%s' of data file %s\n",
369 header
->data_file_fmt
, header
->data_file
);
372 ret
= bdrv_open(s
->fvd_data
, header
->data_file
,
373 NULL
, flags
, data_drv
, &local_err
);
376 qerror_report_err(local_err
);
377 error_free(local_err
);
381 s
->data_offset
= header
->metadata_size
/ 512; /* In sectors. */
382 s
->fvd_data
= s
->fvd_metadata
;
385 if (header
->need_zero_init
&& !bdrv_has_zero_init (s
->fvd_data
)) {
387 /* Only give a warning to allow 'qemu-img update' to modify
388 * need_zero_init if the user manually zero-init the device. */
389 fprintf (stderr
, "Warning: image needs zero_init but it is not "
390 "supported by the storage media.\n");
392 fprintf (stderr
, "Error: image needs zero_init but it is not "
393 "supported by the storage media.\n");
401 static int init_bitmap (BlockDriverState
* bs
, BDRVFvdState
* s
,
402 FvdHeader
* header
, const char *const filename
)
404 if (header
->all_data_in_fvd_img
) {
405 /* This also covers the case of no base image. */
406 s
->prefetch_state
= PREFETCH_STATE_FINISHED
;
407 s
->copy_on_read
= FALSE
;
408 s
->prefetch_start_delay
= -1;
410 if (bs
->backing_file
[0] != 0) {
411 /* No need to use the base image. It may operate without problem
412 * even if the base image is no longer accessible. */
413 bs
->backing_file
[0] = 0;
416 ASSERT (header
->base_img
[0] != 0);
417 pstrcpy (bs
->backing_file
, 1024, header
->base_img
);
418 const int flags
= O_RDONLY
| O_BINARY
| O_LARGEFILE
;
419 int test_backing_fd
= open (bs
->backing_file
, flags
);
420 if (test_backing_fd
< 0) {
421 fprintf (stderr
, "Failed to open the base image %s for read.\n",
425 close (test_backing_fd
);
427 /* This will be enabled in init_prefetch() after a timer expires. */
428 s
->prefetch_state
= PREFETCH_STATE_DISABLED
;
430 s
->stale_bitmap
= my_qemu_blockalign (s
->fvd_metadata
,
431 (size_t) s
->bitmap_size
);
432 if (bdrv_pread (s
->fvd_metadata
, header
->bitmap_offset
,
433 s
->stale_bitmap
, s
->bitmap_size
) != s
->bitmap_size
) {
434 fprintf (stderr
, "Failed to the bitmap of %s.\n", filename
);
438 if (s
->copy_on_read
|| (s
->prefetch_state
!= PREFETCH_STATE_FINISHED
&&
439 s
->prefetch_start_delay
> 0)) {
440 /* Use two bitmaps only if copy_on_read or prefetching is enabled.
441 * See Section 3.3.4 of the FVD-cow paper. */
442 s
->fresh_bitmap
= my_qemu_blockalign (s
->fvd_metadata
,
444 memcpy (s
->fresh_bitmap
, s
->stale_bitmap
, s
->bitmap_size
);
446 s
->fresh_bitmap
= s
->stale_bitmap
;
453 static void init_prefetch_timer (BlockDriverState
* bs
, BDRVFvdState
* s
)
455 #ifndef SIMULATED_TEST_WITH_QEMU_IO
461 if (s
->prefetch_state
== PREFETCH_STATE_FINISHED
||
462 s
->prefetch_start_delay
<= 0) {
466 /* Start prefetching after a delay. Times 1000 to convert sec to ms. */
467 int64_t expire
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
) + s
->prefetch_start_delay
* 1000;
468 s
->prefetch_timer
= timer_new_ns(QEMU_CLOCK_REALTIME
, fvd_init_prefetch
, bs
);
469 timer_mod(s
->prefetch_timer
, expire
);