2 * Copyright (c) 2010-2011 IBM
5 * Chunqiang Tang <ctang@us.ibm.com>
7 * This work is licensed under the terms of the GNU GPL, version 2.
8 * See the COPYING file in the top-level directory.
11 /*=============================================================================
12 * A short description: this module implements bdrv_file_open() for FVD.
13 *============================================================================*/
15 static void init_prefetch_timer (BlockDriverState
* bs
, BDRVFvdState
* s
);
16 static int init_data_file (BDRVFvdState
* s
, FvdHeader
* header
, int flags
);
17 static int init_bitmap (BlockDriverState
* bs
, BDRVFvdState
* s
,
18 FvdHeader
* header
, const char *const filename
);
19 static int load_table (BDRVFvdState
* s
, FvdHeader
* header
,
20 const char *const filename
);
21 static int init_journal (int read_only
, BlockDriverState
* bs
,
23 static int init_compact_image (BDRVFvdState
* s
, FvdHeader
* header
,
24 const char *const filename
);
26 static QemuOptsList runtime_opts
= {
28 .head
= QTAILQ_HEAD_INITIALIZER(runtime_opts
.head
),
32 .type
= QEMU_OPT_STRING
,
33 .help
= "File name of the image",
39 static int fvd_open(BlockDriverState
* bs
, QDict
*options
, int flags
,
42 BDRVFvdState
*s
= bs
->opaque
;
47 Error
*local_err
= NULL
;
50 QemuOpts
*opts
= qemu_opts_create(&runtime_opts
, NULL
, 0, &error_abort
);
51 qemu_opts_absorb_qdict(opts
, options
, &local_err
);
53 qerror_report_err(local_err
);
54 error_free(local_err
);
58 filename
= qemu_opt_get(opts
, "filename");
60 const char * protocol
= strchr (filename
, ':');
62 drv
= bdrv_find_protocol (filename
, true);
63 filename
= protocol
+ 1;
66 /* Use "raw" instead of "file" to allow storing the image on device. */
67 drv
= bdrv_find_format ("raw");
69 fprintf (stderr
, "Failed to find the block device driver\n");
74 s
->fvd_metadata
= bdrv_new("", &error_abort
);
75 ret
= bdrv_open(&s
->fvd_metadata
, filename
, NULL
, NULL
,
76 flags
, drv
, &local_err
);
78 qerror_report_err(local_err
);
79 error_free(local_err
);
83 /* Initialize so that jumping to 'fail' would do cleanup properly. */
84 s
->stale_bitmap
= NULL
;
85 s
->fresh_bitmap
= NULL
;
87 s
->outstanding_copy_on_read_data
= 0;
88 QLIST_INIT (&s
->write_locks
);
89 QLIST_INIT (&s
->copy_locks
);
90 QLIST_INIT (&s
->wait_for_journal
);
91 s
->ongoing_journal_updates
= 0;
92 s
->prefetch_acb
= NULL
;
93 s
->add_storage_cmd
= NULL
;
95 s
->total_copy_on_read_data
= s
->total_prefetch_data
= 0;
98 if (bdrv_pread (s
->fvd_metadata
, 0, &header
, sizeof (header
)) !=
100 fprintf (stderr
, "Failed to read the header of %s\n", filename
);
104 fvd_header_le_to_cpu (&header
);
106 if (header
.magic
!= FVD_MAGIC
|| header
.version
!= FVD_VERSION
) {
107 fprintf (stderr
, "Incorrect magic number in the header of %s: "
108 "magic=%0X version=%d expect_magic=%0X expect_version=%d\n",
109 filename
, header
.magic
, header
.version
, FVD_MAGIC
,
113 if (header
.virtual_disk_size
% 512 != 0) {
114 fprintf (stderr
, "Disk size %"PRId64
" in the header of %s is not "
115 "a multple of 512.\n", header
.virtual_disk_size
, filename
);
119 /* Initialize the fields of BDRVFvdState. */
120 s
->dirty_image
= FALSE
;
121 s
->block_size
= header
.block_size
/ 512;
122 s
->bitmap_size
= header
.bitmap_size
;
123 s
->prefetch_error
= FALSE
;
124 s
->prefetch_timer
= NULL
;
125 s
->sectors_per_prefetch
= (header
.bytes_per_prefetch
+ 511) / 512;
126 s
->prefetch_throttle_time
= header
.prefetch_throttle_time
;
127 s
->prefetch_perf_calc_alpha
= header
.prefetch_perf_calc_alpha
/ 100.0;
128 s
->prefetch_read_throughput_measure_time
=
129 header
.prefetch_read_throughput_measure_time
;
130 s
->prefetch_write_throughput_measure_time
=
131 header
.prefetch_write_throughput_measure_time
;
133 /* Convert KB/s to bytes/millisec. */
134 s
->prefetch_min_read_throughput
=
135 ((double) header
.prefetch_min_read_throughput
) * 1024.0 / 1000.0;
136 s
->prefetch_min_write_throughput
=
137 ((double) header
.prefetch_min_write_throughput
) * 1024.0 / 1000.0;
139 if (header
.base_img
[0] != 0 && s
->sectors_per_prefetch
%s
->block_size
!= 0) {
140 fprintf (stderr
, "sectors_per_prefetch (%d) is not a multiple of "
142 s
->sectors_per_prefetch
* 512, s
->block_size
* 512);
144 s
->max_outstanding_copy_on_read_data
=
145 header
.max_outstanding_copy_on_read_data
;
146 if (s
->max_outstanding_copy_on_read_data
< header
.block_size
* 2) {
147 s
->max_outstanding_copy_on_read_data
= header
.block_size
;
150 if (header
.num_prefetch_slots
< 1) {
151 s
->num_prefetch_slots
= 1;
153 s
->num_prefetch_slots
= header
.num_prefetch_slots
;
156 /* No prefetching in a qemu tool. */
157 s
->prefetch_start_delay
= -1;
159 #ifndef SIMULATED_TEST_WITH_QEMU_IO
160 s
->copy_on_read
= FALSE
; /* No prefetching in a qemu tool. */
162 /* But allow debugging copy_on_read in qemu-io if configured. */
163 s
->copy_on_read
= header
.copy_on_read
;
166 s
->prefetch_start_delay
= header
.prefetch_start_delay
;
167 s
->copy_on_read
= header
.copy_on_read
;
169 s
->virtual_disk_size
= header
.virtual_disk_size
;
170 s
->bitmap_offset
= header
.bitmap_offset
/ 512;
171 s
->nb_sectors_in_base_img
= header
.base_img_size
/ 512;
172 bs
->total_sectors
= s
->virtual_disk_size
/ 512;
174 if (init_data_file (s
, &header
, flags
)) {
178 if (init_bitmap (bs
, s
, &header
, filename
)) {
182 if (load_table (s
, &header
, filename
)) {
186 const int read_only
= !(flags
& BDRV_O_RDWR
);
187 if (init_journal (read_only
, bs
, &header
)) {
191 /* This must be done after init_journal() because it may use metadata
192 * recovered from the journal. */
193 if (init_compact_image (s
, &header
, filename
)) {
198 /* This flag will be cleaned later when the image is shut down
200 update_clean_shutdown_flag (s
, FALSE
);
202 init_prefetch_timer (bs
, s
);
204 QDEBUG ("copy_on_read=%s block_size=%d journal_size=%" PRId64
205 " prefetching_delay=%d prefetch_slots=%d "
206 "prefetch_read_threshold_KB=%.0lf "
207 "prefetch_write_threshold_KB=%.0lf "
208 "prefetch_throttle_time=%d bytes_per_prefetch=%d "
209 "max_outstanding_copy_on_read_data=%"PRId64
"\n",
210 BOOL (s
->copy_on_read
), s
->block_size
* 512,
211 s
->journal_size
* 512, s
->prefetch_start_delay
,
212 s
->num_prefetch_slots
,
213 s
->prefetch_min_read_throughput
* 1000.0 / 1024.0,
214 s
->prefetch_min_write_throughput
* 1000.0 / 1024.0,
215 s
->prefetch_throttle_time
, s
->sectors_per_prefetch
* 512,
216 s
->max_outstanding_copy_on_read_data
);
221 fprintf (stderr
, "Failed to open %s using the FVD format.\n", filename
);
226 static int load_table (BDRVFvdState
* s
, FvdHeader
* header
,
227 const char *const filename
)
229 if (!header
->compact_image
) {
233 /* Initialize the table. */
234 s
->table_offset
= header
->table_offset
/ 512;
235 s
->chunk_size
= header
->chunk_size
/ 512;
236 int64_t vsize
= header
->virtual_disk_size
+ header
->chunk_size
- 1;
237 int table_entries
= vsize
/ header
->chunk_size
;
238 int64_t table_size
= sizeof (uint32_t) * table_entries
;
239 table_size
= ROUND_UP (table_size
, DEF_PAGE_SIZE
);
240 s
->table
= my_qemu_blockalign (s
->fvd_metadata
, (size_t) table_size
);
242 if (bdrv_pread (s
->fvd_metadata
, header
->table_offset
, s
->table
, table_size
)
244 fprintf (stderr
, "Failed to read the table of %s\n", filename
);
251 static int init_compact_image (BDRVFvdState
* s
, FvdHeader
* header
,
252 const char *const filename
)
254 if (!header
->compact_image
) {
255 s
->data_region_prepared
= FALSE
;
259 /* Scan the table to find the max allocated chunk. */
261 uint32_t max_chunk
= 0;
262 int empty_disk
= TRUE
;
264 (int) (ROUND_UP (header
->virtual_disk_size
, header
->chunk_size
) /
266 for (i
= 0; i
< table_entries
; i
++) {
267 if (!IS_EMPTY (s
->table
[i
])) {
269 uint32_t id
= READ_TABLE (s
->table
[i
]);
270 if (id
> max_chunk
) {
278 s
->used_storage
= max_chunk
* s
->chunk_size
;
279 s
->storage_grow_unit
= header
->storage_grow_unit
/ 512;
281 /* Check if the image is directly stored on a raw device, including
282 * logical volume. If so, figure out the size of the device. */
283 struct stat stat_buf
;
284 if (stat (filename
, &stat_buf
) != 0) {
285 fprintf (stderr
, "Failed to stat() %s\n", filename
);
289 /* Check how much storage space is already allocated. */
290 int64_t size
= bdrv_getlength (s
->fvd_data
);
292 fprintf (stderr
, "Failed in bdrv_getlength(%s)\n", filename
);
295 const int64_t min_size
= (s
->data_offset
+ s
->used_storage
) * 512;
296 if (size
< min_size
) {
297 fprintf (stderr
, "The size of device %s is not even big enough to "
298 "store already allocated data.\n",
303 if (S_ISBLK (stat_buf
.st_mode
) || S_ISCHR (stat_buf
.st_mode
)) {
304 /* Initialize the command to grow storage space. */
306 if (header
->add_storage_cmd
[0] == 0) {
307 s
->add_storage_cmd
= NULL
;
309 if (strcmp (header
->add_storage_cmd
, "builtin:lvextend") == 0) {
310 /* Note the following:
311 * 1. lvextend may generate warning messages like "File
312 * descriptor...leaked...", * which is fine. See the
313 * following from LVM manual: "On invocation, lvm requires
314 * that only the standard file descriptors stdin,
315 * stdout * and stderr are available. If others are
316 * found, they get closed and messages are issued warning
318 * 2. Instead of using the lvextend command line, one
319 * option is to use liblvm directly, which avoids creating
320 * a process to resize a LV.
321 * 3. On Ubuntu, /bin/sh is linked to /bin/dash, which
322 * does not support ">&" for stdout and stderr
324 snprintf (cmd
, sizeof (cmd
) - 1, "/sbin/lvextend -L+%" PRId64
325 "B %s >/dev/null 2>/dev/null",
326 header
->storage_grow_unit
,
327 header
->data_file
[0] ? header
->data_file
: filename
);
329 snprintf (cmd
, sizeof (cmd
) - 1, "%s %" PRId64
330 " %s >/dev/null 2>/dev/null",
331 header
->add_storage_cmd
, header
->storage_grow_unit
,
332 header
->data_file
[0] ? header
->data_file
: filename
);
335 int len
= strlen (cmd
);
336 s
->add_storage_cmd
= my_qemu_malloc (len
+ 1);
337 memcpy (s
->add_storage_cmd
, cmd
, len
+ 1);
341 s
->data_storage
= size
/ 512 - s
->data_offset
;
342 s
->fvd_data
->growable
= TRUE
;
343 s
->data_region_prepared
= TRUE
;
348 static int init_data_file (BDRVFvdState
* s
, FvdHeader
* header
, int flags
)
350 Error
*local_err
= NULL
;
353 if (header
->data_file
[0]) {
354 /* Open a separate data file. */
356 s
->fvd_data
= bdrv_new("", &error_abort
);
358 fprintf (stderr
, "Failed to create a new block device driver.\n");
362 if (header
->data_file_fmt
[0] == 0) {
363 ret
= bdrv_open(&s
->fvd_data
, header
->data_file
, NULL
, NULL
,
364 flags
, NULL
, &local_err
);
366 BlockDriver
*data_drv
= bdrv_find_format (header
->data_file_fmt
);
368 fprintf (stderr
, "Failed to find driver for image format "
369 "'%s' of data file %s\n",
370 header
->data_file_fmt
, header
->data_file
);
373 ret
= bdrv_open(&s
->fvd_data
, header
->data_file
,
374 NULL
, NULL
, flags
, data_drv
, &local_err
);
377 qerror_report_err(local_err
);
378 error_free(local_err
);
382 s
->data_offset
= header
->metadata_size
/ 512; /* In sectors. */
383 s
->fvd_data
= s
->fvd_metadata
;
386 if (header
->need_zero_init
&& !bdrv_has_zero_init (s
->fvd_data
)) {
388 /* Only give a warning to allow 'qemu-img update' to modify
389 * need_zero_init if the user manually zero-init the device. */
390 fprintf (stderr
, "Warning: image needs zero_init but it is not "
391 "supported by the storage media.\n");
393 fprintf (stderr
, "Error: image needs zero_init but it is not "
394 "supported by the storage media.\n");
402 static int init_bitmap (BlockDriverState
* bs
, BDRVFvdState
* s
,
403 FvdHeader
* header
, const char *const filename
)
405 if (header
->all_data_in_fvd_img
) {
406 /* This also covers the case of no base image. */
407 s
->prefetch_state
= PREFETCH_STATE_FINISHED
;
408 s
->copy_on_read
= FALSE
;
409 s
->prefetch_start_delay
= -1;
411 if (bs
->backing_file
[0] != 0) {
412 /* No need to use the base image. It may operate without problem
413 * even if the base image is no longer accessible. */
414 bs
->backing_file
[0] = 0;
417 ASSERT (header
->base_img
[0] != 0);
418 pstrcpy (bs
->backing_file
, 1024, header
->base_img
);
419 const int flags
= O_RDONLY
| O_BINARY
| O_LARGEFILE
;
420 int test_backing_fd
= open (bs
->backing_file
, flags
);
421 if (test_backing_fd
< 0) {
422 fprintf (stderr
, "Failed to open the base image %s for read.\n",
426 close (test_backing_fd
);
428 /* This will be enabled in init_prefetch() after a timer expires. */
429 s
->prefetch_state
= PREFETCH_STATE_DISABLED
;
431 s
->stale_bitmap
= my_qemu_blockalign (s
->fvd_metadata
,
432 (size_t) s
->bitmap_size
);
433 if (bdrv_pread (s
->fvd_metadata
, header
->bitmap_offset
,
434 s
->stale_bitmap
, s
->bitmap_size
) != s
->bitmap_size
) {
435 fprintf (stderr
, "Failed to the bitmap of %s.\n", filename
);
439 if (s
->copy_on_read
|| (s
->prefetch_state
!= PREFETCH_STATE_FINISHED
&&
440 s
->prefetch_start_delay
> 0)) {
441 /* Use two bitmaps only if copy_on_read or prefetching is enabled.
442 * See Section 3.3.4 of the FVD-cow paper. */
443 s
->fresh_bitmap
= my_qemu_blockalign (s
->fvd_metadata
,
445 memcpy (s
->fresh_bitmap
, s
->stale_bitmap
, s
->bitmap_size
);
447 s
->fresh_bitmap
= s
->stale_bitmap
;
454 static void init_prefetch_timer (BlockDriverState
* bs
, BDRVFvdState
* s
)
456 #ifndef SIMULATED_TEST_WITH_QEMU_IO
462 if (s
->prefetch_state
== PREFETCH_STATE_FINISHED
||
463 s
->prefetch_start_delay
<= 0) {
467 /* Start prefetching after a delay. Times 1000 to convert sec to ms. */
468 int64_t expire
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
) + s
->prefetch_start_delay
* 1000;
469 s
->prefetch_timer
= timer_new_ns(QEMU_CLOCK_REALTIME
, fvd_init_prefetch
, bs
);
470 timer_mod(s
->prefetch_timer
, expire
);