2 * Copyright (c) 2010-2011 IBM
5 * Chunqiang Tang <ctang@us.ibm.com>
7 * This work is licensed under the terms of the GNU GPL, version 2.
8 * See the COPYING file in the top-level directory.
11 /*=============================================================================
12 * A short description: this FVD module implements a journal for committing
13 * metadata changes. Each sector in the journal is self-contained so that
14 * updates are atomic. A sector may contain one or multiple journal records.
15 * There are two types of journal records:
16 * bitmap_update and table_update.
17 * Format of a bitmap_update record:
18 * + BITMAP_JRECORD (uint32_t)
19 * + num_dirty_sectors (uint32_t)
20 * + dirty_sector_begin (int64_t)
21 * Format of a table_update record:
22 * + TABLE_JRECORD (uint32_t)
23 * + dirty_table_offset (uint32_t)
24 * + num_dirty_table_entries (uint32_t)
25 * + table_entry_1 (uint32_t)
26 * + table_entry_2 (uint32_t)
28 * If both the bitmap and the table need update, one sector contains a
29 * TABLE_JRECORD and a BITMAP_JRECORD, and these two records cover
30 * the same range of virtual disk data so that the corresponding parts of the
31 * bitmap and the table are always updated in one atomic operation.
32 *============================================================================*/
34 #define BITMAP_JRECORD ((uint32_t)0x3F2AB8ED)
35 #define TABLE_JRECORD ((uint32_t)0xB4E6F7AC)
36 #define EMPTY_JRECORD ((uint32_t)0)
37 #define BITMAP_JRECORD_SIZE (2*sizeof(uint32_t) + sizeof(int64_t))
38 #define TABLE_JRECORD_HDR_SIZE (3*sizeof(uint32_t))
39 #define TABLE_JRECORDS_PER_SECTOR \
40 ((512 - TABLE_JRECORD_HDR_SIZE)/sizeof(uint32_t))
42 /* One BITMAP_JRECORD and this number of BITMAP_JRECORDs can fit
43 * in one journal sector. */
44 #define MIXED_JRECORDS_PER_SECTOR ((512 - TABLE_JRECORD_HDR_SIZE - \
45 BITMAP_JRECORD_SIZE) / sizeof(uint32_t))
47 static inline int64_t calc_min_journal_size (int64_t table_entries
)
49 return (table_entries
+ MIXED_JRECORDS_PER_SECTOR
- 1)
50 / MIXED_JRECORDS_PER_SECTOR
* 512;
53 static int init_journal (int read_only
, BlockDriverState
* bs
,
56 BDRVFvdState
*s
= bs
->opaque
;
57 s
->journal_size
= header
->journal_size
/ 512;
58 s
->journal_offset
= header
->journal_offset
/ 512;
59 s
->next_journal_sector
= 0;
65 if (s
->journal_size
<= 0) {
66 if (!s
->table
&& !s
->fresh_bitmap
) {
67 return 0; /* No need to use the journal. */
70 if (!header
->clean_shutdown
) {
71 fprintf (stderr
, "ERROR: the image may be corrupted because it was "
72 "not shut down gracefully last\ntime and it does not use "
73 "a journal. You may continue to use the image at your\n"
74 "own risk by manually resetting the clean_shutdown flag "
76 s
->dirty_image
= TRUE
;
78 return 0; /* Allow qemu tools to use the image. */
80 /* Do not allow boot the VM until the clean_shutdown flag is
81 * manually cleaned. */
86 QDEBUG ("Journal is disabled\n");
90 if (header
->clean_shutdown
) {
91 QDEBUG ("Journal is skipped as the VM was shut down gracefully "
96 QDEBUG ("Recover from the journal as the VM was not shut down gracefully "
99 uint8_t *journal
= my_qemu_blockalign (s
->fvd_metadata
,
100 s
->journal_size
* 512);
101 int ret
= bdrv_read (s
->fvd_metadata
, s
->journal_offset
,
102 journal
, s
->journal_size
);
104 my_qemu_vfree (journal
);
105 fprintf (stderr
, "Failed to read the journal (%" PRId64
") bytes\n",
106 s
->journal_size
* 512);
110 /* Go through every journal sector. */
111 uint8_t *sector
= journal
;
112 uint8_t *journal_end
= journal
+ s
->journal_size
* 512;
113 while (sector
< journal_end
) {
114 uint32_t *type
= (uint32_t *) sector
; /* Journal record type. */
115 while ((uint8_t *) type
< (sector
+ 512)) {
116 if (le32_to_cpu (*type
) == BITMAP_JRECORD
) {
117 uint32_t *nb_sectors
= type
+ 1; /* BITMAP_JRECORD field 2. */
118 int64_t *sector_num
= (int64_t *) (type
+ 2); /* field 3. */
119 if (s
->stale_bitmap
) {
120 update_both_bitmaps (s
, le64_to_cpu (*sector_num
),
121 le32_to_cpu (*nb_sectors
));
122 QDEBUG ("JOURNAL: recover BITMAP_JRECORD sector_num=%"
123 PRId64
" nb_sectors=%u\n",
124 le64_to_cpu (*sector_num
),
125 le32_to_cpu (*nb_sectors
));
128 /* First field of the next journal record. */
129 type
= (uint32_t *) sector_num
+ 1;
130 } else if (le32_to_cpu (*type
) == TABLE_JRECORD
) {
131 uint32_t *offset
= type
+ 1; /* TABLE_JRECORD field 2. */
132 uint32_t *count
= type
+ 2; /* TABLE_JRECORD field 3. */
133 uint32_t *content
= type
+ 3; /* fields 4 and beyond. */
134 const uint32_t chunk
= le32_to_cpu (*offset
);
135 const uint32_t n
= le32_to_cpu (*count
);
137 for (i
= 0; i
< n
; i
++) {
138 s
->table
[chunk
+ i
] = content
[i
];
140 /* The dirty bit was not cleaned when the table entry was
141 * saved in the journal. */
142 CLEAN_DIRTY2 (s
->table
[chunk
+ i
]);
144 type
= content
+ n
; /* First field of the next record. */
145 QDEBUG ("JOURNAL: recover TABLE_JRECORD chunk_start=%u "
146 "nb_chunks=%u\n", chunk
, n
);
148 /* End of valid records in this journal sector. */
149 ASSERT (le32_to_cpu (*type
) == EMPTY_JRECORD
);
156 my_qemu_vfree (journal
);
157 flush_metadata_to_disk (bs
); /* Write the recovered metadata. */
163 * This function first flushes in-memory metadata to disk and then recycle the
164 * used journal sectors. It is possible to make this operation asynchronous so
165 * that the performance is better. However, the overall performance
166 * improvement may be limited since recycling the journal happens very
167 * infrequently and updating on-disk metadata finishes quickly because of the
168 * small size of the metadata.
170 static void recycle_journal (BDRVFvdState
* s
)
173 static int64_t recycle_count
= 0;
174 QDEBUG ("JOURNAL: start journal recycle %" PRId64
".\n", recycle_count
);
176 int64_t begin_time
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
179 /* Write fresh_bitmap to disk. */
180 if (s
->fresh_bitmap
) {
181 int nb
= (int) (s
->bitmap_size
/ 512);
182 QDEBUG ("JOURNAL: flush bitmap (%d sectors) to disk\n", nb
);
184 /* How to recover if this write fails? */
185 bdrv_write (s
->fvd_metadata
, s
->bitmap_offset
, s
->fresh_bitmap
, nb
);
187 if (s
->fresh_bitmap
!= s
->stale_bitmap
) {
188 memcpy (s
->stale_bitmap
, s
->fresh_bitmap
, s
->bitmap_size
);
192 /* Clean DIRTY_TABLE bit and write the table to disk. */
195 (int) (ROUND_UP (s
->virtual_disk_size
, s
->chunk_size
* 512) /
196 (s
->chunk_size
* 512));
198 for (i
= 0; i
< table_entries
; i
++) {
199 CLEAN_DIRTY (s
->table
[i
]);
202 int64_t table_size
= sizeof (uint32_t) * table_entries
;
203 table_size
= ROUND_UP (table_size
, DEF_PAGE_SIZE
);
204 int nb
= (int) (table_size
/ 512);
205 QDEBUG ("JOURNAL: flush table (%d sectors) to disk\n", nb
);
207 /* How to recover if this write fails? */
208 bdrv_write (s
->fvd_metadata
, s
->table_offset
, (uint8_t *) s
->table
, nb
);
210 s
->next_journal_sector
= 0;
213 int64_t end_time
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
214 QDEBUG ("JOURNAL: journal recycle took %" PRId64
" ms.\n",
215 (end_time
- begin_time
));
219 static void free_journal_sectors (BDRVFvdState
* s
)
221 if (s
->journal_size
<= 0) {
225 s
->ongoing_journal_updates
--;
226 ASSERT (s
->ongoing_journal_updates
>= 0);
227 if (s
->ongoing_journal_updates
> 0 || QLIST_EMPTY (&s
->wait_for_journal
)) {
231 /* Some requests are waiting for the journal to be recycled in order to
232 * get free journal sectors. */
235 /* Restart requests in the wait_for_journal list. First make a copy of
236 * the head and then empty the head. */
237 FvdAIOCB
*acb
= QLIST_FIRST (&s
->wait_for_journal
);
238 QLIST_INIT (&s
->wait_for_journal
);
241 /* Restart all dependent requests. Cannot use QLIST_FOREACH here, because
242 * the next link might not be the same any more after the callback. */
244 next
= acb
->jcb
.next_wait_for_journal
.le_next
;
245 acb
->jcb
.next_wait_for_journal
.le_prev
= NULL
;
246 QDEBUG ("WRITE: acb%llu-%p restart_write_metadata_to_journal "
247 "after recycle_journal\n", acb
->uuid
, acb
);
248 write_metadata_to_journal (acb
);
253 static int64_t allocate_journal_sectors (BDRVFvdState
* s
, FvdAIOCB
* acb
,
256 ASSERT (num_sectors
<= s
->journal_size
);
258 if (!QLIST_EMPTY (&s
->wait_for_journal
)) {
259 /* Waiting for journal recycle to finish. */
260 ASSERT (s
->ongoing_journal_updates
> 0);
261 QDEBUG ("WRITE: acb%llu-%p wait_for_journal_recycle\n",
263 QLIST_INSERT_HEAD (&s
->wait_for_journal
, acb
,
264 jcb
.next_wait_for_journal
);
269 if (s
->next_journal_sector
+ num_sectors
<= s
->journal_size
) {
271 journal_sec
= s
->next_journal_sector
;
272 s
->next_journal_sector
+= num_sectors
;
273 s
->ongoing_journal_updates
++;
277 /* No free journal sector is available. Check if the journal can be
279 if (s
->ongoing_journal_updates
== 0) {
284 /* Waiting for journal recycle to finish. It will be waken up later in
285 * free_journal_sectors(). */
286 QLIST_INSERT_HEAD (&s
->wait_for_journal
, acb
, jcb
.next_wait_for_journal
);
287 QDEBUG ("WRITE: acb%llu-%p wait_for_journal_recycle\n", acb
->uuid
, acb
);
291 static void finish_write_journal (void *opaque
, int ret
)
293 FvdAIOCB
*acb
= (FvdAIOCB
*) opaque
;
294 BlockDriverState
*bs
= acb
->common
.bs
;
295 BDRVFvdState
*s
= bs
->opaque
;
298 QDEBUG ("JOURNAL: acb%llu-%p finish_write_journal\n", acb
->uuid
, acb
);
301 /* Update the table. */
303 const uint32_t first_chunk
= acb
->sector_num
/ s
->chunk_size
;
304 const uint32_t last_chunk
= (acb
->sector_num
+ acb
->nb_sectors
- 1)
306 for (i
= first_chunk
; i
<= last_chunk
; i
++) {
307 CLEAN_DIRTY2 (s
->table
[i
]);
311 if (s
->stale_bitmap
) {
312 /* If fresh_bitmap differs from stale_bitmap, fresh_bitmap has
313 * already been updated in finish_write_data() when invoking
314 * update_fresh_bitmap_and_check_stale_bitmap(). */
315 update_stale_bitmap (s
, acb
->sector_num
, acb
->nb_sectors
);
318 QDEBUG ("JOURNAL: acb%llu-%p finish_write_journal error ret=%d\n",
319 acb
->uuid
, acb
, ret
);
323 if (acb
->type
== OP_STORE_COMPACT
) {
324 acb
->common
.cb (acb
->common
.opaque
, ret
);
325 if (acb
->jcb
.iov
.iov_base
!= NULL
) {
326 my_qemu_vfree (acb
->jcb
.iov
.iov_base
);
328 my_qemu_aio_unref (acb
);
330 ASSERT (acb
->type
== OP_WRITE
);
331 finish_write (acb
, ret
);
334 free_journal_sectors (s
);
337 static void write_metadata_to_journal (FvdAIOCB
* acb
)
339 BlockDriverState
*bs
= acb
->common
.bs
;
340 BDRVFvdState
*s
= bs
->opaque
;
342 int num_journal_sectors
;
344 ASSERT ((s
->table
|| s
->fresh_bitmap
)
345 && (acb
->type
== OP_WRITE
|| acb
->type
== OP_STORE_COMPACT
));
347 /* Is journal is disabled? */
348 if (s
->journal_size
<= 0) {
349 finish_write_journal (acb
, 0);
354 /* Only update the bitmap. */
355 num_journal_sectors
= 1;
356 journal_sec
= allocate_journal_sectors (s
, acb
, num_journal_sectors
);
357 if (journal_sec
< 0) {
358 /* No journal sector is available now. It will be waken up later
359 * in free_journal_sectors(). */
362 acb
->jcb
.iov
.iov_len
= 512;
363 acb
->jcb
.iov
.iov_base
= my_qemu_blockalign (s
->fvd_metadata
, 512);
365 uint32_t *type
= (uint32_t *) acb
->jcb
.iov
.iov_base
; /* Field 1. */
366 uint32_t *nb_sectors
= type
+ 1; /* BITMAP_JRECORD field 2. */
367 int64_t *sector_num
= (int64_t *) (type
+ 2); /* field 3. */
368 *type
= cpu_to_le32 (BITMAP_JRECORD
);
369 *nb_sectors
= cpu_to_le32 ((uint32_t) acb
->nb_sectors
);
370 *sector_num
= cpu_to_le64 (acb
->sector_num
);
371 *((uint32_t *) (sector_num
+ 1)) = EMPTY_JRECORD
;/* Mark record end. */
373 } else if (!s
->fresh_bitmap
) {
374 /* Only update the table. */
375 const int64_t first_chunk
= acb
->sector_num
/ s
->chunk_size
;
376 const int64_t last_chunk
= (acb
->sector_num
+ acb
->nb_sectors
- 1)
378 int num_chunks
= last_chunk
- first_chunk
+ 1;
379 num_journal_sectors
= (num_chunks
+ TABLE_JRECORDS_PER_SECTOR
- 1)
380 / TABLE_JRECORDS_PER_SECTOR
;
381 journal_sec
= allocate_journal_sectors (s
, acb
, num_journal_sectors
);
382 if (journal_sec
< 0) {
383 /* No journal sector is available now. It will be waken up later
384 * in free_journal_sectors(). */
388 acb
->jcb
.iov
.iov_len
= num_journal_sectors
* 512;
389 acb
->jcb
.iov
.iov_base
= my_qemu_blockalign (s
->fvd_metadata
,
390 acb
->jcb
.iov
.iov_len
);
392 uint32_t *type
= (uint32_t *) acb
->jcb
.iov
.iov_base
; /* Field 1. */
393 int64_t chunk
= first_chunk
;
396 /* Start a new journal sector. */
397 uint32_t *offset
= type
+ 1; /* TABLE_JRECORD field 2. */
398 uint32_t *count
= type
+ 2; /* TABLE_JRECORD field 3. */
399 uint32_t *content
= type
+ 3; /* Fields 4 and beyond. */
400 *type
= cpu_to_le32 (TABLE_JRECORD
);
401 *offset
= cpu_to_le32 (chunk
);
403 if (num_chunks
<= TABLE_JRECORDS_PER_SECTOR
) {
404 /* This is the last journal sector. */
405 *count
= cpu_to_le32 (num_chunks
);
406 memcpy (content
, &s
->table
[chunk
],
407 sizeof (uint32_t) * num_chunks
);
408 if (num_chunks
< TABLE_JRECORDS_PER_SECTOR
) {
409 *(content
+ num_chunks
) = EMPTY_JRECORD
; /* Mark end. */
414 *count
= cpu_to_le32 (TABLE_JRECORDS_PER_SECTOR
);
415 memcpy (content
, &s
->table
[chunk
],
416 sizeof (uint32_t) * TABLE_JRECORDS_PER_SECTOR
);
417 chunk
+= TABLE_JRECORDS_PER_SECTOR
;
418 num_chunks
-= TABLE_JRECORDS_PER_SECTOR
;
420 /* Next TABLE_JRECORD field 1. */
421 type
= content
+ TABLE_JRECORDS_PER_SECTOR
;
424 /* Update both the table and the bitmap. It may use multiple journal
425 * sectors. Each sector is self-contained, including a TABLE_JRECORD
426 * and a BITMAP_JRECORD. The two records one the same sector cover the
427 * same range of virtual disk data. The purpose is to update the
428 * corresponding parts of the bitmap and the table in one atomic
430 const int64_t first_chunk
= acb
->sector_num
/ s
->chunk_size
;
431 const int64_t last_chunk
= (acb
->sector_num
+ acb
->nb_sectors
- 1)
433 int num_chunks
= last_chunk
- first_chunk
+ 1;
434 num_journal_sectors
= (num_chunks
+ MIXED_JRECORDS_PER_SECTOR
- 1)
435 / MIXED_JRECORDS_PER_SECTOR
;
436 journal_sec
= allocate_journal_sectors (s
, acb
, num_journal_sectors
);
437 if (journal_sec
< 0) {
438 /* No journal sector is available now. It will be waken up later
439 * in free_journal_sectors(). */
442 acb
->jcb
.iov
.iov_len
= num_journal_sectors
* 512;
443 acb
->jcb
.iov
.iov_base
= my_qemu_blockalign (s
->fvd_metadata
,
444 acb
->jcb
.iov
.iov_len
);
446 uint32_t *type
= (uint32_t *) acb
->jcb
.iov
.iov_base
; /* Field 1. */
447 int64_t chunk
= first_chunk
;
448 int64_t sector_num
= acb
->sector_num
;
450 if (num_journal_sectors
== 1) {
451 nb_sectors
= acb
->nb_sectors
;
453 /* Number of sectors that fall into the first chunk. */
454 nb_sectors
= (first_chunk
+ MIXED_JRECORDS_PER_SECTOR
)
455 * s
->chunk_size
- acb
->sector_num
;
459 /* Start a new journal sector. */
460 uint32_t *offset
= type
+ 1; /* TABLE_JRECORD field 2. */
461 uint32_t *count
= type
+ 2; /* TABLE_JRECORD field 3. */
462 uint32_t *content
= type
+ 3; /* Fields 4 and beyond. */
463 *type
= cpu_to_le32 (TABLE_JRECORD
);
464 *offset
= cpu_to_le32 (chunk
);
466 if (num_chunks
<= MIXED_JRECORDS_PER_SECTOR
) {
467 /* This is the last journal sector. */
468 *count
= cpu_to_le32 (num_chunks
);
469 memcpy (content
, &s
->table
[chunk
],
470 sizeof (uint32_t) * num_chunks
);
472 /* A BITMAP_JRECORD follows a TABLE_JRECORD so that they are
473 * updated in one atomic operatoin. */
474 type
= content
+ num_chunks
; /* BITMAP_JRECORD field 1. */
475 uint32_t *p_nb_sectors
= type
+ 1; /* BITMAP_JRECORD field 2. */
476 int64_t *p_sector_num
= (int64_t *) (type
+ 2); /* Field 3. */
477 *type
= cpu_to_le32 (BITMAP_JRECORD
);
478 *p_nb_sectors
= cpu_to_le32 (nb_sectors
);
479 *p_sector_num
= cpu_to_le64 (sector_num
);
481 if (num_chunks
< MIXED_JRECORDS_PER_SECTOR
) {
482 *((uint32_t *) (p_sector_num
+ 1)) = EMPTY_JRECORD
; /*End*/
487 *count
= cpu_to_le32 (MIXED_JRECORDS_PER_SECTOR
);
488 memcpy (content
, &s
->table
[chunk
],
489 sizeof (uint32_t) * MIXED_JRECORDS_PER_SECTOR
);
491 /* A BITMAP_JRECORD follows a TABLE_JRECORD so that they are
492 * updated in one atomic operatoin. */
493 type
= content
+ MIXED_JRECORDS_PER_SECTOR
; /* Field 1. */
494 uint32_t *p_nb_sectors
= type
+ 1; /* BITMAP_JRECORD field 2. */
495 int64_t *p_sector_num
= (int64_t *) (type
+ 2); /* Field 3. */
496 *type
= cpu_to_le32 (BITMAP_JRECORD
);
497 *p_nb_sectors
= cpu_to_le32 (nb_sectors
);
498 *p_sector_num
= cpu_to_le64 (sector_num
);
500 /* Prepare for the next journal sector. */
501 type
= (uint32_t *) (p_sector_num
+ 1);
502 chunk
+= MIXED_JRECORDS_PER_SECTOR
;
503 sector_num
= chunk
* s
->chunk_size
;
504 num_chunks
-= MIXED_JRECORDS_PER_SECTOR
;
505 if (num_chunks
<= MIXED_JRECORDS_PER_SECTOR
) {
506 /* Data sectors covered by the last journal sector. */
507 nb_sectors
= (acb
->sector_num
+ acb
->nb_sectors
)
508 - chunk
* s
->chunk_size
;
510 nb_sectors
= s
->chunk_size
* MIXED_JRECORDS_PER_SECTOR
;
515 QDEBUG ("JOURNAL: acb%llu-%p write_metadata_to_journal journal_sec=%"
516 PRId64
" nb_journal_sectors=%d\n", acb
->uuid
, acb
, journal_sec
,
517 num_journal_sectors
);
518 qemu_iovec_init_external (&acb
->jcb
.qiov
, &acb
->jcb
.iov
, 1);
519 acb
->jcb
.hd_acb
= bdrv_aio_writev (s
->fvd_metadata
,
520 s
->journal_offset
+ journal_sec
,
521 &acb
->jcb
.qiov
, num_journal_sectors
,
522 finish_write_journal
, acb
);
523 if (!acb
->jcb
.hd_acb
) {
524 finish_write_journal (acb
, -1);
529 static int emulate_host_crash
= TRUE
;
531 static int emulate_host_crash
= FALSE
;
534 static void flush_metadata_to_disk_on_exit (BlockDriverState
*bs
)
536 BDRVFvdState
*s
= bs
->opaque
;
538 if (bs
->read_only
|| !s
->fvd_metadata
) {
542 /* If (emulate_host_crash==TRUE), do not flush metadata to disk
543 * so that it has to rely on journal for recovery. */
544 if (s
->journal_size
<= 0 || !emulate_host_crash
) {
545 flush_metadata_to_disk (bs
);
546 if (!s
->dirty_image
) {
547 update_clean_shutdown_flag (s
, TRUE
);
552 void fvd_enable_host_crash_test (void)
554 emulate_host_crash
= TRUE
;