Merge tag 'v9.0.0-rc3'
[qemu/ar7.git] / block / fvd-journal.c
blob17704278b589bab167cc76c85405c0feda67ec97
1 /*
2 * Copyright (c) 2010-2011 IBM
4 * Authors:
5 * Chunqiang Tang <ctang@us.ibm.com>
7 * This work is licensed under the terms of the GNU GPL, version 2.
8 * See the COPYING file in the top-level directory.
9 */
11 /*=============================================================================
12 * A short description: this FVD module implements a journal for committing
13 * metadata changes. Each sector in the journal is self-contained so that
14 * updates are atomic. A sector may contain one or multiple journal records.
15 * There are two types of journal records:
16 * bitmap_update and table_update.
17 * Format of a bitmap_update record:
18 * + BITMAP_JRECORD (uint32_t)
19 * + num_dirty_sectors (uint32_t)
20 * + dirty_sector_begin (int64_t)
21 * Format of a table_update record:
22 * + TABLE_JRECORD (uint32_t)
23 * + dirty_table_offset (uint32_t)
24 * + num_dirty_table_entries (uint32_t)
25 * + table_entry_1 (uint32_t)
26 * + table_entry_2 (uint32_t)
27 * + ...
28 * If both the bitmap and the table need update, one sector contains a
29 * TABLE_JRECORD and a BITMAP_JRECORD, and these two records cover
30 * the same range of virtual disk data so that the corresponding parts of the
31 * bitmap and the table are always updated in one atomic operation.
32 *============================================================================*/
34 #define BITMAP_JRECORD ((uint32_t)0x3F2AB8ED)
35 #define TABLE_JRECORD ((uint32_t)0xB4E6F7AC)
36 #define EMPTY_JRECORD ((uint32_t)0)
37 #define BITMAP_JRECORD_SIZE (2*sizeof(uint32_t) + sizeof(int64_t))
38 #define TABLE_JRECORD_HDR_SIZE (3*sizeof(uint32_t))
39 #define TABLE_JRECORDS_PER_SECTOR \
40 ((512 - TABLE_JRECORD_HDR_SIZE)/sizeof(uint32_t))
42 /* One BITMAP_JRECORD and this number of BITMAP_JRECORDs can fit
43 * in one journal sector. */
44 #define MIXED_JRECORDS_PER_SECTOR ((512 - TABLE_JRECORD_HDR_SIZE - \
45 BITMAP_JRECORD_SIZE) / sizeof(uint32_t))
47 static inline int64_t calc_min_journal_size (int64_t table_entries)
49 return (table_entries + MIXED_JRECORDS_PER_SECTOR - 1)
50 / MIXED_JRECORDS_PER_SECTOR * 512;
53 static int init_journal (int read_only, BlockDriverState * bs,
54 FvdHeader * header)
56 BDRVFvdState *s = bs->opaque;
57 s->journal_size = header->journal_size / 512;
58 s->journal_offset = header->journal_offset / 512;
59 s->next_journal_sector = 0;
61 if (read_only) {
62 return 0;
65 if (s->journal_size <= 0) {
66 if (!s->table && !s->fresh_bitmap) {
67 return 0; /* No need to use the journal. */
70 if (!header->clean_shutdown) {
71 fprintf (stderr, "ERROR: the image may be corrupted because it was "
72 "not shut down gracefully last\ntime and it does not use "
73 "a journal. You may continue to use the image at your\n"
74 "own risk by manually resetting the clean_shutdown flag "
75 "in the image.\n\n");
76 s->dirty_image = TRUE;
77 if (in_qemu_tool) {
78 return 0; /* Allow qemu tools to use the image. */
79 } else {
80 /* Do not allow boot the VM until the clean_shutdown flag is
81 * manually cleaned. */
82 return -1;
86 QDEBUG ("Journal is disabled\n");
87 return 0;
90 if (header->clean_shutdown) {
91 QDEBUG ("Journal is skipped as the VM was shut down gracefully "
92 "last time.\n");
93 return 0;
96 QDEBUG ("Recover from the journal as the VM was not shut down gracefully "
97 "last time.\n");
99 uint8_t *journal = my_qemu_blockalign (s->fvd_metadata,
100 s->journal_size * 512);
101 int ret = bdrv_read (s->fvd_metadata, s->journal_offset,
102 journal, s->journal_size);
103 if (ret < 0) {
104 my_qemu_vfree (journal);
105 fprintf (stderr, "Failed to read the journal (%" PRId64 ") bytes\n",
106 s->journal_size * 512);
107 return -1;
110 /* Go through every journal sector. */
111 uint8_t *sector = journal;
112 uint8_t *journal_end = journal + s->journal_size * 512;
113 while (sector < journal_end) {
114 uint32_t *type = (uint32_t *) sector; /* Journal record type. */
115 while ((uint8_t *) type < (sector + 512)) {
116 if (le32_to_cpu (*type) == BITMAP_JRECORD) {
117 uint32_t *nb_sectors = type + 1; /* BITMAP_JRECORD field 2. */
118 int64_t *sector_num = (int64_t *) (type + 2); /* field 3. */
119 if (s->stale_bitmap) {
120 update_both_bitmaps (s, le64_to_cpu (*sector_num),
121 le32_to_cpu (*nb_sectors));
122 QDEBUG ("JOURNAL: recover BITMAP_JRECORD sector_num=%"
123 PRId64 " nb_sectors=%u\n",
124 le64_to_cpu (*sector_num),
125 le32_to_cpu (*nb_sectors));
128 /* First field of the next journal record. */
129 type = (uint32_t *) sector_num + 1;
130 } else if (le32_to_cpu (*type) == TABLE_JRECORD) {
131 uint32_t *offset = type + 1; /* TABLE_JRECORD field 2. */
132 uint32_t *count = type + 2; /* TABLE_JRECORD field 3. */
133 uint32_t *content = type + 3; /* fields 4 and beyond. */
134 const uint32_t chunk = le32_to_cpu (*offset);
135 const uint32_t n = le32_to_cpu (*count);
136 uint32_t i;
137 for (i = 0; i < n; i++) {
138 s->table[chunk + i] = content[i];
140 /* The dirty bit was not cleaned when the table entry was
141 * saved in the journal. */
142 CLEAN_DIRTY2 (s->table[chunk + i]);
144 type = content + n; /* First field of the next record. */
145 QDEBUG ("JOURNAL: recover TABLE_JRECORD chunk_start=%u "
146 "nb_chunks=%u\n", chunk, n);
147 } else {
148 /* End of valid records in this journal sector. */
149 ASSERT (le32_to_cpu (*type) == EMPTY_JRECORD);
150 break;
154 sector += 512;
156 my_qemu_vfree (journal);
157 flush_metadata_to_disk (bs); /* Write the recovered metadata. */
159 return 0;
163 * This function first flushes in-memory metadata to disk and then recycle the
164 * used journal sectors. It is possible to make this operation asynchronous so
165 * that the performance is better. However, the overall performance
166 * improvement may be limited since recycling the journal happens very
167 * infrequently and updating on-disk metadata finishes quickly because of the
168 * small size of the metadata.
170 static void recycle_journal (BDRVFvdState * s)
172 #ifdef FVD_DEBUG
173 static int64_t recycle_count = 0;
174 QDEBUG ("JOURNAL: start journal recycle %" PRId64 ".\n", recycle_count);
175 recycle_count++;
176 int64_t begin_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
177 #endif
179 /* Write fresh_bitmap to disk. */
180 if (s->fresh_bitmap) {
181 int nb = (int) (s->bitmap_size / 512);
182 QDEBUG ("JOURNAL: flush bitmap (%d sectors) to disk\n", nb);
184 /* How to recover if this write fails? */
185 bdrv_write (s->fvd_metadata, s->bitmap_offset, s->fresh_bitmap, nb);
187 if (s->fresh_bitmap != s->stale_bitmap) {
188 memcpy (s->stale_bitmap, s->fresh_bitmap, s->bitmap_size);
192 /* Clean DIRTY_TABLE bit and write the table to disk. */
193 if (s->table) {
194 int table_entries =
195 (int) (ROUND_UP (s->virtual_disk_size, s->chunk_size * 512) /
196 (s->chunk_size * 512));
197 int i;
198 for (i = 0; i < table_entries; i++) {
199 CLEAN_DIRTY (s->table[i]);
202 int64_t table_size = sizeof (uint32_t) * table_entries;
203 table_size = ROUND_UP (table_size, DEF_PAGE_SIZE);
204 int nb = (int) (table_size / 512);
205 QDEBUG ("JOURNAL: flush table (%d sectors) to disk\n", nb);
207 /* How to recover if this write fails? */
208 bdrv_write (s->fvd_metadata, s->table_offset, (uint8_t *) s->table, nb);
210 s->next_journal_sector = 0;
212 #ifdef FVD_DEBUG
213 int64_t end_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
214 QDEBUG ("JOURNAL: journal recycle took %" PRId64 " ms.\n",
215 (end_time - begin_time));
216 #endif
219 static void free_journal_sectors (BDRVFvdState * s)
221 if (s->journal_size <= 0) {
222 return;
225 s->ongoing_journal_updates--;
226 ASSERT (s->ongoing_journal_updates >= 0);
227 if (s->ongoing_journal_updates > 0 || QLIST_EMPTY (&s->wait_for_journal)) {
228 return;
231 /* Some requests are waiting for the journal to be recycled in order to
232 * get free journal sectors. */
233 recycle_journal (s);
235 /* Restart requests in the wait_for_journal list. First make a copy of
236 * the head and then empty the head. */
237 FvdAIOCB *acb = QLIST_FIRST (&s->wait_for_journal);
238 QLIST_INIT (&s->wait_for_journal);
239 FvdAIOCB *next;
241 /* Restart all dependent requests. Cannot use QLIST_FOREACH here, because
242 * the next link might not be the same any more after the callback. */
243 while (acb) {
244 next = acb->jcb.next_wait_for_journal.le_next;
245 acb->jcb.next_wait_for_journal.le_prev = NULL;
246 QDEBUG ("WRITE: acb%llu-%p restart_write_metadata_to_journal "
247 "after recycle_journal\n", acb->uuid, acb);
248 write_metadata_to_journal (acb);
249 acb = next;
253 static int64_t allocate_journal_sectors (BDRVFvdState * s, FvdAIOCB * acb,
254 int num_sectors)
256 ASSERT (num_sectors <= s->journal_size);
258 if (!QLIST_EMPTY (&s->wait_for_journal)) {
259 /* Waiting for journal recycle to finish. */
260 ASSERT (s->ongoing_journal_updates > 0);
261 QDEBUG ("WRITE: acb%llu-%p wait_for_journal_recycle\n",
262 acb->uuid, acb);
263 QLIST_INSERT_HEAD (&s->wait_for_journal, acb,
264 jcb.next_wait_for_journal);
265 return -1;
268 int64_t journal_sec;
269 if (s->next_journal_sector + num_sectors <= s->journal_size) {
270 alloc_sector:
271 journal_sec = s->next_journal_sector;
272 s->next_journal_sector += num_sectors;
273 s->ongoing_journal_updates++;
274 return journal_sec;
277 /* No free journal sector is available. Check if the journal can be
278 * recycled now. */
279 if (s->ongoing_journal_updates == 0) {
280 recycle_journal (s);
281 goto alloc_sector;
284 /* Waiting for journal recycle to finish. It will be waken up later in
285 * free_journal_sectors(). */
286 QLIST_INSERT_HEAD (&s->wait_for_journal, acb, jcb.next_wait_for_journal);
287 QDEBUG ("WRITE: acb%llu-%p wait_for_journal_recycle\n", acb->uuid, acb);
288 return -1;
291 static void finish_write_journal (void *opaque, int ret)
293 FvdAIOCB *acb = (FvdAIOCB *) opaque;
294 BlockDriverState *bs = acb->common.bs;
295 BDRVFvdState *s = bs->opaque;
297 if (ret == 0) {
298 QDEBUG ("JOURNAL: acb%llu-%p finish_write_journal\n", acb->uuid, acb);
300 if (s->table) {
301 /* Update the table. */
302 int i;
303 const uint32_t first_chunk = acb->sector_num / s->chunk_size;
304 const uint32_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
305 / s->chunk_size;
306 for (i = first_chunk; i <= last_chunk; i++) {
307 CLEAN_DIRTY2 (s->table[i]);
311 if (s->stale_bitmap) {
312 /* If fresh_bitmap differs from stale_bitmap, fresh_bitmap has
313 * already been updated in finish_write_data() when invoking
314 * update_fresh_bitmap_and_check_stale_bitmap(). */
315 update_stale_bitmap (s, acb->sector_num, acb->nb_sectors);
317 } else {
318 QDEBUG ("JOURNAL: acb%llu-%p finish_write_journal error ret=%d\n",
319 acb->uuid, acb, ret);
322 /* Clean up. */
323 if (acb->type == OP_STORE_COMPACT) {
324 acb->common.cb (acb->common.opaque, ret);
325 if (acb->jcb.iov.iov_base != NULL) {
326 my_qemu_vfree (acb->jcb.iov.iov_base);
328 my_qemu_aio_unref (acb);
329 } else {
330 ASSERT (acb->type == OP_WRITE);
331 finish_write (acb, ret);
334 free_journal_sectors (s);
337 static void write_metadata_to_journal (FvdAIOCB * acb)
339 BlockDriverState *bs = acb->common.bs;
340 BDRVFvdState *s = bs->opaque;
341 int64_t journal_sec;
342 int num_journal_sectors;
344 ASSERT ((s->table || s->fresh_bitmap)
345 && (acb->type == OP_WRITE || acb->type == OP_STORE_COMPACT));
347 /* Is journal is disabled? */
348 if (s->journal_size <= 0) {
349 finish_write_journal (acb, 0);
350 return;
353 if (!s->table) {
354 /* Only update the bitmap. */
355 num_journal_sectors = 1;
356 journal_sec = allocate_journal_sectors (s, acb, num_journal_sectors);
357 if (journal_sec < 0) {
358 /* No journal sector is available now. It will be waken up later
359 * in free_journal_sectors(). */
360 return;
362 acb->jcb.iov.iov_len = 512;
363 acb->jcb.iov.iov_base = my_qemu_blockalign (s->fvd_metadata, 512);
365 uint32_t *type = (uint32_t *) acb->jcb.iov.iov_base; /* Field 1. */
366 uint32_t *nb_sectors = type + 1; /* BITMAP_JRECORD field 2. */
367 int64_t *sector_num = (int64_t *) (type + 2); /* field 3. */
368 *type = cpu_to_le32 (BITMAP_JRECORD);
369 *nb_sectors = cpu_to_le32 ((uint32_t) acb->nb_sectors);
370 *sector_num = cpu_to_le64 (acb->sector_num);
371 *((uint32_t *) (sector_num + 1)) = EMPTY_JRECORD;/* Mark record end. */
373 } else if (!s->fresh_bitmap) {
374 /* Only update the table. */
375 const int64_t first_chunk = acb->sector_num / s->chunk_size;
376 const int64_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
377 / s->chunk_size;
378 int num_chunks = last_chunk - first_chunk + 1;
379 num_journal_sectors = (num_chunks + TABLE_JRECORDS_PER_SECTOR - 1)
380 / TABLE_JRECORDS_PER_SECTOR;
381 journal_sec = allocate_journal_sectors (s, acb, num_journal_sectors);
382 if (journal_sec < 0) {
383 /* No journal sector is available now. It will be waken up later
384 * in free_journal_sectors(). */
385 return;
388 acb->jcb.iov.iov_len = num_journal_sectors * 512;
389 acb->jcb.iov.iov_base = my_qemu_blockalign (s->fvd_metadata,
390 acb->jcb.iov.iov_len);
392 uint32_t *type = (uint32_t *) acb->jcb.iov.iov_base; /* Field 1. */
393 int64_t chunk = first_chunk;
395 while (1) {
396 /* Start a new journal sector. */
397 uint32_t *offset = type + 1; /* TABLE_JRECORD field 2. */
398 uint32_t *count = type + 2; /* TABLE_JRECORD field 3. */
399 uint32_t *content = type + 3; /* Fields 4 and beyond. */
400 *type = cpu_to_le32 (TABLE_JRECORD);
401 *offset = cpu_to_le32 (chunk);
403 if (num_chunks <= TABLE_JRECORDS_PER_SECTOR) {
404 /* This is the last journal sector. */
405 *count = cpu_to_le32 (num_chunks);
406 memcpy (content, &s->table[chunk],
407 sizeof (uint32_t) * num_chunks);
408 if (num_chunks < TABLE_JRECORDS_PER_SECTOR) {
409 *(content + num_chunks) = EMPTY_JRECORD; /* Mark end. */
411 break;
414 *count = cpu_to_le32 (TABLE_JRECORDS_PER_SECTOR);
415 memcpy (content, &s->table[chunk],
416 sizeof (uint32_t) * TABLE_JRECORDS_PER_SECTOR);
417 chunk += TABLE_JRECORDS_PER_SECTOR;
418 num_chunks -= TABLE_JRECORDS_PER_SECTOR;
420 /* Next TABLE_JRECORD field 1. */
421 type = content + TABLE_JRECORDS_PER_SECTOR;
423 } else {
424 /* Update both the table and the bitmap. It may use multiple journal
425 * sectors. Each sector is self-contained, including a TABLE_JRECORD
426 * and a BITMAP_JRECORD. The two records one the same sector cover the
427 * same range of virtual disk data. The purpose is to update the
428 * corresponding parts of the bitmap and the table in one atomic
429 * operation. */
430 const int64_t first_chunk = acb->sector_num / s->chunk_size;
431 const int64_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
432 / s->chunk_size;
433 int num_chunks = last_chunk - first_chunk + 1;
434 num_journal_sectors = (num_chunks + MIXED_JRECORDS_PER_SECTOR - 1)
435 / MIXED_JRECORDS_PER_SECTOR;
436 journal_sec = allocate_journal_sectors (s, acb, num_journal_sectors);
437 if (journal_sec < 0) {
438 /* No journal sector is available now. It will be waken up later
439 * in free_journal_sectors(). */
440 return;
442 acb->jcb.iov.iov_len = num_journal_sectors * 512;
443 acb->jcb.iov.iov_base = my_qemu_blockalign (s->fvd_metadata,
444 acb->jcb.iov.iov_len);
446 uint32_t *type = (uint32_t *) acb->jcb.iov.iov_base; /* Field 1. */
447 int64_t chunk = first_chunk;
448 int64_t sector_num = acb->sector_num;
449 uint32_t nb_sectors;
450 if (num_journal_sectors == 1) {
451 nb_sectors = acb->nb_sectors;
452 } else {
453 /* Number of sectors that fall into the first chunk. */
454 nb_sectors = (first_chunk + MIXED_JRECORDS_PER_SECTOR)
455 * s->chunk_size - acb->sector_num;
458 while (1) {
459 /* Start a new journal sector. */
460 uint32_t *offset = type + 1; /* TABLE_JRECORD field 2. */
461 uint32_t *count = type + 2; /* TABLE_JRECORD field 3. */
462 uint32_t *content = type + 3; /* Fields 4 and beyond. */
463 *type = cpu_to_le32 (TABLE_JRECORD);
464 *offset = cpu_to_le32 (chunk);
466 if (num_chunks <= MIXED_JRECORDS_PER_SECTOR) {
467 /* This is the last journal sector. */
468 *count = cpu_to_le32 (num_chunks);
469 memcpy (content, &s->table[chunk],
470 sizeof (uint32_t) * num_chunks);
472 /* A BITMAP_JRECORD follows a TABLE_JRECORD so that they are
473 * updated in one atomic operatoin. */
474 type = content + num_chunks; /* BITMAP_JRECORD field 1. */
475 uint32_t *p_nb_sectors = type + 1; /* BITMAP_JRECORD field 2. */
476 int64_t *p_sector_num = (int64_t *) (type + 2); /* Field 3. */
477 *type = cpu_to_le32 (BITMAP_JRECORD);
478 *p_nb_sectors = cpu_to_le32 (nb_sectors);
479 *p_sector_num = cpu_to_le64 (sector_num);
481 if (num_chunks < MIXED_JRECORDS_PER_SECTOR) {
482 *((uint32_t *) (p_sector_num + 1)) = EMPTY_JRECORD; /*End*/
484 break;
487 *count = cpu_to_le32 (MIXED_JRECORDS_PER_SECTOR);
488 memcpy (content, &s->table[chunk],
489 sizeof (uint32_t) * MIXED_JRECORDS_PER_SECTOR);
491 /* A BITMAP_JRECORD follows a TABLE_JRECORD so that they are
492 * updated in one atomic operatoin. */
493 type = content + MIXED_JRECORDS_PER_SECTOR; /* Field 1. */
494 uint32_t *p_nb_sectors = type + 1; /* BITMAP_JRECORD field 2. */
495 int64_t *p_sector_num = (int64_t *) (type + 2); /* Field 3. */
496 *type = cpu_to_le32 (BITMAP_JRECORD);
497 *p_nb_sectors = cpu_to_le32 (nb_sectors);
498 *p_sector_num = cpu_to_le64 (sector_num);
500 /* Prepare for the next journal sector. */
501 type = (uint32_t *) (p_sector_num + 1);
502 chunk += MIXED_JRECORDS_PER_SECTOR;
503 sector_num = chunk * s->chunk_size;
504 num_chunks -= MIXED_JRECORDS_PER_SECTOR;
505 if (num_chunks <= MIXED_JRECORDS_PER_SECTOR) {
506 /* Data sectors covered by the last journal sector. */
507 nb_sectors = (acb->sector_num + acb->nb_sectors)
508 - chunk * s->chunk_size;
509 } else {
510 nb_sectors = s->chunk_size * MIXED_JRECORDS_PER_SECTOR;
515 QDEBUG ("JOURNAL: acb%llu-%p write_metadata_to_journal journal_sec=%"
516 PRId64 " nb_journal_sectors=%d\n", acb->uuid, acb, journal_sec,
517 num_journal_sectors);
518 qemu_iovec_init_external (&acb->jcb.qiov, &acb->jcb.iov, 1);
519 acb->jcb.hd_acb = bdrv_aio_writev (s->fvd_metadata,
520 s->journal_offset + journal_sec,
521 &acb->jcb.qiov, num_journal_sectors,
522 finish_write_journal, acb);
523 if (!acb->jcb.hd_acb) {
524 finish_write_journal (acb, -1);
528 #ifdef FVD_DEBUG
529 static int emulate_host_crash = TRUE;
530 #else
531 static int emulate_host_crash = FALSE;
532 #endif
534 static void flush_metadata_to_disk_on_exit (BlockDriverState *bs)
536 BDRVFvdState *s = bs->opaque;
538 if (bs->read_only || !s->fvd_metadata) {
539 return;
542 /* If (emulate_host_crash==TRUE), do not flush metadata to disk
543 * so that it has to rely on journal for recovery. */
544 if (s->journal_size <= 0 || !emulate_host_crash) {
545 flush_metadata_to_disk (bs);
546 if (!s->dirty_image) {
547 update_clean_shutdown_flag (s, TRUE);
552 void fvd_enable_host_crash_test (void)
554 emulate_host_crash = TRUE;