Merge tag 'v9.0.0-rc3'
[qemu/ar7.git] / block / fvd-write.c
blob4678606e7407a7e4eaa83914f4148f6ba109b4be
1 /*
2 * Copyright (c) 2010-2011 IBM
4 * Authors:
5 * Chunqiang Tang <ctang@us.ibm.com>
7 * This work is licensed under the terms of the GNU GPL, version 2.
8 * See the COPYING file in the top-level directory.
9 */
11 /*=============================================================================
12 * A short description: this module implements bdrv_aio_writev() for FVD.
13 *===========================================================================*/
15 static BlockDriverAIOCB *fvd_aio_writev (BlockDriverState * bs,
16 int64_t sector_num,
17 QEMUIOVector * qiov, int nb_sectors,
18 BlockDriverCompletionFunc * cb,
19 void *opaque)
21 BDRVFvdState *s = bs->opaque;
22 FvdAIOCB *acb;
24 TRACE_REQUEST (TRUE, sector_num, nb_sectors);
26 if (!s->data_region_prepared) {
27 init_data_region (s);
30 if (s->prefetch_state == PREFETCH_STATE_FINISHED
31 || sector_num >= s->nb_sectors_in_base_img) {
32 /* This is an efficient case. See Section 3.3.5 of the FVD-cow paper.
33 * This also covers the case of no base image. */
34 return store_data (FALSE, NULL, bs, sector_num, qiov,
35 nb_sectors, cb, opaque);
38 /* Check if all requested sectors are in the FVD data file. */
39 int64_t sec = ROUND_DOWN (sector_num, s->block_size);
40 int64_t sec_in_last_block = ROUND_DOWN (sector_num + nb_sectors - 1,
41 s->block_size);
42 do {
43 if (stale_bitmap_show_sector_in_base_img (sec, s)) {
44 goto slow_path;
46 sec += s->block_size;
47 } while (sec <= sec_in_last_block);
49 /* This is the fast path, as all requested data are in the FVD data file
50 * and no need to update the bitmap. */
51 return store_data (FALSE, NULL, bs, sector_num, qiov,
52 nb_sectors, cb, opaque);
54 slow_path:
55 acb = my_qemu_aio_get (&fvd_aio_pool, bs, cb, opaque);
56 if (!acb) {
57 return NULL;
60 acb->type = OP_WRITE;
61 acb->sector_num = sector_num;
62 acb->nb_sectors = nb_sectors;
63 acb->write.ret = 0;
64 acb->write.update_table = FALSE;
65 acb->write.qiov = qiov;
66 acb->write.hd_acb = NULL;
67 acb->write.cow_buf = NULL;
68 acb->copy_lock.next.le_prev = NULL;
69 acb->write.next_write_lock.le_prev = NULL;
70 acb->write.next_dependent_write.le_prev = NULL;
71 acb->jcb.iov.iov_base = NULL;
72 acb->jcb.hd_acb = NULL;
73 acb->jcb.next_wait_for_journal.le_prev = NULL;
74 QLIST_INIT (&acb->copy_lock.dependent_writes);
76 QDEBUG ("WRITE: acb%llu-%p start sector_num=%" PRId64 " nb_sectors=%d\n",
77 acb->uuid, acb, acb->sector_num, acb->nb_sectors);
79 if (do_aio_write (acb) < 0) {
80 my_qemu_aio_unref (acb);
81 return NULL;
83 #ifdef FVD_DEBUG
84 pending_local_writes++;
85 #endif
86 return &acb->common;
89 #if 0
90 static void fvd_write_cancel (FvdAIOCB * acb)
92 if (acb->write.hd_acb) {
93 bdrv_aio_cancel (acb->write.hd_acb);
95 if (acb->jcb.hd_acb) {
96 bdrv_aio_cancel (acb->jcb.hd_acb);
97 free_journal_sectors (acb->common.bs->opaque);
99 if (acb->jcb.next_wait_for_journal.le_prev) {
100 QLIST_REMOVE (acb, jcb.next_wait_for_journal);
102 if (acb->write.next_dependent_write.le_prev) {
103 QLIST_REMOVE (acb, write.next_dependent_write);
105 free_write_resource (acb);
107 #endif
109 static void free_write_resource (FvdAIOCB * acb)
111 if (acb->write.next_write_lock.le_prev) {
112 QLIST_REMOVE (acb, write.next_write_lock);
114 if (acb->copy_lock.next.le_prev) {
115 QLIST_REMOVE (acb, copy_lock.next);
116 restart_dependent_writes (acb);
118 if (acb->write.cow_buf) {
119 my_qemu_vfree (acb->write.cow_buf);
121 if (acb->jcb.iov.iov_base != NULL) {
122 my_qemu_vfree (acb->jcb.iov.iov_base);
125 my_qemu_aio_unref (acb);
127 #ifdef FVD_DEBUG
128 pending_local_writes--;
129 #endif
132 static inline void finish_write (FvdAIOCB * acb, int ret)
134 QDEBUG ("WRITE: acb%llu-%p completely_finished ret=%d\n", acb->uuid, acb,
135 ret);
136 acb->common.cb (acb->common.opaque, ret);
137 free_write_resource (acb);
140 static void finish_write_data (void *opaque, int ret)
142 FvdAIOCB *acb = opaque;
143 BlockDriverState *bs = acb->common.bs;
144 BDRVFvdState *s = bs->opaque;
146 acb->write.ret = ret;
147 acb->write.hd_acb = NULL;
149 if (ret != 0) {
150 QDEBUG ("WRITE: acb%llu-%p finish_write_data error ret=%d\n",
151 acb->uuid, acb, ret);
152 finish_write (acb, ret);
153 return;
156 QDEBUG ("WRITE: acb%llu-%p finish_write_data\n", acb->uuid, acb);
158 /* Figure out whether to update metadata or not. */
159 if (s->fresh_bitmap == s->stale_bitmap) {
160 /* This is the case if neither copy_on_read nor prefetching is
161 * enabled. Cannot update fresh_bitmap until the on-disk metadata is
162 * updated. */
163 if (acb->write.update_table || stale_bitmap_need_update (acb)) {
164 /* Cannot release lock on data now since fresh_bitmap has not been
165 * updated. Otherwise, a copy-on-write or copy-on-read operation
166 * may use data from the backing image to overwrite the data just
167 * been written. */
168 write_metadata_to_journal (acb);
169 } else {
170 finish_write (acb, ret); /* No need to update metadata. */
172 return;
175 /* stale_bitmap and fresh_bitmap are different. Now we can update
176 * fresh_bitmap. stale_bitmap will be updated after the on-disk metadata
177 * are updated. */
178 int update_stale_bitmap = update_fresh_bitmap_and_check_stale_bitmap (acb);
180 if (acb->write.update_table || update_stale_bitmap) {
181 /* Release lock on data now since fresh_bitmap has been updated. */
182 QLIST_REMOVE (acb, write.next_write_lock);
183 acb->write.next_write_lock.le_prev = NULL;
184 if (acb->copy_lock.next.le_prev) {
185 QLIST_REMOVE (acb, copy_lock.next);
186 restart_dependent_writes (acb);
189 write_metadata_to_journal (acb);
190 } else {
191 finish_write (acb, ret);
195 static void finish_read_backing_for_copy_on_write (void *opaque, int ret)
197 FvdAIOCB *acb = (FvdAIOCB *) opaque;
198 BlockDriverState *bs = acb->common.bs;
200 if (ret != 0) {
201 QDEBUG ("WRITE: acb%llu-%p finish_read_from_backing with error "
202 "ret=%d\n", acb->uuid, acb, ret);
203 finish_write (acb, ret);
204 } else {
205 QDEBUG ("WRITE: acb%llu-%p "
206 "finish_read_from_backing_and_start_write_data\n",
207 acb->uuid, acb);
208 acb->write.hd_acb = store_data (FALSE, acb, bs,
209 acb->write.cow_start_sector,
210 acb->write.cow_qiov,
211 acb->write.cow_qiov->size / 512,
212 finish_write_data, acb);
213 if (!acb->write.hd_acb) {
214 finish_write (acb, -1);
219 static int do_aio_write (FvdAIOCB * acb)
221 BlockDriverState *bs = acb->common.bs;
222 BDRVFvdState *s = bs->opaque;
224 /* Calculate the data region need be locked. */
225 const int64_t sector_end = acb->sector_num + acb->nb_sectors;
226 const int64_t block_begin = ROUND_DOWN (acb->sector_num, s->block_size);
227 int64_t block_end = ROUND_UP (sector_end, s->block_size);
229 /* Check for conflicting copy-on-reads. */
230 FvdAIOCB *old;
231 QLIST_FOREACH (old, &s->copy_locks, copy_lock.next) {
232 if (old->copy_lock.end > acb->sector_num &&
233 sector_end > old->copy_lock.begin) {
234 QLIST_INSERT_HEAD (&old->copy_lock.dependent_writes, acb,
235 write.next_dependent_write);
236 QDEBUG ("WRITE: acb%llu-%p put_on_hold_due_to_data_conflict "
237 "with %s acb%llu-%p\n", acb->uuid, acb,
238 old->type == OP_WRITE ? "write" : "copy_on_read",
239 old->uuid, old);
240 return 0;
244 /* No conflict. Now check if this write updates partial blocks and hence
245 * need to read those blocks from the base image and merge with this
246 * write. */
247 int read_first_block, read_last_block;
248 if (acb->sector_num % s->block_size == 0) {
249 read_first_block = FALSE;
250 } else
251 if (fresh_bitmap_show_sector_in_base_img (acb->sector_num, s)) {
252 read_first_block = TRUE;
253 } else {
254 read_first_block = FALSE;
257 if (sector_end % s->block_size == 0) {
258 read_last_block = FALSE;
259 } else if (fresh_bitmap_show_sector_in_base_img (sector_end - 1, s)) {
260 read_last_block = TRUE;
261 } else {
262 read_last_block = FALSE;
265 if (read_first_block) {
266 if (read_last_block) {
267 /* Case 1: Read all the blocks involved from the base image. */
268 const QEMUIOVector *old_qiov = acb->write.qiov;
269 if (block_end > s->nb_sectors_in_base_img) {
270 block_end = s->nb_sectors_in_base_img;
273 int buf_size = (block_end - block_begin) * 512
274 + 2 * sizeof (QEMUIOVector)
275 + sizeof (struct iovec) * (old_qiov->niov + 3);
276 buf_size = ROUND_UP (buf_size, 512);
277 acb->write.cow_buf = my_qemu_blockalign (bs->backing_hd, buf_size);
279 /* For reading from the base image. */
280 QEMUIOVector *read_qiov = (QEMUIOVector *) (acb->write.cow_buf +
281 (block_end - block_begin) * 512);
282 read_qiov->iov = (struct iovec *) (read_qiov + 1);
283 read_qiov->nalloc = -1;
284 read_qiov->niov = 1;
285 read_qiov->iov[0].iov_base = acb->write.cow_buf;
286 read_qiov->iov[0].iov_len = read_qiov->size =
287 (block_end - block_begin) * 512;
289 /* For writing to the FVD data file. */
290 QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
291 write_qiov->iov = (struct iovec *) (write_qiov + 1);
292 write_qiov->nalloc = -1;
293 write_qiov->niov = old_qiov->niov + 2;
294 write_qiov->size = read_qiov->size;
296 /* The first entry is for data read from the base image. */
297 write_qiov->iov[0].iov_base = acb->write.cow_buf;
298 write_qiov->iov[0].iov_len = (acb->sector_num - block_begin) * 512;
299 memcpy (&write_qiov->iov[1], old_qiov->iov,
300 sizeof (struct iovec) * old_qiov->niov);
302 /* The last entry is for data read from the base image. */
303 write_qiov->iov[old_qiov->niov + 1].iov_base = acb->write.cow_buf
304 + (sector_end - block_begin) * 512;
305 write_qiov->iov[old_qiov->niov + 1].iov_len =
306 (block_end - sector_end) * 512;
307 acb->write.cow_qiov = write_qiov;
308 acb->write.cow_start_sector = block_begin;
310 acb->write.hd_acb = bdrv_aio_readv (bs->backing_hd, block_begin,
311 read_qiov, block_end - block_begin,
312 finish_read_backing_for_copy_on_write, acb);
313 if (!acb->write.hd_acb) {
314 goto fail;
317 acb->copy_lock.begin = block_begin;
318 acb->copy_lock.end = block_end;
319 QLIST_INSERT_HEAD (&s->copy_locks, acb, copy_lock.next);
320 QDEBUG ("WRITE: acb%llu-%p "
321 "read_first_last_partial_blocks_from_backing sector_num=%"
322 PRId64 " nb_sectors=%d\n", acb->uuid, acb, block_begin,
323 (int) (block_end - block_begin));
324 } else {
325 /* Case 2: Read the first block from the base image. */
326 int nb = acb->sector_num - block_begin;
327 const QEMUIOVector *old_qiov = acb->write.qiov;
329 /* Space for data and metadata. */
330 int buf_size = nb * 512 + 2 * sizeof (QEMUIOVector)
331 + sizeof (struct iovec) * (old_qiov->niov + 2);
332 buf_size = ROUND_UP (buf_size, 512);
333 acb->write.cow_buf = my_qemu_blockalign (bs->backing_hd, buf_size);
335 /* For reading from the base image. */
336 QEMUIOVector *read_qiov =
337 (QEMUIOVector *) (acb->write.cow_buf + nb * 512);
338 read_qiov->iov = (struct iovec *) (read_qiov + 1);
339 read_qiov->nalloc = -1;
340 read_qiov->niov = 1;
341 read_qiov->iov[0].iov_base = acb->write.cow_buf;
342 read_qiov->iov[0].iov_len = read_qiov->size = nb * 512;
344 /* For writing to the FVD data file. */
345 QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
346 write_qiov->iov = (struct iovec *) (write_qiov + 1);
347 write_qiov->nalloc = -1;
348 write_qiov->niov = old_qiov->niov + 1;
349 write_qiov->size = old_qiov->size + read_qiov->size;
351 /* The first entry is added for data read from the base image. */
352 write_qiov->iov[0].iov_base = acb->write.cow_buf;
353 write_qiov->iov[0].iov_len = read_qiov->size;
354 memcpy (&write_qiov->iov[1], old_qiov->iov,
355 sizeof (struct iovec) * old_qiov->niov);
356 acb->write.cow_qiov = write_qiov;
357 acb->write.cow_start_sector = block_begin;
359 acb->write.hd_acb = bdrv_aio_readv (bs->backing_hd,
360 block_begin, read_qiov, nb,
361 finish_read_backing_for_copy_on_write, acb);
362 if (!acb->write.hd_acb) {
363 goto fail;
366 acb->copy_lock.begin = block_begin;
367 acb->copy_lock.end = block_begin + s->block_size;
368 QLIST_INSERT_HEAD (&s->copy_locks, acb, copy_lock.next);
369 QDEBUG ("WRITE: acb%llu-%p read_first_partial_block_from_backing "
370 "sector_num=%" PRId64 " nb_sectors=%d\n",
371 acb->uuid, acb, block_begin, nb);
373 } else {
374 if (read_last_block) {
375 /* Case 3: Read the last block from the base image. */
376 int nb;
377 if (block_end < s->nb_sectors_in_base_img) {
378 nb = block_end - sector_end;
380 else {
381 nb = s->nb_sectors_in_base_img - sector_end;
383 const QEMUIOVector *old_qiov = acb->write.qiov;
385 /* Space for data and metadata. */
386 int buf_size = nb * 512 + 2 * sizeof (QEMUIOVector)
387 + sizeof (struct iovec) * (old_qiov->niov + 2);
388 buf_size = ROUND_UP (buf_size, 512);
389 acb->write.cow_buf = my_qemu_blockalign (bs->backing_hd, buf_size);
391 /* For reading from the base image. */
392 QEMUIOVector *read_qiov = (QEMUIOVector *) (acb->write.cow_buf
393 + nb * 512);
394 read_qiov->iov = (struct iovec *) (read_qiov + 1);
395 read_qiov->nalloc = -1;
396 read_qiov->niov = 1;
397 read_qiov->iov[0].iov_base = acb->write.cow_buf;
398 read_qiov->iov[0].iov_len = read_qiov->size = nb * 512;
400 /* For writing to the FVD data file. */
401 QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
402 write_qiov->iov = (struct iovec *) (write_qiov + 1);
403 write_qiov->nalloc = -1;
404 write_qiov->niov = old_qiov->niov + 1;
405 write_qiov->size = old_qiov->size + read_qiov->size;
406 memcpy (write_qiov->iov, old_qiov->iov,
407 sizeof (struct iovec) * old_qiov->niov);
409 /* The last appended entry is for data read from the base image. */
410 write_qiov->iov[old_qiov->niov].iov_base = acb->write.cow_buf;
411 write_qiov->iov[old_qiov->niov].iov_len = read_qiov->size;
412 acb->write.cow_qiov = write_qiov;
413 acb->write.cow_start_sector = acb->sector_num;
415 acb->write.hd_acb = bdrv_aio_readv (bs->backing_hd,
416 sector_end, read_qiov, nb,
417 finish_read_backing_for_copy_on_write, acb);
418 if (!acb->write.hd_acb) {
419 goto fail;
422 acb->copy_lock.end = block_end;
423 acb->copy_lock.begin = block_end - s->block_size;
424 QLIST_INSERT_HEAD (&s->copy_locks, acb, copy_lock.next);
425 QDEBUG ("WRITE: acb%llu-%p read_last_partial_block_from_backing "
426 "sector_num=%" PRId64 " nb_sectors=%d\n",
427 acb->uuid, acb, sector_end, nb);
428 } else {
429 /* Case 4: Can write directly and no need to merge with data from
430 * the base image. */
431 QDEBUG ("WRITE: acb%llu-%p "
432 "write_fvd_without_read_partial_block_from_backing\n",
433 acb->uuid, acb);
434 acb->write.hd_acb = store_data (FALSE, acb, bs, acb->sector_num,
435 acb->write.qiov, acb->nb_sectors,
436 finish_write_data, acb);
437 if (!acb->write.hd_acb) {
438 goto fail;
443 QLIST_INSERT_HEAD (&s->write_locks, acb, write.next_write_lock);
444 return 0;
446 fail:
447 if (acb->write.cow_buf) {
448 my_qemu_vfree (acb->write.cow_buf);
450 return -1;