2 * Copyright (c) 2010-2011 IBM
5 * Chunqiang Tang <ctang@us.ibm.com>
7 * This work is licensed under the terms of the GNU GPL, version 2.
8 * See the COPYING file in the top-level directory.
11 /*=============================================================================
12 * A short description: this module implements bdrv_aio_writev() for FVD.
13 *===========================================================================*/
15 static BlockDriverAIOCB
*fvd_aio_writev (BlockDriverState
* bs
,
17 QEMUIOVector
* qiov
, int nb_sectors
,
18 BlockDriverCompletionFunc
* cb
,
21 BDRVFvdState
*s
= bs
->opaque
;
24 TRACE_REQUEST (TRUE
, sector_num
, nb_sectors
);
26 if (!s
->data_region_prepared
) {
30 if (s
->prefetch_state
== PREFETCH_STATE_FINISHED
31 || sector_num
>= s
->nb_sectors_in_base_img
) {
32 /* This is an efficient case. See Section 3.3.5 of the FVD-cow paper.
33 * This also covers the case of no base image. */
34 return store_data (FALSE
, NULL
, bs
, sector_num
, qiov
,
35 nb_sectors
, cb
, opaque
);
38 /* Check if all requested sectors are in the FVD data file. */
39 int64_t sec
= ROUND_DOWN (sector_num
, s
->block_size
);
40 int64_t sec_in_last_block
= ROUND_DOWN (sector_num
+ nb_sectors
- 1,
43 if (stale_bitmap_show_sector_in_base_img (sec
, s
)) {
47 } while (sec
<= sec_in_last_block
);
49 /* This is the fast path, as all requested data are in the FVD data file
50 * and no need to update the bitmap. */
51 return store_data (FALSE
, NULL
, bs
, sector_num
, qiov
,
52 nb_sectors
, cb
, opaque
);
55 acb
= my_qemu_aio_get (&fvd_aio_pool
, bs
, cb
, opaque
);
61 acb
->sector_num
= sector_num
;
62 acb
->nb_sectors
= nb_sectors
;
64 acb
->write
.update_table
= FALSE
;
65 acb
->write
.qiov
= qiov
;
66 acb
->write
.hd_acb
= NULL
;
67 acb
->write
.cow_buf
= NULL
;
68 acb
->copy_lock
.next
.le_prev
= NULL
;
69 acb
->write
.next_write_lock
.le_prev
= NULL
;
70 acb
->write
.next_dependent_write
.le_prev
= NULL
;
71 acb
->jcb
.iov
.iov_base
= NULL
;
72 acb
->jcb
.hd_acb
= NULL
;
73 acb
->jcb
.next_wait_for_journal
.le_prev
= NULL
;
74 QLIST_INIT (&acb
->copy_lock
.dependent_writes
);
76 QDEBUG ("WRITE: acb%llu-%p start sector_num=%" PRId64
" nb_sectors=%d\n",
77 acb
->uuid
, acb
, acb
->sector_num
, acb
->nb_sectors
);
79 if (do_aio_write (acb
) < 0) {
80 my_qemu_aio_unref (acb
);
84 pending_local_writes
++;
90 static void fvd_write_cancel (FvdAIOCB
* acb
)
92 if (acb
->write
.hd_acb
) {
93 bdrv_aio_cancel (acb
->write
.hd_acb
);
95 if (acb
->jcb
.hd_acb
) {
96 bdrv_aio_cancel (acb
->jcb
.hd_acb
);
97 free_journal_sectors (acb
->common
.bs
->opaque
);
99 if (acb
->jcb
.next_wait_for_journal
.le_prev
) {
100 QLIST_REMOVE (acb
, jcb
.next_wait_for_journal
);
102 if (acb
->write
.next_dependent_write
.le_prev
) {
103 QLIST_REMOVE (acb
, write
.next_dependent_write
);
105 free_write_resource (acb
);
109 static void free_write_resource (FvdAIOCB
* acb
)
111 if (acb
->write
.next_write_lock
.le_prev
) {
112 QLIST_REMOVE (acb
, write
.next_write_lock
);
114 if (acb
->copy_lock
.next
.le_prev
) {
115 QLIST_REMOVE (acb
, copy_lock
.next
);
116 restart_dependent_writes (acb
);
118 if (acb
->write
.cow_buf
) {
119 my_qemu_vfree (acb
->write
.cow_buf
);
121 if (acb
->jcb
.iov
.iov_base
!= NULL
) {
122 my_qemu_vfree (acb
->jcb
.iov
.iov_base
);
125 my_qemu_aio_unref (acb
);
128 pending_local_writes
--;
132 static inline void finish_write (FvdAIOCB
* acb
, int ret
)
134 QDEBUG ("WRITE: acb%llu-%p completely_finished ret=%d\n", acb
->uuid
, acb
,
136 acb
->common
.cb (acb
->common
.opaque
, ret
);
137 free_write_resource (acb
);
140 static void finish_write_data (void *opaque
, int ret
)
142 FvdAIOCB
*acb
= opaque
;
143 BlockDriverState
*bs
= acb
->common
.bs
;
144 BDRVFvdState
*s
= bs
->opaque
;
146 acb
->write
.ret
= ret
;
147 acb
->write
.hd_acb
= NULL
;
150 QDEBUG ("WRITE: acb%llu-%p finish_write_data error ret=%d\n",
151 acb
->uuid
, acb
, ret
);
152 finish_write (acb
, ret
);
156 QDEBUG ("WRITE: acb%llu-%p finish_write_data\n", acb
->uuid
, acb
);
158 /* Figure out whether to update metadata or not. */
159 if (s
->fresh_bitmap
== s
->stale_bitmap
) {
160 /* This is the case if neither copy_on_read nor prefetching is
161 * enabled. Cannot update fresh_bitmap until the on-disk metadata is
163 if (acb
->write
.update_table
|| stale_bitmap_need_update (acb
)) {
164 /* Cannot release lock on data now since fresh_bitmap has not been
165 * updated. Otherwise, a copy-on-write or copy-on-read operation
166 * may use data from the backing image to overwrite the data just
168 write_metadata_to_journal (acb
);
170 finish_write (acb
, ret
); /* No need to update metadata. */
175 /* stale_bitmap and fresh_bitmap are different. Now we can update
176 * fresh_bitmap. stale_bitmap will be updated after the on-disk metadata
178 int update_stale_bitmap
= update_fresh_bitmap_and_check_stale_bitmap (acb
);
180 if (acb
->write
.update_table
|| update_stale_bitmap
) {
181 /* Release lock on data now since fresh_bitmap has been updated. */
182 QLIST_REMOVE (acb
, write
.next_write_lock
);
183 acb
->write
.next_write_lock
.le_prev
= NULL
;
184 if (acb
->copy_lock
.next
.le_prev
) {
185 QLIST_REMOVE (acb
, copy_lock
.next
);
186 restart_dependent_writes (acb
);
189 write_metadata_to_journal (acb
);
191 finish_write (acb
, ret
);
195 static void finish_read_backing_for_copy_on_write (void *opaque
, int ret
)
197 FvdAIOCB
*acb
= (FvdAIOCB
*) opaque
;
198 BlockDriverState
*bs
= acb
->common
.bs
;
201 QDEBUG ("WRITE: acb%llu-%p finish_read_from_backing with error "
202 "ret=%d\n", acb
->uuid
, acb
, ret
);
203 finish_write (acb
, ret
);
205 QDEBUG ("WRITE: acb%llu-%p "
206 "finish_read_from_backing_and_start_write_data\n",
208 acb
->write
.hd_acb
= store_data (FALSE
, acb
, bs
,
209 acb
->write
.cow_start_sector
,
211 acb
->write
.cow_qiov
->size
/ 512,
212 finish_write_data
, acb
);
213 if (!acb
->write
.hd_acb
) {
214 finish_write (acb
, -1);
219 static int do_aio_write (FvdAIOCB
* acb
)
221 BlockDriverState
*bs
= acb
->common
.bs
;
222 BDRVFvdState
*s
= bs
->opaque
;
224 /* Calculate the data region need be locked. */
225 const int64_t sector_end
= acb
->sector_num
+ acb
->nb_sectors
;
226 const int64_t block_begin
= ROUND_DOWN (acb
->sector_num
, s
->block_size
);
227 int64_t block_end
= ROUND_UP (sector_end
, s
->block_size
);
229 /* Check for conflicting copy-on-reads. */
231 QLIST_FOREACH (old
, &s
->copy_locks
, copy_lock
.next
) {
232 if (old
->copy_lock
.end
> acb
->sector_num
&&
233 sector_end
> old
->copy_lock
.begin
) {
234 QLIST_INSERT_HEAD (&old
->copy_lock
.dependent_writes
, acb
,
235 write
.next_dependent_write
);
236 QDEBUG ("WRITE: acb%llu-%p put_on_hold_due_to_data_conflict "
237 "with %s acb%llu-%p\n", acb
->uuid
, acb
,
238 old
->type
== OP_WRITE
? "write" : "copy_on_read",
244 /* No conflict. Now check if this write updates partial blocks and hence
245 * need to read those blocks from the base image and merge with this
247 int read_first_block
, read_last_block
;
248 if (acb
->sector_num
% s
->block_size
== 0) {
249 read_first_block
= FALSE
;
251 if (fresh_bitmap_show_sector_in_base_img (acb
->sector_num
, s
)) {
252 read_first_block
= TRUE
;
254 read_first_block
= FALSE
;
257 if (sector_end
% s
->block_size
== 0) {
258 read_last_block
= FALSE
;
259 } else if (fresh_bitmap_show_sector_in_base_img (sector_end
- 1, s
)) {
260 read_last_block
= TRUE
;
262 read_last_block
= FALSE
;
265 if (read_first_block
) {
266 if (read_last_block
) {
267 /* Case 1: Read all the blocks involved from the base image. */
268 const QEMUIOVector
*old_qiov
= acb
->write
.qiov
;
269 if (block_end
> s
->nb_sectors_in_base_img
) {
270 block_end
= s
->nb_sectors_in_base_img
;
273 int buf_size
= (block_end
- block_begin
) * 512
274 + 2 * sizeof (QEMUIOVector
)
275 + sizeof (struct iovec
) * (old_qiov
->niov
+ 3);
276 buf_size
= ROUND_UP (buf_size
, 512);
277 acb
->write
.cow_buf
= my_qemu_blockalign (bs
->backing_hd
, buf_size
);
279 /* For reading from the base image. */
280 QEMUIOVector
*read_qiov
= (QEMUIOVector
*) (acb
->write
.cow_buf
+
281 (block_end
- block_begin
) * 512);
282 read_qiov
->iov
= (struct iovec
*) (read_qiov
+ 1);
283 read_qiov
->nalloc
= -1;
285 read_qiov
->iov
[0].iov_base
= acb
->write
.cow_buf
;
286 read_qiov
->iov
[0].iov_len
= read_qiov
->size
=
287 (block_end
- block_begin
) * 512;
289 /* For writing to the FVD data file. */
290 QEMUIOVector
*write_qiov
= (QEMUIOVector
*) (read_qiov
->iov
+ 1);
291 write_qiov
->iov
= (struct iovec
*) (write_qiov
+ 1);
292 write_qiov
->nalloc
= -1;
293 write_qiov
->niov
= old_qiov
->niov
+ 2;
294 write_qiov
->size
= read_qiov
->size
;
296 /* The first entry is for data read from the base image. */
297 write_qiov
->iov
[0].iov_base
= acb
->write
.cow_buf
;
298 write_qiov
->iov
[0].iov_len
= (acb
->sector_num
- block_begin
) * 512;
299 memcpy (&write_qiov
->iov
[1], old_qiov
->iov
,
300 sizeof (struct iovec
) * old_qiov
->niov
);
302 /* The last entry is for data read from the base image. */
303 write_qiov
->iov
[old_qiov
->niov
+ 1].iov_base
= acb
->write
.cow_buf
304 + (sector_end
- block_begin
) * 512;
305 write_qiov
->iov
[old_qiov
->niov
+ 1].iov_len
=
306 (block_end
- sector_end
) * 512;
307 acb
->write
.cow_qiov
= write_qiov
;
308 acb
->write
.cow_start_sector
= block_begin
;
310 acb
->write
.hd_acb
= bdrv_aio_readv (bs
->backing_hd
, block_begin
,
311 read_qiov
, block_end
- block_begin
,
312 finish_read_backing_for_copy_on_write
, acb
);
313 if (!acb
->write
.hd_acb
) {
317 acb
->copy_lock
.begin
= block_begin
;
318 acb
->copy_lock
.end
= block_end
;
319 QLIST_INSERT_HEAD (&s
->copy_locks
, acb
, copy_lock
.next
);
320 QDEBUG ("WRITE: acb%llu-%p "
321 "read_first_last_partial_blocks_from_backing sector_num=%"
322 PRId64
" nb_sectors=%d\n", acb
->uuid
, acb
, block_begin
,
323 (int) (block_end
- block_begin
));
325 /* Case 2: Read the first block from the base image. */
326 int nb
= acb
->sector_num
- block_begin
;
327 const QEMUIOVector
*old_qiov
= acb
->write
.qiov
;
329 /* Space for data and metadata. */
330 int buf_size
= nb
* 512 + 2 * sizeof (QEMUIOVector
)
331 + sizeof (struct iovec
) * (old_qiov
->niov
+ 2);
332 buf_size
= ROUND_UP (buf_size
, 512);
333 acb
->write
.cow_buf
= my_qemu_blockalign (bs
->backing_hd
, buf_size
);
335 /* For reading from the base image. */
336 QEMUIOVector
*read_qiov
=
337 (QEMUIOVector
*) (acb
->write
.cow_buf
+ nb
* 512);
338 read_qiov
->iov
= (struct iovec
*) (read_qiov
+ 1);
339 read_qiov
->nalloc
= -1;
341 read_qiov
->iov
[0].iov_base
= acb
->write
.cow_buf
;
342 read_qiov
->iov
[0].iov_len
= read_qiov
->size
= nb
* 512;
344 /* For writing to the FVD data file. */
345 QEMUIOVector
*write_qiov
= (QEMUIOVector
*) (read_qiov
->iov
+ 1);
346 write_qiov
->iov
= (struct iovec
*) (write_qiov
+ 1);
347 write_qiov
->nalloc
= -1;
348 write_qiov
->niov
= old_qiov
->niov
+ 1;
349 write_qiov
->size
= old_qiov
->size
+ read_qiov
->size
;
351 /* The first entry is added for data read from the base image. */
352 write_qiov
->iov
[0].iov_base
= acb
->write
.cow_buf
;
353 write_qiov
->iov
[0].iov_len
= read_qiov
->size
;
354 memcpy (&write_qiov
->iov
[1], old_qiov
->iov
,
355 sizeof (struct iovec
) * old_qiov
->niov
);
356 acb
->write
.cow_qiov
= write_qiov
;
357 acb
->write
.cow_start_sector
= block_begin
;
359 acb
->write
.hd_acb
= bdrv_aio_readv (bs
->backing_hd
,
360 block_begin
, read_qiov
, nb
,
361 finish_read_backing_for_copy_on_write
, acb
);
362 if (!acb
->write
.hd_acb
) {
366 acb
->copy_lock
.begin
= block_begin
;
367 acb
->copy_lock
.end
= block_begin
+ s
->block_size
;
368 QLIST_INSERT_HEAD (&s
->copy_locks
, acb
, copy_lock
.next
);
369 QDEBUG ("WRITE: acb%llu-%p read_first_partial_block_from_backing "
370 "sector_num=%" PRId64
" nb_sectors=%d\n",
371 acb
->uuid
, acb
, block_begin
, nb
);
374 if (read_last_block
) {
375 /* Case 3: Read the last block from the base image. */
377 if (block_end
< s
->nb_sectors_in_base_img
) {
378 nb
= block_end
- sector_end
;
381 nb
= s
->nb_sectors_in_base_img
- sector_end
;
383 const QEMUIOVector
*old_qiov
= acb
->write
.qiov
;
385 /* Space for data and metadata. */
386 int buf_size
= nb
* 512 + 2 * sizeof (QEMUIOVector
)
387 + sizeof (struct iovec
) * (old_qiov
->niov
+ 2);
388 buf_size
= ROUND_UP (buf_size
, 512);
389 acb
->write
.cow_buf
= my_qemu_blockalign (bs
->backing_hd
, buf_size
);
391 /* For reading from the base image. */
392 QEMUIOVector
*read_qiov
= (QEMUIOVector
*) (acb
->write
.cow_buf
394 read_qiov
->iov
= (struct iovec
*) (read_qiov
+ 1);
395 read_qiov
->nalloc
= -1;
397 read_qiov
->iov
[0].iov_base
= acb
->write
.cow_buf
;
398 read_qiov
->iov
[0].iov_len
= read_qiov
->size
= nb
* 512;
400 /* For writing to the FVD data file. */
401 QEMUIOVector
*write_qiov
= (QEMUIOVector
*) (read_qiov
->iov
+ 1);
402 write_qiov
->iov
= (struct iovec
*) (write_qiov
+ 1);
403 write_qiov
->nalloc
= -1;
404 write_qiov
->niov
= old_qiov
->niov
+ 1;
405 write_qiov
->size
= old_qiov
->size
+ read_qiov
->size
;
406 memcpy (write_qiov
->iov
, old_qiov
->iov
,
407 sizeof (struct iovec
) * old_qiov
->niov
);
409 /* The last appended entry is for data read from the base image. */
410 write_qiov
->iov
[old_qiov
->niov
].iov_base
= acb
->write
.cow_buf
;
411 write_qiov
->iov
[old_qiov
->niov
].iov_len
= read_qiov
->size
;
412 acb
->write
.cow_qiov
= write_qiov
;
413 acb
->write
.cow_start_sector
= acb
->sector_num
;
415 acb
->write
.hd_acb
= bdrv_aio_readv (bs
->backing_hd
,
416 sector_end
, read_qiov
, nb
,
417 finish_read_backing_for_copy_on_write
, acb
);
418 if (!acb
->write
.hd_acb
) {
422 acb
->copy_lock
.end
= block_end
;
423 acb
->copy_lock
.begin
= block_end
- s
->block_size
;
424 QLIST_INSERT_HEAD (&s
->copy_locks
, acb
, copy_lock
.next
);
425 QDEBUG ("WRITE: acb%llu-%p read_last_partial_block_from_backing "
426 "sector_num=%" PRId64
" nb_sectors=%d\n",
427 acb
->uuid
, acb
, sector_end
, nb
);
429 /* Case 4: Can write directly and no need to merge with data from
431 QDEBUG ("WRITE: acb%llu-%p "
432 "write_fvd_without_read_partial_block_from_backing\n",
434 acb
->write
.hd_acb
= store_data (FALSE
, acb
, bs
, acb
->sector_num
,
435 acb
->write
.qiov
, acb
->nb_sectors
,
436 finish_write_data
, acb
);
437 if (!acb
->write
.hd_acb
) {
443 QLIST_INSERT_HEAD (&s
->write_locks
, acb
, write
.next_write_lock
);
447 if (acb
->write
.cow_buf
) {
448 my_qemu_vfree (acb
->write
.cow_buf
);