Merge remote-tracking branch 'qemu-project/master'
[qemu/ar7.git] / block / fvd-read.c
blob4dbcc6dc9dfac32baeb2d4c764e5764d234acbeb
1 /*
2 * Copyright (c) 2010-2011 IBM
4 * Authors:
5 * Chunqiang Tang <ctang@us.ibm.com>
7 * This work is licensed under the terms of the GNU GPL, version 2.
8 * See the COPYING file in the top-level directory.
9 */
11 /*=============================================================================
12 * A short description: this module implements bdrv_aio_readv() for FVD.
13 *============================================================================*/
15 static void finish_read_backing_for_copy_on_read (void *opaque, int ret);
16 static void finish_read_fvd (void *opaque, int ret);
17 static inline void calc_read_region (BDRVFvdState * s, int64_t sector_num,
18 int nb_sectors,
19 int64_t * p_first_sec_in_fvd,
20 int64_t * p_last_sec_in_fvd,
21 int64_t * p_first_sec_in_backing,
22 int64_t * p_last_sec_in_backing);
24 static BlockDriverAIOCB *fvd_aio_readv (BlockDriverState * bs,
25 int64_t sector_num, QEMUIOVector * qiov,
26 int nb_sectors,
27 BlockDriverCompletionFunc * cb,
28 void *opaque)
30 BDRVFvdState *s = bs->opaque;
32 TRACE_REQUEST (FALSE, sector_num, nb_sectors);
34 if (!s->data_region_prepared) {
35 init_data_region (s);
38 if (s->prefetch_state == PREFETCH_STATE_FINISHED
39 || sector_num >= s->nb_sectors_in_base_img) {
40 /* This is an efficient case. See Section 3.3.5 of the FVD-cow paper.
41 * This also covers the case of no base image. */
42 return load_data (NULL, bs, sector_num, qiov, nb_sectors, cb, opaque);
45 /* Figure out data regions in the base image and in the FVD data file. */
46 int64_t last_sec_in_backing, first_sec_in_backing;
47 int64_t last_sec_in_fvd, first_sec_in_fvd;
48 calc_read_region (s, sector_num, nb_sectors, &first_sec_in_fvd,
49 &last_sec_in_fvd, &first_sec_in_backing,
50 &last_sec_in_backing);
52 if (first_sec_in_backing < 0) {
53 /* A simple case: all requested data are in the FVD data file. */
54 return load_data (NULL, bs, sector_num, qiov, nb_sectors, cb, opaque);
57 /* Do copy-on-read only if the context id is 0, i.e., it is not emulating
58 * synchronous I/O. Doing copy-on-read in emulated synchronous I/O may
59 * leave the copy-on-read callbacks never being processed due to
60 * mismatching contextid. */
61 const int copy_on_read = s->copy_on_read;
63 if (first_sec_in_fvd < 0 && !copy_on_read) {
64 /* A simple case: all requested data are in the base image and no need
65 * to do copy_on_read. */
66 return bdrv_aio_readv (bs->backing_hd, sector_num, qiov, nb_sectors, cb,
67 opaque);
70 /* The remaining cases are more complicated, which can be: 1. Data are
71 * only in the base image and copy-on-read is needed. 2. Data are in both
72 * the base image and the FVD data file. Copy-on-read may be either TRUE
73 * or FALSE. */
74 FvdAIOCB *acb = my_qemu_aio_get (&fvd_aio_pool, bs, cb, opaque);
75 if (!acb) {
76 return NULL;
79 QDEBUG ("READ: acb%llu-%p start sector_num=%" PRId64 " nb_sectors=%d\n",
80 acb->uuid, acb, sector_num, nb_sectors);
82 acb->type = OP_READ;
83 acb->sector_num = sector_num;
84 acb->nb_sectors = nb_sectors;
85 acb->read.qiov = qiov;
86 acb->read.ret = 0;
87 acb->read.read_backing.hd_acb = NULL;
88 acb->read.read_backing.done = FALSE;
89 acb->read.read_backing.iov.iov_base = NULL;
90 acb->read.read_fvd.hd_acb = NULL;
91 acb->read.read_fvd.iov.iov_base = NULL;
92 acb->read.read_fvd.done = (first_sec_in_fvd < 0);
94 /* Read from the base image. */
95 if (copy_on_read) {
96 /* Round the request to the block boundary. */
97 acb->read.read_backing.sector_num =
98 ROUND_DOWN (first_sec_in_backing, s->block_size);
99 int64_t end = ROUND_UP (last_sec_in_backing + 1, s->block_size);
100 if (end > s->nb_sectors_in_base_img) {
101 end = s->nb_sectors_in_base_img;
103 acb->read.read_backing.nb_sectors =
104 end - acb->read.read_backing.sector_num;
105 } else {
106 acb->read.read_backing.sector_num = first_sec_in_backing;
107 acb->read.read_backing.nb_sectors =
108 last_sec_in_backing - first_sec_in_backing + 1;
111 acb->read.read_backing.iov.iov_len =
112 acb->read.read_backing.nb_sectors * 512;
113 acb->read.read_backing.iov.iov_base =
114 my_qemu_blockalign (bs->backing_hd, acb->read.read_backing.iov.iov_len);
115 qemu_iovec_init_external (&acb->read.read_backing.qiov,
116 &acb->read.read_backing.iov, 1);
117 acb->read.read_backing.hd_acb =
118 bdrv_aio_readv (bs->backing_hd, acb->read.read_backing.sector_num,
119 &acb->read.read_backing.qiov,
120 acb->read.read_backing.nb_sectors,
121 finish_read_backing_for_copy_on_read, acb);
122 QDEBUG ("READ: acb%llu-%p read_backing backing_sector_num=%" PRId64
123 " backing_nb_sectors=%d\n", acb->uuid, acb,
124 acb->read.read_backing.sector_num,
125 acb->read.read_backing.nb_sectors);
127 if (!acb->read.read_backing.hd_acb) {
128 my_qemu_vfree (acb->read.read_backing.iov.iov_base);
129 my_qemu_aio_unref (acb);
130 return NULL;
133 if (first_sec_in_fvd >= 0) {
134 /* Read the FVD data file. */
135 acb->read.read_fvd.sector_num = first_sec_in_fvd;
136 acb->read.read_fvd.nb_sectors = last_sec_in_fvd - first_sec_in_fvd + 1;
137 acb->read.read_fvd.iov.iov_len = acb->read.read_fvd.nb_sectors * 512;
139 /* Make a copy of the current bitmap because it may change when the
140 * read requests finish. */
141 int64_t b = MIN (acb->read.read_backing.sector_num,
142 acb->read.read_fvd.sector_num);
143 b = b / s->block_size / 8; /* First byte of the bitmap we need. */
144 int64_t e1 = acb->read.read_backing.sector_num +
145 acb->read.read_backing.nb_sectors;
146 int64_t e2 = acb->read.read_fvd.sector_num +
147 acb->read.read_fvd.nb_sectors;
148 int64_t e = MAX (e1, e2);
149 if (e > s->nb_sectors_in_base_img) {
150 e = s->nb_sectors_in_base_img;
152 e = (e - 1) / s->block_size / 8;/* Last byte of the bitmap we need. */
153 int bitmap_bytes = e - b + 1;
154 int buf_size = acb->read.read_fvd.iov.iov_len +
155 ROUND_UP (bitmap_bytes, 512);
156 acb->read.read_fvd.iov.iov_base =
157 my_qemu_blockalign (s->fvd_data, buf_size);
158 uint8_t *saved_bitmap = ((uint8_t *) acb->read.read_fvd.iov.iov_base) +
159 acb->read.read_fvd.iov.iov_len;
160 memcpy (saved_bitmap, s->fresh_bitmap + b, bitmap_bytes);
162 qemu_iovec_init_external (&acb->read.read_fvd.qiov,
163 &acb->read.read_fvd.iov, 1);
164 QDEBUG ("READ: acb%llu-%p read_fvd fvd_sector_num=%" PRId64
165 " fvd_nb_sectors=%d\n", acb->uuid, acb,
166 acb->read.read_fvd.sector_num, acb->read.read_fvd.nb_sectors);
167 acb->read.read_fvd.hd_acb = load_data (acb, bs, first_sec_in_fvd,
168 &acb->read.read_fvd.qiov,
169 acb->read.read_fvd.nb_sectors,
170 finish_read_fvd, acb);
171 if (!acb->read.read_fvd.hd_acb) {
172 if (acb->read.read_backing.hd_acb) {
173 bdrv_aio_cancel (acb->read.read_backing.hd_acb);
174 my_qemu_vfree (acb->read.read_backing.iov.iov_base);
176 my_qemu_vfree (acb->read.read_fvd.iov.iov_base);
177 my_qemu_aio_unref (acb);
178 return NULL;
182 return &acb->common;
185 static void finish_copy_on_read (void *opaque, int ret)
187 FvdAIOCB *acb = opaque;
188 BlockDriverState *bs = acb->common.bs;
189 BDRVFvdState *s = bs->opaque;
191 if (ret == 0) {
192 /* Update fresh_bitmap but do not update stale_bitmap or the on-disk
193 * bitmap. See Section 3.3.4 of the FVD-cow paper. */
194 update_fresh_bitmap (acb->sector_num, acb->nb_sectors, s);
197 s->outstanding_copy_on_read_data -= acb->nb_sectors * 512;
199 #ifdef FVD_DEBUG
200 s->total_copy_on_read_data += acb->nb_sectors * 512;
201 #endif
202 QDEBUG ("READ: acb%llu-%p finish_copy_on_read buffer_sector_num=%" PRId64
203 " buffer_nb_sectors=%d write_sector_num=%" PRId64
204 " write_nb_sectors=%d outstanding_copy_on_read=%" PRId64 "\n",
205 acb->uuid, acb, acb->copy.buffered_sector_begin,
206 (int) (acb->copy.buffered_sector_end -
207 acb->copy.buffered_sector_begin), acb->sector_num,
208 acb->nb_sectors, s->outstanding_copy_on_read_data);
210 QLIST_REMOVE (acb, copy_lock.next);
211 restart_dependent_writes (acb);
213 int64_t begin = acb->sector_num + acb->nb_sectors;
214 int64_t end = acb->copy.buffered_sector_end;
216 if (find_region_in_base_img (s, &begin, &end)) {
217 acb->sector_num = begin;
218 acb->nb_sectors = end - begin;
219 acb->copy.iov.iov_base = acb->copy.buf +
220 (begin - acb->copy.buffered_sector_begin) * 512;
221 acb->copy.iov.iov_len = acb->nb_sectors * 512;
222 qemu_iovec_init_external (&acb->copy.qiov, &acb->copy.iov, 1);
223 QDEBUG ("READ: acb%llu-%p copy_on_read buffer_sector_num=%" PRId64
224 " buffer_nb_sectors=%d write_sector_num=%" PRId64
225 " write_nb_sectors=%d outstanding_copy_on_read=%" PRId64 "\n",
226 acb->uuid, acb, acb->copy.buffered_sector_begin,
227 (int) (acb->copy.buffered_sector_end -
228 acb->copy.buffered_sector_begin), acb->sector_num,
229 acb->nb_sectors, s->outstanding_copy_on_read_data);
230 acb->copy.hd_acb = store_data (TRUE, acb, bs, acb->sector_num,
231 &acb->copy.qiov, acb->nb_sectors,
232 finish_copy_on_read, acb);
233 if (acb->copy.hd_acb) {
234 QLIST_INIT (&acb->copy_lock.dependent_writes);
235 acb->copy_lock.begin = begin;
236 acb->copy_lock.end = end;
237 QLIST_INSERT_HEAD (&s->copy_locks, acb, copy_lock.next);
238 s->outstanding_copy_on_read_data += acb->copy.iov.iov_len;
239 return;
243 QDEBUG ("READ: acb%llu-%p no_more_copy_on_read\n", acb->uuid, acb);
244 my_qemu_vfree (acb->copy.buf);
245 my_qemu_aio_unref (acb);
248 static void finish_read (FvdAIOCB * acb)
250 BlockDriverState *bs = acb->common.bs;
251 BDRVFvdState *s = bs->opaque;
253 if (acb->read.ret != 0) {
254 QDEBUG ("READ: acb%llu-%p finish_read error ret=%d sector_num=%" PRId64
255 " nb_sectors=%d\n", acb->uuid, acb, acb->read.ret,
256 acb->sector_num, acb->nb_sectors);
257 acb->common.cb (acb->common.opaque, acb->read.ret);
258 if (acb->read.read_backing.iov.iov_base) {
259 my_qemu_vfree (acb->read.read_backing.iov.iov_base);
261 if (acb->read.read_fvd.iov.iov_base) {
262 my_qemu_vfree (acb->read.read_fvd.iov.iov_base);
264 my_qemu_aio_unref (acb);
266 return;
269 if (!acb->read.read_fvd.iov.iov_base) {
270 /* Only read data from the base image. */
271 uint8_t *data = ((uint8_t *) acb->read.read_backing.iov.iov_base) +
272 (acb->sector_num - acb->read.read_backing.sector_num) * 512;
273 qemu_iovec_from_buf(acb->read.qiov, 0, data, acb->nb_sectors * 512);
274 } else {
275 /* Under the guidance of the saved bitmap, merge data from the FVD
276 * data file and the base image. */
277 uint8_t *saved_bitmap = ((uint8_t *) acb->read.read_fvd.iov.iov_base) +
278 acb->read.read_fvd.iov.iov_len;
279 int64_t bitmap_offset = MIN (acb->read.read_backing.sector_num,
280 acb->read.read_fvd.sector_num);
281 bitmap_offset = bitmap_offset / s->block_size / 8;
282 int iov_index = 0;
283 uint8_t *iov_buf = acb->read.qiov->iov[0].iov_base;
284 int iov_left = acb->read.qiov->iov[0].iov_len;
285 int64_t sec = acb->sector_num;
286 const int64_t end = acb->sector_num + acb->nb_sectors;
287 int64_t first_sec;
288 uint8_t *source;
290 if (bitmap_show_sector_in_base_img
291 (sec, s, bitmap_offset, saved_bitmap)) {
292 goto in_backing;
295 while (1) {
296 /* For a section of data in the FVD data file. */
297 if (sec >= end) {
298 break;
301 first_sec = sec;
302 do {
303 sec++;
304 } while (sec < end && !bitmap_show_sector_in_base_img (sec, s,
305 bitmap_offset, saved_bitmap));
307 source = ((uint8_t *) acb->read.read_fvd.iov.iov_base) +
308 (first_sec - acb->read.read_fvd.sector_num) * 512;
309 copy_to_iov (acb->read.qiov->iov, &iov_index, &iov_buf, &iov_left,
310 source, (sec - first_sec) * 512);
312 in_backing:
313 /* For a section of data in the base image. */
314 if (sec >= end) {
315 break;
318 first_sec = sec;
319 do {
320 sec++;
321 } while (sec < end && bitmap_show_sector_in_base_img (sec, s,
322 bitmap_offset, saved_bitmap));
324 source = ((uint8_t *) acb->read.read_backing.iov.iov_base) +
325 (first_sec - acb->read.read_backing.sector_num) * 512;
326 copy_to_iov (acb->read.qiov->iov, &iov_index, &iov_buf, &iov_left,
327 source, (sec - first_sec) * 512);
330 ASSERT (iov_index == acb->read.qiov->niov - 1 && iov_left == 0);
331 my_qemu_vfree (acb->read.read_fvd.iov.iov_base);
334 QDEBUG ("READ: acb%llu-%p finish_read ret=%d\n", acb->uuid, acb,
335 acb->read.ret);
336 acb->common.cb (acb->common.opaque, acb->read.ret);
338 if (!s->copy_on_read) {
339 /* Do copy-on-read only if the context id is 0, i.e., it is not
340 * emulating synchronous I/O. Doing copy-on-read in emulated
341 * synchronous I/O may leave the copy-on-read callbacks never being
342 * processed due to mismatching context id. */
343 my_qemu_vfree (acb->read.read_backing.iov.iov_base);
344 my_qemu_aio_unref (acb);
345 return;
348 /* Convert AIOReadCB into a AIOCopyCB for copy-on-read. */
349 uint8_t *buf = acb->read.read_backing.iov.iov_base;
350 int64_t begin = acb->read.read_backing.sector_num;
351 int64_t end = begin + acb->read.read_backing.nb_sectors;
353 acb->type = OP_COPY;
354 acb->copy.buf = buf;
355 acb->copy.buffered_sector_begin = begin;
356 acb->copy.buffered_sector_end = end;
358 if (s->outstanding_copy_on_read_data < s->max_outstanding_copy_on_read_data
359 && find_region_in_base_img (s, &begin, &end)) {
360 /* Write to the FVD data file. */
361 acb->sector_num = begin;
362 acb->nb_sectors = end - begin;
363 acb->copy.iov.iov_base =
364 buf + (begin - acb->copy.buffered_sector_begin) * 512;
365 acb->copy.iov.iov_len = acb->nb_sectors * 512;
366 qemu_iovec_init_external (&acb->copy.qiov, &acb->copy.iov, 1);
367 QDEBUG ("READ: acb%llu-%p copy_on_read buffer_sector_num=%" PRId64
368 " buffer_nb_sectors=%d write_sector_num=%" PRId64
369 " write_nb_sectors=%d outstanding_copy_on_read=%" PRId64 "\n",
370 acb->uuid, acb, acb->copy.buffered_sector_begin,
371 (int) (acb->copy.buffered_sector_end -
372 acb->copy.buffered_sector_begin), acb->sector_num,
373 acb->nb_sectors, s->outstanding_copy_on_read_data);
374 acb->copy.hd_acb = store_data (TRUE, acb, bs, acb->sector_num,
375 &acb->copy.qiov, acb->nb_sectors,
376 finish_copy_on_read, acb);
377 if (acb->copy.hd_acb) {
378 QLIST_INIT (&acb->copy_lock.dependent_writes);
379 acb->copy_lock.begin = begin;
380 acb->copy_lock.end = end;
381 QLIST_INSERT_HEAD (&s->copy_locks, acb, copy_lock.next);
382 s->outstanding_copy_on_read_data += acb->copy.iov.iov_len;
383 return;
387 /* No more copy-on-read to do. */
388 my_qemu_vfree (acb->copy.buf);
389 my_qemu_aio_unref (acb);
392 static void finish_read_fvd (void *opaque, int ret)
394 FvdAIOCB *acb = opaque;
396 QDEBUG ("READ: acb%llu-%p finish_read_fvd ret=%d\n", acb->uuid, acb, ret);
397 acb->read.read_fvd.hd_acb = NULL;
398 acb->read.read_fvd.done = TRUE;
399 if (acb->read.ret == 0) {
400 acb->read.ret = ret;
403 if (acb->read.read_backing.done) {
404 finish_read (acb); /* The other request also finished. */
408 static void finish_read_backing_for_copy_on_read (void *opaque, int ret)
410 FvdAIOCB *acb = opaque;
412 QDEBUG ("READ: acb%llu-%p finish_read_backing ret=%d\n", acb->uuid, acb,
413 ret);
414 acb->read.read_backing.hd_acb = NULL;
415 acb->read.read_backing.done = TRUE;
416 if (acb->read.ret == 0) {
417 acb->read.ret = ret;
420 if (acb->read.read_fvd.done) {
421 finish_read (acb); /* The other request also finished. */
425 static inline void calc_read_region (BDRVFvdState * s, int64_t sector_num,
426 int nb_sectors,
427 int64_t * p_first_sec_in_fvd,
428 int64_t * p_last_sec_in_fvd,
429 int64_t * p_first_sec_in_backing,
430 int64_t * p_last_sec_in_backing)
432 int64_t last_sec_in_backing = -1, first_sec_in_backing = -1;
433 int64_t last_sec_in_fvd = -1, first_sec_in_fvd = -1;
434 int prev_block_in_backing;
436 if (fresh_bitmap_show_sector_in_base_img (sector_num, s)) {
437 first_sec_in_backing = last_sec_in_backing = sector_num;
438 prev_block_in_backing = TRUE;
439 } else {
440 first_sec_in_fvd = last_sec_in_fvd = sector_num;
441 prev_block_in_backing = FALSE;
444 /* Begin of next block. */
445 int64_t sec = ROUND_UP (sector_num + 1, s->block_size);
447 const int64_t sec_end = sector_num + nb_sectors;
448 int64_t last_sec = MIN (sec_end, s->nb_sectors_in_base_img) - 1;
450 while (1) {
451 if (sec > last_sec) {
452 sec = last_sec;
455 if (fresh_bitmap_show_sector_in_base_img (sec, s)) {
456 if (first_sec_in_backing < 0) {
457 first_sec_in_backing = sec;
459 if (!prev_block_in_backing) {
460 last_sec_in_fvd = sec - 1;
461 prev_block_in_backing = TRUE;
463 last_sec_in_backing = sec;
464 } else {
465 if (first_sec_in_fvd < 0) {
466 first_sec_in_fvd = sec;
468 if (prev_block_in_backing) {
469 last_sec_in_backing = sec - 1;
470 prev_block_in_backing = FALSE;
472 last_sec_in_fvd = sec;
475 if (sec == last_sec) {
476 break;
478 sec += s->block_size;
481 if (sec_end > s->nb_sectors_in_base_img) {
482 if (first_sec_in_fvd < 0) {
483 first_sec_in_fvd = s->nb_sectors_in_base_img;
485 last_sec_in_fvd = sec_end - 1;
488 *p_first_sec_in_fvd = first_sec_in_fvd;
489 *p_last_sec_in_fvd = last_sec_in_fvd;
490 *p_first_sec_in_backing = first_sec_in_backing;
491 *p_last_sec_in_backing = last_sec_in_backing;
494 #if 0
495 static void fvd_read_cancel (FvdAIOCB * acb)
497 if (acb->read.read_backing.hd_acb) {
498 bdrv_aio_cancel (acb->read.read_backing.hd_acb);
500 if (acb->read.read_fvd.hd_acb) {
501 bdrv_aio_cancel (acb->read.read_fvd.hd_acb);
503 if (acb->read.read_backing.iov.iov_base) {
504 my_qemu_vfree (acb->read.read_backing.iov.iov_base);
506 if (acb->read.read_fvd.iov.iov_base) {
507 my_qemu_vfree (acb->read.read_fvd.iov.iov_base);
509 my_qemu_aio_unref (acb);
512 static void fvd_copy_cancel (FvdAIOCB * acb)
514 BlockDriverState *bs = acb->common.bs;
515 BDRVFvdState *s = bs->opaque;
517 if (acb->copy.hd_acb) {
518 bdrv_aio_cancel (acb->copy.hd_acb);
520 if (acb->copy_lock.next.le_prev != NULL) {
521 QLIST_REMOVE (acb, copy_lock.next);
522 restart_dependent_writes (acb);
524 my_qemu_vfree (acb->copy.buf);
525 if (acb->common.cb != null_prefetch_cb) {
526 /* This is a copy-on-read operation. */
527 s->outstanding_copy_on_read_data -= acb->nb_sectors * 512;
529 my_qemu_aio_unref (acb);
531 #endif
533 static void restart_dependent_writes (FvdAIOCB * acb)
535 acb->copy_lock.next.le_prev = NULL;
536 FvdAIOCB *req = acb->copy_lock.dependent_writes.lh_first;
538 while (req) {
539 /* Keep a copy of 'next' as it may be changed in do_aiO_write(). */
540 FvdAIOCB *next = req->write.next_dependent_write.le_next;
542 /* Indicate that this write is no longer on any depedent list. This
543 * helps fvd_read_cancel() work properly. */
544 req->write.next_dependent_write.le_prev = NULL;
546 if (acb->type == OP_WRITE) {
547 QDEBUG ("WRITE: acb%llu-%p finished_and_restart_conflict_write "
548 "acb%llu-%p\n", acb->uuid, acb, req->uuid, req);
549 } else {
550 QDEBUG ("READ: copy_on_read acb%llu-%p "
551 "finished_and_restart_conflict_write acb%llu-%p\n",
552 acb->uuid, acb, req->uuid, req);
555 if (do_aio_write (req) < 0) {
556 QDEBUG ("WRITE: acb%llu-%p finished with error ret=%d\n",
557 req->uuid, req, -1);
558 req->common.cb (req->common.opaque, -1);
559 my_qemu_aio_unref (req);
562 req = next;