2 * Copyright (c) 2010-2011 IBM
5 * Chunqiang Tang <ctang@us.ibm.com>
7 * This work is licensed under the terms of the GNU GPL, version 2.
8 * See the COPYING file in the top-level directory.
11 /*=============================================================================
12 * A short description: this module implements a simulated block device
13 * driver "blksim". It works with qemu-io and qemu-test to perform testing,
14 * allowing changing the order of disk I/O and callback activities to test
15 * rare race conditions. See qemu-test.c, qemu-io.c, and qemu-io-sim.c.
16 *============================================================================*/
18 #include "qemu/osdep.h"
23 #include <sys/ioctl.h>
24 #include "qemu-common.h"
25 #include "block/block_int.h"
26 #include "qemu/option.h"
27 #include "qemu/queue.h"
28 #include "qemu/timer.h"
29 #include "block/block.h"
30 #include "block/blksim.h"
31 #include "block/fvd-ext.h"
44 static int64_t sim_uuid
= 0;
45 static int64_t current_time
= 0;
46 static int64_t rand_time
= 0;
47 static int interactive_print
= FALSE
;
51 * Note: disk_io_return_code, set_disk_io_return_code(), and insert_task() work
52 * together to ensure that multiple subrequests triggered by the same
53 * outtermost request either succeed together or fail together. This behavior
54 * is required by qemu-test. Here is one example of problems caused by
55 * departuring from this behavior. Consider a write request that generates
56 * two subrequests, w1 and w2. If w1 succeeds but w2 fails, the data will not
57 * be written into qemu-test's "truth image" but the part of the data handled
58 * by w1 will be written into qemu-test's "test image". As a result, their
59 * contents diverge can automated testing cannot continue.
61 static int disk_io_return_code
= 0;
63 typedef struct BDRVSimState
{
67 typedef struct SimAIOCB
{
68 BlockDriverAIOCB common
;
76 struct SimAIOCB
*next
;
77 struct SimAIOCB
*prev
;
81 static AIOCBInfo sim_aio_pool
= {
82 .aiocb_size
= sizeof (SimAIOCB
),
85 static SimAIOCB head
= {
87 .time
= (int64_t) (9223372036854775807ULL),
93 /* Debug a specific task.*/
95 # define CHECK_TASK(acb) do { } while (0)
97 static inline void CHECK_TASK (int64_t uuid
)
100 printf ("CHECK_TASK pause for task %" PRId64
"\n", uuid
);
105 /* do_io() should never fail. A failure indicates a bug in the upper layer
106 * block device driver, or failure in the real hardware. */
107 static int do_io (BlockDriverState
* bs
, int64_t sector_num
, uint8_t * buf
,
108 int nb_sectors
, int do_read
)
110 BDRVSimState
*s
= bs
->opaque
;
111 size_t size
= nb_sectors
* 512;
114 if (lseek (s
->fd
, sector_num
* 512, SEEK_SET
) < 0) {
115 fprintf (stderr
, "Error: lseek %s sector_num=%" PRId64
". "
116 "Pause process %d for debugging...\n",
117 bs
->filename
, sector_num
, getpid ());
124 ret
= read (s
->fd
, buf
, size
);
127 "Error: read beyond the size of %s sector_num=%" PRId64
128 " nb_sectors=%d. Pause process %d for debugging...\n",
129 bs
->filename
, sector_num
, nb_sectors
, getpid ());
133 ret
= write (s
->fd
, buf
, size
);
139 } else if (errno
!= EINTR
) {
140 fprintf (stderr
, "Error: %s %s sector_num=%" PRId64
141 " nb_sectors=%d. Pause process %d for debugging...\n",
142 do_read
? "READ" : "WRITE", bs
->filename
, sector_num
,
143 nb_sectors
, getpid ());
152 static int sim_read (BlockDriverState
* bs
, int64_t sector_num
, uint8_t * buf
,
155 return do_io (bs
, sector_num
, buf
, nb_sectors
, TRUE
);
158 static int sim_write (BlockDriverState
* bs
, int64_t sector_num
,
159 const uint8_t * buf
, int nb_sectors
)
161 return do_io (bs
, sector_num
, (uint8_t *) buf
, nb_sectors
, FALSE
);
164 static void insert_in_list (SimAIOCB
* acb
)
166 int64_t new_id
= sim_uuid
++;
170 if (rand_time
<= 0) {
171 /* Working with qemu-io.c and not doing delay randomization.
172 * Insert it to the tail. */
174 acb
->prev
= head
.prev
;
176 head
.prev
->next
= acb
;
181 if (acb
->time
>= 0) {
182 /* Introduce a random delay to better trigger rare race conditions. */
183 acb
->time
+= random () % rand_time
;
186 /* Find the position to insert. The list is sorted in ascending time. */
187 SimAIOCB
*p
= head
.next
;
189 if (p
->time
> acb
->time
) {
192 if (p
->time
== acb
->time
&& (random () % 2 == 0)) {
198 /* Insert acb before p. */
205 /* Debug problems related to reusing task objects. Problem already solved.*/
207 # define my_qemu_aio_get qemu_aio_get
208 # define my_qemu_aio_unref qemu_aio_unref
211 static SimAIOCB
*search_task_list (SimAIOCB
* acb
)
214 for (p
= head
.next
; p
!= &head
; p
= p
->next
) {
223 static inline void *my_qemu_aio_get (AIOCBInfo
*pool
, BlockDriverState
*bs
,
224 BlockDriverCompletionFunc
* cb
,
227 SimAIOCB
*acb
= (SimAIOCB
*) qemu_aio_get (&sim_aio_pool
, bs
, cb
, opaque
);
228 QDEBUG ("SIM: qemu_aio_get reuse old task%" PRId64
"\n", acb
->uuid
);
229 ASSERT (!search_task_list (acb
));
233 static inline void my_qemu_aio_unref (SimAIOCB
* acb
)
235 QDEBUG ("SIM: qemu_aio_unref task%" PRId64
"\n", acb
->uuid
);
236 qemu_aio_unref (acb
);
240 static BlockDriverAIOCB
*insert_task (int op
, BlockDriverState
* bs
,
241 int64_t sector_num
, QEMUIOVector
* qiov
,
243 BlockDriverCompletionFunc
* cb
,
246 SimAIOCB
*acb
= my_qemu_aio_get (&sim_aio_pool
, bs
, cb
, opaque
);
252 acb
->sector_num
= sector_num
;
254 acb
->nb_sectors
= nb_sectors
;
255 acb
->ret
= disk_io_return_code
;
256 acb
->time
= current_time
;
257 insert_in_list (acb
);
259 if (interactive_print
) {
260 if (op
== SIM_READ
) {
261 printf ("Added READ uuid=%" PRId64
" filename=%s sector_num=%"
262 PRId64
" nb_sectors=%d\n", acb
->uuid
,
263 acb
->common
.bs
->filename
, acb
->sector_num
, acb
->nb_sectors
);
264 } else if (op
== SIM_WRITE
) {
265 printf ("Added WRITE uuid=%" PRId64
" filename=%s sector_num=%"
266 PRId64
" nb_sectors=%d\n", acb
->uuid
,
267 acb
->common
.bs
->filename
, acb
->sector_num
, acb
->nb_sectors
);
269 fprintf (stderr
, "Unknown op %d\n", op
);
277 static void insert_aio_callback (SimAIOCB
* acb
)
279 acb
->time
= current_time
;
280 insert_in_list (acb
);
282 if (acb
->op
== SIM_FLUSH
) {
283 acb
->op
= SIM_FLUSH_CALLBACK
;
284 if (interactive_print
) {
285 printf ("Added FLUSH_CALLBACK uuid=%" PRId64
" filename=%s\n",
286 acb
->uuid
, acb
->common
.bs
->filename
);
288 } else if (acb
->op
== SIM_READ
) {
289 acb
->op
= SIM_READ_CALLBACK
;
290 if (interactive_print
) {
291 printf ("Added READ_CALLBACK uuid=%" PRId64
292 " filename=%s sector_num=%" PRId64
" nb_sectors=%d\n",
293 acb
->uuid
, acb
->common
.bs
->filename
, acb
->sector_num
,
296 } else if (acb
->op
== SIM_WRITE
) {
297 acb
->op
= SIM_WRITE_CALLBACK
;
298 if (interactive_print
) {
299 printf ("Added WRITE_CALLBACK uuid=%" PRId64
300 " filename=%s sector_num=%" PRId64
" nb_sectors=%d\n",
301 acb
->uuid
, acb
->common
.bs
->filename
, acb
->sector_num
,
305 fprintf (stderr
, "Wrong op %d\n", acb
->op
);
310 void sim_list_tasks (void)
314 for (acb
= head
.next
; acb
!= &head
; acb
= acb
->next
) {
315 if (acb
->op
== SIM_READ
) {
316 printf ("uuid=%" PRId64
" READ file=%s sector_num=%"
317 PRIu64
" nb_sectors=%d\n", acb
->uuid
,
318 acb
->common
.bs
->filename
, acb
->sector_num
, acb
->nb_sectors
);
319 } else if (acb
->op
== SIM_WRITE
) {
320 printf ("uuid=%" PRId64
" WRITE file=%s sector_num=%"
321 PRIu64
" nb_sectors=%d\n", acb
->uuid
,
322 acb
->common
.bs
->filename
, acb
->sector_num
, acb
->nb_sectors
);
323 } else if (acb
->op
== SIM_READ_CALLBACK
) {
324 printf ("uuid=%" PRId64
" CALLBACK READ file=%s sector_num=%"
325 PRIu64
" nb_sectors=%d\n", acb
->uuid
,
326 acb
->common
.bs
->filename
, acb
->sector_num
, acb
->nb_sectors
);
327 } else if (acb
->op
== SIM_WRITE_CALLBACK
) {
328 printf ("uuid=%" PRId64
" CALLBACK WRITE file=%s sector_num=%"
329 PRIu64
" nb_sectors=%d\n", acb
->uuid
,
330 acb
->common
.bs
->filename
, acb
->sector_num
, acb
->nb_sectors
);
332 fprintf (stderr
, "Wrong OP %d\n", acb
->op
);
338 static inline void sim_callback (SimAIOCB
* acb
)
340 ASSERT (disk_io_return_code
== 0);
341 FVD_DEBUG_ACB (acb
->common
.opaque
);
342 acb
->common
.cb (acb
->common
.opaque
, acb
->ret
);
345 int64_t sim_get_time (void)
350 void *sim_new_timer (void *cb
, void *opaque
)
352 SimAIOCB
*acb
= my_qemu_aio_get (&sim_aio_pool
, NULL
, cb
, opaque
);
358 void sim_mod_timer (void *ts
, int64_t expire_time
)
363 /* Remove it first. */
364 acb
->next
->prev
= acb
->prev
;
365 acb
->prev
->next
= acb
->next
;
367 acb
->time
= expire_time
;
368 insert_in_list (acb
);
371 void sim_free_timer (void *ts
)
374 CHECK_TASK (acb
->uuid
);
375 my_qemu_aio_unref (acb
);
378 void sim_del_timer (void *ts
)
382 CHECK_TASK (acb
->uuid
);
384 /* Remove it from the list. */
385 acb
->next
->prev
= acb
->prev
;
386 acb
->prev
->next
= acb
->next
;
388 /* Mark it as not in list. */
393 void sim_set_disk_io_return_code (int ret
)
395 disk_io_return_code
= ret
;
398 static void sim_task_by_acb (SimAIOCB
* acb
)
400 CHECK_TASK (acb
->uuid
);
402 /* Remove it from the list. */
403 acb
->next
->prev
= acb
->prev
;
404 acb
->prev
->next
= acb
->next
;
405 acb
->prev
= NULL
; /* Indicate that it is no longer in the list. */
407 if (acb
->time
> current_time
) {
408 current_time
= acb
->time
;
411 if (acb
->op
== SIM_TIMER
) {
412 QDEBUG ("SIM: execute task%" PRId64
" time=%" PRId64
" TIMER \n",
413 acb
->uuid
, acb
->time
);
415 FVD_DEBUG_ACB (acb
->common
.opaque
);
416 ((QEMUTimerCB
*) acb
->common
.cb
) (acb
->common
.opaque
);
420 BlockDriverState
*bs
= acb
->common
.bs
;
422 if (acb
->op
== SIM_READ
) {
423 QDEBUG ("SIM: execute task%" PRId64
" time=%" PRId64
424 " READ sector_num=%" PRId64
" nb_sectors=%d\n",
425 acb
->uuid
, acb
->time
, acb
->sector_num
, acb
->nb_sectors
);
428 if (acb
->qiov
->niov
== 1) {
430 (bs
, acb
->sector_num
, acb
->qiov
->iov
->iov_base
,
431 acb
->nb_sectors
) != 0) {
432 fprintf (stderr
, "Error in reading %s sector_num=%" PRId64
433 " nb_sectors=%d\n", acb
->common
.bs
->filename
,
434 acb
->sector_num
, acb
->nb_sectors
);
439 qemu_blockalign (acb
->common
.bs
, acb
->qiov
->size
);
440 if (sim_read (bs
, acb
->sector_num
, buf
, acb
->nb_sectors
) != 0) {
441 fprintf (stderr
, "Error in reading %s sector_num=%" PRId64
442 " nb_sectors=%d\n", acb
->common
.bs
->filename
,
443 acb
->sector_num
, acb
->nb_sectors
);
446 qemu_iovec_from_buf(acb
->qiov
, 0, buf
, acb
->qiov
->size
);
451 insert_aio_callback (acb
);
452 } else if (acb
->op
== SIM_WRITE
) {
453 QDEBUG ("SIM: execute task%" PRId64
" time=%" PRId64
454 " WRITE sector_num=%" PRId64
" nb_sectors=%d\n",
455 acb
->uuid
, acb
->time
, acb
->sector_num
, acb
->nb_sectors
);
458 if (acb
->qiov
->niov
== 1) {
460 (bs
, acb
->sector_num
, acb
->qiov
->iov
->iov_base
,
461 acb
->nb_sectors
) != 0) {
462 fprintf (stderr
, "Error in writing %s sector_num=%" PRId64
463 " nb_sectors=%d\n", acb
->common
.bs
->filename
,
464 acb
->sector_num
, acb
->nb_sectors
);
468 uint8_t *buf
= qemu_blockalign (acb
->common
.bs
,
470 qemu_iovec_to_buf(acb
->qiov
, 0, buf
, acb
->qiov
->size
);
471 if (sim_write (bs
, acb
->sector_num
, buf
, acb
->nb_sectors
)!= 0) {
472 fprintf (stderr
, "Error in writing %s sector_num=%" PRId64
473 " nb_sectors=%d\n", acb
->common
.bs
->filename
,
474 acb
->sector_num
, acb
->nb_sectors
);
481 insert_aio_callback (acb
);
482 } else if (acb
->op
== SIM_FLUSH
) {
483 QDEBUG ("SIM: execute task%" PRId64
" time=%" PRId64
" FLUSH\n",
484 acb
->uuid
, acb
->time
);
485 /* Skip real flushing to speed up simulation:
486 * if (ret == 0) { * fdatasync (s->fd); } */
487 insert_aio_callback (acb
);
488 } else if (acb
->op
== SIM_WRITE_CALLBACK
|| acb
->op
== SIM_READ_CALLBACK
489 || acb
->op
== SIM_FLUSH_CALLBACK
) {
490 QDEBUG ("SIM: execute task%" PRId64
" time=%" PRId64
" CALLBACK\n",
491 acb
->uuid
, acb
->time
);
493 CHECK_TASK (acb
->uuid
);
494 my_qemu_aio_unref (acb
);
496 fprintf (stderr
, "Unknown op %d\n", acb
->op
);
501 int sim_task_by_uuid (int64_t uuid
)
505 for (acb
= head
.next
; acb
!= &head
; acb
= acb
->next
) {
506 if (acb
->uuid
== uuid
) {
507 sim_task_by_acb (acb
);
515 int sim_all_tasks (void)
520 SimAIOCB
*acb
= head
.next
;
525 sim_task_by_acb (acb
);
530 static BlockDriverAIOCB
*sim_aio_readv (BlockDriverState
* bs
,
534 BlockDriverCompletionFunc
* cb
,
537 return insert_task (SIM_READ
, bs
, sector_num
, qiov
, nb_sectors
, cb
, opaque
);
540 static BlockDriverAIOCB
*sim_aio_writev (BlockDriverState
* bs
,
544 BlockDriverCompletionFunc
* cb
,
547 return insert_task (SIM_WRITE
, bs
, sector_num
, qiov
, nb_sectors
, cb
,
551 static BlockDriverAIOCB
*sim_aio_flush (BlockDriverState
* bs
,
552 BlockDriverCompletionFunc
* cb
,
555 return insert_task (SIM_FLUSH
, bs
, 0, NULL
, 0, cb
, opaque
);
559 static void sim_aio_cancel (BlockDriverAIOCB
* blockacb
)
561 SimAIOCB
*acb
= container_of (blockacb
, SimAIOCB
, common
);
563 CHECK_TASK (acb
->uuid
);
566 acb
->next
->prev
= acb
->prev
;
567 acb
->prev
->next
= acb
->next
;
569 my_qemu_aio_unref (acb
);
571 ASSERT (FALSE
); /* Cancel a task not in the list. */
576 static int sim_probe (const uint8_t * buf
, int buf_size
, const char *filename
)
578 /* Return a score higher than RAW so that the image will be openned using
579 * the 'sim' format. */
583 static QemuOptsList runtime_opts
= {
585 .head
= QTAILQ_HEAD_INITIALIZER(runtime_opts
.head
),
589 .type
= QEMU_OPT_STRING
,
590 .help
= "File name of the image",
592 { /* end of list */ }
596 static int sim_open(BlockDriverState
*bs
, QDict
*options
, int bdrv_flags
,
599 BDRVSimState
*s
= bs
->opaque
;
600 int open_flags
= O_BINARY
| O_LARGEFILE
;
601 Error
*local_err
= NULL
;
602 const char *filename
;
604 QemuOpts
*opts
= qemu_opts_create(&runtime_opts
, NULL
, 0, &error_abort
);
605 qemu_opts_absorb_qdict(opts
, options
, &local_err
);
607 qerror_report_err(local_err
);
608 error_free(local_err
);
612 filename
= qemu_opt_get(opts
, "filename");
614 if ((bdrv_flags
& BDRV_O_RDWR
)) {
615 open_flags
|= O_RDWR
;
617 open_flags
|= O_RDONLY
;
620 if ((bdrv_flags
& BDRV_O_NOCACHE
)) {
621 open_flags
|= O_DIRECT
;
622 } else if (!(bdrv_flags
& BDRV_O_CACHE_WB
)) {
623 open_flags
|= O_DSYNC
;
626 /* Parse the "blksim:" prefix */
627 if (!strncmp(filename
, "blksim:", strlen("blksim:"))) {
628 filename
+= strlen("blksim:");
631 s
->fd
= open (filename
, open_flags
);
635 int64_t len
= lseek (s
->fd
, 0, SEEK_END
);
637 bs
->total_sectors
= len
/ 512;
639 bs
->total_sectors
= 0;
646 static void sim_close (BlockDriverState
* bs
)
648 BDRVSimState
*s
= bs
->opaque
;
652 static int sim_flush (BlockDriverState
* bs
)
655 * Skip real flushing to speed up simulation.
656 * BDRVSimState *s = bs->opaque;
662 static int sim_has_zero_init (BlockDriverState
* bs
)
666 if (stat (bs
->filename
, &buf
) != 0) {
667 fprintf (stderr
, "Failed to stat() %s\n", bs
->filename
);
671 if (S_ISBLK (buf
.st_mode
) || S_ISCHR (buf
.st_mode
)) {
678 static int sim_truncate (BlockDriverState
* bs
, int64_t offset
)
680 BDRVSimState
*s
= bs
->opaque
;
681 return ftruncate (s
->fd
, offset
);
684 static BlockDriver bdrv_sim
= {
685 .format_name
= "blksim",
686 .protocol_name
= "blksim",
687 .instance_size
= sizeof (BDRVSimState
),
688 .bdrv_probe
= sim_probe
,
689 .bdrv_file_open
= sim_open
,
690 .bdrv_close
= sim_close
,
691 .bdrv_co_flush_to_disk
= sim_flush
,
692 .bdrv_read
= sim_read
,
693 .bdrv_write
= sim_write
,
694 .bdrv_aio_readv
= sim_aio_readv
,
695 .bdrv_aio_writev
= sim_aio_writev
,
696 .bdrv_aio_flush
= sim_aio_flush
,
697 .bdrv_has_zero_init
= sim_has_zero_init
,
698 .bdrv_truncate
= sim_truncate
,
701 void enable_block_sim (int print
, int64_t _rand_time
)
703 BlockDriver
*drv
= bdrv_find_format ("blksim");
705 bdrv_register (&bdrv_sim
);
707 interactive_print
= print
;
708 rand_time
= _rand_time
;