util/async.c

   1 /*
   2  * Data plane event loop
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2009-2017 QEMU contributors
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a copy
   8  * of this software and associated documentation files (the "Software"), to deal
   9  * in the Software without restriction, including without limitation the rights
  10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11  * copies of the Software, and to permit persons to whom the Software is
  12  * furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included in
  15  * all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23  * THE SOFTWARE.
  24  */
  25
  26 #include "qemu/osdep.h"
  27 #include "qapi/error.h"
  28 #include "block/aio.h"
  29 #include "block/thread-pool.h"
  30 #include "qemu/main-loop.h"
  31 #include "qemu/atomic.h"
  32 #include "qemu/rcu_queue.h"
  33 #include "block/raw-aio.h"
  34 #include "qemu/coroutine_int.h"
  35 #include "trace.h"
  36
  37 /***********************************************************/
  38 /* bottom halves (can be seen as timers which expire ASAP) */
  39
  40 /* QEMUBH::flags values */
  41 enum {
  42     /* Already enqueued and waiting for aio_bh_poll() */
  43     BH_PENDING   = (1 << 0),
  44
  45     /* Invoke the callback */
  46     BH_SCHEDULED = (1 << 1),
  47
  48     /* Delete without invoking callback */
  49     BH_DELETED   = (1 << 2),
  50
  51     /* Delete after invoking callback */
  52     BH_ONESHOT   = (1 << 3),
  53
  54     /* Schedule periodically when the event loop is idle */
  55     BH_IDLE      = (1 << 4),
  56 };
  57
  58 struct QEMUBH {
  59     AioContext *ctx;
  60     QEMUBHFunc *cb;
  61     void *opaque;
  62     QSLIST_ENTRY(QEMUBH) next;
  63     unsigned flags;
  64 };
  65
  66 /* Called concurrently from any thread */
  67 static void aio_bh_enqueue(QEMUBH *bh, unsigned new_flags)
  68 {
  69     AioContext *ctx = bh->ctx;
  70     unsigned old_flags;
  71
  72     /*
  73      * The memory barrier implicit in atomic_fetch_or makes sure that:
  74      * 1. idle & any writes needed by the callback are done before the
  75      *    locations are read in the aio_bh_poll.
  76      * 2. ctx is loaded before the callback has a chance to execute and bh
  77      *    could be freed.
  78      */
  79     old_flags = atomic_fetch_or(&bh->flags, BH_PENDING | new_flags);
  80     if (!(old_flags & BH_PENDING)) {
  81         QSLIST_INSERT_HEAD_ATOMIC(&ctx->bh_list, bh, next);
  82     }
  83
  84     aio_notify(ctx);
  85 }
  86
  87 /* Only called from aio_bh_poll() and aio_ctx_finalize() */
  88 static QEMUBH *aio_bh_dequeue(BHList *head, unsigned *flags)
  89 {
  90     QEMUBH *bh = QSLIST_FIRST_RCU(head);
  91
  92     if (!bh) {
  93         return NULL;
  94     }
  95
  96     QSLIST_REMOVE_HEAD(head, next);
  97
  98     /*
  99      * The atomic_and is paired with aio_bh_enqueue().  The implicit memory
 100      * barrier ensures that the callback sees all writes done by the scheduling
 101      * thread.  It also ensures that the scheduling thread sees the cleared
 102      * flag before bh->cb has run, and thus will call aio_notify again if
 103      * necessary.
 104      */
 105     *flags = atomic_fetch_and(&bh->flags,
 106                               ~(BH_PENDING | BH_SCHEDULED | BH_IDLE));
 107     return bh;
 108 }
 109
 110 void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
 111 {
 112     QEMUBH *bh;
 113     bh = g_new(QEMUBH, 1);
 114     *bh = (QEMUBH){
 115         .ctx = ctx,
 116         .cb = cb,
 117         .opaque = opaque,
 118     };
 119     aio_bh_enqueue(bh, BH_SCHEDULED | BH_ONESHOT);
 120 }
 121
 122 QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
 123 {
 124     QEMUBH *bh;
 125     bh = g_new(QEMUBH, 1);
 126     *bh = (QEMUBH){
 127         .ctx = ctx,
 128         .cb = cb,
 129         .opaque = opaque,
 130     };
 131     return bh;
 132 }
 133
 134 void aio_bh_call(QEMUBH *bh)
 135 {
 136     bh->cb(bh->opaque);
 137 }
 138
 139 /* Multiple occurrences of aio_bh_poll cannot be called concurrently. */
 140 int aio_bh_poll(AioContext *ctx)
 141 {
 142     BHListSlice slice;
 143     BHListSlice *s;
 144     int ret = 0;
 145
 146     QSLIST_MOVE_ATOMIC(&slice.bh_list, &ctx->bh_list);
 147     QSIMPLEQ_INSERT_TAIL(&ctx->bh_slice_list, &slice, next);
 148
 149     while ((s = QSIMPLEQ_FIRST(&ctx->bh_slice_list))) {
 150         QEMUBH *bh;
 151         unsigned flags;
 152
 153         bh = aio_bh_dequeue(&s->bh_list, &flags);
 154         if (!bh) {
 155             QSIMPLEQ_REMOVE_HEAD(&ctx->bh_slice_list, next);
 156             continue;
 157         }
 158
 159         if ((flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
 160             /* Idle BHs don't count as progress */
 161             if (!(flags & BH_IDLE)) {
 162                 ret = 1;
 163             }
 164             aio_bh_call(bh);
 165         }
 166         if (flags & (BH_DELETED | BH_ONESHOT)) {
 167             g_free(bh);
 168         }
 169     }
 170
 171     return ret;
 172 }
 173
 174 void qemu_bh_schedule_idle(QEMUBH *bh)
 175 {
 176     aio_bh_enqueue(bh, BH_SCHEDULED | BH_IDLE);
 177 }
 178
 179 void qemu_bh_schedule(QEMUBH *bh)
 180 {
 181     aio_bh_enqueue(bh, BH_SCHEDULED);
 182 }
 183
 184 /* This func is async.
 185  */
 186 void qemu_bh_cancel(QEMUBH *bh)
 187 {
 188     atomic_and(&bh->flags, ~BH_SCHEDULED);
 189 }
 190
 191 /* This func is async.The bottom half will do the delete action at the finial
 192  * end.
 193  */
 194 void qemu_bh_delete(QEMUBH *bh)
 195 {
 196     aio_bh_enqueue(bh, BH_DELETED);
 197 }
 198
 199 static int64_t aio_compute_bh_timeout(BHList *head, int timeout)
 200 {
 201     QEMUBH *bh;
 202
 203     QSLIST_FOREACH_RCU(bh, head, next) {
 204         if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
 205             if (bh->flags & BH_IDLE) {
 206                 /* idle bottom halves will be polled at least
 207                  * every 10ms */
 208                 timeout = 10000000;
 209             } else {
 210                 /* non-idle bottom halves will be executed
 211                  * immediately */
 212                 return 0;
 213             }
 214         }
 215     }
 216
 217     return timeout;
 218 }
 219
 220 int64_t
 221 aio_compute_timeout(AioContext *ctx)
 222 {
 223     BHListSlice *s;
 224     int64_t deadline;
 225     int timeout = -1;
 226
 227     timeout = aio_compute_bh_timeout(&ctx->bh_list, timeout);
 228     if (timeout == 0) {
 229         return 0;
 230     }
 231
 232     QSIMPLEQ_FOREACH(s, &ctx->bh_slice_list, next) {
 233         timeout = aio_compute_bh_timeout(&s->bh_list, timeout);
 234         if (timeout == 0) {
 235             return 0;
 236         }
 237     }
 238
 239     deadline = timerlistgroup_deadline_ns(&ctx->tlg);
 240     if (deadline == 0) {
 241         return 0;
 242     } else {
 243         return qemu_soonest_timeout(timeout, deadline);
 244     }
 245 }
 246
 247 static gboolean
 248 aio_ctx_prepare(GSource *source, gint    *timeout)
 249 {
 250     AioContext *ctx = (AioContext *) source;
 251
 252     atomic_or(&ctx->notify_me, 1);
 253
 254     /* We assume there is no timeout already supplied */
 255     *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
 256
 257     if (aio_prepare(ctx)) {
 258         *timeout = 0;
 259     }
 260
 261     return *timeout == 0;
 262 }
 263
 264 static gboolean
 265 aio_ctx_check(GSource *source)
 266 {
 267     AioContext *ctx = (AioContext *) source;
 268     QEMUBH *bh;
 269     BHListSlice *s;
 270
 271     atomic_and(&ctx->notify_me, ~1);
 272     aio_notify_accept(ctx);
 273
 274     QSLIST_FOREACH_RCU(bh, &ctx->bh_list, next) {
 275         if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
 276             return true;
 277         }
 278     }
 279
 280     QSIMPLEQ_FOREACH(s, &ctx->bh_slice_list, next) {
 281         QSLIST_FOREACH_RCU(bh, &s->bh_list, next) {
 282             if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
 283                 return true;
 284             }
 285         }
 286     }
 287     return aio_pending(ctx) || (timerlistgroup_deadline_ns(&ctx->tlg) == 0);
 288 }
 289
 290 static gboolean
 291 aio_ctx_dispatch(GSource     *source,
 292                  GSourceFunc  callback,
 293                  gpointer     user_data)
 294 {
 295     AioContext *ctx = (AioContext *) source;
 296
 297     assert(callback == NULL);
 298     aio_dispatch(ctx);
 299     return true;
 300 }
 301
 302 static void
 303 aio_ctx_finalize(GSource     *source)
 304 {
 305     AioContext *ctx = (AioContext *) source;
 306     QEMUBH *bh;
 307     unsigned flags;
 308
 309     thread_pool_free(ctx->thread_pool);
 310
 311 #ifdef CONFIG_LINUX_AIO
 312     if (ctx->linux_aio) {
 313         laio_detach_aio_context(ctx->linux_aio, ctx);
 314         laio_cleanup(ctx->linux_aio);
 315         ctx->linux_aio = NULL;
 316     }
 317 #endif
 318
 319 #ifdef CONFIG_LINUX_IO_URING
 320     if (ctx->linux_io_uring) {
 321         luring_detach_aio_context(ctx->linux_io_uring, ctx);
 322         luring_cleanup(ctx->linux_io_uring);
 323         ctx->linux_io_uring = NULL;
 324     }
 325 #endif
 326
 327     assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
 328     qemu_bh_delete(ctx->co_schedule_bh);
 329
 330     /* There must be no aio_bh_poll() calls going on */
 331     assert(QSIMPLEQ_EMPTY(&ctx->bh_slice_list));
 332
 333     while ((bh = aio_bh_dequeue(&ctx->bh_list, &flags))) {
 334         /* qemu_bh_delete() must have been called on BHs in this AioContext */
 335         assert(flags & BH_DELETED);
 336
 337         g_free(bh);
 338     }
 339
 340     aio_set_event_notifier(ctx, &ctx->notifier, false, NULL, NULL);
 341     event_notifier_cleanup(&ctx->notifier);
 342     qemu_rec_mutex_destroy(&ctx->lock);
 343     qemu_lockcnt_destroy(&ctx->list_lock);
 344     timerlistgroup_deinit(&ctx->tlg);
 345     aio_context_destroy(ctx);
 346 }
 347
 348 static GSourceFuncs aio_source_funcs = {
 349     aio_ctx_prepare,
 350     aio_ctx_check,
 351     aio_ctx_dispatch,
 352     aio_ctx_finalize
 353 };
 354
 355 GSource *aio_get_g_source(AioContext *ctx)
 356 {
 357     g_source_ref(&ctx->source);
 358     return &ctx->source;
 359 }
 360
 361 ThreadPool *aio_get_thread_pool(AioContext *ctx)
 362 {
 363     if (!ctx->thread_pool) {
 364         ctx->thread_pool = thread_pool_new(ctx);
 365     }
 366     return ctx->thread_pool;
 367 }
 368
 369 #ifdef CONFIG_LINUX_AIO
 370 LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp)
 371 {
 372     if (!ctx->linux_aio) {
 373         ctx->linux_aio = laio_init(errp);
 374         if (ctx->linux_aio) {
 375             laio_attach_aio_context(ctx->linux_aio, ctx);
 376         }
 377     }
 378     return ctx->linux_aio;
 379 }
 380
 381 LinuxAioState *aio_get_linux_aio(AioContext *ctx)
 382 {
 383     assert(ctx->linux_aio);
 384     return ctx->linux_aio;
 385 }
 386 #endif
 387
 388 #ifdef CONFIG_LINUX_IO_URING
 389 LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp)
 390 {
 391     if (ctx->linux_io_uring) {
 392         return ctx->linux_io_uring;
 393     }
 394
 395     ctx->linux_io_uring = luring_init(errp);
 396     if (!ctx->linux_io_uring) {
 397         return NULL;
 398     }
 399
 400     luring_attach_aio_context(ctx->linux_io_uring, ctx);
 401     return ctx->linux_io_uring;
 402 }
 403
 404 LuringState *aio_get_linux_io_uring(AioContext *ctx)
 405 {
 406     assert(ctx->linux_io_uring);
 407     return ctx->linux_io_uring;
 408 }
 409 #endif
 410
 411 void aio_notify(AioContext *ctx)
 412 {
 413     /* Write e.g. bh->scheduled before reading ctx->notify_me.  Pairs
 414      * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll.
 415      */
 416     smp_mb();
 417     if (ctx->notify_me) {
 418         event_notifier_set(&ctx->notifier);
 419         atomic_mb_set(&ctx->notified, true);
 420     }
 421 }
 422
 423 void aio_notify_accept(AioContext *ctx)
 424 {
 425     if (atomic_xchg(&ctx->notified, false)
 426 #ifdef WIN32
 427         || true
 428 #endif
 429     ) {
 430         event_notifier_test_and_clear(&ctx->notifier);
 431     }
 432 }
 433
 434 static void aio_timerlist_notify(void *opaque, QEMUClockType type)
 435 {
 436     aio_notify(opaque);
 437 }
 438
 439 static void event_notifier_dummy_cb(EventNotifier *e)
 440 {
 441 }
 442
 443 /* Returns true if aio_notify() was called (e.g. a BH was scheduled) */
 444 static bool event_notifier_poll(void *opaque)
 445 {
 446     EventNotifier *e = opaque;
 447     AioContext *ctx = container_of(e, AioContext, notifier);
 448
 449     return atomic_read(&ctx->notified);
 450 }
 451
 452 static void co_schedule_bh_cb(void *opaque)
 453 {
 454     AioContext *ctx = opaque;
 455     QSLIST_HEAD(, Coroutine) straight, reversed;
 456
 457     QSLIST_MOVE_ATOMIC(&reversed, &ctx->scheduled_coroutines);
 458     QSLIST_INIT(&straight);
 459
 460     while (!QSLIST_EMPTY(&reversed)) {
 461         Coroutine *co = QSLIST_FIRST(&reversed);
 462         QSLIST_REMOVE_HEAD(&reversed, co_scheduled_next);
 463         QSLIST_INSERT_HEAD(&straight, co, co_scheduled_next);
 464     }
 465
 466     while (!QSLIST_EMPTY(&straight)) {
 467         Coroutine *co = QSLIST_FIRST(&straight);
 468         QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
 469         trace_aio_co_schedule_bh_cb(ctx, co);
 470         aio_context_acquire(ctx);
 471
 472         /* Protected by write barrier in qemu_aio_coroutine_enter */
 473         atomic_set(&co->scheduled, NULL);
 474         qemu_aio_coroutine_enter(ctx, co);
 475         aio_context_release(ctx);
 476     }
 477 }
 478
 479 AioContext *aio_context_new(Error **errp)
 480 {
 481     int ret;
 482     AioContext *ctx;
 483
 484     ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
 485     QSLIST_INIT(&ctx->bh_list);
 486     QSIMPLEQ_INIT(&ctx->bh_slice_list);
 487     aio_context_setup(ctx);
 488
 489     ret = event_notifier_init(&ctx->notifier, false);
 490     if (ret < 0) {
 491         error_setg_errno(errp, -ret, "Failed to initialize event notifier");
 492         goto fail;
 493     }
 494     g_source_set_can_recurse(&ctx->source, true);
 495     qemu_lockcnt_init(&ctx->list_lock);
 496
 497     ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
 498     QSLIST_INIT(&ctx->scheduled_coroutines);
 499
 500     aio_set_event_notifier(ctx, &ctx->notifier,
 501                            false,
 502                            event_notifier_dummy_cb,
 503                            event_notifier_poll);
 504 #ifdef CONFIG_LINUX_AIO
 505     ctx->linux_aio = NULL;
 506 #endif
 507
 508 #ifdef CONFIG_LINUX_IO_URING
 509     ctx->linux_io_uring = NULL;
 510 #endif
 511
 512     ctx->thread_pool = NULL;
 513     qemu_rec_mutex_init(&ctx->lock);
 514     timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
 515
 516     ctx->poll_ns = 0;
 517     ctx->poll_max_ns = 0;
 518     ctx->poll_grow = 0;
 519     ctx->poll_shrink = 0;
 520
 521     return ctx;
 522 fail:
 523     g_source_destroy(&ctx->source);
 524     return NULL;
 525 }
 526
 527 void aio_co_schedule(AioContext *ctx, Coroutine *co)
 528 {
 529     trace_aio_co_schedule(ctx, co);
 530     const char *scheduled = atomic_cmpxchg(&co->scheduled, NULL,
 531                                            __func__);
 532
 533     if (scheduled) {
 534         fprintf(stderr,
 535                 "%s: Co-routine was already scheduled in '%s'\n",
 536                 __func__, scheduled);
 537         abort();
 538     }
 539
 540     /* The coroutine might run and release the last ctx reference before we
 541      * invoke qemu_bh_schedule().  Take a reference to keep ctx alive until
 542      * we're done.
 543      */
 544     aio_context_ref(ctx);
 545
 546     QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines,
 547                               co, co_scheduled_next);
 548     qemu_bh_schedule(ctx->co_schedule_bh);
 549
 550     aio_context_unref(ctx);
 551 }
 552
 553 void aio_co_wake(struct Coroutine *co)
 554 {
 555     AioContext *ctx;
 556
 557     /* Read coroutine before co->ctx.  Matches smp_wmb in
 558      * qemu_coroutine_enter.
 559      */
 560     smp_read_barrier_depends();
 561     ctx = atomic_read(&co->ctx);
 562
 563     aio_co_enter(ctx, co);
 564 }
 565
 566 void aio_co_enter(AioContext *ctx, struct Coroutine *co)
 567 {
 568     if (ctx != qemu_get_current_aio_context()) {
 569         aio_co_schedule(ctx, co);
 570         return;
 571     }
 572
 573     if (qemu_in_coroutine()) {
 574         Coroutine *self = qemu_coroutine_self();
 575         assert(self != co);
 576         QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
 577     } else {
 578         aio_context_acquire(ctx);
 579         qemu_aio_coroutine_enter(ctx, co);
 580         aio_context_release(ctx);
 581     }
 582 }
 583
 584 void aio_context_ref(AioContext *ctx)
 585 {
 586     g_source_ref(&ctx->source);
 587 }
 588
 589 void aio_context_unref(AioContext *ctx)
 590 {
 591     g_source_unref(&ctx->source);
 592 }
 593
 594 void aio_context_acquire(AioContext *ctx)
 595 {
 596     qemu_rec_mutex_lock(&ctx->lock);
 597 }
 598
 599 void aio_context_release(AioContext *ctx)
 600 {
 601     qemu_rec_mutex_unlock(&ctx->lock);
 602 }