sys/kern/kern_timeout.c

   1 /*
   2  * Copyright (c) 2004,2014 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34 /*
  35  * Copyright (c) 1982, 1986, 1991, 1993
  36  *      The Regents of the University of California.  All rights reserved.
  37  * (c) UNIX System Laboratories, Inc.
  38  * All or some portions of this file are derived from material licensed
  39  * to the University of California by American Telephone and Telegraph
  40  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  41  * the permission of UNIX System Laboratories, Inc.
  42  *
  43  * Redistribution and use in source and binary forms, with or without
  44  * modification, are permitted provided that the following conditions
  45  * are met:
  46  * 1. Redistributions of source code must retain the above copyright
  47  *    notice, this list of conditions and the following disclaimer.
  48  * 2. Redistributions in binary form must reproduce the above copyright
  49  *    notice, this list of conditions and the following disclaimer in the
  50  *    documentation and/or other materials provided with the distribution.
  51  * 3. Neither the name of the University nor the names of its contributors
  52  *    may be used to endorse or promote products derived from this software
  53  *    without specific prior written permission.
  54  *
  55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  65  * SUCH DAMAGE.
  66  */
  67 /*
  68  * The original callout mechanism was based on the work of Adam M. Costello
  69  * and George Varghese, published in a technical report entitled "Redesigning
  70  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
  71  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
  72  * used in this implementation was published by G. Varghese and T. Lauck in
  73  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
  74  * the Efficient Implementation of a Timer Facility" in the Proceedings of
  75  * the 11th ACM Annual Symposium on Operating Systems Principles,
  76  * Austin, Texas Nov 1987.
  77  *
  78  * The per-cpu augmentation was done by Matthew Dillon.
  79  */
  80
  81 #include <sys/param.h>
  82 #include <sys/systm.h>
  83 #include <sys/callout.h>
  84 #include <sys/kernel.h>
  85 #include <sys/interrupt.h>
  86 #include <sys/thread.h>
  87
  88 #include <sys/thread2.h>
  89 #include <sys/mplock2.h>
  90
  91 struct softclock_pcpu {
  92         struct callout_tailq *callwheel;
  93         struct callout * volatile next;
  94         intptr_t running;       /* NOTE! Bit 0 used to flag wakeup */
  95         int softticks;          /* softticks index */
  96         int curticks;           /* per-cpu ticks counter */
  97         int isrunning;
  98         struct thread *thread;
  99 };
 100
 101 typedef struct softclock_pcpu *softclock_pcpu_t;
 102
 103 static MALLOC_DEFINE(M_CALLOUT, "callout", "callout structures");
 104 static int cwheelsize;
 105 static int cwheelmask;
 106 static struct softclock_pcpu softclock_pcpu_ary[MAXCPU];
 107
 108 static void softclock_handler(void *arg);
 109 static void slotimer_callback(void *arg);
 110 static void callout_reset_ipi(void *arg);
 111 static void callout_stop_ipi(void *arg, int issync, struct intrframe *frame);
 112
 113
 114 static void
 115 swi_softclock_setup(void *arg)
 116 {
 117         int cpu;
 118         int i;
 119         int target;
 120
 121         /*
 122          * Figure out how large a callwheel we need.  It must be a power of 2.
 123          *
 124          * ncallout is primarily based on available memory, don't explode
 125          * the allocations if the system has a lot of cpus.
 126          */
 127         target = ncallout / ncpus + 16;
 128
 129         cwheelsize = 1;
 130         while (cwheelsize < target)
 131                 cwheelsize <<= 1;
 132         cwheelmask = cwheelsize - 1;
 133
 134         /*
 135          * Initialize per-cpu data structures.
 136          */
 137         for (cpu = 0; cpu < ncpus; ++cpu) {
 138                 softclock_pcpu_t sc;
 139
 140                 sc = &softclock_pcpu_ary[cpu];
 141
 142                 sc->callwheel = kmalloc(sizeof(*sc->callwheel) * cwheelsize,
 143                                         M_CALLOUT, M_WAITOK|M_ZERO);
 144                 for (i = 0; i < cwheelsize; ++i)
 145                         TAILQ_INIT(&sc->callwheel[i]);
 146
 147                 /*
 148                  * Mark the softclock handler as being an interrupt thread
 149                  * even though it really isn't, but do not allow it to
 150                  * preempt other threads (do not assign td_preemptable).
 151                  *
 152                  * Kernel code now assumes that callouts do not preempt
 153                  * the cpu they were scheduled on.
 154                  */
 155                 lwkt_create(softclock_handler, sc, &sc->thread, NULL,
 156                             TDF_NOSTART | TDF_INTTHREAD,
 157                             cpu, "softclock %d", cpu);
 158         }
 159 }
 160
 161 /*
 162  * Must occur after ncpus has been initialized.
 163  */
 164 SYSINIT(softclock_setup, SI_BOOT2_SOFTCLOCK, SI_ORDER_SECOND,
 165         swi_softclock_setup, NULL);
 166
 167 /*
 168  * Clear PENDING and, if possible, also clear ARMED and WAITING.  Returns
 169  * the flags prior to the clear, atomically (used to check for WAITING).
 170  *
 171  * Clearing the cpu association (ARMED) can significantly improve the
 172  * performance of the next callout_reset*() call.
 173  */
 174 static __inline
 175 int
 176 callout_unpend_disarm(struct callout *c)
 177 {
 178         int flags;
 179         int nflags;
 180
 181         for (;;) {
 182                 flags = c->c_flags;
 183                 cpu_ccfence();
 184                 nflags = flags & ~(CALLOUT_PENDING | CALLOUT_WAITING);
 185                 if ((flags & CALLOUT_IPI_MASK) == 0)
 186                         nflags &= ~CALLOUT_ARMED;
 187                 if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
 188                         break;
 189                 }
 190                 cpu_pause();
 191                 /* retry */
 192         }
 193         return flags;
 194 }
 195
 196 /*
 197  * Clear ARMED after finishing adjustments to the callout, potentially
 198  * allowing other cpus to take over.  We can only do this if the IPI mask
 199  * is 0.
 200  */
 201 static __inline
 202 int
 203 callout_maybe_clear_armed(struct callout *c)
 204 {
 205         int flags;
 206         int nflags;
 207
 208         for (;;) {
 209                 flags = c->c_flags;
 210                 cpu_ccfence();
 211                 if (flags & (CALLOUT_PENDING | CALLOUT_IPI_MASK))
 212                         break;
 213                 nflags = flags & ~CALLOUT_ARMED;
 214                 if (atomic_cmpset_int(&c->c_flags, flags, nflags))
 215                         break;
 216                 cpu_pause();
 217                 /* retry */
 218         }
 219         return flags;
 220 }
 221
 222 /*
 223  * This routine is called from the hardclock() (basically a FASTint/IPI) on
 224  * each cpu in the system.  sc->curticks is this cpu's notion of the timebase.
 225  * It IS NOT NECESSARILY SYNCHRONIZED WITH 'ticks'!  sc->softticks is where
 226  * the callwheel is currently indexed.
 227  *
 228  * WARNING!  The MP lock is not necessarily held on call, nor can it be
 229  * safely obtained.
 230  *
 231  * sc->softticks is adjusted by either this routine or our helper thread
 232  * depending on whether the helper thread is running or not.
 233  */
 234 void
 235 hardclock_softtick(globaldata_t gd)
 236 {
 237         softclock_pcpu_t sc;
 238
 239         sc = &softclock_pcpu_ary[gd->gd_cpuid];
 240         ++sc->curticks;
 241         if (sc->isrunning)
 242                 return;
 243         if (sc->softticks == sc->curticks) {
 244                 /*
 245                  * In sync, only wakeup the thread if there is something to
 246                  * do.
 247                  */
 248                 if (TAILQ_FIRST(&sc->callwheel[sc->softticks & cwheelmask])) {
 249                         sc->isrunning = 1;
 250                         lwkt_schedule(sc->thread);
 251                 } else {
 252                         ++sc->softticks;
 253                 }
 254         } else {
 255                 /*
 256                  * out of sync, wakeup the thread unconditionally so it can
 257                  * catch up.
 258                  */
 259                 sc->isrunning = 1;
 260                 lwkt_schedule(sc->thread);
 261         }
 262 }
 263
 264 /*
 265  * This procedure is the main loop of our per-cpu helper thread.  The
 266  * sc->isrunning flag prevents us from racing hardclock_softtick() and
 267  * a critical section is sufficient to interlock sc->curticks and protect
 268  * us from remote IPI's / list removal.
 269  *
 270  * The thread starts with the MP lock released and not in a critical
 271  * section.  The loop itself is MP safe while individual callbacks
 272  * may or may not be, so we obtain or release the MP lock as appropriate.
 273  */
 274 static void
 275 softclock_handler(void *arg)
 276 {
 277         softclock_pcpu_t sc;
 278         struct callout *c;
 279         struct callout_tailq *bucket;
 280         struct callout slotimer;
 281         int mpsafe = 1;
 282         int flags;
 283
 284         /*
 285          * Setup pcpu slow clocks which we want to run from the callout
 286          * thread.
 287          */
 288         callout_init_mp(&slotimer);
 289         callout_reset(&slotimer, hz * 10, slotimer_callback, &slotimer);
 290
 291         /*
 292          * Run the callout thread at the same priority as other kernel
 293          * threads so it can be round-robined.
 294          */
 295         /*lwkt_setpri_self(TDPRI_SOFT_NORM);*/
 296
 297         /*
 298          * Loop critical section against ipi operations to this cpu.
 299          */
 300         sc = arg;
 301         crit_enter();
 302 loop:
 303         while (sc->softticks != (int)(sc->curticks + 1)) {
 304                 bucket = &sc->callwheel[sc->softticks & cwheelmask];
 305
 306                 for (c = TAILQ_FIRST(bucket); c; c = sc->next) {
 307                         if (c->c_time != sc->softticks) {
 308                                 sc->next = TAILQ_NEXT(c, c_links.tqe);
 309                                 continue;
 310                         }
 311
 312                         flags = c->c_flags;
 313                         if (flags & CALLOUT_MPSAFE) {
 314                                 if (mpsafe == 0) {
 315                                         mpsafe = 1;
 316                                         rel_mplock();
 317                                 }
 318                         } else {
 319                                 /*
 320                                  * The request might be removed while we
 321                                  * are waiting to get the MP lock.  If it
 322                                  * was removed sc->next will point to the
 323                                  * next valid request or NULL, loop up.
 324                                  */
 325                                 if (mpsafe) {
 326                                         mpsafe = 0;
 327                                         sc->next = c;
 328                                         get_mplock();
 329                                         if (c != sc->next)
 330                                                 continue;
 331                                 }
 332                         }
 333
 334                         /*
 335                          * Queue protection only exists while we hold the
 336                          * critical section uninterrupted.
 337                          *
 338                          * Adjust sc->next when removing (c) from the queue,
 339                          * note that an IPI on this cpu may make further
 340                          * adjustments to sc->next.
 341                          */
 342                         sc->next = TAILQ_NEXT(c, c_links.tqe);
 343                         TAILQ_REMOVE(bucket, c, c_links.tqe);
 344
 345                         KASSERT((c->c_flags & CALLOUT_ARMED) &&
 346                                 (c->c_flags & CALLOUT_PENDING) &&
 347                                 CALLOUT_FLAGS_TO_CPU(c->c_flags) ==
 348                                 mycpu->gd_cpuid,
 349                                 ("callout %p: bad flags %08x", c, c->c_flags));
 350
 351                         /*
 352                          * Once CALLOUT_PENDING is cleared, sc->running
 353                          * protects the callout structure's existance but
 354                          * only until we call c_func().  A callout_stop()
 355                          * or callout_reset() issued from within c_func()
 356                          * will not block.  The callout can also be kfree()d
 357                          * by c_func().
 358                          *
 359                          * We set EXECUTED before calling c_func() so a
 360                          * callout_stop() issued from within c_func() returns
 361                          * the correct status.
 362                          */
 363                         if ((flags & (CALLOUT_AUTOLOCK | CALLOUT_ACTIVE)) ==
 364                             (CALLOUT_AUTOLOCK | CALLOUT_ACTIVE)) {
 365                                 void (*c_func)(void *);
 366                                 void *c_arg;
 367                                 struct lock *c_lk;
 368                                 int error;
 369
 370                                 /*
 371                                  * NOTE: sc->running must be set prior to
 372                                  *       CALLOUT_PENDING being cleared to
 373                                  *       avoid missed CANCELs and *_stop()
 374                                  *       races.
 375                                  */
 376                                 sc->running = (intptr_t)c;
 377                                 c_func = c->c_func;
 378                                 c_arg = c->c_arg;
 379                                 c_lk = c->c_lk;
 380                                 c->c_func = NULL;
 381                                 KKASSERT(c->c_flags & CALLOUT_DID_INIT);
 382                                 flags = callout_unpend_disarm(c);
 383                                 error = lockmgr(c_lk, LK_EXCLUSIVE |
 384                                                       LK_CANCELABLE);
 385                                 if (error == 0) {
 386                                         atomic_set_int(&c->c_flags,
 387                                                        CALLOUT_EXECUTED);
 388                                         crit_exit();
 389                                         c_func(c_arg);
 390                                         crit_enter();
 391                                         lockmgr(c_lk, LK_RELEASE);
 392                                 }
 393                         } else if (flags & CALLOUT_ACTIVE) {
 394                                 void (*c_func)(void *);
 395                                 void *c_arg;
 396
 397                                 sc->running = (intptr_t)c;
 398                                 c_func = c->c_func;
 399                                 c_arg = c->c_arg;
 400                                 c->c_func = NULL;
 401                                 KKASSERT(c->c_flags & CALLOUT_DID_INIT);
 402                                 flags = callout_unpend_disarm(c);
 403                                 atomic_set_int(&c->c_flags, CALLOUT_EXECUTED);
 404                                 crit_exit();
 405                                 c_func(c_arg);
 406                                 crit_enter();
 407                         } else {
 408                                 flags = callout_unpend_disarm(c);
 409                         }
 410
 411                         /*
 412                          * Read and clear sc->running.  If bit 0 was set,
 413                          * a callout_stop() is likely blocked waiting for
 414                          * the callback to complete.
 415                          *
 416                          * The sigclear above also cleared CALLOUT_WAITING
 417                          * and returns the contents of flags prior to clearing
 418                          * any bits.
 419                          *
 420                          * Interlock wakeup any _stop's waiting on us.  Note
 421                          * that once c_func() was called, the callout
 422                          * structure (c) pointer may no longer be valid.  It
 423                          * can only be used for the wakeup.
 424                          */
 425                         if ((atomic_readandclear_ptr(&sc->running) & 1) ||
 426                             (flags & CALLOUT_WAITING)) {
 427                                 wakeup(c);
 428                         }
 429                         /* NOTE: list may have changed */
 430                 }
 431                 ++sc->softticks;
 432         }
 433
 434         /*
 435          * Don't leave us holding the MP lock when we deschedule ourselves.
 436          */
 437         if (mpsafe == 0) {
 438                 mpsafe = 1;
 439                 rel_mplock();
 440         }
 441         sc->isrunning = 0;
 442         lwkt_deschedule_self(sc->thread);       /* == curthread */
 443         lwkt_switch();
 444         goto loop;
 445         /* NOT REACHED */
 446 }
 447
 448 /*
 449  * A very slow system cleanup timer (10 second interval),
 450  * per-cpu.
 451  */
 452 void
 453 slotimer_callback(void *arg)
 454 {
 455         struct callout *c = arg;
 456
 457         slab_cleanup();
 458         callout_reset(c, hz * 10, slotimer_callback, c);
 459 }
 460
 461 /*
 462  * Start or restart a timeout.  Installs the callout structure on the
 463  * callwheel.  Callers may legally pass any value, even if 0 or negative,
 464  * but since the sc->curticks index may have already been processed a
 465  * minimum timeout of 1 tick will be enforced.
 466  *
 467  * This function will block if the callout is currently queued to a different
 468  * cpu or the callback is currently running in another thread.
 469  */
 470 void
 471 callout_reset(struct callout *c, int to_ticks, void (*ftn)(void *), void *arg)
 472 {
 473         softclock_pcpu_t sc;
 474         globaldata_t gd;
 475
 476 #ifdef INVARIANTS
 477         if ((c->c_flags & CALLOUT_DID_INIT) == 0) {
 478                 callout_init(c);
 479                 kprintf(
 480                     "callout_reset(%p) from %p: callout was not initialized\n",
 481                     c, ((int **)&c)[-1]);
 482                 print_backtrace(-1);
 483         }
 484 #endif
 485         gd = mycpu;
 486         sc = &softclock_pcpu_ary[gd->gd_cpuid];
 487         crit_enter_gd(gd);
 488
 489         /*
 490          * Our cpu must gain ownership of the callout and cancel anything
 491          * still running, which is complex.  The easiest way to do it is to
 492          * issue a callout_stop().
 493          *
 494          * Clearing bits on flags is a way to guarantee they are not set,
 495          * as the cmpset atomic op will fail otherwise.  PENDING and ARMED
 496          * must not be set, if we find them set we loop up and call
 497          * stop_sync() again.
 498          *
 499          */
 500         for (;;) {
 501                 int flags;
 502                 int nflags;
 503
 504                 callout_stop_sync(c);
 505                 flags = c->c_flags & ~(CALLOUT_PENDING | CALLOUT_ARMED);
 506                 nflags = (flags & ~(CALLOUT_CPU_MASK |
 507                                     CALLOUT_EXECUTED)) |
 508                          CALLOUT_CPU_TO_FLAGS(gd->gd_cpuid) |
 509                          CALLOUT_ARMED |
 510                          CALLOUT_PENDING |
 511                          CALLOUT_ACTIVE;
 512                 if (atomic_cmpset_int(&c->c_flags, flags, nflags))
 513                         break;
 514         }
 515
 516
 517         if (to_ticks <= 0)
 518                 to_ticks = 1;
 519
 520         c->c_arg = arg;
 521         c->c_func = ftn;
 522         c->c_time = sc->curticks + to_ticks;
 523
 524         TAILQ_INSERT_TAIL(&sc->callwheel[c->c_time & cwheelmask],
 525                           c, c_links.tqe);
 526         crit_exit_gd(gd);
 527 }
 528
 529 /*
 530  * Setup a callout to run on the specified cpu.  Should generally be used
 531  * to run a callout on a specific cpu which does not nominally change.
 532  */
 533 void
 534 callout_reset_bycpu(struct callout *c, int to_ticks, void (*ftn)(void *),
 535                     void *arg, int cpuid)
 536 {
 537         globaldata_t gd;
 538         globaldata_t tgd;
 539
 540 #ifdef INVARIANTS
 541         if ((c->c_flags & CALLOUT_DID_INIT) == 0) {
 542                 callout_init(c);
 543                 kprintf(
 544                     "callout_reset(%p) from %p: callout was not initialized\n",
 545                     c, ((int **)&c)[-1]);
 546                 print_backtrace(-1);
 547         }
 548 #endif
 549         gd = mycpu;
 550         crit_enter_gd(gd);
 551
 552         tgd = globaldata_find(cpuid);
 553
 554         /*
 555          * Our cpu must temporarily gain ownership of the callout and cancel
 556          * anything still running, which is complex.  The easiest way to do
 557          * it is to issue a callout_stop().
 558          *
 559          * Clearing bits on flags (vs nflags) is a way to guarantee they were
 560          * not previously set, by forcing the atomic op to fail.  The callout
 561          * must not be pending or armed after the stop_sync, if it is we have
 562          * to loop up and stop_sync() again.
 563          */
 564         for (;;) {
 565                 int flags;
 566                 int nflags;
 567
 568                 callout_stop_sync(c);
 569                 flags = c->c_flags & ~(CALLOUT_PENDING | CALLOUT_ARMED);
 570                 nflags = (flags & ~(CALLOUT_CPU_MASK |
 571                                     CALLOUT_EXECUTED)) |
 572                          CALLOUT_CPU_TO_FLAGS(tgd->gd_cpuid) |
 573                          CALLOUT_ARMED |
 574                          CALLOUT_ACTIVE;
 575                 nflags = nflags + 1;            /* bump IPI count */
 576                 if (atomic_cmpset_int(&c->c_flags, flags, nflags))
 577                         break;
 578                 cpu_pause();
 579         }
 580
 581         /*
 582          * Even though we are not the cpu that now owns the callout, our
 583          * bumping of the IPI count (and in a situation where the callout is
 584          * not queued to the callwheel) will prevent anyone else from
 585          * depending on or acting on the contents of the callout structure.
 586          */
 587         if (to_ticks <= 0)
 588                 to_ticks = 1;
 589
 590         c->c_arg = arg;
 591         c->c_func = ftn;
 592         c->c_load = to_ticks;   /* IPI will add curticks */
 593
 594         lwkt_send_ipiq(tgd, callout_reset_ipi, c);
 595         crit_exit_gd(gd);
 596 }
 597
 598 /*
 599  * Remote IPI for callout_reset_bycpu().  The operation is performed only
 600  * on the 1->0 transition of the counter, otherwise there are callout_stop()s
 601  * pending after us.
 602  *
 603  * The IPI counter and PENDING flags must be set atomically with the
 604  * 1->0 transition.  The ACTIVE flag was set prior to the ipi being
 605  * sent and we do not want to race a caller on the original cpu trying
 606  * to deactivate() the flag concurrent with our installation of the
 607  * callout.
 608  */
 609 static void
 610 callout_reset_ipi(void *arg)
 611 {
 612         struct callout *c = arg;
 613         globaldata_t gd = mycpu;
 614         globaldata_t tgd;
 615         int flags;
 616         int nflags;
 617
 618         for (;;) {
 619                 flags = c->c_flags;
 620                 cpu_ccfence();
 621                 KKASSERT((flags & CALLOUT_IPI_MASK) > 0);
 622
 623                 /*
 624                  * We should already be armed for our cpu, if armed to another
 625                  * cpu, chain the IPI.  If for some reason we are not armed,
 626                  * we can arm ourselves.
 627                  */
 628                 if (flags & CALLOUT_ARMED) {
 629                         if (CALLOUT_FLAGS_TO_CPU(flags) != gd->gd_cpuid) {
 630                                 tgd = globaldata_find(
 631                                                 CALLOUT_FLAGS_TO_CPU(flags));
 632                                 lwkt_send_ipiq(tgd, callout_reset_ipi, c);
 633                                 return;
 634                         }
 635                         nflags = (flags & ~CALLOUT_EXECUTED);
 636                 } else {
 637                         nflags = (flags & ~(CALLOUT_CPU_MASK |
 638                                             CALLOUT_EXECUTED)) |
 639                                  CALLOUT_ARMED |
 640                                  CALLOUT_CPU_TO_FLAGS(gd->gd_cpuid);
 641                 }
 642
 643                 /*
 644                  * Decrement the IPI count, retain and clear the WAITING
 645                  * status, clear EXECUTED.
 646                  *
 647                  * NOTE: It is possible for the callout to already have been
 648                  *       marked pending due to SMP races.
 649                  */
 650                 nflags = nflags - 1;
 651                 if ((flags & CALLOUT_IPI_MASK) == 1) {
 652                         nflags &= ~(CALLOUT_WAITING | CALLOUT_EXECUTED);
 653                         nflags |= CALLOUT_PENDING;
 654                 }
 655
 656                 if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
 657                         /*
 658                          * Only install the callout on the 1->0 transition
 659                          * of the IPI count, and only if PENDING was not
 660                          * already set.  The latter situation should never
 661                          * occur but we check anyway.
 662                          */
 663                         if ((flags & (CALLOUT_PENDING|CALLOUT_IPI_MASK)) == 1) {
 664                                 softclock_pcpu_t sc;
 665
 666                                 sc = &softclock_pcpu_ary[gd->gd_cpuid];
 667                                 c->c_time = sc->curticks + c->c_load;
 668                                 TAILQ_INSERT_TAIL(
 669                                         &sc->callwheel[c->c_time & cwheelmask],
 670                                         c, c_links.tqe);
 671                         }
 672                         break;
 673                 }
 674                 /* retry */
 675                 cpu_pause();
 676         }
 677
 678         /*
 679          * Issue wakeup if requested.
 680          */
 681         if (flags & CALLOUT_WAITING)
 682                 wakeup(c);
 683 }
 684
 685 /*
 686  * Stop a running timer and ensure that any running callout completes before
 687  * returning.  If the timer is running on another cpu this function may block
 688  * to interlock against the callout.  If the callout is currently executing
 689  * or blocked in another thread this function may also block to interlock
 690  * against the callout.
 691  *
 692  * The caller must be careful to avoid deadlocks, either by using
 693  * callout_init_lk() (which uses the lockmgr lock cancelation feature),
 694  * by using tokens and dealing with breaks in the serialization, or using
 695  * the lockmgr lock cancelation feature yourself in the callout callback
 696  * function.
 697  *
 698  * callout_stop() returns non-zero if the callout was pending.
 699  */
 700 static int
 701 _callout_stop(struct callout *c, int issync)
 702 {
 703         globaldata_t gd = mycpu;
 704         globaldata_t tgd;
 705         softclock_pcpu_t sc;
 706         int flags;
 707         int nflags;
 708         int rc;
 709         int cpuid;
 710
 711 #ifdef INVARIANTS
 712         if ((c->c_flags & CALLOUT_DID_INIT) == 0) {
 713                 callout_init(c);
 714                 kprintf(
 715                     "callout_stop(%p) from %p: callout was not initialized\n",
 716                     c, ((int **)&c)[-1]);
 717                 print_backtrace(-1);
 718         }
 719 #endif
 720         crit_enter_gd(gd);
 721
 722         /*
 723          * Fast path operations:
 724          *
 725          * If ARMED and owned by our cpu, or not ARMED, and other simple
 726          * conditions are met, we can just clear ACTIVE and EXECUTED
 727          * and we are done.
 728          */
 729         for (;;) {
 730                 flags = c->c_flags;
 731                 cpu_ccfence();
 732
 733                 cpuid = CALLOUT_FLAGS_TO_CPU(flags);
 734
 735                 /*
 736                  * Can't handle an armed callout in the fast path if it is
 737                  * not on the current cpu.  We must atomically increment the
 738                  * IPI count for the IPI we intend to send and break out of
 739                  * the fast path to enter the slow path.
 740                  */
 741                 if (flags & CALLOUT_ARMED) {
 742                         if (gd->gd_cpuid != cpuid) {
 743                                 nflags = flags + 1;
 744                                 if (atomic_cmpset_int(&c->c_flags,
 745                                                       flags, nflags)) {
 746                                         /* break to slow path */
 747                                         break;
 748                                 }
 749                                 continue;       /* retry */
 750                         }
 751                 } else {
 752                         cpuid = gd->gd_cpuid;
 753                         KKASSERT((flags & CALLOUT_IPI_MASK) == 0);
 754                         KKASSERT((flags & CALLOUT_PENDING) == 0);
 755                 }
 756
 757                 /*
 758                  * Process pending IPIs and retry (only if not called from
 759                  * an IPI).
 760                  */
 761                 if (flags & CALLOUT_IPI_MASK) {
 762                         lwkt_process_ipiq();
 763                         continue;       /* retry */
 764                 }
 765
 766                 /*
 767                  * Transition to the stopped state, recover the EXECUTED
 768                  * status.  If pending we cannot clear ARMED until after
 769                  * we have removed (c) from the callwheel.
 770                  *
 771                  * NOTE: The callout might already not be armed but in this
 772                  *       case it should also not be pending.
 773                  */
 774                 nflags = flags & ~(CALLOUT_ACTIVE |
 775                                    CALLOUT_EXECUTED |
 776                                    CALLOUT_WAITING |
 777                                    CALLOUT_PENDING);
 778
 779                 /* NOTE: IPI_MASK already tested */
 780                 if ((flags & CALLOUT_PENDING) == 0)
 781                         nflags &= ~CALLOUT_ARMED;
 782                 if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
 783                         /*
 784                          * Can only remove from callwheel if currently
 785                          * pending.
 786                          */
 787                         if (flags & CALLOUT_PENDING) {
 788                                 sc = &softclock_pcpu_ary[gd->gd_cpuid];
 789                                 if (sc->next == c)
 790                                         sc->next = TAILQ_NEXT(c, c_links.tqe);
 791                                 TAILQ_REMOVE(
 792                                         &sc->callwheel[c->c_time & cwheelmask],
 793                                         c,
 794                                         c_links.tqe);
 795                                 c->c_func = NULL;
 796
 797                                 /*
 798                                  * NOTE: Can't clear ARMED until we have
 799                                  *       physically removed (c) from the
 800                                  *       callwheel.
 801                                  *
 802                                  * NOTE: WAITING bit race exists when doing
 803                                  *       unconditional bit clears.
 804                                  */
 805                                 callout_maybe_clear_armed(c);
 806                                 if (c->c_flags & CALLOUT_WAITING)
 807                                         flags |= CALLOUT_WAITING;
 808                         }
 809
 810                         /*
 811                          * ARMED has been cleared at this point and (c)
 812                          * might now be stale.  Only good for wakeup()s.
 813                          */
 814                         if (flags & CALLOUT_WAITING)
 815                                 wakeup(c);
 816
 817                         goto skip_slow;
 818                 }
 819                 /* retry */
 820         }
 821
 822         /*
 823          * Slow path (and not called via an IPI).
 824          *
 825          * When ARMED to a different cpu the stop must be processed on that
 826          * cpu.  Issue the IPI and wait for completion.  We have already
 827          * incremented the IPI count.
 828          */
 829         tgd = globaldata_find(cpuid);
 830         lwkt_send_ipiq3(tgd, callout_stop_ipi, c, issync);
 831
 832         for (;;) {
 833                 int flags;
 834                 int nflags;
 835
 836                 flags = c->c_flags;
 837                 cpu_ccfence();
 838                 if ((flags & CALLOUT_IPI_MASK) == 0)    /* fast path */
 839                         break;
 840                 nflags = flags | CALLOUT_WAITING;
 841                 tsleep_interlock(c, 0);
 842                 if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
 843                         tsleep(c, PINTERLOCKED, "cstp1", 0);
 844                 }
 845         }
 846
 847 skip_slow:
 848
 849         /*
 850          * If (issync) we must also wait for any in-progress callbacks to
 851          * complete, unless the stop is being executed from the callback
 852          * itself.  The EXECUTED flag is set prior to the callback
 853          * being made so our existing flags status already has it.
 854          *
 855          * If auto-lock mode is being used, this is where we cancel any
 856          * blocked lock that is potentially preventing the target cpu
 857          * from completing the callback.
 858          */
 859         while (issync) {
 860                 intptr_t *runp;
 861                 intptr_t runco;
 862
 863                 sc = &softclock_pcpu_ary[cpuid];
 864                 if (gd->gd_curthread == sc->thread)     /* stop from cb */
 865                         break;
 866                 runp = &sc->running;
 867                 runco = *runp;
 868                 cpu_ccfence();
 869                 if ((runco & ~(intptr_t)1) != (intptr_t)c)
 870                         break;
 871                 if (c->c_flags & CALLOUT_AUTOLOCK)
 872                         lockmgr(c->c_lk, LK_CANCEL_BEG);
 873                 tsleep_interlock(c, 0);
 874                 if (atomic_cmpset_long(runp, runco, runco | 1))
 875                         tsleep(c, PINTERLOCKED, "cstp3", 0);
 876                 if (c->c_flags & CALLOUT_AUTOLOCK)
 877                         lockmgr(c->c_lk, LK_CANCEL_END);
 878         }
 879
 880         crit_exit_gd(gd);
 881         rc = (flags & CALLOUT_EXECUTED) != 0;
 882
 883         return rc;
 884 }
 885
 886 static
 887 void
 888 callout_stop_ipi(void *arg, int issync, struct intrframe *frame)
 889 {
 890         globaldata_t gd = mycpu;
 891         struct callout *c = arg;
 892         softclock_pcpu_t sc;
 893
 894         /*
 895          * Only the fast path can run in an IPI.  Chain the stop request
 896          * if we are racing cpu changes.
 897          */
 898         for (;;) {
 899                 globaldata_t tgd;
 900                 int flags;
 901                 int nflags;
 902                 int cpuid;
 903
 904                 flags = c->c_flags;
 905                 cpu_ccfence();
 906
 907                 /*
 908                  * Can't handle an armed callout in the fast path if it is
 909                  * not on the current cpu.  We must atomically increment the
 910                  * IPI count and break out of the fast path.
 911                  *
 912                  * If called from an IPI we chain the IPI instead.
 913                  */
 914                 if (flags & CALLOUT_ARMED) {
 915                         cpuid = CALLOUT_FLAGS_TO_CPU(flags);
 916                         if (gd->gd_cpuid != cpuid) {
 917                                 tgd = globaldata_find(cpuid);
 918                                 lwkt_send_ipiq3(tgd, callout_stop_ipi,
 919                                                 c, issync);
 920                                 break;
 921                         }
 922                 }
 923
 924                 /*
 925                  * NOTE: As an IPI ourselves we cannot wait for other IPIs
 926                  *       to complete, and we are being executed in-order.
 927                  */
 928
 929                 /*
 930                  * Transition to the stopped state, recover the EXECUTED
 931                  * status, decrement the IPI count.  If pending we cannot
 932                  * clear ARMED until after we have removed (c) from the
 933                  * callwheel, and only if there are no more IPIs pending.
 934                  */
 935                 nflags = flags & ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 936                 nflags = nflags - 1;                    /* dec ipi count */
 937                 if ((flags & (CALLOUT_IPI_MASK | CALLOUT_PENDING)) == 1)
 938                         nflags &= ~CALLOUT_ARMED;
 939                 if ((flags & CALLOUT_IPI_MASK) == 1)
 940                         nflags &= ~(CALLOUT_WAITING | CALLOUT_EXECUTED);
 941
 942                 if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
 943                         /*
 944                          * Can only remove from callwheel if currently
 945                          * pending.
 946                          */
 947                         if (flags & CALLOUT_PENDING) {
 948                                 sc = &softclock_pcpu_ary[gd->gd_cpuid];
 949                                 if (sc->next == c)
 950                                         sc->next = TAILQ_NEXT(c, c_links.tqe);
 951                                 TAILQ_REMOVE(
 952                                         &sc->callwheel[c->c_time & cwheelmask],
 953                                         c,
 954                                         c_links.tqe);
 955                                 c->c_func = NULL;
 956
 957                                 /*
 958                                  * NOTE: Can't clear ARMED until we have
 959                                  *       physically removed (c) from the
 960                                  *       callwheel.
 961                                  *
 962                                  * NOTE: WAITING bit race exists when doing
 963                                  *       unconditional bit clears.
 964                                  */
 965                                 callout_maybe_clear_armed(c);
 966                                 if (c->c_flags & CALLOUT_WAITING)
 967                                         flags |= CALLOUT_WAITING;
 968                         }
 969
 970                         /*
 971                          * ARMED has been cleared at this point and (c)
 972                          * might now be stale.  Only good for wakeup()s.
 973                          */
 974                         if (flags & CALLOUT_WAITING)
 975                                 wakeup(c);
 976                         break;
 977                 }
 978                 /* retry */
 979         }
 980 }
 981
 982 int
 983 callout_stop(struct callout *c)
 984 {
 985         return _callout_stop(c, 0);
 986 }
 987
 988 int
 989 callout_stop_sync(struct callout *c)
 990 {
 991         return _callout_stop(c, 1);
 992 }
 993
 994 void
 995 callout_stop_async(struct callout *c)
 996 {
 997         _callout_stop(c, 0);
 998 }
 999
1000 void
1001 callout_terminate(struct callout *c)
1002 {
1003         _callout_stop(c, 1);
1004         atomic_clear_int(&c->c_flags, CALLOUT_DID_INIT);
1005 }
1006
1007 /*
1008  * Prepare a callout structure for use by callout_reset() and/or
1009  * callout_stop().
1010  *
1011  * The MP version of this routine requires that the callback
1012  * function installed by callout_reset() be MP safe.
1013  *
1014  * The LK version of this routine is also MPsafe and will automatically
1015  * acquire the specified lock for the duration of the function call,
1016  * and release it after the function returns.  In addition, when autolocking
1017  * is used, callout_stop() becomes synchronous if the caller owns the lock.
1018  * callout_reset(), callout_stop(), and callout_stop_sync() will block
1019  * normally instead of spinning when a cpu race occurs.  Lock cancelation
1020  * is used to avoid deadlocks against the callout ring dispatch.
1021  *
1022  * The init functions can be called from any cpu and do not have to be
1023  * called from the cpu that the timer will eventually run on.
1024  */
1025 static __inline
1026 void
1027 _callout_init(struct callout *c, int flags)
1028 {
1029         bzero(c, sizeof *c);
1030         c->c_flags = flags;
1031 }
1032
1033 void
1034 callout_init(struct callout *c)
1035 {
1036         _callout_init(c, CALLOUT_DID_INIT);
1037 }
1038
1039 void
1040 callout_init_mp(struct callout *c)
1041 {
1042         _callout_init(c, CALLOUT_DID_INIT | CALLOUT_MPSAFE);
1043 }
1044
1045 void
1046 callout_init_lk(struct callout *c, struct lock *lk)
1047 {
1048         _callout_init(c, CALLOUT_DID_INIT | CALLOUT_MPSAFE | CALLOUT_AUTOLOCK);
1049         c->c_lk = lk;
1050 }