kernel/smp.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Generic helpers for smp ipi calls
   4  *
   5  * (C) Jens Axboe <jens.axboe@oracle.com> 2008
   6  */
   7
   8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   9
  10 #include <linux/irq_work.h>
  11 #include <linux/rcupdate.h>
  12 #include <linux/rculist.h>
  13 #include <linux/kernel.h>
  14 #include <linux/export.h>
  15 #include <linux/percpu.h>
  16 #include <linux/init.h>
  17 #include <linux/interrupt.h>
  18 #include <linux/gfp.h>
  19 #include <linux/smp.h>
  20 #include <linux/cpu.h>
  21 #include <linux/sched.h>
  22 #include <linux/sched/idle.h>
  23 #include <linux/hypervisor.h>
  24 #include <linux/sched/clock.h>
  25 #include <linux/nmi.h>
  26 #include <linux/sched/debug.h>
  27 #include <linux/jump_label.h>
  28 #include <linux/string_choices.h>
  29
  30 #include <trace/events/ipi.h>
  31 #define CREATE_TRACE_POINTS
  32 #include <trace/events/csd.h>
  33 #undef CREATE_TRACE_POINTS
  34
  35 #include "smpboot.h"
  36 #include "sched/smp.h"
  37
  38 #define CSD_TYPE(_csd)  ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)
  39
  40 struct call_function_data {
  41         call_single_data_t      __percpu *csd;
  42         cpumask_var_t           cpumask;
  43         cpumask_var_t           cpumask_ipi;
  44 };
  45
  46 static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
  47
  48 static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
  49
  50 static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1);
  51
  52 static void __flush_smp_call_function_queue(bool warn_cpu_offline);
  53
  54 int smpcfd_prepare_cpu(unsigned int cpu)
  55 {
  56         struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
  57
  58         if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
  59                                      cpu_to_node(cpu)))
  60                 return -ENOMEM;
  61         if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
  62                                      cpu_to_node(cpu))) {
  63                 free_cpumask_var(cfd->cpumask);
  64                 return -ENOMEM;
  65         }
  66         cfd->csd = alloc_percpu(call_single_data_t);
  67         if (!cfd->csd) {
  68                 free_cpumask_var(cfd->cpumask);
  69                 free_cpumask_var(cfd->cpumask_ipi);
  70                 return -ENOMEM;
  71         }
  72
  73         return 0;
  74 }
  75
  76 int smpcfd_dead_cpu(unsigned int cpu)
  77 {
  78         struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
  79
  80         free_cpumask_var(cfd->cpumask);
  81         free_cpumask_var(cfd->cpumask_ipi);
  82         free_percpu(cfd->csd);
  83         return 0;
  84 }
  85
  86 int smpcfd_dying_cpu(unsigned int cpu)
  87 {
  88         /*
  89          * The IPIs for the smp-call-function callbacks queued by other
  90          * CPUs might arrive late, either due to hardware latencies or
  91          * because this CPU disabled interrupts (inside stop-machine)
  92          * before the IPIs were sent. So flush out any pending callbacks
  93          * explicitly (without waiting for the IPIs to arrive), to
  94          * ensure that the outgoing CPU doesn't go offline with work
  95          * still pending.
  96          */
  97         __flush_smp_call_function_queue(false);
  98         irq_work_run();
  99         return 0;
 100 }
 101
 102 void __init call_function_init(void)
 103 {
 104         int i;
 105
 106         for_each_possible_cpu(i)
 107                 init_llist_head(&per_cpu(call_single_queue, i));
 108
 109         smpcfd_prepare_cpu(smp_processor_id());
 110 }
 111
 112 static __always_inline void
 113 send_call_function_single_ipi(int cpu)
 114 {
 115         if (call_function_single_prep_ipi(cpu)) {
 116                 trace_ipi_send_cpu(cpu, _RET_IP_,
 117                                    generic_smp_call_function_single_interrupt);
 118                 arch_send_call_function_single_ipi(cpu);
 119         }
 120 }
 121
 122 static __always_inline void
 123 send_call_function_ipi_mask(struct cpumask *mask)
 124 {
 125         trace_ipi_send_cpumask(mask, _RET_IP_,
 126                                generic_smp_call_function_single_interrupt);
 127         arch_send_call_function_ipi_mask(mask);
 128 }
 129
 130 static __always_inline void
 131 csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd)
 132 {
 133         trace_csd_function_entry(func, csd);
 134         func(info);
 135         trace_csd_function_exit(func, csd);
 136 }
 137
 138 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
 139
 140 static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled);
 141
 142 /*
 143  * Parse the csdlock_debug= kernel boot parameter.
 144  *
 145  * If you need to restore the old "ext" value that once provided
 146  * additional debugging information, reapply the following commits:
 147  *
 148  * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging")
 149  * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging")
 150  */
 151 static int __init csdlock_debug(char *str)
 152 {
 153         int ret;
 154         unsigned int val = 0;
 155
 156         ret = get_option(&str, &val);
 157         if (ret) {
 158                 if (val)
 159                         static_branch_enable(&csdlock_debug_enabled);
 160                 else
 161                         static_branch_disable(&csdlock_debug_enabled);
 162         }
 163
 164         return 1;
 165 }
 166 __setup("csdlock_debug=", csdlock_debug);
 167
 168 static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
 169 static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
 170 static DEFINE_PER_CPU(void *, cur_csd_info);
 171
 172 static ulong csd_lock_timeout = 5000;  /* CSD lock timeout in milliseconds. */
 173 module_param(csd_lock_timeout, ulong, 0444);
 174 static int panic_on_ipistall;  /* CSD panic timeout in milliseconds, 300000 for five minutes. */
 175 module_param(panic_on_ipistall, int, 0444);
 176
 177 static atomic_t csd_bug_count = ATOMIC_INIT(0);
 178
 179 /* Record current CSD work for current CPU, NULL to erase. */
 180 static void __csd_lock_record(call_single_data_t *csd)
 181 {
 182         if (!csd) {
 183                 smp_mb(); /* NULL cur_csd after unlock. */
 184                 __this_cpu_write(cur_csd, NULL);
 185                 return;
 186         }
 187         __this_cpu_write(cur_csd_func, csd->func);
 188         __this_cpu_write(cur_csd_info, csd->info);
 189         smp_wmb(); /* func and info before csd. */
 190         __this_cpu_write(cur_csd, csd);
 191         smp_mb(); /* Update cur_csd before function call. */
 192                   /* Or before unlock, as the case may be. */
 193 }
 194
 195 static __always_inline void csd_lock_record(call_single_data_t *csd)
 196 {
 197         if (static_branch_unlikely(&csdlock_debug_enabled))
 198                 __csd_lock_record(csd);
 199 }
 200
 201 static int csd_lock_wait_getcpu(call_single_data_t *csd)
 202 {
 203         unsigned int csd_type;
 204
 205         csd_type = CSD_TYPE(csd);
 206         if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
 207                 return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */
 208         return -1;
 209 }
 210
 211 /*
 212  * Complain if too much time spent waiting.  Note that only
 213  * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
 214  * so waiting on other types gets much less information.
 215  */
 216 static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
 217 {
 218         int cpu = -1;
 219         int cpux;
 220         bool firsttime;
 221         u64 ts2, ts_delta;
 222         call_single_data_t *cpu_cur_csd;
 223         unsigned int flags = READ_ONCE(csd->node.u_flags);
 224         unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC;
 225
 226         if (!(flags & CSD_FLAG_LOCK)) {
 227                 if (!unlikely(*bug_id))
 228                         return true;
 229                 cpu = csd_lock_wait_getcpu(csd);
 230                 pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
 231                          *bug_id, raw_smp_processor_id(), cpu);
 232                 return true;
 233         }
 234
 235         ts2 = sched_clock();
 236         /* How long since we last checked for a stuck CSD lock.*/
 237         ts_delta = ts2 - *ts1;
 238         if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
 239                 return false;
 240
 241         firsttime = !*bug_id;
 242         if (firsttime)
 243                 *bug_id = atomic_inc_return(&csd_bug_count);
 244         cpu = csd_lock_wait_getcpu(csd);
 245         if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu))
 246                 cpux = 0;
 247         else
 248                 cpux = cpu;
 249         cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
 250         /* How long since this CSD lock was stuck. */
 251         ts_delta = ts2 - ts0;
 252         pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
 253                  firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta,
 254                  cpu, csd->func, csd->info);
 255         /*
 256          * If the CSD lock is still stuck after 5 minutes, it is unlikely
 257          * to become unstuck. Use a signed comparison to avoid triggering
 258          * on underflows when the TSC is out of sync between sockets.
 259          */
 260         BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC));
 261         if (cpu_cur_csd && csd != cpu_cur_csd) {
 262                 pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
 263                          *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
 264                          READ_ONCE(per_cpu(cur_csd_info, cpux)));
 265         } else {
 266                 pr_alert("\tcsd: CSD lock (#%d) %s.\n",
 267                          *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
 268         }
 269         if (cpu >= 0) {
 270                 if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0))
 271                         dump_cpu_task(cpu);
 272                 if (!cpu_cur_csd) {
 273                         pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
 274                         arch_send_call_function_single_ipi(cpu);
 275                 }
 276         }
 277         if (firsttime)
 278                 dump_stack();
 279         *ts1 = ts2;
 280
 281         return false;
 282 }
 283
 284 /*
 285  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
 286  *
 287  * For non-synchronous ipi calls the csd can still be in use by the
 288  * previous function call. For multi-cpu calls its even more interesting
 289  * as we'll have to ensure no other cpu is observing our csd.
 290  */
 291 static void __csd_lock_wait(call_single_data_t *csd)
 292 {
 293         int bug_id = 0;
 294         u64 ts0, ts1;
 295
 296         ts1 = ts0 = sched_clock();
 297         for (;;) {
 298                 if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
 299                         break;
 300                 cpu_relax();
 301         }
 302         smp_acquire__after_ctrl_dep();
 303 }
 304
 305 static __always_inline void csd_lock_wait(call_single_data_t *csd)
 306 {
 307         if (static_branch_unlikely(&csdlock_debug_enabled)) {
 308                 __csd_lock_wait(csd);
 309                 return;
 310         }
 311
 312         smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 313 }
 314 #else
 315 static void csd_lock_record(call_single_data_t *csd)
 316 {
 317 }
 318
 319 static __always_inline void csd_lock_wait(call_single_data_t *csd)
 320 {
 321         smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 322 }
 323 #endif
 324
 325 static __always_inline void csd_lock(call_single_data_t *csd)
 326 {
 327         csd_lock_wait(csd);
 328         csd->node.u_flags |= CSD_FLAG_LOCK;
 329
 330         /*
 331          * prevent CPU from reordering the above assignment
 332          * to ->flags with any subsequent assignments to other
 333          * fields of the specified call_single_data_t structure:
 334          */
 335         smp_wmb();
 336 }
 337
 338 static __always_inline void csd_unlock(call_single_data_t *csd)
 339 {
 340         WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
 341
 342         /*
 343          * ensure we're all done before releasing data:
 344          */
 345         smp_store_release(&csd->node.u_flags, 0);
 346 }
 347
 348 static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
 349
 350 void __smp_call_single_queue(int cpu, struct llist_node *node)
 351 {
 352         /*
 353          * We have to check the type of the CSD before queueing it, because
 354          * once queued it can have its flags cleared by
 355          *   flush_smp_call_function_queue()
 356          * even if we haven't sent the smp_call IPI yet (e.g. the stopper
 357          * executes migration_cpu_stop() on the remote CPU).
 358          */
 359         if (trace_csd_queue_cpu_enabled()) {
 360                 call_single_data_t *csd;
 361                 smp_call_func_t func;
 362
 363                 csd = container_of(node, call_single_data_t, node.llist);
 364                 func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
 365                         sched_ttwu_pending : csd->func;
 366
 367                 trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
 368         }
 369
 370         /*
 371          * The list addition should be visible to the target CPU when it pops
 372          * the head of the list to pull the entry off it in the IPI handler
 373          * because of normal cache coherency rules implied by the underlying
 374          * llist ops.
 375          *
 376          * If IPIs can go out of order to the cache coherency protocol
 377          * in an architecture, sufficient synchronisation should be added
 378          * to arch code to make it appear to obey cache coherency WRT
 379          * locking and barrier primitives. Generic code isn't really
 380          * equipped to do the right thing...
 381          */
 382         if (llist_add(node, &per_cpu(call_single_queue, cpu)))
 383                 send_call_function_single_ipi(cpu);
 384 }
 385
 386 /*
 387  * Insert a previously allocated call_single_data_t element
 388  * for execution on the given CPU. data must already have
 389  * ->func, ->info, and ->flags set.
 390  */
 391 static int generic_exec_single(int cpu, call_single_data_t *csd)
 392 {
 393         if (cpu == smp_processor_id()) {
 394                 smp_call_func_t func = csd->func;
 395                 void *info = csd->info;
 396                 unsigned long flags;
 397
 398                 /*
 399                  * We can unlock early even for the synchronous on-stack case,
 400                  * since we're doing this from the same CPU..
 401                  */
 402                 csd_lock_record(csd);
 403                 csd_unlock(csd);
 404                 local_irq_save(flags);
 405                 csd_do_func(func, info, NULL);
 406                 csd_lock_record(NULL);
 407                 local_irq_restore(flags);
 408                 return 0;
 409         }
 410
 411         if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
 412                 csd_unlock(csd);
 413                 return -ENXIO;
 414         }
 415
 416         __smp_call_single_queue(cpu, &csd->node.llist);
 417
 418         return 0;
 419 }
 420
 421 /**
 422  * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
 423  *
 424  * Invoked by arch to handle an IPI for call function single.
 425  * Must be called with interrupts disabled.
 426  */
 427 void generic_smp_call_function_single_interrupt(void)
 428 {
 429         __flush_smp_call_function_queue(true);
 430 }
 431
 432 /**
 433  * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 434  *
 435  * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
 436  *                    offline CPU. Skip this check if set to 'false'.
 437  *
 438  * Flush any pending smp-call-function callbacks queued on this CPU. This is
 439  * invoked by the generic IPI handler, as well as by a CPU about to go offline,
 440  * to ensure that all pending IPI callbacks are run before it goes completely
 441  * offline.
 442  *
 443  * Loop through the call_single_queue and run all the queued callbacks.
 444  * Must be called with interrupts disabled.
 445  */
 446 static void __flush_smp_call_function_queue(bool warn_cpu_offline)
 447 {
 448         call_single_data_t *csd, *csd_next;
 449         struct llist_node *entry, *prev;
 450         struct llist_head *head;
 451         static bool warned;
 452         atomic_t *tbt;
 453
 454         lockdep_assert_irqs_disabled();
 455
 456         /* Allow waiters to send backtrace NMI from here onwards */
 457         tbt = this_cpu_ptr(&trigger_backtrace);
 458         atomic_set_release(tbt, 1);
 459
 460         head = this_cpu_ptr(&call_single_queue);
 461         entry = llist_del_all(head);
 462         entry = llist_reverse_order(entry);
 463
 464         /* There shouldn't be any pending callbacks on an offline CPU. */
 465         if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
 466                      !warned && entry != NULL)) {
 467                 warned = true;
 468                 WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
 469
 470                 /*
 471                  * We don't have to use the _safe() variant here
 472                  * because we are not invoking the IPI handlers yet.
 473                  */
 474                 llist_for_each_entry(csd, entry, node.llist) {
 475                         switch (CSD_TYPE(csd)) {
 476                         case CSD_TYPE_ASYNC:
 477                         case CSD_TYPE_SYNC:
 478                         case CSD_TYPE_IRQ_WORK:
 479                                 pr_warn("IPI callback %pS sent to offline CPU\n",
 480                                         csd->func);
 481                                 break;
 482
 483                         case CSD_TYPE_TTWU:
 484                                 pr_warn("IPI task-wakeup sent to offline CPU\n");
 485                                 break;
 486
 487                         default:
 488                                 pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
 489                                         CSD_TYPE(csd));
 490                                 break;
 491                         }
 492                 }
 493         }
 494
 495         /*
 496          * First; run all SYNC callbacks, people are waiting for us.
 497          */
 498         prev = NULL;
 499         llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
 500                 /* Do we wait until *after* callback? */
 501                 if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
 502                         smp_call_func_t func = csd->func;
 503                         void *info = csd->info;
 504
 505                         if (prev) {
 506                                 prev->next = &csd_next->node.llist;
 507                         } else {
 508                                 entry = &csd_next->node.llist;
 509                         }
 510
 511                         csd_lock_record(csd);
 512                         csd_do_func(func, info, csd);
 513                         csd_unlock(csd);
 514                         csd_lock_record(NULL);
 515                 } else {
 516                         prev = &csd->node.llist;
 517                 }
 518         }
 519
 520         if (!entry)
 521                 return;
 522
 523         /*
 524          * Second; run all !SYNC callbacks.
 525          */
 526         prev = NULL;
 527         llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
 528                 int type = CSD_TYPE(csd);
 529
 530                 if (type != CSD_TYPE_TTWU) {
 531                         if (prev) {
 532                                 prev->next = &csd_next->node.llist;
 533                         } else {
 534                                 entry = &csd_next->node.llist;
 535                         }
 536
 537                         if (type == CSD_TYPE_ASYNC) {
 538                                 smp_call_func_t func = csd->func;
 539                                 void *info = csd->info;
 540
 541                                 csd_lock_record(csd);
 542                                 csd_unlock(csd);
 543                                 csd_do_func(func, info, csd);
 544                                 csd_lock_record(NULL);
 545                         } else if (type == CSD_TYPE_IRQ_WORK) {
 546                                 irq_work_single(csd);
 547                         }
 548
 549                 } else {
 550                         prev = &csd->node.llist;
 551                 }
 552         }
 553
 554         /*
 555          * Third; only CSD_TYPE_TTWU is left, issue those.
 556          */
 557         if (entry) {
 558                 csd = llist_entry(entry, typeof(*csd), node.llist);
 559                 csd_do_func(sched_ttwu_pending, entry, csd);
 560         }
 561 }
 562
 563
 564 /**
 565  * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 566  *                                 from task context (idle, migration thread)
 567  *
 568  * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it
 569  * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by
 570  * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to
 571  * handle queued SMP function calls before scheduling.
 572  *
 573  * The migration thread has to ensure that an eventually pending wakeup has
 574  * been handled before it migrates a task.
 575  */
 576 void flush_smp_call_function_queue(void)
 577 {
 578         unsigned int was_pending;
 579         unsigned long flags;
 580
 581         if (llist_empty(this_cpu_ptr(&call_single_queue)))
 582                 return;
 583
 584         local_irq_save(flags);
 585         /* Get the already pending soft interrupts for RT enabled kernels */
 586         was_pending = local_softirq_pending();
 587         __flush_smp_call_function_queue(true);
 588         if (local_softirq_pending())
 589                 do_softirq_post_smp_call_flush(was_pending);
 590
 591         local_irq_restore(flags);
 592 }
 593
 594 /*
 595  * smp_call_function_single - Run a function on a specific CPU
 596  * @func: The function to run. This must be fast and non-blocking.
 597  * @info: An arbitrary pointer to pass to the function.
 598  * @wait: If true, wait until function has completed on other CPUs.
 599  *
 600  * Returns 0 on success, else a negative status code.
 601  */
 602 int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 603                              int wait)
 604 {
 605         call_single_data_t *csd;
 606         call_single_data_t csd_stack = {
 607                 .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, },
 608         };
 609         int this_cpu;
 610         int err;
 611
 612         /*
 613          * prevent preemption and reschedule on another processor,
 614          * as well as CPU removal
 615          */
 616         this_cpu = get_cpu();
 617
 618         /*
 619          * Can deadlock when called with interrupts disabled.
 620          * We allow cpu's that are not yet online though, as no one else can
 621          * send smp call function interrupt to this cpu and as such deadlocks
 622          * can't happen.
 623          */
 624         WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
 625                      && !oops_in_progress);
 626
 627         /*
 628          * When @wait we can deadlock when we interrupt between llist_add() and
 629          * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
 630          * csd_lock() on because the interrupt context uses the same csd
 631          * storage.
 632          */
 633         WARN_ON_ONCE(!in_task());
 634
 635         csd = &csd_stack;
 636         if (!wait) {
 637                 csd = this_cpu_ptr(&csd_data);
 638                 csd_lock(csd);
 639         }
 640
 641         csd->func = func;
 642         csd->info = info;
 643 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
 644         csd->node.src = smp_processor_id();
 645         csd->node.dst = cpu;
 646 #endif
 647
 648         err = generic_exec_single(cpu, csd);
 649
 650         if (wait)
 651                 csd_lock_wait(csd);
 652
 653         put_cpu();
 654
 655         return err;
 656 }
 657 EXPORT_SYMBOL(smp_call_function_single);
 658
 659 /**
 660  * smp_call_function_single_async() - Run an asynchronous function on a
 661  *                               specific CPU.
 662  * @cpu: The CPU to run on.
 663  * @csd: Pre-allocated and setup data structure
 664  *
 665  * Like smp_call_function_single(), but the call is asynchonous and
 666  * can thus be done from contexts with disabled interrupts.
 667  *
 668  * The caller passes his own pre-allocated data structure
 669  * (ie: embedded in an object) and is responsible for synchronizing it
 670  * such that the IPIs performed on the @csd are strictly serialized.
 671  *
 672  * If the function is called with one csd which has not yet been
 673  * processed by previous call to smp_call_function_single_async(), the
 674  * function will return immediately with -EBUSY showing that the csd
 675  * object is still in progress.
 676  *
 677  * NOTE: Be careful, there is unfortunately no current debugging facility to
 678  * validate the correctness of this serialization.
 679  *
 680  * Return: %0 on success or negative errno value on error
 681  */
 682 int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 683 {
 684         int err = 0;
 685
 686         preempt_disable();
 687
 688         if (csd->node.u_flags & CSD_FLAG_LOCK) {
 689                 err = -EBUSY;
 690                 goto out;
 691         }
 692
 693         csd->node.u_flags = CSD_FLAG_LOCK;
 694         smp_wmb();
 695
 696         err = generic_exec_single(cpu, csd);
 697
 698 out:
 699         preempt_enable();
 700
 701         return err;
 702 }
 703 EXPORT_SYMBOL_GPL(smp_call_function_single_async);
 704
 705 /*
 706  * smp_call_function_any - Run a function on any of the given cpus
 707  * @mask: The mask of cpus it can run on.
 708  * @func: The function to run. This must be fast and non-blocking.
 709  * @info: An arbitrary pointer to pass to the function.
 710  * @wait: If true, wait until function has completed.
 711  *
 712  * Returns 0 on success, else a negative status code (if no cpus were online).
 713  *
 714  * Selection preference:
 715  *      1) current cpu if in @mask
 716  *      2) any cpu of current node if in @mask
 717  *      3) any other online cpu in @mask
 718  */
 719 int smp_call_function_any(const struct cpumask *mask,
 720                           smp_call_func_t func, void *info, int wait)
 721 {
 722         unsigned int cpu;
 723         const struct cpumask *nodemask;
 724         int ret;
 725
 726         /* Try for same CPU (cheapest) */
 727         cpu = get_cpu();
 728         if (cpumask_test_cpu(cpu, mask))
 729                 goto call;
 730
 731         /* Try for same node. */
 732         nodemask = cpumask_of_node(cpu_to_node(cpu));
 733         for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
 734              cpu = cpumask_next_and(cpu, nodemask, mask)) {
 735                 if (cpu_online(cpu))
 736                         goto call;
 737         }
 738
 739         /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
 740         cpu = cpumask_any_and(mask, cpu_online_mask);
 741 call:
 742         ret = smp_call_function_single(cpu, func, info, wait);
 743         put_cpu();
 744         return ret;
 745 }
 746 EXPORT_SYMBOL_GPL(smp_call_function_any);
 747
 748 /*
 749  * Flags to be used as scf_flags argument of smp_call_function_many_cond().
 750  *
 751  * %SCF_WAIT:           Wait until function execution is completed
 752  * %SCF_RUN_LOCAL:      Run also locally if local cpu is set in cpumask
 753  */
 754 #define SCF_WAIT        (1U << 0)
 755 #define SCF_RUN_LOCAL   (1U << 1)
 756
 757 static void smp_call_function_many_cond(const struct cpumask *mask,
 758                                         smp_call_func_t func, void *info,
 759                                         unsigned int scf_flags,
 760                                         smp_cond_func_t cond_func)
 761 {
 762         int cpu, last_cpu, this_cpu = smp_processor_id();
 763         struct call_function_data *cfd;
 764         bool wait = scf_flags & SCF_WAIT;
 765         int nr_cpus = 0;
 766         bool run_remote = false;
 767         bool run_local = false;
 768
 769         lockdep_assert_preemption_disabled();
 770
 771         /*
 772          * Can deadlock when called with interrupts disabled.
 773          * We allow cpu's that are not yet online though, as no one else can
 774          * send smp call function interrupt to this cpu and as such deadlocks
 775          * can't happen.
 776          */
 777         if (cpu_online(this_cpu) && !oops_in_progress &&
 778             !early_boot_irqs_disabled)
 779                 lockdep_assert_irqs_enabled();
 780
 781         /*
 782          * When @wait we can deadlock when we interrupt between llist_add() and
 783          * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
 784          * csd_lock() on because the interrupt context uses the same csd
 785          * storage.
 786          */
 787         WARN_ON_ONCE(!in_task());
 788
 789         /* Check if we need local execution. */
 790         if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask))
 791                 run_local = true;
 792
 793         /* Check if we need remote execution, i.e., any CPU excluding this one. */
 794         cpu = cpumask_first_and(mask, cpu_online_mask);
 795         if (cpu == this_cpu)
 796                 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
 797         if (cpu < nr_cpu_ids)
 798                 run_remote = true;
 799
 800         if (run_remote) {
 801                 cfd = this_cpu_ptr(&cfd_data);
 802                 cpumask_and(cfd->cpumask, mask, cpu_online_mask);
 803                 __cpumask_clear_cpu(this_cpu, cfd->cpumask);
 804
 805                 cpumask_clear(cfd->cpumask_ipi);
 806                 for_each_cpu(cpu, cfd->cpumask) {
 807                         call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
 808
 809                         if (cond_func && !cond_func(cpu, info)) {
 810                                 __cpumask_clear_cpu(cpu, cfd->cpumask);
 811                                 continue;
 812                         }
 813
 814                         csd_lock(csd);
 815                         if (wait)
 816                                 csd->node.u_flags |= CSD_TYPE_SYNC;
 817                         csd->func = func;
 818                         csd->info = info;
 819 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
 820                         csd->node.src = smp_processor_id();
 821                         csd->node.dst = cpu;
 822 #endif
 823                         trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
 824
 825                         if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
 826                                 __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
 827                                 nr_cpus++;
 828                                 last_cpu = cpu;
 829                         }
 830                 }
 831
 832                 /*
 833                  * Choose the most efficient way to send an IPI. Note that the
 834                  * number of CPUs might be zero due to concurrent changes to the
 835                  * provided mask.
 836                  */
 837                 if (nr_cpus == 1)
 838                         send_call_function_single_ipi(last_cpu);
 839                 else if (likely(nr_cpus > 1))
 840                         send_call_function_ipi_mask(cfd->cpumask_ipi);
 841         }
 842
 843         if (run_local && (!cond_func || cond_func(this_cpu, info))) {
 844                 unsigned long flags;
 845
 846                 local_irq_save(flags);
 847                 csd_do_func(func, info, NULL);
 848                 local_irq_restore(flags);
 849         }
 850
 851         if (run_remote && wait) {
 852                 for_each_cpu(cpu, cfd->cpumask) {
 853                         call_single_data_t *csd;
 854
 855                         csd = per_cpu_ptr(cfd->csd, cpu);
 856                         csd_lock_wait(csd);
 857                 }
 858         }
 859 }
 860
 861 /**
 862  * smp_call_function_many(): Run a function on a set of CPUs.
 863  * @mask: The set of cpus to run on (only runs on online subset).
 864  * @func: The function to run. This must be fast and non-blocking.
 865  * @info: An arbitrary pointer to pass to the function.
 866  * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait
 867  *        (atomically) until function has completed on other CPUs. If
 868  *        %SCF_RUN_LOCAL is set, the function will also be run locally
 869  *        if the local CPU is set in the @cpumask.
 870  *
 871  * If @wait is true, then returns once @func has returned.
 872  *
 873  * You must not call this function with disabled interrupts or from a
 874  * hardware interrupt handler or from a bottom half handler. Preemption
 875  * must be disabled when calling this function.
 876  */
 877 void smp_call_function_many(const struct cpumask *mask,
 878                             smp_call_func_t func, void *info, bool wait)
 879 {
 880         smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL);
 881 }
 882 EXPORT_SYMBOL(smp_call_function_many);
 883
 884 /**
 885  * smp_call_function(): Run a function on all other CPUs.
 886  * @func: The function to run. This must be fast and non-blocking.
 887  * @info: An arbitrary pointer to pass to the function.
 888  * @wait: If true, wait (atomically) until function has completed
 889  *        on other CPUs.
 890  *
 891  * Returns 0.
 892  *
 893  * If @wait is true, then returns once @func has returned; otherwise
 894  * it returns just before the target cpu calls @func.
 895  *
 896  * You must not call this function with disabled interrupts or from a
 897  * hardware interrupt handler or from a bottom half handler.
 898  */
 899 void smp_call_function(smp_call_func_t func, void *info, int wait)
 900 {
 901         preempt_disable();
 902         smp_call_function_many(cpu_online_mask, func, info, wait);
 903         preempt_enable();
 904 }
 905 EXPORT_SYMBOL(smp_call_function);
 906
 907 /* Setup configured maximum number of CPUs to activate */
 908 unsigned int setup_max_cpus = NR_CPUS;
 909 EXPORT_SYMBOL(setup_max_cpus);
 910
 911
 912 /*
 913  * Setup routine for controlling SMP activation
 914  *
 915  * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
 916  * activation entirely (the MPS table probe still happens, though).
 917  *
 918  * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
 919  * greater than 0, limits the maximum number of CPUs activated in
 920  * SMP mode to <NUM>.
 921  */
 922
 923 void __weak __init arch_disable_smp_support(void) { }
 924
 925 static int __init nosmp(char *str)
 926 {
 927         setup_max_cpus = 0;
 928         arch_disable_smp_support();
 929
 930         return 0;
 931 }
 932
 933 early_param("nosmp", nosmp);
 934
 935 /* this is hard limit */
 936 static int __init nrcpus(char *str)
 937 {
 938         int nr_cpus;
 939
 940         if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
 941                 set_nr_cpu_ids(nr_cpus);
 942
 943         return 0;
 944 }
 945
 946 early_param("nr_cpus", nrcpus);
 947
 948 static int __init maxcpus(char *str)
 949 {
 950         get_option(&str, &setup_max_cpus);
 951         if (setup_max_cpus == 0)
 952                 arch_disable_smp_support();
 953
 954         return 0;
 955 }
 956
 957 early_param("maxcpus", maxcpus);
 958
 959 #if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS)
 960 /* Setup number of possible processor ids */
 961 unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
 962 EXPORT_SYMBOL(nr_cpu_ids);
 963 #endif
 964
 965 /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
 966 void __init setup_nr_cpu_ids(void)
 967 {
 968         set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1);
 969 }
 970
 971 /* Called by boot processor to activate the rest. */
 972 void __init smp_init(void)
 973 {
 974         int num_nodes, num_cpus;
 975
 976         idle_threads_init();
 977         cpuhp_threads_init();
 978
 979         pr_info("Bringing up secondary CPUs ...\n");
 980
 981         bringup_nonboot_cpus(setup_max_cpus);
 982
 983         num_nodes = num_online_nodes();
 984         num_cpus  = num_online_cpus();
 985         pr_info("Brought up %d node%s, %d CPU%s\n",
 986                 num_nodes, str_plural(num_nodes), num_cpus, str_plural(num_cpus));
 987
 988         /* Any cleanup work */
 989         smp_cpus_done(setup_max_cpus);
 990 }
 991
 992 /*
 993  * on_each_cpu_cond(): Call a function on each processor for which
 994  * the supplied function cond_func returns true, optionally waiting
 995  * for all the required CPUs to finish. This may include the local
 996  * processor.
 997  * @cond_func:  A callback function that is passed a cpu id and
 998  *              the info parameter. The function is called
 999  *              with preemption disabled. The function should
1000  *              return a blooean value indicating whether to IPI
1001  *              the specified CPU.
1002  * @func:       The function to run on all applicable CPUs.
1003  *              This must be fast and non-blocking.
1004  * @info:       An arbitrary pointer to pass to both functions.
1005  * @wait:       If true, wait (atomically) until function has
1006  *              completed on other CPUs.
1007  *
1008  * Preemption is disabled to protect against CPUs going offline but not online.
1009  * CPUs going online during the call will not be seen or sent an IPI.
1010  *
1011  * You must not call this function with disabled interrupts or
1012  * from a hardware interrupt handler or from a bottom half handler.
1013  */
1014 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
1015                            void *info, bool wait, const struct cpumask *mask)
1016 {
1017         unsigned int scf_flags = SCF_RUN_LOCAL;
1018
1019         if (wait)
1020                 scf_flags |= SCF_WAIT;
1021
1022         preempt_disable();
1023         smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
1024         preempt_enable();
1025 }
1026 EXPORT_SYMBOL(on_each_cpu_cond_mask);
1027
1028 static void do_nothing(void *unused)
1029 {
1030 }
1031
1032 /**
1033  * kick_all_cpus_sync - Force all cpus out of idle
1034  *
1035  * Used to synchronize the update of pm_idle function pointer. It's
1036  * called after the pointer is updated and returns after the dummy
1037  * callback function has been executed on all cpus. The execution of
1038  * the function can only happen on the remote cpus after they have
1039  * left the idle function which had been called via pm_idle function
1040  * pointer. So it's guaranteed that nothing uses the previous pointer
1041  * anymore.
1042  */
1043 void kick_all_cpus_sync(void)
1044 {
1045         /* Make sure the change is visible before we kick the cpus */
1046         smp_mb();
1047         smp_call_function(do_nothing, NULL, 1);
1048 }
1049 EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
1050
1051 /**
1052  * wake_up_all_idle_cpus - break all cpus out of idle
1053  * wake_up_all_idle_cpus try to break all cpus which is in idle state even
1054  * including idle polling cpus, for non-idle cpus, we will do nothing
1055  * for them.
1056  */
1057 void wake_up_all_idle_cpus(void)
1058 {
1059         int cpu;
1060
1061         for_each_possible_cpu(cpu) {
1062                 preempt_disable();
1063                 if (cpu != smp_processor_id() && cpu_online(cpu))
1064                         wake_up_if_idle(cpu);
1065                 preempt_enable();
1066         }
1067 }
1068 EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
1069
1070 /**
1071  * struct smp_call_on_cpu_struct - Call a function on a specific CPU
1072  * @work: &work_struct
1073  * @done: &completion to signal
1074  * @func: function to call
1075  * @data: function's data argument
1076  * @ret: return value from @func
1077  * @cpu: target CPU (%-1 for any CPU)
1078  *
1079  * Used to call a function on a specific cpu and wait for it to return.
1080  * Optionally make sure the call is done on a specified physical cpu via vcpu
1081  * pinning in order to support virtualized environments.
1082  */
1083 struct smp_call_on_cpu_struct {
1084         struct work_struct      work;
1085         struct completion       done;
1086         int                     (*func)(void *);
1087         void                    *data;
1088         int                     ret;
1089         int                     cpu;
1090 };
1091
1092 static void smp_call_on_cpu_callback(struct work_struct *work)
1093 {
1094         struct smp_call_on_cpu_struct *sscs;
1095
1096         sscs = container_of(work, struct smp_call_on_cpu_struct, work);
1097         if (sscs->cpu >= 0)
1098                 hypervisor_pin_vcpu(sscs->cpu);
1099         sscs->ret = sscs->func(sscs->data);
1100         if (sscs->cpu >= 0)
1101                 hypervisor_pin_vcpu(-1);
1102
1103         complete(&sscs->done);
1104 }
1105
1106 int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
1107 {
1108         struct smp_call_on_cpu_struct sscs = {
1109                 .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
1110                 .func = func,
1111                 .data = par,
1112                 .cpu  = phys ? cpu : -1,
1113         };
1114
1115         INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback);
1116
1117         if (cpu >= nr_cpu_ids || !cpu_online(cpu))
1118                 return -ENXIO;
1119
1120         queue_work_on(cpu, system_wq, &sscs.work);
1121         wait_for_completion(&sscs.done);
1122         destroy_work_on_stack(&sscs.work);
1123
1124         return sscs.ret;
1125 }
1126 EXPORT_SYMBOL_GPL(smp_call_on_cpu);