kernel/sched.c

   1 /*
   2  *  kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13  *              hybrid priority-list and round-robin design with
  14  *              an array-switch method of distributing timeslices
  15  *              and per-CPU runqueues.  Cleanups and useful suggestions
  16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17  *  2003-09-03  Interactivity tuning by Con Kolivas.
  18  *  2004-04-02  Scheduler domains code by Nick Piggin
  19  *  2007-04-15  Work begun on replacing all interactivity tuning with a
  20  *              fair scheduling design by Con Kolivas.
  21  *  2007-05-05  Load balancing (smp-nice) and other improvements
  22  *              by Peter Williams
  23  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  24  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  25  */
  26
  27 #include <linux/mm.h>
  28 #include <linux/module.h>
  29 #include <linux/nmi.h>
  30 #include <linux/init.h>
  31 #include <linux/uaccess.h>
  32 #include <linux/highmem.h>
  33 #include <linux/smp_lock.h>
  34 #include <asm/mmu_context.h>
  35 #include <linux/interrupt.h>
  36 #include <linux/capability.h>
  37 #include <linux/completion.h>
  38 #include <linux/kernel_stat.h>
  39 #include <linux/debug_locks.h>
  40 #include <linux/security.h>
  41 #include <linux/notifier.h>
  42 #include <linux/profile.h>
  43 #include <linux/freezer.h>
  44 #include <linux/vmalloc.h>
  45 #include <linux/blkdev.h>
  46 #include <linux/delay.h>
  47 #include <linux/smp.h>
  48 #include <linux/threads.h>
  49 #include <linux/timer.h>
  50 #include <linux/rcupdate.h>
  51 #include <linux/cpu.h>
  52 #include <linux/cpuset.h>
  53 #include <linux/percpu.h>
  54 #include <linux/kthread.h>
  55 #include <linux/seq_file.h>
  56 #include <linux/syscalls.h>
  57 #include <linux/times.h>
  58 #include <linux/tsacct_kern.h>
  59 #include <linux/kprobes.h>
  60 #include <linux/delayacct.h>
  61 #include <linux/reciprocal_div.h>
  62 #include <linux/unistd.h>
  63
  64 #include <asm/tlb.h>
  65
  66 /*
  67  * Scheduler clock - returns current time in nanosec units.
  68  * This is default implementation.
  69  * Architectures and sub-architectures can override this.
  70  */
  71 unsigned long long __attribute__((weak)) sched_clock(void)
  72 {
  73         return (unsigned long long)jiffies * (1000000000 / HZ);
  74 }
  75
  76 /*
  77  * Convert user-nice values [ -20 ... 0 ... 19 ]
  78  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  79  * and back.
  80  */
  81 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  82 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  83 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  84
  85 /*
  86  * 'User priority' is the nice value converted to something we
  87  * can work with better when scaling various scheduler parameters,
  88  * it's a [ 0 ... 39 ] range.
  89  */
  90 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  91 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  92 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  93
  94 /*
  95  * Some helpers for converting nanosecond timing to jiffy resolution
  96  */
  97 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
  98 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
  99
 100 #define NICE_0_LOAD             SCHED_LOAD_SCALE
 101 #define NICE_0_SHIFT            SCHED_LOAD_SHIFT
 102
 103 /*
 104  * These are the 'tuning knobs' of the scheduler:
 105  *
 106  * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
 107  * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
 108  * Timeslices get refilled after they expire.
 109  */
 110 #define MIN_TIMESLICE           max(5 * HZ / 1000, 1)
 111 #define DEF_TIMESLICE           (100 * HZ / 1000)
 112
 113 #ifdef CONFIG_SMP
 114 /*
 115  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
 116  * Since cpu_power is a 'constant', we can use a reciprocal divide.
 117  */
 118 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 119 {
 120         return reciprocal_divide(load, sg->reciprocal_cpu_power);
 121 }
 122
 123 /*
 124  * Each time a sched group cpu_power is changed,
 125  * we must compute its reciprocal value
 126  */
 127 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 128 {
 129         sg->__cpu_power += val;
 130         sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 131 }
 132 #endif
 133
 134 #define SCALE_PRIO(x, prio) \
 135         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 136
 137 /*
 138  * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
 139  * to time slice values: [800ms ... 100ms ... 5ms]
 140  */
 141 static unsigned int static_prio_timeslice(int static_prio)
 142 {
 143         if (static_prio == NICE_TO_PRIO(19))
 144                 return 1;
 145
 146         if (static_prio < NICE_TO_PRIO(0))
 147                 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
 148         else
 149                 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 150 }
 151
 152 static inline int rt_policy(int policy)
 153 {
 154         if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
 155                 return 1;
 156         return 0;
 157 }
 158
 159 static inline int task_has_rt_policy(struct task_struct *p)
 160 {
 161         return rt_policy(p->policy);
 162 }
 163
 164 /*
 165  * This is the priority-queue data structure of the RT scheduling class:
 166  */
 167 struct rt_prio_array {
 168         DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 169         struct list_head queue[MAX_RT_PRIO];
 170 };
 171
 172 struct load_stat {
 173         struct load_weight load;
 174         u64 load_update_start, load_update_last;
 175         unsigned long delta_fair, delta_exec, delta_stat;
 176 };
 177
 178 /* CFS-related fields in a runqueue */
 179 struct cfs_rq {
 180         struct load_weight load;
 181         unsigned long nr_running;
 182
 183         s64 fair_clock;
 184         u64 exec_clock;
 185         s64 wait_runtime;
 186         u64 sleeper_bonus;
 187         unsigned long wait_runtime_overruns, wait_runtime_underruns;
 188
 189         struct rb_root tasks_timeline;
 190         struct rb_node *rb_leftmost;
 191         struct rb_node *rb_load_balance_curr;
 192 #ifdef CONFIG_FAIR_GROUP_SCHED
 193         /* 'curr' points to currently running entity on this cfs_rq.
 194          * It is set to NULL otherwise (i.e when none are currently running).
 195          */
 196         struct sched_entity *curr;
 197         struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
 198
 199         /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 200          * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 201          * (like users, containers etc.)
 202          *
 203          * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 204          * list is used during load balance.
 205          */
 206         struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
 207 #endif
 208 };
 209
 210 /* Real-Time classes' related field in a runqueue: */
 211 struct rt_rq {
 212         struct rt_prio_array active;
 213         int rt_load_balance_idx;
 214         struct list_head *rt_load_balance_head, *rt_load_balance_curr;
 215 };
 216
 217 /*
 218  * This is the main, per-CPU runqueue data structure.
 219  *
 220  * Locking rule: those places that want to lock multiple runqueues
 221  * (such as the load balancing or the thread migration code), lock
 222  * acquire operations must be ordered by ascending &runqueue.
 223  */
 224 struct rq {
 225         spinlock_t lock;        /* runqueue lock */
 226
 227         /*
 228          * nr_running and cpu_load should be in the same cacheline because
 229          * remote CPUs use both these fields when doing load calculation.
 230          */
 231         unsigned long nr_running;
 232         #define CPU_LOAD_IDX_MAX 5
 233         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 234         unsigned char idle_at_tick;
 235 #ifdef CONFIG_NO_HZ
 236         unsigned char in_nohz_recently;
 237 #endif
 238         struct load_stat ls;    /* capture load from *all* tasks on this cpu */
 239         unsigned long nr_load_updates;
 240         u64 nr_switches;
 241
 242         struct cfs_rq cfs;
 243 #ifdef CONFIG_FAIR_GROUP_SCHED
 244         struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
 245 #endif
 246         struct rt_rq  rt;
 247
 248         /*
 249          * This is part of a global counter where only the total sum
 250          * over all CPUs matters. A task can increase this counter on
 251          * one CPU and if it got migrated afterwards it may decrease
 252          * it on another CPU. Always updated under the runqueue lock:
 253          */
 254         unsigned long nr_uninterruptible;
 255
 256         struct task_struct *curr, *idle;
 257         unsigned long next_balance;
 258         struct mm_struct *prev_mm;
 259
 260         u64 clock, prev_clock_raw;
 261         s64 clock_max_delta;
 262
 263         unsigned int clock_warps, clock_overflows;
 264         unsigned int clock_unstable_events;
 265
 266         struct sched_class *load_balance_class;
 267
 268         atomic_t nr_iowait;
 269
 270 #ifdef CONFIG_SMP
 271         struct sched_domain *sd;
 272
 273         /* For active balancing */
 274         int active_balance;
 275         int push_cpu;
 276         int cpu;                /* cpu of this runqueue */
 277
 278         struct task_struct *migration_thread;
 279         struct list_head migration_queue;
 280 #endif
 281
 282 #ifdef CONFIG_SCHEDSTATS
 283         /* latency stats */
 284         struct sched_info rq_sched_info;
 285
 286         /* sys_sched_yield() stats */
 287         unsigned long yld_exp_empty;
 288         unsigned long yld_act_empty;
 289         unsigned long yld_both_empty;
 290         unsigned long yld_cnt;
 291
 292         /* schedule() stats */
 293         unsigned long sched_switch;
 294         unsigned long sched_cnt;
 295         unsigned long sched_goidle;
 296
 297         /* try_to_wake_up() stats */
 298         unsigned long ttwu_cnt;
 299         unsigned long ttwu_local;
 300 #endif
 301         struct lock_class_key rq_lock_key;
 302 };
 303
 304 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
 305 static DEFINE_MUTEX(sched_hotcpu_mutex);
 306
 307 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 308 {
 309         rq->curr->sched_class->check_preempt_curr(rq, p);
 310 }
 311
 312 static inline int cpu_of(struct rq *rq)
 313 {
 314 #ifdef CONFIG_SMP
 315         return rq->cpu;
 316 #else
 317         return 0;
 318 #endif
 319 }
 320
 321 /*
 322  * Per-runqueue clock, as finegrained as the platform can give us:
 323  */
 324 static unsigned long long __rq_clock(struct rq *rq)
 325 {
 326         u64 prev_raw = rq->prev_clock_raw;
 327         u64 now = sched_clock();
 328         s64 delta = now - prev_raw;
 329         u64 clock = rq->clock;
 330
 331         /*
 332          * Protect against sched_clock() occasionally going backwards:
 333          */
 334         if (unlikely(delta < 0)) {
 335                 clock++;
 336                 rq->clock_warps++;
 337         } else {
 338                 /*
 339                  * Catch too large forward jumps too:
 340                  */
 341                 if (unlikely(delta > 2*TICK_NSEC)) {
 342                         clock++;
 343                         rq->clock_overflows++;
 344                 } else {
 345                         if (unlikely(delta > rq->clock_max_delta))
 346                                 rq->clock_max_delta = delta;
 347                         clock += delta;
 348                 }
 349         }
 350
 351         rq->prev_clock_raw = now;
 352         rq->clock = clock;
 353
 354         return clock;
 355 }
 356
 357 static inline unsigned long long rq_clock(struct rq *rq)
 358 {
 359         int this_cpu = smp_processor_id();
 360
 361         if (this_cpu == cpu_of(rq))
 362                 return __rq_clock(rq);
 363
 364         return rq->clock;
 365 }
 366
 367 /*
 368  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 369  * See detach_destroy_domains: synchronize_sched for details.
 370  *
 371  * The domain tree of any CPU may only be accessed from within
 372  * preempt-disabled sections.
 373  */
 374 #define for_each_domain(cpu, __sd) \
 375         for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 376
 377 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 378 #define this_rq()               (&__get_cpu_var(runqueues))
 379 #define task_rq(p)              cpu_rq(task_cpu(p))
 380 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 381
 382 #ifdef CONFIG_FAIR_GROUP_SCHED
 383 /* Change a task's ->cfs_rq if it moves across CPUs */
 384 static inline void set_task_cfs_rq(struct task_struct *p)
 385 {
 386         p->se.cfs_rq = &task_rq(p)->cfs;
 387 }
 388 #else
 389 static inline void set_task_cfs_rq(struct task_struct *p)
 390 {
 391 }
 392 #endif
 393
 394 #ifndef prepare_arch_switch
 395 # define prepare_arch_switch(next)      do { } while (0)
 396 #endif
 397 #ifndef finish_arch_switch
 398 # define finish_arch_switch(prev)       do { } while (0)
 399 #endif
 400
 401 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 402 static inline int task_running(struct rq *rq, struct task_struct *p)
 403 {
 404         return rq->curr == p;
 405 }
 406
 407 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 408 {
 409 }
 410
 411 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 412 {
 413 #ifdef CONFIG_DEBUG_SPINLOCK
 414         /* this is a valid case when another task releases the spinlock */
 415         rq->lock.owner = current;
 416 #endif
 417         /*
 418          * If we are tracking spinlock dependencies then we have to
 419          * fix up the runqueue lock - which gets 'carried over' from
 420          * prev into current:
 421          */
 422         spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 423
 424         spin_unlock_irq(&rq->lock);
 425 }
 426
 427 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 428 static inline int task_running(struct rq *rq, struct task_struct *p)
 429 {
 430 #ifdef CONFIG_SMP
 431         return p->oncpu;
 432 #else
 433         return rq->curr == p;
 434 #endif
 435 }
 436
 437 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 438 {
 439 #ifdef CONFIG_SMP
 440         /*
 441          * We can optimise this out completely for !SMP, because the
 442          * SMP rebalancing from interrupt is the only thing that cares
 443          * here.
 444          */
 445         next->oncpu = 1;
 446 #endif
 447 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 448         spin_unlock_irq(&rq->lock);
 449 #else
 450         spin_unlock(&rq->lock);
 451 #endif
 452 }
 453
 454 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 455 {
 456 #ifdef CONFIG_SMP
 457         /*
 458          * After ->oncpu is cleared, the task can be moved to a different CPU.
 459          * We must ensure this doesn't happen until the switch is completely
 460          * finished.
 461          */
 462         smp_wmb();
 463         prev->oncpu = 0;
 464 #endif
 465 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 466         local_irq_enable();
 467 #endif
 468 }
 469 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 470
 471 /*
 472  * __task_rq_lock - lock the runqueue a given task resides on.
 473  * Must be called interrupts disabled.
 474  */
 475 static inline struct rq *__task_rq_lock(struct task_struct *p)
 476         __acquires(rq->lock)
 477 {
 478         struct rq *rq;
 479
 480 repeat_lock_task:
 481         rq = task_rq(p);
 482         spin_lock(&rq->lock);
 483         if (unlikely(rq != task_rq(p))) {
 484                 spin_unlock(&rq->lock);
 485                 goto repeat_lock_task;
 486         }
 487         return rq;
 488 }
 489
 490 /*
 491  * task_rq_lock - lock the runqueue a given task resides on and disable
 492  * interrupts.  Note the ordering: we can safely lookup the task_rq without
 493  * explicitly disabling preemption.
 494  */
 495 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 496         __acquires(rq->lock)
 497 {
 498         struct rq *rq;
 499
 500 repeat_lock_task:
 501         local_irq_save(*flags);
 502         rq = task_rq(p);
 503         spin_lock(&rq->lock);
 504         if (unlikely(rq != task_rq(p))) {
 505                 spin_unlock_irqrestore(&rq->lock, *flags);
 506                 goto repeat_lock_task;
 507         }
 508         return rq;
 509 }
 510
 511 static inline void __task_rq_unlock(struct rq *rq)
 512         __releases(rq->lock)
 513 {
 514         spin_unlock(&rq->lock);
 515 }
 516
 517 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 518         __releases(rq->lock)
 519 {
 520         spin_unlock_irqrestore(&rq->lock, *flags);
 521 }
 522
 523 /*
 524  * this_rq_lock - lock this runqueue and disable interrupts.
 525  */
 526 static inline struct rq *this_rq_lock(void)
 527         __acquires(rq->lock)
 528 {
 529         struct rq *rq;
 530
 531         local_irq_disable();
 532         rq = this_rq();
 533         spin_lock(&rq->lock);
 534
 535         return rq;
 536 }
 537
 538 /*
 539  * CPU frequency is/was unstable - start new by setting prev_clock_raw:
 540  */
 541 void sched_clock_unstable_event(void)
 542 {
 543         unsigned long flags;
 544         struct rq *rq;
 545
 546         rq = task_rq_lock(current, &flags);
 547         rq->prev_clock_raw = sched_clock();
 548         rq->clock_unstable_events++;
 549         task_rq_unlock(rq, &flags);
 550 }
 551
 552 /*
 553  * resched_task - mark a task 'to be rescheduled now'.
 554  *
 555  * On UP this means the setting of the need_resched flag, on SMP it
 556  * might also involve a cross-CPU call to trigger the scheduler on
 557  * the target CPU.
 558  */
 559 #ifdef CONFIG_SMP
 560
 561 #ifndef tsk_is_polling
 562 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 563 #endif
 564
 565 static void resched_task(struct task_struct *p)
 566 {
 567         int cpu;
 568
 569         assert_spin_locked(&task_rq(p)->lock);
 570
 571         if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 572                 return;
 573
 574         set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 575
 576         cpu = task_cpu(p);
 577         if (cpu == smp_processor_id())
 578                 return;
 579
 580         /* NEED_RESCHED must be visible before we test polling */
 581         smp_mb();
 582         if (!tsk_is_polling(p))
 583                 smp_send_reschedule(cpu);
 584 }
 585
 586 static void resched_cpu(int cpu)
 587 {
 588         struct rq *rq = cpu_rq(cpu);
 589         unsigned long flags;
 590
 591         if (!spin_trylock_irqsave(&rq->lock, flags))
 592                 return;
 593         resched_task(cpu_curr(cpu));
 594         spin_unlock_irqrestore(&rq->lock, flags);
 595 }
 596 #else
 597 static inline void resched_task(struct task_struct *p)
 598 {
 599         assert_spin_locked(&task_rq(p)->lock);
 600         set_tsk_need_resched(p);
 601 }
 602 #endif
 603
 604 static u64 div64_likely32(u64 divident, unsigned long divisor)
 605 {
 606 #if BITS_PER_LONG == 32
 607         if (likely(divident <= 0xffffffffULL))
 608                 return (u32)divident / divisor;
 609         do_div(divident, divisor);
 610
 611         return divident;
 612 #else
 613         return divident / divisor;
 614 #endif
 615 }
 616
 617 #if BITS_PER_LONG == 32
 618 # define WMULT_CONST    (~0UL)
 619 #else
 620 # define WMULT_CONST    (1UL << 32)
 621 #endif
 622
 623 #define WMULT_SHIFT     32
 624
 625 static inline unsigned long
 626 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 627                 struct load_weight *lw)
 628 {
 629         u64 tmp;
 630
 631         if (unlikely(!lw->inv_weight))
 632                 lw->inv_weight = WMULT_CONST / lw->weight;
 633
 634         tmp = (u64)delta_exec * weight;
 635         /*
 636          * Check whether we'd overflow the 64-bit multiplication:
 637          */
 638         if (unlikely(tmp > WMULT_CONST)) {
 639                 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
 640                                 >> (WMULT_SHIFT/2);
 641         } else {
 642                 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
 643         }
 644
 645         return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
 646 }
 647
 648 static inline unsigned long
 649 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
 650 {
 651         return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
 652 }
 653
 654 static void update_load_add(struct load_weight *lw, unsigned long inc)
 655 {
 656         lw->weight += inc;
 657         lw->inv_weight = 0;
 658 }
 659
 660 static void update_load_sub(struct load_weight *lw, unsigned long dec)
 661 {
 662         lw->weight -= dec;
 663         lw->inv_weight = 0;
 664 }
 665
 666 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
 667 {
 668         if (rq->curr != rq->idle && ls->load.weight) {
 669                 ls->delta_exec += ls->delta_stat;
 670                 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
 671                 ls->delta_stat = 0;
 672         }
 673 }
 674
 675 /*
 676  * Update delta_exec, delta_fair fields for rq.
 677  *
 678  * delta_fair clock advances at a rate inversely proportional to
 679  * total load (rq->ls.load.weight) on the runqueue, while
 680  * delta_exec advances at the same rate as wall-clock (provided
 681  * cpu is not idle).
 682  *
 683  * delta_exec / delta_fair is a measure of the (smoothened) load on this
 684  * runqueue over any given interval. This (smoothened) load is used
 685  * during load balance.
 686  *
 687  * This function is called /before/ updating rq->ls.load
 688  * and when switching tasks.
 689  */
 690 static void update_curr_load(struct rq *rq, u64 now)
 691 {
 692         struct load_stat *ls = &rq->ls;
 693         u64 start;
 694
 695         start = ls->load_update_start;
 696         ls->load_update_start = now;
 697         ls->delta_stat += now - start;
 698         /*
 699          * Stagger updates to ls->delta_fair. Very frequent updates
 700          * can be expensive.
 701          */
 702         if (ls->delta_stat >= sysctl_sched_stat_granularity)
 703                 __update_curr_load(rq, ls);
 704 }
 705
 706 /*
 707  * To aid in avoiding the subversion of "niceness" due to uneven distribution
 708  * of tasks with abnormal "nice" values across CPUs the contribution that
 709  * each task makes to its run queue's load is weighted according to its
 710  * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
 711  * scaled version of the new time slice allocation that they receive on time
 712  * slice expiry etc.
 713  */
 714
 715 /*
 716  * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
 717  * If static_prio_timeslice() is ever changed to break this assumption then
 718  * this code will need modification
 719  */
 720 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
 721 #define load_weight(lp) \
 722         (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 723 #define PRIO_TO_LOAD_WEIGHT(prio) \
 724         load_weight(static_prio_timeslice(prio))
 725 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
 726         (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
 727
 728 #define WEIGHT_IDLEPRIO         2
 729 #define WMULT_IDLEPRIO          (1 << 31)
 730
 731 /*
 732  * Nice levels are multiplicative, with a gentle 10% change for every
 733  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
 734  * nice 1, it will get ~10% less CPU time than another CPU-bound task
 735  * that remained on nice 0.
 736  *
 737  * The "10% effect" is relative and cumulative: from _any_ nice level,
 738  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
 739  * it's +10% CPU usage.
 740  */
 741 static const int prio_to_weight[40] = {
 742 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
 743 /* -10 */  9537,  7629,  6103,  4883,  3906,  3125,  2500,  2000,  1600,  1280,
 744 /*   0 */  NICE_0_LOAD /* 1024 */,
 745 /*   1 */          819,   655,   524,   419,   336,   268,   215,   172,   137,
 746 /*  10 */   110,    87,    70,    56,    45,    36,    29,    23,    18,    15,
 747 };
 748
 749 static const u32 prio_to_wmult[40] = {
 750         48356,   60446,   75558,   94446,  118058,  147573,
 751         184467,  230589,  288233,  360285,  450347,
 752         562979,  703746,  879575, 1099582, 1374389,
 753         717986, 2147483, 2684354, 3355443, 4194304,
 754         244160, 6557201, 8196502, 10250518, 12782640,
 755         16025997, 19976592, 24970740, 31350126, 39045157,
 756         49367440, 61356675, 76695844, 95443717, 119304647,
 757         148102320, 186737708, 238609294, 286331153,
 758 };
 759
 760 static inline void
 761 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
 762 {
 763         update_curr_load(rq, now);
 764         update_load_add(&rq->ls.load, p->se.load.weight);
 765 }
 766
 767 static inline void
 768 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
 769 {
 770         update_curr_load(rq, now);
 771         update_load_sub(&rq->ls.load, p->se.load.weight);
 772 }
 773
 774 static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 775 {
 776         rq->nr_running++;
 777         inc_load(rq, p, now);
 778 }
 779
 780 static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 781 {
 782         rq->nr_running--;
 783         dec_load(rq, p, now);
 784 }
 785
 786 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 787
 788 /*
 789  * runqueue iterator, to support SMP load-balancing between different
 790  * scheduling classes, without having to expose their internal data
 791  * structures to the load-balancing proper:
 792  */
 793 struct rq_iterator {
 794         void *arg;
 795         struct task_struct *(*start)(void *);
 796         struct task_struct *(*next)(void *);
 797 };
 798
 799 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 800                       unsigned long max_nr_move, unsigned long max_load_move,
 801                       struct sched_domain *sd, enum cpu_idle_type idle,
 802                       int *all_pinned, unsigned long *load_moved,
 803                       int this_best_prio, int best_prio, int best_prio_seen,
 804                       struct rq_iterator *iterator);
 805
 806 #include "sched_stats.h"
 807 #include "sched_rt.c"
 808 #include "sched_fair.c"
 809 #include "sched_idletask.c"
 810 #ifdef CONFIG_SCHED_DEBUG
 811 # include "sched_debug.c"
 812 #endif
 813
 814 #define sched_class_highest (&rt_sched_class)
 815
 816 static void set_load_weight(struct task_struct *p)
 817 {
 818         task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
 819         p->se.wait_runtime = 0;
 820
 821         if (task_has_rt_policy(p)) {
 822                 p->se.load.weight = prio_to_weight[0] * 2;
 823                 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
 824                 return;
 825         }
 826
 827         /*
 828          * SCHED_IDLE tasks get minimal weight:
 829          */
 830         if (p->policy == SCHED_IDLE) {
 831                 p->se.load.weight = WEIGHT_IDLEPRIO;
 832                 p->se.load.inv_weight = WMULT_IDLEPRIO;
 833                 return;
 834         }
 835
 836         p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
 837         p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 838 }
 839
 840 static void
 841 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 842 {
 843         sched_info_queued(p);
 844         p->sched_class->enqueue_task(rq, p, wakeup, now);
 845         p->se.on_rq = 1;
 846 }
 847
 848 static void
 849 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 850 {
 851         p->sched_class->dequeue_task(rq, p, sleep, now);
 852         p->se.on_rq = 0;
 853 }
 854
 855 /*
 856  * __normal_prio - return the priority that is based on the static prio
 857  */
 858 static inline int __normal_prio(struct task_struct *p)
 859 {
 860         return p->static_prio;
 861 }
 862
 863 /*
 864  * Calculate the expected normal priority: i.e. priority
 865  * without taking RT-inheritance into account. Might be
 866  * boosted by interactivity modifiers. Changes upon fork,
 867  * setprio syscalls, and whenever the interactivity
 868  * estimator recalculates.
 869  */
 870 static inline int normal_prio(struct task_struct *p)
 871 {
 872         int prio;
 873
 874         if (task_has_rt_policy(p))
 875                 prio = MAX_RT_PRIO-1 - p->rt_priority;
 876         else
 877                 prio = __normal_prio(p);
 878         return prio;
 879 }
 880
 881 /*
 882  * Calculate the current priority, i.e. the priority
 883  * taken into account by the scheduler. This value might
 884  * be boosted by RT tasks, or might be boosted by
 885  * interactivity modifiers. Will be RT if the task got
 886  * RT-boosted. If not then it returns p->normal_prio.
 887  */
 888 static int effective_prio(struct task_struct *p)
 889 {
 890         p->normal_prio = normal_prio(p);
 891         /*
 892          * If we are RT tasks or we were boosted to RT priority,
 893          * keep the priority unchanged. Otherwise, update priority
 894          * to the normal priority:
 895          */
 896         if (!rt_prio(p->prio))
 897                 return p->normal_prio;
 898         return p->prio;
 899 }
 900
 901 /*
 902  * activate_task - move a task to the runqueue.
 903  */
 904 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 905 {
 906         u64 now = rq_clock(rq);
 907
 908         if (p->state == TASK_UNINTERRUPTIBLE)
 909                 rq->nr_uninterruptible--;
 910
 911         enqueue_task(rq, p, wakeup, now);
 912         inc_nr_running(p, rq, now);
 913 }
 914
 915 /*
 916  * activate_idle_task - move idle task to the _front_ of runqueue.
 917  */
 918 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 919 {
 920         u64 now = rq_clock(rq);
 921
 922         if (p->state == TASK_UNINTERRUPTIBLE)
 923                 rq->nr_uninterruptible--;
 924
 925         enqueue_task(rq, p, 0, now);
 926         inc_nr_running(p, rq, now);
 927 }
 928
 929 /*
 930  * deactivate_task - remove a task from the runqueue.
 931  */
 932 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 933 {
 934         u64 now = rq_clock(rq);
 935
 936         if (p->state == TASK_UNINTERRUPTIBLE)
 937                 rq->nr_uninterruptible++;
 938
 939         dequeue_task(rq, p, sleep, now);
 940         dec_nr_running(p, rq, now);
 941 }
 942
 943 /**
 944  * task_curr - is this task currently executing on a CPU?
 945  * @p: the task in question.
 946  */
 947 inline int task_curr(const struct task_struct *p)
 948 {
 949         return cpu_curr(task_cpu(p)) == p;
 950 }
 951
 952 /* Used instead of source_load when we know the type == 0 */
 953 unsigned long weighted_cpuload(const int cpu)
 954 {
 955         return cpu_rq(cpu)->ls.load.weight;
 956 }
 957
 958 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 959 {
 960 #ifdef CONFIG_SMP
 961         task_thread_info(p)->cpu = cpu;
 962         set_task_cfs_rq(p);
 963 #endif
 964 }
 965
 966 #ifdef CONFIG_SMP
 967
 968 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 969 {
 970         int old_cpu = task_cpu(p);
 971         struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
 972         u64 clock_offset, fair_clock_offset;
 973
 974         clock_offset = old_rq->clock - new_rq->clock;
 975         fair_clock_offset = old_rq->cfs.fair_clock -
 976                                                  new_rq->cfs.fair_clock;
 977         if (p->se.wait_start)
 978                 p->se.wait_start -= clock_offset;
 979         if (p->se.wait_start_fair)
 980                 p->se.wait_start_fair -= fair_clock_offset;
 981         if (p->se.sleep_start)
 982                 p->se.sleep_start -= clock_offset;
 983         if (p->se.block_start)
 984                 p->se.block_start -= clock_offset;
 985         if (p->se.sleep_start_fair)
 986                 p->se.sleep_start_fair -= fair_clock_offset;
 987
 988         __set_task_cpu(p, new_cpu);
 989 }
 990
 991 struct migration_req {
 992         struct list_head list;
 993
 994         struct task_struct *task;
 995         int dest_cpu;
 996
 997         struct completion done;
 998 };
 999
1000 /*
1001  * The task's runqueue lock must be held.
1002  * Returns true if you have to wait for migration thread.
1003  */
1004 static int
1005 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1006 {
1007         struct rq *rq = task_rq(p);
1008
1009         /*
1010          * If the task is not on a runqueue (and not running), then
1011          * it is sufficient to simply update the task's cpu field.
1012          */
1013         if (!p->se.on_rq && !task_running(rq, p)) {
1014                 set_task_cpu(p, dest_cpu);
1015                 return 0;
1016         }
1017
1018         init_completion(&req->done);
1019         req->task = p;
1020         req->dest_cpu = dest_cpu;
1021         list_add(&req->list, &rq->migration_queue);
1022
1023         return 1;
1024 }
1025
1026 /*
1027  * wait_task_inactive - wait for a thread to unschedule.
1028  *
1029  * The caller must ensure that the task *will* unschedule sometime soon,
1030  * else this function might spin for a *long* time. This function can't
1031  * be called with interrupts off, or it may introduce deadlock with
1032  * smp_call_function() if an IPI is sent by the same process we are
1033  * waiting to become inactive.
1034  */
1035 void wait_task_inactive(struct task_struct *p)
1036 {
1037         unsigned long flags;
1038         int running, on_rq;
1039         struct rq *rq;
1040
1041 repeat:
1042         /*
1043          * We do the initial early heuristics without holding
1044          * any task-queue locks at all. We'll only try to get
1045          * the runqueue lock when things look like they will
1046          * work out!
1047          */
1048         rq = task_rq(p);
1049
1050         /*
1051          * If the task is actively running on another CPU
1052          * still, just relax and busy-wait without holding
1053          * any locks.
1054          *
1055          * NOTE! Since we don't hold any locks, it's not
1056          * even sure that "rq" stays as the right runqueue!
1057          * But we don't care, since "task_running()" will
1058          * return false if the runqueue has changed and p
1059          * is actually now running somewhere else!
1060          */
1061         while (task_running(rq, p))
1062                 cpu_relax();
1063
1064         /*
1065          * Ok, time to look more closely! We need the rq
1066          * lock now, to be *sure*. If we're wrong, we'll
1067          * just go back and repeat.
1068          */
1069         rq = task_rq_lock(p, &flags);
1070         running = task_running(rq, p);
1071         on_rq = p->se.on_rq;
1072         task_rq_unlock(rq, &flags);
1073
1074         /*
1075          * Was it really running after all now that we
1076          * checked with the proper locks actually held?
1077          *
1078          * Oops. Go back and try again..
1079          */
1080         if (unlikely(running)) {
1081                 cpu_relax();
1082                 goto repeat;
1083         }
1084
1085         /*
1086          * It's not enough that it's not actively running,
1087          * it must be off the runqueue _entirely_, and not
1088          * preempted!
1089          *
1090          * So if it wa still runnable (but just not actively
1091          * running right now), it's preempted, and we should
1092          * yield - it could be a while.
1093          */
1094         if (unlikely(on_rq)) {
1095                 yield();
1096                 goto repeat;
1097         }
1098
1099         /*
1100          * Ahh, all good. It wasn't running, and it wasn't
1101          * runnable, which means that it will never become
1102          * running in the future either. We're all done!
1103          */
1104 }
1105
1106 /***
1107  * kick_process - kick a running thread to enter/exit the kernel
1108  * @p: the to-be-kicked thread
1109  *
1110  * Cause a process which is running on another CPU to enter
1111  * kernel-mode, without any delay. (to get signals handled.)
1112  *
1113  * NOTE: this function doesnt have to take the runqueue lock,
1114  * because all it wants to ensure is that the remote task enters
1115  * the kernel. If the IPI races and the task has been migrated
1116  * to another CPU then no harm is done and the purpose has been
1117  * achieved as well.
1118  */
1119 void kick_process(struct task_struct *p)
1120 {
1121         int cpu;
1122
1123         preempt_disable();
1124         cpu = task_cpu(p);
1125         if ((cpu != smp_processor_id()) && task_curr(p))
1126                 smp_send_reschedule(cpu);
1127         preempt_enable();
1128 }
1129
1130 /*
1131  * Return a low guess at the load of a migration-source cpu weighted
1132  * according to the scheduling class and "nice" value.
1133  *
1134  * We want to under-estimate the load of migration sources, to
1135  * balance conservatively.
1136  */
1137 static inline unsigned long source_load(int cpu, int type)
1138 {
1139         struct rq *rq = cpu_rq(cpu);
1140         unsigned long total = weighted_cpuload(cpu);
1141
1142         if (type == 0)
1143                 return total;
1144
1145         return min(rq->cpu_load[type-1], total);
1146 }
1147
1148 /*
1149  * Return a high guess at the load of a migration-target cpu weighted
1150  * according to the scheduling class and "nice" value.
1151  */
1152 static inline unsigned long target_load(int cpu, int type)
1153 {
1154         struct rq *rq = cpu_rq(cpu);
1155         unsigned long total = weighted_cpuload(cpu);
1156
1157         if (type == 0)
1158                 return total;
1159
1160         return max(rq->cpu_load[type-1], total);
1161 }
1162
1163 /*
1164  * Return the average load per task on the cpu's run queue
1165  */
1166 static inline unsigned long cpu_avg_load_per_task(int cpu)
1167 {
1168         struct rq *rq = cpu_rq(cpu);
1169         unsigned long total = weighted_cpuload(cpu);
1170         unsigned long n = rq->nr_running;
1171
1172         return n ? total / n : SCHED_LOAD_SCALE;
1173 }
1174
1175 /*
1176  * find_idlest_group finds and returns the least busy CPU group within the
1177  * domain.
1178  */
1179 static struct sched_group *
1180 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1181 {
1182         struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1183         unsigned long min_load = ULONG_MAX, this_load = 0;
1184         int load_idx = sd->forkexec_idx;
1185         int imbalance = 100 + (sd->imbalance_pct-100)/2;
1186
1187         do {
1188                 unsigned long load, avg_load;
1189                 int local_group;
1190                 int i;
1191
1192                 /* Skip over this group if it has no CPUs allowed */
1193                 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1194                         goto nextgroup;
1195
1196                 local_group = cpu_isset(this_cpu, group->cpumask);
1197
1198                 /* Tally up the load of all CPUs in the group */
1199                 avg_load = 0;
1200
1201                 for_each_cpu_mask(i, group->cpumask) {
1202                         /* Bias balancing toward cpus of our domain */
1203                         if (local_group)
1204                                 load = source_load(i, load_idx);
1205                         else
1206                                 load = target_load(i, load_idx);
1207
1208                         avg_load += load;
1209                 }
1210
1211                 /* Adjust by relative CPU power of the group */
1212                 avg_load = sg_div_cpu_power(group,
1213                                 avg_load * SCHED_LOAD_SCALE);
1214
1215                 if (local_group) {
1216                         this_load = avg_load;
1217                         this = group;
1218                 } else if (avg_load < min_load) {
1219                         min_load = avg_load;
1220                         idlest = group;
1221                 }
1222 nextgroup:
1223                 group = group->next;
1224         } while (group != sd->groups);
1225
1226         if (!idlest || 100*this_load < imbalance*min_load)
1227                 return NULL;
1228         return idlest;
1229 }
1230
1231 /*
1232  * find_idlest_cpu - find the idlest cpu among the cpus in group.
1233  */
1234 static int
1235 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1236 {
1237         cpumask_t tmp;
1238         unsigned long load, min_load = ULONG_MAX;
1239         int idlest = -1;
1240         int i;
1241
1242         /* Traverse only the allowed CPUs */
1243         cpus_and(tmp, group->cpumask, p->cpus_allowed);
1244
1245         for_each_cpu_mask(i, tmp) {
1246                 load = weighted_cpuload(i);
1247
1248                 if (load < min_load || (load == min_load && i == this_cpu)) {
1249                         min_load = load;
1250                         idlest = i;
1251                 }
1252         }
1253
1254         return idlest;
1255 }
1256
1257 /*
1258  * sched_balance_self: balance the current task (running on cpu) in domains
1259  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1260  * SD_BALANCE_EXEC.
1261  *
1262  * Balance, ie. select the least loaded group.
1263  *
1264  * Returns the target CPU number, or the same CPU if no balancing is needed.
1265  *
1266  * preempt must be disabled.
1267  */
1268 static int sched_balance_self(int cpu, int flag)
1269 {
1270         struct task_struct *t = current;
1271         struct sched_domain *tmp, *sd = NULL;
1272
1273         for_each_domain(cpu, tmp) {
1274                 /*
1275                  * If power savings logic is enabled for a domain, stop there.
1276                  */
1277                 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1278                         break;
1279                 if (tmp->flags & flag)
1280                         sd = tmp;
1281         }
1282
1283         while (sd) {
1284                 cpumask_t span;
1285                 struct sched_group *group;
1286                 int new_cpu, weight;
1287
1288                 if (!(sd->flags & flag)) {
1289                         sd = sd->child;
1290                         continue;
1291                 }
1292
1293                 span = sd->span;
1294                 group = find_idlest_group(sd, t, cpu);
1295                 if (!group) {
1296                         sd = sd->child;
1297                         continue;
1298                 }
1299
1300                 new_cpu = find_idlest_cpu(group, t, cpu);
1301                 if (new_cpu == -1 || new_cpu == cpu) {
1302                         /* Now try balancing at a lower domain level of cpu */
1303                         sd = sd->child;
1304                         continue;
1305                 }
1306
1307                 /* Now try balancing at a lower domain level of new_cpu */
1308                 cpu = new_cpu;
1309                 sd = NULL;
1310                 weight = cpus_weight(span);
1311                 for_each_domain(cpu, tmp) {
1312                         if (weight <= cpus_weight(tmp->span))
1313                                 break;
1314                         if (tmp->flags & flag)
1315                                 sd = tmp;
1316                 }
1317                 /* while loop will break here if sd == NULL */
1318         }
1319
1320         return cpu;
1321 }
1322
1323 #endif /* CONFIG_SMP */
1324
1325 /*
1326  * wake_idle() will wake a task on an idle cpu if task->cpu is
1327  * not idle and an idle cpu is available.  The span of cpus to
1328  * search starts with cpus closest then further out as needed,
1329  * so we always favor a closer, idle cpu.
1330  *
1331  * Returns the CPU we should wake onto.
1332  */
1333 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1334 static int wake_idle(int cpu, struct task_struct *p)
1335 {
1336         cpumask_t tmp;
1337         struct sched_domain *sd;
1338         int i;
1339
1340         /*
1341          * If it is idle, then it is the best cpu to run this task.
1342          *
1343          * This cpu is also the best, if it has more than one task already.
1344          * Siblings must be also busy(in most cases) as they didn't already
1345          * pickup the extra load from this cpu and hence we need not check
1346          * sibling runqueue info. This will avoid the checks and cache miss
1347          * penalities associated with that.
1348          */
1349         if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1350                 return cpu;
1351
1352         for_each_domain(cpu, sd) {
1353                 if (sd->flags & SD_WAKE_IDLE) {
1354                         cpus_and(tmp, sd->span, p->cpus_allowed);
1355                         for_each_cpu_mask(i, tmp) {
1356                                 if (idle_cpu(i))
1357                                         return i;
1358                         }
1359                 } else {
1360                         break;
1361                 }
1362         }
1363         return cpu;
1364 }
1365 #else
1366 static inline int wake_idle(int cpu, struct task_struct *p)
1367 {
1368         return cpu;
1369 }
1370 #endif
1371
1372 /***
1373  * try_to_wake_up - wake up a thread
1374  * @p: the to-be-woken-up thread
1375  * @state: the mask of task states that can be woken
1376  * @sync: do a synchronous wakeup?
1377  *
1378  * Put it on the run-queue if it's not already there. The "current"
1379  * thread is always on the run-queue (except when the actual
1380  * re-schedule is in progress), and as such you're allowed to do
1381  * the simpler "current->state = TASK_RUNNING" to mark yourself
1382  * runnable without the overhead of this.
1383  *
1384  * returns failure only if the task is already active.
1385  */
1386 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1387 {
1388         int cpu, this_cpu, success = 0;
1389         unsigned long flags;
1390         long old_state;
1391         struct rq *rq;
1392 #ifdef CONFIG_SMP
1393         struct sched_domain *sd, *this_sd = NULL;
1394         unsigned long load, this_load;
1395         int new_cpu;
1396 #endif
1397
1398         rq = task_rq_lock(p, &flags);
1399         old_state = p->state;
1400         if (!(old_state & state))
1401                 goto out;
1402
1403         if (p->se.on_rq)
1404                 goto out_running;
1405
1406         cpu = task_cpu(p);
1407         this_cpu = smp_processor_id();
1408
1409 #ifdef CONFIG_SMP
1410         if (unlikely(task_running(rq, p)))
1411                 goto out_activate;
1412
1413         new_cpu = cpu;
1414
1415         schedstat_inc(rq, ttwu_cnt);
1416         if (cpu == this_cpu) {
1417                 schedstat_inc(rq, ttwu_local);
1418                 goto out_set_cpu;
1419         }
1420
1421         for_each_domain(this_cpu, sd) {
1422                 if (cpu_isset(cpu, sd->span)) {
1423                         schedstat_inc(sd, ttwu_wake_remote);
1424                         this_sd = sd;
1425                         break;
1426                 }
1427         }
1428
1429         if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1430                 goto out_set_cpu;
1431
1432         /*
1433          * Check for affine wakeup and passive balancing possibilities.
1434          */
1435         if (this_sd) {
1436                 int idx = this_sd->wake_idx;
1437                 unsigned int imbalance;
1438
1439                 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1440
1441                 load = source_load(cpu, idx);
1442                 this_load = target_load(this_cpu, idx);
1443
1444                 new_cpu = this_cpu; /* Wake to this CPU if we can */
1445
1446                 if (this_sd->flags & SD_WAKE_AFFINE) {
1447                         unsigned long tl = this_load;
1448                         unsigned long tl_per_task;
1449
1450                         tl_per_task = cpu_avg_load_per_task(this_cpu);
1451
1452                         /*
1453                          * If sync wakeup then subtract the (maximum possible)
1454                          * effect of the currently running task from the load
1455                          * of the current CPU:
1456                          */
1457                         if (sync)
1458                                 tl -= current->se.load.weight;
1459
1460                         if ((tl <= load &&
1461                                 tl + target_load(cpu, idx) <= tl_per_task) ||
1462                                100*(tl + p->se.load.weight) <= imbalance*load) {
1463                                 /*
1464                                  * This domain has SD_WAKE_AFFINE and
1465                                  * p is cache cold in this domain, and
1466                                  * there is no bad imbalance.
1467                                  */
1468                                 schedstat_inc(this_sd, ttwu_move_affine);
1469                                 goto out_set_cpu;
1470                         }
1471                 }
1472
1473                 /*
1474                  * Start passive balancing when half the imbalance_pct
1475                  * limit is reached.
1476                  */
1477                 if (this_sd->flags & SD_WAKE_BALANCE) {
1478                         if (imbalance*this_load <= 100*load) {
1479                                 schedstat_inc(this_sd, ttwu_move_balance);
1480                                 goto out_set_cpu;
1481                         }
1482                 }
1483         }
1484
1485         new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1486 out_set_cpu:
1487         new_cpu = wake_idle(new_cpu, p);
1488         if (new_cpu != cpu) {
1489                 set_task_cpu(p, new_cpu);
1490                 task_rq_unlock(rq, &flags);
1491                 /* might preempt at this point */
1492                 rq = task_rq_lock(p, &flags);
1493                 old_state = p->state;
1494                 if (!(old_state & state))
1495                         goto out;
1496                 if (p->se.on_rq)
1497                         goto out_running;
1498
1499                 this_cpu = smp_processor_id();
1500                 cpu = task_cpu(p);
1501         }
1502
1503 out_activate:
1504 #endif /* CONFIG_SMP */
1505         activate_task(rq, p, 1);
1506         /*
1507          * Sync wakeups (i.e. those types of wakeups where the waker
1508          * has indicated that it will leave the CPU in short order)
1509          * don't trigger a preemption, if the woken up task will run on
1510          * this cpu. (in this case the 'I will reschedule' promise of
1511          * the waker guarantees that the freshly woken up task is going
1512          * to be considered on this CPU.)
1513          */
1514         if (!sync || cpu != this_cpu)
1515                 check_preempt_curr(rq, p);
1516         success = 1;
1517
1518 out_running:
1519         p->state = TASK_RUNNING;
1520 out:
1521         task_rq_unlock(rq, &flags);
1522
1523         return success;
1524 }
1525
1526 int fastcall wake_up_process(struct task_struct *p)
1527 {
1528         return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1529                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1530 }
1531 EXPORT_SYMBOL(wake_up_process);
1532
1533 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1534 {
1535         return try_to_wake_up(p, state, 0);
1536 }
1537
1538 /*
1539  * Perform scheduler related setup for a newly forked process p.
1540  * p is forked by current.
1541  *
1542  * __sched_fork() is basic setup used by init_idle() too:
1543  */
1544 static void __sched_fork(struct task_struct *p)
1545 {
1546         p->se.wait_start_fair           = 0;
1547         p->se.wait_start                = 0;
1548         p->se.exec_start                = 0;
1549         p->se.sum_exec_runtime          = 0;
1550         p->se.delta_exec                = 0;
1551         p->se.delta_fair_run            = 0;
1552         p->se.delta_fair_sleep          = 0;
1553         p->se.wait_runtime              = 0;
1554         p->se.sum_wait_runtime          = 0;
1555         p->se.sum_sleep_runtime         = 0;
1556         p->se.sleep_start               = 0;
1557         p->se.sleep_start_fair          = 0;
1558         p->se.block_start               = 0;
1559         p->se.sleep_max                 = 0;
1560         p->se.block_max                 = 0;
1561         p->se.exec_max                  = 0;
1562         p->se.wait_max                  = 0;
1563         p->se.wait_runtime_overruns     = 0;
1564         p->se.wait_runtime_underruns    = 0;
1565
1566         INIT_LIST_HEAD(&p->run_list);
1567         p->se.on_rq = 0;
1568
1569         /*
1570          * We mark the process as running here, but have not actually
1571          * inserted it onto the runqueue yet. This guarantees that
1572          * nobody will actually run it, and a signal or other external
1573          * event cannot wake it up and insert it on the runqueue either.
1574          */
1575         p->state = TASK_RUNNING;
1576 }
1577
1578 /*
1579  * fork()/clone()-time setup:
1580  */
1581 void sched_fork(struct task_struct *p, int clone_flags)
1582 {
1583         int cpu = get_cpu();
1584
1585         __sched_fork(p);
1586
1587 #ifdef CONFIG_SMP
1588         cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1589 #endif
1590         __set_task_cpu(p, cpu);
1591
1592         /*
1593          * Make sure we do not leak PI boosting priority to the child:
1594          */
1595         p->prio = current->normal_prio;
1596
1597 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1598         if (likely(sched_info_on()))
1599                 memset(&p->sched_info, 0, sizeof(p->sched_info));
1600 #endif
1601 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1602         p->oncpu = 0;
1603 #endif
1604 #ifdef CONFIG_PREEMPT
1605         /* Want to start with kernel preemption disabled. */
1606         task_thread_info(p)->preempt_count = 1;
1607 #endif
1608         put_cpu();
1609 }
1610
1611 /*
1612  * After fork, child runs first. (default) If set to 0 then
1613  * parent will (try to) run first.
1614  */
1615 unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1616
1617 /*
1618  * wake_up_new_task - wake up a newly created task for the first time.
1619  *
1620  * This function will do some initial scheduler statistics housekeeping
1621  * that must be done for every newly created context, then puts the task
1622  * on the runqueue and wakes it.
1623  */
1624 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1625 {
1626         unsigned long flags;
1627         struct rq *rq;
1628         int this_cpu;
1629
1630         rq = task_rq_lock(p, &flags);
1631         BUG_ON(p->state != TASK_RUNNING);
1632         this_cpu = smp_processor_id(); /* parent's CPU */
1633
1634         p->prio = effective_prio(p);
1635
1636         if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1637                         task_cpu(p) != this_cpu || !current->se.on_rq) {
1638                 activate_task(rq, p, 0);
1639         } else {
1640                 /*
1641                  * Let the scheduling class do new task startup
1642                  * management (if any):
1643                  */
1644                 p->sched_class->task_new(rq, p);
1645         }
1646         check_preempt_curr(rq, p);
1647         task_rq_unlock(rq, &flags);
1648 }
1649
1650 /**
1651  * prepare_task_switch - prepare to switch tasks
1652  * @rq: the runqueue preparing to switch
1653  * @next: the task we are going to switch to.
1654  *
1655  * This is called with the rq lock held and interrupts off. It must
1656  * be paired with a subsequent finish_task_switch after the context
1657  * switch.
1658  *
1659  * prepare_task_switch sets up locking and calls architecture specific
1660  * hooks.
1661  */
1662 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1663 {
1664         prepare_lock_switch(rq, next);
1665         prepare_arch_switch(next);
1666 }
1667
1668 /**
1669  * finish_task_switch - clean up after a task-switch
1670  * @rq: runqueue associated with task-switch
1671  * @prev: the thread we just switched away from.
1672  *
1673  * finish_task_switch must be called after the context switch, paired
1674  * with a prepare_task_switch call before the context switch.
1675  * finish_task_switch will reconcile locking set up by prepare_task_switch,
1676  * and do any other architecture-specific cleanup actions.
1677  *
1678  * Note that we may have delayed dropping an mm in context_switch(). If
1679  * so, we finish that here outside of the runqueue lock.  (Doing it
1680  * with the lock held can cause deadlocks; see schedule() for
1681  * details.)
1682  */
1683 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1684         __releases(rq->lock)
1685 {
1686         struct mm_struct *mm = rq->prev_mm;
1687         long prev_state;
1688
1689         rq->prev_mm = NULL;
1690
1691         /*
1692          * A task struct has one reference for the use as "current".
1693          * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1694          * schedule one last time. The schedule call will never return, and
1695          * the scheduled task must drop that reference.
1696          * The test for TASK_DEAD must occur while the runqueue locks are
1697          * still held, otherwise prev could be scheduled on another cpu, die
1698          * there before we look at prev->state, and then the reference would
1699          * be dropped twice.
1700          *              Manfred Spraul <manfred@colorfullife.com>
1701          */
1702         prev_state = prev->state;
1703         finish_arch_switch(prev);
1704         finish_lock_switch(rq, prev);
1705         if (mm)
1706                 mmdrop(mm);
1707         if (unlikely(prev_state == TASK_DEAD)) {
1708                 /*
1709                  * Remove function-return probe instances associated with this
1710                  * task and put them back on the free list.
1711                  */
1712                 kprobe_flush_task(prev);
1713                 put_task_struct(prev);
1714         }
1715 }
1716
1717 /**
1718  * schedule_tail - first thing a freshly forked thread must call.
1719  * @prev: the thread we just switched away from.
1720  */
1721 asmlinkage void schedule_tail(struct task_struct *prev)
1722         __releases(rq->lock)
1723 {
1724         struct rq *rq = this_rq();
1725
1726         finish_task_switch(rq, prev);
1727 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1728         /* In this case, finish_task_switch does not reenable preemption */
1729         preempt_enable();
1730 #endif
1731         if (current->set_child_tid)
1732                 put_user(current->pid, current->set_child_tid);
1733 }
1734
1735 /*
1736  * context_switch - switch to the new MM and the new
1737  * thread's register state.
1738  */
1739 static inline void
1740 context_switch(struct rq *rq, struct task_struct *prev,
1741                struct task_struct *next)
1742 {
1743         struct mm_struct *mm, *oldmm;
1744
1745         prepare_task_switch(rq, next);
1746         mm = next->mm;
1747         oldmm = prev->active_mm;
1748         /*
1749          * For paravirt, this is coupled with an exit in switch_to to
1750          * combine the page table reload and the switch backend into
1751          * one hypercall.
1752          */
1753         arch_enter_lazy_cpu_mode();
1754
1755         if (unlikely(!mm)) {
1756                 next->active_mm = oldmm;
1757                 atomic_inc(&oldmm->mm_count);
1758                 enter_lazy_tlb(oldmm, next);
1759         } else
1760                 switch_mm(oldmm, mm, next);
1761
1762         if (unlikely(!prev->mm)) {
1763                 prev->active_mm = NULL;
1764                 rq->prev_mm = oldmm;
1765         }
1766         /*
1767          * Since the runqueue lock will be released by the next
1768          * task (which is an invalid locking op but in the case
1769          * of the scheduler it's an obvious special-case), so we
1770          * do an early lockdep release here:
1771          */
1772 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1773         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1774 #endif
1775
1776         /* Here we just switch the register state and the stack. */
1777         switch_to(prev, next, prev);
1778
1779         barrier();
1780         /*
1781          * this_rq must be evaluated again because prev may have moved
1782          * CPUs since it called schedule(), thus the 'rq' on its stack
1783          * frame will be invalid.
1784          */
1785         finish_task_switch(this_rq(), prev);
1786 }
1787
1788 /*
1789  * nr_running, nr_uninterruptible and nr_context_switches:
1790  *
1791  * externally visible scheduler statistics: current number of runnable
1792  * threads, current number of uninterruptible-sleeping threads, total
1793  * number of context switches performed since bootup.
1794  */
1795 unsigned long nr_running(void)
1796 {
1797         unsigned long i, sum = 0;
1798
1799         for_each_online_cpu(i)
1800                 sum += cpu_rq(i)->nr_running;
1801
1802         return sum;
1803 }
1804
1805 unsigned long nr_uninterruptible(void)
1806 {
1807         unsigned long i, sum = 0;
1808
1809         for_each_possible_cpu(i)
1810                 sum += cpu_rq(i)->nr_uninterruptible;
1811
1812         /*
1813          * Since we read the counters lockless, it might be slightly
1814          * inaccurate. Do not allow it to go below zero though:
1815          */
1816         if (unlikely((long)sum < 0))
1817                 sum = 0;
1818
1819         return sum;
1820 }
1821
1822 unsigned long long nr_context_switches(void)
1823 {
1824         int i;
1825         unsigned long long sum = 0;
1826
1827         for_each_possible_cpu(i)
1828                 sum += cpu_rq(i)->nr_switches;
1829
1830         return sum;
1831 }
1832
1833 unsigned long nr_iowait(void)
1834 {
1835         unsigned long i, sum = 0;
1836
1837         for_each_possible_cpu(i)
1838                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1839
1840         return sum;
1841 }
1842
1843 unsigned long nr_active(void)
1844 {
1845         unsigned long i, running = 0, uninterruptible = 0;
1846
1847         for_each_online_cpu(i) {
1848                 running += cpu_rq(i)->nr_running;
1849                 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1850         }
1851
1852         if (unlikely((long)uninterruptible < 0))
1853                 uninterruptible = 0;
1854
1855         return running + uninterruptible;
1856 }
1857
1858 /*
1859  * Update rq->cpu_load[] statistics. This function is usually called every
1860  * scheduler tick (TICK_NSEC).
1861  */
1862 static void update_cpu_load(struct rq *this_rq)
1863 {
1864         u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1865         unsigned long total_load = this_rq->ls.load.weight;
1866         unsigned long this_load =  total_load;
1867         struct load_stat *ls = &this_rq->ls;
1868         u64 now = __rq_clock(this_rq);
1869         int i, scale;
1870
1871         this_rq->nr_load_updates++;
1872         if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1873                 goto do_avg;
1874
1875         /* Update delta_fair/delta_exec fields first */
1876         update_curr_load(this_rq, now);
1877
1878         fair_delta64 = ls->delta_fair + 1;
1879         ls->delta_fair = 0;
1880
1881         exec_delta64 = ls->delta_exec + 1;
1882         ls->delta_exec = 0;
1883
1884         sample_interval64 = now - ls->load_update_last;
1885         ls->load_update_last = now;
1886
1887         if ((s64)sample_interval64 < (s64)TICK_NSEC)
1888                 sample_interval64 = TICK_NSEC;
1889
1890         if (exec_delta64 > sample_interval64)
1891                 exec_delta64 = sample_interval64;
1892
1893         idle_delta64 = sample_interval64 - exec_delta64;
1894
1895         tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1896         tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1897
1898         this_load = (unsigned long)tmp64;
1899
1900 do_avg:
1901
1902         /* Update our load: */
1903         for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1904                 unsigned long old_load, new_load;
1905
1906                 /* scale is effectively 1 << i now, and >> i divides by scale */
1907
1908                 old_load = this_rq->cpu_load[i];
1909                 new_load = this_load;
1910
1911                 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1912         }
1913 }
1914
1915 #ifdef CONFIG_SMP
1916
1917 /*
1918  * double_rq_lock - safely lock two runqueues
1919  *
1920  * Note this does not disable interrupts like task_rq_lock,
1921  * you need to do so manually before calling.
1922  */
1923 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1924         __acquires(rq1->lock)
1925         __acquires(rq2->lock)
1926 {
1927         BUG_ON(!irqs_disabled());
1928         if (rq1 == rq2) {
1929                 spin_lock(&rq1->lock);
1930                 __acquire(rq2->lock);   /* Fake it out ;) */
1931         } else {
1932                 if (rq1 < rq2) {
1933                         spin_lock(&rq1->lock);
1934                         spin_lock(&rq2->lock);
1935                 } else {
1936                         spin_lock(&rq2->lock);
1937                         spin_lock(&rq1->lock);
1938                 }
1939         }
1940 }
1941
1942 /*
1943  * double_rq_unlock - safely unlock two runqueues
1944  *
1945  * Note this does not restore interrupts like task_rq_unlock,
1946  * you need to do so manually after calling.
1947  */
1948 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1949         __releases(rq1->lock)
1950         __releases(rq2->lock)
1951 {
1952         spin_unlock(&rq1->lock);
1953         if (rq1 != rq2)
1954                 spin_unlock(&rq2->lock);
1955         else
1956                 __release(rq2->lock);
1957 }
1958
1959 /*
1960  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1961  */
1962 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1963         __releases(this_rq->lock)
1964         __acquires(busiest->lock)
1965         __acquires(this_rq->lock)
1966 {
1967         if (unlikely(!irqs_disabled())) {
1968                 /* printk() doesn't work good under rq->lock */
1969                 spin_unlock(&this_rq->lock);
1970                 BUG_ON(1);
1971         }
1972         if (unlikely(!spin_trylock(&busiest->lock))) {
1973                 if (busiest < this_rq) {
1974                         spin_unlock(&this_rq->lock);
1975                         spin_lock(&busiest->lock);
1976                         spin_lock(&this_rq->lock);
1977                 } else
1978                         spin_lock(&busiest->lock);
1979         }
1980 }
1981
1982 /*
1983  * If dest_cpu is allowed for this process, migrate the task to it.
1984  * This is accomplished by forcing the cpu_allowed mask to only
1985  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1986  * the cpu_allowed mask is restored.
1987  */
1988 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
1989 {
1990         struct migration_req req;
1991         unsigned long flags;
1992         struct rq *rq;
1993
1994         rq = task_rq_lock(p, &flags);
1995         if (!cpu_isset(dest_cpu, p->cpus_allowed)
1996             || unlikely(cpu_is_offline(dest_cpu)))
1997                 goto out;
1998
1999         /* force the process onto the specified CPU */
2000         if (migrate_task(p, dest_cpu, &req)) {
2001                 /* Need to wait for migration thread (might exit: take ref). */
2002                 struct task_struct *mt = rq->migration_thread;
2003
2004                 get_task_struct(mt);
2005                 task_rq_unlock(rq, &flags);
2006                 wake_up_process(mt);
2007                 put_task_struct(mt);
2008                 wait_for_completion(&req.done);
2009
2010                 return;
2011         }
2012 out:
2013         task_rq_unlock(rq, &flags);
2014 }
2015
2016 /*
2017  * sched_exec - execve() is a valuable balancing opportunity, because at
2018  * this point the task has the smallest effective memory and cache footprint.
2019  */
2020 void sched_exec(void)
2021 {
2022         int new_cpu, this_cpu = get_cpu();
2023         new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2024         put_cpu();
2025         if (new_cpu != this_cpu)
2026                 sched_migrate_task(current, new_cpu);
2027 }
2028
2029 /*
2030  * pull_task - move a task from a remote runqueue to the local runqueue.
2031  * Both runqueues must be locked.
2032  */
2033 static void pull_task(struct rq *src_rq, struct task_struct *p,
2034                       struct rq *this_rq, int this_cpu)
2035 {
2036         deactivate_task(src_rq, p, 0);
2037         set_task_cpu(p, this_cpu);
2038         activate_task(this_rq, p, 0);
2039         /*
2040          * Note that idle threads have a prio of MAX_PRIO, for this test
2041          * to be always true for them.
2042          */
2043         check_preempt_curr(this_rq, p);
2044 }
2045
2046 /*
2047  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2048  */
2049 static
2050 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2051                      struct sched_domain *sd, enum cpu_idle_type idle,
2052                      int *all_pinned)
2053 {
2054         /*
2055          * We do not migrate tasks that are:
2056          * 1) running (obviously), or
2057          * 2) cannot be migrated to this CPU due to cpus_allowed, or
2058          * 3) are cache-hot on their current CPU.
2059          */
2060         if (!cpu_isset(this_cpu, p->cpus_allowed))
2061                 return 0;
2062         *all_pinned = 0;
2063
2064         if (task_running(rq, p))
2065                 return 0;
2066
2067         /*
2068          * Aggressive migration if too many balance attempts have failed:
2069          */
2070         if (sd->nr_balance_failed > sd->cache_nice_tries)
2071                 return 1;
2072
2073         return 1;
2074 }
2075
2076 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2077                       unsigned long max_nr_move, unsigned long max_load_move,
2078                       struct sched_domain *sd, enum cpu_idle_type idle,
2079                       int *all_pinned, unsigned long *load_moved,
2080                       int this_best_prio, int best_prio, int best_prio_seen,
2081                       struct rq_iterator *iterator)
2082 {
2083         int pulled = 0, pinned = 0, skip_for_load;
2084         struct task_struct *p;
2085         long rem_load_move = max_load_move;
2086
2087         if (max_nr_move == 0 || max_load_move == 0)
2088                 goto out;
2089
2090         pinned = 1;
2091
2092         /*
2093          * Start the load-balancing iterator:
2094          */
2095         p = iterator->start(iterator->arg);
2096 next:
2097         if (!p)
2098                 goto out;
2099         /*
2100          * To help distribute high priority tasks accross CPUs we don't
2101          * skip a task if it will be the highest priority task (i.e. smallest
2102          * prio value) on its new queue regardless of its load weight
2103          */
2104         skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2105                                                          SCHED_LOAD_SCALE_FUZZ;
2106         if (skip_for_load && p->prio < this_best_prio)
2107                 skip_for_load = !best_prio_seen && p->prio == best_prio;
2108         if (skip_for_load ||
2109             !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2110
2111                 best_prio_seen |= p->prio == best_prio;
2112                 p = iterator->next(iterator->arg);
2113                 goto next;
2114         }
2115
2116         pull_task(busiest, p, this_rq, this_cpu);
2117         pulled++;
2118         rem_load_move -= p->se.load.weight;
2119
2120         /*
2121          * We only want to steal up to the prescribed number of tasks
2122          * and the prescribed amount of weighted load.
2123          */
2124         if (pulled < max_nr_move && rem_load_move > 0) {
2125                 if (p->prio < this_best_prio)
2126                         this_best_prio = p->prio;
2127                 p = iterator->next(iterator->arg);
2128                 goto next;
2129         }
2130 out:
2131         /*
2132          * Right now, this is the only place pull_task() is called,
2133          * so we can safely collect pull_task() stats here rather than
2134          * inside pull_task().
2135          */
2136         schedstat_add(sd, lb_gained[idle], pulled);
2137
2138         if (all_pinned)
2139                 *all_pinned = pinned;
2140         *load_moved = max_load_move - rem_load_move;
2141         return pulled;
2142 }
2143
2144 /*
2145  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2146  * load from busiest to this_rq, as part of a balancing operation within
2147  * "domain". Returns the number of tasks moved.
2148  *
2149  * Called with both runqueues locked.
2150  */
2151 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2152                       unsigned long max_nr_move, unsigned long max_load_move,
2153                       struct sched_domain *sd, enum cpu_idle_type idle,
2154                       int *all_pinned)
2155 {
2156         struct sched_class *class = sched_class_highest;
2157         unsigned long load_moved, total_nr_moved = 0, nr_moved;
2158         long rem_load_move = max_load_move;
2159
2160         do {
2161                 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2162                                 max_nr_move, (unsigned long)rem_load_move,
2163                                 sd, idle, all_pinned, &load_moved);
2164                 total_nr_moved += nr_moved;
2165                 max_nr_move -= nr_moved;
2166                 rem_load_move -= load_moved;
2167                 class = class->next;
2168         } while (class && max_nr_move && rem_load_move > 0);
2169
2170         return total_nr_moved;
2171 }
2172
2173 /*
2174  * find_busiest_group finds and returns the busiest CPU group within the
2175  * domain. It calculates and returns the amount of weighted load which
2176  * should be moved to restore balance via the imbalance parameter.
2177  */
2178 static struct sched_group *
2179 find_busiest_group(struct sched_domain *sd, int this_cpu,
2180                    unsigned long *imbalance, enum cpu_idle_type idle,
2181                    int *sd_idle, cpumask_t *cpus, int *balance)
2182 {
2183         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2184         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2185         unsigned long max_pull;
2186         unsigned long busiest_load_per_task, busiest_nr_running;
2187         unsigned long this_load_per_task, this_nr_running;
2188         int load_idx;
2189 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2190         int power_savings_balance = 1;
2191         unsigned long leader_nr_running = 0, min_load_per_task = 0;
2192         unsigned long min_nr_running = ULONG_MAX;
2193         struct sched_group *group_min = NULL, *group_leader = NULL;
2194 #endif
2195
2196         max_load = this_load = total_load = total_pwr = 0;
2197         busiest_load_per_task = busiest_nr_running = 0;
2198         this_load_per_task = this_nr_running = 0;
2199         if (idle == CPU_NOT_IDLE)
2200                 load_idx = sd->busy_idx;
2201         else if (idle == CPU_NEWLY_IDLE)
2202                 load_idx = sd->newidle_idx;
2203         else
2204                 load_idx = sd->idle_idx;
2205
2206         do {
2207                 unsigned long load, group_capacity;
2208                 int local_group;
2209                 int i;
2210                 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2211                 unsigned long sum_nr_running, sum_weighted_load;
2212
2213                 local_group = cpu_isset(this_cpu, group->cpumask);
2214
2215                 if (local_group)
2216                         balance_cpu = first_cpu(group->cpumask);
2217
2218                 /* Tally up the load of all CPUs in the group */
2219                 sum_weighted_load = sum_nr_running = avg_load = 0;
2220
2221                 for_each_cpu_mask(i, group->cpumask) {
2222                         struct rq *rq;
2223
2224                         if (!cpu_isset(i, *cpus))
2225                                 continue;
2226
2227                         rq = cpu_rq(i);
2228
2229                         if (*sd_idle && !idle_cpu(i))
2230                                 *sd_idle = 0;
2231
2232                         /* Bias balancing toward cpus of our domain */
2233                         if (local_group) {
2234                                 if (idle_cpu(i) && !first_idle_cpu) {
2235                                         first_idle_cpu = 1;
2236                                         balance_cpu = i;
2237                                 }
2238
2239                                 load = target_load(i, load_idx);
2240                         } else
2241                                 load = source_load(i, load_idx);
2242
2243                         avg_load += load;
2244                         sum_nr_running += rq->nr_running;
2245                         sum_weighted_load += weighted_cpuload(i);
2246                 }
2247
2248                 /*
2249                  * First idle cpu or the first cpu(busiest) in this sched group
2250                  * is eligible for doing load balancing at this and above
2251                  * domains.
2252                  */
2253                 if (local_group && balance_cpu != this_cpu && balance) {
2254                         *balance = 0;
2255                         goto ret;
2256                 }
2257
2258                 total_load += avg_load;
2259                 total_pwr += group->__cpu_power;
2260
2261                 /* Adjust by relative CPU power of the group */
2262                 avg_load = sg_div_cpu_power(group,
2263                                 avg_load * SCHED_LOAD_SCALE);
2264
2265                 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2266
2267                 if (local_group) {
2268                         this_load = avg_load;
2269                         this = group;
2270                         this_nr_running = sum_nr_running;
2271                         this_load_per_task = sum_weighted_load;
2272                 } else if (avg_load > max_load &&
2273                            sum_nr_running > group_capacity) {
2274                         max_load = avg_load;
2275                         busiest = group;
2276                         busiest_nr_running = sum_nr_running;
2277                         busiest_load_per_task = sum_weighted_load;
2278                 }
2279
2280 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2281                 /*
2282                  * Busy processors will not participate in power savings
2283                  * balance.
2284                  */
2285                 if (idle == CPU_NOT_IDLE ||
2286                                 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2287                         goto group_next;
2288
2289                 /*
2290                  * If the local group is idle or completely loaded
2291                  * no need to do power savings balance at this domain
2292                  */
2293                 if (local_group && (this_nr_running >= group_capacity ||
2294                                     !this_nr_running))
2295                         power_savings_balance = 0;
2296
2297                 /*
2298                  * If a group is already running at full capacity or idle,
2299                  * don't include that group in power savings calculations
2300                  */
2301                 if (!power_savings_balance || sum_nr_running >= group_capacity
2302                     || !sum_nr_running)
2303                         goto group_next;
2304
2305                 /*
2306                  * Calculate the group which has the least non-idle load.
2307                  * This is the group from where we need to pick up the load
2308                  * for saving power
2309                  */
2310                 if ((sum_nr_running < min_nr_running) ||
2311                     (sum_nr_running == min_nr_running &&
2312                      first_cpu(group->cpumask) <
2313                      first_cpu(group_min->cpumask))) {
2314                         group_min = group;
2315                         min_nr_running = sum_nr_running;
2316                         min_load_per_task = sum_weighted_load /
2317                                                 sum_nr_running;
2318                 }
2319
2320                 /*
2321                  * Calculate the group which is almost near its
2322                  * capacity but still has some space to pick up some load
2323                  * from other group and save more power
2324                  */
2325                 if (sum_nr_running <= group_capacity - 1) {
2326                         if (sum_nr_running > leader_nr_running ||
2327                             (sum_nr_running == leader_nr_running &&
2328                              first_cpu(group->cpumask) >
2329                               first_cpu(group_leader->cpumask))) {
2330                                 group_leader = group;
2331                                 leader_nr_running = sum_nr_running;
2332                         }
2333                 }
2334 group_next:
2335 #endif
2336                 group = group->next;
2337         } while (group != sd->groups);
2338
2339         if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2340                 goto out_balanced;
2341
2342         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2343
2344         if (this_load >= avg_load ||
2345                         100*max_load <= sd->imbalance_pct*this_load)
2346                 goto out_balanced;
2347
2348         busiest_load_per_task /= busiest_nr_running;
2349         /*
2350          * We're trying to get all the cpus to the average_load, so we don't
2351          * want to push ourselves above the average load, nor do we wish to
2352          * reduce the max loaded cpu below the average load, as either of these
2353          * actions would just result in more rebalancing later, and ping-pong
2354          * tasks around. Thus we look for the minimum possible imbalance.
2355          * Negative imbalances (*we* are more loaded than anyone else) will
2356          * be counted as no imbalance for these purposes -- we can't fix that
2357          * by pulling tasks to us.  Be careful of negative numbers as they'll
2358          * appear as very large values with unsigned longs.
2359          */
2360         if (max_load <= busiest_load_per_task)
2361                 goto out_balanced;
2362
2363         /*
2364          * In the presence of smp nice balancing, certain scenarios can have
2365          * max load less than avg load(as we skip the groups at or below
2366          * its cpu_power, while calculating max_load..)
2367          */
2368         if (max_load < avg_load) {
2369                 *imbalance = 0;
2370                 goto small_imbalance;
2371         }
2372
2373         /* Don't want to pull so many tasks that a group would go idle */
2374         max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2375
2376         /* How much load to actually move to equalise the imbalance */
2377         *imbalance = min(max_pull * busiest->__cpu_power,
2378                                 (avg_load - this_load) * this->__cpu_power)
2379                         / SCHED_LOAD_SCALE;
2380
2381         /*
2382          * if *imbalance is less than the average load per runnable task
2383          * there is no gaurantee that any tasks will be moved so we'll have
2384          * a think about bumping its value to force at least one task to be
2385          * moved
2386          */
2387         if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2388                 unsigned long tmp, pwr_now, pwr_move;
2389                 unsigned int imbn;
2390
2391 small_imbalance:
2392                 pwr_move = pwr_now = 0;
2393                 imbn = 2;
2394                 if (this_nr_running) {
2395                         this_load_per_task /= this_nr_running;
2396                         if (busiest_load_per_task > this_load_per_task)
2397                                 imbn = 1;
2398                 } else
2399                         this_load_per_task = SCHED_LOAD_SCALE;
2400
2401                 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2402                                         busiest_load_per_task * imbn) {
2403                         *imbalance = busiest_load_per_task;
2404                         return busiest;
2405                 }
2406
2407                 /*
2408                  * OK, we don't have enough imbalance to justify moving tasks,
2409                  * however we may be able to increase total CPU power used by
2410                  * moving them.
2411                  */
2412
2413                 pwr_now += busiest->__cpu_power *
2414                                 min(busiest_load_per_task, max_load);
2415                 pwr_now += this->__cpu_power *
2416                                 min(this_load_per_task, this_load);
2417                 pwr_now /= SCHED_LOAD_SCALE;
2418
2419                 /* Amount of load we'd subtract */
2420                 tmp = sg_div_cpu_power(busiest,
2421                                 busiest_load_per_task * SCHED_LOAD_SCALE);
2422                 if (max_load > tmp)
2423                         pwr_move += busiest->__cpu_power *
2424                                 min(busiest_load_per_task, max_load - tmp);
2425
2426                 /* Amount of load we'd add */
2427                 if (max_load * busiest->__cpu_power <
2428                                 busiest_load_per_task * SCHED_LOAD_SCALE)
2429                         tmp = sg_div_cpu_power(this,
2430                                         max_load * busiest->__cpu_power);
2431                 else
2432                         tmp = sg_div_cpu_power(this,
2433                                 busiest_load_per_task * SCHED_LOAD_SCALE);
2434                 pwr_move += this->__cpu_power *
2435                                 min(this_load_per_task, this_load + tmp);
2436                 pwr_move /= SCHED_LOAD_SCALE;
2437
2438                 /* Move if we gain throughput */
2439                 if (pwr_move <= pwr_now)
2440                         goto out_balanced;
2441
2442                 *imbalance = busiest_load_per_task;
2443         }
2444
2445         return busiest;
2446
2447 out_balanced:
2448 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2449         if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2450                 goto ret;
2451
2452         if (this == group_leader && group_leader != group_min) {
2453                 *imbalance = min_load_per_task;
2454                 return group_min;
2455         }
2456 #endif
2457 ret:
2458         *imbalance = 0;
2459         return NULL;
2460 }
2461
2462 /*
2463  * find_busiest_queue - find the busiest runqueue among the cpus in group.
2464  */
2465 static struct rq *
2466 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2467                    unsigned long imbalance, cpumask_t *cpus)
2468 {
2469         struct rq *busiest = NULL, *rq;
2470         unsigned long max_load = 0;
2471         int i;
2472
2473         for_each_cpu_mask(i, group->cpumask) {
2474                 unsigned long wl;
2475
2476                 if (!cpu_isset(i, *cpus))
2477                         continue;
2478
2479                 rq = cpu_rq(i);
2480                 wl = weighted_cpuload(i);
2481
2482                 if (rq->nr_running == 1 && wl > imbalance)
2483                         continue;
2484
2485                 if (wl > max_load) {
2486                         max_load = wl;
2487                         busiest = rq;
2488                 }
2489         }
2490
2491         return busiest;
2492 }
2493
2494 /*
2495  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2496  * so long as it is large enough.
2497  */
2498 #define MAX_PINNED_INTERVAL     512
2499
2500 static inline unsigned long minus_1_or_zero(unsigned long n)
2501 {
2502         return n > 0 ? n - 1 : 0;
2503 }
2504
2505 /*
2506  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2507  * tasks if there is an imbalance.
2508  */
2509 static int load_balance(int this_cpu, struct rq *this_rq,
2510                         struct sched_domain *sd, enum cpu_idle_type idle,
2511                         int *balance)
2512 {
2513         int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2514         struct sched_group *group;
2515         unsigned long imbalance;
2516         struct rq *busiest;
2517         cpumask_t cpus = CPU_MASK_ALL;
2518         unsigned long flags;
2519
2520         /*
2521          * When power savings policy is enabled for the parent domain, idle
2522          * sibling can pick up load irrespective of busy siblings. In this case,
2523          * let the state of idle sibling percolate up as CPU_IDLE, instead of
2524          * portraying it as CPU_NOT_IDLE.
2525          */
2526         if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2527             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2528                 sd_idle = 1;
2529
2530         schedstat_inc(sd, lb_cnt[idle]);
2531
2532 redo:
2533         group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2534                                    &cpus, balance);
2535
2536         if (*balance == 0)
2537                 goto out_balanced;
2538
2539         if (!group) {
2540                 schedstat_inc(sd, lb_nobusyg[idle]);
2541                 goto out_balanced;
2542         }
2543
2544         busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2545         if (!busiest) {
2546                 schedstat_inc(sd, lb_nobusyq[idle]);
2547                 goto out_balanced;
2548         }
2549
2550         BUG_ON(busiest == this_rq);
2551
2552         schedstat_add(sd, lb_imbalance[idle], imbalance);
2553
2554         nr_moved = 0;
2555         if (busiest->nr_running > 1) {
2556                 /*
2557                  * Attempt to move tasks. If find_busiest_group has found
2558                  * an imbalance but busiest->nr_running <= 1, the group is
2559                  * still unbalanced. nr_moved simply stays zero, so it is
2560                  * correctly treated as an imbalance.
2561                  */
2562                 local_irq_save(flags);
2563                 double_rq_lock(this_rq, busiest);
2564                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2565                                       minus_1_or_zero(busiest->nr_running),
2566                                       imbalance, sd, idle, &all_pinned);
2567                 double_rq_unlock(this_rq, busiest);
2568                 local_irq_restore(flags);
2569
2570                 /*
2571                  * some other cpu did the load balance for us.
2572                  */
2573                 if (nr_moved && this_cpu != smp_processor_id())
2574                         resched_cpu(this_cpu);
2575
2576                 /* All tasks on this runqueue were pinned by CPU affinity */
2577                 if (unlikely(all_pinned)) {
2578                         cpu_clear(cpu_of(busiest), cpus);
2579                         if (!cpus_empty(cpus))
2580                                 goto redo;
2581                         goto out_balanced;
2582                 }
2583         }
2584
2585         if (!nr_moved) {
2586                 schedstat_inc(sd, lb_failed[idle]);
2587                 sd->nr_balance_failed++;
2588
2589                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2590
2591                         spin_lock_irqsave(&busiest->lock, flags);
2592
2593                         /* don't kick the migration_thread, if the curr
2594                          * task on busiest cpu can't be moved to this_cpu
2595                          */
2596                         if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2597                                 spin_unlock_irqrestore(&busiest->lock, flags);
2598                                 all_pinned = 1;
2599                                 goto out_one_pinned;
2600                         }
2601
2602                         if (!busiest->active_balance) {
2603                                 busiest->active_balance = 1;
2604                                 busiest->push_cpu = this_cpu;
2605                                 active_balance = 1;
2606                         }
2607                         spin_unlock_irqrestore(&busiest->lock, flags);
2608                         if (active_balance)
2609                                 wake_up_process(busiest->migration_thread);
2610
2611                         /*
2612                          * We've kicked active balancing, reset the failure
2613                          * counter.
2614                          */
2615                         sd->nr_balance_failed = sd->cache_nice_tries+1;
2616                 }
2617         } else
2618                 sd->nr_balance_failed = 0;
2619
2620         if (likely(!active_balance)) {
2621                 /* We were unbalanced, so reset the balancing interval */
2622                 sd->balance_interval = sd->min_interval;
2623         } else {
2624                 /*
2625                  * If we've begun active balancing, start to back off. This
2626                  * case may not be covered by the all_pinned logic if there
2627                  * is only 1 task on the busy runqueue (because we don't call
2628                  * move_tasks).
2629                  */
2630                 if (sd->balance_interval < sd->max_interval)
2631                         sd->balance_interval *= 2;
2632         }
2633
2634         if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2635             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2636                 return -1;
2637         return nr_moved;
2638
2639 out_balanced:
2640         schedstat_inc(sd, lb_balanced[idle]);
2641
2642         sd->nr_balance_failed = 0;
2643
2644 out_one_pinned:
2645         /* tune up the balancing interval */
2646         if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2647                         (sd->balance_interval < sd->max_interval))
2648                 sd->balance_interval *= 2;
2649
2650         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2651             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2652                 return -1;
2653         return 0;
2654 }
2655
2656 /*
2657  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2658  * tasks if there is an imbalance.
2659  *
2660  * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2661  * this_rq is locked.
2662  */
2663 static int
2664 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2665 {
2666         struct sched_group *group;
2667         struct rq *busiest = NULL;
2668         unsigned long imbalance;
2669         int nr_moved = 0;
2670         int sd_idle = 0;
2671         cpumask_t cpus = CPU_MASK_ALL;
2672
2673         /*
2674          * When power savings policy is enabled for the parent domain, idle
2675          * sibling can pick up load irrespective of busy siblings. In this case,
2676          * let the state of idle sibling percolate up as IDLE, instead of
2677          * portraying it as CPU_NOT_IDLE.
2678          */
2679         if (sd->flags & SD_SHARE_CPUPOWER &&
2680             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2681                 sd_idle = 1;
2682
2683         schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2684 redo:
2685         group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2686                                    &sd_idle, &cpus, NULL);
2687         if (!group) {
2688                 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2689                 goto out_balanced;
2690         }
2691
2692         busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2693                                 &cpus);
2694         if (!busiest) {
2695                 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2696                 goto out_balanced;
2697         }
2698
2699         BUG_ON(busiest == this_rq);
2700
2701         schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2702
2703         nr_moved = 0;
2704         if (busiest->nr_running > 1) {
2705                 /* Attempt to move tasks */
2706                 double_lock_balance(this_rq, busiest);
2707                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2708                                         minus_1_or_zero(busiest->nr_running),
2709                                         imbalance, sd, CPU_NEWLY_IDLE, NULL);
2710                 spin_unlock(&busiest->lock);
2711
2712                 if (!nr_moved) {
2713                         cpu_clear(cpu_of(busiest), cpus);
2714                         if (!cpus_empty(cpus))
2715                                 goto redo;
2716                 }
2717         }
2718
2719         if (!nr_moved) {
2720                 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2721                 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2722                     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2723                         return -1;
2724         } else
2725                 sd->nr_balance_failed = 0;
2726
2727         return nr_moved;
2728
2729 out_balanced:
2730         schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2731         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2732             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2733                 return -1;
2734         sd->nr_balance_failed = 0;
2735
2736         return 0;
2737 }
2738
2739 /*
2740  * idle_balance is called by schedule() if this_cpu is about to become
2741  * idle. Attempts to pull tasks from other CPUs.
2742  */
2743 static void idle_balance(int this_cpu, struct rq *this_rq)
2744 {
2745         struct sched_domain *sd;
2746         int pulled_task = -1;
2747         unsigned long next_balance = jiffies + HZ;
2748
2749         for_each_domain(this_cpu, sd) {
2750                 unsigned long interval;
2751
2752                 if (!(sd->flags & SD_LOAD_BALANCE))
2753                         continue;
2754
2755                 if (sd->flags & SD_BALANCE_NEWIDLE)
2756                         /* If we've pulled tasks over stop searching: */
2757                         pulled_task = load_balance_newidle(this_cpu,
2758                                                                 this_rq, sd);
2759
2760                 interval = msecs_to_jiffies(sd->balance_interval);
2761                 if (time_after(next_balance, sd->last_balance + interval))
2762                         next_balance = sd->last_balance + interval;
2763                 if (pulled_task)
2764                         break;
2765         }
2766         if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2767                 /*
2768                  * We are going idle. next_balance may be set based on
2769                  * a busy processor. So reset next_balance.
2770                  */
2771                 this_rq->next_balance = next_balance;
2772         }
2773 }
2774
2775 /*
2776  * active_load_balance is run by migration threads. It pushes running tasks
2777  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2778  * running on each physical CPU where possible, and avoids physical /
2779  * logical imbalances.
2780  *
2781  * Called with busiest_rq locked.
2782  */
2783 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2784 {
2785         int target_cpu = busiest_rq->push_cpu;
2786         struct sched_domain *sd;
2787         struct rq *target_rq;
2788
2789         /* Is there any task to move? */
2790         if (busiest_rq->nr_running <= 1)
2791                 return;
2792
2793         target_rq = cpu_rq(target_cpu);
2794
2795         /*
2796          * This condition is "impossible", if it occurs
2797          * we need to fix it.  Originally reported by
2798          * Bjorn Helgaas on a 128-cpu setup.
2799          */
2800         BUG_ON(busiest_rq == target_rq);
2801
2802         /* move a task from busiest_rq to target_rq */
2803         double_lock_balance(busiest_rq, target_rq);
2804
2805         /* Search for an sd spanning us and the target CPU. */
2806         for_each_domain(target_cpu, sd) {
2807                 if ((sd->flags & SD_LOAD_BALANCE) &&
2808                     cpu_isset(busiest_cpu, sd->span))
2809                                 break;
2810         }
2811
2812         if (likely(sd)) {
2813                 schedstat_inc(sd, alb_cnt);
2814
2815                 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2816                                RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
2817                                NULL))
2818                         schedstat_inc(sd, alb_pushed);
2819                 else
2820                         schedstat_inc(sd, alb_failed);
2821         }
2822         spin_unlock(&target_rq->lock);
2823 }
2824
2825 #ifdef CONFIG_NO_HZ
2826 static struct {
2827         atomic_t load_balancer;
2828         cpumask_t  cpu_mask;
2829 } nohz ____cacheline_aligned = {
2830         .load_balancer = ATOMIC_INIT(-1),
2831         .cpu_mask = CPU_MASK_NONE,
2832 };
2833
2834 /*
2835  * This routine will try to nominate the ilb (idle load balancing)
2836  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2837  * load balancing on behalf of all those cpus. If all the cpus in the system
2838  * go into this tickless mode, then there will be no ilb owner (as there is
2839  * no need for one) and all the cpus will sleep till the next wakeup event
2840  * arrives...
2841  *
2842  * For the ilb owner, tick is not stopped. And this tick will be used
2843  * for idle load balancing. ilb owner will still be part of
2844  * nohz.cpu_mask..
2845  *
2846  * While stopping the tick, this cpu will become the ilb owner if there
2847  * is no other owner. And will be the owner till that cpu becomes busy
2848  * or if all cpus in the system stop their ticks at which point
2849  * there is no need for ilb owner.
2850  *
2851  * When the ilb owner becomes busy, it nominates another owner, during the
2852  * next busy scheduler_tick()
2853  */
2854 int select_nohz_load_balancer(int stop_tick)
2855 {
2856         int cpu = smp_processor_id();
2857
2858         if (stop_tick) {
2859                 cpu_set(cpu, nohz.cpu_mask);
2860                 cpu_rq(cpu)->in_nohz_recently = 1;
2861
2862                 /*
2863                  * If we are going offline and still the leader, give up!
2864                  */
2865                 if (cpu_is_offline(cpu) &&
2866                     atomic_read(&nohz.load_balancer) == cpu) {
2867                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2868                                 BUG();
2869                         return 0;
2870                 }
2871
2872                 /* time for ilb owner also to sleep */
2873                 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2874                         if (atomic_read(&nohz.load_balancer) == cpu)
2875                                 atomic_set(&nohz.load_balancer, -1);
2876                         return 0;
2877                 }
2878
2879                 if (atomic_read(&nohz.load_balancer) == -1) {
2880                         /* make me the ilb owner */
2881                         if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2882                                 return 1;
2883                 } else if (atomic_read(&nohz.load_balancer) == cpu)
2884                         return 1;
2885         } else {
2886                 if (!cpu_isset(cpu, nohz.cpu_mask))
2887                         return 0;
2888
2889                 cpu_clear(cpu, nohz.cpu_mask);
2890
2891                 if (atomic_read(&nohz.load_balancer) == cpu)
2892                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2893                                 BUG();
2894         }
2895         return 0;
2896 }
2897 #endif
2898
2899 static DEFINE_SPINLOCK(balancing);
2900
2901 /*
2902  * It checks each scheduling domain to see if it is due to be balanced,
2903  * and initiates a balancing operation if so.
2904  *
2905  * Balancing parameters are set up in arch_init_sched_domains.
2906  */
2907 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
2908 {
2909         int balance = 1;
2910         struct rq *rq = cpu_rq(cpu);
2911         unsigned long interval;
2912         struct sched_domain *sd;
2913         /* Earliest time when we have to do rebalance again */
2914         unsigned long next_balance = jiffies + 60*HZ;
2915
2916         for_each_domain(cpu, sd) {
2917                 if (!(sd->flags & SD_LOAD_BALANCE))
2918                         continue;
2919
2920                 interval = sd->balance_interval;
2921                 if (idle != CPU_IDLE)
2922                         interval *= sd->busy_factor;
2923
2924                 /* scale ms to jiffies */
2925                 interval = msecs_to_jiffies(interval);
2926                 if (unlikely(!interval))
2927                         interval = 1;
2928                 if (interval > HZ*NR_CPUS/10)
2929                         interval = HZ*NR_CPUS/10;
2930
2931
2932                 if (sd->flags & SD_SERIALIZE) {
2933                         if (!spin_trylock(&balancing))
2934                                 goto out;
2935                 }
2936
2937                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2938                         if (load_balance(cpu, rq, sd, idle, &balance)) {
2939                                 /*
2940                                  * We've pulled tasks over so either we're no
2941                                  * longer idle, or one of our SMT siblings is
2942                                  * not idle.
2943                                  */
2944                                 idle = CPU_NOT_IDLE;
2945                         }
2946                         sd->last_balance = jiffies;
2947                 }
2948                 if (sd->flags & SD_SERIALIZE)
2949                         spin_unlock(&balancing);
2950 out:
2951                 if (time_after(next_balance, sd->last_balance + interval))
2952                         next_balance = sd->last_balance + interval;
2953
2954                 /*
2955                  * Stop the load balance at this level. There is another
2956                  * CPU in our sched group which is doing load balancing more
2957                  * actively.
2958                  */
2959                 if (!balance)
2960                         break;
2961         }
2962         rq->next_balance = next_balance;
2963 }
2964
2965 /*
2966  * run_rebalance_domains is triggered when needed from the scheduler tick.
2967  * In CONFIG_NO_HZ case, the idle load balance owner will do the
2968  * rebalancing for all the cpus for whom scheduler ticks are stopped.
2969  */
2970 static void run_rebalance_domains(struct softirq_action *h)
2971 {
2972         int this_cpu = smp_processor_id();
2973         struct rq *this_rq = cpu_rq(this_cpu);
2974         enum cpu_idle_type idle = this_rq->idle_at_tick ?
2975                                                 CPU_IDLE : CPU_NOT_IDLE;
2976
2977         rebalance_domains(this_cpu, idle);
2978
2979 #ifdef CONFIG_NO_HZ
2980         /*
2981          * If this cpu is the owner for idle load balancing, then do the
2982          * balancing on behalf of the other idle cpus whose ticks are
2983          * stopped.
2984          */
2985         if (this_rq->idle_at_tick &&
2986             atomic_read(&nohz.load_balancer) == this_cpu) {
2987                 cpumask_t cpus = nohz.cpu_mask;
2988                 struct rq *rq;
2989                 int balance_cpu;
2990
2991                 cpu_clear(this_cpu, cpus);
2992                 for_each_cpu_mask(balance_cpu, cpus) {
2993                         /*
2994                          * If this cpu gets work to do, stop the load balancing
2995                          * work being done for other cpus. Next load
2996                          * balancing owner will pick it up.
2997                          */
2998                         if (need_resched())
2999                                 break;
3000
3001                         rebalance_domains(balance_cpu, SCHED_IDLE);
3002
3003                         rq = cpu_rq(balance_cpu);
3004                         if (time_after(this_rq->next_balance, rq->next_balance))
3005                                 this_rq->next_balance = rq->next_balance;
3006                 }
3007         }
3008 #endif
3009 }
3010
3011 /*
3012  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3013  *
3014  * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3015  * idle load balancing owner or decide to stop the periodic load balancing,
3016  * if the whole system is idle.
3017  */
3018 static inline void trigger_load_balance(struct rq *rq, int cpu)
3019 {
3020 #ifdef CONFIG_NO_HZ
3021         /*
3022          * If we were in the nohz mode recently and busy at the current
3023          * scheduler tick, then check if we need to nominate new idle
3024          * load balancer.
3025          */
3026         if (rq->in_nohz_recently && !rq->idle_at_tick) {
3027                 rq->in_nohz_recently = 0;
3028
3029                 if (atomic_read(&nohz.load_balancer) == cpu) {
3030                         cpu_clear(cpu, nohz.cpu_mask);
3031                         atomic_set(&nohz.load_balancer, -1);
3032                 }
3033
3034                 if (atomic_read(&nohz.load_balancer) == -1) {
3035                         /*
3036                          * simple selection for now: Nominate the
3037                          * first cpu in the nohz list to be the next
3038                          * ilb owner.
3039                          *
3040                          * TBD: Traverse the sched domains and nominate
3041                          * the nearest cpu in the nohz.cpu_mask.
3042                          */
3043                         int ilb = first_cpu(nohz.cpu_mask);
3044
3045                         if (ilb != NR_CPUS)
3046                                 resched_cpu(ilb);
3047                 }
3048         }
3049
3050         /*
3051          * If this cpu is idle and doing idle load balancing for all the
3052          * cpus with ticks stopped, is it time for that to stop?
3053          */
3054         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3055             cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3056                 resched_cpu(cpu);
3057                 return;
3058         }
3059
3060         /*
3061          * If this cpu is idle and the idle load balancing is done by
3062          * someone else, then no need raise the SCHED_SOFTIRQ
3063          */
3064         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3065             cpu_isset(cpu, nohz.cpu_mask))
3066                 return;
3067 #endif
3068         if (time_after_eq(jiffies, rq->next_balance))
3069                 raise_softirq(SCHED_SOFTIRQ);
3070 }
3071
3072 #else   /* CONFIG_SMP */
3073
3074 /*
3075  * on UP we do not need to balance between CPUs:
3076  */
3077 static inline void idle_balance(int cpu, struct rq *rq)
3078 {
3079 }
3080
3081 /* Avoid "used but not defined" warning on UP */
3082 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3083                       unsigned long max_nr_move, unsigned long max_load_move,
3084                       struct sched_domain *sd, enum cpu_idle_type idle,
3085                       int *all_pinned, unsigned long *load_moved,
3086                       int this_best_prio, int best_prio, int best_prio_seen,
3087                       struct rq_iterator *iterator)
3088 {
3089         *load_moved = 0;
3090
3091         return 0;
3092 }
3093
3094 #endif
3095
3096 DEFINE_PER_CPU(struct kernel_stat, kstat);
3097
3098 EXPORT_PER_CPU_SYMBOL(kstat);
3099
3100 /*
3101  * Return p->sum_exec_runtime plus any more ns on the sched_clock
3102  * that have not yet been banked in case the task is currently running.
3103  */
3104 unsigned long long task_sched_runtime(struct task_struct *p)
3105 {
3106         unsigned long flags;
3107         u64 ns, delta_exec;
3108         struct rq *rq;
3109
3110         rq = task_rq_lock(p, &flags);
3111         ns = p->se.sum_exec_runtime;
3112         if (rq->curr == p) {
3113                 delta_exec = rq_clock(rq) - p->se.exec_start;
3114                 if ((s64)delta_exec > 0)
3115                         ns += delta_exec;
3116         }
3117         task_rq_unlock(rq, &flags);
3118
3119         return ns;
3120 }
3121
3122 /*
3123  * Account user cpu time to a process.
3124  * @p: the process that the cpu time gets accounted to
3125  * @hardirq_offset: the offset to subtract from hardirq_count()
3126  * @cputime: the cpu time spent in user space since the last update
3127  */
3128 void account_user_time(struct task_struct *p, cputime_t cputime)
3129 {
3130         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3131         cputime64_t tmp;
3132
3133         p->utime = cputime_add(p->utime, cputime);
3134
3135         /* Add user time to cpustat. */
3136         tmp = cputime_to_cputime64(cputime);
3137         if (TASK_NICE(p) > 0)
3138                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3139         else
3140                 cpustat->user = cputime64_add(cpustat->user, tmp);
3141 }
3142
3143 /*
3144  * Account system cpu time to a process.
3145  * @p: the process that the cpu time gets accounted to
3146  * @hardirq_offset: the offset to subtract from hardirq_count()
3147  * @cputime: the cpu time spent in kernel space since the last update
3148  */
3149 void account_system_time(struct task_struct *p, int hardirq_offset,
3150                          cputime_t cputime)
3151 {
3152         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3153         struct rq *rq = this_rq();
3154         cputime64_t tmp;
3155
3156         p->stime = cputime_add(p->stime, cputime);
3157
3158         /* Add system time to cpustat. */
3159         tmp = cputime_to_cputime64(cputime);
3160         if (hardirq_count() - hardirq_offset)
3161                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3162         else if (softirq_count())
3163                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3164         else if (p != rq->idle)
3165                 cpustat->system = cputime64_add(cpustat->system, tmp);
3166         else if (atomic_read(&rq->nr_iowait) > 0)
3167                 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3168         else
3169                 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3170         /* Account for system time used */
3171         acct_update_integrals(p);
3172 }
3173
3174 /*
3175  * Account for involuntary wait time.
3176  * @p: the process from which the cpu time has been stolen
3177  * @steal: the cpu time spent in involuntary wait
3178  */
3179 void account_steal_time(struct task_struct *p, cputime_t steal)
3180 {
3181         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3182         cputime64_t tmp = cputime_to_cputime64(steal);
3183         struct rq *rq = this_rq();
3184
3185         if (p == rq->idle) {
3186                 p->stime = cputime_add(p->stime, steal);
3187                 if (atomic_read(&rq->nr_iowait) > 0)
3188                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3189                 else
3190                         cpustat->idle = cputime64_add(cpustat->idle, tmp);
3191         } else
3192                 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3193 }
3194
3195 /*
3196  * This function gets called by the timer code, with HZ frequency.
3197  * We call it with interrupts disabled.
3198  *
3199  * It also gets called by the fork code, when changing the parent's
3200  * timeslices.
3201  */
3202 void scheduler_tick(void)
3203 {
3204         int cpu = smp_processor_id();
3205         struct rq *rq = cpu_rq(cpu);
3206         struct task_struct *curr = rq->curr;
3207
3208         spin_lock(&rq->lock);
3209         if (curr != rq->idle) /* FIXME: needed? */
3210                 curr->sched_class->task_tick(rq, curr);
3211         update_cpu_load(rq);
3212         spin_unlock(&rq->lock);
3213
3214 #ifdef CONFIG_SMP
3215         rq->idle_at_tick = idle_cpu(cpu);
3216         trigger_load_balance(rq, cpu);
3217 #endif
3218 }
3219
3220 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3221
3222 void fastcall add_preempt_count(int val)
3223 {
3224         /*
3225          * Underflow?
3226          */
3227         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3228                 return;
3229         preempt_count() += val;
3230         /*
3231          * Spinlock count overflowing soon?
3232          */
3233         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3234                                 PREEMPT_MASK - 10);
3235 }
3236 EXPORT_SYMBOL(add_preempt_count);
3237
3238 void fastcall sub_preempt_count(int val)
3239 {
3240         /*
3241          * Underflow?
3242          */
3243         if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3244                 return;
3245         /*
3246          * Is the spinlock portion underflowing?
3247          */
3248         if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3249                         !(preempt_count() & PREEMPT_MASK)))
3250                 return;
3251
3252         preempt_count() -= val;
3253 }
3254 EXPORT_SYMBOL(sub_preempt_count);
3255
3256 #endif
3257
3258 /*
3259  * Print scheduling while atomic bug:
3260  */
3261 static noinline void __schedule_bug(struct task_struct *prev)
3262 {
3263         printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3264                 prev->comm, preempt_count(), prev->pid);
3265         debug_show_held_locks(prev);
3266         if (irqs_disabled())
3267                 print_irqtrace_events(prev);
3268         dump_stack();
3269 }
3270
3271 /*
3272  * Various schedule()-time debugging checks and statistics:
3273  */
3274 static inline void schedule_debug(struct task_struct *prev)
3275 {
3276         /*
3277          * Test if we are atomic.  Since do_exit() needs to call into
3278          * schedule() atomically, we ignore that path for now.
3279          * Otherwise, whine if we are scheduling when we should not be.
3280          */
3281         if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3282                 __schedule_bug(prev);
3283
3284         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3285
3286         schedstat_inc(this_rq(), sched_cnt);
3287 }
3288
3289 /*
3290  * Pick up the highest-prio task:
3291  */
3292 static inline struct task_struct *
3293 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3294 {
3295         struct sched_class *class;
3296         struct task_struct *p;
3297
3298         /*
3299          * Optimization: we know that if all tasks are in
3300          * the fair class we can call that function directly:
3301          */
3302         if (likely(rq->nr_running == rq->cfs.nr_running)) {
3303                 p = fair_sched_class.pick_next_task(rq, now);
3304                 if (likely(p))
3305                         return p;
3306         }
3307
3308         class = sched_class_highest;
3309         for ( ; ; ) {
3310                 p = class->pick_next_task(rq, now);
3311                 if (p)
3312                         return p;
3313                 /*
3314                  * Will never be NULL as the idle class always
3315                  * returns a non-NULL p:
3316                  */
3317                 class = class->next;
3318         }
3319 }
3320
3321 /*
3322  * schedule() is the main scheduler function.
3323  */
3324 asmlinkage void __sched schedule(void)
3325 {
3326         struct task_struct *prev, *next;
3327         long *switch_count;
3328         struct rq *rq;
3329         u64 now;
3330         int cpu;
3331
3332 need_resched:
3333         preempt_disable();
3334         cpu = smp_processor_id();
3335         rq = cpu_rq(cpu);
3336         rcu_qsctr_inc(cpu);
3337         prev = rq->curr;
3338         switch_count = &prev->nivcsw;
3339
3340         release_kernel_lock(prev);
3341 need_resched_nonpreemptible:
3342
3343         schedule_debug(prev);
3344
3345         spin_lock_irq(&rq->lock);
3346         clear_tsk_need_resched(prev);
3347
3348         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3349                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3350                                 unlikely(signal_pending(prev)))) {
3351                         prev->state = TASK_RUNNING;
3352                 } else {
3353                         deactivate_task(rq, prev, 1);
3354                 }
3355                 switch_count = &prev->nvcsw;
3356         }
3357
3358         if (unlikely(!rq->nr_running))
3359                 idle_balance(cpu, rq);
3360
3361         now = __rq_clock(rq);
3362         prev->sched_class->put_prev_task(rq, prev, now);
3363         next = pick_next_task(rq, prev, now);
3364
3365         sched_info_switch(prev, next);
3366
3367         if (likely(prev != next)) {
3368                 rq->nr_switches++;
3369                 rq->curr = next;
3370                 ++*switch_count;
3371
3372                 context_switch(rq, prev, next); /* unlocks the rq */
3373         } else
3374                 spin_unlock_irq(&rq->lock);
3375
3376         if (unlikely(reacquire_kernel_lock(current) < 0)) {
3377                 cpu = smp_processor_id();
3378                 rq = cpu_rq(cpu);
3379                 goto need_resched_nonpreemptible;
3380         }
3381         preempt_enable_no_resched();
3382         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3383                 goto need_resched;
3384 }
3385 EXPORT_SYMBOL(schedule);
3386
3387 #ifdef CONFIG_PREEMPT
3388 /*
3389  * this is the entry point to schedule() from in-kernel preemption
3390  * off of preempt_enable.  Kernel preemptions off return from interrupt
3391  * occur there and call schedule directly.
3392  */
3393 asmlinkage void __sched preempt_schedule(void)
3394 {
3395         struct thread_info *ti = current_thread_info();
3396 #ifdef CONFIG_PREEMPT_BKL
3397         struct task_struct *task = current;
3398         int saved_lock_depth;
3399 #endif
3400         /*
3401          * If there is a non-zero preempt_count or interrupts are disabled,
3402          * we do not want to preempt the current task.  Just return..
3403          */
3404         if (likely(ti->preempt_count || irqs_disabled()))
3405                 return;
3406
3407 need_resched:
3408         add_preempt_count(PREEMPT_ACTIVE);
3409         /*
3410          * We keep the big kernel semaphore locked, but we
3411          * clear ->lock_depth so that schedule() doesnt
3412          * auto-release the semaphore:
3413          */
3414 #ifdef CONFIG_PREEMPT_BKL
3415         saved_lock_depth = task->lock_depth;
3416         task->lock_depth = -1;
3417 #endif
3418         schedule();
3419 #ifdef CONFIG_PREEMPT_BKL
3420         task->lock_depth = saved_lock_depth;
3421 #endif
3422         sub_preempt_count(PREEMPT_ACTIVE);
3423
3424         /* we could miss a preemption opportunity between schedule and now */
3425         barrier();
3426         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3427                 goto need_resched;
3428 }
3429 EXPORT_SYMBOL(preempt_schedule);
3430
3431 /*
3432  * this is the entry point to schedule() from kernel preemption
3433  * off of irq context.
3434  * Note, that this is called and return with irqs disabled. This will
3435  * protect us against recursive calling from irq.
3436  */
3437 asmlinkage void __sched preempt_schedule_irq(void)
3438 {
3439         struct thread_info *ti = current_thread_info();
3440 #ifdef CONFIG_PREEMPT_BKL
3441         struct task_struct *task = current;
3442         int saved_lock_depth;
3443 #endif
3444         /* Catch callers which need to be fixed */
3445         BUG_ON(ti->preempt_count || !irqs_disabled());
3446
3447 need_resched:
3448         add_preempt_count(PREEMPT_ACTIVE);
3449         /*
3450          * We keep the big kernel semaphore locked, but we
3451          * clear ->lock_depth so that schedule() doesnt
3452          * auto-release the semaphore:
3453          */
3454 #ifdef CONFIG_PREEMPT_BKL
3455         saved_lock_depth = task->lock_depth;
3456         task->lock_depth = -1;
3457 #endif
3458         local_irq_enable();
3459         schedule();
3460         local_irq_disable();
3461 #ifdef CONFIG_PREEMPT_BKL
3462         task->lock_depth = saved_lock_depth;
3463 #endif
3464         sub_preempt_count(PREEMPT_ACTIVE);
3465
3466         /* we could miss a preemption opportunity between schedule and now */
3467         barrier();
3468         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3469                 goto need_resched;
3470 }
3471
3472 #endif /* CONFIG_PREEMPT */
3473
3474 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3475                           void *key)
3476 {
3477         return try_to_wake_up(curr->private, mode, sync);
3478 }
3479 EXPORT_SYMBOL(default_wake_function);
3480
3481 /*
3482  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
3483  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
3484  * number) then we wake all the non-exclusive tasks and one exclusive task.
3485  *
3486  * There are circumstances in which we can try to wake a task which has already
3487  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
3488  * zero in this (rare) case, and we handle it by continuing to scan the queue.
3489  */
3490 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3491                              int nr_exclusive, int sync, void *key)
3492 {
3493         struct list_head *tmp, *next;
3494
3495         list_for_each_safe(tmp, next, &q->task_list) {
3496                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3497                 unsigned flags = curr->flags;
3498
3499                 if (curr->func(curr, mode, sync, key) &&
3500                                 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3501                         break;
3502         }
3503 }
3504
3505 /**
3506  * __wake_up - wake up threads blocked on a waitqueue.
3507  * @q: the waitqueue
3508  * @mode: which threads
3509  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3510  * @key: is directly passed to the wakeup function
3511  */
3512 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3513                         int nr_exclusive, void *key)
3514 {
3515         unsigned long flags;
3516
3517         spin_lock_irqsave(&q->lock, flags);
3518         __wake_up_common(q, mode, nr_exclusive, 0, key);
3519         spin_unlock_irqrestore(&q->lock, flags);
3520 }
3521 EXPORT_SYMBOL(__wake_up);
3522
3523 /*
3524  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3525  */
3526 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3527 {
3528         __wake_up_common(q, mode, 1, 0, NULL);
3529 }
3530
3531 /**
3532  * __wake_up_sync - wake up threads blocked on a waitqueue.
3533  * @q: the waitqueue
3534  * @mode: which threads
3535  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3536  *
3537  * The sync wakeup differs that the waker knows that it will schedule
3538  * away soon, so while the target thread will be woken up, it will not
3539  * be migrated to another CPU - ie. the two threads are 'synchronized'
3540  * with each other. This can prevent needless bouncing between CPUs.
3541  *
3542  * On UP it can prevent extra preemption.
3543  */
3544 void fastcall
3545 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3546 {
3547         unsigned long flags;
3548         int sync = 1;
3549
3550         if (unlikely(!q))
3551                 return;
3552
3553         if (unlikely(!nr_exclusive))
3554                 sync = 0;
3555
3556         spin_lock_irqsave(&q->lock, flags);
3557         __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3558         spin_unlock_irqrestore(&q->lock, flags);
3559 }
3560 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
3561
3562 void fastcall complete(struct completion *x)
3563 {
3564         unsigned long flags;
3565
3566         spin_lock_irqsave(&x->wait.lock, flags);
3567         x->done++;
3568         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3569                          1, 0, NULL);
3570         spin_unlock_irqrestore(&x->wait.lock, flags);
3571 }
3572 EXPORT_SYMBOL(complete);
3573
3574 void fastcall complete_all(struct completion *x)
3575 {
3576         unsigned long flags;
3577
3578         spin_lock_irqsave(&x->wait.lock, flags);
3579         x->done += UINT_MAX/2;
3580         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3581                          0, 0, NULL);
3582         spin_unlock_irqrestore(&x->wait.lock, flags);
3583 }
3584 EXPORT_SYMBOL(complete_all);
3585
3586 void fastcall __sched wait_for_completion(struct completion *x)
3587 {
3588         might_sleep();
3589
3590         spin_lock_irq(&x->wait.lock);
3591         if (!x->done) {
3592                 DECLARE_WAITQUEUE(wait, current);
3593
3594                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3595                 __add_wait_queue_tail(&x->wait, &wait);
3596                 do {
3597                         __set_current_state(TASK_UNINTERRUPTIBLE);
3598                         spin_unlock_irq(&x->wait.lock);
3599                         schedule();
3600                         spin_lock_irq(&x->wait.lock);
3601                 } while (!x->done);
3602                 __remove_wait_queue(&x->wait, &wait);
3603         }
3604         x->done--;
3605         spin_unlock_irq(&x->wait.lock);
3606 }
3607 EXPORT_SYMBOL(wait_for_completion);
3608
3609 unsigned long fastcall __sched
3610 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3611 {
3612         might_sleep();
3613
3614         spin_lock_irq(&x->wait.lock);
3615         if (!x->done) {
3616                 DECLARE_WAITQUEUE(wait, current);
3617
3618                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3619                 __add_wait_queue_tail(&x->wait, &wait);
3620                 do {
3621                         __set_current_state(TASK_UNINTERRUPTIBLE);
3622                         spin_unlock_irq(&x->wait.lock);
3623                         timeout = schedule_timeout(timeout);
3624                         spin_lock_irq(&x->wait.lock);
3625                         if (!timeout) {
3626                                 __remove_wait_queue(&x->wait, &wait);
3627                                 goto out;
3628                         }
3629                 } while (!x->done);
3630                 __remove_wait_queue(&x->wait, &wait);
3631         }
3632         x->done--;
3633 out:
3634         spin_unlock_irq(&x->wait.lock);
3635         return timeout;
3636 }
3637 EXPORT_SYMBOL(wait_for_completion_timeout);
3638
3639 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3640 {
3641         int ret = 0;
3642
3643         might_sleep();
3644
3645         spin_lock_irq(&x->wait.lock);
3646         if (!x->done) {
3647                 DECLARE_WAITQUEUE(wait, current);
3648
3649                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3650                 __add_wait_queue_tail(&x->wait, &wait);
3651                 do {
3652                         if (signal_pending(current)) {
3653                                 ret = -ERESTARTSYS;
3654                                 __remove_wait_queue(&x->wait, &wait);
3655                                 goto out;
3656                         }
3657                         __set_current_state(TASK_INTERRUPTIBLE);
3658                         spin_unlock_irq(&x->wait.lock);
3659                         schedule();
3660                         spin_lock_irq(&x->wait.lock);
3661                 } while (!x->done);
3662                 __remove_wait_queue(&x->wait, &wait);
3663         }
3664         x->done--;
3665 out:
3666         spin_unlock_irq(&x->wait.lock);
3667
3668         return ret;
3669 }
3670 EXPORT_SYMBOL(wait_for_completion_interruptible);
3671
3672 unsigned long fastcall __sched
3673 wait_for_completion_interruptible_timeout(struct completion *x,
3674                                           unsigned long timeout)
3675 {
3676         might_sleep();
3677
3678         spin_lock_irq(&x->wait.lock);
3679         if (!x->done) {
3680                 DECLARE_WAITQUEUE(wait, current);
3681
3682                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3683                 __add_wait_queue_tail(&x->wait, &wait);
3684                 do {
3685                         if (signal_pending(current)) {
3686                                 timeout = -ERESTARTSYS;
3687                                 __remove_wait_queue(&x->wait, &wait);
3688                                 goto out;
3689                         }
3690                         __set_current_state(TASK_INTERRUPTIBLE);
3691                         spin_unlock_irq(&x->wait.lock);
3692                         timeout = schedule_timeout(timeout);
3693                         spin_lock_irq(&x->wait.lock);
3694                         if (!timeout) {
3695                                 __remove_wait_queue(&x->wait, &wait);
3696                                 goto out;
3697                         }
3698                 } while (!x->done);
3699                 __remove_wait_queue(&x->wait, &wait);
3700         }
3701         x->done--;
3702 out:
3703         spin_unlock_irq(&x->wait.lock);
3704         return timeout;
3705 }
3706 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3707
3708 static inline void
3709 sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3710 {
3711         spin_lock_irqsave(&q->lock, *flags);
3712         __add_wait_queue(q, wait);
3713         spin_unlock(&q->lock);
3714 }
3715
3716 static inline void
3717 sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3718 {
3719         spin_lock_irq(&q->lock);
3720         __remove_wait_queue(q, wait);
3721         spin_unlock_irqrestore(&q->lock, *flags);
3722 }
3723
3724 void __sched interruptible_sleep_on(wait_queue_head_t *q)
3725 {
3726         unsigned long flags;
3727         wait_queue_t wait;
3728
3729         init_waitqueue_entry(&wait, current);
3730
3731         current->state = TASK_INTERRUPTIBLE;
3732
3733         sleep_on_head(q, &wait, &flags);
3734         schedule();
3735         sleep_on_tail(q, &wait, &flags);
3736 }
3737 EXPORT_SYMBOL(interruptible_sleep_on);
3738
3739 long __sched
3740 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3741 {
3742         unsigned long flags;
3743         wait_queue_t wait;
3744
3745         init_waitqueue_entry(&wait, current);
3746
3747         current->state = TASK_INTERRUPTIBLE;
3748
3749         sleep_on_head(q, &wait, &flags);
3750         timeout = schedule_timeout(timeout);
3751         sleep_on_tail(q, &wait, &flags);
3752
3753         return timeout;
3754 }
3755 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3756
3757 void __sched sleep_on(wait_queue_head_t *q)
3758 {
3759         unsigned long flags;
3760         wait_queue_t wait;
3761
3762         init_waitqueue_entry(&wait, current);
3763
3764         current->state = TASK_UNINTERRUPTIBLE;
3765
3766         sleep_on_head(q, &wait, &flags);
3767         schedule();
3768         sleep_on_tail(q, &wait, &flags);
3769 }
3770 EXPORT_SYMBOL(sleep_on);
3771
3772 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3773 {
3774         unsigned long flags;
3775         wait_queue_t wait;
3776
3777         init_waitqueue_entry(&wait, current);
3778
3779         current->state = TASK_UNINTERRUPTIBLE;
3780
3781         sleep_on_head(q, &wait, &flags);
3782         timeout = schedule_timeout(timeout);
3783         sleep_on_tail(q, &wait, &flags);
3784
3785         return timeout;
3786 }
3787 EXPORT_SYMBOL(sleep_on_timeout);
3788
3789 #ifdef CONFIG_RT_MUTEXES
3790
3791 /*
3792  * rt_mutex_setprio - set the current priority of a task
3793  * @p: task
3794  * @prio: prio value (kernel-internal form)
3795  *
3796  * This function changes the 'effective' priority of a task. It does
3797  * not touch ->normal_prio like __setscheduler().
3798  *
3799  * Used by the rt_mutex code to implement priority inheritance logic.
3800  */
3801 void rt_mutex_setprio(struct task_struct *p, int prio)
3802 {
3803         unsigned long flags;
3804         int oldprio, on_rq;
3805         struct rq *rq;
3806         u64 now;
3807
3808         BUG_ON(prio < 0 || prio > MAX_PRIO);
3809
3810         rq = task_rq_lock(p, &flags);
3811         now = rq_clock(rq);
3812
3813         oldprio = p->prio;
3814         on_rq = p->se.on_rq;
3815         if (on_rq)
3816                 dequeue_task(rq, p, 0, now);
3817
3818         if (rt_prio(prio))
3819                 p->sched_class = &rt_sched_class;
3820         else
3821                 p->sched_class = &fair_sched_class;
3822
3823         p->prio = prio;
3824
3825         if (on_rq) {
3826                 enqueue_task(rq, p, 0, now);
3827                 /*
3828                  * Reschedule if we are currently running on this runqueue and
3829                  * our priority decreased, or if we are not currently running on
3830                  * this runqueue and our priority is higher than the current's
3831                  */
3832                 if (task_running(rq, p)) {
3833                         if (p->prio > oldprio)
3834                                 resched_task(rq->curr);
3835                 } else {
3836                         check_preempt_curr(rq, p);
3837                 }
3838         }
3839         task_rq_unlock(rq, &flags);
3840 }
3841
3842 #endif
3843
3844 void set_user_nice(struct task_struct *p, long nice)
3845 {
3846         int old_prio, delta, on_rq;
3847         unsigned long flags;
3848         struct rq *rq;
3849         u64 now;
3850
3851         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3852                 return;
3853         /*
3854          * We have to be careful, if called from sys_setpriority(),
3855          * the task might be in the middle of scheduling on another CPU.
3856          */
3857         rq = task_rq_lock(p, &flags);
3858         now = rq_clock(rq);
3859         /*
3860          * The RT priorities are set via sched_setscheduler(), but we still
3861          * allow the 'normal' nice value to be set - but as expected
3862          * it wont have any effect on scheduling until the task is
3863          * SCHED_FIFO/SCHED_RR:
3864          */
3865         if (task_has_rt_policy(p)) {
3866                 p->static_prio = NICE_TO_PRIO(nice);
3867                 goto out_unlock;
3868         }
3869         on_rq = p->se.on_rq;
3870         if (on_rq) {
3871                 dequeue_task(rq, p, 0, now);
3872                 dec_load(rq, p, now);
3873         }
3874
3875         p->static_prio = NICE_TO_PRIO(nice);
3876         set_load_weight(p);
3877         old_prio = p->prio;
3878         p->prio = effective_prio(p);
3879         delta = p->prio - old_prio;
3880
3881         if (on_rq) {
3882                 enqueue_task(rq, p, 0, now);
3883                 inc_load(rq, p, now);
3884                 /*
3885                  * If the task increased its priority or is running and
3886                  * lowered its priority, then reschedule its CPU:
3887                  */
3888                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3889                         resched_task(rq->curr);
3890         }
3891 out_unlock:
3892         task_rq_unlock(rq, &flags);
3893 }
3894 EXPORT_SYMBOL(set_user_nice);
3895
3896 /*
3897  * can_nice - check if a task can reduce its nice value
3898  * @p: task
3899  * @nice: nice value
3900  */
3901 int can_nice(const struct task_struct *p, const int nice)
3902 {
3903         /* convert nice value [19,-20] to rlimit style value [1,40] */
3904         int nice_rlim = 20 - nice;
3905
3906         return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3907                 capable(CAP_SYS_NICE));
3908 }
3909
3910 #ifdef __ARCH_WANT_SYS_NICE
3911
3912 /*
3913  * sys_nice - change the priority of the current process.
3914  * @increment: priority increment
3915  *
3916  * sys_setpriority is a more generic, but much slower function that
3917  * does similar things.
3918  */
3919 asmlinkage long sys_nice(int increment)
3920 {
3921         long nice, retval;
3922
3923         /*
3924          * Setpriority might change our priority at the same moment.
3925          * We don't have to worry. Conceptually one call occurs first
3926          * and we have a single winner.
3927          */
3928         if (increment < -40)
3929                 increment = -40;
3930         if (increment > 40)
3931                 increment = 40;
3932
3933         nice = PRIO_TO_NICE(current->static_prio) + increment;
3934         if (nice < -20)
3935                 nice = -20;
3936         if (nice > 19)
3937                 nice = 19;
3938
3939         if (increment < 0 && !can_nice(current, nice))
3940                 return -EPERM;
3941
3942         retval = security_task_setnice(current, nice);
3943         if (retval)
3944                 return retval;
3945
3946         set_user_nice(current, nice);
3947         return 0;
3948 }
3949
3950 #endif
3951
3952 /**
3953  * task_prio - return the priority value of a given task.
3954  * @p: the task in question.
3955  *
3956  * This is the priority value as seen by users in /proc.
3957  * RT tasks are offset by -200. Normal tasks are centered
3958  * around 0, value goes from -16 to +15.
3959  */
3960 int task_prio(const struct task_struct *p)
3961 {
3962         return p->prio - MAX_RT_PRIO;
3963 }
3964
3965 /**
3966  * task_nice - return the nice value of a given task.
3967  * @p: the task in question.
3968  */
3969 int task_nice(const struct task_struct *p)
3970 {
3971         return TASK_NICE(p);
3972 }
3973 EXPORT_SYMBOL_GPL(task_nice);
3974
3975 /**
3976  * idle_cpu - is a given cpu idle currently?
3977  * @cpu: the processor in question.
3978  */
3979 int idle_cpu(int cpu)
3980 {
3981         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3982 }
3983
3984 /**
3985  * idle_task - return the idle task for a given cpu.
3986  * @cpu: the processor in question.
3987  */
3988 struct task_struct *idle_task(int cpu)
3989 {
3990         return cpu_rq(cpu)->idle;
3991 }
3992
3993 /**
3994  * find_process_by_pid - find a process with a matching PID value.
3995  * @pid: the pid in question.
3996  */
3997 static inline struct task_struct *find_process_by_pid(pid_t pid)
3998 {
3999         return pid ? find_task_by_pid(pid) : current;
4000 }
4001
4002 /* Actually do priority change: must hold rq lock. */
4003 static void
4004 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4005 {
4006         BUG_ON(p->se.on_rq);
4007
4008         p->policy = policy;
4009         switch (p->policy) {
4010         case SCHED_NORMAL:
4011         case SCHED_BATCH:
4012         case SCHED_IDLE:
4013                 p->sched_class = &fair_sched_class;
4014                 break;
4015         case SCHED_FIFO:
4016         case SCHED_RR:
4017                 p->sched_class = &rt_sched_class;
4018                 break;
4019         }
4020
4021         p->rt_priority = prio;
4022         p->normal_prio = normal_prio(p);
4023         /* we are holding p->pi_lock already */
4024         p->prio = rt_mutex_getprio(p);
4025         set_load_weight(p);
4026 }
4027
4028 /**
4029  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4030  * @p: the task in question.
4031  * @policy: new policy.
4032  * @param: structure containing the new RT priority.
4033  *
4034  * NOTE that the task may be already dead.
4035  */
4036 int sched_setscheduler(struct task_struct *p, int policy,
4037                        struct sched_param *param)
4038 {
4039         int retval, oldprio, oldpolicy = -1, on_rq;
4040         unsigned long flags;
4041         struct rq *rq;
4042
4043         /* may grab non-irq protected spin_locks */
4044         BUG_ON(in_interrupt());
4045 recheck:
4046         /* double check policy once rq lock held */
4047         if (policy < 0)
4048                 policy = oldpolicy = p->policy;
4049         else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4050                         policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4051                         policy != SCHED_IDLE)
4052                 return -EINVAL;
4053         /*
4054          * Valid priorities for SCHED_FIFO and SCHED_RR are
4055          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4056          * SCHED_BATCH and SCHED_IDLE is 0.
4057          */
4058         if (param->sched_priority < 0 ||
4059             (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4060             (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4061                 return -EINVAL;
4062         if (rt_policy(policy) != (param->sched_priority != 0))
4063                 return -EINVAL;
4064
4065         /*
4066          * Allow unprivileged RT tasks to decrease priority:
4067          */
4068         if (!capable(CAP_SYS_NICE)) {
4069                 if (rt_policy(policy)) {
4070                         unsigned long rlim_rtprio;
4071
4072                         if (!lock_task_sighand(p, &flags))
4073                                 return -ESRCH;
4074                         rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4075                         unlock_task_sighand(p, &flags);
4076
4077                         /* can't set/change the rt policy */
4078                         if (policy != p->policy && !rlim_rtprio)
4079                                 return -EPERM;
4080
4081                         /* can't increase priority */
4082                         if (param->sched_priority > p->rt_priority &&
4083                             param->sched_priority > rlim_rtprio)
4084                                 return -EPERM;
4085                 }
4086                 /*
4087                  * Like positive nice levels, dont allow tasks to
4088                  * move out of SCHED_IDLE either:
4089                  */
4090                 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4091                         return -EPERM;
4092
4093                 /* can't change other user's priorities */
4094                 if ((current->euid != p->euid) &&
4095                     (current->euid != p->uid))
4096                         return -EPERM;
4097         }
4098
4099         retval = security_task_setscheduler(p, policy, param);
4100         if (retval)
4101                 return retval;
4102         /*
4103          * make sure no PI-waiters arrive (or leave) while we are
4104          * changing the priority of the task:
4105          */
4106         spin_lock_irqsave(&p->pi_lock, flags);
4107         /*
4108          * To be able to change p->policy safely, the apropriate
4109          * runqueue lock must be held.
4110          */
4111         rq = __task_rq_lock(p);
4112         /* recheck policy now with rq lock held */
4113         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4114                 policy = oldpolicy = -1;
4115                 __task_rq_unlock(rq);
4116                 spin_unlock_irqrestore(&p->pi_lock, flags);
4117                 goto recheck;
4118         }
4119         on_rq = p->se.on_rq;
4120         if (on_rq)
4121                 deactivate_task(rq, p, 0);
4122         oldprio = p->prio;
4123         __setscheduler(rq, p, policy, param->sched_priority);
4124         if (on_rq) {
4125                 activate_task(rq, p, 0);
4126                 /*
4127                  * Reschedule if we are currently running on this runqueue and
4128                  * our priority decreased, or if we are not currently running on
4129                  * this runqueue and our priority is higher than the current's
4130                  */
4131                 if (task_running(rq, p)) {
4132                         if (p->prio > oldprio)
4133                                 resched_task(rq->curr);
4134                 } else {
4135                         check_preempt_curr(rq, p);
4136                 }
4137         }
4138         __task_rq_unlock(rq);
4139         spin_unlock_irqrestore(&p->pi_lock, flags);
4140
4141         rt_mutex_adjust_pi(p);
4142
4143         return 0;
4144 }
4145 EXPORT_SYMBOL_GPL(sched_setscheduler);
4146
4147 static int
4148 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4149 {
4150         struct sched_param lparam;
4151         struct task_struct *p;
4152         int retval;
4153
4154         if (!param || pid < 0)
4155                 return -EINVAL;
4156         if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4157                 return -EFAULT;
4158
4159         rcu_read_lock();
4160         retval = -ESRCH;
4161         p = find_process_by_pid(pid);
4162         if (p != NULL)
4163                 retval = sched_setscheduler(p, policy, &lparam);
4164         rcu_read_unlock();
4165
4166         return retval;
4167 }
4168
4169 /**
4170  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4171  * @pid: the pid in question.
4172  * @policy: new policy.
4173  * @param: structure containing the new RT priority.
4174  */
4175 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4176                                        struct sched_param __user *param)
4177 {
4178         /* negative values for policy are not valid */
4179         if (policy < 0)
4180                 return -EINVAL;
4181
4182         return do_sched_setscheduler(pid, policy, param);
4183 }
4184
4185 /**
4186  * sys_sched_setparam - set/change the RT priority of a thread
4187  * @pid: the pid in question.
4188  * @param: structure containing the new RT priority.
4189  */
4190 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4191 {
4192         return do_sched_setscheduler(pid, -1, param);
4193 }
4194
4195 /**
4196  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4197  * @pid: the pid in question.
4198  */
4199 asmlinkage long sys_sched_getscheduler(pid_t pid)
4200 {
4201         struct task_struct *p;
4202         int retval = -EINVAL;
4203
4204         if (pid < 0)
4205                 goto out_nounlock;
4206
4207         retval = -ESRCH;
4208         read_lock(&tasklist_lock);
4209         p = find_process_by_pid(pid);
4210         if (p) {
4211                 retval = security_task_getscheduler(p);
4212                 if (!retval)
4213                         retval = p->policy;
4214         }
4215         read_unlock(&tasklist_lock);
4216
4217 out_nounlock:
4218         return retval;
4219 }
4220
4221 /**
4222  * sys_sched_getscheduler - get the RT priority of a thread
4223  * @pid: the pid in question.
4224  * @param: structure containing the RT priority.
4225  */
4226 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4227 {
4228         struct sched_param lp;
4229         struct task_struct *p;
4230         int retval = -EINVAL;
4231
4232         if (!param || pid < 0)
4233                 goto out_nounlock;
4234
4235         read_lock(&tasklist_lock);
4236         p = find_process_by_pid(pid);
4237         retval = -ESRCH;
4238         if (!p)
4239                 goto out_unlock;
4240
4241         retval = security_task_getscheduler(p);
4242         if (retval)
4243                 goto out_unlock;
4244
4245         lp.sched_priority = p->rt_priority;
4246         read_unlock(&tasklist_lock);
4247
4248         /*
4249          * This one might sleep, we cannot do it with a spinlock held ...
4250          */
4251         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4252
4253 out_nounlock:
4254         return retval;
4255
4256 out_unlock:
4257         read_unlock(&tasklist_lock);
4258         return retval;
4259 }
4260
4261 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4262 {
4263         cpumask_t cpus_allowed;
4264         struct task_struct *p;
4265         int retval;
4266
4267         mutex_lock(&sched_hotcpu_mutex);
4268         read_lock(&tasklist_lock);
4269
4270         p = find_process_by_pid(pid);
4271         if (!p) {
4272                 read_unlock(&tasklist_lock);
4273                 mutex_unlock(&sched_hotcpu_mutex);
4274                 return -ESRCH;
4275         }
4276
4277         /*
4278          * It is not safe to call set_cpus_allowed with the
4279          * tasklist_lock held.  We will bump the task_struct's
4280          * usage count and then drop tasklist_lock.
4281          */
4282         get_task_struct(p);
4283         read_unlock(&tasklist_lock);
4284
4285         retval = -EPERM;
4286         if ((current->euid != p->euid) && (current->euid != p->uid) &&
4287                         !capable(CAP_SYS_NICE))
4288                 goto out_unlock;
4289
4290         retval = security_task_setscheduler(p, 0, NULL);
4291         if (retval)
4292                 goto out_unlock;
4293
4294         cpus_allowed = cpuset_cpus_allowed(p);
4295         cpus_and(new_mask, new_mask, cpus_allowed);
4296         retval = set_cpus_allowed(p, new_mask);
4297
4298 out_unlock:
4299         put_task_struct(p);
4300         mutex_unlock(&sched_hotcpu_mutex);
4301         return retval;
4302 }
4303
4304 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4305                              cpumask_t *new_mask)
4306 {
4307         if (len < sizeof(cpumask_t)) {
4308                 memset(new_mask, 0, sizeof(cpumask_t));
4309         } else if (len > sizeof(cpumask_t)) {
4310                 len = sizeof(cpumask_t);
4311         }
4312         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4313 }
4314
4315 /**
4316  * sys_sched_setaffinity - set the cpu affinity of a process
4317  * @pid: pid of the process
4318  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4319  * @user_mask_ptr: user-space pointer to the new cpu mask
4320  */
4321 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4322                                       unsigned long __user *user_mask_ptr)
4323 {
4324         cpumask_t new_mask;
4325         int retval;
4326
4327         retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4328         if (retval)
4329                 return retval;
4330
4331         return sched_setaffinity(pid, new_mask);
4332 }
4333
4334 /*
4335  * Represents all cpu's present in the system
4336  * In systems capable of hotplug, this map could dynamically grow
4337  * as new cpu's are detected in the system via any platform specific
4338  * method, such as ACPI for e.g.
4339  */
4340
4341 cpumask_t cpu_present_map __read_mostly;
4342 EXPORT_SYMBOL(cpu_present_map);
4343
4344 #ifndef CONFIG_SMP
4345 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4346 EXPORT_SYMBOL(cpu_online_map);
4347
4348 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4349 EXPORT_SYMBOL(cpu_possible_map);
4350 #endif
4351
4352 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4353 {
4354         struct task_struct *p;
4355         int retval;
4356
4357         mutex_lock(&sched_hotcpu_mutex);
4358         read_lock(&tasklist_lock);
4359
4360         retval = -ESRCH;
4361         p = find_process_by_pid(pid);
4362         if (!p)
4363                 goto out_unlock;
4364
4365         retval = security_task_getscheduler(p);
4366         if (retval)
4367                 goto out_unlock;
4368
4369         cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4370
4371 out_unlock:
4372         read_unlock(&tasklist_lock);
4373         mutex_unlock(&sched_hotcpu_mutex);
4374         if (retval)
4375                 return retval;
4376
4377         return 0;
4378 }
4379
4380 /**
4381  * sys_sched_getaffinity - get the cpu affinity of a process
4382  * @pid: pid of the process
4383  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4384  * @user_mask_ptr: user-space pointer to hold the current cpu mask
4385  */
4386 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4387                                       unsigned long __user *user_mask_ptr)
4388 {
4389         int ret;
4390         cpumask_t mask;
4391
4392         if (len < sizeof(cpumask_t))
4393                 return -EINVAL;
4394
4395         ret = sched_getaffinity(pid, &mask);
4396         if (ret < 0)
4397                 return ret;
4398
4399         if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4400                 return -EFAULT;
4401
4402         return sizeof(cpumask_t);
4403 }
4404
4405 /**
4406  * sys_sched_yield - yield the current processor to other threads.
4407  *
4408  * This function yields the current CPU to other tasks. If there are no
4409  * other threads running on this CPU then this function will return.
4410  */
4411 asmlinkage long sys_sched_yield(void)
4412 {
4413         struct rq *rq = this_rq_lock();
4414
4415         schedstat_inc(rq, yld_cnt);
4416         if (unlikely(rq->nr_running == 1))
4417                 schedstat_inc(rq, yld_act_empty);
4418         else
4419                 current->sched_class->yield_task(rq, current);
4420
4421         /*
4422          * Since we are going to call schedule() anyway, there's
4423          * no need to preempt or enable interrupts:
4424          */
4425         __release(rq->lock);
4426         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4427         _raw_spin_unlock(&rq->lock);
4428         preempt_enable_no_resched();
4429
4430         schedule();
4431
4432         return 0;
4433 }
4434
4435 static void __cond_resched(void)
4436 {
4437 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4438         __might_sleep(__FILE__, __LINE__);
4439 #endif
4440         /*
4441          * The BKS might be reacquired before we have dropped
4442          * PREEMPT_ACTIVE, which could trigger a second
4443          * cond_resched() call.
4444          */
4445         do {
4446                 add_preempt_count(PREEMPT_ACTIVE);
4447                 schedule();
4448                 sub_preempt_count(PREEMPT_ACTIVE);
4449         } while (need_resched());
4450 }
4451
4452 int __sched cond_resched(void)
4453 {
4454         if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4455                                         system_state == SYSTEM_RUNNING) {
4456                 __cond_resched();
4457                 return 1;
4458         }
4459         return 0;
4460 }
4461 EXPORT_SYMBOL(cond_resched);
4462
4463 /*
4464  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4465  * call schedule, and on return reacquire the lock.
4466  *
4467  * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
4468  * operations here to prevent schedule() from being called twice (once via
4469  * spin_unlock(), once by hand).
4470  */
4471 int cond_resched_lock(spinlock_t *lock)
4472 {
4473         int ret = 0;
4474
4475         if (need_lockbreak(lock)) {
4476                 spin_unlock(lock);
4477                 cpu_relax();
4478                 ret = 1;
4479                 spin_lock(lock);
4480         }
4481         if (need_resched() && system_state == SYSTEM_RUNNING) {
4482                 spin_release(&lock->dep_map, 1, _THIS_IP_);
4483                 _raw_spin_unlock(lock);
4484                 preempt_enable_no_resched();
4485                 __cond_resched();
4486                 ret = 1;
4487                 spin_lock(lock);
4488         }
4489         return ret;
4490 }
4491 EXPORT_SYMBOL(cond_resched_lock);
4492
4493 int __sched cond_resched_softirq(void)
4494 {
4495         BUG_ON(!in_softirq());
4496
4497         if (need_resched() && system_state == SYSTEM_RUNNING) {
4498                 local_bh_enable();
4499                 __cond_resched();
4500                 local_bh_disable();
4501                 return 1;
4502         }
4503         return 0;
4504 }
4505 EXPORT_SYMBOL(cond_resched_softirq);
4506
4507 /**
4508  * yield - yield the current processor to other threads.
4509  *
4510  * This is a shortcut for kernel-space yielding - it marks the
4511  * thread runnable and calls sys_sched_yield().
4512  */
4513 void __sched yield(void)
4514 {
4515         set_current_state(TASK_RUNNING);
4516         sys_sched_yield();
4517 }
4518 EXPORT_SYMBOL(yield);
4519
4520 /*
4521  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
4522  * that process accounting knows that this is a task in IO wait state.
4523  *
4524  * But don't do that if it is a deliberate, throttling IO wait (this task
4525  * has set its backing_dev_info: the queue against which it should throttle)
4526  */
4527 void __sched io_schedule(void)
4528 {
4529         struct rq *rq = &__raw_get_cpu_var(runqueues);
4530
4531         delayacct_blkio_start();
4532         atomic_inc(&rq->nr_iowait);
4533         schedule();
4534         atomic_dec(&rq->nr_iowait);
4535         delayacct_blkio_end();
4536 }
4537 EXPORT_SYMBOL(io_schedule);
4538
4539 long __sched io_schedule_timeout(long timeout)
4540 {
4541         struct rq *rq = &__raw_get_cpu_var(runqueues);
4542         long ret;
4543
4544         delayacct_blkio_start();
4545         atomic_inc(&rq->nr_iowait);
4546         ret = schedule_timeout(timeout);
4547         atomic_dec(&rq->nr_iowait);
4548         delayacct_blkio_end();
4549         return ret;
4550 }
4551
4552 /**
4553  * sys_sched_get_priority_max - return maximum RT priority.
4554  * @policy: scheduling class.
4555  *
4556  * this syscall returns the maximum rt_priority that can be used
4557  * by a given scheduling class.
4558  */
4559 asmlinkage long sys_sched_get_priority_max(int policy)
4560 {
4561         int ret = -EINVAL;
4562
4563         switch (policy) {
4564         case SCHED_FIFO:
4565         case SCHED_RR:
4566                 ret = MAX_USER_RT_PRIO-1;
4567                 break;
4568         case SCHED_NORMAL:
4569         case SCHED_BATCH:
4570         case SCHED_IDLE:
4571                 ret = 0;
4572                 break;
4573         }
4574         return ret;
4575 }
4576
4577 /**
4578  * sys_sched_get_priority_min - return minimum RT priority.
4579  * @policy: scheduling class.
4580  *
4581  * this syscall returns the minimum rt_priority that can be used
4582  * by a given scheduling class.
4583  */
4584 asmlinkage long sys_sched_get_priority_min(int policy)
4585 {
4586         int ret = -EINVAL;
4587
4588         switch (policy) {
4589         case SCHED_FIFO:
4590         case SCHED_RR:
4591                 ret = 1;
4592                 break;
4593         case SCHED_NORMAL:
4594         case SCHED_BATCH:
4595         case SCHED_IDLE:
4596                 ret = 0;
4597         }
4598         return ret;
4599 }
4600
4601 /**
4602  * sys_sched_rr_get_interval - return the default timeslice of a process.
4603  * @pid: pid of the process.
4604  * @interval: userspace pointer to the timeslice value.
4605  *
4606  * this syscall writes the default timeslice value of a given process
4607  * into the user-space timespec buffer. A value of '0' means infinity.
4608  */
4609 asmlinkage
4610 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4611 {
4612         struct task_struct *p;
4613         int retval = -EINVAL;
4614         struct timespec t;
4615
4616         if (pid < 0)
4617                 goto out_nounlock;
4618
4619         retval = -ESRCH;
4620         read_lock(&tasklist_lock);
4621         p = find_process_by_pid(pid);
4622         if (!p)
4623                 goto out_unlock;
4624
4625         retval = security_task_getscheduler(p);
4626         if (retval)
4627                 goto out_unlock;
4628
4629         jiffies_to_timespec(p->policy == SCHED_FIFO ?
4630                                 0 : static_prio_timeslice(p->static_prio), &t);
4631         read_unlock(&tasklist_lock);
4632         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4633 out_nounlock:
4634         return retval;
4635 out_unlock:
4636         read_unlock(&tasklist_lock);
4637         return retval;
4638 }
4639
4640 static const char stat_nam[] = "RSDTtZX";
4641
4642 static void show_task(struct task_struct *p)
4643 {
4644         unsigned long free = 0;
4645         unsigned state;
4646
4647         state = p->state ? __ffs(p->state) + 1 : 0;
4648         printk("%-13.13s %c", p->comm,
4649                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4650 #if (BITS_PER_LONG == 32)
4651         if (state == TASK_RUNNING)
4652                 printk(" running ");
4653         else
4654                 printk(" %08lX ", thread_saved_pc(p));
4655 #else
4656         if (state == TASK_RUNNING)
4657                 printk("  running task   ");
4658         else
4659                 printk(" %016lx ", thread_saved_pc(p));
4660 #endif
4661 #ifdef CONFIG_DEBUG_STACK_USAGE
4662         {
4663                 unsigned long *n = end_of_stack(p);
4664                 while (!*n)
4665                         n++;
4666                 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4667         }
4668 #endif
4669         printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
4670         if (!p->mm)
4671                 printk(" (L-TLB)\n");
4672         else
4673                 printk(" (NOTLB)\n");
4674
4675         if (state != TASK_RUNNING)
4676                 show_stack(p, NULL);
4677 }
4678
4679 void show_state_filter(unsigned long state_filter)
4680 {
4681         struct task_struct *g, *p;
4682
4683 #if (BITS_PER_LONG == 32)
4684         printk("\n"
4685                "                         free                        sibling\n");
4686         printk("  task             PC    stack   pid father child younger older\n");
4687 #else
4688         printk("\n"
4689                "                                 free                        sibling\n");
4690         printk("  task                 PC        stack   pid father child younger older\n");
4691 #endif
4692         read_lock(&tasklist_lock);
4693         do_each_thread(g, p) {
4694                 /*
4695                  * reset the NMI-timeout, listing all files on a slow
4696                  * console might take alot of time:
4697                  */
4698                 touch_nmi_watchdog();
4699                 if (!state_filter || (p->state & state_filter))
4700                         show_task(p);
4701         } while_each_thread(g, p);
4702
4703         touch_all_softlockup_watchdogs();
4704
4705 #ifdef CONFIG_SCHED_DEBUG
4706         sysrq_sched_debug_show();
4707 #endif
4708         read_unlock(&tasklist_lock);
4709         /*
4710          * Only show locks if all tasks are dumped:
4711          */
4712         if (state_filter == -1)
4713                 debug_show_all_locks();
4714 }
4715
4716 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4717 {
4718         idle->sched_class = &idle_sched_class;
4719 }
4720
4721 /**
4722  * init_idle - set up an idle thread for a given CPU
4723  * @idle: task in question
4724  * @cpu: cpu the idle task belongs to
4725  *
4726  * NOTE: this function does not set the idle thread's NEED_RESCHED
4727  * flag, to make booting more robust.
4728  */
4729 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4730 {
4731         struct rq *rq = cpu_rq(cpu);
4732         unsigned long flags;
4733
4734         __sched_fork(idle);
4735         idle->se.exec_start = sched_clock();
4736
4737         idle->prio = idle->normal_prio = MAX_PRIO;
4738         idle->cpus_allowed = cpumask_of_cpu(cpu);
4739         __set_task_cpu(idle, cpu);
4740
4741         spin_lock_irqsave(&rq->lock, flags);
4742         rq->curr = rq->idle = idle;
4743 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4744         idle->oncpu = 1;
4745 #endif
4746         spin_unlock_irqrestore(&rq->lock, flags);
4747
4748         /* Set the preempt count _outside_ the spinlocks! */
4749 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4750         task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4751 #else
4752         task_thread_info(idle)->preempt_count = 0;
4753 #endif
4754         /*
4755          * The idle tasks have their own, simple scheduling class:
4756          */
4757         idle->sched_class = &idle_sched_class;
4758 }
4759
4760 /*
4761  * In a system that switches off the HZ timer nohz_cpu_mask
4762  * indicates which cpus entered this state. This is used
4763  * in the rcu update to wait only for active cpus. For system
4764  * which do not switch off the HZ timer nohz_cpu_mask should
4765  * always be CPU_MASK_NONE.
4766  */
4767 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4768
4769 /*
4770  * Increase the granularity value when there are more CPUs,
4771  * because with more CPUs the 'effective latency' as visible
4772  * to users decreases. But the relationship is not linear,
4773  * so pick a second-best guess by going with the log2 of the
4774  * number of CPUs.
4775  *
4776  * This idea comes from the SD scheduler of Con Kolivas:
4777  */
4778 static inline void sched_init_granularity(void)
4779 {
4780         unsigned int factor = 1 + ilog2(num_online_cpus());
4781         const unsigned long gran_limit = 10000000;
4782
4783         sysctl_sched_granularity *= factor;
4784         if (sysctl_sched_granularity > gran_limit)
4785                 sysctl_sched_granularity = gran_limit;
4786
4787         sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4788         sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4789 }
4790
4791 #ifdef CONFIG_SMP
4792 /*
4793  * This is how migration works:
4794  *
4795  * 1) we queue a struct migration_req structure in the source CPU's
4796  *    runqueue and wake up that CPU's migration thread.
4797  * 2) we down() the locked semaphore => thread blocks.
4798  * 3) migration thread wakes up (implicitly it forces the migrated
4799  *    thread off the CPU)
4800  * 4) it gets the migration request and checks whether the migrated
4801  *    task is still in the wrong runqueue.
4802  * 5) if it's in the wrong runqueue then the migration thread removes
4803  *    it and puts it into the right queue.
4804  * 6) migration thread up()s the semaphore.
4805  * 7) we wake up and the migration is done.
4806  */
4807
4808 /*
4809  * Change a given task's CPU affinity. Migrate the thread to a
4810  * proper CPU and schedule it away if the CPU it's executing on
4811  * is removed from the allowed bitmask.
4812  *
4813  * NOTE: the caller must have a valid reference to the task, the
4814  * task must not exit() & deallocate itself prematurely.  The
4815  * call is not atomic; no spinlocks may be held.
4816  */
4817 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4818 {
4819         struct migration_req req;
4820         unsigned long flags;
4821         struct rq *rq;
4822         int ret = 0;
4823
4824         rq = task_rq_lock(p, &flags);
4825         if (!cpus_intersects(new_mask, cpu_online_map)) {
4826                 ret = -EINVAL;
4827                 goto out;
4828         }
4829
4830         p->cpus_allowed = new_mask;
4831         /* Can the task run on the task's current CPU? If so, we're done */
4832         if (cpu_isset(task_cpu(p), new_mask))
4833                 goto out;
4834
4835         if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4836                 /* Need help from migration thread: drop lock and wait. */
4837                 task_rq_unlock(rq, &flags);
4838                 wake_up_process(rq->migration_thread);
4839                 wait_for_completion(&req.done);
4840                 tlb_migrate_finish(p->mm);
4841                 return 0;
4842         }
4843 out:
4844         task_rq_unlock(rq, &flags);
4845
4846         return ret;
4847 }
4848 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4849
4850 /*
4851  * Move (not current) task off this cpu, onto dest cpu.  We're doing
4852  * this because either it can't run here any more (set_cpus_allowed()
4853  * away from this CPU, or CPU going down), or because we're
4854  * attempting to rebalance this task on exec (sched_exec).
4855  *
4856  * So we race with normal scheduler movements, but that's OK, as long
4857  * as the task is no longer on this CPU.
4858  *
4859  * Returns non-zero if task was successfully migrated.
4860  */
4861 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4862 {
4863         struct rq *rq_dest, *rq_src;
4864         int ret = 0, on_rq;
4865
4866         if (unlikely(cpu_is_offline(dest_cpu)))
4867                 return ret;
4868
4869         rq_src = cpu_rq(src_cpu);
4870         rq_dest = cpu_rq(dest_cpu);
4871
4872         double_rq_lock(rq_src, rq_dest);
4873         /* Already moved. */
4874         if (task_cpu(p) != src_cpu)
4875                 goto out;
4876         /* Affinity changed (again). */
4877         if (!cpu_isset(dest_cpu, p->cpus_allowed))
4878                 goto out;
4879
4880         on_rq = p->se.on_rq;
4881         if (on_rq)
4882                 deactivate_task(rq_src, p, 0);
4883         set_task_cpu(p, dest_cpu);
4884         if (on_rq) {
4885                 activate_task(rq_dest, p, 0);
4886                 check_preempt_curr(rq_dest, p);
4887         }
4888         ret = 1;
4889 out:
4890         double_rq_unlock(rq_src, rq_dest);
4891         return ret;
4892 }
4893
4894 /*
4895  * migration_thread - this is a highprio system thread that performs
4896  * thread migration by bumping thread off CPU then 'pushing' onto
4897  * another runqueue.
4898  */
4899 static int migration_thread(void *data)
4900 {
4901         int cpu = (long)data;
4902         struct rq *rq;
4903
4904         rq = cpu_rq(cpu);
4905         BUG_ON(rq->migration_thread != current);
4906
4907         set_current_state(TASK_INTERRUPTIBLE);
4908         while (!kthread_should_stop()) {
4909                 struct migration_req *req;
4910                 struct list_head *head;
4911
4912                 try_to_freeze();
4913
4914                 spin_lock_irq(&rq->lock);
4915
4916                 if (cpu_is_offline(cpu)) {
4917                         spin_unlock_irq(&rq->lock);
4918                         goto wait_to_die;
4919                 }
4920
4921                 if (rq->active_balance) {
4922                         active_load_balance(rq, cpu);
4923                         rq->active_balance = 0;
4924                 }
4925
4926                 head = &rq->migration_queue;
4927
4928                 if (list_empty(head)) {
4929                         spin_unlock_irq(&rq->lock);
4930                         schedule();
4931                         set_current_state(TASK_INTERRUPTIBLE);
4932                         continue;
4933                 }
4934                 req = list_entry(head->next, struct migration_req, list);
4935                 list_del_init(head->next);
4936
4937                 spin_unlock(&rq->lock);
4938                 __migrate_task(req->task, cpu, req->dest_cpu);
4939                 local_irq_enable();
4940
4941                 complete(&req->done);
4942         }
4943         __set_current_state(TASK_RUNNING);
4944         return 0;
4945
4946 wait_to_die:
4947         /* Wait for kthread_stop */
4948         set_current_state(TASK_INTERRUPTIBLE);
4949         while (!kthread_should_stop()) {
4950                 schedule();
4951                 set_current_state(TASK_INTERRUPTIBLE);
4952         }
4953         __set_current_state(TASK_RUNNING);
4954         return 0;
4955 }
4956
4957 #ifdef CONFIG_HOTPLUG_CPU
4958 /*
4959  * Figure out where task on dead CPU should go, use force if neccessary.
4960  * NOTE: interrupts should be disabled by the caller
4961  */
4962 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
4963 {
4964         unsigned long flags;
4965         cpumask_t mask;
4966         struct rq *rq;
4967         int dest_cpu;
4968
4969 restart:
4970         /* On same node? */
4971         mask = node_to_cpumask(cpu_to_node(dead_cpu));
4972         cpus_and(mask, mask, p->cpus_allowed);
4973         dest_cpu = any_online_cpu(mask);
4974
4975         /* On any allowed CPU? */
4976         if (dest_cpu == NR_CPUS)
4977                 dest_cpu = any_online_cpu(p->cpus_allowed);
4978
4979         /* No more Mr. Nice Guy. */
4980         if (dest_cpu == NR_CPUS) {
4981                 rq = task_rq_lock(p, &flags);
4982                 cpus_setall(p->cpus_allowed);
4983                 dest_cpu = any_online_cpu(p->cpus_allowed);
4984                 task_rq_unlock(rq, &flags);
4985
4986                 /*
4987                  * Don't tell them about moving exiting tasks or
4988                  * kernel threads (both mm NULL), since they never
4989                  * leave kernel.
4990                  */
4991                 if (p->mm && printk_ratelimit())
4992                         printk(KERN_INFO "process %d (%s) no "
4993                                "longer affine to cpu%d\n",
4994                                p->pid, p->comm, dead_cpu);
4995         }
4996         if (!__migrate_task(p, dead_cpu, dest_cpu))
4997                 goto restart;
4998 }
4999
5000 /*
5001  * While a dead CPU has no uninterruptible tasks queued at this point,
5002  * it might still have a nonzero ->nr_uninterruptible counter, because
5003  * for performance reasons the counter is not stricly tracking tasks to
5004  * their home CPUs. So we just add the counter to another CPU's counter,
5005  * to keep the global sum constant after CPU-down:
5006  */
5007 static void migrate_nr_uninterruptible(struct rq *rq_src)
5008 {
5009         struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5010         unsigned long flags;
5011
5012         local_irq_save(flags);
5013         double_rq_lock(rq_src, rq_dest);
5014         rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5015         rq_src->nr_uninterruptible = 0;
5016         double_rq_unlock(rq_src, rq_dest);
5017         local_irq_restore(flags);
5018 }
5019
5020 /* Run through task list and migrate tasks from the dead cpu. */
5021 static void migrate_live_tasks(int src_cpu)
5022 {
5023         struct task_struct *p, *t;
5024
5025         write_lock_irq(&tasklist_lock);
5026
5027         do_each_thread(t, p) {
5028                 if (p == current)
5029                         continue;
5030
5031                 if (task_cpu(p) == src_cpu)
5032                         move_task_off_dead_cpu(src_cpu, p);
5033         } while_each_thread(t, p);
5034
5035         write_unlock_irq(&tasklist_lock);
5036 }
5037
5038 /*
5039  * Schedules idle task to be the next runnable task on current CPU.
5040  * It does so by boosting its priority to highest possible and adding it to
5041  * the _front_ of the runqueue. Used by CPU offline code.
5042  */
5043 void sched_idle_next(void)
5044 {
5045         int this_cpu = smp_processor_id();
5046         struct rq *rq = cpu_rq(this_cpu);
5047         struct task_struct *p = rq->idle;
5048         unsigned long flags;
5049
5050         /* cpu has to be offline */
5051         BUG_ON(cpu_online(this_cpu));
5052
5053         /*
5054          * Strictly not necessary since rest of the CPUs are stopped by now
5055          * and interrupts disabled on the current cpu.
5056          */
5057         spin_lock_irqsave(&rq->lock, flags);
5058
5059         __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5060
5061         /* Add idle task to the _front_ of its priority queue: */
5062         activate_idle_task(p, rq);
5063
5064         spin_unlock_irqrestore(&rq->lock, flags);
5065 }
5066
5067 /*
5068  * Ensures that the idle task is using init_mm right before its cpu goes
5069  * offline.
5070  */
5071 void idle_task_exit(void)
5072 {
5073         struct mm_struct *mm = current->active_mm;
5074
5075         BUG_ON(cpu_online(smp_processor_id()));
5076
5077         if (mm != &init_mm)
5078                 switch_mm(mm, &init_mm, current);
5079         mmdrop(mm);
5080 }
5081
5082 /* called under rq->lock with disabled interrupts */
5083 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5084 {
5085         struct rq *rq = cpu_rq(dead_cpu);
5086
5087         /* Must be exiting, otherwise would be on tasklist. */
5088         BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5089
5090         /* Cannot have done final schedule yet: would have vanished. */
5091         BUG_ON(p->state == TASK_DEAD);
5092
5093         get_task_struct(p);
5094
5095         /*
5096          * Drop lock around migration; if someone else moves it,
5097          * that's OK.  No task can be added to this CPU, so iteration is
5098          * fine.
5099          * NOTE: interrupts should be left disabled  --dev@
5100          */
5101         spin_unlock(&rq->lock);
5102         move_task_off_dead_cpu(dead_cpu, p);
5103         spin_lock(&rq->lock);
5104
5105         put_task_struct(p);
5106 }
5107
5108 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5109 static void migrate_dead_tasks(unsigned int dead_cpu)
5110 {
5111         struct rq *rq = cpu_rq(dead_cpu);
5112         struct task_struct *next;
5113
5114         for ( ; ; ) {
5115                 if (!rq->nr_running)
5116                         break;
5117                 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5118                 if (!next)
5119                         break;
5120                 migrate_dead(dead_cpu, next);
5121         }
5122 }
5123 #endif /* CONFIG_HOTPLUG_CPU */
5124
5125 /*
5126  * migration_call - callback that gets triggered when a CPU is added.
5127  * Here we can start up the necessary migration thread for the new CPU.
5128  */
5129 static int __cpuinit
5130 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5131 {
5132         struct task_struct *p;
5133         int cpu = (long)hcpu;
5134         unsigned long flags;
5135         struct rq *rq;
5136
5137         switch (action) {
5138         case CPU_LOCK_ACQUIRE:
5139                 mutex_lock(&sched_hotcpu_mutex);
5140                 break;
5141
5142         case CPU_UP_PREPARE:
5143         case CPU_UP_PREPARE_FROZEN:
5144                 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5145                 if (IS_ERR(p))
5146                         return NOTIFY_BAD;
5147                 p->flags |= PF_NOFREEZE;
5148                 kthread_bind(p, cpu);
5149                 /* Must be high prio: stop_machine expects to yield to it. */
5150                 rq = task_rq_lock(p, &flags);
5151                 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5152                 task_rq_unlock(rq, &flags);
5153                 cpu_rq(cpu)->migration_thread = p;
5154                 break;
5155
5156         case CPU_ONLINE:
5157         case CPU_ONLINE_FROZEN:
5158                 /* Strictly unneccessary, as first user will wake it. */
5159                 wake_up_process(cpu_rq(cpu)->migration_thread);
5160                 break;
5161
5162 #ifdef CONFIG_HOTPLUG_CPU
5163         case CPU_UP_CANCELED:
5164         case CPU_UP_CANCELED_FROZEN:
5165                 if (!cpu_rq(cpu)->migration_thread)
5166                         break;
5167                 /* Unbind it from offline cpu so it can run.  Fall thru. */
5168                 kthread_bind(cpu_rq(cpu)->migration_thread,
5169                              any_online_cpu(cpu_online_map));
5170                 kthread_stop(cpu_rq(cpu)->migration_thread);
5171                 cpu_rq(cpu)->migration_thread = NULL;
5172                 break;
5173
5174         case CPU_DEAD:
5175         case CPU_DEAD_FROZEN:
5176                 migrate_live_tasks(cpu);
5177                 rq = cpu_rq(cpu);
5178                 kthread_stop(rq->migration_thread);
5179                 rq->migration_thread = NULL;
5180                 /* Idle task back to normal (off runqueue, low prio) */
5181                 rq = task_rq_lock(rq->idle, &flags);
5182                 deactivate_task(rq, rq->idle, 0);
5183                 rq->idle->static_prio = MAX_PRIO;
5184                 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5185                 rq->idle->sched_class = &idle_sched_class;
5186                 migrate_dead_tasks(cpu);
5187                 task_rq_unlock(rq, &flags);
5188                 migrate_nr_uninterruptible(rq);
5189                 BUG_ON(rq->nr_running != 0);
5190
5191                 /* No need to migrate the tasks: it was best-effort if
5192                  * they didn't take sched_hotcpu_mutex.  Just wake up
5193                  * the requestors. */
5194                 spin_lock_irq(&rq->lock);
5195                 while (!list_empty(&rq->migration_queue)) {
5196                         struct migration_req *req;
5197
5198                         req = list_entry(rq->migration_queue.next,
5199                                          struct migration_req, list);
5200                         list_del_init(&req->list);
5201                         complete(&req->done);
5202                 }
5203                 spin_unlock_irq(&rq->lock);
5204                 break;
5205 #endif
5206         case CPU_LOCK_RELEASE:
5207                 mutex_unlock(&sched_hotcpu_mutex);
5208                 break;
5209         }
5210         return NOTIFY_OK;
5211 }
5212
5213 /* Register at highest priority so that task migration (migrate_all_tasks)
5214  * happens before everything else.
5215  */
5216 static struct notifier_block __cpuinitdata migration_notifier = {
5217         .notifier_call = migration_call,
5218         .priority = 10
5219 };
5220
5221 int __init migration_init(void)
5222 {
5223         void *cpu = (void *)(long)smp_processor_id();
5224         int err;
5225
5226         /* Start one for the boot CPU: */
5227         err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5228         BUG_ON(err == NOTIFY_BAD);
5229         migration_call(&migration_notifier, CPU_ONLINE, cpu);
5230         register_cpu_notifier(&migration_notifier);
5231
5232         return 0;
5233 }
5234 #endif
5235
5236 #ifdef CONFIG_SMP
5237
5238 /* Number of possible processor ids */
5239 int nr_cpu_ids __read_mostly = NR_CPUS;
5240 EXPORT_SYMBOL(nr_cpu_ids);
5241
5242 #undef SCHED_DOMAIN_DEBUG
5243 #ifdef SCHED_DOMAIN_DEBUG
5244 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5245 {
5246         int level = 0;
5247
5248         if (!sd) {
5249                 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5250                 return;
5251         }
5252
5253         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5254
5255         do {
5256                 int i;
5257                 char str[NR_CPUS];
5258                 struct sched_group *group = sd->groups;
5259                 cpumask_t groupmask;
5260
5261                 cpumask_scnprintf(str, NR_CPUS, sd->span);
5262                 cpus_clear(groupmask);
5263
5264                 printk(KERN_DEBUG);
5265                 for (i = 0; i < level + 1; i++)
5266                         printk(" ");
5267                 printk("domain %d: ", level);
5268
5269                 if (!(sd->flags & SD_LOAD_BALANCE)) {
5270                         printk("does not load-balance\n");
5271                         if (sd->parent)
5272                                 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5273                                                 " has parent");
5274                         break;
5275                 }
5276
5277                 printk("span %s\n", str);
5278
5279                 if (!cpu_isset(cpu, sd->span))
5280                         printk(KERN_ERR "ERROR: domain->span does not contain "
5281                                         "CPU%d\n", cpu);
5282                 if (!cpu_isset(cpu, group->cpumask))
5283                         printk(KERN_ERR "ERROR: domain->groups does not contain"
5284                                         " CPU%d\n", cpu);
5285
5286                 printk(KERN_DEBUG);
5287                 for (i = 0; i < level + 2; i++)
5288                         printk(" ");
5289                 printk("groups:");
5290                 do {
5291                         if (!group) {
5292                                 printk("\n");
5293                                 printk(KERN_ERR "ERROR: group is NULL\n");
5294                                 break;
5295                         }
5296
5297                         if (!group->__cpu_power) {
5298                                 printk("\n");
5299                                 printk(KERN_ERR "ERROR: domain->cpu_power not "
5300                                                 "set\n");
5301                         }
5302
5303                         if (!cpus_weight(group->cpumask)) {
5304                                 printk("\n");
5305                                 printk(KERN_ERR "ERROR: empty group\n");
5306                         }
5307
5308                         if (cpus_intersects(groupmask, group->cpumask)) {
5309                                 printk("\n");
5310                                 printk(KERN_ERR "ERROR: repeated CPUs\n");
5311                         }
5312
5313                         cpus_or(groupmask, groupmask, group->cpumask);
5314
5315                         cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5316                         printk(" %s", str);
5317
5318                         group = group->next;
5319                 } while (group != sd->groups);
5320                 printk("\n");
5321
5322                 if (!cpus_equal(sd->span, groupmask))
5323                         printk(KERN_ERR "ERROR: groups don't span "
5324                                         "domain->span\n");
5325
5326                 level++;
5327                 sd = sd->parent;
5328                 if (!sd)
5329                         continue;
5330
5331                 if (!cpus_subset(groupmask, sd->span))
5332                         printk(KERN_ERR "ERROR: parent span is not a superset "
5333                                 "of domain->span\n");
5334
5335         } while (sd);
5336 }
5337 #else
5338 # define sched_domain_debug(sd, cpu) do { } while (0)
5339 #endif
5340
5341 static int sd_degenerate(struct sched_domain *sd)
5342 {
5343         if (cpus_weight(sd->span) == 1)
5344                 return 1;
5345
5346         /* Following flags need at least 2 groups */
5347         if (sd->flags & (SD_LOAD_BALANCE |
5348                          SD_BALANCE_NEWIDLE |
5349                          SD_BALANCE_FORK |
5350                          SD_BALANCE_EXEC |
5351                          SD_SHARE_CPUPOWER |
5352                          SD_SHARE_PKG_RESOURCES)) {
5353                 if (sd->groups != sd->groups->next)
5354                         return 0;
5355         }
5356
5357         /* Following flags don't use groups */
5358         if (sd->flags & (SD_WAKE_IDLE |
5359                          SD_WAKE_AFFINE |
5360                          SD_WAKE_BALANCE))
5361                 return 0;
5362
5363         return 1;
5364 }
5365
5366 static int
5367 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5368 {
5369         unsigned long cflags = sd->flags, pflags = parent->flags;
5370
5371         if (sd_degenerate(parent))
5372                 return 1;
5373
5374         if (!cpus_equal(sd->span, parent->span))
5375                 return 0;
5376
5377         /* Does parent contain flags not in child? */
5378         /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5379         if (cflags & SD_WAKE_AFFINE)
5380                 pflags &= ~SD_WAKE_BALANCE;
5381         /* Flags needing groups don't count if only 1 group in parent */
5382         if (parent->groups == parent->groups->next) {
5383                 pflags &= ~(SD_LOAD_BALANCE |
5384                                 SD_BALANCE_NEWIDLE |
5385                                 SD_BALANCE_FORK |
5386                                 SD_BALANCE_EXEC |
5387                                 SD_SHARE_CPUPOWER |
5388                                 SD_SHARE_PKG_RESOURCES);
5389         }
5390         if (~cflags & pflags)
5391                 return 0;
5392
5393         return 1;
5394 }
5395
5396 /*
5397  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
5398  * hold the hotplug lock.
5399  */
5400 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5401 {
5402         struct rq *rq = cpu_rq(cpu);
5403         struct sched_domain *tmp;
5404
5405         /* Remove the sched domains which do not contribute to scheduling. */
5406         for (tmp = sd; tmp; tmp = tmp->parent) {
5407                 struct sched_domain *parent = tmp->parent;
5408                 if (!parent)
5409                         break;
5410                 if (sd_parent_degenerate(tmp, parent)) {
5411                         tmp->parent = parent->parent;
5412                         if (parent->parent)
5413                                 parent->parent->child = tmp;
5414                 }
5415         }
5416
5417         if (sd && sd_degenerate(sd)) {
5418                 sd = sd->parent;
5419                 if (sd)
5420                         sd->child = NULL;
5421         }
5422
5423         sched_domain_debug(sd, cpu);
5424
5425         rcu_assign_pointer(rq->sd, sd);
5426 }
5427
5428 /* cpus with isolated domains */
5429 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5430
5431 /* Setup the mask of cpus configured for isolated domains */
5432 static int __init isolated_cpu_setup(char *str)
5433 {
5434         int ints[NR_CPUS], i;
5435
5436         str = get_options(str, ARRAY_SIZE(ints), ints);
5437         cpus_clear(cpu_isolated_map);
5438         for (i = 1; i <= ints[0]; i++)
5439                 if (ints[i] < NR_CPUS)
5440                         cpu_set(ints[i], cpu_isolated_map);
5441         return 1;
5442 }
5443
5444 __setup ("isolcpus=", isolated_cpu_setup);
5445
5446 /*
5447  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5448  * to a function which identifies what group(along with sched group) a CPU
5449  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5450  * (due to the fact that we keep track of groups covered with a cpumask_t).
5451  *
5452  * init_sched_build_groups will build a circular linked list of the groups
5453  * covered by the given span, and will set each group's ->cpumask correctly,
5454  * and ->cpu_power to 0.
5455  */
5456 static void
5457 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5458                         int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5459                                         struct sched_group **sg))
5460 {
5461         struct sched_group *first = NULL, *last = NULL;
5462         cpumask_t covered = CPU_MASK_NONE;
5463         int i;
5464
5465         for_each_cpu_mask(i, span) {
5466                 struct sched_group *sg;
5467                 int group = group_fn(i, cpu_map, &sg);
5468                 int j;
5469
5470                 if (cpu_isset(i, covered))
5471                         continue;
5472
5473                 sg->cpumask = CPU_MASK_NONE;
5474                 sg->__cpu_power = 0;
5475
5476                 for_each_cpu_mask(j, span) {
5477                         if (group_fn(j, cpu_map, NULL) != group)
5478                                 continue;
5479
5480                         cpu_set(j, covered);
5481                         cpu_set(j, sg->cpumask);
5482                 }
5483                 if (!first)
5484                         first = sg;
5485                 if (last)
5486                         last->next = sg;
5487                 last = sg;
5488         }
5489         last->next = first;
5490 }
5491
5492 #define SD_NODES_PER_DOMAIN 16
5493
5494 #ifdef CONFIG_NUMA
5495
5496 /**
5497  * find_next_best_node - find the next node to include in a sched_domain
5498  * @node: node whose sched_domain we're building
5499  * @used_nodes: nodes already in the sched_domain
5500  *
5501  * Find the next node to include in a given scheduling domain.  Simply
5502  * finds the closest node not already in the @used_nodes map.
5503  *
5504  * Should use nodemask_t.
5505  */
5506 static int find_next_best_node(int node, unsigned long *used_nodes)
5507 {
5508         int i, n, val, min_val, best_node = 0;
5509
5510         min_val = INT_MAX;
5511
5512         for (i = 0; i < MAX_NUMNODES; i++) {
5513                 /* Start at @node */
5514                 n = (node + i) % MAX_NUMNODES;
5515
5516                 if (!nr_cpus_node(n))
5517                         continue;
5518
5519                 /* Skip already used nodes */
5520                 if (test_bit(n, used_nodes))
5521                         continue;
5522
5523                 /* Simple min distance search */
5524                 val = node_distance(node, n);
5525
5526                 if (val < min_val) {
5527                         min_val = val;
5528                         best_node = n;
5529                 }
5530         }
5531
5532         set_bit(best_node, used_nodes);
5533         return best_node;
5534 }
5535
5536 /**
5537  * sched_domain_node_span - get a cpumask for a node's sched_domain
5538  * @node: node whose cpumask we're constructing
5539  * @size: number of nodes to include in this span
5540  *
5541  * Given a node, construct a good cpumask for its sched_domain to span.  It
5542  * should be one that prevents unnecessary balancing, but also spreads tasks
5543  * out optimally.
5544  */
5545 static cpumask_t sched_domain_node_span(int node)
5546 {
5547         DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5548         cpumask_t span, nodemask;
5549         int i;
5550
5551         cpus_clear(span);
5552         bitmap_zero(used_nodes, MAX_NUMNODES);
5553
5554         nodemask = node_to_cpumask(node);
5555         cpus_or(span, span, nodemask);
5556         set_bit(node, used_nodes);
5557
5558         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5559                 int next_node = find_next_best_node(node, used_nodes);
5560
5561                 nodemask = node_to_cpumask(next_node);
5562                 cpus_or(span, span, nodemask);
5563         }
5564
5565         return span;
5566 }
5567 #endif
5568
5569 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5570
5571 /*
5572  * SMT sched-domains:
5573  */
5574 #ifdef CONFIG_SCHED_SMT
5575 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5576 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5577
5578 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5579                             struct sched_group **sg)
5580 {
5581         if (sg)
5582                 *sg = &per_cpu(sched_group_cpus, cpu);
5583         return cpu;
5584 }
5585 #endif
5586
5587 /*
5588  * multi-core sched-domains:
5589  */
5590 #ifdef CONFIG_SCHED_MC
5591 static DEFINE_PER_CPU(struct sched_domain, core_domains);
5592 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5593 #endif
5594
5595 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5596 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5597                              struct sched_group **sg)
5598 {
5599         int group;
5600         cpumask_t mask = cpu_sibling_map[cpu];
5601         cpus_and(mask, mask, *cpu_map);
5602         group = first_cpu(mask);
5603         if (sg)
5604                 *sg = &per_cpu(sched_group_core, group);
5605         return group;
5606 }
5607 #elif defined(CONFIG_SCHED_MC)
5608 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5609                              struct sched_group **sg)
5610 {
5611         if (sg)
5612                 *sg = &per_cpu(sched_group_core, cpu);
5613         return cpu;
5614 }
5615 #endif
5616
5617 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5618 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
5619
5620 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5621                              struct sched_group **sg)
5622 {
5623         int group;
5624 #ifdef CONFIG_SCHED_MC
5625         cpumask_t mask = cpu_coregroup_map(cpu);
5626         cpus_and(mask, mask, *cpu_map);
5627         group = first_cpu(mask);
5628 #elif defined(CONFIG_SCHED_SMT)
5629         cpumask_t mask = cpu_sibling_map[cpu];
5630         cpus_and(mask, mask, *cpu_map);
5631         group = first_cpu(mask);
5632 #else
5633         group = cpu;
5634 #endif
5635         if (sg)
5636                 *sg = &per_cpu(sched_group_phys, group);
5637         return group;
5638 }
5639
5640 #ifdef CONFIG_NUMA
5641 /*
5642  * The init_sched_build_groups can't handle what we want to do with node
5643  * groups, so roll our own. Now each node has its own list of groups which
5644  * gets dynamically allocated.
5645  */
5646 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5647 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5648
5649 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5650 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
5651
5652 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5653                                  struct sched_group **sg)
5654 {
5655         cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5656         int group;
5657
5658         cpus_and(nodemask, nodemask, *cpu_map);
5659         group = first_cpu(nodemask);
5660
5661         if (sg)
5662                 *sg = &per_cpu(sched_group_allnodes, group);
5663         return group;
5664 }
5665
5666 static void init_numa_sched_groups_power(struct sched_group *group_head)
5667 {
5668         struct sched_group *sg = group_head;
5669         int j;
5670
5671         if (!sg)
5672                 return;
5673 next_sg:
5674         for_each_cpu_mask(j, sg->cpumask) {
5675                 struct sched_domain *sd;
5676
5677                 sd = &per_cpu(phys_domains, j);
5678                 if (j != first_cpu(sd->groups->cpumask)) {
5679                         /*
5680                          * Only add "power" once for each
5681                          * physical package.
5682                          */
5683                         continue;
5684                 }
5685
5686                 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5687         }
5688         sg = sg->next;
5689         if (sg != group_head)
5690                 goto next_sg;
5691 }
5692 #endif
5693
5694 #ifdef CONFIG_NUMA
5695 /* Free memory allocated for various sched_group structures */
5696 static void free_sched_groups(const cpumask_t *cpu_map)
5697 {
5698         int cpu, i;
5699
5700         for_each_cpu_mask(cpu, *cpu_map) {
5701                 struct sched_group **sched_group_nodes
5702                         = sched_group_nodes_bycpu[cpu];
5703
5704                 if (!sched_group_nodes)
5705                         continue;
5706
5707                 for (i = 0; i < MAX_NUMNODES; i++) {
5708                         cpumask_t nodemask = node_to_cpumask(i);
5709                         struct sched_group *oldsg, *sg = sched_group_nodes[i];
5710
5711                         cpus_and(nodemask, nodemask, *cpu_map);
5712                         if (cpus_empty(nodemask))
5713                                 continue;
5714
5715                         if (sg == NULL)
5716                                 continue;
5717                         sg = sg->next;
5718 next_sg:
5719                         oldsg = sg;
5720                         sg = sg->next;
5721                         kfree(oldsg);
5722                         if (oldsg != sched_group_nodes[i])
5723                                 goto next_sg;
5724                 }
5725                 kfree(sched_group_nodes);
5726                 sched_group_nodes_bycpu[cpu] = NULL;
5727         }
5728 }
5729 #else
5730 static void free_sched_groups(const cpumask_t *cpu_map)
5731 {
5732 }
5733 #endif
5734
5735 /*
5736  * Initialize sched groups cpu_power.
5737  *
5738  * cpu_power indicates the capacity of sched group, which is used while
5739  * distributing the load between different sched groups in a sched domain.
5740  * Typically cpu_power for all the groups in a sched domain will be same unless
5741  * there are asymmetries in the topology. If there are asymmetries, group
5742  * having more cpu_power will pickup more load compared to the group having
5743  * less cpu_power.
5744  *
5745  * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5746  * the maximum number of tasks a group can handle in the presence of other idle
5747  * or lightly loaded groups in the same sched domain.
5748  */
5749 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5750 {
5751         struct sched_domain *child;
5752         struct sched_group *group;
5753
5754         WARN_ON(!sd || !sd->groups);
5755
5756         if (cpu != first_cpu(sd->groups->cpumask))
5757                 return;
5758
5759         child = sd->child;
5760
5761         sd->groups->__cpu_power = 0;
5762
5763         /*
5764          * For perf policy, if the groups in child domain share resources
5765          * (for example cores sharing some portions of the cache hierarchy
5766          * or SMT), then set this domain groups cpu_power such that each group
5767          * can handle only one task, when there are other idle groups in the
5768          * same sched domain.
5769          */
5770         if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5771                        (child->flags &
5772                         (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5773                 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
5774                 return;
5775         }
5776
5777         /*
5778          * add cpu_power of each child group to this groups cpu_power
5779          */
5780         group = child->groups;
5781         do {
5782                 sg_inc_cpu_power(sd->groups, group->__cpu_power);
5783                 group = group->next;
5784         } while (group != child->groups);
5785 }
5786
5787 /*
5788  * Build sched domains for a given set of cpus and attach the sched domains
5789  * to the individual cpus
5790  */
5791 static int build_sched_domains(const cpumask_t *cpu_map)
5792 {
5793         int i;
5794 #ifdef CONFIG_NUMA
5795         struct sched_group **sched_group_nodes = NULL;
5796         int sd_allnodes = 0;
5797
5798         /*
5799          * Allocate the per-node list of sched groups
5800          */
5801         sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
5802                                            GFP_KERNEL);
5803         if (!sched_group_nodes) {
5804                 printk(KERN_WARNING "Can not alloc sched group node list\n");
5805                 return -ENOMEM;
5806         }
5807         sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5808 #endif
5809
5810         /*
5811          * Set up domains for cpus specified by the cpu_map.
5812          */
5813         for_each_cpu_mask(i, *cpu_map) {
5814                 struct sched_domain *sd = NULL, *p;
5815                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5816
5817                 cpus_and(nodemask, nodemask, *cpu_map);
5818
5819 #ifdef CONFIG_NUMA
5820                 if (cpus_weight(*cpu_map) >
5821                                 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5822                         sd = &per_cpu(allnodes_domains, i);
5823                         *sd = SD_ALLNODES_INIT;
5824                         sd->span = *cpu_map;
5825                         cpu_to_allnodes_group(i, cpu_map, &sd->groups);
5826                         p = sd;
5827                         sd_allnodes = 1;
5828                 } else
5829                         p = NULL;
5830
5831                 sd = &per_cpu(node_domains, i);
5832                 *sd = SD_NODE_INIT;
5833                 sd->span = sched_domain_node_span(cpu_to_node(i));
5834                 sd->parent = p;
5835                 if (p)
5836                         p->child = sd;
5837                 cpus_and(sd->span, sd->span, *cpu_map);
5838 #endif
5839
5840                 p = sd;
5841                 sd = &per_cpu(phys_domains, i);
5842                 *sd = SD_CPU_INIT;
5843                 sd->span = nodemask;
5844                 sd->parent = p;
5845                 if (p)
5846                         p->child = sd;
5847                 cpu_to_phys_group(i, cpu_map, &sd->groups);
5848
5849 #ifdef CONFIG_SCHED_MC
5850                 p = sd;
5851                 sd = &per_cpu(core_domains, i);
5852                 *sd = SD_MC_INIT;
5853                 sd->span = cpu_coregroup_map(i);
5854                 cpus_and(sd->span, sd->span, *cpu_map);
5855                 sd->parent = p;
5856                 p->child = sd;
5857                 cpu_to_core_group(i, cpu_map, &sd->groups);
5858 #endif
5859
5860 #ifdef CONFIG_SCHED_SMT
5861                 p = sd;
5862                 sd = &per_cpu(cpu_domains, i);
5863                 *sd = SD_SIBLING_INIT;
5864                 sd->span = cpu_sibling_map[i];
5865                 cpus_and(sd->span, sd->span, *cpu_map);
5866                 sd->parent = p;
5867                 p->child = sd;
5868                 cpu_to_cpu_group(i, cpu_map, &sd->groups);
5869 #endif
5870         }
5871
5872 #ifdef CONFIG_SCHED_SMT
5873         /* Set up CPU (sibling) groups */
5874         for_each_cpu_mask(i, *cpu_map) {
5875                 cpumask_t this_sibling_map = cpu_sibling_map[i];
5876                 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
5877                 if (i != first_cpu(this_sibling_map))
5878                         continue;
5879
5880                 init_sched_build_groups(this_sibling_map, cpu_map,
5881                                         &cpu_to_cpu_group);
5882         }
5883 #endif
5884
5885 #ifdef CONFIG_SCHED_MC
5886         /* Set up multi-core groups */
5887         for_each_cpu_mask(i, *cpu_map) {
5888                 cpumask_t this_core_map = cpu_coregroup_map(i);
5889                 cpus_and(this_core_map, this_core_map, *cpu_map);
5890                 if (i != first_cpu(this_core_map))
5891                         continue;
5892                 init_sched_build_groups(this_core_map, cpu_map,
5893                                         &cpu_to_core_group);
5894         }
5895 #endif
5896
5897         /* Set up physical groups */
5898         for (i = 0; i < MAX_NUMNODES; i++) {
5899                 cpumask_t nodemask = node_to_cpumask(i);
5900
5901                 cpus_and(nodemask, nodemask, *cpu_map);
5902                 if (cpus_empty(nodemask))
5903                         continue;
5904
5905                 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
5906         }
5907
5908 #ifdef CONFIG_NUMA
5909         /* Set up node groups */
5910         if (sd_allnodes)
5911                 init_sched_build_groups(*cpu_map, cpu_map,
5912                                         &cpu_to_allnodes_group);
5913
5914         for (i = 0; i < MAX_NUMNODES; i++) {
5915                 /* Set up node groups */
5916                 struct sched_group *sg, *prev;
5917                 cpumask_t nodemask = node_to_cpumask(i);
5918                 cpumask_t domainspan;
5919                 cpumask_t covered = CPU_MASK_NONE;
5920                 int j;
5921
5922                 cpus_and(nodemask, nodemask, *cpu_map);
5923                 if (cpus_empty(nodemask)) {
5924                         sched_group_nodes[i] = NULL;
5925                         continue;
5926                 }
5927
5928                 domainspan = sched_domain_node_span(i);
5929                 cpus_and(domainspan, domainspan, *cpu_map);
5930
5931                 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
5932                 if (!sg) {
5933                         printk(KERN_WARNING "Can not alloc domain group for "
5934                                 "node %d\n", i);
5935                         goto error;
5936                 }
5937                 sched_group_nodes[i] = sg;
5938                 for_each_cpu_mask(j, nodemask) {
5939                         struct sched_domain *sd;
5940
5941                         sd = &per_cpu(node_domains, j);
5942                         sd->groups = sg;
5943                 }
5944                 sg->__cpu_power = 0;
5945                 sg->cpumask = nodemask;
5946                 sg->next = sg;
5947                 cpus_or(covered, covered, nodemask);
5948                 prev = sg;
5949
5950                 for (j = 0; j < MAX_NUMNODES; j++) {
5951                         cpumask_t tmp, notcovered;
5952                         int n = (i + j) % MAX_NUMNODES;
5953
5954                         cpus_complement(notcovered, covered);
5955                         cpus_and(tmp, notcovered, *cpu_map);
5956                         cpus_and(tmp, tmp, domainspan);
5957                         if (cpus_empty(tmp))
5958                                 break;
5959
5960                         nodemask = node_to_cpumask(n);
5961                         cpus_and(tmp, tmp, nodemask);
5962                         if (cpus_empty(tmp))
5963                                 continue;
5964
5965                         sg = kmalloc_node(sizeof(struct sched_group),
5966                                           GFP_KERNEL, i);
5967                         if (!sg) {
5968                                 printk(KERN_WARNING
5969                                 "Can not alloc domain group for node %d\n", j);
5970                                 goto error;
5971                         }
5972                         sg->__cpu_power = 0;
5973                         sg->cpumask = tmp;
5974                         sg->next = prev->next;
5975                         cpus_or(covered, covered, tmp);
5976                         prev->next = sg;
5977                         prev = sg;
5978                 }
5979         }
5980 #endif
5981
5982         /* Calculate CPU power for physical packages and nodes */
5983 #ifdef CONFIG_SCHED_SMT
5984         for_each_cpu_mask(i, *cpu_map) {
5985                 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5986
5987                 init_sched_groups_power(i, sd);
5988         }
5989 #endif
5990 #ifdef CONFIG_SCHED_MC
5991         for_each_cpu_mask(i, *cpu_map) {
5992                 struct sched_domain *sd = &per_cpu(core_domains, i);
5993
5994                 init_sched_groups_power(i, sd);
5995         }
5996 #endif
5997
5998         for_each_cpu_mask(i, *cpu_map) {
5999                 struct sched_domain *sd = &per_cpu(phys_domains, i);
6000
6001                 init_sched_groups_power(i, sd);
6002         }
6003
6004 #ifdef CONFIG_NUMA
6005         for (i = 0; i < MAX_NUMNODES; i++)
6006                 init_numa_sched_groups_power(sched_group_nodes[i]);
6007
6008         if (sd_allnodes) {
6009                 struct sched_group *sg;
6010
6011                 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6012                 init_numa_sched_groups_power(sg);
6013         }
6014 #endif
6015
6016         /* Attach the domains */
6017         for_each_cpu_mask(i, *cpu_map) {
6018                 struct sched_domain *sd;
6019 #ifdef CONFIG_SCHED_SMT
6020                 sd = &per_cpu(cpu_domains, i);
6021 #elif defined(CONFIG_SCHED_MC)
6022                 sd = &per_cpu(core_domains, i);
6023 #else
6024                 sd = &per_cpu(phys_domains, i);
6025 #endif
6026                 cpu_attach_domain(sd, i);
6027         }
6028
6029         return 0;
6030
6031 #ifdef CONFIG_NUMA
6032 error:
6033         free_sched_groups(cpu_map);
6034         return -ENOMEM;
6035 #endif
6036 }
6037 /*
6038  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
6039  */
6040 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6041 {
6042         cpumask_t cpu_default_map;
6043         int err;
6044
6045         /*
6046          * Setup mask for cpus without special case scheduling requirements.
6047          * For now this just excludes isolated cpus, but could be used to
6048          * exclude other special cases in the future.
6049          */
6050         cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6051
6052         err = build_sched_domains(&cpu_default_map);
6053
6054         return err;
6055 }
6056
6057 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6058 {
6059         free_sched_groups(cpu_map);
6060 }
6061
6062 /*
6063  * Detach sched domains from a group of cpus specified in cpu_map
6064  * These cpus will now be attached to the NULL domain
6065  */
6066 static void detach_destroy_domains(const cpumask_t *cpu_map)
6067 {
6068         int i;
6069
6070         for_each_cpu_mask(i, *cpu_map)
6071                 cpu_attach_domain(NULL, i);
6072         synchronize_sched();
6073         arch_destroy_sched_domains(cpu_map);
6074 }
6075
6076 /*
6077  * Partition sched domains as specified by the cpumasks below.
6078  * This attaches all cpus from the cpumasks to the NULL domain,
6079  * waits for a RCU quiescent period, recalculates sched
6080  * domain information and then attaches them back to the
6081  * correct sched domains
6082  * Call with hotplug lock held
6083  */
6084 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6085 {
6086         cpumask_t change_map;
6087         int err = 0;
6088
6089         cpus_and(*partition1, *partition1, cpu_online_map);
6090         cpus_and(*partition2, *partition2, cpu_online_map);
6091         cpus_or(change_map, *partition1, *partition2);
6092
6093         /* Detach sched domains from all of the affected cpus */
6094         detach_destroy_domains(&change_map);
6095         if (!cpus_empty(*partition1))
6096                 err = build_sched_domains(partition1);
6097         if (!err && !cpus_empty(*partition2))
6098                 err = build_sched_domains(partition2);
6099
6100         return err;
6101 }
6102
6103 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6104 int arch_reinit_sched_domains(void)
6105 {
6106         int err;
6107
6108         mutex_lock(&sched_hotcpu_mutex);
6109         detach_destroy_domains(&cpu_online_map);
6110         err = arch_init_sched_domains(&cpu_online_map);
6111         mutex_unlock(&sched_hotcpu_mutex);
6112
6113         return err;
6114 }
6115
6116 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6117 {
6118         int ret;
6119
6120         if (buf[0] != '0' && buf[0] != '1')
6121                 return -EINVAL;
6122
6123         if (smt)
6124                 sched_smt_power_savings = (buf[0] == '1');
6125         else
6126                 sched_mc_power_savings = (buf[0] == '1');
6127
6128         ret = arch_reinit_sched_domains();
6129
6130         return ret ? ret : count;
6131 }
6132
6133 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6134 {
6135         int err = 0;
6136
6137 #ifdef CONFIG_SCHED_SMT
6138         if (smt_capable())
6139                 err = sysfs_create_file(&cls->kset.kobj,
6140                                         &attr_sched_smt_power_savings.attr);
6141 #endif
6142 #ifdef CONFIG_SCHED_MC
6143         if (!err && mc_capable())
6144                 err = sysfs_create_file(&cls->kset.kobj,
6145                                         &attr_sched_mc_power_savings.attr);
6146 #endif
6147         return err;
6148 }
6149 #endif
6150
6151 #ifdef CONFIG_SCHED_MC
6152 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6153 {
6154         return sprintf(page, "%u\n", sched_mc_power_savings);
6155 }
6156 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6157                                             const char *buf, size_t count)
6158 {
6159         return sched_power_savings_store(buf, count, 0);
6160 }
6161 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6162             sched_mc_power_savings_store);
6163 #endif
6164
6165 #ifdef CONFIG_SCHED_SMT
6166 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6167 {
6168         return sprintf(page, "%u\n", sched_smt_power_savings);
6169 }
6170 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6171                                              const char *buf, size_t count)
6172 {
6173         return sched_power_savings_store(buf, count, 1);
6174 }
6175 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6176             sched_smt_power_savings_store);
6177 #endif
6178
6179 /*
6180  * Force a reinitialization of the sched domains hierarchy.  The domains
6181  * and groups cannot be updated in place without racing with the balancing
6182  * code, so we temporarily attach all running cpus to the NULL domain
6183  * which will prevent rebalancing while the sched domains are recalculated.
6184  */
6185 static int update_sched_domains(struct notifier_block *nfb,
6186                                 unsigned long action, void *hcpu)
6187 {
6188         switch (action) {
6189         case CPU_UP_PREPARE:
6190         case CPU_UP_PREPARE_FROZEN:
6191         case CPU_DOWN_PREPARE:
6192         case CPU_DOWN_PREPARE_FROZEN:
6193                 detach_destroy_domains(&cpu_online_map);
6194                 return NOTIFY_OK;
6195
6196         case CPU_UP_CANCELED:
6197         case CPU_UP_CANCELED_FROZEN:
6198         case CPU_DOWN_FAILED:
6199         case CPU_DOWN_FAILED_FROZEN:
6200         case CPU_ONLINE:
6201         case CPU_ONLINE_FROZEN:
6202         case CPU_DEAD:
6203         case CPU_DEAD_FROZEN:
6204                 /*
6205                  * Fall through and re-initialise the domains.
6206                  */
6207                 break;
6208         default:
6209                 return NOTIFY_DONE;
6210         }
6211
6212         /* The hotplug lock is already held by cpu_up/cpu_down */
6213         arch_init_sched_domains(&cpu_online_map);
6214
6215         return NOTIFY_OK;
6216 }
6217
6218 void __init sched_init_smp(void)
6219 {
6220         cpumask_t non_isolated_cpus;
6221
6222         mutex_lock(&sched_hotcpu_mutex);
6223         arch_init_sched_domains(&cpu_online_map);
6224         cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6225         if (cpus_empty(non_isolated_cpus))
6226                 cpu_set(smp_processor_id(), non_isolated_cpus);
6227         mutex_unlock(&sched_hotcpu_mutex);
6228         /* XXX: Theoretical race here - CPU may be hotplugged now */
6229         hotcpu_notifier(update_sched_domains, 0);
6230
6231         /* Move init over to a non-isolated CPU */
6232         if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6233                 BUG();
6234         sched_init_granularity();
6235 }
6236 #else
6237 void __init sched_init_smp(void)
6238 {
6239         sched_init_granularity();
6240 }
6241 #endif /* CONFIG_SMP */
6242
6243 int in_sched_functions(unsigned long addr)
6244 {
6245         /* Linker adds these: start and end of __sched functions */
6246         extern char __sched_text_start[], __sched_text_end[];
6247
6248         return in_lock_functions(addr) ||
6249                 (addr >= (unsigned long)__sched_text_start
6250                 && addr < (unsigned long)__sched_text_end);
6251 }
6252
6253 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6254 {
6255         cfs_rq->tasks_timeline = RB_ROOT;
6256         cfs_rq->fair_clock = 1;
6257 #ifdef CONFIG_FAIR_GROUP_SCHED
6258         cfs_rq->rq = rq;
6259 #endif
6260 }
6261
6262 void __init sched_init(void)
6263 {
6264         u64 now = sched_clock();
6265         int highest_cpu = 0;
6266         int i, j;
6267
6268         /*
6269          * Link up the scheduling class hierarchy:
6270          */
6271         rt_sched_class.next = &fair_sched_class;
6272         fair_sched_class.next = &idle_sched_class;
6273         idle_sched_class.next = NULL;
6274
6275         for_each_possible_cpu(i) {
6276                 struct rt_prio_array *array;
6277                 struct rq *rq;
6278
6279                 rq = cpu_rq(i);
6280                 spin_lock_init(&rq->lock);
6281                 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6282                 rq->nr_running = 0;
6283                 rq->clock = 1;
6284                 init_cfs_rq(&rq->cfs, rq);
6285 #ifdef CONFIG_FAIR_GROUP_SCHED
6286                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6287                 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6288 #endif
6289                 rq->ls.load_update_last = now;
6290                 rq->ls.load_update_start = now;
6291
6292                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6293                         rq->cpu_load[j] = 0;
6294 #ifdef CONFIG_SMP
6295                 rq->sd = NULL;
6296                 rq->active_balance = 0;
6297                 rq->next_balance = jiffies;
6298                 rq->push_cpu = 0;
6299                 rq->cpu = i;
6300                 rq->migration_thread = NULL;
6301                 INIT_LIST_HEAD(&rq->migration_queue);
6302 #endif
6303                 atomic_set(&rq->nr_iowait, 0);
6304
6305                 array = &rq->rt.active;
6306                 for (j = 0; j < MAX_RT_PRIO; j++) {
6307                         INIT_LIST_HEAD(array->queue + j);
6308                         __clear_bit(j, array->bitmap);
6309                 }
6310                 highest_cpu = i;
6311                 /* delimiter for bitsearch: */
6312                 __set_bit(MAX_RT_PRIO, array->bitmap);
6313         }
6314
6315         set_load_weight(&init_task);
6316
6317 #ifdef CONFIG_SMP
6318         nr_cpu_ids = highest_cpu + 1;
6319         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6320 #endif
6321
6322 #ifdef CONFIG_RT_MUTEXES
6323         plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6324 #endif
6325
6326         /*
6327          * The boot idle thread does lazy MMU switching as well:
6328          */
6329         atomic_inc(&init_mm.mm_count);
6330         enter_lazy_tlb(&init_mm, current);
6331
6332         /*
6333          * Make us the idle thread. Technically, schedule() should not be
6334          * called from this thread, however somewhere below it might be,
6335          * but because we are the idle thread, we just pick up running again
6336          * when this runqueue becomes "idle".
6337          */
6338         init_idle(current, smp_processor_id());
6339         /*
6340          * During early bootup we pretend to be a normal task:
6341          */
6342         current->sched_class = &fair_sched_class;
6343 }
6344
6345 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6346 void __might_sleep(char *file, int line)
6347 {
6348 #ifdef in_atomic
6349         static unsigned long prev_jiffy;        /* ratelimiting */
6350
6351         if ((in_atomic() || irqs_disabled()) &&
6352             system_state == SYSTEM_RUNNING && !oops_in_progress) {
6353                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6354                         return;
6355                 prev_jiffy = jiffies;
6356                 printk(KERN_ERR "BUG: sleeping function called from invalid"
6357                                 " context at %s:%d\n", file, line);
6358                 printk("in_atomic():%d, irqs_disabled():%d\n",
6359                         in_atomic(), irqs_disabled());
6360                 debug_show_held_locks(current);
6361                 if (irqs_disabled())
6362                         print_irqtrace_events(current);
6363                 dump_stack();
6364         }
6365 #endif
6366 }
6367 EXPORT_SYMBOL(__might_sleep);
6368 #endif
6369
6370 #ifdef CONFIG_MAGIC_SYSRQ
6371 void normalize_rt_tasks(void)
6372 {
6373         struct task_struct *g, *p;
6374         unsigned long flags;
6375         struct rq *rq;
6376         int on_rq;
6377
6378         read_lock_irq(&tasklist_lock);
6379         do_each_thread(g, p) {
6380                 p->se.fair_key                  = 0;
6381                 p->se.wait_runtime              = 0;
6382                 p->se.wait_start_fair           = 0;
6383                 p->se.wait_start                = 0;
6384                 p->se.exec_start                = 0;
6385                 p->se.sleep_start               = 0;
6386                 p->se.sleep_start_fair          = 0;
6387                 p->se.block_start               = 0;
6388                 task_rq(p)->cfs.fair_clock      = 0;
6389                 task_rq(p)->clock               = 0;
6390
6391                 if (!rt_task(p)) {
6392                         /*
6393                          * Renice negative nice level userspace
6394                          * tasks back to 0:
6395                          */
6396                         if (TASK_NICE(p) < 0 && p->mm)
6397                                 set_user_nice(p, 0);
6398                         continue;
6399                 }
6400
6401                 spin_lock_irqsave(&p->pi_lock, flags);
6402                 rq = __task_rq_lock(p);
6403 #ifdef CONFIG_SMP
6404                 /*
6405                  * Do not touch the migration thread:
6406                  */
6407                 if (p == rq->migration_thread)
6408                         goto out_unlock;
6409 #endif
6410
6411                 on_rq = p->se.on_rq;
6412                 if (on_rq)
6413                         deactivate_task(task_rq(p), p, 0);
6414                 __setscheduler(rq, p, SCHED_NORMAL, 0);
6415                 if (on_rq) {
6416                         activate_task(task_rq(p), p, 0);
6417                         resched_task(rq->curr);
6418                 }
6419 #ifdef CONFIG_SMP
6420  out_unlock:
6421 #endif
6422                 __task_rq_unlock(rq);
6423                 spin_unlock_irqrestore(&p->pi_lock, flags);
6424         } while_each_thread(g, p);
6425
6426         read_unlock_irq(&tasklist_lock);
6427 }
6428
6429 #endif /* CONFIG_MAGIC_SYSRQ */
6430
6431 #ifdef CONFIG_IA64
6432 /*
6433  * These functions are only useful for the IA64 MCA handling.
6434  *
6435  * They can only be called when the whole system has been
6436  * stopped - every CPU needs to be quiescent, and no scheduling
6437  * activity can take place. Using them for anything else would
6438  * be a serious bug, and as a result, they aren't even visible
6439  * under any other configuration.
6440  */
6441
6442 /**
6443  * curr_task - return the current task for a given cpu.
6444  * @cpu: the processor in question.
6445  *
6446  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6447  */
6448 struct task_struct *curr_task(int cpu)
6449 {
6450         return cpu_curr(cpu);
6451 }
6452
6453 /**
6454  * set_curr_task - set the current task for a given cpu.
6455  * @cpu: the processor in question.
6456  * @p: the task pointer to set.
6457  *
6458  * Description: This function must only be used when non-maskable interrupts
6459  * are serviced on a separate stack.  It allows the architecture to switch the
6460  * notion of the current task on a cpu in a non-blocking manner.  This function
6461  * must be called with all CPU's synchronized, and interrupts disabled, the
6462  * and caller must save the original value of the current task (see
6463  * curr_task() above) and restore that value before reenabling interrupts and
6464  * re-starting the system.
6465  *
6466  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6467  */
6468 void set_curr_task(int cpu, struct task_struct *p)
6469 {
6470         cpu_curr(cpu) = p;
6471 }
6472
6473 #endif