kernel/sched.c

   1 /*
   2  *  kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13  *              hybrid priority-list and round-robin design with
  14  *              an array-switch method of distributing timeslices
  15  *              and per-CPU runqueues.  Cleanups and useful suggestions
  16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17  *  2003-09-03  Interactivity tuning by Con Kolivas.
  18  *  2004-04-02  Scheduler domains code by Nick Piggin
  19  */
  20
  21 #include <linux/mm.h>
  22 #include <linux/module.h>
  23 #include <linux/nmi.h>
  24 #include <linux/init.h>
  25 #include <linux/uaccess.h>
  26 #include <linux/highmem.h>
  27 #include <linux/smp_lock.h>
  28 #include <asm/mmu_context.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/capability.h>
  31 #include <linux/completion.h>
  32 #include <linux/kernel_stat.h>
  33 #include <linux/debug_locks.h>
  34 #include <linux/security.h>
  35 #include <linux/notifier.h>
  36 #include <linux/profile.h>
  37 #include <linux/freezer.h>
  38 #include <linux/vmalloc.h>
  39 #include <linux/blkdev.h>
  40 #include <linux/delay.h>
  41 #include <linux/smp.h>
  42 #include <linux/threads.h>
  43 #include <linux/timer.h>
  44 #include <linux/rcupdate.h>
  45 #include <linux/cpu.h>
  46 #include <linux/cpuset.h>
  47 #include <linux/percpu.h>
  48 #include <linux/kthread.h>
  49 #include <linux/seq_file.h>
  50 #include <linux/syscalls.h>
  51 #include <linux/times.h>
  52 #include <linux/tsacct_kern.h>
  53 #include <linux/kprobes.h>
  54 #include <linux/delayacct.h>
  55 #include <linux/reciprocal_div.h>
  56 #include <linux/unistd.h>
  57
  58 #include <asm/tlb.h>
  59
  60 /*
  61  * Scheduler clock - returns current time in nanosec units.
  62  * This is default implementation.
  63  * Architectures and sub-architectures can override this.
  64  */
  65 unsigned long long __attribute__((weak)) sched_clock(void)
  66 {
  67         return (unsigned long long)jiffies * (1000000000 / HZ);
  68 }
  69
  70 /*
  71  * Convert user-nice values [ -20 ... 0 ... 19 ]
  72  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  73  * and back.
  74  */
  75 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  76 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  77 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  78
  79 /*
  80  * 'User priority' is the nice value converted to something we
  81  * can work with better when scaling various scheduler parameters,
  82  * it's a [ 0 ... 39 ] range.
  83  */
  84 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  85 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  86 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  87
  88 /*
  89  * Some helpers for converting nanosecond timing to jiffy resolution
  90  */
  91 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
  92 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
  93
  94 #define NICE_0_LOAD             SCHED_LOAD_SCALE
  95 #define NICE_0_SHIFT            SCHED_LOAD_SHIFT
  96
  97 /*
  98  * These are the 'tuning knobs' of the scheduler:
  99  *
 100  * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
 101  * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
 102  * Timeslices get refilled after they expire.
 103  */
 104 #define MIN_TIMESLICE           max(5 * HZ / 1000, 1)
 105 #define DEF_TIMESLICE           (100 * HZ / 1000)
 106
 107 #ifdef CONFIG_SMP
 108 /*
 109  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
 110  * Since cpu_power is a 'constant', we can use a reciprocal divide.
 111  */
 112 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 113 {
 114         return reciprocal_divide(load, sg->reciprocal_cpu_power);
 115 }
 116
 117 /*
 118  * Each time a sched group cpu_power is changed,
 119  * we must compute its reciprocal value
 120  */
 121 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 122 {
 123         sg->__cpu_power += val;
 124         sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 125 }
 126 #endif
 127
 128 #define SCALE_PRIO(x, prio) \
 129         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 130
 131 /*
 132  * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
 133  * to time slice values: [800ms ... 100ms ... 5ms]
 134  */
 135 static unsigned int static_prio_timeslice(int static_prio)
 136 {
 137         if (static_prio == NICE_TO_PRIO(19))
 138                 return 1;
 139
 140         if (static_prio < NICE_TO_PRIO(0))
 141                 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
 142         else
 143                 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 144 }
 145
 146 static inline int rt_policy(int policy)
 147 {
 148         if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
 149                 return 1;
 150         return 0;
 151 }
 152
 153 static inline int task_has_rt_policy(struct task_struct *p)
 154 {
 155         return rt_policy(p->policy);
 156 }
 157
 158 /*
 159  * This is the priority-queue data structure of the RT scheduling class:
 160  */
 161 struct rt_prio_array {
 162         DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 163         struct list_head queue[MAX_RT_PRIO];
 164 };
 165
 166 struct load_stat {
 167         struct load_weight load;
 168         u64 load_update_start, load_update_last;
 169         unsigned long delta_fair, delta_exec, delta_stat;
 170 };
 171
 172 /* CFS-related fields in a runqueue */
 173 struct cfs_rq {
 174         struct load_weight load;
 175         unsigned long nr_running;
 176
 177         s64 fair_clock;
 178         u64 exec_clock;
 179         s64 wait_runtime;
 180         u64 sleeper_bonus;
 181         unsigned long wait_runtime_overruns, wait_runtime_underruns;
 182
 183         struct rb_root tasks_timeline;
 184         struct rb_node *rb_leftmost;
 185         struct rb_node *rb_load_balance_curr;
 186 #ifdef CONFIG_FAIR_GROUP_SCHED
 187         /* 'curr' points to currently running entity on this cfs_rq.
 188          * It is set to NULL otherwise (i.e when none are currently running).
 189          */
 190         struct sched_entity *curr;
 191         struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
 192
 193         /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 194          * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 195          * (like users, containers etc.)
 196          *
 197          * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 198          * list is used during load balance.
 199          */
 200         struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
 201 #endif
 202 };
 203
 204 /* Real-Time classes' related field in a runqueue: */
 205 struct rt_rq {
 206         struct rt_prio_array active;
 207         int rt_load_balance_idx;
 208         struct list_head *rt_load_balance_head, *rt_load_balance_curr;
 209 };
 210
 211 /*
 212  * The prio-array type of the old scheduler:
 213  */
 214 struct prio_array {
 215         unsigned int nr_active;
 216         DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
 217         struct list_head queue[MAX_PRIO];
 218 };
 219
 220 /*
 221  * This is the main, per-CPU runqueue data structure.
 222  *
 223  * Locking rule: those places that want to lock multiple runqueues
 224  * (such as the load balancing or the thread migration code), lock
 225  * acquire operations must be ordered by ascending &runqueue.
 226  */
 227 struct rq {
 228         spinlock_t lock;        /* runqueue lock */
 229
 230         /*
 231          * nr_running and cpu_load should be in the same cacheline because
 232          * remote CPUs use both these fields when doing load calculation.
 233          */
 234         unsigned long nr_running;
 235         unsigned long raw_weighted_load;
 236         #define CPU_LOAD_IDX_MAX 5
 237         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 238         unsigned char idle_at_tick;
 239 #ifdef CONFIG_NO_HZ
 240         unsigned char in_nohz_recently;
 241 #endif
 242         struct load_stat ls;    /* capture load from *all* tasks on this cpu */
 243         unsigned long nr_load_updates;
 244         u64 nr_switches;
 245
 246         struct cfs_rq cfs;
 247 #ifdef CONFIG_FAIR_GROUP_SCHED
 248         struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
 249 #endif
 250         struct rt_rq  rt;
 251
 252         /*
 253          * This is part of a global counter where only the total sum
 254          * over all CPUs matters. A task can increase this counter on
 255          * one CPU and if it got migrated afterwards it may decrease
 256          * it on another CPU. Always updated under the runqueue lock:
 257          */
 258         unsigned long nr_uninterruptible;
 259
 260         unsigned long expired_timestamp;
 261         unsigned long long most_recent_timestamp;
 262
 263         struct task_struct *curr, *idle;
 264         unsigned long next_balance;
 265         struct mm_struct *prev_mm;
 266
 267         struct prio_array *active, *expired, arrays[2];
 268         int best_expired_prio;
 269
 270         u64 clock, prev_clock_raw;
 271         s64 clock_max_delta;
 272
 273         unsigned int clock_warps, clock_overflows;
 274         unsigned int clock_unstable_events;
 275
 276         struct sched_class *load_balance_class;
 277
 278         atomic_t nr_iowait;
 279
 280 #ifdef CONFIG_SMP
 281         struct sched_domain *sd;
 282
 283         /* For active balancing */
 284         int active_balance;
 285         int push_cpu;
 286         int cpu;                /* cpu of this runqueue */
 287
 288         struct task_struct *migration_thread;
 289         struct list_head migration_queue;
 290 #endif
 291
 292 #ifdef CONFIG_SCHEDSTATS
 293         /* latency stats */
 294         struct sched_info rq_sched_info;
 295
 296         /* sys_sched_yield() stats */
 297         unsigned long yld_exp_empty;
 298         unsigned long yld_act_empty;
 299         unsigned long yld_both_empty;
 300         unsigned long yld_cnt;
 301
 302         /* schedule() stats */
 303         unsigned long sched_switch;
 304         unsigned long sched_cnt;
 305         unsigned long sched_goidle;
 306
 307         /* try_to_wake_up() stats */
 308         unsigned long ttwu_cnt;
 309         unsigned long ttwu_local;
 310 #endif
 311         struct lock_class_key rq_lock_key;
 312 };
 313
 314 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
 315 static DEFINE_MUTEX(sched_hotcpu_mutex);
 316
 317 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 318 {
 319         rq->curr->sched_class->check_preempt_curr(rq, p);
 320 }
 321
 322 static inline int cpu_of(struct rq *rq)
 323 {
 324 #ifdef CONFIG_SMP
 325         return rq->cpu;
 326 #else
 327         return 0;
 328 #endif
 329 }
 330
 331 /*
 332  * Per-runqueue clock, as finegrained as the platform can give us:
 333  */
 334 static unsigned long long __rq_clock(struct rq *rq)
 335 {
 336         u64 prev_raw = rq->prev_clock_raw;
 337         u64 now = sched_clock();
 338         s64 delta = now - prev_raw;
 339         u64 clock = rq->clock;
 340
 341         /*
 342          * Protect against sched_clock() occasionally going backwards:
 343          */
 344         if (unlikely(delta < 0)) {
 345                 clock++;
 346                 rq->clock_warps++;
 347         } else {
 348                 /*
 349                  * Catch too large forward jumps too:
 350                  */
 351                 if (unlikely(delta > 2*TICK_NSEC)) {
 352                         clock++;
 353                         rq->clock_overflows++;
 354                 } else {
 355                         if (unlikely(delta > rq->clock_max_delta))
 356                                 rq->clock_max_delta = delta;
 357                         clock += delta;
 358                 }
 359         }
 360
 361         rq->prev_clock_raw = now;
 362         rq->clock = clock;
 363
 364         return clock;
 365 }
 366
 367 static inline unsigned long long rq_clock(struct rq *rq)
 368 {
 369         int this_cpu = smp_processor_id();
 370
 371         if (this_cpu == cpu_of(rq))
 372                 return __rq_clock(rq);
 373
 374         return rq->clock;
 375 }
 376
 377 /*
 378  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 379  * See detach_destroy_domains: synchronize_sched for details.
 380  *
 381  * The domain tree of any CPU may only be accessed from within
 382  * preempt-disabled sections.
 383  */
 384 #define for_each_domain(cpu, __sd) \
 385         for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 386
 387 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 388 #define this_rq()               (&__get_cpu_var(runqueues))
 389 #define task_rq(p)              cpu_rq(task_cpu(p))
 390 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 391
 392 #ifdef CONFIG_FAIR_GROUP_SCHED
 393 /* Change a task's ->cfs_rq if it moves across CPUs */
 394 static inline void set_task_cfs_rq(struct task_struct *p)
 395 {
 396         p->se.cfs_rq = &task_rq(p)->cfs;
 397 }
 398 #else
 399 static inline void set_task_cfs_rq(struct task_struct *p)
 400 {
 401 }
 402 #endif
 403
 404 #ifndef prepare_arch_switch
 405 # define prepare_arch_switch(next)      do { } while (0)
 406 #endif
 407 #ifndef finish_arch_switch
 408 # define finish_arch_switch(prev)       do { } while (0)
 409 #endif
 410
 411 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 412 static inline int task_running(struct rq *rq, struct task_struct *p)
 413 {
 414         return rq->curr == p;
 415 }
 416
 417 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 418 {
 419 }
 420
 421 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 422 {
 423 #ifdef CONFIG_DEBUG_SPINLOCK
 424         /* this is a valid case when another task releases the spinlock */
 425         rq->lock.owner = current;
 426 #endif
 427         /*
 428          * If we are tracking spinlock dependencies then we have to
 429          * fix up the runqueue lock - which gets 'carried over' from
 430          * prev into current:
 431          */
 432         spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 433
 434         spin_unlock_irq(&rq->lock);
 435 }
 436
 437 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 438 static inline int task_running(struct rq *rq, struct task_struct *p)
 439 {
 440 #ifdef CONFIG_SMP
 441         return p->oncpu;
 442 #else
 443         return rq->curr == p;
 444 #endif
 445 }
 446
 447 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 448 {
 449 #ifdef CONFIG_SMP
 450         /*
 451          * We can optimise this out completely for !SMP, because the
 452          * SMP rebalancing from interrupt is the only thing that cares
 453          * here.
 454          */
 455         next->oncpu = 1;
 456 #endif
 457 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 458         spin_unlock_irq(&rq->lock);
 459 #else
 460         spin_unlock(&rq->lock);
 461 #endif
 462 }
 463
 464 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 465 {
 466 #ifdef CONFIG_SMP
 467         /*
 468          * After ->oncpu is cleared, the task can be moved to a different CPU.
 469          * We must ensure this doesn't happen until the switch is completely
 470          * finished.
 471          */
 472         smp_wmb();
 473         prev->oncpu = 0;
 474 #endif
 475 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 476         local_irq_enable();
 477 #endif
 478 }
 479 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 480
 481 /*
 482  * __task_rq_lock - lock the runqueue a given task resides on.
 483  * Must be called interrupts disabled.
 484  */
 485 static inline struct rq *__task_rq_lock(struct task_struct *p)
 486         __acquires(rq->lock)
 487 {
 488         struct rq *rq;
 489
 490 repeat_lock_task:
 491         rq = task_rq(p);
 492         spin_lock(&rq->lock);
 493         if (unlikely(rq != task_rq(p))) {
 494                 spin_unlock(&rq->lock);
 495                 goto repeat_lock_task;
 496         }
 497         return rq;
 498 }
 499
 500 /*
 501  * task_rq_lock - lock the runqueue a given task resides on and disable
 502  * interrupts.  Note the ordering: we can safely lookup the task_rq without
 503  * explicitly disabling preemption.
 504  */
 505 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 506         __acquires(rq->lock)
 507 {
 508         struct rq *rq;
 509
 510 repeat_lock_task:
 511         local_irq_save(*flags);
 512         rq = task_rq(p);
 513         spin_lock(&rq->lock);
 514         if (unlikely(rq != task_rq(p))) {
 515                 spin_unlock_irqrestore(&rq->lock, *flags);
 516                 goto repeat_lock_task;
 517         }
 518         return rq;
 519 }
 520
 521 static inline void __task_rq_unlock(struct rq *rq)
 522         __releases(rq->lock)
 523 {
 524         spin_unlock(&rq->lock);
 525 }
 526
 527 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 528         __releases(rq->lock)
 529 {
 530         spin_unlock_irqrestore(&rq->lock, *flags);
 531 }
 532
 533 /*
 534  * this_rq_lock - lock this runqueue and disable interrupts.
 535  */
 536 static inline struct rq *this_rq_lock(void)
 537         __acquires(rq->lock)
 538 {
 539         struct rq *rq;
 540
 541         local_irq_disable();
 542         rq = this_rq();
 543         spin_lock(&rq->lock);
 544
 545         return rq;
 546 }
 547
 548 /*
 549  * CPU frequency is/was unstable - start new by setting prev_clock_raw:
 550  */
 551 void sched_clock_unstable_event(void)
 552 {
 553         unsigned long flags;
 554         struct rq *rq;
 555
 556         rq = task_rq_lock(current, &flags);
 557         rq->prev_clock_raw = sched_clock();
 558         rq->clock_unstable_events++;
 559         task_rq_unlock(rq, &flags);
 560 }
 561
 562 /*
 563  * resched_task - mark a task 'to be rescheduled now'.
 564  *
 565  * On UP this means the setting of the need_resched flag, on SMP it
 566  * might also involve a cross-CPU call to trigger the scheduler on
 567  * the target CPU.
 568  */
 569 #ifdef CONFIG_SMP
 570
 571 #ifndef tsk_is_polling
 572 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 573 #endif
 574
 575 static void resched_task(struct task_struct *p)
 576 {
 577         int cpu;
 578
 579         assert_spin_locked(&task_rq(p)->lock);
 580
 581         if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 582                 return;
 583
 584         set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 585
 586         cpu = task_cpu(p);
 587         if (cpu == smp_processor_id())
 588                 return;
 589
 590         /* NEED_RESCHED must be visible before we test polling */
 591         smp_mb();
 592         if (!tsk_is_polling(p))
 593                 smp_send_reschedule(cpu);
 594 }
 595
 596 static void resched_cpu(int cpu)
 597 {
 598         struct rq *rq = cpu_rq(cpu);
 599         unsigned long flags;
 600
 601         if (!spin_trylock_irqsave(&rq->lock, flags))
 602                 return;
 603         resched_task(cpu_curr(cpu));
 604         spin_unlock_irqrestore(&rq->lock, flags);
 605 }
 606 #else
 607 static inline void resched_task(struct task_struct *p)
 608 {
 609         assert_spin_locked(&task_rq(p)->lock);
 610         set_tsk_need_resched(p);
 611 }
 612 #endif
 613
 614 static u64 div64_likely32(u64 divident, unsigned long divisor)
 615 {
 616 #if BITS_PER_LONG == 32
 617         if (likely(divident <= 0xffffffffULL))
 618                 return (u32)divident / divisor;
 619         do_div(divident, divisor);
 620
 621         return divident;
 622 #else
 623         return divident / divisor;
 624 #endif
 625 }
 626
 627 #if BITS_PER_LONG == 32
 628 # define WMULT_CONST    (~0UL)
 629 #else
 630 # define WMULT_CONST    (1UL << 32)
 631 #endif
 632
 633 #define WMULT_SHIFT     32
 634
 635 static inline unsigned long
 636 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 637                 struct load_weight *lw)
 638 {
 639         u64 tmp;
 640
 641         if (unlikely(!lw->inv_weight))
 642                 lw->inv_weight = WMULT_CONST / lw->weight;
 643
 644         tmp = (u64)delta_exec * weight;
 645         /*
 646          * Check whether we'd overflow the 64-bit multiplication:
 647          */
 648         if (unlikely(tmp > WMULT_CONST)) {
 649                 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
 650                                 >> (WMULT_SHIFT/2);
 651         } else {
 652                 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
 653         }
 654
 655         return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
 656 }
 657
 658 static inline unsigned long
 659 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
 660 {
 661         return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
 662 }
 663
 664 static void update_load_add(struct load_weight *lw, unsigned long inc)
 665 {
 666         lw->weight += inc;
 667         lw->inv_weight = 0;
 668 }
 669
 670 static void update_load_sub(struct load_weight *lw, unsigned long dec)
 671 {
 672         lw->weight -= dec;
 673         lw->inv_weight = 0;
 674 }
 675
 676 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
 677 {
 678         if (rq->curr != rq->idle && ls->load.weight) {
 679                 ls->delta_exec += ls->delta_stat;
 680                 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
 681                 ls->delta_stat = 0;
 682         }
 683 }
 684
 685 /*
 686  * Update delta_exec, delta_fair fields for rq.
 687  *
 688  * delta_fair clock advances at a rate inversely proportional to
 689  * total load (rq->ls.load.weight) on the runqueue, while
 690  * delta_exec advances at the same rate as wall-clock (provided
 691  * cpu is not idle).
 692  *
 693  * delta_exec / delta_fair is a measure of the (smoothened) load on this
 694  * runqueue over any given interval. This (smoothened) load is used
 695  * during load balance.
 696  *
 697  * This function is called /before/ updating rq->ls.load
 698  * and when switching tasks.
 699  */
 700 static void update_curr_load(struct rq *rq, u64 now)
 701 {
 702         struct load_stat *ls = &rq->ls;
 703         u64 start;
 704
 705         start = ls->load_update_start;
 706         ls->load_update_start = now;
 707         ls->delta_stat += now - start;
 708         /*
 709          * Stagger updates to ls->delta_fair. Very frequent updates
 710          * can be expensive.
 711          */
 712         if (ls->delta_stat >= sysctl_sched_stat_granularity)
 713                 __update_curr_load(rq, ls);
 714 }
 715
 716 /*
 717  * To aid in avoiding the subversion of "niceness" due to uneven distribution
 718  * of tasks with abnormal "nice" values across CPUs the contribution that
 719  * each task makes to its run queue's load is weighted according to its
 720  * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
 721  * scaled version of the new time slice allocation that they receive on time
 722  * slice expiry etc.
 723  */
 724
 725 /*
 726  * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
 727  * If static_prio_timeslice() is ever changed to break this assumption then
 728  * this code will need modification
 729  */
 730 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
 731 #define load_weight(lp) \
 732         (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 733 #define PRIO_TO_LOAD_WEIGHT(prio) \
 734         load_weight(static_prio_timeslice(prio))
 735 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
 736         (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
 737
 738 #define WEIGHT_IDLEPRIO         2
 739 #define WMULT_IDLEPRIO          (1 << 31)
 740
 741 /*
 742  * Nice levels are multiplicative, with a gentle 10% change for every
 743  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
 744  * nice 1, it will get ~10% less CPU time than another CPU-bound task
 745  * that remained on nice 0.
 746  *
 747  * The "10% effect" is relative and cumulative: from _any_ nice level,
 748  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
 749  * it's +10% CPU usage.
 750  */
 751 static const int prio_to_weight[40] = {
 752 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
 753 /* -10 */  9537,  7629,  6103,  4883,  3906,  3125,  2500,  2000,  1600,  1280,
 754 /*   0 */  NICE_0_LOAD /* 1024 */,
 755 /*   1 */          819,   655,   524,   419,   336,   268,   215,   172,   137,
 756 /*  10 */   110,    87,    70,    56,    45,    36,    29,    23,    18,    15,
 757 };
 758
 759 static const u32 prio_to_wmult[40] = {
 760         48356,   60446,   75558,   94446,  118058,  147573,
 761         184467,  230589,  288233,  360285,  450347,
 762         562979,  703746,  879575, 1099582, 1374389,
 763         717986, 2147483, 2684354, 3355443, 4194304,
 764         244160, 6557201, 8196502, 10250518, 12782640,
 765         16025997, 19976592, 24970740, 31350126, 39045157,
 766         49367440, 61356675, 76695844, 95443717, 119304647,
 767         148102320, 186737708, 238609294, 286331153,
 768 };
 769
 770 static inline void
 771 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
 772 {
 773         update_curr_load(rq, now);
 774         update_load_add(&rq->ls.load, p->se.load.weight);
 775 }
 776
 777 static inline void
 778 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
 779 {
 780         update_curr_load(rq, now);
 781         update_load_sub(&rq->ls.load, p->se.load.weight);
 782 }
 783
 784 static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 785 {
 786         rq->nr_running++;
 787         inc_load(rq, p, now);
 788 }
 789
 790 static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 791 {
 792         rq->nr_running--;
 793         dec_load(rq, p, now);
 794 }
 795
 796 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 797
 798 /*
 799  * runqueue iterator, to support SMP load-balancing between different
 800  * scheduling classes, without having to expose their internal data
 801  * structures to the load-balancing proper:
 802  */
 803 struct rq_iterator {
 804         void *arg;
 805         struct task_struct *(*start)(void *);
 806         struct task_struct *(*next)(void *);
 807 };
 808
 809 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 810                       unsigned long max_nr_move, unsigned long max_load_move,
 811                       struct sched_domain *sd, enum cpu_idle_type idle,
 812                       int *all_pinned, unsigned long *load_moved,
 813                       int this_best_prio, int best_prio, int best_prio_seen,
 814                       struct rq_iterator *iterator);
 815
 816 #include "sched_stats.h"
 817 #include "sched_rt.c"
 818 #include "sched_fair.c"
 819 #include "sched_idletask.c"
 820 #ifdef CONFIG_SCHED_DEBUG
 821 # include "sched_debug.c"
 822 #endif
 823
 824 #define sched_class_highest (&rt_sched_class)
 825
 826 static void set_load_weight(struct task_struct *p)
 827 {
 828         task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
 829         p->se.wait_runtime = 0;
 830
 831         if (task_has_rt_policy(p)) {
 832                 p->se.load.weight = prio_to_weight[0] * 2;
 833                 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
 834                 return;
 835         }
 836
 837         /*
 838          * SCHED_IDLE tasks get minimal weight:
 839          */
 840         if (p->policy == SCHED_IDLE) {
 841                 p->se.load.weight = WEIGHT_IDLEPRIO;
 842                 p->se.load.inv_weight = WMULT_IDLEPRIO;
 843                 return;
 844         }
 845
 846         p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
 847         p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 848 }
 849
 850 static void
 851 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 852 {
 853         sched_info_queued(p);
 854         p->sched_class->enqueue_task(rq, p, wakeup, now);
 855         p->se.on_rq = 1;
 856 }
 857
 858 static void
 859 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 860 {
 861         p->sched_class->dequeue_task(rq, p, sleep, now);
 862         p->se.on_rq = 0;
 863 }
 864
 865 /*
 866  * __normal_prio - return the priority that is based on the static prio
 867  */
 868 static inline int __normal_prio(struct task_struct *p)
 869 {
 870         return p->static_prio;
 871 }
 872
 873 /*
 874  * Calculate the expected normal priority: i.e. priority
 875  * without taking RT-inheritance into account. Might be
 876  * boosted by interactivity modifiers. Changes upon fork,
 877  * setprio syscalls, and whenever the interactivity
 878  * estimator recalculates.
 879  */
 880 static inline int normal_prio(struct task_struct *p)
 881 {
 882         int prio;
 883
 884         if (task_has_rt_policy(p))
 885                 prio = MAX_RT_PRIO-1 - p->rt_priority;
 886         else
 887                 prio = __normal_prio(p);
 888         return prio;
 889 }
 890
 891 /*
 892  * Calculate the current priority, i.e. the priority
 893  * taken into account by the scheduler. This value might
 894  * be boosted by RT tasks, or might be boosted by
 895  * interactivity modifiers. Will be RT if the task got
 896  * RT-boosted. If not then it returns p->normal_prio.
 897  */
 898 static int effective_prio(struct task_struct *p)
 899 {
 900         p->normal_prio = normal_prio(p);
 901         /*
 902          * If we are RT tasks or we were boosted to RT priority,
 903          * keep the priority unchanged. Otherwise, update priority
 904          * to the normal priority:
 905          */
 906         if (!rt_prio(p->prio))
 907                 return p->normal_prio;
 908         return p->prio;
 909 }
 910
 911 /*
 912  * activate_task - move a task to the runqueue.
 913  */
 914 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 915 {
 916         u64 now = rq_clock(rq);
 917
 918         if (p->state == TASK_UNINTERRUPTIBLE)
 919                 rq->nr_uninterruptible--;
 920
 921         enqueue_task(rq, p, wakeup, now);
 922         inc_nr_running(p, rq, now);
 923 }
 924
 925 /*
 926  * activate_idle_task - move idle task to the _front_ of runqueue.
 927  */
 928 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 929 {
 930         u64 now = rq_clock(rq);
 931
 932         if (p->state == TASK_UNINTERRUPTIBLE)
 933                 rq->nr_uninterruptible--;
 934
 935         enqueue_task(rq, p, 0, now);
 936         inc_nr_running(p, rq, now);
 937 }
 938
 939 /*
 940  * deactivate_task - remove a task from the runqueue.
 941  */
 942 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 943 {
 944         u64 now = rq_clock(rq);
 945
 946         if (p->state == TASK_UNINTERRUPTIBLE)
 947                 rq->nr_uninterruptible++;
 948
 949         dequeue_task(rq, p, sleep, now);
 950         dec_nr_running(p, rq, now);
 951 }
 952
 953 /**
 954  * task_curr - is this task currently executing on a CPU?
 955  * @p: the task in question.
 956  */
 957 inline int task_curr(const struct task_struct *p)
 958 {
 959         return cpu_curr(task_cpu(p)) == p;
 960 }
 961
 962 /* Used instead of source_load when we know the type == 0 */
 963 unsigned long weighted_cpuload(const int cpu)
 964 {
 965         return cpu_rq(cpu)->ls.load.weight;
 966 }
 967
 968 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 969 {
 970 #ifdef CONFIG_SMP
 971         task_thread_info(p)->cpu = cpu;
 972         set_task_cfs_rq(p);
 973 #endif
 974 }
 975
 976 #ifdef CONFIG_SMP
 977
 978 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 979 {
 980         int old_cpu = task_cpu(p);
 981         struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
 982         u64 clock_offset, fair_clock_offset;
 983
 984         clock_offset = old_rq->clock - new_rq->clock;
 985         fair_clock_offset = old_rq->cfs.fair_clock -
 986                                                  new_rq->cfs.fair_clock;
 987         if (p->se.wait_start)
 988                 p->se.wait_start -= clock_offset;
 989         if (p->se.wait_start_fair)
 990                 p->se.wait_start_fair -= fair_clock_offset;
 991         if (p->se.sleep_start)
 992                 p->se.sleep_start -= clock_offset;
 993         if (p->se.block_start)
 994                 p->se.block_start -= clock_offset;
 995         if (p->se.sleep_start_fair)
 996                 p->se.sleep_start_fair -= fair_clock_offset;
 997
 998         __set_task_cpu(p, new_cpu);
 999 }
1000
1001 struct migration_req {
1002         struct list_head list;
1003
1004         struct task_struct *task;
1005         int dest_cpu;
1006
1007         struct completion done;
1008 };
1009
1010 /*
1011  * The task's runqueue lock must be held.
1012  * Returns true if you have to wait for migration thread.
1013  */
1014 static int
1015 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1016 {
1017         struct rq *rq = task_rq(p);
1018
1019         /*
1020          * If the task is not on a runqueue (and not running), then
1021          * it is sufficient to simply update the task's cpu field.
1022          */
1023         if (!p->se.on_rq && !task_running(rq, p)) {
1024                 set_task_cpu(p, dest_cpu);
1025                 return 0;
1026         }
1027
1028         init_completion(&req->done);
1029         req->task = p;
1030         req->dest_cpu = dest_cpu;
1031         list_add(&req->list, &rq->migration_queue);
1032
1033         return 1;
1034 }
1035
1036 /*
1037  * wait_task_inactive - wait for a thread to unschedule.
1038  *
1039  * The caller must ensure that the task *will* unschedule sometime soon,
1040  * else this function might spin for a *long* time. This function can't
1041  * be called with interrupts off, or it may introduce deadlock with
1042  * smp_call_function() if an IPI is sent by the same process we are
1043  * waiting to become inactive.
1044  */
1045 void wait_task_inactive(struct task_struct *p)
1046 {
1047         unsigned long flags;
1048         int running, on_rq;
1049         struct rq *rq;
1050
1051 repeat:
1052         /*
1053          * We do the initial early heuristics without holding
1054          * any task-queue locks at all. We'll only try to get
1055          * the runqueue lock when things look like they will
1056          * work out!
1057          */
1058         rq = task_rq(p);
1059
1060         /*
1061          * If the task is actively running on another CPU
1062          * still, just relax and busy-wait without holding
1063          * any locks.
1064          *
1065          * NOTE! Since we don't hold any locks, it's not
1066          * even sure that "rq" stays as the right runqueue!
1067          * But we don't care, since "task_running()" will
1068          * return false if the runqueue has changed and p
1069          * is actually now running somewhere else!
1070          */
1071         while (task_running(rq, p))
1072                 cpu_relax();
1073
1074         /*
1075          * Ok, time to look more closely! We need the rq
1076          * lock now, to be *sure*. If we're wrong, we'll
1077          * just go back and repeat.
1078          */
1079         rq = task_rq_lock(p, &flags);
1080         running = task_running(rq, p);
1081         on_rq = p->se.on_rq;
1082         task_rq_unlock(rq, &flags);
1083
1084         /*
1085          * Was it really running after all now that we
1086          * checked with the proper locks actually held?
1087          *
1088          * Oops. Go back and try again..
1089          */
1090         if (unlikely(running)) {
1091                 cpu_relax();
1092                 goto repeat;
1093         }
1094
1095         /*
1096          * It's not enough that it's not actively running,
1097          * it must be off the runqueue _entirely_, and not
1098          * preempted!
1099          *
1100          * So if it wa still runnable (but just not actively
1101          * running right now), it's preempted, and we should
1102          * yield - it could be a while.
1103          */
1104         if (unlikely(on_rq)) {
1105                 yield();
1106                 goto repeat;
1107         }
1108
1109         /*
1110          * Ahh, all good. It wasn't running, and it wasn't
1111          * runnable, which means that it will never become
1112          * running in the future either. We're all done!
1113          */
1114 }
1115
1116 /***
1117  * kick_process - kick a running thread to enter/exit the kernel
1118  * @p: the to-be-kicked thread
1119  *
1120  * Cause a process which is running on another CPU to enter
1121  * kernel-mode, without any delay. (to get signals handled.)
1122  *
1123  * NOTE: this function doesnt have to take the runqueue lock,
1124  * because all it wants to ensure is that the remote task enters
1125  * the kernel. If the IPI races and the task has been migrated
1126  * to another CPU then no harm is done and the purpose has been
1127  * achieved as well.
1128  */
1129 void kick_process(struct task_struct *p)
1130 {
1131         int cpu;
1132
1133         preempt_disable();
1134         cpu = task_cpu(p);
1135         if ((cpu != smp_processor_id()) && task_curr(p))
1136                 smp_send_reschedule(cpu);
1137         preempt_enable();
1138 }
1139
1140 /*
1141  * Return a low guess at the load of a migration-source cpu weighted
1142  * according to the scheduling class and "nice" value.
1143  *
1144  * We want to under-estimate the load of migration sources, to
1145  * balance conservatively.
1146  */
1147 static inline unsigned long source_load(int cpu, int type)
1148 {
1149         struct rq *rq = cpu_rq(cpu);
1150         unsigned long total = weighted_cpuload(cpu);
1151
1152         if (type == 0)
1153                 return total;
1154
1155         return min(rq->cpu_load[type-1], total);
1156 }
1157
1158 /*
1159  * Return a high guess at the load of a migration-target cpu weighted
1160  * according to the scheduling class and "nice" value.
1161  */
1162 static inline unsigned long target_load(int cpu, int type)
1163 {
1164         struct rq *rq = cpu_rq(cpu);
1165         unsigned long total = weighted_cpuload(cpu);
1166
1167         if (type == 0)
1168                 return total;
1169
1170         return max(rq->cpu_load[type-1], total);
1171 }
1172
1173 /*
1174  * Return the average load per task on the cpu's run queue
1175  */
1176 static inline unsigned long cpu_avg_load_per_task(int cpu)
1177 {
1178         struct rq *rq = cpu_rq(cpu);
1179         unsigned long total = weighted_cpuload(cpu);
1180         unsigned long n = rq->nr_running;
1181
1182         return n ? total / n : SCHED_LOAD_SCALE;
1183 }
1184
1185 /*
1186  * find_idlest_group finds and returns the least busy CPU group within the
1187  * domain.
1188  */
1189 static struct sched_group *
1190 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1191 {
1192         struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1193         unsigned long min_load = ULONG_MAX, this_load = 0;
1194         int load_idx = sd->forkexec_idx;
1195         int imbalance = 100 + (sd->imbalance_pct-100)/2;
1196
1197         do {
1198                 unsigned long load, avg_load;
1199                 int local_group;
1200                 int i;
1201
1202                 /* Skip over this group if it has no CPUs allowed */
1203                 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1204                         goto nextgroup;
1205
1206                 local_group = cpu_isset(this_cpu, group->cpumask);
1207
1208                 /* Tally up the load of all CPUs in the group */
1209                 avg_load = 0;
1210
1211                 for_each_cpu_mask(i, group->cpumask) {
1212                         /* Bias balancing toward cpus of our domain */
1213                         if (local_group)
1214                                 load = source_load(i, load_idx);
1215                         else
1216                                 load = target_load(i, load_idx);
1217
1218                         avg_load += load;
1219                 }
1220
1221                 /* Adjust by relative CPU power of the group */
1222                 avg_load = sg_div_cpu_power(group,
1223                                 avg_load * SCHED_LOAD_SCALE);
1224
1225                 if (local_group) {
1226                         this_load = avg_load;
1227                         this = group;
1228                 } else if (avg_load < min_load) {
1229                         min_load = avg_load;
1230                         idlest = group;
1231                 }
1232 nextgroup:
1233                 group = group->next;
1234         } while (group != sd->groups);
1235
1236         if (!idlest || 100*this_load < imbalance*min_load)
1237                 return NULL;
1238         return idlest;
1239 }
1240
1241 /*
1242  * find_idlest_cpu - find the idlest cpu among the cpus in group.
1243  */
1244 static int
1245 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1246 {
1247         cpumask_t tmp;
1248         unsigned long load, min_load = ULONG_MAX;
1249         int idlest = -1;
1250         int i;
1251
1252         /* Traverse only the allowed CPUs */
1253         cpus_and(tmp, group->cpumask, p->cpus_allowed);
1254
1255         for_each_cpu_mask(i, tmp) {
1256                 load = weighted_cpuload(i);
1257
1258                 if (load < min_load || (load == min_load && i == this_cpu)) {
1259                         min_load = load;
1260                         idlest = i;
1261                 }
1262         }
1263
1264         return idlest;
1265 }
1266
1267 /*
1268  * sched_balance_self: balance the current task (running on cpu) in domains
1269  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1270  * SD_BALANCE_EXEC.
1271  *
1272  * Balance, ie. select the least loaded group.
1273  *
1274  * Returns the target CPU number, or the same CPU if no balancing is needed.
1275  *
1276  * preempt must be disabled.
1277  */
1278 static int sched_balance_self(int cpu, int flag)
1279 {
1280         struct task_struct *t = current;
1281         struct sched_domain *tmp, *sd = NULL;
1282
1283         for_each_domain(cpu, tmp) {
1284                 /*
1285                  * If power savings logic is enabled for a domain, stop there.
1286                  */
1287                 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1288                         break;
1289                 if (tmp->flags & flag)
1290                         sd = tmp;
1291         }
1292
1293         while (sd) {
1294                 cpumask_t span;
1295                 struct sched_group *group;
1296                 int new_cpu, weight;
1297
1298                 if (!(sd->flags & flag)) {
1299                         sd = sd->child;
1300                         continue;
1301                 }
1302
1303                 span = sd->span;
1304                 group = find_idlest_group(sd, t, cpu);
1305                 if (!group) {
1306                         sd = sd->child;
1307                         continue;
1308                 }
1309
1310                 new_cpu = find_idlest_cpu(group, t, cpu);
1311                 if (new_cpu == -1 || new_cpu == cpu) {
1312                         /* Now try balancing at a lower domain level of cpu */
1313                         sd = sd->child;
1314                         continue;
1315                 }
1316
1317                 /* Now try balancing at a lower domain level of new_cpu */
1318                 cpu = new_cpu;
1319                 sd = NULL;
1320                 weight = cpus_weight(span);
1321                 for_each_domain(cpu, tmp) {
1322                         if (weight <= cpus_weight(tmp->span))
1323                                 break;
1324                         if (tmp->flags & flag)
1325                                 sd = tmp;
1326                 }
1327                 /* while loop will break here if sd == NULL */
1328         }
1329
1330         return cpu;
1331 }
1332
1333 #endif /* CONFIG_SMP */
1334
1335 /*
1336  * wake_idle() will wake a task on an idle cpu if task->cpu is
1337  * not idle and an idle cpu is available.  The span of cpus to
1338  * search starts with cpus closest then further out as needed,
1339  * so we always favor a closer, idle cpu.
1340  *
1341  * Returns the CPU we should wake onto.
1342  */
1343 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1344 static int wake_idle(int cpu, struct task_struct *p)
1345 {
1346         cpumask_t tmp;
1347         struct sched_domain *sd;
1348         int i;
1349
1350         /*
1351          * If it is idle, then it is the best cpu to run this task.
1352          *
1353          * This cpu is also the best, if it has more than one task already.
1354          * Siblings must be also busy(in most cases) as they didn't already
1355          * pickup the extra load from this cpu and hence we need not check
1356          * sibling runqueue info. This will avoid the checks and cache miss
1357          * penalities associated with that.
1358          */
1359         if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1360                 return cpu;
1361
1362         for_each_domain(cpu, sd) {
1363                 if (sd->flags & SD_WAKE_IDLE) {
1364                         cpus_and(tmp, sd->span, p->cpus_allowed);
1365                         for_each_cpu_mask(i, tmp) {
1366                                 if (idle_cpu(i))
1367                                         return i;
1368                         }
1369                 }
1370                 else
1371                         break;
1372         }
1373         return cpu;
1374 }
1375 #else
1376 static inline int wake_idle(int cpu, struct task_struct *p)
1377 {
1378         return cpu;
1379 }
1380 #endif
1381
1382 /***
1383  * try_to_wake_up - wake up a thread
1384  * @p: the to-be-woken-up thread
1385  * @state: the mask of task states that can be woken
1386  * @sync: do a synchronous wakeup?
1387  *
1388  * Put it on the run-queue if it's not already there. The "current"
1389  * thread is always on the run-queue (except when the actual
1390  * re-schedule is in progress), and as such you're allowed to do
1391  * the simpler "current->state = TASK_RUNNING" to mark yourself
1392  * runnable without the overhead of this.
1393  *
1394  * returns failure only if the task is already active.
1395  */
1396 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1397 {
1398         int cpu, this_cpu, success = 0;
1399         unsigned long flags;
1400         long old_state;
1401         struct rq *rq;
1402 #ifdef CONFIG_SMP
1403         struct sched_domain *sd, *this_sd = NULL;
1404         unsigned long load, this_load;
1405         int new_cpu;
1406 #endif
1407
1408         rq = task_rq_lock(p, &flags);
1409         old_state = p->state;
1410         if (!(old_state & state))
1411                 goto out;
1412
1413         if (p->se.on_rq)
1414                 goto out_running;
1415
1416         cpu = task_cpu(p);
1417         this_cpu = smp_processor_id();
1418
1419 #ifdef CONFIG_SMP
1420         if (unlikely(task_running(rq, p)))
1421                 goto out_activate;
1422
1423         new_cpu = cpu;
1424
1425         schedstat_inc(rq, ttwu_cnt);
1426         if (cpu == this_cpu) {
1427                 schedstat_inc(rq, ttwu_local);
1428                 goto out_set_cpu;
1429         }
1430
1431         for_each_domain(this_cpu, sd) {
1432                 if (cpu_isset(cpu, sd->span)) {
1433                         schedstat_inc(sd, ttwu_wake_remote);
1434                         this_sd = sd;
1435                         break;
1436                 }
1437         }
1438
1439         if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1440                 goto out_set_cpu;
1441
1442         /*
1443          * Check for affine wakeup and passive balancing possibilities.
1444          */
1445         if (this_sd) {
1446                 int idx = this_sd->wake_idx;
1447                 unsigned int imbalance;
1448
1449                 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1450
1451                 load = source_load(cpu, idx);
1452                 this_load = target_load(this_cpu, idx);
1453
1454                 new_cpu = this_cpu; /* Wake to this CPU if we can */
1455
1456                 if (this_sd->flags & SD_WAKE_AFFINE) {
1457                         unsigned long tl = this_load;
1458                         unsigned long tl_per_task;
1459
1460                         tl_per_task = cpu_avg_load_per_task(this_cpu);
1461
1462                         /*
1463                          * If sync wakeup then subtract the (maximum possible)
1464                          * effect of the currently running task from the load
1465                          * of the current CPU:
1466                          */
1467                         if (sync)
1468                                 tl -= current->se.load.weight;
1469
1470                         if ((tl <= load &&
1471                                 tl + target_load(cpu, idx) <= tl_per_task) ||
1472                                100*(tl + p->se.load.weight) <= imbalance*load) {
1473                                 /*
1474                                  * This domain has SD_WAKE_AFFINE and
1475                                  * p is cache cold in this domain, and
1476                                  * there is no bad imbalance.
1477                                  */
1478                                 schedstat_inc(this_sd, ttwu_move_affine);
1479                                 goto out_set_cpu;
1480                         }
1481                 }
1482
1483                 /*
1484                  * Start passive balancing when half the imbalance_pct
1485                  * limit is reached.
1486                  */
1487                 if (this_sd->flags & SD_WAKE_BALANCE) {
1488                         if (imbalance*this_load <= 100*load) {
1489                                 schedstat_inc(this_sd, ttwu_move_balance);
1490                                 goto out_set_cpu;
1491                         }
1492                 }
1493         }
1494
1495         new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1496 out_set_cpu:
1497         new_cpu = wake_idle(new_cpu, p);
1498         if (new_cpu != cpu) {
1499                 set_task_cpu(p, new_cpu);
1500                 task_rq_unlock(rq, &flags);
1501                 /* might preempt at this point */
1502                 rq = task_rq_lock(p, &flags);
1503                 old_state = p->state;
1504                 if (!(old_state & state))
1505                         goto out;
1506                 if (p->se.on_rq)
1507                         goto out_running;
1508
1509                 this_cpu = smp_processor_id();
1510                 cpu = task_cpu(p);
1511         }
1512
1513 out_activate:
1514 #endif /* CONFIG_SMP */
1515         activate_task(rq, p, 1);
1516         /*
1517          * Sync wakeups (i.e. those types of wakeups where the waker
1518          * has indicated that it will leave the CPU in short order)
1519          * don't trigger a preemption, if the woken up task will run on
1520          * this cpu. (in this case the 'I will reschedule' promise of
1521          * the waker guarantees that the freshly woken up task is going
1522          * to be considered on this CPU.)
1523          */
1524         if (!sync || cpu != this_cpu)
1525                 check_preempt_curr(rq, p);
1526         success = 1;
1527
1528 out_running:
1529         p->state = TASK_RUNNING;
1530 out:
1531         task_rq_unlock(rq, &flags);
1532
1533         return success;
1534 }
1535
1536 int fastcall wake_up_process(struct task_struct *p)
1537 {
1538         return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1539                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1540 }
1541 EXPORT_SYMBOL(wake_up_process);
1542
1543 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1544 {
1545         return try_to_wake_up(p, state, 0);
1546 }
1547
1548 /*
1549  * Perform scheduler related setup for a newly forked process p.
1550  * p is forked by current.
1551  *
1552  * __sched_fork() is basic setup used by init_idle() too:
1553  */
1554 static void __sched_fork(struct task_struct *p)
1555 {
1556         p->se.wait_start_fair           = 0;
1557         p->se.wait_start                = 0;
1558         p->se.exec_start                = 0;
1559         p->se.sum_exec_runtime          = 0;
1560         p->se.delta_exec                = 0;
1561         p->se.delta_fair_run            = 0;
1562         p->se.delta_fair_sleep          = 0;
1563         p->se.wait_runtime              = 0;
1564         p->se.sum_wait_runtime          = 0;
1565         p->se.sum_sleep_runtime         = 0;
1566         p->se.sleep_start               = 0;
1567         p->se.sleep_start_fair          = 0;
1568         p->se.block_start               = 0;
1569         p->se.sleep_max                 = 0;
1570         p->se.block_max                 = 0;
1571         p->se.exec_max                  = 0;
1572         p->se.wait_max                  = 0;
1573         p->se.wait_runtime_overruns     = 0;
1574         p->se.wait_runtime_underruns    = 0;
1575
1576         INIT_LIST_HEAD(&p->run_list);
1577         p->se.on_rq = 0;
1578
1579         /*
1580          * We mark the process as running here, but have not actually
1581          * inserted it onto the runqueue yet. This guarantees that
1582          * nobody will actually run it, and a signal or other external
1583          * event cannot wake it up and insert it on the runqueue either.
1584          */
1585         p->state = TASK_RUNNING;
1586 }
1587
1588 /*
1589  * fork()/clone()-time setup:
1590  */
1591 void sched_fork(struct task_struct *p, int clone_flags)
1592 {
1593         int cpu = get_cpu();
1594
1595         __sched_fork(p);
1596
1597 #ifdef CONFIG_SMP
1598         cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1599 #endif
1600         __set_task_cpu(p, cpu);
1601
1602         /*
1603          * Make sure we do not leak PI boosting priority to the child:
1604          */
1605         p->prio = current->normal_prio;
1606
1607 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1608         if (likely(sched_info_on()))
1609                 memset(&p->sched_info, 0, sizeof(p->sched_info));
1610 #endif
1611 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1612         p->oncpu = 0;
1613 #endif
1614 #ifdef CONFIG_PREEMPT
1615         /* Want to start with kernel preemption disabled. */
1616         task_thread_info(p)->preempt_count = 1;
1617 #endif
1618         put_cpu();
1619 }
1620
1621 /*
1622  * After fork, child runs first. (default) If set to 0 then
1623  * parent will (try to) run first.
1624  */
1625 unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1626
1627 /*
1628  * wake_up_new_task - wake up a newly created task for the first time.
1629  *
1630  * This function will do some initial scheduler statistics housekeeping
1631  * that must be done for every newly created context, then puts the task
1632  * on the runqueue and wakes it.
1633  */
1634 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1635 {
1636         unsigned long flags;
1637         struct rq *rq;
1638         int this_cpu;
1639
1640         rq = task_rq_lock(p, &flags);
1641         BUG_ON(p->state != TASK_RUNNING);
1642         this_cpu = smp_processor_id(); /* parent's CPU */
1643
1644         p->prio = effective_prio(p);
1645
1646         if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1647                         task_cpu(p) != this_cpu || !current->se.on_rq) {
1648                 activate_task(rq, p, 0);
1649         } else {
1650                 /*
1651                  * Let the scheduling class do new task startup
1652                  * management (if any):
1653                  */
1654                 p->sched_class->task_new(rq, p);
1655         }
1656         check_preempt_curr(rq, p);
1657         task_rq_unlock(rq, &flags);
1658 }
1659
1660 /**
1661  * prepare_task_switch - prepare to switch tasks
1662  * @rq: the runqueue preparing to switch
1663  * @next: the task we are going to switch to.
1664  *
1665  * This is called with the rq lock held and interrupts off. It must
1666  * be paired with a subsequent finish_task_switch after the context
1667  * switch.
1668  *
1669  * prepare_task_switch sets up locking and calls architecture specific
1670  * hooks.
1671  */
1672 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1673 {
1674         prepare_lock_switch(rq, next);
1675         prepare_arch_switch(next);
1676 }
1677
1678 /**
1679  * finish_task_switch - clean up after a task-switch
1680  * @rq: runqueue associated with task-switch
1681  * @prev: the thread we just switched away from.
1682  *
1683  * finish_task_switch must be called after the context switch, paired
1684  * with a prepare_task_switch call before the context switch.
1685  * finish_task_switch will reconcile locking set up by prepare_task_switch,
1686  * and do any other architecture-specific cleanup actions.
1687  *
1688  * Note that we may have delayed dropping an mm in context_switch(). If
1689  * so, we finish that here outside of the runqueue lock.  (Doing it
1690  * with the lock held can cause deadlocks; see schedule() for
1691  * details.)
1692  */
1693 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1694         __releases(rq->lock)
1695 {
1696         struct mm_struct *mm = rq->prev_mm;
1697         long prev_state;
1698
1699         rq->prev_mm = NULL;
1700
1701         /*
1702          * A task struct has one reference for the use as "current".
1703          * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1704          * schedule one last time. The schedule call will never return, and
1705          * the scheduled task must drop that reference.
1706          * The test for TASK_DEAD must occur while the runqueue locks are
1707          * still held, otherwise prev could be scheduled on another cpu, die
1708          * there before we look at prev->state, and then the reference would
1709          * be dropped twice.
1710          *              Manfred Spraul <manfred@colorfullife.com>
1711          */
1712         prev_state = prev->state;
1713         finish_arch_switch(prev);
1714         finish_lock_switch(rq, prev);
1715         if (mm)
1716                 mmdrop(mm);
1717         if (unlikely(prev_state == TASK_DEAD)) {
1718                 /*
1719                  * Remove function-return probe instances associated with this
1720                  * task and put them back on the free list.
1721                  */
1722                 kprobe_flush_task(prev);
1723                 put_task_struct(prev);
1724         }
1725 }
1726
1727 /**
1728  * schedule_tail - first thing a freshly forked thread must call.
1729  * @prev: the thread we just switched away from.
1730  */
1731 asmlinkage void schedule_tail(struct task_struct *prev)
1732         __releases(rq->lock)
1733 {
1734         struct rq *rq = this_rq();
1735
1736         finish_task_switch(rq, prev);
1737 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1738         /* In this case, finish_task_switch does not reenable preemption */
1739         preempt_enable();
1740 #endif
1741         if (current->set_child_tid)
1742                 put_user(current->pid, current->set_child_tid);
1743 }
1744
1745 /*
1746  * context_switch - switch to the new MM and the new
1747  * thread's register state.
1748  */
1749 static inline void
1750 context_switch(struct rq *rq, struct task_struct *prev,
1751                struct task_struct *next)
1752 {
1753         struct mm_struct *mm, *oldmm;
1754
1755         prepare_task_switch(rq, next);
1756         mm = next->mm;
1757         oldmm = prev->active_mm;
1758         /*
1759          * For paravirt, this is coupled with an exit in switch_to to
1760          * combine the page table reload and the switch backend into
1761          * one hypercall.
1762          */
1763         arch_enter_lazy_cpu_mode();
1764
1765         if (unlikely(!mm)) {
1766                 next->active_mm = oldmm;
1767                 atomic_inc(&oldmm->mm_count);
1768                 enter_lazy_tlb(oldmm, next);
1769         } else
1770                 switch_mm(oldmm, mm, next);
1771
1772         if (unlikely(!prev->mm)) {
1773                 prev->active_mm = NULL;
1774                 rq->prev_mm = oldmm;
1775         }
1776         /*
1777          * Since the runqueue lock will be released by the next
1778          * task (which is an invalid locking op but in the case
1779          * of the scheduler it's an obvious special-case), so we
1780          * do an early lockdep release here:
1781          */
1782 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1783         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1784 #endif
1785
1786         /* Here we just switch the register state and the stack. */
1787         switch_to(prev, next, prev);
1788
1789         barrier();
1790         /*
1791          * this_rq must be evaluated again because prev may have moved
1792          * CPUs since it called schedule(), thus the 'rq' on its stack
1793          * frame will be invalid.
1794          */
1795         finish_task_switch(this_rq(), prev);
1796 }
1797
1798 /*
1799  * nr_running, nr_uninterruptible and nr_context_switches:
1800  *
1801  * externally visible scheduler statistics: current number of runnable
1802  * threads, current number of uninterruptible-sleeping threads, total
1803  * number of context switches performed since bootup.
1804  */
1805 unsigned long nr_running(void)
1806 {
1807         unsigned long i, sum = 0;
1808
1809         for_each_online_cpu(i)
1810                 sum += cpu_rq(i)->nr_running;
1811
1812         return sum;
1813 }
1814
1815 unsigned long nr_uninterruptible(void)
1816 {
1817         unsigned long i, sum = 0;
1818
1819         for_each_possible_cpu(i)
1820                 sum += cpu_rq(i)->nr_uninterruptible;
1821
1822         /*
1823          * Since we read the counters lockless, it might be slightly
1824          * inaccurate. Do not allow it to go below zero though:
1825          */
1826         if (unlikely((long)sum < 0))
1827                 sum = 0;
1828
1829         return sum;
1830 }
1831
1832 unsigned long long nr_context_switches(void)
1833 {
1834         int i;
1835         unsigned long long sum = 0;
1836
1837         for_each_possible_cpu(i)
1838                 sum += cpu_rq(i)->nr_switches;
1839
1840         return sum;
1841 }
1842
1843 unsigned long nr_iowait(void)
1844 {
1845         unsigned long i, sum = 0;
1846
1847         for_each_possible_cpu(i)
1848                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1849
1850         return sum;
1851 }
1852
1853 unsigned long nr_active(void)
1854 {
1855         unsigned long i, running = 0, uninterruptible = 0;
1856
1857         for_each_online_cpu(i) {
1858                 running += cpu_rq(i)->nr_running;
1859                 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1860         }
1861
1862         if (unlikely((long)uninterruptible < 0))
1863                 uninterruptible = 0;
1864
1865         return running + uninterruptible;
1866 }
1867
1868 /*
1869  * Update rq->cpu_load[] statistics. This function is usually called every
1870  * scheduler tick (TICK_NSEC).
1871  */
1872 static void update_cpu_load(struct rq *this_rq)
1873 {
1874         u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1875         unsigned long total_load = this_rq->ls.load.weight;
1876         unsigned long this_load =  total_load;
1877         struct load_stat *ls = &this_rq->ls;
1878         u64 now = __rq_clock(this_rq);
1879         int i, scale;
1880
1881         this_rq->nr_load_updates++;
1882         if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1883                 goto do_avg;
1884
1885         /* Update delta_fair/delta_exec fields first */
1886         update_curr_load(this_rq, now);
1887
1888         fair_delta64 = ls->delta_fair + 1;
1889         ls->delta_fair = 0;
1890
1891         exec_delta64 = ls->delta_exec + 1;
1892         ls->delta_exec = 0;
1893
1894         sample_interval64 = now - ls->load_update_last;
1895         ls->load_update_last = now;
1896
1897         if ((s64)sample_interval64 < (s64)TICK_NSEC)
1898                 sample_interval64 = TICK_NSEC;
1899
1900         if (exec_delta64 > sample_interval64)
1901                 exec_delta64 = sample_interval64;
1902
1903         idle_delta64 = sample_interval64 - exec_delta64;
1904
1905         tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1906         tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1907
1908         this_load = (unsigned long)tmp64;
1909
1910 do_avg:
1911
1912         /* Update our load: */
1913         for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1914                 unsigned long old_load, new_load;
1915
1916                 /* scale is effectively 1 << i now, and >> i divides by scale */
1917
1918                 old_load = this_rq->cpu_load[i];
1919                 new_load = this_load;
1920
1921                 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1922         }
1923 }
1924
1925 #ifdef CONFIG_SMP
1926
1927 /*
1928  * double_rq_lock - safely lock two runqueues
1929  *
1930  * Note this does not disable interrupts like task_rq_lock,
1931  * you need to do so manually before calling.
1932  */
1933 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1934         __acquires(rq1->lock)
1935         __acquires(rq2->lock)
1936 {
1937         BUG_ON(!irqs_disabled());
1938         if (rq1 == rq2) {
1939                 spin_lock(&rq1->lock);
1940                 __acquire(rq2->lock);   /* Fake it out ;) */
1941         } else {
1942                 if (rq1 < rq2) {
1943                         spin_lock(&rq1->lock);
1944                         spin_lock(&rq2->lock);
1945                 } else {
1946                         spin_lock(&rq2->lock);
1947                         spin_lock(&rq1->lock);
1948                 }
1949         }
1950 }
1951
1952 /*
1953  * double_rq_unlock - safely unlock two runqueues
1954  *
1955  * Note this does not restore interrupts like task_rq_unlock,
1956  * you need to do so manually after calling.
1957  */
1958 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1959         __releases(rq1->lock)
1960         __releases(rq2->lock)
1961 {
1962         spin_unlock(&rq1->lock);
1963         if (rq1 != rq2)
1964                 spin_unlock(&rq2->lock);
1965         else
1966                 __release(rq2->lock);
1967 }
1968
1969 /*
1970  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1971  */
1972 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1973         __releases(this_rq->lock)
1974         __acquires(busiest->lock)
1975         __acquires(this_rq->lock)
1976 {
1977         if (unlikely(!irqs_disabled())) {
1978                 /* printk() doesn't work good under rq->lock */
1979                 spin_unlock(&this_rq->lock);
1980                 BUG_ON(1);
1981         }
1982         if (unlikely(!spin_trylock(&busiest->lock))) {
1983                 if (busiest < this_rq) {
1984                         spin_unlock(&this_rq->lock);
1985                         spin_lock(&busiest->lock);
1986                         spin_lock(&this_rq->lock);
1987                 } else
1988                         spin_lock(&busiest->lock);
1989         }
1990 }
1991
1992 /*
1993  * If dest_cpu is allowed for this process, migrate the task to it.
1994  * This is accomplished by forcing the cpu_allowed mask to only
1995  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1996  * the cpu_allowed mask is restored.
1997  */
1998 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
1999 {
2000         struct migration_req req;
2001         unsigned long flags;
2002         struct rq *rq;
2003
2004         rq = task_rq_lock(p, &flags);
2005         if (!cpu_isset(dest_cpu, p->cpus_allowed)
2006             || unlikely(cpu_is_offline(dest_cpu)))
2007                 goto out;
2008
2009         /* force the process onto the specified CPU */
2010         if (migrate_task(p, dest_cpu, &req)) {
2011                 /* Need to wait for migration thread (might exit: take ref). */
2012                 struct task_struct *mt = rq->migration_thread;
2013
2014                 get_task_struct(mt);
2015                 task_rq_unlock(rq, &flags);
2016                 wake_up_process(mt);
2017                 put_task_struct(mt);
2018                 wait_for_completion(&req.done);
2019
2020                 return;
2021         }
2022 out:
2023         task_rq_unlock(rq, &flags);
2024 }
2025
2026 /*
2027  * sched_exec - execve() is a valuable balancing opportunity, because at
2028  * this point the task has the smallest effective memory and cache footprint.
2029  */
2030 void sched_exec(void)
2031 {
2032         int new_cpu, this_cpu = get_cpu();
2033         new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2034         put_cpu();
2035         if (new_cpu != this_cpu)
2036                 sched_migrate_task(current, new_cpu);
2037 }
2038
2039 /*
2040  * pull_task - move a task from a remote runqueue to the local runqueue.
2041  * Both runqueues must be locked.
2042  */
2043 static void pull_task(struct rq *src_rq, struct task_struct *p,
2044                       struct rq *this_rq, int this_cpu)
2045 {
2046         deactivate_task(src_rq, p, 0);
2047         set_task_cpu(p, this_cpu);
2048         activate_task(this_rq, p, 0);
2049         /*
2050          * Note that idle threads have a prio of MAX_PRIO, for this test
2051          * to be always true for them.
2052          */
2053         check_preempt_curr(this_rq, p);
2054 }
2055
2056 /*
2057  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2058  */
2059 static
2060 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2061                      struct sched_domain *sd, enum cpu_idle_type idle,
2062                      int *all_pinned)
2063 {
2064         /*
2065          * We do not migrate tasks that are:
2066          * 1) running (obviously), or
2067          * 2) cannot be migrated to this CPU due to cpus_allowed, or
2068          * 3) are cache-hot on their current CPU.
2069          */
2070         if (!cpu_isset(this_cpu, p->cpus_allowed))
2071                 return 0;
2072         *all_pinned = 0;
2073
2074         if (task_running(rq, p))
2075                 return 0;
2076
2077         /*
2078          * Aggressive migration if too many balance attempts have failed:
2079          */
2080         if (sd->nr_balance_failed > sd->cache_nice_tries)
2081                 return 1;
2082
2083         return 1;
2084 }
2085
2086 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2087                       unsigned long max_nr_move, unsigned long max_load_move,
2088                       struct sched_domain *sd, enum cpu_idle_type idle,
2089                       int *all_pinned, unsigned long *load_moved,
2090                       int this_best_prio, int best_prio, int best_prio_seen,
2091                       struct rq_iterator *iterator)
2092 {
2093         int pulled = 0, pinned = 0, skip_for_load;
2094         struct task_struct *p;
2095         long rem_load_move = max_load_move;
2096
2097         if (max_nr_move == 0 || max_load_move == 0)
2098                 goto out;
2099
2100         pinned = 1;
2101
2102         /*
2103          * Start the load-balancing iterator:
2104          */
2105         p = iterator->start(iterator->arg);
2106 next:
2107         if (!p)
2108                 goto out;
2109         /*
2110          * To help distribute high priority tasks accross CPUs we don't
2111          * skip a task if it will be the highest priority task (i.e. smallest
2112          * prio value) on its new queue regardless of its load weight
2113          */
2114         skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2115                                                          SCHED_LOAD_SCALE_FUZZ;
2116         if (skip_for_load && p->prio < this_best_prio)
2117                 skip_for_load = !best_prio_seen && p->prio == best_prio;
2118         if (skip_for_load ||
2119             !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2120
2121                 best_prio_seen |= p->prio == best_prio;
2122                 p = iterator->next(iterator->arg);
2123                 goto next;
2124         }
2125
2126         pull_task(busiest, p, this_rq, this_cpu);
2127         pulled++;
2128         rem_load_move -= p->se.load.weight;
2129
2130         /*
2131          * We only want to steal up to the prescribed number of tasks
2132          * and the prescribed amount of weighted load.
2133          */
2134         if (pulled < max_nr_move && rem_load_move > 0) {
2135                 if (p->prio < this_best_prio)
2136                         this_best_prio = p->prio;
2137                 p = iterator->next(iterator->arg);
2138                 goto next;
2139         }
2140 out:
2141         /*
2142          * Right now, this is the only place pull_task() is called,
2143          * so we can safely collect pull_task() stats here rather than
2144          * inside pull_task().
2145          */
2146         schedstat_add(sd, lb_gained[idle], pulled);
2147
2148         if (all_pinned)
2149                 *all_pinned = pinned;
2150         *load_moved = max_load_move - rem_load_move;
2151         return pulled;
2152 }
2153
2154 /*
2155  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2156  * load from busiest to this_rq, as part of a balancing operation within
2157  * "domain". Returns the number of tasks moved.
2158  *
2159  * Called with both runqueues locked.
2160  */
2161 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2162                       unsigned long max_nr_move, unsigned long max_load_move,
2163                       struct sched_domain *sd, enum cpu_idle_type idle,
2164                       int *all_pinned)
2165 {
2166         struct sched_class *class = sched_class_highest;
2167         unsigned long load_moved, total_nr_moved = 0, nr_moved;
2168         long rem_load_move = max_load_move;
2169
2170         do {
2171                 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2172                                 max_nr_move, (unsigned long)rem_load_move,
2173                                 sd, idle, all_pinned, &load_moved);
2174                 total_nr_moved += nr_moved;
2175                 max_nr_move -= nr_moved;
2176                 rem_load_move -= load_moved;
2177                 class = class->next;
2178         } while (class && max_nr_move && rem_load_move > 0);
2179
2180         return total_nr_moved;
2181 }
2182
2183 /*
2184  * find_busiest_group finds and returns the busiest CPU group within the
2185  * domain. It calculates and returns the amount of weighted load which
2186  * should be moved to restore balance via the imbalance parameter.
2187  */
2188 static struct sched_group *
2189 find_busiest_group(struct sched_domain *sd, int this_cpu,
2190                    unsigned long *imbalance, enum cpu_idle_type idle,
2191                    int *sd_idle, cpumask_t *cpus, int *balance)
2192 {
2193         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2194         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2195         unsigned long max_pull;
2196         unsigned long busiest_load_per_task, busiest_nr_running;
2197         unsigned long this_load_per_task, this_nr_running;
2198         int load_idx;
2199 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2200         int power_savings_balance = 1;
2201         unsigned long leader_nr_running = 0, min_load_per_task = 0;
2202         unsigned long min_nr_running = ULONG_MAX;
2203         struct sched_group *group_min = NULL, *group_leader = NULL;
2204 #endif
2205
2206         max_load = this_load = total_load = total_pwr = 0;
2207         busiest_load_per_task = busiest_nr_running = 0;
2208         this_load_per_task = this_nr_running = 0;
2209         if (idle == CPU_NOT_IDLE)
2210                 load_idx = sd->busy_idx;
2211         else if (idle == CPU_NEWLY_IDLE)
2212                 load_idx = sd->newidle_idx;
2213         else
2214                 load_idx = sd->idle_idx;
2215
2216         do {
2217                 unsigned long load, group_capacity;
2218                 int local_group;
2219                 int i;
2220                 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2221                 unsigned long sum_nr_running, sum_weighted_load;
2222
2223                 local_group = cpu_isset(this_cpu, group->cpumask);
2224
2225                 if (local_group)
2226                         balance_cpu = first_cpu(group->cpumask);
2227
2228                 /* Tally up the load of all CPUs in the group */
2229                 sum_weighted_load = sum_nr_running = avg_load = 0;
2230
2231                 for_each_cpu_mask(i, group->cpumask) {
2232                         struct rq *rq;
2233
2234                         if (!cpu_isset(i, *cpus))
2235                                 continue;
2236
2237                         rq = cpu_rq(i);
2238
2239                         if (*sd_idle && !idle_cpu(i))
2240                                 *sd_idle = 0;
2241
2242                         /* Bias balancing toward cpus of our domain */
2243                         if (local_group) {
2244                                 if (idle_cpu(i) && !first_idle_cpu) {
2245                                         first_idle_cpu = 1;
2246                                         balance_cpu = i;
2247                                 }
2248
2249                                 load = target_load(i, load_idx);
2250                         } else
2251                                 load = source_load(i, load_idx);
2252
2253                         avg_load += load;
2254                         sum_nr_running += rq->nr_running;
2255                         sum_weighted_load += weighted_cpuload(i);
2256                 }
2257
2258                 /*
2259                  * First idle cpu or the first cpu(busiest) in this sched group
2260                  * is eligible for doing load balancing at this and above
2261                  * domains.
2262                  */
2263                 if (local_group && balance_cpu != this_cpu && balance) {
2264                         *balance = 0;
2265                         goto ret;
2266                 }
2267
2268                 total_load += avg_load;
2269                 total_pwr += group->__cpu_power;
2270
2271                 /* Adjust by relative CPU power of the group */
2272                 avg_load = sg_div_cpu_power(group,
2273                                 avg_load * SCHED_LOAD_SCALE);
2274
2275                 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2276
2277                 if (local_group) {
2278                         this_load = avg_load;
2279                         this = group;
2280                         this_nr_running = sum_nr_running;
2281                         this_load_per_task = sum_weighted_load;
2282                 } else if (avg_load > max_load &&
2283                            sum_nr_running > group_capacity) {
2284                         max_load = avg_load;
2285                         busiest = group;
2286                         busiest_nr_running = sum_nr_running;
2287                         busiest_load_per_task = sum_weighted_load;
2288                 }
2289
2290 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2291                 /*
2292                  * Busy processors will not participate in power savings
2293                  * balance.
2294                  */
2295                 if (idle == CPU_NOT_IDLE ||
2296                                 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2297                         goto group_next;
2298
2299                 /*
2300                  * If the local group is idle or completely loaded
2301                  * no need to do power savings balance at this domain
2302                  */
2303                 if (local_group && (this_nr_running >= group_capacity ||
2304                                     !this_nr_running))
2305                         power_savings_balance = 0;
2306
2307                 /*
2308                  * If a group is already running at full capacity or idle,
2309                  * don't include that group in power savings calculations
2310                  */
2311                 if (!power_savings_balance || sum_nr_running >= group_capacity
2312                     || !sum_nr_running)
2313                         goto group_next;
2314
2315                 /*
2316                  * Calculate the group which has the least non-idle load.
2317                  * This is the group from where we need to pick up the load
2318                  * for saving power
2319                  */
2320                 if ((sum_nr_running < min_nr_running) ||
2321                     (sum_nr_running == min_nr_running &&
2322                      first_cpu(group->cpumask) <
2323                      first_cpu(group_min->cpumask))) {
2324                         group_min = group;
2325                         min_nr_running = sum_nr_running;
2326                         min_load_per_task = sum_weighted_load /
2327                                                 sum_nr_running;
2328                 }
2329
2330                 /*
2331                  * Calculate the group which is almost near its
2332                  * capacity but still has some space to pick up some load
2333                  * from other group and save more power
2334                  */
2335                 if (sum_nr_running <= group_capacity - 1) {
2336                         if (sum_nr_running > leader_nr_running ||
2337                             (sum_nr_running == leader_nr_running &&
2338                              first_cpu(group->cpumask) >
2339                               first_cpu(group_leader->cpumask))) {
2340                                 group_leader = group;
2341                                 leader_nr_running = sum_nr_running;
2342                         }
2343                 }
2344 group_next:
2345 #endif
2346                 group = group->next;
2347         } while (group != sd->groups);
2348
2349         if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2350                 goto out_balanced;
2351
2352         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2353
2354         if (this_load >= avg_load ||
2355                         100*max_load <= sd->imbalance_pct*this_load)
2356                 goto out_balanced;
2357
2358         busiest_load_per_task /= busiest_nr_running;
2359         /*
2360          * We're trying to get all the cpus to the average_load, so we don't
2361          * want to push ourselves above the average load, nor do we wish to
2362          * reduce the max loaded cpu below the average load, as either of these
2363          * actions would just result in more rebalancing later, and ping-pong
2364          * tasks around. Thus we look for the minimum possible imbalance.
2365          * Negative imbalances (*we* are more loaded than anyone else) will
2366          * be counted as no imbalance for these purposes -- we can't fix that
2367          * by pulling tasks to us.  Be careful of negative numbers as they'll
2368          * appear as very large values with unsigned longs.
2369          */
2370         if (max_load <= busiest_load_per_task)
2371                 goto out_balanced;
2372
2373         /*
2374          * In the presence of smp nice balancing, certain scenarios can have
2375          * max load less than avg load(as we skip the groups at or below
2376          * its cpu_power, while calculating max_load..)
2377          */
2378         if (max_load < avg_load) {
2379                 *imbalance = 0;
2380                 goto small_imbalance;
2381         }
2382
2383         /* Don't want to pull so many tasks that a group would go idle */
2384         max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2385
2386         /* How much load to actually move to equalise the imbalance */
2387         *imbalance = min(max_pull * busiest->__cpu_power,
2388                                 (avg_load - this_load) * this->__cpu_power)
2389                         / SCHED_LOAD_SCALE;
2390
2391         /*
2392          * if *imbalance is less than the average load per runnable task
2393          * there is no gaurantee that any tasks will be moved so we'll have
2394          * a think about bumping its value to force at least one task to be
2395          * moved
2396          */
2397         if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2398                 unsigned long tmp, pwr_now, pwr_move;
2399                 unsigned int imbn;
2400
2401 small_imbalance:
2402                 pwr_move = pwr_now = 0;
2403                 imbn = 2;
2404                 if (this_nr_running) {
2405                         this_load_per_task /= this_nr_running;
2406                         if (busiest_load_per_task > this_load_per_task)
2407                                 imbn = 1;
2408                 } else
2409                         this_load_per_task = SCHED_LOAD_SCALE;
2410
2411                 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2412                                         busiest_load_per_task * imbn) {
2413                         *imbalance = busiest_load_per_task;
2414                         return busiest;
2415                 }
2416
2417                 /*
2418                  * OK, we don't have enough imbalance to justify moving tasks,
2419                  * however we may be able to increase total CPU power used by
2420                  * moving them.
2421                  */
2422
2423                 pwr_now += busiest->__cpu_power *
2424                                 min(busiest_load_per_task, max_load);
2425                 pwr_now += this->__cpu_power *
2426                                 min(this_load_per_task, this_load);
2427                 pwr_now /= SCHED_LOAD_SCALE;
2428
2429                 /* Amount of load we'd subtract */
2430                 tmp = sg_div_cpu_power(busiest,
2431                                 busiest_load_per_task * SCHED_LOAD_SCALE);
2432                 if (max_load > tmp)
2433                         pwr_move += busiest->__cpu_power *
2434                                 min(busiest_load_per_task, max_load - tmp);
2435
2436                 /* Amount of load we'd add */
2437                 if (max_load * busiest->__cpu_power <
2438                                 busiest_load_per_task * SCHED_LOAD_SCALE)
2439                         tmp = sg_div_cpu_power(this,
2440                                         max_load * busiest->__cpu_power);
2441                 else
2442                         tmp = sg_div_cpu_power(this,
2443                                 busiest_load_per_task * SCHED_LOAD_SCALE);
2444                 pwr_move += this->__cpu_power *
2445                                 min(this_load_per_task, this_load + tmp);
2446                 pwr_move /= SCHED_LOAD_SCALE;
2447
2448                 /* Move if we gain throughput */
2449                 if (pwr_move <= pwr_now)
2450                         goto out_balanced;
2451
2452                 *imbalance = busiest_load_per_task;
2453         }
2454
2455         return busiest;
2456
2457 out_balanced:
2458 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2459         if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2460                 goto ret;
2461
2462         if (this == group_leader && group_leader != group_min) {
2463                 *imbalance = min_load_per_task;
2464                 return group_min;
2465         }
2466 #endif
2467 ret:
2468         *imbalance = 0;
2469         return NULL;
2470 }
2471
2472 /*
2473  * find_busiest_queue - find the busiest runqueue among the cpus in group.
2474  */
2475 static struct rq *
2476 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2477                    unsigned long imbalance, cpumask_t *cpus)
2478 {
2479         struct rq *busiest = NULL, *rq;
2480         unsigned long max_load = 0;
2481         int i;
2482
2483         for_each_cpu_mask(i, group->cpumask) {
2484                 unsigned long wl;
2485
2486                 if (!cpu_isset(i, *cpus))
2487                         continue;
2488
2489                 rq = cpu_rq(i);
2490                 wl = weighted_cpuload(i);
2491
2492                 if (rq->nr_running == 1 && wl > imbalance)
2493                         continue;
2494
2495                 if (wl > max_load) {
2496                         max_load = wl;
2497                         busiest = rq;
2498                 }
2499         }
2500
2501         return busiest;
2502 }
2503
2504 /*
2505  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2506  * so long as it is large enough.
2507  */
2508 #define MAX_PINNED_INTERVAL     512
2509
2510 static inline unsigned long minus_1_or_zero(unsigned long n)
2511 {
2512         return n > 0 ? n - 1 : 0;
2513 }
2514
2515 /*
2516  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2517  * tasks if there is an imbalance.
2518  */
2519 static int load_balance(int this_cpu, struct rq *this_rq,
2520                         struct sched_domain *sd, enum cpu_idle_type idle,
2521                         int *balance)
2522 {
2523         int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2524         struct sched_group *group;
2525         unsigned long imbalance;
2526         struct rq *busiest;
2527         cpumask_t cpus = CPU_MASK_ALL;
2528         unsigned long flags;
2529
2530         /*
2531          * When power savings policy is enabled for the parent domain, idle
2532          * sibling can pick up load irrespective of busy siblings. In this case,
2533          * let the state of idle sibling percolate up as CPU_IDLE, instead of
2534          * portraying it as CPU_NOT_IDLE.
2535          */
2536         if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2537             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2538                 sd_idle = 1;
2539
2540         schedstat_inc(sd, lb_cnt[idle]);
2541
2542 redo:
2543         group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2544                                    &cpus, balance);
2545
2546         if (*balance == 0)
2547                 goto out_balanced;
2548
2549         if (!group) {
2550                 schedstat_inc(sd, lb_nobusyg[idle]);
2551                 goto out_balanced;
2552         }
2553
2554         busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2555         if (!busiest) {
2556                 schedstat_inc(sd, lb_nobusyq[idle]);
2557                 goto out_balanced;
2558         }
2559
2560         BUG_ON(busiest == this_rq);
2561
2562         schedstat_add(sd, lb_imbalance[idle], imbalance);
2563
2564         nr_moved = 0;
2565         if (busiest->nr_running > 1) {
2566                 /*
2567                  * Attempt to move tasks. If find_busiest_group has found
2568                  * an imbalance but busiest->nr_running <= 1, the group is
2569                  * still unbalanced. nr_moved simply stays zero, so it is
2570                  * correctly treated as an imbalance.
2571                  */
2572                 local_irq_save(flags);
2573                 double_rq_lock(this_rq, busiest);
2574                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2575                                       minus_1_or_zero(busiest->nr_running),
2576                                       imbalance, sd, idle, &all_pinned);
2577                 double_rq_unlock(this_rq, busiest);
2578                 local_irq_restore(flags);
2579
2580                 /*
2581                  * some other cpu did the load balance for us.
2582                  */
2583                 if (nr_moved && this_cpu != smp_processor_id())
2584                         resched_cpu(this_cpu);
2585
2586                 /* All tasks on this runqueue were pinned by CPU affinity */
2587                 if (unlikely(all_pinned)) {
2588                         cpu_clear(cpu_of(busiest), cpus);
2589                         if (!cpus_empty(cpus))
2590                                 goto redo;
2591                         goto out_balanced;
2592                 }
2593         }
2594
2595         if (!nr_moved) {
2596                 schedstat_inc(sd, lb_failed[idle]);
2597                 sd->nr_balance_failed++;
2598
2599                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2600
2601                         spin_lock_irqsave(&busiest->lock, flags);
2602
2603                         /* don't kick the migration_thread, if the curr
2604                          * task on busiest cpu can't be moved to this_cpu
2605                          */
2606                         if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2607                                 spin_unlock_irqrestore(&busiest->lock, flags);
2608                                 all_pinned = 1;
2609                                 goto out_one_pinned;
2610                         }
2611
2612                         if (!busiest->active_balance) {
2613                                 busiest->active_balance = 1;
2614                                 busiest->push_cpu = this_cpu;
2615                                 active_balance = 1;
2616                         }
2617                         spin_unlock_irqrestore(&busiest->lock, flags);
2618                         if (active_balance)
2619                                 wake_up_process(busiest->migration_thread);
2620
2621                         /*
2622                          * We've kicked active balancing, reset the failure
2623                          * counter.
2624                          */
2625                         sd->nr_balance_failed = sd->cache_nice_tries+1;
2626                 }
2627         } else
2628                 sd->nr_balance_failed = 0;
2629
2630         if (likely(!active_balance)) {
2631                 /* We were unbalanced, so reset the balancing interval */
2632                 sd->balance_interval = sd->min_interval;
2633         } else {
2634                 /*
2635                  * If we've begun active balancing, start to back off. This
2636                  * case may not be covered by the all_pinned logic if there
2637                  * is only 1 task on the busy runqueue (because we don't call
2638                  * move_tasks).
2639                  */
2640                 if (sd->balance_interval < sd->max_interval)
2641                         sd->balance_interval *= 2;
2642         }
2643
2644         if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2645             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2646                 return -1;
2647         return nr_moved;
2648
2649 out_balanced:
2650         schedstat_inc(sd, lb_balanced[idle]);
2651
2652         sd->nr_balance_failed = 0;
2653
2654 out_one_pinned:
2655         /* tune up the balancing interval */
2656         if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2657                         (sd->balance_interval < sd->max_interval))
2658                 sd->balance_interval *= 2;
2659
2660         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2661             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2662                 return -1;
2663         return 0;
2664 }
2665
2666 /*
2667  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2668  * tasks if there is an imbalance.
2669  *
2670  * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2671  * this_rq is locked.
2672  */
2673 static int
2674 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2675 {
2676         struct sched_group *group;
2677         struct rq *busiest = NULL;
2678         unsigned long imbalance;
2679         int nr_moved = 0;
2680         int sd_idle = 0;
2681         cpumask_t cpus = CPU_MASK_ALL;
2682
2683         /*
2684          * When power savings policy is enabled for the parent domain, idle
2685          * sibling can pick up load irrespective of busy siblings. In this case,
2686          * let the state of idle sibling percolate up as IDLE, instead of
2687          * portraying it as CPU_NOT_IDLE.
2688          */
2689         if (sd->flags & SD_SHARE_CPUPOWER &&
2690             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2691                 sd_idle = 1;
2692
2693         schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2694 redo:
2695         group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2696                                    &sd_idle, &cpus, NULL);
2697         if (!group) {
2698                 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2699                 goto out_balanced;
2700         }
2701
2702         busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2703                                 &cpus);
2704         if (!busiest) {
2705                 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2706                 goto out_balanced;
2707         }
2708
2709         BUG_ON(busiest == this_rq);
2710
2711         schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2712
2713         nr_moved = 0;
2714         if (busiest->nr_running > 1) {
2715                 /* Attempt to move tasks */
2716                 double_lock_balance(this_rq, busiest);
2717                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2718                                         minus_1_or_zero(busiest->nr_running),
2719                                         imbalance, sd, CPU_NEWLY_IDLE, NULL);
2720                 spin_unlock(&busiest->lock);
2721
2722                 if (!nr_moved) {
2723                         cpu_clear(cpu_of(busiest), cpus);
2724                         if (!cpus_empty(cpus))
2725                                 goto redo;
2726                 }
2727         }
2728
2729         if (!nr_moved) {
2730                 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2731                 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2732                     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2733                         return -1;
2734         } else
2735                 sd->nr_balance_failed = 0;
2736
2737         return nr_moved;
2738
2739 out_balanced:
2740         schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2741         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2742             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2743                 return -1;
2744         sd->nr_balance_failed = 0;
2745
2746         return 0;
2747 }
2748
2749 /*
2750  * idle_balance is called by schedule() if this_cpu is about to become
2751  * idle. Attempts to pull tasks from other CPUs.
2752  */
2753 static void idle_balance(int this_cpu, struct rq *this_rq)
2754 {
2755         struct sched_domain *sd;
2756         int pulled_task = -1;
2757         unsigned long next_balance = jiffies + HZ;
2758
2759         for_each_domain(this_cpu, sd) {
2760                 unsigned long interval;
2761
2762                 if (!(sd->flags & SD_LOAD_BALANCE))
2763                         continue;
2764
2765                 if (sd->flags & SD_BALANCE_NEWIDLE)
2766                         /* If we've pulled tasks over stop searching: */
2767                         pulled_task = load_balance_newidle(this_cpu,
2768                                                                 this_rq, sd);
2769
2770                 interval = msecs_to_jiffies(sd->balance_interval);
2771                 if (time_after(next_balance, sd->last_balance + interval))
2772                         next_balance = sd->last_balance + interval;
2773                 if (pulled_task)
2774                         break;
2775         }
2776         if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2777                 /*
2778                  * We are going idle. next_balance may be set based on
2779                  * a busy processor. So reset next_balance.
2780                  */
2781                 this_rq->next_balance = next_balance;
2782         }
2783 }
2784
2785 /*
2786  * active_load_balance is run by migration threads. It pushes running tasks
2787  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2788  * running on each physical CPU where possible, and avoids physical /
2789  * logical imbalances.
2790  *
2791  * Called with busiest_rq locked.
2792  */
2793 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2794 {
2795         int target_cpu = busiest_rq->push_cpu;
2796         struct sched_domain *sd;
2797         struct rq *target_rq;
2798
2799         /* Is there any task to move? */
2800         if (busiest_rq->nr_running <= 1)
2801                 return;
2802
2803         target_rq = cpu_rq(target_cpu);
2804
2805         /*
2806          * This condition is "impossible", if it occurs
2807          * we need to fix it.  Originally reported by
2808          * Bjorn Helgaas on a 128-cpu setup.
2809          */
2810         BUG_ON(busiest_rq == target_rq);
2811
2812         /* move a task from busiest_rq to target_rq */
2813         double_lock_balance(busiest_rq, target_rq);
2814
2815         /* Search for an sd spanning us and the target CPU. */
2816         for_each_domain(target_cpu, sd) {
2817                 if ((sd->flags & SD_LOAD_BALANCE) &&
2818                     cpu_isset(busiest_cpu, sd->span))
2819                                 break;
2820         }
2821
2822         if (likely(sd)) {
2823                 schedstat_inc(sd, alb_cnt);
2824
2825                 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2826                                RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
2827                                NULL))
2828                         schedstat_inc(sd, alb_pushed);
2829                 else
2830                         schedstat_inc(sd, alb_failed);
2831         }
2832         spin_unlock(&target_rq->lock);
2833 }
2834
2835 #ifdef CONFIG_NO_HZ
2836 static struct {
2837         atomic_t load_balancer;
2838         cpumask_t  cpu_mask;
2839 } nohz ____cacheline_aligned = {
2840         .load_balancer = ATOMIC_INIT(-1),
2841         .cpu_mask = CPU_MASK_NONE,
2842 };
2843
2844 /*
2845  * This routine will try to nominate the ilb (idle load balancing)
2846  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2847  * load balancing on behalf of all those cpus. If all the cpus in the system
2848  * go into this tickless mode, then there will be no ilb owner (as there is
2849  * no need for one) and all the cpus will sleep till the next wakeup event
2850  * arrives...
2851  *
2852  * For the ilb owner, tick is not stopped. And this tick will be used
2853  * for idle load balancing. ilb owner will still be part of
2854  * nohz.cpu_mask..
2855  *
2856  * While stopping the tick, this cpu will become the ilb owner if there
2857  * is no other owner. And will be the owner till that cpu becomes busy
2858  * or if all cpus in the system stop their ticks at which point
2859  * there is no need for ilb owner.
2860  *
2861  * When the ilb owner becomes busy, it nominates another owner, during the
2862  * next busy scheduler_tick()
2863  */
2864 int select_nohz_load_balancer(int stop_tick)
2865 {
2866         int cpu = smp_processor_id();
2867
2868         if (stop_tick) {
2869                 cpu_set(cpu, nohz.cpu_mask);
2870                 cpu_rq(cpu)->in_nohz_recently = 1;
2871
2872                 /*
2873                  * If we are going offline and still the leader, give up!
2874                  */
2875                 if (cpu_is_offline(cpu) &&
2876                     atomic_read(&nohz.load_balancer) == cpu) {
2877                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2878                                 BUG();
2879                         return 0;
2880                 }
2881
2882                 /* time for ilb owner also to sleep */
2883                 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2884                         if (atomic_read(&nohz.load_balancer) == cpu)
2885                                 atomic_set(&nohz.load_balancer, -1);
2886                         return 0;
2887                 }
2888
2889                 if (atomic_read(&nohz.load_balancer) == -1) {
2890                         /* make me the ilb owner */
2891                         if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2892                                 return 1;
2893                 } else if (atomic_read(&nohz.load_balancer) == cpu)
2894                         return 1;
2895         } else {
2896                 if (!cpu_isset(cpu, nohz.cpu_mask))
2897                         return 0;
2898
2899                 cpu_clear(cpu, nohz.cpu_mask);
2900
2901                 if (atomic_read(&nohz.load_balancer) == cpu)
2902                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2903                                 BUG();
2904         }
2905         return 0;
2906 }
2907 #endif
2908
2909 static DEFINE_SPINLOCK(balancing);
2910
2911 /*
2912  * It checks each scheduling domain to see if it is due to be balanced,
2913  * and initiates a balancing operation if so.
2914  *
2915  * Balancing parameters are set up in arch_init_sched_domains.
2916  */
2917 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
2918 {
2919         int balance = 1;
2920         struct rq *rq = cpu_rq(cpu);
2921         unsigned long interval;
2922         struct sched_domain *sd;
2923         /* Earliest time when we have to do rebalance again */
2924         unsigned long next_balance = jiffies + 60*HZ;
2925
2926         for_each_domain(cpu, sd) {
2927                 if (!(sd->flags & SD_LOAD_BALANCE))
2928                         continue;
2929
2930                 interval = sd->balance_interval;
2931                 if (idle != CPU_IDLE)
2932                         interval *= sd->busy_factor;
2933
2934                 /* scale ms to jiffies */
2935                 interval = msecs_to_jiffies(interval);
2936                 if (unlikely(!interval))
2937                         interval = 1;
2938                 if (interval > HZ*NR_CPUS/10)
2939                         interval = HZ*NR_CPUS/10;
2940
2941
2942                 if (sd->flags & SD_SERIALIZE) {
2943                         if (!spin_trylock(&balancing))
2944                                 goto out;
2945                 }
2946
2947                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2948                         if (load_balance(cpu, rq, sd, idle, &balance)) {
2949                                 /*
2950                                  * We've pulled tasks over so either we're no
2951                                  * longer idle, or one of our SMT siblings is
2952                                  * not idle.
2953                                  */
2954                                 idle = CPU_NOT_IDLE;
2955                         }
2956                         sd->last_balance = jiffies;
2957                 }
2958                 if (sd->flags & SD_SERIALIZE)
2959                         spin_unlock(&balancing);
2960 out:
2961                 if (time_after(next_balance, sd->last_balance + interval))
2962                         next_balance = sd->last_balance + interval;
2963
2964                 /*
2965                  * Stop the load balance at this level. There is another
2966                  * CPU in our sched group which is doing load balancing more
2967                  * actively.
2968                  */
2969                 if (!balance)
2970                         break;
2971         }
2972         rq->next_balance = next_balance;
2973 }
2974
2975 /*
2976  * run_rebalance_domains is triggered when needed from the scheduler tick.
2977  * In CONFIG_NO_HZ case, the idle load balance owner will do the
2978  * rebalancing for all the cpus for whom scheduler ticks are stopped.
2979  */
2980 static void run_rebalance_domains(struct softirq_action *h)
2981 {
2982         int this_cpu = smp_processor_id();
2983         struct rq *this_rq = cpu_rq(this_cpu);
2984         enum cpu_idle_type idle = this_rq->idle_at_tick ?
2985                                                 CPU_IDLE : CPU_NOT_IDLE;
2986
2987         rebalance_domains(this_cpu, idle);
2988
2989 #ifdef CONFIG_NO_HZ
2990         /*
2991          * If this cpu is the owner for idle load balancing, then do the
2992          * balancing on behalf of the other idle cpus whose ticks are
2993          * stopped.
2994          */
2995         if (this_rq->idle_at_tick &&
2996             atomic_read(&nohz.load_balancer) == this_cpu) {
2997                 cpumask_t cpus = nohz.cpu_mask;
2998                 struct rq *rq;
2999                 int balance_cpu;
3000
3001                 cpu_clear(this_cpu, cpus);
3002                 for_each_cpu_mask(balance_cpu, cpus) {
3003                         /*
3004                          * If this cpu gets work to do, stop the load balancing
3005                          * work being done for other cpus. Next load
3006                          * balancing owner will pick it up.
3007                          */
3008                         if (need_resched())
3009                                 break;
3010
3011                         rebalance_domains(balance_cpu, SCHED_IDLE);
3012
3013                         rq = cpu_rq(balance_cpu);
3014                         if (time_after(this_rq->next_balance, rq->next_balance))
3015                                 this_rq->next_balance = rq->next_balance;
3016                 }
3017         }
3018 #endif
3019 }
3020
3021 /*
3022  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3023  *
3024  * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3025  * idle load balancing owner or decide to stop the periodic load balancing,
3026  * if the whole system is idle.
3027  */
3028 static inline void trigger_load_balance(struct rq *rq, int cpu)
3029 {
3030 #ifdef CONFIG_NO_HZ
3031         /*
3032          * If we were in the nohz mode recently and busy at the current
3033          * scheduler tick, then check if we need to nominate new idle
3034          * load balancer.
3035          */
3036         if (rq->in_nohz_recently && !rq->idle_at_tick) {
3037                 rq->in_nohz_recently = 0;
3038
3039                 if (atomic_read(&nohz.load_balancer) == cpu) {
3040                         cpu_clear(cpu, nohz.cpu_mask);
3041                         atomic_set(&nohz.load_balancer, -1);
3042                 }
3043
3044                 if (atomic_read(&nohz.load_balancer) == -1) {
3045                         /*
3046                          * simple selection for now: Nominate the
3047                          * first cpu in the nohz list to be the next
3048                          * ilb owner.
3049                          *
3050                          * TBD: Traverse the sched domains and nominate
3051                          * the nearest cpu in the nohz.cpu_mask.
3052                          */
3053                         int ilb = first_cpu(nohz.cpu_mask);
3054
3055                         if (ilb != NR_CPUS)
3056                                 resched_cpu(ilb);
3057                 }
3058         }
3059
3060         /*
3061          * If this cpu is idle and doing idle load balancing for all the
3062          * cpus with ticks stopped, is it time for that to stop?
3063          */
3064         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3065             cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3066                 resched_cpu(cpu);
3067                 return;
3068         }
3069
3070         /*
3071          * If this cpu is idle and the idle load balancing is done by
3072          * someone else, then no need raise the SCHED_SOFTIRQ
3073          */
3074         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3075             cpu_isset(cpu, nohz.cpu_mask))
3076                 return;
3077 #endif
3078         if (time_after_eq(jiffies, rq->next_balance))
3079                 raise_softirq(SCHED_SOFTIRQ);
3080 }
3081
3082 #else   /* CONFIG_SMP */
3083
3084 /*
3085  * on UP we do not need to balance between CPUs:
3086  */
3087 static inline void idle_balance(int cpu, struct rq *rq)
3088 {
3089 }
3090
3091 /* Avoid "used but not defined" warning on UP */
3092 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3093                       unsigned long max_nr_move, unsigned long max_load_move,
3094                       struct sched_domain *sd, enum cpu_idle_type idle,
3095                       int *all_pinned, unsigned long *load_moved,
3096                       int this_best_prio, int best_prio, int best_prio_seen,
3097                       struct rq_iterator *iterator)
3098 {
3099         *load_moved = 0;
3100
3101         return 0;
3102 }
3103
3104 #endif
3105
3106 DEFINE_PER_CPU(struct kernel_stat, kstat);
3107
3108 EXPORT_PER_CPU_SYMBOL(kstat);
3109
3110 /*
3111  * Return p->sum_exec_runtime plus any more ns on the sched_clock
3112  * that have not yet been banked in case the task is currently running.
3113  */
3114 unsigned long long task_sched_runtime(struct task_struct *p)
3115 {
3116         unsigned long flags;
3117         u64 ns, delta_exec;
3118         struct rq *rq;
3119
3120         rq = task_rq_lock(p, &flags);
3121         ns = p->se.sum_exec_runtime;
3122         if (rq->curr == p) {
3123                 delta_exec = rq_clock(rq) - p->se.exec_start;
3124                 if ((s64)delta_exec > 0)
3125                         ns += delta_exec;
3126         }
3127         task_rq_unlock(rq, &flags);
3128
3129         return ns;
3130 }
3131
3132 /*
3133  * Account user cpu time to a process.
3134  * @p: the process that the cpu time gets accounted to
3135  * @hardirq_offset: the offset to subtract from hardirq_count()
3136  * @cputime: the cpu time spent in user space since the last update
3137  */
3138 void account_user_time(struct task_struct *p, cputime_t cputime)
3139 {
3140         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3141         cputime64_t tmp;
3142
3143         p->utime = cputime_add(p->utime, cputime);
3144
3145         /* Add user time to cpustat. */
3146         tmp = cputime_to_cputime64(cputime);
3147         if (TASK_NICE(p) > 0)
3148                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3149         else
3150                 cpustat->user = cputime64_add(cpustat->user, tmp);
3151 }
3152
3153 /*
3154  * Account system cpu time to a process.
3155  * @p: the process that the cpu time gets accounted to
3156  * @hardirq_offset: the offset to subtract from hardirq_count()
3157  * @cputime: the cpu time spent in kernel space since the last update
3158  */
3159 void account_system_time(struct task_struct *p, int hardirq_offset,
3160                          cputime_t cputime)
3161 {
3162         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3163         struct rq *rq = this_rq();
3164         cputime64_t tmp;
3165
3166         p->stime = cputime_add(p->stime, cputime);
3167
3168         /* Add system time to cpustat. */
3169         tmp = cputime_to_cputime64(cputime);
3170         if (hardirq_count() - hardirq_offset)
3171                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3172         else if (softirq_count())
3173                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3174         else if (p != rq->idle)
3175                 cpustat->system = cputime64_add(cpustat->system, tmp);
3176         else if (atomic_read(&rq->nr_iowait) > 0)
3177                 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3178         else
3179                 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3180         /* Account for system time used */
3181         acct_update_integrals(p);
3182 }
3183
3184 /*
3185  * Account for involuntary wait time.
3186  * @p: the process from which the cpu time has been stolen
3187  * @steal: the cpu time spent in involuntary wait
3188  */
3189 void account_steal_time(struct task_struct *p, cputime_t steal)
3190 {
3191         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3192         cputime64_t tmp = cputime_to_cputime64(steal);
3193         struct rq *rq = this_rq();
3194
3195         if (p == rq->idle) {
3196                 p->stime = cputime_add(p->stime, steal);
3197                 if (atomic_read(&rq->nr_iowait) > 0)
3198                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3199                 else
3200                         cpustat->idle = cputime64_add(cpustat->idle, tmp);
3201         } else
3202                 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3203 }
3204
3205 /*
3206  * This function gets called by the timer code, with HZ frequency.
3207  * We call it with interrupts disabled.
3208  *
3209  * It also gets called by the fork code, when changing the parent's
3210  * timeslices.
3211  */
3212 void scheduler_tick(void)
3213 {
3214         int cpu = smp_processor_id();
3215         struct rq *rq = cpu_rq(cpu);
3216         struct task_struct *curr = rq->curr;
3217
3218         spin_lock(&rq->lock);
3219         if (curr != rq->idle) /* FIXME: needed? */
3220                 curr->sched_class->task_tick(rq, curr);
3221         update_cpu_load(rq);
3222         spin_unlock(&rq->lock);
3223
3224 #ifdef CONFIG_SMP
3225         rq->idle_at_tick = idle_cpu(cpu);
3226         trigger_load_balance(rq, cpu);
3227 #endif
3228 }
3229
3230 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3231
3232 void fastcall add_preempt_count(int val)
3233 {
3234         /*
3235          * Underflow?
3236          */
3237         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3238                 return;
3239         preempt_count() += val;
3240         /*
3241          * Spinlock count overflowing soon?
3242          */
3243         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3244                                 PREEMPT_MASK - 10);
3245 }
3246 EXPORT_SYMBOL(add_preempt_count);
3247
3248 void fastcall sub_preempt_count(int val)
3249 {
3250         /*
3251          * Underflow?
3252          */
3253         if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3254                 return;
3255         /*
3256          * Is the spinlock portion underflowing?
3257          */
3258         if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3259                         !(preempt_count() & PREEMPT_MASK)))
3260                 return;
3261
3262         preempt_count() -= val;
3263 }
3264 EXPORT_SYMBOL(sub_preempt_count);
3265
3266 #endif
3267
3268 /*
3269  * Print scheduling while atomic bug:
3270  */
3271 static noinline void __schedule_bug(struct task_struct *prev)
3272 {
3273         printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3274                 prev->comm, preempt_count(), prev->pid);
3275         debug_show_held_locks(prev);
3276         if (irqs_disabled())
3277                 print_irqtrace_events(prev);
3278         dump_stack();
3279 }
3280
3281 /*
3282  * Various schedule()-time debugging checks and statistics:
3283  */
3284 static inline void schedule_debug(struct task_struct *prev)
3285 {
3286         /*
3287          * Test if we are atomic.  Since do_exit() needs to call into
3288          * schedule() atomically, we ignore that path for now.
3289          * Otherwise, whine if we are scheduling when we should not be.
3290          */
3291         if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3292                 __schedule_bug(prev);
3293
3294         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3295
3296         schedstat_inc(this_rq(), sched_cnt);
3297 }
3298
3299 /*
3300  * Pick up the highest-prio task:
3301  */
3302 static inline struct task_struct *
3303 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3304 {
3305         struct sched_class *class;
3306         struct task_struct *p;
3307
3308         /*
3309          * Optimization: we know that if all tasks are in
3310          * the fair class we can call that function directly:
3311          */
3312         if (likely(rq->nr_running == rq->cfs.nr_running)) {
3313                 p = fair_sched_class.pick_next_task(rq, now);
3314                 if (likely(p))
3315                         return p;
3316         }
3317
3318         class = sched_class_highest;
3319         for ( ; ; ) {
3320                 p = class->pick_next_task(rq, now);
3321                 if (p)
3322                         return p;
3323                 /*
3324                  * Will never be NULL as the idle class always
3325                  * returns a non-NULL p:
3326                  */
3327                 class = class->next;
3328         }
3329 }
3330
3331 /*
3332  * schedule() is the main scheduler function.
3333  */
3334 asmlinkage void __sched schedule(void)
3335 {
3336         struct task_struct *prev, *next;
3337         long *switch_count;
3338         struct rq *rq;
3339         u64 now;
3340         int cpu;
3341
3342 need_resched:
3343         preempt_disable();
3344         cpu = smp_processor_id();
3345         rq = cpu_rq(cpu);
3346         rcu_qsctr_inc(cpu);
3347         prev = rq->curr;
3348         switch_count = &prev->nivcsw;
3349
3350         release_kernel_lock(prev);
3351 need_resched_nonpreemptible:
3352
3353         schedule_debug(prev);
3354
3355         spin_lock_irq(&rq->lock);
3356         clear_tsk_need_resched(prev);
3357
3358         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3359                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3360                                 unlikely(signal_pending(prev)))) {
3361                         prev->state = TASK_RUNNING;
3362                 } else {
3363                         deactivate_task(rq, prev, 1);
3364                 }
3365                 switch_count = &prev->nvcsw;
3366         }
3367
3368         if (unlikely(!rq->nr_running))
3369                 idle_balance(cpu, rq);
3370
3371         now = __rq_clock(rq);
3372         prev->sched_class->put_prev_task(rq, prev, now);
3373         next = pick_next_task(rq, prev, now);
3374
3375         sched_info_switch(prev, next);
3376
3377         if (likely(prev != next)) {
3378                 rq->nr_switches++;
3379                 rq->curr = next;
3380                 ++*switch_count;
3381
3382                 context_switch(rq, prev, next); /* unlocks the rq */
3383         } else
3384                 spin_unlock_irq(&rq->lock);
3385
3386         if (unlikely(reacquire_kernel_lock(current) < 0)) {
3387                 cpu = smp_processor_id();
3388                 rq = cpu_rq(cpu);
3389                 goto need_resched_nonpreemptible;
3390         }
3391         preempt_enable_no_resched();
3392         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3393                 goto need_resched;
3394 }
3395 EXPORT_SYMBOL(schedule);
3396
3397 #ifdef CONFIG_PREEMPT
3398 /*
3399  * this is the entry point to schedule() from in-kernel preemption
3400  * off of preempt_enable.  Kernel preemptions off return from interrupt
3401  * occur there and call schedule directly.
3402  */
3403 asmlinkage void __sched preempt_schedule(void)
3404 {
3405         struct thread_info *ti = current_thread_info();
3406 #ifdef CONFIG_PREEMPT_BKL
3407         struct task_struct *task = current;
3408         int saved_lock_depth;
3409 #endif
3410         /*
3411          * If there is a non-zero preempt_count or interrupts are disabled,
3412          * we do not want to preempt the current task.  Just return..
3413          */
3414         if (likely(ti->preempt_count || irqs_disabled()))
3415                 return;
3416
3417 need_resched:
3418         add_preempt_count(PREEMPT_ACTIVE);
3419         /*
3420          * We keep the big kernel semaphore locked, but we
3421          * clear ->lock_depth so that schedule() doesnt
3422          * auto-release the semaphore:
3423          */
3424 #ifdef CONFIG_PREEMPT_BKL
3425         saved_lock_depth = task->lock_depth;
3426         task->lock_depth = -1;
3427 #endif
3428         schedule();
3429 #ifdef CONFIG_PREEMPT_BKL
3430         task->lock_depth = saved_lock_depth;
3431 #endif
3432         sub_preempt_count(PREEMPT_ACTIVE);
3433
3434         /* we could miss a preemption opportunity between schedule and now */
3435         barrier();
3436         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3437                 goto need_resched;
3438 }
3439 EXPORT_SYMBOL(preempt_schedule);
3440
3441 /*
3442  * this is the entry point to schedule() from kernel preemption
3443  * off of irq context.
3444  * Note, that this is called and return with irqs disabled. This will
3445  * protect us against recursive calling from irq.
3446  */
3447 asmlinkage void __sched preempt_schedule_irq(void)
3448 {
3449         struct thread_info *ti = current_thread_info();
3450 #ifdef CONFIG_PREEMPT_BKL
3451         struct task_struct *task = current;
3452         int saved_lock_depth;
3453 #endif
3454         /* Catch callers which need to be fixed */
3455         BUG_ON(ti->preempt_count || !irqs_disabled());
3456
3457 need_resched:
3458         add_preempt_count(PREEMPT_ACTIVE);
3459         /*
3460          * We keep the big kernel semaphore locked, but we
3461          * clear ->lock_depth so that schedule() doesnt
3462          * auto-release the semaphore:
3463          */
3464 #ifdef CONFIG_PREEMPT_BKL
3465         saved_lock_depth = task->lock_depth;
3466         task->lock_depth = -1;
3467 #endif
3468         local_irq_enable();
3469         schedule();
3470         local_irq_disable();
3471 #ifdef CONFIG_PREEMPT_BKL
3472         task->lock_depth = saved_lock_depth;
3473 #endif
3474         sub_preempt_count(PREEMPT_ACTIVE);
3475
3476         /* we could miss a preemption opportunity between schedule and now */
3477         barrier();
3478         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3479                 goto need_resched;
3480 }
3481
3482 #endif /* CONFIG_PREEMPT */
3483
3484 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3485                           void *key)
3486 {
3487         return try_to_wake_up(curr->private, mode, sync);
3488 }
3489 EXPORT_SYMBOL(default_wake_function);
3490
3491 /*
3492  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
3493  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
3494  * number) then we wake all the non-exclusive tasks and one exclusive task.
3495  *
3496  * There are circumstances in which we can try to wake a task which has already
3497  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
3498  * zero in this (rare) case, and we handle it by continuing to scan the queue.
3499  */
3500 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3501                              int nr_exclusive, int sync, void *key)
3502 {
3503         struct list_head *tmp, *next;
3504
3505         list_for_each_safe(tmp, next, &q->task_list) {
3506                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3507                 unsigned flags = curr->flags;
3508
3509                 if (curr->func(curr, mode, sync, key) &&
3510                                 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3511                         break;
3512         }
3513 }
3514
3515 /**
3516  * __wake_up - wake up threads blocked on a waitqueue.
3517  * @q: the waitqueue
3518  * @mode: which threads
3519  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3520  * @key: is directly passed to the wakeup function
3521  */
3522 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3523                         int nr_exclusive, void *key)
3524 {
3525         unsigned long flags;
3526
3527         spin_lock_irqsave(&q->lock, flags);
3528         __wake_up_common(q, mode, nr_exclusive, 0, key);
3529         spin_unlock_irqrestore(&q->lock, flags);
3530 }
3531 EXPORT_SYMBOL(__wake_up);
3532
3533 /*
3534  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3535  */
3536 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3537 {
3538         __wake_up_common(q, mode, 1, 0, NULL);
3539 }
3540
3541 /**
3542  * __wake_up_sync - wake up threads blocked on a waitqueue.
3543  * @q: the waitqueue
3544  * @mode: which threads
3545  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3546  *
3547  * The sync wakeup differs that the waker knows that it will schedule
3548  * away soon, so while the target thread will be woken up, it will not
3549  * be migrated to another CPU - ie. the two threads are 'synchronized'
3550  * with each other. This can prevent needless bouncing between CPUs.
3551  *
3552  * On UP it can prevent extra preemption.
3553  */
3554 void fastcall
3555 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3556 {
3557         unsigned long flags;
3558         int sync = 1;
3559
3560         if (unlikely(!q))
3561                 return;
3562
3563         if (unlikely(!nr_exclusive))
3564                 sync = 0;
3565
3566         spin_lock_irqsave(&q->lock, flags);
3567         __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3568         spin_unlock_irqrestore(&q->lock, flags);
3569 }
3570 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
3571
3572 void fastcall complete(struct completion *x)
3573 {
3574         unsigned long flags;
3575
3576         spin_lock_irqsave(&x->wait.lock, flags);
3577         x->done++;
3578         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3579                          1, 0, NULL);
3580         spin_unlock_irqrestore(&x->wait.lock, flags);
3581 }
3582 EXPORT_SYMBOL(complete);
3583
3584 void fastcall complete_all(struct completion *x)
3585 {
3586         unsigned long flags;
3587
3588         spin_lock_irqsave(&x->wait.lock, flags);
3589         x->done += UINT_MAX/2;
3590         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3591                          0, 0, NULL);
3592         spin_unlock_irqrestore(&x->wait.lock, flags);
3593 }
3594 EXPORT_SYMBOL(complete_all);
3595
3596 void fastcall __sched wait_for_completion(struct completion *x)
3597 {
3598         might_sleep();
3599
3600         spin_lock_irq(&x->wait.lock);
3601         if (!x->done) {
3602                 DECLARE_WAITQUEUE(wait, current);
3603
3604                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3605                 __add_wait_queue_tail(&x->wait, &wait);
3606                 do {
3607                         __set_current_state(TASK_UNINTERRUPTIBLE);
3608                         spin_unlock_irq(&x->wait.lock);
3609                         schedule();
3610                         spin_lock_irq(&x->wait.lock);
3611                 } while (!x->done);
3612                 __remove_wait_queue(&x->wait, &wait);
3613         }
3614         x->done--;
3615         spin_unlock_irq(&x->wait.lock);
3616 }
3617 EXPORT_SYMBOL(wait_for_completion);
3618
3619 unsigned long fastcall __sched
3620 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3621 {
3622         might_sleep();
3623
3624         spin_lock_irq(&x->wait.lock);
3625         if (!x->done) {
3626                 DECLARE_WAITQUEUE(wait, current);
3627
3628                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3629                 __add_wait_queue_tail(&x->wait, &wait);
3630                 do {
3631                         __set_current_state(TASK_UNINTERRUPTIBLE);
3632                         spin_unlock_irq(&x->wait.lock);
3633                         timeout = schedule_timeout(timeout);
3634                         spin_lock_irq(&x->wait.lock);
3635                         if (!timeout) {
3636                                 __remove_wait_queue(&x->wait, &wait);
3637                                 goto out;
3638                         }
3639                 } while (!x->done);
3640                 __remove_wait_queue(&x->wait, &wait);
3641         }
3642         x->done--;
3643 out:
3644         spin_unlock_irq(&x->wait.lock);
3645         return timeout;
3646 }
3647 EXPORT_SYMBOL(wait_for_completion_timeout);
3648
3649 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3650 {
3651         int ret = 0;
3652
3653         might_sleep();
3654
3655         spin_lock_irq(&x->wait.lock);
3656         if (!x->done) {
3657                 DECLARE_WAITQUEUE(wait, current);
3658
3659                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3660                 __add_wait_queue_tail(&x->wait, &wait);
3661                 do {
3662                         if (signal_pending(current)) {
3663                                 ret = -ERESTARTSYS;
3664                                 __remove_wait_queue(&x->wait, &wait);
3665                                 goto out;
3666                         }
3667                         __set_current_state(TASK_INTERRUPTIBLE);
3668                         spin_unlock_irq(&x->wait.lock);
3669                         schedule();
3670                         spin_lock_irq(&x->wait.lock);
3671                 } while (!x->done);
3672                 __remove_wait_queue(&x->wait, &wait);
3673         }
3674         x->done--;
3675 out:
3676         spin_unlock_irq(&x->wait.lock);
3677
3678         return ret;
3679 }
3680 EXPORT_SYMBOL(wait_for_completion_interruptible);
3681
3682 unsigned long fastcall __sched
3683 wait_for_completion_interruptible_timeout(struct completion *x,
3684                                           unsigned long timeout)
3685 {
3686         might_sleep();
3687
3688         spin_lock_irq(&x->wait.lock);
3689         if (!x->done) {
3690                 DECLARE_WAITQUEUE(wait, current);
3691
3692                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3693                 __add_wait_queue_tail(&x->wait, &wait);
3694                 do {
3695                         if (signal_pending(current)) {
3696                                 timeout = -ERESTARTSYS;
3697                                 __remove_wait_queue(&x->wait, &wait);
3698                                 goto out;
3699                         }
3700                         __set_current_state(TASK_INTERRUPTIBLE);
3701                         spin_unlock_irq(&x->wait.lock);
3702                         timeout = schedule_timeout(timeout);
3703                         spin_lock_irq(&x->wait.lock);
3704                         if (!timeout) {
3705                                 __remove_wait_queue(&x->wait, &wait);
3706                                 goto out;
3707                         }
3708                 } while (!x->done);
3709                 __remove_wait_queue(&x->wait, &wait);
3710         }
3711         x->done--;
3712 out:
3713         spin_unlock_irq(&x->wait.lock);
3714         return timeout;
3715 }
3716 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3717
3718
3719 #define SLEEP_ON_VAR                                    \
3720         unsigned long flags;                            \
3721         wait_queue_t wait;                              \
3722         init_waitqueue_entry(&wait, current);
3723
3724 #define SLEEP_ON_HEAD                                   \
3725         spin_lock_irqsave(&q->lock,flags);              \
3726         __add_wait_queue(q, &wait);                     \
3727         spin_unlock(&q->lock);
3728
3729 #define SLEEP_ON_TAIL                                   \
3730         spin_lock_irq(&q->lock);                        \
3731         __remove_wait_queue(q, &wait);                  \
3732         spin_unlock_irqrestore(&q->lock, flags);
3733
3734 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3735 {
3736         SLEEP_ON_VAR
3737
3738         current->state = TASK_INTERRUPTIBLE;
3739
3740         SLEEP_ON_HEAD
3741         schedule();
3742         SLEEP_ON_TAIL
3743 }
3744 EXPORT_SYMBOL(interruptible_sleep_on);
3745
3746 long fastcall __sched
3747 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3748 {
3749         SLEEP_ON_VAR
3750
3751         current->state = TASK_INTERRUPTIBLE;
3752
3753         SLEEP_ON_HEAD
3754         timeout = schedule_timeout(timeout);
3755         SLEEP_ON_TAIL
3756
3757         return timeout;
3758 }
3759 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3760
3761 void fastcall __sched sleep_on(wait_queue_head_t *q)
3762 {
3763         SLEEP_ON_VAR
3764
3765         current->state = TASK_UNINTERRUPTIBLE;
3766
3767         SLEEP_ON_HEAD
3768         schedule();
3769         SLEEP_ON_TAIL
3770 }
3771 EXPORT_SYMBOL(sleep_on);
3772
3773 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3774 {
3775         SLEEP_ON_VAR
3776
3777         current->state = TASK_UNINTERRUPTIBLE;
3778
3779         SLEEP_ON_HEAD
3780         timeout = schedule_timeout(timeout);
3781         SLEEP_ON_TAIL
3782
3783         return timeout;
3784 }
3785
3786 EXPORT_SYMBOL(sleep_on_timeout);
3787
3788 #ifdef CONFIG_RT_MUTEXES
3789
3790 /*
3791  * rt_mutex_setprio - set the current priority of a task
3792  * @p: task
3793  * @prio: prio value (kernel-internal form)
3794  *
3795  * This function changes the 'effective' priority of a task. It does
3796  * not touch ->normal_prio like __setscheduler().
3797  *
3798  * Used by the rt_mutex code to implement priority inheritance logic.
3799  */
3800 void rt_mutex_setprio(struct task_struct *p, int prio)
3801 {
3802         unsigned long flags;
3803         int oldprio, on_rq;
3804         struct rq *rq;
3805         u64 now;
3806
3807         BUG_ON(prio < 0 || prio > MAX_PRIO);
3808
3809         rq = task_rq_lock(p, &flags);
3810         now = rq_clock(rq);
3811
3812         oldprio = p->prio;
3813         on_rq = p->se.on_rq;
3814         if (on_rq)
3815                 dequeue_task(rq, p, 0, now);
3816
3817         if (rt_prio(prio))
3818                 p->sched_class = &rt_sched_class;
3819         else
3820                 p->sched_class = &fair_sched_class;
3821
3822         p->prio = prio;
3823
3824         if (on_rq) {
3825                 enqueue_task(rq, p, 0, now);
3826                 /*
3827                  * Reschedule if we are currently running on this runqueue and
3828                  * our priority decreased, or if we are not currently running on
3829                  * this runqueue and our priority is higher than the current's
3830                  */
3831                 if (task_running(rq, p)) {
3832                         if (p->prio > oldprio)
3833                                 resched_task(rq->curr);
3834                 } else {
3835                         check_preempt_curr(rq, p);
3836                 }
3837         }
3838         task_rq_unlock(rq, &flags);
3839 }
3840
3841 #endif
3842
3843 void set_user_nice(struct task_struct *p, long nice)
3844 {
3845         int old_prio, delta, on_rq;
3846         unsigned long flags;
3847         struct rq *rq;
3848         u64 now;
3849
3850         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3851                 return;
3852         /*
3853          * We have to be careful, if called from sys_setpriority(),
3854          * the task might be in the middle of scheduling on another CPU.
3855          */
3856         rq = task_rq_lock(p, &flags);
3857         now = rq_clock(rq);
3858         /*
3859          * The RT priorities are set via sched_setscheduler(), but we still
3860          * allow the 'normal' nice value to be set - but as expected
3861          * it wont have any effect on scheduling until the task is
3862          * SCHED_FIFO/SCHED_RR:
3863          */
3864         if (task_has_rt_policy(p)) {
3865                 p->static_prio = NICE_TO_PRIO(nice);
3866                 goto out_unlock;
3867         }
3868         on_rq = p->se.on_rq;
3869         if (on_rq) {
3870                 dequeue_task(rq, p, 0, now);
3871                 dec_load(rq, p, now);
3872         }
3873
3874         p->static_prio = NICE_TO_PRIO(nice);
3875         set_load_weight(p);
3876         old_prio = p->prio;
3877         p->prio = effective_prio(p);
3878         delta = p->prio - old_prio;
3879
3880         if (on_rq) {
3881                 enqueue_task(rq, p, 0, now);
3882                 inc_load(rq, p, now);
3883                 /*
3884                  * If the task increased its priority or is running and
3885                  * lowered its priority, then reschedule its CPU:
3886                  */
3887                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3888                         resched_task(rq->curr);
3889         }
3890 out_unlock:
3891         task_rq_unlock(rq, &flags);
3892 }
3893 EXPORT_SYMBOL(set_user_nice);
3894
3895 /*
3896  * can_nice - check if a task can reduce its nice value
3897  * @p: task
3898  * @nice: nice value
3899  */
3900 int can_nice(const struct task_struct *p, const int nice)
3901 {
3902         /* convert nice value [19,-20] to rlimit style value [1,40] */
3903         int nice_rlim = 20 - nice;
3904
3905         return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3906                 capable(CAP_SYS_NICE));
3907 }
3908
3909 #ifdef __ARCH_WANT_SYS_NICE
3910
3911 /*
3912  * sys_nice - change the priority of the current process.
3913  * @increment: priority increment
3914  *
3915  * sys_setpriority is a more generic, but much slower function that
3916  * does similar things.
3917  */
3918 asmlinkage long sys_nice(int increment)
3919 {
3920         long nice, retval;
3921
3922         /*
3923          * Setpriority might change our priority at the same moment.
3924          * We don't have to worry. Conceptually one call occurs first
3925          * and we have a single winner.
3926          */
3927         if (increment < -40)
3928                 increment = -40;
3929         if (increment > 40)
3930                 increment = 40;
3931
3932         nice = PRIO_TO_NICE(current->static_prio) + increment;
3933         if (nice < -20)
3934                 nice = -20;
3935         if (nice > 19)
3936                 nice = 19;
3937
3938         if (increment < 0 && !can_nice(current, nice))
3939                 return -EPERM;
3940
3941         retval = security_task_setnice(current, nice);
3942         if (retval)
3943                 return retval;
3944
3945         set_user_nice(current, nice);
3946         return 0;
3947 }
3948
3949 #endif
3950
3951 /**
3952  * task_prio - return the priority value of a given task.
3953  * @p: the task in question.
3954  *
3955  * This is the priority value as seen by users in /proc.
3956  * RT tasks are offset by -200. Normal tasks are centered
3957  * around 0, value goes from -16 to +15.
3958  */
3959 int task_prio(const struct task_struct *p)
3960 {
3961         return p->prio - MAX_RT_PRIO;
3962 }
3963
3964 /**
3965  * task_nice - return the nice value of a given task.
3966  * @p: the task in question.
3967  */
3968 int task_nice(const struct task_struct *p)
3969 {
3970         return TASK_NICE(p);
3971 }
3972 EXPORT_SYMBOL_GPL(task_nice);
3973
3974 /**
3975  * idle_cpu - is a given cpu idle currently?
3976  * @cpu: the processor in question.
3977  */
3978 int idle_cpu(int cpu)
3979 {
3980         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3981 }
3982
3983 /**
3984  * idle_task - return the idle task for a given cpu.
3985  * @cpu: the processor in question.
3986  */
3987 struct task_struct *idle_task(int cpu)
3988 {
3989         return cpu_rq(cpu)->idle;
3990 }
3991
3992 /**
3993  * find_process_by_pid - find a process with a matching PID value.
3994  * @pid: the pid in question.
3995  */
3996 static inline struct task_struct *find_process_by_pid(pid_t pid)
3997 {
3998         return pid ? find_task_by_pid(pid) : current;
3999 }
4000
4001 /* Actually do priority change: must hold rq lock. */
4002 static void
4003 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4004 {
4005         BUG_ON(p->se.on_rq);
4006
4007         p->policy = policy;
4008         switch (p->policy) {
4009         case SCHED_NORMAL:
4010         case SCHED_BATCH:
4011         case SCHED_IDLE:
4012                 p->sched_class = &fair_sched_class;
4013                 break;
4014         case SCHED_FIFO:
4015         case SCHED_RR:
4016                 p->sched_class = &rt_sched_class;
4017                 break;
4018         }
4019
4020         p->rt_priority = prio;
4021         p->normal_prio = normal_prio(p);
4022         /* we are holding p->pi_lock already */
4023         p->prio = rt_mutex_getprio(p);
4024         set_load_weight(p);
4025 }
4026
4027 /**
4028  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4029  * @p: the task in question.
4030  * @policy: new policy.
4031  * @param: structure containing the new RT priority.
4032  *
4033  * NOTE that the task may be already dead.
4034  */
4035 int sched_setscheduler(struct task_struct *p, int policy,
4036                        struct sched_param *param)
4037 {
4038         int retval, oldprio, oldpolicy = -1, on_rq;
4039         unsigned long flags;
4040         struct rq *rq;
4041
4042         /* may grab non-irq protected spin_locks */
4043         BUG_ON(in_interrupt());
4044 recheck:
4045         /* double check policy once rq lock held */
4046         if (policy < 0)
4047                 policy = oldpolicy = p->policy;
4048         else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4049                         policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4050                         policy != SCHED_IDLE)
4051                 return -EINVAL;
4052         /*
4053          * Valid priorities for SCHED_FIFO and SCHED_RR are
4054          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4055          * SCHED_BATCH and SCHED_IDLE is 0.
4056          */
4057         if (param->sched_priority < 0 ||
4058             (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4059             (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4060                 return -EINVAL;
4061         if (rt_policy(policy) != (param->sched_priority != 0))
4062                 return -EINVAL;
4063
4064         /*
4065          * Allow unprivileged RT tasks to decrease priority:
4066          */
4067         if (!capable(CAP_SYS_NICE)) {
4068                 if (rt_policy(policy)) {
4069                         unsigned long rlim_rtprio;
4070
4071                         if (!lock_task_sighand(p, &flags))
4072                                 return -ESRCH;
4073                         rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4074                         unlock_task_sighand(p, &flags);
4075
4076                         /* can't set/change the rt policy */
4077                         if (policy != p->policy && !rlim_rtprio)
4078                                 return -EPERM;
4079
4080                         /* can't increase priority */
4081                         if (param->sched_priority > p->rt_priority &&
4082                             param->sched_priority > rlim_rtprio)
4083                                 return -EPERM;
4084                 }
4085                 /*
4086                  * Like positive nice levels, dont allow tasks to
4087                  * move out of SCHED_IDLE either:
4088                  */
4089                 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4090                         return -EPERM;
4091
4092                 /* can't change other user's priorities */
4093                 if ((current->euid != p->euid) &&
4094                     (current->euid != p->uid))
4095                         return -EPERM;
4096         }
4097
4098         retval = security_task_setscheduler(p, policy, param);
4099         if (retval)
4100                 return retval;
4101         /*
4102          * make sure no PI-waiters arrive (or leave) while we are
4103          * changing the priority of the task:
4104          */
4105         spin_lock_irqsave(&p->pi_lock, flags);
4106         /*
4107          * To be able to change p->policy safely, the apropriate
4108          * runqueue lock must be held.
4109          */
4110         rq = __task_rq_lock(p);
4111         /* recheck policy now with rq lock held */
4112         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4113                 policy = oldpolicy = -1;
4114                 __task_rq_unlock(rq);
4115                 spin_unlock_irqrestore(&p->pi_lock, flags);
4116                 goto recheck;
4117         }
4118         on_rq = p->se.on_rq;
4119         if (on_rq)
4120                 deactivate_task(rq, p, 0);
4121         oldprio = p->prio;
4122         __setscheduler(rq, p, policy, param->sched_priority);
4123         if (on_rq) {
4124                 activate_task(rq, p, 0);
4125                 /*
4126                  * Reschedule if we are currently running on this runqueue and
4127                  * our priority decreased, or if we are not currently running on
4128                  * this runqueue and our priority is higher than the current's
4129                  */
4130                 if (task_running(rq, p)) {
4131                         if (p->prio > oldprio)
4132                                 resched_task(rq->curr);
4133                 } else {
4134                         check_preempt_curr(rq, p);
4135                 }
4136         }
4137         __task_rq_unlock(rq);
4138         spin_unlock_irqrestore(&p->pi_lock, flags);
4139
4140         rt_mutex_adjust_pi(p);
4141
4142         return 0;
4143 }
4144 EXPORT_SYMBOL_GPL(sched_setscheduler);
4145
4146 static int
4147 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4148 {
4149         struct sched_param lparam;
4150         struct task_struct *p;
4151         int retval;
4152
4153         if (!param || pid < 0)
4154                 return -EINVAL;
4155         if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4156                 return -EFAULT;
4157
4158         rcu_read_lock();
4159         retval = -ESRCH;
4160         p = find_process_by_pid(pid);
4161         if (p != NULL)
4162                 retval = sched_setscheduler(p, policy, &lparam);
4163         rcu_read_unlock();
4164
4165         return retval;
4166 }
4167
4168 /**
4169  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4170  * @pid: the pid in question.
4171  * @policy: new policy.
4172  * @param: structure containing the new RT priority.
4173  */
4174 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4175                                        struct sched_param __user *param)
4176 {
4177         /* negative values for policy are not valid */
4178         if (policy < 0)
4179                 return -EINVAL;
4180
4181         return do_sched_setscheduler(pid, policy, param);
4182 }
4183
4184 /**
4185  * sys_sched_setparam - set/change the RT priority of a thread
4186  * @pid: the pid in question.
4187  * @param: structure containing the new RT priority.
4188  */
4189 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4190 {
4191         return do_sched_setscheduler(pid, -1, param);
4192 }
4193
4194 /**
4195  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4196  * @pid: the pid in question.
4197  */
4198 asmlinkage long sys_sched_getscheduler(pid_t pid)
4199 {
4200         struct task_struct *p;
4201         int retval = -EINVAL;
4202
4203         if (pid < 0)
4204                 goto out_nounlock;
4205
4206         retval = -ESRCH;
4207         read_lock(&tasklist_lock);
4208         p = find_process_by_pid(pid);
4209         if (p) {
4210                 retval = security_task_getscheduler(p);
4211                 if (!retval)
4212                         retval = p->policy;
4213         }
4214         read_unlock(&tasklist_lock);
4215
4216 out_nounlock:
4217         return retval;
4218 }
4219
4220 /**
4221  * sys_sched_getscheduler - get the RT priority of a thread
4222  * @pid: the pid in question.
4223  * @param: structure containing the RT priority.
4224  */
4225 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4226 {
4227         struct sched_param lp;
4228         struct task_struct *p;
4229         int retval = -EINVAL;
4230
4231         if (!param || pid < 0)
4232                 goto out_nounlock;
4233
4234         read_lock(&tasklist_lock);
4235         p = find_process_by_pid(pid);
4236         retval = -ESRCH;
4237         if (!p)
4238                 goto out_unlock;
4239
4240         retval = security_task_getscheduler(p);
4241         if (retval)
4242                 goto out_unlock;
4243
4244         lp.sched_priority = p->rt_priority;
4245         read_unlock(&tasklist_lock);
4246
4247         /*
4248          * This one might sleep, we cannot do it with a spinlock held ...
4249          */
4250         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4251
4252 out_nounlock:
4253         return retval;
4254
4255 out_unlock:
4256         read_unlock(&tasklist_lock);
4257         return retval;
4258 }
4259
4260 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4261 {
4262         cpumask_t cpus_allowed;
4263         struct task_struct *p;
4264         int retval;
4265
4266         mutex_lock(&sched_hotcpu_mutex);
4267         read_lock(&tasklist_lock);
4268
4269         p = find_process_by_pid(pid);
4270         if (!p) {
4271                 read_unlock(&tasklist_lock);
4272                 mutex_unlock(&sched_hotcpu_mutex);
4273                 return -ESRCH;
4274         }
4275
4276         /*
4277          * It is not safe to call set_cpus_allowed with the
4278          * tasklist_lock held.  We will bump the task_struct's
4279          * usage count and then drop tasklist_lock.
4280          */
4281         get_task_struct(p);
4282         read_unlock(&tasklist_lock);
4283
4284         retval = -EPERM;
4285         if ((current->euid != p->euid) && (current->euid != p->uid) &&
4286                         !capable(CAP_SYS_NICE))
4287                 goto out_unlock;
4288
4289         retval = security_task_setscheduler(p, 0, NULL);
4290         if (retval)
4291                 goto out_unlock;
4292
4293         cpus_allowed = cpuset_cpus_allowed(p);
4294         cpus_and(new_mask, new_mask, cpus_allowed);
4295         retval = set_cpus_allowed(p, new_mask);
4296
4297 out_unlock:
4298         put_task_struct(p);
4299         mutex_unlock(&sched_hotcpu_mutex);
4300         return retval;
4301 }
4302
4303 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4304                              cpumask_t *new_mask)
4305 {
4306         if (len < sizeof(cpumask_t)) {
4307                 memset(new_mask, 0, sizeof(cpumask_t));
4308         } else if (len > sizeof(cpumask_t)) {
4309                 len = sizeof(cpumask_t);
4310         }
4311         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4312 }
4313
4314 /**
4315  * sys_sched_setaffinity - set the cpu affinity of a process
4316  * @pid: pid of the process
4317  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4318  * @user_mask_ptr: user-space pointer to the new cpu mask
4319  */
4320 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4321                                       unsigned long __user *user_mask_ptr)
4322 {
4323         cpumask_t new_mask;
4324         int retval;
4325
4326         retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4327         if (retval)
4328                 return retval;
4329
4330         return sched_setaffinity(pid, new_mask);
4331 }
4332
4333 /*
4334  * Represents all cpu's present in the system
4335  * In systems capable of hotplug, this map could dynamically grow
4336  * as new cpu's are detected in the system via any platform specific
4337  * method, such as ACPI for e.g.
4338  */
4339
4340 cpumask_t cpu_present_map __read_mostly;
4341 EXPORT_SYMBOL(cpu_present_map);
4342
4343 #ifndef CONFIG_SMP
4344 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4345 EXPORT_SYMBOL(cpu_online_map);
4346
4347 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4348 EXPORT_SYMBOL(cpu_possible_map);
4349 #endif
4350
4351 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4352 {
4353         struct task_struct *p;
4354         int retval;
4355
4356         mutex_lock(&sched_hotcpu_mutex);
4357         read_lock(&tasklist_lock);
4358
4359         retval = -ESRCH;
4360         p = find_process_by_pid(pid);
4361         if (!p)
4362                 goto out_unlock;
4363
4364         retval = security_task_getscheduler(p);
4365         if (retval)
4366                 goto out_unlock;
4367
4368         cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4369
4370 out_unlock:
4371         read_unlock(&tasklist_lock);
4372         mutex_unlock(&sched_hotcpu_mutex);
4373         if (retval)
4374                 return retval;
4375
4376         return 0;
4377 }
4378
4379 /**
4380  * sys_sched_getaffinity - get the cpu affinity of a process
4381  * @pid: pid of the process
4382  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4383  * @user_mask_ptr: user-space pointer to hold the current cpu mask
4384  */
4385 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4386                                       unsigned long __user *user_mask_ptr)
4387 {
4388         int ret;
4389         cpumask_t mask;
4390
4391         if (len < sizeof(cpumask_t))
4392                 return -EINVAL;
4393
4394         ret = sched_getaffinity(pid, &mask);
4395         if (ret < 0)
4396                 return ret;
4397
4398         if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4399                 return -EFAULT;
4400
4401         return sizeof(cpumask_t);
4402 }
4403
4404 /**
4405  * sys_sched_yield - yield the current processor to other threads.
4406  *
4407  * This function yields the current CPU to other tasks. If there are no
4408  * other threads running on this CPU then this function will return.
4409  */
4410 asmlinkage long sys_sched_yield(void)
4411 {
4412         struct rq *rq = this_rq_lock();
4413
4414         schedstat_inc(rq, yld_cnt);
4415         if (unlikely(rq->nr_running == 1))
4416                 schedstat_inc(rq, yld_act_empty);
4417         else
4418                 current->sched_class->yield_task(rq, current);
4419
4420         /*
4421          * Since we are going to call schedule() anyway, there's
4422          * no need to preempt or enable interrupts:
4423          */
4424         __release(rq->lock);
4425         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4426         _raw_spin_unlock(&rq->lock);
4427         preempt_enable_no_resched();
4428
4429         schedule();
4430
4431         return 0;
4432 }
4433
4434 static void __cond_resched(void)
4435 {
4436 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4437         __might_sleep(__FILE__, __LINE__);
4438 #endif
4439         /*
4440          * The BKS might be reacquired before we have dropped
4441          * PREEMPT_ACTIVE, which could trigger a second
4442          * cond_resched() call.
4443          */
4444         do {
4445                 add_preempt_count(PREEMPT_ACTIVE);
4446                 schedule();
4447                 sub_preempt_count(PREEMPT_ACTIVE);
4448         } while (need_resched());
4449 }
4450
4451 int __sched cond_resched(void)
4452 {
4453         if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4454                                         system_state == SYSTEM_RUNNING) {
4455                 __cond_resched();
4456                 return 1;
4457         }
4458         return 0;
4459 }
4460 EXPORT_SYMBOL(cond_resched);
4461
4462 /*
4463  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4464  * call schedule, and on return reacquire the lock.
4465  *
4466  * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
4467  * operations here to prevent schedule() from being called twice (once via
4468  * spin_unlock(), once by hand).
4469  */
4470 int cond_resched_lock(spinlock_t *lock)
4471 {
4472         int ret = 0;
4473
4474         if (need_lockbreak(lock)) {
4475                 spin_unlock(lock);
4476                 cpu_relax();
4477                 ret = 1;
4478                 spin_lock(lock);
4479         }
4480         if (need_resched() && system_state == SYSTEM_RUNNING) {
4481                 spin_release(&lock->dep_map, 1, _THIS_IP_);
4482                 _raw_spin_unlock(lock);
4483                 preempt_enable_no_resched();
4484                 __cond_resched();
4485                 ret = 1;
4486                 spin_lock(lock);
4487         }
4488         return ret;
4489 }
4490 EXPORT_SYMBOL(cond_resched_lock);
4491
4492 int __sched cond_resched_softirq(void)
4493 {
4494         BUG_ON(!in_softirq());
4495
4496         if (need_resched() && system_state == SYSTEM_RUNNING) {
4497                 local_bh_enable();
4498                 __cond_resched();
4499                 local_bh_disable();
4500                 return 1;
4501         }
4502         return 0;
4503 }
4504 EXPORT_SYMBOL(cond_resched_softirq);
4505
4506 /**
4507  * yield - yield the current processor to other threads.
4508  *
4509  * This is a shortcut for kernel-space yielding - it marks the
4510  * thread runnable and calls sys_sched_yield().
4511  */
4512 void __sched yield(void)
4513 {
4514         set_current_state(TASK_RUNNING);
4515         sys_sched_yield();
4516 }
4517 EXPORT_SYMBOL(yield);
4518
4519 /*
4520  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
4521  * that process accounting knows that this is a task in IO wait state.
4522  *
4523  * But don't do that if it is a deliberate, throttling IO wait (this task
4524  * has set its backing_dev_info: the queue against which it should throttle)
4525  */
4526 void __sched io_schedule(void)
4527 {
4528         struct rq *rq = &__raw_get_cpu_var(runqueues);
4529
4530         delayacct_blkio_start();
4531         atomic_inc(&rq->nr_iowait);
4532         schedule();
4533         atomic_dec(&rq->nr_iowait);
4534         delayacct_blkio_end();
4535 }
4536 EXPORT_SYMBOL(io_schedule);
4537
4538 long __sched io_schedule_timeout(long timeout)
4539 {
4540         struct rq *rq = &__raw_get_cpu_var(runqueues);
4541         long ret;
4542
4543         delayacct_blkio_start();
4544         atomic_inc(&rq->nr_iowait);
4545         ret = schedule_timeout(timeout);
4546         atomic_dec(&rq->nr_iowait);
4547         delayacct_blkio_end();
4548         return ret;
4549 }
4550
4551 /**
4552  * sys_sched_get_priority_max - return maximum RT priority.
4553  * @policy: scheduling class.
4554  *
4555  * this syscall returns the maximum rt_priority that can be used
4556  * by a given scheduling class.
4557  */
4558 asmlinkage long sys_sched_get_priority_max(int policy)
4559 {
4560         int ret = -EINVAL;
4561
4562         switch (policy) {
4563         case SCHED_FIFO:
4564         case SCHED_RR:
4565                 ret = MAX_USER_RT_PRIO-1;
4566                 break;
4567         case SCHED_NORMAL:
4568         case SCHED_BATCH:
4569         case SCHED_IDLE:
4570                 ret = 0;
4571                 break;
4572         }
4573         return ret;
4574 }
4575
4576 /**
4577  * sys_sched_get_priority_min - return minimum RT priority.
4578  * @policy: scheduling class.
4579  *
4580  * this syscall returns the minimum rt_priority that can be used
4581  * by a given scheduling class.
4582  */
4583 asmlinkage long sys_sched_get_priority_min(int policy)
4584 {
4585         int ret = -EINVAL;
4586
4587         switch (policy) {
4588         case SCHED_FIFO:
4589         case SCHED_RR:
4590                 ret = 1;
4591                 break;
4592         case SCHED_NORMAL:
4593         case SCHED_BATCH:
4594         case SCHED_IDLE:
4595                 ret = 0;
4596         }
4597         return ret;
4598 }
4599
4600 /**
4601  * sys_sched_rr_get_interval - return the default timeslice of a process.
4602  * @pid: pid of the process.
4603  * @interval: userspace pointer to the timeslice value.
4604  *
4605  * this syscall writes the default timeslice value of a given process
4606  * into the user-space timespec buffer. A value of '0' means infinity.
4607  */
4608 asmlinkage
4609 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4610 {
4611         struct task_struct *p;
4612         int retval = -EINVAL;
4613         struct timespec t;
4614
4615         if (pid < 0)
4616                 goto out_nounlock;
4617
4618         retval = -ESRCH;
4619         read_lock(&tasklist_lock);
4620         p = find_process_by_pid(pid);
4621         if (!p)
4622                 goto out_unlock;
4623
4624         retval = security_task_getscheduler(p);
4625         if (retval)
4626                 goto out_unlock;
4627
4628         jiffies_to_timespec(p->policy == SCHED_FIFO ?
4629                                 0 : static_prio_timeslice(p->static_prio), &t);
4630         read_unlock(&tasklist_lock);
4631         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4632 out_nounlock:
4633         return retval;
4634 out_unlock:
4635         read_unlock(&tasklist_lock);
4636         return retval;
4637 }
4638
4639 static const char stat_nam[] = "RSDTtZX";
4640
4641 static void show_task(struct task_struct *p)
4642 {
4643         unsigned long free = 0;
4644         unsigned state;
4645
4646         state = p->state ? __ffs(p->state) + 1 : 0;
4647         printk("%-13.13s %c", p->comm,
4648                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4649 #if (BITS_PER_LONG == 32)
4650         if (state == TASK_RUNNING)
4651                 printk(" running ");
4652         else
4653                 printk(" %08lX ", thread_saved_pc(p));
4654 #else
4655         if (state == TASK_RUNNING)
4656                 printk("  running task   ");
4657         else
4658                 printk(" %016lx ", thread_saved_pc(p));
4659 #endif
4660 #ifdef CONFIG_DEBUG_STACK_USAGE
4661         {
4662                 unsigned long *n = end_of_stack(p);
4663                 while (!*n)
4664                         n++;
4665                 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4666         }
4667 #endif
4668         printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
4669         if (!p->mm)
4670                 printk(" (L-TLB)\n");
4671         else
4672                 printk(" (NOTLB)\n");
4673
4674         if (state != TASK_RUNNING)
4675                 show_stack(p, NULL);
4676 }
4677
4678 void show_state_filter(unsigned long state_filter)
4679 {
4680         struct task_struct *g, *p;
4681
4682 #if (BITS_PER_LONG == 32)
4683         printk("\n"
4684                "                         free                        sibling\n");
4685         printk("  task             PC    stack   pid father child younger older\n");
4686 #else
4687         printk("\n"
4688                "                                 free                        sibling\n");
4689         printk("  task                 PC        stack   pid father child younger older\n");
4690 #endif
4691         read_lock(&tasklist_lock);
4692         do_each_thread(g, p) {
4693                 /*
4694                  * reset the NMI-timeout, listing all files on a slow
4695                  * console might take alot of time:
4696                  */
4697                 touch_nmi_watchdog();
4698                 if (!state_filter || (p->state & state_filter))
4699                         show_task(p);
4700         } while_each_thread(g, p);
4701
4702         touch_all_softlockup_watchdogs();
4703
4704 #ifdef CONFIG_SCHED_DEBUG
4705         sysrq_sched_debug_show();
4706 #endif
4707         read_unlock(&tasklist_lock);
4708         /*
4709          * Only show locks if all tasks are dumped:
4710          */
4711         if (state_filter == -1)
4712                 debug_show_all_locks();
4713 }
4714
4715 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4716 {
4717         idle->sched_class = &idle_sched_class;
4718 }
4719
4720 /**
4721  * init_idle - set up an idle thread for a given CPU
4722  * @idle: task in question
4723  * @cpu: cpu the idle task belongs to
4724  *
4725  * NOTE: this function does not set the idle thread's NEED_RESCHED
4726  * flag, to make booting more robust.
4727  */
4728 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4729 {
4730         struct rq *rq = cpu_rq(cpu);
4731         unsigned long flags;
4732
4733         __sched_fork(idle);
4734         idle->se.exec_start = sched_clock();
4735
4736         idle->prio = idle->normal_prio = MAX_PRIO;
4737         idle->cpus_allowed = cpumask_of_cpu(cpu);
4738         __set_task_cpu(idle, cpu);
4739
4740         spin_lock_irqsave(&rq->lock, flags);
4741         rq->curr = rq->idle = idle;
4742 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4743         idle->oncpu = 1;
4744 #endif
4745         spin_unlock_irqrestore(&rq->lock, flags);
4746
4747         /* Set the preempt count _outside_ the spinlocks! */
4748 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4749         task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4750 #else
4751         task_thread_info(idle)->preempt_count = 0;
4752 #endif
4753         /*
4754          * The idle tasks have their own, simple scheduling class:
4755          */
4756         idle->sched_class = &idle_sched_class;
4757 }
4758
4759 /*
4760  * In a system that switches off the HZ timer nohz_cpu_mask
4761  * indicates which cpus entered this state. This is used
4762  * in the rcu update to wait only for active cpus. For system
4763  * which do not switch off the HZ timer nohz_cpu_mask should
4764  * always be CPU_MASK_NONE.
4765  */
4766 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4767
4768 /*
4769  * Increase the granularity value when there are more CPUs,
4770  * because with more CPUs the 'effective latency' as visible
4771  * to users decreases. But the relationship is not linear,
4772  * so pick a second-best guess by going with the log2 of the
4773  * number of CPUs.
4774  *
4775  * This idea comes from the SD scheduler of Con Kolivas:
4776  */
4777 static inline void sched_init_granularity(void)
4778 {
4779         unsigned int factor = 1 + ilog2(num_online_cpus());
4780         const unsigned long gran_limit = 10000000;
4781
4782         sysctl_sched_granularity *= factor;
4783         if (sysctl_sched_granularity > gran_limit)
4784                 sysctl_sched_granularity = gran_limit;
4785
4786         sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4787         sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4788 }
4789
4790 #ifdef CONFIG_SMP
4791 /*
4792  * This is how migration works:
4793  *
4794  * 1) we queue a struct migration_req structure in the source CPU's
4795  *    runqueue and wake up that CPU's migration thread.
4796  * 2) we down() the locked semaphore => thread blocks.
4797  * 3) migration thread wakes up (implicitly it forces the migrated
4798  *    thread off the CPU)
4799  * 4) it gets the migration request and checks whether the migrated
4800  *    task is still in the wrong runqueue.
4801  * 5) if it's in the wrong runqueue then the migration thread removes
4802  *    it and puts it into the right queue.
4803  * 6) migration thread up()s the semaphore.
4804  * 7) we wake up and the migration is done.
4805  */
4806
4807 /*
4808  * Change a given task's CPU affinity. Migrate the thread to a
4809  * proper CPU and schedule it away if the CPU it's executing on
4810  * is removed from the allowed bitmask.
4811  *
4812  * NOTE: the caller must have a valid reference to the task, the
4813  * task must not exit() & deallocate itself prematurely.  The
4814  * call is not atomic; no spinlocks may be held.
4815  */
4816 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4817 {
4818         struct migration_req req;
4819         unsigned long flags;
4820         struct rq *rq;
4821         int ret = 0;
4822
4823         rq = task_rq_lock(p, &flags);
4824         if (!cpus_intersects(new_mask, cpu_online_map)) {
4825                 ret = -EINVAL;
4826                 goto out;
4827         }
4828
4829         p->cpus_allowed = new_mask;
4830         /* Can the task run on the task's current CPU? If so, we're done */
4831         if (cpu_isset(task_cpu(p), new_mask))
4832                 goto out;
4833
4834         if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4835                 /* Need help from migration thread: drop lock and wait. */
4836                 task_rq_unlock(rq, &flags);
4837                 wake_up_process(rq->migration_thread);
4838                 wait_for_completion(&req.done);
4839                 tlb_migrate_finish(p->mm);
4840                 return 0;
4841         }
4842 out:
4843         task_rq_unlock(rq, &flags);
4844
4845         return ret;
4846 }
4847 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4848
4849 /*
4850  * Move (not current) task off this cpu, onto dest cpu.  We're doing
4851  * this because either it can't run here any more (set_cpus_allowed()
4852  * away from this CPU, or CPU going down), or because we're
4853  * attempting to rebalance this task on exec (sched_exec).
4854  *
4855  * So we race with normal scheduler movements, but that's OK, as long
4856  * as the task is no longer on this CPU.
4857  *
4858  * Returns non-zero if task was successfully migrated.
4859  */
4860 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4861 {
4862         struct rq *rq_dest, *rq_src;
4863         int ret = 0, on_rq;
4864
4865         if (unlikely(cpu_is_offline(dest_cpu)))
4866                 return ret;
4867
4868         rq_src = cpu_rq(src_cpu);
4869         rq_dest = cpu_rq(dest_cpu);
4870
4871         double_rq_lock(rq_src, rq_dest);
4872         /* Already moved. */
4873         if (task_cpu(p) != src_cpu)
4874                 goto out;
4875         /* Affinity changed (again). */
4876         if (!cpu_isset(dest_cpu, p->cpus_allowed))
4877                 goto out;
4878
4879         on_rq = p->se.on_rq;
4880         if (on_rq)
4881                 deactivate_task(rq_src, p, 0);
4882         set_task_cpu(p, dest_cpu);
4883         if (on_rq) {
4884                 activate_task(rq_dest, p, 0);
4885                 check_preempt_curr(rq_dest, p);
4886         }
4887         ret = 1;
4888 out:
4889         double_rq_unlock(rq_src, rq_dest);
4890         return ret;
4891 }
4892
4893 /*
4894  * migration_thread - this is a highprio system thread that performs
4895  * thread migration by bumping thread off CPU then 'pushing' onto
4896  * another runqueue.
4897  */
4898 static int migration_thread(void *data)
4899 {
4900         int cpu = (long)data;
4901         struct rq *rq;
4902
4903         rq = cpu_rq(cpu);
4904         BUG_ON(rq->migration_thread != current);
4905
4906         set_current_state(TASK_INTERRUPTIBLE);
4907         while (!kthread_should_stop()) {
4908                 struct migration_req *req;
4909                 struct list_head *head;
4910
4911                 try_to_freeze();
4912
4913                 spin_lock_irq(&rq->lock);
4914
4915                 if (cpu_is_offline(cpu)) {
4916                         spin_unlock_irq(&rq->lock);
4917                         goto wait_to_die;
4918                 }
4919
4920                 if (rq->active_balance) {
4921                         active_load_balance(rq, cpu);
4922                         rq->active_balance = 0;
4923                 }
4924
4925                 head = &rq->migration_queue;
4926
4927                 if (list_empty(head)) {
4928                         spin_unlock_irq(&rq->lock);
4929                         schedule();
4930                         set_current_state(TASK_INTERRUPTIBLE);
4931                         continue;
4932                 }
4933                 req = list_entry(head->next, struct migration_req, list);
4934                 list_del_init(head->next);
4935
4936                 spin_unlock(&rq->lock);
4937                 __migrate_task(req->task, cpu, req->dest_cpu);
4938                 local_irq_enable();
4939
4940                 complete(&req->done);
4941         }
4942         __set_current_state(TASK_RUNNING);
4943         return 0;
4944
4945 wait_to_die:
4946         /* Wait for kthread_stop */
4947         set_current_state(TASK_INTERRUPTIBLE);
4948         while (!kthread_should_stop()) {
4949                 schedule();
4950                 set_current_state(TASK_INTERRUPTIBLE);
4951         }
4952         __set_current_state(TASK_RUNNING);
4953         return 0;
4954 }
4955
4956 #ifdef CONFIG_HOTPLUG_CPU
4957 /*
4958  * Figure out where task on dead CPU should go, use force if neccessary.
4959  * NOTE: interrupts should be disabled by the caller
4960  */
4961 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
4962 {
4963         unsigned long flags;
4964         cpumask_t mask;
4965         struct rq *rq;
4966         int dest_cpu;
4967
4968 restart:
4969         /* On same node? */
4970         mask = node_to_cpumask(cpu_to_node(dead_cpu));
4971         cpus_and(mask, mask, p->cpus_allowed);
4972         dest_cpu = any_online_cpu(mask);
4973
4974         /* On any allowed CPU? */
4975         if (dest_cpu == NR_CPUS)
4976                 dest_cpu = any_online_cpu(p->cpus_allowed);
4977
4978         /* No more Mr. Nice Guy. */
4979         if (dest_cpu == NR_CPUS) {
4980                 rq = task_rq_lock(p, &flags);
4981                 cpus_setall(p->cpus_allowed);
4982                 dest_cpu = any_online_cpu(p->cpus_allowed);
4983                 task_rq_unlock(rq, &flags);
4984
4985                 /*
4986                  * Don't tell them about moving exiting tasks or
4987                  * kernel threads (both mm NULL), since they never
4988                  * leave kernel.
4989                  */
4990                 if (p->mm && printk_ratelimit())
4991                         printk(KERN_INFO "process %d (%s) no "
4992                                "longer affine to cpu%d\n",
4993                                p->pid, p->comm, dead_cpu);
4994         }
4995         if (!__migrate_task(p, dead_cpu, dest_cpu))
4996                 goto restart;
4997 }
4998
4999 /*
5000  * While a dead CPU has no uninterruptible tasks queued at this point,
5001  * it might still have a nonzero ->nr_uninterruptible counter, because
5002  * for performance reasons the counter is not stricly tracking tasks to
5003  * their home CPUs. So we just add the counter to another CPU's counter,
5004  * to keep the global sum constant after CPU-down:
5005  */
5006 static void migrate_nr_uninterruptible(struct rq *rq_src)
5007 {
5008         struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5009         unsigned long flags;
5010
5011         local_irq_save(flags);
5012         double_rq_lock(rq_src, rq_dest);
5013         rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5014         rq_src->nr_uninterruptible = 0;
5015         double_rq_unlock(rq_src, rq_dest);
5016         local_irq_restore(flags);
5017 }
5018
5019 /* Run through task list and migrate tasks from the dead cpu. */
5020 static void migrate_live_tasks(int src_cpu)
5021 {
5022         struct task_struct *p, *t;
5023
5024         write_lock_irq(&tasklist_lock);
5025
5026         do_each_thread(t, p) {
5027                 if (p == current)
5028                         continue;
5029
5030                 if (task_cpu(p) == src_cpu)
5031                         move_task_off_dead_cpu(src_cpu, p);
5032         } while_each_thread(t, p);
5033
5034         write_unlock_irq(&tasklist_lock);
5035 }
5036
5037 /*
5038  * Schedules idle task to be the next runnable task on current CPU.
5039  * It does so by boosting its priority to highest possible and adding it to
5040  * the _front_ of the runqueue. Used by CPU offline code.
5041  */
5042 void sched_idle_next(void)
5043 {
5044         int this_cpu = smp_processor_id();
5045         struct rq *rq = cpu_rq(this_cpu);
5046         struct task_struct *p = rq->idle;
5047         unsigned long flags;
5048
5049         /* cpu has to be offline */
5050         BUG_ON(cpu_online(this_cpu));
5051
5052         /*
5053          * Strictly not necessary since rest of the CPUs are stopped by now
5054          * and interrupts disabled on the current cpu.
5055          */
5056         spin_lock_irqsave(&rq->lock, flags);
5057
5058         __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5059
5060         /* Add idle task to the _front_ of its priority queue: */
5061         activate_idle_task(p, rq);
5062
5063         spin_unlock_irqrestore(&rq->lock, flags);
5064 }
5065
5066 /*
5067  * Ensures that the idle task is using init_mm right before its cpu goes
5068  * offline.
5069  */
5070 void idle_task_exit(void)
5071 {
5072         struct mm_struct *mm = current->active_mm;
5073
5074         BUG_ON(cpu_online(smp_processor_id()));
5075
5076         if (mm != &init_mm)
5077                 switch_mm(mm, &init_mm, current);
5078         mmdrop(mm);
5079 }
5080
5081 /* called under rq->lock with disabled interrupts */
5082 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5083 {
5084         struct rq *rq = cpu_rq(dead_cpu);
5085
5086         /* Must be exiting, otherwise would be on tasklist. */
5087         BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5088
5089         /* Cannot have done final schedule yet: would have vanished. */
5090         BUG_ON(p->state == TASK_DEAD);
5091
5092         get_task_struct(p);
5093
5094         /*
5095          * Drop lock around migration; if someone else moves it,
5096          * that's OK.  No task can be added to this CPU, so iteration is
5097          * fine.
5098          * NOTE: interrupts should be left disabled  --dev@
5099          */
5100         spin_unlock(&rq->lock);
5101         move_task_off_dead_cpu(dead_cpu, p);
5102         spin_lock(&rq->lock);
5103
5104         put_task_struct(p);
5105 }
5106
5107 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5108 static void migrate_dead_tasks(unsigned int dead_cpu)
5109 {
5110         struct rq *rq = cpu_rq(dead_cpu);
5111         struct task_struct *next;
5112
5113         for ( ; ; ) {
5114                 if (!rq->nr_running)
5115                         break;
5116                 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5117                 if (!next)
5118                         break;
5119                 migrate_dead(dead_cpu, next);
5120         }
5121 }
5122 #endif /* CONFIG_HOTPLUG_CPU */
5123
5124 /*
5125  * migration_call - callback that gets triggered when a CPU is added.
5126  * Here we can start up the necessary migration thread for the new CPU.
5127  */
5128 static int __cpuinit
5129 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5130 {
5131         struct task_struct *p;
5132         int cpu = (long)hcpu;
5133         unsigned long flags;
5134         struct rq *rq;
5135
5136         switch (action) {
5137         case CPU_LOCK_ACQUIRE:
5138                 mutex_lock(&sched_hotcpu_mutex);
5139                 break;
5140
5141         case CPU_UP_PREPARE:
5142         case CPU_UP_PREPARE_FROZEN:
5143                 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5144                 if (IS_ERR(p))
5145                         return NOTIFY_BAD;
5146                 p->flags |= PF_NOFREEZE;
5147                 kthread_bind(p, cpu);
5148                 /* Must be high prio: stop_machine expects to yield to it. */
5149                 rq = task_rq_lock(p, &flags);
5150                 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5151                 task_rq_unlock(rq, &flags);
5152                 cpu_rq(cpu)->migration_thread = p;
5153                 break;
5154
5155         case CPU_ONLINE:
5156         case CPU_ONLINE_FROZEN:
5157                 /* Strictly unneccessary, as first user will wake it. */
5158                 wake_up_process(cpu_rq(cpu)->migration_thread);
5159                 break;
5160
5161 #ifdef CONFIG_HOTPLUG_CPU
5162         case CPU_UP_CANCELED:
5163         case CPU_UP_CANCELED_FROZEN:
5164                 if (!cpu_rq(cpu)->migration_thread)
5165                         break;
5166                 /* Unbind it from offline cpu so it can run.  Fall thru. */
5167                 kthread_bind(cpu_rq(cpu)->migration_thread,
5168                              any_online_cpu(cpu_online_map));
5169                 kthread_stop(cpu_rq(cpu)->migration_thread);
5170                 cpu_rq(cpu)->migration_thread = NULL;
5171                 break;
5172
5173         case CPU_DEAD:
5174         case CPU_DEAD_FROZEN:
5175                 migrate_live_tasks(cpu);
5176                 rq = cpu_rq(cpu);
5177                 kthread_stop(rq->migration_thread);
5178                 rq->migration_thread = NULL;
5179                 /* Idle task back to normal (off runqueue, low prio) */
5180                 rq = task_rq_lock(rq->idle, &flags);
5181                 deactivate_task(rq, rq->idle, 0);
5182                 rq->idle->static_prio = MAX_PRIO;
5183                 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5184                 rq->idle->sched_class = &idle_sched_class;
5185                 migrate_dead_tasks(cpu);
5186                 task_rq_unlock(rq, &flags);
5187                 migrate_nr_uninterruptible(rq);
5188                 BUG_ON(rq->nr_running != 0);
5189
5190                 /* No need to migrate the tasks: it was best-effort if
5191                  * they didn't take sched_hotcpu_mutex.  Just wake up
5192                  * the requestors. */
5193                 spin_lock_irq(&rq->lock);
5194                 while (!list_empty(&rq->migration_queue)) {
5195                         struct migration_req *req;
5196
5197                         req = list_entry(rq->migration_queue.next,
5198                                          struct migration_req, list);
5199                         list_del_init(&req->list);
5200                         complete(&req->done);
5201                 }
5202                 spin_unlock_irq(&rq->lock);
5203                 break;
5204 #endif
5205         case CPU_LOCK_RELEASE:
5206                 mutex_unlock(&sched_hotcpu_mutex);
5207                 break;
5208         }
5209         return NOTIFY_OK;
5210 }
5211
5212 /* Register at highest priority so that task migration (migrate_all_tasks)
5213  * happens before everything else.
5214  */
5215 static struct notifier_block __cpuinitdata migration_notifier = {
5216         .notifier_call = migration_call,
5217         .priority = 10
5218 };
5219
5220 int __init migration_init(void)
5221 {
5222         void *cpu = (void *)(long)smp_processor_id();
5223         int err;
5224
5225         /* Start one for the boot CPU: */
5226         err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5227         BUG_ON(err == NOTIFY_BAD);
5228         migration_call(&migration_notifier, CPU_ONLINE, cpu);
5229         register_cpu_notifier(&migration_notifier);
5230
5231         return 0;
5232 }
5233 #endif
5234
5235 #ifdef CONFIG_SMP
5236
5237 /* Number of possible processor ids */
5238 int nr_cpu_ids __read_mostly = NR_CPUS;
5239 EXPORT_SYMBOL(nr_cpu_ids);
5240
5241 #undef SCHED_DOMAIN_DEBUG
5242 #ifdef SCHED_DOMAIN_DEBUG
5243 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5244 {
5245         int level = 0;
5246
5247         if (!sd) {
5248                 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5249                 return;
5250         }
5251
5252         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5253
5254         do {
5255                 int i;
5256                 char str[NR_CPUS];
5257                 struct sched_group *group = sd->groups;
5258                 cpumask_t groupmask;
5259
5260                 cpumask_scnprintf(str, NR_CPUS, sd->span);
5261                 cpus_clear(groupmask);
5262
5263                 printk(KERN_DEBUG);
5264                 for (i = 0; i < level + 1; i++)
5265                         printk(" ");
5266                 printk("domain %d: ", level);
5267
5268                 if (!(sd->flags & SD_LOAD_BALANCE)) {
5269                         printk("does not load-balance\n");
5270                         if (sd->parent)
5271                                 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5272                                                 " has parent");
5273                         break;
5274                 }
5275
5276                 printk("span %s\n", str);
5277
5278                 if (!cpu_isset(cpu, sd->span))
5279                         printk(KERN_ERR "ERROR: domain->span does not contain "
5280                                         "CPU%d\n", cpu);
5281                 if (!cpu_isset(cpu, group->cpumask))
5282                         printk(KERN_ERR "ERROR: domain->groups does not contain"
5283                                         " CPU%d\n", cpu);
5284
5285                 printk(KERN_DEBUG);
5286                 for (i = 0; i < level + 2; i++)
5287                         printk(" ");
5288                 printk("groups:");
5289                 do {
5290                         if (!group) {
5291                                 printk("\n");
5292                                 printk(KERN_ERR "ERROR: group is NULL\n");
5293                                 break;
5294                         }
5295
5296                         if (!group->__cpu_power) {
5297                                 printk("\n");
5298                                 printk(KERN_ERR "ERROR: domain->cpu_power not "
5299                                                 "set\n");
5300                         }
5301
5302                         if (!cpus_weight(group->cpumask)) {
5303                                 printk("\n");
5304                                 printk(KERN_ERR "ERROR: empty group\n");
5305                         }
5306
5307                         if (cpus_intersects(groupmask, group->cpumask)) {
5308                                 printk("\n");
5309                                 printk(KERN_ERR "ERROR: repeated CPUs\n");
5310                         }
5311
5312                         cpus_or(groupmask, groupmask, group->cpumask);
5313
5314                         cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5315                         printk(" %s", str);
5316
5317                         group = group->next;
5318                 } while (group != sd->groups);
5319                 printk("\n");
5320
5321                 if (!cpus_equal(sd->span, groupmask))
5322                         printk(KERN_ERR "ERROR: groups don't span "
5323                                         "domain->span\n");
5324
5325                 level++;
5326                 sd = sd->parent;
5327                 if (!sd)
5328                         continue;
5329
5330                 if (!cpus_subset(groupmask, sd->span))
5331                         printk(KERN_ERR "ERROR: parent span is not a superset "
5332                                 "of domain->span\n");
5333
5334         } while (sd);
5335 }
5336 #else
5337 # define sched_domain_debug(sd, cpu) do { } while (0)
5338 #endif
5339
5340 static int sd_degenerate(struct sched_domain *sd)
5341 {
5342         if (cpus_weight(sd->span) == 1)
5343                 return 1;
5344
5345         /* Following flags need at least 2 groups */
5346         if (sd->flags & (SD_LOAD_BALANCE |
5347                          SD_BALANCE_NEWIDLE |
5348                          SD_BALANCE_FORK |
5349                          SD_BALANCE_EXEC |
5350                          SD_SHARE_CPUPOWER |
5351                          SD_SHARE_PKG_RESOURCES)) {
5352                 if (sd->groups != sd->groups->next)
5353                         return 0;
5354         }
5355
5356         /* Following flags don't use groups */
5357         if (sd->flags & (SD_WAKE_IDLE |
5358                          SD_WAKE_AFFINE |
5359                          SD_WAKE_BALANCE))
5360                 return 0;
5361
5362         return 1;
5363 }
5364
5365 static int
5366 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5367 {
5368         unsigned long cflags = sd->flags, pflags = parent->flags;
5369
5370         if (sd_degenerate(parent))
5371                 return 1;
5372
5373         if (!cpus_equal(sd->span, parent->span))
5374                 return 0;
5375
5376         /* Does parent contain flags not in child? */
5377         /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5378         if (cflags & SD_WAKE_AFFINE)
5379                 pflags &= ~SD_WAKE_BALANCE;
5380         /* Flags needing groups don't count if only 1 group in parent */
5381         if (parent->groups == parent->groups->next) {
5382                 pflags &= ~(SD_LOAD_BALANCE |
5383                                 SD_BALANCE_NEWIDLE |
5384                                 SD_BALANCE_FORK |
5385                                 SD_BALANCE_EXEC |
5386                                 SD_SHARE_CPUPOWER |
5387                                 SD_SHARE_PKG_RESOURCES);
5388         }
5389         if (~cflags & pflags)
5390                 return 0;
5391
5392         return 1;
5393 }
5394
5395 /*
5396  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
5397  * hold the hotplug lock.
5398  */
5399 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5400 {
5401         struct rq *rq = cpu_rq(cpu);
5402         struct sched_domain *tmp;
5403
5404         /* Remove the sched domains which do not contribute to scheduling. */
5405         for (tmp = sd; tmp; tmp = tmp->parent) {
5406                 struct sched_domain *parent = tmp->parent;
5407                 if (!parent)
5408                         break;
5409                 if (sd_parent_degenerate(tmp, parent)) {
5410                         tmp->parent = parent->parent;
5411                         if (parent->parent)
5412                                 parent->parent->child = tmp;
5413                 }
5414         }
5415
5416         if (sd && sd_degenerate(sd)) {
5417                 sd = sd->parent;
5418                 if (sd)
5419                         sd->child = NULL;
5420         }
5421
5422         sched_domain_debug(sd, cpu);
5423
5424         rcu_assign_pointer(rq->sd, sd);
5425 }
5426
5427 /* cpus with isolated domains */
5428 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5429
5430 /* Setup the mask of cpus configured for isolated domains */
5431 static int __init isolated_cpu_setup(char *str)
5432 {
5433         int ints[NR_CPUS], i;
5434
5435         str = get_options(str, ARRAY_SIZE(ints), ints);
5436         cpus_clear(cpu_isolated_map);
5437         for (i = 1; i <= ints[0]; i++)
5438                 if (ints[i] < NR_CPUS)
5439                         cpu_set(ints[i], cpu_isolated_map);
5440         return 1;
5441 }
5442
5443 __setup ("isolcpus=", isolated_cpu_setup);
5444
5445 /*
5446  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5447  * to a function which identifies what group(along with sched group) a CPU
5448  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5449  * (due to the fact that we keep track of groups covered with a cpumask_t).
5450  *
5451  * init_sched_build_groups will build a circular linked list of the groups
5452  * covered by the given span, and will set each group's ->cpumask correctly,
5453  * and ->cpu_power to 0.
5454  */
5455 static void
5456 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5457                         int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5458                                         struct sched_group **sg))
5459 {
5460         struct sched_group *first = NULL, *last = NULL;
5461         cpumask_t covered = CPU_MASK_NONE;
5462         int i;
5463
5464         for_each_cpu_mask(i, span) {
5465                 struct sched_group *sg;
5466                 int group = group_fn(i, cpu_map, &sg);
5467                 int j;
5468
5469                 if (cpu_isset(i, covered))
5470                         continue;
5471
5472                 sg->cpumask = CPU_MASK_NONE;
5473                 sg->__cpu_power = 0;
5474
5475                 for_each_cpu_mask(j, span) {
5476                         if (group_fn(j, cpu_map, NULL) != group)
5477                                 continue;
5478
5479                         cpu_set(j, covered);
5480                         cpu_set(j, sg->cpumask);
5481                 }
5482                 if (!first)
5483                         first = sg;
5484                 if (last)
5485                         last->next = sg;
5486                 last = sg;
5487         }
5488         last->next = first;
5489 }
5490
5491 #define SD_NODES_PER_DOMAIN 16
5492
5493 #ifdef CONFIG_NUMA
5494
5495 /**
5496  * find_next_best_node - find the next node to include in a sched_domain
5497  * @node: node whose sched_domain we're building
5498  * @used_nodes: nodes already in the sched_domain
5499  *
5500  * Find the next node to include in a given scheduling domain.  Simply
5501  * finds the closest node not already in the @used_nodes map.
5502  *
5503  * Should use nodemask_t.
5504  */
5505 static int find_next_best_node(int node, unsigned long *used_nodes)
5506 {
5507         int i, n, val, min_val, best_node = 0;
5508
5509         min_val = INT_MAX;
5510
5511         for (i = 0; i < MAX_NUMNODES; i++) {
5512                 /* Start at @node */
5513                 n = (node + i) % MAX_NUMNODES;
5514
5515                 if (!nr_cpus_node(n))
5516                         continue;
5517
5518                 /* Skip already used nodes */
5519                 if (test_bit(n, used_nodes))
5520                         continue;
5521
5522                 /* Simple min distance search */
5523                 val = node_distance(node, n);
5524
5525                 if (val < min_val) {
5526                         min_val = val;
5527                         best_node = n;
5528                 }
5529         }
5530
5531         set_bit(best_node, used_nodes);
5532         return best_node;
5533 }
5534
5535 /**
5536  * sched_domain_node_span - get a cpumask for a node's sched_domain
5537  * @node: node whose cpumask we're constructing
5538  * @size: number of nodes to include in this span
5539  *
5540  * Given a node, construct a good cpumask for its sched_domain to span.  It
5541  * should be one that prevents unnecessary balancing, but also spreads tasks
5542  * out optimally.
5543  */
5544 static cpumask_t sched_domain_node_span(int node)
5545 {
5546         DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5547         cpumask_t span, nodemask;
5548         int i;
5549
5550         cpus_clear(span);
5551         bitmap_zero(used_nodes, MAX_NUMNODES);
5552
5553         nodemask = node_to_cpumask(node);
5554         cpus_or(span, span, nodemask);
5555         set_bit(node, used_nodes);
5556
5557         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5558                 int next_node = find_next_best_node(node, used_nodes);
5559
5560                 nodemask = node_to_cpumask(next_node);
5561                 cpus_or(span, span, nodemask);
5562         }
5563
5564         return span;
5565 }
5566 #endif
5567
5568 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5569
5570 /*
5571  * SMT sched-domains:
5572  */
5573 #ifdef CONFIG_SCHED_SMT
5574 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5575 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5576
5577 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5578                             struct sched_group **sg)
5579 {
5580         if (sg)
5581                 *sg = &per_cpu(sched_group_cpus, cpu);
5582         return cpu;
5583 }
5584 #endif
5585
5586 /*
5587  * multi-core sched-domains:
5588  */
5589 #ifdef CONFIG_SCHED_MC
5590 static DEFINE_PER_CPU(struct sched_domain, core_domains);
5591 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5592 #endif
5593
5594 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5595 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5596                              struct sched_group **sg)
5597 {
5598         int group;
5599         cpumask_t mask = cpu_sibling_map[cpu];
5600         cpus_and(mask, mask, *cpu_map);
5601         group = first_cpu(mask);
5602         if (sg)
5603                 *sg = &per_cpu(sched_group_core, group);
5604         return group;
5605 }
5606 #elif defined(CONFIG_SCHED_MC)
5607 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5608                              struct sched_group **sg)
5609 {
5610         if (sg)
5611                 *sg = &per_cpu(sched_group_core, cpu);
5612         return cpu;
5613 }
5614 #endif
5615
5616 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5617 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
5618
5619 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5620                              struct sched_group **sg)
5621 {
5622         int group;
5623 #ifdef CONFIG_SCHED_MC
5624         cpumask_t mask = cpu_coregroup_map(cpu);
5625         cpus_and(mask, mask, *cpu_map);
5626         group = first_cpu(mask);
5627 #elif defined(CONFIG_SCHED_SMT)
5628         cpumask_t mask = cpu_sibling_map[cpu];
5629         cpus_and(mask, mask, *cpu_map);
5630         group = first_cpu(mask);
5631 #else
5632         group = cpu;
5633 #endif
5634         if (sg)
5635                 *sg = &per_cpu(sched_group_phys, group);
5636         return group;
5637 }
5638
5639 #ifdef CONFIG_NUMA
5640 /*
5641  * The init_sched_build_groups can't handle what we want to do with node
5642  * groups, so roll our own. Now each node has its own list of groups which
5643  * gets dynamically allocated.
5644  */
5645 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5646 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5647
5648 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5649 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
5650
5651 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5652                                  struct sched_group **sg)
5653 {
5654         cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5655         int group;
5656
5657         cpus_and(nodemask, nodemask, *cpu_map);
5658         group = first_cpu(nodemask);
5659
5660         if (sg)
5661                 *sg = &per_cpu(sched_group_allnodes, group);
5662         return group;
5663 }
5664
5665 static void init_numa_sched_groups_power(struct sched_group *group_head)
5666 {
5667         struct sched_group *sg = group_head;
5668         int j;
5669
5670         if (!sg)
5671                 return;
5672 next_sg:
5673         for_each_cpu_mask(j, sg->cpumask) {
5674                 struct sched_domain *sd;
5675
5676                 sd = &per_cpu(phys_domains, j);
5677                 if (j != first_cpu(sd->groups->cpumask)) {
5678                         /*
5679                          * Only add "power" once for each
5680                          * physical package.
5681                          */
5682                         continue;
5683                 }
5684
5685                 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5686         }
5687         sg = sg->next;
5688         if (sg != group_head)
5689                 goto next_sg;
5690 }
5691 #endif
5692
5693 #ifdef CONFIG_NUMA
5694 /* Free memory allocated for various sched_group structures */
5695 static void free_sched_groups(const cpumask_t *cpu_map)
5696 {
5697         int cpu, i;
5698
5699         for_each_cpu_mask(cpu, *cpu_map) {
5700                 struct sched_group **sched_group_nodes
5701                         = sched_group_nodes_bycpu[cpu];
5702
5703                 if (!sched_group_nodes)
5704                         continue;
5705
5706                 for (i = 0; i < MAX_NUMNODES; i++) {
5707                         cpumask_t nodemask = node_to_cpumask(i);
5708                         struct sched_group *oldsg, *sg = sched_group_nodes[i];
5709
5710                         cpus_and(nodemask, nodemask, *cpu_map);
5711                         if (cpus_empty(nodemask))
5712                                 continue;
5713
5714                         if (sg == NULL)
5715                                 continue;
5716                         sg = sg->next;
5717 next_sg:
5718                         oldsg = sg;
5719                         sg = sg->next;
5720                         kfree(oldsg);
5721                         if (oldsg != sched_group_nodes[i])
5722                                 goto next_sg;
5723                 }
5724                 kfree(sched_group_nodes);
5725                 sched_group_nodes_bycpu[cpu] = NULL;
5726         }
5727 }
5728 #else
5729 static void free_sched_groups(const cpumask_t *cpu_map)
5730 {
5731 }
5732 #endif
5733
5734 /*
5735  * Initialize sched groups cpu_power.
5736  *
5737  * cpu_power indicates the capacity of sched group, which is used while
5738  * distributing the load between different sched groups in a sched domain.
5739  * Typically cpu_power for all the groups in a sched domain will be same unless
5740  * there are asymmetries in the topology. If there are asymmetries, group
5741  * having more cpu_power will pickup more load compared to the group having
5742  * less cpu_power.
5743  *
5744  * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5745  * the maximum number of tasks a group can handle in the presence of other idle
5746  * or lightly loaded groups in the same sched domain.
5747  */
5748 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5749 {
5750         struct sched_domain *child;
5751         struct sched_group *group;
5752
5753         WARN_ON(!sd || !sd->groups);
5754
5755         if (cpu != first_cpu(sd->groups->cpumask))
5756                 return;
5757
5758         child = sd->child;
5759
5760         sd->groups->__cpu_power = 0;
5761
5762         /*
5763          * For perf policy, if the groups in child domain share resources
5764          * (for example cores sharing some portions of the cache hierarchy
5765          * or SMT), then set this domain groups cpu_power such that each group
5766          * can handle only one task, when there are other idle groups in the
5767          * same sched domain.
5768          */
5769         if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5770                        (child->flags &
5771                         (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5772                 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
5773                 return;
5774         }
5775
5776         /*
5777          * add cpu_power of each child group to this groups cpu_power
5778          */
5779         group = child->groups;
5780         do {
5781                 sg_inc_cpu_power(sd->groups, group->__cpu_power);
5782                 group = group->next;
5783         } while (group != child->groups);
5784 }
5785
5786 /*
5787  * Build sched domains for a given set of cpus and attach the sched domains
5788  * to the individual cpus
5789  */
5790 static int build_sched_domains(const cpumask_t *cpu_map)
5791 {
5792         int i;
5793 #ifdef CONFIG_NUMA
5794         struct sched_group **sched_group_nodes = NULL;
5795         int sd_allnodes = 0;
5796
5797         /*
5798          * Allocate the per-node list of sched groups
5799          */
5800         sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
5801                                            GFP_KERNEL);
5802         if (!sched_group_nodes) {
5803                 printk(KERN_WARNING "Can not alloc sched group node list\n");
5804                 return -ENOMEM;
5805         }
5806         sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5807 #endif
5808
5809         /*
5810          * Set up domains for cpus specified by the cpu_map.
5811          */
5812         for_each_cpu_mask(i, *cpu_map) {
5813                 struct sched_domain *sd = NULL, *p;
5814                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5815
5816                 cpus_and(nodemask, nodemask, *cpu_map);
5817
5818 #ifdef CONFIG_NUMA
5819                 if (cpus_weight(*cpu_map) >
5820                                 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5821                         sd = &per_cpu(allnodes_domains, i);
5822                         *sd = SD_ALLNODES_INIT;
5823                         sd->span = *cpu_map;
5824                         cpu_to_allnodes_group(i, cpu_map, &sd->groups);
5825                         p = sd;
5826                         sd_allnodes = 1;
5827                 } else
5828                         p = NULL;
5829
5830                 sd = &per_cpu(node_domains, i);
5831                 *sd = SD_NODE_INIT;
5832                 sd->span = sched_domain_node_span(cpu_to_node(i));
5833                 sd->parent = p;
5834                 if (p)
5835                         p->child = sd;
5836                 cpus_and(sd->span, sd->span, *cpu_map);
5837 #endif
5838
5839                 p = sd;
5840                 sd = &per_cpu(phys_domains, i);
5841                 *sd = SD_CPU_INIT;
5842                 sd->span = nodemask;
5843                 sd->parent = p;
5844                 if (p)
5845                         p->child = sd;
5846                 cpu_to_phys_group(i, cpu_map, &sd->groups);
5847
5848 #ifdef CONFIG_SCHED_MC
5849                 p = sd;
5850                 sd = &per_cpu(core_domains, i);
5851                 *sd = SD_MC_INIT;
5852                 sd->span = cpu_coregroup_map(i);
5853                 cpus_and(sd->span, sd->span, *cpu_map);
5854                 sd->parent = p;
5855                 p->child = sd;
5856                 cpu_to_core_group(i, cpu_map, &sd->groups);
5857 #endif
5858
5859 #ifdef CONFIG_SCHED_SMT
5860                 p = sd;
5861                 sd = &per_cpu(cpu_domains, i);
5862                 *sd = SD_SIBLING_INIT;
5863                 sd->span = cpu_sibling_map[i];
5864                 cpus_and(sd->span, sd->span, *cpu_map);
5865                 sd->parent = p;
5866                 p->child = sd;
5867                 cpu_to_cpu_group(i, cpu_map, &sd->groups);
5868 #endif
5869         }
5870
5871 #ifdef CONFIG_SCHED_SMT
5872         /* Set up CPU (sibling) groups */
5873         for_each_cpu_mask(i, *cpu_map) {
5874                 cpumask_t this_sibling_map = cpu_sibling_map[i];
5875                 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
5876                 if (i != first_cpu(this_sibling_map))
5877                         continue;
5878
5879                 init_sched_build_groups(this_sibling_map, cpu_map,
5880                                         &cpu_to_cpu_group);
5881         }
5882 #endif
5883
5884 #ifdef CONFIG_SCHED_MC
5885         /* Set up multi-core groups */
5886         for_each_cpu_mask(i, *cpu_map) {
5887                 cpumask_t this_core_map = cpu_coregroup_map(i);
5888                 cpus_and(this_core_map, this_core_map, *cpu_map);
5889                 if (i != first_cpu(this_core_map))
5890                         continue;
5891                 init_sched_build_groups(this_core_map, cpu_map,
5892                                         &cpu_to_core_group);
5893         }
5894 #endif
5895
5896         /* Set up physical groups */
5897         for (i = 0; i < MAX_NUMNODES; i++) {
5898                 cpumask_t nodemask = node_to_cpumask(i);
5899
5900                 cpus_and(nodemask, nodemask, *cpu_map);
5901                 if (cpus_empty(nodemask))
5902                         continue;
5903
5904                 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
5905         }
5906
5907 #ifdef CONFIG_NUMA
5908         /* Set up node groups */
5909         if (sd_allnodes)
5910                 init_sched_build_groups(*cpu_map, cpu_map,
5911                                         &cpu_to_allnodes_group);
5912
5913         for (i = 0; i < MAX_NUMNODES; i++) {
5914                 /* Set up node groups */
5915                 struct sched_group *sg, *prev;
5916                 cpumask_t nodemask = node_to_cpumask(i);
5917                 cpumask_t domainspan;
5918                 cpumask_t covered = CPU_MASK_NONE;
5919                 int j;
5920
5921                 cpus_and(nodemask, nodemask, *cpu_map);
5922                 if (cpus_empty(nodemask)) {
5923                         sched_group_nodes[i] = NULL;
5924                         continue;
5925                 }
5926
5927                 domainspan = sched_domain_node_span(i);
5928                 cpus_and(domainspan, domainspan, *cpu_map);
5929
5930                 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
5931                 if (!sg) {
5932                         printk(KERN_WARNING "Can not alloc domain group for "
5933                                 "node %d\n", i);
5934                         goto error;
5935                 }
5936                 sched_group_nodes[i] = sg;
5937                 for_each_cpu_mask(j, nodemask) {
5938                         struct sched_domain *sd;
5939                         sd = &per_cpu(node_domains, j);
5940                         sd->groups = sg;
5941                 }
5942                 sg->__cpu_power = 0;
5943                 sg->cpumask = nodemask;
5944                 sg->next = sg;
5945                 cpus_or(covered, covered, nodemask);
5946                 prev = sg;
5947
5948                 for (j = 0; j < MAX_NUMNODES; j++) {
5949                         cpumask_t tmp, notcovered;
5950                         int n = (i + j) % MAX_NUMNODES;
5951
5952                         cpus_complement(notcovered, covered);
5953                         cpus_and(tmp, notcovered, *cpu_map);
5954                         cpus_and(tmp, tmp, domainspan);
5955                         if (cpus_empty(tmp))
5956                                 break;
5957
5958                         nodemask = node_to_cpumask(n);
5959                         cpus_and(tmp, tmp, nodemask);
5960                         if (cpus_empty(tmp))
5961                                 continue;
5962
5963                         sg = kmalloc_node(sizeof(struct sched_group),
5964                                           GFP_KERNEL, i);
5965                         if (!sg) {
5966                                 printk(KERN_WARNING
5967                                 "Can not alloc domain group for node %d\n", j);
5968                                 goto error;
5969                         }
5970                         sg->__cpu_power = 0;
5971                         sg->cpumask = tmp;
5972                         sg->next = prev->next;
5973                         cpus_or(covered, covered, tmp);
5974                         prev->next = sg;
5975                         prev = sg;
5976                 }
5977         }
5978 #endif
5979
5980         /* Calculate CPU power for physical packages and nodes */
5981 #ifdef CONFIG_SCHED_SMT
5982         for_each_cpu_mask(i, *cpu_map) {
5983                 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5984
5985                 init_sched_groups_power(i, sd);
5986         }
5987 #endif
5988 #ifdef CONFIG_SCHED_MC
5989         for_each_cpu_mask(i, *cpu_map) {
5990                 struct sched_domain *sd = &per_cpu(core_domains, i);
5991
5992                 init_sched_groups_power(i, sd);
5993         }
5994 #endif
5995
5996         for_each_cpu_mask(i, *cpu_map) {
5997                 struct sched_domain *sd = &per_cpu(phys_domains, i);
5998
5999                 init_sched_groups_power(i, sd);
6000         }
6001
6002 #ifdef CONFIG_NUMA
6003         for (i = 0; i < MAX_NUMNODES; i++)
6004                 init_numa_sched_groups_power(sched_group_nodes[i]);
6005
6006         if (sd_allnodes) {
6007                 struct sched_group *sg;
6008
6009                 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6010                 init_numa_sched_groups_power(sg);
6011         }
6012 #endif
6013
6014         /* Attach the domains */
6015         for_each_cpu_mask(i, *cpu_map) {
6016                 struct sched_domain *sd;
6017 #ifdef CONFIG_SCHED_SMT
6018                 sd = &per_cpu(cpu_domains, i);
6019 #elif defined(CONFIG_SCHED_MC)
6020                 sd = &per_cpu(core_domains, i);
6021 #else
6022                 sd = &per_cpu(phys_domains, i);
6023 #endif
6024                 cpu_attach_domain(sd, i);
6025         }
6026
6027         return 0;
6028
6029 #ifdef CONFIG_NUMA
6030 error:
6031         free_sched_groups(cpu_map);
6032         return -ENOMEM;
6033 #endif
6034 }
6035 /*
6036  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
6037  */
6038 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6039 {
6040         cpumask_t cpu_default_map;
6041         int err;
6042
6043         /*
6044          * Setup mask for cpus without special case scheduling requirements.
6045          * For now this just excludes isolated cpus, but could be used to
6046          * exclude other special cases in the future.
6047          */
6048         cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6049
6050         err = build_sched_domains(&cpu_default_map);
6051
6052         return err;
6053 }
6054
6055 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6056 {
6057         free_sched_groups(cpu_map);
6058 }
6059
6060 /*
6061  * Detach sched domains from a group of cpus specified in cpu_map
6062  * These cpus will now be attached to the NULL domain
6063  */
6064 static void detach_destroy_domains(const cpumask_t *cpu_map)
6065 {
6066         int i;
6067
6068         for_each_cpu_mask(i, *cpu_map)
6069                 cpu_attach_domain(NULL, i);
6070         synchronize_sched();
6071         arch_destroy_sched_domains(cpu_map);
6072 }
6073
6074 /*
6075  * Partition sched domains as specified by the cpumasks below.
6076  * This attaches all cpus from the cpumasks to the NULL domain,
6077  * waits for a RCU quiescent period, recalculates sched
6078  * domain information and then attaches them back to the
6079  * correct sched domains
6080  * Call with hotplug lock held
6081  */
6082 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6083 {
6084         cpumask_t change_map;
6085         int err = 0;
6086
6087         cpus_and(*partition1, *partition1, cpu_online_map);
6088         cpus_and(*partition2, *partition2, cpu_online_map);
6089         cpus_or(change_map, *partition1, *partition2);
6090
6091         /* Detach sched domains from all of the affected cpus */
6092         detach_destroy_domains(&change_map);
6093         if (!cpus_empty(*partition1))
6094                 err = build_sched_domains(partition1);
6095         if (!err && !cpus_empty(*partition2))
6096                 err = build_sched_domains(partition2);
6097
6098         return err;
6099 }
6100
6101 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6102 int arch_reinit_sched_domains(void)
6103 {
6104         int err;
6105
6106         mutex_lock(&sched_hotcpu_mutex);
6107         detach_destroy_domains(&cpu_online_map);
6108         err = arch_init_sched_domains(&cpu_online_map);
6109         mutex_unlock(&sched_hotcpu_mutex);
6110
6111         return err;
6112 }
6113
6114 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6115 {
6116         int ret;
6117
6118         if (buf[0] != '0' && buf[0] != '1')
6119                 return -EINVAL;
6120
6121         if (smt)
6122                 sched_smt_power_savings = (buf[0] == '1');
6123         else
6124                 sched_mc_power_savings = (buf[0] == '1');
6125
6126         ret = arch_reinit_sched_domains();
6127
6128         return ret ? ret : count;
6129 }
6130
6131 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6132 {
6133         int err = 0;
6134
6135 #ifdef CONFIG_SCHED_SMT
6136         if (smt_capable())
6137                 err = sysfs_create_file(&cls->kset.kobj,
6138                                         &attr_sched_smt_power_savings.attr);
6139 #endif
6140 #ifdef CONFIG_SCHED_MC
6141         if (!err && mc_capable())
6142                 err = sysfs_create_file(&cls->kset.kobj,
6143                                         &attr_sched_mc_power_savings.attr);
6144 #endif
6145         return err;
6146 }
6147 #endif
6148
6149 #ifdef CONFIG_SCHED_MC
6150 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6151 {
6152         return sprintf(page, "%u\n", sched_mc_power_savings);
6153 }
6154 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6155                                             const char *buf, size_t count)
6156 {
6157         return sched_power_savings_store(buf, count, 0);
6158 }
6159 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6160             sched_mc_power_savings_store);
6161 #endif
6162
6163 #ifdef CONFIG_SCHED_SMT
6164 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6165 {
6166         return sprintf(page, "%u\n", sched_smt_power_savings);
6167 }
6168 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6169                                              const char *buf, size_t count)
6170 {
6171         return sched_power_savings_store(buf, count, 1);
6172 }
6173 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6174             sched_smt_power_savings_store);
6175 #endif
6176
6177 /*
6178  * Force a reinitialization of the sched domains hierarchy.  The domains
6179  * and groups cannot be updated in place without racing with the balancing
6180  * code, so we temporarily attach all running cpus to the NULL domain
6181  * which will prevent rebalancing while the sched domains are recalculated.
6182  */
6183 static int update_sched_domains(struct notifier_block *nfb,
6184                                 unsigned long action, void *hcpu)
6185 {
6186         switch (action) {
6187         case CPU_UP_PREPARE:
6188         case CPU_UP_PREPARE_FROZEN:
6189         case CPU_DOWN_PREPARE:
6190         case CPU_DOWN_PREPARE_FROZEN:
6191                 detach_destroy_domains(&cpu_online_map);
6192                 return NOTIFY_OK;
6193
6194         case CPU_UP_CANCELED:
6195         case CPU_UP_CANCELED_FROZEN:
6196         case CPU_DOWN_FAILED:
6197         case CPU_DOWN_FAILED_FROZEN:
6198         case CPU_ONLINE:
6199         case CPU_ONLINE_FROZEN:
6200         case CPU_DEAD:
6201         case CPU_DEAD_FROZEN:
6202                 /*
6203                  * Fall through and re-initialise the domains.
6204                  */
6205                 break;
6206         default:
6207                 return NOTIFY_DONE;
6208         }
6209
6210         /* The hotplug lock is already held by cpu_up/cpu_down */
6211         arch_init_sched_domains(&cpu_online_map);
6212
6213         return NOTIFY_OK;
6214 }
6215
6216 void __init sched_init_smp(void)
6217 {
6218         cpumask_t non_isolated_cpus;
6219
6220         mutex_lock(&sched_hotcpu_mutex);
6221         arch_init_sched_domains(&cpu_online_map);
6222         cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6223         if (cpus_empty(non_isolated_cpus))
6224                 cpu_set(smp_processor_id(), non_isolated_cpus);
6225         mutex_unlock(&sched_hotcpu_mutex);
6226         /* XXX: Theoretical race here - CPU may be hotplugged now */
6227         hotcpu_notifier(update_sched_domains, 0);
6228
6229         /* Move init over to a non-isolated CPU */
6230         if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6231                 BUG();
6232         sched_init_granularity();
6233 }
6234 #else
6235 void __init sched_init_smp(void)
6236 {
6237         sched_init_granularity();
6238 }
6239 #endif /* CONFIG_SMP */
6240
6241 int in_sched_functions(unsigned long addr)
6242 {
6243         /* Linker adds these: start and end of __sched functions */
6244         extern char __sched_text_start[], __sched_text_end[];
6245
6246         return in_lock_functions(addr) ||
6247                 (addr >= (unsigned long)__sched_text_start
6248                 && addr < (unsigned long)__sched_text_end);
6249 }
6250
6251 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6252 {
6253         cfs_rq->tasks_timeline = RB_ROOT;
6254         cfs_rq->fair_clock = 1;
6255 #ifdef CONFIG_FAIR_GROUP_SCHED
6256         cfs_rq->rq = rq;
6257 #endif
6258 }
6259
6260 void __init sched_init(void)
6261 {
6262         u64 now = sched_clock();
6263         int highest_cpu = 0;
6264         int i, j;
6265
6266         /*
6267          * Link up the scheduling class hierarchy:
6268          */
6269         rt_sched_class.next = &fair_sched_class;
6270         fair_sched_class.next = &idle_sched_class;
6271         idle_sched_class.next = NULL;
6272
6273         for_each_possible_cpu(i) {
6274                 struct rt_prio_array *array;
6275                 struct rq *rq;
6276
6277                 rq = cpu_rq(i);
6278                 spin_lock_init(&rq->lock);
6279                 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6280                 rq->nr_running = 0;
6281                 rq->clock = 1;
6282                 init_cfs_rq(&rq->cfs, rq);
6283 #ifdef CONFIG_FAIR_GROUP_SCHED
6284                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6285                 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6286 #endif
6287                 rq->ls.load_update_last = now;
6288                 rq->ls.load_update_start = now;
6289
6290                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6291                         rq->cpu_load[j] = 0;
6292 #ifdef CONFIG_SMP
6293                 rq->sd = NULL;
6294                 rq->active_balance = 0;
6295                 rq->next_balance = jiffies;
6296                 rq->push_cpu = 0;
6297                 rq->cpu = i;
6298                 rq->migration_thread = NULL;
6299                 INIT_LIST_HEAD(&rq->migration_queue);
6300 #endif
6301                 atomic_set(&rq->nr_iowait, 0);
6302
6303                 array = &rq->rt.active;
6304                 for (j = 0; j < MAX_RT_PRIO; j++) {
6305                         INIT_LIST_HEAD(array->queue + j);
6306                         __clear_bit(j, array->bitmap);
6307                 }
6308                 highest_cpu = i;
6309                 /* delimiter for bitsearch: */
6310                 __set_bit(MAX_RT_PRIO, array->bitmap);
6311         }
6312
6313         set_load_weight(&init_task);
6314
6315 #ifdef CONFIG_SMP
6316         nr_cpu_ids = highest_cpu + 1;
6317         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6318 #endif
6319
6320 #ifdef CONFIG_RT_MUTEXES
6321         plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6322 #endif
6323
6324         /*
6325          * The boot idle thread does lazy MMU switching as well:
6326          */
6327         atomic_inc(&init_mm.mm_count);
6328         enter_lazy_tlb(&init_mm, current);
6329
6330         /*
6331          * Make us the idle thread. Technically, schedule() should not be
6332          * called from this thread, however somewhere below it might be,
6333          * but because we are the idle thread, we just pick up running again
6334          * when this runqueue becomes "idle".
6335          */
6336         init_idle(current, smp_processor_id());
6337         /*
6338          * During early bootup we pretend to be a normal task:
6339          */
6340         current->sched_class = &fair_sched_class;
6341 }
6342
6343 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6344 void __might_sleep(char *file, int line)
6345 {
6346 #ifdef in_atomic
6347         static unsigned long prev_jiffy;        /* ratelimiting */
6348
6349         if ((in_atomic() || irqs_disabled()) &&
6350             system_state == SYSTEM_RUNNING && !oops_in_progress) {
6351                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6352                         return;
6353                 prev_jiffy = jiffies;
6354                 printk(KERN_ERR "BUG: sleeping function called from invalid"
6355                                 " context at %s:%d\n", file, line);
6356                 printk("in_atomic():%d, irqs_disabled():%d\n",
6357                         in_atomic(), irqs_disabled());
6358                 debug_show_held_locks(current);
6359                 if (irqs_disabled())
6360                         print_irqtrace_events(current);
6361                 dump_stack();
6362         }
6363 #endif
6364 }
6365 EXPORT_SYMBOL(__might_sleep);
6366 #endif
6367
6368 #ifdef CONFIG_MAGIC_SYSRQ
6369 void normalize_rt_tasks(void)
6370 {
6371         struct task_struct *g, *p;
6372         unsigned long flags;
6373         struct rq *rq;
6374         int on_rq;
6375
6376         read_lock_irq(&tasklist_lock);
6377         do_each_thread(g, p) {
6378                 p->se.fair_key                  = 0;
6379                 p->se.wait_runtime              = 0;
6380                 p->se.wait_start_fair           = 0;
6381                 p->se.wait_start                = 0;
6382                 p->se.exec_start                = 0;
6383                 p->se.sleep_start               = 0;
6384                 p->se.sleep_start_fair          = 0;
6385                 p->se.block_start               = 0;
6386                 task_rq(p)->cfs.fair_clock      = 0;
6387                 task_rq(p)->clock               = 0;
6388
6389                 if (!rt_task(p)) {
6390                         /*
6391                          * Renice negative nice level userspace
6392                          * tasks back to 0:
6393                          */
6394                         if (TASK_NICE(p) < 0 && p->mm)
6395                                 set_user_nice(p, 0);
6396                         continue;
6397                 }
6398
6399                 spin_lock_irqsave(&p->pi_lock, flags);
6400                 rq = __task_rq_lock(p);
6401 #ifdef CONFIG_SMP
6402                 /*
6403                  * Do not touch the migration thread:
6404                  */
6405                 if (p == rq->migration_thread)
6406                         goto out_unlock;
6407 #endif
6408
6409                 on_rq = p->se.on_rq;
6410                 if (on_rq)
6411                         deactivate_task(task_rq(p), p, 0);
6412                 __setscheduler(rq, p, SCHED_NORMAL, 0);
6413                 if (on_rq) {
6414                         activate_task(task_rq(p), p, 0);
6415                         resched_task(rq->curr);
6416                 }
6417 #ifdef CONFIG_SMP
6418  out_unlock:
6419 #endif
6420                 __task_rq_unlock(rq);
6421                 spin_unlock_irqrestore(&p->pi_lock, flags);
6422         } while_each_thread(g, p);
6423
6424         read_unlock_irq(&tasklist_lock);
6425 }
6426
6427 #endif /* CONFIG_MAGIC_SYSRQ */
6428
6429 #ifdef CONFIG_IA64
6430 /*
6431  * These functions are only useful for the IA64 MCA handling.
6432  *
6433  * They can only be called when the whole system has been
6434  * stopped - every CPU needs to be quiescent, and no scheduling
6435  * activity can take place. Using them for anything else would
6436  * be a serious bug, and as a result, they aren't even visible
6437  * under any other configuration.
6438  */
6439
6440 /**
6441  * curr_task - return the current task for a given cpu.
6442  * @cpu: the processor in question.
6443  *
6444  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6445  */
6446 struct task_struct *curr_task(int cpu)
6447 {
6448         return cpu_curr(cpu);
6449 }
6450
6451 /**
6452  * set_curr_task - set the current task for a given cpu.
6453  * @cpu: the processor in question.
6454  * @p: the task pointer to set.
6455  *
6456  * Description: This function must only be used when non-maskable interrupts
6457  * are serviced on a separate stack.  It allows the architecture to switch the
6458  * notion of the current task on a cpu in a non-blocking manner.  This function
6459  * must be called with all CPU's synchronized, and interrupts disabled, the
6460  * and caller must save the original value of the current task (see
6461  * curr_task() above) and restore that value before reenabling interrupts and
6462  * re-starting the system.
6463  *
6464  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6465  */
6466 void set_curr_task(int cpu, struct task_struct *p)
6467 {
6468         cpu_curr(cpu) = p;
6469 }
6470
6471 #endif