fix typo in prefetch.h
[wandboard.git] / kernel / sched.c
blob3332bbb5d5cf96d3677cbd5c20737c77a45bdf7f
1 /*
2 * kernel/sched.c
4 * Kernel scheduler and related syscalls
6 * Copyright (C) 1991-2002 Linus Torvalds
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
27 #include <linux/mm.h>
28 #include <linux/module.h>
29 #include <linux/nmi.h>
30 #include <linux/init.h>
31 #include <linux/uaccess.h>
32 #include <linux/highmem.h>
33 #include <linux/smp_lock.h>
34 #include <asm/mmu_context.h>
35 #include <linux/interrupt.h>
36 #include <linux/capability.h>
37 #include <linux/completion.h>
38 #include <linux/kernel_stat.h>
39 #include <linux/debug_locks.h>
40 #include <linux/security.h>
41 #include <linux/notifier.h>
42 #include <linux/profile.h>
43 #include <linux/freezer.h>
44 #include <linux/vmalloc.h>
45 #include <linux/blkdev.h>
46 #include <linux/delay.h>
47 #include <linux/smp.h>
48 #include <linux/threads.h>
49 #include <linux/timer.h>
50 #include <linux/rcupdate.h>
51 #include <linux/cpu.h>
52 #include <linux/cpuset.h>
53 #include <linux/percpu.h>
54 #include <linux/kthread.h>
55 #include <linux/seq_file.h>
56 #include <linux/syscalls.h>
57 #include <linux/times.h>
58 #include <linux/tsacct_kern.h>
59 #include <linux/kprobes.h>
60 #include <linux/delayacct.h>
61 #include <linux/reciprocal_div.h>
62 #include <linux/unistd.h>
64 #include <asm/tlb.h>
67 * Scheduler clock - returns current time in nanosec units.
68 * This is default implementation.
69 * Architectures and sub-architectures can override this.
71 unsigned long long __attribute__((weak)) sched_clock(void)
73 return (unsigned long long)jiffies * (1000000000 / HZ);
77 * Convert user-nice values [ -20 ... 0 ... 19 ]
78 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
79 * and back.
81 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
82 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
83 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
86 * 'User priority' is the nice value converted to something we
87 * can work with better when scaling various scheduler parameters,
88 * it's a [ 0 ... 39 ] range.
90 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
91 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
92 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
95 * Some helpers for converting nanosecond timing to jiffy resolution
97 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
98 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
100 #define NICE_0_LOAD SCHED_LOAD_SCALE
101 #define NICE_0_SHIFT SCHED_LOAD_SHIFT
104 * These are the 'tuning knobs' of the scheduler:
106 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
107 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
108 * Timeslices get refilled after they expire.
110 #define MIN_TIMESLICE max(5 * HZ / 1000, 1)
111 #define DEF_TIMESLICE (100 * HZ / 1000)
113 #ifdef CONFIG_SMP
115 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
116 * Since cpu_power is a 'constant', we can use a reciprocal divide.
118 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
120 return reciprocal_divide(load, sg->reciprocal_cpu_power);
124 * Each time a sched group cpu_power is changed,
125 * we must compute its reciprocal value
127 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
129 sg->__cpu_power += val;
130 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
132 #endif
134 #define SCALE_PRIO(x, prio) \
135 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
138 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
139 * to time slice values: [800ms ... 100ms ... 5ms]
141 static unsigned int static_prio_timeslice(int static_prio)
143 if (static_prio == NICE_TO_PRIO(19))
144 return 1;
146 if (static_prio < NICE_TO_PRIO(0))
147 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
148 else
149 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
152 static inline int rt_policy(int policy)
154 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
155 return 1;
156 return 0;
159 static inline int task_has_rt_policy(struct task_struct *p)
161 return rt_policy(p->policy);
165 * This is the priority-queue data structure of the RT scheduling class:
167 struct rt_prio_array {
168 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
169 struct list_head queue[MAX_RT_PRIO];
172 struct load_stat {
173 struct load_weight load;
174 u64 load_update_start, load_update_last;
175 unsigned long delta_fair, delta_exec, delta_stat;
178 /* CFS-related fields in a runqueue */
179 struct cfs_rq {
180 struct load_weight load;
181 unsigned long nr_running;
183 s64 fair_clock;
184 u64 exec_clock;
185 s64 wait_runtime;
186 u64 sleeper_bonus;
187 unsigned long wait_runtime_overruns, wait_runtime_underruns;
189 struct rb_root tasks_timeline;
190 struct rb_node *rb_leftmost;
191 struct rb_node *rb_load_balance_curr;
192 #ifdef CONFIG_FAIR_GROUP_SCHED
193 /* 'curr' points to currently running entity on this cfs_rq.
194 * It is set to NULL otherwise (i.e when none are currently running).
196 struct sched_entity *curr;
197 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
199 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
200 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
201 * (like users, containers etc.)
203 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
204 * list is used during load balance.
206 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
207 #endif
210 /* Real-Time classes' related field in a runqueue: */
211 struct rt_rq {
212 struct rt_prio_array active;
213 int rt_load_balance_idx;
214 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
218 * This is the main, per-CPU runqueue data structure.
220 * Locking rule: those places that want to lock multiple runqueues
221 * (such as the load balancing or the thread migration code), lock
222 * acquire operations must be ordered by ascending &runqueue.
224 struct rq {
225 spinlock_t lock; /* runqueue lock */
228 * nr_running and cpu_load should be in the same cacheline because
229 * remote CPUs use both these fields when doing load calculation.
231 unsigned long nr_running;
232 #define CPU_LOAD_IDX_MAX 5
233 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
234 unsigned char idle_at_tick;
235 #ifdef CONFIG_NO_HZ
236 unsigned char in_nohz_recently;
237 #endif
238 struct load_stat ls; /* capture load from *all* tasks on this cpu */
239 unsigned long nr_load_updates;
240 u64 nr_switches;
242 struct cfs_rq cfs;
243 #ifdef CONFIG_FAIR_GROUP_SCHED
244 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
245 #endif
246 struct rt_rq rt;
249 * This is part of a global counter where only the total sum
250 * over all CPUs matters. A task can increase this counter on
251 * one CPU and if it got migrated afterwards it may decrease
252 * it on another CPU. Always updated under the runqueue lock:
254 unsigned long nr_uninterruptible;
256 struct task_struct *curr, *idle;
257 unsigned long next_balance;
258 struct mm_struct *prev_mm;
260 u64 clock, prev_clock_raw;
261 s64 clock_max_delta;
263 unsigned int clock_warps, clock_overflows;
264 unsigned int clock_unstable_events;
266 struct sched_class *load_balance_class;
268 atomic_t nr_iowait;
270 #ifdef CONFIG_SMP
271 struct sched_domain *sd;
273 /* For active balancing */
274 int active_balance;
275 int push_cpu;
276 int cpu; /* cpu of this runqueue */
278 struct task_struct *migration_thread;
279 struct list_head migration_queue;
280 #endif
282 #ifdef CONFIG_SCHEDSTATS
283 /* latency stats */
284 struct sched_info rq_sched_info;
286 /* sys_sched_yield() stats */
287 unsigned long yld_exp_empty;
288 unsigned long yld_act_empty;
289 unsigned long yld_both_empty;
290 unsigned long yld_cnt;
292 /* schedule() stats */
293 unsigned long sched_switch;
294 unsigned long sched_cnt;
295 unsigned long sched_goidle;
297 /* try_to_wake_up() stats */
298 unsigned long ttwu_cnt;
299 unsigned long ttwu_local;
300 #endif
301 struct lock_class_key rq_lock_key;
304 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
305 static DEFINE_MUTEX(sched_hotcpu_mutex);
307 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
309 rq->curr->sched_class->check_preempt_curr(rq, p);
312 static inline int cpu_of(struct rq *rq)
314 #ifdef CONFIG_SMP
315 return rq->cpu;
316 #else
317 return 0;
318 #endif
322 * Per-runqueue clock, as finegrained as the platform can give us:
324 static unsigned long long __rq_clock(struct rq *rq)
326 u64 prev_raw = rq->prev_clock_raw;
327 u64 now = sched_clock();
328 s64 delta = now - prev_raw;
329 u64 clock = rq->clock;
332 * Protect against sched_clock() occasionally going backwards:
334 if (unlikely(delta < 0)) {
335 clock++;
336 rq->clock_warps++;
337 } else {
339 * Catch too large forward jumps too:
341 if (unlikely(delta > 2*TICK_NSEC)) {
342 clock++;
343 rq->clock_overflows++;
344 } else {
345 if (unlikely(delta > rq->clock_max_delta))
346 rq->clock_max_delta = delta;
347 clock += delta;
351 rq->prev_clock_raw = now;
352 rq->clock = clock;
354 return clock;
357 static inline unsigned long long rq_clock(struct rq *rq)
359 int this_cpu = smp_processor_id();
361 if (this_cpu == cpu_of(rq))
362 return __rq_clock(rq);
364 return rq->clock;
368 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
369 * See detach_destroy_domains: synchronize_sched for details.
371 * The domain tree of any CPU may only be accessed from within
372 * preempt-disabled sections.
374 #define for_each_domain(cpu, __sd) \
375 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
377 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
378 #define this_rq() (&__get_cpu_var(runqueues))
379 #define task_rq(p) cpu_rq(task_cpu(p))
380 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
382 #ifdef CONFIG_FAIR_GROUP_SCHED
383 /* Change a task's ->cfs_rq if it moves across CPUs */
384 static inline void set_task_cfs_rq(struct task_struct *p)
386 p->se.cfs_rq = &task_rq(p)->cfs;
388 #else
389 static inline void set_task_cfs_rq(struct task_struct *p)
392 #endif
394 #ifndef prepare_arch_switch
395 # define prepare_arch_switch(next) do { } while (0)
396 #endif
397 #ifndef finish_arch_switch
398 # define finish_arch_switch(prev) do { } while (0)
399 #endif
401 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
402 static inline int task_running(struct rq *rq, struct task_struct *p)
404 return rq->curr == p;
407 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
411 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
413 #ifdef CONFIG_DEBUG_SPINLOCK
414 /* this is a valid case when another task releases the spinlock */
415 rq->lock.owner = current;
416 #endif
418 * If we are tracking spinlock dependencies then we have to
419 * fix up the runqueue lock - which gets 'carried over' from
420 * prev into current:
422 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
424 spin_unlock_irq(&rq->lock);
427 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
428 static inline int task_running(struct rq *rq, struct task_struct *p)
430 #ifdef CONFIG_SMP
431 return p->oncpu;
432 #else
433 return rq->curr == p;
434 #endif
437 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
439 #ifdef CONFIG_SMP
441 * We can optimise this out completely for !SMP, because the
442 * SMP rebalancing from interrupt is the only thing that cares
443 * here.
445 next->oncpu = 1;
446 #endif
447 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
448 spin_unlock_irq(&rq->lock);
449 #else
450 spin_unlock(&rq->lock);
451 #endif
454 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
456 #ifdef CONFIG_SMP
458 * After ->oncpu is cleared, the task can be moved to a different CPU.
459 * We must ensure this doesn't happen until the switch is completely
460 * finished.
462 smp_wmb();
463 prev->oncpu = 0;
464 #endif
465 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
466 local_irq_enable();
467 #endif
469 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
472 * __task_rq_lock - lock the runqueue a given task resides on.
473 * Must be called interrupts disabled.
475 static inline struct rq *__task_rq_lock(struct task_struct *p)
476 __acquires(rq->lock)
478 struct rq *rq;
480 repeat_lock_task:
481 rq = task_rq(p);
482 spin_lock(&rq->lock);
483 if (unlikely(rq != task_rq(p))) {
484 spin_unlock(&rq->lock);
485 goto repeat_lock_task;
487 return rq;
491 * task_rq_lock - lock the runqueue a given task resides on and disable
492 * interrupts. Note the ordering: we can safely lookup the task_rq without
493 * explicitly disabling preemption.
495 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
496 __acquires(rq->lock)
498 struct rq *rq;
500 repeat_lock_task:
501 local_irq_save(*flags);
502 rq = task_rq(p);
503 spin_lock(&rq->lock);
504 if (unlikely(rq != task_rq(p))) {
505 spin_unlock_irqrestore(&rq->lock, *flags);
506 goto repeat_lock_task;
508 return rq;
511 static inline void __task_rq_unlock(struct rq *rq)
512 __releases(rq->lock)
514 spin_unlock(&rq->lock);
517 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
518 __releases(rq->lock)
520 spin_unlock_irqrestore(&rq->lock, *flags);
524 * this_rq_lock - lock this runqueue and disable interrupts.
526 static inline struct rq *this_rq_lock(void)
527 __acquires(rq->lock)
529 struct rq *rq;
531 local_irq_disable();
532 rq = this_rq();
533 spin_lock(&rq->lock);
535 return rq;
539 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
541 void sched_clock_unstable_event(void)
543 unsigned long flags;
544 struct rq *rq;
546 rq = task_rq_lock(current, &flags);
547 rq->prev_clock_raw = sched_clock();
548 rq->clock_unstable_events++;
549 task_rq_unlock(rq, &flags);
553 * resched_task - mark a task 'to be rescheduled now'.
555 * On UP this means the setting of the need_resched flag, on SMP it
556 * might also involve a cross-CPU call to trigger the scheduler on
557 * the target CPU.
559 #ifdef CONFIG_SMP
561 #ifndef tsk_is_polling
562 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
563 #endif
565 static void resched_task(struct task_struct *p)
567 int cpu;
569 assert_spin_locked(&task_rq(p)->lock);
571 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
572 return;
574 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
576 cpu = task_cpu(p);
577 if (cpu == smp_processor_id())
578 return;
580 /* NEED_RESCHED must be visible before we test polling */
581 smp_mb();
582 if (!tsk_is_polling(p))
583 smp_send_reschedule(cpu);
586 static void resched_cpu(int cpu)
588 struct rq *rq = cpu_rq(cpu);
589 unsigned long flags;
591 if (!spin_trylock_irqsave(&rq->lock, flags))
592 return;
593 resched_task(cpu_curr(cpu));
594 spin_unlock_irqrestore(&rq->lock, flags);
596 #else
597 static inline void resched_task(struct task_struct *p)
599 assert_spin_locked(&task_rq(p)->lock);
600 set_tsk_need_resched(p);
602 #endif
604 static u64 div64_likely32(u64 divident, unsigned long divisor)
606 #if BITS_PER_LONG == 32
607 if (likely(divident <= 0xffffffffULL))
608 return (u32)divident / divisor;
609 do_div(divident, divisor);
611 return divident;
612 #else
613 return divident / divisor;
614 #endif
617 #if BITS_PER_LONG == 32
618 # define WMULT_CONST (~0UL)
619 #else
620 # define WMULT_CONST (1UL << 32)
621 #endif
623 #define WMULT_SHIFT 32
625 static inline unsigned long
626 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
627 struct load_weight *lw)
629 u64 tmp;
631 if (unlikely(!lw->inv_weight))
632 lw->inv_weight = WMULT_CONST / lw->weight;
634 tmp = (u64)delta_exec * weight;
636 * Check whether we'd overflow the 64-bit multiplication:
638 if (unlikely(tmp > WMULT_CONST)) {
639 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
640 >> (WMULT_SHIFT/2);
641 } else {
642 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
645 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
648 static inline unsigned long
649 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
651 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
654 static void update_load_add(struct load_weight *lw, unsigned long inc)
656 lw->weight += inc;
657 lw->inv_weight = 0;
660 static void update_load_sub(struct load_weight *lw, unsigned long dec)
662 lw->weight -= dec;
663 lw->inv_weight = 0;
666 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
668 if (rq->curr != rq->idle && ls->load.weight) {
669 ls->delta_exec += ls->delta_stat;
670 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
671 ls->delta_stat = 0;
676 * Update delta_exec, delta_fair fields for rq.
678 * delta_fair clock advances at a rate inversely proportional to
679 * total load (rq->ls.load.weight) on the runqueue, while
680 * delta_exec advances at the same rate as wall-clock (provided
681 * cpu is not idle).
683 * delta_exec / delta_fair is a measure of the (smoothened) load on this
684 * runqueue over any given interval. This (smoothened) load is used
685 * during load balance.
687 * This function is called /before/ updating rq->ls.load
688 * and when switching tasks.
690 static void update_curr_load(struct rq *rq, u64 now)
692 struct load_stat *ls = &rq->ls;
693 u64 start;
695 start = ls->load_update_start;
696 ls->load_update_start = now;
697 ls->delta_stat += now - start;
699 * Stagger updates to ls->delta_fair. Very frequent updates
700 * can be expensive.
702 if (ls->delta_stat >= sysctl_sched_stat_granularity)
703 __update_curr_load(rq, ls);
707 * To aid in avoiding the subversion of "niceness" due to uneven distribution
708 * of tasks with abnormal "nice" values across CPUs the contribution that
709 * each task makes to its run queue's load is weighted according to its
710 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
711 * scaled version of the new time slice allocation that they receive on time
712 * slice expiry etc.
716 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
717 * If static_prio_timeslice() is ever changed to break this assumption then
718 * this code will need modification
720 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
721 #define load_weight(lp) \
722 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
723 #define PRIO_TO_LOAD_WEIGHT(prio) \
724 load_weight(static_prio_timeslice(prio))
725 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
726 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
728 #define WEIGHT_IDLEPRIO 2
729 #define WMULT_IDLEPRIO (1 << 31)
732 * Nice levels are multiplicative, with a gentle 10% change for every
733 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
734 * nice 1, it will get ~10% less CPU time than another CPU-bound task
735 * that remained on nice 0.
737 * The "10% effect" is relative and cumulative: from _any_ nice level,
738 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
739 * it's +10% CPU usage.
741 static const int prio_to_weight[40] = {
742 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
743 /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
744 /* 0 */ NICE_0_LOAD /* 1024 */,
745 /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
746 /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
749 static const u32 prio_to_wmult[40] = {
750 48356, 60446, 75558, 94446, 118058, 147573,
751 184467, 230589, 288233, 360285, 450347,
752 562979, 703746, 879575, 1099582, 1374389,
753 1717986, 2147483, 2684354, 3355443, 4194304,
754 5244160, 6557201, 8196502, 10250518, 12782640,
755 16025997, 19976592, 24970740, 31350126, 39045157,
756 49367440, 61356675, 76695844, 95443717, 119304647,
757 148102320, 186737708, 238609294, 286331153,
760 static inline void
761 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
763 update_curr_load(rq, now);
764 update_load_add(&rq->ls.load, p->se.load.weight);
767 static inline void
768 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
770 update_curr_load(rq, now);
771 update_load_sub(&rq->ls.load, p->se.load.weight);
774 static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
776 rq->nr_running++;
777 inc_load(rq, p, now);
780 static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
782 rq->nr_running--;
783 dec_load(rq, p, now);
786 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
789 * runqueue iterator, to support SMP load-balancing between different
790 * scheduling classes, without having to expose their internal data
791 * structures to the load-balancing proper:
793 struct rq_iterator {
794 void *arg;
795 struct task_struct *(*start)(void *);
796 struct task_struct *(*next)(void *);
799 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
800 unsigned long max_nr_move, unsigned long max_load_move,
801 struct sched_domain *sd, enum cpu_idle_type idle,
802 int *all_pinned, unsigned long *load_moved,
803 int this_best_prio, int best_prio, int best_prio_seen,
804 struct rq_iterator *iterator);
806 #include "sched_stats.h"
807 #include "sched_rt.c"
808 #include "sched_fair.c"
809 #include "sched_idletask.c"
810 #ifdef CONFIG_SCHED_DEBUG
811 # include "sched_debug.c"
812 #endif
814 #define sched_class_highest (&rt_sched_class)
816 static void set_load_weight(struct task_struct *p)
818 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
819 p->se.wait_runtime = 0;
821 if (task_has_rt_policy(p)) {
822 p->se.load.weight = prio_to_weight[0] * 2;
823 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
824 return;
828 * SCHED_IDLE tasks get minimal weight:
830 if (p->policy == SCHED_IDLE) {
831 p->se.load.weight = WEIGHT_IDLEPRIO;
832 p->se.load.inv_weight = WMULT_IDLEPRIO;
833 return;
836 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
837 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
840 static void
841 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
843 sched_info_queued(p);
844 p->sched_class->enqueue_task(rq, p, wakeup, now);
845 p->se.on_rq = 1;
848 static void
849 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
851 p->sched_class->dequeue_task(rq, p, sleep, now);
852 p->se.on_rq = 0;
856 * __normal_prio - return the priority that is based on the static prio
858 static inline int __normal_prio(struct task_struct *p)
860 return p->static_prio;
864 * Calculate the expected normal priority: i.e. priority
865 * without taking RT-inheritance into account. Might be
866 * boosted by interactivity modifiers. Changes upon fork,
867 * setprio syscalls, and whenever the interactivity
868 * estimator recalculates.
870 static inline int normal_prio(struct task_struct *p)
872 int prio;
874 if (task_has_rt_policy(p))
875 prio = MAX_RT_PRIO-1 - p->rt_priority;
876 else
877 prio = __normal_prio(p);
878 return prio;
882 * Calculate the current priority, i.e. the priority
883 * taken into account by the scheduler. This value might
884 * be boosted by RT tasks, or might be boosted by
885 * interactivity modifiers. Will be RT if the task got
886 * RT-boosted. If not then it returns p->normal_prio.
888 static int effective_prio(struct task_struct *p)
890 p->normal_prio = normal_prio(p);
892 * If we are RT tasks or we were boosted to RT priority,
893 * keep the priority unchanged. Otherwise, update priority
894 * to the normal priority:
896 if (!rt_prio(p->prio))
897 return p->normal_prio;
898 return p->prio;
902 * activate_task - move a task to the runqueue.
904 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
906 u64 now = rq_clock(rq);
908 if (p->state == TASK_UNINTERRUPTIBLE)
909 rq->nr_uninterruptible--;
911 enqueue_task(rq, p, wakeup, now);
912 inc_nr_running(p, rq, now);
916 * activate_idle_task - move idle task to the _front_ of runqueue.
918 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
920 u64 now = rq_clock(rq);
922 if (p->state == TASK_UNINTERRUPTIBLE)
923 rq->nr_uninterruptible--;
925 enqueue_task(rq, p, 0, now);
926 inc_nr_running(p, rq, now);
930 * deactivate_task - remove a task from the runqueue.
932 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
934 u64 now = rq_clock(rq);
936 if (p->state == TASK_UNINTERRUPTIBLE)
937 rq->nr_uninterruptible++;
939 dequeue_task(rq, p, sleep, now);
940 dec_nr_running(p, rq, now);
944 * task_curr - is this task currently executing on a CPU?
945 * @p: the task in question.
947 inline int task_curr(const struct task_struct *p)
949 return cpu_curr(task_cpu(p)) == p;
952 /* Used instead of source_load when we know the type == 0 */
953 unsigned long weighted_cpuload(const int cpu)
955 return cpu_rq(cpu)->ls.load.weight;
958 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
960 #ifdef CONFIG_SMP
961 task_thread_info(p)->cpu = cpu;
962 set_task_cfs_rq(p);
963 #endif
966 #ifdef CONFIG_SMP
968 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
970 int old_cpu = task_cpu(p);
971 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
972 u64 clock_offset, fair_clock_offset;
974 clock_offset = old_rq->clock - new_rq->clock;
975 fair_clock_offset = old_rq->cfs.fair_clock -
976 new_rq->cfs.fair_clock;
977 if (p->se.wait_start)
978 p->se.wait_start -= clock_offset;
979 if (p->se.wait_start_fair)
980 p->se.wait_start_fair -= fair_clock_offset;
981 if (p->se.sleep_start)
982 p->se.sleep_start -= clock_offset;
983 if (p->se.block_start)
984 p->se.block_start -= clock_offset;
985 if (p->se.sleep_start_fair)
986 p->se.sleep_start_fair -= fair_clock_offset;
988 __set_task_cpu(p, new_cpu);
991 struct migration_req {
992 struct list_head list;
994 struct task_struct *task;
995 int dest_cpu;
997 struct completion done;
1001 * The task's runqueue lock must be held.
1002 * Returns true if you have to wait for migration thread.
1004 static int
1005 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1007 struct rq *rq = task_rq(p);
1010 * If the task is not on a runqueue (and not running), then
1011 * it is sufficient to simply update the task's cpu field.
1013 if (!p->se.on_rq && !task_running(rq, p)) {
1014 set_task_cpu(p, dest_cpu);
1015 return 0;
1018 init_completion(&req->done);
1019 req->task = p;
1020 req->dest_cpu = dest_cpu;
1021 list_add(&req->list, &rq->migration_queue);
1023 return 1;
1027 * wait_task_inactive - wait for a thread to unschedule.
1029 * The caller must ensure that the task *will* unschedule sometime soon,
1030 * else this function might spin for a *long* time. This function can't
1031 * be called with interrupts off, or it may introduce deadlock with
1032 * smp_call_function() if an IPI is sent by the same process we are
1033 * waiting to become inactive.
1035 void wait_task_inactive(struct task_struct *p)
1037 unsigned long flags;
1038 int running, on_rq;
1039 struct rq *rq;
1041 repeat:
1043 * We do the initial early heuristics without holding
1044 * any task-queue locks at all. We'll only try to get
1045 * the runqueue lock when things look like they will
1046 * work out!
1048 rq = task_rq(p);
1051 * If the task is actively running on another CPU
1052 * still, just relax and busy-wait without holding
1053 * any locks.
1055 * NOTE! Since we don't hold any locks, it's not
1056 * even sure that "rq" stays as the right runqueue!
1057 * But we don't care, since "task_running()" will
1058 * return false if the runqueue has changed and p
1059 * is actually now running somewhere else!
1061 while (task_running(rq, p))
1062 cpu_relax();
1065 * Ok, time to look more closely! We need the rq
1066 * lock now, to be *sure*. If we're wrong, we'll
1067 * just go back and repeat.
1069 rq = task_rq_lock(p, &flags);
1070 running = task_running(rq, p);
1071 on_rq = p->se.on_rq;
1072 task_rq_unlock(rq, &flags);
1075 * Was it really running after all now that we
1076 * checked with the proper locks actually held?
1078 * Oops. Go back and try again..
1080 if (unlikely(running)) {
1081 cpu_relax();
1082 goto repeat;
1086 * It's not enough that it's not actively running,
1087 * it must be off the runqueue _entirely_, and not
1088 * preempted!
1090 * So if it wa still runnable (but just not actively
1091 * running right now), it's preempted, and we should
1092 * yield - it could be a while.
1094 if (unlikely(on_rq)) {
1095 yield();
1096 goto repeat;
1100 * Ahh, all good. It wasn't running, and it wasn't
1101 * runnable, which means that it will never become
1102 * running in the future either. We're all done!
1106 /***
1107 * kick_process - kick a running thread to enter/exit the kernel
1108 * @p: the to-be-kicked thread
1110 * Cause a process which is running on another CPU to enter
1111 * kernel-mode, without any delay. (to get signals handled.)
1113 * NOTE: this function doesnt have to take the runqueue lock,
1114 * because all it wants to ensure is that the remote task enters
1115 * the kernel. If the IPI races and the task has been migrated
1116 * to another CPU then no harm is done and the purpose has been
1117 * achieved as well.
1119 void kick_process(struct task_struct *p)
1121 int cpu;
1123 preempt_disable();
1124 cpu = task_cpu(p);
1125 if ((cpu != smp_processor_id()) && task_curr(p))
1126 smp_send_reschedule(cpu);
1127 preempt_enable();
1131 * Return a low guess at the load of a migration-source cpu weighted
1132 * according to the scheduling class and "nice" value.
1134 * We want to under-estimate the load of migration sources, to
1135 * balance conservatively.
1137 static inline unsigned long source_load(int cpu, int type)
1139 struct rq *rq = cpu_rq(cpu);
1140 unsigned long total = weighted_cpuload(cpu);
1142 if (type == 0)
1143 return total;
1145 return min(rq->cpu_load[type-1], total);
1149 * Return a high guess at the load of a migration-target cpu weighted
1150 * according to the scheduling class and "nice" value.
1152 static inline unsigned long target_load(int cpu, int type)
1154 struct rq *rq = cpu_rq(cpu);
1155 unsigned long total = weighted_cpuload(cpu);
1157 if (type == 0)
1158 return total;
1160 return max(rq->cpu_load[type-1], total);
1164 * Return the average load per task on the cpu's run queue
1166 static inline unsigned long cpu_avg_load_per_task(int cpu)
1168 struct rq *rq = cpu_rq(cpu);
1169 unsigned long total = weighted_cpuload(cpu);
1170 unsigned long n = rq->nr_running;
1172 return n ? total / n : SCHED_LOAD_SCALE;
1176 * find_idlest_group finds and returns the least busy CPU group within the
1177 * domain.
1179 static struct sched_group *
1180 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1182 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1183 unsigned long min_load = ULONG_MAX, this_load = 0;
1184 int load_idx = sd->forkexec_idx;
1185 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1187 do {
1188 unsigned long load, avg_load;
1189 int local_group;
1190 int i;
1192 /* Skip over this group if it has no CPUs allowed */
1193 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1194 goto nextgroup;
1196 local_group = cpu_isset(this_cpu, group->cpumask);
1198 /* Tally up the load of all CPUs in the group */
1199 avg_load = 0;
1201 for_each_cpu_mask(i, group->cpumask) {
1202 /* Bias balancing toward cpus of our domain */
1203 if (local_group)
1204 load = source_load(i, load_idx);
1205 else
1206 load = target_load(i, load_idx);
1208 avg_load += load;
1211 /* Adjust by relative CPU power of the group */
1212 avg_load = sg_div_cpu_power(group,
1213 avg_load * SCHED_LOAD_SCALE);
1215 if (local_group) {
1216 this_load = avg_load;
1217 this = group;
1218 } else if (avg_load < min_load) {
1219 min_load = avg_load;
1220 idlest = group;
1222 nextgroup:
1223 group = group->next;
1224 } while (group != sd->groups);
1226 if (!idlest || 100*this_load < imbalance*min_load)
1227 return NULL;
1228 return idlest;
1232 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1234 static int
1235 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1237 cpumask_t tmp;
1238 unsigned long load, min_load = ULONG_MAX;
1239 int idlest = -1;
1240 int i;
1242 /* Traverse only the allowed CPUs */
1243 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1245 for_each_cpu_mask(i, tmp) {
1246 load = weighted_cpuload(i);
1248 if (load < min_load || (load == min_load && i == this_cpu)) {
1249 min_load = load;
1250 idlest = i;
1254 return idlest;
1258 * sched_balance_self: balance the current task (running on cpu) in domains
1259 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1260 * SD_BALANCE_EXEC.
1262 * Balance, ie. select the least loaded group.
1264 * Returns the target CPU number, or the same CPU if no balancing is needed.
1266 * preempt must be disabled.
1268 static int sched_balance_self(int cpu, int flag)
1270 struct task_struct *t = current;
1271 struct sched_domain *tmp, *sd = NULL;
1273 for_each_domain(cpu, tmp) {
1275 * If power savings logic is enabled for a domain, stop there.
1277 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1278 break;
1279 if (tmp->flags & flag)
1280 sd = tmp;
1283 while (sd) {
1284 cpumask_t span;
1285 struct sched_group *group;
1286 int new_cpu, weight;
1288 if (!(sd->flags & flag)) {
1289 sd = sd->child;
1290 continue;
1293 span = sd->span;
1294 group = find_idlest_group(sd, t, cpu);
1295 if (!group) {
1296 sd = sd->child;
1297 continue;
1300 new_cpu = find_idlest_cpu(group, t, cpu);
1301 if (new_cpu == -1 || new_cpu == cpu) {
1302 /* Now try balancing at a lower domain level of cpu */
1303 sd = sd->child;
1304 continue;
1307 /* Now try balancing at a lower domain level of new_cpu */
1308 cpu = new_cpu;
1309 sd = NULL;
1310 weight = cpus_weight(span);
1311 for_each_domain(cpu, tmp) {
1312 if (weight <= cpus_weight(tmp->span))
1313 break;
1314 if (tmp->flags & flag)
1315 sd = tmp;
1317 /* while loop will break here if sd == NULL */
1320 return cpu;
1323 #endif /* CONFIG_SMP */
1326 * wake_idle() will wake a task on an idle cpu if task->cpu is
1327 * not idle and an idle cpu is available. The span of cpus to
1328 * search starts with cpus closest then further out as needed,
1329 * so we always favor a closer, idle cpu.
1331 * Returns the CPU we should wake onto.
1333 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1334 static int wake_idle(int cpu, struct task_struct *p)
1336 cpumask_t tmp;
1337 struct sched_domain *sd;
1338 int i;
1341 * If it is idle, then it is the best cpu to run this task.
1343 * This cpu is also the best, if it has more than one task already.
1344 * Siblings must be also busy(in most cases) as they didn't already
1345 * pickup the extra load from this cpu and hence we need not check
1346 * sibling runqueue info. This will avoid the checks and cache miss
1347 * penalities associated with that.
1349 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1350 return cpu;
1352 for_each_domain(cpu, sd) {
1353 if (sd->flags & SD_WAKE_IDLE) {
1354 cpus_and(tmp, sd->span, p->cpus_allowed);
1355 for_each_cpu_mask(i, tmp) {
1356 if (idle_cpu(i))
1357 return i;
1359 } else {
1360 break;
1363 return cpu;
1365 #else
1366 static inline int wake_idle(int cpu, struct task_struct *p)
1368 return cpu;
1370 #endif
1372 /***
1373 * try_to_wake_up - wake up a thread
1374 * @p: the to-be-woken-up thread
1375 * @state: the mask of task states that can be woken
1376 * @sync: do a synchronous wakeup?
1378 * Put it on the run-queue if it's not already there. The "current"
1379 * thread is always on the run-queue (except when the actual
1380 * re-schedule is in progress), and as such you're allowed to do
1381 * the simpler "current->state = TASK_RUNNING" to mark yourself
1382 * runnable without the overhead of this.
1384 * returns failure only if the task is already active.
1386 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1388 int cpu, this_cpu, success = 0;
1389 unsigned long flags;
1390 long old_state;
1391 struct rq *rq;
1392 #ifdef CONFIG_SMP
1393 struct sched_domain *sd, *this_sd = NULL;
1394 unsigned long load, this_load;
1395 int new_cpu;
1396 #endif
1398 rq = task_rq_lock(p, &flags);
1399 old_state = p->state;
1400 if (!(old_state & state))
1401 goto out;
1403 if (p->se.on_rq)
1404 goto out_running;
1406 cpu = task_cpu(p);
1407 this_cpu = smp_processor_id();
1409 #ifdef CONFIG_SMP
1410 if (unlikely(task_running(rq, p)))
1411 goto out_activate;
1413 new_cpu = cpu;
1415 schedstat_inc(rq, ttwu_cnt);
1416 if (cpu == this_cpu) {
1417 schedstat_inc(rq, ttwu_local);
1418 goto out_set_cpu;
1421 for_each_domain(this_cpu, sd) {
1422 if (cpu_isset(cpu, sd->span)) {
1423 schedstat_inc(sd, ttwu_wake_remote);
1424 this_sd = sd;
1425 break;
1429 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1430 goto out_set_cpu;
1433 * Check for affine wakeup and passive balancing possibilities.
1435 if (this_sd) {
1436 int idx = this_sd->wake_idx;
1437 unsigned int imbalance;
1439 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1441 load = source_load(cpu, idx);
1442 this_load = target_load(this_cpu, idx);
1444 new_cpu = this_cpu; /* Wake to this CPU if we can */
1446 if (this_sd->flags & SD_WAKE_AFFINE) {
1447 unsigned long tl = this_load;
1448 unsigned long tl_per_task;
1450 tl_per_task = cpu_avg_load_per_task(this_cpu);
1453 * If sync wakeup then subtract the (maximum possible)
1454 * effect of the currently running task from the load
1455 * of the current CPU:
1457 if (sync)
1458 tl -= current->se.load.weight;
1460 if ((tl <= load &&
1461 tl + target_load(cpu, idx) <= tl_per_task) ||
1462 100*(tl + p->se.load.weight) <= imbalance*load) {
1464 * This domain has SD_WAKE_AFFINE and
1465 * p is cache cold in this domain, and
1466 * there is no bad imbalance.
1468 schedstat_inc(this_sd, ttwu_move_affine);
1469 goto out_set_cpu;
1474 * Start passive balancing when half the imbalance_pct
1475 * limit is reached.
1477 if (this_sd->flags & SD_WAKE_BALANCE) {
1478 if (imbalance*this_load <= 100*load) {
1479 schedstat_inc(this_sd, ttwu_move_balance);
1480 goto out_set_cpu;
1485 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1486 out_set_cpu:
1487 new_cpu = wake_idle(new_cpu, p);
1488 if (new_cpu != cpu) {
1489 set_task_cpu(p, new_cpu);
1490 task_rq_unlock(rq, &flags);
1491 /* might preempt at this point */
1492 rq = task_rq_lock(p, &flags);
1493 old_state = p->state;
1494 if (!(old_state & state))
1495 goto out;
1496 if (p->se.on_rq)
1497 goto out_running;
1499 this_cpu = smp_processor_id();
1500 cpu = task_cpu(p);
1503 out_activate:
1504 #endif /* CONFIG_SMP */
1505 activate_task(rq, p, 1);
1507 * Sync wakeups (i.e. those types of wakeups where the waker
1508 * has indicated that it will leave the CPU in short order)
1509 * don't trigger a preemption, if the woken up task will run on
1510 * this cpu. (in this case the 'I will reschedule' promise of
1511 * the waker guarantees that the freshly woken up task is going
1512 * to be considered on this CPU.)
1514 if (!sync || cpu != this_cpu)
1515 check_preempt_curr(rq, p);
1516 success = 1;
1518 out_running:
1519 p->state = TASK_RUNNING;
1520 out:
1521 task_rq_unlock(rq, &flags);
1523 return success;
1526 int fastcall wake_up_process(struct task_struct *p)
1528 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1529 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1531 EXPORT_SYMBOL(wake_up_process);
1533 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1535 return try_to_wake_up(p, state, 0);
1539 * Perform scheduler related setup for a newly forked process p.
1540 * p is forked by current.
1542 * __sched_fork() is basic setup used by init_idle() too:
1544 static void __sched_fork(struct task_struct *p)
1546 p->se.wait_start_fair = 0;
1547 p->se.wait_start = 0;
1548 p->se.exec_start = 0;
1549 p->se.sum_exec_runtime = 0;
1550 p->se.delta_exec = 0;
1551 p->se.delta_fair_run = 0;
1552 p->se.delta_fair_sleep = 0;
1553 p->se.wait_runtime = 0;
1554 p->se.sum_wait_runtime = 0;
1555 p->se.sum_sleep_runtime = 0;
1556 p->se.sleep_start = 0;
1557 p->se.sleep_start_fair = 0;
1558 p->se.block_start = 0;
1559 p->se.sleep_max = 0;
1560 p->se.block_max = 0;
1561 p->se.exec_max = 0;
1562 p->se.wait_max = 0;
1563 p->se.wait_runtime_overruns = 0;
1564 p->se.wait_runtime_underruns = 0;
1566 INIT_LIST_HEAD(&p->run_list);
1567 p->se.on_rq = 0;
1570 * We mark the process as running here, but have not actually
1571 * inserted it onto the runqueue yet. This guarantees that
1572 * nobody will actually run it, and a signal or other external
1573 * event cannot wake it up and insert it on the runqueue either.
1575 p->state = TASK_RUNNING;
1579 * fork()/clone()-time setup:
1581 void sched_fork(struct task_struct *p, int clone_flags)
1583 int cpu = get_cpu();
1585 __sched_fork(p);
1587 #ifdef CONFIG_SMP
1588 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1589 #endif
1590 __set_task_cpu(p, cpu);
1593 * Make sure we do not leak PI boosting priority to the child:
1595 p->prio = current->normal_prio;
1597 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1598 if (likely(sched_info_on()))
1599 memset(&p->sched_info, 0, sizeof(p->sched_info));
1600 #endif
1601 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1602 p->oncpu = 0;
1603 #endif
1604 #ifdef CONFIG_PREEMPT
1605 /* Want to start with kernel preemption disabled. */
1606 task_thread_info(p)->preempt_count = 1;
1607 #endif
1608 put_cpu();
1612 * After fork, child runs first. (default) If set to 0 then
1613 * parent will (try to) run first.
1615 unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1618 * wake_up_new_task - wake up a newly created task for the first time.
1620 * This function will do some initial scheduler statistics housekeeping
1621 * that must be done for every newly created context, then puts the task
1622 * on the runqueue and wakes it.
1624 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1626 unsigned long flags;
1627 struct rq *rq;
1628 int this_cpu;
1630 rq = task_rq_lock(p, &flags);
1631 BUG_ON(p->state != TASK_RUNNING);
1632 this_cpu = smp_processor_id(); /* parent's CPU */
1634 p->prio = effective_prio(p);
1636 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1637 task_cpu(p) != this_cpu || !current->se.on_rq) {
1638 activate_task(rq, p, 0);
1639 } else {
1641 * Let the scheduling class do new task startup
1642 * management (if any):
1644 p->sched_class->task_new(rq, p);
1646 check_preempt_curr(rq, p);
1647 task_rq_unlock(rq, &flags);
1651 * prepare_task_switch - prepare to switch tasks
1652 * @rq: the runqueue preparing to switch
1653 * @next: the task we are going to switch to.
1655 * This is called with the rq lock held and interrupts off. It must
1656 * be paired with a subsequent finish_task_switch after the context
1657 * switch.
1659 * prepare_task_switch sets up locking and calls architecture specific
1660 * hooks.
1662 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1664 prepare_lock_switch(rq, next);
1665 prepare_arch_switch(next);
1669 * finish_task_switch - clean up after a task-switch
1670 * @rq: runqueue associated with task-switch
1671 * @prev: the thread we just switched away from.
1673 * finish_task_switch must be called after the context switch, paired
1674 * with a prepare_task_switch call before the context switch.
1675 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1676 * and do any other architecture-specific cleanup actions.
1678 * Note that we may have delayed dropping an mm in context_switch(). If
1679 * so, we finish that here outside of the runqueue lock. (Doing it
1680 * with the lock held can cause deadlocks; see schedule() for
1681 * details.)
1683 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1684 __releases(rq->lock)
1686 struct mm_struct *mm = rq->prev_mm;
1687 long prev_state;
1689 rq->prev_mm = NULL;
1692 * A task struct has one reference for the use as "current".
1693 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1694 * schedule one last time. The schedule call will never return, and
1695 * the scheduled task must drop that reference.
1696 * The test for TASK_DEAD must occur while the runqueue locks are
1697 * still held, otherwise prev could be scheduled on another cpu, die
1698 * there before we look at prev->state, and then the reference would
1699 * be dropped twice.
1700 * Manfred Spraul <manfred@colorfullife.com>
1702 prev_state = prev->state;
1703 finish_arch_switch(prev);
1704 finish_lock_switch(rq, prev);
1705 if (mm)
1706 mmdrop(mm);
1707 if (unlikely(prev_state == TASK_DEAD)) {
1709 * Remove function-return probe instances associated with this
1710 * task and put them back on the free list.
1712 kprobe_flush_task(prev);
1713 put_task_struct(prev);
1718 * schedule_tail - first thing a freshly forked thread must call.
1719 * @prev: the thread we just switched away from.
1721 asmlinkage void schedule_tail(struct task_struct *prev)
1722 __releases(rq->lock)
1724 struct rq *rq = this_rq();
1726 finish_task_switch(rq, prev);
1727 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1728 /* In this case, finish_task_switch does not reenable preemption */
1729 preempt_enable();
1730 #endif
1731 if (current->set_child_tid)
1732 put_user(current->pid, current->set_child_tid);
1736 * context_switch - switch to the new MM and the new
1737 * thread's register state.
1739 static inline void
1740 context_switch(struct rq *rq, struct task_struct *prev,
1741 struct task_struct *next)
1743 struct mm_struct *mm, *oldmm;
1745 prepare_task_switch(rq, next);
1746 mm = next->mm;
1747 oldmm = prev->active_mm;
1749 * For paravirt, this is coupled with an exit in switch_to to
1750 * combine the page table reload and the switch backend into
1751 * one hypercall.
1753 arch_enter_lazy_cpu_mode();
1755 if (unlikely(!mm)) {
1756 next->active_mm = oldmm;
1757 atomic_inc(&oldmm->mm_count);
1758 enter_lazy_tlb(oldmm, next);
1759 } else
1760 switch_mm(oldmm, mm, next);
1762 if (unlikely(!prev->mm)) {
1763 prev->active_mm = NULL;
1764 rq->prev_mm = oldmm;
1767 * Since the runqueue lock will be released by the next
1768 * task (which is an invalid locking op but in the case
1769 * of the scheduler it's an obvious special-case), so we
1770 * do an early lockdep release here:
1772 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1773 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1774 #endif
1776 /* Here we just switch the register state and the stack. */
1777 switch_to(prev, next, prev);
1779 barrier();
1781 * this_rq must be evaluated again because prev may have moved
1782 * CPUs since it called schedule(), thus the 'rq' on its stack
1783 * frame will be invalid.
1785 finish_task_switch(this_rq(), prev);
1789 * nr_running, nr_uninterruptible and nr_context_switches:
1791 * externally visible scheduler statistics: current number of runnable
1792 * threads, current number of uninterruptible-sleeping threads, total
1793 * number of context switches performed since bootup.
1795 unsigned long nr_running(void)
1797 unsigned long i, sum = 0;
1799 for_each_online_cpu(i)
1800 sum += cpu_rq(i)->nr_running;
1802 return sum;
1805 unsigned long nr_uninterruptible(void)
1807 unsigned long i, sum = 0;
1809 for_each_possible_cpu(i)
1810 sum += cpu_rq(i)->nr_uninterruptible;
1813 * Since we read the counters lockless, it might be slightly
1814 * inaccurate. Do not allow it to go below zero though:
1816 if (unlikely((long)sum < 0))
1817 sum = 0;
1819 return sum;
1822 unsigned long long nr_context_switches(void)
1824 int i;
1825 unsigned long long sum = 0;
1827 for_each_possible_cpu(i)
1828 sum += cpu_rq(i)->nr_switches;
1830 return sum;
1833 unsigned long nr_iowait(void)
1835 unsigned long i, sum = 0;
1837 for_each_possible_cpu(i)
1838 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1840 return sum;
1843 unsigned long nr_active(void)
1845 unsigned long i, running = 0, uninterruptible = 0;
1847 for_each_online_cpu(i) {
1848 running += cpu_rq(i)->nr_running;
1849 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1852 if (unlikely((long)uninterruptible < 0))
1853 uninterruptible = 0;
1855 return running + uninterruptible;
1859 * Update rq->cpu_load[] statistics. This function is usually called every
1860 * scheduler tick (TICK_NSEC).
1862 static void update_cpu_load(struct rq *this_rq)
1864 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1865 unsigned long total_load = this_rq->ls.load.weight;
1866 unsigned long this_load = total_load;
1867 struct load_stat *ls = &this_rq->ls;
1868 u64 now = __rq_clock(this_rq);
1869 int i, scale;
1871 this_rq->nr_load_updates++;
1872 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1873 goto do_avg;
1875 /* Update delta_fair/delta_exec fields first */
1876 update_curr_load(this_rq, now);
1878 fair_delta64 = ls->delta_fair + 1;
1879 ls->delta_fair = 0;
1881 exec_delta64 = ls->delta_exec + 1;
1882 ls->delta_exec = 0;
1884 sample_interval64 = now - ls->load_update_last;
1885 ls->load_update_last = now;
1887 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1888 sample_interval64 = TICK_NSEC;
1890 if (exec_delta64 > sample_interval64)
1891 exec_delta64 = sample_interval64;
1893 idle_delta64 = sample_interval64 - exec_delta64;
1895 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1896 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1898 this_load = (unsigned long)tmp64;
1900 do_avg:
1902 /* Update our load: */
1903 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1904 unsigned long old_load, new_load;
1906 /* scale is effectively 1 << i now, and >> i divides by scale */
1908 old_load = this_rq->cpu_load[i];
1909 new_load = this_load;
1911 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1915 #ifdef CONFIG_SMP
1918 * double_rq_lock - safely lock two runqueues
1920 * Note this does not disable interrupts like task_rq_lock,
1921 * you need to do so manually before calling.
1923 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1924 __acquires(rq1->lock)
1925 __acquires(rq2->lock)
1927 BUG_ON(!irqs_disabled());
1928 if (rq1 == rq2) {
1929 spin_lock(&rq1->lock);
1930 __acquire(rq2->lock); /* Fake it out ;) */
1931 } else {
1932 if (rq1 < rq2) {
1933 spin_lock(&rq1->lock);
1934 spin_lock(&rq2->lock);
1935 } else {
1936 spin_lock(&rq2->lock);
1937 spin_lock(&rq1->lock);
1943 * double_rq_unlock - safely unlock two runqueues
1945 * Note this does not restore interrupts like task_rq_unlock,
1946 * you need to do so manually after calling.
1948 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1949 __releases(rq1->lock)
1950 __releases(rq2->lock)
1952 spin_unlock(&rq1->lock);
1953 if (rq1 != rq2)
1954 spin_unlock(&rq2->lock);
1955 else
1956 __release(rq2->lock);
1960 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1962 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1963 __releases(this_rq->lock)
1964 __acquires(busiest->lock)
1965 __acquires(this_rq->lock)
1967 if (unlikely(!irqs_disabled())) {
1968 /* printk() doesn't work good under rq->lock */
1969 spin_unlock(&this_rq->lock);
1970 BUG_ON(1);
1972 if (unlikely(!spin_trylock(&busiest->lock))) {
1973 if (busiest < this_rq) {
1974 spin_unlock(&this_rq->lock);
1975 spin_lock(&busiest->lock);
1976 spin_lock(&this_rq->lock);
1977 } else
1978 spin_lock(&busiest->lock);
1983 * If dest_cpu is allowed for this process, migrate the task to it.
1984 * This is accomplished by forcing the cpu_allowed mask to only
1985 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1986 * the cpu_allowed mask is restored.
1988 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
1990 struct migration_req req;
1991 unsigned long flags;
1992 struct rq *rq;
1994 rq = task_rq_lock(p, &flags);
1995 if (!cpu_isset(dest_cpu, p->cpus_allowed)
1996 || unlikely(cpu_is_offline(dest_cpu)))
1997 goto out;
1999 /* force the process onto the specified CPU */
2000 if (migrate_task(p, dest_cpu, &req)) {
2001 /* Need to wait for migration thread (might exit: take ref). */
2002 struct task_struct *mt = rq->migration_thread;
2004 get_task_struct(mt);
2005 task_rq_unlock(rq, &flags);
2006 wake_up_process(mt);
2007 put_task_struct(mt);
2008 wait_for_completion(&req.done);
2010 return;
2012 out:
2013 task_rq_unlock(rq, &flags);
2017 * sched_exec - execve() is a valuable balancing opportunity, because at
2018 * this point the task has the smallest effective memory and cache footprint.
2020 void sched_exec(void)
2022 int new_cpu, this_cpu = get_cpu();
2023 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2024 put_cpu();
2025 if (new_cpu != this_cpu)
2026 sched_migrate_task(current, new_cpu);
2030 * pull_task - move a task from a remote runqueue to the local runqueue.
2031 * Both runqueues must be locked.
2033 static void pull_task(struct rq *src_rq, struct task_struct *p,
2034 struct rq *this_rq, int this_cpu)
2036 deactivate_task(src_rq, p, 0);
2037 set_task_cpu(p, this_cpu);
2038 activate_task(this_rq, p, 0);
2040 * Note that idle threads have a prio of MAX_PRIO, for this test
2041 * to be always true for them.
2043 check_preempt_curr(this_rq, p);
2047 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2049 static
2050 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2051 struct sched_domain *sd, enum cpu_idle_type idle,
2052 int *all_pinned)
2055 * We do not migrate tasks that are:
2056 * 1) running (obviously), or
2057 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2058 * 3) are cache-hot on their current CPU.
2060 if (!cpu_isset(this_cpu, p->cpus_allowed))
2061 return 0;
2062 *all_pinned = 0;
2064 if (task_running(rq, p))
2065 return 0;
2068 * Aggressive migration if too many balance attempts have failed:
2070 if (sd->nr_balance_failed > sd->cache_nice_tries)
2071 return 1;
2073 return 1;
2076 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2077 unsigned long max_nr_move, unsigned long max_load_move,
2078 struct sched_domain *sd, enum cpu_idle_type idle,
2079 int *all_pinned, unsigned long *load_moved,
2080 int this_best_prio, int best_prio, int best_prio_seen,
2081 struct rq_iterator *iterator)
2083 int pulled = 0, pinned = 0, skip_for_load;
2084 struct task_struct *p;
2085 long rem_load_move = max_load_move;
2087 if (max_nr_move == 0 || max_load_move == 0)
2088 goto out;
2090 pinned = 1;
2093 * Start the load-balancing iterator:
2095 p = iterator->start(iterator->arg);
2096 next:
2097 if (!p)
2098 goto out;
2100 * To help distribute high priority tasks accross CPUs we don't
2101 * skip a task if it will be the highest priority task (i.e. smallest
2102 * prio value) on its new queue regardless of its load weight
2104 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2105 SCHED_LOAD_SCALE_FUZZ;
2106 if (skip_for_load && p->prio < this_best_prio)
2107 skip_for_load = !best_prio_seen && p->prio == best_prio;
2108 if (skip_for_load ||
2109 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2111 best_prio_seen |= p->prio == best_prio;
2112 p = iterator->next(iterator->arg);
2113 goto next;
2116 pull_task(busiest, p, this_rq, this_cpu);
2117 pulled++;
2118 rem_load_move -= p->se.load.weight;
2121 * We only want to steal up to the prescribed number of tasks
2122 * and the prescribed amount of weighted load.
2124 if (pulled < max_nr_move && rem_load_move > 0) {
2125 if (p->prio < this_best_prio)
2126 this_best_prio = p->prio;
2127 p = iterator->next(iterator->arg);
2128 goto next;
2130 out:
2132 * Right now, this is the only place pull_task() is called,
2133 * so we can safely collect pull_task() stats here rather than
2134 * inside pull_task().
2136 schedstat_add(sd, lb_gained[idle], pulled);
2138 if (all_pinned)
2139 *all_pinned = pinned;
2140 *load_moved = max_load_move - rem_load_move;
2141 return pulled;
2145 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2146 * load from busiest to this_rq, as part of a balancing operation within
2147 * "domain". Returns the number of tasks moved.
2149 * Called with both runqueues locked.
2151 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2152 unsigned long max_nr_move, unsigned long max_load_move,
2153 struct sched_domain *sd, enum cpu_idle_type idle,
2154 int *all_pinned)
2156 struct sched_class *class = sched_class_highest;
2157 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2158 long rem_load_move = max_load_move;
2160 do {
2161 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2162 max_nr_move, (unsigned long)rem_load_move,
2163 sd, idle, all_pinned, &load_moved);
2164 total_nr_moved += nr_moved;
2165 max_nr_move -= nr_moved;
2166 rem_load_move -= load_moved;
2167 class = class->next;
2168 } while (class && max_nr_move && rem_load_move > 0);
2170 return total_nr_moved;
2174 * find_busiest_group finds and returns the busiest CPU group within the
2175 * domain. It calculates and returns the amount of weighted load which
2176 * should be moved to restore balance via the imbalance parameter.
2178 static struct sched_group *
2179 find_busiest_group(struct sched_domain *sd, int this_cpu,
2180 unsigned long *imbalance, enum cpu_idle_type idle,
2181 int *sd_idle, cpumask_t *cpus, int *balance)
2183 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2184 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2185 unsigned long max_pull;
2186 unsigned long busiest_load_per_task, busiest_nr_running;
2187 unsigned long this_load_per_task, this_nr_running;
2188 int load_idx;
2189 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2190 int power_savings_balance = 1;
2191 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2192 unsigned long min_nr_running = ULONG_MAX;
2193 struct sched_group *group_min = NULL, *group_leader = NULL;
2194 #endif
2196 max_load = this_load = total_load = total_pwr = 0;
2197 busiest_load_per_task = busiest_nr_running = 0;
2198 this_load_per_task = this_nr_running = 0;
2199 if (idle == CPU_NOT_IDLE)
2200 load_idx = sd->busy_idx;
2201 else if (idle == CPU_NEWLY_IDLE)
2202 load_idx = sd->newidle_idx;
2203 else
2204 load_idx = sd->idle_idx;
2206 do {
2207 unsigned long load, group_capacity;
2208 int local_group;
2209 int i;
2210 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2211 unsigned long sum_nr_running, sum_weighted_load;
2213 local_group = cpu_isset(this_cpu, group->cpumask);
2215 if (local_group)
2216 balance_cpu = first_cpu(group->cpumask);
2218 /* Tally up the load of all CPUs in the group */
2219 sum_weighted_load = sum_nr_running = avg_load = 0;
2221 for_each_cpu_mask(i, group->cpumask) {
2222 struct rq *rq;
2224 if (!cpu_isset(i, *cpus))
2225 continue;
2227 rq = cpu_rq(i);
2229 if (*sd_idle && !idle_cpu(i))
2230 *sd_idle = 0;
2232 /* Bias balancing toward cpus of our domain */
2233 if (local_group) {
2234 if (idle_cpu(i) && !first_idle_cpu) {
2235 first_idle_cpu = 1;
2236 balance_cpu = i;
2239 load = target_load(i, load_idx);
2240 } else
2241 load = source_load(i, load_idx);
2243 avg_load += load;
2244 sum_nr_running += rq->nr_running;
2245 sum_weighted_load += weighted_cpuload(i);
2249 * First idle cpu or the first cpu(busiest) in this sched group
2250 * is eligible for doing load balancing at this and above
2251 * domains.
2253 if (local_group && balance_cpu != this_cpu && balance) {
2254 *balance = 0;
2255 goto ret;
2258 total_load += avg_load;
2259 total_pwr += group->__cpu_power;
2261 /* Adjust by relative CPU power of the group */
2262 avg_load = sg_div_cpu_power(group,
2263 avg_load * SCHED_LOAD_SCALE);
2265 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2267 if (local_group) {
2268 this_load = avg_load;
2269 this = group;
2270 this_nr_running = sum_nr_running;
2271 this_load_per_task = sum_weighted_load;
2272 } else if (avg_load > max_load &&
2273 sum_nr_running > group_capacity) {
2274 max_load = avg_load;
2275 busiest = group;
2276 busiest_nr_running = sum_nr_running;
2277 busiest_load_per_task = sum_weighted_load;
2280 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2282 * Busy processors will not participate in power savings
2283 * balance.
2285 if (idle == CPU_NOT_IDLE ||
2286 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2287 goto group_next;
2290 * If the local group is idle or completely loaded
2291 * no need to do power savings balance at this domain
2293 if (local_group && (this_nr_running >= group_capacity ||
2294 !this_nr_running))
2295 power_savings_balance = 0;
2298 * If a group is already running at full capacity or idle,
2299 * don't include that group in power savings calculations
2301 if (!power_savings_balance || sum_nr_running >= group_capacity
2302 || !sum_nr_running)
2303 goto group_next;
2306 * Calculate the group which has the least non-idle load.
2307 * This is the group from where we need to pick up the load
2308 * for saving power
2310 if ((sum_nr_running < min_nr_running) ||
2311 (sum_nr_running == min_nr_running &&
2312 first_cpu(group->cpumask) <
2313 first_cpu(group_min->cpumask))) {
2314 group_min = group;
2315 min_nr_running = sum_nr_running;
2316 min_load_per_task = sum_weighted_load /
2317 sum_nr_running;
2321 * Calculate the group which is almost near its
2322 * capacity but still has some space to pick up some load
2323 * from other group and save more power
2325 if (sum_nr_running <= group_capacity - 1) {
2326 if (sum_nr_running > leader_nr_running ||
2327 (sum_nr_running == leader_nr_running &&
2328 first_cpu(group->cpumask) >
2329 first_cpu(group_leader->cpumask))) {
2330 group_leader = group;
2331 leader_nr_running = sum_nr_running;
2334 group_next:
2335 #endif
2336 group = group->next;
2337 } while (group != sd->groups);
2339 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2340 goto out_balanced;
2342 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2344 if (this_load >= avg_load ||
2345 100*max_load <= sd->imbalance_pct*this_load)
2346 goto out_balanced;
2348 busiest_load_per_task /= busiest_nr_running;
2350 * We're trying to get all the cpus to the average_load, so we don't
2351 * want to push ourselves above the average load, nor do we wish to
2352 * reduce the max loaded cpu below the average load, as either of these
2353 * actions would just result in more rebalancing later, and ping-pong
2354 * tasks around. Thus we look for the minimum possible imbalance.
2355 * Negative imbalances (*we* are more loaded than anyone else) will
2356 * be counted as no imbalance for these purposes -- we can't fix that
2357 * by pulling tasks to us. Be careful of negative numbers as they'll
2358 * appear as very large values with unsigned longs.
2360 if (max_load <= busiest_load_per_task)
2361 goto out_balanced;
2364 * In the presence of smp nice balancing, certain scenarios can have
2365 * max load less than avg load(as we skip the groups at or below
2366 * its cpu_power, while calculating max_load..)
2368 if (max_load < avg_load) {
2369 *imbalance = 0;
2370 goto small_imbalance;
2373 /* Don't want to pull so many tasks that a group would go idle */
2374 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2376 /* How much load to actually move to equalise the imbalance */
2377 *imbalance = min(max_pull * busiest->__cpu_power,
2378 (avg_load - this_load) * this->__cpu_power)
2379 / SCHED_LOAD_SCALE;
2382 * if *imbalance is less than the average load per runnable task
2383 * there is no gaurantee that any tasks will be moved so we'll have
2384 * a think about bumping its value to force at least one task to be
2385 * moved
2387 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2388 unsigned long tmp, pwr_now, pwr_move;
2389 unsigned int imbn;
2391 small_imbalance:
2392 pwr_move = pwr_now = 0;
2393 imbn = 2;
2394 if (this_nr_running) {
2395 this_load_per_task /= this_nr_running;
2396 if (busiest_load_per_task > this_load_per_task)
2397 imbn = 1;
2398 } else
2399 this_load_per_task = SCHED_LOAD_SCALE;
2401 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2402 busiest_load_per_task * imbn) {
2403 *imbalance = busiest_load_per_task;
2404 return busiest;
2408 * OK, we don't have enough imbalance to justify moving tasks,
2409 * however we may be able to increase total CPU power used by
2410 * moving them.
2413 pwr_now += busiest->__cpu_power *
2414 min(busiest_load_per_task, max_load);
2415 pwr_now += this->__cpu_power *
2416 min(this_load_per_task, this_load);
2417 pwr_now /= SCHED_LOAD_SCALE;
2419 /* Amount of load we'd subtract */
2420 tmp = sg_div_cpu_power(busiest,
2421 busiest_load_per_task * SCHED_LOAD_SCALE);
2422 if (max_load > tmp)
2423 pwr_move += busiest->__cpu_power *
2424 min(busiest_load_per_task, max_load - tmp);
2426 /* Amount of load we'd add */
2427 if (max_load * busiest->__cpu_power <
2428 busiest_load_per_task * SCHED_LOAD_SCALE)
2429 tmp = sg_div_cpu_power(this,
2430 max_load * busiest->__cpu_power);
2431 else
2432 tmp = sg_div_cpu_power(this,
2433 busiest_load_per_task * SCHED_LOAD_SCALE);
2434 pwr_move += this->__cpu_power *
2435 min(this_load_per_task, this_load + tmp);
2436 pwr_move /= SCHED_LOAD_SCALE;
2438 /* Move if we gain throughput */
2439 if (pwr_move <= pwr_now)
2440 goto out_balanced;
2442 *imbalance = busiest_load_per_task;
2445 return busiest;
2447 out_balanced:
2448 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2449 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2450 goto ret;
2452 if (this == group_leader && group_leader != group_min) {
2453 *imbalance = min_load_per_task;
2454 return group_min;
2456 #endif
2457 ret:
2458 *imbalance = 0;
2459 return NULL;
2463 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2465 static struct rq *
2466 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2467 unsigned long imbalance, cpumask_t *cpus)
2469 struct rq *busiest = NULL, *rq;
2470 unsigned long max_load = 0;
2471 int i;
2473 for_each_cpu_mask(i, group->cpumask) {
2474 unsigned long wl;
2476 if (!cpu_isset(i, *cpus))
2477 continue;
2479 rq = cpu_rq(i);
2480 wl = weighted_cpuload(i);
2482 if (rq->nr_running == 1 && wl > imbalance)
2483 continue;
2485 if (wl > max_load) {
2486 max_load = wl;
2487 busiest = rq;
2491 return busiest;
2495 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2496 * so long as it is large enough.
2498 #define MAX_PINNED_INTERVAL 512
2500 static inline unsigned long minus_1_or_zero(unsigned long n)
2502 return n > 0 ? n - 1 : 0;
2506 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2507 * tasks if there is an imbalance.
2509 static int load_balance(int this_cpu, struct rq *this_rq,
2510 struct sched_domain *sd, enum cpu_idle_type idle,
2511 int *balance)
2513 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2514 struct sched_group *group;
2515 unsigned long imbalance;
2516 struct rq *busiest;
2517 cpumask_t cpus = CPU_MASK_ALL;
2518 unsigned long flags;
2521 * When power savings policy is enabled for the parent domain, idle
2522 * sibling can pick up load irrespective of busy siblings. In this case,
2523 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2524 * portraying it as CPU_NOT_IDLE.
2526 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2527 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2528 sd_idle = 1;
2530 schedstat_inc(sd, lb_cnt[idle]);
2532 redo:
2533 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2534 &cpus, balance);
2536 if (*balance == 0)
2537 goto out_balanced;
2539 if (!group) {
2540 schedstat_inc(sd, lb_nobusyg[idle]);
2541 goto out_balanced;
2544 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2545 if (!busiest) {
2546 schedstat_inc(sd, lb_nobusyq[idle]);
2547 goto out_balanced;
2550 BUG_ON(busiest == this_rq);
2552 schedstat_add(sd, lb_imbalance[idle], imbalance);
2554 nr_moved = 0;
2555 if (busiest->nr_running > 1) {
2557 * Attempt to move tasks. If find_busiest_group has found
2558 * an imbalance but busiest->nr_running <= 1, the group is
2559 * still unbalanced. nr_moved simply stays zero, so it is
2560 * correctly treated as an imbalance.
2562 local_irq_save(flags);
2563 double_rq_lock(this_rq, busiest);
2564 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2565 minus_1_or_zero(busiest->nr_running),
2566 imbalance, sd, idle, &all_pinned);
2567 double_rq_unlock(this_rq, busiest);
2568 local_irq_restore(flags);
2571 * some other cpu did the load balance for us.
2573 if (nr_moved && this_cpu != smp_processor_id())
2574 resched_cpu(this_cpu);
2576 /* All tasks on this runqueue were pinned by CPU affinity */
2577 if (unlikely(all_pinned)) {
2578 cpu_clear(cpu_of(busiest), cpus);
2579 if (!cpus_empty(cpus))
2580 goto redo;
2581 goto out_balanced;
2585 if (!nr_moved) {
2586 schedstat_inc(sd, lb_failed[idle]);
2587 sd->nr_balance_failed++;
2589 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2591 spin_lock_irqsave(&busiest->lock, flags);
2593 /* don't kick the migration_thread, if the curr
2594 * task on busiest cpu can't be moved to this_cpu
2596 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2597 spin_unlock_irqrestore(&busiest->lock, flags);
2598 all_pinned = 1;
2599 goto out_one_pinned;
2602 if (!busiest->active_balance) {
2603 busiest->active_balance = 1;
2604 busiest->push_cpu = this_cpu;
2605 active_balance = 1;
2607 spin_unlock_irqrestore(&busiest->lock, flags);
2608 if (active_balance)
2609 wake_up_process(busiest->migration_thread);
2612 * We've kicked active balancing, reset the failure
2613 * counter.
2615 sd->nr_balance_failed = sd->cache_nice_tries+1;
2617 } else
2618 sd->nr_balance_failed = 0;
2620 if (likely(!active_balance)) {
2621 /* We were unbalanced, so reset the balancing interval */
2622 sd->balance_interval = sd->min_interval;
2623 } else {
2625 * If we've begun active balancing, start to back off. This
2626 * case may not be covered by the all_pinned logic if there
2627 * is only 1 task on the busy runqueue (because we don't call
2628 * move_tasks).
2630 if (sd->balance_interval < sd->max_interval)
2631 sd->balance_interval *= 2;
2634 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2635 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2636 return -1;
2637 return nr_moved;
2639 out_balanced:
2640 schedstat_inc(sd, lb_balanced[idle]);
2642 sd->nr_balance_failed = 0;
2644 out_one_pinned:
2645 /* tune up the balancing interval */
2646 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2647 (sd->balance_interval < sd->max_interval))
2648 sd->balance_interval *= 2;
2650 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2651 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2652 return -1;
2653 return 0;
2657 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2658 * tasks if there is an imbalance.
2660 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2661 * this_rq is locked.
2663 static int
2664 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2666 struct sched_group *group;
2667 struct rq *busiest = NULL;
2668 unsigned long imbalance;
2669 int nr_moved = 0;
2670 int sd_idle = 0;
2671 cpumask_t cpus = CPU_MASK_ALL;
2674 * When power savings policy is enabled for the parent domain, idle
2675 * sibling can pick up load irrespective of busy siblings. In this case,
2676 * let the state of idle sibling percolate up as IDLE, instead of
2677 * portraying it as CPU_NOT_IDLE.
2679 if (sd->flags & SD_SHARE_CPUPOWER &&
2680 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2681 sd_idle = 1;
2683 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2684 redo:
2685 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2686 &sd_idle, &cpus, NULL);
2687 if (!group) {
2688 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2689 goto out_balanced;
2692 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2693 &cpus);
2694 if (!busiest) {
2695 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2696 goto out_balanced;
2699 BUG_ON(busiest == this_rq);
2701 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2703 nr_moved = 0;
2704 if (busiest->nr_running > 1) {
2705 /* Attempt to move tasks */
2706 double_lock_balance(this_rq, busiest);
2707 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2708 minus_1_or_zero(busiest->nr_running),
2709 imbalance, sd, CPU_NEWLY_IDLE, NULL);
2710 spin_unlock(&busiest->lock);
2712 if (!nr_moved) {
2713 cpu_clear(cpu_of(busiest), cpus);
2714 if (!cpus_empty(cpus))
2715 goto redo;
2719 if (!nr_moved) {
2720 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2721 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2722 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2723 return -1;
2724 } else
2725 sd->nr_balance_failed = 0;
2727 return nr_moved;
2729 out_balanced:
2730 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2731 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2732 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2733 return -1;
2734 sd->nr_balance_failed = 0;
2736 return 0;
2740 * idle_balance is called by schedule() if this_cpu is about to become
2741 * idle. Attempts to pull tasks from other CPUs.
2743 static void idle_balance(int this_cpu, struct rq *this_rq)
2745 struct sched_domain *sd;
2746 int pulled_task = -1;
2747 unsigned long next_balance = jiffies + HZ;
2749 for_each_domain(this_cpu, sd) {
2750 unsigned long interval;
2752 if (!(sd->flags & SD_LOAD_BALANCE))
2753 continue;
2755 if (sd->flags & SD_BALANCE_NEWIDLE)
2756 /* If we've pulled tasks over stop searching: */
2757 pulled_task = load_balance_newidle(this_cpu,
2758 this_rq, sd);
2760 interval = msecs_to_jiffies(sd->balance_interval);
2761 if (time_after(next_balance, sd->last_balance + interval))
2762 next_balance = sd->last_balance + interval;
2763 if (pulled_task)
2764 break;
2766 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2768 * We are going idle. next_balance may be set based on
2769 * a busy processor. So reset next_balance.
2771 this_rq->next_balance = next_balance;
2776 * active_load_balance is run by migration threads. It pushes running tasks
2777 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2778 * running on each physical CPU where possible, and avoids physical /
2779 * logical imbalances.
2781 * Called with busiest_rq locked.
2783 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2785 int target_cpu = busiest_rq->push_cpu;
2786 struct sched_domain *sd;
2787 struct rq *target_rq;
2789 /* Is there any task to move? */
2790 if (busiest_rq->nr_running <= 1)
2791 return;
2793 target_rq = cpu_rq(target_cpu);
2796 * This condition is "impossible", if it occurs
2797 * we need to fix it. Originally reported by
2798 * Bjorn Helgaas on a 128-cpu setup.
2800 BUG_ON(busiest_rq == target_rq);
2802 /* move a task from busiest_rq to target_rq */
2803 double_lock_balance(busiest_rq, target_rq);
2805 /* Search for an sd spanning us and the target CPU. */
2806 for_each_domain(target_cpu, sd) {
2807 if ((sd->flags & SD_LOAD_BALANCE) &&
2808 cpu_isset(busiest_cpu, sd->span))
2809 break;
2812 if (likely(sd)) {
2813 schedstat_inc(sd, alb_cnt);
2815 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2816 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
2817 NULL))
2818 schedstat_inc(sd, alb_pushed);
2819 else
2820 schedstat_inc(sd, alb_failed);
2822 spin_unlock(&target_rq->lock);
2825 #ifdef CONFIG_NO_HZ
2826 static struct {
2827 atomic_t load_balancer;
2828 cpumask_t cpu_mask;
2829 } nohz ____cacheline_aligned = {
2830 .load_balancer = ATOMIC_INIT(-1),
2831 .cpu_mask = CPU_MASK_NONE,
2835 * This routine will try to nominate the ilb (idle load balancing)
2836 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2837 * load balancing on behalf of all those cpus. If all the cpus in the system
2838 * go into this tickless mode, then there will be no ilb owner (as there is
2839 * no need for one) and all the cpus will sleep till the next wakeup event
2840 * arrives...
2842 * For the ilb owner, tick is not stopped. And this tick will be used
2843 * for idle load balancing. ilb owner will still be part of
2844 * nohz.cpu_mask..
2846 * While stopping the tick, this cpu will become the ilb owner if there
2847 * is no other owner. And will be the owner till that cpu becomes busy
2848 * or if all cpus in the system stop their ticks at which point
2849 * there is no need for ilb owner.
2851 * When the ilb owner becomes busy, it nominates another owner, during the
2852 * next busy scheduler_tick()
2854 int select_nohz_load_balancer(int stop_tick)
2856 int cpu = smp_processor_id();
2858 if (stop_tick) {
2859 cpu_set(cpu, nohz.cpu_mask);
2860 cpu_rq(cpu)->in_nohz_recently = 1;
2863 * If we are going offline and still the leader, give up!
2865 if (cpu_is_offline(cpu) &&
2866 atomic_read(&nohz.load_balancer) == cpu) {
2867 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2868 BUG();
2869 return 0;
2872 /* time for ilb owner also to sleep */
2873 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2874 if (atomic_read(&nohz.load_balancer) == cpu)
2875 atomic_set(&nohz.load_balancer, -1);
2876 return 0;
2879 if (atomic_read(&nohz.load_balancer) == -1) {
2880 /* make me the ilb owner */
2881 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2882 return 1;
2883 } else if (atomic_read(&nohz.load_balancer) == cpu)
2884 return 1;
2885 } else {
2886 if (!cpu_isset(cpu, nohz.cpu_mask))
2887 return 0;
2889 cpu_clear(cpu, nohz.cpu_mask);
2891 if (atomic_read(&nohz.load_balancer) == cpu)
2892 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2893 BUG();
2895 return 0;
2897 #endif
2899 static DEFINE_SPINLOCK(balancing);
2902 * It checks each scheduling domain to see if it is due to be balanced,
2903 * and initiates a balancing operation if so.
2905 * Balancing parameters are set up in arch_init_sched_domains.
2907 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
2909 int balance = 1;
2910 struct rq *rq = cpu_rq(cpu);
2911 unsigned long interval;
2912 struct sched_domain *sd;
2913 /* Earliest time when we have to do rebalance again */
2914 unsigned long next_balance = jiffies + 60*HZ;
2916 for_each_domain(cpu, sd) {
2917 if (!(sd->flags & SD_LOAD_BALANCE))
2918 continue;
2920 interval = sd->balance_interval;
2921 if (idle != CPU_IDLE)
2922 interval *= sd->busy_factor;
2924 /* scale ms to jiffies */
2925 interval = msecs_to_jiffies(interval);
2926 if (unlikely(!interval))
2927 interval = 1;
2928 if (interval > HZ*NR_CPUS/10)
2929 interval = HZ*NR_CPUS/10;
2932 if (sd->flags & SD_SERIALIZE) {
2933 if (!spin_trylock(&balancing))
2934 goto out;
2937 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2938 if (load_balance(cpu, rq, sd, idle, &balance)) {
2940 * We've pulled tasks over so either we're no
2941 * longer idle, or one of our SMT siblings is
2942 * not idle.
2944 idle = CPU_NOT_IDLE;
2946 sd->last_balance = jiffies;
2948 if (sd->flags & SD_SERIALIZE)
2949 spin_unlock(&balancing);
2950 out:
2951 if (time_after(next_balance, sd->last_balance + interval))
2952 next_balance = sd->last_balance + interval;
2955 * Stop the load balance at this level. There is another
2956 * CPU in our sched group which is doing load balancing more
2957 * actively.
2959 if (!balance)
2960 break;
2962 rq->next_balance = next_balance;
2966 * run_rebalance_domains is triggered when needed from the scheduler tick.
2967 * In CONFIG_NO_HZ case, the idle load balance owner will do the
2968 * rebalancing for all the cpus for whom scheduler ticks are stopped.
2970 static void run_rebalance_domains(struct softirq_action *h)
2972 int this_cpu = smp_processor_id();
2973 struct rq *this_rq = cpu_rq(this_cpu);
2974 enum cpu_idle_type idle = this_rq->idle_at_tick ?
2975 CPU_IDLE : CPU_NOT_IDLE;
2977 rebalance_domains(this_cpu, idle);
2979 #ifdef CONFIG_NO_HZ
2981 * If this cpu is the owner for idle load balancing, then do the
2982 * balancing on behalf of the other idle cpus whose ticks are
2983 * stopped.
2985 if (this_rq->idle_at_tick &&
2986 atomic_read(&nohz.load_balancer) == this_cpu) {
2987 cpumask_t cpus = nohz.cpu_mask;
2988 struct rq *rq;
2989 int balance_cpu;
2991 cpu_clear(this_cpu, cpus);
2992 for_each_cpu_mask(balance_cpu, cpus) {
2994 * If this cpu gets work to do, stop the load balancing
2995 * work being done for other cpus. Next load
2996 * balancing owner will pick it up.
2998 if (need_resched())
2999 break;
3001 rebalance_domains(balance_cpu, SCHED_IDLE);
3003 rq = cpu_rq(balance_cpu);
3004 if (time_after(this_rq->next_balance, rq->next_balance))
3005 this_rq->next_balance = rq->next_balance;
3008 #endif
3012 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3014 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3015 * idle load balancing owner or decide to stop the periodic load balancing,
3016 * if the whole system is idle.
3018 static inline void trigger_load_balance(struct rq *rq, int cpu)
3020 #ifdef CONFIG_NO_HZ
3022 * If we were in the nohz mode recently and busy at the current
3023 * scheduler tick, then check if we need to nominate new idle
3024 * load balancer.
3026 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3027 rq->in_nohz_recently = 0;
3029 if (atomic_read(&nohz.load_balancer) == cpu) {
3030 cpu_clear(cpu, nohz.cpu_mask);
3031 atomic_set(&nohz.load_balancer, -1);
3034 if (atomic_read(&nohz.load_balancer) == -1) {
3036 * simple selection for now: Nominate the
3037 * first cpu in the nohz list to be the next
3038 * ilb owner.
3040 * TBD: Traverse the sched domains and nominate
3041 * the nearest cpu in the nohz.cpu_mask.
3043 int ilb = first_cpu(nohz.cpu_mask);
3045 if (ilb != NR_CPUS)
3046 resched_cpu(ilb);
3051 * If this cpu is idle and doing idle load balancing for all the
3052 * cpus with ticks stopped, is it time for that to stop?
3054 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3055 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3056 resched_cpu(cpu);
3057 return;
3061 * If this cpu is idle and the idle load balancing is done by
3062 * someone else, then no need raise the SCHED_SOFTIRQ
3064 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3065 cpu_isset(cpu, nohz.cpu_mask))
3066 return;
3067 #endif
3068 if (time_after_eq(jiffies, rq->next_balance))
3069 raise_softirq(SCHED_SOFTIRQ);
3072 #else /* CONFIG_SMP */
3075 * on UP we do not need to balance between CPUs:
3077 static inline void idle_balance(int cpu, struct rq *rq)
3081 /* Avoid "used but not defined" warning on UP */
3082 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3083 unsigned long max_nr_move, unsigned long max_load_move,
3084 struct sched_domain *sd, enum cpu_idle_type idle,
3085 int *all_pinned, unsigned long *load_moved,
3086 int this_best_prio, int best_prio, int best_prio_seen,
3087 struct rq_iterator *iterator)
3089 *load_moved = 0;
3091 return 0;
3094 #endif
3096 DEFINE_PER_CPU(struct kernel_stat, kstat);
3098 EXPORT_PER_CPU_SYMBOL(kstat);
3101 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3102 * that have not yet been banked in case the task is currently running.
3104 unsigned long long task_sched_runtime(struct task_struct *p)
3106 unsigned long flags;
3107 u64 ns, delta_exec;
3108 struct rq *rq;
3110 rq = task_rq_lock(p, &flags);
3111 ns = p->se.sum_exec_runtime;
3112 if (rq->curr == p) {
3113 delta_exec = rq_clock(rq) - p->se.exec_start;
3114 if ((s64)delta_exec > 0)
3115 ns += delta_exec;
3117 task_rq_unlock(rq, &flags);
3119 return ns;
3123 * Account user cpu time to a process.
3124 * @p: the process that the cpu time gets accounted to
3125 * @hardirq_offset: the offset to subtract from hardirq_count()
3126 * @cputime: the cpu time spent in user space since the last update
3128 void account_user_time(struct task_struct *p, cputime_t cputime)
3130 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3131 cputime64_t tmp;
3133 p->utime = cputime_add(p->utime, cputime);
3135 /* Add user time to cpustat. */
3136 tmp = cputime_to_cputime64(cputime);
3137 if (TASK_NICE(p) > 0)
3138 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3139 else
3140 cpustat->user = cputime64_add(cpustat->user, tmp);
3144 * Account system cpu time to a process.
3145 * @p: the process that the cpu time gets accounted to
3146 * @hardirq_offset: the offset to subtract from hardirq_count()
3147 * @cputime: the cpu time spent in kernel space since the last update
3149 void account_system_time(struct task_struct *p, int hardirq_offset,
3150 cputime_t cputime)
3152 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3153 struct rq *rq = this_rq();
3154 cputime64_t tmp;
3156 p->stime = cputime_add(p->stime, cputime);
3158 /* Add system time to cpustat. */
3159 tmp = cputime_to_cputime64(cputime);
3160 if (hardirq_count() - hardirq_offset)
3161 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3162 else if (softirq_count())
3163 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3164 else if (p != rq->idle)
3165 cpustat->system = cputime64_add(cpustat->system, tmp);
3166 else if (atomic_read(&rq->nr_iowait) > 0)
3167 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3168 else
3169 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3170 /* Account for system time used */
3171 acct_update_integrals(p);
3175 * Account for involuntary wait time.
3176 * @p: the process from which the cpu time has been stolen
3177 * @steal: the cpu time spent in involuntary wait
3179 void account_steal_time(struct task_struct *p, cputime_t steal)
3181 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3182 cputime64_t tmp = cputime_to_cputime64(steal);
3183 struct rq *rq = this_rq();
3185 if (p == rq->idle) {
3186 p->stime = cputime_add(p->stime, steal);
3187 if (atomic_read(&rq->nr_iowait) > 0)
3188 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3189 else
3190 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3191 } else
3192 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3196 * This function gets called by the timer code, with HZ frequency.
3197 * We call it with interrupts disabled.
3199 * It also gets called by the fork code, when changing the parent's
3200 * timeslices.
3202 void scheduler_tick(void)
3204 int cpu = smp_processor_id();
3205 struct rq *rq = cpu_rq(cpu);
3206 struct task_struct *curr = rq->curr;
3208 spin_lock(&rq->lock);
3209 if (curr != rq->idle) /* FIXME: needed? */
3210 curr->sched_class->task_tick(rq, curr);
3211 update_cpu_load(rq);
3212 spin_unlock(&rq->lock);
3214 #ifdef CONFIG_SMP
3215 rq->idle_at_tick = idle_cpu(cpu);
3216 trigger_load_balance(rq, cpu);
3217 #endif
3220 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3222 void fastcall add_preempt_count(int val)
3225 * Underflow?
3227 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3228 return;
3229 preempt_count() += val;
3231 * Spinlock count overflowing soon?
3233 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3234 PREEMPT_MASK - 10);
3236 EXPORT_SYMBOL(add_preempt_count);
3238 void fastcall sub_preempt_count(int val)
3241 * Underflow?
3243 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3244 return;
3246 * Is the spinlock portion underflowing?
3248 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3249 !(preempt_count() & PREEMPT_MASK)))
3250 return;
3252 preempt_count() -= val;
3254 EXPORT_SYMBOL(sub_preempt_count);
3256 #endif
3259 * Print scheduling while atomic bug:
3261 static noinline void __schedule_bug(struct task_struct *prev)
3263 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3264 prev->comm, preempt_count(), prev->pid);
3265 debug_show_held_locks(prev);
3266 if (irqs_disabled())
3267 print_irqtrace_events(prev);
3268 dump_stack();
3272 * Various schedule()-time debugging checks and statistics:
3274 static inline void schedule_debug(struct task_struct *prev)
3277 * Test if we are atomic. Since do_exit() needs to call into
3278 * schedule() atomically, we ignore that path for now.
3279 * Otherwise, whine if we are scheduling when we should not be.
3281 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3282 __schedule_bug(prev);
3284 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3286 schedstat_inc(this_rq(), sched_cnt);
3290 * Pick up the highest-prio task:
3292 static inline struct task_struct *
3293 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3295 struct sched_class *class;
3296 struct task_struct *p;
3299 * Optimization: we know that if all tasks are in
3300 * the fair class we can call that function directly:
3302 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3303 p = fair_sched_class.pick_next_task(rq, now);
3304 if (likely(p))
3305 return p;
3308 class = sched_class_highest;
3309 for ( ; ; ) {
3310 p = class->pick_next_task(rq, now);
3311 if (p)
3312 return p;
3314 * Will never be NULL as the idle class always
3315 * returns a non-NULL p:
3317 class = class->next;
3322 * schedule() is the main scheduler function.
3324 asmlinkage void __sched schedule(void)
3326 struct task_struct *prev, *next;
3327 long *switch_count;
3328 struct rq *rq;
3329 u64 now;
3330 int cpu;
3332 need_resched:
3333 preempt_disable();
3334 cpu = smp_processor_id();
3335 rq = cpu_rq(cpu);
3336 rcu_qsctr_inc(cpu);
3337 prev = rq->curr;
3338 switch_count = &prev->nivcsw;
3340 release_kernel_lock(prev);
3341 need_resched_nonpreemptible:
3343 schedule_debug(prev);
3345 spin_lock_irq(&rq->lock);
3346 clear_tsk_need_resched(prev);
3348 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3349 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3350 unlikely(signal_pending(prev)))) {
3351 prev->state = TASK_RUNNING;
3352 } else {
3353 deactivate_task(rq, prev, 1);
3355 switch_count = &prev->nvcsw;
3358 if (unlikely(!rq->nr_running))
3359 idle_balance(cpu, rq);
3361 now = __rq_clock(rq);
3362 prev->sched_class->put_prev_task(rq, prev, now);
3363 next = pick_next_task(rq, prev, now);
3365 sched_info_switch(prev, next);
3367 if (likely(prev != next)) {
3368 rq->nr_switches++;
3369 rq->curr = next;
3370 ++*switch_count;
3372 context_switch(rq, prev, next); /* unlocks the rq */
3373 } else
3374 spin_unlock_irq(&rq->lock);
3376 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3377 cpu = smp_processor_id();
3378 rq = cpu_rq(cpu);
3379 goto need_resched_nonpreemptible;
3381 preempt_enable_no_resched();
3382 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3383 goto need_resched;
3385 EXPORT_SYMBOL(schedule);
3387 #ifdef CONFIG_PREEMPT
3389 * this is the entry point to schedule() from in-kernel preemption
3390 * off of preempt_enable. Kernel preemptions off return from interrupt
3391 * occur there and call schedule directly.
3393 asmlinkage void __sched preempt_schedule(void)
3395 struct thread_info *ti = current_thread_info();
3396 #ifdef CONFIG_PREEMPT_BKL
3397 struct task_struct *task = current;
3398 int saved_lock_depth;
3399 #endif
3401 * If there is a non-zero preempt_count or interrupts are disabled,
3402 * we do not want to preempt the current task. Just return..
3404 if (likely(ti->preempt_count || irqs_disabled()))
3405 return;
3407 need_resched:
3408 add_preempt_count(PREEMPT_ACTIVE);
3410 * We keep the big kernel semaphore locked, but we
3411 * clear ->lock_depth so that schedule() doesnt
3412 * auto-release the semaphore:
3414 #ifdef CONFIG_PREEMPT_BKL
3415 saved_lock_depth = task->lock_depth;
3416 task->lock_depth = -1;
3417 #endif
3418 schedule();
3419 #ifdef CONFIG_PREEMPT_BKL
3420 task->lock_depth = saved_lock_depth;
3421 #endif
3422 sub_preempt_count(PREEMPT_ACTIVE);
3424 /* we could miss a preemption opportunity between schedule and now */
3425 barrier();
3426 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3427 goto need_resched;
3429 EXPORT_SYMBOL(preempt_schedule);
3432 * this is the entry point to schedule() from kernel preemption
3433 * off of irq context.
3434 * Note, that this is called and return with irqs disabled. This will
3435 * protect us against recursive calling from irq.
3437 asmlinkage void __sched preempt_schedule_irq(void)
3439 struct thread_info *ti = current_thread_info();
3440 #ifdef CONFIG_PREEMPT_BKL
3441 struct task_struct *task = current;
3442 int saved_lock_depth;
3443 #endif
3444 /* Catch callers which need to be fixed */
3445 BUG_ON(ti->preempt_count || !irqs_disabled());
3447 need_resched:
3448 add_preempt_count(PREEMPT_ACTIVE);
3450 * We keep the big kernel semaphore locked, but we
3451 * clear ->lock_depth so that schedule() doesnt
3452 * auto-release the semaphore:
3454 #ifdef CONFIG_PREEMPT_BKL
3455 saved_lock_depth = task->lock_depth;
3456 task->lock_depth = -1;
3457 #endif
3458 local_irq_enable();
3459 schedule();
3460 local_irq_disable();
3461 #ifdef CONFIG_PREEMPT_BKL
3462 task->lock_depth = saved_lock_depth;
3463 #endif
3464 sub_preempt_count(PREEMPT_ACTIVE);
3466 /* we could miss a preemption opportunity between schedule and now */
3467 barrier();
3468 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3469 goto need_resched;
3472 #endif /* CONFIG_PREEMPT */
3474 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3475 void *key)
3477 return try_to_wake_up(curr->private, mode, sync);
3479 EXPORT_SYMBOL(default_wake_function);
3482 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3483 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3484 * number) then we wake all the non-exclusive tasks and one exclusive task.
3486 * There are circumstances in which we can try to wake a task which has already
3487 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3488 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3490 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3491 int nr_exclusive, int sync, void *key)
3493 struct list_head *tmp, *next;
3495 list_for_each_safe(tmp, next, &q->task_list) {
3496 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3497 unsigned flags = curr->flags;
3499 if (curr->func(curr, mode, sync, key) &&
3500 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3501 break;
3506 * __wake_up - wake up threads blocked on a waitqueue.
3507 * @q: the waitqueue
3508 * @mode: which threads
3509 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3510 * @key: is directly passed to the wakeup function
3512 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3513 int nr_exclusive, void *key)
3515 unsigned long flags;
3517 spin_lock_irqsave(&q->lock, flags);
3518 __wake_up_common(q, mode, nr_exclusive, 0, key);
3519 spin_unlock_irqrestore(&q->lock, flags);
3521 EXPORT_SYMBOL(__wake_up);
3524 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3526 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3528 __wake_up_common(q, mode, 1, 0, NULL);
3532 * __wake_up_sync - wake up threads blocked on a waitqueue.
3533 * @q: the waitqueue
3534 * @mode: which threads
3535 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3537 * The sync wakeup differs that the waker knows that it will schedule
3538 * away soon, so while the target thread will be woken up, it will not
3539 * be migrated to another CPU - ie. the two threads are 'synchronized'
3540 * with each other. This can prevent needless bouncing between CPUs.
3542 * On UP it can prevent extra preemption.
3544 void fastcall
3545 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3547 unsigned long flags;
3548 int sync = 1;
3550 if (unlikely(!q))
3551 return;
3553 if (unlikely(!nr_exclusive))
3554 sync = 0;
3556 spin_lock_irqsave(&q->lock, flags);
3557 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3558 spin_unlock_irqrestore(&q->lock, flags);
3560 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3562 void fastcall complete(struct completion *x)
3564 unsigned long flags;
3566 spin_lock_irqsave(&x->wait.lock, flags);
3567 x->done++;
3568 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3569 1, 0, NULL);
3570 spin_unlock_irqrestore(&x->wait.lock, flags);
3572 EXPORT_SYMBOL(complete);
3574 void fastcall complete_all(struct completion *x)
3576 unsigned long flags;
3578 spin_lock_irqsave(&x->wait.lock, flags);
3579 x->done += UINT_MAX/2;
3580 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3581 0, 0, NULL);
3582 spin_unlock_irqrestore(&x->wait.lock, flags);
3584 EXPORT_SYMBOL(complete_all);
3586 void fastcall __sched wait_for_completion(struct completion *x)
3588 might_sleep();
3590 spin_lock_irq(&x->wait.lock);
3591 if (!x->done) {
3592 DECLARE_WAITQUEUE(wait, current);
3594 wait.flags |= WQ_FLAG_EXCLUSIVE;
3595 __add_wait_queue_tail(&x->wait, &wait);
3596 do {
3597 __set_current_state(TASK_UNINTERRUPTIBLE);
3598 spin_unlock_irq(&x->wait.lock);
3599 schedule();
3600 spin_lock_irq(&x->wait.lock);
3601 } while (!x->done);
3602 __remove_wait_queue(&x->wait, &wait);
3604 x->done--;
3605 spin_unlock_irq(&x->wait.lock);
3607 EXPORT_SYMBOL(wait_for_completion);
3609 unsigned long fastcall __sched
3610 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3612 might_sleep();
3614 spin_lock_irq(&x->wait.lock);
3615 if (!x->done) {
3616 DECLARE_WAITQUEUE(wait, current);
3618 wait.flags |= WQ_FLAG_EXCLUSIVE;
3619 __add_wait_queue_tail(&x->wait, &wait);
3620 do {
3621 __set_current_state(TASK_UNINTERRUPTIBLE);
3622 spin_unlock_irq(&x->wait.lock);
3623 timeout = schedule_timeout(timeout);
3624 spin_lock_irq(&x->wait.lock);
3625 if (!timeout) {
3626 __remove_wait_queue(&x->wait, &wait);
3627 goto out;
3629 } while (!x->done);
3630 __remove_wait_queue(&x->wait, &wait);
3632 x->done--;
3633 out:
3634 spin_unlock_irq(&x->wait.lock);
3635 return timeout;
3637 EXPORT_SYMBOL(wait_for_completion_timeout);
3639 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3641 int ret = 0;
3643 might_sleep();
3645 spin_lock_irq(&x->wait.lock);
3646 if (!x->done) {
3647 DECLARE_WAITQUEUE(wait, current);
3649 wait.flags |= WQ_FLAG_EXCLUSIVE;
3650 __add_wait_queue_tail(&x->wait, &wait);
3651 do {
3652 if (signal_pending(current)) {
3653 ret = -ERESTARTSYS;
3654 __remove_wait_queue(&x->wait, &wait);
3655 goto out;
3657 __set_current_state(TASK_INTERRUPTIBLE);
3658 spin_unlock_irq(&x->wait.lock);
3659 schedule();
3660 spin_lock_irq(&x->wait.lock);
3661 } while (!x->done);
3662 __remove_wait_queue(&x->wait, &wait);
3664 x->done--;
3665 out:
3666 spin_unlock_irq(&x->wait.lock);
3668 return ret;
3670 EXPORT_SYMBOL(wait_for_completion_interruptible);
3672 unsigned long fastcall __sched
3673 wait_for_completion_interruptible_timeout(struct completion *x,
3674 unsigned long timeout)
3676 might_sleep();
3678 spin_lock_irq(&x->wait.lock);
3679 if (!x->done) {
3680 DECLARE_WAITQUEUE(wait, current);
3682 wait.flags |= WQ_FLAG_EXCLUSIVE;
3683 __add_wait_queue_tail(&x->wait, &wait);
3684 do {
3685 if (signal_pending(current)) {
3686 timeout = -ERESTARTSYS;
3687 __remove_wait_queue(&x->wait, &wait);
3688 goto out;
3690 __set_current_state(TASK_INTERRUPTIBLE);
3691 spin_unlock_irq(&x->wait.lock);
3692 timeout = schedule_timeout(timeout);
3693 spin_lock_irq(&x->wait.lock);
3694 if (!timeout) {
3695 __remove_wait_queue(&x->wait, &wait);
3696 goto out;
3698 } while (!x->done);
3699 __remove_wait_queue(&x->wait, &wait);
3701 x->done--;
3702 out:
3703 spin_unlock_irq(&x->wait.lock);
3704 return timeout;
3706 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3708 static inline void
3709 sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3711 spin_lock_irqsave(&q->lock, *flags);
3712 __add_wait_queue(q, wait);
3713 spin_unlock(&q->lock);
3716 static inline void
3717 sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3719 spin_lock_irq(&q->lock);
3720 __remove_wait_queue(q, wait);
3721 spin_unlock_irqrestore(&q->lock, *flags);
3724 void __sched interruptible_sleep_on(wait_queue_head_t *q)
3726 unsigned long flags;
3727 wait_queue_t wait;
3729 init_waitqueue_entry(&wait, current);
3731 current->state = TASK_INTERRUPTIBLE;
3733 sleep_on_head(q, &wait, &flags);
3734 schedule();
3735 sleep_on_tail(q, &wait, &flags);
3737 EXPORT_SYMBOL(interruptible_sleep_on);
3739 long __sched
3740 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3742 unsigned long flags;
3743 wait_queue_t wait;
3745 init_waitqueue_entry(&wait, current);
3747 current->state = TASK_INTERRUPTIBLE;
3749 sleep_on_head(q, &wait, &flags);
3750 timeout = schedule_timeout(timeout);
3751 sleep_on_tail(q, &wait, &flags);
3753 return timeout;
3755 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3757 void __sched sleep_on(wait_queue_head_t *q)
3759 unsigned long flags;
3760 wait_queue_t wait;
3762 init_waitqueue_entry(&wait, current);
3764 current->state = TASK_UNINTERRUPTIBLE;
3766 sleep_on_head(q, &wait, &flags);
3767 schedule();
3768 sleep_on_tail(q, &wait, &flags);
3770 EXPORT_SYMBOL(sleep_on);
3772 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3774 unsigned long flags;
3775 wait_queue_t wait;
3777 init_waitqueue_entry(&wait, current);
3779 current->state = TASK_UNINTERRUPTIBLE;
3781 sleep_on_head(q, &wait, &flags);
3782 timeout = schedule_timeout(timeout);
3783 sleep_on_tail(q, &wait, &flags);
3785 return timeout;
3787 EXPORT_SYMBOL(sleep_on_timeout);
3789 #ifdef CONFIG_RT_MUTEXES
3792 * rt_mutex_setprio - set the current priority of a task
3793 * @p: task
3794 * @prio: prio value (kernel-internal form)
3796 * This function changes the 'effective' priority of a task. It does
3797 * not touch ->normal_prio like __setscheduler().
3799 * Used by the rt_mutex code to implement priority inheritance logic.
3801 void rt_mutex_setprio(struct task_struct *p, int prio)
3803 unsigned long flags;
3804 int oldprio, on_rq;
3805 struct rq *rq;
3806 u64 now;
3808 BUG_ON(prio < 0 || prio > MAX_PRIO);
3810 rq = task_rq_lock(p, &flags);
3811 now = rq_clock(rq);
3813 oldprio = p->prio;
3814 on_rq = p->se.on_rq;
3815 if (on_rq)
3816 dequeue_task(rq, p, 0, now);
3818 if (rt_prio(prio))
3819 p->sched_class = &rt_sched_class;
3820 else
3821 p->sched_class = &fair_sched_class;
3823 p->prio = prio;
3825 if (on_rq) {
3826 enqueue_task(rq, p, 0, now);
3828 * Reschedule if we are currently running on this runqueue and
3829 * our priority decreased, or if we are not currently running on
3830 * this runqueue and our priority is higher than the current's
3832 if (task_running(rq, p)) {
3833 if (p->prio > oldprio)
3834 resched_task(rq->curr);
3835 } else {
3836 check_preempt_curr(rq, p);
3839 task_rq_unlock(rq, &flags);
3842 #endif
3844 void set_user_nice(struct task_struct *p, long nice)
3846 int old_prio, delta, on_rq;
3847 unsigned long flags;
3848 struct rq *rq;
3849 u64 now;
3851 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3852 return;
3854 * We have to be careful, if called from sys_setpriority(),
3855 * the task might be in the middle of scheduling on another CPU.
3857 rq = task_rq_lock(p, &flags);
3858 now = rq_clock(rq);
3860 * The RT priorities are set via sched_setscheduler(), but we still
3861 * allow the 'normal' nice value to be set - but as expected
3862 * it wont have any effect on scheduling until the task is
3863 * SCHED_FIFO/SCHED_RR:
3865 if (task_has_rt_policy(p)) {
3866 p->static_prio = NICE_TO_PRIO(nice);
3867 goto out_unlock;
3869 on_rq = p->se.on_rq;
3870 if (on_rq) {
3871 dequeue_task(rq, p, 0, now);
3872 dec_load(rq, p, now);
3875 p->static_prio = NICE_TO_PRIO(nice);
3876 set_load_weight(p);
3877 old_prio = p->prio;
3878 p->prio = effective_prio(p);
3879 delta = p->prio - old_prio;
3881 if (on_rq) {
3882 enqueue_task(rq, p, 0, now);
3883 inc_load(rq, p, now);
3885 * If the task increased its priority or is running and
3886 * lowered its priority, then reschedule its CPU:
3888 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3889 resched_task(rq->curr);
3891 out_unlock:
3892 task_rq_unlock(rq, &flags);
3894 EXPORT_SYMBOL(set_user_nice);
3897 * can_nice - check if a task can reduce its nice value
3898 * @p: task
3899 * @nice: nice value
3901 int can_nice(const struct task_struct *p, const int nice)
3903 /* convert nice value [19,-20] to rlimit style value [1,40] */
3904 int nice_rlim = 20 - nice;
3906 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3907 capable(CAP_SYS_NICE));
3910 #ifdef __ARCH_WANT_SYS_NICE
3913 * sys_nice - change the priority of the current process.
3914 * @increment: priority increment
3916 * sys_setpriority is a more generic, but much slower function that
3917 * does similar things.
3919 asmlinkage long sys_nice(int increment)
3921 long nice, retval;
3924 * Setpriority might change our priority at the same moment.
3925 * We don't have to worry. Conceptually one call occurs first
3926 * and we have a single winner.
3928 if (increment < -40)
3929 increment = -40;
3930 if (increment > 40)
3931 increment = 40;
3933 nice = PRIO_TO_NICE(current->static_prio) + increment;
3934 if (nice < -20)
3935 nice = -20;
3936 if (nice > 19)
3937 nice = 19;
3939 if (increment < 0 && !can_nice(current, nice))
3940 return -EPERM;
3942 retval = security_task_setnice(current, nice);
3943 if (retval)
3944 return retval;
3946 set_user_nice(current, nice);
3947 return 0;
3950 #endif
3953 * task_prio - return the priority value of a given task.
3954 * @p: the task in question.
3956 * This is the priority value as seen by users in /proc.
3957 * RT tasks are offset by -200. Normal tasks are centered
3958 * around 0, value goes from -16 to +15.
3960 int task_prio(const struct task_struct *p)
3962 return p->prio - MAX_RT_PRIO;
3966 * task_nice - return the nice value of a given task.
3967 * @p: the task in question.
3969 int task_nice(const struct task_struct *p)
3971 return TASK_NICE(p);
3973 EXPORT_SYMBOL_GPL(task_nice);
3976 * idle_cpu - is a given cpu idle currently?
3977 * @cpu: the processor in question.
3979 int idle_cpu(int cpu)
3981 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3985 * idle_task - return the idle task for a given cpu.
3986 * @cpu: the processor in question.
3988 struct task_struct *idle_task(int cpu)
3990 return cpu_rq(cpu)->idle;
3994 * find_process_by_pid - find a process with a matching PID value.
3995 * @pid: the pid in question.
3997 static inline struct task_struct *find_process_by_pid(pid_t pid)
3999 return pid ? find_task_by_pid(pid) : current;
4002 /* Actually do priority change: must hold rq lock. */
4003 static void
4004 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4006 BUG_ON(p->se.on_rq);
4008 p->policy = policy;
4009 switch (p->policy) {
4010 case SCHED_NORMAL:
4011 case SCHED_BATCH:
4012 case SCHED_IDLE:
4013 p->sched_class = &fair_sched_class;
4014 break;
4015 case SCHED_FIFO:
4016 case SCHED_RR:
4017 p->sched_class = &rt_sched_class;
4018 break;
4021 p->rt_priority = prio;
4022 p->normal_prio = normal_prio(p);
4023 /* we are holding p->pi_lock already */
4024 p->prio = rt_mutex_getprio(p);
4025 set_load_weight(p);
4029 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4030 * @p: the task in question.
4031 * @policy: new policy.
4032 * @param: structure containing the new RT priority.
4034 * NOTE that the task may be already dead.
4036 int sched_setscheduler(struct task_struct *p, int policy,
4037 struct sched_param *param)
4039 int retval, oldprio, oldpolicy = -1, on_rq;
4040 unsigned long flags;
4041 struct rq *rq;
4043 /* may grab non-irq protected spin_locks */
4044 BUG_ON(in_interrupt());
4045 recheck:
4046 /* double check policy once rq lock held */
4047 if (policy < 0)
4048 policy = oldpolicy = p->policy;
4049 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4050 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4051 policy != SCHED_IDLE)
4052 return -EINVAL;
4054 * Valid priorities for SCHED_FIFO and SCHED_RR are
4055 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4056 * SCHED_BATCH and SCHED_IDLE is 0.
4058 if (param->sched_priority < 0 ||
4059 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4060 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4061 return -EINVAL;
4062 if (rt_policy(policy) != (param->sched_priority != 0))
4063 return -EINVAL;
4066 * Allow unprivileged RT tasks to decrease priority:
4068 if (!capable(CAP_SYS_NICE)) {
4069 if (rt_policy(policy)) {
4070 unsigned long rlim_rtprio;
4072 if (!lock_task_sighand(p, &flags))
4073 return -ESRCH;
4074 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4075 unlock_task_sighand(p, &flags);
4077 /* can't set/change the rt policy */
4078 if (policy != p->policy && !rlim_rtprio)
4079 return -EPERM;
4081 /* can't increase priority */
4082 if (param->sched_priority > p->rt_priority &&
4083 param->sched_priority > rlim_rtprio)
4084 return -EPERM;
4087 * Like positive nice levels, dont allow tasks to
4088 * move out of SCHED_IDLE either:
4090 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4091 return -EPERM;
4093 /* can't change other user's priorities */
4094 if ((current->euid != p->euid) &&
4095 (current->euid != p->uid))
4096 return -EPERM;
4099 retval = security_task_setscheduler(p, policy, param);
4100 if (retval)
4101 return retval;
4103 * make sure no PI-waiters arrive (or leave) while we are
4104 * changing the priority of the task:
4106 spin_lock_irqsave(&p->pi_lock, flags);
4108 * To be able to change p->policy safely, the apropriate
4109 * runqueue lock must be held.
4111 rq = __task_rq_lock(p);
4112 /* recheck policy now with rq lock held */
4113 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4114 policy = oldpolicy = -1;
4115 __task_rq_unlock(rq);
4116 spin_unlock_irqrestore(&p->pi_lock, flags);
4117 goto recheck;
4119 on_rq = p->se.on_rq;
4120 if (on_rq)
4121 deactivate_task(rq, p, 0);
4122 oldprio = p->prio;
4123 __setscheduler(rq, p, policy, param->sched_priority);
4124 if (on_rq) {
4125 activate_task(rq, p, 0);
4127 * Reschedule if we are currently running on this runqueue and
4128 * our priority decreased, or if we are not currently running on
4129 * this runqueue and our priority is higher than the current's
4131 if (task_running(rq, p)) {
4132 if (p->prio > oldprio)
4133 resched_task(rq->curr);
4134 } else {
4135 check_preempt_curr(rq, p);
4138 __task_rq_unlock(rq);
4139 spin_unlock_irqrestore(&p->pi_lock, flags);
4141 rt_mutex_adjust_pi(p);
4143 return 0;
4145 EXPORT_SYMBOL_GPL(sched_setscheduler);
4147 static int
4148 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4150 struct sched_param lparam;
4151 struct task_struct *p;
4152 int retval;
4154 if (!param || pid < 0)
4155 return -EINVAL;
4156 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4157 return -EFAULT;
4159 rcu_read_lock();
4160 retval = -ESRCH;
4161 p = find_process_by_pid(pid);
4162 if (p != NULL)
4163 retval = sched_setscheduler(p, policy, &lparam);
4164 rcu_read_unlock();
4166 return retval;
4170 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4171 * @pid: the pid in question.
4172 * @policy: new policy.
4173 * @param: structure containing the new RT priority.
4175 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4176 struct sched_param __user *param)
4178 /* negative values for policy are not valid */
4179 if (policy < 0)
4180 return -EINVAL;
4182 return do_sched_setscheduler(pid, policy, param);
4186 * sys_sched_setparam - set/change the RT priority of a thread
4187 * @pid: the pid in question.
4188 * @param: structure containing the new RT priority.
4190 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4192 return do_sched_setscheduler(pid, -1, param);
4196 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4197 * @pid: the pid in question.
4199 asmlinkage long sys_sched_getscheduler(pid_t pid)
4201 struct task_struct *p;
4202 int retval = -EINVAL;
4204 if (pid < 0)
4205 goto out_nounlock;
4207 retval = -ESRCH;
4208 read_lock(&tasklist_lock);
4209 p = find_process_by_pid(pid);
4210 if (p) {
4211 retval = security_task_getscheduler(p);
4212 if (!retval)
4213 retval = p->policy;
4215 read_unlock(&tasklist_lock);
4217 out_nounlock:
4218 return retval;
4222 * sys_sched_getscheduler - get the RT priority of a thread
4223 * @pid: the pid in question.
4224 * @param: structure containing the RT priority.
4226 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4228 struct sched_param lp;
4229 struct task_struct *p;
4230 int retval = -EINVAL;
4232 if (!param || pid < 0)
4233 goto out_nounlock;
4235 read_lock(&tasklist_lock);
4236 p = find_process_by_pid(pid);
4237 retval = -ESRCH;
4238 if (!p)
4239 goto out_unlock;
4241 retval = security_task_getscheduler(p);
4242 if (retval)
4243 goto out_unlock;
4245 lp.sched_priority = p->rt_priority;
4246 read_unlock(&tasklist_lock);
4249 * This one might sleep, we cannot do it with a spinlock held ...
4251 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4253 out_nounlock:
4254 return retval;
4256 out_unlock:
4257 read_unlock(&tasklist_lock);
4258 return retval;
4261 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4263 cpumask_t cpus_allowed;
4264 struct task_struct *p;
4265 int retval;
4267 mutex_lock(&sched_hotcpu_mutex);
4268 read_lock(&tasklist_lock);
4270 p = find_process_by_pid(pid);
4271 if (!p) {
4272 read_unlock(&tasklist_lock);
4273 mutex_unlock(&sched_hotcpu_mutex);
4274 return -ESRCH;
4278 * It is not safe to call set_cpus_allowed with the
4279 * tasklist_lock held. We will bump the task_struct's
4280 * usage count and then drop tasklist_lock.
4282 get_task_struct(p);
4283 read_unlock(&tasklist_lock);
4285 retval = -EPERM;
4286 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4287 !capable(CAP_SYS_NICE))
4288 goto out_unlock;
4290 retval = security_task_setscheduler(p, 0, NULL);
4291 if (retval)
4292 goto out_unlock;
4294 cpus_allowed = cpuset_cpus_allowed(p);
4295 cpus_and(new_mask, new_mask, cpus_allowed);
4296 retval = set_cpus_allowed(p, new_mask);
4298 out_unlock:
4299 put_task_struct(p);
4300 mutex_unlock(&sched_hotcpu_mutex);
4301 return retval;
4304 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4305 cpumask_t *new_mask)
4307 if (len < sizeof(cpumask_t)) {
4308 memset(new_mask, 0, sizeof(cpumask_t));
4309 } else if (len > sizeof(cpumask_t)) {
4310 len = sizeof(cpumask_t);
4312 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4316 * sys_sched_setaffinity - set the cpu affinity of a process
4317 * @pid: pid of the process
4318 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4319 * @user_mask_ptr: user-space pointer to the new cpu mask
4321 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4322 unsigned long __user *user_mask_ptr)
4324 cpumask_t new_mask;
4325 int retval;
4327 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4328 if (retval)
4329 return retval;
4331 return sched_setaffinity(pid, new_mask);
4335 * Represents all cpu's present in the system
4336 * In systems capable of hotplug, this map could dynamically grow
4337 * as new cpu's are detected in the system via any platform specific
4338 * method, such as ACPI for e.g.
4341 cpumask_t cpu_present_map __read_mostly;
4342 EXPORT_SYMBOL(cpu_present_map);
4344 #ifndef CONFIG_SMP
4345 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4346 EXPORT_SYMBOL(cpu_online_map);
4348 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4349 EXPORT_SYMBOL(cpu_possible_map);
4350 #endif
4352 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4354 struct task_struct *p;
4355 int retval;
4357 mutex_lock(&sched_hotcpu_mutex);
4358 read_lock(&tasklist_lock);
4360 retval = -ESRCH;
4361 p = find_process_by_pid(pid);
4362 if (!p)
4363 goto out_unlock;
4365 retval = security_task_getscheduler(p);
4366 if (retval)
4367 goto out_unlock;
4369 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4371 out_unlock:
4372 read_unlock(&tasklist_lock);
4373 mutex_unlock(&sched_hotcpu_mutex);
4374 if (retval)
4375 return retval;
4377 return 0;
4381 * sys_sched_getaffinity - get the cpu affinity of a process
4382 * @pid: pid of the process
4383 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4384 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4386 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4387 unsigned long __user *user_mask_ptr)
4389 int ret;
4390 cpumask_t mask;
4392 if (len < sizeof(cpumask_t))
4393 return -EINVAL;
4395 ret = sched_getaffinity(pid, &mask);
4396 if (ret < 0)
4397 return ret;
4399 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4400 return -EFAULT;
4402 return sizeof(cpumask_t);
4406 * sys_sched_yield - yield the current processor to other threads.
4408 * This function yields the current CPU to other tasks. If there are no
4409 * other threads running on this CPU then this function will return.
4411 asmlinkage long sys_sched_yield(void)
4413 struct rq *rq = this_rq_lock();
4415 schedstat_inc(rq, yld_cnt);
4416 if (unlikely(rq->nr_running == 1))
4417 schedstat_inc(rq, yld_act_empty);
4418 else
4419 current->sched_class->yield_task(rq, current);
4422 * Since we are going to call schedule() anyway, there's
4423 * no need to preempt or enable interrupts:
4425 __release(rq->lock);
4426 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4427 _raw_spin_unlock(&rq->lock);
4428 preempt_enable_no_resched();
4430 schedule();
4432 return 0;
4435 static void __cond_resched(void)
4437 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4438 __might_sleep(__FILE__, __LINE__);
4439 #endif
4441 * The BKS might be reacquired before we have dropped
4442 * PREEMPT_ACTIVE, which could trigger a second
4443 * cond_resched() call.
4445 do {
4446 add_preempt_count(PREEMPT_ACTIVE);
4447 schedule();
4448 sub_preempt_count(PREEMPT_ACTIVE);
4449 } while (need_resched());
4452 int __sched cond_resched(void)
4454 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4455 system_state == SYSTEM_RUNNING) {
4456 __cond_resched();
4457 return 1;
4459 return 0;
4461 EXPORT_SYMBOL(cond_resched);
4464 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4465 * call schedule, and on return reacquire the lock.
4467 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4468 * operations here to prevent schedule() from being called twice (once via
4469 * spin_unlock(), once by hand).
4471 int cond_resched_lock(spinlock_t *lock)
4473 int ret = 0;
4475 if (need_lockbreak(lock)) {
4476 spin_unlock(lock);
4477 cpu_relax();
4478 ret = 1;
4479 spin_lock(lock);
4481 if (need_resched() && system_state == SYSTEM_RUNNING) {
4482 spin_release(&lock->dep_map, 1, _THIS_IP_);
4483 _raw_spin_unlock(lock);
4484 preempt_enable_no_resched();
4485 __cond_resched();
4486 ret = 1;
4487 spin_lock(lock);
4489 return ret;
4491 EXPORT_SYMBOL(cond_resched_lock);
4493 int __sched cond_resched_softirq(void)
4495 BUG_ON(!in_softirq());
4497 if (need_resched() && system_state == SYSTEM_RUNNING) {
4498 local_bh_enable();
4499 __cond_resched();
4500 local_bh_disable();
4501 return 1;
4503 return 0;
4505 EXPORT_SYMBOL(cond_resched_softirq);
4508 * yield - yield the current processor to other threads.
4510 * This is a shortcut for kernel-space yielding - it marks the
4511 * thread runnable and calls sys_sched_yield().
4513 void __sched yield(void)
4515 set_current_state(TASK_RUNNING);
4516 sys_sched_yield();
4518 EXPORT_SYMBOL(yield);
4521 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4522 * that process accounting knows that this is a task in IO wait state.
4524 * But don't do that if it is a deliberate, throttling IO wait (this task
4525 * has set its backing_dev_info: the queue against which it should throttle)
4527 void __sched io_schedule(void)
4529 struct rq *rq = &__raw_get_cpu_var(runqueues);
4531 delayacct_blkio_start();
4532 atomic_inc(&rq->nr_iowait);
4533 schedule();
4534 atomic_dec(&rq->nr_iowait);
4535 delayacct_blkio_end();
4537 EXPORT_SYMBOL(io_schedule);
4539 long __sched io_schedule_timeout(long timeout)
4541 struct rq *rq = &__raw_get_cpu_var(runqueues);
4542 long ret;
4544 delayacct_blkio_start();
4545 atomic_inc(&rq->nr_iowait);
4546 ret = schedule_timeout(timeout);
4547 atomic_dec(&rq->nr_iowait);
4548 delayacct_blkio_end();
4549 return ret;
4553 * sys_sched_get_priority_max - return maximum RT priority.
4554 * @policy: scheduling class.
4556 * this syscall returns the maximum rt_priority that can be used
4557 * by a given scheduling class.
4559 asmlinkage long sys_sched_get_priority_max(int policy)
4561 int ret = -EINVAL;
4563 switch (policy) {
4564 case SCHED_FIFO:
4565 case SCHED_RR:
4566 ret = MAX_USER_RT_PRIO-1;
4567 break;
4568 case SCHED_NORMAL:
4569 case SCHED_BATCH:
4570 case SCHED_IDLE:
4571 ret = 0;
4572 break;
4574 return ret;
4578 * sys_sched_get_priority_min - return minimum RT priority.
4579 * @policy: scheduling class.
4581 * this syscall returns the minimum rt_priority that can be used
4582 * by a given scheduling class.
4584 asmlinkage long sys_sched_get_priority_min(int policy)
4586 int ret = -EINVAL;
4588 switch (policy) {
4589 case SCHED_FIFO:
4590 case SCHED_RR:
4591 ret = 1;
4592 break;
4593 case SCHED_NORMAL:
4594 case SCHED_BATCH:
4595 case SCHED_IDLE:
4596 ret = 0;
4598 return ret;
4602 * sys_sched_rr_get_interval - return the default timeslice of a process.
4603 * @pid: pid of the process.
4604 * @interval: userspace pointer to the timeslice value.
4606 * this syscall writes the default timeslice value of a given process
4607 * into the user-space timespec buffer. A value of '0' means infinity.
4609 asmlinkage
4610 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4612 struct task_struct *p;
4613 int retval = -EINVAL;
4614 struct timespec t;
4616 if (pid < 0)
4617 goto out_nounlock;
4619 retval = -ESRCH;
4620 read_lock(&tasklist_lock);
4621 p = find_process_by_pid(pid);
4622 if (!p)
4623 goto out_unlock;
4625 retval = security_task_getscheduler(p);
4626 if (retval)
4627 goto out_unlock;
4629 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4630 0 : static_prio_timeslice(p->static_prio), &t);
4631 read_unlock(&tasklist_lock);
4632 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4633 out_nounlock:
4634 return retval;
4635 out_unlock:
4636 read_unlock(&tasklist_lock);
4637 return retval;
4640 static const char stat_nam[] = "RSDTtZX";
4642 static void show_task(struct task_struct *p)
4644 unsigned long free = 0;
4645 unsigned state;
4647 state = p->state ? __ffs(p->state) + 1 : 0;
4648 printk("%-13.13s %c", p->comm,
4649 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4650 #if BITS_PER_LONG == 32
4651 if (state == TASK_RUNNING)
4652 printk(" running ");
4653 else
4654 printk(" %08lx ", thread_saved_pc(p));
4655 #else
4656 if (state == TASK_RUNNING)
4657 printk(" running task ");
4658 else
4659 printk(" %016lx ", thread_saved_pc(p));
4660 #endif
4661 #ifdef CONFIG_DEBUG_STACK_USAGE
4663 unsigned long *n = end_of_stack(p);
4664 while (!*n)
4665 n++;
4666 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4668 #endif
4669 printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid);
4671 if (state != TASK_RUNNING)
4672 show_stack(p, NULL);
4675 void show_state_filter(unsigned long state_filter)
4677 struct task_struct *g, *p;
4679 #if BITS_PER_LONG == 32
4680 printk(KERN_INFO
4681 " task PC stack pid father\n");
4682 #else
4683 printk(KERN_INFO
4684 " task PC stack pid father\n");
4685 #endif
4686 read_lock(&tasklist_lock);
4687 do_each_thread(g, p) {
4689 * reset the NMI-timeout, listing all files on a slow
4690 * console might take alot of time:
4692 touch_nmi_watchdog();
4693 if (!state_filter || (p->state & state_filter))
4694 show_task(p);
4695 } while_each_thread(g, p);
4697 touch_all_softlockup_watchdogs();
4699 #ifdef CONFIG_SCHED_DEBUG
4700 sysrq_sched_debug_show();
4701 #endif
4702 read_unlock(&tasklist_lock);
4704 * Only show locks if all tasks are dumped:
4706 if (state_filter == -1)
4707 debug_show_all_locks();
4710 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4712 idle->sched_class = &idle_sched_class;
4716 * init_idle - set up an idle thread for a given CPU
4717 * @idle: task in question
4718 * @cpu: cpu the idle task belongs to
4720 * NOTE: this function does not set the idle thread's NEED_RESCHED
4721 * flag, to make booting more robust.
4723 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4725 struct rq *rq = cpu_rq(cpu);
4726 unsigned long flags;
4728 __sched_fork(idle);
4729 idle->se.exec_start = sched_clock();
4731 idle->prio = idle->normal_prio = MAX_PRIO;
4732 idle->cpus_allowed = cpumask_of_cpu(cpu);
4733 __set_task_cpu(idle, cpu);
4735 spin_lock_irqsave(&rq->lock, flags);
4736 rq->curr = rq->idle = idle;
4737 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4738 idle->oncpu = 1;
4739 #endif
4740 spin_unlock_irqrestore(&rq->lock, flags);
4742 /* Set the preempt count _outside_ the spinlocks! */
4743 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4744 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4745 #else
4746 task_thread_info(idle)->preempt_count = 0;
4747 #endif
4749 * The idle tasks have their own, simple scheduling class:
4751 idle->sched_class = &idle_sched_class;
4755 * In a system that switches off the HZ timer nohz_cpu_mask
4756 * indicates which cpus entered this state. This is used
4757 * in the rcu update to wait only for active cpus. For system
4758 * which do not switch off the HZ timer nohz_cpu_mask should
4759 * always be CPU_MASK_NONE.
4761 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4764 * Increase the granularity value when there are more CPUs,
4765 * because with more CPUs the 'effective latency' as visible
4766 * to users decreases. But the relationship is not linear,
4767 * so pick a second-best guess by going with the log2 of the
4768 * number of CPUs.
4770 * This idea comes from the SD scheduler of Con Kolivas:
4772 static inline void sched_init_granularity(void)
4774 unsigned int factor = 1 + ilog2(num_online_cpus());
4775 const unsigned long gran_limit = 100000000;
4777 sysctl_sched_granularity *= factor;
4778 if (sysctl_sched_granularity > gran_limit)
4779 sysctl_sched_granularity = gran_limit;
4781 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4782 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4785 #ifdef CONFIG_SMP
4787 * This is how migration works:
4789 * 1) we queue a struct migration_req structure in the source CPU's
4790 * runqueue and wake up that CPU's migration thread.
4791 * 2) we down() the locked semaphore => thread blocks.
4792 * 3) migration thread wakes up (implicitly it forces the migrated
4793 * thread off the CPU)
4794 * 4) it gets the migration request and checks whether the migrated
4795 * task is still in the wrong runqueue.
4796 * 5) if it's in the wrong runqueue then the migration thread removes
4797 * it and puts it into the right queue.
4798 * 6) migration thread up()s the semaphore.
4799 * 7) we wake up and the migration is done.
4803 * Change a given task's CPU affinity. Migrate the thread to a
4804 * proper CPU and schedule it away if the CPU it's executing on
4805 * is removed from the allowed bitmask.
4807 * NOTE: the caller must have a valid reference to the task, the
4808 * task must not exit() & deallocate itself prematurely. The
4809 * call is not atomic; no spinlocks may be held.
4811 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4813 struct migration_req req;
4814 unsigned long flags;
4815 struct rq *rq;
4816 int ret = 0;
4818 rq = task_rq_lock(p, &flags);
4819 if (!cpus_intersects(new_mask, cpu_online_map)) {
4820 ret = -EINVAL;
4821 goto out;
4824 p->cpus_allowed = new_mask;
4825 /* Can the task run on the task's current CPU? If so, we're done */
4826 if (cpu_isset(task_cpu(p), new_mask))
4827 goto out;
4829 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4830 /* Need help from migration thread: drop lock and wait. */
4831 task_rq_unlock(rq, &flags);
4832 wake_up_process(rq->migration_thread);
4833 wait_for_completion(&req.done);
4834 tlb_migrate_finish(p->mm);
4835 return 0;
4837 out:
4838 task_rq_unlock(rq, &flags);
4840 return ret;
4842 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4845 * Move (not current) task off this cpu, onto dest cpu. We're doing
4846 * this because either it can't run here any more (set_cpus_allowed()
4847 * away from this CPU, or CPU going down), or because we're
4848 * attempting to rebalance this task on exec (sched_exec).
4850 * So we race with normal scheduler movements, but that's OK, as long
4851 * as the task is no longer on this CPU.
4853 * Returns non-zero if task was successfully migrated.
4855 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4857 struct rq *rq_dest, *rq_src;
4858 int ret = 0, on_rq;
4860 if (unlikely(cpu_is_offline(dest_cpu)))
4861 return ret;
4863 rq_src = cpu_rq(src_cpu);
4864 rq_dest = cpu_rq(dest_cpu);
4866 double_rq_lock(rq_src, rq_dest);
4867 /* Already moved. */
4868 if (task_cpu(p) != src_cpu)
4869 goto out;
4870 /* Affinity changed (again). */
4871 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4872 goto out;
4874 on_rq = p->se.on_rq;
4875 if (on_rq)
4876 deactivate_task(rq_src, p, 0);
4877 set_task_cpu(p, dest_cpu);
4878 if (on_rq) {
4879 activate_task(rq_dest, p, 0);
4880 check_preempt_curr(rq_dest, p);
4882 ret = 1;
4883 out:
4884 double_rq_unlock(rq_src, rq_dest);
4885 return ret;
4889 * migration_thread - this is a highprio system thread that performs
4890 * thread migration by bumping thread off CPU then 'pushing' onto
4891 * another runqueue.
4893 static int migration_thread(void *data)
4895 int cpu = (long)data;
4896 struct rq *rq;
4898 rq = cpu_rq(cpu);
4899 BUG_ON(rq->migration_thread != current);
4901 set_current_state(TASK_INTERRUPTIBLE);
4902 while (!kthread_should_stop()) {
4903 struct migration_req *req;
4904 struct list_head *head;
4906 try_to_freeze();
4908 spin_lock_irq(&rq->lock);
4910 if (cpu_is_offline(cpu)) {
4911 spin_unlock_irq(&rq->lock);
4912 goto wait_to_die;
4915 if (rq->active_balance) {
4916 active_load_balance(rq, cpu);
4917 rq->active_balance = 0;
4920 head = &rq->migration_queue;
4922 if (list_empty(head)) {
4923 spin_unlock_irq(&rq->lock);
4924 schedule();
4925 set_current_state(TASK_INTERRUPTIBLE);
4926 continue;
4928 req = list_entry(head->next, struct migration_req, list);
4929 list_del_init(head->next);
4931 spin_unlock(&rq->lock);
4932 __migrate_task(req->task, cpu, req->dest_cpu);
4933 local_irq_enable();
4935 complete(&req->done);
4937 __set_current_state(TASK_RUNNING);
4938 return 0;
4940 wait_to_die:
4941 /* Wait for kthread_stop */
4942 set_current_state(TASK_INTERRUPTIBLE);
4943 while (!kthread_should_stop()) {
4944 schedule();
4945 set_current_state(TASK_INTERRUPTIBLE);
4947 __set_current_state(TASK_RUNNING);
4948 return 0;
4951 #ifdef CONFIG_HOTPLUG_CPU
4953 * Figure out where task on dead CPU should go, use force if neccessary.
4954 * NOTE: interrupts should be disabled by the caller
4956 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
4958 unsigned long flags;
4959 cpumask_t mask;
4960 struct rq *rq;
4961 int dest_cpu;
4963 restart:
4964 /* On same node? */
4965 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4966 cpus_and(mask, mask, p->cpus_allowed);
4967 dest_cpu = any_online_cpu(mask);
4969 /* On any allowed CPU? */
4970 if (dest_cpu == NR_CPUS)
4971 dest_cpu = any_online_cpu(p->cpus_allowed);
4973 /* No more Mr. Nice Guy. */
4974 if (dest_cpu == NR_CPUS) {
4975 rq = task_rq_lock(p, &flags);
4976 cpus_setall(p->cpus_allowed);
4977 dest_cpu = any_online_cpu(p->cpus_allowed);
4978 task_rq_unlock(rq, &flags);
4981 * Don't tell them about moving exiting tasks or
4982 * kernel threads (both mm NULL), since they never
4983 * leave kernel.
4985 if (p->mm && printk_ratelimit())
4986 printk(KERN_INFO "process %d (%s) no "
4987 "longer affine to cpu%d\n",
4988 p->pid, p->comm, dead_cpu);
4990 if (!__migrate_task(p, dead_cpu, dest_cpu))
4991 goto restart;
4995 * While a dead CPU has no uninterruptible tasks queued at this point,
4996 * it might still have a nonzero ->nr_uninterruptible counter, because
4997 * for performance reasons the counter is not stricly tracking tasks to
4998 * their home CPUs. So we just add the counter to another CPU's counter,
4999 * to keep the global sum constant after CPU-down:
5001 static void migrate_nr_uninterruptible(struct rq *rq_src)
5003 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5004 unsigned long flags;
5006 local_irq_save(flags);
5007 double_rq_lock(rq_src, rq_dest);
5008 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5009 rq_src->nr_uninterruptible = 0;
5010 double_rq_unlock(rq_src, rq_dest);
5011 local_irq_restore(flags);
5014 /* Run through task list and migrate tasks from the dead cpu. */
5015 static void migrate_live_tasks(int src_cpu)
5017 struct task_struct *p, *t;
5019 write_lock_irq(&tasklist_lock);
5021 do_each_thread(t, p) {
5022 if (p == current)
5023 continue;
5025 if (task_cpu(p) == src_cpu)
5026 move_task_off_dead_cpu(src_cpu, p);
5027 } while_each_thread(t, p);
5029 write_unlock_irq(&tasklist_lock);
5033 * Schedules idle task to be the next runnable task on current CPU.
5034 * It does so by boosting its priority to highest possible and adding it to
5035 * the _front_ of the runqueue. Used by CPU offline code.
5037 void sched_idle_next(void)
5039 int this_cpu = smp_processor_id();
5040 struct rq *rq = cpu_rq(this_cpu);
5041 struct task_struct *p = rq->idle;
5042 unsigned long flags;
5044 /* cpu has to be offline */
5045 BUG_ON(cpu_online(this_cpu));
5048 * Strictly not necessary since rest of the CPUs are stopped by now
5049 * and interrupts disabled on the current cpu.
5051 spin_lock_irqsave(&rq->lock, flags);
5053 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5055 /* Add idle task to the _front_ of its priority queue: */
5056 activate_idle_task(p, rq);
5058 spin_unlock_irqrestore(&rq->lock, flags);
5062 * Ensures that the idle task is using init_mm right before its cpu goes
5063 * offline.
5065 void idle_task_exit(void)
5067 struct mm_struct *mm = current->active_mm;
5069 BUG_ON(cpu_online(smp_processor_id()));
5071 if (mm != &init_mm)
5072 switch_mm(mm, &init_mm, current);
5073 mmdrop(mm);
5076 /* called under rq->lock with disabled interrupts */
5077 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5079 struct rq *rq = cpu_rq(dead_cpu);
5081 /* Must be exiting, otherwise would be on tasklist. */
5082 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5084 /* Cannot have done final schedule yet: would have vanished. */
5085 BUG_ON(p->state == TASK_DEAD);
5087 get_task_struct(p);
5090 * Drop lock around migration; if someone else moves it,
5091 * that's OK. No task can be added to this CPU, so iteration is
5092 * fine.
5093 * NOTE: interrupts should be left disabled --dev@
5095 spin_unlock(&rq->lock);
5096 move_task_off_dead_cpu(dead_cpu, p);
5097 spin_lock(&rq->lock);
5099 put_task_struct(p);
5102 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5103 static void migrate_dead_tasks(unsigned int dead_cpu)
5105 struct rq *rq = cpu_rq(dead_cpu);
5106 struct task_struct *next;
5108 for ( ; ; ) {
5109 if (!rq->nr_running)
5110 break;
5111 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5112 if (!next)
5113 break;
5114 migrate_dead(dead_cpu, next);
5117 #endif /* CONFIG_HOTPLUG_CPU */
5120 * migration_call - callback that gets triggered when a CPU is added.
5121 * Here we can start up the necessary migration thread for the new CPU.
5123 static int __cpuinit
5124 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5126 struct task_struct *p;
5127 int cpu = (long)hcpu;
5128 unsigned long flags;
5129 struct rq *rq;
5131 switch (action) {
5132 case CPU_LOCK_ACQUIRE:
5133 mutex_lock(&sched_hotcpu_mutex);
5134 break;
5136 case CPU_UP_PREPARE:
5137 case CPU_UP_PREPARE_FROZEN:
5138 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5139 if (IS_ERR(p))
5140 return NOTIFY_BAD;
5141 p->flags |= PF_NOFREEZE;
5142 kthread_bind(p, cpu);
5143 /* Must be high prio: stop_machine expects to yield to it. */
5144 rq = task_rq_lock(p, &flags);
5145 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5146 task_rq_unlock(rq, &flags);
5147 cpu_rq(cpu)->migration_thread = p;
5148 break;
5150 case CPU_ONLINE:
5151 case CPU_ONLINE_FROZEN:
5152 /* Strictly unneccessary, as first user will wake it. */
5153 wake_up_process(cpu_rq(cpu)->migration_thread);
5154 break;
5156 #ifdef CONFIG_HOTPLUG_CPU
5157 case CPU_UP_CANCELED:
5158 case CPU_UP_CANCELED_FROZEN:
5159 if (!cpu_rq(cpu)->migration_thread)
5160 break;
5161 /* Unbind it from offline cpu so it can run. Fall thru. */
5162 kthread_bind(cpu_rq(cpu)->migration_thread,
5163 any_online_cpu(cpu_online_map));
5164 kthread_stop(cpu_rq(cpu)->migration_thread);
5165 cpu_rq(cpu)->migration_thread = NULL;
5166 break;
5168 case CPU_DEAD:
5169 case CPU_DEAD_FROZEN:
5170 migrate_live_tasks(cpu);
5171 rq = cpu_rq(cpu);
5172 kthread_stop(rq->migration_thread);
5173 rq->migration_thread = NULL;
5174 /* Idle task back to normal (off runqueue, low prio) */
5175 rq = task_rq_lock(rq->idle, &flags);
5176 deactivate_task(rq, rq->idle, 0);
5177 rq->idle->static_prio = MAX_PRIO;
5178 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5179 rq->idle->sched_class = &idle_sched_class;
5180 migrate_dead_tasks(cpu);
5181 task_rq_unlock(rq, &flags);
5182 migrate_nr_uninterruptible(rq);
5183 BUG_ON(rq->nr_running != 0);
5185 /* No need to migrate the tasks: it was best-effort if
5186 * they didn't take sched_hotcpu_mutex. Just wake up
5187 * the requestors. */
5188 spin_lock_irq(&rq->lock);
5189 while (!list_empty(&rq->migration_queue)) {
5190 struct migration_req *req;
5192 req = list_entry(rq->migration_queue.next,
5193 struct migration_req, list);
5194 list_del_init(&req->list);
5195 complete(&req->done);
5197 spin_unlock_irq(&rq->lock);
5198 break;
5199 #endif
5200 case CPU_LOCK_RELEASE:
5201 mutex_unlock(&sched_hotcpu_mutex);
5202 break;
5204 return NOTIFY_OK;
5207 /* Register at highest priority so that task migration (migrate_all_tasks)
5208 * happens before everything else.
5210 static struct notifier_block __cpuinitdata migration_notifier = {
5211 .notifier_call = migration_call,
5212 .priority = 10
5215 int __init migration_init(void)
5217 void *cpu = (void *)(long)smp_processor_id();
5218 int err;
5220 /* Start one for the boot CPU: */
5221 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5222 BUG_ON(err == NOTIFY_BAD);
5223 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5224 register_cpu_notifier(&migration_notifier);
5226 return 0;
5228 #endif
5230 #ifdef CONFIG_SMP
5232 /* Number of possible processor ids */
5233 int nr_cpu_ids __read_mostly = NR_CPUS;
5234 EXPORT_SYMBOL(nr_cpu_ids);
5236 #undef SCHED_DOMAIN_DEBUG
5237 #ifdef SCHED_DOMAIN_DEBUG
5238 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5240 int level = 0;
5242 if (!sd) {
5243 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5244 return;
5247 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5249 do {
5250 int i;
5251 char str[NR_CPUS];
5252 struct sched_group *group = sd->groups;
5253 cpumask_t groupmask;
5255 cpumask_scnprintf(str, NR_CPUS, sd->span);
5256 cpus_clear(groupmask);
5258 printk(KERN_DEBUG);
5259 for (i = 0; i < level + 1; i++)
5260 printk(" ");
5261 printk("domain %d: ", level);
5263 if (!(sd->flags & SD_LOAD_BALANCE)) {
5264 printk("does not load-balance\n");
5265 if (sd->parent)
5266 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5267 " has parent");
5268 break;
5271 printk("span %s\n", str);
5273 if (!cpu_isset(cpu, sd->span))
5274 printk(KERN_ERR "ERROR: domain->span does not contain "
5275 "CPU%d\n", cpu);
5276 if (!cpu_isset(cpu, group->cpumask))
5277 printk(KERN_ERR "ERROR: domain->groups does not contain"
5278 " CPU%d\n", cpu);
5280 printk(KERN_DEBUG);
5281 for (i = 0; i < level + 2; i++)
5282 printk(" ");
5283 printk("groups:");
5284 do {
5285 if (!group) {
5286 printk("\n");
5287 printk(KERN_ERR "ERROR: group is NULL\n");
5288 break;
5291 if (!group->__cpu_power) {
5292 printk("\n");
5293 printk(KERN_ERR "ERROR: domain->cpu_power not "
5294 "set\n");
5297 if (!cpus_weight(group->cpumask)) {
5298 printk("\n");
5299 printk(KERN_ERR "ERROR: empty group\n");
5302 if (cpus_intersects(groupmask, group->cpumask)) {
5303 printk("\n");
5304 printk(KERN_ERR "ERROR: repeated CPUs\n");
5307 cpus_or(groupmask, groupmask, group->cpumask);
5309 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5310 printk(" %s", str);
5312 group = group->next;
5313 } while (group != sd->groups);
5314 printk("\n");
5316 if (!cpus_equal(sd->span, groupmask))
5317 printk(KERN_ERR "ERROR: groups don't span "
5318 "domain->span\n");
5320 level++;
5321 sd = sd->parent;
5322 if (!sd)
5323 continue;
5325 if (!cpus_subset(groupmask, sd->span))
5326 printk(KERN_ERR "ERROR: parent span is not a superset "
5327 "of domain->span\n");
5329 } while (sd);
5331 #else
5332 # define sched_domain_debug(sd, cpu) do { } while (0)
5333 #endif
5335 static int sd_degenerate(struct sched_domain *sd)
5337 if (cpus_weight(sd->span) == 1)
5338 return 1;
5340 /* Following flags need at least 2 groups */
5341 if (sd->flags & (SD_LOAD_BALANCE |
5342 SD_BALANCE_NEWIDLE |
5343 SD_BALANCE_FORK |
5344 SD_BALANCE_EXEC |
5345 SD_SHARE_CPUPOWER |
5346 SD_SHARE_PKG_RESOURCES)) {
5347 if (sd->groups != sd->groups->next)
5348 return 0;
5351 /* Following flags don't use groups */
5352 if (sd->flags & (SD_WAKE_IDLE |
5353 SD_WAKE_AFFINE |
5354 SD_WAKE_BALANCE))
5355 return 0;
5357 return 1;
5360 static int
5361 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5363 unsigned long cflags = sd->flags, pflags = parent->flags;
5365 if (sd_degenerate(parent))
5366 return 1;
5368 if (!cpus_equal(sd->span, parent->span))
5369 return 0;
5371 /* Does parent contain flags not in child? */
5372 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5373 if (cflags & SD_WAKE_AFFINE)
5374 pflags &= ~SD_WAKE_BALANCE;
5375 /* Flags needing groups don't count if only 1 group in parent */
5376 if (parent->groups == parent->groups->next) {
5377 pflags &= ~(SD_LOAD_BALANCE |
5378 SD_BALANCE_NEWIDLE |
5379 SD_BALANCE_FORK |
5380 SD_BALANCE_EXEC |
5381 SD_SHARE_CPUPOWER |
5382 SD_SHARE_PKG_RESOURCES);
5384 if (~cflags & pflags)
5385 return 0;
5387 return 1;
5391 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5392 * hold the hotplug lock.
5394 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5396 struct rq *rq = cpu_rq(cpu);
5397 struct sched_domain *tmp;
5399 /* Remove the sched domains which do not contribute to scheduling. */
5400 for (tmp = sd; tmp; tmp = tmp->parent) {
5401 struct sched_domain *parent = tmp->parent;
5402 if (!parent)
5403 break;
5404 if (sd_parent_degenerate(tmp, parent)) {
5405 tmp->parent = parent->parent;
5406 if (parent->parent)
5407 parent->parent->child = tmp;
5411 if (sd && sd_degenerate(sd)) {
5412 sd = sd->parent;
5413 if (sd)
5414 sd->child = NULL;
5417 sched_domain_debug(sd, cpu);
5419 rcu_assign_pointer(rq->sd, sd);
5422 /* cpus with isolated domains */
5423 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5425 /* Setup the mask of cpus configured for isolated domains */
5426 static int __init isolated_cpu_setup(char *str)
5428 int ints[NR_CPUS], i;
5430 str = get_options(str, ARRAY_SIZE(ints), ints);
5431 cpus_clear(cpu_isolated_map);
5432 for (i = 1; i <= ints[0]; i++)
5433 if (ints[i] < NR_CPUS)
5434 cpu_set(ints[i], cpu_isolated_map);
5435 return 1;
5438 __setup ("isolcpus=", isolated_cpu_setup);
5441 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5442 * to a function which identifies what group(along with sched group) a CPU
5443 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5444 * (due to the fact that we keep track of groups covered with a cpumask_t).
5446 * init_sched_build_groups will build a circular linked list of the groups
5447 * covered by the given span, and will set each group's ->cpumask correctly,
5448 * and ->cpu_power to 0.
5450 static void
5451 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5452 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5453 struct sched_group **sg))
5455 struct sched_group *first = NULL, *last = NULL;
5456 cpumask_t covered = CPU_MASK_NONE;
5457 int i;
5459 for_each_cpu_mask(i, span) {
5460 struct sched_group *sg;
5461 int group = group_fn(i, cpu_map, &sg);
5462 int j;
5464 if (cpu_isset(i, covered))
5465 continue;
5467 sg->cpumask = CPU_MASK_NONE;
5468 sg->__cpu_power = 0;
5470 for_each_cpu_mask(j, span) {
5471 if (group_fn(j, cpu_map, NULL) != group)
5472 continue;
5474 cpu_set(j, covered);
5475 cpu_set(j, sg->cpumask);
5477 if (!first)
5478 first = sg;
5479 if (last)
5480 last->next = sg;
5481 last = sg;
5483 last->next = first;
5486 #define SD_NODES_PER_DOMAIN 16
5488 #ifdef CONFIG_NUMA
5491 * find_next_best_node - find the next node to include in a sched_domain
5492 * @node: node whose sched_domain we're building
5493 * @used_nodes: nodes already in the sched_domain
5495 * Find the next node to include in a given scheduling domain. Simply
5496 * finds the closest node not already in the @used_nodes map.
5498 * Should use nodemask_t.
5500 static int find_next_best_node(int node, unsigned long *used_nodes)
5502 int i, n, val, min_val, best_node = 0;
5504 min_val = INT_MAX;
5506 for (i = 0; i < MAX_NUMNODES; i++) {
5507 /* Start at @node */
5508 n = (node + i) % MAX_NUMNODES;
5510 if (!nr_cpus_node(n))
5511 continue;
5513 /* Skip already used nodes */
5514 if (test_bit(n, used_nodes))
5515 continue;
5517 /* Simple min distance search */
5518 val = node_distance(node, n);
5520 if (val < min_val) {
5521 min_val = val;
5522 best_node = n;
5526 set_bit(best_node, used_nodes);
5527 return best_node;
5531 * sched_domain_node_span - get a cpumask for a node's sched_domain
5532 * @node: node whose cpumask we're constructing
5533 * @size: number of nodes to include in this span
5535 * Given a node, construct a good cpumask for its sched_domain to span. It
5536 * should be one that prevents unnecessary balancing, but also spreads tasks
5537 * out optimally.
5539 static cpumask_t sched_domain_node_span(int node)
5541 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5542 cpumask_t span, nodemask;
5543 int i;
5545 cpus_clear(span);
5546 bitmap_zero(used_nodes, MAX_NUMNODES);
5548 nodemask = node_to_cpumask(node);
5549 cpus_or(span, span, nodemask);
5550 set_bit(node, used_nodes);
5552 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5553 int next_node = find_next_best_node(node, used_nodes);
5555 nodemask = node_to_cpumask(next_node);
5556 cpus_or(span, span, nodemask);
5559 return span;
5561 #endif
5563 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5566 * SMT sched-domains:
5568 #ifdef CONFIG_SCHED_SMT
5569 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5570 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5572 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5573 struct sched_group **sg)
5575 if (sg)
5576 *sg = &per_cpu(sched_group_cpus, cpu);
5577 return cpu;
5579 #endif
5582 * multi-core sched-domains:
5584 #ifdef CONFIG_SCHED_MC
5585 static DEFINE_PER_CPU(struct sched_domain, core_domains);
5586 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5587 #endif
5589 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5590 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5591 struct sched_group **sg)
5593 int group;
5594 cpumask_t mask = cpu_sibling_map[cpu];
5595 cpus_and(mask, mask, *cpu_map);
5596 group = first_cpu(mask);
5597 if (sg)
5598 *sg = &per_cpu(sched_group_core, group);
5599 return group;
5601 #elif defined(CONFIG_SCHED_MC)
5602 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5603 struct sched_group **sg)
5605 if (sg)
5606 *sg = &per_cpu(sched_group_core, cpu);
5607 return cpu;
5609 #endif
5611 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5612 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
5614 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5615 struct sched_group **sg)
5617 int group;
5618 #ifdef CONFIG_SCHED_MC
5619 cpumask_t mask = cpu_coregroup_map(cpu);
5620 cpus_and(mask, mask, *cpu_map);
5621 group = first_cpu(mask);
5622 #elif defined(CONFIG_SCHED_SMT)
5623 cpumask_t mask = cpu_sibling_map[cpu];
5624 cpus_and(mask, mask, *cpu_map);
5625 group = first_cpu(mask);
5626 #else
5627 group = cpu;
5628 #endif
5629 if (sg)
5630 *sg = &per_cpu(sched_group_phys, group);
5631 return group;
5634 #ifdef CONFIG_NUMA
5636 * The init_sched_build_groups can't handle what we want to do with node
5637 * groups, so roll our own. Now each node has its own list of groups which
5638 * gets dynamically allocated.
5640 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5641 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5643 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5644 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
5646 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5647 struct sched_group **sg)
5649 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5650 int group;
5652 cpus_and(nodemask, nodemask, *cpu_map);
5653 group = first_cpu(nodemask);
5655 if (sg)
5656 *sg = &per_cpu(sched_group_allnodes, group);
5657 return group;
5660 static void init_numa_sched_groups_power(struct sched_group *group_head)
5662 struct sched_group *sg = group_head;
5663 int j;
5665 if (!sg)
5666 return;
5667 next_sg:
5668 for_each_cpu_mask(j, sg->cpumask) {
5669 struct sched_domain *sd;
5671 sd = &per_cpu(phys_domains, j);
5672 if (j != first_cpu(sd->groups->cpumask)) {
5674 * Only add "power" once for each
5675 * physical package.
5677 continue;
5680 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5682 sg = sg->next;
5683 if (sg != group_head)
5684 goto next_sg;
5686 #endif
5688 #ifdef CONFIG_NUMA
5689 /* Free memory allocated for various sched_group structures */
5690 static void free_sched_groups(const cpumask_t *cpu_map)
5692 int cpu, i;
5694 for_each_cpu_mask(cpu, *cpu_map) {
5695 struct sched_group **sched_group_nodes
5696 = sched_group_nodes_bycpu[cpu];
5698 if (!sched_group_nodes)
5699 continue;
5701 for (i = 0; i < MAX_NUMNODES; i++) {
5702 cpumask_t nodemask = node_to_cpumask(i);
5703 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5705 cpus_and(nodemask, nodemask, *cpu_map);
5706 if (cpus_empty(nodemask))
5707 continue;
5709 if (sg == NULL)
5710 continue;
5711 sg = sg->next;
5712 next_sg:
5713 oldsg = sg;
5714 sg = sg->next;
5715 kfree(oldsg);
5716 if (oldsg != sched_group_nodes[i])
5717 goto next_sg;
5719 kfree(sched_group_nodes);
5720 sched_group_nodes_bycpu[cpu] = NULL;
5723 #else
5724 static void free_sched_groups(const cpumask_t *cpu_map)
5727 #endif
5730 * Initialize sched groups cpu_power.
5732 * cpu_power indicates the capacity of sched group, which is used while
5733 * distributing the load between different sched groups in a sched domain.
5734 * Typically cpu_power for all the groups in a sched domain will be same unless
5735 * there are asymmetries in the topology. If there are asymmetries, group
5736 * having more cpu_power will pickup more load compared to the group having
5737 * less cpu_power.
5739 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5740 * the maximum number of tasks a group can handle in the presence of other idle
5741 * or lightly loaded groups in the same sched domain.
5743 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5745 struct sched_domain *child;
5746 struct sched_group *group;
5748 WARN_ON(!sd || !sd->groups);
5750 if (cpu != first_cpu(sd->groups->cpumask))
5751 return;
5753 child = sd->child;
5755 sd->groups->__cpu_power = 0;
5758 * For perf policy, if the groups in child domain share resources
5759 * (for example cores sharing some portions of the cache hierarchy
5760 * or SMT), then set this domain groups cpu_power such that each group
5761 * can handle only one task, when there are other idle groups in the
5762 * same sched domain.
5764 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5765 (child->flags &
5766 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5767 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
5768 return;
5772 * add cpu_power of each child group to this groups cpu_power
5774 group = child->groups;
5775 do {
5776 sg_inc_cpu_power(sd->groups, group->__cpu_power);
5777 group = group->next;
5778 } while (group != child->groups);
5782 * Build sched domains for a given set of cpus and attach the sched domains
5783 * to the individual cpus
5785 static int build_sched_domains(const cpumask_t *cpu_map)
5787 int i;
5788 #ifdef CONFIG_NUMA
5789 struct sched_group **sched_group_nodes = NULL;
5790 int sd_allnodes = 0;
5793 * Allocate the per-node list of sched groups
5795 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
5796 GFP_KERNEL);
5797 if (!sched_group_nodes) {
5798 printk(KERN_WARNING "Can not alloc sched group node list\n");
5799 return -ENOMEM;
5801 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5802 #endif
5805 * Set up domains for cpus specified by the cpu_map.
5807 for_each_cpu_mask(i, *cpu_map) {
5808 struct sched_domain *sd = NULL, *p;
5809 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5811 cpus_and(nodemask, nodemask, *cpu_map);
5813 #ifdef CONFIG_NUMA
5814 if (cpus_weight(*cpu_map) >
5815 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5816 sd = &per_cpu(allnodes_domains, i);
5817 *sd = SD_ALLNODES_INIT;
5818 sd->span = *cpu_map;
5819 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
5820 p = sd;
5821 sd_allnodes = 1;
5822 } else
5823 p = NULL;
5825 sd = &per_cpu(node_domains, i);
5826 *sd = SD_NODE_INIT;
5827 sd->span = sched_domain_node_span(cpu_to_node(i));
5828 sd->parent = p;
5829 if (p)
5830 p->child = sd;
5831 cpus_and(sd->span, sd->span, *cpu_map);
5832 #endif
5834 p = sd;
5835 sd = &per_cpu(phys_domains, i);
5836 *sd = SD_CPU_INIT;
5837 sd->span = nodemask;
5838 sd->parent = p;
5839 if (p)
5840 p->child = sd;
5841 cpu_to_phys_group(i, cpu_map, &sd->groups);
5843 #ifdef CONFIG_SCHED_MC
5844 p = sd;
5845 sd = &per_cpu(core_domains, i);
5846 *sd = SD_MC_INIT;
5847 sd->span = cpu_coregroup_map(i);
5848 cpus_and(sd->span, sd->span, *cpu_map);
5849 sd->parent = p;
5850 p->child = sd;
5851 cpu_to_core_group(i, cpu_map, &sd->groups);
5852 #endif
5854 #ifdef CONFIG_SCHED_SMT
5855 p = sd;
5856 sd = &per_cpu(cpu_domains, i);
5857 *sd = SD_SIBLING_INIT;
5858 sd->span = cpu_sibling_map[i];
5859 cpus_and(sd->span, sd->span, *cpu_map);
5860 sd->parent = p;
5861 p->child = sd;
5862 cpu_to_cpu_group(i, cpu_map, &sd->groups);
5863 #endif
5866 #ifdef CONFIG_SCHED_SMT
5867 /* Set up CPU (sibling) groups */
5868 for_each_cpu_mask(i, *cpu_map) {
5869 cpumask_t this_sibling_map = cpu_sibling_map[i];
5870 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
5871 if (i != first_cpu(this_sibling_map))
5872 continue;
5874 init_sched_build_groups(this_sibling_map, cpu_map,
5875 &cpu_to_cpu_group);
5877 #endif
5879 #ifdef CONFIG_SCHED_MC
5880 /* Set up multi-core groups */
5881 for_each_cpu_mask(i, *cpu_map) {
5882 cpumask_t this_core_map = cpu_coregroup_map(i);
5883 cpus_and(this_core_map, this_core_map, *cpu_map);
5884 if (i != first_cpu(this_core_map))
5885 continue;
5886 init_sched_build_groups(this_core_map, cpu_map,
5887 &cpu_to_core_group);
5889 #endif
5891 /* Set up physical groups */
5892 for (i = 0; i < MAX_NUMNODES; i++) {
5893 cpumask_t nodemask = node_to_cpumask(i);
5895 cpus_and(nodemask, nodemask, *cpu_map);
5896 if (cpus_empty(nodemask))
5897 continue;
5899 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
5902 #ifdef CONFIG_NUMA
5903 /* Set up node groups */
5904 if (sd_allnodes)
5905 init_sched_build_groups(*cpu_map, cpu_map,
5906 &cpu_to_allnodes_group);
5908 for (i = 0; i < MAX_NUMNODES; i++) {
5909 /* Set up node groups */
5910 struct sched_group *sg, *prev;
5911 cpumask_t nodemask = node_to_cpumask(i);
5912 cpumask_t domainspan;
5913 cpumask_t covered = CPU_MASK_NONE;
5914 int j;
5916 cpus_and(nodemask, nodemask, *cpu_map);
5917 if (cpus_empty(nodemask)) {
5918 sched_group_nodes[i] = NULL;
5919 continue;
5922 domainspan = sched_domain_node_span(i);
5923 cpus_and(domainspan, domainspan, *cpu_map);
5925 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
5926 if (!sg) {
5927 printk(KERN_WARNING "Can not alloc domain group for "
5928 "node %d\n", i);
5929 goto error;
5931 sched_group_nodes[i] = sg;
5932 for_each_cpu_mask(j, nodemask) {
5933 struct sched_domain *sd;
5935 sd = &per_cpu(node_domains, j);
5936 sd->groups = sg;
5938 sg->__cpu_power = 0;
5939 sg->cpumask = nodemask;
5940 sg->next = sg;
5941 cpus_or(covered, covered, nodemask);
5942 prev = sg;
5944 for (j = 0; j < MAX_NUMNODES; j++) {
5945 cpumask_t tmp, notcovered;
5946 int n = (i + j) % MAX_NUMNODES;
5948 cpus_complement(notcovered, covered);
5949 cpus_and(tmp, notcovered, *cpu_map);
5950 cpus_and(tmp, tmp, domainspan);
5951 if (cpus_empty(tmp))
5952 break;
5954 nodemask = node_to_cpumask(n);
5955 cpus_and(tmp, tmp, nodemask);
5956 if (cpus_empty(tmp))
5957 continue;
5959 sg = kmalloc_node(sizeof(struct sched_group),
5960 GFP_KERNEL, i);
5961 if (!sg) {
5962 printk(KERN_WARNING
5963 "Can not alloc domain group for node %d\n", j);
5964 goto error;
5966 sg->__cpu_power = 0;
5967 sg->cpumask = tmp;
5968 sg->next = prev->next;
5969 cpus_or(covered, covered, tmp);
5970 prev->next = sg;
5971 prev = sg;
5974 #endif
5976 /* Calculate CPU power for physical packages and nodes */
5977 #ifdef CONFIG_SCHED_SMT
5978 for_each_cpu_mask(i, *cpu_map) {
5979 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5981 init_sched_groups_power(i, sd);
5983 #endif
5984 #ifdef CONFIG_SCHED_MC
5985 for_each_cpu_mask(i, *cpu_map) {
5986 struct sched_domain *sd = &per_cpu(core_domains, i);
5988 init_sched_groups_power(i, sd);
5990 #endif
5992 for_each_cpu_mask(i, *cpu_map) {
5993 struct sched_domain *sd = &per_cpu(phys_domains, i);
5995 init_sched_groups_power(i, sd);
5998 #ifdef CONFIG_NUMA
5999 for (i = 0; i < MAX_NUMNODES; i++)
6000 init_numa_sched_groups_power(sched_group_nodes[i]);
6002 if (sd_allnodes) {
6003 struct sched_group *sg;
6005 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6006 init_numa_sched_groups_power(sg);
6008 #endif
6010 /* Attach the domains */
6011 for_each_cpu_mask(i, *cpu_map) {
6012 struct sched_domain *sd;
6013 #ifdef CONFIG_SCHED_SMT
6014 sd = &per_cpu(cpu_domains, i);
6015 #elif defined(CONFIG_SCHED_MC)
6016 sd = &per_cpu(core_domains, i);
6017 #else
6018 sd = &per_cpu(phys_domains, i);
6019 #endif
6020 cpu_attach_domain(sd, i);
6023 return 0;
6025 #ifdef CONFIG_NUMA
6026 error:
6027 free_sched_groups(cpu_map);
6028 return -ENOMEM;
6029 #endif
6032 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6034 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6036 cpumask_t cpu_default_map;
6037 int err;
6040 * Setup mask for cpus without special case scheduling requirements.
6041 * For now this just excludes isolated cpus, but could be used to
6042 * exclude other special cases in the future.
6044 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6046 err = build_sched_domains(&cpu_default_map);
6048 return err;
6051 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6053 free_sched_groups(cpu_map);
6057 * Detach sched domains from a group of cpus specified in cpu_map
6058 * These cpus will now be attached to the NULL domain
6060 static void detach_destroy_domains(const cpumask_t *cpu_map)
6062 int i;
6064 for_each_cpu_mask(i, *cpu_map)
6065 cpu_attach_domain(NULL, i);
6066 synchronize_sched();
6067 arch_destroy_sched_domains(cpu_map);
6071 * Partition sched domains as specified by the cpumasks below.
6072 * This attaches all cpus from the cpumasks to the NULL domain,
6073 * waits for a RCU quiescent period, recalculates sched
6074 * domain information and then attaches them back to the
6075 * correct sched domains
6076 * Call with hotplug lock held
6078 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6080 cpumask_t change_map;
6081 int err = 0;
6083 cpus_and(*partition1, *partition1, cpu_online_map);
6084 cpus_and(*partition2, *partition2, cpu_online_map);
6085 cpus_or(change_map, *partition1, *partition2);
6087 /* Detach sched domains from all of the affected cpus */
6088 detach_destroy_domains(&change_map);
6089 if (!cpus_empty(*partition1))
6090 err = build_sched_domains(partition1);
6091 if (!err && !cpus_empty(*partition2))
6092 err = build_sched_domains(partition2);
6094 return err;
6097 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6098 int arch_reinit_sched_domains(void)
6100 int err;
6102 mutex_lock(&sched_hotcpu_mutex);
6103 detach_destroy_domains(&cpu_online_map);
6104 err = arch_init_sched_domains(&cpu_online_map);
6105 mutex_unlock(&sched_hotcpu_mutex);
6107 return err;
6110 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6112 int ret;
6114 if (buf[0] != '0' && buf[0] != '1')
6115 return -EINVAL;
6117 if (smt)
6118 sched_smt_power_savings = (buf[0] == '1');
6119 else
6120 sched_mc_power_savings = (buf[0] == '1');
6122 ret = arch_reinit_sched_domains();
6124 return ret ? ret : count;
6127 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6129 int err = 0;
6131 #ifdef CONFIG_SCHED_SMT
6132 if (smt_capable())
6133 err = sysfs_create_file(&cls->kset.kobj,
6134 &attr_sched_smt_power_savings.attr);
6135 #endif
6136 #ifdef CONFIG_SCHED_MC
6137 if (!err && mc_capable())
6138 err = sysfs_create_file(&cls->kset.kobj,
6139 &attr_sched_mc_power_savings.attr);
6140 #endif
6141 return err;
6143 #endif
6145 #ifdef CONFIG_SCHED_MC
6146 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6148 return sprintf(page, "%u\n", sched_mc_power_savings);
6150 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6151 const char *buf, size_t count)
6153 return sched_power_savings_store(buf, count, 0);
6155 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6156 sched_mc_power_savings_store);
6157 #endif
6159 #ifdef CONFIG_SCHED_SMT
6160 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6162 return sprintf(page, "%u\n", sched_smt_power_savings);
6164 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6165 const char *buf, size_t count)
6167 return sched_power_savings_store(buf, count, 1);
6169 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6170 sched_smt_power_savings_store);
6171 #endif
6174 * Force a reinitialization of the sched domains hierarchy. The domains
6175 * and groups cannot be updated in place without racing with the balancing
6176 * code, so we temporarily attach all running cpus to the NULL domain
6177 * which will prevent rebalancing while the sched domains are recalculated.
6179 static int update_sched_domains(struct notifier_block *nfb,
6180 unsigned long action, void *hcpu)
6182 switch (action) {
6183 case CPU_UP_PREPARE:
6184 case CPU_UP_PREPARE_FROZEN:
6185 case CPU_DOWN_PREPARE:
6186 case CPU_DOWN_PREPARE_FROZEN:
6187 detach_destroy_domains(&cpu_online_map);
6188 return NOTIFY_OK;
6190 case CPU_UP_CANCELED:
6191 case CPU_UP_CANCELED_FROZEN:
6192 case CPU_DOWN_FAILED:
6193 case CPU_DOWN_FAILED_FROZEN:
6194 case CPU_ONLINE:
6195 case CPU_ONLINE_FROZEN:
6196 case CPU_DEAD:
6197 case CPU_DEAD_FROZEN:
6199 * Fall through and re-initialise the domains.
6201 break;
6202 default:
6203 return NOTIFY_DONE;
6206 /* The hotplug lock is already held by cpu_up/cpu_down */
6207 arch_init_sched_domains(&cpu_online_map);
6209 return NOTIFY_OK;
6212 void __init sched_init_smp(void)
6214 cpumask_t non_isolated_cpus;
6216 mutex_lock(&sched_hotcpu_mutex);
6217 arch_init_sched_domains(&cpu_online_map);
6218 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6219 if (cpus_empty(non_isolated_cpus))
6220 cpu_set(smp_processor_id(), non_isolated_cpus);
6221 mutex_unlock(&sched_hotcpu_mutex);
6222 /* XXX: Theoretical race here - CPU may be hotplugged now */
6223 hotcpu_notifier(update_sched_domains, 0);
6225 /* Move init over to a non-isolated CPU */
6226 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6227 BUG();
6228 sched_init_granularity();
6230 #else
6231 void __init sched_init_smp(void)
6233 sched_init_granularity();
6235 #endif /* CONFIG_SMP */
6237 int in_sched_functions(unsigned long addr)
6239 /* Linker adds these: start and end of __sched functions */
6240 extern char __sched_text_start[], __sched_text_end[];
6242 return in_lock_functions(addr) ||
6243 (addr >= (unsigned long)__sched_text_start
6244 && addr < (unsigned long)__sched_text_end);
6247 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6249 cfs_rq->tasks_timeline = RB_ROOT;
6250 cfs_rq->fair_clock = 1;
6251 #ifdef CONFIG_FAIR_GROUP_SCHED
6252 cfs_rq->rq = rq;
6253 #endif
6256 void __init sched_init(void)
6258 u64 now = sched_clock();
6259 int highest_cpu = 0;
6260 int i, j;
6263 * Link up the scheduling class hierarchy:
6265 rt_sched_class.next = &fair_sched_class;
6266 fair_sched_class.next = &idle_sched_class;
6267 idle_sched_class.next = NULL;
6269 for_each_possible_cpu(i) {
6270 struct rt_prio_array *array;
6271 struct rq *rq;
6273 rq = cpu_rq(i);
6274 spin_lock_init(&rq->lock);
6275 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6276 rq->nr_running = 0;
6277 rq->clock = 1;
6278 init_cfs_rq(&rq->cfs, rq);
6279 #ifdef CONFIG_FAIR_GROUP_SCHED
6280 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6281 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6282 #endif
6283 rq->ls.load_update_last = now;
6284 rq->ls.load_update_start = now;
6286 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6287 rq->cpu_load[j] = 0;
6288 #ifdef CONFIG_SMP
6289 rq->sd = NULL;
6290 rq->active_balance = 0;
6291 rq->next_balance = jiffies;
6292 rq->push_cpu = 0;
6293 rq->cpu = i;
6294 rq->migration_thread = NULL;
6295 INIT_LIST_HEAD(&rq->migration_queue);
6296 #endif
6297 atomic_set(&rq->nr_iowait, 0);
6299 array = &rq->rt.active;
6300 for (j = 0; j < MAX_RT_PRIO; j++) {
6301 INIT_LIST_HEAD(array->queue + j);
6302 __clear_bit(j, array->bitmap);
6304 highest_cpu = i;
6305 /* delimiter for bitsearch: */
6306 __set_bit(MAX_RT_PRIO, array->bitmap);
6309 set_load_weight(&init_task);
6311 #ifdef CONFIG_SMP
6312 nr_cpu_ids = highest_cpu + 1;
6313 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6314 #endif
6316 #ifdef CONFIG_RT_MUTEXES
6317 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6318 #endif
6321 * The boot idle thread does lazy MMU switching as well:
6323 atomic_inc(&init_mm.mm_count);
6324 enter_lazy_tlb(&init_mm, current);
6327 * Make us the idle thread. Technically, schedule() should not be
6328 * called from this thread, however somewhere below it might be,
6329 * but because we are the idle thread, we just pick up running again
6330 * when this runqueue becomes "idle".
6332 init_idle(current, smp_processor_id());
6334 * During early bootup we pretend to be a normal task:
6336 current->sched_class = &fair_sched_class;
6339 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6340 void __might_sleep(char *file, int line)
6342 #ifdef in_atomic
6343 static unsigned long prev_jiffy; /* ratelimiting */
6345 if ((in_atomic() || irqs_disabled()) &&
6346 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6347 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6348 return;
6349 prev_jiffy = jiffies;
6350 printk(KERN_ERR "BUG: sleeping function called from invalid"
6351 " context at %s:%d\n", file, line);
6352 printk("in_atomic():%d, irqs_disabled():%d\n",
6353 in_atomic(), irqs_disabled());
6354 debug_show_held_locks(current);
6355 if (irqs_disabled())
6356 print_irqtrace_events(current);
6357 dump_stack();
6359 #endif
6361 EXPORT_SYMBOL(__might_sleep);
6362 #endif
6364 #ifdef CONFIG_MAGIC_SYSRQ
6365 void normalize_rt_tasks(void)
6367 struct task_struct *g, *p;
6368 unsigned long flags;
6369 struct rq *rq;
6370 int on_rq;
6372 read_lock_irq(&tasklist_lock);
6373 do_each_thread(g, p) {
6374 p->se.fair_key = 0;
6375 p->se.wait_runtime = 0;
6376 p->se.wait_start_fair = 0;
6377 p->se.wait_start = 0;
6378 p->se.exec_start = 0;
6379 p->se.sleep_start = 0;
6380 p->se.sleep_start_fair = 0;
6381 p->se.block_start = 0;
6382 task_rq(p)->cfs.fair_clock = 0;
6383 task_rq(p)->clock = 0;
6385 if (!rt_task(p)) {
6387 * Renice negative nice level userspace
6388 * tasks back to 0:
6390 if (TASK_NICE(p) < 0 && p->mm)
6391 set_user_nice(p, 0);
6392 continue;
6395 spin_lock_irqsave(&p->pi_lock, flags);
6396 rq = __task_rq_lock(p);
6397 #ifdef CONFIG_SMP
6399 * Do not touch the migration thread:
6401 if (p == rq->migration_thread)
6402 goto out_unlock;
6403 #endif
6405 on_rq = p->se.on_rq;
6406 if (on_rq)
6407 deactivate_task(task_rq(p), p, 0);
6408 __setscheduler(rq, p, SCHED_NORMAL, 0);
6409 if (on_rq) {
6410 activate_task(task_rq(p), p, 0);
6411 resched_task(rq->curr);
6413 #ifdef CONFIG_SMP
6414 out_unlock:
6415 #endif
6416 __task_rq_unlock(rq);
6417 spin_unlock_irqrestore(&p->pi_lock, flags);
6418 } while_each_thread(g, p);
6420 read_unlock_irq(&tasklist_lock);
6423 #endif /* CONFIG_MAGIC_SYSRQ */
6425 #ifdef CONFIG_IA64
6427 * These functions are only useful for the IA64 MCA handling.
6429 * They can only be called when the whole system has been
6430 * stopped - every CPU needs to be quiescent, and no scheduling
6431 * activity can take place. Using them for anything else would
6432 * be a serious bug, and as a result, they aren't even visible
6433 * under any other configuration.
6437 * curr_task - return the current task for a given cpu.
6438 * @cpu: the processor in question.
6440 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6442 struct task_struct *curr_task(int cpu)
6444 return cpu_curr(cpu);
6448 * set_curr_task - set the current task for a given cpu.
6449 * @cpu: the processor in question.
6450 * @p: the task pointer to set.
6452 * Description: This function must only be used when non-maskable interrupts
6453 * are serviced on a separate stack. It allows the architecture to switch the
6454 * notion of the current task on a cpu in a non-blocking manner. This function
6455 * must be called with all CPU's synchronized, and interrupts disabled, the
6456 * and caller must save the original value of the current task (see
6457 * curr_task() above) and restore that value before reenabling interrupts and
6458 * re-starting the system.
6460 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6462 void set_curr_task(int cpu, struct task_struct *p)
6464 cpu_curr(cpu) = p;
6467 #endif