sched: remove interactivity types
[usb.git] / kernel / sched.c
blob886531c681c1c353f82d9f11ec887881148cfc3f
1 /*
2 * kernel/sched.c
4 * Kernel scheduler and related syscalls
6 * Copyright (C) 1991-2002 Linus Torvalds
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
21 #include <linux/mm.h>
22 #include <linux/module.h>
23 #include <linux/nmi.h>
24 #include <linux/init.h>
25 #include <linux/uaccess.h>
26 #include <linux/highmem.h>
27 #include <linux/smp_lock.h>
28 #include <asm/mmu_context.h>
29 #include <linux/interrupt.h>
30 #include <linux/capability.h>
31 #include <linux/completion.h>
32 #include <linux/kernel_stat.h>
33 #include <linux/debug_locks.h>
34 #include <linux/security.h>
35 #include <linux/notifier.h>
36 #include <linux/profile.h>
37 #include <linux/freezer.h>
38 #include <linux/vmalloc.h>
39 #include <linux/blkdev.h>
40 #include <linux/delay.h>
41 #include <linux/smp.h>
42 #include <linux/threads.h>
43 #include <linux/timer.h>
44 #include <linux/rcupdate.h>
45 #include <linux/cpu.h>
46 #include <linux/cpuset.h>
47 #include <linux/percpu.h>
48 #include <linux/kthread.h>
49 #include <linux/seq_file.h>
50 #include <linux/syscalls.h>
51 #include <linux/times.h>
52 #include <linux/tsacct_kern.h>
53 #include <linux/kprobes.h>
54 #include <linux/delayacct.h>
55 #include <linux/reciprocal_div.h>
56 #include <linux/unistd.h>
58 #include <asm/tlb.h>
61 * Scheduler clock - returns current time in nanosec units.
62 * This is default implementation.
63 * Architectures and sub-architectures can override this.
65 unsigned long long __attribute__((weak)) sched_clock(void)
67 return (unsigned long long)jiffies * (1000000000 / HZ);
71 * Convert user-nice values [ -20 ... 0 ... 19 ]
72 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
73 * and back.
75 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
76 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
77 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
80 * 'User priority' is the nice value converted to something we
81 * can work with better when scaling various scheduler parameters,
82 * it's a [ 0 ... 39 ] range.
84 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
85 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
86 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
89 * Some helpers for converting nanosecond timing to jiffy resolution
91 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
94 #define NICE_0_LOAD SCHED_LOAD_SCALE
95 #define NICE_0_SHIFT SCHED_LOAD_SHIFT
98 * These are the 'tuning knobs' of the scheduler:
100 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
101 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
102 * Timeslices get refilled after they expire.
104 #define MIN_TIMESLICE max(5 * HZ / 1000, 1)
105 #define DEF_TIMESLICE (100 * HZ / 1000)
107 #ifdef CONFIG_SMP
109 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
110 * Since cpu_power is a 'constant', we can use a reciprocal divide.
112 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
114 return reciprocal_divide(load, sg->reciprocal_cpu_power);
118 * Each time a sched group cpu_power is changed,
119 * we must compute its reciprocal value
121 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
123 sg->__cpu_power += val;
124 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
126 #endif
128 #define SCALE_PRIO(x, prio) \
129 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
132 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
133 * to time slice values: [800ms ... 100ms ... 5ms]
135 static unsigned int static_prio_timeslice(int static_prio)
137 if (static_prio == NICE_TO_PRIO(19))
138 return 1;
140 if (static_prio < NICE_TO_PRIO(0))
141 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
142 else
143 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
146 static inline int rt_policy(int policy)
148 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
149 return 1;
150 return 0;
153 static inline int task_has_rt_policy(struct task_struct *p)
155 return rt_policy(p->policy);
159 * This is the priority-queue data structure of the RT scheduling class:
161 struct rt_prio_array {
162 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
163 struct list_head queue[MAX_RT_PRIO];
166 struct load_stat {
167 struct load_weight load;
168 u64 load_update_start, load_update_last;
169 unsigned long delta_fair, delta_exec, delta_stat;
172 /* CFS-related fields in a runqueue */
173 struct cfs_rq {
174 struct load_weight load;
175 unsigned long nr_running;
177 s64 fair_clock;
178 u64 exec_clock;
179 s64 wait_runtime;
180 u64 sleeper_bonus;
181 unsigned long wait_runtime_overruns, wait_runtime_underruns;
183 struct rb_root tasks_timeline;
184 struct rb_node *rb_leftmost;
185 struct rb_node *rb_load_balance_curr;
186 #ifdef CONFIG_FAIR_GROUP_SCHED
187 /* 'curr' points to currently running entity on this cfs_rq.
188 * It is set to NULL otherwise (i.e when none are currently running).
190 struct sched_entity *curr;
191 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
193 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
194 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
195 * (like users, containers etc.)
197 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
198 * list is used during load balance.
200 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
201 #endif
204 /* Real-Time classes' related field in a runqueue: */
205 struct rt_rq {
206 struct rt_prio_array active;
207 int rt_load_balance_idx;
208 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
212 * The prio-array type of the old scheduler:
214 struct prio_array {
215 unsigned int nr_active;
216 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
217 struct list_head queue[MAX_PRIO];
221 * This is the main, per-CPU runqueue data structure.
223 * Locking rule: those places that want to lock multiple runqueues
224 * (such as the load balancing or the thread migration code), lock
225 * acquire operations must be ordered by ascending &runqueue.
227 struct rq {
228 spinlock_t lock; /* runqueue lock */
231 * nr_running and cpu_load should be in the same cacheline because
232 * remote CPUs use both these fields when doing load calculation.
234 unsigned long nr_running;
235 unsigned long raw_weighted_load;
236 #define CPU_LOAD_IDX_MAX 5
237 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
238 unsigned char idle_at_tick;
239 #ifdef CONFIG_NO_HZ
240 unsigned char in_nohz_recently;
241 #endif
242 struct load_stat ls; /* capture load from *all* tasks on this cpu */
243 unsigned long nr_load_updates;
244 u64 nr_switches;
246 struct cfs_rq cfs;
247 #ifdef CONFIG_FAIR_GROUP_SCHED
248 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
249 #endif
250 struct rt_rq rt;
253 * This is part of a global counter where only the total sum
254 * over all CPUs matters. A task can increase this counter on
255 * one CPU and if it got migrated afterwards it may decrease
256 * it on another CPU. Always updated under the runqueue lock:
258 unsigned long nr_uninterruptible;
260 unsigned long expired_timestamp;
261 unsigned long long most_recent_timestamp;
263 struct task_struct *curr, *idle;
264 unsigned long next_balance;
265 struct mm_struct *prev_mm;
267 struct prio_array *active, *expired, arrays[2];
268 int best_expired_prio;
270 u64 clock, prev_clock_raw;
271 s64 clock_max_delta;
273 unsigned int clock_warps, clock_overflows;
274 unsigned int clock_unstable_events;
276 struct sched_class *load_balance_class;
278 atomic_t nr_iowait;
280 #ifdef CONFIG_SMP
281 struct sched_domain *sd;
283 /* For active balancing */
284 int active_balance;
285 int push_cpu;
286 int cpu; /* cpu of this runqueue */
288 struct task_struct *migration_thread;
289 struct list_head migration_queue;
290 #endif
292 #ifdef CONFIG_SCHEDSTATS
293 /* latency stats */
294 struct sched_info rq_sched_info;
296 /* sys_sched_yield() stats */
297 unsigned long yld_exp_empty;
298 unsigned long yld_act_empty;
299 unsigned long yld_both_empty;
300 unsigned long yld_cnt;
302 /* schedule() stats */
303 unsigned long sched_switch;
304 unsigned long sched_cnt;
305 unsigned long sched_goidle;
307 /* try_to_wake_up() stats */
308 unsigned long ttwu_cnt;
309 unsigned long ttwu_local;
310 #endif
311 struct lock_class_key rq_lock_key;
314 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
315 static DEFINE_MUTEX(sched_hotcpu_mutex);
317 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
319 rq->curr->sched_class->check_preempt_curr(rq, p);
322 static inline int cpu_of(struct rq *rq)
324 #ifdef CONFIG_SMP
325 return rq->cpu;
326 #else
327 return 0;
328 #endif
332 * Per-runqueue clock, as finegrained as the platform can give us:
334 static unsigned long long __rq_clock(struct rq *rq)
336 u64 prev_raw = rq->prev_clock_raw;
337 u64 now = sched_clock();
338 s64 delta = now - prev_raw;
339 u64 clock = rq->clock;
342 * Protect against sched_clock() occasionally going backwards:
344 if (unlikely(delta < 0)) {
345 clock++;
346 rq->clock_warps++;
347 } else {
349 * Catch too large forward jumps too:
351 if (unlikely(delta > 2*TICK_NSEC)) {
352 clock++;
353 rq->clock_overflows++;
354 } else {
355 if (unlikely(delta > rq->clock_max_delta))
356 rq->clock_max_delta = delta;
357 clock += delta;
361 rq->prev_clock_raw = now;
362 rq->clock = clock;
364 return clock;
367 static inline unsigned long long rq_clock(struct rq *rq)
369 int this_cpu = smp_processor_id();
371 if (this_cpu == cpu_of(rq))
372 return __rq_clock(rq);
374 return rq->clock;
378 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
379 * See detach_destroy_domains: synchronize_sched for details.
381 * The domain tree of any CPU may only be accessed from within
382 * preempt-disabled sections.
384 #define for_each_domain(cpu, __sd) \
385 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
387 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
388 #define this_rq() (&__get_cpu_var(runqueues))
389 #define task_rq(p) cpu_rq(task_cpu(p))
390 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
392 #ifdef CONFIG_FAIR_GROUP_SCHED
393 /* Change a task's ->cfs_rq if it moves across CPUs */
394 static inline void set_task_cfs_rq(struct task_struct *p)
396 p->se.cfs_rq = &task_rq(p)->cfs;
398 #else
399 static inline void set_task_cfs_rq(struct task_struct *p)
402 #endif
404 #ifndef prepare_arch_switch
405 # define prepare_arch_switch(next) do { } while (0)
406 #endif
407 #ifndef finish_arch_switch
408 # define finish_arch_switch(prev) do { } while (0)
409 #endif
411 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
412 static inline int task_running(struct rq *rq, struct task_struct *p)
414 return rq->curr == p;
417 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
421 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
423 #ifdef CONFIG_DEBUG_SPINLOCK
424 /* this is a valid case when another task releases the spinlock */
425 rq->lock.owner = current;
426 #endif
428 * If we are tracking spinlock dependencies then we have to
429 * fix up the runqueue lock - which gets 'carried over' from
430 * prev into current:
432 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
434 spin_unlock_irq(&rq->lock);
437 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
438 static inline int task_running(struct rq *rq, struct task_struct *p)
440 #ifdef CONFIG_SMP
441 return p->oncpu;
442 #else
443 return rq->curr == p;
444 #endif
447 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
449 #ifdef CONFIG_SMP
451 * We can optimise this out completely for !SMP, because the
452 * SMP rebalancing from interrupt is the only thing that cares
453 * here.
455 next->oncpu = 1;
456 #endif
457 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
458 spin_unlock_irq(&rq->lock);
459 #else
460 spin_unlock(&rq->lock);
461 #endif
464 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
466 #ifdef CONFIG_SMP
468 * After ->oncpu is cleared, the task can be moved to a different CPU.
469 * We must ensure this doesn't happen until the switch is completely
470 * finished.
472 smp_wmb();
473 prev->oncpu = 0;
474 #endif
475 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
476 local_irq_enable();
477 #endif
479 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
482 * __task_rq_lock - lock the runqueue a given task resides on.
483 * Must be called interrupts disabled.
485 static inline struct rq *__task_rq_lock(struct task_struct *p)
486 __acquires(rq->lock)
488 struct rq *rq;
490 repeat_lock_task:
491 rq = task_rq(p);
492 spin_lock(&rq->lock);
493 if (unlikely(rq != task_rq(p))) {
494 spin_unlock(&rq->lock);
495 goto repeat_lock_task;
497 return rq;
501 * task_rq_lock - lock the runqueue a given task resides on and disable
502 * interrupts. Note the ordering: we can safely lookup the task_rq without
503 * explicitly disabling preemption.
505 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
506 __acquires(rq->lock)
508 struct rq *rq;
510 repeat_lock_task:
511 local_irq_save(*flags);
512 rq = task_rq(p);
513 spin_lock(&rq->lock);
514 if (unlikely(rq != task_rq(p))) {
515 spin_unlock_irqrestore(&rq->lock, *flags);
516 goto repeat_lock_task;
518 return rq;
521 static inline void __task_rq_unlock(struct rq *rq)
522 __releases(rq->lock)
524 spin_unlock(&rq->lock);
527 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
528 __releases(rq->lock)
530 spin_unlock_irqrestore(&rq->lock, *flags);
534 * this_rq_lock - lock this runqueue and disable interrupts.
536 static inline struct rq *this_rq_lock(void)
537 __acquires(rq->lock)
539 struct rq *rq;
541 local_irq_disable();
542 rq = this_rq();
543 spin_lock(&rq->lock);
545 return rq;
549 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
551 void sched_clock_unstable_event(void)
553 unsigned long flags;
554 struct rq *rq;
556 rq = task_rq_lock(current, &flags);
557 rq->prev_clock_raw = sched_clock();
558 rq->clock_unstable_events++;
559 task_rq_unlock(rq, &flags);
563 * resched_task - mark a task 'to be rescheduled now'.
565 * On UP this means the setting of the need_resched flag, on SMP it
566 * might also involve a cross-CPU call to trigger the scheduler on
567 * the target CPU.
569 #ifdef CONFIG_SMP
571 #ifndef tsk_is_polling
572 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
573 #endif
575 static void resched_task(struct task_struct *p)
577 int cpu;
579 assert_spin_locked(&task_rq(p)->lock);
581 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
582 return;
584 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
586 cpu = task_cpu(p);
587 if (cpu == smp_processor_id())
588 return;
590 /* NEED_RESCHED must be visible before we test polling */
591 smp_mb();
592 if (!tsk_is_polling(p))
593 smp_send_reschedule(cpu);
596 static void resched_cpu(int cpu)
598 struct rq *rq = cpu_rq(cpu);
599 unsigned long flags;
601 if (!spin_trylock_irqsave(&rq->lock, flags))
602 return;
603 resched_task(cpu_curr(cpu));
604 spin_unlock_irqrestore(&rq->lock, flags);
606 #else
607 static inline void resched_task(struct task_struct *p)
609 assert_spin_locked(&task_rq(p)->lock);
610 set_tsk_need_resched(p);
612 #endif
614 static u64 div64_likely32(u64 divident, unsigned long divisor)
616 #if BITS_PER_LONG == 32
617 if (likely(divident <= 0xffffffffULL))
618 return (u32)divident / divisor;
619 do_div(divident, divisor);
621 return divident;
622 #else
623 return divident / divisor;
624 #endif
627 #if BITS_PER_LONG == 32
628 # define WMULT_CONST (~0UL)
629 #else
630 # define WMULT_CONST (1UL << 32)
631 #endif
633 #define WMULT_SHIFT 32
635 static inline unsigned long
636 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
637 struct load_weight *lw)
639 u64 tmp;
641 if (unlikely(!lw->inv_weight))
642 lw->inv_weight = WMULT_CONST / lw->weight;
644 tmp = (u64)delta_exec * weight;
646 * Check whether we'd overflow the 64-bit multiplication:
648 if (unlikely(tmp > WMULT_CONST)) {
649 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
650 >> (WMULT_SHIFT/2);
651 } else {
652 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
655 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
658 static inline unsigned long
659 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
661 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
664 static void update_load_add(struct load_weight *lw, unsigned long inc)
666 lw->weight += inc;
667 lw->inv_weight = 0;
670 static void update_load_sub(struct load_weight *lw, unsigned long dec)
672 lw->weight -= dec;
673 lw->inv_weight = 0;
676 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
678 if (rq->curr != rq->idle && ls->load.weight) {
679 ls->delta_exec += ls->delta_stat;
680 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
681 ls->delta_stat = 0;
686 * Update delta_exec, delta_fair fields for rq.
688 * delta_fair clock advances at a rate inversely proportional to
689 * total load (rq->ls.load.weight) on the runqueue, while
690 * delta_exec advances at the same rate as wall-clock (provided
691 * cpu is not idle).
693 * delta_exec / delta_fair is a measure of the (smoothened) load on this
694 * runqueue over any given interval. This (smoothened) load is used
695 * during load balance.
697 * This function is called /before/ updating rq->ls.load
698 * and when switching tasks.
700 static void update_curr_load(struct rq *rq, u64 now)
702 struct load_stat *ls = &rq->ls;
703 u64 start;
705 start = ls->load_update_start;
706 ls->load_update_start = now;
707 ls->delta_stat += now - start;
709 * Stagger updates to ls->delta_fair. Very frequent updates
710 * can be expensive.
712 if (ls->delta_stat >= sysctl_sched_stat_granularity)
713 __update_curr_load(rq, ls);
717 * To aid in avoiding the subversion of "niceness" due to uneven distribution
718 * of tasks with abnormal "nice" values across CPUs the contribution that
719 * each task makes to its run queue's load is weighted according to its
720 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
721 * scaled version of the new time slice allocation that they receive on time
722 * slice expiry etc.
726 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
727 * If static_prio_timeslice() is ever changed to break this assumption then
728 * this code will need modification
730 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
731 #define load_weight(lp) \
732 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
733 #define PRIO_TO_LOAD_WEIGHT(prio) \
734 load_weight(static_prio_timeslice(prio))
735 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
736 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
738 #define WEIGHT_IDLEPRIO 2
739 #define WMULT_IDLEPRIO (1 << 31)
742 * Nice levels are multiplicative, with a gentle 10% change for every
743 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
744 * nice 1, it will get ~10% less CPU time than another CPU-bound task
745 * that remained on nice 0.
747 * The "10% effect" is relative and cumulative: from _any_ nice level,
748 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
749 * it's +10% CPU usage.
751 static const int prio_to_weight[40] = {
752 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
753 /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
754 /* 0 */ NICE_0_LOAD /* 1024 */,
755 /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
756 /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
759 static const u32 prio_to_wmult[40] = {
760 48356, 60446, 75558, 94446, 118058, 147573,
761 184467, 230589, 288233, 360285, 450347,
762 562979, 703746, 879575, 1099582, 1374389,
763 717986, 2147483, 2684354, 3355443, 4194304,
764 244160, 6557201, 8196502, 10250518, 12782640,
765 16025997, 19976592, 24970740, 31350126, 39045157,
766 49367440, 61356675, 76695844, 95443717, 119304647,
767 148102320, 186737708, 238609294, 286331153,
770 static inline void
771 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
773 update_curr_load(rq, now);
774 update_load_add(&rq->ls.load, p->se.load.weight);
777 static inline void
778 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
780 update_curr_load(rq, now);
781 update_load_sub(&rq->ls.load, p->se.load.weight);
784 static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
786 rq->nr_running++;
787 inc_load(rq, p, now);
790 static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
792 rq->nr_running--;
793 dec_load(rq, p, now);
796 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
799 * runqueue iterator, to support SMP load-balancing between different
800 * scheduling classes, without having to expose their internal data
801 * structures to the load-balancing proper:
803 struct rq_iterator {
804 void *arg;
805 struct task_struct *(*start)(void *);
806 struct task_struct *(*next)(void *);
809 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
810 unsigned long max_nr_move, unsigned long max_load_move,
811 struct sched_domain *sd, enum cpu_idle_type idle,
812 int *all_pinned, unsigned long *load_moved,
813 int this_best_prio, int best_prio, int best_prio_seen,
814 struct rq_iterator *iterator);
816 #include "sched_stats.h"
817 #include "sched_rt.c"
818 #include "sched_fair.c"
819 #include "sched_idletask.c"
820 #ifdef CONFIG_SCHED_DEBUG
821 # include "sched_debug.c"
822 #endif
824 #define sched_class_highest (&rt_sched_class)
826 static void set_load_weight(struct task_struct *p)
828 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
829 p->se.wait_runtime = 0;
831 if (task_has_rt_policy(p)) {
832 p->se.load.weight = prio_to_weight[0] * 2;
833 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
834 return;
838 * SCHED_IDLE tasks get minimal weight:
840 if (p->policy == SCHED_IDLE) {
841 p->se.load.weight = WEIGHT_IDLEPRIO;
842 p->se.load.inv_weight = WMULT_IDLEPRIO;
843 return;
846 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
847 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
850 static void
851 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
853 sched_info_queued(p);
854 p->sched_class->enqueue_task(rq, p, wakeup, now);
855 p->se.on_rq = 1;
858 static void
859 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
861 p->sched_class->dequeue_task(rq, p, sleep, now);
862 p->se.on_rq = 0;
866 * __normal_prio - return the priority that is based on the static prio
868 static inline int __normal_prio(struct task_struct *p)
870 return p->static_prio;
874 * Calculate the expected normal priority: i.e. priority
875 * without taking RT-inheritance into account. Might be
876 * boosted by interactivity modifiers. Changes upon fork,
877 * setprio syscalls, and whenever the interactivity
878 * estimator recalculates.
880 static inline int normal_prio(struct task_struct *p)
882 int prio;
884 if (task_has_rt_policy(p))
885 prio = MAX_RT_PRIO-1 - p->rt_priority;
886 else
887 prio = __normal_prio(p);
888 return prio;
892 * Calculate the current priority, i.e. the priority
893 * taken into account by the scheduler. This value might
894 * be boosted by RT tasks, or might be boosted by
895 * interactivity modifiers. Will be RT if the task got
896 * RT-boosted. If not then it returns p->normal_prio.
898 static int effective_prio(struct task_struct *p)
900 p->normal_prio = normal_prio(p);
902 * If we are RT tasks or we were boosted to RT priority,
903 * keep the priority unchanged. Otherwise, update priority
904 * to the normal priority:
906 if (!rt_prio(p->prio))
907 return p->normal_prio;
908 return p->prio;
912 * activate_task - move a task to the runqueue.
914 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
916 u64 now = rq_clock(rq);
918 if (p->state == TASK_UNINTERRUPTIBLE)
919 rq->nr_uninterruptible--;
921 enqueue_task(rq, p, wakeup, now);
922 inc_nr_running(p, rq, now);
926 * activate_idle_task - move idle task to the _front_ of runqueue.
928 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
930 u64 now = rq_clock(rq);
932 if (p->state == TASK_UNINTERRUPTIBLE)
933 rq->nr_uninterruptible--;
935 enqueue_task(rq, p, 0, now);
936 inc_nr_running(p, rq, now);
940 * deactivate_task - remove a task from the runqueue.
942 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
944 u64 now = rq_clock(rq);
946 if (p->state == TASK_UNINTERRUPTIBLE)
947 rq->nr_uninterruptible++;
949 dequeue_task(rq, p, sleep, now);
950 dec_nr_running(p, rq, now);
954 * task_curr - is this task currently executing on a CPU?
955 * @p: the task in question.
957 inline int task_curr(const struct task_struct *p)
959 return cpu_curr(task_cpu(p)) == p;
962 /* Used instead of source_load when we know the type == 0 */
963 unsigned long weighted_cpuload(const int cpu)
965 return cpu_rq(cpu)->ls.load.weight;
968 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
970 #ifdef CONFIG_SMP
971 task_thread_info(p)->cpu = cpu;
972 set_task_cfs_rq(p);
973 #endif
976 #ifdef CONFIG_SMP
978 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
980 int old_cpu = task_cpu(p);
981 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
982 u64 clock_offset, fair_clock_offset;
984 clock_offset = old_rq->clock - new_rq->clock;
985 fair_clock_offset = old_rq->cfs.fair_clock -
986 new_rq->cfs.fair_clock;
987 if (p->se.wait_start)
988 p->se.wait_start -= clock_offset;
989 if (p->se.wait_start_fair)
990 p->se.wait_start_fair -= fair_clock_offset;
991 if (p->se.sleep_start)
992 p->se.sleep_start -= clock_offset;
993 if (p->se.block_start)
994 p->se.block_start -= clock_offset;
995 if (p->se.sleep_start_fair)
996 p->se.sleep_start_fair -= fair_clock_offset;
998 __set_task_cpu(p, new_cpu);
1001 struct migration_req {
1002 struct list_head list;
1004 struct task_struct *task;
1005 int dest_cpu;
1007 struct completion done;
1011 * The task's runqueue lock must be held.
1012 * Returns true if you have to wait for migration thread.
1014 static int
1015 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1017 struct rq *rq = task_rq(p);
1020 * If the task is not on a runqueue (and not running), then
1021 * it is sufficient to simply update the task's cpu field.
1023 if (!p->se.on_rq && !task_running(rq, p)) {
1024 set_task_cpu(p, dest_cpu);
1025 return 0;
1028 init_completion(&req->done);
1029 req->task = p;
1030 req->dest_cpu = dest_cpu;
1031 list_add(&req->list, &rq->migration_queue);
1033 return 1;
1037 * wait_task_inactive - wait for a thread to unschedule.
1039 * The caller must ensure that the task *will* unschedule sometime soon,
1040 * else this function might spin for a *long* time. This function can't
1041 * be called with interrupts off, or it may introduce deadlock with
1042 * smp_call_function() if an IPI is sent by the same process we are
1043 * waiting to become inactive.
1045 void wait_task_inactive(struct task_struct *p)
1047 unsigned long flags;
1048 int running, on_rq;
1049 struct rq *rq;
1051 repeat:
1053 * We do the initial early heuristics without holding
1054 * any task-queue locks at all. We'll only try to get
1055 * the runqueue lock when things look like they will
1056 * work out!
1058 rq = task_rq(p);
1061 * If the task is actively running on another CPU
1062 * still, just relax and busy-wait without holding
1063 * any locks.
1065 * NOTE! Since we don't hold any locks, it's not
1066 * even sure that "rq" stays as the right runqueue!
1067 * But we don't care, since "task_running()" will
1068 * return false if the runqueue has changed and p
1069 * is actually now running somewhere else!
1071 while (task_running(rq, p))
1072 cpu_relax();
1075 * Ok, time to look more closely! We need the rq
1076 * lock now, to be *sure*. If we're wrong, we'll
1077 * just go back and repeat.
1079 rq = task_rq_lock(p, &flags);
1080 running = task_running(rq, p);
1081 on_rq = p->se.on_rq;
1082 task_rq_unlock(rq, &flags);
1085 * Was it really running after all now that we
1086 * checked with the proper locks actually held?
1088 * Oops. Go back and try again..
1090 if (unlikely(running)) {
1091 cpu_relax();
1092 goto repeat;
1096 * It's not enough that it's not actively running,
1097 * it must be off the runqueue _entirely_, and not
1098 * preempted!
1100 * So if it wa still runnable (but just not actively
1101 * running right now), it's preempted, and we should
1102 * yield - it could be a while.
1104 if (unlikely(on_rq)) {
1105 yield();
1106 goto repeat;
1110 * Ahh, all good. It wasn't running, and it wasn't
1111 * runnable, which means that it will never become
1112 * running in the future either. We're all done!
1116 /***
1117 * kick_process - kick a running thread to enter/exit the kernel
1118 * @p: the to-be-kicked thread
1120 * Cause a process which is running on another CPU to enter
1121 * kernel-mode, without any delay. (to get signals handled.)
1123 * NOTE: this function doesnt have to take the runqueue lock,
1124 * because all it wants to ensure is that the remote task enters
1125 * the kernel. If the IPI races and the task has been migrated
1126 * to another CPU then no harm is done and the purpose has been
1127 * achieved as well.
1129 void kick_process(struct task_struct *p)
1131 int cpu;
1133 preempt_disable();
1134 cpu = task_cpu(p);
1135 if ((cpu != smp_processor_id()) && task_curr(p))
1136 smp_send_reschedule(cpu);
1137 preempt_enable();
1141 * Return a low guess at the load of a migration-source cpu weighted
1142 * according to the scheduling class and "nice" value.
1144 * We want to under-estimate the load of migration sources, to
1145 * balance conservatively.
1147 static inline unsigned long source_load(int cpu, int type)
1149 struct rq *rq = cpu_rq(cpu);
1150 unsigned long total = weighted_cpuload(cpu);
1152 if (type == 0)
1153 return total;
1155 return min(rq->cpu_load[type-1], total);
1159 * Return a high guess at the load of a migration-target cpu weighted
1160 * according to the scheduling class and "nice" value.
1162 static inline unsigned long target_load(int cpu, int type)
1164 struct rq *rq = cpu_rq(cpu);
1165 unsigned long total = weighted_cpuload(cpu);
1167 if (type == 0)
1168 return total;
1170 return max(rq->cpu_load[type-1], total);
1174 * Return the average load per task on the cpu's run queue
1176 static inline unsigned long cpu_avg_load_per_task(int cpu)
1178 struct rq *rq = cpu_rq(cpu);
1179 unsigned long total = weighted_cpuload(cpu);
1180 unsigned long n = rq->nr_running;
1182 return n ? total / n : SCHED_LOAD_SCALE;
1186 * find_idlest_group finds and returns the least busy CPU group within the
1187 * domain.
1189 static struct sched_group *
1190 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1192 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1193 unsigned long min_load = ULONG_MAX, this_load = 0;
1194 int load_idx = sd->forkexec_idx;
1195 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1197 do {
1198 unsigned long load, avg_load;
1199 int local_group;
1200 int i;
1202 /* Skip over this group if it has no CPUs allowed */
1203 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1204 goto nextgroup;
1206 local_group = cpu_isset(this_cpu, group->cpumask);
1208 /* Tally up the load of all CPUs in the group */
1209 avg_load = 0;
1211 for_each_cpu_mask(i, group->cpumask) {
1212 /* Bias balancing toward cpus of our domain */
1213 if (local_group)
1214 load = source_load(i, load_idx);
1215 else
1216 load = target_load(i, load_idx);
1218 avg_load += load;
1221 /* Adjust by relative CPU power of the group */
1222 avg_load = sg_div_cpu_power(group,
1223 avg_load * SCHED_LOAD_SCALE);
1225 if (local_group) {
1226 this_load = avg_load;
1227 this = group;
1228 } else if (avg_load < min_load) {
1229 min_load = avg_load;
1230 idlest = group;
1232 nextgroup:
1233 group = group->next;
1234 } while (group != sd->groups);
1236 if (!idlest || 100*this_load < imbalance*min_load)
1237 return NULL;
1238 return idlest;
1242 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1244 static int
1245 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1247 cpumask_t tmp;
1248 unsigned long load, min_load = ULONG_MAX;
1249 int idlest = -1;
1250 int i;
1252 /* Traverse only the allowed CPUs */
1253 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1255 for_each_cpu_mask(i, tmp) {
1256 load = weighted_cpuload(i);
1258 if (load < min_load || (load == min_load && i == this_cpu)) {
1259 min_load = load;
1260 idlest = i;
1264 return idlest;
1268 * sched_balance_self: balance the current task (running on cpu) in domains
1269 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1270 * SD_BALANCE_EXEC.
1272 * Balance, ie. select the least loaded group.
1274 * Returns the target CPU number, or the same CPU if no balancing is needed.
1276 * preempt must be disabled.
1278 static int sched_balance_self(int cpu, int flag)
1280 struct task_struct *t = current;
1281 struct sched_domain *tmp, *sd = NULL;
1283 for_each_domain(cpu, tmp) {
1285 * If power savings logic is enabled for a domain, stop there.
1287 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1288 break;
1289 if (tmp->flags & flag)
1290 sd = tmp;
1293 while (sd) {
1294 cpumask_t span;
1295 struct sched_group *group;
1296 int new_cpu, weight;
1298 if (!(sd->flags & flag)) {
1299 sd = sd->child;
1300 continue;
1303 span = sd->span;
1304 group = find_idlest_group(sd, t, cpu);
1305 if (!group) {
1306 sd = sd->child;
1307 continue;
1310 new_cpu = find_idlest_cpu(group, t, cpu);
1311 if (new_cpu == -1 || new_cpu == cpu) {
1312 /* Now try balancing at a lower domain level of cpu */
1313 sd = sd->child;
1314 continue;
1317 /* Now try balancing at a lower domain level of new_cpu */
1318 cpu = new_cpu;
1319 sd = NULL;
1320 weight = cpus_weight(span);
1321 for_each_domain(cpu, tmp) {
1322 if (weight <= cpus_weight(tmp->span))
1323 break;
1324 if (tmp->flags & flag)
1325 sd = tmp;
1327 /* while loop will break here if sd == NULL */
1330 return cpu;
1333 #endif /* CONFIG_SMP */
1336 * wake_idle() will wake a task on an idle cpu if task->cpu is
1337 * not idle and an idle cpu is available. The span of cpus to
1338 * search starts with cpus closest then further out as needed,
1339 * so we always favor a closer, idle cpu.
1341 * Returns the CPU we should wake onto.
1343 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1344 static int wake_idle(int cpu, struct task_struct *p)
1346 cpumask_t tmp;
1347 struct sched_domain *sd;
1348 int i;
1351 * If it is idle, then it is the best cpu to run this task.
1353 * This cpu is also the best, if it has more than one task already.
1354 * Siblings must be also busy(in most cases) as they didn't already
1355 * pickup the extra load from this cpu and hence we need not check
1356 * sibling runqueue info. This will avoid the checks and cache miss
1357 * penalities associated with that.
1359 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1360 return cpu;
1362 for_each_domain(cpu, sd) {
1363 if (sd->flags & SD_WAKE_IDLE) {
1364 cpus_and(tmp, sd->span, p->cpus_allowed);
1365 for_each_cpu_mask(i, tmp) {
1366 if (idle_cpu(i))
1367 return i;
1370 else
1371 break;
1373 return cpu;
1375 #else
1376 static inline int wake_idle(int cpu, struct task_struct *p)
1378 return cpu;
1380 #endif
1382 /***
1383 * try_to_wake_up - wake up a thread
1384 * @p: the to-be-woken-up thread
1385 * @state: the mask of task states that can be woken
1386 * @sync: do a synchronous wakeup?
1388 * Put it on the run-queue if it's not already there. The "current"
1389 * thread is always on the run-queue (except when the actual
1390 * re-schedule is in progress), and as such you're allowed to do
1391 * the simpler "current->state = TASK_RUNNING" to mark yourself
1392 * runnable without the overhead of this.
1394 * returns failure only if the task is already active.
1396 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1398 int cpu, this_cpu, success = 0;
1399 unsigned long flags;
1400 long old_state;
1401 struct rq *rq;
1402 #ifdef CONFIG_SMP
1403 struct sched_domain *sd, *this_sd = NULL;
1404 unsigned long load, this_load;
1405 int new_cpu;
1406 #endif
1408 rq = task_rq_lock(p, &flags);
1409 old_state = p->state;
1410 if (!(old_state & state))
1411 goto out;
1413 if (p->se.on_rq)
1414 goto out_running;
1416 cpu = task_cpu(p);
1417 this_cpu = smp_processor_id();
1419 #ifdef CONFIG_SMP
1420 if (unlikely(task_running(rq, p)))
1421 goto out_activate;
1423 new_cpu = cpu;
1425 schedstat_inc(rq, ttwu_cnt);
1426 if (cpu == this_cpu) {
1427 schedstat_inc(rq, ttwu_local);
1428 goto out_set_cpu;
1431 for_each_domain(this_cpu, sd) {
1432 if (cpu_isset(cpu, sd->span)) {
1433 schedstat_inc(sd, ttwu_wake_remote);
1434 this_sd = sd;
1435 break;
1439 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1440 goto out_set_cpu;
1443 * Check for affine wakeup and passive balancing possibilities.
1445 if (this_sd) {
1446 int idx = this_sd->wake_idx;
1447 unsigned int imbalance;
1449 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1451 load = source_load(cpu, idx);
1452 this_load = target_load(this_cpu, idx);
1454 new_cpu = this_cpu; /* Wake to this CPU if we can */
1456 if (this_sd->flags & SD_WAKE_AFFINE) {
1457 unsigned long tl = this_load;
1458 unsigned long tl_per_task;
1460 tl_per_task = cpu_avg_load_per_task(this_cpu);
1463 * If sync wakeup then subtract the (maximum possible)
1464 * effect of the currently running task from the load
1465 * of the current CPU:
1467 if (sync)
1468 tl -= current->se.load.weight;
1470 if ((tl <= load &&
1471 tl + target_load(cpu, idx) <= tl_per_task) ||
1472 100*(tl + p->se.load.weight) <= imbalance*load) {
1474 * This domain has SD_WAKE_AFFINE and
1475 * p is cache cold in this domain, and
1476 * there is no bad imbalance.
1478 schedstat_inc(this_sd, ttwu_move_affine);
1479 goto out_set_cpu;
1484 * Start passive balancing when half the imbalance_pct
1485 * limit is reached.
1487 if (this_sd->flags & SD_WAKE_BALANCE) {
1488 if (imbalance*this_load <= 100*load) {
1489 schedstat_inc(this_sd, ttwu_move_balance);
1490 goto out_set_cpu;
1495 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1496 out_set_cpu:
1497 new_cpu = wake_idle(new_cpu, p);
1498 if (new_cpu != cpu) {
1499 set_task_cpu(p, new_cpu);
1500 task_rq_unlock(rq, &flags);
1501 /* might preempt at this point */
1502 rq = task_rq_lock(p, &flags);
1503 old_state = p->state;
1504 if (!(old_state & state))
1505 goto out;
1506 if (p->se.on_rq)
1507 goto out_running;
1509 this_cpu = smp_processor_id();
1510 cpu = task_cpu(p);
1513 out_activate:
1514 #endif /* CONFIG_SMP */
1515 activate_task(rq, p, 1);
1517 * Sync wakeups (i.e. those types of wakeups where the waker
1518 * has indicated that it will leave the CPU in short order)
1519 * don't trigger a preemption, if the woken up task will run on
1520 * this cpu. (in this case the 'I will reschedule' promise of
1521 * the waker guarantees that the freshly woken up task is going
1522 * to be considered on this CPU.)
1524 if (!sync || cpu != this_cpu)
1525 check_preempt_curr(rq, p);
1526 success = 1;
1528 out_running:
1529 p->state = TASK_RUNNING;
1530 out:
1531 task_rq_unlock(rq, &flags);
1533 return success;
1536 int fastcall wake_up_process(struct task_struct *p)
1538 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1539 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1541 EXPORT_SYMBOL(wake_up_process);
1543 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1545 return try_to_wake_up(p, state, 0);
1549 * Perform scheduler related setup for a newly forked process p.
1550 * p is forked by current.
1552 * __sched_fork() is basic setup used by init_idle() too:
1554 static void __sched_fork(struct task_struct *p)
1556 p->se.wait_start_fair = 0;
1557 p->se.wait_start = 0;
1558 p->se.exec_start = 0;
1559 p->se.sum_exec_runtime = 0;
1560 p->se.delta_exec = 0;
1561 p->se.delta_fair_run = 0;
1562 p->se.delta_fair_sleep = 0;
1563 p->se.wait_runtime = 0;
1564 p->se.sum_wait_runtime = 0;
1565 p->se.sum_sleep_runtime = 0;
1566 p->se.sleep_start = 0;
1567 p->se.sleep_start_fair = 0;
1568 p->se.block_start = 0;
1569 p->se.sleep_max = 0;
1570 p->se.block_max = 0;
1571 p->se.exec_max = 0;
1572 p->se.wait_max = 0;
1573 p->se.wait_runtime_overruns = 0;
1574 p->se.wait_runtime_underruns = 0;
1576 INIT_LIST_HEAD(&p->run_list);
1577 p->se.on_rq = 0;
1580 * We mark the process as running here, but have not actually
1581 * inserted it onto the runqueue yet. This guarantees that
1582 * nobody will actually run it, and a signal or other external
1583 * event cannot wake it up and insert it on the runqueue either.
1585 p->state = TASK_RUNNING;
1589 * fork()/clone()-time setup:
1591 void sched_fork(struct task_struct *p, int clone_flags)
1593 int cpu = get_cpu();
1595 __sched_fork(p);
1597 #ifdef CONFIG_SMP
1598 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1599 #endif
1600 __set_task_cpu(p, cpu);
1603 * Make sure we do not leak PI boosting priority to the child:
1605 p->prio = current->normal_prio;
1607 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1608 if (likely(sched_info_on()))
1609 memset(&p->sched_info, 0, sizeof(p->sched_info));
1610 #endif
1611 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1612 p->oncpu = 0;
1613 #endif
1614 #ifdef CONFIG_PREEMPT
1615 /* Want to start with kernel preemption disabled. */
1616 task_thread_info(p)->preempt_count = 1;
1617 #endif
1618 put_cpu();
1622 * After fork, child runs first. (default) If set to 0 then
1623 * parent will (try to) run first.
1625 unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1628 * wake_up_new_task - wake up a newly created task for the first time.
1630 * This function will do some initial scheduler statistics housekeeping
1631 * that must be done for every newly created context, then puts the task
1632 * on the runqueue and wakes it.
1634 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1636 unsigned long flags;
1637 struct rq *rq;
1638 int this_cpu;
1640 rq = task_rq_lock(p, &flags);
1641 BUG_ON(p->state != TASK_RUNNING);
1642 this_cpu = smp_processor_id(); /* parent's CPU */
1644 p->prio = effective_prio(p);
1646 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1647 task_cpu(p) != this_cpu || !current->se.on_rq) {
1648 activate_task(rq, p, 0);
1649 } else {
1651 * Let the scheduling class do new task startup
1652 * management (if any):
1654 p->sched_class->task_new(rq, p);
1656 check_preempt_curr(rq, p);
1657 task_rq_unlock(rq, &flags);
1661 * prepare_task_switch - prepare to switch tasks
1662 * @rq: the runqueue preparing to switch
1663 * @next: the task we are going to switch to.
1665 * This is called with the rq lock held and interrupts off. It must
1666 * be paired with a subsequent finish_task_switch after the context
1667 * switch.
1669 * prepare_task_switch sets up locking and calls architecture specific
1670 * hooks.
1672 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1674 prepare_lock_switch(rq, next);
1675 prepare_arch_switch(next);
1679 * finish_task_switch - clean up after a task-switch
1680 * @rq: runqueue associated with task-switch
1681 * @prev: the thread we just switched away from.
1683 * finish_task_switch must be called after the context switch, paired
1684 * with a prepare_task_switch call before the context switch.
1685 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1686 * and do any other architecture-specific cleanup actions.
1688 * Note that we may have delayed dropping an mm in context_switch(). If
1689 * so, we finish that here outside of the runqueue lock. (Doing it
1690 * with the lock held can cause deadlocks; see schedule() for
1691 * details.)
1693 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1694 __releases(rq->lock)
1696 struct mm_struct *mm = rq->prev_mm;
1697 long prev_state;
1699 rq->prev_mm = NULL;
1702 * A task struct has one reference for the use as "current".
1703 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1704 * schedule one last time. The schedule call will never return, and
1705 * the scheduled task must drop that reference.
1706 * The test for TASK_DEAD must occur while the runqueue locks are
1707 * still held, otherwise prev could be scheduled on another cpu, die
1708 * there before we look at prev->state, and then the reference would
1709 * be dropped twice.
1710 * Manfred Spraul <manfred@colorfullife.com>
1712 prev_state = prev->state;
1713 finish_arch_switch(prev);
1714 finish_lock_switch(rq, prev);
1715 if (mm)
1716 mmdrop(mm);
1717 if (unlikely(prev_state == TASK_DEAD)) {
1719 * Remove function-return probe instances associated with this
1720 * task and put them back on the free list.
1722 kprobe_flush_task(prev);
1723 put_task_struct(prev);
1728 * schedule_tail - first thing a freshly forked thread must call.
1729 * @prev: the thread we just switched away from.
1731 asmlinkage void schedule_tail(struct task_struct *prev)
1732 __releases(rq->lock)
1734 struct rq *rq = this_rq();
1736 finish_task_switch(rq, prev);
1737 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1738 /* In this case, finish_task_switch does not reenable preemption */
1739 preempt_enable();
1740 #endif
1741 if (current->set_child_tid)
1742 put_user(current->pid, current->set_child_tid);
1746 * context_switch - switch to the new MM and the new
1747 * thread's register state.
1749 static inline void
1750 context_switch(struct rq *rq, struct task_struct *prev,
1751 struct task_struct *next)
1753 struct mm_struct *mm, *oldmm;
1755 prepare_task_switch(rq, next);
1756 mm = next->mm;
1757 oldmm = prev->active_mm;
1759 * For paravirt, this is coupled with an exit in switch_to to
1760 * combine the page table reload and the switch backend into
1761 * one hypercall.
1763 arch_enter_lazy_cpu_mode();
1765 if (unlikely(!mm)) {
1766 next->active_mm = oldmm;
1767 atomic_inc(&oldmm->mm_count);
1768 enter_lazy_tlb(oldmm, next);
1769 } else
1770 switch_mm(oldmm, mm, next);
1772 if (unlikely(!prev->mm)) {
1773 prev->active_mm = NULL;
1774 rq->prev_mm = oldmm;
1777 * Since the runqueue lock will be released by the next
1778 * task (which is an invalid locking op but in the case
1779 * of the scheduler it's an obvious special-case), so we
1780 * do an early lockdep release here:
1782 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1783 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1784 #endif
1786 /* Here we just switch the register state and the stack. */
1787 switch_to(prev, next, prev);
1789 barrier();
1791 * this_rq must be evaluated again because prev may have moved
1792 * CPUs since it called schedule(), thus the 'rq' on its stack
1793 * frame will be invalid.
1795 finish_task_switch(this_rq(), prev);
1799 * nr_running, nr_uninterruptible and nr_context_switches:
1801 * externally visible scheduler statistics: current number of runnable
1802 * threads, current number of uninterruptible-sleeping threads, total
1803 * number of context switches performed since bootup.
1805 unsigned long nr_running(void)
1807 unsigned long i, sum = 0;
1809 for_each_online_cpu(i)
1810 sum += cpu_rq(i)->nr_running;
1812 return sum;
1815 unsigned long nr_uninterruptible(void)
1817 unsigned long i, sum = 0;
1819 for_each_possible_cpu(i)
1820 sum += cpu_rq(i)->nr_uninterruptible;
1823 * Since we read the counters lockless, it might be slightly
1824 * inaccurate. Do not allow it to go below zero though:
1826 if (unlikely((long)sum < 0))
1827 sum = 0;
1829 return sum;
1832 unsigned long long nr_context_switches(void)
1834 int i;
1835 unsigned long long sum = 0;
1837 for_each_possible_cpu(i)
1838 sum += cpu_rq(i)->nr_switches;
1840 return sum;
1843 unsigned long nr_iowait(void)
1845 unsigned long i, sum = 0;
1847 for_each_possible_cpu(i)
1848 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1850 return sum;
1853 unsigned long nr_active(void)
1855 unsigned long i, running = 0, uninterruptible = 0;
1857 for_each_online_cpu(i) {
1858 running += cpu_rq(i)->nr_running;
1859 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1862 if (unlikely((long)uninterruptible < 0))
1863 uninterruptible = 0;
1865 return running + uninterruptible;
1869 * Update rq->cpu_load[] statistics. This function is usually called every
1870 * scheduler tick (TICK_NSEC).
1872 static void update_cpu_load(struct rq *this_rq)
1874 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1875 unsigned long total_load = this_rq->ls.load.weight;
1876 unsigned long this_load = total_load;
1877 struct load_stat *ls = &this_rq->ls;
1878 u64 now = __rq_clock(this_rq);
1879 int i, scale;
1881 this_rq->nr_load_updates++;
1882 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1883 goto do_avg;
1885 /* Update delta_fair/delta_exec fields first */
1886 update_curr_load(this_rq, now);
1888 fair_delta64 = ls->delta_fair + 1;
1889 ls->delta_fair = 0;
1891 exec_delta64 = ls->delta_exec + 1;
1892 ls->delta_exec = 0;
1894 sample_interval64 = now - ls->load_update_last;
1895 ls->load_update_last = now;
1897 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1898 sample_interval64 = TICK_NSEC;
1900 if (exec_delta64 > sample_interval64)
1901 exec_delta64 = sample_interval64;
1903 idle_delta64 = sample_interval64 - exec_delta64;
1905 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1906 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1908 this_load = (unsigned long)tmp64;
1910 do_avg:
1912 /* Update our load: */
1913 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1914 unsigned long old_load, new_load;
1916 /* scale is effectively 1 << i now, and >> i divides by scale */
1918 old_load = this_rq->cpu_load[i];
1919 new_load = this_load;
1921 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1925 #ifdef CONFIG_SMP
1928 * double_rq_lock - safely lock two runqueues
1930 * Note this does not disable interrupts like task_rq_lock,
1931 * you need to do so manually before calling.
1933 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1934 __acquires(rq1->lock)
1935 __acquires(rq2->lock)
1937 BUG_ON(!irqs_disabled());
1938 if (rq1 == rq2) {
1939 spin_lock(&rq1->lock);
1940 __acquire(rq2->lock); /* Fake it out ;) */
1941 } else {
1942 if (rq1 < rq2) {
1943 spin_lock(&rq1->lock);
1944 spin_lock(&rq2->lock);
1945 } else {
1946 spin_lock(&rq2->lock);
1947 spin_lock(&rq1->lock);
1953 * double_rq_unlock - safely unlock two runqueues
1955 * Note this does not restore interrupts like task_rq_unlock,
1956 * you need to do so manually after calling.
1958 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1959 __releases(rq1->lock)
1960 __releases(rq2->lock)
1962 spin_unlock(&rq1->lock);
1963 if (rq1 != rq2)
1964 spin_unlock(&rq2->lock);
1965 else
1966 __release(rq2->lock);
1970 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1972 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1973 __releases(this_rq->lock)
1974 __acquires(busiest->lock)
1975 __acquires(this_rq->lock)
1977 if (unlikely(!irqs_disabled())) {
1978 /* printk() doesn't work good under rq->lock */
1979 spin_unlock(&this_rq->lock);
1980 BUG_ON(1);
1982 if (unlikely(!spin_trylock(&busiest->lock))) {
1983 if (busiest < this_rq) {
1984 spin_unlock(&this_rq->lock);
1985 spin_lock(&busiest->lock);
1986 spin_lock(&this_rq->lock);
1987 } else
1988 spin_lock(&busiest->lock);
1993 * If dest_cpu is allowed for this process, migrate the task to it.
1994 * This is accomplished by forcing the cpu_allowed mask to only
1995 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1996 * the cpu_allowed mask is restored.
1998 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2000 struct migration_req req;
2001 unsigned long flags;
2002 struct rq *rq;
2004 rq = task_rq_lock(p, &flags);
2005 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2006 || unlikely(cpu_is_offline(dest_cpu)))
2007 goto out;
2009 /* force the process onto the specified CPU */
2010 if (migrate_task(p, dest_cpu, &req)) {
2011 /* Need to wait for migration thread (might exit: take ref). */
2012 struct task_struct *mt = rq->migration_thread;
2014 get_task_struct(mt);
2015 task_rq_unlock(rq, &flags);
2016 wake_up_process(mt);
2017 put_task_struct(mt);
2018 wait_for_completion(&req.done);
2020 return;
2022 out:
2023 task_rq_unlock(rq, &flags);
2027 * sched_exec - execve() is a valuable balancing opportunity, because at
2028 * this point the task has the smallest effective memory and cache footprint.
2030 void sched_exec(void)
2032 int new_cpu, this_cpu = get_cpu();
2033 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2034 put_cpu();
2035 if (new_cpu != this_cpu)
2036 sched_migrate_task(current, new_cpu);
2040 * pull_task - move a task from a remote runqueue to the local runqueue.
2041 * Both runqueues must be locked.
2043 static void pull_task(struct rq *src_rq, struct task_struct *p,
2044 struct rq *this_rq, int this_cpu)
2046 deactivate_task(src_rq, p, 0);
2047 set_task_cpu(p, this_cpu);
2048 activate_task(this_rq, p, 0);
2050 * Note that idle threads have a prio of MAX_PRIO, for this test
2051 * to be always true for them.
2053 check_preempt_curr(this_rq, p);
2057 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2059 static
2060 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2061 struct sched_domain *sd, enum cpu_idle_type idle,
2062 int *all_pinned)
2065 * We do not migrate tasks that are:
2066 * 1) running (obviously), or
2067 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2068 * 3) are cache-hot on their current CPU.
2070 if (!cpu_isset(this_cpu, p->cpus_allowed))
2071 return 0;
2072 *all_pinned = 0;
2074 if (task_running(rq, p))
2075 return 0;
2078 * Aggressive migration if too many balance attempts have failed:
2080 if (sd->nr_balance_failed > sd->cache_nice_tries)
2081 return 1;
2083 return 1;
2086 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2087 unsigned long max_nr_move, unsigned long max_load_move,
2088 struct sched_domain *sd, enum cpu_idle_type idle,
2089 int *all_pinned, unsigned long *load_moved,
2090 int this_best_prio, int best_prio, int best_prio_seen,
2091 struct rq_iterator *iterator)
2093 int pulled = 0, pinned = 0, skip_for_load;
2094 struct task_struct *p;
2095 long rem_load_move = max_load_move;
2097 if (max_nr_move == 0 || max_load_move == 0)
2098 goto out;
2100 pinned = 1;
2103 * Start the load-balancing iterator:
2105 p = iterator->start(iterator->arg);
2106 next:
2107 if (!p)
2108 goto out;
2110 * To help distribute high priority tasks accross CPUs we don't
2111 * skip a task if it will be the highest priority task (i.e. smallest
2112 * prio value) on its new queue regardless of its load weight
2114 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2115 SCHED_LOAD_SCALE_FUZZ;
2116 if (skip_for_load && p->prio < this_best_prio)
2117 skip_for_load = !best_prio_seen && p->prio == best_prio;
2118 if (skip_for_load ||
2119 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2121 best_prio_seen |= p->prio == best_prio;
2122 p = iterator->next(iterator->arg);
2123 goto next;
2126 pull_task(busiest, p, this_rq, this_cpu);
2127 pulled++;
2128 rem_load_move -= p->se.load.weight;
2131 * We only want to steal up to the prescribed number of tasks
2132 * and the prescribed amount of weighted load.
2134 if (pulled < max_nr_move && rem_load_move > 0) {
2135 if (p->prio < this_best_prio)
2136 this_best_prio = p->prio;
2137 p = iterator->next(iterator->arg);
2138 goto next;
2140 out:
2142 * Right now, this is the only place pull_task() is called,
2143 * so we can safely collect pull_task() stats here rather than
2144 * inside pull_task().
2146 schedstat_add(sd, lb_gained[idle], pulled);
2148 if (all_pinned)
2149 *all_pinned = pinned;
2150 *load_moved = max_load_move - rem_load_move;
2151 return pulled;
2155 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2156 * load from busiest to this_rq, as part of a balancing operation within
2157 * "domain". Returns the number of tasks moved.
2159 * Called with both runqueues locked.
2161 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2162 unsigned long max_nr_move, unsigned long max_load_move,
2163 struct sched_domain *sd, enum cpu_idle_type idle,
2164 int *all_pinned)
2166 struct sched_class *class = sched_class_highest;
2167 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2168 long rem_load_move = max_load_move;
2170 do {
2171 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2172 max_nr_move, (unsigned long)rem_load_move,
2173 sd, idle, all_pinned, &load_moved);
2174 total_nr_moved += nr_moved;
2175 max_nr_move -= nr_moved;
2176 rem_load_move -= load_moved;
2177 class = class->next;
2178 } while (class && max_nr_move && rem_load_move > 0);
2180 return total_nr_moved;
2184 * find_busiest_group finds and returns the busiest CPU group within the
2185 * domain. It calculates and returns the amount of weighted load which
2186 * should be moved to restore balance via the imbalance parameter.
2188 static struct sched_group *
2189 find_busiest_group(struct sched_domain *sd, int this_cpu,
2190 unsigned long *imbalance, enum cpu_idle_type idle,
2191 int *sd_idle, cpumask_t *cpus, int *balance)
2193 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2194 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2195 unsigned long max_pull;
2196 unsigned long busiest_load_per_task, busiest_nr_running;
2197 unsigned long this_load_per_task, this_nr_running;
2198 int load_idx;
2199 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2200 int power_savings_balance = 1;
2201 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2202 unsigned long min_nr_running = ULONG_MAX;
2203 struct sched_group *group_min = NULL, *group_leader = NULL;
2204 #endif
2206 max_load = this_load = total_load = total_pwr = 0;
2207 busiest_load_per_task = busiest_nr_running = 0;
2208 this_load_per_task = this_nr_running = 0;
2209 if (idle == CPU_NOT_IDLE)
2210 load_idx = sd->busy_idx;
2211 else if (idle == CPU_NEWLY_IDLE)
2212 load_idx = sd->newidle_idx;
2213 else
2214 load_idx = sd->idle_idx;
2216 do {
2217 unsigned long load, group_capacity;
2218 int local_group;
2219 int i;
2220 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2221 unsigned long sum_nr_running, sum_weighted_load;
2223 local_group = cpu_isset(this_cpu, group->cpumask);
2225 if (local_group)
2226 balance_cpu = first_cpu(group->cpumask);
2228 /* Tally up the load of all CPUs in the group */
2229 sum_weighted_load = sum_nr_running = avg_load = 0;
2231 for_each_cpu_mask(i, group->cpumask) {
2232 struct rq *rq;
2234 if (!cpu_isset(i, *cpus))
2235 continue;
2237 rq = cpu_rq(i);
2239 if (*sd_idle && !idle_cpu(i))
2240 *sd_idle = 0;
2242 /* Bias balancing toward cpus of our domain */
2243 if (local_group) {
2244 if (idle_cpu(i) && !first_idle_cpu) {
2245 first_idle_cpu = 1;
2246 balance_cpu = i;
2249 load = target_load(i, load_idx);
2250 } else
2251 load = source_load(i, load_idx);
2253 avg_load += load;
2254 sum_nr_running += rq->nr_running;
2255 sum_weighted_load += weighted_cpuload(i);
2259 * First idle cpu or the first cpu(busiest) in this sched group
2260 * is eligible for doing load balancing at this and above
2261 * domains.
2263 if (local_group && balance_cpu != this_cpu && balance) {
2264 *balance = 0;
2265 goto ret;
2268 total_load += avg_load;
2269 total_pwr += group->__cpu_power;
2271 /* Adjust by relative CPU power of the group */
2272 avg_load = sg_div_cpu_power(group,
2273 avg_load * SCHED_LOAD_SCALE);
2275 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2277 if (local_group) {
2278 this_load = avg_load;
2279 this = group;
2280 this_nr_running = sum_nr_running;
2281 this_load_per_task = sum_weighted_load;
2282 } else if (avg_load > max_load &&
2283 sum_nr_running > group_capacity) {
2284 max_load = avg_load;
2285 busiest = group;
2286 busiest_nr_running = sum_nr_running;
2287 busiest_load_per_task = sum_weighted_load;
2290 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2292 * Busy processors will not participate in power savings
2293 * balance.
2295 if (idle == CPU_NOT_IDLE ||
2296 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2297 goto group_next;
2300 * If the local group is idle or completely loaded
2301 * no need to do power savings balance at this domain
2303 if (local_group && (this_nr_running >= group_capacity ||
2304 !this_nr_running))
2305 power_savings_balance = 0;
2308 * If a group is already running at full capacity or idle,
2309 * don't include that group in power savings calculations
2311 if (!power_savings_balance || sum_nr_running >= group_capacity
2312 || !sum_nr_running)
2313 goto group_next;
2316 * Calculate the group which has the least non-idle load.
2317 * This is the group from where we need to pick up the load
2318 * for saving power
2320 if ((sum_nr_running < min_nr_running) ||
2321 (sum_nr_running == min_nr_running &&
2322 first_cpu(group->cpumask) <
2323 first_cpu(group_min->cpumask))) {
2324 group_min = group;
2325 min_nr_running = sum_nr_running;
2326 min_load_per_task = sum_weighted_load /
2327 sum_nr_running;
2331 * Calculate the group which is almost near its
2332 * capacity but still has some space to pick up some load
2333 * from other group and save more power
2335 if (sum_nr_running <= group_capacity - 1) {
2336 if (sum_nr_running > leader_nr_running ||
2337 (sum_nr_running == leader_nr_running &&
2338 first_cpu(group->cpumask) >
2339 first_cpu(group_leader->cpumask))) {
2340 group_leader = group;
2341 leader_nr_running = sum_nr_running;
2344 group_next:
2345 #endif
2346 group = group->next;
2347 } while (group != sd->groups);
2349 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2350 goto out_balanced;
2352 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2354 if (this_load >= avg_load ||
2355 100*max_load <= sd->imbalance_pct*this_load)
2356 goto out_balanced;
2358 busiest_load_per_task /= busiest_nr_running;
2360 * We're trying to get all the cpus to the average_load, so we don't
2361 * want to push ourselves above the average load, nor do we wish to
2362 * reduce the max loaded cpu below the average load, as either of these
2363 * actions would just result in more rebalancing later, and ping-pong
2364 * tasks around. Thus we look for the minimum possible imbalance.
2365 * Negative imbalances (*we* are more loaded than anyone else) will
2366 * be counted as no imbalance for these purposes -- we can't fix that
2367 * by pulling tasks to us. Be careful of negative numbers as they'll
2368 * appear as very large values with unsigned longs.
2370 if (max_load <= busiest_load_per_task)
2371 goto out_balanced;
2374 * In the presence of smp nice balancing, certain scenarios can have
2375 * max load less than avg load(as we skip the groups at or below
2376 * its cpu_power, while calculating max_load..)
2378 if (max_load < avg_load) {
2379 *imbalance = 0;
2380 goto small_imbalance;
2383 /* Don't want to pull so many tasks that a group would go idle */
2384 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2386 /* How much load to actually move to equalise the imbalance */
2387 *imbalance = min(max_pull * busiest->__cpu_power,
2388 (avg_load - this_load) * this->__cpu_power)
2389 / SCHED_LOAD_SCALE;
2392 * if *imbalance is less than the average load per runnable task
2393 * there is no gaurantee that any tasks will be moved so we'll have
2394 * a think about bumping its value to force at least one task to be
2395 * moved
2397 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2398 unsigned long tmp, pwr_now, pwr_move;
2399 unsigned int imbn;
2401 small_imbalance:
2402 pwr_move = pwr_now = 0;
2403 imbn = 2;
2404 if (this_nr_running) {
2405 this_load_per_task /= this_nr_running;
2406 if (busiest_load_per_task > this_load_per_task)
2407 imbn = 1;
2408 } else
2409 this_load_per_task = SCHED_LOAD_SCALE;
2411 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2412 busiest_load_per_task * imbn) {
2413 *imbalance = busiest_load_per_task;
2414 return busiest;
2418 * OK, we don't have enough imbalance to justify moving tasks,
2419 * however we may be able to increase total CPU power used by
2420 * moving them.
2423 pwr_now += busiest->__cpu_power *
2424 min(busiest_load_per_task, max_load);
2425 pwr_now += this->__cpu_power *
2426 min(this_load_per_task, this_load);
2427 pwr_now /= SCHED_LOAD_SCALE;
2429 /* Amount of load we'd subtract */
2430 tmp = sg_div_cpu_power(busiest,
2431 busiest_load_per_task * SCHED_LOAD_SCALE);
2432 if (max_load > tmp)
2433 pwr_move += busiest->__cpu_power *
2434 min(busiest_load_per_task, max_load - tmp);
2436 /* Amount of load we'd add */
2437 if (max_load * busiest->__cpu_power <
2438 busiest_load_per_task * SCHED_LOAD_SCALE)
2439 tmp = sg_div_cpu_power(this,
2440 max_load * busiest->__cpu_power);
2441 else
2442 tmp = sg_div_cpu_power(this,
2443 busiest_load_per_task * SCHED_LOAD_SCALE);
2444 pwr_move += this->__cpu_power *
2445 min(this_load_per_task, this_load + tmp);
2446 pwr_move /= SCHED_LOAD_SCALE;
2448 /* Move if we gain throughput */
2449 if (pwr_move <= pwr_now)
2450 goto out_balanced;
2452 *imbalance = busiest_load_per_task;
2455 return busiest;
2457 out_balanced:
2458 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2459 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2460 goto ret;
2462 if (this == group_leader && group_leader != group_min) {
2463 *imbalance = min_load_per_task;
2464 return group_min;
2466 #endif
2467 ret:
2468 *imbalance = 0;
2469 return NULL;
2473 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2475 static struct rq *
2476 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2477 unsigned long imbalance, cpumask_t *cpus)
2479 struct rq *busiest = NULL, *rq;
2480 unsigned long max_load = 0;
2481 int i;
2483 for_each_cpu_mask(i, group->cpumask) {
2484 unsigned long wl;
2486 if (!cpu_isset(i, *cpus))
2487 continue;
2489 rq = cpu_rq(i);
2490 wl = weighted_cpuload(i);
2492 if (rq->nr_running == 1 && wl > imbalance)
2493 continue;
2495 if (wl > max_load) {
2496 max_load = wl;
2497 busiest = rq;
2501 return busiest;
2505 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2506 * so long as it is large enough.
2508 #define MAX_PINNED_INTERVAL 512
2510 static inline unsigned long minus_1_or_zero(unsigned long n)
2512 return n > 0 ? n - 1 : 0;
2516 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2517 * tasks if there is an imbalance.
2519 static int load_balance(int this_cpu, struct rq *this_rq,
2520 struct sched_domain *sd, enum cpu_idle_type idle,
2521 int *balance)
2523 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2524 struct sched_group *group;
2525 unsigned long imbalance;
2526 struct rq *busiest;
2527 cpumask_t cpus = CPU_MASK_ALL;
2528 unsigned long flags;
2531 * When power savings policy is enabled for the parent domain, idle
2532 * sibling can pick up load irrespective of busy siblings. In this case,
2533 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2534 * portraying it as CPU_NOT_IDLE.
2536 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2537 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2538 sd_idle = 1;
2540 schedstat_inc(sd, lb_cnt[idle]);
2542 redo:
2543 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2544 &cpus, balance);
2546 if (*balance == 0)
2547 goto out_balanced;
2549 if (!group) {
2550 schedstat_inc(sd, lb_nobusyg[idle]);
2551 goto out_balanced;
2554 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2555 if (!busiest) {
2556 schedstat_inc(sd, lb_nobusyq[idle]);
2557 goto out_balanced;
2560 BUG_ON(busiest == this_rq);
2562 schedstat_add(sd, lb_imbalance[idle], imbalance);
2564 nr_moved = 0;
2565 if (busiest->nr_running > 1) {
2567 * Attempt to move tasks. If find_busiest_group has found
2568 * an imbalance but busiest->nr_running <= 1, the group is
2569 * still unbalanced. nr_moved simply stays zero, so it is
2570 * correctly treated as an imbalance.
2572 local_irq_save(flags);
2573 double_rq_lock(this_rq, busiest);
2574 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2575 minus_1_or_zero(busiest->nr_running),
2576 imbalance, sd, idle, &all_pinned);
2577 double_rq_unlock(this_rq, busiest);
2578 local_irq_restore(flags);
2581 * some other cpu did the load balance for us.
2583 if (nr_moved && this_cpu != smp_processor_id())
2584 resched_cpu(this_cpu);
2586 /* All tasks on this runqueue were pinned by CPU affinity */
2587 if (unlikely(all_pinned)) {
2588 cpu_clear(cpu_of(busiest), cpus);
2589 if (!cpus_empty(cpus))
2590 goto redo;
2591 goto out_balanced;
2595 if (!nr_moved) {
2596 schedstat_inc(sd, lb_failed[idle]);
2597 sd->nr_balance_failed++;
2599 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2601 spin_lock_irqsave(&busiest->lock, flags);
2603 /* don't kick the migration_thread, if the curr
2604 * task on busiest cpu can't be moved to this_cpu
2606 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2607 spin_unlock_irqrestore(&busiest->lock, flags);
2608 all_pinned = 1;
2609 goto out_one_pinned;
2612 if (!busiest->active_balance) {
2613 busiest->active_balance = 1;
2614 busiest->push_cpu = this_cpu;
2615 active_balance = 1;
2617 spin_unlock_irqrestore(&busiest->lock, flags);
2618 if (active_balance)
2619 wake_up_process(busiest->migration_thread);
2622 * We've kicked active balancing, reset the failure
2623 * counter.
2625 sd->nr_balance_failed = sd->cache_nice_tries+1;
2627 } else
2628 sd->nr_balance_failed = 0;
2630 if (likely(!active_balance)) {
2631 /* We were unbalanced, so reset the balancing interval */
2632 sd->balance_interval = sd->min_interval;
2633 } else {
2635 * If we've begun active balancing, start to back off. This
2636 * case may not be covered by the all_pinned logic if there
2637 * is only 1 task on the busy runqueue (because we don't call
2638 * move_tasks).
2640 if (sd->balance_interval < sd->max_interval)
2641 sd->balance_interval *= 2;
2644 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2645 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2646 return -1;
2647 return nr_moved;
2649 out_balanced:
2650 schedstat_inc(sd, lb_balanced[idle]);
2652 sd->nr_balance_failed = 0;
2654 out_one_pinned:
2655 /* tune up the balancing interval */
2656 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2657 (sd->balance_interval < sd->max_interval))
2658 sd->balance_interval *= 2;
2660 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2661 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2662 return -1;
2663 return 0;
2667 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2668 * tasks if there is an imbalance.
2670 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2671 * this_rq is locked.
2673 static int
2674 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2676 struct sched_group *group;
2677 struct rq *busiest = NULL;
2678 unsigned long imbalance;
2679 int nr_moved = 0;
2680 int sd_idle = 0;
2681 cpumask_t cpus = CPU_MASK_ALL;
2684 * When power savings policy is enabled for the parent domain, idle
2685 * sibling can pick up load irrespective of busy siblings. In this case,
2686 * let the state of idle sibling percolate up as IDLE, instead of
2687 * portraying it as CPU_NOT_IDLE.
2689 if (sd->flags & SD_SHARE_CPUPOWER &&
2690 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2691 sd_idle = 1;
2693 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2694 redo:
2695 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2696 &sd_idle, &cpus, NULL);
2697 if (!group) {
2698 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2699 goto out_balanced;
2702 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2703 &cpus);
2704 if (!busiest) {
2705 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2706 goto out_balanced;
2709 BUG_ON(busiest == this_rq);
2711 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2713 nr_moved = 0;
2714 if (busiest->nr_running > 1) {
2715 /* Attempt to move tasks */
2716 double_lock_balance(this_rq, busiest);
2717 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2718 minus_1_or_zero(busiest->nr_running),
2719 imbalance, sd, CPU_NEWLY_IDLE, NULL);
2720 spin_unlock(&busiest->lock);
2722 if (!nr_moved) {
2723 cpu_clear(cpu_of(busiest), cpus);
2724 if (!cpus_empty(cpus))
2725 goto redo;
2729 if (!nr_moved) {
2730 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2731 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2732 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2733 return -1;
2734 } else
2735 sd->nr_balance_failed = 0;
2737 return nr_moved;
2739 out_balanced:
2740 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2741 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2742 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2743 return -1;
2744 sd->nr_balance_failed = 0;
2746 return 0;
2750 * idle_balance is called by schedule() if this_cpu is about to become
2751 * idle. Attempts to pull tasks from other CPUs.
2753 static void idle_balance(int this_cpu, struct rq *this_rq)
2755 struct sched_domain *sd;
2756 int pulled_task = -1;
2757 unsigned long next_balance = jiffies + HZ;
2759 for_each_domain(this_cpu, sd) {
2760 unsigned long interval;
2762 if (!(sd->flags & SD_LOAD_BALANCE))
2763 continue;
2765 if (sd->flags & SD_BALANCE_NEWIDLE)
2766 /* If we've pulled tasks over stop searching: */
2767 pulled_task = load_balance_newidle(this_cpu,
2768 this_rq, sd);
2770 interval = msecs_to_jiffies(sd->balance_interval);
2771 if (time_after(next_balance, sd->last_balance + interval))
2772 next_balance = sd->last_balance + interval;
2773 if (pulled_task)
2774 break;
2776 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2778 * We are going idle. next_balance may be set based on
2779 * a busy processor. So reset next_balance.
2781 this_rq->next_balance = next_balance;
2786 * active_load_balance is run by migration threads. It pushes running tasks
2787 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2788 * running on each physical CPU where possible, and avoids physical /
2789 * logical imbalances.
2791 * Called with busiest_rq locked.
2793 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2795 int target_cpu = busiest_rq->push_cpu;
2796 struct sched_domain *sd;
2797 struct rq *target_rq;
2799 /* Is there any task to move? */
2800 if (busiest_rq->nr_running <= 1)
2801 return;
2803 target_rq = cpu_rq(target_cpu);
2806 * This condition is "impossible", if it occurs
2807 * we need to fix it. Originally reported by
2808 * Bjorn Helgaas on a 128-cpu setup.
2810 BUG_ON(busiest_rq == target_rq);
2812 /* move a task from busiest_rq to target_rq */
2813 double_lock_balance(busiest_rq, target_rq);
2815 /* Search for an sd spanning us and the target CPU. */
2816 for_each_domain(target_cpu, sd) {
2817 if ((sd->flags & SD_LOAD_BALANCE) &&
2818 cpu_isset(busiest_cpu, sd->span))
2819 break;
2822 if (likely(sd)) {
2823 schedstat_inc(sd, alb_cnt);
2825 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2826 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
2827 NULL))
2828 schedstat_inc(sd, alb_pushed);
2829 else
2830 schedstat_inc(sd, alb_failed);
2832 spin_unlock(&target_rq->lock);
2835 #ifdef CONFIG_NO_HZ
2836 static struct {
2837 atomic_t load_balancer;
2838 cpumask_t cpu_mask;
2839 } nohz ____cacheline_aligned = {
2840 .load_balancer = ATOMIC_INIT(-1),
2841 .cpu_mask = CPU_MASK_NONE,
2845 * This routine will try to nominate the ilb (idle load balancing)
2846 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2847 * load balancing on behalf of all those cpus. If all the cpus in the system
2848 * go into this tickless mode, then there will be no ilb owner (as there is
2849 * no need for one) and all the cpus will sleep till the next wakeup event
2850 * arrives...
2852 * For the ilb owner, tick is not stopped. And this tick will be used
2853 * for idle load balancing. ilb owner will still be part of
2854 * nohz.cpu_mask..
2856 * While stopping the tick, this cpu will become the ilb owner if there
2857 * is no other owner. And will be the owner till that cpu becomes busy
2858 * or if all cpus in the system stop their ticks at which point
2859 * there is no need for ilb owner.
2861 * When the ilb owner becomes busy, it nominates another owner, during the
2862 * next busy scheduler_tick()
2864 int select_nohz_load_balancer(int stop_tick)
2866 int cpu = smp_processor_id();
2868 if (stop_tick) {
2869 cpu_set(cpu, nohz.cpu_mask);
2870 cpu_rq(cpu)->in_nohz_recently = 1;
2873 * If we are going offline and still the leader, give up!
2875 if (cpu_is_offline(cpu) &&
2876 atomic_read(&nohz.load_balancer) == cpu) {
2877 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2878 BUG();
2879 return 0;
2882 /* time for ilb owner also to sleep */
2883 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2884 if (atomic_read(&nohz.load_balancer) == cpu)
2885 atomic_set(&nohz.load_balancer, -1);
2886 return 0;
2889 if (atomic_read(&nohz.load_balancer) == -1) {
2890 /* make me the ilb owner */
2891 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2892 return 1;
2893 } else if (atomic_read(&nohz.load_balancer) == cpu)
2894 return 1;
2895 } else {
2896 if (!cpu_isset(cpu, nohz.cpu_mask))
2897 return 0;
2899 cpu_clear(cpu, nohz.cpu_mask);
2901 if (atomic_read(&nohz.load_balancer) == cpu)
2902 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2903 BUG();
2905 return 0;
2907 #endif
2909 static DEFINE_SPINLOCK(balancing);
2912 * It checks each scheduling domain to see if it is due to be balanced,
2913 * and initiates a balancing operation if so.
2915 * Balancing parameters are set up in arch_init_sched_domains.
2917 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
2919 int balance = 1;
2920 struct rq *rq = cpu_rq(cpu);
2921 unsigned long interval;
2922 struct sched_domain *sd;
2923 /* Earliest time when we have to do rebalance again */
2924 unsigned long next_balance = jiffies + 60*HZ;
2926 for_each_domain(cpu, sd) {
2927 if (!(sd->flags & SD_LOAD_BALANCE))
2928 continue;
2930 interval = sd->balance_interval;
2931 if (idle != CPU_IDLE)
2932 interval *= sd->busy_factor;
2934 /* scale ms to jiffies */
2935 interval = msecs_to_jiffies(interval);
2936 if (unlikely(!interval))
2937 interval = 1;
2938 if (interval > HZ*NR_CPUS/10)
2939 interval = HZ*NR_CPUS/10;
2942 if (sd->flags & SD_SERIALIZE) {
2943 if (!spin_trylock(&balancing))
2944 goto out;
2947 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2948 if (load_balance(cpu, rq, sd, idle, &balance)) {
2950 * We've pulled tasks over so either we're no
2951 * longer idle, or one of our SMT siblings is
2952 * not idle.
2954 idle = CPU_NOT_IDLE;
2956 sd->last_balance = jiffies;
2958 if (sd->flags & SD_SERIALIZE)
2959 spin_unlock(&balancing);
2960 out:
2961 if (time_after(next_balance, sd->last_balance + interval))
2962 next_balance = sd->last_balance + interval;
2965 * Stop the load balance at this level. There is another
2966 * CPU in our sched group which is doing load balancing more
2967 * actively.
2969 if (!balance)
2970 break;
2972 rq->next_balance = next_balance;
2976 * run_rebalance_domains is triggered when needed from the scheduler tick.
2977 * In CONFIG_NO_HZ case, the idle load balance owner will do the
2978 * rebalancing for all the cpus for whom scheduler ticks are stopped.
2980 static void run_rebalance_domains(struct softirq_action *h)
2982 int this_cpu = smp_processor_id();
2983 struct rq *this_rq = cpu_rq(this_cpu);
2984 enum cpu_idle_type idle = this_rq->idle_at_tick ?
2985 CPU_IDLE : CPU_NOT_IDLE;
2987 rebalance_domains(this_cpu, idle);
2989 #ifdef CONFIG_NO_HZ
2991 * If this cpu is the owner for idle load balancing, then do the
2992 * balancing on behalf of the other idle cpus whose ticks are
2993 * stopped.
2995 if (this_rq->idle_at_tick &&
2996 atomic_read(&nohz.load_balancer) == this_cpu) {
2997 cpumask_t cpus = nohz.cpu_mask;
2998 struct rq *rq;
2999 int balance_cpu;
3001 cpu_clear(this_cpu, cpus);
3002 for_each_cpu_mask(balance_cpu, cpus) {
3004 * If this cpu gets work to do, stop the load balancing
3005 * work being done for other cpus. Next load
3006 * balancing owner will pick it up.
3008 if (need_resched())
3009 break;
3011 rebalance_domains(balance_cpu, SCHED_IDLE);
3013 rq = cpu_rq(balance_cpu);
3014 if (time_after(this_rq->next_balance, rq->next_balance))
3015 this_rq->next_balance = rq->next_balance;
3018 #endif
3022 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3024 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3025 * idle load balancing owner or decide to stop the periodic load balancing,
3026 * if the whole system is idle.
3028 static inline void trigger_load_balance(struct rq *rq, int cpu)
3030 #ifdef CONFIG_NO_HZ
3032 * If we were in the nohz mode recently and busy at the current
3033 * scheduler tick, then check if we need to nominate new idle
3034 * load balancer.
3036 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3037 rq->in_nohz_recently = 0;
3039 if (atomic_read(&nohz.load_balancer) == cpu) {
3040 cpu_clear(cpu, nohz.cpu_mask);
3041 atomic_set(&nohz.load_balancer, -1);
3044 if (atomic_read(&nohz.load_balancer) == -1) {
3046 * simple selection for now: Nominate the
3047 * first cpu in the nohz list to be the next
3048 * ilb owner.
3050 * TBD: Traverse the sched domains and nominate
3051 * the nearest cpu in the nohz.cpu_mask.
3053 int ilb = first_cpu(nohz.cpu_mask);
3055 if (ilb != NR_CPUS)
3056 resched_cpu(ilb);
3061 * If this cpu is idle and doing idle load balancing for all the
3062 * cpus with ticks stopped, is it time for that to stop?
3064 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3065 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3066 resched_cpu(cpu);
3067 return;
3071 * If this cpu is idle and the idle load balancing is done by
3072 * someone else, then no need raise the SCHED_SOFTIRQ
3074 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3075 cpu_isset(cpu, nohz.cpu_mask))
3076 return;
3077 #endif
3078 if (time_after_eq(jiffies, rq->next_balance))
3079 raise_softirq(SCHED_SOFTIRQ);
3082 #else /* CONFIG_SMP */
3085 * on UP we do not need to balance between CPUs:
3087 static inline void idle_balance(int cpu, struct rq *rq)
3091 /* Avoid "used but not defined" warning on UP */
3092 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3093 unsigned long max_nr_move, unsigned long max_load_move,
3094 struct sched_domain *sd, enum cpu_idle_type idle,
3095 int *all_pinned, unsigned long *load_moved,
3096 int this_best_prio, int best_prio, int best_prio_seen,
3097 struct rq_iterator *iterator)
3099 *load_moved = 0;
3101 return 0;
3104 #endif
3106 DEFINE_PER_CPU(struct kernel_stat, kstat);
3108 EXPORT_PER_CPU_SYMBOL(kstat);
3111 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3112 * that have not yet been banked in case the task is currently running.
3114 unsigned long long task_sched_runtime(struct task_struct *p)
3116 unsigned long flags;
3117 u64 ns, delta_exec;
3118 struct rq *rq;
3120 rq = task_rq_lock(p, &flags);
3121 ns = p->se.sum_exec_runtime;
3122 if (rq->curr == p) {
3123 delta_exec = rq_clock(rq) - p->se.exec_start;
3124 if ((s64)delta_exec > 0)
3125 ns += delta_exec;
3127 task_rq_unlock(rq, &flags);
3129 return ns;
3133 * Account user cpu time to a process.
3134 * @p: the process that the cpu time gets accounted to
3135 * @hardirq_offset: the offset to subtract from hardirq_count()
3136 * @cputime: the cpu time spent in user space since the last update
3138 void account_user_time(struct task_struct *p, cputime_t cputime)
3140 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3141 cputime64_t tmp;
3143 p->utime = cputime_add(p->utime, cputime);
3145 /* Add user time to cpustat. */
3146 tmp = cputime_to_cputime64(cputime);
3147 if (TASK_NICE(p) > 0)
3148 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3149 else
3150 cpustat->user = cputime64_add(cpustat->user, tmp);
3154 * Account system cpu time to a process.
3155 * @p: the process that the cpu time gets accounted to
3156 * @hardirq_offset: the offset to subtract from hardirq_count()
3157 * @cputime: the cpu time spent in kernel space since the last update
3159 void account_system_time(struct task_struct *p, int hardirq_offset,
3160 cputime_t cputime)
3162 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3163 struct rq *rq = this_rq();
3164 cputime64_t tmp;
3166 p->stime = cputime_add(p->stime, cputime);
3168 /* Add system time to cpustat. */
3169 tmp = cputime_to_cputime64(cputime);
3170 if (hardirq_count() - hardirq_offset)
3171 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3172 else if (softirq_count())
3173 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3174 else if (p != rq->idle)
3175 cpustat->system = cputime64_add(cpustat->system, tmp);
3176 else if (atomic_read(&rq->nr_iowait) > 0)
3177 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3178 else
3179 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3180 /* Account for system time used */
3181 acct_update_integrals(p);
3185 * Account for involuntary wait time.
3186 * @p: the process from which the cpu time has been stolen
3187 * @steal: the cpu time spent in involuntary wait
3189 void account_steal_time(struct task_struct *p, cputime_t steal)
3191 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3192 cputime64_t tmp = cputime_to_cputime64(steal);
3193 struct rq *rq = this_rq();
3195 if (p == rq->idle) {
3196 p->stime = cputime_add(p->stime, steal);
3197 if (atomic_read(&rq->nr_iowait) > 0)
3198 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3199 else
3200 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3201 } else
3202 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3206 * This function gets called by the timer code, with HZ frequency.
3207 * We call it with interrupts disabled.
3209 * It also gets called by the fork code, when changing the parent's
3210 * timeslices.
3212 void scheduler_tick(void)
3214 int cpu = smp_processor_id();
3215 struct rq *rq = cpu_rq(cpu);
3216 struct task_struct *curr = rq->curr;
3218 spin_lock(&rq->lock);
3219 if (curr != rq->idle) /* FIXME: needed? */
3220 curr->sched_class->task_tick(rq, curr);
3221 update_cpu_load(rq);
3222 spin_unlock(&rq->lock);
3224 #ifdef CONFIG_SMP
3225 rq->idle_at_tick = idle_cpu(cpu);
3226 trigger_load_balance(rq, cpu);
3227 #endif
3230 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3232 void fastcall add_preempt_count(int val)
3235 * Underflow?
3237 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3238 return;
3239 preempt_count() += val;
3241 * Spinlock count overflowing soon?
3243 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3244 PREEMPT_MASK - 10);
3246 EXPORT_SYMBOL(add_preempt_count);
3248 void fastcall sub_preempt_count(int val)
3251 * Underflow?
3253 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3254 return;
3256 * Is the spinlock portion underflowing?
3258 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3259 !(preempt_count() & PREEMPT_MASK)))
3260 return;
3262 preempt_count() -= val;
3264 EXPORT_SYMBOL(sub_preempt_count);
3266 #endif
3269 * Print scheduling while atomic bug:
3271 static noinline void __schedule_bug(struct task_struct *prev)
3273 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3274 prev->comm, preempt_count(), prev->pid);
3275 debug_show_held_locks(prev);
3276 if (irqs_disabled())
3277 print_irqtrace_events(prev);
3278 dump_stack();
3282 * Various schedule()-time debugging checks and statistics:
3284 static inline void schedule_debug(struct task_struct *prev)
3287 * Test if we are atomic. Since do_exit() needs to call into
3288 * schedule() atomically, we ignore that path for now.
3289 * Otherwise, whine if we are scheduling when we should not be.
3291 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3292 __schedule_bug(prev);
3294 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3296 schedstat_inc(this_rq(), sched_cnt);
3300 * Pick up the highest-prio task:
3302 static inline struct task_struct *
3303 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3305 struct sched_class *class;
3306 struct task_struct *p;
3309 * Optimization: we know that if all tasks are in
3310 * the fair class we can call that function directly:
3312 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3313 p = fair_sched_class.pick_next_task(rq, now);
3314 if (likely(p))
3315 return p;
3318 class = sched_class_highest;
3319 for ( ; ; ) {
3320 p = class->pick_next_task(rq, now);
3321 if (p)
3322 return p;
3324 * Will never be NULL as the idle class always
3325 * returns a non-NULL p:
3327 class = class->next;
3332 * schedule() is the main scheduler function.
3334 asmlinkage void __sched schedule(void)
3336 struct task_struct *prev, *next;
3337 long *switch_count;
3338 struct rq *rq;
3339 u64 now;
3340 int cpu;
3342 need_resched:
3343 preempt_disable();
3344 cpu = smp_processor_id();
3345 rq = cpu_rq(cpu);
3346 rcu_qsctr_inc(cpu);
3347 prev = rq->curr;
3348 switch_count = &prev->nivcsw;
3350 release_kernel_lock(prev);
3351 need_resched_nonpreemptible:
3353 schedule_debug(prev);
3355 spin_lock_irq(&rq->lock);
3356 clear_tsk_need_resched(prev);
3358 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3359 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3360 unlikely(signal_pending(prev)))) {
3361 prev->state = TASK_RUNNING;
3362 } else {
3363 deactivate_task(rq, prev, 1);
3365 switch_count = &prev->nvcsw;
3368 if (unlikely(!rq->nr_running))
3369 idle_balance(cpu, rq);
3371 now = __rq_clock(rq);
3372 prev->sched_class->put_prev_task(rq, prev, now);
3373 next = pick_next_task(rq, prev, now);
3375 sched_info_switch(prev, next);
3377 if (likely(prev != next)) {
3378 rq->nr_switches++;
3379 rq->curr = next;
3380 ++*switch_count;
3382 context_switch(rq, prev, next); /* unlocks the rq */
3383 } else
3384 spin_unlock_irq(&rq->lock);
3386 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3387 cpu = smp_processor_id();
3388 rq = cpu_rq(cpu);
3389 goto need_resched_nonpreemptible;
3391 preempt_enable_no_resched();
3392 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3393 goto need_resched;
3395 EXPORT_SYMBOL(schedule);
3397 #ifdef CONFIG_PREEMPT
3399 * this is the entry point to schedule() from in-kernel preemption
3400 * off of preempt_enable. Kernel preemptions off return from interrupt
3401 * occur there and call schedule directly.
3403 asmlinkage void __sched preempt_schedule(void)
3405 struct thread_info *ti = current_thread_info();
3406 #ifdef CONFIG_PREEMPT_BKL
3407 struct task_struct *task = current;
3408 int saved_lock_depth;
3409 #endif
3411 * If there is a non-zero preempt_count or interrupts are disabled,
3412 * we do not want to preempt the current task. Just return..
3414 if (likely(ti->preempt_count || irqs_disabled()))
3415 return;
3417 need_resched:
3418 add_preempt_count(PREEMPT_ACTIVE);
3420 * We keep the big kernel semaphore locked, but we
3421 * clear ->lock_depth so that schedule() doesnt
3422 * auto-release the semaphore:
3424 #ifdef CONFIG_PREEMPT_BKL
3425 saved_lock_depth = task->lock_depth;
3426 task->lock_depth = -1;
3427 #endif
3428 schedule();
3429 #ifdef CONFIG_PREEMPT_BKL
3430 task->lock_depth = saved_lock_depth;
3431 #endif
3432 sub_preempt_count(PREEMPT_ACTIVE);
3434 /* we could miss a preemption opportunity between schedule and now */
3435 barrier();
3436 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3437 goto need_resched;
3439 EXPORT_SYMBOL(preempt_schedule);
3442 * this is the entry point to schedule() from kernel preemption
3443 * off of irq context.
3444 * Note, that this is called and return with irqs disabled. This will
3445 * protect us against recursive calling from irq.
3447 asmlinkage void __sched preempt_schedule_irq(void)
3449 struct thread_info *ti = current_thread_info();
3450 #ifdef CONFIG_PREEMPT_BKL
3451 struct task_struct *task = current;
3452 int saved_lock_depth;
3453 #endif
3454 /* Catch callers which need to be fixed */
3455 BUG_ON(ti->preempt_count || !irqs_disabled());
3457 need_resched:
3458 add_preempt_count(PREEMPT_ACTIVE);
3460 * We keep the big kernel semaphore locked, but we
3461 * clear ->lock_depth so that schedule() doesnt
3462 * auto-release the semaphore:
3464 #ifdef CONFIG_PREEMPT_BKL
3465 saved_lock_depth = task->lock_depth;
3466 task->lock_depth = -1;
3467 #endif
3468 local_irq_enable();
3469 schedule();
3470 local_irq_disable();
3471 #ifdef CONFIG_PREEMPT_BKL
3472 task->lock_depth = saved_lock_depth;
3473 #endif
3474 sub_preempt_count(PREEMPT_ACTIVE);
3476 /* we could miss a preemption opportunity between schedule and now */
3477 barrier();
3478 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3479 goto need_resched;
3482 #endif /* CONFIG_PREEMPT */
3484 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3485 void *key)
3487 return try_to_wake_up(curr->private, mode, sync);
3489 EXPORT_SYMBOL(default_wake_function);
3492 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3493 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3494 * number) then we wake all the non-exclusive tasks and one exclusive task.
3496 * There are circumstances in which we can try to wake a task which has already
3497 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3498 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3500 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3501 int nr_exclusive, int sync, void *key)
3503 struct list_head *tmp, *next;
3505 list_for_each_safe(tmp, next, &q->task_list) {
3506 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3507 unsigned flags = curr->flags;
3509 if (curr->func(curr, mode, sync, key) &&
3510 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3511 break;
3516 * __wake_up - wake up threads blocked on a waitqueue.
3517 * @q: the waitqueue
3518 * @mode: which threads
3519 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3520 * @key: is directly passed to the wakeup function
3522 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3523 int nr_exclusive, void *key)
3525 unsigned long flags;
3527 spin_lock_irqsave(&q->lock, flags);
3528 __wake_up_common(q, mode, nr_exclusive, 0, key);
3529 spin_unlock_irqrestore(&q->lock, flags);
3531 EXPORT_SYMBOL(__wake_up);
3534 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3536 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3538 __wake_up_common(q, mode, 1, 0, NULL);
3542 * __wake_up_sync - wake up threads blocked on a waitqueue.
3543 * @q: the waitqueue
3544 * @mode: which threads
3545 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3547 * The sync wakeup differs that the waker knows that it will schedule
3548 * away soon, so while the target thread will be woken up, it will not
3549 * be migrated to another CPU - ie. the two threads are 'synchronized'
3550 * with each other. This can prevent needless bouncing between CPUs.
3552 * On UP it can prevent extra preemption.
3554 void fastcall
3555 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3557 unsigned long flags;
3558 int sync = 1;
3560 if (unlikely(!q))
3561 return;
3563 if (unlikely(!nr_exclusive))
3564 sync = 0;
3566 spin_lock_irqsave(&q->lock, flags);
3567 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3568 spin_unlock_irqrestore(&q->lock, flags);
3570 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3572 void fastcall complete(struct completion *x)
3574 unsigned long flags;
3576 spin_lock_irqsave(&x->wait.lock, flags);
3577 x->done++;
3578 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3579 1, 0, NULL);
3580 spin_unlock_irqrestore(&x->wait.lock, flags);
3582 EXPORT_SYMBOL(complete);
3584 void fastcall complete_all(struct completion *x)
3586 unsigned long flags;
3588 spin_lock_irqsave(&x->wait.lock, flags);
3589 x->done += UINT_MAX/2;
3590 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3591 0, 0, NULL);
3592 spin_unlock_irqrestore(&x->wait.lock, flags);
3594 EXPORT_SYMBOL(complete_all);
3596 void fastcall __sched wait_for_completion(struct completion *x)
3598 might_sleep();
3600 spin_lock_irq(&x->wait.lock);
3601 if (!x->done) {
3602 DECLARE_WAITQUEUE(wait, current);
3604 wait.flags |= WQ_FLAG_EXCLUSIVE;
3605 __add_wait_queue_tail(&x->wait, &wait);
3606 do {
3607 __set_current_state(TASK_UNINTERRUPTIBLE);
3608 spin_unlock_irq(&x->wait.lock);
3609 schedule();
3610 spin_lock_irq(&x->wait.lock);
3611 } while (!x->done);
3612 __remove_wait_queue(&x->wait, &wait);
3614 x->done--;
3615 spin_unlock_irq(&x->wait.lock);
3617 EXPORT_SYMBOL(wait_for_completion);
3619 unsigned long fastcall __sched
3620 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3622 might_sleep();
3624 spin_lock_irq(&x->wait.lock);
3625 if (!x->done) {
3626 DECLARE_WAITQUEUE(wait, current);
3628 wait.flags |= WQ_FLAG_EXCLUSIVE;
3629 __add_wait_queue_tail(&x->wait, &wait);
3630 do {
3631 __set_current_state(TASK_UNINTERRUPTIBLE);
3632 spin_unlock_irq(&x->wait.lock);
3633 timeout = schedule_timeout(timeout);
3634 spin_lock_irq(&x->wait.lock);
3635 if (!timeout) {
3636 __remove_wait_queue(&x->wait, &wait);
3637 goto out;
3639 } while (!x->done);
3640 __remove_wait_queue(&x->wait, &wait);
3642 x->done--;
3643 out:
3644 spin_unlock_irq(&x->wait.lock);
3645 return timeout;
3647 EXPORT_SYMBOL(wait_for_completion_timeout);
3649 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3651 int ret = 0;
3653 might_sleep();
3655 spin_lock_irq(&x->wait.lock);
3656 if (!x->done) {
3657 DECLARE_WAITQUEUE(wait, current);
3659 wait.flags |= WQ_FLAG_EXCLUSIVE;
3660 __add_wait_queue_tail(&x->wait, &wait);
3661 do {
3662 if (signal_pending(current)) {
3663 ret = -ERESTARTSYS;
3664 __remove_wait_queue(&x->wait, &wait);
3665 goto out;
3667 __set_current_state(TASK_INTERRUPTIBLE);
3668 spin_unlock_irq(&x->wait.lock);
3669 schedule();
3670 spin_lock_irq(&x->wait.lock);
3671 } while (!x->done);
3672 __remove_wait_queue(&x->wait, &wait);
3674 x->done--;
3675 out:
3676 spin_unlock_irq(&x->wait.lock);
3678 return ret;
3680 EXPORT_SYMBOL(wait_for_completion_interruptible);
3682 unsigned long fastcall __sched
3683 wait_for_completion_interruptible_timeout(struct completion *x,
3684 unsigned long timeout)
3686 might_sleep();
3688 spin_lock_irq(&x->wait.lock);
3689 if (!x->done) {
3690 DECLARE_WAITQUEUE(wait, current);
3692 wait.flags |= WQ_FLAG_EXCLUSIVE;
3693 __add_wait_queue_tail(&x->wait, &wait);
3694 do {
3695 if (signal_pending(current)) {
3696 timeout = -ERESTARTSYS;
3697 __remove_wait_queue(&x->wait, &wait);
3698 goto out;
3700 __set_current_state(TASK_INTERRUPTIBLE);
3701 spin_unlock_irq(&x->wait.lock);
3702 timeout = schedule_timeout(timeout);
3703 spin_lock_irq(&x->wait.lock);
3704 if (!timeout) {
3705 __remove_wait_queue(&x->wait, &wait);
3706 goto out;
3708 } while (!x->done);
3709 __remove_wait_queue(&x->wait, &wait);
3711 x->done--;
3712 out:
3713 spin_unlock_irq(&x->wait.lock);
3714 return timeout;
3716 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3719 #define SLEEP_ON_VAR \
3720 unsigned long flags; \
3721 wait_queue_t wait; \
3722 init_waitqueue_entry(&wait, current);
3724 #define SLEEP_ON_HEAD \
3725 spin_lock_irqsave(&q->lock,flags); \
3726 __add_wait_queue(q, &wait); \
3727 spin_unlock(&q->lock);
3729 #define SLEEP_ON_TAIL \
3730 spin_lock_irq(&q->lock); \
3731 __remove_wait_queue(q, &wait); \
3732 spin_unlock_irqrestore(&q->lock, flags);
3734 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3736 SLEEP_ON_VAR
3738 current->state = TASK_INTERRUPTIBLE;
3740 SLEEP_ON_HEAD
3741 schedule();
3742 SLEEP_ON_TAIL
3744 EXPORT_SYMBOL(interruptible_sleep_on);
3746 long fastcall __sched
3747 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3749 SLEEP_ON_VAR
3751 current->state = TASK_INTERRUPTIBLE;
3753 SLEEP_ON_HEAD
3754 timeout = schedule_timeout(timeout);
3755 SLEEP_ON_TAIL
3757 return timeout;
3759 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3761 void fastcall __sched sleep_on(wait_queue_head_t *q)
3763 SLEEP_ON_VAR
3765 current->state = TASK_UNINTERRUPTIBLE;
3767 SLEEP_ON_HEAD
3768 schedule();
3769 SLEEP_ON_TAIL
3771 EXPORT_SYMBOL(sleep_on);
3773 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3775 SLEEP_ON_VAR
3777 current->state = TASK_UNINTERRUPTIBLE;
3779 SLEEP_ON_HEAD
3780 timeout = schedule_timeout(timeout);
3781 SLEEP_ON_TAIL
3783 return timeout;
3786 EXPORT_SYMBOL(sleep_on_timeout);
3788 #ifdef CONFIG_RT_MUTEXES
3791 * rt_mutex_setprio - set the current priority of a task
3792 * @p: task
3793 * @prio: prio value (kernel-internal form)
3795 * This function changes the 'effective' priority of a task. It does
3796 * not touch ->normal_prio like __setscheduler().
3798 * Used by the rt_mutex code to implement priority inheritance logic.
3800 void rt_mutex_setprio(struct task_struct *p, int prio)
3802 unsigned long flags;
3803 int oldprio, on_rq;
3804 struct rq *rq;
3805 u64 now;
3807 BUG_ON(prio < 0 || prio > MAX_PRIO);
3809 rq = task_rq_lock(p, &flags);
3810 now = rq_clock(rq);
3812 oldprio = p->prio;
3813 on_rq = p->se.on_rq;
3814 if (on_rq)
3815 dequeue_task(rq, p, 0, now);
3817 if (rt_prio(prio))
3818 p->sched_class = &rt_sched_class;
3819 else
3820 p->sched_class = &fair_sched_class;
3822 p->prio = prio;
3824 if (on_rq) {
3825 enqueue_task(rq, p, 0, now);
3827 * Reschedule if we are currently running on this runqueue and
3828 * our priority decreased, or if we are not currently running on
3829 * this runqueue and our priority is higher than the current's
3831 if (task_running(rq, p)) {
3832 if (p->prio > oldprio)
3833 resched_task(rq->curr);
3834 } else {
3835 check_preempt_curr(rq, p);
3838 task_rq_unlock(rq, &flags);
3841 #endif
3843 void set_user_nice(struct task_struct *p, long nice)
3845 int old_prio, delta, on_rq;
3846 unsigned long flags;
3847 struct rq *rq;
3848 u64 now;
3850 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3851 return;
3853 * We have to be careful, if called from sys_setpriority(),
3854 * the task might be in the middle of scheduling on another CPU.
3856 rq = task_rq_lock(p, &flags);
3857 now = rq_clock(rq);
3859 * The RT priorities are set via sched_setscheduler(), but we still
3860 * allow the 'normal' nice value to be set - but as expected
3861 * it wont have any effect on scheduling until the task is
3862 * SCHED_FIFO/SCHED_RR:
3864 if (task_has_rt_policy(p)) {
3865 p->static_prio = NICE_TO_PRIO(nice);
3866 goto out_unlock;
3868 on_rq = p->se.on_rq;
3869 if (on_rq) {
3870 dequeue_task(rq, p, 0, now);
3871 dec_load(rq, p, now);
3874 p->static_prio = NICE_TO_PRIO(nice);
3875 set_load_weight(p);
3876 old_prio = p->prio;
3877 p->prio = effective_prio(p);
3878 delta = p->prio - old_prio;
3880 if (on_rq) {
3881 enqueue_task(rq, p, 0, now);
3882 inc_load(rq, p, now);
3884 * If the task increased its priority or is running and
3885 * lowered its priority, then reschedule its CPU:
3887 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3888 resched_task(rq->curr);
3890 out_unlock:
3891 task_rq_unlock(rq, &flags);
3893 EXPORT_SYMBOL(set_user_nice);
3896 * can_nice - check if a task can reduce its nice value
3897 * @p: task
3898 * @nice: nice value
3900 int can_nice(const struct task_struct *p, const int nice)
3902 /* convert nice value [19,-20] to rlimit style value [1,40] */
3903 int nice_rlim = 20 - nice;
3905 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3906 capable(CAP_SYS_NICE));
3909 #ifdef __ARCH_WANT_SYS_NICE
3912 * sys_nice - change the priority of the current process.
3913 * @increment: priority increment
3915 * sys_setpriority is a more generic, but much slower function that
3916 * does similar things.
3918 asmlinkage long sys_nice(int increment)
3920 long nice, retval;
3923 * Setpriority might change our priority at the same moment.
3924 * We don't have to worry. Conceptually one call occurs first
3925 * and we have a single winner.
3927 if (increment < -40)
3928 increment = -40;
3929 if (increment > 40)
3930 increment = 40;
3932 nice = PRIO_TO_NICE(current->static_prio) + increment;
3933 if (nice < -20)
3934 nice = -20;
3935 if (nice > 19)
3936 nice = 19;
3938 if (increment < 0 && !can_nice(current, nice))
3939 return -EPERM;
3941 retval = security_task_setnice(current, nice);
3942 if (retval)
3943 return retval;
3945 set_user_nice(current, nice);
3946 return 0;
3949 #endif
3952 * task_prio - return the priority value of a given task.
3953 * @p: the task in question.
3955 * This is the priority value as seen by users in /proc.
3956 * RT tasks are offset by -200. Normal tasks are centered
3957 * around 0, value goes from -16 to +15.
3959 int task_prio(const struct task_struct *p)
3961 return p->prio - MAX_RT_PRIO;
3965 * task_nice - return the nice value of a given task.
3966 * @p: the task in question.
3968 int task_nice(const struct task_struct *p)
3970 return TASK_NICE(p);
3972 EXPORT_SYMBOL_GPL(task_nice);
3975 * idle_cpu - is a given cpu idle currently?
3976 * @cpu: the processor in question.
3978 int idle_cpu(int cpu)
3980 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3984 * idle_task - return the idle task for a given cpu.
3985 * @cpu: the processor in question.
3987 struct task_struct *idle_task(int cpu)
3989 return cpu_rq(cpu)->idle;
3993 * find_process_by_pid - find a process with a matching PID value.
3994 * @pid: the pid in question.
3996 static inline struct task_struct *find_process_by_pid(pid_t pid)
3998 return pid ? find_task_by_pid(pid) : current;
4001 /* Actually do priority change: must hold rq lock. */
4002 static void
4003 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4005 BUG_ON(p->se.on_rq);
4007 p->policy = policy;
4008 switch (p->policy) {
4009 case SCHED_NORMAL:
4010 case SCHED_BATCH:
4011 case SCHED_IDLE:
4012 p->sched_class = &fair_sched_class;
4013 break;
4014 case SCHED_FIFO:
4015 case SCHED_RR:
4016 p->sched_class = &rt_sched_class;
4017 break;
4020 p->rt_priority = prio;
4021 p->normal_prio = normal_prio(p);
4022 /* we are holding p->pi_lock already */
4023 p->prio = rt_mutex_getprio(p);
4024 set_load_weight(p);
4028 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4029 * @p: the task in question.
4030 * @policy: new policy.
4031 * @param: structure containing the new RT priority.
4033 * NOTE that the task may be already dead.
4035 int sched_setscheduler(struct task_struct *p, int policy,
4036 struct sched_param *param)
4038 int retval, oldprio, oldpolicy = -1, on_rq;
4039 unsigned long flags;
4040 struct rq *rq;
4042 /* may grab non-irq protected spin_locks */
4043 BUG_ON(in_interrupt());
4044 recheck:
4045 /* double check policy once rq lock held */
4046 if (policy < 0)
4047 policy = oldpolicy = p->policy;
4048 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4049 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4050 policy != SCHED_IDLE)
4051 return -EINVAL;
4053 * Valid priorities for SCHED_FIFO and SCHED_RR are
4054 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4055 * SCHED_BATCH and SCHED_IDLE is 0.
4057 if (param->sched_priority < 0 ||
4058 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4059 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4060 return -EINVAL;
4061 if (rt_policy(policy) != (param->sched_priority != 0))
4062 return -EINVAL;
4065 * Allow unprivileged RT tasks to decrease priority:
4067 if (!capable(CAP_SYS_NICE)) {
4068 if (rt_policy(policy)) {
4069 unsigned long rlim_rtprio;
4071 if (!lock_task_sighand(p, &flags))
4072 return -ESRCH;
4073 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4074 unlock_task_sighand(p, &flags);
4076 /* can't set/change the rt policy */
4077 if (policy != p->policy && !rlim_rtprio)
4078 return -EPERM;
4080 /* can't increase priority */
4081 if (param->sched_priority > p->rt_priority &&
4082 param->sched_priority > rlim_rtprio)
4083 return -EPERM;
4086 * Like positive nice levels, dont allow tasks to
4087 * move out of SCHED_IDLE either:
4089 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4090 return -EPERM;
4092 /* can't change other user's priorities */
4093 if ((current->euid != p->euid) &&
4094 (current->euid != p->uid))
4095 return -EPERM;
4098 retval = security_task_setscheduler(p, policy, param);
4099 if (retval)
4100 return retval;
4102 * make sure no PI-waiters arrive (or leave) while we are
4103 * changing the priority of the task:
4105 spin_lock_irqsave(&p->pi_lock, flags);
4107 * To be able to change p->policy safely, the apropriate
4108 * runqueue lock must be held.
4110 rq = __task_rq_lock(p);
4111 /* recheck policy now with rq lock held */
4112 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4113 policy = oldpolicy = -1;
4114 __task_rq_unlock(rq);
4115 spin_unlock_irqrestore(&p->pi_lock, flags);
4116 goto recheck;
4118 on_rq = p->se.on_rq;
4119 if (on_rq)
4120 deactivate_task(rq, p, 0);
4121 oldprio = p->prio;
4122 __setscheduler(rq, p, policy, param->sched_priority);
4123 if (on_rq) {
4124 activate_task(rq, p, 0);
4126 * Reschedule if we are currently running on this runqueue and
4127 * our priority decreased, or if we are not currently running on
4128 * this runqueue and our priority is higher than the current's
4130 if (task_running(rq, p)) {
4131 if (p->prio > oldprio)
4132 resched_task(rq->curr);
4133 } else {
4134 check_preempt_curr(rq, p);
4137 __task_rq_unlock(rq);
4138 spin_unlock_irqrestore(&p->pi_lock, flags);
4140 rt_mutex_adjust_pi(p);
4142 return 0;
4144 EXPORT_SYMBOL_GPL(sched_setscheduler);
4146 static int
4147 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4149 struct sched_param lparam;
4150 struct task_struct *p;
4151 int retval;
4153 if (!param || pid < 0)
4154 return -EINVAL;
4155 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4156 return -EFAULT;
4158 rcu_read_lock();
4159 retval = -ESRCH;
4160 p = find_process_by_pid(pid);
4161 if (p != NULL)
4162 retval = sched_setscheduler(p, policy, &lparam);
4163 rcu_read_unlock();
4165 return retval;
4169 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4170 * @pid: the pid in question.
4171 * @policy: new policy.
4172 * @param: structure containing the new RT priority.
4174 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4175 struct sched_param __user *param)
4177 /* negative values for policy are not valid */
4178 if (policy < 0)
4179 return -EINVAL;
4181 return do_sched_setscheduler(pid, policy, param);
4185 * sys_sched_setparam - set/change the RT priority of a thread
4186 * @pid: the pid in question.
4187 * @param: structure containing the new RT priority.
4189 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4191 return do_sched_setscheduler(pid, -1, param);
4195 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4196 * @pid: the pid in question.
4198 asmlinkage long sys_sched_getscheduler(pid_t pid)
4200 struct task_struct *p;
4201 int retval = -EINVAL;
4203 if (pid < 0)
4204 goto out_nounlock;
4206 retval = -ESRCH;
4207 read_lock(&tasklist_lock);
4208 p = find_process_by_pid(pid);
4209 if (p) {
4210 retval = security_task_getscheduler(p);
4211 if (!retval)
4212 retval = p->policy;
4214 read_unlock(&tasklist_lock);
4216 out_nounlock:
4217 return retval;
4221 * sys_sched_getscheduler - get the RT priority of a thread
4222 * @pid: the pid in question.
4223 * @param: structure containing the RT priority.
4225 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4227 struct sched_param lp;
4228 struct task_struct *p;
4229 int retval = -EINVAL;
4231 if (!param || pid < 0)
4232 goto out_nounlock;
4234 read_lock(&tasklist_lock);
4235 p = find_process_by_pid(pid);
4236 retval = -ESRCH;
4237 if (!p)
4238 goto out_unlock;
4240 retval = security_task_getscheduler(p);
4241 if (retval)
4242 goto out_unlock;
4244 lp.sched_priority = p->rt_priority;
4245 read_unlock(&tasklist_lock);
4248 * This one might sleep, we cannot do it with a spinlock held ...
4250 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4252 out_nounlock:
4253 return retval;
4255 out_unlock:
4256 read_unlock(&tasklist_lock);
4257 return retval;
4260 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4262 cpumask_t cpus_allowed;
4263 struct task_struct *p;
4264 int retval;
4266 mutex_lock(&sched_hotcpu_mutex);
4267 read_lock(&tasklist_lock);
4269 p = find_process_by_pid(pid);
4270 if (!p) {
4271 read_unlock(&tasklist_lock);
4272 mutex_unlock(&sched_hotcpu_mutex);
4273 return -ESRCH;
4277 * It is not safe to call set_cpus_allowed with the
4278 * tasklist_lock held. We will bump the task_struct's
4279 * usage count and then drop tasklist_lock.
4281 get_task_struct(p);
4282 read_unlock(&tasklist_lock);
4284 retval = -EPERM;
4285 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4286 !capable(CAP_SYS_NICE))
4287 goto out_unlock;
4289 retval = security_task_setscheduler(p, 0, NULL);
4290 if (retval)
4291 goto out_unlock;
4293 cpus_allowed = cpuset_cpus_allowed(p);
4294 cpus_and(new_mask, new_mask, cpus_allowed);
4295 retval = set_cpus_allowed(p, new_mask);
4297 out_unlock:
4298 put_task_struct(p);
4299 mutex_unlock(&sched_hotcpu_mutex);
4300 return retval;
4303 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4304 cpumask_t *new_mask)
4306 if (len < sizeof(cpumask_t)) {
4307 memset(new_mask, 0, sizeof(cpumask_t));
4308 } else if (len > sizeof(cpumask_t)) {
4309 len = sizeof(cpumask_t);
4311 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4315 * sys_sched_setaffinity - set the cpu affinity of a process
4316 * @pid: pid of the process
4317 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4318 * @user_mask_ptr: user-space pointer to the new cpu mask
4320 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4321 unsigned long __user *user_mask_ptr)
4323 cpumask_t new_mask;
4324 int retval;
4326 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4327 if (retval)
4328 return retval;
4330 return sched_setaffinity(pid, new_mask);
4334 * Represents all cpu's present in the system
4335 * In systems capable of hotplug, this map could dynamically grow
4336 * as new cpu's are detected in the system via any platform specific
4337 * method, such as ACPI for e.g.
4340 cpumask_t cpu_present_map __read_mostly;
4341 EXPORT_SYMBOL(cpu_present_map);
4343 #ifndef CONFIG_SMP
4344 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4345 EXPORT_SYMBOL(cpu_online_map);
4347 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4348 EXPORT_SYMBOL(cpu_possible_map);
4349 #endif
4351 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4353 struct task_struct *p;
4354 int retval;
4356 mutex_lock(&sched_hotcpu_mutex);
4357 read_lock(&tasklist_lock);
4359 retval = -ESRCH;
4360 p = find_process_by_pid(pid);
4361 if (!p)
4362 goto out_unlock;
4364 retval = security_task_getscheduler(p);
4365 if (retval)
4366 goto out_unlock;
4368 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4370 out_unlock:
4371 read_unlock(&tasklist_lock);
4372 mutex_unlock(&sched_hotcpu_mutex);
4373 if (retval)
4374 return retval;
4376 return 0;
4380 * sys_sched_getaffinity - get the cpu affinity of a process
4381 * @pid: pid of the process
4382 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4383 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4385 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4386 unsigned long __user *user_mask_ptr)
4388 int ret;
4389 cpumask_t mask;
4391 if (len < sizeof(cpumask_t))
4392 return -EINVAL;
4394 ret = sched_getaffinity(pid, &mask);
4395 if (ret < 0)
4396 return ret;
4398 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4399 return -EFAULT;
4401 return sizeof(cpumask_t);
4405 * sys_sched_yield - yield the current processor to other threads.
4407 * This function yields the current CPU to other tasks. If there are no
4408 * other threads running on this CPU then this function will return.
4410 asmlinkage long sys_sched_yield(void)
4412 struct rq *rq = this_rq_lock();
4414 schedstat_inc(rq, yld_cnt);
4415 if (unlikely(rq->nr_running == 1))
4416 schedstat_inc(rq, yld_act_empty);
4417 else
4418 current->sched_class->yield_task(rq, current);
4421 * Since we are going to call schedule() anyway, there's
4422 * no need to preempt or enable interrupts:
4424 __release(rq->lock);
4425 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4426 _raw_spin_unlock(&rq->lock);
4427 preempt_enable_no_resched();
4429 schedule();
4431 return 0;
4434 static void __cond_resched(void)
4436 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4437 __might_sleep(__FILE__, __LINE__);
4438 #endif
4440 * The BKS might be reacquired before we have dropped
4441 * PREEMPT_ACTIVE, which could trigger a second
4442 * cond_resched() call.
4444 do {
4445 add_preempt_count(PREEMPT_ACTIVE);
4446 schedule();
4447 sub_preempt_count(PREEMPT_ACTIVE);
4448 } while (need_resched());
4451 int __sched cond_resched(void)
4453 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4454 system_state == SYSTEM_RUNNING) {
4455 __cond_resched();
4456 return 1;
4458 return 0;
4460 EXPORT_SYMBOL(cond_resched);
4463 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4464 * call schedule, and on return reacquire the lock.
4466 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4467 * operations here to prevent schedule() from being called twice (once via
4468 * spin_unlock(), once by hand).
4470 int cond_resched_lock(spinlock_t *lock)
4472 int ret = 0;
4474 if (need_lockbreak(lock)) {
4475 spin_unlock(lock);
4476 cpu_relax();
4477 ret = 1;
4478 spin_lock(lock);
4480 if (need_resched() && system_state == SYSTEM_RUNNING) {
4481 spin_release(&lock->dep_map, 1, _THIS_IP_);
4482 _raw_spin_unlock(lock);
4483 preempt_enable_no_resched();
4484 __cond_resched();
4485 ret = 1;
4486 spin_lock(lock);
4488 return ret;
4490 EXPORT_SYMBOL(cond_resched_lock);
4492 int __sched cond_resched_softirq(void)
4494 BUG_ON(!in_softirq());
4496 if (need_resched() && system_state == SYSTEM_RUNNING) {
4497 local_bh_enable();
4498 __cond_resched();
4499 local_bh_disable();
4500 return 1;
4502 return 0;
4504 EXPORT_SYMBOL(cond_resched_softirq);
4507 * yield - yield the current processor to other threads.
4509 * This is a shortcut for kernel-space yielding - it marks the
4510 * thread runnable and calls sys_sched_yield().
4512 void __sched yield(void)
4514 set_current_state(TASK_RUNNING);
4515 sys_sched_yield();
4517 EXPORT_SYMBOL(yield);
4520 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4521 * that process accounting knows that this is a task in IO wait state.
4523 * But don't do that if it is a deliberate, throttling IO wait (this task
4524 * has set its backing_dev_info: the queue against which it should throttle)
4526 void __sched io_schedule(void)
4528 struct rq *rq = &__raw_get_cpu_var(runqueues);
4530 delayacct_blkio_start();
4531 atomic_inc(&rq->nr_iowait);
4532 schedule();
4533 atomic_dec(&rq->nr_iowait);
4534 delayacct_blkio_end();
4536 EXPORT_SYMBOL(io_schedule);
4538 long __sched io_schedule_timeout(long timeout)
4540 struct rq *rq = &__raw_get_cpu_var(runqueues);
4541 long ret;
4543 delayacct_blkio_start();
4544 atomic_inc(&rq->nr_iowait);
4545 ret = schedule_timeout(timeout);
4546 atomic_dec(&rq->nr_iowait);
4547 delayacct_blkio_end();
4548 return ret;
4552 * sys_sched_get_priority_max - return maximum RT priority.
4553 * @policy: scheduling class.
4555 * this syscall returns the maximum rt_priority that can be used
4556 * by a given scheduling class.
4558 asmlinkage long sys_sched_get_priority_max(int policy)
4560 int ret = -EINVAL;
4562 switch (policy) {
4563 case SCHED_FIFO:
4564 case SCHED_RR:
4565 ret = MAX_USER_RT_PRIO-1;
4566 break;
4567 case SCHED_NORMAL:
4568 case SCHED_BATCH:
4569 case SCHED_IDLE:
4570 ret = 0;
4571 break;
4573 return ret;
4577 * sys_sched_get_priority_min - return minimum RT priority.
4578 * @policy: scheduling class.
4580 * this syscall returns the minimum rt_priority that can be used
4581 * by a given scheduling class.
4583 asmlinkage long sys_sched_get_priority_min(int policy)
4585 int ret = -EINVAL;
4587 switch (policy) {
4588 case SCHED_FIFO:
4589 case SCHED_RR:
4590 ret = 1;
4591 break;
4592 case SCHED_NORMAL:
4593 case SCHED_BATCH:
4594 case SCHED_IDLE:
4595 ret = 0;
4597 return ret;
4601 * sys_sched_rr_get_interval - return the default timeslice of a process.
4602 * @pid: pid of the process.
4603 * @interval: userspace pointer to the timeslice value.
4605 * this syscall writes the default timeslice value of a given process
4606 * into the user-space timespec buffer. A value of '0' means infinity.
4608 asmlinkage
4609 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4611 struct task_struct *p;
4612 int retval = -EINVAL;
4613 struct timespec t;
4615 if (pid < 0)
4616 goto out_nounlock;
4618 retval = -ESRCH;
4619 read_lock(&tasklist_lock);
4620 p = find_process_by_pid(pid);
4621 if (!p)
4622 goto out_unlock;
4624 retval = security_task_getscheduler(p);
4625 if (retval)
4626 goto out_unlock;
4628 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4629 0 : static_prio_timeslice(p->static_prio), &t);
4630 read_unlock(&tasklist_lock);
4631 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4632 out_nounlock:
4633 return retval;
4634 out_unlock:
4635 read_unlock(&tasklist_lock);
4636 return retval;
4639 static const char stat_nam[] = "RSDTtZX";
4641 static void show_task(struct task_struct *p)
4643 unsigned long free = 0;
4644 unsigned state;
4646 state = p->state ? __ffs(p->state) + 1 : 0;
4647 printk("%-13.13s %c", p->comm,
4648 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4649 #if (BITS_PER_LONG == 32)
4650 if (state == TASK_RUNNING)
4651 printk(" running ");
4652 else
4653 printk(" %08lX ", thread_saved_pc(p));
4654 #else
4655 if (state == TASK_RUNNING)
4656 printk(" running task ");
4657 else
4658 printk(" %016lx ", thread_saved_pc(p));
4659 #endif
4660 #ifdef CONFIG_DEBUG_STACK_USAGE
4662 unsigned long *n = end_of_stack(p);
4663 while (!*n)
4664 n++;
4665 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4667 #endif
4668 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
4669 if (!p->mm)
4670 printk(" (L-TLB)\n");
4671 else
4672 printk(" (NOTLB)\n");
4674 if (state != TASK_RUNNING)
4675 show_stack(p, NULL);
4678 void show_state_filter(unsigned long state_filter)
4680 struct task_struct *g, *p;
4682 #if (BITS_PER_LONG == 32)
4683 printk("\n"
4684 " free sibling\n");
4685 printk(" task PC stack pid father child younger older\n");
4686 #else
4687 printk("\n"
4688 " free sibling\n");
4689 printk(" task PC stack pid father child younger older\n");
4690 #endif
4691 read_lock(&tasklist_lock);
4692 do_each_thread(g, p) {
4694 * reset the NMI-timeout, listing all files on a slow
4695 * console might take alot of time:
4697 touch_nmi_watchdog();
4698 if (!state_filter || (p->state & state_filter))
4699 show_task(p);
4700 } while_each_thread(g, p);
4702 touch_all_softlockup_watchdogs();
4704 #ifdef CONFIG_SCHED_DEBUG
4705 sysrq_sched_debug_show();
4706 #endif
4707 read_unlock(&tasklist_lock);
4709 * Only show locks if all tasks are dumped:
4711 if (state_filter == -1)
4712 debug_show_all_locks();
4715 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4717 idle->sched_class = &idle_sched_class;
4721 * init_idle - set up an idle thread for a given CPU
4722 * @idle: task in question
4723 * @cpu: cpu the idle task belongs to
4725 * NOTE: this function does not set the idle thread's NEED_RESCHED
4726 * flag, to make booting more robust.
4728 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4730 struct rq *rq = cpu_rq(cpu);
4731 unsigned long flags;
4733 __sched_fork(idle);
4734 idle->se.exec_start = sched_clock();
4736 idle->prio = idle->normal_prio = MAX_PRIO;
4737 idle->cpus_allowed = cpumask_of_cpu(cpu);
4738 __set_task_cpu(idle, cpu);
4740 spin_lock_irqsave(&rq->lock, flags);
4741 rq->curr = rq->idle = idle;
4742 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4743 idle->oncpu = 1;
4744 #endif
4745 spin_unlock_irqrestore(&rq->lock, flags);
4747 /* Set the preempt count _outside_ the spinlocks! */
4748 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4749 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4750 #else
4751 task_thread_info(idle)->preempt_count = 0;
4752 #endif
4754 * The idle tasks have their own, simple scheduling class:
4756 idle->sched_class = &idle_sched_class;
4760 * In a system that switches off the HZ timer nohz_cpu_mask
4761 * indicates which cpus entered this state. This is used
4762 * in the rcu update to wait only for active cpus. For system
4763 * which do not switch off the HZ timer nohz_cpu_mask should
4764 * always be CPU_MASK_NONE.
4766 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4769 * Increase the granularity value when there are more CPUs,
4770 * because with more CPUs the 'effective latency' as visible
4771 * to users decreases. But the relationship is not linear,
4772 * so pick a second-best guess by going with the log2 of the
4773 * number of CPUs.
4775 * This idea comes from the SD scheduler of Con Kolivas:
4777 static inline void sched_init_granularity(void)
4779 unsigned int factor = 1 + ilog2(num_online_cpus());
4780 const unsigned long gran_limit = 10000000;
4782 sysctl_sched_granularity *= factor;
4783 if (sysctl_sched_granularity > gran_limit)
4784 sysctl_sched_granularity = gran_limit;
4786 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4787 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4790 #ifdef CONFIG_SMP
4792 * This is how migration works:
4794 * 1) we queue a struct migration_req structure in the source CPU's
4795 * runqueue and wake up that CPU's migration thread.
4796 * 2) we down() the locked semaphore => thread blocks.
4797 * 3) migration thread wakes up (implicitly it forces the migrated
4798 * thread off the CPU)
4799 * 4) it gets the migration request and checks whether the migrated
4800 * task is still in the wrong runqueue.
4801 * 5) if it's in the wrong runqueue then the migration thread removes
4802 * it and puts it into the right queue.
4803 * 6) migration thread up()s the semaphore.
4804 * 7) we wake up and the migration is done.
4808 * Change a given task's CPU affinity. Migrate the thread to a
4809 * proper CPU and schedule it away if the CPU it's executing on
4810 * is removed from the allowed bitmask.
4812 * NOTE: the caller must have a valid reference to the task, the
4813 * task must not exit() & deallocate itself prematurely. The
4814 * call is not atomic; no spinlocks may be held.
4816 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4818 struct migration_req req;
4819 unsigned long flags;
4820 struct rq *rq;
4821 int ret = 0;
4823 rq = task_rq_lock(p, &flags);
4824 if (!cpus_intersects(new_mask, cpu_online_map)) {
4825 ret = -EINVAL;
4826 goto out;
4829 p->cpus_allowed = new_mask;
4830 /* Can the task run on the task's current CPU? If so, we're done */
4831 if (cpu_isset(task_cpu(p), new_mask))
4832 goto out;
4834 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4835 /* Need help from migration thread: drop lock and wait. */
4836 task_rq_unlock(rq, &flags);
4837 wake_up_process(rq->migration_thread);
4838 wait_for_completion(&req.done);
4839 tlb_migrate_finish(p->mm);
4840 return 0;
4842 out:
4843 task_rq_unlock(rq, &flags);
4845 return ret;
4847 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4850 * Move (not current) task off this cpu, onto dest cpu. We're doing
4851 * this because either it can't run here any more (set_cpus_allowed()
4852 * away from this CPU, or CPU going down), or because we're
4853 * attempting to rebalance this task on exec (sched_exec).
4855 * So we race with normal scheduler movements, but that's OK, as long
4856 * as the task is no longer on this CPU.
4858 * Returns non-zero if task was successfully migrated.
4860 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4862 struct rq *rq_dest, *rq_src;
4863 int ret = 0, on_rq;
4865 if (unlikely(cpu_is_offline(dest_cpu)))
4866 return ret;
4868 rq_src = cpu_rq(src_cpu);
4869 rq_dest = cpu_rq(dest_cpu);
4871 double_rq_lock(rq_src, rq_dest);
4872 /* Already moved. */
4873 if (task_cpu(p) != src_cpu)
4874 goto out;
4875 /* Affinity changed (again). */
4876 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4877 goto out;
4879 on_rq = p->se.on_rq;
4880 if (on_rq)
4881 deactivate_task(rq_src, p, 0);
4882 set_task_cpu(p, dest_cpu);
4883 if (on_rq) {
4884 activate_task(rq_dest, p, 0);
4885 check_preempt_curr(rq_dest, p);
4887 ret = 1;
4888 out:
4889 double_rq_unlock(rq_src, rq_dest);
4890 return ret;
4894 * migration_thread - this is a highprio system thread that performs
4895 * thread migration by bumping thread off CPU then 'pushing' onto
4896 * another runqueue.
4898 static int migration_thread(void *data)
4900 int cpu = (long)data;
4901 struct rq *rq;
4903 rq = cpu_rq(cpu);
4904 BUG_ON(rq->migration_thread != current);
4906 set_current_state(TASK_INTERRUPTIBLE);
4907 while (!kthread_should_stop()) {
4908 struct migration_req *req;
4909 struct list_head *head;
4911 try_to_freeze();
4913 spin_lock_irq(&rq->lock);
4915 if (cpu_is_offline(cpu)) {
4916 spin_unlock_irq(&rq->lock);
4917 goto wait_to_die;
4920 if (rq->active_balance) {
4921 active_load_balance(rq, cpu);
4922 rq->active_balance = 0;
4925 head = &rq->migration_queue;
4927 if (list_empty(head)) {
4928 spin_unlock_irq(&rq->lock);
4929 schedule();
4930 set_current_state(TASK_INTERRUPTIBLE);
4931 continue;
4933 req = list_entry(head->next, struct migration_req, list);
4934 list_del_init(head->next);
4936 spin_unlock(&rq->lock);
4937 __migrate_task(req->task, cpu, req->dest_cpu);
4938 local_irq_enable();
4940 complete(&req->done);
4942 __set_current_state(TASK_RUNNING);
4943 return 0;
4945 wait_to_die:
4946 /* Wait for kthread_stop */
4947 set_current_state(TASK_INTERRUPTIBLE);
4948 while (!kthread_should_stop()) {
4949 schedule();
4950 set_current_state(TASK_INTERRUPTIBLE);
4952 __set_current_state(TASK_RUNNING);
4953 return 0;
4956 #ifdef CONFIG_HOTPLUG_CPU
4958 * Figure out where task on dead CPU should go, use force if neccessary.
4959 * NOTE: interrupts should be disabled by the caller
4961 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
4963 unsigned long flags;
4964 cpumask_t mask;
4965 struct rq *rq;
4966 int dest_cpu;
4968 restart:
4969 /* On same node? */
4970 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4971 cpus_and(mask, mask, p->cpus_allowed);
4972 dest_cpu = any_online_cpu(mask);
4974 /* On any allowed CPU? */
4975 if (dest_cpu == NR_CPUS)
4976 dest_cpu = any_online_cpu(p->cpus_allowed);
4978 /* No more Mr. Nice Guy. */
4979 if (dest_cpu == NR_CPUS) {
4980 rq = task_rq_lock(p, &flags);
4981 cpus_setall(p->cpus_allowed);
4982 dest_cpu = any_online_cpu(p->cpus_allowed);
4983 task_rq_unlock(rq, &flags);
4986 * Don't tell them about moving exiting tasks or
4987 * kernel threads (both mm NULL), since they never
4988 * leave kernel.
4990 if (p->mm && printk_ratelimit())
4991 printk(KERN_INFO "process %d (%s) no "
4992 "longer affine to cpu%d\n",
4993 p->pid, p->comm, dead_cpu);
4995 if (!__migrate_task(p, dead_cpu, dest_cpu))
4996 goto restart;
5000 * While a dead CPU has no uninterruptible tasks queued at this point,
5001 * it might still have a nonzero ->nr_uninterruptible counter, because
5002 * for performance reasons the counter is not stricly tracking tasks to
5003 * their home CPUs. So we just add the counter to another CPU's counter,
5004 * to keep the global sum constant after CPU-down:
5006 static void migrate_nr_uninterruptible(struct rq *rq_src)
5008 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5009 unsigned long flags;
5011 local_irq_save(flags);
5012 double_rq_lock(rq_src, rq_dest);
5013 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5014 rq_src->nr_uninterruptible = 0;
5015 double_rq_unlock(rq_src, rq_dest);
5016 local_irq_restore(flags);
5019 /* Run through task list and migrate tasks from the dead cpu. */
5020 static void migrate_live_tasks(int src_cpu)
5022 struct task_struct *p, *t;
5024 write_lock_irq(&tasklist_lock);
5026 do_each_thread(t, p) {
5027 if (p == current)
5028 continue;
5030 if (task_cpu(p) == src_cpu)
5031 move_task_off_dead_cpu(src_cpu, p);
5032 } while_each_thread(t, p);
5034 write_unlock_irq(&tasklist_lock);
5038 * Schedules idle task to be the next runnable task on current CPU.
5039 * It does so by boosting its priority to highest possible and adding it to
5040 * the _front_ of the runqueue. Used by CPU offline code.
5042 void sched_idle_next(void)
5044 int this_cpu = smp_processor_id();
5045 struct rq *rq = cpu_rq(this_cpu);
5046 struct task_struct *p = rq->idle;
5047 unsigned long flags;
5049 /* cpu has to be offline */
5050 BUG_ON(cpu_online(this_cpu));
5053 * Strictly not necessary since rest of the CPUs are stopped by now
5054 * and interrupts disabled on the current cpu.
5056 spin_lock_irqsave(&rq->lock, flags);
5058 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5060 /* Add idle task to the _front_ of its priority queue: */
5061 activate_idle_task(p, rq);
5063 spin_unlock_irqrestore(&rq->lock, flags);
5067 * Ensures that the idle task is using init_mm right before its cpu goes
5068 * offline.
5070 void idle_task_exit(void)
5072 struct mm_struct *mm = current->active_mm;
5074 BUG_ON(cpu_online(smp_processor_id()));
5076 if (mm != &init_mm)
5077 switch_mm(mm, &init_mm, current);
5078 mmdrop(mm);
5081 /* called under rq->lock with disabled interrupts */
5082 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5084 struct rq *rq = cpu_rq(dead_cpu);
5086 /* Must be exiting, otherwise would be on tasklist. */
5087 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5089 /* Cannot have done final schedule yet: would have vanished. */
5090 BUG_ON(p->state == TASK_DEAD);
5092 get_task_struct(p);
5095 * Drop lock around migration; if someone else moves it,
5096 * that's OK. No task can be added to this CPU, so iteration is
5097 * fine.
5098 * NOTE: interrupts should be left disabled --dev@
5100 spin_unlock(&rq->lock);
5101 move_task_off_dead_cpu(dead_cpu, p);
5102 spin_lock(&rq->lock);
5104 put_task_struct(p);
5107 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5108 static void migrate_dead_tasks(unsigned int dead_cpu)
5110 struct rq *rq = cpu_rq(dead_cpu);
5111 struct task_struct *next;
5113 for ( ; ; ) {
5114 if (!rq->nr_running)
5115 break;
5116 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5117 if (!next)
5118 break;
5119 migrate_dead(dead_cpu, next);
5122 #endif /* CONFIG_HOTPLUG_CPU */
5125 * migration_call - callback that gets triggered when a CPU is added.
5126 * Here we can start up the necessary migration thread for the new CPU.
5128 static int __cpuinit
5129 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5131 struct task_struct *p;
5132 int cpu = (long)hcpu;
5133 unsigned long flags;
5134 struct rq *rq;
5136 switch (action) {
5137 case CPU_LOCK_ACQUIRE:
5138 mutex_lock(&sched_hotcpu_mutex);
5139 break;
5141 case CPU_UP_PREPARE:
5142 case CPU_UP_PREPARE_FROZEN:
5143 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5144 if (IS_ERR(p))
5145 return NOTIFY_BAD;
5146 p->flags |= PF_NOFREEZE;
5147 kthread_bind(p, cpu);
5148 /* Must be high prio: stop_machine expects to yield to it. */
5149 rq = task_rq_lock(p, &flags);
5150 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5151 task_rq_unlock(rq, &flags);
5152 cpu_rq(cpu)->migration_thread = p;
5153 break;
5155 case CPU_ONLINE:
5156 case CPU_ONLINE_FROZEN:
5157 /* Strictly unneccessary, as first user will wake it. */
5158 wake_up_process(cpu_rq(cpu)->migration_thread);
5159 break;
5161 #ifdef CONFIG_HOTPLUG_CPU
5162 case CPU_UP_CANCELED:
5163 case CPU_UP_CANCELED_FROZEN:
5164 if (!cpu_rq(cpu)->migration_thread)
5165 break;
5166 /* Unbind it from offline cpu so it can run. Fall thru. */
5167 kthread_bind(cpu_rq(cpu)->migration_thread,
5168 any_online_cpu(cpu_online_map));
5169 kthread_stop(cpu_rq(cpu)->migration_thread);
5170 cpu_rq(cpu)->migration_thread = NULL;
5171 break;
5173 case CPU_DEAD:
5174 case CPU_DEAD_FROZEN:
5175 migrate_live_tasks(cpu);
5176 rq = cpu_rq(cpu);
5177 kthread_stop(rq->migration_thread);
5178 rq->migration_thread = NULL;
5179 /* Idle task back to normal (off runqueue, low prio) */
5180 rq = task_rq_lock(rq->idle, &flags);
5181 deactivate_task(rq, rq->idle, 0);
5182 rq->idle->static_prio = MAX_PRIO;
5183 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5184 rq->idle->sched_class = &idle_sched_class;
5185 migrate_dead_tasks(cpu);
5186 task_rq_unlock(rq, &flags);
5187 migrate_nr_uninterruptible(rq);
5188 BUG_ON(rq->nr_running != 0);
5190 /* No need to migrate the tasks: it was best-effort if
5191 * they didn't take sched_hotcpu_mutex. Just wake up
5192 * the requestors. */
5193 spin_lock_irq(&rq->lock);
5194 while (!list_empty(&rq->migration_queue)) {
5195 struct migration_req *req;
5197 req = list_entry(rq->migration_queue.next,
5198 struct migration_req, list);
5199 list_del_init(&req->list);
5200 complete(&req->done);
5202 spin_unlock_irq(&rq->lock);
5203 break;
5204 #endif
5205 case CPU_LOCK_RELEASE:
5206 mutex_unlock(&sched_hotcpu_mutex);
5207 break;
5209 return NOTIFY_OK;
5212 /* Register at highest priority so that task migration (migrate_all_tasks)
5213 * happens before everything else.
5215 static struct notifier_block __cpuinitdata migration_notifier = {
5216 .notifier_call = migration_call,
5217 .priority = 10
5220 int __init migration_init(void)
5222 void *cpu = (void *)(long)smp_processor_id();
5223 int err;
5225 /* Start one for the boot CPU: */
5226 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5227 BUG_ON(err == NOTIFY_BAD);
5228 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5229 register_cpu_notifier(&migration_notifier);
5231 return 0;
5233 #endif
5235 #ifdef CONFIG_SMP
5237 /* Number of possible processor ids */
5238 int nr_cpu_ids __read_mostly = NR_CPUS;
5239 EXPORT_SYMBOL(nr_cpu_ids);
5241 #undef SCHED_DOMAIN_DEBUG
5242 #ifdef SCHED_DOMAIN_DEBUG
5243 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5245 int level = 0;
5247 if (!sd) {
5248 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5249 return;
5252 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5254 do {
5255 int i;
5256 char str[NR_CPUS];
5257 struct sched_group *group = sd->groups;
5258 cpumask_t groupmask;
5260 cpumask_scnprintf(str, NR_CPUS, sd->span);
5261 cpus_clear(groupmask);
5263 printk(KERN_DEBUG);
5264 for (i = 0; i < level + 1; i++)
5265 printk(" ");
5266 printk("domain %d: ", level);
5268 if (!(sd->flags & SD_LOAD_BALANCE)) {
5269 printk("does not load-balance\n");
5270 if (sd->parent)
5271 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5272 " has parent");
5273 break;
5276 printk("span %s\n", str);
5278 if (!cpu_isset(cpu, sd->span))
5279 printk(KERN_ERR "ERROR: domain->span does not contain "
5280 "CPU%d\n", cpu);
5281 if (!cpu_isset(cpu, group->cpumask))
5282 printk(KERN_ERR "ERROR: domain->groups does not contain"
5283 " CPU%d\n", cpu);
5285 printk(KERN_DEBUG);
5286 for (i = 0; i < level + 2; i++)
5287 printk(" ");
5288 printk("groups:");
5289 do {
5290 if (!group) {
5291 printk("\n");
5292 printk(KERN_ERR "ERROR: group is NULL\n");
5293 break;
5296 if (!group->__cpu_power) {
5297 printk("\n");
5298 printk(KERN_ERR "ERROR: domain->cpu_power not "
5299 "set\n");
5302 if (!cpus_weight(group->cpumask)) {
5303 printk("\n");
5304 printk(KERN_ERR "ERROR: empty group\n");
5307 if (cpus_intersects(groupmask, group->cpumask)) {
5308 printk("\n");
5309 printk(KERN_ERR "ERROR: repeated CPUs\n");
5312 cpus_or(groupmask, groupmask, group->cpumask);
5314 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5315 printk(" %s", str);
5317 group = group->next;
5318 } while (group != sd->groups);
5319 printk("\n");
5321 if (!cpus_equal(sd->span, groupmask))
5322 printk(KERN_ERR "ERROR: groups don't span "
5323 "domain->span\n");
5325 level++;
5326 sd = sd->parent;
5327 if (!sd)
5328 continue;
5330 if (!cpus_subset(groupmask, sd->span))
5331 printk(KERN_ERR "ERROR: parent span is not a superset "
5332 "of domain->span\n");
5334 } while (sd);
5336 #else
5337 # define sched_domain_debug(sd, cpu) do { } while (0)
5338 #endif
5340 static int sd_degenerate(struct sched_domain *sd)
5342 if (cpus_weight(sd->span) == 1)
5343 return 1;
5345 /* Following flags need at least 2 groups */
5346 if (sd->flags & (SD_LOAD_BALANCE |
5347 SD_BALANCE_NEWIDLE |
5348 SD_BALANCE_FORK |
5349 SD_BALANCE_EXEC |
5350 SD_SHARE_CPUPOWER |
5351 SD_SHARE_PKG_RESOURCES)) {
5352 if (sd->groups != sd->groups->next)
5353 return 0;
5356 /* Following flags don't use groups */
5357 if (sd->flags & (SD_WAKE_IDLE |
5358 SD_WAKE_AFFINE |
5359 SD_WAKE_BALANCE))
5360 return 0;
5362 return 1;
5365 static int
5366 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5368 unsigned long cflags = sd->flags, pflags = parent->flags;
5370 if (sd_degenerate(parent))
5371 return 1;
5373 if (!cpus_equal(sd->span, parent->span))
5374 return 0;
5376 /* Does parent contain flags not in child? */
5377 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5378 if (cflags & SD_WAKE_AFFINE)
5379 pflags &= ~SD_WAKE_BALANCE;
5380 /* Flags needing groups don't count if only 1 group in parent */
5381 if (parent->groups == parent->groups->next) {
5382 pflags &= ~(SD_LOAD_BALANCE |
5383 SD_BALANCE_NEWIDLE |
5384 SD_BALANCE_FORK |
5385 SD_BALANCE_EXEC |
5386 SD_SHARE_CPUPOWER |
5387 SD_SHARE_PKG_RESOURCES);
5389 if (~cflags & pflags)
5390 return 0;
5392 return 1;
5396 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5397 * hold the hotplug lock.
5399 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5401 struct rq *rq = cpu_rq(cpu);
5402 struct sched_domain *tmp;
5404 /* Remove the sched domains which do not contribute to scheduling. */
5405 for (tmp = sd; tmp; tmp = tmp->parent) {
5406 struct sched_domain *parent = tmp->parent;
5407 if (!parent)
5408 break;
5409 if (sd_parent_degenerate(tmp, parent)) {
5410 tmp->parent = parent->parent;
5411 if (parent->parent)
5412 parent->parent->child = tmp;
5416 if (sd && sd_degenerate(sd)) {
5417 sd = sd->parent;
5418 if (sd)
5419 sd->child = NULL;
5422 sched_domain_debug(sd, cpu);
5424 rcu_assign_pointer(rq->sd, sd);
5427 /* cpus with isolated domains */
5428 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5430 /* Setup the mask of cpus configured for isolated domains */
5431 static int __init isolated_cpu_setup(char *str)
5433 int ints[NR_CPUS], i;
5435 str = get_options(str, ARRAY_SIZE(ints), ints);
5436 cpus_clear(cpu_isolated_map);
5437 for (i = 1; i <= ints[0]; i++)
5438 if (ints[i] < NR_CPUS)
5439 cpu_set(ints[i], cpu_isolated_map);
5440 return 1;
5443 __setup ("isolcpus=", isolated_cpu_setup);
5446 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5447 * to a function which identifies what group(along with sched group) a CPU
5448 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5449 * (due to the fact that we keep track of groups covered with a cpumask_t).
5451 * init_sched_build_groups will build a circular linked list of the groups
5452 * covered by the given span, and will set each group's ->cpumask correctly,
5453 * and ->cpu_power to 0.
5455 static void
5456 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5457 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5458 struct sched_group **sg))
5460 struct sched_group *first = NULL, *last = NULL;
5461 cpumask_t covered = CPU_MASK_NONE;
5462 int i;
5464 for_each_cpu_mask(i, span) {
5465 struct sched_group *sg;
5466 int group = group_fn(i, cpu_map, &sg);
5467 int j;
5469 if (cpu_isset(i, covered))
5470 continue;
5472 sg->cpumask = CPU_MASK_NONE;
5473 sg->__cpu_power = 0;
5475 for_each_cpu_mask(j, span) {
5476 if (group_fn(j, cpu_map, NULL) != group)
5477 continue;
5479 cpu_set(j, covered);
5480 cpu_set(j, sg->cpumask);
5482 if (!first)
5483 first = sg;
5484 if (last)
5485 last->next = sg;
5486 last = sg;
5488 last->next = first;
5491 #define SD_NODES_PER_DOMAIN 16
5493 #ifdef CONFIG_NUMA
5496 * find_next_best_node - find the next node to include in a sched_domain
5497 * @node: node whose sched_domain we're building
5498 * @used_nodes: nodes already in the sched_domain
5500 * Find the next node to include in a given scheduling domain. Simply
5501 * finds the closest node not already in the @used_nodes map.
5503 * Should use nodemask_t.
5505 static int find_next_best_node(int node, unsigned long *used_nodes)
5507 int i, n, val, min_val, best_node = 0;
5509 min_val = INT_MAX;
5511 for (i = 0; i < MAX_NUMNODES; i++) {
5512 /* Start at @node */
5513 n = (node + i) % MAX_NUMNODES;
5515 if (!nr_cpus_node(n))
5516 continue;
5518 /* Skip already used nodes */
5519 if (test_bit(n, used_nodes))
5520 continue;
5522 /* Simple min distance search */
5523 val = node_distance(node, n);
5525 if (val < min_val) {
5526 min_val = val;
5527 best_node = n;
5531 set_bit(best_node, used_nodes);
5532 return best_node;
5536 * sched_domain_node_span - get a cpumask for a node's sched_domain
5537 * @node: node whose cpumask we're constructing
5538 * @size: number of nodes to include in this span
5540 * Given a node, construct a good cpumask for its sched_domain to span. It
5541 * should be one that prevents unnecessary balancing, but also spreads tasks
5542 * out optimally.
5544 static cpumask_t sched_domain_node_span(int node)
5546 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5547 cpumask_t span, nodemask;
5548 int i;
5550 cpus_clear(span);
5551 bitmap_zero(used_nodes, MAX_NUMNODES);
5553 nodemask = node_to_cpumask(node);
5554 cpus_or(span, span, nodemask);
5555 set_bit(node, used_nodes);
5557 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5558 int next_node = find_next_best_node(node, used_nodes);
5560 nodemask = node_to_cpumask(next_node);
5561 cpus_or(span, span, nodemask);
5564 return span;
5566 #endif
5568 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5571 * SMT sched-domains:
5573 #ifdef CONFIG_SCHED_SMT
5574 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5575 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5577 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5578 struct sched_group **sg)
5580 if (sg)
5581 *sg = &per_cpu(sched_group_cpus, cpu);
5582 return cpu;
5584 #endif
5587 * multi-core sched-domains:
5589 #ifdef CONFIG_SCHED_MC
5590 static DEFINE_PER_CPU(struct sched_domain, core_domains);
5591 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5592 #endif
5594 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5595 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5596 struct sched_group **sg)
5598 int group;
5599 cpumask_t mask = cpu_sibling_map[cpu];
5600 cpus_and(mask, mask, *cpu_map);
5601 group = first_cpu(mask);
5602 if (sg)
5603 *sg = &per_cpu(sched_group_core, group);
5604 return group;
5606 #elif defined(CONFIG_SCHED_MC)
5607 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5608 struct sched_group **sg)
5610 if (sg)
5611 *sg = &per_cpu(sched_group_core, cpu);
5612 return cpu;
5614 #endif
5616 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5617 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
5619 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5620 struct sched_group **sg)
5622 int group;
5623 #ifdef CONFIG_SCHED_MC
5624 cpumask_t mask = cpu_coregroup_map(cpu);
5625 cpus_and(mask, mask, *cpu_map);
5626 group = first_cpu(mask);
5627 #elif defined(CONFIG_SCHED_SMT)
5628 cpumask_t mask = cpu_sibling_map[cpu];
5629 cpus_and(mask, mask, *cpu_map);
5630 group = first_cpu(mask);
5631 #else
5632 group = cpu;
5633 #endif
5634 if (sg)
5635 *sg = &per_cpu(sched_group_phys, group);
5636 return group;
5639 #ifdef CONFIG_NUMA
5641 * The init_sched_build_groups can't handle what we want to do with node
5642 * groups, so roll our own. Now each node has its own list of groups which
5643 * gets dynamically allocated.
5645 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5646 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5648 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5649 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
5651 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5652 struct sched_group **sg)
5654 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5655 int group;
5657 cpus_and(nodemask, nodemask, *cpu_map);
5658 group = first_cpu(nodemask);
5660 if (sg)
5661 *sg = &per_cpu(sched_group_allnodes, group);
5662 return group;
5665 static void init_numa_sched_groups_power(struct sched_group *group_head)
5667 struct sched_group *sg = group_head;
5668 int j;
5670 if (!sg)
5671 return;
5672 next_sg:
5673 for_each_cpu_mask(j, sg->cpumask) {
5674 struct sched_domain *sd;
5676 sd = &per_cpu(phys_domains, j);
5677 if (j != first_cpu(sd->groups->cpumask)) {
5679 * Only add "power" once for each
5680 * physical package.
5682 continue;
5685 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5687 sg = sg->next;
5688 if (sg != group_head)
5689 goto next_sg;
5691 #endif
5693 #ifdef CONFIG_NUMA
5694 /* Free memory allocated for various sched_group structures */
5695 static void free_sched_groups(const cpumask_t *cpu_map)
5697 int cpu, i;
5699 for_each_cpu_mask(cpu, *cpu_map) {
5700 struct sched_group **sched_group_nodes
5701 = sched_group_nodes_bycpu[cpu];
5703 if (!sched_group_nodes)
5704 continue;
5706 for (i = 0; i < MAX_NUMNODES; i++) {
5707 cpumask_t nodemask = node_to_cpumask(i);
5708 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5710 cpus_and(nodemask, nodemask, *cpu_map);
5711 if (cpus_empty(nodemask))
5712 continue;
5714 if (sg == NULL)
5715 continue;
5716 sg = sg->next;
5717 next_sg:
5718 oldsg = sg;
5719 sg = sg->next;
5720 kfree(oldsg);
5721 if (oldsg != sched_group_nodes[i])
5722 goto next_sg;
5724 kfree(sched_group_nodes);
5725 sched_group_nodes_bycpu[cpu] = NULL;
5728 #else
5729 static void free_sched_groups(const cpumask_t *cpu_map)
5732 #endif
5735 * Initialize sched groups cpu_power.
5737 * cpu_power indicates the capacity of sched group, which is used while
5738 * distributing the load between different sched groups in a sched domain.
5739 * Typically cpu_power for all the groups in a sched domain will be same unless
5740 * there are asymmetries in the topology. If there are asymmetries, group
5741 * having more cpu_power will pickup more load compared to the group having
5742 * less cpu_power.
5744 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5745 * the maximum number of tasks a group can handle in the presence of other idle
5746 * or lightly loaded groups in the same sched domain.
5748 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5750 struct sched_domain *child;
5751 struct sched_group *group;
5753 WARN_ON(!sd || !sd->groups);
5755 if (cpu != first_cpu(sd->groups->cpumask))
5756 return;
5758 child = sd->child;
5760 sd->groups->__cpu_power = 0;
5763 * For perf policy, if the groups in child domain share resources
5764 * (for example cores sharing some portions of the cache hierarchy
5765 * or SMT), then set this domain groups cpu_power such that each group
5766 * can handle only one task, when there are other idle groups in the
5767 * same sched domain.
5769 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5770 (child->flags &
5771 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5772 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
5773 return;
5777 * add cpu_power of each child group to this groups cpu_power
5779 group = child->groups;
5780 do {
5781 sg_inc_cpu_power(sd->groups, group->__cpu_power);
5782 group = group->next;
5783 } while (group != child->groups);
5787 * Build sched domains for a given set of cpus and attach the sched domains
5788 * to the individual cpus
5790 static int build_sched_domains(const cpumask_t *cpu_map)
5792 int i;
5793 #ifdef CONFIG_NUMA
5794 struct sched_group **sched_group_nodes = NULL;
5795 int sd_allnodes = 0;
5798 * Allocate the per-node list of sched groups
5800 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
5801 GFP_KERNEL);
5802 if (!sched_group_nodes) {
5803 printk(KERN_WARNING "Can not alloc sched group node list\n");
5804 return -ENOMEM;
5806 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5807 #endif
5810 * Set up domains for cpus specified by the cpu_map.
5812 for_each_cpu_mask(i, *cpu_map) {
5813 struct sched_domain *sd = NULL, *p;
5814 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5816 cpus_and(nodemask, nodemask, *cpu_map);
5818 #ifdef CONFIG_NUMA
5819 if (cpus_weight(*cpu_map) >
5820 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5821 sd = &per_cpu(allnodes_domains, i);
5822 *sd = SD_ALLNODES_INIT;
5823 sd->span = *cpu_map;
5824 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
5825 p = sd;
5826 sd_allnodes = 1;
5827 } else
5828 p = NULL;
5830 sd = &per_cpu(node_domains, i);
5831 *sd = SD_NODE_INIT;
5832 sd->span = sched_domain_node_span(cpu_to_node(i));
5833 sd->parent = p;
5834 if (p)
5835 p->child = sd;
5836 cpus_and(sd->span, sd->span, *cpu_map);
5837 #endif
5839 p = sd;
5840 sd = &per_cpu(phys_domains, i);
5841 *sd = SD_CPU_INIT;
5842 sd->span = nodemask;
5843 sd->parent = p;
5844 if (p)
5845 p->child = sd;
5846 cpu_to_phys_group(i, cpu_map, &sd->groups);
5848 #ifdef CONFIG_SCHED_MC
5849 p = sd;
5850 sd = &per_cpu(core_domains, i);
5851 *sd = SD_MC_INIT;
5852 sd->span = cpu_coregroup_map(i);
5853 cpus_and(sd->span, sd->span, *cpu_map);
5854 sd->parent = p;
5855 p->child = sd;
5856 cpu_to_core_group(i, cpu_map, &sd->groups);
5857 #endif
5859 #ifdef CONFIG_SCHED_SMT
5860 p = sd;
5861 sd = &per_cpu(cpu_domains, i);
5862 *sd = SD_SIBLING_INIT;
5863 sd->span = cpu_sibling_map[i];
5864 cpus_and(sd->span, sd->span, *cpu_map);
5865 sd->parent = p;
5866 p->child = sd;
5867 cpu_to_cpu_group(i, cpu_map, &sd->groups);
5868 #endif
5871 #ifdef CONFIG_SCHED_SMT
5872 /* Set up CPU (sibling) groups */
5873 for_each_cpu_mask(i, *cpu_map) {
5874 cpumask_t this_sibling_map = cpu_sibling_map[i];
5875 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
5876 if (i != first_cpu(this_sibling_map))
5877 continue;
5879 init_sched_build_groups(this_sibling_map, cpu_map,
5880 &cpu_to_cpu_group);
5882 #endif
5884 #ifdef CONFIG_SCHED_MC
5885 /* Set up multi-core groups */
5886 for_each_cpu_mask(i, *cpu_map) {
5887 cpumask_t this_core_map = cpu_coregroup_map(i);
5888 cpus_and(this_core_map, this_core_map, *cpu_map);
5889 if (i != first_cpu(this_core_map))
5890 continue;
5891 init_sched_build_groups(this_core_map, cpu_map,
5892 &cpu_to_core_group);
5894 #endif
5896 /* Set up physical groups */
5897 for (i = 0; i < MAX_NUMNODES; i++) {
5898 cpumask_t nodemask = node_to_cpumask(i);
5900 cpus_and(nodemask, nodemask, *cpu_map);
5901 if (cpus_empty(nodemask))
5902 continue;
5904 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
5907 #ifdef CONFIG_NUMA
5908 /* Set up node groups */
5909 if (sd_allnodes)
5910 init_sched_build_groups(*cpu_map, cpu_map,
5911 &cpu_to_allnodes_group);
5913 for (i = 0; i < MAX_NUMNODES; i++) {
5914 /* Set up node groups */
5915 struct sched_group *sg, *prev;
5916 cpumask_t nodemask = node_to_cpumask(i);
5917 cpumask_t domainspan;
5918 cpumask_t covered = CPU_MASK_NONE;
5919 int j;
5921 cpus_and(nodemask, nodemask, *cpu_map);
5922 if (cpus_empty(nodemask)) {
5923 sched_group_nodes[i] = NULL;
5924 continue;
5927 domainspan = sched_domain_node_span(i);
5928 cpus_and(domainspan, domainspan, *cpu_map);
5930 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
5931 if (!sg) {
5932 printk(KERN_WARNING "Can not alloc domain group for "
5933 "node %d\n", i);
5934 goto error;
5936 sched_group_nodes[i] = sg;
5937 for_each_cpu_mask(j, nodemask) {
5938 struct sched_domain *sd;
5939 sd = &per_cpu(node_domains, j);
5940 sd->groups = sg;
5942 sg->__cpu_power = 0;
5943 sg->cpumask = nodemask;
5944 sg->next = sg;
5945 cpus_or(covered, covered, nodemask);
5946 prev = sg;
5948 for (j = 0; j < MAX_NUMNODES; j++) {
5949 cpumask_t tmp, notcovered;
5950 int n = (i + j) % MAX_NUMNODES;
5952 cpus_complement(notcovered, covered);
5953 cpus_and(tmp, notcovered, *cpu_map);
5954 cpus_and(tmp, tmp, domainspan);
5955 if (cpus_empty(tmp))
5956 break;
5958 nodemask = node_to_cpumask(n);
5959 cpus_and(tmp, tmp, nodemask);
5960 if (cpus_empty(tmp))
5961 continue;
5963 sg = kmalloc_node(sizeof(struct sched_group),
5964 GFP_KERNEL, i);
5965 if (!sg) {
5966 printk(KERN_WARNING
5967 "Can not alloc domain group for node %d\n", j);
5968 goto error;
5970 sg->__cpu_power = 0;
5971 sg->cpumask = tmp;
5972 sg->next = prev->next;
5973 cpus_or(covered, covered, tmp);
5974 prev->next = sg;
5975 prev = sg;
5978 #endif
5980 /* Calculate CPU power for physical packages and nodes */
5981 #ifdef CONFIG_SCHED_SMT
5982 for_each_cpu_mask(i, *cpu_map) {
5983 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5985 init_sched_groups_power(i, sd);
5987 #endif
5988 #ifdef CONFIG_SCHED_MC
5989 for_each_cpu_mask(i, *cpu_map) {
5990 struct sched_domain *sd = &per_cpu(core_domains, i);
5992 init_sched_groups_power(i, sd);
5994 #endif
5996 for_each_cpu_mask(i, *cpu_map) {
5997 struct sched_domain *sd = &per_cpu(phys_domains, i);
5999 init_sched_groups_power(i, sd);
6002 #ifdef CONFIG_NUMA
6003 for (i = 0; i < MAX_NUMNODES; i++)
6004 init_numa_sched_groups_power(sched_group_nodes[i]);
6006 if (sd_allnodes) {
6007 struct sched_group *sg;
6009 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6010 init_numa_sched_groups_power(sg);
6012 #endif
6014 /* Attach the domains */
6015 for_each_cpu_mask(i, *cpu_map) {
6016 struct sched_domain *sd;
6017 #ifdef CONFIG_SCHED_SMT
6018 sd = &per_cpu(cpu_domains, i);
6019 #elif defined(CONFIG_SCHED_MC)
6020 sd = &per_cpu(core_domains, i);
6021 #else
6022 sd = &per_cpu(phys_domains, i);
6023 #endif
6024 cpu_attach_domain(sd, i);
6027 return 0;
6029 #ifdef CONFIG_NUMA
6030 error:
6031 free_sched_groups(cpu_map);
6032 return -ENOMEM;
6033 #endif
6036 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6038 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6040 cpumask_t cpu_default_map;
6041 int err;
6044 * Setup mask for cpus without special case scheduling requirements.
6045 * For now this just excludes isolated cpus, but could be used to
6046 * exclude other special cases in the future.
6048 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6050 err = build_sched_domains(&cpu_default_map);
6052 return err;
6055 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6057 free_sched_groups(cpu_map);
6061 * Detach sched domains from a group of cpus specified in cpu_map
6062 * These cpus will now be attached to the NULL domain
6064 static void detach_destroy_domains(const cpumask_t *cpu_map)
6066 int i;
6068 for_each_cpu_mask(i, *cpu_map)
6069 cpu_attach_domain(NULL, i);
6070 synchronize_sched();
6071 arch_destroy_sched_domains(cpu_map);
6075 * Partition sched domains as specified by the cpumasks below.
6076 * This attaches all cpus from the cpumasks to the NULL domain,
6077 * waits for a RCU quiescent period, recalculates sched
6078 * domain information and then attaches them back to the
6079 * correct sched domains
6080 * Call with hotplug lock held
6082 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6084 cpumask_t change_map;
6085 int err = 0;
6087 cpus_and(*partition1, *partition1, cpu_online_map);
6088 cpus_and(*partition2, *partition2, cpu_online_map);
6089 cpus_or(change_map, *partition1, *partition2);
6091 /* Detach sched domains from all of the affected cpus */
6092 detach_destroy_domains(&change_map);
6093 if (!cpus_empty(*partition1))
6094 err = build_sched_domains(partition1);
6095 if (!err && !cpus_empty(*partition2))
6096 err = build_sched_domains(partition2);
6098 return err;
6101 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6102 int arch_reinit_sched_domains(void)
6104 int err;
6106 mutex_lock(&sched_hotcpu_mutex);
6107 detach_destroy_domains(&cpu_online_map);
6108 err = arch_init_sched_domains(&cpu_online_map);
6109 mutex_unlock(&sched_hotcpu_mutex);
6111 return err;
6114 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6116 int ret;
6118 if (buf[0] != '0' && buf[0] != '1')
6119 return -EINVAL;
6121 if (smt)
6122 sched_smt_power_savings = (buf[0] == '1');
6123 else
6124 sched_mc_power_savings = (buf[0] == '1');
6126 ret = arch_reinit_sched_domains();
6128 return ret ? ret : count;
6131 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6133 int err = 0;
6135 #ifdef CONFIG_SCHED_SMT
6136 if (smt_capable())
6137 err = sysfs_create_file(&cls->kset.kobj,
6138 &attr_sched_smt_power_savings.attr);
6139 #endif
6140 #ifdef CONFIG_SCHED_MC
6141 if (!err && mc_capable())
6142 err = sysfs_create_file(&cls->kset.kobj,
6143 &attr_sched_mc_power_savings.attr);
6144 #endif
6145 return err;
6147 #endif
6149 #ifdef CONFIG_SCHED_MC
6150 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6152 return sprintf(page, "%u\n", sched_mc_power_savings);
6154 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6155 const char *buf, size_t count)
6157 return sched_power_savings_store(buf, count, 0);
6159 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6160 sched_mc_power_savings_store);
6161 #endif
6163 #ifdef CONFIG_SCHED_SMT
6164 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6166 return sprintf(page, "%u\n", sched_smt_power_savings);
6168 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6169 const char *buf, size_t count)
6171 return sched_power_savings_store(buf, count, 1);
6173 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6174 sched_smt_power_savings_store);
6175 #endif
6178 * Force a reinitialization of the sched domains hierarchy. The domains
6179 * and groups cannot be updated in place without racing with the balancing
6180 * code, so we temporarily attach all running cpus to the NULL domain
6181 * which will prevent rebalancing while the sched domains are recalculated.
6183 static int update_sched_domains(struct notifier_block *nfb,
6184 unsigned long action, void *hcpu)
6186 switch (action) {
6187 case CPU_UP_PREPARE:
6188 case CPU_UP_PREPARE_FROZEN:
6189 case CPU_DOWN_PREPARE:
6190 case CPU_DOWN_PREPARE_FROZEN:
6191 detach_destroy_domains(&cpu_online_map);
6192 return NOTIFY_OK;
6194 case CPU_UP_CANCELED:
6195 case CPU_UP_CANCELED_FROZEN:
6196 case CPU_DOWN_FAILED:
6197 case CPU_DOWN_FAILED_FROZEN:
6198 case CPU_ONLINE:
6199 case CPU_ONLINE_FROZEN:
6200 case CPU_DEAD:
6201 case CPU_DEAD_FROZEN:
6203 * Fall through and re-initialise the domains.
6205 break;
6206 default:
6207 return NOTIFY_DONE;
6210 /* The hotplug lock is already held by cpu_up/cpu_down */
6211 arch_init_sched_domains(&cpu_online_map);
6213 return NOTIFY_OK;
6216 void __init sched_init_smp(void)
6218 cpumask_t non_isolated_cpus;
6220 mutex_lock(&sched_hotcpu_mutex);
6221 arch_init_sched_domains(&cpu_online_map);
6222 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6223 if (cpus_empty(non_isolated_cpus))
6224 cpu_set(smp_processor_id(), non_isolated_cpus);
6225 mutex_unlock(&sched_hotcpu_mutex);
6226 /* XXX: Theoretical race here - CPU may be hotplugged now */
6227 hotcpu_notifier(update_sched_domains, 0);
6229 /* Move init over to a non-isolated CPU */
6230 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6231 BUG();
6232 sched_init_granularity();
6234 #else
6235 void __init sched_init_smp(void)
6237 sched_init_granularity();
6239 #endif /* CONFIG_SMP */
6241 int in_sched_functions(unsigned long addr)
6243 /* Linker adds these: start and end of __sched functions */
6244 extern char __sched_text_start[], __sched_text_end[];
6246 return in_lock_functions(addr) ||
6247 (addr >= (unsigned long)__sched_text_start
6248 && addr < (unsigned long)__sched_text_end);
6251 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6253 cfs_rq->tasks_timeline = RB_ROOT;
6254 cfs_rq->fair_clock = 1;
6255 #ifdef CONFIG_FAIR_GROUP_SCHED
6256 cfs_rq->rq = rq;
6257 #endif
6260 void __init sched_init(void)
6262 u64 now = sched_clock();
6263 int highest_cpu = 0;
6264 int i, j;
6267 * Link up the scheduling class hierarchy:
6269 rt_sched_class.next = &fair_sched_class;
6270 fair_sched_class.next = &idle_sched_class;
6271 idle_sched_class.next = NULL;
6273 for_each_possible_cpu(i) {
6274 struct rt_prio_array *array;
6275 struct rq *rq;
6277 rq = cpu_rq(i);
6278 spin_lock_init(&rq->lock);
6279 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6280 rq->nr_running = 0;
6281 rq->clock = 1;
6282 init_cfs_rq(&rq->cfs, rq);
6283 #ifdef CONFIG_FAIR_GROUP_SCHED
6284 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6285 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6286 #endif
6287 rq->ls.load_update_last = now;
6288 rq->ls.load_update_start = now;
6290 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6291 rq->cpu_load[j] = 0;
6292 #ifdef CONFIG_SMP
6293 rq->sd = NULL;
6294 rq->active_balance = 0;
6295 rq->next_balance = jiffies;
6296 rq->push_cpu = 0;
6297 rq->cpu = i;
6298 rq->migration_thread = NULL;
6299 INIT_LIST_HEAD(&rq->migration_queue);
6300 #endif
6301 atomic_set(&rq->nr_iowait, 0);
6303 array = &rq->rt.active;
6304 for (j = 0; j < MAX_RT_PRIO; j++) {
6305 INIT_LIST_HEAD(array->queue + j);
6306 __clear_bit(j, array->bitmap);
6308 highest_cpu = i;
6309 /* delimiter for bitsearch: */
6310 __set_bit(MAX_RT_PRIO, array->bitmap);
6313 set_load_weight(&init_task);
6315 #ifdef CONFIG_SMP
6316 nr_cpu_ids = highest_cpu + 1;
6317 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6318 #endif
6320 #ifdef CONFIG_RT_MUTEXES
6321 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6322 #endif
6325 * The boot idle thread does lazy MMU switching as well:
6327 atomic_inc(&init_mm.mm_count);
6328 enter_lazy_tlb(&init_mm, current);
6331 * Make us the idle thread. Technically, schedule() should not be
6332 * called from this thread, however somewhere below it might be,
6333 * but because we are the idle thread, we just pick up running again
6334 * when this runqueue becomes "idle".
6336 init_idle(current, smp_processor_id());
6338 * During early bootup we pretend to be a normal task:
6340 current->sched_class = &fair_sched_class;
6343 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6344 void __might_sleep(char *file, int line)
6346 #ifdef in_atomic
6347 static unsigned long prev_jiffy; /* ratelimiting */
6349 if ((in_atomic() || irqs_disabled()) &&
6350 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6351 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6352 return;
6353 prev_jiffy = jiffies;
6354 printk(KERN_ERR "BUG: sleeping function called from invalid"
6355 " context at %s:%d\n", file, line);
6356 printk("in_atomic():%d, irqs_disabled():%d\n",
6357 in_atomic(), irqs_disabled());
6358 debug_show_held_locks(current);
6359 if (irqs_disabled())
6360 print_irqtrace_events(current);
6361 dump_stack();
6363 #endif
6365 EXPORT_SYMBOL(__might_sleep);
6366 #endif
6368 #ifdef CONFIG_MAGIC_SYSRQ
6369 void normalize_rt_tasks(void)
6371 struct task_struct *g, *p;
6372 unsigned long flags;
6373 struct rq *rq;
6374 int on_rq;
6376 read_lock_irq(&tasklist_lock);
6377 do_each_thread(g, p) {
6378 p->se.fair_key = 0;
6379 p->se.wait_runtime = 0;
6380 p->se.wait_start_fair = 0;
6381 p->se.wait_start = 0;
6382 p->se.exec_start = 0;
6383 p->se.sleep_start = 0;
6384 p->se.sleep_start_fair = 0;
6385 p->se.block_start = 0;
6386 task_rq(p)->cfs.fair_clock = 0;
6387 task_rq(p)->clock = 0;
6389 if (!rt_task(p)) {
6391 * Renice negative nice level userspace
6392 * tasks back to 0:
6394 if (TASK_NICE(p) < 0 && p->mm)
6395 set_user_nice(p, 0);
6396 continue;
6399 spin_lock_irqsave(&p->pi_lock, flags);
6400 rq = __task_rq_lock(p);
6401 #ifdef CONFIG_SMP
6403 * Do not touch the migration thread:
6405 if (p == rq->migration_thread)
6406 goto out_unlock;
6407 #endif
6409 on_rq = p->se.on_rq;
6410 if (on_rq)
6411 deactivate_task(task_rq(p), p, 0);
6412 __setscheduler(rq, p, SCHED_NORMAL, 0);
6413 if (on_rq) {
6414 activate_task(task_rq(p), p, 0);
6415 resched_task(rq->curr);
6417 #ifdef CONFIG_SMP
6418 out_unlock:
6419 #endif
6420 __task_rq_unlock(rq);
6421 spin_unlock_irqrestore(&p->pi_lock, flags);
6422 } while_each_thread(g, p);
6424 read_unlock_irq(&tasklist_lock);
6427 #endif /* CONFIG_MAGIC_SYSRQ */
6429 #ifdef CONFIG_IA64
6431 * These functions are only useful for the IA64 MCA handling.
6433 * They can only be called when the whole system has been
6434 * stopped - every CPU needs to be quiescent, and no scheduling
6435 * activity can take place. Using them for anything else would
6436 * be a serious bug, and as a result, they aren't even visible
6437 * under any other configuration.
6441 * curr_task - return the current task for a given cpu.
6442 * @cpu: the processor in question.
6444 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6446 struct task_struct *curr_task(int cpu)
6448 return cpu_curr(cpu);
6452 * set_curr_task - set the current task for a given cpu.
6453 * @cpu: the processor in question.
6454 * @p: the task pointer to set.
6456 * Description: This function must only be used when non-maskable interrupts
6457 * are serviced on a separate stack. It allows the architecture to switch the
6458 * notion of the current task on a cpu in a non-blocking manner. This function
6459 * must be called with all CPU's synchronized, and interrupts disabled, the
6460 * and caller must save the original value of the current task (see
6461 * curr_task() above) and restore that value before reenabling interrupts and
6462 * re-starting the system.
6464 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6466 void set_curr_task(int cpu, struct task_struct *p)
6468 cpu_curr(cpu) = p;
6471 #endif