kernel/sched.c

   1 /*
   2  *  kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13  *              hybrid priority-list and round-robin design with
  14  *              an array-switch method of distributing timeslices
  15  *              and per-CPU runqueues.  Cleanups and useful suggestions
  16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17  */
  18
  19 #include <linux/mm.h>
  20 #include <linux/nmi.h>
  21 #include <linux/init.h>
  22 #include <asm/uaccess.h>
  23 #include <linux/highmem.h>
  24 #include <linux/smp_lock.h>
  25 #include <asm/mmu_context.h>
  26 #include <linux/interrupt.h>
  27 #include <linux/completion.h>
  28 #include <linux/kernel_stat.h>
  29 #include <linux/security.h>
  30 #include <linux/notifier.h>
  31 #include <linux/blkdev.h>
  32 #include <linux/delay.h>
  33 #include <linux/timer.h>
  34 #include <linux/rcupdate.h>
  35 #include <linux/cpu.h>
  36 #include <linux/percpu.h>
  37
  38 #ifdef CONFIG_NUMA
  39 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
  40 #else
  41 #define cpu_to_node_mask(cpu) (cpu_online_map)
  42 #endif
  43
  44 /*
  45  * Convert user-nice values [ -20 ... 0 ... 19 ]
  46  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  47  * and back.
  48  */
  49 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  50 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  51 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  52
  53 /*
  54  * 'User priority' is the nice value converted to something we
  55  * can work with better when scaling various scheduler parameters,
  56  * it's a [ 0 ... 39 ] range.
  57  */
  58 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  59 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  60 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  61
  62 /*
  63  * These are the 'tuning knobs' of the scheduler:
  64  *
  65  * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
  66  * maximum timeslice is 200 msecs. Timeslices get refilled after
  67  * they expire.
  68  */
  69 #define MIN_TIMESLICE           ( 10 * HZ / 1000)
  70 #define MAX_TIMESLICE           (200 * HZ / 1000)
  71 #define CHILD_PENALTY           50
  72 #define PARENT_PENALTY          100
  73 #define EXIT_WEIGHT             3
  74 #define PRIO_BONUS_RATIO        25
  75 #define INTERACTIVE_DELTA       2
  76 #define MAX_SLEEP_AVG           (10*HZ)
  77 #define STARVATION_LIMIT        (10*HZ)
  78 #define NODE_THRESHOLD          125
  79
  80 /*
  81  * If a task is 'interactive' then we reinsert it in the active
  82  * array after it has expired its current timeslice. (it will not
  83  * continue to run immediately, it will still roundrobin with
  84  * other interactive tasks.)
  85  *
  86  * This part scales the interactivity limit depending on niceness.
  87  *
  88  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
  89  * Here are a few examples of different nice levels:
  90  *
  91  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
  92  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
  93  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
  94  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
  95  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
  96  *
  97  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
  98  *  priority range a task can explore, a value of '1' means the
  99  *  task is rated interactive.)
 100  *
 101  * Ie. nice +19 tasks can never get 'interactive' enough to be
 102  * reinserted into the active array. And only heavily CPU-hog nice -20
 103  * tasks will be expired. Default nice 0 tasks are somewhere between,
 104  * it takes some effort for them to get interactive, but it's not
 105  * too hard.
 106  */
 107
 108 #define SCALE(v1,v1_max,v2_max) \
 109         (v1) * (v2_max) / (v1_max)
 110
 111 #define DELTA(p) \
 112         (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
 113                 INTERACTIVE_DELTA)
 114
 115 #define TASK_INTERACTIVE(p) \
 116         ((p)->prio <= (p)->static_prio - DELTA(p))
 117
 118 /*
 119  * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
 120  * to time slice values.
 121  *
 122  * The higher a thread's priority, the bigger timeslices
 123  * it gets during one round of execution. But even the lowest
 124  * priority thread gets MIN_TIMESLICE worth of execution time.
 125  *
 126  * task_timeslice() is the interface that is used by the scheduler.
 127  */
 128
 129 #define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
 130         ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
 131
 132 static inline unsigned int task_timeslice(task_t *p)
 133 {
 134         return BASE_TIMESLICE(p);
 135 }
 136
 137 /*
 138  * These are the runqueue data structures:
 139  */
 140
 141 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
 142
 143 typedef struct runqueue runqueue_t;
 144
 145 struct prio_array {
 146         int nr_active;
 147         unsigned long bitmap[BITMAP_SIZE];
 148         struct list_head queue[MAX_PRIO];
 149 };
 150
 151 /*
 152  * This is the main, per-CPU runqueue data structure.
 153  *
 154  * Locking rule: those places that want to lock multiple runqueues
 155  * (such as the load balancing or the thread migration code), lock
 156  * acquire operations must be ordered by ascending &runqueue.
 157  */
 158 struct runqueue {
 159         spinlock_t lock;
 160         unsigned long nr_running, nr_switches, expired_timestamp,
 161                         nr_uninterruptible;
 162         task_t *curr, *idle;
 163         struct mm_struct *prev_mm;
 164         prio_array_t *active, *expired, arrays[2];
 165         int prev_cpu_load[NR_CPUS];
 166 #ifdef CONFIG_NUMA
 167         atomic_t *node_nr_running;
 168         int prev_node_load[MAX_NUMNODES];
 169 #endif
 170         task_t *migration_thread;
 171         struct list_head migration_queue;
 172
 173         atomic_t nr_iowait;
 174 };
 175
 176 static DEFINE_PER_CPU(struct runqueue, runqueues);
 177
 178 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 179 #define this_rq()               (&__get_cpu_var(runqueues))
 180 #define task_rq(p)              cpu_rq(task_cpu(p))
 181 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 182 #define rt_task(p)              ((p)->prio < MAX_RT_PRIO)
 183
 184 /*
 185  * Default context-switch locking:
 186  */
 187 #ifndef prepare_arch_switch
 188 # define prepare_arch_switch(rq, next)  do { } while(0)
 189 # define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
 190 # define task_running(rq, p)            ((rq)->curr == (p))
 191 #endif
 192
 193 #ifdef CONFIG_NUMA
 194
 195 /*
 196  * Keep track of running tasks.
 197  */
 198
 199 static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp =
 200         {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
 201
 202 static inline void nr_running_init(struct runqueue *rq)
 203 {
 204         rq->node_nr_running = &node_nr_running[0];
 205 }
 206
 207 static inline void nr_running_inc(runqueue_t *rq)
 208 {
 209         atomic_inc(rq->node_nr_running);
 210         rq->nr_running++;
 211 }
 212
 213 static inline void nr_running_dec(runqueue_t *rq)
 214 {
 215         atomic_dec(rq->node_nr_running);
 216         rq->nr_running--;
 217 }
 218
 219 __init void node_nr_running_init(void)
 220 {
 221         int i;
 222
 223         for (i = 0; i < NR_CPUS; i++) {
 224                 if (cpu_possible(i))
 225                         cpu_rq(i)->node_nr_running =
 226                                 &node_nr_running[cpu_to_node(i)];
 227         }
 228 }
 229
 230 #else /* !CONFIG_NUMA */
 231
 232 # define nr_running_init(rq)   do { } while (0)
 233 # define nr_running_inc(rq)    do { (rq)->nr_running++; } while (0)
 234 # define nr_running_dec(rq)    do { (rq)->nr_running--; } while (0)
 235
 236 #endif /* CONFIG_NUMA */
 237
 238 /*
 239  * task_rq_lock - lock the runqueue a given task resides on and disable
 240  * interrupts.  Note the ordering: we can safely lookup the task_rq without
 241  * explicitly disabling preemption.
 242  */
 243 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
 244 {
 245         struct runqueue *rq;
 246
 247 repeat_lock_task:
 248         local_irq_save(*flags);
 249         rq = task_rq(p);
 250         spin_lock(&rq->lock);
 251         if (unlikely(rq != task_rq(p))) {
 252                 spin_unlock_irqrestore(&rq->lock, *flags);
 253                 goto repeat_lock_task;
 254         }
 255         return rq;
 256 }
 257
 258 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 259 {
 260         spin_unlock_irqrestore(&rq->lock, *flags);
 261 }
 262
 263 /*
 264  * rq_lock - lock a given runqueue and disable interrupts.
 265  */
 266 static inline runqueue_t *this_rq_lock(void)
 267 {
 268         runqueue_t *rq;
 269
 270         local_irq_disable();
 271         rq = this_rq();
 272         spin_lock(&rq->lock);
 273
 274         return rq;
 275 }
 276
 277 static inline void rq_unlock(runqueue_t *rq)
 278 {
 279         spin_unlock_irq(&rq->lock);
 280 }
 281
 282 /*
 283  * Adding/removing a task to/from a priority array:
 284  */
 285 static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
 286 {
 287         array->nr_active--;
 288         list_del(&p->run_list);
 289         if (list_empty(array->queue + p->prio))
 290                 __clear_bit(p->prio, array->bitmap);
 291 }
 292
 293 static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
 294 {
 295         list_add_tail(&p->run_list, array->queue + p->prio);
 296         __set_bit(p->prio, array->bitmap);
 297         array->nr_active++;
 298         p->array = array;
 299 }
 300
 301 /*
 302  * effective_prio - return the priority that is based on the static
 303  * priority but is modified by bonuses/penalties.
 304  *
 305  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
 306  * into the -5 ... 0 ... +5 bonus/penalty range.
 307  *
 308  * We use 25% of the full 0...39 priority range so that:
 309  *
 310  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
 311  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
 312  *
 313  * Both properties are important to certain workloads.
 314  */
 315 static int effective_prio(task_t *p)
 316 {
 317         int bonus, prio;
 318
 319         if (rt_task(p))
 320                 return p->prio;
 321
 322         bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
 323                         MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
 324
 325         prio = p->static_prio - bonus;
 326         if (prio < MAX_RT_PRIO)
 327                 prio = MAX_RT_PRIO;
 328         if (prio > MAX_PRIO-1)
 329                 prio = MAX_PRIO-1;
 330         return prio;
 331 }
 332
 333 /*
 334  * __activate_task - move a task to the runqueue.
 335  */
 336 static inline void __activate_task(task_t *p, runqueue_t *rq)
 337 {
 338         enqueue_task(p, rq->active);
 339         nr_running_inc(rq);
 340 }
 341
 342 /*
 343  * activate_task - move a task to the runqueue and do priority recalculation
 344  *
 345  * Update all the scheduling statistics stuff. (sleep average
 346  * calculation, priority modifiers, etc.)
 347  */
 348 static inline void activate_task(task_t *p, runqueue_t *rq)
 349 {
 350         long sleep_time = jiffies - p->last_run - 1;
 351
 352         if (sleep_time > 0) {
 353                 int sleep_avg;
 354
 355                 /*
 356                  * This code gives a bonus to interactive tasks.
 357                  *
 358                  * The boost works by updating the 'average sleep time'
 359                  * value here, based on ->last_run. The more time a task
 360                  * spends sleeping, the higher the average gets - and the
 361                  * higher the priority boost gets as well.
 362                  */
 363                 sleep_avg = p->sleep_avg + sleep_time;
 364
 365                 /*
 366                  * 'Overflow' bonus ticks go to the waker as well, so the
 367                  * ticks are not lost. This has the effect of further
 368                  * boosting tasks that are related to maximum-interactive
 369                  * tasks.
 370                  */
 371                 if (sleep_avg > MAX_SLEEP_AVG)
 372                         sleep_avg = MAX_SLEEP_AVG;
 373                 if (p->sleep_avg != sleep_avg) {
 374                         p->sleep_avg = sleep_avg;
 375                         p->prio = effective_prio(p);
 376                 }
 377         }
 378         __activate_task(p, rq);
 379 }
 380
 381 /*
 382  * deactivate_task - remove a task from the runqueue.
 383  */
 384 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 385 {
 386         nr_running_dec(rq);
 387         if (p->state == TASK_UNINTERRUPTIBLE)
 388                 rq->nr_uninterruptible++;
 389         dequeue_task(p, p->array);
 390         p->array = NULL;
 391 }
 392
 393 /*
 394  * resched_task - mark a task 'to be rescheduled now'.
 395  *
 396  * On UP this means the setting of the need_resched flag, on SMP it
 397  * might also involve a cross-CPU call to trigger the scheduler on
 398  * the target CPU.
 399  */
 400 static inline void resched_task(task_t *p)
 401 {
 402 #ifdef CONFIG_SMP
 403         int need_resched, nrpolling;
 404
 405         preempt_disable();
 406         /* minimise the chance of sending an interrupt to poll_idle() */
 407         nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 408         need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
 409         nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 410
 411         if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
 412                 smp_send_reschedule(task_cpu(p));
 413         preempt_enable();
 414 #else
 415         set_tsk_need_resched(p);
 416 #endif
 417 }
 418
 419 #ifdef CONFIG_SMP
 420
 421 /*
 422  * wait_task_inactive - wait for a thread to unschedule.
 423  *
 424  * The caller must ensure that the task *will* unschedule sometime soon,
 425  * else this function might spin for a *long* time. This function can't
 426  * be called with interrupts off, or it may introduce deadlock with
 427  * smp_call_function() if an IPI is sent by the same process we are
 428  * waiting to become inactive.
 429  */
 430 void wait_task_inactive(task_t * p)
 431 {
 432         unsigned long flags;
 433         runqueue_t *rq;
 434
 435 repeat:
 436         preempt_disable();
 437         rq = task_rq(p);
 438         if (unlikely(task_running(rq, p))) {
 439                 cpu_relax();
 440                 /*
 441                  * enable/disable preemption just to make this
 442                  * a preemption point - we are busy-waiting
 443                  * anyway.
 444                  */
 445                 preempt_enable();
 446                 goto repeat;
 447         }
 448         rq = task_rq_lock(p, &flags);
 449         if (unlikely(task_running(rq, p))) {
 450                 task_rq_unlock(rq, &flags);
 451                 preempt_enable();
 452                 goto repeat;
 453         }
 454         task_rq_unlock(rq, &flags);
 455         preempt_enable();
 456 }
 457 #endif
 458
 459 /***
 460  * try_to_wake_up - wake up a thread
 461  * @p: the to-be-woken-up thread
 462  * @state: the mask of task states that can be woken
 463  * @sync: do a synchronous wakeup?
 464  * @kick: kick the CPU if the task is already running?
 465  *
 466  * Put it on the run-queue if it's not already there. The "current"
 467  * thread is always on the run-queue (except when the actual
 468  * re-schedule is in progress), and as such you're allowed to do
 469  * the simpler "current->state = TASK_RUNNING" to mark yourself
 470  * runnable without the overhead of this.
 471  *
 472  * returns failure only if the task is already active.
 473  */
 474 static int try_to_wake_up(task_t * p, unsigned int state, int sync, int kick)
 475 {
 476         unsigned long flags;
 477         int success = 0;
 478         long old_state;
 479         runqueue_t *rq;
 480
 481 repeat_lock_task:
 482         rq = task_rq_lock(p, &flags);
 483         old_state = p->state;
 484         if (old_state & state) {
 485                 if (!p->array) {
 486                         /*
 487                          * Fast-migrate the task if it's not running or runnable
 488                          * currently. Do not violate hard affinity.
 489                          */
 490                         if (unlikely(sync && !task_running(rq, p) &&
 491                                 (task_cpu(p) != smp_processor_id()) &&
 492                                 (p->cpus_allowed & (1UL << smp_processor_id())))) {
 493
 494                                 set_task_cpu(p, smp_processor_id());
 495                                 task_rq_unlock(rq, &flags);
 496                                 goto repeat_lock_task;
 497                         }
 498                         if (old_state == TASK_UNINTERRUPTIBLE)
 499                                 rq->nr_uninterruptible--;
 500                         if (sync)
 501                                 __activate_task(p, rq);
 502                         else {
 503                                 activate_task(p, rq);
 504                                 if (p->prio < rq->curr->prio)
 505                                         resched_task(rq->curr);
 506                         }
 507                         success = 1;
 508                 }
 509 #ifdef CONFIG_SMP
 510                 else
 511                         if (unlikely(kick) && task_running(rq, p) && (task_cpu(p) != smp_processor_id()))
 512                                 smp_send_reschedule(task_cpu(p));
 513 #endif
 514                 p->state = TASK_RUNNING;
 515         }
 516         task_rq_unlock(rq, &flags);
 517
 518         return success;
 519 }
 520
 521 int wake_up_process(task_t * p)
 522 {
 523         return try_to_wake_up(p, TASK_STOPPED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0, 0);
 524 }
 525
 526 int wake_up_process_kick(task_t * p)
 527 {
 528         return try_to_wake_up(p, TASK_STOPPED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0, 1);
 529 }
 530
 531 int wake_up_state(task_t *p, unsigned int state)
 532 {
 533         return try_to_wake_up(p, state, 0, 0);
 534 }
 535
 536 /*
 537  * wake_up_forked_process - wake up a freshly forked process.
 538  *
 539  * This function will do some initial scheduler statistics housekeeping
 540  * that must be done for every newly created process.
 541  */
 542 void wake_up_forked_process(task_t * p)
 543 {
 544         unsigned long flags;
 545         runqueue_t *rq = task_rq_lock(current, &flags);
 546
 547         p->state = TASK_RUNNING;
 548         /*
 549          * We decrease the sleep average of forking parents
 550          * and children as well, to keep max-interactive tasks
 551          * from forking tasks that are max-interactive.
 552          */
 553         current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
 554         p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
 555         p->prio = effective_prio(p);
 556         set_task_cpu(p, smp_processor_id());
 557
 558         if (unlikely(!current->array))
 559                 __activate_task(p, rq);
 560         else {
 561                 p->prio = current->prio;
 562                 list_add_tail(&p->run_list, &current->run_list);
 563                 p->array = current->array;
 564                 p->array->nr_active++;
 565                 nr_running_inc(rq);
 566         }
 567         task_rq_unlock(rq, &flags);
 568 }
 569
 570 /*
 571  * Potentially available exiting-child timeslices are
 572  * retrieved here - this way the parent does not get
 573  * penalized for creating too many threads.
 574  *
 575  * (this cannot be used to 'generate' timeslices
 576  * artificially, because any timeslice recovered here
 577  * was given away by the parent in the first place.)
 578  */
 579 void sched_exit(task_t * p)
 580 {
 581         unsigned long flags;
 582
 583         local_irq_save(flags);
 584         if (p->first_time_slice) {
 585                 p->parent->time_slice += p->time_slice;
 586                 if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
 587                         p->parent->time_slice = MAX_TIMESLICE;
 588         }
 589         local_irq_restore(flags);
 590         /*
 591          * If the child was a (relative-) CPU hog then decrease
 592          * the sleep_avg of the parent as well.
 593          */
 594         if (p->sleep_avg < p->parent->sleep_avg)
 595                 p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT +
 596                         p->sleep_avg) / (EXIT_WEIGHT + 1);
 597 }
 598
 599 /**
 600  * finish_task_switch - clean up after a task-switch
 601  * @prev: the thread we just switched away from.
 602  *
 603  * We enter this with the runqueue still locked, and finish_arch_switch()
 604  * will unlock it along with doing any other architecture-specific cleanup
 605  * actions.
 606  *
 607  * Note that we may have delayed dropping an mm in context_switch(). If
 608  * so, we finish that here outside of the runqueue lock.  (Doing it
 609  * with the lock held can cause deadlocks; see schedule() for
 610  * details.)
 611  */
 612 static inline void finish_task_switch(task_t *prev)
 613 {
 614         runqueue_t *rq = this_rq();
 615         struct mm_struct *mm = rq->prev_mm;
 616
 617         rq->prev_mm = NULL;
 618         finish_arch_switch(rq, prev);
 619         if (mm)
 620                 mmdrop(mm);
 621         if (prev->state & (TASK_DEAD | TASK_ZOMBIE))
 622                 put_task_struct(prev);
 623 }
 624
 625 /**
 626  * schedule_tail - first thing a freshly forked thread must call.
 627  * @prev: the thread we just switched away from.
 628  */
 629 asmlinkage void schedule_tail(task_t *prev)
 630 {
 631         finish_task_switch(prev);
 632
 633         if (current->set_child_tid)
 634                 put_user(current->pid, current->set_child_tid);
 635 }
 636
 637 /*
 638  * context_switch - switch to the new MM and the new
 639  * thread's register state.
 640  */
 641 static inline task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
 642 {
 643         struct mm_struct *mm = next->mm;
 644         struct mm_struct *oldmm = prev->active_mm;
 645
 646         if (unlikely(!mm)) {
 647                 next->active_mm = oldmm;
 648                 atomic_inc(&oldmm->mm_count);
 649                 enter_lazy_tlb(oldmm, next);
 650         } else
 651                 switch_mm(oldmm, mm, next);
 652
 653         if (unlikely(!prev->mm)) {
 654                 prev->active_mm = NULL;
 655                 WARN_ON(rq->prev_mm);
 656                 rq->prev_mm = oldmm;
 657         }
 658
 659         /* Here we just switch the register state and the stack. */
 660         switch_to(prev, next, prev);
 661
 662         return prev;
 663 }
 664
 665 /*
 666  * nr_running, nr_uninterruptible and nr_context_switches:
 667  *
 668  * externally visible scheduler statistics: current number of runnable
 669  * threads, current number of uninterruptible-sleeping threads, total
 670  * number of context switches performed since bootup.
 671  */
 672 unsigned long nr_running(void)
 673 {
 674         unsigned long i, sum = 0;
 675
 676         for (i = 0; i < NR_CPUS; i++)
 677                 sum += cpu_rq(i)->nr_running;
 678
 679         return sum;
 680 }
 681
 682 unsigned long nr_uninterruptible(void)
 683 {
 684         unsigned long i, sum = 0;
 685
 686         for (i = 0; i < NR_CPUS; i++) {
 687                 if (!cpu_online(i))
 688                         continue;
 689                 sum += cpu_rq(i)->nr_uninterruptible;
 690         }
 691         return sum;
 692 }
 693
 694 unsigned long nr_context_switches(void)
 695 {
 696         unsigned long i, sum = 0;
 697
 698         for (i = 0; i < NR_CPUS; i++) {
 699                 if (!cpu_online(i))
 700                         continue;
 701                 sum += cpu_rq(i)->nr_switches;
 702         }
 703         return sum;
 704 }
 705
 706 unsigned long nr_iowait(void)
 707 {
 708         unsigned long i, sum = 0;
 709
 710         for (i = 0; i < NR_CPUS; ++i) {
 711                 if (!cpu_online(i))
 712                         continue;
 713                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
 714         }
 715         return sum;
 716 }
 717
 718 /*
 719  * double_rq_lock - safely lock two runqueues
 720  *
 721  * Note this does not disable interrupts like task_rq_lock,
 722  * you need to do so manually before calling.
 723  */
 724 static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
 725 {
 726         if (rq1 == rq2)
 727                 spin_lock(&rq1->lock);
 728         else {
 729                 if (rq1 < rq2) {
 730                         spin_lock(&rq1->lock);
 731                         spin_lock(&rq2->lock);
 732                 } else {
 733                         spin_lock(&rq2->lock);
 734                         spin_lock(&rq1->lock);
 735                 }
 736         }
 737 }
 738
 739 /*
 740  * double_rq_unlock - safely unlock two runqueues
 741  *
 742  * Note this does not restore interrupts like task_rq_unlock,
 743  * you need to do so manually after calling.
 744  */
 745 static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
 746 {
 747         spin_unlock(&rq1->lock);
 748         if (rq1 != rq2)
 749                 spin_unlock(&rq2->lock);
 750 }
 751
 752 #ifdef CONFIG_NUMA
 753 /*
 754  * If dest_cpu is allowed for this process, migrate the task to it.
 755  * This is accomplished by forcing the cpu_allowed mask to only
 756  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
 757  * the cpu_allowed mask is restored.
 758  */
 759 static void sched_migrate_task(task_t *p, int dest_cpu)
 760 {
 761         unsigned long old_mask;
 762
 763         old_mask = p->cpus_allowed;
 764         if (!(old_mask & (1UL << dest_cpu)))
 765                 return;
 766         /* force the process onto the specified CPU */
 767         set_cpus_allowed(p, 1UL << dest_cpu);
 768
 769         /* restore the cpus allowed mask */
 770         set_cpus_allowed(p, old_mask);
 771 }
 772
 773 /*
 774  * Find the least loaded CPU.  Slightly favor the current CPU by
 775  * setting its runqueue length as the minimum to start.
 776  */
 777 static int sched_best_cpu(struct task_struct *p)
 778 {
 779         int i, minload, load, best_cpu, node = 0;
 780         unsigned long cpumask;
 781
 782         best_cpu = task_cpu(p);
 783         if (cpu_rq(best_cpu)->nr_running <= 2)
 784                 return best_cpu;
 785
 786         minload = 10000000;
 787         for_each_node_with_cpus(i) {
 788                 /*
 789                  * Node load is always divided by nr_cpus_node to normalise
 790                  * load values in case cpu count differs from node to node.
 791                  * We first multiply node_nr_running by 10 to get a little
 792                  * better resolution.
 793                  */
 794                 load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i);
 795                 if (load < minload) {
 796                         minload = load;
 797                         node = i;
 798                 }
 799         }
 800
 801         minload = 10000000;
 802         cpumask = node_to_cpumask(node);
 803         for (i = 0; i < NR_CPUS; ++i) {
 804                 if (!(cpumask & (1UL << i)))
 805                         continue;
 806                 if (cpu_rq(i)->nr_running < minload) {
 807                         best_cpu = i;
 808                         minload = cpu_rq(i)->nr_running;
 809                 }
 810         }
 811         return best_cpu;
 812 }
 813
 814 void sched_balance_exec(void)
 815 {
 816         int new_cpu;
 817
 818         if (numnodes > 1) {
 819                 new_cpu = sched_best_cpu(current);
 820                 if (new_cpu != smp_processor_id())
 821                         sched_migrate_task(current, new_cpu);
 822         }
 823 }
 824
 825 /*
 826  * Find the busiest node. All previous node loads contribute with a
 827  * geometrically deccaying weight to the load measure:
 828  *      load_{t} = load_{t-1}/2 + nr_node_running_{t}
 829  * This way sudden load peaks are flattened out a bit.
 830  * Node load is divided by nr_cpus_node() in order to compare nodes
 831  * of different cpu count but also [first] multiplied by 10 to
 832  * provide better resolution.
 833  */
 834 static int find_busiest_node(int this_node)
 835 {
 836         int i, node = -1, load, this_load, maxload;
 837
 838         if (!nr_cpus_node(this_node))
 839                 return node;
 840         this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1)
 841                 + (10 * atomic_read(&node_nr_running[this_node])
 842                 / nr_cpus_node(this_node));
 843         this_rq()->prev_node_load[this_node] = this_load;
 844         for_each_node_with_cpus(i) {
 845                 if (i == this_node)
 846                         continue;
 847                 load = (this_rq()->prev_node_load[i] >> 1)
 848                         + (10 * atomic_read(&node_nr_running[i])
 849                         / nr_cpus_node(i));
 850                 this_rq()->prev_node_load[i] = load;
 851                 if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) {
 852                         maxload = load;
 853                         node = i;
 854                 }
 855         }
 856         return node;
 857 }
 858
 859 #endif /* CONFIG_NUMA */
 860
 861 #ifdef CONFIG_SMP
 862
 863 /*
 864  * double_lock_balance - lock the busiest runqueue
 865  *
 866  * this_rq is locked already. Recalculate nr_running if we have to
 867  * drop the runqueue lock.
 868  */
 869 static inline unsigned int double_lock_balance(runqueue_t *this_rq,
 870         runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running)
 871 {
 872         if (unlikely(!spin_trylock(&busiest->lock))) {
 873                 if (busiest < this_rq) {
 874                         spin_unlock(&this_rq->lock);
 875                         spin_lock(&busiest->lock);
 876                         spin_lock(&this_rq->lock);
 877                         /* Need to recalculate nr_running */
 878                         if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu]))
 879                                 nr_running = this_rq->nr_running;
 880                         else
 881                                 nr_running = this_rq->prev_cpu_load[this_cpu];
 882                 } else
 883                         spin_lock(&busiest->lock);
 884         }
 885         return nr_running;
 886 }
 887
 888 /*
 889  * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
 890  */
 891 static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance, unsigned long cpumask)
 892 {
 893         int nr_running, load, max_load, i;
 894         runqueue_t *busiest, *rq_src;
 895
 896         /*
 897          * We search all runqueues to find the most busy one.
 898          * We do this lockless to reduce cache-bouncing overhead,
 899          * we re-check the 'best' source CPU later on again, with
 900          * the lock held.
 901          *
 902          * We fend off statistical fluctuations in runqueue lengths by
 903          * saving the runqueue length during the previous load-balancing
 904          * operation and using the smaller one the current and saved lengths.
 905          * If a runqueue is long enough for a longer amount of time then
 906          * we recognize it and pull tasks from it.
 907          *
 908          * The 'current runqueue length' is a statistical maximum variable,
 909          * for that one we take the longer one - to avoid fluctuations in
 910          * the other direction. So for a load-balance to happen it needs
 911          * stable long runqueue on the target CPU and stable short runqueue
 912          * on the local runqueue.
 913          *
 914          * We make an exception if this CPU is about to become idle - in
 915          * that case we are less picky about moving a task across CPUs and
 916          * take what can be taken.
 917          */
 918         if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu]))
 919                 nr_running = this_rq->nr_running;
 920         else
 921                 nr_running = this_rq->prev_cpu_load[this_cpu];
 922
 923         busiest = NULL;
 924         max_load = 1;
 925         for (i = 0; i < NR_CPUS; i++) {
 926                 if (!((1UL << i) & cpumask))
 927                         continue;
 928
 929                 rq_src = cpu_rq(i);
 930                 if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i]))
 931                         load = rq_src->nr_running;
 932                 else
 933                         load = this_rq->prev_cpu_load[i];
 934                 this_rq->prev_cpu_load[i] = rq_src->nr_running;
 935
 936                 if ((load > max_load) && (rq_src != this_rq)) {
 937                         busiest = rq_src;
 938                         max_load = load;
 939                 }
 940         }
 941
 942         if (likely(!busiest))
 943                 goto out;
 944
 945         *imbalance = (max_load - nr_running) / 2;
 946
 947         /* It needs an at least ~25% imbalance to trigger balancing. */
 948         if (!idle && (*imbalance < (max_load + 3)/4)) {
 949                 busiest = NULL;
 950                 goto out;
 951         }
 952
 953         nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
 954         /*
 955          * Make sure nothing changed since we checked the
 956          * runqueue length.
 957          */
 958         if (busiest->nr_running <= nr_running + 1) {
 959                 spin_unlock(&busiest->lock);
 960                 busiest = NULL;
 961         }
 962 out:
 963         return busiest;
 964 }
 965
 966 /*
 967  * pull_task - move a task from a remote runqueue to the local runqueue.
 968  * Both runqueues must be locked.
 969  */
 970 static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
 971 {
 972         dequeue_task(p, src_array);
 973         nr_running_dec(src_rq);
 974         set_task_cpu(p, this_cpu);
 975         nr_running_inc(this_rq);
 976         enqueue_task(p, this_rq->active);
 977         /*
 978          * Note that idle threads have a prio of MAX_PRIO, for this test
 979          * to be always true for them.
 980          */
 981         if (p->prio < this_rq->curr->prio)
 982                 set_need_resched();
 983         else {
 984                 if (p->prio == this_rq->curr->prio &&
 985                                 p->time_slice > this_rq->curr->time_slice)
 986                         set_need_resched();
 987         }
 988 }
 989
 990 /*
 991  * Current runqueue is empty, or rebalance tick: if there is an
 992  * inbalance (current runqueue is too short) then pull from
 993  * busiest runqueue(s).
 994  *
 995  * We call this with the current runqueue locked,
 996  * irqs disabled.
 997  */
 998 static void load_balance(runqueue_t *this_rq, int idle, unsigned long cpumask)
 999 {
1000         int imbalance, idx, this_cpu = smp_processor_id();
1001         runqueue_t *busiest;
1002         prio_array_t *array;
1003         struct list_head *head, *curr;
1004         task_t *tmp;
1005
1006         busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
1007         if (!busiest)
1008                 goto out;
1009
1010         /*
1011          * We first consider expired tasks. Those will likely not be
1012          * executed in the near future, and they are most likely to
1013          * be cache-cold, thus switching CPUs has the least effect
1014          * on them.
1015          */
1016         if (busiest->expired->nr_active)
1017                 array = busiest->expired;
1018         else
1019                 array = busiest->active;
1020
1021 new_array:
1022         /* Start searching at priority 0: */
1023         idx = 0;
1024 skip_bitmap:
1025         if (!idx)
1026                 idx = sched_find_first_bit(array->bitmap);
1027         else
1028                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1029         if (idx >= MAX_PRIO) {
1030                 if (array == busiest->expired) {
1031                         array = busiest->active;
1032                         goto new_array;
1033                 }
1034                 goto out_unlock;
1035         }
1036
1037         head = array->queue + idx;
1038         curr = head->prev;
1039 skip_queue:
1040         tmp = list_entry(curr, task_t, run_list);
1041
1042         /*
1043          * We do not migrate tasks that are:
1044          * 1) running (obviously), or
1045          * 2) cannot be migrated to this CPU due to cpus_allowed, or
1046          * 3) are cache-hot on their current CPU.
1047          */
1048
1049 #define CAN_MIGRATE_TASK(p,rq,this_cpu)                                 \
1050         ((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) &&    \
1051                 !task_running(rq, p) &&                                 \
1052                         ((p)->cpus_allowed & (1UL << (this_cpu))))
1053
1054         curr = curr->prev;
1055
1056         if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
1057                 if (curr != head)
1058                         goto skip_queue;
1059                 idx++;
1060                 goto skip_bitmap;
1061         }
1062         pull_task(busiest, array, tmp, this_rq, this_cpu);
1063         if (!idle && --imbalance) {
1064                 if (curr != head)
1065                         goto skip_queue;
1066                 idx++;
1067                 goto skip_bitmap;
1068         }
1069 out_unlock:
1070         spin_unlock(&busiest->lock);
1071 out:
1072         ;
1073 }
1074
1075 /*
1076  * One of the idle_cpu_tick() and busy_cpu_tick() functions will
1077  * get called every timer tick, on every CPU. Our balancing action
1078  * frequency and balancing agressivity depends on whether the CPU is
1079  * idle or not.
1080  *
1081  * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on
1082  * systems with HZ=100, every 10 msecs.)
1083  *
1084  * On NUMA, do a node-rebalance every 400 msecs.
1085  */
1086 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
1087 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
1088 #define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
1089 #define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
1090
1091 #ifdef CONFIG_NUMA
1092 static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
1093 {
1094         int node = find_busiest_node(cpu_to_node(this_cpu));
1095         unsigned long cpumask, this_cpumask = 1UL << this_cpu;
1096
1097         if (node >= 0) {
1098                 cpumask = node_to_cpumask(node) | this_cpumask;
1099                 spin_lock(&this_rq->lock);
1100                 load_balance(this_rq, idle, cpumask);
1101                 spin_unlock(&this_rq->lock);
1102         }
1103 }
1104 #endif
1105
1106 static void rebalance_tick(runqueue_t *this_rq, int idle)
1107 {
1108 #ifdef CONFIG_NUMA
1109         int this_cpu = smp_processor_id();
1110 #endif
1111         unsigned long j = jiffies;
1112
1113         /*
1114          * First do inter-node rebalancing, then intra-node rebalancing,
1115          * if both events happen in the same tick. The inter-node
1116          * rebalancing does not necessarily have to create a perfect
1117          * balance within the node, since we load-balance the most loaded
1118          * node with the current CPU. (ie. other CPUs in the local node
1119          * are not balanced.)
1120          */
1121         if (idle) {
1122 #ifdef CONFIG_NUMA
1123                 if (!(j % IDLE_NODE_REBALANCE_TICK))
1124                         balance_node(this_rq, idle, this_cpu);
1125 #endif
1126                 if (!(j % IDLE_REBALANCE_TICK)) {
1127                         spin_lock(&this_rq->lock);
1128                         load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1129                         spin_unlock(&this_rq->lock);
1130                 }
1131                 return;
1132         }
1133 #ifdef CONFIG_NUMA
1134         if (!(j % BUSY_NODE_REBALANCE_TICK))
1135                 balance_node(this_rq, idle, this_cpu);
1136 #endif
1137         if (!(j % BUSY_REBALANCE_TICK)) {
1138                 spin_lock(&this_rq->lock);
1139                 load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1140                 spin_unlock(&this_rq->lock);
1141         }
1142 }
1143 #else
1144 /*
1145  * on UP we do not need to balance between CPUs:
1146  */
1147 static inline void rebalance_tick(runqueue_t *this_rq, int idle)
1148 {
1149 }
1150 #endif
1151
1152 DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
1153
1154 /*
1155  * We place interactive tasks back into the active array, if possible.
1156  *
1157  * To guarantee that this does not starve expired tasks we ignore the
1158  * interactivity of a task if the first expired task had to wait more
1159  * than a 'reasonable' amount of time. This deadline timeout is
1160  * load-dependent, as the frequency of array switched decreases with
1161  * increasing number of running tasks:
1162  */
1163 #define EXPIRED_STARVING(rq) \
1164                 (STARVATION_LIMIT && ((rq)->expired_timestamp && \
1165                 (jiffies - (rq)->expired_timestamp >= \
1166                         STARVATION_LIMIT * ((rq)->nr_running) + 1)))
1167
1168 /*
1169  * This function gets called by the timer code, with HZ frequency.
1170  * We call it with interrupts disabled.
1171  *
1172  * It also gets called by the fork code, when changing the parent's
1173  * timeslices.
1174  */
1175 void scheduler_tick(int user_ticks, int sys_ticks)
1176 {
1177         int cpu = smp_processor_id();
1178         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1179         runqueue_t *rq = this_rq();
1180         task_t *p = current;
1181
1182         if (rcu_pending(cpu))
1183                 rcu_check_callbacks(cpu, user_ticks);
1184
1185         if (p == rq->idle) {
1186                 /* note: this timer irq context must be accounted for as well */
1187                 if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
1188                         cpustat->system += sys_ticks;
1189                 else if (atomic_read(&rq->nr_iowait) > 0)
1190                         cpustat->iowait += sys_ticks;
1191                 else
1192                         cpustat->idle += sys_ticks;
1193                 rebalance_tick(rq, 1);
1194                 return;
1195         }
1196         if (TASK_NICE(p) > 0)
1197                 cpustat->nice += user_ticks;
1198         else
1199                 cpustat->user += user_ticks;
1200         cpustat->system += sys_ticks;
1201
1202         /* Task might have expired already, but not scheduled off yet */
1203         if (p->array != rq->active) {
1204                 set_tsk_need_resched(p);
1205                 goto out;
1206         }
1207         spin_lock(&rq->lock);
1208         /*
1209          * The task was running during this tick - update the
1210          * time slice counter and the sleep average. Note: we
1211          * do not update a thread's priority until it either
1212          * goes to sleep or uses up its timeslice. This makes
1213          * it possible for interactive tasks to use up their
1214          * timeslices at their highest priority levels.
1215          */
1216         if (p->sleep_avg)
1217                 p->sleep_avg--;
1218         if (unlikely(rt_task(p))) {
1219                 /*
1220                  * RR tasks need a special form of timeslice management.
1221                  * FIFO tasks have no timeslices.
1222                  */
1223                 if ((p->policy == SCHED_RR) && !--p->time_slice) {
1224                         p->time_slice = task_timeslice(p);
1225                         p->first_time_slice = 0;
1226                         set_tsk_need_resched(p);
1227
1228                         /* put it at the end of the queue: */
1229                         dequeue_task(p, rq->active);
1230                         enqueue_task(p, rq->active);
1231                 }
1232                 goto out_unlock;
1233         }
1234         if (!--p->time_slice) {
1235                 dequeue_task(p, rq->active);
1236                 set_tsk_need_resched(p);
1237                 p->prio = effective_prio(p);
1238                 p->time_slice = task_timeslice(p);
1239                 p->first_time_slice = 0;
1240
1241                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1242                         if (!rq->expired_timestamp)
1243                                 rq->expired_timestamp = jiffies;
1244                         enqueue_task(p, rq->expired);
1245                 } else
1246                         enqueue_task(p, rq->active);
1247         }
1248 out_unlock:
1249         spin_unlock(&rq->lock);
1250 out:
1251         rebalance_tick(rq, 0);
1252 }
1253
1254 void scheduling_functions_start_here(void) { }
1255
1256 /*
1257  * schedule() is the main scheduler function.
1258  */
1259 asmlinkage void schedule(void)
1260 {
1261         task_t *prev, *next;
1262         runqueue_t *rq;
1263         prio_array_t *array;
1264         struct list_head *queue;
1265         int idx;
1266
1267         /*
1268          * Test if we are atomic.  Since do_exit() needs to call into
1269          * schedule() atomically, we ignore that path for now.
1270          * Otherwise, whine if we are scheduling when we should not be.
1271          */
1272         if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
1273                 if (unlikely(in_atomic())) {
1274                         printk(KERN_ERR "bad: scheduling while atomic!\n");
1275                         dump_stack();
1276                 }
1277         }
1278
1279 need_resched:
1280         preempt_disable();
1281         prev = current;
1282         rq = this_rq();
1283
1284         release_kernel_lock(prev);
1285         prev->last_run = jiffies;
1286         spin_lock_irq(&rq->lock);
1287
1288         /*
1289          * if entering off of a kernel preemption go straight
1290          * to picking the next task.
1291          */
1292         if (unlikely(preempt_count() & PREEMPT_ACTIVE))
1293                 goto pick_next_task;
1294
1295         switch (prev->state) {
1296         case TASK_INTERRUPTIBLE:
1297                 if (unlikely(signal_pending(prev))) {
1298                         prev->state = TASK_RUNNING;
1299                         break;
1300                 }
1301         default:
1302                 deactivate_task(prev, rq);
1303         case TASK_RUNNING:
1304                 ;
1305         }
1306 pick_next_task:
1307         if (unlikely(!rq->nr_running)) {
1308 #ifdef CONFIG_SMP
1309                 load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
1310                 if (rq->nr_running)
1311                         goto pick_next_task;
1312 #endif
1313                 next = rq->idle;
1314                 rq->expired_timestamp = 0;
1315                 goto switch_tasks;
1316         }
1317
1318         array = rq->active;
1319         if (unlikely(!array->nr_active)) {
1320                 /*
1321                  * Switch the active and expired arrays.
1322                  */
1323                 rq->active = rq->expired;
1324                 rq->expired = array;
1325                 array = rq->active;
1326                 rq->expired_timestamp = 0;
1327         }
1328
1329         idx = sched_find_first_bit(array->bitmap);
1330         queue = array->queue + idx;
1331         next = list_entry(queue->next, task_t, run_list);
1332
1333 switch_tasks:
1334         prefetch(next);
1335         clear_tsk_need_resched(prev);
1336         RCU_qsctr(task_cpu(prev))++;
1337
1338         if (likely(prev != next)) {
1339                 rq->nr_switches++;
1340                 rq->curr = next;
1341
1342                 prepare_arch_switch(rq, next);
1343                 prev = context_switch(rq, prev, next);
1344                 barrier();
1345
1346                 finish_task_switch(prev);
1347         } else
1348                 spin_unlock_irq(&rq->lock);
1349
1350         reacquire_kernel_lock(current);
1351         preempt_enable_no_resched();
1352         if (test_thread_flag(TIF_NEED_RESCHED))
1353                 goto need_resched;
1354 }
1355
1356 #ifdef CONFIG_PREEMPT
1357 /*
1358  * this is is the entry point to schedule() from in-kernel preemption
1359  * off of preempt_enable.  Kernel preemptions off return from interrupt
1360  * occur there and call schedule directly.
1361  */
1362 asmlinkage void preempt_schedule(void)
1363 {
1364         struct thread_info *ti = current_thread_info();
1365
1366         /*
1367          * If there is a non-zero preempt_count or interrupts are disabled,
1368          * we do not want to preempt the current task.  Just return..
1369          */
1370         if (unlikely(ti->preempt_count || irqs_disabled()))
1371                 return;
1372
1373 need_resched:
1374         ti->preempt_count = PREEMPT_ACTIVE;
1375         schedule();
1376         ti->preempt_count = 0;
1377
1378         /* we could miss a preemption opportunity between schedule and now */
1379         barrier();
1380         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
1381                 goto need_resched;
1382 }
1383 #endif /* CONFIG_PREEMPT */
1384
1385 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync)
1386 {
1387         task_t *p = curr->task;
1388         return try_to_wake_up(p, mode, sync, 0);
1389 }
1390
1391 /*
1392  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
1393  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
1394  * number) then we wake all the non-exclusive tasks and one exclusive task.
1395  *
1396  * There are circumstances in which we can try to wake a task which has already
1397  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
1398  * zero in this (rare) case, and we handle it by continuing to scan the queue.
1399  */
1400 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync)
1401 {
1402         struct list_head *tmp, *next;
1403
1404         list_for_each_safe(tmp, next, &q->task_list) {
1405                 wait_queue_t *curr;
1406                 unsigned flags;
1407                 curr = list_entry(tmp, wait_queue_t, task_list);
1408                 flags = curr->flags;
1409                 if (curr->func(curr, mode, sync) &&
1410                     (flags & WQ_FLAG_EXCLUSIVE) &&
1411                     !--nr_exclusive)
1412                         break;
1413         }
1414 }
1415
1416 /**
1417  * __wake_up - wake up threads blocked on a waitqueue.
1418  * @q: the waitqueue
1419  * @mode: which threads
1420  * @nr_exclusive: how many wake-one or wake-many threads to wake up
1421  */
1422 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1423 {
1424         unsigned long flags;
1425
1426         spin_lock_irqsave(&q->lock, flags);
1427         __wake_up_common(q, mode, nr_exclusive, 0);
1428         spin_unlock_irqrestore(&q->lock, flags);
1429 }
1430
1431 /*
1432  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
1433  */
1434 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
1435 {
1436         __wake_up_common(q, mode, 1, 0);
1437 }
1438
1439 /**
1440  * __wake_up - sync- wake up threads blocked on a waitqueue.
1441  * @q: the waitqueue
1442  * @mode: which threads
1443  * @nr_exclusive: how many wake-one or wake-many threads to wake up
1444  *
1445  * The sync wakeup differs that the waker knows that it will schedule
1446  * away soon, so while the target thread will be woken up, it will not
1447  * be migrated to another CPU - ie. the two threads are 'synchronized'
1448  * with each other. This can prevent needless bouncing between CPUs.
1449  *
1450  * On UP it can prevent extra preemption.
1451  */
1452 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1453 {
1454         unsigned long flags;
1455
1456         if (unlikely(!q))
1457                 return;
1458
1459         spin_lock_irqsave(&q->lock, flags);
1460         if (likely(nr_exclusive))
1461                 __wake_up_common(q, mode, nr_exclusive, 1);
1462         else
1463                 __wake_up_common(q, mode, nr_exclusive, 0);
1464         spin_unlock_irqrestore(&q->lock, flags);
1465 }
1466
1467 void complete(struct completion *x)
1468 {
1469         unsigned long flags;
1470
1471         spin_lock_irqsave(&x->wait.lock, flags);
1472         x->done++;
1473         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, 0);
1474         spin_unlock_irqrestore(&x->wait.lock, flags);
1475 }
1476
1477 void complete_all(struct completion *x)
1478 {
1479         unsigned long flags;
1480
1481         spin_lock_irqsave(&x->wait.lock, flags);
1482         x->done += UINT_MAX/2;
1483         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0, 0);
1484         spin_unlock_irqrestore(&x->wait.lock, flags);
1485 }
1486
1487 void wait_for_completion(struct completion *x)
1488 {
1489         might_sleep();
1490         spin_lock_irq(&x->wait.lock);
1491         if (!x->done) {
1492                 DECLARE_WAITQUEUE(wait, current);
1493
1494                 wait.flags |= WQ_FLAG_EXCLUSIVE;
1495                 __add_wait_queue_tail(&x->wait, &wait);
1496                 do {
1497                         __set_current_state(TASK_UNINTERRUPTIBLE);
1498                         spin_unlock_irq(&x->wait.lock);
1499                         schedule();
1500                         spin_lock_irq(&x->wait.lock);
1501                 } while (!x->done);
1502                 __remove_wait_queue(&x->wait, &wait);
1503         }
1504         x->done--;
1505         spin_unlock_irq(&x->wait.lock);
1506 }
1507
1508 #define SLEEP_ON_VAR                            \
1509         unsigned long flags;                    \
1510         wait_queue_t wait;                      \
1511         init_waitqueue_entry(&wait, current);
1512
1513 #define SLEEP_ON_HEAD                                   \
1514         spin_lock_irqsave(&q->lock,flags);              \
1515         __add_wait_queue(q, &wait);                     \
1516         spin_unlock(&q->lock);
1517
1518 #define SLEEP_ON_TAIL                                           \
1519         spin_lock_irq(&q->lock);                                \
1520         __remove_wait_queue(q, &wait);                          \
1521         spin_unlock_irqrestore(&q->lock, flags);
1522
1523 void interruptible_sleep_on(wait_queue_head_t *q)
1524 {
1525         SLEEP_ON_VAR
1526
1527         current->state = TASK_INTERRUPTIBLE;
1528
1529         SLEEP_ON_HEAD
1530         schedule();
1531         SLEEP_ON_TAIL
1532 }
1533
1534 long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
1535 {
1536         SLEEP_ON_VAR
1537
1538         current->state = TASK_INTERRUPTIBLE;
1539
1540         SLEEP_ON_HEAD
1541         timeout = schedule_timeout(timeout);
1542         SLEEP_ON_TAIL
1543
1544         return timeout;
1545 }
1546
1547 void sleep_on(wait_queue_head_t *q)
1548 {
1549         SLEEP_ON_VAR
1550
1551         current->state = TASK_UNINTERRUPTIBLE;
1552
1553         SLEEP_ON_HEAD
1554         schedule();
1555         SLEEP_ON_TAIL
1556 }
1557
1558 long sleep_on_timeout(wait_queue_head_t *q, long timeout)
1559 {
1560         SLEEP_ON_VAR
1561
1562         current->state = TASK_UNINTERRUPTIBLE;
1563
1564         SLEEP_ON_HEAD
1565         timeout = schedule_timeout(timeout);
1566         SLEEP_ON_TAIL
1567
1568         return timeout;
1569 }
1570
1571 void scheduling_functions_end_here(void) { }
1572
1573 void set_user_nice(task_t *p, long nice)
1574 {
1575         unsigned long flags;
1576         prio_array_t *array;
1577         runqueue_t *rq;
1578
1579         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
1580                 return;
1581         /*
1582          * We have to be careful, if called from sys_setpriority(),
1583          * the task might be in the middle of scheduling on another CPU.
1584          */
1585         rq = task_rq_lock(p, &flags);
1586         if (rt_task(p)) {
1587                 p->static_prio = NICE_TO_PRIO(nice);
1588                 goto out_unlock;
1589         }
1590         array = p->array;
1591         if (array)
1592                 dequeue_task(p, array);
1593         p->static_prio = NICE_TO_PRIO(nice);
1594         p->prio = NICE_TO_PRIO(nice);
1595         if (array) {
1596                 enqueue_task(p, array);
1597                 /*
1598                  * If the task is running and lowered its priority,
1599                  * or increased its priority then reschedule its CPU:
1600                  */
1601                 if ((NICE_TO_PRIO(nice) < p->static_prio) ||
1602                                                         task_running(rq, p))
1603                         resched_task(rq->curr);
1604         }
1605 out_unlock:
1606         task_rq_unlock(rq, &flags);
1607 }
1608
1609 #ifndef __alpha__
1610
1611 /*
1612  * sys_nice - change the priority of the current process.
1613  * @increment: priority increment
1614  *
1615  * sys_setpriority is a more generic, but much slower function that
1616  * does similar things.
1617  */
1618 asmlinkage long sys_nice(int increment)
1619 {
1620         int retval;
1621         long nice;
1622
1623         /*
1624          *      Setpriority might change our priority at the same moment.
1625          *      We don't have to worry. Conceptually one call occurs first
1626          *      and we have a single winner.
1627          */
1628         if (increment < 0) {
1629                 if (!capable(CAP_SYS_NICE))
1630                         return -EPERM;
1631                 if (increment < -40)
1632                         increment = -40;
1633         }
1634         if (increment > 40)
1635                 increment = 40;
1636
1637         nice = PRIO_TO_NICE(current->static_prio) + increment;
1638         if (nice < -20)
1639                 nice = -20;
1640         if (nice > 19)
1641                 nice = 19;
1642
1643         retval = security_task_setnice(current, nice);
1644         if (retval)
1645                 return retval;
1646
1647         set_user_nice(current, nice);
1648         return 0;
1649 }
1650
1651 #endif
1652
1653 /**
1654  * task_prio - return the priority value of a given task.
1655  * @p: the task in question.
1656  *
1657  * This is the priority value as seen by users in /proc.
1658  * RT tasks are offset by -200. Normal tasks are centered
1659  * around 0, value goes from -16 to +15.
1660  */
1661 int task_prio(task_t *p)
1662 {
1663         return p->prio - MAX_RT_PRIO;
1664 }
1665
1666 /**
1667  * task_nice - return the nice value of a given task.
1668  * @p: the task in question.
1669  */
1670 int task_nice(task_t *p)
1671 {
1672         return TASK_NICE(p);
1673 }
1674
1675 /**
1676  * task_curr - is this task currently executing on a CPU?
1677  * @p: the task in question.
1678  */
1679 int task_curr(task_t *p)
1680 {
1681         return cpu_curr(task_cpu(p)) == p;
1682 }
1683
1684 /**
1685  * idle_cpu - is a given cpu idle currently?
1686  * @cpu: the processor in question.
1687  */
1688 int idle_cpu(int cpu)
1689 {
1690         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
1691 }
1692
1693 /**
1694  * find_process_by_pid - find a process with a matching PID value.
1695  * @pid: the pid in question.
1696  */
1697 static inline task_t *find_process_by_pid(pid_t pid)
1698 {
1699         return pid ? find_task_by_pid(pid) : current;
1700 }
1701
1702 /*
1703  * setscheduler - change the scheduling policy and/or RT priority of a thread.
1704  */
1705 static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
1706 {
1707         struct sched_param lp;
1708         int retval = -EINVAL;
1709         int oldprio;
1710         prio_array_t *array;
1711         unsigned long flags;
1712         runqueue_t *rq;
1713         task_t *p;
1714
1715         if (!param || pid < 0)
1716                 goto out_nounlock;
1717
1718         retval = -EFAULT;
1719         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
1720                 goto out_nounlock;
1721
1722         /*
1723          * We play safe to avoid deadlocks.
1724          */
1725         read_lock_irq(&tasklist_lock);
1726
1727         p = find_process_by_pid(pid);
1728
1729         retval = -ESRCH;
1730         if (!p)
1731                 goto out_unlock_tasklist;
1732
1733         /*
1734          * To be able to change p->policy safely, the apropriate
1735          * runqueue lock must be held.
1736          */
1737         rq = task_rq_lock(p, &flags);
1738
1739         if (policy < 0)
1740                 policy = p->policy;
1741         else {
1742                 retval = -EINVAL;
1743                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
1744                                 policy != SCHED_NORMAL)
1745                         goto out_unlock;
1746         }
1747
1748         /*
1749          * Valid priorities for SCHED_FIFO and SCHED_RR are
1750          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
1751          */
1752         retval = -EINVAL;
1753         if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
1754                 goto out_unlock;
1755         if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
1756                 goto out_unlock;
1757
1758         retval = -EPERM;
1759         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
1760             !capable(CAP_SYS_NICE))
1761                 goto out_unlock;
1762         if ((current->euid != p->euid) && (current->euid != p->uid) &&
1763             !capable(CAP_SYS_NICE))
1764                 goto out_unlock;
1765
1766         retval = security_task_setscheduler(p, policy, &lp);
1767         if (retval)
1768                 goto out_unlock;
1769
1770         array = p->array;
1771         if (array)
1772                 deactivate_task(p, task_rq(p));
1773         retval = 0;
1774         p->policy = policy;
1775         p->rt_priority = lp.sched_priority;
1776         oldprio = p->prio;
1777         if (policy != SCHED_NORMAL)
1778                 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
1779         else
1780                 p->prio = p->static_prio;
1781         if (array) {
1782                 __activate_task(p, task_rq(p));
1783                 /*
1784                  * Reschedule if we are currently running on this runqueue and
1785                  * our priority decreased, or if we are not currently running on
1786                  * this runqueue and our priority is higher than the current's
1787                  */
1788                 if (rq->curr == p) {
1789                         if (p->prio > oldprio)
1790                                 resched_task(rq->curr);
1791                 } else if (p->prio < rq->curr->prio)
1792                         resched_task(rq->curr);
1793         }
1794
1795 out_unlock:
1796         task_rq_unlock(rq, &flags);
1797 out_unlock_tasklist:
1798         read_unlock_irq(&tasklist_lock);
1799
1800 out_nounlock:
1801         return retval;
1802 }
1803
1804 /**
1805  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
1806  * @pid: the pid in question.
1807  * @policy: new policy
1808  * @param: structure containing the new RT priority.
1809  */
1810 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
1811                                       struct sched_param __user *param)
1812 {
1813         return setscheduler(pid, policy, param);
1814 }
1815
1816 /**
1817  * sys_sched_setparam - set/change the RT priority of a thread
1818  * @pid: the pid in question.
1819  * @param: structure containing the new RT priority.
1820  */
1821 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
1822 {
1823         return setscheduler(pid, -1, param);
1824 }
1825
1826 /**
1827  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
1828  * @pid: the pid in question.
1829  */
1830 asmlinkage long sys_sched_getscheduler(pid_t pid)
1831 {
1832         int retval = -EINVAL;
1833         task_t *p;
1834
1835         if (pid < 0)
1836                 goto out_nounlock;
1837
1838         retval = -ESRCH;
1839         read_lock(&tasklist_lock);
1840         p = find_process_by_pid(pid);
1841         if (p) {
1842                 retval = security_task_getscheduler(p);
1843                 if (!retval)
1844                         retval = p->policy;
1845         }
1846         read_unlock(&tasklist_lock);
1847
1848 out_nounlock:
1849         return retval;
1850 }
1851
1852 /**
1853  * sys_sched_getscheduler - get the RT priority of a thread
1854  * @pid: the pid in question.
1855  * @param: structure containing the RT priority.
1856  */
1857 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
1858 {
1859         struct sched_param lp;
1860         int retval = -EINVAL;
1861         task_t *p;
1862
1863         if (!param || pid < 0)
1864                 goto out_nounlock;
1865
1866         read_lock(&tasklist_lock);
1867         p = find_process_by_pid(pid);
1868         retval = -ESRCH;
1869         if (!p)
1870                 goto out_unlock;
1871
1872         retval = security_task_getscheduler(p);
1873         if (retval)
1874                 goto out_unlock;
1875
1876         lp.sched_priority = p->rt_priority;
1877         read_unlock(&tasklist_lock);
1878
1879         /*
1880          * This one might sleep, we cannot do it with a spinlock held ...
1881          */
1882         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
1883
1884 out_nounlock:
1885         return retval;
1886
1887 out_unlock:
1888         read_unlock(&tasklist_lock);
1889         return retval;
1890 }
1891
1892 /**
1893  * sys_sched_setaffinity - set the cpu affinity of a process
1894  * @pid: pid of the process
1895  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
1896  * @user_mask_ptr: user-space pointer to the new cpu mask
1897  */
1898 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
1899                                       unsigned long __user *user_mask_ptr)
1900 {
1901         unsigned long new_mask;
1902         int retval;
1903         task_t *p;
1904
1905         if (len < sizeof(new_mask))
1906                 return -EINVAL;
1907
1908         if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
1909                 return -EFAULT;
1910
1911         read_lock(&tasklist_lock);
1912
1913         p = find_process_by_pid(pid);
1914         if (!p) {
1915                 read_unlock(&tasklist_lock);
1916                 return -ESRCH;
1917         }
1918
1919         /*
1920          * It is not safe to call set_cpus_allowed with the
1921          * tasklist_lock held.  We will bump the task_struct's
1922          * usage count and then drop tasklist_lock.
1923          */
1924         get_task_struct(p);
1925         read_unlock(&tasklist_lock);
1926
1927         retval = -EPERM;
1928         if ((current->euid != p->euid) && (current->euid != p->uid) &&
1929                         !capable(CAP_SYS_NICE))
1930                 goto out_unlock;
1931
1932         retval = set_cpus_allowed(p, new_mask);
1933
1934 out_unlock:
1935         put_task_struct(p);
1936         return retval;
1937 }
1938
1939 /**
1940  * sys_sched_getaffinity - get the cpu affinity of a process
1941  * @pid: pid of the process
1942  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
1943  * @user_mask_ptr: user-space pointer to hold the current cpu mask
1944  */
1945 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
1946                                       unsigned long __user *user_mask_ptr)
1947 {
1948         unsigned int real_len;
1949         unsigned long mask;
1950         int retval;
1951         task_t *p;
1952
1953         real_len = sizeof(mask);
1954         if (len < real_len)
1955                 return -EINVAL;
1956
1957         read_lock(&tasklist_lock);
1958
1959         retval = -ESRCH;
1960         p = find_process_by_pid(pid);
1961         if (!p)
1962                 goto out_unlock;
1963
1964         retval = 0;
1965         mask = p->cpus_allowed & cpu_online_map;
1966
1967 out_unlock:
1968         read_unlock(&tasklist_lock);
1969         if (retval)
1970                 return retval;
1971         if (copy_to_user(user_mask_ptr, &mask, real_len))
1972                 return -EFAULT;
1973         return real_len;
1974 }
1975
1976 /**
1977  * sys_sched_yield - yield the current processor to other threads.
1978  *
1979  * this function yields the current CPU by moving the calling thread
1980  * to the expired array. If there are no other threads running on this
1981  * CPU then this function will return.
1982  */
1983 asmlinkage long sys_sched_yield(void)
1984 {
1985         runqueue_t *rq = this_rq_lock();
1986         prio_array_t *array = current->array;
1987
1988         /*
1989          * We implement yielding by moving the task into the expired
1990          * queue.
1991          *
1992          * (special rule: RT tasks will just roundrobin in the active
1993          *  array.)
1994          */
1995         if (likely(!rt_task(current))) {
1996                 dequeue_task(current, array);
1997                 enqueue_task(current, rq->expired);
1998         } else {
1999                 list_del(&current->run_list);
2000                 list_add_tail(&current->run_list, array->queue + current->prio);
2001         }
2002         /*
2003          * Since we are going to call schedule() anyway, there's
2004          * no need to preempt:
2005          */
2006         _raw_spin_unlock(&rq->lock);
2007         preempt_enable_no_resched();
2008
2009         schedule();
2010
2011         return 0;
2012 }
2013
2014 void __cond_resched(void)
2015 {
2016         set_current_state(TASK_RUNNING);
2017         schedule();
2018 }
2019
2020 /**
2021  * yield - yield the current processor to other threads.
2022  *
2023  * this is a shortcut for kernel-space yielding - it marks the
2024  * thread runnable and calls sys_sched_yield().
2025  */
2026 void yield(void)
2027 {
2028         set_current_state(TASK_RUNNING);
2029         sys_sched_yield();
2030 }
2031
2032 /*
2033  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
2034  * that process accounting knows that this is a task in IO wait state.
2035  *
2036  * But don't do that if it is a deliberate, throttling IO wait (this task
2037  * has set its backing_dev_info: the queue against which it should throttle)
2038  */
2039 void io_schedule(void)
2040 {
2041         struct runqueue *rq = this_rq();
2042
2043         atomic_inc(&rq->nr_iowait);
2044         schedule();
2045         atomic_dec(&rq->nr_iowait);
2046 }
2047
2048 long io_schedule_timeout(long timeout)
2049 {
2050         struct runqueue *rq = this_rq();
2051         long ret;
2052
2053         atomic_inc(&rq->nr_iowait);
2054         ret = schedule_timeout(timeout);
2055         atomic_dec(&rq->nr_iowait);
2056         return ret;
2057 }
2058
2059 /**
2060  * sys_sched_get_priority_max - return maximum RT priority.
2061  * @policy: scheduling class.
2062  *
2063  * this syscall returns the maximum rt_priority that can be used
2064  * by a given scheduling class.
2065  */
2066 asmlinkage long sys_sched_get_priority_max(int policy)
2067 {
2068         int ret = -EINVAL;
2069
2070         switch (policy) {
2071         case SCHED_FIFO:
2072         case SCHED_RR:
2073                 ret = MAX_USER_RT_PRIO-1;
2074                 break;
2075         case SCHED_NORMAL:
2076                 ret = 0;
2077                 break;
2078         }
2079         return ret;
2080 }
2081
2082 /**
2083  * sys_sched_get_priority_mix - return minimum RT priority.
2084  * @policy: scheduling class.
2085  *
2086  * this syscall returns the minimum rt_priority that can be used
2087  * by a given scheduling class.
2088  */
2089 asmlinkage long sys_sched_get_priority_min(int policy)
2090 {
2091         int ret = -EINVAL;
2092
2093         switch (policy) {
2094         case SCHED_FIFO:
2095         case SCHED_RR:
2096                 ret = 1;
2097                 break;
2098         case SCHED_NORMAL:
2099                 ret = 0;
2100         }
2101         return ret;
2102 }
2103
2104 /**
2105  * sys_sched_rr_get_interval - return the default timeslice of a process.
2106  * @pid: pid of the process.
2107  * @interval: userspace pointer to the timeslice value.
2108  *
2109  * this syscall writes the default timeslice value of a given process
2110  * into the user-space timespec buffer. A value of '0' means infinity.
2111  */
2112 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
2113 {
2114         int retval = -EINVAL;
2115         struct timespec t;
2116         task_t *p;
2117
2118         if (pid < 0)
2119                 goto out_nounlock;
2120
2121         retval = -ESRCH;
2122         read_lock(&tasklist_lock);
2123         p = find_process_by_pid(pid);
2124         if (!p)
2125                 goto out_unlock;
2126
2127         retval = security_task_getscheduler(p);
2128         if (retval)
2129                 goto out_unlock;
2130
2131         jiffies_to_timespec(p->policy & SCHED_FIFO ?
2132                                 0 : task_timeslice(p), &t);
2133         read_unlock(&tasklist_lock);
2134         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
2135 out_nounlock:
2136         return retval;
2137 out_unlock:
2138         read_unlock(&tasklist_lock);
2139         return retval;
2140 }
2141
2142 static inline struct task_struct *eldest_child(struct task_struct *p)
2143 {
2144         if (list_empty(&p->children)) return NULL;
2145         return list_entry(p->children.next,struct task_struct,sibling);
2146 }
2147
2148 static inline struct task_struct *older_sibling(struct task_struct *p)
2149 {
2150         if (p->sibling.prev==&p->parent->children) return NULL;
2151         return list_entry(p->sibling.prev,struct task_struct,sibling);
2152 }
2153
2154 static inline struct task_struct *younger_sibling(struct task_struct *p)
2155 {
2156         if (p->sibling.next==&p->parent->children) return NULL;
2157         return list_entry(p->sibling.next,struct task_struct,sibling);
2158 }
2159
2160 static void show_task(task_t * p)
2161 {
2162         unsigned long free = 0;
2163         task_t *relative;
2164         int state;
2165         static const char * stat_nam[] = { "R", "S", "D", "T", "Z", "W" };
2166
2167         printk("%-13.13s ", p->comm);
2168         state = p->state ? __ffs(p->state) + 1 : 0;
2169         if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
2170                 printk(stat_nam[state]);
2171         else
2172                 printk(" ");
2173 #if (BITS_PER_LONG == 32)
2174         if (p == current)
2175                 printk(" current  ");
2176         else
2177                 printk(" %08lX ", thread_saved_pc(p));
2178 #else
2179         if (p == current)
2180                 printk("   current task   ");
2181         else
2182                 printk(" %016lx ", thread_saved_pc(p));
2183 #endif
2184         {
2185                 unsigned long * n = (unsigned long *) (p->thread_info+1);
2186                 while (!*n)
2187                         n++;
2188                 free = (unsigned long) n - (unsigned long)(p+1);
2189         }
2190         printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
2191         if ((relative = eldest_child(p)))
2192                 printk("%5d ", relative->pid);
2193         else
2194                 printk("      ");
2195         if ((relative = younger_sibling(p)))
2196                 printk("%7d", relative->pid);
2197         else
2198                 printk("       ");
2199         if ((relative = older_sibling(p)))
2200                 printk(" %5d", relative->pid);
2201         else
2202                 printk("      ");
2203         if (!p->mm)
2204                 printk(" (L-TLB)\n");
2205         else
2206                 printk(" (NOTLB)\n");
2207
2208         show_stack(p, NULL);
2209 }
2210
2211 void show_state(void)
2212 {
2213         task_t *g, *p;
2214
2215 #if (BITS_PER_LONG == 32)
2216         printk("\n"
2217                "                         free                        sibling\n");
2218         printk("  task             PC    stack   pid father child younger older\n");
2219 #else
2220         printk("\n"
2221                "                                 free                        sibling\n");
2222         printk("  task                 PC        stack   pid father child younger older\n");
2223 #endif
2224         read_lock(&tasklist_lock);
2225         do_each_thread(g, p) {
2226                 /*
2227                  * reset the NMI-timeout, listing all files on a slow
2228                  * console might take alot of time:
2229                  */
2230                 touch_nmi_watchdog();
2231                 show_task(p);
2232         } while_each_thread(g, p);
2233
2234         read_unlock(&tasklist_lock);
2235 }
2236
2237 void __init init_idle(task_t *idle, int cpu)
2238 {
2239         runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
2240         unsigned long flags;
2241
2242         local_irq_save(flags);
2243         double_rq_lock(idle_rq, rq);
2244
2245         idle_rq->curr = idle_rq->idle = idle;
2246         deactivate_task(idle, rq);
2247         idle->array = NULL;
2248         idle->prio = MAX_PRIO;
2249         idle->state = TASK_RUNNING;
2250         set_task_cpu(idle, cpu);
2251         double_rq_unlock(idle_rq, rq);
2252         set_tsk_need_resched(idle);
2253         local_irq_restore(flags);
2254
2255         /* Set the preempt count _outside_ the spinlocks! */
2256 #ifdef CONFIG_PREEMPT
2257         idle->thread_info->preempt_count = (idle->lock_depth >= 0);
2258 #else
2259         idle->thread_info->preempt_count = 0;
2260 #endif
2261 }
2262
2263 #ifdef CONFIG_SMP
2264 /*
2265  * This is how migration works:
2266  *
2267  * 1) we queue a migration_req_t structure in the source CPU's
2268  *    runqueue and wake up that CPU's migration thread.
2269  * 2) we down() the locked semaphore => thread blocks.
2270  * 3) migration thread wakes up (implicitly it forces the migrated
2271  *    thread off the CPU)
2272  * 4) it gets the migration request and checks whether the migrated
2273  *    task is still in the wrong runqueue.
2274  * 5) if it's in the wrong runqueue then the migration thread removes
2275  *    it and puts it into the right queue.
2276  * 6) migration thread up()s the semaphore.
2277  * 7) we wake up and the migration is done.
2278  */
2279
2280 typedef struct {
2281         struct list_head list;
2282         task_t *task;
2283         struct completion done;
2284 } migration_req_t;
2285
2286 /*
2287  * Change a given task's CPU affinity. Migrate the thread to a
2288  * proper CPU and schedule it away if the CPU it's executing on
2289  * is removed from the allowed bitmask.
2290  *
2291  * NOTE: the caller must have a valid reference to the task, the
2292  * task must not exit() & deallocate itself prematurely.  The
2293  * call is not atomic; no spinlocks may be held.
2294  */
2295 int set_cpus_allowed(task_t *p, unsigned long new_mask)
2296 {
2297         unsigned long flags;
2298         migration_req_t req;
2299         runqueue_t *rq;
2300
2301         if (any_online_cpu(new_mask) == NR_CPUS)
2302                 return -EINVAL;
2303
2304         rq = task_rq_lock(p, &flags);
2305         p->cpus_allowed = new_mask;
2306         /*
2307          * Can the task run on the task's current CPU? If not then
2308          * migrate the thread off to a proper CPU.
2309          */
2310         if (new_mask & (1UL << task_cpu(p))) {
2311                 task_rq_unlock(rq, &flags);
2312                 return 0;
2313         }
2314         /*
2315          * If the task is not on a runqueue (and not running), then
2316          * it is sufficient to simply update the task's cpu field.
2317          */
2318         if (!p->array && !task_running(rq, p)) {
2319                 set_task_cpu(p, any_online_cpu(p->cpus_allowed));
2320                 task_rq_unlock(rq, &flags);
2321                 return 0;
2322         }
2323         init_completion(&req.done);
2324         req.task = p;
2325         list_add(&req.list, &rq->migration_queue);
2326         task_rq_unlock(rq, &flags);
2327
2328         wake_up_process(rq->migration_thread);
2329
2330         wait_for_completion(&req.done);
2331         return 0;
2332 }
2333
2334 /* Move (not current) task off this cpu, onto dest cpu. */
2335 static void move_task_away(struct task_struct *p, int dest_cpu)
2336 {
2337         runqueue_t *rq_dest;
2338         unsigned long flags;
2339
2340         rq_dest = cpu_rq(dest_cpu);
2341
2342         local_irq_save(flags);
2343         double_rq_lock(this_rq(), rq_dest);
2344         if (task_cpu(p) != smp_processor_id())
2345                 goto out; /* Already moved */
2346
2347         set_task_cpu(p, dest_cpu);
2348         if (p->array) {
2349                 deactivate_task(p, this_rq());
2350                 activate_task(p, rq_dest);
2351                 if (p->prio < rq_dest->curr->prio)
2352                         resched_task(rq_dest->curr);
2353         }
2354  out:
2355         double_rq_unlock(this_rq(), rq_dest);
2356         local_irq_restore(flags);
2357 }
2358
2359 /*
2360  * migration_thread - this is a highprio system thread that performs
2361  * thread migration by bumping thread off CPU then 'pushing' onto
2362  * another runqueue.
2363  */
2364 static int migration_thread(void * data)
2365 {
2366         /* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */
2367         struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 };
2368         int cpu = (long) data;
2369         runqueue_t *rq;
2370         int ret;
2371
2372         daemonize("migration/%d", cpu);
2373         set_fs(KERNEL_DS);
2374
2375         /*
2376          * Either we are running on the right CPU, or there's a a
2377          * migration thread on this CPU, guaranteed (we're started
2378          * serially).
2379          */
2380         set_cpus_allowed(current, 1UL << cpu);
2381
2382         ret = setscheduler(0, SCHED_FIFO, &param);
2383
2384         rq = this_rq();
2385         rq->migration_thread = current;
2386
2387         for (;;) {
2388                 struct list_head *head;
2389                 migration_req_t *req;
2390
2391                 spin_lock_irq(&rq->lock);
2392                 head = &rq->migration_queue;
2393                 current->state = TASK_INTERRUPTIBLE;
2394                 if (list_empty(head)) {
2395                         spin_unlock_irq(&rq->lock);
2396                         schedule();
2397                         continue;
2398                 }
2399                 req = list_entry(head->next, migration_req_t, list);
2400                 list_del_init(head->next);
2401                 spin_unlock_irq(&rq->lock);
2402
2403                 move_task_away(req->task,
2404                                any_online_cpu(req->task->cpus_allowed));
2405                 complete(&req->done);
2406         }
2407 }
2408
2409 /*
2410  * migration_call - callback that gets triggered when a CPU is added.
2411  * Here we can start up the necessary migration thread for the new CPU.
2412  */
2413 static int migration_call(struct notifier_block *nfb,
2414                           unsigned long action,
2415                           void *hcpu)
2416 {
2417         switch (action) {
2418         case CPU_ONLINE:
2419                 printk("Starting migration thread for cpu %li\n",
2420                        (long)hcpu);
2421                 kernel_thread(migration_thread, hcpu, CLONE_KERNEL);
2422                 while (!cpu_rq((long)hcpu)->migration_thread)
2423                         yield();
2424                 break;
2425         }
2426         return NOTIFY_OK;
2427 }
2428
2429 static struct notifier_block migration_notifier = { &migration_call, NULL, 0 };
2430
2431 __init int migration_init(void)
2432 {
2433         /* Start one for boot CPU. */
2434         migration_call(&migration_notifier, CPU_ONLINE,
2435                        (void *)(long)smp_processor_id());
2436         register_cpu_notifier(&migration_notifier);
2437         return 0;
2438 }
2439
2440 #endif
2441
2442 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2443 /*
2444  * The 'big kernel lock'
2445  *
2446  * This spinlock is taken and released recursively by lock_kernel()
2447  * and unlock_kernel().  It is transparently dropped and reaquired
2448  * over schedule().  It is used to protect legacy code that hasn't
2449  * been migrated to a proper locking design yet.
2450  *
2451  * Don't use in new code.
2452  */
2453 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
2454 #endif
2455
2456 static void kstat_init_cpu(int cpu)
2457 {
2458         /* Add any initialisation to kstat here */
2459         /* Useful when cpu offlining logic is added.. */
2460 }
2461
2462 static int __devinit kstat_cpu_notify(struct notifier_block *self,
2463                                         unsigned long action, void *hcpu)
2464 {
2465         int cpu = (unsigned long)hcpu;
2466         switch(action) {
2467         case CPU_UP_PREPARE:
2468                 kstat_init_cpu(cpu);
2469                 break;
2470         default:
2471                 break;
2472         }
2473         return NOTIFY_OK;
2474 }
2475
2476 static struct notifier_block __devinitdata kstat_nb = {
2477         .notifier_call  = kstat_cpu_notify,
2478         .next           = NULL,
2479 };
2480
2481 __init static void init_kstat(void) {
2482         kstat_cpu_notify(&kstat_nb, (unsigned long)CPU_UP_PREPARE,
2483                         (void *)(long)smp_processor_id());
2484         register_cpu_notifier(&kstat_nb);
2485 }
2486
2487 void __init sched_init(void)
2488 {
2489         runqueue_t *rq;
2490         int i, j, k;
2491
2492         /* Init the kstat counters */
2493         init_kstat();
2494         for (i = 0; i < NR_CPUS; i++) {
2495                 prio_array_t *array;
2496
2497                 rq = cpu_rq(i);
2498                 rq->active = rq->arrays;
2499                 rq->expired = rq->arrays + 1;
2500                 spin_lock_init(&rq->lock);
2501                 INIT_LIST_HEAD(&rq->migration_queue);
2502                 atomic_set(&rq->nr_iowait, 0);
2503                 nr_running_init(rq);
2504
2505                 for (j = 0; j < 2; j++) {
2506                         array = rq->arrays + j;
2507                         for (k = 0; k < MAX_PRIO; k++) {
2508                                 INIT_LIST_HEAD(array->queue + k);
2509                                 __clear_bit(k, array->bitmap);
2510                         }
2511                         // delimiter for bitsearch
2512                         __set_bit(MAX_PRIO, array->bitmap);
2513                 }
2514         }
2515         /*
2516          * We have to do a little magic to get the first
2517          * thread right in SMP mode.
2518          */
2519         rq = this_rq();
2520         rq->curr = current;
2521         rq->idle = current;
2522         set_task_cpu(current, smp_processor_id());
2523         wake_up_forked_process(current);
2524
2525         init_timers();
2526
2527         /*
2528          * The boot idle thread does lazy MMU switching as well:
2529          */
2530         atomic_inc(&init_mm.mm_count);
2531         enter_lazy_tlb(&init_mm, current);
2532 }
2533
2534 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
2535 void __might_sleep(char *file, int line)
2536 {
2537 #if defined(in_atomic)
2538         static unsigned long prev_jiffy;        /* ratelimiting */
2539
2540         if (in_atomic() || irqs_disabled()) {
2541                 if (time_before(jiffies, prev_jiffy + HZ))
2542                         return;
2543                 prev_jiffy = jiffies;
2544                 printk(KERN_ERR "Debug: sleeping function called from illegal"
2545                                 " context at %s:%d\n", file, line);
2546                 dump_stack();
2547         }
2548 #endif
2549 }
2550 #endif
2551
2552
2553 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
2554 /*
2555  * This could be a long-held lock.  If another CPU holds it for a long time,
2556  * and that CPU is not asked to reschedule then *this* CPU will spin on the
2557  * lock for a long time, even if *this* CPU is asked to reschedule.
2558  *
2559  * So what we do here, in the slow (contended) path is to spin on the lock by
2560  * hand while permitting preemption.
2561  *
2562  * Called inside preempt_disable().
2563  */
2564 void __preempt_spin_lock(spinlock_t *lock)
2565 {
2566         if (preempt_count() > 1) {
2567                 _raw_spin_lock(lock);
2568                 return;
2569         }
2570         do {
2571                 preempt_enable();
2572                 while (spin_is_locked(lock))
2573                         cpu_relax();
2574                 preempt_disable();
2575         } while (!_raw_spin_trylock(lock));
2576 }
2577
2578 void __preempt_write_lock(rwlock_t *lock)
2579 {
2580         if (preempt_count() > 1) {
2581                 _raw_write_lock(lock);
2582                 return;
2583         }
2584
2585         do {
2586                 preempt_enable();
2587                 while (rwlock_is_locked(lock))
2588                         cpu_relax();
2589                 preempt_disable();
2590         } while (!_raw_write_trylock(lock));
2591 }
2592 #endif